PyPI - ethnidata - Versions diffs - 4.0.1__py3-none-any.whl - Mend

ethnidata 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ethnidata might be problematic. Click here for more details.

Files changed (14) hide show

ethnidata/__init__.py +96 -0
ethnidata/downloader.py +143 -0
ethnidata/ethnidata.db +0 -0
ethnidata/explainability.py +233 -0
ethnidata/morphology.py +286 -0
ethnidata/predictor.py +699 -0
ethnidata/predictor_old.py +277 -0
ethnidata/synthetic/__init__.py +23 -0
ethnidata/synthetic/engine.py +253 -0
ethnidata-4.0.1.dist-info/METADATA +534 -0
ethnidata-4.0.1.dist-info/RECORD +14 -0
ethnidata-4.0.1.dist-info/WHEEL +5 -0
ethnidata-4.0.1.dist-info/licenses/LICENSE +21 -0
ethnidata-4.0.1.dist-info/top_level.txt +1 -0

ethnidata/__init__.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""
+EthniData v4.0.0 - STATE-OF-THE-ART NAME ANALYSIS ENGINE
+Predict nationality, ethnicity, gender, region, language AND religion!
+🔥 NEW in v4.0.0 - EXPLAINABLE AI & TRANSPARENCY:
+- 🧠 **Explainability Layer** - Understand WHY predictions are made
+- 📊 **Ambiguity Scoring** - Shannon entropy for uncertainty quantification (0-1)
+- 🔍 **Morphology Detection** - Rule-based pattern recognition (9 cultural groups)
+- 📈 **Confidence Breakdown** - Interpretable confidence components
+- 🎯 **Synthetic Data Engine** - Privacy-safe test data generation
+- 📚 **Academic-Grade** - Transparent, reproducible, legally compliant
+Database:
+- 📊 **5.9M+ records** (14x increase from v2.0.0)
+- 🌍 **238 countries** - complete global coverage
+- 🗣️  **72 languages**
+- 🕌 **6 MAJOR WORLD RELIGIONS**:
+  - Christianity: 3.9M+ records (65.2%)
+  - Buddhism: 1.3M+ records (22.1%)
+  - Islam: 504K+ records (8.5%)
+  - Judaism: 121K+ records (2.0%)
+  - Hinduism: 90K+ records (1.5%)
+  - Sikhism: 24K+ records (0.4%)
+Features:
+- ✅ Nationality prediction (238 countries)
+- ✅ Religion prediction (6 major world religions)
+- ✅ Gender prediction
+- ✅ Region prediction (5 continents)
+- ✅ Language prediction (72 languages)
+- ✅ Ethnicity prediction
+- ✅ Full name analysis
+- 🆕 Explainable AI (explain=True)
+- 🆕 Morphology pattern detection
+- 🆕 Ambiguity scoring (Shannon entropy)
+- 🆕 Confidence breakdown
+- 🆕 Synthetic data generation
+Usage:
+    from ethnidata import EthniData
+    ed = EthniData()
+    # Basic prediction
+    result = ed.predict_nationality("Ahmet")
+    # v4.0.0: With explainability
+    result = ed.predict_nationality("Yılmaz", name_type="last", explain=True)
+    print(result['ambiguity_score'])      # Shannon entropy
+    print(result['confidence_level'])     # 'High', 'Medium', 'Low'
+    print(result['morphology_signal'])    # Detected patterns
+    print(result['explanation']['why'])   # Human-readable reasons
+    # Full name with explanation
+    result = ed.predict_full_name("Mehmet", "Yılmaz", explain=True)
+    # Morphology-only analysis
+    from ethnidata.morphology import MorphologyEngine
+    signal = MorphologyEngine.get_morphological_signal("O'Connor", "last")
+    # Returns: {'primary_pattern': "o'", 'pattern_type': 'gaelic', ...}
+    # Synthetic data generation
+    from ethnidata.synthetic import SyntheticDataEngine, SyntheticConfig
+    engine = SyntheticDataEngine(freq_provider)
+    config = SyntheticConfig(size=10000, country="TUR")
+    records = engine.generate(config)
+"""
+__version__ = "4.0.1"
+__author__ = "Teyfik Oz"
+__license__ = "MIT"
+from .predictor import EthniData
+from .explainability import ExplainabilityEngine
+from .morphology import MorphologyEngine, NameFeatureExtractor
+# Synthetic module imports (optional, may not have freq_provider)
+try:
+    from .synthetic import SyntheticDataEngine, SyntheticConfig, SyntheticRecord, FrequencyProvider
+    __all__ = [
+        "EthniData",
+        "ExplainabilityEngine",
+        "MorphologyEngine",
+        "NameFeatureExtractor",
+        "SyntheticDataEngine",
+        "SyntheticConfig",
+        "SyntheticRecord",
+        "FrequencyProvider"
+    ]
+except ImportError:
+    __all__ = [
+        "EthniData",
+        "ExplainabilityEngine",
+        "MorphologyEngine",
+        "NameFeatureExtractor"
+    ]

ethnidata/downloader.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+Database downloader for EthniData
+Downloads the full v3.0.0 database (5.8M records) on first use
+"""
+import os
+import urllib.request
+import shutil
+from pathlib import Path
+from typing import Optional
+# Database versions and URLs
+DATABASES = {
+    'v2.0.0': {
+        'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v2.0.0/ethnidata_v2.db',
+        'size': '75 MB',
+        'records': '415K',
+        'filename': 'ethnidata.db'
+    },
+    'v3.0.0': {
+        'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v3.0.0/ethnidata_v3.db',
+        'size': '1.1 GB',
+        'records': '5.8M',
+        'filename': 'ethnidata_v3.db'
+    }
+}
+DEFAULT_VERSION = 'v2.0.0'  # Included in package
+FULL_VERSION = 'v3.0.0'     # Downloaded on demand
+class DatabaseDownloader:
+    """Handles database downloads"""
+    def __init__(self, package_dir: Path):
+        self.package_dir = package_dir
+        self.db_path = package_dir / "ethnidata.db"
+        self.v3_path = package_dir / "ethnidata_v3.db"
+    def check_database(self, version: str = DEFAULT_VERSION) -> bool:
+        """Check if database exists"""
+        if version == 'v2.0.0':
+            return self.db_path.exists()
+        elif version == 'v3.0.0':
+            return self.v3_path.exists()
+        return False
+    def download_database(self, version: str = FULL_VERSION, force: bool = False) -> str:
+        """
+        Download database if not exists
+        Args:
+            version: Database version to download ('v2.0.0' or 'v3.0.0')
+            force: Force download even if exists
+        Returns:
+            Path to database file
+        """
+        if version not in DATABASES:
+            raise ValueError(f"Unknown version: {version}. Available: {list(DATABASES.keys())}")
+        db_info = DATABASES[version]
+        target_path = self.v3_path if version == 'v3.0.0' else self.db_path
+        # Check if already exists
+        if target_path.exists() and not force:
+            print(f"✅ Database {version} already exists ({db_info['records']} records)")
+            return str(target_path)
+        print(f"\n📥 Downloading EthniData {version} database...")
+        print(f"   Records: {db_info['records']}")
+        print(f"   Size: {db_info['size']}")
+        print(f"   This may take a few minutes...")
+        try:
+            # Download with progress
+            def report_progress(block_num, block_size, total_size):
+                downloaded = block_num * block_size
+                percent = min(downloaded * 100 / total_size, 100)
+                print(f"\r   Progress: {percent:.1f}%", end='', flush=True)
+            urllib.request.urlretrieve(
+                db_info['url'],
+                target_path,
+                reporthook=report_progress
+            )
+            print(f"\n✅ Download complete: {target_path}")
+            return str(target_path)
+        except Exception as e:
+            print(f"\n❌ Download failed: {e}")
+            print(f"\n💡 You can manually download from:")
+            print(f"   {db_info['url']}")
+            print(f"   And save it as: {target_path}")
+            raise
+    def get_database_path(self, prefer_v3: bool = False) -> str:
+        """
+        Get database path, downloading if necessary
+        Args:
+            prefer_v3: If True, use v3.0.0 (5.8M records) instead of v2.0.0 (415K records)
+        Returns:
+            Path to database file
+        """
+        if prefer_v3:
+            # Try to use v3, download if not exists
+            if not self.v3_path.exists():
+                print(f"\n🚀 EthniData v3.0.0 offers 14x more data (5.8M vs 415K records)!")
+                print(f"   Would you like to download it? ({DATABASES['v3.0.0']['size']})")
+                response = input("   Download v3.0.0? [y/N]: ").strip().lower()
+                if response in ['y', 'yes']:
+                    return self.download_database('v3.0.0')
+                else:
+                    print(f"   Using v2.0.0 ({DATABASES['v2.0.0']['records']} records)")
+                    return str(self.db_path)
+            return str(self.v3_path)
+        else:
+            # Use v2 (included in package)
+            if not self.db_path.exists():
+                raise FileNotFoundError(
+                    f"Database not found at {self.db_path}. "
+                    f"Please reinstall: pip install --upgrade --force-reinstall ethnidata"
+                )
+            return str(self.db_path)
+def download_v3_database(package_dir: Optional[Path] = None) -> str:
+    """
+    Convenience function to download v3.0.0 database
+    Args:
+        package_dir: Package directory (auto-detected if None)
+    Returns:
+        Path to downloaded database
+    """
+    if package_dir is None:
+        package_dir = Path(__file__).parent
+    downloader = DatabaseDownloader(package_dir)
+    return downloader.download_database('v3.0.0')

ethnidata/ethnidata.db ADDED Viewed

Binary file

ethnidata/explainability.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""
+EthniData Explainability Layer (v2.0)
+Provides detailed explanations for predictions, confidence breakdowns,
+and transparency into the decision-making process.
+License: MIT
+"""
+from typing import Dict, List, Any, Optional
+import math
+class ExplainabilityEngine:
+    """
+    Generates human-readable explanations for EthniData predictions.
+    """
+    @staticmethod
+    def calculate_ambiguity_score(probabilities: List[float]) -> float:
+        """
+        Calculate ambiguity score using Shannon entropy.
+        Args:
+            probabilities: List of probabilities (must sum to ~1.0)
+        Returns:
+            Ambiguity score in [0, 1]
+            - 0 = very certain (one clear winner)
+            - 1 = highly ambiguous (uniform distribution)
+        """
+        if not probabilities or len(probabilities) == 1:
+            return 0.0
+        # Normalize probabilities
+        total = sum(probabilities)
+        if total <= 0:
+            return 1.0
+        probs = [p / total for p in probabilities]
+        # Calculate Shannon entropy
+        entropy = 0.0
+        for p in probs:
+            if p > 0:
+                entropy -= p * math.log2(p)
+        # Normalize by max entropy (log2(n))
+        max_entropy = math.log2(len(probs))
+        if max_entropy == 0:
+            return 0.0
+        ambiguity = entropy / max_entropy
+        return min(1.0, max(0.0, ambiguity))
+    @staticmethod
+    def get_confidence_level(confidence: float, ambiguity: float) -> str:
+        """
+        Convert numeric confidence to human-readable level.
+        Args:
+            confidence: Confidence score (0-1)
+            ambiguity: Ambiguity score (0-1)
+        Returns:
+            "High", "Medium", or "Low"
+        """
+        # Adjust confidence by ambiguity penalty
+        adjusted = confidence * (1.0 - ambiguity * 0.5)
+        if adjusted >= 0.7:
+            return "High"
+        elif adjusted >= 0.4:
+            return "Medium"
+        else:
+            return "Low"
+    @staticmethod
+    def decompose_confidence(
+        frequency_strength: float,
+        cross_source_agreement: float = 0.0,
+        name_uniqueness: float = 0.0,
+        morphology_signal: float = 0.0,
+        entropy_penalty: float = 0.0
+    ) -> Dict[str, float]:
+        """
+        Break down confidence score into interpretable components.
+        Args:
+            frequency_strength: Base frequency-based confidence (0-1)
+            cross_source_agreement: Agreement across multiple sources (0-1)
+            name_uniqueness: How unique/rare the name is (0-1)
+            morphology_signal: Strength of morphological patterns (0-1)
+            entropy_penalty: Reduction due to high ambiguity (0-1)
+        Returns:
+            Dictionary with normalized breakdown components
+        """
+        components = {
+            "frequency_strength": frequency_strength,
+            "cross_source_agreement": cross_source_agreement,
+            "name_uniqueness": name_uniqueness,
+            "morphology_signal": morphology_signal,
+            "entropy_penalty": -entropy_penalty  # Negative contribution
+        }
+        # Normalize so positive components sum to ~1.0
+        total_positive = sum(v for v in components.values() if v > 0)
+        if total_positive > 0:
+            for key in components:
+                if components[key] > 0:
+                    components[key] = components[key] / total_positive
+        return components
+    @staticmethod
+    def generate_explanation(
+        name: str,
+        prediction: Dict[str, Any],
+        confidence_breakdown: Dict[str, float],
+        ambiguity_score: float,
+        morphology_patterns: Optional[List[str]] = None,
+        sources: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """
+        Generate comprehensive explanation for a prediction.
+        Args:
+            name: The input name
+            prediction: The prediction result
+            confidence_breakdown: Decomposed confidence components
+            ambiguity_score: Ambiguity score (0-1)
+            morphology_patterns: Detected morphological patterns
+            sources: Data sources used
+        Returns:
+            Complete explanation structure
+        """
+        top_country = prediction.get('country_name', 'Unknown')
+        confidence = prediction.get('confidence', 0.0)
+        # Generate textual explanations
+        why = []
+        # Frequency-based reasoning
+        freq_strength = confidence_breakdown.get('frequency_strength', 0.0)
+        if freq_strength > 0.5:
+            why.append(f"High frequency in {top_country} name databases")
+        elif freq_strength > 0.3:
+            why.append(f"Moderate presence in {top_country} records")
+        else:
+            why.append(f"Limited data available for this name")
+        # Cross-source agreement
+        cross_source = confidence_breakdown.get('cross_source_agreement', 0.0)
+        if cross_source > 0.2:
+            n_sources = len(sources) if sources else 2
+            why.append(f"Cross-source agreement across {n_sources} datasets")
+        # Morphology patterns
+        if morphology_patterns and len(morphology_patterns) > 0:
+            patterns_str = ", ".join(morphology_patterns)
+            why.append(f"Strong morphological patterns detected: {patterns_str}")
+        # Name uniqueness
+        uniqueness = confidence_breakdown.get('name_uniqueness', 0.0)
+        if uniqueness > 0.5:
+            why.append("Name is relatively unique/rare globally")
+        elif uniqueness < 0.2:
+            why.append("Name is very common across multiple regions")
+        # Ambiguity warning
+        if ambiguity_score > 0.6:
+            why.append(f"⚠️ High ambiguity: name is common in {len(prediction.get('top_countries', []))} countries")
+        confidence_level = ExplainabilityEngine.get_confidence_level(confidence, ambiguity_score)
+        return {
+            "prediction": {
+                "name": name,
+                "country": prediction.get('country'),
+                "country_name": top_country,
+                "confidence": round(confidence, 4),
+                "top_countries": prediction.get('top_countries', [])
+            },
+            "explanation": {
+                "why": why,
+                "confidence_breakdown": {
+                    k: round(v, 4) for k, v in confidence_breakdown.items()
+                },
+                "ambiguity_score": round(ambiguity_score, 4),
+                "confidence_level": confidence_level
+            }
+        }
+    @staticmethod
+    def explain_batch(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Generate explanations for a batch of predictions.
+        Args:
+            predictions: List of prediction results
+        Returns:
+            List of explanation structures
+        """
+        explanations = []
+        for pred in predictions:
+            # Extract probabilities for ambiguity calculation
+            top_countries = pred.get('top_countries', [])
+            probs = [c.get('probability', 0.0) for c in top_countries]
+            ambiguity = ExplainabilityEngine.calculate_ambiguity_score(probs)
+            # Simple confidence breakdown (can be enhanced with actual data)
+            freq_strength = pred.get('confidence', 0.0)
+            breakdown = ExplainabilityEngine.decompose_confidence(
+                frequency_strength=freq_strength,
+                cross_source_agreement=0.2 if len(top_countries) > 1 else 0.0,
+                entropy_penalty=ambiguity * 0.3
+            )
+            explanation = ExplainabilityEngine.generate_explanation(
+                name=pred.get('name', ''),
+                prediction=pred,
+                confidence_breakdown=breakdown,
+                ambiguity_score=ambiguity
+            )
+            explanations.append(explanation)
+        return explanations