ethnidata 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ethnidata might be problematic. Click here for more details.

ethnidata/__init__.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ EthniData v4.0.0 - STATE-OF-THE-ART NAME ANALYSIS ENGINE
3
+ Predict nationality, ethnicity, gender, region, language AND religion!
4
+
5
+ 🔥 NEW in v4.0.0 - EXPLAINABLE AI & TRANSPARENCY:
6
+ - 🧠 **Explainability Layer** - Understand WHY predictions are made
7
+ - 📊 **Ambiguity Scoring** - Shannon entropy for uncertainty quantification (0-1)
8
+ - 🔍 **Morphology Detection** - Rule-based pattern recognition (9 cultural groups)
9
+ - 📈 **Confidence Breakdown** - Interpretable confidence components
10
+ - 🎯 **Synthetic Data Engine** - Privacy-safe test data generation
11
+ - 📚 **Academic-Grade** - Transparent, reproducible, legally compliant
12
+
13
+ Database:
14
+ - 📊 **5.9M+ records** (14x increase from v2.0.0)
15
+ - 🌍 **238 countries** - complete global coverage
16
+ - 🗣️ **72 languages**
17
+ - 🕌 **6 MAJOR WORLD RELIGIONS**:
18
+ - Christianity: 3.9M+ records (65.2%)
19
+ - Buddhism: 1.3M+ records (22.1%)
20
+ - Islam: 504K+ records (8.5%)
21
+ - Judaism: 121K+ records (2.0%)
22
+ - Hinduism: 90K+ records (1.5%)
23
+ - Sikhism: 24K+ records (0.4%)
24
+
25
+ Features:
26
+ - ✅ Nationality prediction (238 countries)
27
+ - ✅ Religion prediction (6 major world religions)
28
+ - ✅ Gender prediction
29
+ - ✅ Region prediction (5 continents)
30
+ - ✅ Language prediction (72 languages)
31
+ - ✅ Ethnicity prediction
32
+ - ✅ Full name analysis
33
+ - 🆕 Explainable AI (explain=True)
34
+ - 🆕 Morphology pattern detection
35
+ - 🆕 Ambiguity scoring (Shannon entropy)
36
+ - 🆕 Confidence breakdown
37
+ - 🆕 Synthetic data generation
38
+
39
+ Usage:
40
+ from ethnidata import EthniData
41
+
42
+ ed = EthniData()
43
+
44
+ # Basic prediction
45
+ result = ed.predict_nationality("Ahmet")
46
+
47
+ # v4.0.0: With explainability
48
+ result = ed.predict_nationality("Yılmaz", name_type="last", explain=True)
49
+ print(result['ambiguity_score']) # Shannon entropy
50
+ print(result['confidence_level']) # 'High', 'Medium', 'Low'
51
+ print(result['morphology_signal']) # Detected patterns
52
+ print(result['explanation']['why']) # Human-readable reasons
53
+
54
+ # Full name with explanation
55
+ result = ed.predict_full_name("Mehmet", "Yılmaz", explain=True)
56
+
57
+ # Morphology-only analysis
58
+ from ethnidata.morphology import MorphologyEngine
59
+ signal = MorphologyEngine.get_morphological_signal("O'Connor", "last")
60
+ # Returns: {'primary_pattern': "o'", 'pattern_type': 'gaelic', ...}
61
+
62
+ # Synthetic data generation
63
+ from ethnidata.synthetic import SyntheticDataEngine, SyntheticConfig
64
+ engine = SyntheticDataEngine(freq_provider)
65
+ config = SyntheticConfig(size=10000, country="TUR")
66
+ records = engine.generate(config)
67
+ """
68
+
69
+ __version__ = "4.0.1"
70
+ __author__ = "Teyfik Oz"
71
+ __license__ = "MIT"
72
+
73
+ from .predictor import EthniData
74
+ from .explainability import ExplainabilityEngine
75
+ from .morphology import MorphologyEngine, NameFeatureExtractor
76
+
77
+ # Synthetic module imports (optional, may not have freq_provider)
78
+ try:
79
+ from .synthetic import SyntheticDataEngine, SyntheticConfig, SyntheticRecord, FrequencyProvider
80
+ __all__ = [
81
+ "EthniData",
82
+ "ExplainabilityEngine",
83
+ "MorphologyEngine",
84
+ "NameFeatureExtractor",
85
+ "SyntheticDataEngine",
86
+ "SyntheticConfig",
87
+ "SyntheticRecord",
88
+ "FrequencyProvider"
89
+ ]
90
+ except ImportError:
91
+ __all__ = [
92
+ "EthniData",
93
+ "ExplainabilityEngine",
94
+ "MorphologyEngine",
95
+ "NameFeatureExtractor"
96
+ ]
@@ -0,0 +1,143 @@
1
+ """
2
+ Database downloader for EthniData
3
+ Downloads the full v3.0.0 database (5.8M records) on first use
4
+ """
5
+
6
+ import os
7
+ import urllib.request
8
+ import shutil
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ # Database versions and URLs
13
+ DATABASES = {
14
+ 'v2.0.0': {
15
+ 'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v2.0.0/ethnidata_v2.db',
16
+ 'size': '75 MB',
17
+ 'records': '415K',
18
+ 'filename': 'ethnidata.db'
19
+ },
20
+ 'v3.0.0': {
21
+ 'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v3.0.0/ethnidata_v3.db',
22
+ 'size': '1.1 GB',
23
+ 'records': '5.8M',
24
+ 'filename': 'ethnidata_v3.db'
25
+ }
26
+ }
27
+
28
+ DEFAULT_VERSION = 'v2.0.0' # Included in package
29
+ FULL_VERSION = 'v3.0.0' # Downloaded on demand
30
+
31
+ class DatabaseDownloader:
32
+ """Handles database downloads"""
33
+
34
+ def __init__(self, package_dir: Path):
35
+ self.package_dir = package_dir
36
+ self.db_path = package_dir / "ethnidata.db"
37
+ self.v3_path = package_dir / "ethnidata_v3.db"
38
+
39
+ def check_database(self, version: str = DEFAULT_VERSION) -> bool:
40
+ """Check if database exists"""
41
+ if version == 'v2.0.0':
42
+ return self.db_path.exists()
43
+ elif version == 'v3.0.0':
44
+ return self.v3_path.exists()
45
+ return False
46
+
47
+ def download_database(self, version: str = FULL_VERSION, force: bool = False) -> str:
48
+ """
49
+ Download database if not exists
50
+
51
+ Args:
52
+ version: Database version to download ('v2.0.0' or 'v3.0.0')
53
+ force: Force download even if exists
54
+
55
+ Returns:
56
+ Path to database file
57
+ """
58
+ if version not in DATABASES:
59
+ raise ValueError(f"Unknown version: {version}. Available: {list(DATABASES.keys())}")
60
+
61
+ db_info = DATABASES[version]
62
+ target_path = self.v3_path if version == 'v3.0.0' else self.db_path
63
+
64
+ # Check if already exists
65
+ if target_path.exists() and not force:
66
+ print(f"✅ Database {version} already exists ({db_info['records']} records)")
67
+ return str(target_path)
68
+
69
+ print(f"\n📥 Downloading EthniData {version} database...")
70
+ print(f" Records: {db_info['records']}")
71
+ print(f" Size: {db_info['size']}")
72
+ print(f" This may take a few minutes...")
73
+
74
+ try:
75
+ # Download with progress
76
+ def report_progress(block_num, block_size, total_size):
77
+ downloaded = block_num * block_size
78
+ percent = min(downloaded * 100 / total_size, 100)
79
+ print(f"\r Progress: {percent:.1f}%", end='', flush=True)
80
+
81
+ urllib.request.urlretrieve(
82
+ db_info['url'],
83
+ target_path,
84
+ reporthook=report_progress
85
+ )
86
+ print(f"\n✅ Download complete: {target_path}")
87
+ return str(target_path)
88
+
89
+ except Exception as e:
90
+ print(f"\n❌ Download failed: {e}")
91
+ print(f"\n💡 You can manually download from:")
92
+ print(f" {db_info['url']}")
93
+ print(f" And save it as: {target_path}")
94
+ raise
95
+
96
+ def get_database_path(self, prefer_v3: bool = False) -> str:
97
+ """
98
+ Get database path, downloading if necessary
99
+
100
+ Args:
101
+ prefer_v3: If True, use v3.0.0 (5.8M records) instead of v2.0.0 (415K records)
102
+
103
+ Returns:
104
+ Path to database file
105
+ """
106
+ if prefer_v3:
107
+ # Try to use v3, download if not exists
108
+ if not self.v3_path.exists():
109
+ print(f"\n🚀 EthniData v3.0.0 offers 14x more data (5.8M vs 415K records)!")
110
+ print(f" Would you like to download it? ({DATABASES['v3.0.0']['size']})")
111
+ response = input(" Download v3.0.0? [y/N]: ").strip().lower()
112
+
113
+ if response in ['y', 'yes']:
114
+ return self.download_database('v3.0.0')
115
+ else:
116
+ print(f" Using v2.0.0 ({DATABASES['v2.0.0']['records']} records)")
117
+ return str(self.db_path)
118
+ return str(self.v3_path)
119
+ else:
120
+ # Use v2 (included in package)
121
+ if not self.db_path.exists():
122
+ raise FileNotFoundError(
123
+ f"Database not found at {self.db_path}. "
124
+ f"Please reinstall: pip install --upgrade --force-reinstall ethnidata"
125
+ )
126
+ return str(self.db_path)
127
+
128
+
129
+ def download_v3_database(package_dir: Optional[Path] = None) -> str:
130
+ """
131
+ Convenience function to download v3.0.0 database
132
+
133
+ Args:
134
+ package_dir: Package directory (auto-detected if None)
135
+
136
+ Returns:
137
+ Path to downloaded database
138
+ """
139
+ if package_dir is None:
140
+ package_dir = Path(__file__).parent
141
+
142
+ downloader = DatabaseDownloader(package_dir)
143
+ return downloader.download_database('v3.0.0')
ethnidata/ethnidata.db ADDED
Binary file
@@ -0,0 +1,233 @@
1
+ """
2
+ EthniData Explainability Layer (v2.0)
3
+
4
+ Provides detailed explanations for predictions, confidence breakdowns,
5
+ and transparency into the decision-making process.
6
+
7
+ License: MIT
8
+ """
9
+
10
+ from typing import Dict, List, Any, Optional
11
+ import math
12
+
13
+
14
+ class ExplainabilityEngine:
15
+ """
16
+ Generates human-readable explanations for EthniData predictions.
17
+ """
18
+
19
+ @staticmethod
20
+ def calculate_ambiguity_score(probabilities: List[float]) -> float:
21
+ """
22
+ Calculate ambiguity score using Shannon entropy.
23
+
24
+ Args:
25
+ probabilities: List of probabilities (must sum to ~1.0)
26
+
27
+ Returns:
28
+ Ambiguity score in [0, 1]
29
+ - 0 = very certain (one clear winner)
30
+ - 1 = highly ambiguous (uniform distribution)
31
+ """
32
+ if not probabilities or len(probabilities) == 1:
33
+ return 0.0
34
+
35
+ # Normalize probabilities
36
+ total = sum(probabilities)
37
+ if total <= 0:
38
+ return 1.0
39
+
40
+ probs = [p / total for p in probabilities]
41
+
42
+ # Calculate Shannon entropy
43
+ entropy = 0.0
44
+ for p in probs:
45
+ if p > 0:
46
+ entropy -= p * math.log2(p)
47
+
48
+ # Normalize by max entropy (log2(n))
49
+ max_entropy = math.log2(len(probs))
50
+ if max_entropy == 0:
51
+ return 0.0
52
+
53
+ ambiguity = entropy / max_entropy
54
+ return min(1.0, max(0.0, ambiguity))
55
+
56
+ @staticmethod
57
+ def get_confidence_level(confidence: float, ambiguity: float) -> str:
58
+ """
59
+ Convert numeric confidence to human-readable level.
60
+
61
+ Args:
62
+ confidence: Confidence score (0-1)
63
+ ambiguity: Ambiguity score (0-1)
64
+
65
+ Returns:
66
+ "High", "Medium", or "Low"
67
+ """
68
+ # Adjust confidence by ambiguity penalty
69
+ adjusted = confidence * (1.0 - ambiguity * 0.5)
70
+
71
+ if adjusted >= 0.7:
72
+ return "High"
73
+ elif adjusted >= 0.4:
74
+ return "Medium"
75
+ else:
76
+ return "Low"
77
+
78
+ @staticmethod
79
+ def decompose_confidence(
80
+ frequency_strength: float,
81
+ cross_source_agreement: float = 0.0,
82
+ name_uniqueness: float = 0.0,
83
+ morphology_signal: float = 0.0,
84
+ entropy_penalty: float = 0.0
85
+ ) -> Dict[str, float]:
86
+ """
87
+ Break down confidence score into interpretable components.
88
+
89
+ Args:
90
+ frequency_strength: Base frequency-based confidence (0-1)
91
+ cross_source_agreement: Agreement across multiple sources (0-1)
92
+ name_uniqueness: How unique/rare the name is (0-1)
93
+ morphology_signal: Strength of morphological patterns (0-1)
94
+ entropy_penalty: Reduction due to high ambiguity (0-1)
95
+
96
+ Returns:
97
+ Dictionary with normalized breakdown components
98
+ """
99
+ components = {
100
+ "frequency_strength": frequency_strength,
101
+ "cross_source_agreement": cross_source_agreement,
102
+ "name_uniqueness": name_uniqueness,
103
+ "morphology_signal": morphology_signal,
104
+ "entropy_penalty": -entropy_penalty # Negative contribution
105
+ }
106
+
107
+ # Normalize so positive components sum to ~1.0
108
+ total_positive = sum(v for v in components.values() if v > 0)
109
+ if total_positive > 0:
110
+ for key in components:
111
+ if components[key] > 0:
112
+ components[key] = components[key] / total_positive
113
+
114
+ return components
115
+
116
+ @staticmethod
117
+ def generate_explanation(
118
+ name: str,
119
+ prediction: Dict[str, Any],
120
+ confidence_breakdown: Dict[str, float],
121
+ ambiguity_score: float,
122
+ morphology_patterns: Optional[List[str]] = None,
123
+ sources: Optional[List[str]] = None
124
+ ) -> Dict[str, Any]:
125
+ """
126
+ Generate comprehensive explanation for a prediction.
127
+
128
+ Args:
129
+ name: The input name
130
+ prediction: The prediction result
131
+ confidence_breakdown: Decomposed confidence components
132
+ ambiguity_score: Ambiguity score (0-1)
133
+ morphology_patterns: Detected morphological patterns
134
+ sources: Data sources used
135
+
136
+ Returns:
137
+ Complete explanation structure
138
+ """
139
+ top_country = prediction.get('country_name', 'Unknown')
140
+ confidence = prediction.get('confidence', 0.0)
141
+
142
+ # Generate textual explanations
143
+ why = []
144
+
145
+ # Frequency-based reasoning
146
+ freq_strength = confidence_breakdown.get('frequency_strength', 0.0)
147
+ if freq_strength > 0.5:
148
+ why.append(f"High frequency in {top_country} name databases")
149
+ elif freq_strength > 0.3:
150
+ why.append(f"Moderate presence in {top_country} records")
151
+ else:
152
+ why.append(f"Limited data available for this name")
153
+
154
+ # Cross-source agreement
155
+ cross_source = confidence_breakdown.get('cross_source_agreement', 0.0)
156
+ if cross_source > 0.2:
157
+ n_sources = len(sources) if sources else 2
158
+ why.append(f"Cross-source agreement across {n_sources} datasets")
159
+
160
+ # Morphology patterns
161
+ if morphology_patterns and len(morphology_patterns) > 0:
162
+ patterns_str = ", ".join(morphology_patterns)
163
+ why.append(f"Strong morphological patterns detected: {patterns_str}")
164
+
165
+ # Name uniqueness
166
+ uniqueness = confidence_breakdown.get('name_uniqueness', 0.0)
167
+ if uniqueness > 0.5:
168
+ why.append("Name is relatively unique/rare globally")
169
+ elif uniqueness < 0.2:
170
+ why.append("Name is very common across multiple regions")
171
+
172
+ # Ambiguity warning
173
+ if ambiguity_score > 0.6:
174
+ why.append(f"⚠️ High ambiguity: name is common in {len(prediction.get('top_countries', []))} countries")
175
+
176
+ confidence_level = ExplainabilityEngine.get_confidence_level(confidence, ambiguity_score)
177
+
178
+ return {
179
+ "prediction": {
180
+ "name": name,
181
+ "country": prediction.get('country'),
182
+ "country_name": top_country,
183
+ "confidence": round(confidence, 4),
184
+ "top_countries": prediction.get('top_countries', [])
185
+ },
186
+ "explanation": {
187
+ "why": why,
188
+ "confidence_breakdown": {
189
+ k: round(v, 4) for k, v in confidence_breakdown.items()
190
+ },
191
+ "ambiguity_score": round(ambiguity_score, 4),
192
+ "confidence_level": confidence_level
193
+ }
194
+ }
195
+
196
+ @staticmethod
197
+ def explain_batch(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
198
+ """
199
+ Generate explanations for a batch of predictions.
200
+
201
+ Args:
202
+ predictions: List of prediction results
203
+
204
+ Returns:
205
+ List of explanation structures
206
+ """
207
+ explanations = []
208
+
209
+ for pred in predictions:
210
+ # Extract probabilities for ambiguity calculation
211
+ top_countries = pred.get('top_countries', [])
212
+ probs = [c.get('probability', 0.0) for c in top_countries]
213
+
214
+ ambiguity = ExplainabilityEngine.calculate_ambiguity_score(probs)
215
+
216
+ # Simple confidence breakdown (can be enhanced with actual data)
217
+ freq_strength = pred.get('confidence', 0.0)
218
+ breakdown = ExplainabilityEngine.decompose_confidence(
219
+ frequency_strength=freq_strength,
220
+ cross_source_agreement=0.2 if len(top_countries) > 1 else 0.0,
221
+ entropy_penalty=ambiguity * 0.3
222
+ )
223
+
224
+ explanation = ExplainabilityEngine.generate_explanation(
225
+ name=pred.get('name', ''),
226
+ prediction=pred,
227
+ confidence_breakdown=breakdown,
228
+ ambiguity_score=ambiguity
229
+ )
230
+
231
+ explanations.append(explanation)
232
+
233
+ return explanations