ethnidata 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ethnidata might be problematic. Click here for more details.
- ethnidata/__init__.py +96 -0
- ethnidata/downloader.py +143 -0
- ethnidata/ethnidata.db +0 -0
- ethnidata/explainability.py +233 -0
- ethnidata/morphology.py +286 -0
- ethnidata/predictor.py +699 -0
- ethnidata/predictor_old.py +277 -0
- ethnidata/synthetic/__init__.py +23 -0
- ethnidata/synthetic/engine.py +253 -0
- ethnidata-4.0.1.dist-info/METADATA +534 -0
- ethnidata-4.0.1.dist-info/RECORD +14 -0
- ethnidata-4.0.1.dist-info/WHEEL +5 -0
- ethnidata-4.0.1.dist-info/licenses/LICENSE +21 -0
- ethnidata-4.0.1.dist-info/top_level.txt +1 -0
ethnidata/morphology.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Morphology & Linguistics Engine (v2.0)
|
|
3
|
+
|
|
4
|
+
Rule-based pattern detection for name morphology.
|
|
5
|
+
Detects cultural/linguistic patterns in names (suffixes, prefixes, etc.)
|
|
6
|
+
|
|
7
|
+
License: MIT
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Dict, List, Optional, Tuple
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MorphologyEngine:
|
|
15
|
+
"""
|
|
16
|
+
Detects morphological patterns in names (rule-based, no ML required).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# Suffix/Prefix patterns by cultural/linguistic group
|
|
20
|
+
PATTERNS = {
|
|
21
|
+
# Slavic patterns
|
|
22
|
+
"slavic": {
|
|
23
|
+
"suffixes": ["-ov", "-ova", "-ev", "-eva", "-ski", "-ska", "-sky", "-ic", "-vić", "-vich"],
|
|
24
|
+
"prefixes": [],
|
|
25
|
+
"regions": ["Eastern Europe", "Balkans", "Russia"],
|
|
26
|
+
"countries": ["RUS", "UKR", "POL", "CZE", "SRB", "BIH"]
|
|
27
|
+
},
|
|
28
|
+
|
|
29
|
+
# Turkic patterns
|
|
30
|
+
"turkic": {
|
|
31
|
+
"suffixes": ["-oğlu", "-oglu", "-soy", "-gil", "-başı", "-can", "-han", "-bey"],
|
|
32
|
+
"prefixes": [],
|
|
33
|
+
"regions": ["Anatolia", "Balkans", "Central Asia"],
|
|
34
|
+
"countries": ["TUR", "AZE", "KAZ", "UZB", "TKM"]
|
|
35
|
+
},
|
|
36
|
+
|
|
37
|
+
# Nordic patterns
|
|
38
|
+
"nordic": {
|
|
39
|
+
"suffixes": ["-son", "-sen", "-sson", "-dóttir", "-dottir", "-berg", "-ström", "-lund"],
|
|
40
|
+
"prefixes": [],
|
|
41
|
+
"regions": ["Scandinavia", "Iceland"],
|
|
42
|
+
"countries": ["SWE", "NOR", "DNK", "ISL", "FIN"]
|
|
43
|
+
},
|
|
44
|
+
|
|
45
|
+
# Arabic patterns
|
|
46
|
+
"arabic": {
|
|
47
|
+
"suffixes": [],
|
|
48
|
+
"prefixes": ["al-", "el-", "bin", "binti", "ibn", "abu", "abd", "abdul"],
|
|
49
|
+
"regions": ["Middle East", "North Africa"],
|
|
50
|
+
"countries": ["SAU", "EGY", "IRQ", "SYR", "JOR", "MAR", "DZA"]
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
# Gaelic/Celtic patterns
|
|
54
|
+
"gaelic": {
|
|
55
|
+
"suffixes": [],
|
|
56
|
+
"prefixes": ["mc", "mac", "o'", "ó"],
|
|
57
|
+
"regions": ["Ireland", "Scotland"],
|
|
58
|
+
"countries": ["IRL", "GBR"]
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
# Iberian patterns
|
|
62
|
+
"iberian": {
|
|
63
|
+
"suffixes": ["-ez", "-es", "-az", "-is", "-os"],
|
|
64
|
+
"prefixes": ["de", "del", "da", "dos"],
|
|
65
|
+
"regions": ["Iberia", "Latin America"],
|
|
66
|
+
"countries": ["ESP", "PRT", "MEX", "ARG", "BRA"]
|
|
67
|
+
},
|
|
68
|
+
|
|
69
|
+
# Germanic patterns
|
|
70
|
+
"germanic": {
|
|
71
|
+
"suffixes": ["-mann", "-schmidt", "-schneider", "-meyer", "-müller", "-bauer"],
|
|
72
|
+
"prefixes": ["von", "van", "der", "de"],
|
|
73
|
+
"regions": ["Central Europe", "Netherlands"],
|
|
74
|
+
"countries": ["DEU", "AUT", "CHE", "NLD", "BEL"]
|
|
75
|
+
},
|
|
76
|
+
|
|
77
|
+
# East Asian patterns
|
|
78
|
+
"east_asian": {
|
|
79
|
+
"suffixes": [],
|
|
80
|
+
"prefixes": [], # East Asian names typically don't have affixes
|
|
81
|
+
"regions": ["East Asia"],
|
|
82
|
+
"countries": ["CHN", "JPN", "KOR", "TWN"],
|
|
83
|
+
"note": "East Asian names follow different morphological rules"
|
|
84
|
+
},
|
|
85
|
+
|
|
86
|
+
# South Asian patterns
|
|
87
|
+
"south_asian": {
|
|
88
|
+
"suffixes": ["-kumar", "-singh", "-sharma", "-patel", "-reddy", "-rao"],
|
|
89
|
+
"prefixes": ["sri", "shri"],
|
|
90
|
+
"regions": ["South Asia"],
|
|
91
|
+
"countries": ["IND", "PAK", "BGD", "LKA", "NPL"]
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def normalize_for_pattern(name: str) -> str:
|
|
97
|
+
"""Normalize name for pattern matching (lowercase, remove spaces)."""
|
|
98
|
+
return name.lower().strip().replace(" ", "").replace("-", "")
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def detect_patterns(cls, name: str) -> List[Dict[str, any]]:
|
|
102
|
+
"""
|
|
103
|
+
Detect morphological patterns in a name.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
name: Name to analyze (first or last)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of detected patterns with confidence scores
|
|
110
|
+
"""
|
|
111
|
+
normalized = cls.normalize_for_pattern(name)
|
|
112
|
+
detected = []
|
|
113
|
+
|
|
114
|
+
for pattern_type, pattern_data in cls.PATTERNS.items():
|
|
115
|
+
# Check suffixes
|
|
116
|
+
for suffix in pattern_data.get("suffixes", []):
|
|
117
|
+
suffix_norm = suffix.lower().replace("-", "")
|
|
118
|
+
if normalized.endswith(suffix_norm):
|
|
119
|
+
detected.append({
|
|
120
|
+
"pattern_type": pattern_type,
|
|
121
|
+
"pattern": suffix,
|
|
122
|
+
"match_type": "suffix",
|
|
123
|
+
"regions": pattern_data.get("regions", []),
|
|
124
|
+
"likely_countries": pattern_data.get("countries", []),
|
|
125
|
+
"confidence": 0.8 # High confidence for suffix match
|
|
126
|
+
})
|
|
127
|
+
break # One suffix per type
|
|
128
|
+
|
|
129
|
+
# Check prefixes
|
|
130
|
+
for prefix in pattern_data.get("prefixes", []):
|
|
131
|
+
prefix_norm = prefix.lower().replace("-", "").replace("'", "")
|
|
132
|
+
if normalized.startswith(prefix_norm):
|
|
133
|
+
detected.append({
|
|
134
|
+
"pattern_type": pattern_type,
|
|
135
|
+
"pattern": prefix,
|
|
136
|
+
"match_type": "prefix",
|
|
137
|
+
"regions": pattern_data.get("regions", []),
|
|
138
|
+
"likely_countries": pattern_data.get("countries", []),
|
|
139
|
+
"confidence": 0.75 # Slightly lower for prefix (more ambiguous)
|
|
140
|
+
})
|
|
141
|
+
break # One prefix per type
|
|
142
|
+
|
|
143
|
+
return detected
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def get_morphological_signal(cls, name: str, name_type: str = "last") -> Optional[Dict[str, any]]:
|
|
147
|
+
"""
|
|
148
|
+
Get aggregated morphological signal for a name.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
name: Name to analyze
|
|
152
|
+
name_type: "first" or "last" (last names have stronger patterns)
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Morphological signal dict or None if no patterns detected
|
|
156
|
+
"""
|
|
157
|
+
patterns = cls.detect_patterns(name)
|
|
158
|
+
|
|
159
|
+
if not patterns:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
# Take the highest confidence pattern
|
|
163
|
+
best_pattern = max(patterns, key=lambda p: p['confidence'])
|
|
164
|
+
|
|
165
|
+
# Adjust confidence based on name type
|
|
166
|
+
confidence_multiplier = 1.0 if name_type == "last" else 0.7
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"detected_patterns": [p['pattern'] for p in patterns],
|
|
170
|
+
"pattern_types": list(set(p['pattern_type'] for p in patterns)),
|
|
171
|
+
"likely_regions": list(set(r for p in patterns for r in p['regions'])),
|
|
172
|
+
"likely_countries": list(set(c for p in patterns for c in p['likely_countries'])),
|
|
173
|
+
"pattern_confidence": round(best_pattern['confidence'] * confidence_multiplier, 4),
|
|
174
|
+
"primary_pattern": best_pattern['pattern'],
|
|
175
|
+
"primary_type": best_pattern['pattern_type']
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def explain_pattern(cls, pattern_signal: Dict[str, any]) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Generate human-readable explanation for detected pattern.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
pattern_signal: Output from get_morphological_signal()
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Textual explanation
|
|
188
|
+
"""
|
|
189
|
+
if not pattern_signal:
|
|
190
|
+
return "No distinctive morphological patterns detected"
|
|
191
|
+
|
|
192
|
+
pattern_type = pattern_signal.get('primary_type', 'unknown')
|
|
193
|
+
pattern = pattern_signal.get('primary_pattern', '')
|
|
194
|
+
regions = pattern_signal.get('likely_regions', [])
|
|
195
|
+
confidence = pattern_signal.get('pattern_confidence', 0.0)
|
|
196
|
+
|
|
197
|
+
region_str = ", ".join(regions[:2]) if regions else "unknown"
|
|
198
|
+
|
|
199
|
+
explanations = {
|
|
200
|
+
"slavic": f"Slavic name pattern ('{pattern}') common in {region_str}",
|
|
201
|
+
"turkic": f"Turkic name pattern ('{pattern}') typical of {region_str}",
|
|
202
|
+
"nordic": f"Nordic name pattern ('{pattern}') from {region_str}",
|
|
203
|
+
"arabic": f"Arabic name pattern ('{pattern}') associated with {region_str}",
|
|
204
|
+
"gaelic": f"Gaelic name pattern ('{pattern}') from {region_str}",
|
|
205
|
+
"iberian": f"Iberian name pattern ('{pattern}') typical of {region_str}",
|
|
206
|
+
"germanic": f"Germanic name pattern ('{pattern}') from {region_str}",
|
|
207
|
+
"south_asian": f"South Asian name pattern ('{pattern}') from {region_str}",
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
explanation = explanations.get(pattern_type, f"Name pattern '{pattern}' detected")
|
|
211
|
+
|
|
212
|
+
if confidence > 0.7:
|
|
213
|
+
return f"{explanation} (high confidence)"
|
|
214
|
+
elif confidence > 0.5:
|
|
215
|
+
return f"{explanation} (moderate confidence)"
|
|
216
|
+
else:
|
|
217
|
+
return f"{explanation} (low confidence)"
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class NameFeatureExtractor:
|
|
221
|
+
"""
|
|
222
|
+
Extract linguistic features from names for analysis.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def get_name_features(name: str) -> Dict[str, any]:
|
|
227
|
+
"""
|
|
228
|
+
Extract various linguistic features from a name.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
name: Name to analyze
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Dictionary of features
|
|
235
|
+
"""
|
|
236
|
+
normalized = name.lower().strip()
|
|
237
|
+
|
|
238
|
+
features = {
|
|
239
|
+
"length": len(normalized),
|
|
240
|
+
"has_hyphen": "-" in name,
|
|
241
|
+
"has_apostrophe": "'" in name,
|
|
242
|
+
"has_space": " " in name,
|
|
243
|
+
"starts_with_vowel": normalized[0] in "aeiouáéíóúàèìòù" if normalized else False,
|
|
244
|
+
"ends_with_vowel": normalized[-1] in "aeiouáéíóúàèìòù" if normalized else False,
|
|
245
|
+
"consonant_clusters": bool(re.search(r'[bcdfghjklmnpqrstvwxyz]{3,}', normalized)),
|
|
246
|
+
"double_letters": bool(re.search(r'(.)\1', normalized)),
|
|
247
|
+
"numeric_chars": bool(re.search(r'\d', normalized)),
|
|
248
|
+
"special_chars": bool(re.search(r'[^a-záéíóúàèìòùäöüßñçåæø\s\-\']', normalized)),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# Vowel/consonant ratio
|
|
252
|
+
vowels = sum(1 for c in normalized if c in "aeiouáéíóúàèìòù")
|
|
253
|
+
features["vowel_ratio"] = vowels / len(normalized) if normalized else 0.0
|
|
254
|
+
|
|
255
|
+
return features
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def is_likely_romanized(name: str) -> bool:
|
|
259
|
+
"""
|
|
260
|
+
Detect if name is likely a romanized/transliterated non-Latin name.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
name: Name to check
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
True if likely romanized
|
|
267
|
+
"""
|
|
268
|
+
features = NameFeatureExtractor.get_name_features(name)
|
|
269
|
+
|
|
270
|
+
# Heuristic indicators of romanization:
|
|
271
|
+
# - Uncommon consonant clusters
|
|
272
|
+
# - High consonant ratio
|
|
273
|
+
# - Specific patterns (x, q without u, etc.)
|
|
274
|
+
|
|
275
|
+
indicators = 0
|
|
276
|
+
|
|
277
|
+
if features['consonant_clusters']:
|
|
278
|
+
indicators += 1
|
|
279
|
+
|
|
280
|
+
if features['vowel_ratio'] < 0.25:
|
|
281
|
+
indicators += 1
|
|
282
|
+
|
|
283
|
+
if any(char in name.lower() for char in ['x', 'zh', 'ch', 'sh']):
|
|
284
|
+
indicators += 1
|
|
285
|
+
|
|
286
|
+
return indicators >= 2
|