ethnidata 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ethnidata might be problematic. Click here for more details.

@@ -0,0 +1,286 @@
1
+ """
2
+ EthniData Morphology & Linguistics Engine (v2.0)
3
+
4
+ Rule-based pattern detection for name morphology.
5
+ Detects cultural/linguistic patterns in names (suffixes, prefixes, etc.)
6
+
7
+ License: MIT
8
+ """
9
+
10
+ from typing import Dict, List, Optional, Tuple
11
+ import re
12
+
13
+
14
+ class MorphologyEngine:
15
+ """
16
+ Detects morphological patterns in names (rule-based, no ML required).
17
+ """
18
+
19
+ # Suffix/Prefix patterns by cultural/linguistic group
20
+ PATTERNS = {
21
+ # Slavic patterns
22
+ "slavic": {
23
+ "suffixes": ["-ov", "-ova", "-ev", "-eva", "-ski", "-ska", "-sky", "-ic", "-vić", "-vich"],
24
+ "prefixes": [],
25
+ "regions": ["Eastern Europe", "Balkans", "Russia"],
26
+ "countries": ["RUS", "UKR", "POL", "CZE", "SRB", "BIH"]
27
+ },
28
+
29
+ # Turkic patterns
30
+ "turkic": {
31
+ "suffixes": ["-oğlu", "-oglu", "-soy", "-gil", "-başı", "-can", "-han", "-bey"],
32
+ "prefixes": [],
33
+ "regions": ["Anatolia", "Balkans", "Central Asia"],
34
+ "countries": ["TUR", "AZE", "KAZ", "UZB", "TKM"]
35
+ },
36
+
37
+ # Nordic patterns
38
+ "nordic": {
39
+ "suffixes": ["-son", "-sen", "-sson", "-dóttir", "-dottir", "-berg", "-ström", "-lund"],
40
+ "prefixes": [],
41
+ "regions": ["Scandinavia", "Iceland"],
42
+ "countries": ["SWE", "NOR", "DNK", "ISL", "FIN"]
43
+ },
44
+
45
+ # Arabic patterns
46
+ "arabic": {
47
+ "suffixes": [],
48
+ "prefixes": ["al-", "el-", "bin", "binti", "ibn", "abu", "abd", "abdul"],
49
+ "regions": ["Middle East", "North Africa"],
50
+ "countries": ["SAU", "EGY", "IRQ", "SYR", "JOR", "MAR", "DZA"]
51
+ },
52
+
53
+ # Gaelic/Celtic patterns
54
+ "gaelic": {
55
+ "suffixes": [],
56
+ "prefixes": ["mc", "mac", "o'", "ó"],
57
+ "regions": ["Ireland", "Scotland"],
58
+ "countries": ["IRL", "GBR"]
59
+ },
60
+
61
+ # Iberian patterns
62
+ "iberian": {
63
+ "suffixes": ["-ez", "-es", "-az", "-is", "-os"],
64
+ "prefixes": ["de", "del", "da", "dos"],
65
+ "regions": ["Iberia", "Latin America"],
66
+ "countries": ["ESP", "PRT", "MEX", "ARG", "BRA"]
67
+ },
68
+
69
+ # Germanic patterns
70
+ "germanic": {
71
+ "suffixes": ["-mann", "-schmidt", "-schneider", "-meyer", "-müller", "-bauer"],
72
+ "prefixes": ["von", "van", "der", "de"],
73
+ "regions": ["Central Europe", "Netherlands"],
74
+ "countries": ["DEU", "AUT", "CHE", "NLD", "BEL"]
75
+ },
76
+
77
+ # East Asian patterns
78
+ "east_asian": {
79
+ "suffixes": [],
80
+ "prefixes": [], # East Asian names typically don't have affixes
81
+ "regions": ["East Asia"],
82
+ "countries": ["CHN", "JPN", "KOR", "TWN"],
83
+ "note": "East Asian names follow different morphological rules"
84
+ },
85
+
86
+ # South Asian patterns
87
+ "south_asian": {
88
+ "suffixes": ["-kumar", "-singh", "-sharma", "-patel", "-reddy", "-rao"],
89
+ "prefixes": ["sri", "shri"],
90
+ "regions": ["South Asia"],
91
+ "countries": ["IND", "PAK", "BGD", "LKA", "NPL"]
92
+ }
93
+ }
94
+
95
+ @staticmethod
96
+ def normalize_for_pattern(name: str) -> str:
97
+ """Normalize name for pattern matching (lowercase, remove spaces)."""
98
+ return name.lower().strip().replace(" ", "").replace("-", "")
99
+
100
+ @classmethod
101
+ def detect_patterns(cls, name: str) -> List[Dict[str, any]]:
102
+ """
103
+ Detect morphological patterns in a name.
104
+
105
+ Args:
106
+ name: Name to analyze (first or last)
107
+
108
+ Returns:
109
+ List of detected patterns with confidence scores
110
+ """
111
+ normalized = cls.normalize_for_pattern(name)
112
+ detected = []
113
+
114
+ for pattern_type, pattern_data in cls.PATTERNS.items():
115
+ # Check suffixes
116
+ for suffix in pattern_data.get("suffixes", []):
117
+ suffix_norm = suffix.lower().replace("-", "")
118
+ if normalized.endswith(suffix_norm):
119
+ detected.append({
120
+ "pattern_type": pattern_type,
121
+ "pattern": suffix,
122
+ "match_type": "suffix",
123
+ "regions": pattern_data.get("regions", []),
124
+ "likely_countries": pattern_data.get("countries", []),
125
+ "confidence": 0.8 # High confidence for suffix match
126
+ })
127
+ break # One suffix per type
128
+
129
+ # Check prefixes
130
+ for prefix in pattern_data.get("prefixes", []):
131
+ prefix_norm = prefix.lower().replace("-", "").replace("'", "")
132
+ if normalized.startswith(prefix_norm):
133
+ detected.append({
134
+ "pattern_type": pattern_type,
135
+ "pattern": prefix,
136
+ "match_type": "prefix",
137
+ "regions": pattern_data.get("regions", []),
138
+ "likely_countries": pattern_data.get("countries", []),
139
+ "confidence": 0.75 # Slightly lower for prefix (more ambiguous)
140
+ })
141
+ break # One prefix per type
142
+
143
+ return detected
144
+
145
+ @classmethod
146
+ def get_morphological_signal(cls, name: str, name_type: str = "last") -> Optional[Dict[str, any]]:
147
+ """
148
+ Get aggregated morphological signal for a name.
149
+
150
+ Args:
151
+ name: Name to analyze
152
+ name_type: "first" or "last" (last names have stronger patterns)
153
+
154
+ Returns:
155
+ Morphological signal dict or None if no patterns detected
156
+ """
157
+ patterns = cls.detect_patterns(name)
158
+
159
+ if not patterns:
160
+ return None
161
+
162
+ # Take the highest confidence pattern
163
+ best_pattern = max(patterns, key=lambda p: p['confidence'])
164
+
165
+ # Adjust confidence based on name type
166
+ confidence_multiplier = 1.0 if name_type == "last" else 0.7
167
+
168
+ return {
169
+ "detected_patterns": [p['pattern'] for p in patterns],
170
+ "pattern_types": list(set(p['pattern_type'] for p in patterns)),
171
+ "likely_regions": list(set(r for p in patterns for r in p['regions'])),
172
+ "likely_countries": list(set(c for p in patterns for c in p['likely_countries'])),
173
+ "pattern_confidence": round(best_pattern['confidence'] * confidence_multiplier, 4),
174
+ "primary_pattern": best_pattern['pattern'],
175
+ "primary_type": best_pattern['pattern_type']
176
+ }
177
+
178
+ @classmethod
179
+ def explain_pattern(cls, pattern_signal: Dict[str, any]) -> str:
180
+ """
181
+ Generate human-readable explanation for detected pattern.
182
+
183
+ Args:
184
+ pattern_signal: Output from get_morphological_signal()
185
+
186
+ Returns:
187
+ Textual explanation
188
+ """
189
+ if not pattern_signal:
190
+ return "No distinctive morphological patterns detected"
191
+
192
+ pattern_type = pattern_signal.get('primary_type', 'unknown')
193
+ pattern = pattern_signal.get('primary_pattern', '')
194
+ regions = pattern_signal.get('likely_regions', [])
195
+ confidence = pattern_signal.get('pattern_confidence', 0.0)
196
+
197
+ region_str = ", ".join(regions[:2]) if regions else "unknown"
198
+
199
+ explanations = {
200
+ "slavic": f"Slavic name pattern ('{pattern}') common in {region_str}",
201
+ "turkic": f"Turkic name pattern ('{pattern}') typical of {region_str}",
202
+ "nordic": f"Nordic name pattern ('{pattern}') from {region_str}",
203
+ "arabic": f"Arabic name pattern ('{pattern}') associated with {region_str}",
204
+ "gaelic": f"Gaelic name pattern ('{pattern}') from {region_str}",
205
+ "iberian": f"Iberian name pattern ('{pattern}') typical of {region_str}",
206
+ "germanic": f"Germanic name pattern ('{pattern}') from {region_str}",
207
+ "south_asian": f"South Asian name pattern ('{pattern}') from {region_str}",
208
+ }
209
+
210
+ explanation = explanations.get(pattern_type, f"Name pattern '{pattern}' detected")
211
+
212
+ if confidence > 0.7:
213
+ return f"{explanation} (high confidence)"
214
+ elif confidence > 0.5:
215
+ return f"{explanation} (moderate confidence)"
216
+ else:
217
+ return f"{explanation} (low confidence)"
218
+
219
+
220
+ class NameFeatureExtractor:
221
+ """
222
+ Extract linguistic features from names for analysis.
223
+ """
224
+
225
+ @staticmethod
226
+ def get_name_features(name: str) -> Dict[str, any]:
227
+ """
228
+ Extract various linguistic features from a name.
229
+
230
+ Args:
231
+ name: Name to analyze
232
+
233
+ Returns:
234
+ Dictionary of features
235
+ """
236
+ normalized = name.lower().strip()
237
+
238
+ features = {
239
+ "length": len(normalized),
240
+ "has_hyphen": "-" in name,
241
+ "has_apostrophe": "'" in name,
242
+ "has_space": " " in name,
243
+ "starts_with_vowel": normalized[0] in "aeiouáéíóúàèìòù" if normalized else False,
244
+ "ends_with_vowel": normalized[-1] in "aeiouáéíóúàèìòù" if normalized else False,
245
+ "consonant_clusters": bool(re.search(r'[bcdfghjklmnpqrstvwxyz]{3,}', normalized)),
246
+ "double_letters": bool(re.search(r'(.)\1', normalized)),
247
+ "numeric_chars": bool(re.search(r'\d', normalized)),
248
+ "special_chars": bool(re.search(r'[^a-záéíóúàèìòùäöüßñçåæø\s\-\']', normalized)),
249
+ }
250
+
251
+ # Vowel/consonant ratio
252
+ vowels = sum(1 for c in normalized if c in "aeiouáéíóúàèìòù")
253
+ features["vowel_ratio"] = vowels / len(normalized) if normalized else 0.0
254
+
255
+ return features
256
+
257
+ @staticmethod
258
+ def is_likely_romanized(name: str) -> bool:
259
+ """
260
+ Detect if name is likely a romanized/transliterated non-Latin name.
261
+
262
+ Args:
263
+ name: Name to check
264
+
265
+ Returns:
266
+ True if likely romanized
267
+ """
268
+ features = NameFeatureExtractor.get_name_features(name)
269
+
270
+ # Heuristic indicators of romanization:
271
+ # - Uncommon consonant clusters
272
+ # - High consonant ratio
273
+ # - Specific patterns (x, q without u, etc.)
274
+
275
+ indicators = 0
276
+
277
+ if features['consonant_clusters']:
278
+ indicators += 1
279
+
280
+ if features['vowel_ratio'] < 0.25:
281
+ indicators += 1
282
+
283
+ if any(char in name.lower() for char in ['x', 'zh', 'ch', 'sh']):
284
+ indicators += 1
285
+
286
+ return indicators >= 2