ethnidata 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ethnidata might be problematic. Click here for more details.

ethnidata/predictor.py ADDED
@@ -0,0 +1,699 @@
1
+ """
2
+ EthniData Predictor v4.0.0 - State-of-the-Art Features
3
+ Yeni özellikler:
4
+ - Gender prediction (Cinsiyet tahmini)
5
+ - Region prediction (Bölge: Europe, Asia, Americas, Africa, Oceania)
6
+ - Language prediction (Yaygın dil tahmini)
7
+ - Explainability layer (Açıklanabilirlik)
8
+ - Ambiguity scoring (Belirsizlik skoru - Shannon entropy)
9
+ - Morphology pattern detection (Morfoljik kalıp tespiti)
10
+ - Confidence breakdown (Güven skoru ayrıştırması)
11
+ """
12
+
13
+ import sqlite3
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional, Literal
16
+ from unidecode import unidecode
17
+ import pycountry
18
+
19
+ # v4.0.0 new modules
20
+ from .explainability import ExplainabilityEngine
21
+ from .morphology import MorphologyEngine, NameFeatureExtractor
22
+
23
+ class EthniData:
24
+ """Ethnicity, Nationality, Gender, Region and Language predictor"""
25
+
26
+ def __init__(self, db_path: Optional[str] = None, use_v3: bool = False):
27
+ """
28
+ Initialize EthniData predictor
29
+
30
+ Args:
31
+ db_path: Path to SQLite database. If None, uses default location.
32
+ use_v3: If True, attempts to use v3.0.0 database (5.8M records).
33
+ If False, uses v2.0.0 database (415K records, included in package).
34
+ """
35
+ if db_path is None:
36
+ package_dir = Path(__file__).parent
37
+
38
+ if use_v3:
39
+ # Try to use v3 database
40
+ v3_path = package_dir / "ethnidata_v3.db"
41
+ if v3_path.exists():
42
+ db_path = v3_path
43
+ else:
44
+ print(f"\n💡 EthniData v3.0.0 (5.8M records) is not installed.")
45
+ print(f" To download: from ethnidata.downloader import download_v3_database")
46
+ print(f" download_v3_database()")
47
+ print(f"\n Using v2.0.0 (415K records) for now...")
48
+ db_path = package_dir / "ethnidata.db"
49
+ else:
50
+ db_path = package_dir / "ethnidata.db"
51
+
52
+ self.db_path = Path(db_path)
53
+
54
+ if not self.db_path.exists():
55
+ raise FileNotFoundError(
56
+ f"Database not found: {self.db_path}\n"
57
+ f"Please reinstall: pip install --upgrade --force-reinstall ethnidata"
58
+ )
59
+
60
+ self.conn = sqlite3.connect(self.db_path)
61
+ self.conn.row_factory = sqlite3.Row
62
+
63
+ def __del__(self):
64
+ """Close database connection"""
65
+ if hasattr(self, 'conn'):
66
+ self.conn.close()
67
+
68
+ @staticmethod
69
+ def normalize_name(name: str) -> str:
70
+ """Normalize name (lowercase, remove accents)"""
71
+ return unidecode(name.strip().lower())
72
+
73
+ def predict_nationality(
74
+ self,
75
+ name: str,
76
+ name_type: Literal["first", "last"] = "first",
77
+ top_n: int = 5,
78
+ explain: bool = False
79
+ ) -> Dict:
80
+ """
81
+ Predict nationality from name - ENHANCED v4.0.0
82
+
83
+ Args:
84
+ name: First or last name
85
+ name_type: "first" or "last"
86
+ top_n: Number of top predictions
87
+ explain: If True, includes explainability layer (v4.0.0 NEW!)
88
+
89
+ Returns:
90
+ {
91
+ 'name': str,
92
+ 'country': str (ISO 3166-1 alpha-3),
93
+ 'country_name': str,
94
+ 'confidence': float (0-1),
95
+ 'region': str,
96
+ 'language': str,
97
+ 'top_countries': [...],
98
+
99
+ # NEW v4.0.0 fields (if explain=True):
100
+ 'ambiguity_score': float, # Shannon entropy (0-1)
101
+ 'confidence_level': str, # 'High', 'Medium', 'Low'
102
+ 'morphology_signal': {...}, # Detected patterns
103
+ 'explanation': {...} # Full human-readable explanation
104
+ }
105
+ """
106
+
107
+ normalized = self.normalize_name(name)
108
+
109
+ query = """
110
+ SELECT country_code, region, language, COUNT(*) as frequency
111
+ FROM names
112
+ WHERE name = ? AND name_type = ?
113
+ GROUP BY country_code, region, language
114
+ ORDER BY frequency DESC
115
+ LIMIT ?
116
+ """
117
+
118
+ cursor = self.conn.cursor()
119
+ cursor.execute(query, (normalized, name_type, top_n))
120
+ results = cursor.fetchall()
121
+
122
+ if not results:
123
+ base_result = {
124
+ 'name': normalized,
125
+ 'country': None,
126
+ 'country_name': None,
127
+ 'confidence': 0.0,
128
+ 'region': None,
129
+ 'language': None,
130
+ 'top_countries': []
131
+ }
132
+
133
+ # v4.0.0: Add explain fields even when no results
134
+ if explain:
135
+ # Still try to detect morphological patterns
136
+ morphology_signal = MorphologyEngine.get_morphological_signal(name, name_type)
137
+
138
+ base_result['ambiguity_score'] = 1.0 # Maximum ambiguity (no data)
139
+ base_result['confidence_level'] = "Low"
140
+ base_result['morphology_signal'] = morphology_signal
141
+ base_result['explanation'] = {
142
+ 'why': ["Name not found in database"],
143
+ 'confidence_breakdown': {
144
+ 'frequency_strength': 0.0,
145
+ 'cross_source_agreement': 0.0,
146
+ 'name_uniqueness': 0.0,
147
+ 'morphology_signal': morphology_signal['pattern_confidence'] if morphology_signal else 0.0,
148
+ 'entropy_penalty': 0.0
149
+ },
150
+ 'ambiguity_score': 1.0,
151
+ 'confidence_level': "Low"
152
+ }
153
+
154
+ return base_result
155
+
156
+ # Calculate probabilities
157
+ total_freq = sum(row['frequency'] for row in results)
158
+
159
+ top_countries = []
160
+ for row in results:
161
+ prob = row['frequency'] / total_freq
162
+
163
+ try:
164
+ country = pycountry.countries.get(alpha_3=row['country_code'])
165
+ country_name = country.name if country else row['country_code']
166
+ except:
167
+ country_name = row['country_code']
168
+
169
+ top_countries.append({
170
+ 'country': row['country_code'],
171
+ 'country_name': country_name,
172
+ 'region': row['region'],
173
+ 'language': row['language'],
174
+ 'probability': round(prob, 4),
175
+ 'frequency': row['frequency']
176
+ })
177
+
178
+ top = top_countries[0]
179
+
180
+ # Base result
181
+ result = {
182
+ 'name': normalized,
183
+ 'country': top['country'],
184
+ 'country_name': top['country_name'],
185
+ 'confidence': top['probability'],
186
+ 'region': top['region'],
187
+ 'language': top['language'],
188
+ 'top_countries': top_countries
189
+ }
190
+
191
+ # v4.0.0: Add explainability features if requested
192
+ if explain:
193
+ # Calculate ambiguity score (Shannon entropy)
194
+ probs = [c['probability'] for c in top_countries]
195
+ ambiguity = ExplainabilityEngine.calculate_ambiguity_score(probs)
196
+
197
+ # Detect morphological patterns
198
+ morphology_signal = MorphologyEngine.get_morphological_signal(name, name_type)
199
+
200
+ # Calculate confidence breakdown
201
+ freq_strength = top['probability']
202
+ morph_signal_strength = morphology_signal['pattern_confidence'] if morphology_signal else 0.0
203
+
204
+ breakdown = ExplainabilityEngine.decompose_confidence(
205
+ frequency_strength=freq_strength,
206
+ cross_source_agreement=0.15 if len(top_countries) > 1 else 0.0,
207
+ morphology_signal=morph_signal_strength,
208
+ entropy_penalty=ambiguity * 0.3
209
+ )
210
+
211
+ # Generate full explanation
212
+ morphology_patterns = [morphology_signal['primary_pattern']] if morphology_signal else None
213
+
214
+ explanation = ExplainabilityEngine.generate_explanation(
215
+ name=name,
216
+ prediction=result,
217
+ confidence_breakdown=breakdown,
218
+ ambiguity_score=ambiguity,
219
+ morphology_patterns=morphology_patterns,
220
+ sources=["EthniData Database"]
221
+ )
222
+
223
+ # Add v4.0.0 fields to result
224
+ result['ambiguity_score'] = round(ambiguity, 4)
225
+ result['confidence_level'] = explanation['explanation']['confidence_level']
226
+ result['morphology_signal'] = morphology_signal
227
+ result['explanation'] = explanation['explanation']
228
+
229
+ return result
230
+
231
+ def predict_gender(
232
+ self,
233
+ name: str
234
+ ) -> Dict:
235
+ """
236
+ Predict gender from first name
237
+
238
+ Args:
239
+ name: First name
240
+
241
+ Returns:
242
+ {
243
+ 'name': str,
244
+ 'gender': str ('M' or 'F' or None),
245
+ 'confidence': float,
246
+ 'distribution': {'M': prob, 'F': prob, None: prob}
247
+ }
248
+ """
249
+
250
+ normalized = self.normalize_name(name)
251
+
252
+ query = """
253
+ SELECT gender, COUNT(*) as count
254
+ FROM names
255
+ WHERE name = ? AND name_type = 'first'
256
+ GROUP BY gender
257
+ """
258
+
259
+ cursor = self.conn.cursor()
260
+ cursor.execute(query, (normalized,))
261
+ results = cursor.fetchall()
262
+
263
+ if not results:
264
+ return {
265
+ 'name': normalized,
266
+ 'gender': None,
267
+ 'confidence': 0.0,
268
+ 'distribution': {}
269
+ }
270
+
271
+ # Count by gender
272
+ gender_counts = {}
273
+ total = 0
274
+
275
+ for row in results:
276
+ gender = row['gender']
277
+ count = row['count']
278
+ gender_counts[gender] = count
279
+ total += count
280
+
281
+ # Calculate probabilities
282
+ distribution = {g: round(c / total, 4) for g, c in gender_counts.items()}
283
+
284
+ # Top gender
285
+ top_gender = max(gender_counts.items(), key=lambda x: x[1])[0]
286
+ confidence = gender_counts[top_gender] / total
287
+
288
+ return {
289
+ 'name': normalized,
290
+ 'gender': top_gender,
291
+ 'confidence': round(confidence, 4),
292
+ 'distribution': distribution
293
+ }
294
+
295
+ def predict_region(
296
+ self,
297
+ name: str,
298
+ name_type: Literal["first", "last"] = "first"
299
+ ) -> Dict:
300
+ """
301
+ Predict geographic region from name
302
+
303
+ Args:
304
+ name: First or last name
305
+ name_type: "first" or "last"
306
+
307
+ Returns:
308
+ {
309
+ 'name': str,
310
+ 'region': str (Europe, Asia, Americas, Africa, Oceania, Other),
311
+ 'confidence': float,
312
+ 'distribution': {region: probability, ...}
313
+ }
314
+ """
315
+
316
+ normalized = self.normalize_name(name)
317
+
318
+ query = """
319
+ SELECT region, COUNT(*) as total_freq
320
+ FROM names
321
+ WHERE name = ? AND name_type = ?
322
+ GROUP BY region
323
+ ORDER BY total_freq DESC
324
+ """
325
+
326
+ cursor = self.conn.cursor()
327
+ cursor.execute(query, (normalized, name_type))
328
+ results = cursor.fetchall()
329
+
330
+ if not results:
331
+ return {
332
+ 'name': normalized,
333
+ 'region': None,
334
+ 'confidence': 0.0,
335
+ 'distribution': {}
336
+ }
337
+
338
+ total = sum(row['total_freq'] for row in results)
339
+
340
+ distribution = {}
341
+ for row in results:
342
+ region = row['region']
343
+ prob = row['total_freq'] / total
344
+ distribution[region] = round(prob, 4)
345
+
346
+ top_region = results[0]['region']
347
+ confidence = results[0]['total_freq'] / total
348
+
349
+ return {
350
+ 'name': normalized,
351
+ 'region': top_region,
352
+ 'confidence': round(confidence, 4),
353
+ 'distribution': distribution
354
+ }
355
+
356
+ def predict_language(
357
+ self,
358
+ name: str,
359
+ name_type: Literal["first", "last"] = "first",
360
+ top_n: int = 5
361
+ ) -> Dict:
362
+ """
363
+ Predict most likely language from name
364
+
365
+ Args:
366
+ name: First or last name
367
+ name_type: "first" or "last"
368
+ top_n: Number of top predictions
369
+
370
+ Returns:
371
+ {
372
+ 'name': str,
373
+ 'language': str,
374
+ 'confidence': float,
375
+ 'top_languages': [{language, probability}, ...]
376
+ }
377
+ """
378
+
379
+ normalized = self.normalize_name(name)
380
+
381
+ query = """
382
+ SELECT language, COUNT(*) as total_freq
383
+ FROM names
384
+ WHERE name = ? AND name_type = ? AND language IS NOT NULL
385
+ GROUP BY language
386
+ ORDER BY total_freq DESC
387
+ LIMIT ?
388
+ """
389
+
390
+ cursor = self.conn.cursor()
391
+ cursor.execute(query, (normalized, name_type, top_n))
392
+ results = cursor.fetchall()
393
+
394
+ if not results:
395
+ return {
396
+ 'name': normalized,
397
+ 'language': None,
398
+ 'confidence': 0.0,
399
+ 'top_languages': []
400
+ }
401
+
402
+ total = sum(row['total_freq'] for row in results)
403
+
404
+ top_languages = []
405
+ for row in results:
406
+ lang = row['language']
407
+ prob = row['total_freq'] / total
408
+ top_languages.append({
409
+ 'language': lang,
410
+ 'probability': round(prob, 4)
411
+ })
412
+
413
+ return {
414
+ 'name': normalized,
415
+ 'language': top_languages[0]['language'],
416
+ 'confidence': top_languages[0]['probability'],
417
+ 'top_languages': top_languages
418
+ }
419
+
420
+ def predict_religion(
421
+ self,
422
+ name: str,
423
+ name_type: Literal["first", "last"] = "first",
424
+ top_n: int = 5
425
+ ) -> Dict:
426
+ """
427
+ Predict religion from name - NEW in v1.3.0!
428
+
429
+ Args:
430
+ name: First or last name
431
+ name_type: "first" or "last"
432
+ top_n: Number of top predictions
433
+
434
+ Returns:
435
+ {
436
+ 'name': str,
437
+ 'religion': str (Christianity, Islam, Hinduism, Buddhism, Judaism),
438
+ 'confidence': float,
439
+ 'top_religions': [{religion, probability}, ...]
440
+ }
441
+ """
442
+
443
+ normalized = self.normalize_name(name)
444
+
445
+ query = """
446
+ SELECT religion, COUNT(*) as total_freq
447
+ FROM names
448
+ WHERE name = ? AND name_type = ? AND religion IS NOT NULL
449
+ GROUP BY religion
450
+ ORDER BY total_freq DESC
451
+ LIMIT ?
452
+ """
453
+
454
+ cursor = self.conn.cursor()
455
+ cursor.execute(query, (normalized, name_type, top_n))
456
+ results = cursor.fetchall()
457
+
458
+ if not results:
459
+ return {
460
+ 'name': normalized,
461
+ 'religion': None,
462
+ 'confidence': 0.0,
463
+ 'top_religions': []
464
+ }
465
+
466
+ total = sum(row['total_freq'] for row in results)
467
+
468
+ top_religions = []
469
+ for row in results:
470
+ religion = row['religion']
471
+ prob = row['total_freq'] / total
472
+ top_religions.append({
473
+ 'religion': religion,
474
+ 'probability': round(prob, 4)
475
+ })
476
+
477
+ return {
478
+ 'name': normalized,
479
+ 'religion': top_religions[0]['religion'],
480
+ 'confidence': top_religions[0]['probability'],
481
+ 'top_religions': top_religions
482
+ }
483
+
484
+ def predict_ethnicity(
485
+ self,
486
+ name: str,
487
+ name_type: Literal["first", "last"] = "first"
488
+ ) -> Dict:
489
+ """Predict ethnicity from name (uses nationality as proxy)"""
490
+
491
+ # Use nationality as ethnicity proxy since we don't have separate ethnicity data
492
+ nationality = self.predict_nationality(name, name_type, top_n=1)
493
+
494
+ return {
495
+ 'name': nationality['name'],
496
+ 'ethnicity': nationality['country_name'], # Use country as ethnicity
497
+ 'country': nationality['country'],
498
+ 'country_name': nationality['country_name'],
499
+ 'region': nationality.get('region'),
500
+ 'language': nationality.get('language'),
501
+ 'confidence': nationality['confidence']
502
+ }
503
+
504
+ def predict_full_name(
505
+ self,
506
+ first_name: str,
507
+ last_name: str,
508
+ top_n: int = 5,
509
+ explain: bool = False
510
+ ) -> Dict:
511
+ """
512
+ Predict from full name (first + last) - ENHANCED v4.0.0
513
+
514
+ Returns nationality, region, language
515
+
516
+ Args:
517
+ first_name: First name
518
+ last_name: Last name
519
+ top_n: Number of top predictions
520
+ explain: If True, includes explainability layer (v4.0.0 NEW!)
521
+ """
522
+
523
+ first_pred = self.predict_nationality(first_name, "first", top_n=top_n, explain=False)
524
+ last_pred = self.predict_nationality(last_name, "last", top_n=top_n, explain=False)
525
+
526
+ # Combine scores
527
+ combined_scores = {}
528
+
529
+ for item in first_pred['top_countries']:
530
+ combined_scores[item['country']] = {
531
+ 'score': item['probability'] * 0.4,
532
+ 'region': item['region'],
533
+ 'language': item['language']
534
+ }
535
+
536
+ for item in last_pred['top_countries']:
537
+ if item['country'] in combined_scores:
538
+ combined_scores[item['country']]['score'] += item['probability'] * 0.6
539
+ else:
540
+ combined_scores[item['country']] = {
541
+ 'score': item['probability'] * 0.6,
542
+ 'region': item['region'],
543
+ 'language': item['language']
544
+ }
545
+
546
+ # Sort
547
+ sorted_countries = sorted(
548
+ combined_scores.items(),
549
+ key=lambda x: x[1]['score'],
550
+ reverse=True
551
+ )[:top_n]
552
+
553
+ # Format
554
+ top_countries = []
555
+ for country_code, data in sorted_countries:
556
+ try:
557
+ country = pycountry.countries.get(alpha_3=country_code)
558
+ country_name = country.name if country else country_code
559
+ except:
560
+ country_name = country_code
561
+
562
+ top_countries.append({
563
+ 'country': country_code,
564
+ 'country_name': country_name,
565
+ 'region': data['region'],
566
+ 'language': data['language'],
567
+ 'probability': round(data['score'], 4)
568
+ })
569
+
570
+ top = top_countries[0] if top_countries else {}
571
+
572
+ # Base result
573
+ result = {
574
+ 'first_name': self.normalize_name(first_name),
575
+ 'last_name': self.normalize_name(last_name),
576
+ 'country': top.get('country'),
577
+ 'country_name': top.get('country_name'),
578
+ 'region': top.get('region'),
579
+ 'language': top.get('language'),
580
+ 'confidence': top.get('probability', 0.0),
581
+ 'top_countries': top_countries
582
+ }
583
+
584
+ # v4.0.0: Add explainability features if requested
585
+ if explain:
586
+ # Calculate ambiguity score
587
+ probs = [c['probability'] for c in top_countries]
588
+ ambiguity = ExplainabilityEngine.calculate_ambiguity_score(probs)
589
+
590
+ # Detect morphological patterns in both names
591
+ first_morph = MorphologyEngine.get_morphological_signal(first_name, "first")
592
+ last_morph = MorphologyEngine.get_morphological_signal(last_name, "last")
593
+
594
+ # Use last name morphology (stronger signal)
595
+ morphology_signal = last_morph if last_morph else first_morph
596
+
597
+ # Calculate confidence breakdown
598
+ freq_strength = top.get('probability', 0.0)
599
+ morph_signal_strength = morphology_signal['pattern_confidence'] if morphology_signal else 0.0
600
+
601
+ breakdown = ExplainabilityEngine.decompose_confidence(
602
+ frequency_strength=freq_strength,
603
+ cross_source_agreement=0.20 if len(top_countries) > 1 else 0.0,
604
+ morphology_signal=morph_signal_strength,
605
+ entropy_penalty=ambiguity * 0.3
606
+ )
607
+
608
+ # Generate full explanation
609
+ morphology_patterns = []
610
+ if first_morph:
611
+ morphology_patterns.append(f"{first_morph['primary_pattern']} (first)")
612
+ if last_morph:
613
+ morphology_patterns.append(f"{last_morph['primary_pattern']} (last)")
614
+
615
+ explanation = ExplainabilityEngine.generate_explanation(
616
+ name=f"{first_name} {last_name}",
617
+ prediction=result,
618
+ confidence_breakdown=breakdown,
619
+ ambiguity_score=ambiguity,
620
+ morphology_patterns=morphology_patterns if morphology_patterns else None,
621
+ sources=["EthniData Database"]
622
+ )
623
+
624
+ # Add v4.0.0 fields
625
+ result['ambiguity_score'] = round(ambiguity, 4)
626
+ result['confidence_level'] = explanation['explanation']['confidence_level']
627
+ result['morphology_signal'] = {
628
+ 'first_name': first_morph,
629
+ 'last_name': last_morph
630
+ }
631
+ result['explanation'] = explanation['explanation']
632
+
633
+ return result
634
+
635
+ def predict_all(
636
+ self,
637
+ name: str,
638
+ name_type: Literal["first", "last"] = "first"
639
+ ) -> Dict:
640
+ """
641
+ Predict ALL attributes at once - UPDATED v1.3.0
642
+ Now includes: nationality, gender, region, language, religion, ethnicity
643
+
644
+ Args:
645
+ name: First or last name
646
+ name_type: "first" or "last"
647
+
648
+ Returns:
649
+ {
650
+ 'name': str,
651
+ 'nationality': {...},
652
+ 'gender': {...}, # Only for first names
653
+ 'region': {...},
654
+ 'language': {...},
655
+ 'religion': {...}, # NEW in v1.3.0!
656
+ 'ethnicity': {...}
657
+ }
658
+ """
659
+
660
+ normalized = self.normalize_name(name)
661
+
662
+ result = {
663
+ 'name': normalized,
664
+ 'nationality': self.predict_nationality(name, name_type),
665
+ 'region': self.predict_region(name, name_type),
666
+ 'language': self.predict_language(name, name_type),
667
+ 'religion': self.predict_religion(name, name_type), # NEW!
668
+ 'ethnicity': self.predict_ethnicity(name, name_type)
669
+ }
670
+
671
+ # Gender only for first names
672
+ if name_type == "first":
673
+ result['gender'] = self.predict_gender(name)
674
+
675
+ return result
676
+
677
+ def get_stats(self) -> Dict:
678
+ """Get database statistics"""
679
+
680
+ cursor = self.conn.cursor()
681
+
682
+ stats = {}
683
+
684
+ cursor.execute("SELECT COUNT(*) as count FROM names WHERE name_type = 'first'")
685
+ stats['total_first_names'] = cursor.fetchone()['count']
686
+
687
+ cursor.execute("SELECT COUNT(*) as count FROM names WHERE name_type = 'last'")
688
+ stats['total_last_names'] = cursor.fetchone()['count']
689
+
690
+ cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM names")
691
+ stats['countries'] = cursor.fetchone()['count']
692
+
693
+ cursor.execute("SELECT COUNT(DISTINCT region) as count FROM names WHERE region IS NOT NULL")
694
+ stats['regions'] = cursor.fetchone()['count']
695
+
696
+ cursor.execute("SELECT COUNT(DISTINCT language) as count FROM names WHERE language IS NOT NULL")
697
+ stats['languages'] = cursor.fetchone()['count']
698
+
699
+ return stats