ethnidata 3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ethnidata/__init__.py +61 -0
- ethnidata/downloader.py +143 -0
- ethnidata/ethnidata.db +0 -0
- ethnidata/predictor.py +560 -0
- ethnidata/predictor_old.py +277 -0
- ethnidata-3.1.5.dist-info/METADATA +257 -0
- ethnidata-3.1.5.dist-info/RECORD +10 -0
- ethnidata-3.1.5.dist-info/WHEEL +5 -0
- ethnidata-3.1.5.dist-info/licenses/LICENSE +21 -0
- ethnidata-3.1.5.dist-info/top_level.txt +1 -0
ethnidata/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData v3.0.1 - ULTRA MASSIVE EXPANSION! Global Demographics Prediction
|
|
3
|
+
Predict nationality, ethnicity, gender, region, language AND religion!
|
|
4
|
+
|
|
5
|
+
🔥 NEW in v3.0.1 - COMPLETE RELIGIOUS COVERAGE:
|
|
6
|
+
- 📊 **5.9M+ records** (14x increase from v2.0.0 - 1,326% growth!)
|
|
7
|
+
- 🌍 **238 countries** - complete global coverage
|
|
8
|
+
- 🗣️ **72 languages**
|
|
9
|
+
- 🕌 **ALL 6 MAJOR WORLD RELIGIONS** - Complete coverage:
|
|
10
|
+
- Christianity: 3.9M+ records (65.2%)
|
|
11
|
+
- Buddhism: 1.3M+ records (22.1%)
|
|
12
|
+
- Islam: 504K+ records (8.5%)
|
|
13
|
+
- Judaism: 121K+ records (2.0%) ✡️
|
|
14
|
+
- Hinduism: 90K+ records (1.5%)
|
|
15
|
+
- Sikhism: 24K+ records (0.4%) 🪯 NEW!
|
|
16
|
+
- 🌎 **Perfectly balanced regional distribution**:
|
|
17
|
+
- Asia: 33% • Americas: 32% • Africa: 30% • Europe: 3% • Oceania: 0.1%
|
|
18
|
+
|
|
19
|
+
Features:
|
|
20
|
+
- ✅ Nationality prediction (238 countries)
|
|
21
|
+
- ✅ Religion prediction (6 major world religions)
|
|
22
|
+
- ✅ Gender prediction
|
|
23
|
+
- ✅ Region prediction (5 continents)
|
|
24
|
+
- ✅ Language prediction (72 languages)
|
|
25
|
+
- ✅ Ethnicity prediction
|
|
26
|
+
- ✅ Full name analysis
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
from ethnidata import EthniData
|
|
30
|
+
|
|
31
|
+
ed = EthniData()
|
|
32
|
+
|
|
33
|
+
# Nationality
|
|
34
|
+
result = ed.predict_nationality("Ahmet")
|
|
35
|
+
|
|
36
|
+
# Religion (NOW WITH 6 RELIGIONS!)
|
|
37
|
+
result = ed.predict_religion("Muhammad") # Islam
|
|
38
|
+
result = ed.predict_religion("Cohen") # Judaism
|
|
39
|
+
result = ed.predict_religion("Singh") # Sikhism
|
|
40
|
+
|
|
41
|
+
# Gender
|
|
42
|
+
result = ed.predict_gender("Emma")
|
|
43
|
+
|
|
44
|
+
# Region
|
|
45
|
+
result = ed.predict_region("Chen")
|
|
46
|
+
|
|
47
|
+
# Language
|
|
48
|
+
result = ed.predict_language("José")
|
|
49
|
+
|
|
50
|
+
# ALL at once
|
|
51
|
+
result = ed.predict_all("Maria")
|
|
52
|
+
# Returns: nationality, religion, gender, region, language, ethnicity
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
__version__ = "3.1.0"
|
|
56
|
+
__author__ = "Teyfik Oz"
|
|
57
|
+
__license__ = "MIT"
|
|
58
|
+
|
|
59
|
+
from .predictor import EthniData
|
|
60
|
+
|
|
61
|
+
__all__ = ["EthniData"]
|
ethnidata/downloader.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database downloader for EthniData
|
|
3
|
+
Downloads the full v3.0.0 database (5.8M records) on first use
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import urllib.request
|
|
8
|
+
import shutil
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
# Database versions and URLs
|
|
13
|
+
DATABASES = {
|
|
14
|
+
'v2.0.0': {
|
|
15
|
+
'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v2.0.0/ethnidata_v2.db',
|
|
16
|
+
'size': '75 MB',
|
|
17
|
+
'records': '415K',
|
|
18
|
+
'filename': 'ethnidata.db'
|
|
19
|
+
},
|
|
20
|
+
'v3.0.0': {
|
|
21
|
+
'url': 'https://github.com/teyfikoz/ethnidata/releases/download/v3.0.0/ethnidata_v3.db',
|
|
22
|
+
'size': '1.1 GB',
|
|
23
|
+
'records': '5.8M',
|
|
24
|
+
'filename': 'ethnidata_v3.db'
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
DEFAULT_VERSION = 'v2.0.0' # Included in package
|
|
29
|
+
FULL_VERSION = 'v3.0.0' # Downloaded on demand
|
|
30
|
+
|
|
31
|
+
class DatabaseDownloader:
|
|
32
|
+
"""Handles database downloads"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, package_dir: Path):
|
|
35
|
+
self.package_dir = package_dir
|
|
36
|
+
self.db_path = package_dir / "ethnidata.db"
|
|
37
|
+
self.v3_path = package_dir / "ethnidata_v3.db"
|
|
38
|
+
|
|
39
|
+
def check_database(self, version: str = DEFAULT_VERSION) -> bool:
|
|
40
|
+
"""Check if database exists"""
|
|
41
|
+
if version == 'v2.0.0':
|
|
42
|
+
return self.db_path.exists()
|
|
43
|
+
elif version == 'v3.0.0':
|
|
44
|
+
return self.v3_path.exists()
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def download_database(self, version: str = FULL_VERSION, force: bool = False) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Download database if not exists
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
version: Database version to download ('v2.0.0' or 'v3.0.0')
|
|
53
|
+
force: Force download even if exists
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Path to database file
|
|
57
|
+
"""
|
|
58
|
+
if version not in DATABASES:
|
|
59
|
+
raise ValueError(f"Unknown version: {version}. Available: {list(DATABASES.keys())}")
|
|
60
|
+
|
|
61
|
+
db_info = DATABASES[version]
|
|
62
|
+
target_path = self.v3_path if version == 'v3.0.0' else self.db_path
|
|
63
|
+
|
|
64
|
+
# Check if already exists
|
|
65
|
+
if target_path.exists() and not force:
|
|
66
|
+
print(f"✅ Database {version} already exists ({db_info['records']} records)")
|
|
67
|
+
return str(target_path)
|
|
68
|
+
|
|
69
|
+
print(f"\n📥 Downloading EthniData {version} database...")
|
|
70
|
+
print(f" Records: {db_info['records']}")
|
|
71
|
+
print(f" Size: {db_info['size']}")
|
|
72
|
+
print(f" This may take a few minutes...")
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Download with progress
|
|
76
|
+
def report_progress(block_num, block_size, total_size):
|
|
77
|
+
downloaded = block_num * block_size
|
|
78
|
+
percent = min(downloaded * 100 / total_size, 100)
|
|
79
|
+
print(f"\r Progress: {percent:.1f}%", end='', flush=True)
|
|
80
|
+
|
|
81
|
+
urllib.request.urlretrieve(
|
|
82
|
+
db_info['url'],
|
|
83
|
+
target_path,
|
|
84
|
+
reporthook=report_progress
|
|
85
|
+
)
|
|
86
|
+
print(f"\n✅ Download complete: {target_path}")
|
|
87
|
+
return str(target_path)
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
print(f"\n❌ Download failed: {e}")
|
|
91
|
+
print(f"\n💡 You can manually download from:")
|
|
92
|
+
print(f" {db_info['url']}")
|
|
93
|
+
print(f" And save it as: {target_path}")
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
def get_database_path(self, prefer_v3: bool = False) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Get database path, downloading if necessary
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
prefer_v3: If True, use v3.0.0 (5.8M records) instead of v2.0.0 (415K records)
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Path to database file
|
|
105
|
+
"""
|
|
106
|
+
if prefer_v3:
|
|
107
|
+
# Try to use v3, download if not exists
|
|
108
|
+
if not self.v3_path.exists():
|
|
109
|
+
print(f"\n🚀 EthniData v3.0.0 offers 14x more data (5.8M vs 415K records)!")
|
|
110
|
+
print(f" Would you like to download it? ({DATABASES['v3.0.0']['size']})")
|
|
111
|
+
response = input(" Download v3.0.0? [y/N]: ").strip().lower()
|
|
112
|
+
|
|
113
|
+
if response in ['y', 'yes']:
|
|
114
|
+
return self.download_database('v3.0.0')
|
|
115
|
+
else:
|
|
116
|
+
print(f" Using v2.0.0 ({DATABASES['v2.0.0']['records']} records)")
|
|
117
|
+
return str(self.db_path)
|
|
118
|
+
return str(self.v3_path)
|
|
119
|
+
else:
|
|
120
|
+
# Use v2 (included in package)
|
|
121
|
+
if not self.db_path.exists():
|
|
122
|
+
raise FileNotFoundError(
|
|
123
|
+
f"Database not found at {self.db_path}. "
|
|
124
|
+
f"Please reinstall: pip install --upgrade --force-reinstall ethnidata"
|
|
125
|
+
)
|
|
126
|
+
return str(self.db_path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def download_v3_database(package_dir: Optional[Path] = None) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Convenience function to download v3.0.0 database
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
package_dir: Package directory (auto-detected if None)
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Path to downloaded database
|
|
138
|
+
"""
|
|
139
|
+
if package_dir is None:
|
|
140
|
+
package_dir = Path(__file__).parent
|
|
141
|
+
|
|
142
|
+
downloader = DatabaseDownloader(package_dir)
|
|
143
|
+
return downloader.download_database('v3.0.0')
|
ethnidata/ethnidata.db
ADDED
|
Binary file
|
ethnidata/predictor.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Predictor v2.0 - Gelişmiş özelliklerle
|
|
3
|
+
Yeni özellikler:
|
|
4
|
+
- Gender prediction (Cinsiyet tahmini)
|
|
5
|
+
- Region prediction (Bölge: Europe, Asia, Americas, Africa, Oceania)
|
|
6
|
+
- Language prediction (Yaygın dil tahmini)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlite3
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Literal
|
|
12
|
+
from unidecode import unidecode
|
|
13
|
+
import pycountry
|
|
14
|
+
|
|
15
|
+
class EthniData:
|
|
16
|
+
"""Ethnicity, Nationality, Gender, Region and Language predictor"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, db_path: Optional[str] = None, use_v3: bool = False):
|
|
19
|
+
"""
|
|
20
|
+
Initialize EthniData predictor
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
db_path: Path to SQLite database. If None, uses default location.
|
|
24
|
+
use_v3: If True, attempts to use v3.0.0 database (5.8M records).
|
|
25
|
+
If False, uses v2.0.0 database (415K records, included in package).
|
|
26
|
+
"""
|
|
27
|
+
if db_path is None:
|
|
28
|
+
package_dir = Path(__file__).parent
|
|
29
|
+
|
|
30
|
+
if use_v3:
|
|
31
|
+
# Try to use v3 database
|
|
32
|
+
v3_path = package_dir / "ethnidata_v3.db"
|
|
33
|
+
if v3_path.exists():
|
|
34
|
+
db_path = v3_path
|
|
35
|
+
else:
|
|
36
|
+
print(f"\n💡 EthniData v3.0.0 (5.8M records) is not installed.")
|
|
37
|
+
print(f" To download: from ethnidata.downloader import download_v3_database")
|
|
38
|
+
print(f" download_v3_database()")
|
|
39
|
+
print(f"\n Using v2.0.0 (415K records) for now...")
|
|
40
|
+
db_path = package_dir / "ethnidata.db"
|
|
41
|
+
else:
|
|
42
|
+
db_path = package_dir / "ethnidata.db"
|
|
43
|
+
|
|
44
|
+
self.db_path = Path(db_path)
|
|
45
|
+
|
|
46
|
+
if not self.db_path.exists():
|
|
47
|
+
raise FileNotFoundError(
|
|
48
|
+
f"Database not found: {self.db_path}\n"
|
|
49
|
+
f"Please reinstall: pip install --upgrade --force-reinstall ethnidata"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
53
|
+
self.conn.row_factory = sqlite3.Row
|
|
54
|
+
|
|
55
|
+
def __del__(self):
|
|
56
|
+
"""Close database connection"""
|
|
57
|
+
if hasattr(self, 'conn'):
|
|
58
|
+
self.conn.close()
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def normalize_name(name: str) -> str:
|
|
62
|
+
"""Normalize name (lowercase, remove accents)"""
|
|
63
|
+
return unidecode(name.strip().lower())
|
|
64
|
+
|
|
65
|
+
def predict_nationality(
|
|
66
|
+
self,
|
|
67
|
+
name: str,
|
|
68
|
+
name_type: Literal["first", "last"] = "first",
|
|
69
|
+
top_n: int = 5
|
|
70
|
+
) -> Dict:
|
|
71
|
+
"""
|
|
72
|
+
Predict nationality from name
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
name: First or last name
|
|
76
|
+
name_type: "first" or "last"
|
|
77
|
+
top_n: Number of top predictions
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
{
|
|
81
|
+
'name': str,
|
|
82
|
+
'country': str (ISO 3166-1 alpha-3),
|
|
83
|
+
'country_name': str,
|
|
84
|
+
'confidence': float (0-1),
|
|
85
|
+
'region': str, # NEW
|
|
86
|
+
'language': str, # NEW
|
|
87
|
+
'top_countries': [...]
|
|
88
|
+
}
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
normalized = self.normalize_name(name)
|
|
92
|
+
|
|
93
|
+
query = """
|
|
94
|
+
SELECT country_code, region, language, COUNT(*) as frequency
|
|
95
|
+
FROM names
|
|
96
|
+
WHERE name = ? AND name_type = ?
|
|
97
|
+
GROUP BY country_code, region, language
|
|
98
|
+
ORDER BY frequency DESC
|
|
99
|
+
LIMIT ?
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
cursor = self.conn.cursor()
|
|
103
|
+
cursor.execute(query, (normalized, name_type, top_n))
|
|
104
|
+
results = cursor.fetchall()
|
|
105
|
+
|
|
106
|
+
if not results:
|
|
107
|
+
return {
|
|
108
|
+
'name': normalized,
|
|
109
|
+
'country': None,
|
|
110
|
+
'country_name': None,
|
|
111
|
+
'confidence': 0.0,
|
|
112
|
+
'region': None,
|
|
113
|
+
'language': None,
|
|
114
|
+
'top_countries': []
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Calculate probabilities
|
|
118
|
+
total_freq = sum(row['frequency'] for row in results)
|
|
119
|
+
|
|
120
|
+
top_countries = []
|
|
121
|
+
for row in results:
|
|
122
|
+
prob = row['frequency'] / total_freq
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
country = pycountry.countries.get(alpha_3=row['country_code'])
|
|
126
|
+
country_name = country.name if country else row['country_code']
|
|
127
|
+
except:
|
|
128
|
+
country_name = row['country_code']
|
|
129
|
+
|
|
130
|
+
top_countries.append({
|
|
131
|
+
'country': row['country_code'],
|
|
132
|
+
'country_name': country_name,
|
|
133
|
+
'region': row['region'],
|
|
134
|
+
'language': row['language'],
|
|
135
|
+
'probability': round(prob, 4),
|
|
136
|
+
'frequency': row['frequency']
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
top = top_countries[0]
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
'name': normalized,
|
|
143
|
+
'country': top['country'],
|
|
144
|
+
'country_name': top['country_name'],
|
|
145
|
+
'confidence': top['probability'],
|
|
146
|
+
'region': top['region'], # NEW
|
|
147
|
+
'language': top['language'], # NEW
|
|
148
|
+
'top_countries': top_countries
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
def predict_gender(
|
|
152
|
+
self,
|
|
153
|
+
name: str
|
|
154
|
+
) -> Dict:
|
|
155
|
+
"""
|
|
156
|
+
Predict gender from first name
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
name: First name
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
{
|
|
163
|
+
'name': str,
|
|
164
|
+
'gender': str ('M' or 'F' or None),
|
|
165
|
+
'confidence': float,
|
|
166
|
+
'distribution': {'M': prob, 'F': prob, None: prob}
|
|
167
|
+
}
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
normalized = self.normalize_name(name)
|
|
171
|
+
|
|
172
|
+
query = """
|
|
173
|
+
SELECT gender, COUNT(*) as count
|
|
174
|
+
FROM names
|
|
175
|
+
WHERE name = ? AND name_type = 'first'
|
|
176
|
+
GROUP BY gender
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
cursor = self.conn.cursor()
|
|
180
|
+
cursor.execute(query, (normalized,))
|
|
181
|
+
results = cursor.fetchall()
|
|
182
|
+
|
|
183
|
+
if not results:
|
|
184
|
+
return {
|
|
185
|
+
'name': normalized,
|
|
186
|
+
'gender': None,
|
|
187
|
+
'confidence': 0.0,
|
|
188
|
+
'distribution': {}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
# Count by gender
|
|
192
|
+
gender_counts = {}
|
|
193
|
+
total = 0
|
|
194
|
+
|
|
195
|
+
for row in results:
|
|
196
|
+
gender = row['gender']
|
|
197
|
+
count = row['count']
|
|
198
|
+
gender_counts[gender] = count
|
|
199
|
+
total += count
|
|
200
|
+
|
|
201
|
+
# Calculate probabilities
|
|
202
|
+
distribution = {g: round(c / total, 4) for g, c in gender_counts.items()}
|
|
203
|
+
|
|
204
|
+
# Top gender
|
|
205
|
+
top_gender = max(gender_counts.items(), key=lambda x: x[1])[0]
|
|
206
|
+
confidence = gender_counts[top_gender] / total
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
'name': normalized,
|
|
210
|
+
'gender': top_gender,
|
|
211
|
+
'confidence': round(confidence, 4),
|
|
212
|
+
'distribution': distribution
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
def predict_region(
|
|
216
|
+
self,
|
|
217
|
+
name: str,
|
|
218
|
+
name_type: Literal["first", "last"] = "first"
|
|
219
|
+
) -> Dict:
|
|
220
|
+
"""
|
|
221
|
+
Predict geographic region from name
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
name: First or last name
|
|
225
|
+
name_type: "first" or "last"
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
{
|
|
229
|
+
'name': str,
|
|
230
|
+
'region': str (Europe, Asia, Americas, Africa, Oceania, Other),
|
|
231
|
+
'confidence': float,
|
|
232
|
+
'distribution': {region: probability, ...}
|
|
233
|
+
}
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
normalized = self.normalize_name(name)
|
|
237
|
+
|
|
238
|
+
query = """
|
|
239
|
+
SELECT region, COUNT(*) as total_freq
|
|
240
|
+
FROM names
|
|
241
|
+
WHERE name = ? AND name_type = ?
|
|
242
|
+
GROUP BY region
|
|
243
|
+
ORDER BY total_freq DESC
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
cursor = self.conn.cursor()
|
|
247
|
+
cursor.execute(query, (normalized, name_type))
|
|
248
|
+
results = cursor.fetchall()
|
|
249
|
+
|
|
250
|
+
if not results:
|
|
251
|
+
return {
|
|
252
|
+
'name': normalized,
|
|
253
|
+
'region': None,
|
|
254
|
+
'confidence': 0.0,
|
|
255
|
+
'distribution': {}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
total = sum(row['total_freq'] for row in results)
|
|
259
|
+
|
|
260
|
+
distribution = {}
|
|
261
|
+
for row in results:
|
|
262
|
+
region = row['region']
|
|
263
|
+
prob = row['total_freq'] / total
|
|
264
|
+
distribution[region] = round(prob, 4)
|
|
265
|
+
|
|
266
|
+
top_region = results[0]['region']
|
|
267
|
+
confidence = results[0]['total_freq'] / total
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
'name': normalized,
|
|
271
|
+
'region': top_region,
|
|
272
|
+
'confidence': round(confidence, 4),
|
|
273
|
+
'distribution': distribution
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
def predict_language(
|
|
277
|
+
self,
|
|
278
|
+
name: str,
|
|
279
|
+
name_type: Literal["first", "last"] = "first",
|
|
280
|
+
top_n: int = 5
|
|
281
|
+
) -> Dict:
|
|
282
|
+
"""
|
|
283
|
+
Predict most likely language from name
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
name: First or last name
|
|
287
|
+
name_type: "first" or "last"
|
|
288
|
+
top_n: Number of top predictions
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
{
|
|
292
|
+
'name': str,
|
|
293
|
+
'language': str,
|
|
294
|
+
'confidence': float,
|
|
295
|
+
'top_languages': [{language, probability}, ...]
|
|
296
|
+
}
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
normalized = self.normalize_name(name)
|
|
300
|
+
|
|
301
|
+
query = """
|
|
302
|
+
SELECT language, COUNT(*) as total_freq
|
|
303
|
+
FROM names
|
|
304
|
+
WHERE name = ? AND name_type = ? AND language IS NOT NULL
|
|
305
|
+
GROUP BY language
|
|
306
|
+
ORDER BY total_freq DESC
|
|
307
|
+
LIMIT ?
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
cursor = self.conn.cursor()
|
|
311
|
+
cursor.execute(query, (normalized, name_type, top_n))
|
|
312
|
+
results = cursor.fetchall()
|
|
313
|
+
|
|
314
|
+
if not results:
|
|
315
|
+
return {
|
|
316
|
+
'name': normalized,
|
|
317
|
+
'language': None,
|
|
318
|
+
'confidence': 0.0,
|
|
319
|
+
'top_languages': []
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
total = sum(row['total_freq'] for row in results)
|
|
323
|
+
|
|
324
|
+
top_languages = []
|
|
325
|
+
for row in results:
|
|
326
|
+
lang = row['language']
|
|
327
|
+
prob = row['total_freq'] / total
|
|
328
|
+
top_languages.append({
|
|
329
|
+
'language': lang,
|
|
330
|
+
'probability': round(prob, 4)
|
|
331
|
+
})
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
'name': normalized,
|
|
335
|
+
'language': top_languages[0]['language'],
|
|
336
|
+
'confidence': top_languages[0]['probability'],
|
|
337
|
+
'top_languages': top_languages
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
def predict_religion(
|
|
341
|
+
self,
|
|
342
|
+
name: str,
|
|
343
|
+
name_type: Literal["first", "last"] = "first",
|
|
344
|
+
top_n: int = 5
|
|
345
|
+
) -> Dict:
|
|
346
|
+
"""
|
|
347
|
+
Predict religion from name - NEW in v1.3.0!
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
name: First or last name
|
|
351
|
+
name_type: "first" or "last"
|
|
352
|
+
top_n: Number of top predictions
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
{
|
|
356
|
+
'name': str,
|
|
357
|
+
'religion': str (Christianity, Islam, Hinduism, Buddhism, Judaism),
|
|
358
|
+
'confidence': float,
|
|
359
|
+
'top_religions': [{religion, probability}, ...]
|
|
360
|
+
}
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
normalized = self.normalize_name(name)
|
|
364
|
+
|
|
365
|
+
query = """
|
|
366
|
+
SELECT religion, COUNT(*) as total_freq
|
|
367
|
+
FROM names
|
|
368
|
+
WHERE name = ? AND name_type = ? AND religion IS NOT NULL
|
|
369
|
+
GROUP BY religion
|
|
370
|
+
ORDER BY total_freq DESC
|
|
371
|
+
LIMIT ?
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
cursor = self.conn.cursor()
|
|
375
|
+
cursor.execute(query, (normalized, name_type, top_n))
|
|
376
|
+
results = cursor.fetchall()
|
|
377
|
+
|
|
378
|
+
if not results:
|
|
379
|
+
return {
|
|
380
|
+
'name': normalized,
|
|
381
|
+
'religion': None,
|
|
382
|
+
'confidence': 0.0,
|
|
383
|
+
'top_religions': []
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
total = sum(row['total_freq'] for row in results)
|
|
387
|
+
|
|
388
|
+
top_religions = []
|
|
389
|
+
for row in results:
|
|
390
|
+
religion = row['religion']
|
|
391
|
+
prob = row['total_freq'] / total
|
|
392
|
+
top_religions.append({
|
|
393
|
+
'religion': religion,
|
|
394
|
+
'probability': round(prob, 4)
|
|
395
|
+
})
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
'name': normalized,
|
|
399
|
+
'religion': top_religions[0]['religion'],
|
|
400
|
+
'confidence': top_religions[0]['probability'],
|
|
401
|
+
'top_religions': top_religions
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
def predict_ethnicity(
|
|
405
|
+
self,
|
|
406
|
+
name: str,
|
|
407
|
+
name_type: Literal["first", "last"] = "first"
|
|
408
|
+
) -> Dict:
|
|
409
|
+
"""Predict ethnicity from name (uses nationality as proxy)"""
|
|
410
|
+
|
|
411
|
+
# Use nationality as ethnicity proxy since we don't have separate ethnicity data
|
|
412
|
+
nationality = self.predict_nationality(name, name_type, top_n=1)
|
|
413
|
+
|
|
414
|
+
return {
|
|
415
|
+
'name': nationality['name'],
|
|
416
|
+
'ethnicity': nationality['country_name'], # Use country as ethnicity
|
|
417
|
+
'country': nationality['country'],
|
|
418
|
+
'country_name': nationality['country_name'],
|
|
419
|
+
'region': nationality.get('region'),
|
|
420
|
+
'language': nationality.get('language'),
|
|
421
|
+
'confidence': nationality['confidence']
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
def predict_full_name(
|
|
425
|
+
self,
|
|
426
|
+
first_name: str,
|
|
427
|
+
last_name: str,
|
|
428
|
+
top_n: int = 5
|
|
429
|
+
) -> Dict:
|
|
430
|
+
"""
|
|
431
|
+
Predict from full name (first + last) - ENHANCED
|
|
432
|
+
|
|
433
|
+
Returns nationality, region, language
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
first_pred = self.predict_nationality(first_name, "first", top_n=top_n)
|
|
437
|
+
last_pred = self.predict_nationality(last_name, "last", top_n=top_n)
|
|
438
|
+
|
|
439
|
+
# Combine scores
|
|
440
|
+
combined_scores = {}
|
|
441
|
+
|
|
442
|
+
for item in first_pred['top_countries']:
|
|
443
|
+
combined_scores[item['country']] = {
|
|
444
|
+
'score': item['probability'] * 0.4,
|
|
445
|
+
'region': item['region'],
|
|
446
|
+
'language': item['language']
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
for item in last_pred['top_countries']:
|
|
450
|
+
if item['country'] in combined_scores:
|
|
451
|
+
combined_scores[item['country']]['score'] += item['probability'] * 0.6
|
|
452
|
+
else:
|
|
453
|
+
combined_scores[item['country']] = {
|
|
454
|
+
'score': item['probability'] * 0.6,
|
|
455
|
+
'region': item['region'],
|
|
456
|
+
'language': item['language']
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
# Sort
|
|
460
|
+
sorted_countries = sorted(
|
|
461
|
+
combined_scores.items(),
|
|
462
|
+
key=lambda x: x[1]['score'],
|
|
463
|
+
reverse=True
|
|
464
|
+
)[:top_n]
|
|
465
|
+
|
|
466
|
+
# Format
|
|
467
|
+
top_countries = []
|
|
468
|
+
for country_code, data in sorted_countries:
|
|
469
|
+
try:
|
|
470
|
+
country = pycountry.countries.get(alpha_3=country_code)
|
|
471
|
+
country_name = country.name if country else country_code
|
|
472
|
+
except:
|
|
473
|
+
country_name = country_code
|
|
474
|
+
|
|
475
|
+
top_countries.append({
|
|
476
|
+
'country': country_code,
|
|
477
|
+
'country_name': country_name,
|
|
478
|
+
'region': data['region'],
|
|
479
|
+
'language': data['language'],
|
|
480
|
+
'probability': round(data['score'], 4)
|
|
481
|
+
})
|
|
482
|
+
|
|
483
|
+
top = top_countries[0] if top_countries else {}
|
|
484
|
+
|
|
485
|
+
return {
|
|
486
|
+
'first_name': self.normalize_name(first_name),
|
|
487
|
+
'last_name': self.normalize_name(last_name),
|
|
488
|
+
'country': top.get('country'),
|
|
489
|
+
'country_name': top.get('country_name'),
|
|
490
|
+
'region': top.get('region'), # NEW
|
|
491
|
+
'language': top.get('language'), # NEW
|
|
492
|
+
'confidence': top.get('probability', 0.0),
|
|
493
|
+
'top_countries': top_countries
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
def predict_all(
|
|
497
|
+
self,
|
|
498
|
+
name: str,
|
|
499
|
+
name_type: Literal["first", "last"] = "first"
|
|
500
|
+
) -> Dict:
|
|
501
|
+
"""
|
|
502
|
+
Predict ALL attributes at once - UPDATED v1.3.0
|
|
503
|
+
Now includes: nationality, gender, region, language, religion, ethnicity
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
name: First or last name
|
|
507
|
+
name_type: "first" or "last"
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
{
|
|
511
|
+
'name': str,
|
|
512
|
+
'nationality': {...},
|
|
513
|
+
'gender': {...}, # Only for first names
|
|
514
|
+
'region': {...},
|
|
515
|
+
'language': {...},
|
|
516
|
+
'religion': {...}, # NEW in v1.3.0!
|
|
517
|
+
'ethnicity': {...}
|
|
518
|
+
}
|
|
519
|
+
"""
|
|
520
|
+
|
|
521
|
+
normalized = self.normalize_name(name)
|
|
522
|
+
|
|
523
|
+
result = {
|
|
524
|
+
'name': normalized,
|
|
525
|
+
'nationality': self.predict_nationality(name, name_type),
|
|
526
|
+
'region': self.predict_region(name, name_type),
|
|
527
|
+
'language': self.predict_language(name, name_type),
|
|
528
|
+
'religion': self.predict_religion(name, name_type), # NEW!
|
|
529
|
+
'ethnicity': self.predict_ethnicity(name, name_type)
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Gender only for first names
|
|
533
|
+
if name_type == "first":
|
|
534
|
+
result['gender'] = self.predict_gender(name)
|
|
535
|
+
|
|
536
|
+
return result
|
|
537
|
+
|
|
538
|
+
def get_stats(self) -> Dict:
|
|
539
|
+
"""Get database statistics"""
|
|
540
|
+
|
|
541
|
+
cursor = self.conn.cursor()
|
|
542
|
+
|
|
543
|
+
stats = {}
|
|
544
|
+
|
|
545
|
+
cursor.execute("SELECT COUNT(*) as count FROM names WHERE name_type = 'first'")
|
|
546
|
+
stats['total_first_names'] = cursor.fetchone()['count']
|
|
547
|
+
|
|
548
|
+
cursor.execute("SELECT COUNT(*) as count FROM names WHERE name_type = 'last'")
|
|
549
|
+
stats['total_last_names'] = cursor.fetchone()['count']
|
|
550
|
+
|
|
551
|
+
cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM names")
|
|
552
|
+
stats['countries'] = cursor.fetchone()['count']
|
|
553
|
+
|
|
554
|
+
cursor.execute("SELECT COUNT(DISTINCT region) as count FROM names WHERE region IS NOT NULL")
|
|
555
|
+
stats['regions'] = cursor.fetchone()['count']
|
|
556
|
+
|
|
557
|
+
cursor.execute("SELECT COUNT(DISTINCT language) as count FROM names WHERE language IS NOT NULL")
|
|
558
|
+
stats['languages'] = cursor.fetchone()['count']
|
|
559
|
+
|
|
560
|
+
return stats
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Predictor - Ana tahmin modülü
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Literal
|
|
8
|
+
from unidecode import unidecode
|
|
9
|
+
import pycountry
|
|
10
|
+
|
|
11
|
+
class EthniData:
|
|
12
|
+
"""Ethnicity and Nationality Data predictor"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize EthniData predictor
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
db_path: Path to SQLite database. If None, uses default location.
|
|
20
|
+
"""
|
|
21
|
+
if db_path is None:
|
|
22
|
+
db_path = Path(__file__).parent / "ethnidata.db"
|
|
23
|
+
|
|
24
|
+
self.db_path = Path(db_path)
|
|
25
|
+
|
|
26
|
+
if not self.db_path.exists():
|
|
27
|
+
raise FileNotFoundError(
|
|
28
|
+
f"Database not found: {self.db_path}\n"
|
|
29
|
+
f"Please run scripts/6_create_database.py first"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
33
|
+
self.conn.row_factory = sqlite3.Row
|
|
34
|
+
|
|
35
|
+
def __del__(self):
|
|
36
|
+
"""Close database connection"""
|
|
37
|
+
if hasattr(self, 'conn'):
|
|
38
|
+
self.conn.close()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def normalize_name(name: str) -> str:
|
|
42
|
+
"""Normalize name (lowercase, remove accents)"""
|
|
43
|
+
return unidecode(name.strip().lower())
|
|
44
|
+
|
|
45
|
+
def predict_nationality(
|
|
46
|
+
self,
|
|
47
|
+
name: str,
|
|
48
|
+
name_type: Literal["first", "last"] = "first",
|
|
49
|
+
top_n: int = 5
|
|
50
|
+
) -> Dict:
|
|
51
|
+
"""
|
|
52
|
+
Predict nationality from name
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
name: First or last name
|
|
56
|
+
name_type: "first" or "last"
|
|
57
|
+
top_n: Number of top predictions to return
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
{
|
|
61
|
+
'name': normalized name,
|
|
62
|
+
'country': top country code (ISO 3166-1 alpha-3),
|
|
63
|
+
'confidence': confidence score (0-1),
|
|
64
|
+
'top_countries': [{country, probability, frequency}, ...]
|
|
65
|
+
}
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
normalized = self.normalize_name(name)
|
|
69
|
+
|
|
70
|
+
table = "first_names" if name_type == "first" else "last_names"
|
|
71
|
+
|
|
72
|
+
# Query database
|
|
73
|
+
query = f"""
|
|
74
|
+
SELECT country_code, frequency
|
|
75
|
+
FROM {table}
|
|
76
|
+
WHERE name = ?
|
|
77
|
+
ORDER BY frequency DESC
|
|
78
|
+
LIMIT ?
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
cursor = self.conn.cursor()
|
|
82
|
+
cursor.execute(query, (normalized, top_n))
|
|
83
|
+
|
|
84
|
+
results = cursor.fetchall()
|
|
85
|
+
|
|
86
|
+
if not results:
|
|
87
|
+
return {
|
|
88
|
+
'name': normalized,
|
|
89
|
+
'country': None,
|
|
90
|
+
'confidence': 0.0,
|
|
91
|
+
'top_countries': []
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Calculate probabilities
|
|
95
|
+
total_frequency = sum(row['frequency'] for row in results)
|
|
96
|
+
|
|
97
|
+
top_countries = []
|
|
98
|
+
for row in results:
|
|
99
|
+
prob = row['frequency'] / total_frequency
|
|
100
|
+
|
|
101
|
+
# Country name lookup
|
|
102
|
+
try:
|
|
103
|
+
country = pycountry.countries.get(alpha_3=row['country_code'])
|
|
104
|
+
country_name = country.name if country else row['country_code']
|
|
105
|
+
except:
|
|
106
|
+
country_name = row['country_code']
|
|
107
|
+
|
|
108
|
+
top_countries.append({
|
|
109
|
+
'country': row['country_code'],
|
|
110
|
+
'country_name': country_name,
|
|
111
|
+
'probability': round(prob, 4),
|
|
112
|
+
'frequency': row['frequency']
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
# Top prediction
|
|
116
|
+
top = top_countries[0]
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
'name': normalized,
|
|
120
|
+
'country': top['country'],
|
|
121
|
+
'country_name': top['country_name'],
|
|
122
|
+
'confidence': top['probability'],
|
|
123
|
+
'top_countries': top_countries
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def predict_ethnicity(
|
|
127
|
+
self,
|
|
128
|
+
name: str,
|
|
129
|
+
name_type: Literal["first", "last"] = "first"
|
|
130
|
+
) -> Dict:
|
|
131
|
+
"""
|
|
132
|
+
Predict ethnicity from name
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
name: First or last name
|
|
136
|
+
name_type: "first" or "last"
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
{
|
|
140
|
+
'name': normalized name,
|
|
141
|
+
'ethnicity': predicted ethnicity,
|
|
142
|
+
'country': most likely country,
|
|
143
|
+
'confidence': confidence score
|
|
144
|
+
}
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
normalized = self.normalize_name(name)
|
|
148
|
+
|
|
149
|
+
table = "first_names" if name_type == "first" else "last_names"
|
|
150
|
+
|
|
151
|
+
# Query with ethnicity
|
|
152
|
+
query = f"""
|
|
153
|
+
SELECT country_code, ethnicity, frequency
|
|
154
|
+
FROM {table}
|
|
155
|
+
WHERE name = ? AND ethnicity IS NOT NULL
|
|
156
|
+
ORDER BY frequency DESC
|
|
157
|
+
LIMIT 1
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
cursor = self.conn.cursor()
|
|
161
|
+
cursor.execute(query, (normalized,))
|
|
162
|
+
|
|
163
|
+
result = cursor.fetchone()
|
|
164
|
+
|
|
165
|
+
if result:
|
|
166
|
+
# Country name
|
|
167
|
+
try:
|
|
168
|
+
country = pycountry.countries.get(alpha_3=result['country_code'])
|
|
169
|
+
country_name = country.name if country else result['country_code']
|
|
170
|
+
except:
|
|
171
|
+
country_name = result['country_code']
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
'name': normalized,
|
|
175
|
+
'ethnicity': result['ethnicity'],
|
|
176
|
+
'country': result['country_code'],
|
|
177
|
+
'country_name': country_name,
|
|
178
|
+
'frequency': result['frequency']
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Fallback to nationality prediction
|
|
182
|
+
nationality = self.predict_nationality(name, name_type, top_n=1)
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
'name': normalized,
|
|
186
|
+
'ethnicity': None,
|
|
187
|
+
'country': nationality['country'],
|
|
188
|
+
'country_name': nationality.get('country_name'),
|
|
189
|
+
'confidence': nationality['confidence']
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
def predict_full_name(
|
|
193
|
+
self,
|
|
194
|
+
first_name: str,
|
|
195
|
+
last_name: str,
|
|
196
|
+
top_n: int = 5
|
|
197
|
+
) -> Dict:
|
|
198
|
+
"""
|
|
199
|
+
Predict nationality from full name (first + last)
|
|
200
|
+
|
|
201
|
+
Combines predictions from both first and last names
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
first_name: First name
|
|
205
|
+
last_name: Last name
|
|
206
|
+
top_n: Number of top predictions
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Combined prediction with country probabilities
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
first_pred = self.predict_nationality(first_name, "first", top_n=top_n)
|
|
213
|
+
last_pred = self.predict_nationality(last_name, "last", top_n=top_n)
|
|
214
|
+
|
|
215
|
+
# Combine probabilities
|
|
216
|
+
combined_scores = {}
|
|
217
|
+
|
|
218
|
+
for item in first_pred['top_countries']:
|
|
219
|
+
combined_scores[item['country']] = item['probability'] * 0.4
|
|
220
|
+
|
|
221
|
+
for item in last_pred['top_countries']:
|
|
222
|
+
if item['country'] in combined_scores:
|
|
223
|
+
combined_scores[item['country']] += item['probability'] * 0.6
|
|
224
|
+
else:
|
|
225
|
+
combined_scores[item['country']] = item['probability'] * 0.6
|
|
226
|
+
|
|
227
|
+
# Sort by combined score
|
|
228
|
+
sorted_countries = sorted(
|
|
229
|
+
combined_scores.items(),
|
|
230
|
+
key=lambda x: x[1],
|
|
231
|
+
reverse=True
|
|
232
|
+
)[:top_n]
|
|
233
|
+
|
|
234
|
+
# Format results
|
|
235
|
+
top_countries = []
|
|
236
|
+
for country_code, score in sorted_countries:
|
|
237
|
+
try:
|
|
238
|
+
country = pycountry.countries.get(alpha_3=country_code)
|
|
239
|
+
country_name = country.name if country else country_code
|
|
240
|
+
except:
|
|
241
|
+
country_name = country_code
|
|
242
|
+
|
|
243
|
+
top_countries.append({
|
|
244
|
+
'country': country_code,
|
|
245
|
+
'country_name': country_name,
|
|
246
|
+
'probability': round(score, 4)
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
'first_name': self.normalize_name(first_name),
|
|
251
|
+
'last_name': self.normalize_name(last_name),
|
|
252
|
+
'country': top_countries[0]['country'] if top_countries else None,
|
|
253
|
+
'country_name': top_countries[0]['country_name'] if top_countries else None,
|
|
254
|
+
'confidence': top_countries[0]['probability'] if top_countries else 0.0,
|
|
255
|
+
'top_countries': top_countries
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
def get_stats(self) -> Dict:
|
|
259
|
+
"""Get database statistics"""
|
|
260
|
+
|
|
261
|
+
cursor = self.conn.cursor()
|
|
262
|
+
|
|
263
|
+
stats = {}
|
|
264
|
+
|
|
265
|
+
cursor.execute("SELECT COUNT(*) as count FROM first_names")
|
|
266
|
+
stats['total_first_names'] = cursor.fetchone()['count']
|
|
267
|
+
|
|
268
|
+
cursor.execute("SELECT COUNT(*) as count FROM last_names")
|
|
269
|
+
stats['total_last_names'] = cursor.fetchone()['count']
|
|
270
|
+
|
|
271
|
+
cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM first_names")
|
|
272
|
+
stats['countries_first'] = cursor.fetchone()['count']
|
|
273
|
+
|
|
274
|
+
cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM last_names")
|
|
275
|
+
stats['countries_last'] = cursor.fetchone()['count']
|
|
276
|
+
|
|
277
|
+
return stats
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ethnidata
|
|
3
|
+
Version: 3.1.5
|
|
4
|
+
Summary: Predict nationality, ethnicity, gender, region, language and religion from names - 238 countries, 6 major religions, 5.9M+ names, complete religious coverage
|
|
5
|
+
Author-email: Teyfik OZ <teyfikoz@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/teyfikoz/ethnidata
|
|
8
|
+
Project-URL: Documentation, https://github.com/teyfikoz/ethnidata#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/teyfikoz/ethnidata.git
|
|
10
|
+
Project-URL: Issues, https://github.com/teyfikoz/ethnidata/issues
|
|
11
|
+
Keywords: names,nationality,ethnicity,demographics,prediction,NLP
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pycountry>=22.3.5
|
|
26
|
+
Requires-Dist: unidecode>=1.3.6
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
30
|
+
Provides-Extra: build
|
|
31
|
+
Requires-Dist: requests>=2.31.0; extra == "build"
|
|
32
|
+
Requires-Dist: pandas>=2.0.0; extra == "build"
|
|
33
|
+
Requires-Dist: numpy>=1.24.0; extra == "build"
|
|
34
|
+
Requires-Dist: beautifulsoup4>=4.12.0; extra == "build"
|
|
35
|
+
Requires-Dist: lxml>=4.9.0; extra == "build"
|
|
36
|
+
Requires-Dist: tqdm>=4.65.0; extra == "build"
|
|
37
|
+
Requires-Dist: wikipedia-api>=0.6.0; extra == "build"
|
|
38
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == "build"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# EthniData - Ethnicity and Nationality Prediction
|
|
42
|
+
|
|
43
|
+
[](https://www.python.org/downloads/)
|
|
44
|
+
[](https://opensource.org/licenses/MIT)
|
|
45
|
+
[](https://badge.fury.io/py/ethnidata)
|
|
46
|
+
|
|
47
|
+
Predict **nationality**, **ethnicity**, and **demographics** from names using a comprehensive global database built from multiple authoritative sources.
|
|
48
|
+
|
|
49
|
+
## 🌟 Features
|
|
50
|
+
|
|
51
|
+
- **190+ Countries** - Comprehensive coverage from Wikipedia/Wikidata
|
|
52
|
+
- **106 Countries** - Enhanced with names-dataset
|
|
53
|
+
- **120 Years** of Olympic athlete names
|
|
54
|
+
- **Multiple Sources** - Phone directories, census data, public records
|
|
55
|
+
- **Fast Predictions** - SQLite-based for instant lookups
|
|
56
|
+
- **Normalized Data** - Unicode-aware, case-insensitive matching
|
|
57
|
+
- **Ethnicity Support** - Where available in source data
|
|
58
|
+
- **Simple API** - Easy to use Python interface
|
|
59
|
+
|
|
60
|
+
## 📊 Data Sources
|
|
61
|
+
|
|
62
|
+
1. **Wikipedia/Wikidata** - 190+ countries, biographical data with ethnicity
|
|
63
|
+
2. **names-dataset** - 106 countries, curated name lists
|
|
64
|
+
3. **Olympics Dataset** - 120 years of athlete names (271,116 records)
|
|
65
|
+
4. **Phone Directories** - Public domain name lists from multiple countries
|
|
66
|
+
5. **Census Data** - US Census and other government open data
|
|
67
|
+
|
|
68
|
+
## 🚀 Installation
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install ethnidata
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 📖 Usage
|
|
75
|
+
|
|
76
|
+
### Basic Usage
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from ethnidata import EthniData
|
|
80
|
+
|
|
81
|
+
# Initialize
|
|
82
|
+
ed = EthniData()
|
|
83
|
+
|
|
84
|
+
# Predict nationality from first name
|
|
85
|
+
result = ed.predict_nationality("Ahmet", name_type="first")
|
|
86
|
+
print(result)
|
|
87
|
+
# {
|
|
88
|
+
# 'name': 'ahmet',
|
|
89
|
+
# 'country': 'TUR',
|
|
90
|
+
# 'country_name': 'Turkey',
|
|
91
|
+
# 'confidence': 0.89,
|
|
92
|
+
# 'top_countries': [
|
|
93
|
+
# {'country': 'TUR', 'country_name': 'Turkey', 'probability': 0.89},
|
|
94
|
+
# {'country': 'DEU', 'country_name': 'Germany', 'probability': 0.07},
|
|
95
|
+
# ...
|
|
96
|
+
# ]
|
|
97
|
+
# }
|
|
98
|
+
|
|
99
|
+
# Predict from last name
|
|
100
|
+
result = ed.predict_nationality("Tanaka", name_type="last")
|
|
101
|
+
print(result['country']) # 'JPN'
|
|
102
|
+
|
|
103
|
+
# Predict from full name (combines both)
|
|
104
|
+
result = ed.predict_full_name("Wei", "Chen")
|
|
105
|
+
print(result['country']) # 'CHN'
|
|
106
|
+
|
|
107
|
+
# Predict ethnicity (when available)
|
|
108
|
+
result = ed.predict_ethnicity("Muhammad", name_type="first")
|
|
109
|
+
print(result)
|
|
110
|
+
# {
|
|
111
|
+
# 'name': 'muhammad',
|
|
112
|
+
# 'ethnicity': 'Arab',
|
|
113
|
+
# 'country': 'SAU',
|
|
114
|
+
# 'country_name': 'Saudi Arabia'
|
|
115
|
+
# }
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Advanced Usage
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# Get top 10 predictions
|
|
122
|
+
result = ed.predict_nationality("Maria", name_type="first", top_n=10)
|
|
123
|
+
|
|
124
|
+
for country in result['top_countries']:
|
|
125
|
+
print(f"{country['country_name']}: {country['probability']:.2%}")
|
|
126
|
+
# Spain: 35.4%
|
|
127
|
+
# Italy: 28.2%
|
|
128
|
+
# Portugal: 15.1%
|
|
129
|
+
# ...
|
|
130
|
+
|
|
131
|
+
# Database statistics
|
|
132
|
+
stats = ed.get_stats()
|
|
133
|
+
print(stats)
|
|
134
|
+
# {
|
|
135
|
+
# 'total_first_names': 123456,
|
|
136
|
+
# 'total_last_names': 234567,
|
|
137
|
+
# 'countries_first': 195,
|
|
138
|
+
# 'countries_last': 198
|
|
139
|
+
# }
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## 🏗️ Project Structure
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
ethnidata/
|
|
146
|
+
├── ethnidata/ # Main package
|
|
147
|
+
│ ├── __init__.py
|
|
148
|
+
│ ├── predictor.py # Core prediction logic
|
|
149
|
+
│ └── ethnidata.db # SQLite database
|
|
150
|
+
├── scripts/ # Data collection scripts
|
|
151
|
+
│ ├── 1_fetch_names_dataset.py
|
|
152
|
+
│ ├── 2_fetch_wikipedia.py
|
|
153
|
+
│ ├── 3_fetch_olympics.py
|
|
154
|
+
│ ├── 4_fetch_phone_directories.py
|
|
155
|
+
│ ├── 5_merge_all_data.py
|
|
156
|
+
│ └── 6_create_database.py
|
|
157
|
+
├── tests/ # Unit tests
|
|
158
|
+
├── examples/ # Example scripts
|
|
159
|
+
├── docs/ # Documentation
|
|
160
|
+
├── setup.py
|
|
161
|
+
├── pyproject.toml
|
|
162
|
+
└── README.md
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## 🔬 Accuracy & Methodology
|
|
166
|
+
|
|
167
|
+
### How it works
|
|
168
|
+
|
|
169
|
+
1. **Name Normalization**: Names are lowercased and Unicode-normalized (e.g., "José" → "jose")
|
|
170
|
+
2. **Database Lookup**: Queries SQLite database for matching names
|
|
171
|
+
3. **Frequency-Based Scoring**: Countries are ranked by how often the name appears
|
|
172
|
+
4. **Probability Calculation**: Frequencies are converted to probabilities
|
|
173
|
+
5. **Full Name Combination**: First name (40%) + last name (60%) weights
|
|
174
|
+
|
|
175
|
+
### Limitations
|
|
176
|
+
|
|
177
|
+
- **Bias**: Database reflects historical Olympic participation, Wikipedia coverage
|
|
178
|
+
- **Missing Names**: Rare or new names may not be in database
|
|
179
|
+
- **Ethnicity**: Only available where source data included it
|
|
180
|
+
- **Migration**: Doesn't account for diaspora or modern migration patterns
|
|
181
|
+
- **Multiple Origins**: Common names (e.g., "Ali", "Maria") exist in many cultures
|
|
182
|
+
|
|
183
|
+
## 🛠️ Development
|
|
184
|
+
|
|
185
|
+
### Build Database from Scratch
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/teyfikoz/ethnidata.git
|
|
189
|
+
cd ethnidata
|
|
190
|
+
|
|
191
|
+
# Install dependencies
|
|
192
|
+
pip install -r requirements.txt
|
|
193
|
+
|
|
194
|
+
# Fetch all data (takes 10-30 minutes)
|
|
195
|
+
cd scripts
|
|
196
|
+
python 1_fetch_names_dataset.py
|
|
197
|
+
python 2_fetch_wikipedia.py
|
|
198
|
+
python 3_fetch_olympics.py
|
|
199
|
+
python 4_fetch_phone_directories.py
|
|
200
|
+
python 5_merge_all_data.py
|
|
201
|
+
python 6_create_database.py
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Run Tests
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
pip install -e ".[dev]"
|
|
208
|
+
pytest tests/ -v
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## 📜 License
|
|
212
|
+
|
|
213
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
|
214
|
+
|
|
215
|
+
## 🤝 Contributing
|
|
216
|
+
|
|
217
|
+
Contributions welcome! Please:
|
|
218
|
+
|
|
219
|
+
1. Fork the repository
|
|
220
|
+
2. Create a feature branch
|
|
221
|
+
3. Commit your changes
|
|
222
|
+
4. Push to the branch
|
|
223
|
+
5. Open a Pull Request
|
|
224
|
+
|
|
225
|
+
## 📚 Citations
|
|
226
|
+
|
|
227
|
+
If you use this database in research, please cite:
|
|
228
|
+
|
|
229
|
+
```bibtex
|
|
230
|
+
@software{ethnidata_2024,
|
|
231
|
+
title = {EthniData: Ethnicity and Nationality Prediction from Names},
|
|
232
|
+
author = {Oz, Teyfik},
|
|
233
|
+
year = {2024},
|
|
234
|
+
url = {https://github.com/teyfikoz/ethnidata}
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Data Source Citations
|
|
239
|
+
|
|
240
|
+
- **Olympics Data**: Randi Griffin (2018). 120 years of Olympic history. [Kaggle](https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results)
|
|
241
|
+
- **names-dataset**: Philippe Remy (2021). [name-dataset](https://github.com/philipperemy/name-dataset)
|
|
242
|
+
- **Wikidata**: Wikimedia Foundation. [Wikidata](https://www.wikidata.org)
|
|
243
|
+
|
|
244
|
+
## 🔗 Related Projects
|
|
245
|
+
|
|
246
|
+
- [ethnicolr](https://github.com/appeler/ethnicolr) - Ethnicity prediction using LSTM
|
|
247
|
+
- [name-dataset](https://github.com/philipperemy/name-dataset) - Name database (106 countries)
|
|
248
|
+
- [gender-guesser](https://github.com/lead-ratings/gender-guesser) - Gender prediction
|
|
249
|
+
|
|
250
|
+
## 📧 Contact
|
|
251
|
+
|
|
252
|
+
- GitHub Issues: [Report bugs or request features](https://github.com/teyfikoz/ethnidata/issues)
|
|
253
|
+
- GitHub: [@teyfikoz](https://github.com/teyfikoz)
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
**Built with ❤️ using open data**
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
ethnidata/__init__.py,sha256=xeg_6_eiFa05LNQKzUtQ1F1EHhQbGRrBSkq6lLuZ1CY,1858
|
|
2
|
+
ethnidata/downloader.py,sha256=GNohBtHyn_14TuWPhRUMxGNHy0UieXzwFCC5z-oiVQs,5057
|
|
3
|
+
ethnidata/ethnidata.db,sha256=edDXYMOoNNtVprbYGqQ7qPgdn2U7e6Y1PPyocBADVOs,78405632
|
|
4
|
+
ethnidata/predictor.py,sha256=fmmLSVpluMpBeKxABAwT7OVIaY4c_H5TksiZu7dETEQ,17687
|
|
5
|
+
ethnidata/predictor_old.py,sha256=dGmfYWTO2BRYxQUzzE7foZMEEDaOd6VWPZk4ib5Gp9E,8696
|
|
6
|
+
ethnidata-3.1.5.dist-info/licenses/LICENSE,sha256=p5pRNvuSoG_JxH4Xy11FK2iXc3hyAnzOKUx9gBltulk,1095
|
|
7
|
+
ethnidata-3.1.5.dist-info/METADATA,sha256=WEsx8FKprwzqhA2GmYGN7i52FzTCpqs-lTZqMV2040M,8182
|
|
8
|
+
ethnidata-3.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
ethnidata-3.1.5.dist-info/top_level.txt,sha256=V5Cuyv_Ib3mDSp2KL8MocXzLyp3o3r2FG01rFA7Iatk,10
|
|
10
|
+
ethnidata-3.1.5.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 NBD Database Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ethnidata
|