ethnidata 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ethnidata might be problematic. Click here for more details.

@@ -0,0 +1,277 @@
1
+ """
2
+ EthniData Predictor - Ana tahmin modülü
3
+ """
4
+
5
+ import sqlite3
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Literal
8
+ from unidecode import unidecode
9
+ import pycountry
10
+
11
+ class EthniData:
12
+ """Ethnicity and Nationality Data predictor"""
13
+
14
+ def __init__(self, db_path: Optional[str] = None):
15
+ """
16
+ Initialize EthniData predictor
17
+
18
+ Args:
19
+ db_path: Path to SQLite database. If None, uses default location.
20
+ """
21
+ if db_path is None:
22
+ db_path = Path(__file__).parent / "ethnidata.db"
23
+
24
+ self.db_path = Path(db_path)
25
+
26
+ if not self.db_path.exists():
27
+ raise FileNotFoundError(
28
+ f"Database not found: {self.db_path}\n"
29
+ f"Please run scripts/6_create_database.py first"
30
+ )
31
+
32
+ self.conn = sqlite3.connect(self.db_path)
33
+ self.conn.row_factory = sqlite3.Row
34
+
35
+ def __del__(self):
36
+ """Close database connection"""
37
+ if hasattr(self, 'conn'):
38
+ self.conn.close()
39
+
40
+ @staticmethod
41
+ def normalize_name(name: str) -> str:
42
+ """Normalize name (lowercase, remove accents)"""
43
+ return unidecode(name.strip().lower())
44
+
45
+ def predict_nationality(
46
+ self,
47
+ name: str,
48
+ name_type: Literal["first", "last"] = "first",
49
+ top_n: int = 5
50
+ ) -> Dict:
51
+ """
52
+ Predict nationality from name
53
+
54
+ Args:
55
+ name: First or last name
56
+ name_type: "first" or "last"
57
+ top_n: Number of top predictions to return
58
+
59
+ Returns:
60
+ {
61
+ 'name': normalized name,
62
+ 'country': top country code (ISO 3166-1 alpha-3),
63
+ 'confidence': confidence score (0-1),
64
+ 'top_countries': [{country, probability, frequency}, ...]
65
+ }
66
+ """
67
+
68
+ normalized = self.normalize_name(name)
69
+
70
+ table = "first_names" if name_type == "first" else "last_names"
71
+
72
+ # Query database
73
+ query = f"""
74
+ SELECT country_code, frequency
75
+ FROM {table}
76
+ WHERE name = ?
77
+ ORDER BY frequency DESC
78
+ LIMIT ?
79
+ """
80
+
81
+ cursor = self.conn.cursor()
82
+ cursor.execute(query, (normalized, top_n))
83
+
84
+ results = cursor.fetchall()
85
+
86
+ if not results:
87
+ return {
88
+ 'name': normalized,
89
+ 'country': None,
90
+ 'confidence': 0.0,
91
+ 'top_countries': []
92
+ }
93
+
94
+ # Calculate probabilities
95
+ total_frequency = sum(row['frequency'] for row in results)
96
+
97
+ top_countries = []
98
+ for row in results:
99
+ prob = row['frequency'] / total_frequency
100
+
101
+ # Country name lookup
102
+ try:
103
+ country = pycountry.countries.get(alpha_3=row['country_code'])
104
+ country_name = country.name if country else row['country_code']
105
+ except:
106
+ country_name = row['country_code']
107
+
108
+ top_countries.append({
109
+ 'country': row['country_code'],
110
+ 'country_name': country_name,
111
+ 'probability': round(prob, 4),
112
+ 'frequency': row['frequency']
113
+ })
114
+
115
+ # Top prediction
116
+ top = top_countries[0]
117
+
118
+ return {
119
+ 'name': normalized,
120
+ 'country': top['country'],
121
+ 'country_name': top['country_name'],
122
+ 'confidence': top['probability'],
123
+ 'top_countries': top_countries
124
+ }
125
+
126
+ def predict_ethnicity(
127
+ self,
128
+ name: str,
129
+ name_type: Literal["first", "last"] = "first"
130
+ ) -> Dict:
131
+ """
132
+ Predict ethnicity from name
133
+
134
+ Args:
135
+ name: First or last name
136
+ name_type: "first" or "last"
137
+
138
+ Returns:
139
+ {
140
+ 'name': normalized name,
141
+ 'ethnicity': predicted ethnicity,
142
+ 'country': most likely country,
143
+ 'confidence': confidence score
144
+ }
145
+ """
146
+
147
+ normalized = self.normalize_name(name)
148
+
149
+ table = "first_names" if name_type == "first" else "last_names"
150
+
151
+ # Query with ethnicity
152
+ query = f"""
153
+ SELECT country_code, ethnicity, frequency
154
+ FROM {table}
155
+ WHERE name = ? AND ethnicity IS NOT NULL
156
+ ORDER BY frequency DESC
157
+ LIMIT 1
158
+ """
159
+
160
+ cursor = self.conn.cursor()
161
+ cursor.execute(query, (normalized,))
162
+
163
+ result = cursor.fetchone()
164
+
165
+ if result:
166
+ # Country name
167
+ try:
168
+ country = pycountry.countries.get(alpha_3=result['country_code'])
169
+ country_name = country.name if country else result['country_code']
170
+ except:
171
+ country_name = result['country_code']
172
+
173
+ return {
174
+ 'name': normalized,
175
+ 'ethnicity': result['ethnicity'],
176
+ 'country': result['country_code'],
177
+ 'country_name': country_name,
178
+ 'frequency': result['frequency']
179
+ }
180
+
181
+ # Fallback to nationality prediction
182
+ nationality = self.predict_nationality(name, name_type, top_n=1)
183
+
184
+ return {
185
+ 'name': normalized,
186
+ 'ethnicity': None,
187
+ 'country': nationality['country'],
188
+ 'country_name': nationality.get('country_name'),
189
+ 'confidence': nationality['confidence']
190
+ }
191
+
192
+ def predict_full_name(
193
+ self,
194
+ first_name: str,
195
+ last_name: str,
196
+ top_n: int = 5
197
+ ) -> Dict:
198
+ """
199
+ Predict nationality from full name (first + last)
200
+
201
+ Combines predictions from both first and last names
202
+
203
+ Args:
204
+ first_name: First name
205
+ last_name: Last name
206
+ top_n: Number of top predictions
207
+
208
+ Returns:
209
+ Combined prediction with country probabilities
210
+ """
211
+
212
+ first_pred = self.predict_nationality(first_name, "first", top_n=top_n)
213
+ last_pred = self.predict_nationality(last_name, "last", top_n=top_n)
214
+
215
+ # Combine probabilities
216
+ combined_scores = {}
217
+
218
+ for item in first_pred['top_countries']:
219
+ combined_scores[item['country']] = item['probability'] * 0.4
220
+
221
+ for item in last_pred['top_countries']:
222
+ if item['country'] in combined_scores:
223
+ combined_scores[item['country']] += item['probability'] * 0.6
224
+ else:
225
+ combined_scores[item['country']] = item['probability'] * 0.6
226
+
227
+ # Sort by combined score
228
+ sorted_countries = sorted(
229
+ combined_scores.items(),
230
+ key=lambda x: x[1],
231
+ reverse=True
232
+ )[:top_n]
233
+
234
+ # Format results
235
+ top_countries = []
236
+ for country_code, score in sorted_countries:
237
+ try:
238
+ country = pycountry.countries.get(alpha_3=country_code)
239
+ country_name = country.name if country else country_code
240
+ except:
241
+ country_name = country_code
242
+
243
+ top_countries.append({
244
+ 'country': country_code,
245
+ 'country_name': country_name,
246
+ 'probability': round(score, 4)
247
+ })
248
+
249
+ return {
250
+ 'first_name': self.normalize_name(first_name),
251
+ 'last_name': self.normalize_name(last_name),
252
+ 'country': top_countries[0]['country'] if top_countries else None,
253
+ 'country_name': top_countries[0]['country_name'] if top_countries else None,
254
+ 'confidence': top_countries[0]['probability'] if top_countries else 0.0,
255
+ 'top_countries': top_countries
256
+ }
257
+
258
+ def get_stats(self) -> Dict:
259
+ """Get database statistics"""
260
+
261
+ cursor = self.conn.cursor()
262
+
263
+ stats = {}
264
+
265
+ cursor.execute("SELECT COUNT(*) as count FROM first_names")
266
+ stats['total_first_names'] = cursor.fetchone()['count']
267
+
268
+ cursor.execute("SELECT COUNT(*) as count FROM last_names")
269
+ stats['total_last_names'] = cursor.fetchone()['count']
270
+
271
+ cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM first_names")
272
+ stats['countries_first'] = cursor.fetchone()['count']
273
+
274
+ cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM last_names")
275
+ stats['countries_last'] = cursor.fetchone()['count']
276
+
277
+ return stats
@@ -0,0 +1,23 @@
1
+ """
2
+ EthniData Synthetic Data Engine (v2.0)
3
+
4
+ Privacy-safe synthetic name population generator.
5
+
6
+ License: MIT
7
+ """
8
+
9
+ from .engine import (
10
+ SyntheticDataEngine,
11
+ SyntheticConfig,
12
+ SyntheticRecord,
13
+ FrequencyProvider,
14
+ WeightedSampler
15
+ )
16
+
17
+ __all__ = [
18
+ 'SyntheticDataEngine',
19
+ 'SyntheticConfig',
20
+ 'SyntheticRecord',
21
+ 'FrequencyProvider',
22
+ 'WeightedSampler'
23
+ ]
@@ -0,0 +1,253 @@
1
+ """
2
+ EthniData Synthetic Data Engine (v2.0)
3
+ License: MIT
4
+
5
+ Goal:
6
+ - Generate privacy-safe, statistically plausible synthetic name populations
7
+ - No real-person generation: sampling from aggregated frequency tables
8
+ - Deterministic with seed
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from typing import Dict, List, Optional, Any
15
+ import random
16
+ import csv
17
+ import json
18
+ from collections import Counter
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class SyntheticConfig:
23
+ seed: int = 42
24
+ size: int = 10000
25
+ country: str = "TUR"
26
+ context_country: Optional[str] = None
27
+ diaspora_ratio: float = 0.15
28
+ diaspora_strength: float = 1.0
29
+ rare_name_boost: float = 1.0
30
+ noise_level: float = 0.02
31
+ first_weight: float = 0.4
32
+ last_weight: float = 0.6
33
+ include_probabilities: bool = True
34
+ include_ethnicity_profile: bool = False
35
+ export_format: str = "csv"
36
+ output_path: str = "synthetic_population.csv"
37
+
38
+
39
+ @dataclass
40
+ class SyntheticRecord:
41
+ first_name: str
42
+ last_name: str
43
+ origin_country: str
44
+ context_country: Optional[str]
45
+ nationality_top1: Optional[str] = None
46
+ nationality_topk: Optional[List[Dict[str, Any]]] = None
47
+ ethnicity_topk: Optional[List[Dict[str, Any]]] = None
48
+
49
+
50
+ class WeightedSampler:
51
+ """Fast weighted sampler using cumulative distribution."""
52
+ def __init__(self, items: List[str], weights: List[float], rng: random.Random):
53
+ if len(items) != len(weights) or not items:
54
+ raise ValueError("items/weights mismatch or empty")
55
+ total = sum(weights)
56
+ if total <= 0:
57
+ raise ValueError("non-positive total weight")
58
+
59
+ self.items = items
60
+ self.cdf = []
61
+ cum = 0.0
62
+ for w in weights:
63
+ cum += w / total
64
+ self.cdf.append(cum)
65
+ self.rng = rng
66
+
67
+ def sample(self) -> str:
68
+ r = self.rng.random()
69
+ lo, hi = 0, len(self.cdf) - 1
70
+ while lo < hi:
71
+ mid = (lo + hi) // 2
72
+ if r <= self.cdf[mid]:
73
+ hi = mid
74
+ else:
75
+ lo = mid + 1
76
+ return self.items[lo]
77
+
78
+
79
+ class FrequencyProvider:
80
+ """
81
+ Adapter interface for EthniData integration.
82
+ Implement this against your existing DB + predictor.
83
+ """
84
+ def get_first_name_freq(self, country: str) -> Dict[str, int]:
85
+ raise NotImplementedError
86
+
87
+ def get_last_name_freq(self, country: str) -> Dict[str, int]:
88
+ raise NotImplementedError
89
+
90
+ def get_migration_weights(self, context_country: str) -> Dict[str, float]:
91
+ return {}
92
+
93
+ def predict_full_name(self, first: str, last: str, context_country: Optional[str] = None) -> Dict[str, Any]:
94
+ return {"country": None, "top_countries": []}
95
+
96
+ def predict_ethnicity(self, name: str, name_type: str = "first", context_country: Optional[str] = None) -> Dict[str, Any]:
97
+ return {"top_ethnicities": []}
98
+
99
+
100
+ class SyntheticDataEngine:
101
+ """
102
+ Privacy-safe synthetic data generator.
103
+ """
104
+
105
+ def __init__(self, freq_provider: FrequencyProvider):
106
+ self.freq_provider = freq_provider
107
+
108
+ def generate(self, cfg: SyntheticConfig) -> List[SyntheticRecord]:
109
+ self._validate_cfg(cfg)
110
+ rng = random.Random(cfg.seed)
111
+
112
+ mixture = self._build_country_mixture(cfg, rng)
113
+ records: List[SyntheticRecord] = []
114
+
115
+ for _ in range(cfg.size):
116
+ origin_country = mixture.sample()
117
+ first = self._sample_name(
118
+ self.freq_provider.get_first_name_freq(origin_country),
119
+ rng, cfg.rare_name_boost, cfg.noise_level
120
+ )
121
+ last = self._sample_name(
122
+ self.freq_provider.get_last_name_freq(origin_country),
123
+ rng, cfg.rare_name_boost, cfg.noise_level
124
+ )
125
+
126
+ rec = SyntheticRecord(
127
+ first_name=first,
128
+ last_name=last,
129
+ origin_country=origin_country,
130
+ context_country=cfg.context_country
131
+ )
132
+
133
+ if cfg.include_probabilities:
134
+ nat = self.freq_provider.predict_full_name(first, last, context_country=cfg.context_country)
135
+ rec.nationality_top1 = nat.get("country")
136
+ rec.nationality_topk = nat.get("top_countries")
137
+
138
+ if cfg.include_ethnicity_profile:
139
+ eth = self.freq_provider.predict_ethnicity(first, name_type="first", context_country=cfg.context_country)
140
+ rec.ethnicity_topk = eth.get("top_ethnicities") or eth.get("ethnic_profile") or None
141
+
142
+ records.append(rec)
143
+
144
+ return records
145
+
146
+ def export(self, records: List[SyntheticRecord], cfg: SyntheticConfig) -> None:
147
+ if cfg.export_format == "csv":
148
+ self._export_csv(records, cfg.output_path)
149
+ elif cfg.export_format == "jsonl":
150
+ self._export_jsonl(records, cfg.output_path)
151
+ else:
152
+ raise ValueError(f"Unsupported export_format: {cfg.export_format}")
153
+
154
+ def sanity_report(self, records: List[SyntheticRecord]) -> Dict[str, Any]:
155
+ """Distribution checks for debugging."""
156
+ origins = Counter(r.origin_country for r in records)
157
+ firsts = Counter(r.first_name for r in records)
158
+ lasts = Counter(r.last_name for r in records)
159
+
160
+ return {
161
+ "n": len(records),
162
+ "unique_first_names": len(firsts),
163
+ "unique_last_names": len(lasts),
164
+ "top_origin_countries": origins.most_common(10),
165
+ "top_first_names": firsts.most_common(10),
166
+ "top_last_names": lasts.most_common(10),
167
+ }
168
+
169
+ def _build_country_mixture(self, cfg: SyntheticConfig, rng: random.Random) -> WeightedSampler:
170
+ base_country = cfg.country
171
+
172
+ if not cfg.context_country or cfg.diaspora_ratio <= 0:
173
+ return WeightedSampler([base_country], [1.0], rng)
174
+
175
+ mig = self.freq_provider.get_migration_weights(cfg.context_country)
176
+ mig = dict(mig)
177
+ mig.setdefault(base_country, 1.0)
178
+
179
+ origins = list(mig.keys())
180
+ weights = []
181
+ total_other = sum(w for c, w in mig.items() if c != base_country)
182
+
183
+ for c in origins:
184
+ if c == base_country:
185
+ weights.append(max(1e-9, 1.0 - cfg.diaspora_ratio))
186
+ else:
187
+ if total_other <= 0:
188
+ weights.append(1e-9)
189
+ else:
190
+ w = (mig[c] / total_other) * cfg.diaspora_ratio * cfg.diaspora_strength
191
+ weights.append(max(1e-9, w))
192
+
193
+ return WeightedSampler(origins, weights, rng)
194
+
195
+ def _sample_name(self, freq_map: Dict[str, int], rng: random.Random, rare_name_boost: float, noise_level: float) -> str:
196
+ if not freq_map:
197
+ return "unknown"
198
+
199
+ items = list(freq_map.keys())
200
+ weights: List[float] = []
201
+
202
+ for name in items:
203
+ f = max(1, int(freq_map[name]))
204
+ exponent = 1.0 / max(1e-9, rare_name_boost)
205
+ w = float(f) ** exponent
206
+
207
+ if noise_level > 0:
208
+ w *= (1.0 + rng.uniform(-noise_level, noise_level))
209
+
210
+ weights.append(max(1e-12, w))
211
+
212
+ sampler = WeightedSampler(items, weights, rng)
213
+ return sampler.sample()
214
+
215
+ def _export_csv(self, records: List[SyntheticRecord], path: str) -> None:
216
+ fieldnames = ["first_name", "last_name", "origin_country", "context_country", "nationality_top1", "nationality_topk", "ethnicity_topk"]
217
+ with open(path, "w", newline="", encoding="utf-8") as f:
218
+ w = csv.DictWriter(f, fieldnames=fieldnames)
219
+ w.writeheader()
220
+ for r in records:
221
+ w.writerow({
222
+ "first_name": r.first_name,
223
+ "last_name": r.last_name,
224
+ "origin_country": r.origin_country,
225
+ "context_country": r.context_country or "",
226
+ "nationality_top1": r.nationality_top1 or "",
227
+ "nationality_topk": json.dumps(r.nationality_topk, ensure_ascii=False) if r.nationality_topk else "",
228
+ "ethnicity_topk": json.dumps(r.ethnicity_topk, ensure_ascii=False) if r.ethnicity_topk else "",
229
+ })
230
+
231
+ def _export_jsonl(self, records: List[SyntheticRecord], path: str) -> None:
232
+ with open(path, "w", encoding="utf-8") as f:
233
+ for r in records:
234
+ obj = {
235
+ "first_name": r.first_name,
236
+ "last_name": r.last_name,
237
+ "origin_country": r.origin_country,
238
+ "context_country": r.context_country,
239
+ "nationality_top1": r.nationality_top1,
240
+ "nationality_topk": r.nationality_topk,
241
+ "ethnicity_topk": r.ethnicity_topk,
242
+ }
243
+ f.write(json.dumps(obj, ensure_ascii=False) + "\n")
244
+
245
+ def _validate_cfg(self, cfg: SyntheticConfig) -> None:
246
+ if cfg.size <= 0:
247
+ raise ValueError("size must be > 0")
248
+ if not (0.0 <= cfg.diaspora_ratio <= 1.0):
249
+ raise ValueError("diaspora_ratio must be in [0,1]")
250
+ if cfg.noise_level < 0:
251
+ raise ValueError("noise_level must be >= 0")
252
+ if cfg.rare_name_boost <= 0:
253
+ raise ValueError("rare_name_boost must be > 0")