ethnidata 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ethnidata might be problematic. Click here for more details.
- ethnidata/__init__.py +96 -0
- ethnidata/downloader.py +143 -0
- ethnidata/ethnidata.db +0 -0
- ethnidata/explainability.py +233 -0
- ethnidata/morphology.py +286 -0
- ethnidata/predictor.py +699 -0
- ethnidata/predictor_old.py +277 -0
- ethnidata/synthetic/__init__.py +23 -0
- ethnidata/synthetic/engine.py +253 -0
- ethnidata-4.0.1.dist-info/METADATA +534 -0
- ethnidata-4.0.1.dist-info/RECORD +14 -0
- ethnidata-4.0.1.dist-info/WHEEL +5 -0
- ethnidata-4.0.1.dist-info/licenses/LICENSE +21 -0
- ethnidata-4.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Predictor - Ana tahmin modülü
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Literal
|
|
8
|
+
from unidecode import unidecode
|
|
9
|
+
import pycountry
|
|
10
|
+
|
|
11
|
+
class EthniData:
|
|
12
|
+
"""Ethnicity and Nationality Data predictor"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize EthniData predictor
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
db_path: Path to SQLite database. If None, uses default location.
|
|
20
|
+
"""
|
|
21
|
+
if db_path is None:
|
|
22
|
+
db_path = Path(__file__).parent / "ethnidata.db"
|
|
23
|
+
|
|
24
|
+
self.db_path = Path(db_path)
|
|
25
|
+
|
|
26
|
+
if not self.db_path.exists():
|
|
27
|
+
raise FileNotFoundError(
|
|
28
|
+
f"Database not found: {self.db_path}\n"
|
|
29
|
+
f"Please run scripts/6_create_database.py first"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
33
|
+
self.conn.row_factory = sqlite3.Row
|
|
34
|
+
|
|
35
|
+
def __del__(self):
|
|
36
|
+
"""Close database connection"""
|
|
37
|
+
if hasattr(self, 'conn'):
|
|
38
|
+
self.conn.close()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def normalize_name(name: str) -> str:
|
|
42
|
+
"""Normalize name (lowercase, remove accents)"""
|
|
43
|
+
return unidecode(name.strip().lower())
|
|
44
|
+
|
|
45
|
+
def predict_nationality(
|
|
46
|
+
self,
|
|
47
|
+
name: str,
|
|
48
|
+
name_type: Literal["first", "last"] = "first",
|
|
49
|
+
top_n: int = 5
|
|
50
|
+
) -> Dict:
|
|
51
|
+
"""
|
|
52
|
+
Predict nationality from name
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
name: First or last name
|
|
56
|
+
name_type: "first" or "last"
|
|
57
|
+
top_n: Number of top predictions to return
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
{
|
|
61
|
+
'name': normalized name,
|
|
62
|
+
'country': top country code (ISO 3166-1 alpha-3),
|
|
63
|
+
'confidence': confidence score (0-1),
|
|
64
|
+
'top_countries': [{country, probability, frequency}, ...]
|
|
65
|
+
}
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
normalized = self.normalize_name(name)
|
|
69
|
+
|
|
70
|
+
table = "first_names" if name_type == "first" else "last_names"
|
|
71
|
+
|
|
72
|
+
# Query database
|
|
73
|
+
query = f"""
|
|
74
|
+
SELECT country_code, frequency
|
|
75
|
+
FROM {table}
|
|
76
|
+
WHERE name = ?
|
|
77
|
+
ORDER BY frequency DESC
|
|
78
|
+
LIMIT ?
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
cursor = self.conn.cursor()
|
|
82
|
+
cursor.execute(query, (normalized, top_n))
|
|
83
|
+
|
|
84
|
+
results = cursor.fetchall()
|
|
85
|
+
|
|
86
|
+
if not results:
|
|
87
|
+
return {
|
|
88
|
+
'name': normalized,
|
|
89
|
+
'country': None,
|
|
90
|
+
'confidence': 0.0,
|
|
91
|
+
'top_countries': []
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Calculate probabilities
|
|
95
|
+
total_frequency = sum(row['frequency'] for row in results)
|
|
96
|
+
|
|
97
|
+
top_countries = []
|
|
98
|
+
for row in results:
|
|
99
|
+
prob = row['frequency'] / total_frequency
|
|
100
|
+
|
|
101
|
+
# Country name lookup
|
|
102
|
+
try:
|
|
103
|
+
country = pycountry.countries.get(alpha_3=row['country_code'])
|
|
104
|
+
country_name = country.name if country else row['country_code']
|
|
105
|
+
except:
|
|
106
|
+
country_name = row['country_code']
|
|
107
|
+
|
|
108
|
+
top_countries.append({
|
|
109
|
+
'country': row['country_code'],
|
|
110
|
+
'country_name': country_name,
|
|
111
|
+
'probability': round(prob, 4),
|
|
112
|
+
'frequency': row['frequency']
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
# Top prediction
|
|
116
|
+
top = top_countries[0]
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
'name': normalized,
|
|
120
|
+
'country': top['country'],
|
|
121
|
+
'country_name': top['country_name'],
|
|
122
|
+
'confidence': top['probability'],
|
|
123
|
+
'top_countries': top_countries
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def predict_ethnicity(
|
|
127
|
+
self,
|
|
128
|
+
name: str,
|
|
129
|
+
name_type: Literal["first", "last"] = "first"
|
|
130
|
+
) -> Dict:
|
|
131
|
+
"""
|
|
132
|
+
Predict ethnicity from name
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
name: First or last name
|
|
136
|
+
name_type: "first" or "last"
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
{
|
|
140
|
+
'name': normalized name,
|
|
141
|
+
'ethnicity': predicted ethnicity,
|
|
142
|
+
'country': most likely country,
|
|
143
|
+
'confidence': confidence score
|
|
144
|
+
}
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
normalized = self.normalize_name(name)
|
|
148
|
+
|
|
149
|
+
table = "first_names" if name_type == "first" else "last_names"
|
|
150
|
+
|
|
151
|
+
# Query with ethnicity
|
|
152
|
+
query = f"""
|
|
153
|
+
SELECT country_code, ethnicity, frequency
|
|
154
|
+
FROM {table}
|
|
155
|
+
WHERE name = ? AND ethnicity IS NOT NULL
|
|
156
|
+
ORDER BY frequency DESC
|
|
157
|
+
LIMIT 1
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
cursor = self.conn.cursor()
|
|
161
|
+
cursor.execute(query, (normalized,))
|
|
162
|
+
|
|
163
|
+
result = cursor.fetchone()
|
|
164
|
+
|
|
165
|
+
if result:
|
|
166
|
+
# Country name
|
|
167
|
+
try:
|
|
168
|
+
country = pycountry.countries.get(alpha_3=result['country_code'])
|
|
169
|
+
country_name = country.name if country else result['country_code']
|
|
170
|
+
except:
|
|
171
|
+
country_name = result['country_code']
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
'name': normalized,
|
|
175
|
+
'ethnicity': result['ethnicity'],
|
|
176
|
+
'country': result['country_code'],
|
|
177
|
+
'country_name': country_name,
|
|
178
|
+
'frequency': result['frequency']
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Fallback to nationality prediction
|
|
182
|
+
nationality = self.predict_nationality(name, name_type, top_n=1)
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
'name': normalized,
|
|
186
|
+
'ethnicity': None,
|
|
187
|
+
'country': nationality['country'],
|
|
188
|
+
'country_name': nationality.get('country_name'),
|
|
189
|
+
'confidence': nationality['confidence']
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
def predict_full_name(
|
|
193
|
+
self,
|
|
194
|
+
first_name: str,
|
|
195
|
+
last_name: str,
|
|
196
|
+
top_n: int = 5
|
|
197
|
+
) -> Dict:
|
|
198
|
+
"""
|
|
199
|
+
Predict nationality from full name (first + last)
|
|
200
|
+
|
|
201
|
+
Combines predictions from both first and last names
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
first_name: First name
|
|
205
|
+
last_name: Last name
|
|
206
|
+
top_n: Number of top predictions
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Combined prediction with country probabilities
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
first_pred = self.predict_nationality(first_name, "first", top_n=top_n)
|
|
213
|
+
last_pred = self.predict_nationality(last_name, "last", top_n=top_n)
|
|
214
|
+
|
|
215
|
+
# Combine probabilities
|
|
216
|
+
combined_scores = {}
|
|
217
|
+
|
|
218
|
+
for item in first_pred['top_countries']:
|
|
219
|
+
combined_scores[item['country']] = item['probability'] * 0.4
|
|
220
|
+
|
|
221
|
+
for item in last_pred['top_countries']:
|
|
222
|
+
if item['country'] in combined_scores:
|
|
223
|
+
combined_scores[item['country']] += item['probability'] * 0.6
|
|
224
|
+
else:
|
|
225
|
+
combined_scores[item['country']] = item['probability'] * 0.6
|
|
226
|
+
|
|
227
|
+
# Sort by combined score
|
|
228
|
+
sorted_countries = sorted(
|
|
229
|
+
combined_scores.items(),
|
|
230
|
+
key=lambda x: x[1],
|
|
231
|
+
reverse=True
|
|
232
|
+
)[:top_n]
|
|
233
|
+
|
|
234
|
+
# Format results
|
|
235
|
+
top_countries = []
|
|
236
|
+
for country_code, score in sorted_countries:
|
|
237
|
+
try:
|
|
238
|
+
country = pycountry.countries.get(alpha_3=country_code)
|
|
239
|
+
country_name = country.name if country else country_code
|
|
240
|
+
except:
|
|
241
|
+
country_name = country_code
|
|
242
|
+
|
|
243
|
+
top_countries.append({
|
|
244
|
+
'country': country_code,
|
|
245
|
+
'country_name': country_name,
|
|
246
|
+
'probability': round(score, 4)
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
'first_name': self.normalize_name(first_name),
|
|
251
|
+
'last_name': self.normalize_name(last_name),
|
|
252
|
+
'country': top_countries[0]['country'] if top_countries else None,
|
|
253
|
+
'country_name': top_countries[0]['country_name'] if top_countries else None,
|
|
254
|
+
'confidence': top_countries[0]['probability'] if top_countries else 0.0,
|
|
255
|
+
'top_countries': top_countries
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
def get_stats(self) -> Dict:
|
|
259
|
+
"""Get database statistics"""
|
|
260
|
+
|
|
261
|
+
cursor = self.conn.cursor()
|
|
262
|
+
|
|
263
|
+
stats = {}
|
|
264
|
+
|
|
265
|
+
cursor.execute("SELECT COUNT(*) as count FROM first_names")
|
|
266
|
+
stats['total_first_names'] = cursor.fetchone()['count']
|
|
267
|
+
|
|
268
|
+
cursor.execute("SELECT COUNT(*) as count FROM last_names")
|
|
269
|
+
stats['total_last_names'] = cursor.fetchone()['count']
|
|
270
|
+
|
|
271
|
+
cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM first_names")
|
|
272
|
+
stats['countries_first'] = cursor.fetchone()['count']
|
|
273
|
+
|
|
274
|
+
cursor.execute("SELECT COUNT(DISTINCT country_code) as count FROM last_names")
|
|
275
|
+
stats['countries_last'] = cursor.fetchone()['count']
|
|
276
|
+
|
|
277
|
+
return stats
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Synthetic Data Engine (v2.0)
|
|
3
|
+
|
|
4
|
+
Privacy-safe synthetic name population generator.
|
|
5
|
+
|
|
6
|
+
License: MIT
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .engine import (
|
|
10
|
+
SyntheticDataEngine,
|
|
11
|
+
SyntheticConfig,
|
|
12
|
+
SyntheticRecord,
|
|
13
|
+
FrequencyProvider,
|
|
14
|
+
WeightedSampler
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
'SyntheticDataEngine',
|
|
19
|
+
'SyntheticConfig',
|
|
20
|
+
'SyntheticRecord',
|
|
21
|
+
'FrequencyProvider',
|
|
22
|
+
'WeightedSampler'
|
|
23
|
+
]
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EthniData Synthetic Data Engine (v2.0)
|
|
3
|
+
License: MIT
|
|
4
|
+
|
|
5
|
+
Goal:
|
|
6
|
+
- Generate privacy-safe, statistically plausible synthetic name populations
|
|
7
|
+
- No real-person generation: sampling from aggregated frequency tables
|
|
8
|
+
- Deterministic with seed
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import Dict, List, Optional, Any
|
|
15
|
+
import random
|
|
16
|
+
import csv
|
|
17
|
+
import json
|
|
18
|
+
from collections import Counter
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class SyntheticConfig:
|
|
23
|
+
seed: int = 42
|
|
24
|
+
size: int = 10000
|
|
25
|
+
country: str = "TUR"
|
|
26
|
+
context_country: Optional[str] = None
|
|
27
|
+
diaspora_ratio: float = 0.15
|
|
28
|
+
diaspora_strength: float = 1.0
|
|
29
|
+
rare_name_boost: float = 1.0
|
|
30
|
+
noise_level: float = 0.02
|
|
31
|
+
first_weight: float = 0.4
|
|
32
|
+
last_weight: float = 0.6
|
|
33
|
+
include_probabilities: bool = True
|
|
34
|
+
include_ethnicity_profile: bool = False
|
|
35
|
+
export_format: str = "csv"
|
|
36
|
+
output_path: str = "synthetic_population.csv"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class SyntheticRecord:
|
|
41
|
+
first_name: str
|
|
42
|
+
last_name: str
|
|
43
|
+
origin_country: str
|
|
44
|
+
context_country: Optional[str]
|
|
45
|
+
nationality_top1: Optional[str] = None
|
|
46
|
+
nationality_topk: Optional[List[Dict[str, Any]]] = None
|
|
47
|
+
ethnicity_topk: Optional[List[Dict[str, Any]]] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class WeightedSampler:
|
|
51
|
+
"""Fast weighted sampler using cumulative distribution."""
|
|
52
|
+
def __init__(self, items: List[str], weights: List[float], rng: random.Random):
|
|
53
|
+
if len(items) != len(weights) or not items:
|
|
54
|
+
raise ValueError("items/weights mismatch or empty")
|
|
55
|
+
total = sum(weights)
|
|
56
|
+
if total <= 0:
|
|
57
|
+
raise ValueError("non-positive total weight")
|
|
58
|
+
|
|
59
|
+
self.items = items
|
|
60
|
+
self.cdf = []
|
|
61
|
+
cum = 0.0
|
|
62
|
+
for w in weights:
|
|
63
|
+
cum += w / total
|
|
64
|
+
self.cdf.append(cum)
|
|
65
|
+
self.rng = rng
|
|
66
|
+
|
|
67
|
+
def sample(self) -> str:
|
|
68
|
+
r = self.rng.random()
|
|
69
|
+
lo, hi = 0, len(self.cdf) - 1
|
|
70
|
+
while lo < hi:
|
|
71
|
+
mid = (lo + hi) // 2
|
|
72
|
+
if r <= self.cdf[mid]:
|
|
73
|
+
hi = mid
|
|
74
|
+
else:
|
|
75
|
+
lo = mid + 1
|
|
76
|
+
return self.items[lo]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class FrequencyProvider:
|
|
80
|
+
"""
|
|
81
|
+
Adapter interface for EthniData integration.
|
|
82
|
+
Implement this against your existing DB + predictor.
|
|
83
|
+
"""
|
|
84
|
+
def get_first_name_freq(self, country: str) -> Dict[str, int]:
|
|
85
|
+
raise NotImplementedError
|
|
86
|
+
|
|
87
|
+
def get_last_name_freq(self, country: str) -> Dict[str, int]:
|
|
88
|
+
raise NotImplementedError
|
|
89
|
+
|
|
90
|
+
def get_migration_weights(self, context_country: str) -> Dict[str, float]:
|
|
91
|
+
return {}
|
|
92
|
+
|
|
93
|
+
def predict_full_name(self, first: str, last: str, context_country: Optional[str] = None) -> Dict[str, Any]:
|
|
94
|
+
return {"country": None, "top_countries": []}
|
|
95
|
+
|
|
96
|
+
def predict_ethnicity(self, name: str, name_type: str = "first", context_country: Optional[str] = None) -> Dict[str, Any]:
|
|
97
|
+
return {"top_ethnicities": []}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class SyntheticDataEngine:
|
|
101
|
+
"""
|
|
102
|
+
Privacy-safe synthetic data generator.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(self, freq_provider: FrequencyProvider):
|
|
106
|
+
self.freq_provider = freq_provider
|
|
107
|
+
|
|
108
|
+
def generate(self, cfg: SyntheticConfig) -> List[SyntheticRecord]:
|
|
109
|
+
self._validate_cfg(cfg)
|
|
110
|
+
rng = random.Random(cfg.seed)
|
|
111
|
+
|
|
112
|
+
mixture = self._build_country_mixture(cfg, rng)
|
|
113
|
+
records: List[SyntheticRecord] = []
|
|
114
|
+
|
|
115
|
+
for _ in range(cfg.size):
|
|
116
|
+
origin_country = mixture.sample()
|
|
117
|
+
first = self._sample_name(
|
|
118
|
+
self.freq_provider.get_first_name_freq(origin_country),
|
|
119
|
+
rng, cfg.rare_name_boost, cfg.noise_level
|
|
120
|
+
)
|
|
121
|
+
last = self._sample_name(
|
|
122
|
+
self.freq_provider.get_last_name_freq(origin_country),
|
|
123
|
+
rng, cfg.rare_name_boost, cfg.noise_level
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
rec = SyntheticRecord(
|
|
127
|
+
first_name=first,
|
|
128
|
+
last_name=last,
|
|
129
|
+
origin_country=origin_country,
|
|
130
|
+
context_country=cfg.context_country
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if cfg.include_probabilities:
|
|
134
|
+
nat = self.freq_provider.predict_full_name(first, last, context_country=cfg.context_country)
|
|
135
|
+
rec.nationality_top1 = nat.get("country")
|
|
136
|
+
rec.nationality_topk = nat.get("top_countries")
|
|
137
|
+
|
|
138
|
+
if cfg.include_ethnicity_profile:
|
|
139
|
+
eth = self.freq_provider.predict_ethnicity(first, name_type="first", context_country=cfg.context_country)
|
|
140
|
+
rec.ethnicity_topk = eth.get("top_ethnicities") or eth.get("ethnic_profile") or None
|
|
141
|
+
|
|
142
|
+
records.append(rec)
|
|
143
|
+
|
|
144
|
+
return records
|
|
145
|
+
|
|
146
|
+
def export(self, records: List[SyntheticRecord], cfg: SyntheticConfig) -> None:
|
|
147
|
+
if cfg.export_format == "csv":
|
|
148
|
+
self._export_csv(records, cfg.output_path)
|
|
149
|
+
elif cfg.export_format == "jsonl":
|
|
150
|
+
self._export_jsonl(records, cfg.output_path)
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(f"Unsupported export_format: {cfg.export_format}")
|
|
153
|
+
|
|
154
|
+
def sanity_report(self, records: List[SyntheticRecord]) -> Dict[str, Any]:
|
|
155
|
+
"""Distribution checks for debugging."""
|
|
156
|
+
origins = Counter(r.origin_country for r in records)
|
|
157
|
+
firsts = Counter(r.first_name for r in records)
|
|
158
|
+
lasts = Counter(r.last_name for r in records)
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"n": len(records),
|
|
162
|
+
"unique_first_names": len(firsts),
|
|
163
|
+
"unique_last_names": len(lasts),
|
|
164
|
+
"top_origin_countries": origins.most_common(10),
|
|
165
|
+
"top_first_names": firsts.most_common(10),
|
|
166
|
+
"top_last_names": lasts.most_common(10),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
def _build_country_mixture(self, cfg: SyntheticConfig, rng: random.Random) -> WeightedSampler:
|
|
170
|
+
base_country = cfg.country
|
|
171
|
+
|
|
172
|
+
if not cfg.context_country or cfg.diaspora_ratio <= 0:
|
|
173
|
+
return WeightedSampler([base_country], [1.0], rng)
|
|
174
|
+
|
|
175
|
+
mig = self.freq_provider.get_migration_weights(cfg.context_country)
|
|
176
|
+
mig = dict(mig)
|
|
177
|
+
mig.setdefault(base_country, 1.0)
|
|
178
|
+
|
|
179
|
+
origins = list(mig.keys())
|
|
180
|
+
weights = []
|
|
181
|
+
total_other = sum(w for c, w in mig.items() if c != base_country)
|
|
182
|
+
|
|
183
|
+
for c in origins:
|
|
184
|
+
if c == base_country:
|
|
185
|
+
weights.append(max(1e-9, 1.0 - cfg.diaspora_ratio))
|
|
186
|
+
else:
|
|
187
|
+
if total_other <= 0:
|
|
188
|
+
weights.append(1e-9)
|
|
189
|
+
else:
|
|
190
|
+
w = (mig[c] / total_other) * cfg.diaspora_ratio * cfg.diaspora_strength
|
|
191
|
+
weights.append(max(1e-9, w))
|
|
192
|
+
|
|
193
|
+
return WeightedSampler(origins, weights, rng)
|
|
194
|
+
|
|
195
|
+
def _sample_name(self, freq_map: Dict[str, int], rng: random.Random, rare_name_boost: float, noise_level: float) -> str:
|
|
196
|
+
if not freq_map:
|
|
197
|
+
return "unknown"
|
|
198
|
+
|
|
199
|
+
items = list(freq_map.keys())
|
|
200
|
+
weights: List[float] = []
|
|
201
|
+
|
|
202
|
+
for name in items:
|
|
203
|
+
f = max(1, int(freq_map[name]))
|
|
204
|
+
exponent = 1.0 / max(1e-9, rare_name_boost)
|
|
205
|
+
w = float(f) ** exponent
|
|
206
|
+
|
|
207
|
+
if noise_level > 0:
|
|
208
|
+
w *= (1.0 + rng.uniform(-noise_level, noise_level))
|
|
209
|
+
|
|
210
|
+
weights.append(max(1e-12, w))
|
|
211
|
+
|
|
212
|
+
sampler = WeightedSampler(items, weights, rng)
|
|
213
|
+
return sampler.sample()
|
|
214
|
+
|
|
215
|
+
def _export_csv(self, records: List[SyntheticRecord], path: str) -> None:
|
|
216
|
+
fieldnames = ["first_name", "last_name", "origin_country", "context_country", "nationality_top1", "nationality_topk", "ethnicity_topk"]
|
|
217
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
218
|
+
w = csv.DictWriter(f, fieldnames=fieldnames)
|
|
219
|
+
w.writeheader()
|
|
220
|
+
for r in records:
|
|
221
|
+
w.writerow({
|
|
222
|
+
"first_name": r.first_name,
|
|
223
|
+
"last_name": r.last_name,
|
|
224
|
+
"origin_country": r.origin_country,
|
|
225
|
+
"context_country": r.context_country or "",
|
|
226
|
+
"nationality_top1": r.nationality_top1 or "",
|
|
227
|
+
"nationality_topk": json.dumps(r.nationality_topk, ensure_ascii=False) if r.nationality_topk else "",
|
|
228
|
+
"ethnicity_topk": json.dumps(r.ethnicity_topk, ensure_ascii=False) if r.ethnicity_topk else "",
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
def _export_jsonl(self, records: List[SyntheticRecord], path: str) -> None:
|
|
232
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
233
|
+
for r in records:
|
|
234
|
+
obj = {
|
|
235
|
+
"first_name": r.first_name,
|
|
236
|
+
"last_name": r.last_name,
|
|
237
|
+
"origin_country": r.origin_country,
|
|
238
|
+
"context_country": r.context_country,
|
|
239
|
+
"nationality_top1": r.nationality_top1,
|
|
240
|
+
"nationality_topk": r.nationality_topk,
|
|
241
|
+
"ethnicity_topk": r.ethnicity_topk,
|
|
242
|
+
}
|
|
243
|
+
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
|
244
|
+
|
|
245
|
+
def _validate_cfg(self, cfg: SyntheticConfig) -> None:
|
|
246
|
+
if cfg.size <= 0:
|
|
247
|
+
raise ValueError("size must be > 0")
|
|
248
|
+
if not (0.0 <= cfg.diaspora_ratio <= 1.0):
|
|
249
|
+
raise ValueError("diaspora_ratio must be in [0,1]")
|
|
250
|
+
if cfg.noise_level < 0:
|
|
251
|
+
raise ValueError("noise_level must be >= 0")
|
|
252
|
+
if cfg.rare_name_boost <= 0:
|
|
253
|
+
raise ValueError("rare_name_boost must be > 0")
|