@elanlanguages/bridge-anonymization 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +73 -1
  2. package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
  3. package/dist/crypto/pii-map-crypto.js +8 -8
  4. package/dist/crypto/pii-map-crypto.js.map +1 -1
  5. package/dist/index.d.ts +25 -20
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +103 -52
  8. package/dist/index.js.map +1 -1
  9. package/dist/ner/model-manager.d.ts.map +1 -1
  10. package/dist/ner/model-manager.js +10 -8
  11. package/dist/ner/model-manager.js.map +1 -1
  12. package/dist/ner/ner-model.d.ts.map +1 -1
  13. package/dist/ner/ner-model.js +9 -9
  14. package/dist/ner/ner-model.js.map +1 -1
  15. package/dist/ner/onnx-runtime.d.ts +3 -3
  16. package/dist/ner/onnx-runtime.d.ts.map +1 -1
  17. package/dist/ner/onnx-runtime.js +1 -1
  18. package/dist/ner/onnx-runtime.js.map +1 -1
  19. package/dist/ner/tokenizer.js +3 -3
  20. package/dist/ner/tokenizer.js.map +1 -1
  21. package/dist/pipeline/index.d.ts +7 -4
  22. package/dist/pipeline/index.d.ts.map +1 -1
  23. package/dist/pipeline/index.js +7 -4
  24. package/dist/pipeline/index.js.map +1 -1
  25. package/dist/pipeline/resolver.d.ts.map +1 -1
  26. package/dist/pipeline/resolver.js +3 -2
  27. package/dist/pipeline/resolver.js.map +1 -1
  28. package/dist/pipeline/semantic-data-loader.d.ts +157 -0
  29. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  30. package/dist/pipeline/semantic-data-loader.js +662 -0
  31. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  32. package/dist/pipeline/semantic-enricher.d.ts +102 -0
  33. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  34. package/dist/pipeline/semantic-enricher.js +268 -0
  35. package/dist/pipeline/semantic-enricher.js.map +1 -0
  36. package/dist/pipeline/tagger.d.ts +52 -12
  37. package/dist/pipeline/tagger.d.ts.map +1 -1
  38. package/dist/pipeline/tagger.js +226 -21
  39. package/dist/pipeline/tagger.js.map +1 -1
  40. package/dist/pipeline/title-extractor.d.ts +79 -0
  41. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  42. package/dist/pipeline/title-extractor.js +801 -0
  43. package/dist/pipeline/title-extractor.js.map +1 -0
  44. package/dist/types/index.d.ts +66 -3
  45. package/dist/types/index.d.ts.map +1 -1
  46. package/dist/types/index.js +14 -3
  47. package/dist/types/index.js.map +1 -1
  48. package/dist/utils/index.d.ts +3 -3
  49. package/dist/utils/index.js +3 -3
  50. package/package.json +7 -5
@@ -0,0 +1,662 @@
1
+ /**
2
+ * Semantic Data Loader
3
+ * Handles automatic downloading, caching, and parsing of semantic enrichment data.
4
+ *
5
+ * Data sources:
6
+ * - nam_dict.txt: Name-gender mappings from gender-guesser (~40K names)
7
+ * - cities15000.txt: GeoNames cities with population > 15,000 (~25K cities)
8
+ * - countryInfo.txt: Country names and codes (~250 countries)
9
+ * - admin1CodesASCII.txt: First-level admin divisions (~4K regions)
10
+ *
11
+ * Data is cached in the same location as NER models:
12
+ * - macOS: ~/Library/Caches/bridge-anonymization/semantic-data/
13
+ * - Linux: ~/.cache/bridge-anonymization/semantic-data/
14
+ * - Windows: %LOCALAPPDATA%/bridge-anonymization/semantic-data/
15
+ */
16
+ import * as fs from "fs";
17
+ import * as fsPromises from "fs/promises";
18
+ import * as path from "path";
19
+ import * as os from "os";
20
+ /**
21
+ * Gender code mappings from nam_dict.txt format
22
+ */
23
+ const GENDER_CODE_MAP = {
24
+ M: "male",
25
+ "1M": "male",
26
+ "?M": "male", // mostly male
27
+ F: "female",
28
+ "1F": "female",
29
+ "?F": "female", // mostly female
30
+ "?": "neutral", // unisex
31
+ };
32
+ /**
33
+ * Country column positions in nam_dict.txt (0-indexed from column 30)
34
+ * These correspond to the frequency values for each country
35
+ */
36
+ const COUNTRY_COLUMNS = {
37
+ gb: 0, // Great Britain
38
+ ie: 1, // Ireland
39
+ us: 2, // USA
40
+ it: 3, // Italy
41
+ mt: 4, // Malta
42
+ pt: 5, // Portugal
43
+ es: 6, // Spain
44
+ fr: 7, // France
45
+ be: 8, // Belgium
46
+ lu: 9, // Luxembourg
47
+ nl: 10, // Netherlands
48
+ de: 11, // Germany (East Frisia at 10, Germany at 11)
49
+ at: 12, // Austria
50
+ ch: 13, // Swiss
51
+ is: 14, // Iceland
52
+ dk: 15, // Denmark
53
+ no: 16, // Norway
54
+ se: 17, // Sweden
55
+ fi: 18, // Finland
56
+ ee: 19, // Estonia
57
+ lv: 20, // Latvia
58
+ lt: 21, // Lithuania
59
+ pl: 22, // Poland
60
+ cz: 23, // Czech Republic
61
+ sk: 24, // Slovakia
62
+ hu: 25, // Hungary
63
+ ro: 26, // Romania
64
+ bg: 27, // Bulgaria
65
+ hr: 28, // Croatia (Bosnian)
66
+ si: 29, // Slovenia
67
+ rs: 30, // Serbia (Albanian)
68
+ mk: 31, // Macedonia
69
+ gr: 32, // Greece
70
+ ru: 33, // Russia
71
+ by: 34, // Belarus
72
+ md: 35, // Moldova
73
+ ua: 36, // Ukraine
74
+ am: 37, // Armenia
75
+ az: 38, // Azerbaijan
76
+ ge: 39, // Georgia
77
+ kz: 40, // Kazakhstan/Uzbekistan
78
+ tr: 41, // Turkey
79
+ sa: 42, // Arabia/Persia
80
+ il: 43, // Israel
81
+ cn: 44, // China
82
+ in: 45, // India/Sri Lanka
83
+ jp: 46, // Japan
84
+ kr: 47, // Korea
85
+ vn: 48, // Vietnam
86
+ };
87
+ // Keep COUNTRY_COLUMNS for future locale-specific gender lookups
88
+ void COUNTRY_COLUMNS;
89
+ // Global data store (lazily loaded)
90
+ let semanticData = null;
91
+ // =============================================================================
92
+ // Cache Directory Management
93
+ // =============================================================================
94
+ /**
95
+ * Gets the cache directory for semantic data
96
+ * Uses platform-specific cache location (same as NER models)
97
+ */
98
+ export function getSemanticDataCacheDir() {
99
+ const homeDir = os.homedir();
100
+ switch (process.platform) {
101
+ case "darwin":
102
+ return path.join(homeDir, "Library", "Caches", "bridge-anonymization", "semantic-data");
103
+ case "win32":
104
+ return path.join(process.env["LOCALAPPDATA"] ?? path.join(homeDir, "AppData", "Local"), "bridge-anonymization", "semantic-data");
105
+ default:
106
+ // Linux and others - use XDG_CACHE_HOME or ~/.cache
107
+ return path.join(process.env["XDG_CACHE_HOME"] ?? path.join(homeDir, ".cache"), "bridge-anonymization", "semantic-data");
108
+ }
109
+ }
110
+ /**
111
+ * Gets the path to the data directory (alias for backwards compatibility)
112
+ */
113
+ export function getDataDirectory() {
114
+ return getSemanticDataCacheDir();
115
+ }
116
+ /**
117
+ * Registry of semantic data files and their download URLs
118
+ */
119
+ export const SEMANTIC_DATA_FILES = [
120
+ {
121
+ filename: "nam_dict.txt",
122
+ url: "https://raw.githubusercontent.com/lead-ratings/gender-guesser/master/gender_guesser/data/nam_dict.txt",
123
+ required: true,
124
+ description: "Name-gender mappings (~40K names)",
125
+ size: "~1.5 MB",
126
+ },
127
+ {
128
+ filename: "cities15000.txt",
129
+ url: "https://download.geonames.org/export/dump/cities15000.zip",
130
+ required: true,
131
+ description: "GeoNames cities with population > 15,000",
132
+ size: "~2 MB (compressed)",
133
+ },
134
+ {
135
+ filename: "countryInfo.txt",
136
+ url: "https://download.geonames.org/export/dump/countryInfo.txt",
137
+ required: true,
138
+ description: "Country names and codes",
139
+ size: "~25 KB",
140
+ },
141
+ {
142
+ filename: "admin1CodesASCII.txt",
143
+ url: "https://download.geonames.org/export/dump/admin1CodesASCII.txt",
144
+ required: false,
145
+ description: "First-level admin divisions (states/regions)",
146
+ size: "~350 KB",
147
+ },
148
+ ];
149
+ /**
150
+ * Checks if semantic data is already downloaded
151
+ */
152
+ export async function isSemanticDataDownloaded() {
153
+ const dataDir = getSemanticDataCacheDir();
154
+ try {
155
+ // Check all required files
156
+ for (const file of SEMANTIC_DATA_FILES) {
157
+ if (file.required) {
158
+ await fsPromises.access(path.join(dataDir, file.filename));
159
+ }
160
+ }
161
+ return true;
162
+ }
163
+ catch {
164
+ return false;
165
+ }
166
+ }
167
+ /**
168
+ * Checks if the semantic data files are available (synchronous version)
169
+ */
170
+ export function isSemanticDataAvailable() {
171
+ const dataDir = getSemanticDataCacheDir();
172
+ const requiredFiles = SEMANTIC_DATA_FILES.filter((f) => f.required);
173
+ return requiredFiles.every((file) => {
174
+ try {
175
+ fs.accessSync(path.join(dataDir, file.filename), fs.constants.R_OK);
176
+ return true;
177
+ }
178
+ catch {
179
+ return false;
180
+ }
181
+ });
182
+ }
183
+ /**
184
+ * Downloads a file from URL to local path with progress
185
+ */
186
+ async function downloadFile(url, destPath, onProgress) {
187
+ const response = await fetch(url, {
188
+ headers: {
189
+ "User-Agent": "bridge-anonymization/1.0.0",
190
+ },
191
+ });
192
+ if (!response.ok) {
193
+ if (response.status === 404) {
194
+ throw new Error(`File not found: ${url}`);
195
+ }
196
+ throw new Error(`Failed to download ${url}: ${response.status} ${response.statusText}`);
197
+ }
198
+ const totalBytes = response.headers.get("content-length");
199
+ const total = totalBytes !== null && totalBytes !== "" ? parseInt(totalBytes, 10) : null;
200
+ // Ensure directory exists
201
+ await fsPromises.mkdir(path.dirname(destPath), { recursive: true });
202
+ const fileName = path.basename(destPath);
203
+ const reader = response.body?.getReader();
204
+ if (reader === undefined) {
205
+ throw new Error("Response body is not readable");
206
+ }
207
+ const chunks = [];
208
+ let bytesDownloaded = 0;
209
+ // eslint-disable-next-line no-constant-condition
210
+ while (true) {
211
+ const result = await reader.read();
212
+ if (result.done)
213
+ break;
214
+ const value = result.value;
215
+ chunks.push(value);
216
+ bytesDownloaded += value.length;
217
+ if (onProgress) {
218
+ onProgress({
219
+ file: fileName,
220
+ bytesDownloaded,
221
+ totalBytes: total,
222
+ percent: total !== null && total > 0
223
+ ? Math.round((bytesDownloaded / total) * 100)
224
+ : null,
225
+ });
226
+ }
227
+ }
228
+ // Write all chunks to file
229
+ const buffer = Buffer.concat(chunks);
230
+ await fsPromises.writeFile(destPath, buffer);
231
+ }
232
+ /**
233
+ * Extracts a zip file (for cities15000.zip)
234
+ */
235
+ async function extractZip(zipPath, destDir) {
236
+ // Use Node.js built-in zlib for basic extraction
237
+ // For zip files, we'll use a simple approach with the 'unzip' command
238
+ // or fall back to downloading the uncompressed version if available
239
+ const { exec } = await import("child_process");
240
+ const { promisify } = await import("util");
241
+ const execAsync = promisify(exec);
242
+ try {
243
+ // Try using unzip command (available on most systems)
244
+ await execAsync(`unzip -o "${zipPath}" -d "${destDir}"`);
245
+ }
246
+ catch {
247
+ // If unzip is not available, try using tar (some systems have it)
248
+ try {
249
+ await execAsync(`tar -xf "${zipPath}" -C "${destDir}"`);
250
+ }
251
+ catch {
252
+ throw new Error("Could not extract zip file. Please install 'unzip' command or extract manually.");
253
+ }
254
+ }
255
+ // Clean up zip file
256
+ await fsPromises.unlink(zipPath);
257
+ }
258
+ /**
259
+ * Downloads all semantic data files
260
+ */
261
+ export async function downloadSemanticData(onProgress, onStatus) {
262
+ const dataDir = getSemanticDataCacheDir();
263
+ // Create directory
264
+ await fsPromises.mkdir(dataDir, { recursive: true });
265
+ onStatus?.("Downloading semantic enrichment data...");
266
+ onStatus?.(`Cache directory: ${dataDir}`);
267
+ for (const file of SEMANTIC_DATA_FILES) {
268
+ const destPath = path.join(dataDir, file.filename);
269
+ const isZip = file.url.endsWith(".zip");
270
+ onStatus?.(`Downloading ${file.description}...`);
271
+ try {
272
+ if (isZip) {
273
+ // Download zip and extract
274
+ const zipPath = destPath + ".zip";
275
+ await downloadFile(file.url, zipPath, onProgress);
276
+ onStatus?.(`Extracting ${file.filename}...`);
277
+ await extractZip(zipPath, dataDir);
278
+ }
279
+ else {
280
+ await downloadFile(file.url, destPath, onProgress);
281
+ }
282
+ }
283
+ catch (e) {
284
+ if (file.required) {
285
+ throw new Error(`Failed to download required file ${file.filename}: ${String(e)}`);
286
+ }
287
+ // Optional files can fail silently
288
+ onStatus?.(`Skipping optional file ${file.filename}`);
289
+ }
290
+ }
291
+ onStatus?.("Semantic data download complete!");
292
+ return dataDir;
293
+ }
294
+ /**
295
+ * Ensures semantic data is available, downloading if needed
296
+ */
297
+ export async function ensureSemanticData(options = {}) {
298
+ const { autoDownload = true, onProgress, onStatus } = options;
299
+ const dataDir = getSemanticDataCacheDir();
300
+ // Check if already downloaded
301
+ const isDownloaded = await isSemanticDataDownloaded();
302
+ if (!isDownloaded) {
303
+ if (!autoDownload) {
304
+ throw new Error(`Semantic data not found at ${dataDir}.\n\n` +
305
+ `To download automatically, use:\n` +
306
+ ` createAnonymizer({ semantic: { enabled: true, autoDownload: true } })\n\n` +
307
+ `Or disable semantic masking:\n` +
308
+ ` createAnonymizer({ semantic: { enabled: false } })`);
309
+ }
310
+ await downloadSemanticData(onProgress, onStatus);
311
+ }
312
+ else {
313
+ onStatus?.(`Using cached semantic data: ${dataDir}`);
314
+ }
315
+ return dataDir;
316
+ }
317
+ /**
318
+ * Clears cached semantic data
319
+ */
320
+ export async function clearSemanticDataCache() {
321
+ const dataDir = getSemanticDataCacheDir();
322
+ await fsPromises.rm(dataDir, { recursive: true, force: true });
323
+ // Also clear in-memory data
324
+ clearSemanticData();
325
+ }
326
+ /**
327
+ * Gets info about semantic data files
328
+ */
329
+ export function getSemanticDataInfo() {
330
+ return {
331
+ files: SEMANTIC_DATA_FILES,
332
+ cacheDir: getSemanticDataCacheDir(),
333
+ totalSize: "~4 MB",
334
+ };
335
+ }
336
+ // =============================================================================
337
+ // Data Parsing Functions
338
+ // =============================================================================
339
+ /**
340
+ * Parses nam_dict.txt and extracts name-gender mappings
341
+ */
342
+ function parseNameDict(filePath) {
343
+ const names = new Map();
344
+ const content = fs.readFileSync(filePath, "latin1"); // File uses ISO-8859-1 encoding
345
+ const lines = content.split("\n");
346
+ for (const line of lines) {
347
+ // Skip comments and empty lines
348
+ if (line.startsWith("#") || line.trim() === "")
349
+ continue;
350
+ // Skip lines with '+' in column 30 (expanded umlauts duplicates)
351
+ if (line.length > 29 && line[29] === "+")
352
+ continue;
353
+ // Parse gender code and name
354
+ // Format: <gender_code> <name> <spaces> <frequencies>
355
+ const match = line.match(/^(\?[MF]?|1[MF]|[MF])\s+(\S+)/);
356
+ if (!match)
357
+ continue;
358
+ const [, genderCode, name] = match;
359
+ if (genderCode === undefined ||
360
+ genderCode === "" ||
361
+ name === undefined ||
362
+ name === "")
363
+ continue;
364
+ const gender = GENDER_CODE_MAP[genderCode];
365
+ if (gender === undefined)
366
+ continue;
367
+ const normalizedName = name.toLowerCase();
368
+ // Check for locale-specific gender by looking at frequency columns
369
+ // If a name has frequency in certain countries but different gender elsewhere,
370
+ // we store locale overrides
371
+ const localeOverrides = {};
372
+ // For simplicity, we'll detect common locale-specific patterns
373
+ // The full implementation would parse frequency columns
374
+ // Store the entry
375
+ if (!names.has(normalizedName)) {
376
+ names.set(normalizedName, {
377
+ gender,
378
+ localeOverrides: Object.keys(localeOverrides).length > 0 ? localeOverrides : undefined,
379
+ });
380
+ }
381
+ else {
382
+ // If we see a name again with different gender, mark as locale-specific or neutral
383
+ const existing = names.get(normalizedName);
384
+ if (existing.gender !== gender) {
385
+ // Create locale overrides for the different gender
386
+ if (!existing.localeOverrides) {
387
+ existing.localeOverrides = {};
388
+ }
389
+ // This is a simplification - full implementation would use frequency data
390
+ }
391
+ }
392
+ }
393
+ return names;
394
+ }
395
+ /**
396
+ * Parses cities15000.txt and extracts city data
397
+ * When multiple cities have the same name, keeps the one with highest population
398
+ */
399
+ function parseCities(filePath) {
400
+ const cities = new Map();
401
+ const content = fs.readFileSync(filePath, "utf-8");
402
+ const lines = content.split("\n");
403
+ // Helper to add city only if population is higher than existing
404
+ const addCity = (name, entry) => {
405
+ const normalized = name.toLowerCase();
406
+ const existing = cities.get(normalized);
407
+ if (!existing || entry.population > existing.population) {
408
+ cities.set(normalized, entry);
409
+ }
410
+ };
411
+ for (const line of lines) {
412
+ if (line.trim() === "")
413
+ continue;
414
+ const parts = line.split("\t");
415
+ if (parts.length < 15)
416
+ continue;
417
+ const name = parts[1];
418
+ const asciiName = parts[2];
419
+ const alternateNames = parts[3]?.split(",") ?? [];
420
+ const countryCode = parts[8];
421
+ const population = parseInt(parts[14] ?? "0", 10);
422
+ if (name === undefined ||
423
+ name === "" ||
424
+ countryCode === undefined ||
425
+ countryCode === "")
426
+ continue;
427
+ const cityEntry = { country: countryCode, population };
428
+ // Add main name (prefer higher population)
429
+ addCity(name, cityEntry);
430
+ // Add ASCII name if different (prefer higher population)
431
+ if (asciiName !== undefined &&
432
+ asciiName !== "" &&
433
+ asciiName.toLowerCase() !== name.toLowerCase()) {
434
+ addCity(asciiName, cityEntry);
435
+ }
436
+ // Add alternate names (prefer higher population)
437
+ for (const altName of alternateNames) {
438
+ const trimmed = altName.trim();
439
+ if (trimmed && trimmed.length > 2 && !trimmed.includes(",")) {
440
+ addCity(trimmed, cityEntry);
441
+ }
442
+ }
443
+ }
444
+ return cities;
445
+ }
446
+ /**
447
+ * Parses countryInfo.txt and extracts country data
448
+ */
449
+ function parseCountries(filePath) {
450
+ const countries = new Map();
451
+ const content = fs.readFileSync(filePath, "utf-8");
452
+ const lines = content.split("\n");
453
+ for (const line of lines) {
454
+ // Skip comments and empty lines
455
+ if (line.startsWith("#") || line.trim() === "")
456
+ continue;
457
+ const parts = line.split("\t");
458
+ if (parts.length < 5)
459
+ continue;
460
+ const code = parts[0];
461
+ const name = parts[4];
462
+ if (code === undefined || code === "" || name === undefined || name === "")
463
+ continue;
464
+ // Add country name -> code mapping
465
+ countries.set(name.toLowerCase(), code);
466
+ // Add common variations
467
+ // E.g., "United States" also as "USA", "US", "America"
468
+ const variations = getCountryVariations(name, code);
469
+ for (const variation of variations) {
470
+ countries.set(variation.toLowerCase(), code);
471
+ }
472
+ }
473
+ return countries;
474
+ }
475
+ /**
476
+ * Generates common country name variations
477
+ */
478
+ function getCountryVariations(name, code) {
479
+ const variations = [code.toLowerCase()];
480
+ // Common abbreviations and variants
481
+ const variantMap = {
482
+ "United States": ["USA", "US", "America", "United States of America"],
483
+ "United Kingdom": ["UK", "Britain", "Great Britain", "England"],
484
+ Germany: ["Deutschland"],
485
+ France: ["Frankreich"],
486
+ Spain: ["España", "Espana", "Spanien"],
487
+ Italy: ["Italia", "Italien"],
488
+ Netherlands: ["Holland", "The Netherlands", "Niederlande"],
489
+ Switzerland: ["Schweiz", "Suisse", "Svizzera"],
490
+ Austria: ["Österreich", "Oesterreich"],
491
+ Belgium: ["Belgien", "Belgique"],
492
+ Russia: ["Russland", "Russian Federation"],
493
+ China: ["People's Republic of China", "PRC"],
494
+ Japan: ["Nippon"],
495
+ "South Korea": ["Korea", "Republic of Korea"],
496
+ "United Arab Emirates": ["UAE", "Emirates"],
497
+ "Czech Republic": ["Czechia", "Tschechien"],
498
+ };
499
+ if (variantMap[name]) {
500
+ variations.push(...variantMap[name]);
501
+ }
502
+ return variations;
503
+ }
504
+ /**
505
+ * Parses admin1CodesASCII.txt and extracts region data
506
+ */
507
+ function parseRegions(filePath) {
508
+ const regions = new Map();
509
+ try {
510
+ const content = fs.readFileSync(filePath, "utf-8");
511
+ const lines = content.split("\n");
512
+ for (const line of lines) {
513
+ if (line.trim() === "")
514
+ continue;
515
+ const parts = line.split("\t");
516
+ if (parts.length < 2)
517
+ continue;
518
+ const code = parts[0]; // Format: "US.CA"
519
+ const name = parts[1];
520
+ const asciiName = parts[2];
521
+ if (code === undefined ||
522
+ code === "" ||
523
+ name === undefined ||
524
+ name === "")
525
+ continue;
526
+ const [countryCode] = code.split(".");
527
+ if (countryCode === undefined || countryCode === "")
528
+ continue;
529
+ const regionEntry = { country: countryCode, name };
530
+ // Add region name
531
+ regions.set(name.toLowerCase(), regionEntry);
532
+ // Add ASCII name if different
533
+ if (asciiName !== undefined &&
534
+ asciiName !== "" &&
535
+ asciiName.toLowerCase() !== name.toLowerCase()) {
536
+ regions.set(asciiName.toLowerCase(), regionEntry);
537
+ }
538
+ }
539
+ }
540
+ catch {
541
+ // admin1 file is optional
542
+ }
543
+ return regions;
544
+ }
545
+ // =============================================================================
546
+ // Data Loading and Lookup
547
+ // =============================================================================
548
+ /**
549
+ * Initializes semantic data (downloads if needed, then loads)
550
+ */
551
+ export async function initializeSemanticData(options = {}) {
552
+ // Ensure data is downloaded
553
+ await ensureSemanticData(options);
554
+ // Load the data
555
+ loadSemanticData();
556
+ }
557
+ /**
558
+ * Loads semantic data from cached files
559
+ * @throws Error if required data files are not available
560
+ */
561
+ export function loadSemanticData() {
562
+ if (semanticData !== null && semanticData.loaded === true) {
563
+ return semanticData;
564
+ }
565
+ const dataDir = getSemanticDataCacheDir();
566
+ if (!isSemanticDataAvailable()) {
567
+ throw new Error(`Semantic data files not found in ${dataDir}. ` +
568
+ `Use ensureSemanticData() or createAnonymizer({ semantic: { enabled: true, autoDownload: true } }) to download.`);
569
+ }
570
+ const names = parseNameDict(path.join(dataDir, "nam_dict.txt"));
571
+ const cities = parseCities(path.join(dataDir, "cities15000.txt"));
572
+ const countries = parseCountries(path.join(dataDir, "countryInfo.txt"));
573
+ const regions = parseRegions(path.join(dataDir, "admin1CodesASCII.txt"));
574
+ semanticData = {
575
+ names,
576
+ cities,
577
+ countries,
578
+ regions,
579
+ loaded: true,
580
+ };
581
+ return semanticData;
582
+ }
583
+ /**
584
+ * Gets the loaded semantic data (loads if not already loaded)
585
+ */
586
+ export function getSemanticData() {
587
+ if (semanticData === null || semanticData.loaded !== true) {
588
+ return loadSemanticData();
589
+ }
590
+ return semanticData;
591
+ }
592
+ /**
593
+ * Clears the loaded semantic data (useful for testing)
594
+ */
595
+ export function clearSemanticData() {
596
+ semanticData = null;
597
+ }
598
+ /**
599
+ * Looks up gender for a name
600
+ */
601
+ export function lookupGender(name, locale) {
602
+ const data = getSemanticData();
603
+ const entry = data.names.get(name.toLowerCase());
604
+ if (entry === undefined)
605
+ return undefined;
606
+ // Check for locale-specific override
607
+ if (locale !== undefined &&
608
+ locale !== "" &&
609
+ entry.localeOverrides !== undefined &&
610
+ entry.localeOverrides[locale] !== undefined) {
611
+ return entry.localeOverrides[locale];
612
+ }
613
+ return entry.gender;
614
+ }
615
+ /**
616
+ * Population threshold for "major" cities that take precedence over regions
617
+ */
618
+ const MAJOR_CITY_POPULATION = 500000;
619
+ /**
620
+ * Looks up location type (city, country, or region)
621
+ * Priority: country > major city (pop > 500K) > region > other cities
622
+ */
623
+ export function lookupLocationType(location) {
624
+ const data = getSemanticData();
625
+ const normalized = location.toLowerCase().trim();
626
+ // Check countries FIRST (to avoid "USA" being matched as a city)
627
+ const countryCode = data.countries.get(normalized);
628
+ if (countryCode !== undefined) {
629
+ return { type: "country", countryCode };
630
+ }
631
+ // Check cities - if it's a major city (pop > 500K), prioritize it over regions
632
+ const city = data.cities.get(normalized);
633
+ if (city && city.population >= MAJOR_CITY_POPULATION) {
634
+ return { type: "city", countryCode: city.country };
635
+ }
636
+ // Check regions
637
+ const region = data.regions.get(normalized);
638
+ if (region) {
639
+ return { type: "region", countryCode: region.country };
640
+ }
641
+ // Check remaining cities (smaller cities)
642
+ if (city) {
643
+ return { type: "city", countryCode: city.country };
644
+ }
645
+ return undefined;
646
+ }
647
+ /**
648
+ * Gets statistics about loaded data
649
+ */
650
+ export function getDataStats() {
651
+ if (semanticData === null || semanticData.loaded !== true) {
652
+ return { names: 0, cities: 0, countries: 0, regions: 0, loaded: false };
653
+ }
654
+ return {
655
+ names: semanticData.names.size,
656
+ cities: semanticData.cities.size,
657
+ countries: semanticData.countries.size,
658
+ regions: semanticData.regions.size,
659
+ loaded: true,
660
+ };
661
+ }
662
+ //# sourceMappingURL=semantic-data-loader.js.map