@elanlanguages/bridge-anonymization 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -1
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
- package/dist/crypto/pii-map-crypto.js +8 -8
- package/dist/crypto/pii-map-crypto.js.map +1 -1
- package/dist/index.d.ts +25 -20
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -52
- package/dist/index.js.map +1 -1
- package/dist/ner/model-manager.d.ts.map +1 -1
- package/dist/ner/model-manager.js +10 -8
- package/dist/ner/model-manager.js.map +1 -1
- package/dist/ner/ner-model.d.ts.map +1 -1
- package/dist/ner/ner-model.js +9 -9
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/onnx-runtime.d.ts +3 -3
- package/dist/ner/onnx-runtime.d.ts.map +1 -1
- package/dist/ner/onnx-runtime.js +1 -1
- package/dist/ner/onnx-runtime.js.map +1 -1
- package/dist/ner/tokenizer.js +3 -3
- package/dist/ner/tokenizer.js.map +1 -1
- package/dist/pipeline/index.d.ts +7 -4
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +7 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/resolver.d.ts.map +1 -1
- package/dist/pipeline/resolver.js +3 -2
- package/dist/pipeline/resolver.js.map +1 -1
- package/dist/pipeline/semantic-data-loader.d.ts +157 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +662 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +102 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +268 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +52 -12
- package/dist/pipeline/tagger.d.ts.map +1 -1
- package/dist/pipeline/tagger.js +226 -21
- package/dist/pipeline/tagger.js.map +1 -1
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/types/index.d.ts +66 -3
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +14 -3
- package/dist/types/index.js.map +1 -1
- package/dist/utils/index.d.ts +3 -3
- package/dist/utils/index.js +3 -3
- package/package.json +7 -5
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Data Loader
|
|
3
|
+
* Handles automatic downloading, caching, and parsing of semantic enrichment data.
|
|
4
|
+
*
|
|
5
|
+
* Data sources:
|
|
6
|
+
* - nam_dict.txt: Name-gender mappings from gender-guesser (~40K names)
|
|
7
|
+
* - cities15000.txt: GeoNames cities with population > 15,000 (~25K cities)
|
|
8
|
+
* - countryInfo.txt: Country names and codes (~250 countries)
|
|
9
|
+
* - admin1CodesASCII.txt: First-level admin divisions (~4K regions)
|
|
10
|
+
*
|
|
11
|
+
* Data is cached in the same location as NER models:
|
|
12
|
+
* - macOS: ~/Library/Caches/bridge-anonymization/semantic-data/
|
|
13
|
+
* - Linux: ~/.cache/bridge-anonymization/semantic-data/
|
|
14
|
+
* - Windows: %LOCALAPPDATA%/bridge-anonymization/semantic-data/
|
|
15
|
+
*/
|
|
16
|
+
import * as fs from "fs";
|
|
17
|
+
import * as fsPromises from "fs/promises";
|
|
18
|
+
import * as path from "path";
|
|
19
|
+
import * as os from "os";
|
|
20
|
+
/**
|
|
21
|
+
* Gender code mappings from nam_dict.txt format
|
|
22
|
+
*/
|
|
23
|
+
const GENDER_CODE_MAP = {
|
|
24
|
+
M: "male",
|
|
25
|
+
"1M": "male",
|
|
26
|
+
"?M": "male", // mostly male
|
|
27
|
+
F: "female",
|
|
28
|
+
"1F": "female",
|
|
29
|
+
"?F": "female", // mostly female
|
|
30
|
+
"?": "neutral", // unisex
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* Country column positions in nam_dict.txt (0-indexed from column 30)
|
|
34
|
+
* These correspond to the frequency values for each country
|
|
35
|
+
*/
|
|
36
|
+
const COUNTRY_COLUMNS = {
|
|
37
|
+
gb: 0, // Great Britain
|
|
38
|
+
ie: 1, // Ireland
|
|
39
|
+
us: 2, // USA
|
|
40
|
+
it: 3, // Italy
|
|
41
|
+
mt: 4, // Malta
|
|
42
|
+
pt: 5, // Portugal
|
|
43
|
+
es: 6, // Spain
|
|
44
|
+
fr: 7, // France
|
|
45
|
+
be: 8, // Belgium
|
|
46
|
+
lu: 9, // Luxembourg
|
|
47
|
+
nl: 10, // Netherlands
|
|
48
|
+
de: 11, // Germany (East Frisia at 10, Germany at 11)
|
|
49
|
+
at: 12, // Austria
|
|
50
|
+
ch: 13, // Swiss
|
|
51
|
+
is: 14, // Iceland
|
|
52
|
+
dk: 15, // Denmark
|
|
53
|
+
no: 16, // Norway
|
|
54
|
+
se: 17, // Sweden
|
|
55
|
+
fi: 18, // Finland
|
|
56
|
+
ee: 19, // Estonia
|
|
57
|
+
lv: 20, // Latvia
|
|
58
|
+
lt: 21, // Lithuania
|
|
59
|
+
pl: 22, // Poland
|
|
60
|
+
cz: 23, // Czech Republic
|
|
61
|
+
sk: 24, // Slovakia
|
|
62
|
+
hu: 25, // Hungary
|
|
63
|
+
ro: 26, // Romania
|
|
64
|
+
bg: 27, // Bulgaria
|
|
65
|
+
hr: 28, // Croatia (Bosnian)
|
|
66
|
+
si: 29, // Slovenia
|
|
67
|
+
rs: 30, // Serbia (Albanian)
|
|
68
|
+
mk: 31, // Macedonia
|
|
69
|
+
gr: 32, // Greece
|
|
70
|
+
ru: 33, // Russia
|
|
71
|
+
by: 34, // Belarus
|
|
72
|
+
md: 35, // Moldova
|
|
73
|
+
ua: 36, // Ukraine
|
|
74
|
+
am: 37, // Armenia
|
|
75
|
+
az: 38, // Azerbaijan
|
|
76
|
+
ge: 39, // Georgia
|
|
77
|
+
kz: 40, // Kazakhstan/Uzbekistan
|
|
78
|
+
tr: 41, // Turkey
|
|
79
|
+
sa: 42, // Arabia/Persia
|
|
80
|
+
il: 43, // Israel
|
|
81
|
+
cn: 44, // China
|
|
82
|
+
in: 45, // India/Sri Lanka
|
|
83
|
+
jp: 46, // Japan
|
|
84
|
+
kr: 47, // Korea
|
|
85
|
+
vn: 48, // Vietnam
|
|
86
|
+
};
|
|
87
|
+
// Keep COUNTRY_COLUMNS for future locale-specific gender lookups
|
|
88
|
+
void COUNTRY_COLUMNS;
|
|
89
|
+
// Global data store (lazily loaded)
|
|
90
|
+
let semanticData = null;
|
|
91
|
+
// =============================================================================
|
|
92
|
+
// Cache Directory Management
|
|
93
|
+
// =============================================================================
|
|
94
|
+
/**
|
|
95
|
+
* Gets the cache directory for semantic data
|
|
96
|
+
* Uses platform-specific cache location (same as NER models)
|
|
97
|
+
*/
|
|
98
|
+
export function getSemanticDataCacheDir() {
|
|
99
|
+
const homeDir = os.homedir();
|
|
100
|
+
switch (process.platform) {
|
|
101
|
+
case "darwin":
|
|
102
|
+
return path.join(homeDir, "Library", "Caches", "bridge-anonymization", "semantic-data");
|
|
103
|
+
case "win32":
|
|
104
|
+
return path.join(process.env["LOCALAPPDATA"] ?? path.join(homeDir, "AppData", "Local"), "bridge-anonymization", "semantic-data");
|
|
105
|
+
default:
|
|
106
|
+
// Linux and others - use XDG_CACHE_HOME or ~/.cache
|
|
107
|
+
return path.join(process.env["XDG_CACHE_HOME"] ?? path.join(homeDir, ".cache"), "bridge-anonymization", "semantic-data");
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Gets the path to the data directory (alias for backwards compatibility)
|
|
112
|
+
*/
|
|
113
|
+
export function getDataDirectory() {
|
|
114
|
+
return getSemanticDataCacheDir();
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Registry of semantic data files and their download URLs
|
|
118
|
+
*/
|
|
119
|
+
export const SEMANTIC_DATA_FILES = [
|
|
120
|
+
{
|
|
121
|
+
filename: "nam_dict.txt",
|
|
122
|
+
url: "https://raw.githubusercontent.com/lead-ratings/gender-guesser/master/gender_guesser/data/nam_dict.txt",
|
|
123
|
+
required: true,
|
|
124
|
+
description: "Name-gender mappings (~40K names)",
|
|
125
|
+
size: "~1.5 MB",
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
filename: "cities15000.txt",
|
|
129
|
+
url: "https://download.geonames.org/export/dump/cities15000.zip",
|
|
130
|
+
required: true,
|
|
131
|
+
description: "GeoNames cities with population > 15,000",
|
|
132
|
+
size: "~2 MB (compressed)",
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
filename: "countryInfo.txt",
|
|
136
|
+
url: "https://download.geonames.org/export/dump/countryInfo.txt",
|
|
137
|
+
required: true,
|
|
138
|
+
description: "Country names and codes",
|
|
139
|
+
size: "~25 KB",
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
filename: "admin1CodesASCII.txt",
|
|
143
|
+
url: "https://download.geonames.org/export/dump/admin1CodesASCII.txt",
|
|
144
|
+
required: false,
|
|
145
|
+
description: "First-level admin divisions (states/regions)",
|
|
146
|
+
size: "~350 KB",
|
|
147
|
+
},
|
|
148
|
+
];
|
|
149
|
+
/**
|
|
150
|
+
* Checks if semantic data is already downloaded
|
|
151
|
+
*/
|
|
152
|
+
export async function isSemanticDataDownloaded() {
|
|
153
|
+
const dataDir = getSemanticDataCacheDir();
|
|
154
|
+
try {
|
|
155
|
+
// Check all required files
|
|
156
|
+
for (const file of SEMANTIC_DATA_FILES) {
|
|
157
|
+
if (file.required) {
|
|
158
|
+
await fsPromises.access(path.join(dataDir, file.filename));
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
catch {
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Checks if the semantic data files are available (synchronous version)
|
|
169
|
+
*/
|
|
170
|
+
export function isSemanticDataAvailable() {
|
|
171
|
+
const dataDir = getSemanticDataCacheDir();
|
|
172
|
+
const requiredFiles = SEMANTIC_DATA_FILES.filter((f) => f.required);
|
|
173
|
+
return requiredFiles.every((file) => {
|
|
174
|
+
try {
|
|
175
|
+
fs.accessSync(path.join(dataDir, file.filename), fs.constants.R_OK);
|
|
176
|
+
return true;
|
|
177
|
+
}
|
|
178
|
+
catch {
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Downloads a file from URL to local path with progress
|
|
185
|
+
*/
|
|
186
|
+
async function downloadFile(url, destPath, onProgress) {
|
|
187
|
+
const response = await fetch(url, {
|
|
188
|
+
headers: {
|
|
189
|
+
"User-Agent": "bridge-anonymization/1.0.0",
|
|
190
|
+
},
|
|
191
|
+
});
|
|
192
|
+
if (!response.ok) {
|
|
193
|
+
if (response.status === 404) {
|
|
194
|
+
throw new Error(`File not found: ${url}`);
|
|
195
|
+
}
|
|
196
|
+
throw new Error(`Failed to download ${url}: ${response.status} ${response.statusText}`);
|
|
197
|
+
}
|
|
198
|
+
const totalBytes = response.headers.get("content-length");
|
|
199
|
+
const total = totalBytes !== null && totalBytes !== "" ? parseInt(totalBytes, 10) : null;
|
|
200
|
+
// Ensure directory exists
|
|
201
|
+
await fsPromises.mkdir(path.dirname(destPath), { recursive: true });
|
|
202
|
+
const fileName = path.basename(destPath);
|
|
203
|
+
const reader = response.body?.getReader();
|
|
204
|
+
if (reader === undefined) {
|
|
205
|
+
throw new Error("Response body is not readable");
|
|
206
|
+
}
|
|
207
|
+
const chunks = [];
|
|
208
|
+
let bytesDownloaded = 0;
|
|
209
|
+
// eslint-disable-next-line no-constant-condition
|
|
210
|
+
while (true) {
|
|
211
|
+
const result = await reader.read();
|
|
212
|
+
if (result.done)
|
|
213
|
+
break;
|
|
214
|
+
const value = result.value;
|
|
215
|
+
chunks.push(value);
|
|
216
|
+
bytesDownloaded += value.length;
|
|
217
|
+
if (onProgress) {
|
|
218
|
+
onProgress({
|
|
219
|
+
file: fileName,
|
|
220
|
+
bytesDownloaded,
|
|
221
|
+
totalBytes: total,
|
|
222
|
+
percent: total !== null && total > 0
|
|
223
|
+
? Math.round((bytesDownloaded / total) * 100)
|
|
224
|
+
: null,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
// Write all chunks to file
|
|
229
|
+
const buffer = Buffer.concat(chunks);
|
|
230
|
+
await fsPromises.writeFile(destPath, buffer);
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Extracts a zip file (for cities15000.zip)
|
|
234
|
+
*/
|
|
235
|
+
async function extractZip(zipPath, destDir) {
|
|
236
|
+
// Use Node.js built-in zlib for basic extraction
|
|
237
|
+
// For zip files, we'll use a simple approach with the 'unzip' command
|
|
238
|
+
// or fall back to downloading the uncompressed version if available
|
|
239
|
+
const { exec } = await import("child_process");
|
|
240
|
+
const { promisify } = await import("util");
|
|
241
|
+
const execAsync = promisify(exec);
|
|
242
|
+
try {
|
|
243
|
+
// Try using unzip command (available on most systems)
|
|
244
|
+
await execAsync(`unzip -o "${zipPath}" -d "${destDir}"`);
|
|
245
|
+
}
|
|
246
|
+
catch {
|
|
247
|
+
// If unzip is not available, try using tar (some systems have it)
|
|
248
|
+
try {
|
|
249
|
+
await execAsync(`tar -xf "${zipPath}" -C "${destDir}"`);
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
throw new Error("Could not extract zip file. Please install 'unzip' command or extract manually.");
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Clean up zip file
|
|
256
|
+
await fsPromises.unlink(zipPath);
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Downloads all semantic data files
|
|
260
|
+
*/
|
|
261
|
+
export async function downloadSemanticData(onProgress, onStatus) {
|
|
262
|
+
const dataDir = getSemanticDataCacheDir();
|
|
263
|
+
// Create directory
|
|
264
|
+
await fsPromises.mkdir(dataDir, { recursive: true });
|
|
265
|
+
onStatus?.("Downloading semantic enrichment data...");
|
|
266
|
+
onStatus?.(`Cache directory: ${dataDir}`);
|
|
267
|
+
for (const file of SEMANTIC_DATA_FILES) {
|
|
268
|
+
const destPath = path.join(dataDir, file.filename);
|
|
269
|
+
const isZip = file.url.endsWith(".zip");
|
|
270
|
+
onStatus?.(`Downloading ${file.description}...`);
|
|
271
|
+
try {
|
|
272
|
+
if (isZip) {
|
|
273
|
+
// Download zip and extract
|
|
274
|
+
const zipPath = destPath + ".zip";
|
|
275
|
+
await downloadFile(file.url, zipPath, onProgress);
|
|
276
|
+
onStatus?.(`Extracting ${file.filename}...`);
|
|
277
|
+
await extractZip(zipPath, dataDir);
|
|
278
|
+
}
|
|
279
|
+
else {
|
|
280
|
+
await downloadFile(file.url, destPath, onProgress);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
catch (e) {
|
|
284
|
+
if (file.required) {
|
|
285
|
+
throw new Error(`Failed to download required file ${file.filename}: ${String(e)}`);
|
|
286
|
+
}
|
|
287
|
+
// Optional files can fail silently
|
|
288
|
+
onStatus?.(`Skipping optional file ${file.filename}`);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
onStatus?.("Semantic data download complete!");
|
|
292
|
+
return dataDir;
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Ensures semantic data is available, downloading if needed
|
|
296
|
+
*/
|
|
297
|
+
export async function ensureSemanticData(options = {}) {
|
|
298
|
+
const { autoDownload = true, onProgress, onStatus } = options;
|
|
299
|
+
const dataDir = getSemanticDataCacheDir();
|
|
300
|
+
// Check if already downloaded
|
|
301
|
+
const isDownloaded = await isSemanticDataDownloaded();
|
|
302
|
+
if (!isDownloaded) {
|
|
303
|
+
if (!autoDownload) {
|
|
304
|
+
throw new Error(`Semantic data not found at ${dataDir}.\n\n` +
|
|
305
|
+
`To download automatically, use:\n` +
|
|
306
|
+
` createAnonymizer({ semantic: { enabled: true, autoDownload: true } })\n\n` +
|
|
307
|
+
`Or disable semantic masking:\n` +
|
|
308
|
+
` createAnonymizer({ semantic: { enabled: false } })`);
|
|
309
|
+
}
|
|
310
|
+
await downloadSemanticData(onProgress, onStatus);
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
onStatus?.(`Using cached semantic data: ${dataDir}`);
|
|
314
|
+
}
|
|
315
|
+
return dataDir;
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Clears cached semantic data
|
|
319
|
+
*/
|
|
320
|
+
export async function clearSemanticDataCache() {
|
|
321
|
+
const dataDir = getSemanticDataCacheDir();
|
|
322
|
+
await fsPromises.rm(dataDir, { recursive: true, force: true });
|
|
323
|
+
// Also clear in-memory data
|
|
324
|
+
clearSemanticData();
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Gets info about semantic data files
|
|
328
|
+
*/
|
|
329
|
+
export function getSemanticDataInfo() {
|
|
330
|
+
return {
|
|
331
|
+
files: SEMANTIC_DATA_FILES,
|
|
332
|
+
cacheDir: getSemanticDataCacheDir(),
|
|
333
|
+
totalSize: "~4 MB",
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
// =============================================================================
|
|
337
|
+
// Data Parsing Functions
|
|
338
|
+
// =============================================================================
|
|
339
|
+
/**
|
|
340
|
+
* Parses nam_dict.txt and extracts name-gender mappings
|
|
341
|
+
*/
|
|
342
|
+
function parseNameDict(filePath) {
|
|
343
|
+
const names = new Map();
|
|
344
|
+
const content = fs.readFileSync(filePath, "latin1"); // File uses ISO-8859-1 encoding
|
|
345
|
+
const lines = content.split("\n");
|
|
346
|
+
for (const line of lines) {
|
|
347
|
+
// Skip comments and empty lines
|
|
348
|
+
if (line.startsWith("#") || line.trim() === "")
|
|
349
|
+
continue;
|
|
350
|
+
// Skip lines with '+' in column 30 (expanded umlauts duplicates)
|
|
351
|
+
if (line.length > 29 && line[29] === "+")
|
|
352
|
+
continue;
|
|
353
|
+
// Parse gender code and name
|
|
354
|
+
// Format: <gender_code> <name> <spaces> <frequencies>
|
|
355
|
+
const match = line.match(/^(\?[MF]?|1[MF]|[MF])\s+(\S+)/);
|
|
356
|
+
if (!match)
|
|
357
|
+
continue;
|
|
358
|
+
const [, genderCode, name] = match;
|
|
359
|
+
if (genderCode === undefined ||
|
|
360
|
+
genderCode === "" ||
|
|
361
|
+
name === undefined ||
|
|
362
|
+
name === "")
|
|
363
|
+
continue;
|
|
364
|
+
const gender = GENDER_CODE_MAP[genderCode];
|
|
365
|
+
if (gender === undefined)
|
|
366
|
+
continue;
|
|
367
|
+
const normalizedName = name.toLowerCase();
|
|
368
|
+
// Check for locale-specific gender by looking at frequency columns
|
|
369
|
+
// If a name has frequency in certain countries but different gender elsewhere,
|
|
370
|
+
// we store locale overrides
|
|
371
|
+
const localeOverrides = {};
|
|
372
|
+
// For simplicity, we'll detect common locale-specific patterns
|
|
373
|
+
// The full implementation would parse frequency columns
|
|
374
|
+
// Store the entry
|
|
375
|
+
if (!names.has(normalizedName)) {
|
|
376
|
+
names.set(normalizedName, {
|
|
377
|
+
gender,
|
|
378
|
+
localeOverrides: Object.keys(localeOverrides).length > 0 ? localeOverrides : undefined,
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
else {
|
|
382
|
+
// If we see a name again with different gender, mark as locale-specific or neutral
|
|
383
|
+
const existing = names.get(normalizedName);
|
|
384
|
+
if (existing.gender !== gender) {
|
|
385
|
+
// Create locale overrides for the different gender
|
|
386
|
+
if (!existing.localeOverrides) {
|
|
387
|
+
existing.localeOverrides = {};
|
|
388
|
+
}
|
|
389
|
+
// This is a simplification - full implementation would use frequency data
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
return names;
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Parses cities15000.txt and extracts city data
|
|
397
|
+
* When multiple cities have the same name, keeps the one with highest population
|
|
398
|
+
*/
|
|
399
|
+
function parseCities(filePath) {
|
|
400
|
+
const cities = new Map();
|
|
401
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
402
|
+
const lines = content.split("\n");
|
|
403
|
+
// Helper to add city only if population is higher than existing
|
|
404
|
+
const addCity = (name, entry) => {
|
|
405
|
+
const normalized = name.toLowerCase();
|
|
406
|
+
const existing = cities.get(normalized);
|
|
407
|
+
if (!existing || entry.population > existing.population) {
|
|
408
|
+
cities.set(normalized, entry);
|
|
409
|
+
}
|
|
410
|
+
};
|
|
411
|
+
for (const line of lines) {
|
|
412
|
+
if (line.trim() === "")
|
|
413
|
+
continue;
|
|
414
|
+
const parts = line.split("\t");
|
|
415
|
+
if (parts.length < 15)
|
|
416
|
+
continue;
|
|
417
|
+
const name = parts[1];
|
|
418
|
+
const asciiName = parts[2];
|
|
419
|
+
const alternateNames = parts[3]?.split(",") ?? [];
|
|
420
|
+
const countryCode = parts[8];
|
|
421
|
+
const population = parseInt(parts[14] ?? "0", 10);
|
|
422
|
+
if (name === undefined ||
|
|
423
|
+
name === "" ||
|
|
424
|
+
countryCode === undefined ||
|
|
425
|
+
countryCode === "")
|
|
426
|
+
continue;
|
|
427
|
+
const cityEntry = { country: countryCode, population };
|
|
428
|
+
// Add main name (prefer higher population)
|
|
429
|
+
addCity(name, cityEntry);
|
|
430
|
+
// Add ASCII name if different (prefer higher population)
|
|
431
|
+
if (asciiName !== undefined &&
|
|
432
|
+
asciiName !== "" &&
|
|
433
|
+
asciiName.toLowerCase() !== name.toLowerCase()) {
|
|
434
|
+
addCity(asciiName, cityEntry);
|
|
435
|
+
}
|
|
436
|
+
// Add alternate names (prefer higher population)
|
|
437
|
+
for (const altName of alternateNames) {
|
|
438
|
+
const trimmed = altName.trim();
|
|
439
|
+
if (trimmed && trimmed.length > 2 && !trimmed.includes(",")) {
|
|
440
|
+
addCity(trimmed, cityEntry);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
return cities;
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Parses countryInfo.txt and extracts country data
|
|
448
|
+
*/
|
|
449
|
+
function parseCountries(filePath) {
|
|
450
|
+
const countries = new Map();
|
|
451
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
452
|
+
const lines = content.split("\n");
|
|
453
|
+
for (const line of lines) {
|
|
454
|
+
// Skip comments and empty lines
|
|
455
|
+
if (line.startsWith("#") || line.trim() === "")
|
|
456
|
+
continue;
|
|
457
|
+
const parts = line.split("\t");
|
|
458
|
+
if (parts.length < 5)
|
|
459
|
+
continue;
|
|
460
|
+
const code = parts[0];
|
|
461
|
+
const name = parts[4];
|
|
462
|
+
if (code === undefined || code === "" || name === undefined || name === "")
|
|
463
|
+
continue;
|
|
464
|
+
// Add country name -> code mapping
|
|
465
|
+
countries.set(name.toLowerCase(), code);
|
|
466
|
+
// Add common variations
|
|
467
|
+
// E.g., "United States" also as "USA", "US", "America"
|
|
468
|
+
const variations = getCountryVariations(name, code);
|
|
469
|
+
for (const variation of variations) {
|
|
470
|
+
countries.set(variation.toLowerCase(), code);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
return countries;
|
|
474
|
+
}
|
|
475
|
+
/**
|
|
476
|
+
* Generates common country name variations
|
|
477
|
+
*/
|
|
478
|
+
function getCountryVariations(name, code) {
|
|
479
|
+
const variations = [code.toLowerCase()];
|
|
480
|
+
// Common abbreviations and variants
|
|
481
|
+
const variantMap = {
|
|
482
|
+
"United States": ["USA", "US", "America", "United States of America"],
|
|
483
|
+
"United Kingdom": ["UK", "Britain", "Great Britain", "England"],
|
|
484
|
+
Germany: ["Deutschland"],
|
|
485
|
+
France: ["Frankreich"],
|
|
486
|
+
Spain: ["España", "Espana", "Spanien"],
|
|
487
|
+
Italy: ["Italia", "Italien"],
|
|
488
|
+
Netherlands: ["Holland", "The Netherlands", "Niederlande"],
|
|
489
|
+
Switzerland: ["Schweiz", "Suisse", "Svizzera"],
|
|
490
|
+
Austria: ["Österreich", "Oesterreich"],
|
|
491
|
+
Belgium: ["Belgien", "Belgique"],
|
|
492
|
+
Russia: ["Russland", "Russian Federation"],
|
|
493
|
+
China: ["People's Republic of China", "PRC"],
|
|
494
|
+
Japan: ["Nippon"],
|
|
495
|
+
"South Korea": ["Korea", "Republic of Korea"],
|
|
496
|
+
"United Arab Emirates": ["UAE", "Emirates"],
|
|
497
|
+
"Czech Republic": ["Czechia", "Tschechien"],
|
|
498
|
+
};
|
|
499
|
+
if (variantMap[name]) {
|
|
500
|
+
variations.push(...variantMap[name]);
|
|
501
|
+
}
|
|
502
|
+
return variations;
|
|
503
|
+
}
|
|
504
|
+
/**
|
|
505
|
+
* Parses admin1CodesASCII.txt and extracts region data
|
|
506
|
+
*/
|
|
507
|
+
function parseRegions(filePath) {
|
|
508
|
+
const regions = new Map();
|
|
509
|
+
try {
|
|
510
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
511
|
+
const lines = content.split("\n");
|
|
512
|
+
for (const line of lines) {
|
|
513
|
+
if (line.trim() === "")
|
|
514
|
+
continue;
|
|
515
|
+
const parts = line.split("\t");
|
|
516
|
+
if (parts.length < 2)
|
|
517
|
+
continue;
|
|
518
|
+
const code = parts[0]; // Format: "US.CA"
|
|
519
|
+
const name = parts[1];
|
|
520
|
+
const asciiName = parts[2];
|
|
521
|
+
if (code === undefined ||
|
|
522
|
+
code === "" ||
|
|
523
|
+
name === undefined ||
|
|
524
|
+
name === "")
|
|
525
|
+
continue;
|
|
526
|
+
const [countryCode] = code.split(".");
|
|
527
|
+
if (countryCode === undefined || countryCode === "")
|
|
528
|
+
continue;
|
|
529
|
+
const regionEntry = { country: countryCode, name };
|
|
530
|
+
// Add region name
|
|
531
|
+
regions.set(name.toLowerCase(), regionEntry);
|
|
532
|
+
// Add ASCII name if different
|
|
533
|
+
if (asciiName !== undefined &&
|
|
534
|
+
asciiName !== "" &&
|
|
535
|
+
asciiName.toLowerCase() !== name.toLowerCase()) {
|
|
536
|
+
regions.set(asciiName.toLowerCase(), regionEntry);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
catch {
|
|
541
|
+
// admin1 file is optional
|
|
542
|
+
}
|
|
543
|
+
return regions;
|
|
544
|
+
}
|
|
545
|
+
// =============================================================================
|
|
546
|
+
// Data Loading and Lookup
|
|
547
|
+
// =============================================================================
|
|
548
|
+
/**
|
|
549
|
+
* Initializes semantic data (downloads if needed, then loads)
|
|
550
|
+
*/
|
|
551
|
+
export async function initializeSemanticData(options = {}) {
|
|
552
|
+
// Ensure data is downloaded
|
|
553
|
+
await ensureSemanticData(options);
|
|
554
|
+
// Load the data
|
|
555
|
+
loadSemanticData();
|
|
556
|
+
}
|
|
557
|
+
/**
|
|
558
|
+
* Loads semantic data from cached files
|
|
559
|
+
* @throws Error if required data files are not available
|
|
560
|
+
*/
|
|
561
|
+
export function loadSemanticData() {
|
|
562
|
+
if (semanticData !== null && semanticData.loaded === true) {
|
|
563
|
+
return semanticData;
|
|
564
|
+
}
|
|
565
|
+
const dataDir = getSemanticDataCacheDir();
|
|
566
|
+
if (!isSemanticDataAvailable()) {
|
|
567
|
+
throw new Error(`Semantic data files not found in ${dataDir}. ` +
|
|
568
|
+
`Use ensureSemanticData() or createAnonymizer({ semantic: { enabled: true, autoDownload: true } }) to download.`);
|
|
569
|
+
}
|
|
570
|
+
const names = parseNameDict(path.join(dataDir, "nam_dict.txt"));
|
|
571
|
+
const cities = parseCities(path.join(dataDir, "cities15000.txt"));
|
|
572
|
+
const countries = parseCountries(path.join(dataDir, "countryInfo.txt"));
|
|
573
|
+
const regions = parseRegions(path.join(dataDir, "admin1CodesASCII.txt"));
|
|
574
|
+
semanticData = {
|
|
575
|
+
names,
|
|
576
|
+
cities,
|
|
577
|
+
countries,
|
|
578
|
+
regions,
|
|
579
|
+
loaded: true,
|
|
580
|
+
};
|
|
581
|
+
return semanticData;
|
|
582
|
+
}
|
|
583
|
+
/**
|
|
584
|
+
* Gets the loaded semantic data (loads if not already loaded)
|
|
585
|
+
*/
|
|
586
|
+
export function getSemanticData() {
|
|
587
|
+
if (semanticData === null || semanticData.loaded !== true) {
|
|
588
|
+
return loadSemanticData();
|
|
589
|
+
}
|
|
590
|
+
return semanticData;
|
|
591
|
+
}
|
|
592
|
+
/**
|
|
593
|
+
* Clears the loaded semantic data (useful for testing)
|
|
594
|
+
*/
|
|
595
|
+
export function clearSemanticData() {
|
|
596
|
+
semanticData = null;
|
|
597
|
+
}
|
|
598
|
+
/**
|
|
599
|
+
* Looks up gender for a name
|
|
600
|
+
*/
|
|
601
|
+
export function lookupGender(name, locale) {
|
|
602
|
+
const data = getSemanticData();
|
|
603
|
+
const entry = data.names.get(name.toLowerCase());
|
|
604
|
+
if (entry === undefined)
|
|
605
|
+
return undefined;
|
|
606
|
+
// Check for locale-specific override
|
|
607
|
+
if (locale !== undefined &&
|
|
608
|
+
locale !== "" &&
|
|
609
|
+
entry.localeOverrides !== undefined &&
|
|
610
|
+
entry.localeOverrides[locale] !== undefined) {
|
|
611
|
+
return entry.localeOverrides[locale];
|
|
612
|
+
}
|
|
613
|
+
return entry.gender;
|
|
614
|
+
}
|
|
615
|
+
/**
|
|
616
|
+
* Population threshold for "major" cities that take precedence over regions
|
|
617
|
+
*/
|
|
618
|
+
const MAJOR_CITY_POPULATION = 500000;
|
|
619
|
+
/**
|
|
620
|
+
* Looks up location type (city, country, or region)
|
|
621
|
+
* Priority: country > major city (pop > 500K) > region > other cities
|
|
622
|
+
*/
|
|
623
|
+
export function lookupLocationType(location) {
|
|
624
|
+
const data = getSemanticData();
|
|
625
|
+
const normalized = location.toLowerCase().trim();
|
|
626
|
+
// Check countries FIRST (to avoid "USA" being matched as a city)
|
|
627
|
+
const countryCode = data.countries.get(normalized);
|
|
628
|
+
if (countryCode !== undefined) {
|
|
629
|
+
return { type: "country", countryCode };
|
|
630
|
+
}
|
|
631
|
+
// Check cities - if it's a major city (pop > 500K), prioritize it over regions
|
|
632
|
+
const city = data.cities.get(normalized);
|
|
633
|
+
if (city && city.population >= MAJOR_CITY_POPULATION) {
|
|
634
|
+
return { type: "city", countryCode: city.country };
|
|
635
|
+
}
|
|
636
|
+
// Check regions
|
|
637
|
+
const region = data.regions.get(normalized);
|
|
638
|
+
if (region) {
|
|
639
|
+
return { type: "region", countryCode: region.country };
|
|
640
|
+
}
|
|
641
|
+
// Check remaining cities (smaller cities)
|
|
642
|
+
if (city) {
|
|
643
|
+
return { type: "city", countryCode: city.country };
|
|
644
|
+
}
|
|
645
|
+
return undefined;
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Gets statistics about loaded data
|
|
649
|
+
*/
|
|
650
|
+
export function getDataStats() {
|
|
651
|
+
if (semanticData === null || semanticData.loaded !== true) {
|
|
652
|
+
return { names: 0, cities: 0, countries: 0, regions: 0, loaded: false };
|
|
653
|
+
}
|
|
654
|
+
return {
|
|
655
|
+
names: semanticData.names.size,
|
|
656
|
+
cities: semanticData.cities.size,
|
|
657
|
+
countries: semanticData.countries.size,
|
|
658
|
+
regions: semanticData.regions.size,
|
|
659
|
+
loaded: true,
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
//# sourceMappingURL=semantic-data-loader.js.map
|