@elanlanguages/bridge-anonymization 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -1
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
- package/dist/crypto/pii-map-crypto.js +8 -8
- package/dist/crypto/pii-map-crypto.js.map +1 -1
- package/dist/index.d.ts +25 -20
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -52
- package/dist/index.js.map +1 -1
- package/dist/ner/model-manager.d.ts.map +1 -1
- package/dist/ner/model-manager.js +10 -8
- package/dist/ner/model-manager.js.map +1 -1
- package/dist/ner/ner-model.d.ts.map +1 -1
- package/dist/ner/ner-model.js +9 -9
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/onnx-runtime.d.ts +3 -3
- package/dist/ner/onnx-runtime.d.ts.map +1 -1
- package/dist/ner/onnx-runtime.js +1 -1
- package/dist/ner/onnx-runtime.js.map +1 -1
- package/dist/ner/tokenizer.js +3 -3
- package/dist/ner/tokenizer.js.map +1 -1
- package/dist/pipeline/index.d.ts +7 -4
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +7 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/resolver.d.ts.map +1 -1
- package/dist/pipeline/resolver.js +3 -2
- package/dist/pipeline/resolver.js.map +1 -1
- package/dist/pipeline/semantic-data-loader.d.ts +157 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +662 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +102 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +268 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +52 -12
- package/dist/pipeline/tagger.d.ts.map +1 -1
- package/dist/pipeline/tagger.js +226 -21
- package/dist/pipeline/tagger.js.map +1 -1
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/types/index.d.ts +66 -3
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +14 -3
- package/dist/types/index.js.map +1 -1
- package/dist/utils/index.d.ts +3 -3
- package/dist/utils/index.js +3 -3
- package/package.json +7 -5
|
@@ -0,0 +1,801 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Title Extractor
|
|
3
|
+
* Extracts and strips honorific titles/prefixes from PERSON entities
|
|
4
|
+
* so that titles remain visible in anonymized text for translation.
|
|
5
|
+
*
|
|
6
|
+
* Supported languages: ar, de, en, es, fr, it, lv, nl, pt, zh
|
|
7
|
+
*/
|
|
8
|
+
import { PIIType, DetectionSource, } from "../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Comprehensive list of honorific titles by language
|
|
11
|
+
* Organized by language code, then by category
|
|
12
|
+
*
|
|
13
|
+
* Each pattern includes:
|
|
14
|
+
* - The title text (case-insensitive matching)
|
|
15
|
+
* - Whether it requires a period (for abbreviations)
|
|
16
|
+
* - Common variants
|
|
17
|
+
*/
|
|
18
|
+
// English titles
|
|
19
|
+
const EN_TITLES = [
|
|
20
|
+
// Basic honorifics
|
|
21
|
+
"Mr",
|
|
22
|
+
"Mr.",
|
|
23
|
+
"Mister",
|
|
24
|
+
"Mrs",
|
|
25
|
+
"Mrs.",
|
|
26
|
+
"Missus",
|
|
27
|
+
"Ms",
|
|
28
|
+
"Ms.",
|
|
29
|
+
"Miss",
|
|
30
|
+
"Mx",
|
|
31
|
+
"Mx.", // Gender-neutral
|
|
32
|
+
// Professional/Academic
|
|
33
|
+
"Dr",
|
|
34
|
+
"Dr.",
|
|
35
|
+
"Doctor",
|
|
36
|
+
"Prof",
|
|
37
|
+
"Prof.",
|
|
38
|
+
"Professor",
|
|
39
|
+
"Rev",
|
|
40
|
+
"Rev.",
|
|
41
|
+
"Reverend",
|
|
42
|
+
"Fr",
|
|
43
|
+
"Fr.",
|
|
44
|
+
"Father",
|
|
45
|
+
"Sr",
|
|
46
|
+
"Sr.",
|
|
47
|
+
"Sister",
|
|
48
|
+
"Br",
|
|
49
|
+
"Br.",
|
|
50
|
+
"Brother",
|
|
51
|
+
// Military
|
|
52
|
+
"Capt",
|
|
53
|
+
"Capt.",
|
|
54
|
+
"Captain",
|
|
55
|
+
"Col",
|
|
56
|
+
"Col.",
|
|
57
|
+
"Colonel",
|
|
58
|
+
"Gen",
|
|
59
|
+
"Gen.",
|
|
60
|
+
"General",
|
|
61
|
+
"Lt",
|
|
62
|
+
"Lt.",
|
|
63
|
+
"Lieutenant",
|
|
64
|
+
"Sgt",
|
|
65
|
+
"Sgt.",
|
|
66
|
+
"Sergeant",
|
|
67
|
+
"Maj",
|
|
68
|
+
"Maj.",
|
|
69
|
+
"Major",
|
|
70
|
+
"Cpl",
|
|
71
|
+
"Cpl.",
|
|
72
|
+
"Corporal",
|
|
73
|
+
"Pvt",
|
|
74
|
+
"Pvt.",
|
|
75
|
+
"Private",
|
|
76
|
+
"Adm",
|
|
77
|
+
"Adm.",
|
|
78
|
+
"Admiral",
|
|
79
|
+
"Cmdr",
|
|
80
|
+
"Cmdr.",
|
|
81
|
+
"Commander",
|
|
82
|
+
// Nobility/Honorific
|
|
83
|
+
"Sir",
|
|
84
|
+
"Dame",
|
|
85
|
+
"Lord",
|
|
86
|
+
"Lady",
|
|
87
|
+
"Hon",
|
|
88
|
+
"Hon.",
|
|
89
|
+
"Honorable",
|
|
90
|
+
"The Honorable",
|
|
91
|
+
"Rt Hon",
|
|
92
|
+
"Rt. Hon.",
|
|
93
|
+
"Right Honorable",
|
|
94
|
+
"The Right Honorable",
|
|
95
|
+
// Legal
|
|
96
|
+
"Esq",
|
|
97
|
+
"Esq.",
|
|
98
|
+
"Esquire",
|
|
99
|
+
"Atty",
|
|
100
|
+
"Atty.",
|
|
101
|
+
"Attorney",
|
|
102
|
+
"Judge",
|
|
103
|
+
"Justice",
|
|
104
|
+
];
|
|
105
|
+
// German titles
|
|
106
|
+
const DE_TITLES = [
|
|
107
|
+
// Basic honorifics
|
|
108
|
+
"Herr",
|
|
109
|
+
"Frau",
|
|
110
|
+
"Fräulein",
|
|
111
|
+
// Professional/Academic
|
|
112
|
+
"Dr",
|
|
113
|
+
"Dr.",
|
|
114
|
+
"Doktor",
|
|
115
|
+
"Prof",
|
|
116
|
+
"Prof.",
|
|
117
|
+
"Professor",
|
|
118
|
+
"Mag",
|
|
119
|
+
"Mag.",
|
|
120
|
+
"Magister",
|
|
121
|
+
"Dipl",
|
|
122
|
+
"Dipl.",
|
|
123
|
+
"Diplom",
|
|
124
|
+
"Dipl.-Ing",
|
|
125
|
+
"Dipl.-Ing.",
|
|
126
|
+
"Diplomingenieur",
|
|
127
|
+
"Ing",
|
|
128
|
+
"Ing.",
|
|
129
|
+
"Ingenieur",
|
|
130
|
+
// Combinations common in German-speaking countries
|
|
131
|
+
"Dr. med",
|
|
132
|
+
"Dr. med.",
|
|
133
|
+
"Dr. jur",
|
|
134
|
+
"Dr. jur.",
|
|
135
|
+
"Dr. phil",
|
|
136
|
+
"Dr. phil.",
|
|
137
|
+
"Dr. rer. nat",
|
|
138
|
+
"Dr. rer. nat.",
|
|
139
|
+
"Dr. h.c",
|
|
140
|
+
"Dr. h.c.",
|
|
141
|
+
"Prof. Dr",
|
|
142
|
+
"Prof. Dr.",
|
|
143
|
+
// Military/Official
|
|
144
|
+
"Gen",
|
|
145
|
+
"Gen.",
|
|
146
|
+
"General",
|
|
147
|
+
"Oberst",
|
|
148
|
+
"Major",
|
|
149
|
+
"Hauptmann",
|
|
150
|
+
// Religious
|
|
151
|
+
"Pfarrer",
|
|
152
|
+
"Pastor",
|
|
153
|
+
"Bischof",
|
|
154
|
+
];
|
|
155
|
+
// French titles
|
|
156
|
+
const FR_TITLES = [
|
|
157
|
+
// Basic honorifics
|
|
158
|
+
"M",
|
|
159
|
+
"M.",
|
|
160
|
+
"Monsieur",
|
|
161
|
+
"Mme",
|
|
162
|
+
"Mme.",
|
|
163
|
+
"Madame",
|
|
164
|
+
"Mlle",
|
|
165
|
+
"Mlle.",
|
|
166
|
+
"Mademoiselle",
|
|
167
|
+
// Professional
|
|
168
|
+
"Dr",
|
|
169
|
+
"Dr.",
|
|
170
|
+
"Docteur",
|
|
171
|
+
"Pr",
|
|
172
|
+
"Pr.",
|
|
173
|
+
"Professeur",
|
|
174
|
+
"Prof",
|
|
175
|
+
"Prof.",
|
|
176
|
+
"Me",
|
|
177
|
+
"Me.",
|
|
178
|
+
"Maître",
|
|
179
|
+
"Maitre", // For lawyers/notaries
|
|
180
|
+
"Mgr",
|
|
181
|
+
"Mgr.",
|
|
182
|
+
"Monseigneur",
|
|
183
|
+
// Military
|
|
184
|
+
"Gén",
|
|
185
|
+
"Gén.",
|
|
186
|
+
"Général",
|
|
187
|
+
"Gen",
|
|
188
|
+
"Gen.",
|
|
189
|
+
"Col",
|
|
190
|
+
"Col.",
|
|
191
|
+
"Colonel",
|
|
192
|
+
"Cdt",
|
|
193
|
+
"Cdt.",
|
|
194
|
+
"Commandant",
|
|
195
|
+
"Capt",
|
|
196
|
+
"Capt.",
|
|
197
|
+
"Capitaine",
|
|
198
|
+
"Lt",
|
|
199
|
+
"Lt.",
|
|
200
|
+
"Lieutenant",
|
|
201
|
+
// Religious
|
|
202
|
+
"Père",
|
|
203
|
+
"Frère",
|
|
204
|
+
"Sœur",
|
|
205
|
+
"Soeur",
|
|
206
|
+
"Abbé",
|
|
207
|
+
];
|
|
208
|
+
// Spanish titles
|
|
209
|
+
const ES_TITLES = [
|
|
210
|
+
// Basic honorifics
|
|
211
|
+
"Sr",
|
|
212
|
+
"Sr.",
|
|
213
|
+
"Señor",
|
|
214
|
+
"Sra",
|
|
215
|
+
"Sra.",
|
|
216
|
+
"Señora",
|
|
217
|
+
"Srta",
|
|
218
|
+
"Srta.",
|
|
219
|
+
"Señorita",
|
|
220
|
+
// Traditional
|
|
221
|
+
"Don",
|
|
222
|
+
"Doña",
|
|
223
|
+
"D.",
|
|
224
|
+
"Dña.",
|
|
225
|
+
// Professional
|
|
226
|
+
"Dr",
|
|
227
|
+
"Dr.",
|
|
228
|
+
"Doctor",
|
|
229
|
+
"Dra",
|
|
230
|
+
"Dra.",
|
|
231
|
+
"Doctora",
|
|
232
|
+
"Prof",
|
|
233
|
+
"Prof.",
|
|
234
|
+
"Profesor",
|
|
235
|
+
"Profa",
|
|
236
|
+
"Profa.",
|
|
237
|
+
"Profesora",
|
|
238
|
+
"Lic",
|
|
239
|
+
"Lic.",
|
|
240
|
+
"Licenciado",
|
|
241
|
+
"Licenciada",
|
|
242
|
+
"Ing",
|
|
243
|
+
"Ing.",
|
|
244
|
+
"Ingeniero",
|
|
245
|
+
"Ingeniera",
|
|
246
|
+
"Arq",
|
|
247
|
+
"Arq.",
|
|
248
|
+
"Arquitecto",
|
|
249
|
+
"Arquitecta",
|
|
250
|
+
"Abog",
|
|
251
|
+
"Abog.",
|
|
252
|
+
"Abogado",
|
|
253
|
+
"Abogada",
|
|
254
|
+
// Military
|
|
255
|
+
"Gral",
|
|
256
|
+
"Gral.",
|
|
257
|
+
"General",
|
|
258
|
+
"Cnel",
|
|
259
|
+
"Cnel.",
|
|
260
|
+
"Coronel",
|
|
261
|
+
"Cap",
|
|
262
|
+
"Cap.",
|
|
263
|
+
"Capitán",
|
|
264
|
+
"Tte",
|
|
265
|
+
"Tte.",
|
|
266
|
+
"Teniente",
|
|
267
|
+
// Religious
|
|
268
|
+
"Padre",
|
|
269
|
+
"Fray",
|
|
270
|
+
"Sor",
|
|
271
|
+
];
|
|
272
|
+
// Italian titles
|
|
273
|
+
const IT_TITLES = [
|
|
274
|
+
// Basic honorifics
|
|
275
|
+
"Sig",
|
|
276
|
+
"Sig.",
|
|
277
|
+
"Signor",
|
|
278
|
+
"Signore",
|
|
279
|
+
"Sig.ra",
|
|
280
|
+
"Signora",
|
|
281
|
+
"Sig.na",
|
|
282
|
+
"Signorina",
|
|
283
|
+
// Professional
|
|
284
|
+
"Dott",
|
|
285
|
+
"Dott.",
|
|
286
|
+
"Dottore",
|
|
287
|
+
"Dottor",
|
|
288
|
+
"Dott.ssa",
|
|
289
|
+
"Dottoressa",
|
|
290
|
+
"Prof",
|
|
291
|
+
"Prof.",
|
|
292
|
+
"Professore",
|
|
293
|
+
"Professor",
|
|
294
|
+
"Prof.ssa",
|
|
295
|
+
"Professoressa",
|
|
296
|
+
"Ing",
|
|
297
|
+
"Ing.",
|
|
298
|
+
"Ingegnere",
|
|
299
|
+
"Avv",
|
|
300
|
+
"Avv.",
|
|
301
|
+
"Avvocato",
|
|
302
|
+
"Arch",
|
|
303
|
+
"Arch.",
|
|
304
|
+
"Architetto",
|
|
305
|
+
"Rag",
|
|
306
|
+
"Rag.",
|
|
307
|
+
"Ragioniere",
|
|
308
|
+
"Geom",
|
|
309
|
+
"Geom.",
|
|
310
|
+
"Geometra",
|
|
311
|
+
// Nobility
|
|
312
|
+
"Conte",
|
|
313
|
+
"Contessa",
|
|
314
|
+
"Marchese",
|
|
315
|
+
"Marchesa",
|
|
316
|
+
"Principe",
|
|
317
|
+
"Principessa",
|
|
318
|
+
"Duca",
|
|
319
|
+
"Duchessa",
|
|
320
|
+
// Religious
|
|
321
|
+
"Don",
|
|
322
|
+
"Padre",
|
|
323
|
+
"Fra",
|
|
324
|
+
"Suor",
|
|
325
|
+
"Mons",
|
|
326
|
+
"Mons.",
|
|
327
|
+
"Monsignore",
|
|
328
|
+
];
|
|
329
|
+
// Portuguese titles
|
|
330
|
+
const PT_TITLES = [
|
|
331
|
+
// Basic honorifics
|
|
332
|
+
"Sr",
|
|
333
|
+
"Sr.",
|
|
334
|
+
"Senhor",
|
|
335
|
+
"Sra",
|
|
336
|
+
"Sra.",
|
|
337
|
+
"Senhora",
|
|
338
|
+
"Srta",
|
|
339
|
+
"Srta.",
|
|
340
|
+
"Senhorita",
|
|
341
|
+
// Professional
|
|
342
|
+
"Dr",
|
|
343
|
+
"Dr.",
|
|
344
|
+
"Doutor",
|
|
345
|
+
"Dra",
|
|
346
|
+
"Dra.",
|
|
347
|
+
"Doutora",
|
|
348
|
+
"Prof",
|
|
349
|
+
"Prof.",
|
|
350
|
+
"Professor",
|
|
351
|
+
"Profa",
|
|
352
|
+
"Profa.",
|
|
353
|
+
"Professora",
|
|
354
|
+
"Eng",
|
|
355
|
+
"Eng.",
|
|
356
|
+
"Engenheiro",
|
|
357
|
+
"Engenheira",
|
|
358
|
+
"Arq",
|
|
359
|
+
"Arq.",
|
|
360
|
+
"Arquiteto",
|
|
361
|
+
"Arquiteta",
|
|
362
|
+
// Traditional (Brazil/Portugal)
|
|
363
|
+
"Dom",
|
|
364
|
+
"Dona",
|
|
365
|
+
// Military
|
|
366
|
+
"Gen",
|
|
367
|
+
"Gen.",
|
|
368
|
+
"General",
|
|
369
|
+
"Cel",
|
|
370
|
+
"Cel.",
|
|
371
|
+
"Coronel",
|
|
372
|
+
"Cap",
|
|
373
|
+
"Cap.",
|
|
374
|
+
"Capitão",
|
|
375
|
+
"Ten",
|
|
376
|
+
"Ten.",
|
|
377
|
+
"Tenente",
|
|
378
|
+
// Religious
|
|
379
|
+
"Padre",
|
|
380
|
+
"Frei",
|
|
381
|
+
"Irmã",
|
|
382
|
+
"Irmão",
|
|
383
|
+
];
|
|
384
|
+
// Dutch titles
|
|
385
|
+
const NL_TITLES = [
|
|
386
|
+
// Basic honorifics
|
|
387
|
+
"Dhr",
|
|
388
|
+
"Dhr.",
|
|
389
|
+
"De heer",
|
|
390
|
+
"Meneer",
|
|
391
|
+
"Mijnheer",
|
|
392
|
+
"Mevr",
|
|
393
|
+
"Mevr.",
|
|
394
|
+
"Mevrouw",
|
|
395
|
+
"Mw",
|
|
396
|
+
"Mw.",
|
|
397
|
+
"Mejuffrouw",
|
|
398
|
+
"Juffrouw",
|
|
399
|
+
// Professional/Academic
|
|
400
|
+
"Dr",
|
|
401
|
+
"Dr.",
|
|
402
|
+
"Doctor",
|
|
403
|
+
"Prof",
|
|
404
|
+
"Prof.",
|
|
405
|
+
"Professor",
|
|
406
|
+
"Ir",
|
|
407
|
+
"Ir.",
|
|
408
|
+
"Ingenieur",
|
|
409
|
+
"Mr",
|
|
410
|
+
"Mr.",
|
|
411
|
+
"Meester", // Legal title
|
|
412
|
+
"Drs",
|
|
413
|
+
"Drs.",
|
|
414
|
+
"Doctorandus",
|
|
415
|
+
"Ing",
|
|
416
|
+
"Ing.",
|
|
417
|
+
// Military
|
|
418
|
+
"Gen",
|
|
419
|
+
"Gen.",
|
|
420
|
+
"Generaal",
|
|
421
|
+
"Kol",
|
|
422
|
+
"Kol.",
|
|
423
|
+
"Kolonel",
|
|
424
|
+
"Kapt",
|
|
425
|
+
"Kapt.",
|
|
426
|
+
"Kapitein",
|
|
427
|
+
// Religious
|
|
428
|
+
"Ds",
|
|
429
|
+
"Ds.",
|
|
430
|
+
"Dominee",
|
|
431
|
+
"Pastoor",
|
|
432
|
+
"Pater",
|
|
433
|
+
];
|
|
434
|
+
// Latvian titles
|
|
435
|
+
const LV_TITLES = [
|
|
436
|
+
// Basic honorifics
|
|
437
|
+
"kungs",
|
|
438
|
+
"kundze",
|
|
439
|
+
"jaunkundze",
|
|
440
|
+
"k-gs",
|
|
441
|
+
"k-dze",
|
|
442
|
+
// Professional
|
|
443
|
+
"Dr",
|
|
444
|
+
"Dr.",
|
|
445
|
+
"doktors",
|
|
446
|
+
"Prof",
|
|
447
|
+
"Prof.",
|
|
448
|
+
"profesors",
|
|
449
|
+
// Note: Latvian uses fewer abbreviated titles than Western languages
|
|
450
|
+
];
|
|
451
|
+
// Arabic titles (transliterated and Arabic script)
|
|
452
|
+
const AR_TITLES = [
|
|
453
|
+
// Basic honorifics - Arabic script
|
|
454
|
+
"السيد",
|
|
455
|
+
"السيدة",
|
|
456
|
+
"الآنسة",
|
|
457
|
+
// Basic honorifics - transliterated
|
|
458
|
+
"Al-Sayyid",
|
|
459
|
+
"As-Sayyid",
|
|
460
|
+
"Sayyid",
|
|
461
|
+
"Al-Sayyida",
|
|
462
|
+
"As-Sayyida",
|
|
463
|
+
"Sayyida",
|
|
464
|
+
"Al-Aanisa",
|
|
465
|
+
"Aanisa",
|
|
466
|
+
// Professional - Arabic script
|
|
467
|
+
"الدكتور",
|
|
468
|
+
"الدكتورة",
|
|
469
|
+
"الأستاذ",
|
|
470
|
+
"الأستاذة",
|
|
471
|
+
"المهندس",
|
|
472
|
+
"المهندسة",
|
|
473
|
+
// Professional - transliterated
|
|
474
|
+
"Dr",
|
|
475
|
+
"Dr.",
|
|
476
|
+
"Doktor",
|
|
477
|
+
"Ustadh",
|
|
478
|
+
"Ustadha",
|
|
479
|
+
"Ustaaz",
|
|
480
|
+
"Muhandis",
|
|
481
|
+
"Muhandisa",
|
|
482
|
+
// Religious
|
|
483
|
+
"الشيخ",
|
|
484
|
+
"Sheikh",
|
|
485
|
+
"Shaikh",
|
|
486
|
+
"Shaykh",
|
|
487
|
+
"الإمام",
|
|
488
|
+
"Imam",
|
|
489
|
+
"الحاج",
|
|
490
|
+
"Hajj",
|
|
491
|
+
"Hajji",
|
|
492
|
+
"Al-Hajj",
|
|
493
|
+
// Nobility/Honorific
|
|
494
|
+
"أمير",
|
|
495
|
+
"Amir",
|
|
496
|
+
"Emir",
|
|
497
|
+
"سلطان",
|
|
498
|
+
"Sultan",
|
|
499
|
+
];
|
|
500
|
+
// Chinese titles (simplified and traditional)
|
|
501
|
+
const ZH_TITLES = [
|
|
502
|
+
// Basic honorifics
|
|
503
|
+
"先生", // xiānsheng - Mr.
|
|
504
|
+
"女士", // nǚshì - Ms.
|
|
505
|
+
"小姐", // xiǎojiě - Miss
|
|
506
|
+
"太太", // tàitai - Mrs.
|
|
507
|
+
// Professional/Academic
|
|
508
|
+
"博士", // bóshì - Doctor (PhD)
|
|
509
|
+
"教授", // jiàoshòu - Professor
|
|
510
|
+
"医生", // yīshēng - Doctor (medical)
|
|
511
|
+
"老师",
|
|
512
|
+
"老師", // lǎoshī - Teacher
|
|
513
|
+
"工程师",
|
|
514
|
+
"工程師", // gōngchéngshī - Engineer
|
|
515
|
+
"律师",
|
|
516
|
+
"律師", // lǜshī - Lawyer
|
|
517
|
+
// Military
|
|
518
|
+
"将军",
|
|
519
|
+
"將軍", // jiāngjūn - General
|
|
520
|
+
"上校", // shàngxiào - Colonel
|
|
521
|
+
"上尉", // shàngwèi - Captain
|
|
522
|
+
// Traditional/Formal
|
|
523
|
+
"阁下",
|
|
524
|
+
"閣下", // géxià - Your Excellency
|
|
525
|
+
"大人", // dàrén - Lord/Sir (historical)
|
|
526
|
+
];
|
|
527
|
+
/**
|
|
528
|
+
* All titles combined and sorted by length (longest first)
|
|
529
|
+
* This ensures "Prof. Dr." matches before "Prof."
|
|
530
|
+
*/
|
|
531
|
+
const ALL_TITLES = [
|
|
532
|
+
...EN_TITLES,
|
|
533
|
+
...DE_TITLES,
|
|
534
|
+
...FR_TITLES,
|
|
535
|
+
...ES_TITLES,
|
|
536
|
+
...IT_TITLES,
|
|
537
|
+
...PT_TITLES,
|
|
538
|
+
...NL_TITLES,
|
|
539
|
+
...LV_TITLES,
|
|
540
|
+
...AR_TITLES,
|
|
541
|
+
...ZH_TITLES,
|
|
542
|
+
].sort((a, b) => b.length - a.length);
|
|
543
|
+
/**
|
|
544
|
+
* Pre-compiled regex patterns for title matching
|
|
545
|
+
* Matches title at start of string, followed by whitespace
|
|
546
|
+
*/
|
|
547
|
+
const TITLE_PATTERNS = ALL_TITLES.map((title) => ({
|
|
548
|
+
pattern: new RegExp(`^${escapeRegex(title)}(?:\\s+|$)`, "i"),
|
|
549
|
+
title,
|
|
550
|
+
}));
|
|
551
|
+
/**
|
|
552
|
+
* Escapes special regex characters in a string
|
|
553
|
+
*/
|
|
554
|
+
function escapeRegex(str) {
|
|
555
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
556
|
+
}
|
|
557
|
+
/**
|
|
558
|
+
* Extracts a title from the beginning of a name
|
|
559
|
+
*
|
|
560
|
+
* @param name - Full name potentially starting with a title
|
|
561
|
+
* @returns Extraction result with title, remaining name, and offset
|
|
562
|
+
*
|
|
563
|
+
* @example
|
|
564
|
+
* extractTitle("Dr. John Smith") // { title: "Dr.", nameWithoutTitle: "John Smith", titleLength: 4 }
|
|
565
|
+
* extractTitle("John Smith") // { title: undefined, nameWithoutTitle: "John Smith", titleLength: 0 }
|
|
566
|
+
*/
|
|
567
|
+
export function extractTitle(name) {
|
|
568
|
+
const trimmed = name.trim();
|
|
569
|
+
for (const { pattern } of TITLE_PATTERNS) {
|
|
570
|
+
const match = trimmed.match(pattern);
|
|
571
|
+
if (match !== null) {
|
|
572
|
+
const matchedText = match[0];
|
|
573
|
+
const nameWithoutTitle = trimmed.slice(matchedText.length).trim();
|
|
574
|
+
// Only extract if there's still a name left after the title
|
|
575
|
+
if (nameWithoutTitle.length > 0) {
|
|
576
|
+
return {
|
|
577
|
+
title: matchedText.trimEnd(), // Keep original case from input
|
|
578
|
+
nameWithoutTitle,
|
|
579
|
+
titleLength: matchedText.length,
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
return {
|
|
585
|
+
title: undefined,
|
|
586
|
+
nameWithoutTitle: trimmed,
|
|
587
|
+
titleLength: 0,
|
|
588
|
+
};
|
|
589
|
+
}
|
|
590
|
+
/**
|
|
591
|
+
* Processes PERSON spans to extract titles
|
|
592
|
+
* Titles are removed from the span and stored in semantic attributes
|
|
593
|
+
* The span boundaries are adjusted so the title remains visible
|
|
594
|
+
*
|
|
595
|
+
* @param spans - Array of detected PII spans
|
|
596
|
+
* @param originalText - The original text (needed to verify span boundaries)
|
|
597
|
+
* @returns Array of spans with titles extracted from PERSON entities
|
|
598
|
+
*/
|
|
599
|
+
export function extractTitlesFromSpans(spans, originalText) {
|
|
600
|
+
return spans.map((span) => {
|
|
601
|
+
// Only process PERSON entities
|
|
602
|
+
if (span.type !== PIIType.PERSON) {
|
|
603
|
+
return span;
|
|
604
|
+
}
|
|
605
|
+
// Get the actual text from the span
|
|
606
|
+
const spanText = span.text;
|
|
607
|
+
const extraction = extractTitle(spanText);
|
|
608
|
+
// If no title found, return original span
|
|
609
|
+
if (extraction.title === undefined || extraction.titleLength === 0) {
|
|
610
|
+
return span;
|
|
611
|
+
}
|
|
612
|
+
// Verify the extraction makes sense (remaining name is not empty)
|
|
613
|
+
if (extraction.nameWithoutTitle.length === 0) {
|
|
614
|
+
return span;
|
|
615
|
+
}
|
|
616
|
+
// Calculate new span boundaries
|
|
617
|
+
// The title stays in the text, we adjust the span to exclude it
|
|
618
|
+
const newStart = span.start + extraction.titleLength;
|
|
619
|
+
// Make sure new start doesn't exceed original end
|
|
620
|
+
if (newStart >= span.end) {
|
|
621
|
+
return span;
|
|
622
|
+
}
|
|
623
|
+
// Verify the text at the new position matches what we expect
|
|
624
|
+
const expectedText = originalText.slice(newStart, span.end).trim();
|
|
625
|
+
if (expectedText.toLowerCase() !== extraction.nameWithoutTitle.toLowerCase()) {
|
|
626
|
+
// Text doesn't match - might be whitespace differences, try to find actual start
|
|
627
|
+
const actualStart = originalText.indexOf(extraction.nameWithoutTitle, span.start);
|
|
628
|
+
if (actualStart === -1 || actualStart >= span.end) {
|
|
629
|
+
return span; // Can't find the name without title, return original
|
|
630
|
+
}
|
|
631
|
+
return {
|
|
632
|
+
...span,
|
|
633
|
+
start: actualStart,
|
|
634
|
+
text: extraction.nameWithoutTitle,
|
|
635
|
+
semantic: {
|
|
636
|
+
...span.semantic,
|
|
637
|
+
title: extraction.title,
|
|
638
|
+
},
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
// Return new span with adjusted boundaries and title in semantic attributes
|
|
642
|
+
return {
|
|
643
|
+
...span,
|
|
644
|
+
start: newStart,
|
|
645
|
+
text: extraction.nameWithoutTitle,
|
|
646
|
+
semantic: {
|
|
647
|
+
...span.semantic,
|
|
648
|
+
title: extraction.title,
|
|
649
|
+
},
|
|
650
|
+
};
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
/**
|
|
654
|
+
* Gets all supported titles for a specific language
|
|
655
|
+
*/
|
|
656
|
+
export function getTitlesForLanguage(langCode) {
|
|
657
|
+
const titleMap = {
|
|
658
|
+
ar: AR_TITLES,
|
|
659
|
+
de: DE_TITLES,
|
|
660
|
+
en: EN_TITLES,
|
|
661
|
+
es: ES_TITLES,
|
|
662
|
+
fr: FR_TITLES,
|
|
663
|
+
it: IT_TITLES,
|
|
664
|
+
lv: LV_TITLES,
|
|
665
|
+
nl: NL_TITLES,
|
|
666
|
+
pt: PT_TITLES,
|
|
667
|
+
zh: ZH_TITLES,
|
|
668
|
+
};
|
|
669
|
+
return titleMap[langCode] ?? [];
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Gets all supported titles across all languages
|
|
673
|
+
*/
|
|
674
|
+
export function getAllTitles() {
|
|
675
|
+
return [...ALL_TITLES];
|
|
676
|
+
}
|
|
677
|
+
/**
|
|
678
|
+
* Checks if a string starts with a known title
|
|
679
|
+
*/
|
|
680
|
+
export function startsWithTitle(text) {
|
|
681
|
+
const result = extractTitle(text);
|
|
682
|
+
return result.title !== undefined;
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Checks if a text consists entirely of a title (with optional punctuation)
|
|
686
|
+
*/
|
|
687
|
+
export function isOnlyTitle(text) {
|
|
688
|
+
const trimmed = text.trim();
|
|
689
|
+
// Remove trailing punctuation for comparison
|
|
690
|
+
const withoutPunctuation = trimmed.replace(/[.,!?;:]+$/, "").trim();
|
|
691
|
+
for (const { pattern } of TITLE_PATTERNS) {
|
|
692
|
+
// Check if the text matches the title pattern completely
|
|
693
|
+
const match = withoutPunctuation.match(pattern);
|
|
694
|
+
if (match !== null) {
|
|
695
|
+
const remaining = withoutPunctuation.slice(match[0].length).trim();
|
|
696
|
+
// If nothing left after the title, it's only a title
|
|
697
|
+
if (remaining === "") {
|
|
698
|
+
return true;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
// Also check for exact match (case-insensitive)
|
|
703
|
+
const normalizedText = withoutPunctuation.toLowerCase();
|
|
704
|
+
for (const title of ALL_TITLES) {
|
|
705
|
+
if (normalizedText === title.toLowerCase()) {
|
|
706
|
+
return true;
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
return false;
|
|
710
|
+
}
|
|
711
|
+
/**
|
|
712
|
+
* Merges adjacent PERSON spans when one is a title
|
|
713
|
+
*
|
|
714
|
+
* This fixes issues where NER models split "Mrs. Smith" into two entities:
|
|
715
|
+
* - PERSON: "Mrs" (or "Mrs.")
|
|
716
|
+
* - PERSON: "Smith"
|
|
717
|
+
*
|
|
718
|
+
* After merging: PERSON: "Mrs. Smith"
|
|
719
|
+
*
|
|
720
|
+
* @param spans - Array of detected PII spans
|
|
721
|
+
* @param originalText - The original text
|
|
722
|
+
* @param maxGap - Maximum characters between spans to consider them adjacent (default: 3)
|
|
723
|
+
* @returns Array of spans with adjacent title+name PERSON entities merged
|
|
724
|
+
*/
|
|
725
|
+
export function mergeAdjacentTitleSpans(spans, originalText, maxGap = 3) {
|
|
726
|
+
if (spans.length <= 1) {
|
|
727
|
+
return spans;
|
|
728
|
+
}
|
|
729
|
+
// Sort by start position
|
|
730
|
+
const sorted = [...spans].sort((a, b) => a.start - b.start);
|
|
731
|
+
const result = [];
|
|
732
|
+
let i = 0;
|
|
733
|
+
while (i < sorted.length) {
|
|
734
|
+
const current = sorted[i];
|
|
735
|
+
if (current === undefined) {
|
|
736
|
+
i++;
|
|
737
|
+
continue;
|
|
738
|
+
}
|
|
739
|
+
// Only process PERSON entities
|
|
740
|
+
if (current.type !== PIIType.PERSON) {
|
|
741
|
+
result.push(current);
|
|
742
|
+
i++;
|
|
743
|
+
continue;
|
|
744
|
+
}
|
|
745
|
+
// Check if this span is only a title
|
|
746
|
+
if (!isOnlyTitle(current.text)) {
|
|
747
|
+
result.push(current);
|
|
748
|
+
i++;
|
|
749
|
+
continue;
|
|
750
|
+
}
|
|
751
|
+
// Look for the next PERSON span that's close enough
|
|
752
|
+
let merged = false;
|
|
753
|
+
for (let j = i + 1; j < sorted.length; j++) {
|
|
754
|
+
const next = sorted[j];
|
|
755
|
+
if (next === undefined)
|
|
756
|
+
continue;
|
|
757
|
+
// Calculate gap between spans
|
|
758
|
+
const gap = next.start - current.end;
|
|
759
|
+
// If gap is too large, stop looking
|
|
760
|
+
if (gap > maxGap) {
|
|
761
|
+
break;
|
|
762
|
+
}
|
|
763
|
+
// Check what's in the gap (should be whitespace/punctuation only)
|
|
764
|
+
const gapText = originalText.slice(current.end, next.start);
|
|
765
|
+
if (!/^[\s.,;:!?]*$/.test(gapText)) {
|
|
766
|
+
break;
|
|
767
|
+
}
|
|
768
|
+
// If next is also PERSON, merge them
|
|
769
|
+
if (next.type === PIIType.PERSON) {
|
|
770
|
+
const mergedText = originalText.slice(current.start, next.end);
|
|
771
|
+
const mergedSpan = {
|
|
772
|
+
type: PIIType.PERSON,
|
|
773
|
+
start: current.start,
|
|
774
|
+
end: next.end,
|
|
775
|
+
// Use the higher confidence
|
|
776
|
+
confidence: Math.max(current.confidence, next.confidence),
|
|
777
|
+
// Mark as hybrid since we merged NER results
|
|
778
|
+
source: current.source === next.source
|
|
779
|
+
? current.source
|
|
780
|
+
: DetectionSource.HYBRID,
|
|
781
|
+
text: mergedText,
|
|
782
|
+
// Preserve any existing semantic attributes from either span
|
|
783
|
+
semantic: {
|
|
784
|
+
...current.semantic,
|
|
785
|
+
...next.semantic,
|
|
786
|
+
},
|
|
787
|
+
};
|
|
788
|
+
result.push(mergedSpan);
|
|
789
|
+
merged = true;
|
|
790
|
+
i = j + 1; // Skip the merged span
|
|
791
|
+
break;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
if (!merged) {
|
|
795
|
+
result.push(current);
|
|
796
|
+
i++;
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
return result;
|
|
800
|
+
}
|
|
801
|
+
//# sourceMappingURL=title-extractor.js.map
|