@elanlanguages/bridge-anonymization 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +73 -1
  2. package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
  3. package/dist/crypto/pii-map-crypto.js +8 -8
  4. package/dist/crypto/pii-map-crypto.js.map +1 -1
  5. package/dist/index.d.ts +25 -20
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +103 -52
  8. package/dist/index.js.map +1 -1
  9. package/dist/ner/model-manager.d.ts.map +1 -1
  10. package/dist/ner/model-manager.js +10 -8
  11. package/dist/ner/model-manager.js.map +1 -1
  12. package/dist/ner/ner-model.d.ts.map +1 -1
  13. package/dist/ner/ner-model.js +9 -9
  14. package/dist/ner/ner-model.js.map +1 -1
  15. package/dist/ner/onnx-runtime.d.ts +3 -3
  16. package/dist/ner/onnx-runtime.d.ts.map +1 -1
  17. package/dist/ner/onnx-runtime.js +1 -1
  18. package/dist/ner/onnx-runtime.js.map +1 -1
  19. package/dist/ner/tokenizer.js +3 -3
  20. package/dist/ner/tokenizer.js.map +1 -1
  21. package/dist/pipeline/index.d.ts +7 -4
  22. package/dist/pipeline/index.d.ts.map +1 -1
  23. package/dist/pipeline/index.js +7 -4
  24. package/dist/pipeline/index.js.map +1 -1
  25. package/dist/pipeline/resolver.d.ts.map +1 -1
  26. package/dist/pipeline/resolver.js +3 -2
  27. package/dist/pipeline/resolver.js.map +1 -1
  28. package/dist/pipeline/semantic-data-loader.d.ts +157 -0
  29. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  30. package/dist/pipeline/semantic-data-loader.js +662 -0
  31. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  32. package/dist/pipeline/semantic-enricher.d.ts +102 -0
  33. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  34. package/dist/pipeline/semantic-enricher.js +268 -0
  35. package/dist/pipeline/semantic-enricher.js.map +1 -0
  36. package/dist/pipeline/tagger.d.ts +52 -12
  37. package/dist/pipeline/tagger.d.ts.map +1 -1
  38. package/dist/pipeline/tagger.js +226 -21
  39. package/dist/pipeline/tagger.js.map +1 -1
  40. package/dist/pipeline/title-extractor.d.ts +79 -0
  41. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  42. package/dist/pipeline/title-extractor.js +801 -0
  43. package/dist/pipeline/title-extractor.js.map +1 -0
  44. package/dist/types/index.d.ts +66 -3
  45. package/dist/types/index.d.ts.map +1 -1
  46. package/dist/types/index.js +14 -3
  47. package/dist/types/index.js.map +1 -1
  48. package/dist/utils/index.d.ts +3 -3
  49. package/dist/utils/index.js +3 -3
  50. package/package.json +7 -5
@@ -0,0 +1,801 @@
1
+ /**
2
+ * Title Extractor
3
+ * Extracts and strips honorific titles/prefixes from PERSON entities
4
+ * so that titles remain visible in anonymized text for translation.
5
+ *
6
+ * Supported languages: ar, de, en, es, fr, it, lv, nl, pt, zh
7
+ */
8
+ import { PIIType, DetectionSource, } from "../types/index.js";
9
+ /**
10
+ * Comprehensive list of honorific titles by language
11
+ * Organized by language code, then by category
12
+ *
13
+ * Each pattern includes:
14
+ * - The title text (case-insensitive matching)
15
+ * - Whether it requires a period (for abbreviations)
16
+ * - Common variants
17
+ */
18
+ // English titles
19
+ const EN_TITLES = [
20
+ // Basic honorifics
21
+ "Mr",
22
+ "Mr.",
23
+ "Mister",
24
+ "Mrs",
25
+ "Mrs.",
26
+ "Missus",
27
+ "Ms",
28
+ "Ms.",
29
+ "Miss",
30
+ "Mx",
31
+ "Mx.", // Gender-neutral
32
+ // Professional/Academic
33
+ "Dr",
34
+ "Dr.",
35
+ "Doctor",
36
+ "Prof",
37
+ "Prof.",
38
+ "Professor",
39
+ "Rev",
40
+ "Rev.",
41
+ "Reverend",
42
+ "Fr",
43
+ "Fr.",
44
+ "Father",
45
+ "Sr",
46
+ "Sr.",
47
+ "Sister",
48
+ "Br",
49
+ "Br.",
50
+ "Brother",
51
+ // Military
52
+ "Capt",
53
+ "Capt.",
54
+ "Captain",
55
+ "Col",
56
+ "Col.",
57
+ "Colonel",
58
+ "Gen",
59
+ "Gen.",
60
+ "General",
61
+ "Lt",
62
+ "Lt.",
63
+ "Lieutenant",
64
+ "Sgt",
65
+ "Sgt.",
66
+ "Sergeant",
67
+ "Maj",
68
+ "Maj.",
69
+ "Major",
70
+ "Cpl",
71
+ "Cpl.",
72
+ "Corporal",
73
+ "Pvt",
74
+ "Pvt.",
75
+ "Private",
76
+ "Adm",
77
+ "Adm.",
78
+ "Admiral",
79
+ "Cmdr",
80
+ "Cmdr.",
81
+ "Commander",
82
+ // Nobility/Honorific
83
+ "Sir",
84
+ "Dame",
85
+ "Lord",
86
+ "Lady",
87
+ "Hon",
88
+ "Hon.",
89
+ "Honorable",
90
+ "The Honorable",
91
+ "Rt Hon",
92
+ "Rt. Hon.",
93
+ "Right Honorable",
94
+ "The Right Honorable",
95
+ // Legal
96
+ "Esq",
97
+ "Esq.",
98
+ "Esquire",
99
+ "Atty",
100
+ "Atty.",
101
+ "Attorney",
102
+ "Judge",
103
+ "Justice",
104
+ ];
105
+ // German titles
106
+ const DE_TITLES = [
107
+ // Basic honorifics
108
+ "Herr",
109
+ "Frau",
110
+ "Fräulein",
111
+ // Professional/Academic
112
+ "Dr",
113
+ "Dr.",
114
+ "Doktor",
115
+ "Prof",
116
+ "Prof.",
117
+ "Professor",
118
+ "Mag",
119
+ "Mag.",
120
+ "Magister",
121
+ "Dipl",
122
+ "Dipl.",
123
+ "Diplom",
124
+ "Dipl.-Ing",
125
+ "Dipl.-Ing.",
126
+ "Diplomingenieur",
127
+ "Ing",
128
+ "Ing.",
129
+ "Ingenieur",
130
+ // Combinations common in German-speaking countries
131
+ "Dr. med",
132
+ "Dr. med.",
133
+ "Dr. jur",
134
+ "Dr. jur.",
135
+ "Dr. phil",
136
+ "Dr. phil.",
137
+ "Dr. rer. nat",
138
+ "Dr. rer. nat.",
139
+ "Dr. h.c",
140
+ "Dr. h.c.",
141
+ "Prof. Dr",
142
+ "Prof. Dr.",
143
+ // Military/Official
144
+ "Gen",
145
+ "Gen.",
146
+ "General",
147
+ "Oberst",
148
+ "Major",
149
+ "Hauptmann",
150
+ // Religious
151
+ "Pfarrer",
152
+ "Pastor",
153
+ "Bischof",
154
+ ];
155
+ // French titles
156
+ const FR_TITLES = [
157
+ // Basic honorifics
158
+ "M",
159
+ "M.",
160
+ "Monsieur",
161
+ "Mme",
162
+ "Mme.",
163
+ "Madame",
164
+ "Mlle",
165
+ "Mlle.",
166
+ "Mademoiselle",
167
+ // Professional
168
+ "Dr",
169
+ "Dr.",
170
+ "Docteur",
171
+ "Pr",
172
+ "Pr.",
173
+ "Professeur",
174
+ "Prof",
175
+ "Prof.",
176
+ "Me",
177
+ "Me.",
178
+ "Maître",
179
+ "Maitre", // For lawyers/notaries
180
+ "Mgr",
181
+ "Mgr.",
182
+ "Monseigneur",
183
+ // Military
184
+ "Gén",
185
+ "Gén.",
186
+ "Général",
187
+ "Gen",
188
+ "Gen.",
189
+ "Col",
190
+ "Col.",
191
+ "Colonel",
192
+ "Cdt",
193
+ "Cdt.",
194
+ "Commandant",
195
+ "Capt",
196
+ "Capt.",
197
+ "Capitaine",
198
+ "Lt",
199
+ "Lt.",
200
+ "Lieutenant",
201
+ // Religious
202
+ "Père",
203
+ "Frère",
204
+ "Sœur",
205
+ "Soeur",
206
+ "Abbé",
207
+ ];
208
+ // Spanish titles
209
+ const ES_TITLES = [
210
+ // Basic honorifics
211
+ "Sr",
212
+ "Sr.",
213
+ "Señor",
214
+ "Sra",
215
+ "Sra.",
216
+ "Señora",
217
+ "Srta",
218
+ "Srta.",
219
+ "Señorita",
220
+ // Traditional
221
+ "Don",
222
+ "Doña",
223
+ "D.",
224
+ "Dña.",
225
+ // Professional
226
+ "Dr",
227
+ "Dr.",
228
+ "Doctor",
229
+ "Dra",
230
+ "Dra.",
231
+ "Doctora",
232
+ "Prof",
233
+ "Prof.",
234
+ "Profesor",
235
+ "Profa",
236
+ "Profa.",
237
+ "Profesora",
238
+ "Lic",
239
+ "Lic.",
240
+ "Licenciado",
241
+ "Licenciada",
242
+ "Ing",
243
+ "Ing.",
244
+ "Ingeniero",
245
+ "Ingeniera",
246
+ "Arq",
247
+ "Arq.",
248
+ "Arquitecto",
249
+ "Arquitecta",
250
+ "Abog",
251
+ "Abog.",
252
+ "Abogado",
253
+ "Abogada",
254
+ // Military
255
+ "Gral",
256
+ "Gral.",
257
+ "General",
258
+ "Cnel",
259
+ "Cnel.",
260
+ "Coronel",
261
+ "Cap",
262
+ "Cap.",
263
+ "Capitán",
264
+ "Tte",
265
+ "Tte.",
266
+ "Teniente",
267
+ // Religious
268
+ "Padre",
269
+ "Fray",
270
+ "Sor",
271
+ ];
272
+ // Italian titles
273
+ const IT_TITLES = [
274
+ // Basic honorifics
275
+ "Sig",
276
+ "Sig.",
277
+ "Signor",
278
+ "Signore",
279
+ "Sig.ra",
280
+ "Signora",
281
+ "Sig.na",
282
+ "Signorina",
283
+ // Professional
284
+ "Dott",
285
+ "Dott.",
286
+ "Dottore",
287
+ "Dottor",
288
+ "Dott.ssa",
289
+ "Dottoressa",
290
+ "Prof",
291
+ "Prof.",
292
+ "Professore",
293
+ "Professor",
294
+ "Prof.ssa",
295
+ "Professoressa",
296
+ "Ing",
297
+ "Ing.",
298
+ "Ingegnere",
299
+ "Avv",
300
+ "Avv.",
301
+ "Avvocato",
302
+ "Arch",
303
+ "Arch.",
304
+ "Architetto",
305
+ "Rag",
306
+ "Rag.",
307
+ "Ragioniere",
308
+ "Geom",
309
+ "Geom.",
310
+ "Geometra",
311
+ // Nobility
312
+ "Conte",
313
+ "Contessa",
314
+ "Marchese",
315
+ "Marchesa",
316
+ "Principe",
317
+ "Principessa",
318
+ "Duca",
319
+ "Duchessa",
320
+ // Religious
321
+ "Don",
322
+ "Padre",
323
+ "Fra",
324
+ "Suor",
325
+ "Mons",
326
+ "Mons.",
327
+ "Monsignore",
328
+ ];
329
+ // Portuguese titles
330
+ const PT_TITLES = [
331
+ // Basic honorifics
332
+ "Sr",
333
+ "Sr.",
334
+ "Senhor",
335
+ "Sra",
336
+ "Sra.",
337
+ "Senhora",
338
+ "Srta",
339
+ "Srta.",
340
+ "Senhorita",
341
+ // Professional
342
+ "Dr",
343
+ "Dr.",
344
+ "Doutor",
345
+ "Dra",
346
+ "Dra.",
347
+ "Doutora",
348
+ "Prof",
349
+ "Prof.",
350
+ "Professor",
351
+ "Profa",
352
+ "Profa.",
353
+ "Professora",
354
+ "Eng",
355
+ "Eng.",
356
+ "Engenheiro",
357
+ "Engenheira",
358
+ "Arq",
359
+ "Arq.",
360
+ "Arquiteto",
361
+ "Arquiteta",
362
+ // Traditional (Brazil/Portugal)
363
+ "Dom",
364
+ "Dona",
365
+ // Military
366
+ "Gen",
367
+ "Gen.",
368
+ "General",
369
+ "Cel",
370
+ "Cel.",
371
+ "Coronel",
372
+ "Cap",
373
+ "Cap.",
374
+ "Capitão",
375
+ "Ten",
376
+ "Ten.",
377
+ "Tenente",
378
+ // Religious
379
+ "Padre",
380
+ "Frei",
381
+ "Irmã",
382
+ "Irmão",
383
+ ];
384
+ // Dutch titles
385
+ const NL_TITLES = [
386
+ // Basic honorifics
387
+ "Dhr",
388
+ "Dhr.",
389
+ "De heer",
390
+ "Meneer",
391
+ "Mijnheer",
392
+ "Mevr",
393
+ "Mevr.",
394
+ "Mevrouw",
395
+ "Mw",
396
+ "Mw.",
397
+ "Mejuffrouw",
398
+ "Juffrouw",
399
+ // Professional/Academic
400
+ "Dr",
401
+ "Dr.",
402
+ "Doctor",
403
+ "Prof",
404
+ "Prof.",
405
+ "Professor",
406
+ "Ir",
407
+ "Ir.",
408
+ "Ingenieur",
409
+ "Mr",
410
+ "Mr.",
411
+ "Meester", // Legal title
412
+ "Drs",
413
+ "Drs.",
414
+ "Doctorandus",
415
+ "Ing",
416
+ "Ing.",
417
+ // Military
418
+ "Gen",
419
+ "Gen.",
420
+ "Generaal",
421
+ "Kol",
422
+ "Kol.",
423
+ "Kolonel",
424
+ "Kapt",
425
+ "Kapt.",
426
+ "Kapitein",
427
+ // Religious
428
+ "Ds",
429
+ "Ds.",
430
+ "Dominee",
431
+ "Pastoor",
432
+ "Pater",
433
+ ];
434
+ // Latvian titles
435
+ const LV_TITLES = [
436
+ // Basic honorifics
437
+ "kungs",
438
+ "kundze",
439
+ "jaunkundze",
440
+ "k-gs",
441
+ "k-dze",
442
+ // Professional
443
+ "Dr",
444
+ "Dr.",
445
+ "doktors",
446
+ "Prof",
447
+ "Prof.",
448
+ "profesors",
449
+ // Note: Latvian uses fewer abbreviated titles than Western languages
450
+ ];
451
+ // Arabic titles (transliterated and Arabic script)
452
+ const AR_TITLES = [
453
+ // Basic honorifics - Arabic script
454
+ "السيد",
455
+ "السيدة",
456
+ "الآنسة",
457
+ // Basic honorifics - transliterated
458
+ "Al-Sayyid",
459
+ "As-Sayyid",
460
+ "Sayyid",
461
+ "Al-Sayyida",
462
+ "As-Sayyida",
463
+ "Sayyida",
464
+ "Al-Aanisa",
465
+ "Aanisa",
466
+ // Professional - Arabic script
467
+ "الدكتور",
468
+ "الدكتورة",
469
+ "الأستاذ",
470
+ "الأستاذة",
471
+ "المهندس",
472
+ "المهندسة",
473
+ // Professional - transliterated
474
+ "Dr",
475
+ "Dr.",
476
+ "Doktor",
477
+ "Ustadh",
478
+ "Ustadha",
479
+ "Ustaaz",
480
+ "Muhandis",
481
+ "Muhandisa",
482
+ // Religious
483
+ "الشيخ",
484
+ "Sheikh",
485
+ "Shaikh",
486
+ "Shaykh",
487
+ "الإمام",
488
+ "Imam",
489
+ "الحاج",
490
+ "Hajj",
491
+ "Hajji",
492
+ "Al-Hajj",
493
+ // Nobility/Honorific
494
+ "أمير",
495
+ "Amir",
496
+ "Emir",
497
+ "سلطان",
498
+ "Sultan",
499
+ ];
500
+ // Chinese titles (simplified and traditional)
501
+ const ZH_TITLES = [
502
+ // Basic honorifics
503
+ "先生", // xiānsheng - Mr.
504
+ "女士", // nǚshì - Ms.
505
+ "小姐", // xiǎojiě - Miss
506
+ "太太", // tàitai - Mrs.
507
+ // Professional/Academic
508
+ "博士", // bóshì - Doctor (PhD)
509
+ "教授", // jiàoshòu - Professor
510
+ "医生", // yīshēng - Doctor (medical)
511
+ "老师",
512
+ "老師", // lǎoshī - Teacher
513
+ "工程师",
514
+ "工程師", // gōngchéngshī - Engineer
515
+ "律师",
516
+ "律師", // lǜshī - Lawyer
517
+ // Military
518
+ "将军",
519
+ "將軍", // jiāngjūn - General
520
+ "上校", // shàngxiào - Colonel
521
+ "上尉", // shàngwèi - Captain
522
+ // Traditional/Formal
523
+ "阁下",
524
+ "閣下", // géxià - Your Excellency
525
+ "大人", // dàrén - Lord/Sir (historical)
526
+ ];
527
+ /**
528
+ * All titles combined and sorted by length (longest first)
529
+ * This ensures "Prof. Dr." matches before "Prof."
530
+ */
531
+ const ALL_TITLES = [
532
+ ...EN_TITLES,
533
+ ...DE_TITLES,
534
+ ...FR_TITLES,
535
+ ...ES_TITLES,
536
+ ...IT_TITLES,
537
+ ...PT_TITLES,
538
+ ...NL_TITLES,
539
+ ...LV_TITLES,
540
+ ...AR_TITLES,
541
+ ...ZH_TITLES,
542
+ ].sort((a, b) => b.length - a.length);
543
+ /**
544
+ * Pre-compiled regex patterns for title matching
545
+ * Matches title at start of string, followed by whitespace
546
+ */
547
+ const TITLE_PATTERNS = ALL_TITLES.map((title) => ({
548
+ pattern: new RegExp(`^${escapeRegex(title)}(?:\\s+|$)`, "i"),
549
+ title,
550
+ }));
551
+ /**
552
+ * Escapes special regex characters in a string
553
+ */
554
+ function escapeRegex(str) {
555
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
556
+ }
557
+ /**
558
+ * Extracts a title from the beginning of a name
559
+ *
560
+ * @param name - Full name potentially starting with a title
561
+ * @returns Extraction result with title, remaining name, and offset
562
+ *
563
+ * @example
564
+ * extractTitle("Dr. John Smith") // { title: "Dr.", nameWithoutTitle: "John Smith", titleLength: 4 }
565
+ * extractTitle("John Smith") // { title: undefined, nameWithoutTitle: "John Smith", titleLength: 0 }
566
+ */
567
+ export function extractTitle(name) {
568
+ const trimmed = name.trim();
569
+ for (const { pattern } of TITLE_PATTERNS) {
570
+ const match = trimmed.match(pattern);
571
+ if (match !== null) {
572
+ const matchedText = match[0];
573
+ const nameWithoutTitle = trimmed.slice(matchedText.length).trim();
574
+ // Only extract if there's still a name left after the title
575
+ if (nameWithoutTitle.length > 0) {
576
+ return {
577
+ title: matchedText.trimEnd(), // Keep original case from input
578
+ nameWithoutTitle,
579
+ titleLength: matchedText.length,
580
+ };
581
+ }
582
+ }
583
+ }
584
+ return {
585
+ title: undefined,
586
+ nameWithoutTitle: trimmed,
587
+ titleLength: 0,
588
+ };
589
+ }
590
+ /**
591
+ * Processes PERSON spans to extract titles
592
+ * Titles are removed from the span and stored in semantic attributes
593
+ * The span boundaries are adjusted so the title remains visible
594
+ *
595
+ * @param spans - Array of detected PII spans
596
+ * @param originalText - The original text (needed to verify span boundaries)
597
+ * @returns Array of spans with titles extracted from PERSON entities
598
+ */
599
+ export function extractTitlesFromSpans(spans, originalText) {
600
+ return spans.map((span) => {
601
+ // Only process PERSON entities
602
+ if (span.type !== PIIType.PERSON) {
603
+ return span;
604
+ }
605
+ // Get the actual text from the span
606
+ const spanText = span.text;
607
+ const extraction = extractTitle(spanText);
608
+ // If no title found, return original span
609
+ if (extraction.title === undefined || extraction.titleLength === 0) {
610
+ return span;
611
+ }
612
+ // Verify the extraction makes sense (remaining name is not empty)
613
+ if (extraction.nameWithoutTitle.length === 0) {
614
+ return span;
615
+ }
616
+ // Calculate new span boundaries
617
+ // The title stays in the text, we adjust the span to exclude it
618
+ const newStart = span.start + extraction.titleLength;
619
+ // Make sure new start doesn't exceed original end
620
+ if (newStart >= span.end) {
621
+ return span;
622
+ }
623
+ // Verify the text at the new position matches what we expect
624
+ const expectedText = originalText.slice(newStart, span.end).trim();
625
+ if (expectedText.toLowerCase() !== extraction.nameWithoutTitle.toLowerCase()) {
626
+ // Text doesn't match - might be whitespace differences, try to find actual start
627
+ const actualStart = originalText.indexOf(extraction.nameWithoutTitle, span.start);
628
+ if (actualStart === -1 || actualStart >= span.end) {
629
+ return span; // Can't find the name without title, return original
630
+ }
631
+ return {
632
+ ...span,
633
+ start: actualStart,
634
+ text: extraction.nameWithoutTitle,
635
+ semantic: {
636
+ ...span.semantic,
637
+ title: extraction.title,
638
+ },
639
+ };
640
+ }
641
+ // Return new span with adjusted boundaries and title in semantic attributes
642
+ return {
643
+ ...span,
644
+ start: newStart,
645
+ text: extraction.nameWithoutTitle,
646
+ semantic: {
647
+ ...span.semantic,
648
+ title: extraction.title,
649
+ },
650
+ };
651
+ });
652
+ }
653
+ /**
654
+ * Gets all supported titles for a specific language
655
+ */
656
+ export function getTitlesForLanguage(langCode) {
657
+ const titleMap = {
658
+ ar: AR_TITLES,
659
+ de: DE_TITLES,
660
+ en: EN_TITLES,
661
+ es: ES_TITLES,
662
+ fr: FR_TITLES,
663
+ it: IT_TITLES,
664
+ lv: LV_TITLES,
665
+ nl: NL_TITLES,
666
+ pt: PT_TITLES,
667
+ zh: ZH_TITLES,
668
+ };
669
+ return titleMap[langCode] ?? [];
670
+ }
671
+ /**
672
+ * Gets all supported titles across all languages
673
+ */
674
+ export function getAllTitles() {
675
+ return [...ALL_TITLES];
676
+ }
677
+ /**
678
+ * Checks if a string starts with a known title
679
+ */
680
+ export function startsWithTitle(text) {
681
+ const result = extractTitle(text);
682
+ return result.title !== undefined;
683
+ }
684
+ /**
685
+ * Checks if a text consists entirely of a title (with optional punctuation)
686
+ */
687
+ export function isOnlyTitle(text) {
688
+ const trimmed = text.trim();
689
+ // Remove trailing punctuation for comparison
690
+ const withoutPunctuation = trimmed.replace(/[.,!?;:]+$/, "").trim();
691
+ for (const { pattern } of TITLE_PATTERNS) {
692
+ // Check if the text matches the title pattern completely
693
+ const match = withoutPunctuation.match(pattern);
694
+ if (match !== null) {
695
+ const remaining = withoutPunctuation.slice(match[0].length).trim();
696
+ // If nothing left after the title, it's only a title
697
+ if (remaining === "") {
698
+ return true;
699
+ }
700
+ }
701
+ }
702
+ // Also check for exact match (case-insensitive)
703
+ const normalizedText = withoutPunctuation.toLowerCase();
704
+ for (const title of ALL_TITLES) {
705
+ if (normalizedText === title.toLowerCase()) {
706
+ return true;
707
+ }
708
+ }
709
+ return false;
710
+ }
711
+ /**
712
+ * Merges adjacent PERSON spans when one is a title
713
+ *
714
+ * This fixes issues where NER models split "Mrs. Smith" into two entities:
715
+ * - PERSON: "Mrs" (or "Mrs.")
716
+ * - PERSON: "Smith"
717
+ *
718
+ * After merging: PERSON: "Mrs. Smith"
719
+ *
720
+ * @param spans - Array of detected PII spans
721
+ * @param originalText - The original text
722
+ * @param maxGap - Maximum characters between spans to consider them adjacent (default: 3)
723
+ * @returns Array of spans with adjacent title+name PERSON entities merged
724
+ */
725
+ export function mergeAdjacentTitleSpans(spans, originalText, maxGap = 3) {
726
+ if (spans.length <= 1) {
727
+ return spans;
728
+ }
729
+ // Sort by start position
730
+ const sorted = [...spans].sort((a, b) => a.start - b.start);
731
+ const result = [];
732
+ let i = 0;
733
+ while (i < sorted.length) {
734
+ const current = sorted[i];
735
+ if (current === undefined) {
736
+ i++;
737
+ continue;
738
+ }
739
+ // Only process PERSON entities
740
+ if (current.type !== PIIType.PERSON) {
741
+ result.push(current);
742
+ i++;
743
+ continue;
744
+ }
745
+ // Check if this span is only a title
746
+ if (!isOnlyTitle(current.text)) {
747
+ result.push(current);
748
+ i++;
749
+ continue;
750
+ }
751
+ // Look for the next PERSON span that's close enough
752
+ let merged = false;
753
+ for (let j = i + 1; j < sorted.length; j++) {
754
+ const next = sorted[j];
755
+ if (next === undefined)
756
+ continue;
757
+ // Calculate gap between spans
758
+ const gap = next.start - current.end;
759
+ // If gap is too large, stop looking
760
+ if (gap > maxGap) {
761
+ break;
762
+ }
763
+ // Check what's in the gap (should be whitespace/punctuation only)
764
+ const gapText = originalText.slice(current.end, next.start);
765
+ if (!/^[\s.,;:!?]*$/.test(gapText)) {
766
+ break;
767
+ }
768
+ // If next is also PERSON, merge them
769
+ if (next.type === PIIType.PERSON) {
770
+ const mergedText = originalText.slice(current.start, next.end);
771
+ const mergedSpan = {
772
+ type: PIIType.PERSON,
773
+ start: current.start,
774
+ end: next.end,
775
+ // Use the higher confidence
776
+ confidence: Math.max(current.confidence, next.confidence),
777
+ // Mark as hybrid since we merged NER results
778
+ source: current.source === next.source
779
+ ? current.source
780
+ : DetectionSource.HYBRID,
781
+ text: mergedText,
782
+ // Preserve any existing semantic attributes from either span
783
+ semantic: {
784
+ ...current.semantic,
785
+ ...next.semantic,
786
+ },
787
+ };
788
+ result.push(mergedSpan);
789
+ merged = true;
790
+ i = j + 1; // Skip the merged span
791
+ break;
792
+ }
793
+ }
794
+ if (!merged) {
795
+ result.push(current);
796
+ i++;
797
+ }
798
+ }
799
+ return result;
800
+ }
801
+ //# sourceMappingURL=title-extractor.js.map