rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
@@ -0,0 +1,374 @@
1
+ /**
2
+ * Replacement Tagger
3
+ * Replaces PII spans with placeholder tags and builds the PII map
4
+ */
5
+ import { PIIType, } from "../types/index.js";
6
+ import { sortSpansByPosition } from "../utils/offsets.js";
7
+ /**
8
+ * Generates a PII placeholder tag
9
+ * Format: <PII type="TYPE" id="N"/> or <PII type="TYPE" gender="X" id="N"/> etc.
10
+ *
11
+ * Semantic attributes (gender, scope) are included when provided and not 'unknown'
12
+ */
13
+ export function generateTag(type, id, semantic) {
14
+ let attrs = `type="${type}"`;
15
+ // Add semantic attributes if present and meaningful
16
+ if (semantic?.gender && semantic.gender !== "unknown") {
17
+ attrs += ` gender="${semantic.gender}"`;
18
+ }
19
+ if (semantic?.scope && semantic.scope !== "unknown") {
20
+ attrs += ` scope="${semantic.scope}"`;
21
+ }
22
+ attrs += ` id="${id}"`;
23
+ return `<PII ${attrs}/>`;
24
+ }
25
+ /**
26
+ * Parses a PII tag to extract type, id, and semantic attributes
27
+ * Returns null if not a valid tag
28
+ *
29
+ * Supports formats:
30
+ * - <PII type="TYPE" id="N"/>
31
+ * - <PII type="TYPE" gender="X" id="N"/>
32
+ * - <PII type="TYPE" scope="X" id="N"/>
33
+ * - <PII type="TYPE" gender="X" scope="Y" id="N"/>
34
+ */
35
+ export function parseTag(tag) {
36
+ // More flexible regex that handles optional gender/scope attributes
37
+ const match = tag.match(/^<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>$/);
38
+ if (match === null) {
39
+ return null;
40
+ }
41
+ const [, typeStr, genderStr, scopeStr, idStr] = match;
42
+ if (typeStr === undefined || idStr === undefined) {
43
+ return null;
44
+ }
45
+ const type = typeStr;
46
+ const id = parseInt(idStr, 10);
47
+ // Validate type is a valid PIIType
48
+ if (!Object.values(PIIType).includes(type)) {
49
+ return null;
50
+ }
51
+ // Build semantic attributes if present
52
+ let semantic;
53
+ if ((genderStr !== undefined && genderStr !== "") ||
54
+ (scopeStr !== undefined && scopeStr !== "")) {
55
+ semantic = {};
56
+ if (genderStr !== undefined &&
57
+ genderStr !== "" &&
58
+ ["male", "female", "neutral", "unknown"].includes(genderStr)) {
59
+ semantic.gender = genderStr;
60
+ }
61
+ if (scopeStr !== undefined &&
62
+ scopeStr !== "" &&
63
+ ["city", "country", "region", "unknown"].includes(scopeStr)) {
64
+ semantic.scope = scopeStr;
65
+ }
66
+ }
67
+ return { type, id, semantic };
68
+ }
69
+ /**
70
+ * Creates a key for the PII map
71
+ */
72
+ export function createPIIMapKey(type, id) {
73
+ return `${type}_${id}`;
74
+ }
75
+ /**
76
+ * Tags PII spans in text and builds the PII map
77
+ */
78
+ export function tagEntities(text, matches, policy) {
79
+ if (matches.length === 0) {
80
+ return {
81
+ anonymizedText: text,
82
+ entities: [],
83
+ piiMap: new Map(),
84
+ };
85
+ }
86
+ // Sort by start position ascending for ID assignment
87
+ const sortedAscending = sortSpansByPosition(matches);
88
+ // Assign IDs
89
+ const entitiesWithIds = [];
90
+ let nextId = 1;
91
+ // Track seen text for ID reuse (if enabled)
92
+ const seenText = new Map(); // text -> id
93
+ for (const match of sortedAscending) {
94
+ let id;
95
+ if (policy.reuseIdsForRepeatedPII) {
96
+ const key = `${match.type}:${match.text}`;
97
+ const existingId = seenText.get(key);
98
+ if (existingId !== undefined) {
99
+ id = existingId;
100
+ }
101
+ else {
102
+ id = nextId++;
103
+ seenText.set(key, id);
104
+ }
105
+ }
106
+ else {
107
+ id = nextId++;
108
+ }
109
+ entitiesWithIds.push({ ...match, id });
110
+ }
111
+ // Build PII map
112
+ const piiMap = new Map();
113
+ for (const entity of entitiesWithIds) {
114
+ const key = createPIIMapKey(entity.type, entity.id);
115
+ piiMap.set(key, entity.text);
116
+ }
117
+ // Sort by start position descending for replacement
118
+ // (replacing from end to start preserves earlier offsets)
119
+ const sortedDescending = [...entitiesWithIds].sort((a, b) => b.start - a.start);
120
+ // Perform replacements
121
+ let anonymizedText = text;
122
+ for (const entity of sortedDescending) {
123
+ const tag = generateTag(entity.type, entity.id, entity.semantic);
124
+ anonymizedText =
125
+ anonymizedText.slice(0, entity.start) +
126
+ tag +
127
+ anonymizedText.slice(entity.end);
128
+ }
129
+ // Build final entities list (sorted by position)
130
+ const entities = entitiesWithIds.map((e) => ({
131
+ type: e.type,
132
+ id: e.id,
133
+ start: e.start,
134
+ end: e.end,
135
+ confidence: e.confidence,
136
+ source: e.source,
137
+ original: e.text,
138
+ semantic: e.semantic,
139
+ }));
140
+ return {
141
+ anonymizedText,
142
+ entities: sortSpansByPosition(entities),
143
+ piiMap,
144
+ };
145
+ }
146
+ /**
147
+ * Validates that a tag is well-formed
148
+ */
149
+ export function isValidTag(tag) {
150
+ return parseTag(tag) !== null;
151
+ }
152
+ /**
153
+ * Quote characters that might appear after translation
154
+ * Includes: standard quotes, smart quotes, German quotes, French quotes, etc.
155
+ *
156
+ * Unicode references:
157
+ * - \u0022 (") Standard double quote
158
+ * - \u0027 (') Standard single quote
159
+ * - \u0060 (`) Backtick
160
+ * - \u00AB («) Left guillemet
161
+ * - \u00BB (») Right guillemet
162
+ * - \u2018 (') Left single curly quote
163
+ * - \u2019 (') Right single curly quote
164
+ * - \u201A (‚) Single low-9 quote
165
+ * - \u201C (") Left double curly quote
166
+ * - \u201D (") Right double curly quote
167
+ * - \u201E („) Double low-9 quote (German)
168
+ */
169
+ const QUOTE_CHARS = "[\"'`\u00AB\u00BB\u2018\u2019\u201A\u201C\u201D\u201E]";
170
+ /**
171
+ * Whitespace pattern including various unicode spaces
172
+ */
173
+ const FLEXIBLE_WS = `[\\s\\u00A0\\u2000-\\u200B]*`;
174
+ const FLEXIBLE_WS_REQUIRED = `[\\s\\u00A0\\u2000-\\u200B]+`;
175
+ /**
176
+ * Builds patterns for fuzzy PII tag matching
177
+ * Handles various translation artifacts and optional semantic attributes
178
+ */
179
+ function buildFuzzyTagPatterns() {
180
+ // Pattern for type attribute: type = "VALUE" (flexible spacing and quotes)
181
+ const typeAttr = `type${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}([A-Z_]+)${QUOTE_CHARS}`;
182
+ // Pattern for id attribute: id = "VALUE" (flexible spacing and quotes)
183
+ const idAttr = `id${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\d+)${QUOTE_CHARS}`;
184
+ // Optional gender attribute
185
+ const genderAttr = `(?:${FLEXIBLE_WS}gender${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
186
+ // Optional scope attribute
187
+ const scopeAttr = `(?:${FLEXIBLE_WS}scope${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
188
+ // Self-closing tag endings: />, / >, >, etc.
189
+ const selfClosing = `${FLEXIBLE_WS}\\/?${FLEXIBLE_WS}>`;
190
+ return [
191
+ // type first with optional gender/scope: <PII type="X" gender="Y" scope="Z" id="N"/>
192
+ // Groups: type=1, gender=2, scope=3, id=4
193
+ new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${typeAttr}${genderAttr}${scopeAttr}${FLEXIBLE_WS_REQUIRED}${idAttr}${selfClosing}`, "gi"),
194
+ // id first: <PII id="N" type="X"/>
195
+ // Groups: id=1, type=2
196
+ new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${idAttr}${FLEXIBLE_WS_REQUIRED}${typeAttr}${selfClosing}`, "gi"),
197
+ ];
198
+ }
199
+ /**
200
+ * Extracts all PII tags from anonymized text using fuzzy matching
201
+ * Handles mangled tags that may occur after translation
202
+ *
203
+ * Translation can mangle tags by:
204
+ * - Changing quote types (" → " or „ or « etc.)
205
+ * - Adding/removing whitespace
206
+ * - Changing case (type → Type, PII → pii)
207
+ * - Reordering attributes (id before type)
208
+ * - Modifying self-closing syntax (/> → / > or >)
209
+ */
210
+ export function extractTags(anonymizedText) {
211
+ const tags = [];
212
+ const patterns = buildFuzzyTagPatterns();
213
+ // Track positions we've already matched to avoid duplicates from overlapping patterns
214
+ const matchedPositions = new Set();
215
+ for (let patternIndex = 0; patternIndex < patterns.length; patternIndex++) {
216
+ const pattern = patterns[patternIndex];
217
+ if (pattern === undefined)
218
+ continue;
219
+ let match;
220
+ // Reset lastIndex for each pattern
221
+ pattern.lastIndex = 0;
222
+ while ((match = pattern.exec(anonymizedText)) !== null) {
223
+ if (matchedPositions.has(match.index)) {
224
+ continue; // Skip duplicates from overlapping patterns
225
+ }
226
+ // Extract type, id, and semantic attributes based on which pattern matched
227
+ // Pattern 0: type first with optional gender/scope (groups: type=1, gender=2, scope=3, id=4)
228
+ // Pattern 1: id first (groups: id=1, type=2)
229
+ let typeStr;
230
+ let idStr;
231
+ let genderStr;
232
+ let scopeStr;
233
+ if (patternIndex === 0) {
234
+ typeStr = match[1];
235
+ genderStr = match[2];
236
+ scopeStr = match[3];
237
+ idStr = match[4];
238
+ }
239
+ else {
240
+ idStr = match[1];
241
+ typeStr = match[2];
242
+ }
243
+ if (typeStr !== undefined && idStr !== undefined) {
244
+ const type = typeStr.toUpperCase();
245
+ const id = parseInt(idStr, 10);
246
+ if (Object.values(PIIType).includes(type)) {
247
+ // Build semantic attributes if present
248
+ let semantic;
249
+ if ((genderStr !== undefined && genderStr !== "") ||
250
+ (scopeStr !== undefined && scopeStr !== "")) {
251
+ semantic = {};
252
+ if (genderStr !== undefined &&
253
+ genderStr !== "" &&
254
+ ["male", "female", "neutral", "unknown"].includes(genderStr.toLowerCase())) {
255
+ semantic.gender =
256
+ genderStr.toLowerCase();
257
+ }
258
+ if (scopeStr !== undefined &&
259
+ scopeStr !== "" &&
260
+ ["city", "country", "region", "unknown"].includes(scopeStr.toLowerCase())) {
261
+ semantic.scope =
262
+ scopeStr.toLowerCase();
263
+ }
264
+ }
265
+ tags.push({
266
+ type,
267
+ id,
268
+ position: match.index,
269
+ matchedText: match[0],
270
+ semantic,
271
+ });
272
+ matchedPositions.add(match.index);
273
+ }
274
+ }
275
+ }
276
+ }
277
+ // Sort by position ascending
278
+ tags.sort((a, b) => a.position - b.position);
279
+ return tags;
280
+ }
281
+ /**
282
+ * Extracts tags using strict matching (original behavior)
283
+ * Useful when you know tags haven't been mangled
284
+ * Supports optional gender and scope attributes
285
+ */
286
+ export function extractTagsStrict(anonymizedText) {
287
+ const tags = [];
288
+ // Pattern matches: <PII type="X" [gender="Y"] [scope="Z"] id="N"/>
289
+ const tagPattern = /<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>/g;
290
+ let match;
291
+ while ((match = tagPattern.exec(anonymizedText)) !== null) {
292
+ const typeStr = match[1];
293
+ const genderStr = match[2];
294
+ const scopeStr = match[3];
295
+ const idStr = match[4];
296
+ if (typeStr !== undefined && idStr !== undefined) {
297
+ const type = typeStr;
298
+ const id = parseInt(idStr, 10);
299
+ if (Object.values(PIIType).includes(type)) {
300
+ // Build semantic attributes if present
301
+ let semantic;
302
+ if ((genderStr !== undefined && genderStr !== "") ||
303
+ (scopeStr !== undefined && scopeStr !== "")) {
304
+ semantic = {};
305
+ if (genderStr !== undefined &&
306
+ genderStr !== "" &&
307
+ ["male", "female", "neutral", "unknown"].includes(genderStr)) {
308
+ semantic.gender = genderStr;
309
+ }
310
+ if (scopeStr !== undefined &&
311
+ scopeStr !== "" &&
312
+ ["city", "country", "region", "unknown"].includes(scopeStr)) {
313
+ semantic.scope = scopeStr;
314
+ }
315
+ }
316
+ tags.push({
317
+ type,
318
+ id,
319
+ position: match.index,
320
+ matchedText: match[0],
321
+ semantic,
322
+ });
323
+ }
324
+ }
325
+ }
326
+ return tags;
327
+ }
328
+ /**
329
+ * Counts entities by type
330
+ */
331
+ export function countEntitiesByType(entities) {
332
+ const counts = {};
333
+ // Initialize all types to 0
334
+ for (const type of Object.values(PIIType)) {
335
+ counts[type] = 0;
336
+ }
337
+ // Count entities
338
+ for (const entity of entities) {
339
+ counts[entity.type] = (counts[entity.type] ?? 0) + 1;
340
+ }
341
+ return counts;
342
+ }
343
+ /**
344
+ * Rehydrates anonymized text using the PII map
345
+ * Uses fuzzy matching to handle tags that may have been mangled by translation
346
+ *
347
+ * @param anonymizedText - Text containing PII tags (possibly mangled)
348
+ * @param piiMap - Map of PII keys to original values
349
+ * @param strict - If true, use strict matching (original behavior). Default: false
350
+ * @returns Text with PII tags replaced by original values
351
+ */
352
+ export function rehydrate(anonymizedText, piiMap, strict = false) {
353
+ let result = anonymizedText;
354
+ const tags = strict
355
+ ? extractTagsStrict(anonymizedText)
356
+ : extractTags(anonymizedText);
357
+ // Sort by position descending for replacement
358
+ // (replacing from end to start preserves earlier offsets)
359
+ tags.sort((a, b) => b.position - a.position);
360
+ for (const { type, id, position, matchedText } of tags) {
361
+ const key = createPIIMapKey(type, id);
362
+ const original = piiMap.get(key);
363
+ if (original !== undefined) {
364
+ // Use the actual matched text length for replacement
365
+ // This handles mangled tags where the length differs from the canonical form
366
+ result =
367
+ result.slice(0, position) +
368
+ original +
369
+ result.slice(position + matchedText.length);
370
+ }
371
+ }
372
+ return result;
373
+ }
374
+ //# sourceMappingURL=tagger.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tagger.js","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,GAKR,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AA+B1D;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CACzB,IAAa,EACb,EAAU,EACV,QAA6B;IAE7B,IAAI,KAAK,GAAG,SAAS,IAAI,GAAG,CAAC;IAE7B,oDAAoD;IACpD,IAAI,QAAQ,EAAE,MAAM,IAAI,QAAQ,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;QACtD,KAAK,IAAI,YAAY,QAAQ,CAAC,MAAM,GAAG,CAAC;IAC1C,CAAC;IACD,IAAI,QAAQ,EAAE,KAAK,IAAI,QAAQ,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;QACpD,KAAK,IAAI,WAAW,QAAQ,CAAC,KAAK,GAAG,CAAC;IACxC,CAAC;IAED,KAAK,IAAI,QAAQ,EAAE,GAAG,CAAC;IAEvB,OAAO,QAAQ,KAAK,IAAI,CAAC;AAC3B,CAAC;AAWD;;;;;;;;;GASG;AACH,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,oEAAoE;IACpE,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CACrB,yFAAyF,CAC1F,CAAC;IAEF,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,CAAC,GAAG,KAAK,CAAC;IACtD,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QACjD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,IAAI,GAAG,OAAkB,CAAC;IAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAE/B,mCAAmC;IACnC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,uCAAuC;IACvC,IAAI,QAAwC,CAAC;IAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;QAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;QACD,QAAQ,GAAG,EAAE,CAAC;QACd,IACE,SAAS,KAAK,SAAS;YACvB,SAAS,KAAK,EAAE;YAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;YACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;QAC9D,CAAC;QACD,IACE,QAAQ,KAAK,SAAS;YACtB,QAAQ,KAAK,EAAE;YACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;YACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAAC,IAAa,EAAE,EAAU;IACvD,OAAO,GAAG,IAAI,IAAI,EAAE,EAAE,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAoB,EACpB,MAA2B;IAE3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO;YACL,cAAc,EAAE,IAAI;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,IAAI,GAAG,EAAE;SAClB,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,MAAM,eAAe,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAErD,aAAa;IACb,MAAM,eAAe,GAAsC,EAAE,CAAC;IAC9D,IAAI,MAAM,GAAG,CAAC,CAAC;IAEf,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,aAAa;IAEzD,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,EAAU,CAAC;QAEf,IAAI,MAAM,CAAC,sBAAsB,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACrC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,EAAE,GAAG,UAAU,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,EAAE,GAAG,MAAM,EAAE,CAAC;gBACd,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,MAAM,EAAE,CAAC;QAChB,CAAC;QAED,eAAe,CAAC,IAAI,CAAC,EAAE,GAAG,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,gBAAgB;IAChB,MAAM,MAAM,GAAc,IAAI,GAAG,EAAE,CAAC;IACpC,KAAK,MAAM,MAAM,IAAI,eAAe,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;QACpD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IAED,oDAAoD;IACpD,0DAA0D;IAC1D,MAAM,gBAAgB,GAAG,CAAC,GAAG,eAAe,CAAC,CAAC,IAAI,CAChD,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAC5B,CAAC;IAEF,uBAAuB;IACvB,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACtC,MAAM,GAAG,GAAG,WAAW,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;QACjE,cAAc;YACZ,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC;gBACrC,GAAG;gBACH,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IACrC,CAAC;IAED,iDAAiD;IACjD,MAAM,QAAQ,GAAqB,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,IAAI;QAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;KACrB,CAAC,CAAC,CAAC;IAEJ,OAAO;QACL,cAAc;QACd,QAAQ,EAAE,mBAAmB,CAAC,QAAQ,CAAC;QACvC,MAAM;KACP,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,OAAO,QAAQ,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC;AAChC,CAAC;AAeD;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,WAAW,GAAG,wDAAwD,CAAC;AAE7E;;GAEG;AACH,MAAM,WAAW,GAAG,8BAA8B,CAAC;AACnD,MAAM,oBAAoB,GAAG,8BAA8B,CAAC;AAE5D;;;GAGG;AACH,SAAS,qBAAqB;IAC5B,2EAA2E;IAC3E,MAAM,QAAQ,GAAG,OAAO,WAAW,IAAI,WAAW,GAAG,WAAW,YAAY,WAAW,EAAE,CAAC;IAC1F,uEAAuE;IACvE,MAAM,MAAM,GAAG,KAAK,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,EAAE,CAAC;IACnF,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,WAAW,SAAS,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAC9G,2BAA2B;IAC3B,MAAM,SAAS,GAAG,MAAM,WAAW,QAAQ,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAE5G,6CAA6C;IAC7C,MAAM,WAAW,GAAG,GAAG,WAAW,OAAO,WAAW,GAAG,CAAC;IAExD,OAAO;QACL,qFAAqF;QACrF,0CAA0C;QAC1C,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,QAAQ,GAAG,UAAU,GAAG,SAAS,GAAG,oBAAoB,GAAG,MAAM,GAAG,WAAW,EAAE,EAC7H,IAAI,CACL;QACD,mCAAmC;QACnC,uBAAuB;QACvB,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,MAAM,GAAG,oBAAoB,GAAG,QAAQ,GAAG,WAAW,EAAE,EACpG,IAAI,CACL;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,WAAW,CAAC,cAAsB;IAChD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,MAAM,QAAQ,GAAG,qBAAqB,EAAE,CAAC;IAEzC,sFAAsF;IACtF,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAU,CAAC;IAE3C,KAAK,IAAI,YAAY,GAAG,CAAC,EAAE,YAAY,GAAG,QAAQ,CAAC,MAAM,EAAE,YAAY,EAAE,EAAE,CAAC;QAC1E,MAAM,OAAO,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;QACvC,IAAI,OAAO,KAAK,SAAS;YAAE,SAAS;QAEpC,IAAI,KAA6B,CAAC;QAClC,mCAAmC;QACnC,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;QAEtB,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACvD,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,SAAS,CAAC,4CAA4C;YACxD,CAAC;YAED,2EAA2E;YAC3E,6FAA6F;YAC7F,6CAA6C;YAC7C,IAAI,OAA2B,CAAC;YAChC,IAAI,KAAyB,CAAC;YAC9B,IAAI,SAA6B,CAAC;YAClC,IAAI,QAA4B,CAAC;YAEjC,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACnB,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACpB,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACjB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;YAED,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;gBACjD,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAa,CAAC;gBAC9C,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;gBAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC1C,uCAAuC;oBACvC,IAAI,QAAwC,CAAC;oBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;wBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;wBACD,QAAQ,GAAG,EAAE,CAAC;wBACd,IACE,SAAS,KAAK,SAAS;4BACvB,SAAS,KAAK,EAAE;4BAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,SAAS,CAAC,WAAW,EAAE,CACxB,EACD,CAAC;4BACD,QAAQ,CAAC,MAAM;gCACb,SAAS,CAAC,WAAW,EAAkC,CAAC;wBAC5D,CAAC;wBACD,IACE,QAAQ,KAAK,SAAS;4BACtB,QAAQ,KAAK,EAAE;4BACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,QAAQ,CAAC,WAAW,EAAE,CACvB,EACD,CAAC;4BACD,QAAQ,CAAC,KAAK;gCACZ,QAAQ,CAAC,WAAW,EAAiC,CAAC;wBAC1D,CAAC;oBACH,CAAC;oBAED,IAAI,CAAC,IAAI,CAAC;wBACR,IAAI;wBACJ,EAAE;wBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;wBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;wBACrB,QAAQ;qBACT,CAAC,CAAC;oBACH,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBACpC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,cAAsB;IACtD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,mEAAmE;IACnE,MAAM,UAAU,GACd,wFAAwF,CAAC;IAE3F,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1D,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACzB,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEvB,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACjD,MAAM,IAAI,GAAG,OAAkB,CAAC;YAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1C,uCAAuC;gBACvC,IAAI,QAAwC,CAAC;gBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;oBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;oBACD,QAAQ,GAAG,EAAE,CAAC;oBACd,IACE,SAAS,KAAK,SAAS;wBACvB,SAAS,KAAK,EAAE;wBAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;wBACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;oBAC9D,CAAC;oBACD,IACE,QAAQ,KAAK,SAAS;wBACtB,QAAQ,KAAK,EAAE;wBACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;wBACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC,IAAI,CAAC;oBACR,IAAI;oBACJ,EAAE;oBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;oBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;oBACrB,QAAQ;iBACT,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CACjC,QAA0B;IAE1B,MAAM,MAAM,GAA4B,EAA6B,CAAC;IAEtE,4BAA4B;IAC5B,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;IAED,iBAAiB;IACjB,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,SAAS,CACvB,cAAsB,EACtB,MAAiB,EACjB,SAAkB,KAAK;IAEvB,IAAI,MAAM,GAAG,cAAc,CAAC;IAC5B,MAAM,IAAI,GAAG,MAAM;QACjB,CAAC,CAAC,iBAAiB,CAAC,cAAc,CAAC;QACnC,CAAC,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC;IAEhC,8CAA8C;IAC9C,0DAA0D;IAC1D,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,KAAK,MAAM,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC;QACvD,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjC,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,qDAAqD;YACrD,6EAA6E;YAC7E,MAAM;gBACJ,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC;oBACzB,QAAQ;oBACR,MAAM,CAAC,KAAK,CAAC,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Title Extractor
3
+ * Extracts and strips honorific titles/prefixes from PERSON entities
4
+ * so that titles remain visible in anonymized text for translation.
5
+ *
6
+ * Supported languages: ar, de, en, es, fr, it, lv, nl, pt, zh
7
+ */
8
+ import { SpanMatch, SemanticAttributes } from "../types/index.js";
9
+ /**
10
+ * Title extraction result
11
+ */
12
+ export interface TitleExtractionResult {
13
+ /** The extracted title (e.g., "Dr.", "Mr.") or undefined if no title */
14
+ title?: string;
15
+ /** The name without the title */
16
+ nameWithoutTitle: string;
17
+ /** Character offset where the name starts (after title + space) */
18
+ titleLength: number;
19
+ }
20
+ /**
21
+ * Extracts a title from the beginning of a name
22
+ *
23
+ * @param name - Full name potentially starting with a title
24
+ * @returns Extraction result with title, remaining name, and offset
25
+ *
26
+ * @example
27
+ * extractTitle("Dr. John Smith") // { title: "Dr.", nameWithoutTitle: "John Smith", titleLength: 4 }
28
+ * extractTitle("John Smith") // { title: undefined, nameWithoutTitle: "John Smith", titleLength: 0 }
29
+ */
30
+ export declare function extractTitle(name: string): TitleExtractionResult;
31
+ /**
32
+ * Extended semantic attributes including title
33
+ */
34
+ export interface SemanticAttributesWithTitle extends SemanticAttributes {
35
+ /** Extracted title prefix (e.g., "Dr.", "Mrs.") */
36
+ title?: string;
37
+ }
38
+ /**
39
+ * Processes PERSON spans to extract titles
40
+ * Titles are removed from the span and stored in semantic attributes
41
+ * The span boundaries are adjusted so the title remains visible
42
+ *
43
+ * @param spans - Array of detected PII spans
44
+ * @param originalText - The original text (needed to verify span boundaries)
45
+ * @returns Array of spans with titles extracted from PERSON entities
46
+ */
47
+ export declare function extractTitlesFromSpans(spans: SpanMatch[], originalText: string): SpanMatch[];
48
+ /**
49
+ * Gets all supported titles for a specific language
50
+ */
51
+ export declare function getTitlesForLanguage(langCode: "ar" | "de" | "en" | "es" | "fr" | "it" | "lv" | "nl" | "pt" | "zh"): string[];
52
+ /**
53
+ * Gets all supported titles across all languages
54
+ */
55
+ export declare function getAllTitles(): string[];
56
+ /**
57
+ * Checks if a string starts with a known title
58
+ */
59
+ export declare function startsWithTitle(text: string): boolean;
60
+ /**
61
+ * Checks if a text consists entirely of a title (with optional punctuation)
62
+ */
63
+ export declare function isOnlyTitle(text: string): boolean;
64
+ /**
65
+ * Merges adjacent PERSON spans when one is a title
66
+ *
67
+ * This fixes issues where NER models split "Mrs. Smith" into two entities:
68
+ * - PERSON: "Mrs" (or "Mrs.")
69
+ * - PERSON: "Smith"
70
+ *
71
+ * After merging: PERSON: "Mrs. Smith"
72
+ *
73
+ * @param spans - Array of detected PII spans
74
+ * @param originalText - The original text
75
+ * @param maxGap - Maximum characters between spans to consider them adjacent (default: 3)
76
+ * @returns Array of spans with adjacent title+name PERSON entities merged
77
+ */
78
+ export declare function mergeAdjacentTitleSpans(spans: SpanMatch[], originalText: string, maxGap?: number): SpanMatch[];
79
+ //# sourceMappingURL=title-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"title-extractor.d.ts","sourceRoot":"","sources":["../../src/pipeline/title-extractor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EACL,SAAS,EAET,kBAAkB,EAEnB,MAAM,mBAAmB,CAAC;AAE3B;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,wEAAwE;IACxE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IACzB,mEAAmE;IACnE,WAAW,EAAE,MAAM,CAAC;CACrB;AAqjBD;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,qBAAqB,CAyBhE;AAED;;GAEG;AACH,MAAM,WAAW,2BAA4B,SAAQ,kBAAkB;IACrE,mDAAmD;IACnD,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAiEb;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAC5E,MAAM,EAAE,CAcV;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,MAAM,EAAE,CAEvC;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAGrD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CA0BjD;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,MAAU,GACjB,SAAS,EAAE,CAuFb"}