@elanlanguages/bridge-anonymization 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -1
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
- package/dist/crypto/pii-map-crypto.js +8 -8
- package/dist/crypto/pii-map-crypto.js.map +1 -1
- package/dist/index.d.ts +25 -20
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -52
- package/dist/index.js.map +1 -1
- package/dist/ner/model-manager.d.ts.map +1 -1
- package/dist/ner/model-manager.js +10 -8
- package/dist/ner/model-manager.js.map +1 -1
- package/dist/ner/ner-model.d.ts.map +1 -1
- package/dist/ner/ner-model.js +10 -10
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/onnx-runtime.d.ts +3 -3
- package/dist/ner/onnx-runtime.d.ts.map +1 -1
- package/dist/ner/onnx-runtime.js +1 -1
- package/dist/ner/onnx-runtime.js.map +1 -1
- package/dist/ner/tokenizer.d.ts +26 -53
- package/dist/ner/tokenizer.d.ts.map +1 -1
- package/dist/ner/tokenizer.js +174 -196
- package/dist/ner/tokenizer.js.map +1 -1
- package/dist/pipeline/index.d.ts +7 -4
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +7 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/resolver.d.ts.map +1 -1
- package/dist/pipeline/resolver.js +3 -2
- package/dist/pipeline/resolver.js.map +1 -1
- package/dist/pipeline/semantic-data-loader.d.ts +157 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +662 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +102 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +268 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +52 -12
- package/dist/pipeline/tagger.d.ts.map +1 -1
- package/dist/pipeline/tagger.js +226 -21
- package/dist/pipeline/tagger.js.map +1 -1
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/types/index.d.ts +66 -3
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +14 -3
- package/dist/types/index.js.map +1 -1
- package/dist/utils/index.d.ts +3 -3
- package/dist/utils/index.js +3 -3
- package/package.json +7 -5
package/dist/pipeline/tagger.js
CHANGED
|
@@ -2,25 +2,43 @@
|
|
|
2
2
|
* Replacement Tagger
|
|
3
3
|
* Replaces PII spans with placeholder tags and builds the PII map
|
|
4
4
|
*/
|
|
5
|
-
import { PIIType, } from
|
|
6
|
-
import { sortSpansByPosition } from
|
|
5
|
+
import { PIIType, } from "../types/index.js";
|
|
6
|
+
import { sortSpansByPosition } from "../utils/offsets.js";
|
|
7
7
|
/**
|
|
8
8
|
* Generates a PII placeholder tag
|
|
9
|
-
* Format: <PII type="TYPE" id="N"/>
|
|
9
|
+
* Format: <PII type="TYPE" id="N"/> or <PII type="TYPE" gender="X" id="N"/> etc.
|
|
10
|
+
*
|
|
11
|
+
* Semantic attributes (gender, scope) are included when provided and not 'unknown'
|
|
10
12
|
*/
|
|
11
|
-
export function generateTag(type, id) {
|
|
12
|
-
|
|
13
|
+
export function generateTag(type, id, semantic) {
|
|
14
|
+
let attrs = `type="${type}"`;
|
|
15
|
+
// Add semantic attributes if present and meaningful
|
|
16
|
+
if (semantic?.gender && semantic.gender !== "unknown") {
|
|
17
|
+
attrs += ` gender="${semantic.gender}"`;
|
|
18
|
+
}
|
|
19
|
+
if (semantic?.scope && semantic.scope !== "unknown") {
|
|
20
|
+
attrs += ` scope="${semantic.scope}"`;
|
|
21
|
+
}
|
|
22
|
+
attrs += ` id="${id}"`;
|
|
23
|
+
return `<PII ${attrs}/>`;
|
|
13
24
|
}
|
|
14
25
|
/**
|
|
15
|
-
* Parses a PII tag to extract type and
|
|
26
|
+
* Parses a PII tag to extract type, id, and semantic attributes
|
|
16
27
|
* Returns null if not a valid tag
|
|
28
|
+
*
|
|
29
|
+
* Supports formats:
|
|
30
|
+
* - <PII type="TYPE" id="N"/>
|
|
31
|
+
* - <PII type="TYPE" gender="X" id="N"/>
|
|
32
|
+
* - <PII type="TYPE" scope="X" id="N"/>
|
|
33
|
+
* - <PII type="TYPE" gender="X" scope="Y" id="N"/>
|
|
17
34
|
*/
|
|
18
35
|
export function parseTag(tag) {
|
|
19
|
-
|
|
36
|
+
// More flexible regex that handles optional gender/scope attributes
|
|
37
|
+
const match = tag.match(/^<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>$/);
|
|
20
38
|
if (match === null) {
|
|
21
39
|
return null;
|
|
22
40
|
}
|
|
23
|
-
const [, typeStr, idStr] = match;
|
|
41
|
+
const [, typeStr, genderStr, scopeStr, idStr] = match;
|
|
24
42
|
if (typeStr === undefined || idStr === undefined) {
|
|
25
43
|
return null;
|
|
26
44
|
}
|
|
@@ -30,7 +48,23 @@ export function parseTag(tag) {
|
|
|
30
48
|
if (!Object.values(PIIType).includes(type)) {
|
|
31
49
|
return null;
|
|
32
50
|
}
|
|
33
|
-
|
|
51
|
+
// Build semantic attributes if present
|
|
52
|
+
let semantic;
|
|
53
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
54
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
55
|
+
semantic = {};
|
|
56
|
+
if (genderStr !== undefined &&
|
|
57
|
+
genderStr !== "" &&
|
|
58
|
+
["male", "female", "neutral", "unknown"].includes(genderStr)) {
|
|
59
|
+
semantic.gender = genderStr;
|
|
60
|
+
}
|
|
61
|
+
if (scopeStr !== undefined &&
|
|
62
|
+
scopeStr !== "" &&
|
|
63
|
+
["city", "country", "region", "unknown"].includes(scopeStr)) {
|
|
64
|
+
semantic.scope = scopeStr;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return { type, id, semantic };
|
|
34
68
|
}
|
|
35
69
|
/**
|
|
36
70
|
* Creates a key for the PII map
|
|
@@ -86,9 +120,11 @@ export function tagEntities(text, matches, policy) {
|
|
|
86
120
|
// Perform replacements
|
|
87
121
|
let anonymizedText = text;
|
|
88
122
|
for (const entity of sortedDescending) {
|
|
89
|
-
const tag = generateTag(entity.type, entity.id);
|
|
123
|
+
const tag = generateTag(entity.type, entity.id, entity.semantic);
|
|
90
124
|
anonymizedText =
|
|
91
|
-
anonymizedText.slice(0, entity.start) +
|
|
125
|
+
anonymizedText.slice(0, entity.start) +
|
|
126
|
+
tag +
|
|
127
|
+
anonymizedText.slice(entity.end);
|
|
92
128
|
}
|
|
93
129
|
// Build final entities list (sorted by position)
|
|
94
130
|
const entities = entitiesWithIds.map((e) => ({
|
|
@@ -99,6 +135,7 @@ export function tagEntities(text, matches, policy) {
|
|
|
99
135
|
confidence: e.confidence,
|
|
100
136
|
source: e.source,
|
|
101
137
|
original: e.text,
|
|
138
|
+
semantic: e.semantic,
|
|
102
139
|
}));
|
|
103
140
|
return {
|
|
104
141
|
anonymizedText,
|
|
@@ -113,20 +150,176 @@ export function isValidTag(tag) {
|
|
|
113
150
|
return parseTag(tag) !== null;
|
|
114
151
|
}
|
|
115
152
|
/**
|
|
116
|
-
*
|
|
153
|
+
* Quote characters that might appear after translation
|
|
154
|
+
* Includes: standard quotes, smart quotes, German quotes, French quotes, etc.
|
|
155
|
+
*
|
|
156
|
+
* Unicode references:
|
|
157
|
+
* - \u0022 (") Standard double quote
|
|
158
|
+
* - \u0027 (') Standard single quote
|
|
159
|
+
* - \u0060 (`) Backtick
|
|
160
|
+
* - \u00AB («) Left guillemet
|
|
161
|
+
* - \u00BB (») Right guillemet
|
|
162
|
+
* - \u2018 (') Left single curly quote
|
|
163
|
+
* - \u2019 (') Right single curly quote
|
|
164
|
+
* - \u201A (‚) Single low-9 quote
|
|
165
|
+
* - \u201C (") Left double curly quote
|
|
166
|
+
* - \u201D (") Right double curly quote
|
|
167
|
+
* - \u201E („) Double low-9 quote (German)
|
|
168
|
+
*/
|
|
169
|
+
const QUOTE_CHARS = "[\"'`\u00AB\u00BB\u2018\u2019\u201A\u201C\u201D\u201E]";
|
|
170
|
+
/**
|
|
171
|
+
* Whitespace pattern including various unicode spaces
|
|
172
|
+
*/
|
|
173
|
+
const FLEXIBLE_WS = `[\\s\\u00A0\\u2000-\\u200B]*`;
|
|
174
|
+
const FLEXIBLE_WS_REQUIRED = `[\\s\\u00A0\\u2000-\\u200B]+`;
|
|
175
|
+
/**
|
|
176
|
+
* Builds patterns for fuzzy PII tag matching
|
|
177
|
+
* Handles various translation artifacts and optional semantic attributes
|
|
178
|
+
*/
|
|
179
|
+
function buildFuzzyTagPatterns() {
|
|
180
|
+
// Pattern for type attribute: type = "VALUE" (flexible spacing and quotes)
|
|
181
|
+
const typeAttr = `type${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}([A-Z_]+)${QUOTE_CHARS}`;
|
|
182
|
+
// Pattern for id attribute: id = "VALUE" (flexible spacing and quotes)
|
|
183
|
+
const idAttr = `id${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\d+)${QUOTE_CHARS}`;
|
|
184
|
+
// Optional gender attribute
|
|
185
|
+
const genderAttr = `(?:${FLEXIBLE_WS}gender${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
|
|
186
|
+
// Optional scope attribute
|
|
187
|
+
const scopeAttr = `(?:${FLEXIBLE_WS}scope${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
|
|
188
|
+
// Self-closing tag endings: />, / >, >, etc.
|
|
189
|
+
const selfClosing = `${FLEXIBLE_WS}\\/?${FLEXIBLE_WS}>`;
|
|
190
|
+
return [
|
|
191
|
+
// type first with optional gender/scope: <PII type="X" gender="Y" scope="Z" id="N"/>
|
|
192
|
+
// Groups: type=1, gender=2, scope=3, id=4
|
|
193
|
+
new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${typeAttr}${genderAttr}${scopeAttr}${FLEXIBLE_WS_REQUIRED}${idAttr}${selfClosing}`, "gi"),
|
|
194
|
+
// id first: <PII id="N" type="X"/>
|
|
195
|
+
// Groups: id=1, type=2
|
|
196
|
+
new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${idAttr}${FLEXIBLE_WS_REQUIRED}${typeAttr}${selfClosing}`, "gi"),
|
|
197
|
+
];
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Extracts all PII tags from anonymized text using fuzzy matching
|
|
201
|
+
* Handles mangled tags that may occur after translation
|
|
202
|
+
*
|
|
203
|
+
* Translation can mangle tags by:
|
|
204
|
+
* - Changing quote types (" → " or „ or « etc.)
|
|
205
|
+
* - Adding/removing whitespace
|
|
206
|
+
* - Changing case (type → Type, PII → pii)
|
|
207
|
+
* - Reordering attributes (id before type)
|
|
208
|
+
* - Modifying self-closing syntax (/> → / > or >)
|
|
117
209
|
*/
|
|
118
210
|
export function extractTags(anonymizedText) {
|
|
119
211
|
const tags = [];
|
|
120
|
-
const
|
|
212
|
+
const patterns = buildFuzzyTagPatterns();
|
|
213
|
+
// Track positions we've already matched to avoid duplicates from overlapping patterns
|
|
214
|
+
const matchedPositions = new Set();
|
|
215
|
+
for (let patternIndex = 0; patternIndex < patterns.length; patternIndex++) {
|
|
216
|
+
const pattern = patterns[patternIndex];
|
|
217
|
+
if (pattern === undefined)
|
|
218
|
+
continue;
|
|
219
|
+
let match;
|
|
220
|
+
// Reset lastIndex for each pattern
|
|
221
|
+
pattern.lastIndex = 0;
|
|
222
|
+
while ((match = pattern.exec(anonymizedText)) !== null) {
|
|
223
|
+
if (matchedPositions.has(match.index)) {
|
|
224
|
+
continue; // Skip duplicates from overlapping patterns
|
|
225
|
+
}
|
|
226
|
+
// Extract type, id, and semantic attributes based on which pattern matched
|
|
227
|
+
// Pattern 0: type first with optional gender/scope (groups: type=1, gender=2, scope=3, id=4)
|
|
228
|
+
// Pattern 1: id first (groups: id=1, type=2)
|
|
229
|
+
let typeStr;
|
|
230
|
+
let idStr;
|
|
231
|
+
let genderStr;
|
|
232
|
+
let scopeStr;
|
|
233
|
+
if (patternIndex === 0) {
|
|
234
|
+
typeStr = match[1];
|
|
235
|
+
genderStr = match[2];
|
|
236
|
+
scopeStr = match[3];
|
|
237
|
+
idStr = match[4];
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
idStr = match[1];
|
|
241
|
+
typeStr = match[2];
|
|
242
|
+
}
|
|
243
|
+
if (typeStr !== undefined && idStr !== undefined) {
|
|
244
|
+
const type = typeStr.toUpperCase();
|
|
245
|
+
const id = parseInt(idStr, 10);
|
|
246
|
+
if (Object.values(PIIType).includes(type)) {
|
|
247
|
+
// Build semantic attributes if present
|
|
248
|
+
let semantic;
|
|
249
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
250
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
251
|
+
semantic = {};
|
|
252
|
+
if (genderStr !== undefined &&
|
|
253
|
+
genderStr !== "" &&
|
|
254
|
+
["male", "female", "neutral", "unknown"].includes(genderStr.toLowerCase())) {
|
|
255
|
+
semantic.gender =
|
|
256
|
+
genderStr.toLowerCase();
|
|
257
|
+
}
|
|
258
|
+
if (scopeStr !== undefined &&
|
|
259
|
+
scopeStr !== "" &&
|
|
260
|
+
["city", "country", "region", "unknown"].includes(scopeStr.toLowerCase())) {
|
|
261
|
+
semantic.scope =
|
|
262
|
+
scopeStr.toLowerCase();
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
tags.push({
|
|
266
|
+
type,
|
|
267
|
+
id,
|
|
268
|
+
position: match.index,
|
|
269
|
+
matchedText: match[0],
|
|
270
|
+
semantic,
|
|
271
|
+
});
|
|
272
|
+
matchedPositions.add(match.index);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// Sort by position ascending
|
|
278
|
+
tags.sort((a, b) => a.position - b.position);
|
|
279
|
+
return tags;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Extracts tags using strict matching (original behavior)
|
|
283
|
+
* Useful when you know tags haven't been mangled
|
|
284
|
+
* Supports optional gender and scope attributes
|
|
285
|
+
*/
|
|
286
|
+
export function extractTagsStrict(anonymizedText) {
|
|
287
|
+
const tags = [];
|
|
288
|
+
// Pattern matches: <PII type="X" [gender="Y"] [scope="Z"] id="N"/>
|
|
289
|
+
const tagPattern = /<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>/g;
|
|
121
290
|
let match;
|
|
122
291
|
while ((match = tagPattern.exec(anonymizedText)) !== null) {
|
|
123
292
|
const typeStr = match[1];
|
|
124
|
-
const
|
|
293
|
+
const genderStr = match[2];
|
|
294
|
+
const scopeStr = match[3];
|
|
295
|
+
const idStr = match[4];
|
|
125
296
|
if (typeStr !== undefined && idStr !== undefined) {
|
|
126
297
|
const type = typeStr;
|
|
127
298
|
const id = parseInt(idStr, 10);
|
|
128
299
|
if (Object.values(PIIType).includes(type)) {
|
|
129
|
-
|
|
300
|
+
// Build semantic attributes if present
|
|
301
|
+
let semantic;
|
|
302
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
303
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
304
|
+
semantic = {};
|
|
305
|
+
if (genderStr !== undefined &&
|
|
306
|
+
genderStr !== "" &&
|
|
307
|
+
["male", "female", "neutral", "unknown"].includes(genderStr)) {
|
|
308
|
+
semantic.gender = genderStr;
|
|
309
|
+
}
|
|
310
|
+
if (scopeStr !== undefined &&
|
|
311
|
+
scopeStr !== "" &&
|
|
312
|
+
["city", "country", "region", "unknown"].includes(scopeStr)) {
|
|
313
|
+
semantic.scope = scopeStr;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
tags.push({
|
|
317
|
+
type,
|
|
318
|
+
id,
|
|
319
|
+
position: match.index,
|
|
320
|
+
matchedText: match[0],
|
|
321
|
+
semantic,
|
|
322
|
+
});
|
|
130
323
|
}
|
|
131
324
|
}
|
|
132
325
|
}
|
|
@@ -149,19 +342,31 @@ export function countEntitiesByType(entities) {
|
|
|
149
342
|
}
|
|
150
343
|
/**
|
|
151
344
|
* Rehydrates anonymized text using the PII map
|
|
152
|
-
*
|
|
345
|
+
* Uses fuzzy matching to handle tags that may have been mangled by translation
|
|
346
|
+
*
|
|
347
|
+
* @param anonymizedText - Text containing PII tags (possibly mangled)
|
|
348
|
+
* @param piiMap - Map of PII keys to original values
|
|
349
|
+
* @param strict - If true, use strict matching (original behavior). Default: false
|
|
350
|
+
* @returns Text with PII tags replaced by original values
|
|
153
351
|
*/
|
|
154
|
-
export function rehydrate(anonymizedText, piiMap) {
|
|
352
|
+
export function rehydrate(anonymizedText, piiMap, strict = false) {
|
|
155
353
|
let result = anonymizedText;
|
|
156
|
-
const tags =
|
|
354
|
+
const tags = strict
|
|
355
|
+
? extractTagsStrict(anonymizedText)
|
|
356
|
+
: extractTags(anonymizedText);
|
|
157
357
|
// Sort by position descending for replacement
|
|
358
|
+
// (replacing from end to start preserves earlier offsets)
|
|
158
359
|
tags.sort((a, b) => b.position - a.position);
|
|
159
|
-
for (const { type, id, position } of tags) {
|
|
360
|
+
for (const { type, id, position, matchedText } of tags) {
|
|
160
361
|
const key = createPIIMapKey(type, id);
|
|
161
362
|
const original = piiMap.get(key);
|
|
162
363
|
if (original !== undefined) {
|
|
163
|
-
|
|
164
|
-
|
|
364
|
+
// Use the actual matched text length for replacement
|
|
365
|
+
// This handles mangled tags where the length differs from the canonical form
|
|
366
|
+
result =
|
|
367
|
+
result.slice(0, position) +
|
|
368
|
+
original +
|
|
369
|
+
result.slice(position + matchedText.length);
|
|
165
370
|
}
|
|
166
371
|
}
|
|
167
372
|
return result;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tagger.js","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,GAKR,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,mBAAmB,
|
|
1
|
+
{"version":3,"file":"tagger.js","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,GAKR,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AA+B1D;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CACzB,IAAa,EACb,EAAU,EACV,QAA6B;IAE7B,IAAI,KAAK,GAAG,SAAS,IAAI,GAAG,CAAC;IAE7B,oDAAoD;IACpD,IAAI,QAAQ,EAAE,MAAM,IAAI,QAAQ,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;QACtD,KAAK,IAAI,YAAY,QAAQ,CAAC,MAAM,GAAG,CAAC;IAC1C,CAAC;IACD,IAAI,QAAQ,EAAE,KAAK,IAAI,QAAQ,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;QACpD,KAAK,IAAI,WAAW,QAAQ,CAAC,KAAK,GAAG,CAAC;IACxC,CAAC;IAED,KAAK,IAAI,QAAQ,EAAE,GAAG,CAAC;IAEvB,OAAO,QAAQ,KAAK,IAAI,CAAC;AAC3B,CAAC;AAWD;;;;;;;;;GASG;AACH,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,oEAAoE;IACpE,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CACrB,yFAAyF,CAC1F,CAAC;IAEF,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,CAAC,GAAG,KAAK,CAAC;IACtD,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QACjD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,IAAI,GAAG,OAAkB,CAAC;IAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAE/B,mCAAmC;IACnC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,uCAAuC;IACvC,IAAI,QAAwC,CAAC;IAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;QAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;QACD,QAAQ,GAAG,EAAE,CAAC;QACd,IACE,SAAS,KAAK,SAAS;YACvB,SAAS,KAAK,EAAE;YAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;YACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;QAC9D,CAAC;QACD,IACE,QAAQ,KAAK,SAAS;YACtB,QAAQ,KAAK,EAAE;YACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;YACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAAC,IAAa,EAAE,EAAU;IACvD,OAAO,GAAG,IAAI,IAAI,EAAE,EAAE,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAoB,EACpB,MAA2B;IAE3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO;YACL,cAAc,EAAE,IAAI;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,IAAI,GAAG,EAAE;SAClB,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,MAAM,eAAe,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAErD,aAAa;IACb,MAAM,eAAe,GAAsC,EAAE,CAAC;IAC9D,IAAI,MAAM,GAAG,CAAC,CAAC;IAEf,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,aAAa;IAEzD,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,EAAU,CAAC;QAEf,IAAI,MAAM,CAAC,sBAAsB,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACrC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,EAAE,GAAG,UAAU,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,EAAE,GAAG,MAAM,EAAE,CAAC;gBACd,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,MAAM,EAAE,CAAC;QAChB,CAAC;QAED,eAAe,CAAC,IAAI,CAAC,EAAE,GAAG,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,gBAAgB;IAChB,MAAM,MAAM,GAAc,IAAI,GAAG,EAAE,CAAC;IACpC,KAAK,MAAM,MAAM,IAAI,eAAe,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;QACpD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IAED,oDAAoD;IACpD,0DAA0D;IAC1D,MAAM,gBAAgB,GAAG,CAAC,GAAG,eAAe,CAAC,CAAC,IAAI,CAChD,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAC5B,CAAC;IAEF,uBAAuB;IACvB,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACtC,MAAM,GAAG,GAAG,WAAW,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;QACjE,cAAc;YACZ,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC;gBACrC,GAAG;gBACH,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IACrC,CAAC;IAED,iDAAiD;IACjD,MAAM,QAAQ,GAAqB,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,IAAI;QAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;KACrB,CAAC,CAAC,CAAC;IAEJ,OAAO;QACL,cAAc;QACd,QAAQ,EAAE,mBAAmB,CAAC,QAAQ,CAAC;QACvC,MAAM;KACP,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,OAAO,QAAQ,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC;AAChC,CAAC;AAeD;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,WAAW,GAAG,wDAAwD,CAAC;AAE7E;;GAEG;AACH,MAAM,WAAW,GAAG,8BAA8B,CAAC;AACnD,MAAM,oBAAoB,GAAG,8BAA8B,CAAC;AAE5D;;;GAGG;AACH,SAAS,qBAAqB;IAC5B,2EAA2E;IAC3E,MAAM,QAAQ,GAAG,OAAO,WAAW,IAAI,WAAW,GAAG,WAAW,YAAY,WAAW,EAAE,CAAC;IAC1F,uEAAuE;IACvE,MAAM,MAAM,GAAG,KAAK,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,EAAE,CAAC;IACnF,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,WAAW,SAAS,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAC9G,2BAA2B;IAC3B,MAAM,SAAS,GAAG,MAAM,WAAW,QAAQ,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAE5G,6CAA6C;IAC7C,MAAM,WAAW,GAAG,GAAG,WAAW,OAAO,WAAW,GAAG,CAAC;IAExD,OAAO;QACL,qFAAqF;QACrF,0CAA0C;QAC1C,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,QAAQ,GAAG,UAAU,GAAG,SAAS,GAAG,oBAAoB,GAAG,MAAM,GAAG,WAAW,EAAE,EAC7H,IAAI,CACL;QACD,mCAAmC;QACnC,uBAAuB;QACvB,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,MAAM,GAAG,oBAAoB,GAAG,QAAQ,GAAG,WAAW,EAAE,EACpG,IAAI,CACL;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,WAAW,CAAC,cAAsB;IAChD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,MAAM,QAAQ,GAAG,qBAAqB,EAAE,CAAC;IAEzC,sFAAsF;IACtF,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAU,CAAC;IAE3C,KAAK,IAAI,YAAY,GAAG,CAAC,EAAE,YAAY,GAAG,QAAQ,CAAC,MAAM,EAAE,YAAY,EAAE,EAAE,CAAC;QAC1E,MAAM,OAAO,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;QACvC,IAAI,OAAO,KAAK,SAAS;YAAE,SAAS;QAEpC,IAAI,KAA6B,CAAC;QAClC,mCAAmC;QACnC,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;QAEtB,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACvD,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,SAAS,CAAC,4CAA4C;YACxD,CAAC;YAED,2EAA2E;YAC3E,6FAA6F;YAC7F,6CAA6C;YAC7C,IAAI,OAA2B,CAAC;YAChC,IAAI,KAAyB,CAAC;YAC9B,IAAI,SAA6B,CAAC;YAClC,IAAI,QAA4B,CAAC;YAEjC,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACnB,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACpB,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACjB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;YAED,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;gBACjD,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAa,CAAC;gBAC9C,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;gBAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC1C,uCAAuC;oBACvC,IAAI,QAAwC,CAAC;oBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;wBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;wBACD,QAAQ,GAAG,EAAE,CAAC;wBACd,IACE,SAAS,KAAK,SAAS;4BACvB,SAAS,KAAK,EAAE;4BAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,SAAS,CAAC,WAAW,EAAE,CACxB,EACD,CAAC;4BACD,QAAQ,CAAC,MAAM;gCACb,SAAS,CAAC,WAAW,EAAkC,CAAC;wBAC5D,CAAC;wBACD,IACE,QAAQ,KAAK,SAAS;4BACtB,QAAQ,KAAK,EAAE;4BACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,QAAQ,CAAC,WAAW,EAAE,CACvB,EACD,CAAC;4BACD,QAAQ,CAAC,KAAK;gCACZ,QAAQ,CAAC,WAAW,EAAiC,CAAC;wBAC1D,CAAC;oBACH,CAAC;oBAED,IAAI,CAAC,IAAI,CAAC;wBACR,IAAI;wBACJ,EAAE;wBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;wBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;wBACrB,QAAQ;qBACT,CAAC,CAAC;oBACH,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBACpC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,cAAsB;IACtD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,mEAAmE;IACnE,MAAM,UAAU,GACd,wFAAwF,CAAC;IAE3F,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1D,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACzB,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEvB,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACjD,MAAM,IAAI,GAAG,OAAkB,CAAC;YAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1C,uCAAuC;gBACvC,IAAI,QAAwC,CAAC;gBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;oBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;oBACD,QAAQ,GAAG,EAAE,CAAC;oBACd,IACE,SAAS,KAAK,SAAS;wBACvB,SAAS,KAAK,EAAE;wBAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;wBACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;oBAC9D,CAAC;oBACD,IACE,QAAQ,KAAK,SAAS;wBACtB,QAAQ,KAAK,EAAE;wBACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;wBACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC,IAAI,CAAC;oBACR,IAAI;oBACJ,EAAE;oBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;oBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;oBACrB,QAAQ;iBACT,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CACjC,QAA0B;IAE1B,MAAM,MAAM,GAA4B,EAA6B,CAAC;IAEtE,4BAA4B;IAC5B,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;IAED,iBAAiB;IACjB,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,SAAS,CACvB,cAAsB,EACtB,MAAiB,EACjB,SAAkB,KAAK;IAEvB,IAAI,MAAM,GAAG,cAAc,CAAC;IAC5B,MAAM,IAAI,GAAG,MAAM;QACjB,CAAC,CAAC,iBAAiB,CAAC,cAAc,CAAC;QACnC,CAAC,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC;IAEhC,8CAA8C;IAC9C,0DAA0D;IAC1D,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,KAAK,MAAM,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC;QACvD,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjC,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,qDAAqD;YACrD,6EAA6E;YAC7E,MAAM;gBACJ,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC;oBACzB,QAAQ;oBACR,MAAM,CAAC,KAAK,CAAC,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Title Extractor
|
|
3
|
+
* Extracts and strips honorific titles/prefixes from PERSON entities
|
|
4
|
+
* so that titles remain visible in anonymized text for translation.
|
|
5
|
+
*
|
|
6
|
+
* Supported languages: ar, de, en, es, fr, it, lv, nl, pt, zh
|
|
7
|
+
*/
|
|
8
|
+
import { SpanMatch, SemanticAttributes } from "../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Title extraction result
|
|
11
|
+
*/
|
|
12
|
+
export interface TitleExtractionResult {
|
|
13
|
+
/** The extracted title (e.g., "Dr.", "Mr.") or undefined if no title */
|
|
14
|
+
title?: string;
|
|
15
|
+
/** The name without the title */
|
|
16
|
+
nameWithoutTitle: string;
|
|
17
|
+
/** Character offset where the name starts (after title + space) */
|
|
18
|
+
titleLength: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Extracts a title from the beginning of a name
|
|
22
|
+
*
|
|
23
|
+
* @param name - Full name potentially starting with a title
|
|
24
|
+
* @returns Extraction result with title, remaining name, and offset
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* extractTitle("Dr. John Smith") // { title: "Dr.", nameWithoutTitle: "John Smith", titleLength: 4 }
|
|
28
|
+
* extractTitle("John Smith") // { title: undefined, nameWithoutTitle: "John Smith", titleLength: 0 }
|
|
29
|
+
*/
|
|
30
|
+
export declare function extractTitle(name: string): TitleExtractionResult;
|
|
31
|
+
/**
|
|
32
|
+
* Extended semantic attributes including title
|
|
33
|
+
*/
|
|
34
|
+
export interface SemanticAttributesWithTitle extends SemanticAttributes {
|
|
35
|
+
/** Extracted title prefix (e.g., "Dr.", "Mrs.") */
|
|
36
|
+
title?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Processes PERSON spans to extract titles
|
|
40
|
+
* Titles are removed from the span and stored in semantic attributes
|
|
41
|
+
* The span boundaries are adjusted so the title remains visible
|
|
42
|
+
*
|
|
43
|
+
* @param spans - Array of detected PII spans
|
|
44
|
+
* @param originalText - The original text (needed to verify span boundaries)
|
|
45
|
+
* @returns Array of spans with titles extracted from PERSON entities
|
|
46
|
+
*/
|
|
47
|
+
export declare function extractTitlesFromSpans(spans: SpanMatch[], originalText: string): SpanMatch[];
|
|
48
|
+
/**
|
|
49
|
+
* Gets all supported titles for a specific language
|
|
50
|
+
*/
|
|
51
|
+
export declare function getTitlesForLanguage(langCode: "ar" | "de" | "en" | "es" | "fr" | "it" | "lv" | "nl" | "pt" | "zh"): string[];
|
|
52
|
+
/**
|
|
53
|
+
* Gets all supported titles across all languages
|
|
54
|
+
*/
|
|
55
|
+
export declare function getAllTitles(): string[];
|
|
56
|
+
/**
|
|
57
|
+
* Checks if a string starts with a known title
|
|
58
|
+
*/
|
|
59
|
+
export declare function startsWithTitle(text: string): boolean;
|
|
60
|
+
/**
|
|
61
|
+
* Checks if a text consists entirely of a title (with optional punctuation)
|
|
62
|
+
*/
|
|
63
|
+
export declare function isOnlyTitle(text: string): boolean;
|
|
64
|
+
/**
|
|
65
|
+
* Merges adjacent PERSON spans when one is a title
|
|
66
|
+
*
|
|
67
|
+
* This fixes issues where NER models split "Mrs. Smith" into two entities:
|
|
68
|
+
* - PERSON: "Mrs" (or "Mrs.")
|
|
69
|
+
* - PERSON: "Smith"
|
|
70
|
+
*
|
|
71
|
+
* After merging: PERSON: "Mrs. Smith"
|
|
72
|
+
*
|
|
73
|
+
* @param spans - Array of detected PII spans
|
|
74
|
+
* @param originalText - The original text
|
|
75
|
+
* @param maxGap - Maximum characters between spans to consider them adjacent (default: 3)
|
|
76
|
+
* @returns Array of spans with adjacent title+name PERSON entities merged
|
|
77
|
+
*/
|
|
78
|
+
export declare function mergeAdjacentTitleSpans(spans: SpanMatch[], originalText: string, maxGap?: number): SpanMatch[];
|
|
79
|
+
//# sourceMappingURL=title-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"title-extractor.d.ts","sourceRoot":"","sources":["../../src/pipeline/title-extractor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EACL,SAAS,EAET,kBAAkB,EAEnB,MAAM,mBAAmB,CAAC;AAE3B;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,wEAAwE;IACxE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IACzB,mEAAmE;IACnE,WAAW,EAAE,MAAM,CAAC;CACrB;AAqjBD;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,qBAAqB,CAyBhE;AAED;;GAEG;AACH,MAAM,WAAW,2BAA4B,SAAQ,kBAAkB;IACrE,mDAAmD;IACnD,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAiEb;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAC5E,MAAM,EAAE,CAcV;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,MAAM,EAAE,CAEvC;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAGrD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CA0BjD;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,MAAU,GACjB,SAAS,EAAE,CAuFb"}
|