rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Replacement Tagger
|
|
3
|
+
* Replaces PII spans with placeholder tags and builds the PII map
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, } from "../types/index.js";
|
|
6
|
+
import { sortSpansByPosition } from "../utils/offsets.js";
|
|
7
|
+
/**
|
|
8
|
+
* Generates a PII placeholder tag
|
|
9
|
+
* Format: <PII type="TYPE" id="N"/> or <PII type="TYPE" gender="X" id="N"/> etc.
|
|
10
|
+
*
|
|
11
|
+
* Semantic attributes (gender, scope) are included when provided and not 'unknown'
|
|
12
|
+
*/
|
|
13
|
+
export function generateTag(type, id, semantic) {
|
|
14
|
+
let attrs = `type="${type}"`;
|
|
15
|
+
// Add semantic attributes if present and meaningful
|
|
16
|
+
if (semantic?.gender && semantic.gender !== "unknown") {
|
|
17
|
+
attrs += ` gender="${semantic.gender}"`;
|
|
18
|
+
}
|
|
19
|
+
if (semantic?.scope && semantic.scope !== "unknown") {
|
|
20
|
+
attrs += ` scope="${semantic.scope}"`;
|
|
21
|
+
}
|
|
22
|
+
attrs += ` id="${id}"`;
|
|
23
|
+
return `<PII ${attrs}/>`;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Parses a PII tag to extract type, id, and semantic attributes
|
|
27
|
+
* Returns null if not a valid tag
|
|
28
|
+
*
|
|
29
|
+
* Supports formats:
|
|
30
|
+
* - <PII type="TYPE" id="N"/>
|
|
31
|
+
* - <PII type="TYPE" gender="X" id="N"/>
|
|
32
|
+
* - <PII type="TYPE" scope="X" id="N"/>
|
|
33
|
+
* - <PII type="TYPE" gender="X" scope="Y" id="N"/>
|
|
34
|
+
*/
|
|
35
|
+
export function parseTag(tag) {
|
|
36
|
+
// More flexible regex that handles optional gender/scope attributes
|
|
37
|
+
const match = tag.match(/^<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>$/);
|
|
38
|
+
if (match === null) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
const [, typeStr, genderStr, scopeStr, idStr] = match;
|
|
42
|
+
if (typeStr === undefined || idStr === undefined) {
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
const type = typeStr;
|
|
46
|
+
const id = parseInt(idStr, 10);
|
|
47
|
+
// Validate type is a valid PIIType
|
|
48
|
+
if (!Object.values(PIIType).includes(type)) {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
// Build semantic attributes if present
|
|
52
|
+
let semantic;
|
|
53
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
54
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
55
|
+
semantic = {};
|
|
56
|
+
if (genderStr !== undefined &&
|
|
57
|
+
genderStr !== "" &&
|
|
58
|
+
["male", "female", "neutral", "unknown"].includes(genderStr)) {
|
|
59
|
+
semantic.gender = genderStr;
|
|
60
|
+
}
|
|
61
|
+
if (scopeStr !== undefined &&
|
|
62
|
+
scopeStr !== "" &&
|
|
63
|
+
["city", "country", "region", "unknown"].includes(scopeStr)) {
|
|
64
|
+
semantic.scope = scopeStr;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return { type, id, semantic };
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Creates a key for the PII map
|
|
71
|
+
*/
|
|
72
|
+
export function createPIIMapKey(type, id) {
|
|
73
|
+
return `${type}_${id}`;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Tags PII spans in text and builds the PII map
|
|
77
|
+
*/
|
|
78
|
+
export function tagEntities(text, matches, policy) {
|
|
79
|
+
if (matches.length === 0) {
|
|
80
|
+
return {
|
|
81
|
+
anonymizedText: text,
|
|
82
|
+
entities: [],
|
|
83
|
+
piiMap: new Map(),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
// Sort by start position ascending for ID assignment
|
|
87
|
+
const sortedAscending = sortSpansByPosition(matches);
|
|
88
|
+
// Assign IDs
|
|
89
|
+
const entitiesWithIds = [];
|
|
90
|
+
let nextId = 1;
|
|
91
|
+
// Track seen text for ID reuse (if enabled)
|
|
92
|
+
const seenText = new Map(); // text -> id
|
|
93
|
+
for (const match of sortedAscending) {
|
|
94
|
+
let id;
|
|
95
|
+
if (policy.reuseIdsForRepeatedPII) {
|
|
96
|
+
const key = `${match.type}:${match.text}`;
|
|
97
|
+
const existingId = seenText.get(key);
|
|
98
|
+
if (existingId !== undefined) {
|
|
99
|
+
id = existingId;
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
id = nextId++;
|
|
103
|
+
seenText.set(key, id);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
id = nextId++;
|
|
108
|
+
}
|
|
109
|
+
entitiesWithIds.push({ ...match, id });
|
|
110
|
+
}
|
|
111
|
+
// Build PII map
|
|
112
|
+
const piiMap = new Map();
|
|
113
|
+
for (const entity of entitiesWithIds) {
|
|
114
|
+
const key = createPIIMapKey(entity.type, entity.id);
|
|
115
|
+
piiMap.set(key, entity.text);
|
|
116
|
+
}
|
|
117
|
+
// Sort by start position descending for replacement
|
|
118
|
+
// (replacing from end to start preserves earlier offsets)
|
|
119
|
+
const sortedDescending = [...entitiesWithIds].sort((a, b) => b.start - a.start);
|
|
120
|
+
// Perform replacements
|
|
121
|
+
let anonymizedText = text;
|
|
122
|
+
for (const entity of sortedDescending) {
|
|
123
|
+
const tag = generateTag(entity.type, entity.id, entity.semantic);
|
|
124
|
+
anonymizedText =
|
|
125
|
+
anonymizedText.slice(0, entity.start) +
|
|
126
|
+
tag +
|
|
127
|
+
anonymizedText.slice(entity.end);
|
|
128
|
+
}
|
|
129
|
+
// Build final entities list (sorted by position)
|
|
130
|
+
const entities = entitiesWithIds.map((e) => ({
|
|
131
|
+
type: e.type,
|
|
132
|
+
id: e.id,
|
|
133
|
+
start: e.start,
|
|
134
|
+
end: e.end,
|
|
135
|
+
confidence: e.confidence,
|
|
136
|
+
source: e.source,
|
|
137
|
+
original: e.text,
|
|
138
|
+
semantic: e.semantic,
|
|
139
|
+
}));
|
|
140
|
+
return {
|
|
141
|
+
anonymizedText,
|
|
142
|
+
entities: sortSpansByPosition(entities),
|
|
143
|
+
piiMap,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Validates that a tag is well-formed
|
|
148
|
+
*/
|
|
149
|
+
export function isValidTag(tag) {
|
|
150
|
+
return parseTag(tag) !== null;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Quote characters that might appear after translation
|
|
154
|
+
* Includes: standard quotes, smart quotes, German quotes, French quotes, etc.
|
|
155
|
+
*
|
|
156
|
+
* Unicode references:
|
|
157
|
+
* - \u0022 (") Standard double quote
|
|
158
|
+
* - \u0027 (') Standard single quote
|
|
159
|
+
* - \u0060 (`) Backtick
|
|
160
|
+
* - \u00AB («) Left guillemet
|
|
161
|
+
* - \u00BB (») Right guillemet
|
|
162
|
+
* - \u2018 (') Left single curly quote
|
|
163
|
+
* - \u2019 (') Right single curly quote
|
|
164
|
+
* - \u201A (‚) Single low-9 quote
|
|
165
|
+
* - \u201C (") Left double curly quote
|
|
166
|
+
* - \u201D (") Right double curly quote
|
|
167
|
+
* - \u201E („) Double low-9 quote (German)
|
|
168
|
+
*/
|
|
169
|
+
const QUOTE_CHARS = "[\"'`\u00AB\u00BB\u2018\u2019\u201A\u201C\u201D\u201E]";
|
|
170
|
+
/**
|
|
171
|
+
* Whitespace pattern including various unicode spaces
|
|
172
|
+
*/
|
|
173
|
+
const FLEXIBLE_WS = `[\\s\\u00A0\\u2000-\\u200B]*`;
|
|
174
|
+
const FLEXIBLE_WS_REQUIRED = `[\\s\\u00A0\\u2000-\\u200B]+`;
|
|
175
|
+
/**
|
|
176
|
+
* Builds patterns for fuzzy PII tag matching
|
|
177
|
+
* Handles various translation artifacts and optional semantic attributes
|
|
178
|
+
*/
|
|
179
|
+
function buildFuzzyTagPatterns() {
|
|
180
|
+
// Pattern for type attribute: type = "VALUE" (flexible spacing and quotes)
|
|
181
|
+
const typeAttr = `type${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}([A-Z_]+)${QUOTE_CHARS}`;
|
|
182
|
+
// Pattern for id attribute: id = "VALUE" (flexible spacing and quotes)
|
|
183
|
+
const idAttr = `id${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\d+)${QUOTE_CHARS}`;
|
|
184
|
+
// Optional gender attribute
|
|
185
|
+
const genderAttr = `(?:${FLEXIBLE_WS}gender${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
|
|
186
|
+
// Optional scope attribute
|
|
187
|
+
const scopeAttr = `(?:${FLEXIBLE_WS}scope${FLEXIBLE_WS}=${FLEXIBLE_WS}${QUOTE_CHARS}(\\w+)${QUOTE_CHARS})?`;
|
|
188
|
+
// Self-closing tag endings: />, / >, >, etc.
|
|
189
|
+
const selfClosing = `${FLEXIBLE_WS}\\/?${FLEXIBLE_WS}>`;
|
|
190
|
+
return [
|
|
191
|
+
// type first with optional gender/scope: <PII type="X" gender="Y" scope="Z" id="N"/>
|
|
192
|
+
// Groups: type=1, gender=2, scope=3, id=4
|
|
193
|
+
new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${typeAttr}${genderAttr}${scopeAttr}${FLEXIBLE_WS_REQUIRED}${idAttr}${selfClosing}`, "gi"),
|
|
194
|
+
// id first: <PII id="N" type="X"/>
|
|
195
|
+
// Groups: id=1, type=2
|
|
196
|
+
new RegExp(`<${FLEXIBLE_WS}PII${FLEXIBLE_WS_REQUIRED}${idAttr}${FLEXIBLE_WS_REQUIRED}${typeAttr}${selfClosing}`, "gi"),
|
|
197
|
+
];
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Extracts all PII tags from anonymized text using fuzzy matching
|
|
201
|
+
* Handles mangled tags that may occur after translation
|
|
202
|
+
*
|
|
203
|
+
* Translation can mangle tags by:
|
|
204
|
+
* - Changing quote types (" → " or „ or « etc.)
|
|
205
|
+
* - Adding/removing whitespace
|
|
206
|
+
* - Changing case (type → Type, PII → pii)
|
|
207
|
+
* - Reordering attributes (id before type)
|
|
208
|
+
* - Modifying self-closing syntax (/> → / > or >)
|
|
209
|
+
*/
|
|
210
|
+
export function extractTags(anonymizedText) {
|
|
211
|
+
const tags = [];
|
|
212
|
+
const patterns = buildFuzzyTagPatterns();
|
|
213
|
+
// Track positions we've already matched to avoid duplicates from overlapping patterns
|
|
214
|
+
const matchedPositions = new Set();
|
|
215
|
+
for (let patternIndex = 0; patternIndex < patterns.length; patternIndex++) {
|
|
216
|
+
const pattern = patterns[patternIndex];
|
|
217
|
+
if (pattern === undefined)
|
|
218
|
+
continue;
|
|
219
|
+
let match;
|
|
220
|
+
// Reset lastIndex for each pattern
|
|
221
|
+
pattern.lastIndex = 0;
|
|
222
|
+
while ((match = pattern.exec(anonymizedText)) !== null) {
|
|
223
|
+
if (matchedPositions.has(match.index)) {
|
|
224
|
+
continue; // Skip duplicates from overlapping patterns
|
|
225
|
+
}
|
|
226
|
+
// Extract type, id, and semantic attributes based on which pattern matched
|
|
227
|
+
// Pattern 0: type first with optional gender/scope (groups: type=1, gender=2, scope=3, id=4)
|
|
228
|
+
// Pattern 1: id first (groups: id=1, type=2)
|
|
229
|
+
let typeStr;
|
|
230
|
+
let idStr;
|
|
231
|
+
let genderStr;
|
|
232
|
+
let scopeStr;
|
|
233
|
+
if (patternIndex === 0) {
|
|
234
|
+
typeStr = match[1];
|
|
235
|
+
genderStr = match[2];
|
|
236
|
+
scopeStr = match[3];
|
|
237
|
+
idStr = match[4];
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
idStr = match[1];
|
|
241
|
+
typeStr = match[2];
|
|
242
|
+
}
|
|
243
|
+
if (typeStr !== undefined && idStr !== undefined) {
|
|
244
|
+
const type = typeStr.toUpperCase();
|
|
245
|
+
const id = parseInt(idStr, 10);
|
|
246
|
+
if (Object.values(PIIType).includes(type)) {
|
|
247
|
+
// Build semantic attributes if present
|
|
248
|
+
let semantic;
|
|
249
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
250
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
251
|
+
semantic = {};
|
|
252
|
+
if (genderStr !== undefined &&
|
|
253
|
+
genderStr !== "" &&
|
|
254
|
+
["male", "female", "neutral", "unknown"].includes(genderStr.toLowerCase())) {
|
|
255
|
+
semantic.gender =
|
|
256
|
+
genderStr.toLowerCase();
|
|
257
|
+
}
|
|
258
|
+
if (scopeStr !== undefined &&
|
|
259
|
+
scopeStr !== "" &&
|
|
260
|
+
["city", "country", "region", "unknown"].includes(scopeStr.toLowerCase())) {
|
|
261
|
+
semantic.scope =
|
|
262
|
+
scopeStr.toLowerCase();
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
tags.push({
|
|
266
|
+
type,
|
|
267
|
+
id,
|
|
268
|
+
position: match.index,
|
|
269
|
+
matchedText: match[0],
|
|
270
|
+
semantic,
|
|
271
|
+
});
|
|
272
|
+
matchedPositions.add(match.index);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// Sort by position ascending
|
|
278
|
+
tags.sort((a, b) => a.position - b.position);
|
|
279
|
+
return tags;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Extracts tags using strict matching (original behavior)
|
|
283
|
+
* Useful when you know tags haven't been mangled
|
|
284
|
+
* Supports optional gender and scope attributes
|
|
285
|
+
*/
|
|
286
|
+
export function extractTagsStrict(anonymizedText) {
|
|
287
|
+
const tags = [];
|
|
288
|
+
// Pattern matches: <PII type="X" [gender="Y"] [scope="Z"] id="N"/>
|
|
289
|
+
const tagPattern = /<PII\s+type="([A-Z_]+)"(?:\s+gender="(\w+)")?(?:\s+scope="(\w+)")?\s+id="(\d+)"\s*\/>/g;
|
|
290
|
+
let match;
|
|
291
|
+
while ((match = tagPattern.exec(anonymizedText)) !== null) {
|
|
292
|
+
const typeStr = match[1];
|
|
293
|
+
const genderStr = match[2];
|
|
294
|
+
const scopeStr = match[3];
|
|
295
|
+
const idStr = match[4];
|
|
296
|
+
if (typeStr !== undefined && idStr !== undefined) {
|
|
297
|
+
const type = typeStr;
|
|
298
|
+
const id = parseInt(idStr, 10);
|
|
299
|
+
if (Object.values(PIIType).includes(type)) {
|
|
300
|
+
// Build semantic attributes if present
|
|
301
|
+
let semantic;
|
|
302
|
+
if ((genderStr !== undefined && genderStr !== "") ||
|
|
303
|
+
(scopeStr !== undefined && scopeStr !== "")) {
|
|
304
|
+
semantic = {};
|
|
305
|
+
if (genderStr !== undefined &&
|
|
306
|
+
genderStr !== "" &&
|
|
307
|
+
["male", "female", "neutral", "unknown"].includes(genderStr)) {
|
|
308
|
+
semantic.gender = genderStr;
|
|
309
|
+
}
|
|
310
|
+
if (scopeStr !== undefined &&
|
|
311
|
+
scopeStr !== "" &&
|
|
312
|
+
["city", "country", "region", "unknown"].includes(scopeStr)) {
|
|
313
|
+
semantic.scope = scopeStr;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
tags.push({
|
|
317
|
+
type,
|
|
318
|
+
id,
|
|
319
|
+
position: match.index,
|
|
320
|
+
matchedText: match[0],
|
|
321
|
+
semantic,
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
return tags;
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Counts entities by type
|
|
330
|
+
*/
|
|
331
|
+
export function countEntitiesByType(entities) {
|
|
332
|
+
const counts = {};
|
|
333
|
+
// Initialize all types to 0
|
|
334
|
+
for (const type of Object.values(PIIType)) {
|
|
335
|
+
counts[type] = 0;
|
|
336
|
+
}
|
|
337
|
+
// Count entities
|
|
338
|
+
for (const entity of entities) {
|
|
339
|
+
counts[entity.type] = (counts[entity.type] ?? 0) + 1;
|
|
340
|
+
}
|
|
341
|
+
return counts;
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Rehydrates anonymized text using the PII map
|
|
345
|
+
* Uses fuzzy matching to handle tags that may have been mangled by translation
|
|
346
|
+
*
|
|
347
|
+
* @param anonymizedText - Text containing PII tags (possibly mangled)
|
|
348
|
+
* @param piiMap - Map of PII keys to original values
|
|
349
|
+
* @param strict - If true, use strict matching (original behavior). Default: false
|
|
350
|
+
* @returns Text with PII tags replaced by original values
|
|
351
|
+
*/
|
|
352
|
+
export function rehydrate(anonymizedText, piiMap, strict = false) {
|
|
353
|
+
let result = anonymizedText;
|
|
354
|
+
const tags = strict
|
|
355
|
+
? extractTagsStrict(anonymizedText)
|
|
356
|
+
: extractTags(anonymizedText);
|
|
357
|
+
// Sort by position descending for replacement
|
|
358
|
+
// (replacing from end to start preserves earlier offsets)
|
|
359
|
+
tags.sort((a, b) => b.position - a.position);
|
|
360
|
+
for (const { type, id, position, matchedText } of tags) {
|
|
361
|
+
const key = createPIIMapKey(type, id);
|
|
362
|
+
const original = piiMap.get(key);
|
|
363
|
+
if (original !== undefined) {
|
|
364
|
+
// Use the actual matched text length for replacement
|
|
365
|
+
// This handles mangled tags where the length differs from the canonical form
|
|
366
|
+
result =
|
|
367
|
+
result.slice(0, position) +
|
|
368
|
+
original +
|
|
369
|
+
result.slice(position + matchedText.length);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
return result;
|
|
373
|
+
}
|
|
374
|
+
//# sourceMappingURL=tagger.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tagger.js","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,GAKR,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AA+B1D;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CACzB,IAAa,EACb,EAAU,EACV,QAA6B;IAE7B,IAAI,KAAK,GAAG,SAAS,IAAI,GAAG,CAAC;IAE7B,oDAAoD;IACpD,IAAI,QAAQ,EAAE,MAAM,IAAI,QAAQ,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;QACtD,KAAK,IAAI,YAAY,QAAQ,CAAC,MAAM,GAAG,CAAC;IAC1C,CAAC;IACD,IAAI,QAAQ,EAAE,KAAK,IAAI,QAAQ,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;QACpD,KAAK,IAAI,WAAW,QAAQ,CAAC,KAAK,GAAG,CAAC;IACxC,CAAC;IAED,KAAK,IAAI,QAAQ,EAAE,GAAG,CAAC;IAEvB,OAAO,QAAQ,KAAK,IAAI,CAAC;AAC3B,CAAC;AAWD;;;;;;;;;GASG;AACH,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,oEAAoE;IACpE,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CACrB,yFAAyF,CAC1F,CAAC;IAEF,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,CAAC,GAAG,KAAK,CAAC;IACtD,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QACjD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,IAAI,GAAG,OAAkB,CAAC;IAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAE/B,mCAAmC;IACnC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,uCAAuC;IACvC,IAAI,QAAwC,CAAC;IAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;QAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;QACD,QAAQ,GAAG,EAAE,CAAC;QACd,IACE,SAAS,KAAK,SAAS;YACvB,SAAS,KAAK,EAAE;YAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;YACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;QAC9D,CAAC;QACD,IACE,QAAQ,KAAK,SAAS;YACtB,QAAQ,KAAK,EAAE;YACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;YACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAAC,IAAa,EAAE,EAAU;IACvD,OAAO,GAAG,IAAI,IAAI,EAAE,EAAE,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAoB,EACpB,MAA2B;IAE3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO;YACL,cAAc,EAAE,IAAI;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,IAAI,GAAG,EAAE;SAClB,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,MAAM,eAAe,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAErD,aAAa;IACb,MAAM,eAAe,GAAsC,EAAE,CAAC;IAC9D,IAAI,MAAM,GAAG,CAAC,CAAC;IAEf,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,aAAa;IAEzD,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,EAAU,CAAC;QAEf,IAAI,MAAM,CAAC,sBAAsB,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACrC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,EAAE,GAAG,UAAU,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,EAAE,GAAG,MAAM,EAAE,CAAC;gBACd,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,MAAM,EAAE,CAAC;QAChB,CAAC;QAED,eAAe,CAAC,IAAI,CAAC,EAAE,GAAG,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,gBAAgB;IAChB,MAAM,MAAM,GAAc,IAAI,GAAG,EAAE,CAAC;IACpC,KAAK,MAAM,MAAM,IAAI,eAAe,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;QACpD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IAED,oDAAoD;IACpD,0DAA0D;IAC1D,MAAM,gBAAgB,GAAG,CAAC,GAAG,eAAe,CAAC,CAAC,IAAI,CAChD,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAC5B,CAAC;IAEF,uBAAuB;IACvB,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACtC,MAAM,GAAG,GAAG,WAAW,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;QACjE,cAAc;YACZ,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC;gBACrC,GAAG;gBACH,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IACrC,CAAC;IAED,iDAAiD;IACjD,MAAM,QAAQ,GAAqB,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,IAAI;QAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;KACrB,CAAC,CAAC,CAAC;IAEJ,OAAO;QACL,cAAc;QACd,QAAQ,EAAE,mBAAmB,CAAC,QAAQ,CAAC;QACvC,MAAM;KACP,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,OAAO,QAAQ,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC;AAChC,CAAC;AAeD;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,WAAW,GAAG,wDAAwD,CAAC;AAE7E;;GAEG;AACH,MAAM,WAAW,GAAG,8BAA8B,CAAC;AACnD,MAAM,oBAAoB,GAAG,8BAA8B,CAAC;AAE5D;;;GAGG;AACH,SAAS,qBAAqB;IAC5B,2EAA2E;IAC3E,MAAM,QAAQ,GAAG,OAAO,WAAW,IAAI,WAAW,GAAG,WAAW,YAAY,WAAW,EAAE,CAAC;IAC1F,uEAAuE;IACvE,MAAM,MAAM,GAAG,KAAK,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,EAAE,CAAC;IACnF,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,WAAW,SAAS,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAC9G,2BAA2B;IAC3B,MAAM,SAAS,GAAG,MAAM,WAAW,QAAQ,WAAW,IAAI,WAAW,GAAG,WAAW,SAAS,WAAW,IAAI,CAAC;IAE5G,6CAA6C;IAC7C,MAAM,WAAW,GAAG,GAAG,WAAW,OAAO,WAAW,GAAG,CAAC;IAExD,OAAO;QACL,qFAAqF;QACrF,0CAA0C;QAC1C,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,QAAQ,GAAG,UAAU,GAAG,SAAS,GAAG,oBAAoB,GAAG,MAAM,GAAG,WAAW,EAAE,EAC7H,IAAI,CACL;QACD,mCAAmC;QACnC,uBAAuB;QACvB,IAAI,MAAM,CACR,IAAI,WAAW,MAAM,oBAAoB,GAAG,MAAM,GAAG,oBAAoB,GAAG,QAAQ,GAAG,WAAW,EAAE,EACpG,IAAI,CACL;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,WAAW,CAAC,cAAsB;IAChD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,MAAM,QAAQ,GAAG,qBAAqB,EAAE,CAAC;IAEzC,sFAAsF;IACtF,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAU,CAAC;IAE3C,KAAK,IAAI,YAAY,GAAG,CAAC,EAAE,YAAY,GAAG,QAAQ,CAAC,MAAM,EAAE,YAAY,EAAE,EAAE,CAAC;QAC1E,MAAM,OAAO,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;QACvC,IAAI,OAAO,KAAK,SAAS;YAAE,SAAS;QAEpC,IAAI,KAA6B,CAAC;QAClC,mCAAmC;QACnC,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;QAEtB,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACvD,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,SAAS,CAAC,4CAA4C;YACxD,CAAC;YAED,2EAA2E;YAC3E,6FAA6F;YAC7F,6CAA6C;YAC7C,IAAI,OAA2B,CAAC;YAChC,IAAI,KAAyB,CAAC;YAC9B,IAAI,SAA6B,CAAC;YAClC,IAAI,QAA4B,CAAC;YAEjC,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACnB,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACpB,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACjB,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;YAED,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;gBACjD,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAa,CAAC;gBAC9C,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;gBAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC1C,uCAAuC;oBACvC,IAAI,QAAwC,CAAC;oBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;wBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;wBACD,QAAQ,GAAG,EAAE,CAAC;wBACd,IACE,SAAS,KAAK,SAAS;4BACvB,SAAS,KAAK,EAAE;4BAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,SAAS,CAAC,WAAW,EAAE,CACxB,EACD,CAAC;4BACD,QAAQ,CAAC,MAAM;gCACb,SAAS,CAAC,WAAW,EAAkC,CAAC;wBAC5D,CAAC;wBACD,IACE,QAAQ,KAAK,SAAS;4BACtB,QAAQ,KAAK,EAAE;4BACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAC/C,QAAQ,CAAC,WAAW,EAAE,CACvB,EACD,CAAC;4BACD,QAAQ,CAAC,KAAK;gCACZ,QAAQ,CAAC,WAAW,EAAiC,CAAC;wBAC1D,CAAC;oBACH,CAAC;oBAED,IAAI,CAAC,IAAI,CAAC;wBACR,IAAI;wBACJ,EAAE;wBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;wBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;wBACrB,QAAQ;qBACT,CAAC,CAAC;oBACH,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBACpC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,cAAsB;IACtD,MAAM,IAAI,GAAmB,EAAE,CAAC;IAChC,mEAAmE;IACnE,MAAM,UAAU,GACd,wFAAwF,CAAC;IAE3F,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1D,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACzB,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEvB,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACjD,MAAM,IAAI,GAAG,OAAkB,CAAC;YAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1C,uCAAuC;gBACvC,IAAI,QAAwC,CAAC;gBAC7C,IACE,CAAC,SAAS,KAAK,SAAS,IAAI,SAAS,KAAK,EAAE,CAAC;oBAC7C,CAAC,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,EAC3C,CAAC;oBACD,QAAQ,GAAG,EAAE,CAAC;oBACd,IACE,SAAS,KAAK,SAAS;wBACvB,SAAS,KAAK,EAAE;wBAChB,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,EAC5D,CAAC;wBACD,QAAQ,CAAC,MAAM,GAAG,SAAyC,CAAC;oBAC9D,CAAC;oBACD,IACE,QAAQ,KAAK,SAAS;wBACtB,QAAQ,KAAK,EAAE;wBACf,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC3D,CAAC;wBACD,QAAQ,CAAC,KAAK,GAAG,QAAuC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC,IAAI,CAAC;oBACR,IAAI;oBACJ,EAAE;oBACF,QAAQ,EAAE,KAAK,CAAC,KAAK;oBACrB,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC;oBACrB,QAAQ;iBACT,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CACjC,QAA0B;IAE1B,MAAM,MAAM,GAA4B,EAA6B,CAAC;IAEtE,4BAA4B;IAC5B,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;IAED,iBAAiB;IACjB,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,SAAS,CACvB,cAAsB,EACtB,MAAiB,EACjB,SAAkB,KAAK;IAEvB,IAAI,MAAM,GAAG,cAAc,CAAC;IAC5B,MAAM,IAAI,GAAG,MAAM;QACjB,CAAC,CAAC,iBAAiB,CAAC,cAAc,CAAC;QACnC,CAAC,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC;IAEhC,8CAA8C;IAC9C,0DAA0D;IAC1D,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,KAAK,MAAM,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC;QACvD,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjC,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,qDAAqD;YACrD,6EAA6E;YAC7E,MAAM;gBACJ,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC;oBACzB,QAAQ;oBACR,MAAM,CAAC,KAAK,CAAC,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Title Extractor
|
|
3
|
+
* Extracts and strips honorific titles/prefixes from PERSON entities
|
|
4
|
+
* so that titles remain visible in anonymized text for translation.
|
|
5
|
+
*
|
|
6
|
+
* Supported languages: ar, de, en, es, fr, it, lv, nl, pt, zh
|
|
7
|
+
*/
|
|
8
|
+
import { SpanMatch, SemanticAttributes } from "../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Title extraction result
|
|
11
|
+
*/
|
|
12
|
+
export interface TitleExtractionResult {
|
|
13
|
+
/** The extracted title (e.g., "Dr.", "Mr.") or undefined if no title */
|
|
14
|
+
title?: string;
|
|
15
|
+
/** The name without the title */
|
|
16
|
+
nameWithoutTitle: string;
|
|
17
|
+
/** Character offset where the name starts (after title + space) */
|
|
18
|
+
titleLength: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Extracts a title from the beginning of a name
|
|
22
|
+
*
|
|
23
|
+
* @param name - Full name potentially starting with a title
|
|
24
|
+
* @returns Extraction result with title, remaining name, and offset
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* extractTitle("Dr. John Smith") // { title: "Dr.", nameWithoutTitle: "John Smith", titleLength: 4 }
|
|
28
|
+
* extractTitle("John Smith") // { title: undefined, nameWithoutTitle: "John Smith", titleLength: 0 }
|
|
29
|
+
*/
|
|
30
|
+
export declare function extractTitle(name: string): TitleExtractionResult;
|
|
31
|
+
/**
|
|
32
|
+
* Extended semantic attributes including title
|
|
33
|
+
*/
|
|
34
|
+
export interface SemanticAttributesWithTitle extends SemanticAttributes {
|
|
35
|
+
/** Extracted title prefix (e.g., "Dr.", "Mrs.") */
|
|
36
|
+
title?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Processes PERSON spans to extract titles
|
|
40
|
+
* Titles are removed from the span and stored in semantic attributes
|
|
41
|
+
* The span boundaries are adjusted so the title remains visible
|
|
42
|
+
*
|
|
43
|
+
* @param spans - Array of detected PII spans
|
|
44
|
+
* @param originalText - The original text (needed to verify span boundaries)
|
|
45
|
+
* @returns Array of spans with titles extracted from PERSON entities
|
|
46
|
+
*/
|
|
47
|
+
export declare function extractTitlesFromSpans(spans: SpanMatch[], originalText: string): SpanMatch[];
|
|
48
|
+
/**
|
|
49
|
+
* Gets all supported titles for a specific language
|
|
50
|
+
*/
|
|
51
|
+
export declare function getTitlesForLanguage(langCode: "ar" | "de" | "en" | "es" | "fr" | "it" | "lv" | "nl" | "pt" | "zh"): string[];
|
|
52
|
+
/**
|
|
53
|
+
* Gets all supported titles across all languages
|
|
54
|
+
*/
|
|
55
|
+
export declare function getAllTitles(): string[];
|
|
56
|
+
/**
|
|
57
|
+
* Checks if a string starts with a known title
|
|
58
|
+
*/
|
|
59
|
+
export declare function startsWithTitle(text: string): boolean;
|
|
60
|
+
/**
|
|
61
|
+
* Checks if a text consists entirely of a title (with optional punctuation)
|
|
62
|
+
*/
|
|
63
|
+
export declare function isOnlyTitle(text: string): boolean;
|
|
64
|
+
/**
|
|
65
|
+
* Merges adjacent PERSON spans when one is a title
|
|
66
|
+
*
|
|
67
|
+
* This fixes issues where NER models split "Mrs. Smith" into two entities:
|
|
68
|
+
* - PERSON: "Mrs" (or "Mrs.")
|
|
69
|
+
* - PERSON: "Smith"
|
|
70
|
+
*
|
|
71
|
+
* After merging: PERSON: "Mrs. Smith"
|
|
72
|
+
*
|
|
73
|
+
* @param spans - Array of detected PII spans
|
|
74
|
+
* @param originalText - The original text
|
|
75
|
+
* @param maxGap - Maximum characters between spans to consider them adjacent (default: 3)
|
|
76
|
+
* @returns Array of spans with adjacent title+name PERSON entities merged
|
|
77
|
+
*/
|
|
78
|
+
export declare function mergeAdjacentTitleSpans(spans: SpanMatch[], originalText: string, maxGap?: number): SpanMatch[];
|
|
79
|
+
//# sourceMappingURL=title-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"title-extractor.d.ts","sourceRoot":"","sources":["../../src/pipeline/title-extractor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EACL,SAAS,EAET,kBAAkB,EAEnB,MAAM,mBAAmB,CAAC;AAE3B;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,wEAAwE;IACxE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IACzB,mEAAmE;IACnE,WAAW,EAAE,MAAM,CAAC;CACrB;AAqjBD;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,qBAAqB,CAyBhE;AAED;;GAEG;AACH,MAAM,WAAW,2BAA4B,SAAQ,kBAAkB;IACrE,mDAAmD;IACnD,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAiEb;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,IAAI,GAC5E,MAAM,EAAE,CAcV;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,MAAM,EAAE,CAEvC;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAGrD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CA0BjD;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,MAAU,GACjB,SAAS,EAAE,CAuFb"}
|