@elanlanguages/bridge-anonymization 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +382 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +100 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +163 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +173 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +294 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +102 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +253 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +240 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +45 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +99 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +140 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +341 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +9 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +9 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +238 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +74 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +169 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +134 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +69 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/package.json +62 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WordPiece Tokenizer
|
|
3
|
+
* Tokenizes text into subword tokens while maintaining character offset mapping
|
|
4
|
+
* Compatible with BERT-style models
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Default tokenizer configuration for BERT-style models
|
|
8
|
+
*/
|
|
9
|
+
export const DEFAULT_TOKENIZER_CONFIG = {
|
|
10
|
+
maxLength: 512,
|
|
11
|
+
unkToken: '[UNK]',
|
|
12
|
+
clsToken: '[CLS]',
|
|
13
|
+
sepToken: '[SEP]',
|
|
14
|
+
padToken: '[PAD]',
|
|
15
|
+
maskToken: '[MASK]',
|
|
16
|
+
doLowerCase: true,
|
|
17
|
+
stripAccents: true,
|
|
18
|
+
};
|
|
19
|
+
/**
|
|
20
|
+
* WordPiece Tokenizer implementation
|
|
21
|
+
*/
|
|
22
|
+
export class WordPieceTokenizer {
|
|
23
|
+
vocab;
|
|
24
|
+
inverseVocab;
|
|
25
|
+
config;
|
|
26
|
+
// Special token IDs
|
|
27
|
+
unkId;
|
|
28
|
+
clsId;
|
|
29
|
+
sepId;
|
|
30
|
+
padId;
|
|
31
|
+
constructor(vocab, config = {}) {
|
|
32
|
+
this.vocab = vocab;
|
|
33
|
+
this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
|
|
34
|
+
// Build inverse vocab
|
|
35
|
+
this.inverseVocab = new Map();
|
|
36
|
+
for (const [token, id] of vocab) {
|
|
37
|
+
this.inverseVocab.set(id, token);
|
|
38
|
+
}
|
|
39
|
+
// Get special token IDs
|
|
40
|
+
this.unkId = this.vocab.get(this.config.unkToken) ?? 0;
|
|
41
|
+
this.clsId = this.vocab.get(this.config.clsToken) ?? 101;
|
|
42
|
+
this.sepId = this.vocab.get(this.config.sepToken) ?? 102;
|
|
43
|
+
this.padId = this.vocab.get(this.config.padToken) ?? 0;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Tokenizes text into tokens with offset tracking
|
|
47
|
+
*/
|
|
48
|
+
tokenize(text) {
|
|
49
|
+
const tokens = [];
|
|
50
|
+
const tokenToCharSpan = [];
|
|
51
|
+
// Add [CLS] token
|
|
52
|
+
tokens.push({
|
|
53
|
+
id: this.clsId,
|
|
54
|
+
token: this.config.clsToken,
|
|
55
|
+
start: 0,
|
|
56
|
+
end: 0,
|
|
57
|
+
isContinuation: false,
|
|
58
|
+
isSpecial: true,
|
|
59
|
+
});
|
|
60
|
+
tokenToCharSpan.push(null);
|
|
61
|
+
// Preprocess text
|
|
62
|
+
const processedText = this.preprocess(text);
|
|
63
|
+
// Split into words by whitespace
|
|
64
|
+
const wordSpans = this.splitIntoWords(processedText, text);
|
|
65
|
+
// Tokenize each word
|
|
66
|
+
for (const { word, start, end } of wordSpans) {
|
|
67
|
+
const wordTokens = this.tokenizeWord(word, start, end);
|
|
68
|
+
tokens.push(...wordTokens);
|
|
69
|
+
for (const t of wordTokens) {
|
|
70
|
+
tokenToCharSpan.push([t.start, t.end]);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Add [SEP] token
|
|
74
|
+
tokens.push({
|
|
75
|
+
id: this.sepId,
|
|
76
|
+
token: this.config.sepToken,
|
|
77
|
+
start: text.length,
|
|
78
|
+
end: text.length,
|
|
79
|
+
isContinuation: false,
|
|
80
|
+
isSpecial: true,
|
|
81
|
+
});
|
|
82
|
+
tokenToCharSpan.push(null);
|
|
83
|
+
// Truncate if necessary
|
|
84
|
+
const maxTokens = this.config.maxLength;
|
|
85
|
+
if (tokens.length > maxTokens) {
|
|
86
|
+
tokens.length = maxTokens - 1;
|
|
87
|
+
tokenToCharSpan.length = maxTokens - 1;
|
|
88
|
+
// Add [SEP] at end
|
|
89
|
+
tokens.push({
|
|
90
|
+
id: this.sepId,
|
|
91
|
+
token: this.config.sepToken,
|
|
92
|
+
start: text.length,
|
|
93
|
+
end: text.length,
|
|
94
|
+
isContinuation: false,
|
|
95
|
+
isSpecial: true,
|
|
96
|
+
});
|
|
97
|
+
tokenToCharSpan.push(null);
|
|
98
|
+
}
|
|
99
|
+
// Build arrays
|
|
100
|
+
const inputIds = tokens.map((t) => t.id);
|
|
101
|
+
const attentionMask = tokens.map(() => 1);
|
|
102
|
+
const tokenTypeIds = tokens.map(() => 0);
|
|
103
|
+
return {
|
|
104
|
+
tokens,
|
|
105
|
+
inputIds,
|
|
106
|
+
attentionMask,
|
|
107
|
+
tokenTypeIds,
|
|
108
|
+
tokenToCharSpan,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Preprocesses text (lowercase, accent stripping)
|
|
113
|
+
*/
|
|
114
|
+
preprocess(text) {
|
|
115
|
+
let processed = text;
|
|
116
|
+
if (this.config.doLowerCase) {
|
|
117
|
+
processed = processed.toLowerCase();
|
|
118
|
+
}
|
|
119
|
+
if (this.config.stripAccents) {
|
|
120
|
+
processed = this.stripAccents(processed);
|
|
121
|
+
}
|
|
122
|
+
return processed;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Strips accents from text
|
|
126
|
+
*/
|
|
127
|
+
stripAccents(text) {
|
|
128
|
+
return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Splits text into words while tracking character offsets
|
|
132
|
+
*/
|
|
133
|
+
splitIntoWords(processedText, originalText) {
|
|
134
|
+
const words = [];
|
|
135
|
+
// Split on whitespace and punctuation while keeping track of positions
|
|
136
|
+
const wordPattern = /\S+/g;
|
|
137
|
+
let match;
|
|
138
|
+
while ((match = wordPattern.exec(processedText)) !== null) {
|
|
139
|
+
// Find corresponding position in original text
|
|
140
|
+
// Since we may have lowercased, we need to map positions
|
|
141
|
+
const start = match.index;
|
|
142
|
+
const end = start + match[0].length;
|
|
143
|
+
words.push({
|
|
144
|
+
word: match[0],
|
|
145
|
+
start,
|
|
146
|
+
end,
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
return words;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Tokenizes a single word using WordPiece algorithm
|
|
153
|
+
*/
|
|
154
|
+
tokenizeWord(word, startOffset, endOffset) {
|
|
155
|
+
const tokens = [];
|
|
156
|
+
// Handle punctuation separately
|
|
157
|
+
const subwords = this.splitWordIntoPieces(word);
|
|
158
|
+
let currentOffset = startOffset;
|
|
159
|
+
for (let i = 0; i < subwords.length; i++) {
|
|
160
|
+
let subword = subwords[i];
|
|
161
|
+
const isContinuation = i > 0;
|
|
162
|
+
// For continuation tokens, add ## prefix for vocab lookup
|
|
163
|
+
const vocabKey = isContinuation ? '##' + subword : subword;
|
|
164
|
+
// Look up in vocabulary
|
|
165
|
+
let tokenId = this.vocab.get(vocabKey);
|
|
166
|
+
// If not found, try to find longest matching prefix
|
|
167
|
+
if (tokenId === undefined) {
|
|
168
|
+
const { id, token } = this.findLongestMatch(subword, isContinuation);
|
|
169
|
+
tokenId = id;
|
|
170
|
+
subword = token;
|
|
171
|
+
}
|
|
172
|
+
const tokenLength = subword.length;
|
|
173
|
+
const tokenEnd = Math.min(currentOffset + tokenLength, endOffset);
|
|
174
|
+
tokens.push({
|
|
175
|
+
id: tokenId,
|
|
176
|
+
token: isContinuation ? '##' + subword : subword,
|
|
177
|
+
start: currentOffset,
|
|
178
|
+
end: tokenEnd,
|
|
179
|
+
isContinuation,
|
|
180
|
+
isSpecial: false,
|
|
181
|
+
});
|
|
182
|
+
currentOffset = tokenEnd;
|
|
183
|
+
}
|
|
184
|
+
return tokens;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Splits a word into pieces, handling punctuation
|
|
188
|
+
*/
|
|
189
|
+
splitWordIntoPieces(word) {
|
|
190
|
+
const pieces = [];
|
|
191
|
+
let current = '';
|
|
192
|
+
for (const char of word) {
|
|
193
|
+
if (this.isPunctuation(char)) {
|
|
194
|
+
if (current.length > 0) {
|
|
195
|
+
pieces.push(current);
|
|
196
|
+
current = '';
|
|
197
|
+
}
|
|
198
|
+
pieces.push(char);
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
current += char;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
if (current.length > 0) {
|
|
205
|
+
pieces.push(current);
|
|
206
|
+
}
|
|
207
|
+
return pieces;
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Checks if a character is punctuation
|
|
211
|
+
*/
|
|
212
|
+
isPunctuation(char) {
|
|
213
|
+
const code = char.charCodeAt(0);
|
|
214
|
+
// ASCII punctuation and some Unicode punctuation
|
|
215
|
+
return ((code >= 33 && code <= 47) ||
|
|
216
|
+
(code >= 58 && code <= 64) ||
|
|
217
|
+
(code >= 91 && code <= 96) ||
|
|
218
|
+
(code >= 123 && code <= 126) ||
|
|
219
|
+
/[\u2000-\u206F]/.test(char) || // General punctuation
|
|
220
|
+
/[\u3000-\u303F]/.test(char) // CJK punctuation
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Finds the longest matching token in vocabulary
|
|
225
|
+
*/
|
|
226
|
+
findLongestMatch(word, isContinuation) {
|
|
227
|
+
const prefix = isContinuation ? '##' : '';
|
|
228
|
+
// Try progressively shorter substrings
|
|
229
|
+
for (let end = word.length; end > 0; end--) {
|
|
230
|
+
const subword = word.slice(0, end);
|
|
231
|
+
const vocabKey = prefix + subword;
|
|
232
|
+
const id = this.vocab.get(vocabKey);
|
|
233
|
+
if (id !== undefined) {
|
|
234
|
+
return { id, token: subword };
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// Fall back to unknown token
|
|
238
|
+
return { id: this.unkId, token: word };
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Decodes token IDs back to text
|
|
242
|
+
*/
|
|
243
|
+
decode(tokenIds) {
|
|
244
|
+
const tokens = [];
|
|
245
|
+
for (const id of tokenIds) {
|
|
246
|
+
const token = this.inverseVocab.get(id);
|
|
247
|
+
if (token === undefined)
|
|
248
|
+
continue;
|
|
249
|
+
// Skip special tokens
|
|
250
|
+
if (token === this.config.clsToken ||
|
|
251
|
+
token === this.config.sepToken ||
|
|
252
|
+
token === this.config.padToken) {
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
// Handle continuation tokens
|
|
256
|
+
if (token.startsWith('##')) {
|
|
257
|
+
tokens.push(token.slice(2));
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
if (tokens.length > 0) {
|
|
261
|
+
tokens.push(' ');
|
|
262
|
+
}
|
|
263
|
+
tokens.push(token);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return tokens.join('');
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* Gets vocabulary size
|
|
270
|
+
*/
|
|
271
|
+
get vocabSize() {
|
|
272
|
+
return this.vocab.size;
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Gets a token ID by string
|
|
276
|
+
*/
|
|
277
|
+
getTokenId(token) {
|
|
278
|
+
return this.vocab.get(token);
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Gets a token string by ID
|
|
282
|
+
*/
|
|
283
|
+
getToken(id) {
|
|
284
|
+
return this.inverseVocab.get(id);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Loads vocabulary from a text file (one token per line)
|
|
289
|
+
*/
|
|
290
|
+
export async function loadVocabFromFile(path) {
|
|
291
|
+
const fs = await import('fs/promises');
|
|
292
|
+
const content = await fs.readFile(path, 'utf-8');
|
|
293
|
+
return parseVocab(content);
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Parses vocabulary from string content
|
|
297
|
+
*/
|
|
298
|
+
export function parseVocab(content) {
|
|
299
|
+
const vocab = new Map();
|
|
300
|
+
const lines = content.split('\n');
|
|
301
|
+
for (let i = 0; i < lines.length; i++) {
|
|
302
|
+
const token = lines[i]?.trim();
|
|
303
|
+
if (token !== undefined && token.length > 0) {
|
|
304
|
+
vocab.set(token, i);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
return vocab;
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Creates a minimal vocabulary for testing
|
|
311
|
+
*/
|
|
312
|
+
export function createTestVocab() {
|
|
313
|
+
const tokens = [
|
|
314
|
+
'[PAD]',
|
|
315
|
+
'[UNK]',
|
|
316
|
+
'[CLS]',
|
|
317
|
+
'[SEP]',
|
|
318
|
+
'[MASK]',
|
|
319
|
+
'the',
|
|
320
|
+
'a',
|
|
321
|
+
'is',
|
|
322
|
+
'was',
|
|
323
|
+
'john',
|
|
324
|
+
'smith',
|
|
325
|
+
'berlin',
|
|
326
|
+
'germany',
|
|
327
|
+
'##s',
|
|
328
|
+
'##ed',
|
|
329
|
+
'##ing',
|
|
330
|
+
',',
|
|
331
|
+
'.',
|
|
332
|
+
'!',
|
|
333
|
+
'?',
|
|
334
|
+
];
|
|
335
|
+
const vocab = new Map();
|
|
336
|
+
tokens.forEach((token, index) => {
|
|
337
|
+
vocab.set(token, index);
|
|
338
|
+
});
|
|
339
|
+
return vocab;
|
|
340
|
+
}
|
|
341
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AA8DH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,SAAS,EAAE,QAAQ;IACnB,WAAW,EAAE,IAAI;IACjB,YAAY,EAAE,IAAI;CACnB,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IAEhC,oBAAoB;IACZ,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IAEtB,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QAE5C,iCAAiC;QACjC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;QAE3D,qBAAqB;QACrB,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,SAAS,EAAE,CAAC;YAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;YAC3B,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;gBAC3B,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACzC,CAAC;QACH,CAAC;QAED,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,mBAAmB;YACnB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,IAAY;QAC7B,IAAI,SAAS,GAAG,IAAI,CAAC;QAErB,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;YAC5B,SAAS,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YAC7B,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY;QAC/B,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,aAAqB,EACrB,YAAoB;QAEpB,MAAM,KAAK,GAAwD,EAAE,CAAC;QAEtE,uEAAuE;QACvE,MAAM,WAAW,GAAG,MAAM,CAAC;QAC3B,IAAI,KAA6B,CAAC;QAElC,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC1D,+CAA+C;YAC/C,yDAAyD;YACzD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,GAAG,GAAG,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YAEpC,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;gBACd,KAAK;gBACL,GAAG;aACJ,CAAC,CAAC;QACL,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY,EAAE,WAAmB,EAAE,SAAiB;QACvE,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,aAAa,GAAG,WAAW,CAAC;QAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,IAAI,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;YAC3B,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;YAE7B,0DAA0D;YAC1D,MAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAE3D,wBAAwB;YACxB,IAAI,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAEvC,oDAAoD;YACpD,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC1B,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;gBACrE,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,GAAG,KAAK,CAAC;YAClB,CAAC;YAED,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,aAAa,GAAG,WAAW,EAAE,SAAS,CAAC,CAAC;YAElE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,OAAO;gBACX,KAAK,EAAE,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO;gBAChD,KAAK,EAAE,aAAa;gBACpB,GAAG,EAAE,QAAQ;gBACb,cAAc;gBACd,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YAEH,aAAa,GAAG,QAAQ,CAAC;QAC3B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,IAAY;QACtC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;QAEjB,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACrB,OAAO,GAAG,EAAE,CAAC;gBACf,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACpB,CAAC;iBAAM,CAAC;gBACN,OAAO,IAAI,IAAI,CAAC;YAClB,CAAC;QACH,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY;QAChC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAChC,iDAAiD;QACjD,OAAO,CACL,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,GAAG,CAAC;YAC5B,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,sBAAsB;YACtD,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,kBAAkB;SAChD,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,gBAAgB,CACtB,IAAY,EACZ,cAAuB;QAEvB,MAAM,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1C,uCAAuC;QACvC,KAAK,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC;YAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACnC,MAAM,QAAQ,GAAG,MAAM,GAAG,OAAO,CAAC;YAElC,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACpC,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;YAChC,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,OAAO,EAAE,EAAE,EAAE,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACzC,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAElC,sBAAsB;YACtB,IACE,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ,EAC9B,CAAC;gBACD,SAAS;YACX,CAAC;YAED,6BAA6B;YAC7B,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAAY;IAClD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,OAAO;QACP,OAAO;QACP,OAAO;QACP,OAAO;QACP,QAAQ;QACR,KAAK;QACL,GAAG;QACH,IAAI;QACJ,KAAK;QACL,MAAM;QACN,OAAO;QACP,QAAQ;QACR,SAAS;QACT,KAAK;QACL,MAAM;QACN,OAAO;QACP,GAAG;QACH,GAAG;QACH,GAAG;QACH,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-normalization
|
|
3
|
+
* Normalizes text before PII detection
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Pre-normalization options
|
|
7
|
+
*/
|
|
8
|
+
export interface PrenormalizeOptions {
|
|
9
|
+
/** Normalize line endings to \n */
|
|
10
|
+
normalizeLineEndings: boolean;
|
|
11
|
+
/** Apply Unicode NFKC normalization */
|
|
12
|
+
unicodeNormalize: boolean;
|
|
13
|
+
/** Trim leading/trailing whitespace */
|
|
14
|
+
trim: boolean;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Default pre-normalization options
|
|
18
|
+
*/
|
|
19
|
+
export declare const DEFAULT_PRENORMALIZE_OPTIONS: PrenormalizeOptions;
|
|
20
|
+
/**
|
|
21
|
+
* Pre-normalizes text for PII detection
|
|
22
|
+
* Note: This currently only normalizes line endings to preserve character offsets
|
|
23
|
+
*
|
|
24
|
+
* @param text - Original input text
|
|
25
|
+
* @param options - Normalization options
|
|
26
|
+
* @returns Normalized text
|
|
27
|
+
*/
|
|
28
|
+
export declare function prenormalize(text: string, options?: Partial<PrenormalizeOptions>): string;
|
|
29
|
+
/**
|
|
30
|
+
* Calculates offset adjustments when text is modified
|
|
31
|
+
* Used when prenormalization changes text length
|
|
32
|
+
*/
|
|
33
|
+
export interface OffsetMapping {
|
|
34
|
+
/** Map from original offset to normalized offset */
|
|
35
|
+
toNormalized: (originalOffset: number) => number;
|
|
36
|
+
/** Map from normalized offset to original offset */
|
|
37
|
+
toOriginal: (normalizedOffset: number) => number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Creates an identity offset mapping (no changes)
|
|
41
|
+
*/
|
|
42
|
+
export declare function createIdentityMapping(): OffsetMapping;
|
|
43
|
+
/**
|
|
44
|
+
* Creates offset mapping for line ending normalization
|
|
45
|
+
* This handles \r\n -> \n replacement
|
|
46
|
+
*/
|
|
47
|
+
export declare function createLineEndingMapping(originalText: string): OffsetMapping;
|
|
48
|
+
//# sourceMappingURL=prenormalize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prenormalize.d.ts","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,oBAAoB,EAAE,OAAO,CAAC;IAC9B,uCAAuC;IACvC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,uCAAuC;IACvC,IAAI,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,eAAO,MAAM,4BAA4B,EAAE,mBAI1C,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,OAAO,CAAC,mBAAmB,CAAM,GACzC,MAAM,CAqBR;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC5B,oDAAoD;IACpD,YAAY,EAAE,CAAC,cAAc,EAAE,MAAM,KAAK,MAAM,CAAC;IACjD,oDAAoD;IACpD,UAAU,EAAE,CAAC,gBAAgB,EAAE,MAAM,KAAK,MAAM,CAAC;CAClD;AAED;;GAEG;AACH,wBAAgB,qBAAqB,IAAI,aAAa,CAKrD;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,YAAY,EAAE,MAAM,GAAG,aAAa,CA4C3E"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-normalization
|
|
3
|
+
* Normalizes text before PII detection
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Default pre-normalization options
|
|
7
|
+
*/
|
|
8
|
+
export const DEFAULT_PRENORMALIZE_OPTIONS = {
|
|
9
|
+
normalizeLineEndings: true,
|
|
10
|
+
unicodeNormalize: false, // Disabled by default to preserve offsets
|
|
11
|
+
trim: false, // Disabled by default to preserve offsets
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Pre-normalizes text for PII detection
|
|
15
|
+
* Note: This currently only normalizes line endings to preserve character offsets
|
|
16
|
+
*
|
|
17
|
+
* @param text - Original input text
|
|
18
|
+
* @param options - Normalization options
|
|
19
|
+
* @returns Normalized text
|
|
20
|
+
*/
|
|
21
|
+
export function prenormalize(text, options = {}) {
|
|
22
|
+
const opts = { ...DEFAULT_PRENORMALIZE_OPTIONS, ...options };
|
|
23
|
+
let result = text;
|
|
24
|
+
// Normalize line endings (\r\n -> \n, \r -> \n)
|
|
25
|
+
if (opts.normalizeLineEndings) {
|
|
26
|
+
result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
27
|
+
}
|
|
28
|
+
// Note: Unicode normalization (NFKC) can change string length
|
|
29
|
+
// We skip it by default to preserve character offsets
|
|
30
|
+
// If needed, implement offset mapping
|
|
31
|
+
if (opts.unicodeNormalize) {
|
|
32
|
+
result = result.normalize('NFKC');
|
|
33
|
+
}
|
|
34
|
+
if (opts.trim) {
|
|
35
|
+
result = result.trim();
|
|
36
|
+
}
|
|
37
|
+
return result;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Creates an identity offset mapping (no changes)
|
|
41
|
+
*/
|
|
42
|
+
export function createIdentityMapping() {
|
|
43
|
+
return {
|
|
44
|
+
toNormalized: (offset) => offset,
|
|
45
|
+
toOriginal: (offset) => offset,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Creates offset mapping for line ending normalization
|
|
50
|
+
* This handles \r\n -> \n replacement
|
|
51
|
+
*/
|
|
52
|
+
export function createLineEndingMapping(originalText) {
|
|
53
|
+
// Find all \r\n positions
|
|
54
|
+
const crlfPositions = [];
|
|
55
|
+
for (let i = 0; i < originalText.length - 1; i++) {
|
|
56
|
+
if (originalText[i] === '\r' && originalText[i + 1] === '\n') {
|
|
57
|
+
crlfPositions.push(i);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
if (crlfPositions.length === 0) {
|
|
61
|
+
return createIdentityMapping();
|
|
62
|
+
}
|
|
63
|
+
return {
|
|
64
|
+
toNormalized(originalOffset) {
|
|
65
|
+
// Count how many \r\n pairs are before this offset
|
|
66
|
+
let adjustment = 0;
|
|
67
|
+
for (const pos of crlfPositions) {
|
|
68
|
+
if (pos < originalOffset) {
|
|
69
|
+
adjustment++;
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return originalOffset - adjustment;
|
|
76
|
+
},
|
|
77
|
+
toOriginal(normalizedOffset) {
|
|
78
|
+
// Add back the removed \r characters
|
|
79
|
+
let adjustment = 0;
|
|
80
|
+
let currentNormalized = 0;
|
|
81
|
+
for (const pos of crlfPositions) {
|
|
82
|
+
if (currentNormalized + (pos - adjustment) <= normalizedOffset) {
|
|
83
|
+
adjustment++;
|
|
84
|
+
currentNormalized = pos - adjustment + 1;
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return normalizedOffset + adjustment;
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=prenormalize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prenormalize.js","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAcH;;GAEG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAwB;IAC/D,oBAAoB,EAAE,IAAI;IAC1B,gBAAgB,EAAE,KAAK,EAAE,0CAA0C;IACnE,IAAI,EAAE,KAAK,EAAE,0CAA0C;CACxD,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,UAAwC,EAAE;IAE1C,MAAM,IAAI,GAAG,EAAE,GAAG,4BAA4B,EAAE,GAAG,OAAO,EAAE,CAAC;IAC7D,IAAI,MAAM,GAAG,IAAI,CAAC;IAElB,gDAAgD;IAChD,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;QAC9B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC9D,CAAC;IAED,8DAA8D;IAC9D,sDAAsD;IACtD,sCAAsC;IACtC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACd,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAaD;;GAEG;AACH,MAAM,UAAU,qBAAqB;IACnC,OAAO;QACL,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;QAChC,UAAU,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;KAC/B,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,YAAoB;IAC1D,0BAA0B;IAC1B,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,IAAI,YAAY,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7D,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,qBAAqB,EAAE,CAAC;IACjC,CAAC;IAED,OAAO;QACL,YAAY,CAAC,cAAsB;YACjC,mDAAmD;YACnD,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;oBACzB,UAAU,EAAE,CAAC;gBACf,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YACD,OAAO,cAAc,GAAG,UAAU,CAAC;QACrC,CAAC;QAED,UAAU,CAAC,gBAAwB;YACjC,qCAAqC;YACrC,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAE1B,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,iBAAiB,GAAG,CAAC,GAAG,GAAG,UAAU,CAAC,IAAI,gBAAgB,EAAE,CAAC;oBAC/D,UAAU,EAAE,CAAC;oBACb,iBAAiB,GAAG,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;gBAC3C,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YAED,OAAO,gBAAgB,GAAG,UAAU,CAAC;QACvC,CAAC;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Resolver
|
|
3
|
+
* Merges, deduplicates, and resolves overlapping entity detections
|
|
4
|
+
*/
|
|
5
|
+
import { SpanMatch, AnonymizationPolicy } from '../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Resolution strategy for overlapping entities
|
|
8
|
+
*/
|
|
9
|
+
export declare enum OverlapStrategy {
|
|
10
|
+
/** Regex matches always win over NER */
|
|
11
|
+
REGEX_PRIORITY = "REGEX_PRIORITY",
|
|
12
|
+
/** Longer span wins */
|
|
13
|
+
LONGER_SPAN = "LONGER_SPAN",
|
|
14
|
+
/** Higher confidence wins */
|
|
15
|
+
HIGHER_CONFIDENCE = "HIGHER_CONFIDENCE",
|
|
16
|
+
/** Use type priority from policy */
|
|
17
|
+
TYPE_PRIORITY = "TYPE_PRIORITY"
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Entity resolver configuration
|
|
21
|
+
*/
|
|
22
|
+
export interface ResolverConfig {
|
|
23
|
+
/** Primary strategy for overlap resolution */
|
|
24
|
+
overlapStrategy: OverlapStrategy;
|
|
25
|
+
/** Whether regex matches always take precedence */
|
|
26
|
+
regexPriority: boolean;
|
|
27
|
+
/** Minimum confidence to keep an entity */
|
|
28
|
+
minConfidence: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Default resolver configuration
|
|
32
|
+
*/
|
|
33
|
+
export declare const DEFAULT_RESOLVER_CONFIG: ResolverConfig;
|
|
34
|
+
/**
|
|
35
|
+
* Resolves and merges entity detections from regex and NER
|
|
36
|
+
*/
|
|
37
|
+
export declare function resolveEntities(regexMatches: SpanMatch[], nerMatches: SpanMatch[], policy: AnonymizationPolicy, originalText: string, config?: Partial<ResolverConfig>): SpanMatch[];
|
|
38
|
+
/**
|
|
39
|
+
* Creates protected spans from regex matches
|
|
40
|
+
* Used to mask regex matches from NER to avoid double-detection
|
|
41
|
+
*/
|
|
42
|
+
export declare function createProtectedSpans(regexMatches: SpanMatch[]): Array<{
|
|
43
|
+
start: number;
|
|
44
|
+
end: number;
|
|
45
|
+
}>;
|
|
46
|
+
/**
|
|
47
|
+
* Checks if a span overlaps with any protected span
|
|
48
|
+
*/
|
|
49
|
+
export declare function isInProtectedSpan(span: {
|
|
50
|
+
start: number;
|
|
51
|
+
end: number;
|
|
52
|
+
}, protectedSpans: Array<{
|
|
53
|
+
start: number;
|
|
54
|
+
end: number;
|
|
55
|
+
}>): boolean;
|
|
56
|
+
//# sourceMappingURL=resolver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAEL,SAAS,EAET,mBAAmB,EAEpB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,oBAAY,eAAe;IACzB,wCAAwC;IACxC,cAAc,mBAAmB;IACjC,uBAAuB;IACvB,WAAW,gBAAgB;IAC3B,6BAA6B;IAC7B,iBAAiB,sBAAsB;IACvC,oCAAoC;IACpC,aAAa,kBAAkB;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,8CAA8C;IAC9C,eAAe,EAAE,eAAe,CAAC;IACjC,mDAAmD;IACnD,aAAa,EAAE,OAAO,CAAC;IACvB,2CAA2C;IAC3C,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,uBAAuB,EAAE,cAIrC,CAAC;AAEF;;GAEG;AACH,wBAAgB,eAAe,CAC7B,YAAY,EAAE,SAAS,EAAE,EACzB,UAAU,EAAE,SAAS,EAAE,EACvB,MAAM,EAAE,mBAAmB,EAC3B,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,OAAO,CAAC,cAAc,CAAM,GACnC,SAAS,EAAE,CAyBb;AAiOD;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,YAAY,EAAE,SAAS,EAAE,GACxB,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,CAEvC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,EACpC,cAAc,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,GACpD,OAAO,CAET"}
|