@elanlanguages/bridge-anonymization 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/README.md +382 -0
  2. package/dist/crypto/index.d.ts +6 -0
  3. package/dist/crypto/index.d.ts.map +1 -0
  4. package/dist/crypto/index.js +6 -0
  5. package/dist/crypto/index.js.map +1 -0
  6. package/dist/crypto/pii-map-crypto.d.ts +100 -0
  7. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  8. package/dist/crypto/pii-map-crypto.js +163 -0
  9. package/dist/crypto/pii-map-crypto.js.map +1 -0
  10. package/dist/index.d.ts +173 -0
  11. package/dist/index.d.ts.map +1 -0
  12. package/dist/index.js +294 -0
  13. package/dist/index.js.map +1 -0
  14. package/dist/ner/bio-decoder.d.ts +64 -0
  15. package/dist/ner/bio-decoder.d.ts.map +1 -0
  16. package/dist/ner/bio-decoder.js +216 -0
  17. package/dist/ner/bio-decoder.js.map +1 -0
  18. package/dist/ner/index.d.ts +10 -0
  19. package/dist/ner/index.d.ts.map +1 -0
  20. package/dist/ner/index.js +10 -0
  21. package/dist/ner/index.js.map +1 -0
  22. package/dist/ner/model-manager.d.ts +102 -0
  23. package/dist/ner/model-manager.d.ts.map +1 -0
  24. package/dist/ner/model-manager.js +253 -0
  25. package/dist/ner/model-manager.js.map +1 -0
  26. package/dist/ner/ner-model.d.ts +114 -0
  27. package/dist/ner/ner-model.d.ts.map +1 -0
  28. package/dist/ner/ner-model.js +240 -0
  29. package/dist/ner/ner-model.js.map +1 -0
  30. package/dist/ner/onnx-runtime.d.ts +45 -0
  31. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  32. package/dist/ner/onnx-runtime.js +99 -0
  33. package/dist/ner/onnx-runtime.js.map +1 -0
  34. package/dist/ner/tokenizer.d.ts +140 -0
  35. package/dist/ner/tokenizer.d.ts.map +1 -0
  36. package/dist/ner/tokenizer.js +341 -0
  37. package/dist/ner/tokenizer.js.map +1 -0
  38. package/dist/pipeline/index.d.ts +9 -0
  39. package/dist/pipeline/index.d.ts.map +1 -0
  40. package/dist/pipeline/index.js +9 -0
  41. package/dist/pipeline/index.js.map +1 -0
  42. package/dist/pipeline/prenormalize.d.ts +48 -0
  43. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  44. package/dist/pipeline/prenormalize.js +94 -0
  45. package/dist/pipeline/prenormalize.js.map +1 -0
  46. package/dist/pipeline/resolver.d.ts +56 -0
  47. package/dist/pipeline/resolver.d.ts.map +1 -0
  48. package/dist/pipeline/resolver.js +238 -0
  49. package/dist/pipeline/resolver.js.map +1 -0
  50. package/dist/pipeline/tagger.d.ts +74 -0
  51. package/dist/pipeline/tagger.d.ts.map +1 -0
  52. package/dist/pipeline/tagger.js +169 -0
  53. package/dist/pipeline/tagger.js.map +1 -0
  54. package/dist/pipeline/validator.d.ts +65 -0
  55. package/dist/pipeline/validator.d.ts.map +1 -0
  56. package/dist/pipeline/validator.js +264 -0
  57. package/dist/pipeline/validator.js.map +1 -0
  58. package/dist/recognizers/base.d.ts +78 -0
  59. package/dist/recognizers/base.d.ts.map +1 -0
  60. package/dist/recognizers/base.js +100 -0
  61. package/dist/recognizers/base.js.map +1 -0
  62. package/dist/recognizers/bic-swift.d.ts +10 -0
  63. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  64. package/dist/recognizers/bic-swift.js +107 -0
  65. package/dist/recognizers/bic-swift.js.map +1 -0
  66. package/dist/recognizers/credit-card.d.ts +32 -0
  67. package/dist/recognizers/credit-card.d.ts.map +1 -0
  68. package/dist/recognizers/credit-card.js +160 -0
  69. package/dist/recognizers/credit-card.js.map +1 -0
  70. package/dist/recognizers/custom-id.d.ts +28 -0
  71. package/dist/recognizers/custom-id.d.ts.map +1 -0
  72. package/dist/recognizers/custom-id.js +116 -0
  73. package/dist/recognizers/custom-id.js.map +1 -0
  74. package/dist/recognizers/email.d.ts +10 -0
  75. package/dist/recognizers/email.d.ts.map +1 -0
  76. package/dist/recognizers/email.js +75 -0
  77. package/dist/recognizers/email.js.map +1 -0
  78. package/dist/recognizers/iban.d.ts +14 -0
  79. package/dist/recognizers/iban.d.ts.map +1 -0
  80. package/dist/recognizers/iban.js +67 -0
  81. package/dist/recognizers/iban.js.map +1 -0
  82. package/dist/recognizers/index.d.ts +20 -0
  83. package/dist/recognizers/index.d.ts.map +1 -0
  84. package/dist/recognizers/index.js +42 -0
  85. package/dist/recognizers/index.js.map +1 -0
  86. package/dist/recognizers/ip-address.d.ts +14 -0
  87. package/dist/recognizers/ip-address.d.ts.map +1 -0
  88. package/dist/recognizers/ip-address.js +183 -0
  89. package/dist/recognizers/ip-address.js.map +1 -0
  90. package/dist/recognizers/phone.d.ts +10 -0
  91. package/dist/recognizers/phone.d.ts.map +1 -0
  92. package/dist/recognizers/phone.js +145 -0
  93. package/dist/recognizers/phone.js.map +1 -0
  94. package/dist/recognizers/registry.d.ts +59 -0
  95. package/dist/recognizers/registry.d.ts.map +1 -0
  96. package/dist/recognizers/registry.js +113 -0
  97. package/dist/recognizers/registry.js.map +1 -0
  98. package/dist/recognizers/url.d.ts +14 -0
  99. package/dist/recognizers/url.d.ts.map +1 -0
  100. package/dist/recognizers/url.js +121 -0
  101. package/dist/recognizers/url.js.map +1 -0
  102. package/dist/types/index.d.ts +134 -0
  103. package/dist/types/index.d.ts.map +1 -0
  104. package/dist/types/index.js +69 -0
  105. package/dist/types/index.js.map +1 -0
  106. package/dist/types/pii-types.d.ts +50 -0
  107. package/dist/types/pii-types.d.ts.map +1 -0
  108. package/dist/types/pii-types.js +114 -0
  109. package/dist/types/pii-types.js.map +1 -0
  110. package/dist/utils/iban-checksum.d.ts +23 -0
  111. package/dist/utils/iban-checksum.d.ts.map +1 -0
  112. package/dist/utils/iban-checksum.js +106 -0
  113. package/dist/utils/iban-checksum.js.map +1 -0
  114. package/dist/utils/index.d.ts +8 -0
  115. package/dist/utils/index.d.ts.map +1 -0
  116. package/dist/utils/index.js +8 -0
  117. package/dist/utils/index.js.map +1 -0
  118. package/dist/utils/luhn.d.ts +17 -0
  119. package/dist/utils/luhn.d.ts.map +1 -0
  120. package/dist/utils/luhn.js +55 -0
  121. package/dist/utils/luhn.js.map +1 -0
  122. package/dist/utils/offsets.d.ts +86 -0
  123. package/dist/utils/offsets.d.ts.map +1 -0
  124. package/dist/utils/offsets.js +124 -0
  125. package/dist/utils/offsets.js.map +1 -0
  126. package/package.json +62 -0
@@ -0,0 +1,341 @@
1
+ /**
2
+ * WordPiece Tokenizer
3
+ * Tokenizes text into subword tokens while maintaining character offset mapping
4
+ * Compatible with BERT-style models
5
+ */
6
+ /**
7
+ * Default tokenizer configuration for BERT-style models
8
+ */
9
+ export const DEFAULT_TOKENIZER_CONFIG = {
10
+ maxLength: 512,
11
+ unkToken: '[UNK]',
12
+ clsToken: '[CLS]',
13
+ sepToken: '[SEP]',
14
+ padToken: '[PAD]',
15
+ maskToken: '[MASK]',
16
+ doLowerCase: true,
17
+ stripAccents: true,
18
+ };
19
+ /**
20
+ * WordPiece Tokenizer implementation
21
+ */
22
+ export class WordPieceTokenizer {
23
+ vocab;
24
+ inverseVocab;
25
+ config;
26
+ // Special token IDs
27
+ unkId;
28
+ clsId;
29
+ sepId;
30
+ padId;
31
+ constructor(vocab, config = {}) {
32
+ this.vocab = vocab;
33
+ this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
34
+ // Build inverse vocab
35
+ this.inverseVocab = new Map();
36
+ for (const [token, id] of vocab) {
37
+ this.inverseVocab.set(id, token);
38
+ }
39
+ // Get special token IDs
40
+ this.unkId = this.vocab.get(this.config.unkToken) ?? 0;
41
+ this.clsId = this.vocab.get(this.config.clsToken) ?? 101;
42
+ this.sepId = this.vocab.get(this.config.sepToken) ?? 102;
43
+ this.padId = this.vocab.get(this.config.padToken) ?? 0;
44
+ }
45
+ /**
46
+ * Tokenizes text into tokens with offset tracking
47
+ */
48
+ tokenize(text) {
49
+ const tokens = [];
50
+ const tokenToCharSpan = [];
51
+ // Add [CLS] token
52
+ tokens.push({
53
+ id: this.clsId,
54
+ token: this.config.clsToken,
55
+ start: 0,
56
+ end: 0,
57
+ isContinuation: false,
58
+ isSpecial: true,
59
+ });
60
+ tokenToCharSpan.push(null);
61
+ // Preprocess text
62
+ const processedText = this.preprocess(text);
63
+ // Split into words by whitespace
64
+ const wordSpans = this.splitIntoWords(processedText, text);
65
+ // Tokenize each word
66
+ for (const { word, start, end } of wordSpans) {
67
+ const wordTokens = this.tokenizeWord(word, start, end);
68
+ tokens.push(...wordTokens);
69
+ for (const t of wordTokens) {
70
+ tokenToCharSpan.push([t.start, t.end]);
71
+ }
72
+ }
73
+ // Add [SEP] token
74
+ tokens.push({
75
+ id: this.sepId,
76
+ token: this.config.sepToken,
77
+ start: text.length,
78
+ end: text.length,
79
+ isContinuation: false,
80
+ isSpecial: true,
81
+ });
82
+ tokenToCharSpan.push(null);
83
+ // Truncate if necessary
84
+ const maxTokens = this.config.maxLength;
85
+ if (tokens.length > maxTokens) {
86
+ tokens.length = maxTokens - 1;
87
+ tokenToCharSpan.length = maxTokens - 1;
88
+ // Add [SEP] at end
89
+ tokens.push({
90
+ id: this.sepId,
91
+ token: this.config.sepToken,
92
+ start: text.length,
93
+ end: text.length,
94
+ isContinuation: false,
95
+ isSpecial: true,
96
+ });
97
+ tokenToCharSpan.push(null);
98
+ }
99
+ // Build arrays
100
+ const inputIds = tokens.map((t) => t.id);
101
+ const attentionMask = tokens.map(() => 1);
102
+ const tokenTypeIds = tokens.map(() => 0);
103
+ return {
104
+ tokens,
105
+ inputIds,
106
+ attentionMask,
107
+ tokenTypeIds,
108
+ tokenToCharSpan,
109
+ };
110
+ }
111
+ /**
112
+ * Preprocesses text (lowercase, accent stripping)
113
+ */
114
+ preprocess(text) {
115
+ let processed = text;
116
+ if (this.config.doLowerCase) {
117
+ processed = processed.toLowerCase();
118
+ }
119
+ if (this.config.stripAccents) {
120
+ processed = this.stripAccents(processed);
121
+ }
122
+ return processed;
123
+ }
124
+ /**
125
+ * Strips accents from text
126
+ */
127
+ stripAccents(text) {
128
+ return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
129
+ }
130
+ /**
131
+ * Splits text into words while tracking character offsets
132
+ */
133
+ splitIntoWords(processedText, originalText) {
134
+ const words = [];
135
+ // Split on whitespace and punctuation while keeping track of positions
136
+ const wordPattern = /\S+/g;
137
+ let match;
138
+ while ((match = wordPattern.exec(processedText)) !== null) {
139
+ // Find corresponding position in original text
140
+ // Since we may have lowercased, we need to map positions
141
+ const start = match.index;
142
+ const end = start + match[0].length;
143
+ words.push({
144
+ word: match[0],
145
+ start,
146
+ end,
147
+ });
148
+ }
149
+ return words;
150
+ }
151
+ /**
152
+ * Tokenizes a single word using WordPiece algorithm
153
+ */
154
+ tokenizeWord(word, startOffset, endOffset) {
155
+ const tokens = [];
156
+ // Handle punctuation separately
157
+ const subwords = this.splitWordIntoPieces(word);
158
+ let currentOffset = startOffset;
159
+ for (let i = 0; i < subwords.length; i++) {
160
+ let subword = subwords[i];
161
+ const isContinuation = i > 0;
162
+ // For continuation tokens, add ## prefix for vocab lookup
163
+ const vocabKey = isContinuation ? '##' + subword : subword;
164
+ // Look up in vocabulary
165
+ let tokenId = this.vocab.get(vocabKey);
166
+ // If not found, try to find longest matching prefix
167
+ if (tokenId === undefined) {
168
+ const { id, token } = this.findLongestMatch(subword, isContinuation);
169
+ tokenId = id;
170
+ subword = token;
171
+ }
172
+ const tokenLength = subword.length;
173
+ const tokenEnd = Math.min(currentOffset + tokenLength, endOffset);
174
+ tokens.push({
175
+ id: tokenId,
176
+ token: isContinuation ? '##' + subword : subword,
177
+ start: currentOffset,
178
+ end: tokenEnd,
179
+ isContinuation,
180
+ isSpecial: false,
181
+ });
182
+ currentOffset = tokenEnd;
183
+ }
184
+ return tokens;
185
+ }
186
+ /**
187
+ * Splits a word into pieces, handling punctuation
188
+ */
189
+ splitWordIntoPieces(word) {
190
+ const pieces = [];
191
+ let current = '';
192
+ for (const char of word) {
193
+ if (this.isPunctuation(char)) {
194
+ if (current.length > 0) {
195
+ pieces.push(current);
196
+ current = '';
197
+ }
198
+ pieces.push(char);
199
+ }
200
+ else {
201
+ current += char;
202
+ }
203
+ }
204
+ if (current.length > 0) {
205
+ pieces.push(current);
206
+ }
207
+ return pieces;
208
+ }
209
+ /**
210
+ * Checks if a character is punctuation
211
+ */
212
+ isPunctuation(char) {
213
+ const code = char.charCodeAt(0);
214
+ // ASCII punctuation and some Unicode punctuation
215
+ return ((code >= 33 && code <= 47) ||
216
+ (code >= 58 && code <= 64) ||
217
+ (code >= 91 && code <= 96) ||
218
+ (code >= 123 && code <= 126) ||
219
+ /[\u2000-\u206F]/.test(char) || // General punctuation
220
+ /[\u3000-\u303F]/.test(char) // CJK punctuation
221
+ );
222
+ }
223
+ /**
224
+ * Finds the longest matching token in vocabulary
225
+ */
226
+ findLongestMatch(word, isContinuation) {
227
+ const prefix = isContinuation ? '##' : '';
228
+ // Try progressively shorter substrings
229
+ for (let end = word.length; end > 0; end--) {
230
+ const subword = word.slice(0, end);
231
+ const vocabKey = prefix + subword;
232
+ const id = this.vocab.get(vocabKey);
233
+ if (id !== undefined) {
234
+ return { id, token: subword };
235
+ }
236
+ }
237
+ // Fall back to unknown token
238
+ return { id: this.unkId, token: word };
239
+ }
240
+ /**
241
+ * Decodes token IDs back to text
242
+ */
243
+ decode(tokenIds) {
244
+ const tokens = [];
245
+ for (const id of tokenIds) {
246
+ const token = this.inverseVocab.get(id);
247
+ if (token === undefined)
248
+ continue;
249
+ // Skip special tokens
250
+ if (token === this.config.clsToken ||
251
+ token === this.config.sepToken ||
252
+ token === this.config.padToken) {
253
+ continue;
254
+ }
255
+ // Handle continuation tokens
256
+ if (token.startsWith('##')) {
257
+ tokens.push(token.slice(2));
258
+ }
259
+ else {
260
+ if (tokens.length > 0) {
261
+ tokens.push(' ');
262
+ }
263
+ tokens.push(token);
264
+ }
265
+ }
266
+ return tokens.join('');
267
+ }
268
+ /**
269
+ * Gets vocabulary size
270
+ */
271
+ get vocabSize() {
272
+ return this.vocab.size;
273
+ }
274
+ /**
275
+ * Gets a token ID by string
276
+ */
277
+ getTokenId(token) {
278
+ return this.vocab.get(token);
279
+ }
280
+ /**
281
+ * Gets a token string by ID
282
+ */
283
+ getToken(id) {
284
+ return this.inverseVocab.get(id);
285
+ }
286
+ }
287
+ /**
288
+ * Loads vocabulary from a text file (one token per line)
289
+ */
290
+ export async function loadVocabFromFile(path) {
291
+ const fs = await import('fs/promises');
292
+ const content = await fs.readFile(path, 'utf-8');
293
+ return parseVocab(content);
294
+ }
295
+ /**
296
+ * Parses vocabulary from string content
297
+ */
298
+ export function parseVocab(content) {
299
+ const vocab = new Map();
300
+ const lines = content.split('\n');
301
+ for (let i = 0; i < lines.length; i++) {
302
+ const token = lines[i]?.trim();
303
+ if (token !== undefined && token.length > 0) {
304
+ vocab.set(token, i);
305
+ }
306
+ }
307
+ return vocab;
308
+ }
309
+ /**
310
+ * Creates a minimal vocabulary for testing
311
+ */
312
+ export function createTestVocab() {
313
+ const tokens = [
314
+ '[PAD]',
315
+ '[UNK]',
316
+ '[CLS]',
317
+ '[SEP]',
318
+ '[MASK]',
319
+ 'the',
320
+ 'a',
321
+ 'is',
322
+ 'was',
323
+ 'john',
324
+ 'smith',
325
+ 'berlin',
326
+ 'germany',
327
+ '##s',
328
+ '##ed',
329
+ '##ing',
330
+ ',',
331
+ '.',
332
+ '!',
333
+ '?',
334
+ ];
335
+ const vocab = new Map();
336
+ tokens.forEach((token, index) => {
337
+ vocab.set(token, index);
338
+ });
339
+ return vocab;
340
+ }
341
+ //# sourceMappingURL=tokenizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AA8DH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,SAAS,EAAE,QAAQ;IACnB,WAAW,EAAE,IAAI;IACjB,YAAY,EAAE,IAAI;CACnB,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IAEhC,oBAAoB;IACZ,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IAEtB,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QAE5C,iCAAiC;QACjC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;QAE3D,qBAAqB;QACrB,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,SAAS,EAAE,CAAC;YAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;YAC3B,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;gBAC3B,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACzC,CAAC;QACH,CAAC;QAED,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,mBAAmB;YACnB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,IAAY;QAC7B,IAAI,SAAS,GAAG,IAAI,CAAC;QAErB,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;YAC5B,SAAS,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YAC7B,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY;QAC/B,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,aAAqB,EACrB,YAAoB;QAEpB,MAAM,KAAK,GAAwD,EAAE,CAAC;QAEtE,uEAAuE;QACvE,MAAM,WAAW,GAAG,MAAM,CAAC;QAC3B,IAAI,KAA6B,CAAC;QAElC,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC1D,+CAA+C;YAC/C,yDAAyD;YACzD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,GAAG,GAAG,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YAEpC,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;gBACd,KAAK;gBACL,GAAG;aACJ,CAAC,CAAC;QACL,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY,EAAE,WAAmB,EAAE,SAAiB;QACvE,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,aAAa,GAAG,WAAW,CAAC;QAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,IAAI,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;YAC3B,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;YAE7B,0DAA0D;YAC1D,MAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAE3D,wBAAwB;YACxB,IAAI,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAEvC,oDAAoD;YACpD,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC1B,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;gBACrE,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,GAAG,KAAK,CAAC;YAClB,CAAC;YAED,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,aAAa,GAAG,WAAW,EAAE,SAAS,CAAC,CAAC;YAElE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,OAAO;gBACX,KAAK,EAAE,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO;gBAChD,KAAK,EAAE,aAAa;gBACpB,GAAG,EAAE,QAAQ;gBACb,cAAc;gBACd,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YAEH,aAAa,GAAG,QAAQ,CAAC;QAC3B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,IAAY;QACtC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;QAEjB,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACrB,OAAO,GAAG,EAAE,CAAC;gBACf,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACpB,CAAC;iBAAM,CAAC;gBACN,OAAO,IAAI,IAAI,CAAC;YAClB,CAAC;QACH,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY;QAChC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAChC,iDAAiD;QACjD,OAAO,CACL,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,GAAG,CAAC;YAC5B,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,sBAAsB;YACtD,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,kBAAkB;SAChD,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,gBAAgB,CACtB,IAAY,EACZ,cAAuB;QAEvB,MAAM,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1C,uCAAuC;QACvC,KAAK,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC;YAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACnC,MAAM,QAAQ,GAAG,MAAM,GAAG,OAAO,CAAC;YAElC,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACpC,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;YAChC,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,OAAO,EAAE,EAAE,EAAE,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACzC,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAElC,sBAAsB;YACtB,IACE,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ,EAC9B,CAAC;gBACD,SAAS;YACX,CAAC;YAED,6BAA6B;YAC7B,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAAY;IAClD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,OAAO;QACP,OAAO;QACP,OAAO;QACP,OAAO;QACP,QAAQ;QACR,KAAK;QACL,GAAG;QACH,IAAI;QACJ,KAAK;QACL,MAAM;QACN,OAAO;QACP,QAAQ;QACR,SAAS;QACT,KAAK;QACL,MAAM;QACN,OAAO;QACP,GAAG;QACH,GAAG;QACH,GAAG;QACH,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Pipeline Module
3
+ * Exports all pipeline components
4
+ */
5
+ export * from './prenormalize.js';
6
+ export * from './resolver.js';
7
+ export * from './tagger.js';
8
+ export * from './validator.js';
9
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Pipeline Module
3
+ * Exports all pipeline components
4
+ */
5
+ export * from './prenormalize.js';
6
+ export * from './resolver.js';
7
+ export * from './tagger.js';
8
+ export * from './validator.js';
9
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Pre-normalization
3
+ * Normalizes text before PII detection
4
+ */
5
+ /**
6
+ * Pre-normalization options
7
+ */
8
+ export interface PrenormalizeOptions {
9
+ /** Normalize line endings to \n */
10
+ normalizeLineEndings: boolean;
11
+ /** Apply Unicode NFKC normalization */
12
+ unicodeNormalize: boolean;
13
+ /** Trim leading/trailing whitespace */
14
+ trim: boolean;
15
+ }
16
+ /**
17
+ * Default pre-normalization options
18
+ */
19
+ export declare const DEFAULT_PRENORMALIZE_OPTIONS: PrenormalizeOptions;
20
+ /**
21
+ * Pre-normalizes text for PII detection
22
+ * Note: This currently only normalizes line endings to preserve character offsets
23
+ *
24
+ * @param text - Original input text
25
+ * @param options - Normalization options
26
+ * @returns Normalized text
27
+ */
28
+ export declare function prenormalize(text: string, options?: Partial<PrenormalizeOptions>): string;
29
+ /**
30
+ * Calculates offset adjustments when text is modified
31
+ * Used when prenormalization changes text length
32
+ */
33
+ export interface OffsetMapping {
34
+ /** Map from original offset to normalized offset */
35
+ toNormalized: (originalOffset: number) => number;
36
+ /** Map from normalized offset to original offset */
37
+ toOriginal: (normalizedOffset: number) => number;
38
+ }
39
+ /**
40
+ * Creates an identity offset mapping (no changes)
41
+ */
42
+ export declare function createIdentityMapping(): OffsetMapping;
43
+ /**
44
+ * Creates offset mapping for line ending normalization
45
+ * This handles \r\n -> \n replacement
46
+ */
47
+ export declare function createLineEndingMapping(originalText: string): OffsetMapping;
48
+ //# sourceMappingURL=prenormalize.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prenormalize.d.ts","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,oBAAoB,EAAE,OAAO,CAAC;IAC9B,uCAAuC;IACvC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,uCAAuC;IACvC,IAAI,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,eAAO,MAAM,4BAA4B,EAAE,mBAI1C,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,OAAO,CAAC,mBAAmB,CAAM,GACzC,MAAM,CAqBR;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC5B,oDAAoD;IACpD,YAAY,EAAE,CAAC,cAAc,EAAE,MAAM,KAAK,MAAM,CAAC;IACjD,oDAAoD;IACpD,UAAU,EAAE,CAAC,gBAAgB,EAAE,MAAM,KAAK,MAAM,CAAC;CAClD;AAED;;GAEG;AACH,wBAAgB,qBAAqB,IAAI,aAAa,CAKrD;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,YAAY,EAAE,MAAM,GAAG,aAAa,CA4C3E"}
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Pre-normalization
3
+ * Normalizes text before PII detection
4
+ */
5
+ /**
6
+ * Default pre-normalization options
7
+ */
8
+ export const DEFAULT_PRENORMALIZE_OPTIONS = {
9
+ normalizeLineEndings: true,
10
+ unicodeNormalize: false, // Disabled by default to preserve offsets
11
+ trim: false, // Disabled by default to preserve offsets
12
+ };
13
+ /**
14
+ * Pre-normalizes text for PII detection
15
+ * Note: This currently only normalizes line endings to preserve character offsets
16
+ *
17
+ * @param text - Original input text
18
+ * @param options - Normalization options
19
+ * @returns Normalized text
20
+ */
21
+ export function prenormalize(text, options = {}) {
22
+ const opts = { ...DEFAULT_PRENORMALIZE_OPTIONS, ...options };
23
+ let result = text;
24
+ // Normalize line endings (\r\n -> \n, \r -> \n)
25
+ if (opts.normalizeLineEndings) {
26
+ result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
27
+ }
28
+ // Note: Unicode normalization (NFKC) can change string length
29
+ // We skip it by default to preserve character offsets
30
+ // If needed, implement offset mapping
31
+ if (opts.unicodeNormalize) {
32
+ result = result.normalize('NFKC');
33
+ }
34
+ if (opts.trim) {
35
+ result = result.trim();
36
+ }
37
+ return result;
38
+ }
39
+ /**
40
+ * Creates an identity offset mapping (no changes)
41
+ */
42
+ export function createIdentityMapping() {
43
+ return {
44
+ toNormalized: (offset) => offset,
45
+ toOriginal: (offset) => offset,
46
+ };
47
+ }
48
+ /**
49
+ * Creates offset mapping for line ending normalization
50
+ * This handles \r\n -> \n replacement
51
+ */
52
+ export function createLineEndingMapping(originalText) {
53
+ // Find all \r\n positions
54
+ const crlfPositions = [];
55
+ for (let i = 0; i < originalText.length - 1; i++) {
56
+ if (originalText[i] === '\r' && originalText[i + 1] === '\n') {
57
+ crlfPositions.push(i);
58
+ }
59
+ }
60
+ if (crlfPositions.length === 0) {
61
+ return createIdentityMapping();
62
+ }
63
+ return {
64
+ toNormalized(originalOffset) {
65
+ // Count how many \r\n pairs are before this offset
66
+ let adjustment = 0;
67
+ for (const pos of crlfPositions) {
68
+ if (pos < originalOffset) {
69
+ adjustment++;
70
+ }
71
+ else {
72
+ break;
73
+ }
74
+ }
75
+ return originalOffset - adjustment;
76
+ },
77
+ toOriginal(normalizedOffset) {
78
+ // Add back the removed \r characters
79
+ let adjustment = 0;
80
+ let currentNormalized = 0;
81
+ for (const pos of crlfPositions) {
82
+ if (currentNormalized + (pos - adjustment) <= normalizedOffset) {
83
+ adjustment++;
84
+ currentNormalized = pos - adjustment + 1;
85
+ }
86
+ else {
87
+ break;
88
+ }
89
+ }
90
+ return normalizedOffset + adjustment;
91
+ },
92
+ };
93
+ }
94
+ //# sourceMappingURL=prenormalize.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prenormalize.js","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAcH;;GAEG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAwB;IAC/D,oBAAoB,EAAE,IAAI;IAC1B,gBAAgB,EAAE,KAAK,EAAE,0CAA0C;IACnE,IAAI,EAAE,KAAK,EAAE,0CAA0C;CACxD,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,UAAwC,EAAE;IAE1C,MAAM,IAAI,GAAG,EAAE,GAAG,4BAA4B,EAAE,GAAG,OAAO,EAAE,CAAC;IAC7D,IAAI,MAAM,GAAG,IAAI,CAAC;IAElB,gDAAgD;IAChD,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;QAC9B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC9D,CAAC;IAED,8DAA8D;IAC9D,sDAAsD;IACtD,sCAAsC;IACtC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACd,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAaD;;GAEG;AACH,MAAM,UAAU,qBAAqB;IACnC,OAAO;QACL,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;QAChC,UAAU,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;KAC/B,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,YAAoB;IAC1D,0BAA0B;IAC1B,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,IAAI,YAAY,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7D,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,qBAAqB,EAAE,CAAC;IACjC,CAAC;IAED,OAAO;QACL,YAAY,CAAC,cAAsB;YACjC,mDAAmD;YACnD,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;oBACzB,UAAU,EAAE,CAAC;gBACf,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YACD,OAAO,cAAc,GAAG,UAAU,CAAC;QACrC,CAAC;QAED,UAAU,CAAC,gBAAwB;YACjC,qCAAqC;YACrC,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAE1B,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,iBAAiB,GAAG,CAAC,GAAG,GAAG,UAAU,CAAC,IAAI,gBAAgB,EAAE,CAAC;oBAC/D,UAAU,EAAE,CAAC;oBACb,iBAAiB,GAAG,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;gBAC3C,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YAED,OAAO,gBAAgB,GAAG,UAAU,CAAC;QACvC,CAAC;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Entity Resolver
3
+ * Merges, deduplicates, and resolves overlapping entity detections
4
+ */
5
+ import { SpanMatch, AnonymizationPolicy } from '../types/index.js';
6
+ /**
7
+ * Resolution strategy for overlapping entities
8
+ */
9
+ export declare enum OverlapStrategy {
10
+ /** Regex matches always win over NER */
11
+ REGEX_PRIORITY = "REGEX_PRIORITY",
12
+ /** Longer span wins */
13
+ LONGER_SPAN = "LONGER_SPAN",
14
+ /** Higher confidence wins */
15
+ HIGHER_CONFIDENCE = "HIGHER_CONFIDENCE",
16
+ /** Use type priority from policy */
17
+ TYPE_PRIORITY = "TYPE_PRIORITY"
18
+ }
19
+ /**
20
+ * Entity resolver configuration
21
+ */
22
+ export interface ResolverConfig {
23
+ /** Primary strategy for overlap resolution */
24
+ overlapStrategy: OverlapStrategy;
25
+ /** Whether regex matches always take precedence */
26
+ regexPriority: boolean;
27
+ /** Minimum confidence to keep an entity */
28
+ minConfidence: number;
29
+ }
30
+ /**
31
+ * Default resolver configuration
32
+ */
33
+ export declare const DEFAULT_RESOLVER_CONFIG: ResolverConfig;
34
+ /**
35
+ * Resolves and merges entity detections from regex and NER
36
+ */
37
+ export declare function resolveEntities(regexMatches: SpanMatch[], nerMatches: SpanMatch[], policy: AnonymizationPolicy, originalText: string, config?: Partial<ResolverConfig>): SpanMatch[];
38
+ /**
39
+ * Creates protected spans from regex matches
40
+ * Used to mask regex matches from NER to avoid double-detection
41
+ */
42
+ export declare function createProtectedSpans(regexMatches: SpanMatch[]): Array<{
43
+ start: number;
44
+ end: number;
45
+ }>;
46
+ /**
47
+ * Checks if a span overlaps with any protected span
48
+ */
49
+ export declare function isInProtectedSpan(span: {
50
+ start: number;
51
+ end: number;
52
+ }, protectedSpans: Array<{
53
+ start: number;
54
+ end: number;
55
+ }>): boolean;
56
+ //# sourceMappingURL=resolver.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAEL,SAAS,EAET,mBAAmB,EAEpB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,oBAAY,eAAe;IACzB,wCAAwC;IACxC,cAAc,mBAAmB;IACjC,uBAAuB;IACvB,WAAW,gBAAgB;IAC3B,6BAA6B;IAC7B,iBAAiB,sBAAsB;IACvC,oCAAoC;IACpC,aAAa,kBAAkB;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,8CAA8C;IAC9C,eAAe,EAAE,eAAe,CAAC;IACjC,mDAAmD;IACnD,aAAa,EAAE,OAAO,CAAC;IACvB,2CAA2C;IAC3C,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,uBAAuB,EAAE,cAIrC,CAAC;AAEF;;GAEG;AACH,wBAAgB,eAAe,CAC7B,YAAY,EAAE,SAAS,EAAE,EACzB,UAAU,EAAE,SAAS,EAAE,EACvB,MAAM,EAAE,mBAAmB,EAC3B,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,OAAO,CAAC,cAAc,CAAM,GACnC,SAAS,EAAE,CAyBb;AAiOD;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,YAAY,EAAE,SAAS,EAAE,GACxB,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,CAEvC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,EACpC,cAAc,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,GACpD,OAAO,CAET"}