rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
@@ -0,0 +1,332 @@
1
+ /**
2
+ * HuggingFace Tokenizer
3
+ * Loads and uses tokenizers from HuggingFace's tokenizer.json format
4
+ * Supports Unigram (SentencePiece) and BPE tokenizers
5
+ */
6
+ /**
7
+ * Default tokenizer configuration
8
+ */
9
+ export const DEFAULT_TOKENIZER_CONFIG = {
10
+ maxLength: 512,
11
+ doLowerCase: false, // XLM-RoBERTa doesn't lowercase
12
+ };
13
+ /**
14
+ * WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
15
+ */
16
+ export class WordPieceTokenizer {
17
+ vocab;
18
+ inverseVocab;
19
+ config;
20
+ sortedVocab;
21
+ // Special token IDs (XLM-RoBERTa style)
22
+ clsId = 0; // <s>
23
+ sepId = 2; // </s>
24
+ padId = 1; // <pad>
25
+ unkId = 3; // <unk>
26
+ // Special token strings
27
+ clsToken = '<s>';
28
+ sepToken = '</s>';
29
+ padToken = '<pad>';
30
+ unkToken = '<unk>';
31
+ constructor(vocab, config = {}) {
32
+ this.vocab = vocab;
33
+ this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
34
+ // Build inverse vocab
35
+ this.inverseVocab = new Map();
36
+ for (const [token, id] of vocab) {
37
+ this.inverseVocab.set(id, token);
38
+ }
39
+ // Sort vocab by token length (longest first) for greedy matching
40
+ this.sortedVocab = Array.from(vocab.entries()).sort((a, b) => b[0].length - a[0].length);
41
+ // Try to detect special tokens from vocab
42
+ this.detectSpecialTokens();
43
+ }
44
+ /**
45
+ * Detect special tokens from vocabulary
46
+ */
47
+ detectSpecialTokens() {
48
+ // XLM-RoBERTa style
49
+ if (this.vocab.has('<s>')) {
50
+ this.clsToken = '<s>';
51
+ this.clsId = this.vocab.get('<s>') ?? 0;
52
+ this.sepToken = '</s>';
53
+ this.sepId = this.vocab.get('</s>') ?? 2;
54
+ this.padToken = '<pad>';
55
+ this.padId = this.vocab.get('<pad>') ?? 1;
56
+ this.unkToken = '<unk>';
57
+ this.unkId = this.vocab.get('<unk>') ?? 3;
58
+ }
59
+ // BERT style
60
+ else if (this.vocab.has('[CLS]')) {
61
+ this.clsToken = '[CLS]';
62
+ this.clsId = this.vocab.get('[CLS]') ?? 101;
63
+ this.sepToken = '[SEP]';
64
+ this.sepId = this.vocab.get('[SEP]') ?? 102;
65
+ this.padToken = '[PAD]';
66
+ this.padId = this.vocab.get('[PAD]') ?? 0;
67
+ this.unkToken = '[UNK]';
68
+ this.unkId = this.vocab.get('[UNK]') ?? 100;
69
+ }
70
+ }
71
+ /**
72
+ * Tokenizes text into tokens with offset tracking
73
+ */
74
+ tokenize(text) {
75
+ const tokens = [];
76
+ const tokenToCharSpan = [];
77
+ // Add CLS token
78
+ tokens.push({
79
+ id: this.clsId,
80
+ token: this.clsToken,
81
+ start: 0,
82
+ end: 0,
83
+ isContinuation: false,
84
+ isSpecial: true,
85
+ });
86
+ tokenToCharSpan.push(null);
87
+ // Preprocess text
88
+ const processedText = this.config.doLowerCase ? text.toLowerCase() : text;
89
+ // Tokenize using greedy longest-match
90
+ let pos = 0;
91
+ while (pos < processedText.length) {
92
+ // Skip whitespace
93
+ if (/\s/.test(processedText[pos])) {
94
+ pos++;
95
+ continue;
96
+ }
97
+ // Find the longest matching token starting at this position
98
+ const { token, id, length } = this.findBestToken(processedText, pos);
99
+ const isFirstOfWord = pos === 0 || /\s/.test(processedText[pos - 1]);
100
+ tokens.push({
101
+ id,
102
+ token,
103
+ start: pos,
104
+ end: pos + length,
105
+ isContinuation: !isFirstOfWord && !token.startsWith('▁'),
106
+ isSpecial: false,
107
+ });
108
+ tokenToCharSpan.push([pos, pos + length]);
109
+ pos += length;
110
+ }
111
+ // Add SEP token
112
+ tokens.push({
113
+ id: this.sepId,
114
+ token: this.sepToken,
115
+ start: text.length,
116
+ end: text.length,
117
+ isContinuation: false,
118
+ isSpecial: true,
119
+ });
120
+ tokenToCharSpan.push(null);
121
+ // Truncate if necessary
122
+ const maxTokens = this.config.maxLength;
123
+ if (tokens.length > maxTokens) {
124
+ tokens.length = maxTokens - 1;
125
+ tokenToCharSpan.length = maxTokens - 1;
126
+ tokens.push({
127
+ id: this.sepId,
128
+ token: this.sepToken,
129
+ start: text.length,
130
+ end: text.length,
131
+ isContinuation: false,
132
+ isSpecial: true,
133
+ });
134
+ tokenToCharSpan.push(null);
135
+ }
136
+ // Build arrays
137
+ const inputIds = tokens.map((t) => t.id);
138
+ const attentionMask = tokens.map(() => 1);
139
+ const tokenTypeIds = tokens.map(() => 0);
140
+ return {
141
+ tokens,
142
+ inputIds,
143
+ attentionMask,
144
+ tokenTypeIds,
145
+ tokenToCharSpan,
146
+ };
147
+ }
148
+ /**
149
+ * Find the best matching token using greedy longest-match
150
+ */
151
+ findBestToken(text, startPos) {
152
+ const remaining = text.slice(startPos);
153
+ // Check if this starts a new word (preceded by space or start)
154
+ const isWordStart = startPos === 0 || /\s/.test(text[startPos - 1]);
155
+ // For SentencePiece models, word-initial tokens start with ▁
156
+ if (isWordStart) {
157
+ // Try with ▁ prefix first
158
+ const withPrefix = '▁' + remaining;
159
+ for (const [vocabToken, id] of this.sortedVocab) {
160
+ if (withPrefix.startsWith(vocabToken)) {
161
+ // Return the match length without the ▁ since that's not in original text
162
+ return {
163
+ token: vocabToken,
164
+ id,
165
+ length: vocabToken.length - 1 // Subtract 1 for the ▁
166
+ };
167
+ }
168
+ }
169
+ }
170
+ // Try exact match without prefix
171
+ for (const [vocabToken, id] of this.sortedVocab) {
172
+ // Skip special tokens and tokens starting with ▁ for non-word-start positions
173
+ if (vocabToken.startsWith('<') || vocabToken.startsWith('['))
174
+ continue;
175
+ if (!isWordStart && vocabToken.startsWith('▁'))
176
+ continue;
177
+ if (remaining.startsWith(vocabToken.replace(/^▁/, ''))) {
178
+ const matchLength = vocabToken.replace(/^▁/, '').length;
179
+ if (matchLength > 0) {
180
+ return { token: vocabToken, id, length: matchLength };
181
+ }
182
+ }
183
+ }
184
+ // Single character fallback
185
+ const char = remaining[0];
186
+ const charId = this.vocab.get(char) ?? this.vocab.get('▁' + char) ?? this.unkId;
187
+ return { token: char, id: charId, length: 1 };
188
+ }
189
+ /**
190
+ * Decodes token IDs back to text
191
+ */
192
+ decode(tokenIds) {
193
+ const parts = [];
194
+ for (const id of tokenIds) {
195
+ const token = this.inverseVocab.get(id);
196
+ if (token === undefined)
197
+ continue;
198
+ if (token === this.clsToken || token === this.sepToken || token === this.padToken)
199
+ continue;
200
+ // SentencePiece uses ▁ to mark word boundaries
201
+ if (token.startsWith('▁')) {
202
+ parts.push(' ' + token.slice(1));
203
+ }
204
+ else {
205
+ parts.push(token);
206
+ }
207
+ }
208
+ return parts.join('').trim();
209
+ }
210
+ /**
211
+ * Gets vocabulary size
212
+ */
213
+ get vocabSize() {
214
+ return this.vocab.size;
215
+ }
216
+ /**
217
+ * Gets a token ID by string
218
+ */
219
+ getTokenId(token) {
220
+ return this.vocab.get(token);
221
+ }
222
+ /**
223
+ * Gets a token string by ID
224
+ */
225
+ getToken(id) {
226
+ return this.inverseVocab.get(id);
227
+ }
228
+ }
229
+ /**
230
+ * Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
231
+ * Uses storage abstraction for browser compatibility
232
+ */
233
+ export async function loadVocabFromFile(filePath) {
234
+ const { getStorageProvider } = await import('../utils/storage.js');
235
+ const storage = await getStorageProvider();
236
+ const content = await storage.readTextFile(filePath);
237
+ // Detect format
238
+ if (filePath.endsWith('.json') || content.trim().startsWith('{')) {
239
+ return parseHFTokenizerJson(content);
240
+ }
241
+ else {
242
+ return parseVocab(content);
243
+ }
244
+ }
245
+ /**
246
+ * Loads vocabulary from content string (for when content is already available)
247
+ */
248
+ export function loadVocabFromContent(content, format = 'json') {
249
+ if (format === 'json' || content.trim().startsWith('{')) {
250
+ return parseHFTokenizerJson(content);
251
+ }
252
+ else {
253
+ return parseVocab(content);
254
+ }
255
+ }
256
+ /**
257
+ * Parses HuggingFace tokenizer.json format
258
+ */
259
+ export function parseHFTokenizerJson(content) {
260
+ const vocab = new Map();
261
+ try {
262
+ const config = JSON.parse(content);
263
+ // Add special tokens first
264
+ if (Array.isArray(config.added_tokens)) {
265
+ for (const token of config.added_tokens) {
266
+ vocab.set(token.content, token.id);
267
+ }
268
+ }
269
+ // Add model vocabulary
270
+ if (config.model !== undefined && config.model.vocab !== undefined) {
271
+ if (Array.isArray(config.model.vocab)) {
272
+ // Unigram format: array of [token, score] pairs
273
+ for (let i = 0; i < config.model.vocab.length; i++) {
274
+ const entry = config.model.vocab[i];
275
+ if (entry && typeof entry[0] === 'string') {
276
+ vocab.set(entry[0], i);
277
+ }
278
+ }
279
+ }
280
+ else {
281
+ // BPE/WordPiece format: object mapping token -> id
282
+ for (const [token, id] of Object.entries(config.model.vocab)) {
283
+ vocab.set(token, id);
284
+ }
285
+ }
286
+ }
287
+ }
288
+ catch (e) {
289
+ throw new Error(`Failed to parse tokenizer.json: ${String(e)}`);
290
+ }
291
+ return vocab;
292
+ }
293
+ /**
294
+ * Parses vocabulary from string content (vocab.txt format)
295
+ */
296
+ export function parseVocab(content) {
297
+ const vocab = new Map();
298
+ const lines = content.split('\n');
299
+ for (let i = 0; i < lines.length; i++) {
300
+ const token = lines[i]?.trim();
301
+ if (token !== undefined && token.length > 0) {
302
+ vocab.set(token, i);
303
+ }
304
+ }
305
+ return vocab;
306
+ }
307
+ /**
308
+ * Creates a minimal vocabulary for testing
309
+ */
310
+ export function createTestVocab() {
311
+ const tokens = [
312
+ '<s>',
313
+ '<pad>',
314
+ '</s>',
315
+ '<unk>',
316
+ '▁Hello',
317
+ '▁John',
318
+ '▁Smith',
319
+ '▁from',
320
+ '▁Acme',
321
+ '▁Corp',
322
+ '▁in',
323
+ '▁Berlin',
324
+ '!',
325
+ ];
326
+ const vocab = new Map();
327
+ tokens.forEach((token, index) => {
328
+ vocab.set(token, index);
329
+ });
330
+ return vocab;
331
+ }
332
+ //# sourceMappingURL=tokenizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkEH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,KAAK,EAAE,gCAAgC;CACrD,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IACxB,WAAW,CAA0B;IAE7C,wCAAwC;IAChC,KAAK,GAAW,CAAC,CAAC,CAAE,MAAM;IAC1B,KAAK,GAAW,CAAC,CAAC,CAAE,OAAO;IAC3B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAC5B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAEpC,wBAAwB;IAChB,QAAQ,GAAW,KAAK,CAAC;IACzB,QAAQ,GAAW,MAAM,CAAC;IAC1B,QAAQ,GAAW,OAAO,CAAC;IAC3B,QAAQ,GAAW,OAAO,CAAC;IAEnC,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,iEAAiE;QACjE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEzF,0CAA0C;QAC1C,IAAI,CAAC,mBAAmB,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,oBAAoB;QACpB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;YACtB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC;YACvB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QACD,aAAa;aACR,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;QAC9C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAE1E,sCAAsC;QACtC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,OAAO,GAAG,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;YAClC,kBAAkB;YAClB,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAE,CAAC,EAAE,CAAC;gBACnC,GAAG,EAAE,CAAC;gBACN,SAAS;YACX,CAAC;YAED,4DAA4D;YAC5D,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;YAErE,MAAM,aAAa,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC,CAAC;YAEtE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,KAAK;gBACL,KAAK,EAAE,GAAG;gBACV,GAAG,EAAE,GAAG,GAAG,MAAM;gBACjB,cAAc,EAAE,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;gBACxD,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YAE1C,GAAG,IAAI,MAAM,CAAC;QAChB,CAAC;QAED,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,QAAQ;gBACpB,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,QAAgB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEvC,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAE,CAAC,CAAC;QAErE,6DAA6D;QAC7D,IAAI,WAAW,EAAE,CAAC;YAChB,0BAA0B;YAC1B,MAAM,UAAU,GAAG,GAAG,GAAG,SAAS,CAAC;YACnC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBAChD,IAAI,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;oBACtC,0EAA0E;oBAC1E,OAAO;wBACL,KAAK,EAAE,UAAU;wBACjB,EAAE;wBACF,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,uBAAuB;qBACtD,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAChD,8EAA8E;YAC9E,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YACvE,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEzD,IAAI,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC;gBACxD,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;oBACpB,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;gBACxD,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC;QAChF,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAClC,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAE5F,+CAA+C;YAC/C,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB;IACtD,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IACnE,MAAM,OAAO,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAC3C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IAErD,gBAAgB;IAChB,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe,EAAE,SAAyB,MAAM;IACnF,IAAI,MAAM,KAAK,MAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACxD,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAsB,CAAC;QAExD,2BAA2B;QAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,CAAC,EAAE,CAAC;YACvC,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACxC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,MAAM,CAAC,KAAK,KAAK,SAAS,IAAI,MAAM,CAAC,KAAK,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;YACnE,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,gDAAgD;gBAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,QAAQ,EAAE,CAAC;wBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;oBACzB,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,mDAAmD;gBACnD,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7D,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,KAAK;QACL,OAAO;QACP,MAAM;QACN,OAAO;QACP,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,OAAO;QACP,OAAO;QACP,OAAO;QACP,KAAK;QACL,SAAS;QACT,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Pipeline Module
3
+ * Exports all pipeline components
4
+ */
5
+ export * from "./prenormalize.js";
6
+ export * from "./resolver.js";
7
+ export * from "./tagger.js";
8
+ export * from "./validator.js";
9
+ export * from "./semantic-enricher.js";
10
+ export * from "./semantic-data-loader.js";
11
+ export * from "./title-extractor.js";
12
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC;AAC/B,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,sBAAsB,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Pipeline Module
3
+ * Exports all pipeline components
4
+ */
5
+ export * from "./prenormalize.js";
6
+ export * from "./resolver.js";
7
+ export * from "./tagger.js";
8
+ export * from "./validator.js";
9
+ export * from "./semantic-enricher.js";
10
+ export * from "./semantic-data-loader.js";
11
+ export * from "./title-extractor.js";
12
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC;AAC/B,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,sBAAsB,CAAC"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Pre-normalization
3
+ * Normalizes text before PII detection
4
+ */
5
+ /**
6
+ * Pre-normalization options
7
+ */
8
+ export interface PrenormalizeOptions {
9
+ /** Normalize line endings to \n */
10
+ normalizeLineEndings: boolean;
11
+ /** Apply Unicode NFKC normalization */
12
+ unicodeNormalize: boolean;
13
+ /** Trim leading/trailing whitespace */
14
+ trim: boolean;
15
+ }
16
+ /**
17
+ * Default pre-normalization options
18
+ */
19
+ export declare const DEFAULT_PRENORMALIZE_OPTIONS: PrenormalizeOptions;
20
+ /**
21
+ * Pre-normalizes text for PII detection
22
+ * Note: This currently only normalizes line endings to preserve character offsets
23
+ *
24
+ * @param text - Original input text
25
+ * @param options - Normalization options
26
+ * @returns Normalized text
27
+ */
28
+ export declare function prenormalize(text: string, options?: Partial<PrenormalizeOptions>): string;
29
+ /**
30
+ * Calculates offset adjustments when text is modified
31
+ * Used when prenormalization changes text length
32
+ */
33
+ export interface OffsetMapping {
34
+ /** Map from original offset to normalized offset */
35
+ toNormalized: (originalOffset: number) => number;
36
+ /** Map from normalized offset to original offset */
37
+ toOriginal: (normalizedOffset: number) => number;
38
+ }
39
+ /**
40
+ * Creates an identity offset mapping (no changes)
41
+ */
42
+ export declare function createIdentityMapping(): OffsetMapping;
43
+ /**
44
+ * Creates offset mapping for line ending normalization
45
+ * This handles \r\n -> \n replacement
46
+ */
47
+ export declare function createLineEndingMapping(originalText: string): OffsetMapping;
48
+ //# sourceMappingURL=prenormalize.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prenormalize.d.ts","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,oBAAoB,EAAE,OAAO,CAAC;IAC9B,uCAAuC;IACvC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,uCAAuC;IACvC,IAAI,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,eAAO,MAAM,4BAA4B,EAAE,mBAI1C,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,OAAO,CAAC,mBAAmB,CAAM,GACzC,MAAM,CAqBR;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC5B,oDAAoD;IACpD,YAAY,EAAE,CAAC,cAAc,EAAE,MAAM,KAAK,MAAM,CAAC;IACjD,oDAAoD;IACpD,UAAU,EAAE,CAAC,gBAAgB,EAAE,MAAM,KAAK,MAAM,CAAC;CAClD;AAED;;GAEG;AACH,wBAAgB,qBAAqB,IAAI,aAAa,CAKrD;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,YAAY,EAAE,MAAM,GAAG,aAAa,CA4C3E"}
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Pre-normalization
3
+ * Normalizes text before PII detection
4
+ */
5
+ /**
6
+ * Default pre-normalization options
7
+ */
8
+ export const DEFAULT_PRENORMALIZE_OPTIONS = {
9
+ normalizeLineEndings: true,
10
+ unicodeNormalize: false, // Disabled by default to preserve offsets
11
+ trim: false, // Disabled by default to preserve offsets
12
+ };
13
+ /**
14
+ * Pre-normalizes text for PII detection
15
+ * Note: This currently only normalizes line endings to preserve character offsets
16
+ *
17
+ * @param text - Original input text
18
+ * @param options - Normalization options
19
+ * @returns Normalized text
20
+ */
21
+ export function prenormalize(text, options = {}) {
22
+ const opts = { ...DEFAULT_PRENORMALIZE_OPTIONS, ...options };
23
+ let result = text;
24
+ // Normalize line endings (\r\n -> \n, \r -> \n)
25
+ if (opts.normalizeLineEndings) {
26
+ result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
27
+ }
28
+ // Note: Unicode normalization (NFKC) can change string length
29
+ // We skip it by default to preserve character offsets
30
+ // If needed, implement offset mapping
31
+ if (opts.unicodeNormalize) {
32
+ result = result.normalize('NFKC');
33
+ }
34
+ if (opts.trim) {
35
+ result = result.trim();
36
+ }
37
+ return result;
38
+ }
39
+ /**
40
+ * Creates an identity offset mapping (no changes)
41
+ */
42
+ export function createIdentityMapping() {
43
+ return {
44
+ toNormalized: (offset) => offset,
45
+ toOriginal: (offset) => offset,
46
+ };
47
+ }
48
+ /**
49
+ * Creates offset mapping for line ending normalization
50
+ * This handles \r\n -> \n replacement
51
+ */
52
+ export function createLineEndingMapping(originalText) {
53
+ // Find all \r\n positions
54
+ const crlfPositions = [];
55
+ for (let i = 0; i < originalText.length - 1; i++) {
56
+ if (originalText[i] === '\r' && originalText[i + 1] === '\n') {
57
+ crlfPositions.push(i);
58
+ }
59
+ }
60
+ if (crlfPositions.length === 0) {
61
+ return createIdentityMapping();
62
+ }
63
+ return {
64
+ toNormalized(originalOffset) {
65
+ // Count how many \r\n pairs are before this offset
66
+ let adjustment = 0;
67
+ for (const pos of crlfPositions) {
68
+ if (pos < originalOffset) {
69
+ adjustment++;
70
+ }
71
+ else {
72
+ break;
73
+ }
74
+ }
75
+ return originalOffset - adjustment;
76
+ },
77
+ toOriginal(normalizedOffset) {
78
+ // Add back the removed \r characters
79
+ let adjustment = 0;
80
+ let currentNormalized = 0;
81
+ for (const pos of crlfPositions) {
82
+ if (currentNormalized + (pos - adjustment) <= normalizedOffset) {
83
+ adjustment++;
84
+ currentNormalized = pos - adjustment + 1;
85
+ }
86
+ else {
87
+ break;
88
+ }
89
+ }
90
+ return normalizedOffset + adjustment;
91
+ },
92
+ };
93
+ }
94
+ //# sourceMappingURL=prenormalize.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prenormalize.js","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAcH;;GAEG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAwB;IAC/D,oBAAoB,EAAE,IAAI;IAC1B,gBAAgB,EAAE,KAAK,EAAE,0CAA0C;IACnE,IAAI,EAAE,KAAK,EAAE,0CAA0C;CACxD,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,UAAwC,EAAE;IAE1C,MAAM,IAAI,GAAG,EAAE,GAAG,4BAA4B,EAAE,GAAG,OAAO,EAAE,CAAC;IAC7D,IAAI,MAAM,GAAG,IAAI,CAAC;IAElB,gDAAgD;IAChD,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;QAC9B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC9D,CAAC;IAED,8DAA8D;IAC9D,sDAAsD;IACtD,sCAAsC;IACtC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACd,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAaD;;GAEG;AACH,MAAM,UAAU,qBAAqB;IACnC,OAAO;QACL,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;QAChC,UAAU,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;KAC/B,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,YAAoB;IAC1D,0BAA0B;IAC1B,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,IAAI,YAAY,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7D,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,qBAAqB,EAAE,CAAC;IACjC,CAAC;IAED,OAAO;QACL,YAAY,CAAC,cAAsB;YACjC,mDAAmD;YACnD,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;oBACzB,UAAU,EAAE,CAAC;gBACf,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YACD,OAAO,cAAc,GAAG,UAAU,CAAC;QACrC,CAAC;QAED,UAAU,CAAC,gBAAwB;YACjC,qCAAqC;YACrC,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAE1B,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,iBAAiB,GAAG,CAAC,GAAG,GAAG,UAAU,CAAC,IAAI,gBAAgB,EAAE,CAAC;oBAC/D,UAAU,EAAE,CAAC;oBACb,iBAAiB,GAAG,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;gBAC3C,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YAED,OAAO,gBAAgB,GAAG,UAAU,CAAC;QACvC,CAAC;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Entity Resolver
3
+ * Merges, deduplicates, and resolves overlapping entity detections
4
+ */
5
+ import { SpanMatch, AnonymizationPolicy } from '../types/index.js';
6
+ /**
7
+ * Resolution strategy for overlapping entities
8
+ */
9
+ export declare enum OverlapStrategy {
10
+ /** Regex matches always win over NER */
11
+ REGEX_PRIORITY = "REGEX_PRIORITY",
12
+ /** Longer span wins */
13
+ LONGER_SPAN = "LONGER_SPAN",
14
+ /** Higher confidence wins */
15
+ HIGHER_CONFIDENCE = "HIGHER_CONFIDENCE",
16
+ /** Use type priority from policy */
17
+ TYPE_PRIORITY = "TYPE_PRIORITY"
18
+ }
19
+ /**
20
+ * Entity resolver configuration
21
+ */
22
+ export interface ResolverConfig {
23
+ /** Primary strategy for overlap resolution */
24
+ overlapStrategy: OverlapStrategy;
25
+ /** Whether regex matches always take precedence */
26
+ regexPriority: boolean;
27
+ /** Minimum confidence to keep an entity */
28
+ minConfidence: number;
29
+ }
30
+ /**
31
+ * Default resolver configuration
32
+ */
33
+ export declare const DEFAULT_RESOLVER_CONFIG: ResolverConfig;
34
+ /**
35
+ * Resolves and merges entity detections from regex and NER
36
+ */
37
+ export declare function resolveEntities(regexMatches: SpanMatch[], nerMatches: SpanMatch[], policy: AnonymizationPolicy, originalText: string, config?: Partial<ResolverConfig>): SpanMatch[];
38
+ /**
39
+ * Creates protected spans from regex matches
40
+ * Used to mask regex matches from NER to avoid double-detection
41
+ */
42
+ export declare function createProtectedSpans(regexMatches: SpanMatch[]): Array<{
43
+ start: number;
44
+ end: number;
45
+ }>;
46
+ /**
47
+ * Checks if a span overlaps with any protected span
48
+ */
49
+ export declare function isInProtectedSpan(span: {
50
+ start: number;
51
+ end: number;
52
+ }, protectedSpans: Array<{
53
+ start: number;
54
+ end: number;
55
+ }>): boolean;
56
+ //# sourceMappingURL=resolver.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAEL,SAAS,EAET,mBAAmB,EAEpB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,oBAAY,eAAe;IACzB,wCAAwC;IACxC,cAAc,mBAAmB;IACjC,uBAAuB;IACvB,WAAW,gBAAgB;IAC3B,6BAA6B;IAC7B,iBAAiB,sBAAsB;IACvC,oCAAoC;IACpC,aAAa,kBAAkB;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,8CAA8C;IAC9C,eAAe,EAAE,eAAe,CAAC;IACjC,mDAAmD;IACnD,aAAa,EAAE,OAAO,CAAC;IACvB,2CAA2C;IAC3C,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,uBAAuB,EAAE,cAIrC,CAAC;AAEF;;GAEG;AACH,wBAAgB,eAAe,CAC7B,YAAY,EAAE,SAAS,EAAE,EACzB,UAAU,EAAE,SAAS,EAAE,EACvB,MAAM,EAAE,mBAAmB,EAC3B,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,OAAO,CAAC,cAAc,CAAM,GACnC,SAAS,EAAE,CAyBb;AAkOD;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,YAAY,EAAE,SAAS,EAAE,GACxB,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,CAEvC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,EACpC,cAAc,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,GACpD,OAAO,CAET"}