rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HuggingFace Tokenizer
|
|
3
|
+
* Loads and uses tokenizers from HuggingFace's tokenizer.json format
|
|
4
|
+
* Supports Unigram (SentencePiece) and BPE tokenizers
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Default tokenizer configuration
|
|
8
|
+
*/
|
|
9
|
+
export const DEFAULT_TOKENIZER_CONFIG = {
|
|
10
|
+
maxLength: 512,
|
|
11
|
+
doLowerCase: false, // XLM-RoBERTa doesn't lowercase
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
|
|
15
|
+
*/
|
|
16
|
+
export class WordPieceTokenizer {
|
|
17
|
+
vocab;
|
|
18
|
+
inverseVocab;
|
|
19
|
+
config;
|
|
20
|
+
sortedVocab;
|
|
21
|
+
// Special token IDs (XLM-RoBERTa style)
|
|
22
|
+
clsId = 0; // <s>
|
|
23
|
+
sepId = 2; // </s>
|
|
24
|
+
padId = 1; // <pad>
|
|
25
|
+
unkId = 3; // <unk>
|
|
26
|
+
// Special token strings
|
|
27
|
+
clsToken = '<s>';
|
|
28
|
+
sepToken = '</s>';
|
|
29
|
+
padToken = '<pad>';
|
|
30
|
+
unkToken = '<unk>';
|
|
31
|
+
constructor(vocab, config = {}) {
|
|
32
|
+
this.vocab = vocab;
|
|
33
|
+
this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
|
|
34
|
+
// Build inverse vocab
|
|
35
|
+
this.inverseVocab = new Map();
|
|
36
|
+
for (const [token, id] of vocab) {
|
|
37
|
+
this.inverseVocab.set(id, token);
|
|
38
|
+
}
|
|
39
|
+
// Sort vocab by token length (longest first) for greedy matching
|
|
40
|
+
this.sortedVocab = Array.from(vocab.entries()).sort((a, b) => b[0].length - a[0].length);
|
|
41
|
+
// Try to detect special tokens from vocab
|
|
42
|
+
this.detectSpecialTokens();
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Detect special tokens from vocabulary
|
|
46
|
+
*/
|
|
47
|
+
detectSpecialTokens() {
|
|
48
|
+
// XLM-RoBERTa style
|
|
49
|
+
if (this.vocab.has('<s>')) {
|
|
50
|
+
this.clsToken = '<s>';
|
|
51
|
+
this.clsId = this.vocab.get('<s>') ?? 0;
|
|
52
|
+
this.sepToken = '</s>';
|
|
53
|
+
this.sepId = this.vocab.get('</s>') ?? 2;
|
|
54
|
+
this.padToken = '<pad>';
|
|
55
|
+
this.padId = this.vocab.get('<pad>') ?? 1;
|
|
56
|
+
this.unkToken = '<unk>';
|
|
57
|
+
this.unkId = this.vocab.get('<unk>') ?? 3;
|
|
58
|
+
}
|
|
59
|
+
// BERT style
|
|
60
|
+
else if (this.vocab.has('[CLS]')) {
|
|
61
|
+
this.clsToken = '[CLS]';
|
|
62
|
+
this.clsId = this.vocab.get('[CLS]') ?? 101;
|
|
63
|
+
this.sepToken = '[SEP]';
|
|
64
|
+
this.sepId = this.vocab.get('[SEP]') ?? 102;
|
|
65
|
+
this.padToken = '[PAD]';
|
|
66
|
+
this.padId = this.vocab.get('[PAD]') ?? 0;
|
|
67
|
+
this.unkToken = '[UNK]';
|
|
68
|
+
this.unkId = this.vocab.get('[UNK]') ?? 100;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Tokenizes text into tokens with offset tracking
|
|
73
|
+
*/
|
|
74
|
+
tokenize(text) {
|
|
75
|
+
const tokens = [];
|
|
76
|
+
const tokenToCharSpan = [];
|
|
77
|
+
// Add CLS token
|
|
78
|
+
tokens.push({
|
|
79
|
+
id: this.clsId,
|
|
80
|
+
token: this.clsToken,
|
|
81
|
+
start: 0,
|
|
82
|
+
end: 0,
|
|
83
|
+
isContinuation: false,
|
|
84
|
+
isSpecial: true,
|
|
85
|
+
});
|
|
86
|
+
tokenToCharSpan.push(null);
|
|
87
|
+
// Preprocess text
|
|
88
|
+
const processedText = this.config.doLowerCase ? text.toLowerCase() : text;
|
|
89
|
+
// Tokenize using greedy longest-match
|
|
90
|
+
let pos = 0;
|
|
91
|
+
while (pos < processedText.length) {
|
|
92
|
+
// Skip whitespace
|
|
93
|
+
if (/\s/.test(processedText[pos])) {
|
|
94
|
+
pos++;
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
// Find the longest matching token starting at this position
|
|
98
|
+
const { token, id, length } = this.findBestToken(processedText, pos);
|
|
99
|
+
const isFirstOfWord = pos === 0 || /\s/.test(processedText[pos - 1]);
|
|
100
|
+
tokens.push({
|
|
101
|
+
id,
|
|
102
|
+
token,
|
|
103
|
+
start: pos,
|
|
104
|
+
end: pos + length,
|
|
105
|
+
isContinuation: !isFirstOfWord && !token.startsWith('▁'),
|
|
106
|
+
isSpecial: false,
|
|
107
|
+
});
|
|
108
|
+
tokenToCharSpan.push([pos, pos + length]);
|
|
109
|
+
pos += length;
|
|
110
|
+
}
|
|
111
|
+
// Add SEP token
|
|
112
|
+
tokens.push({
|
|
113
|
+
id: this.sepId,
|
|
114
|
+
token: this.sepToken,
|
|
115
|
+
start: text.length,
|
|
116
|
+
end: text.length,
|
|
117
|
+
isContinuation: false,
|
|
118
|
+
isSpecial: true,
|
|
119
|
+
});
|
|
120
|
+
tokenToCharSpan.push(null);
|
|
121
|
+
// Truncate if necessary
|
|
122
|
+
const maxTokens = this.config.maxLength;
|
|
123
|
+
if (tokens.length > maxTokens) {
|
|
124
|
+
tokens.length = maxTokens - 1;
|
|
125
|
+
tokenToCharSpan.length = maxTokens - 1;
|
|
126
|
+
tokens.push({
|
|
127
|
+
id: this.sepId,
|
|
128
|
+
token: this.sepToken,
|
|
129
|
+
start: text.length,
|
|
130
|
+
end: text.length,
|
|
131
|
+
isContinuation: false,
|
|
132
|
+
isSpecial: true,
|
|
133
|
+
});
|
|
134
|
+
tokenToCharSpan.push(null);
|
|
135
|
+
}
|
|
136
|
+
// Build arrays
|
|
137
|
+
const inputIds = tokens.map((t) => t.id);
|
|
138
|
+
const attentionMask = tokens.map(() => 1);
|
|
139
|
+
const tokenTypeIds = tokens.map(() => 0);
|
|
140
|
+
return {
|
|
141
|
+
tokens,
|
|
142
|
+
inputIds,
|
|
143
|
+
attentionMask,
|
|
144
|
+
tokenTypeIds,
|
|
145
|
+
tokenToCharSpan,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Find the best matching token using greedy longest-match
|
|
150
|
+
*/
|
|
151
|
+
findBestToken(text, startPos) {
|
|
152
|
+
const remaining = text.slice(startPos);
|
|
153
|
+
// Check if this starts a new word (preceded by space or start)
|
|
154
|
+
const isWordStart = startPos === 0 || /\s/.test(text[startPos - 1]);
|
|
155
|
+
// For SentencePiece models, word-initial tokens start with ▁
|
|
156
|
+
if (isWordStart) {
|
|
157
|
+
// Try with ▁ prefix first
|
|
158
|
+
const withPrefix = '▁' + remaining;
|
|
159
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
160
|
+
if (withPrefix.startsWith(vocabToken)) {
|
|
161
|
+
// Return the match length without the ▁ since that's not in original text
|
|
162
|
+
return {
|
|
163
|
+
token: vocabToken,
|
|
164
|
+
id,
|
|
165
|
+
length: vocabToken.length - 1 // Subtract 1 for the ▁
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// Try exact match without prefix
|
|
171
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
172
|
+
// Skip special tokens and tokens starting with ▁ for non-word-start positions
|
|
173
|
+
if (vocabToken.startsWith('<') || vocabToken.startsWith('['))
|
|
174
|
+
continue;
|
|
175
|
+
if (!isWordStart && vocabToken.startsWith('▁'))
|
|
176
|
+
continue;
|
|
177
|
+
if (remaining.startsWith(vocabToken.replace(/^▁/, ''))) {
|
|
178
|
+
const matchLength = vocabToken.replace(/^▁/, '').length;
|
|
179
|
+
if (matchLength > 0) {
|
|
180
|
+
return { token: vocabToken, id, length: matchLength };
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
// Single character fallback
|
|
185
|
+
const char = remaining[0];
|
|
186
|
+
const charId = this.vocab.get(char) ?? this.vocab.get('▁' + char) ?? this.unkId;
|
|
187
|
+
return { token: char, id: charId, length: 1 };
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Decodes token IDs back to text
|
|
191
|
+
*/
|
|
192
|
+
decode(tokenIds) {
|
|
193
|
+
const parts = [];
|
|
194
|
+
for (const id of tokenIds) {
|
|
195
|
+
const token = this.inverseVocab.get(id);
|
|
196
|
+
if (token === undefined)
|
|
197
|
+
continue;
|
|
198
|
+
if (token === this.clsToken || token === this.sepToken || token === this.padToken)
|
|
199
|
+
continue;
|
|
200
|
+
// SentencePiece uses ▁ to mark word boundaries
|
|
201
|
+
if (token.startsWith('▁')) {
|
|
202
|
+
parts.push(' ' + token.slice(1));
|
|
203
|
+
}
|
|
204
|
+
else {
|
|
205
|
+
parts.push(token);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return parts.join('').trim();
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Gets vocabulary size
|
|
212
|
+
*/
|
|
213
|
+
get vocabSize() {
|
|
214
|
+
return this.vocab.size;
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Gets a token ID by string
|
|
218
|
+
*/
|
|
219
|
+
getTokenId(token) {
|
|
220
|
+
return this.vocab.get(token);
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Gets a token string by ID
|
|
224
|
+
*/
|
|
225
|
+
getToken(id) {
|
|
226
|
+
return this.inverseVocab.get(id);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
|
|
231
|
+
* Uses storage abstraction for browser compatibility
|
|
232
|
+
*/
|
|
233
|
+
export async function loadVocabFromFile(filePath) {
|
|
234
|
+
const { getStorageProvider } = await import('../utils/storage.js');
|
|
235
|
+
const storage = await getStorageProvider();
|
|
236
|
+
const content = await storage.readTextFile(filePath);
|
|
237
|
+
// Detect format
|
|
238
|
+
if (filePath.endsWith('.json') || content.trim().startsWith('{')) {
|
|
239
|
+
return parseHFTokenizerJson(content);
|
|
240
|
+
}
|
|
241
|
+
else {
|
|
242
|
+
return parseVocab(content);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Loads vocabulary from content string (for when content is already available)
|
|
247
|
+
*/
|
|
248
|
+
export function loadVocabFromContent(content, format = 'json') {
|
|
249
|
+
if (format === 'json' || content.trim().startsWith('{')) {
|
|
250
|
+
return parseHFTokenizerJson(content);
|
|
251
|
+
}
|
|
252
|
+
else {
|
|
253
|
+
return parseVocab(content);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Parses HuggingFace tokenizer.json format
|
|
258
|
+
*/
|
|
259
|
+
export function parseHFTokenizerJson(content) {
|
|
260
|
+
const vocab = new Map();
|
|
261
|
+
try {
|
|
262
|
+
const config = JSON.parse(content);
|
|
263
|
+
// Add special tokens first
|
|
264
|
+
if (Array.isArray(config.added_tokens)) {
|
|
265
|
+
for (const token of config.added_tokens) {
|
|
266
|
+
vocab.set(token.content, token.id);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// Add model vocabulary
|
|
270
|
+
if (config.model !== undefined && config.model.vocab !== undefined) {
|
|
271
|
+
if (Array.isArray(config.model.vocab)) {
|
|
272
|
+
// Unigram format: array of [token, score] pairs
|
|
273
|
+
for (let i = 0; i < config.model.vocab.length; i++) {
|
|
274
|
+
const entry = config.model.vocab[i];
|
|
275
|
+
if (entry && typeof entry[0] === 'string') {
|
|
276
|
+
vocab.set(entry[0], i);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
else {
|
|
281
|
+
// BPE/WordPiece format: object mapping token -> id
|
|
282
|
+
for (const [token, id] of Object.entries(config.model.vocab)) {
|
|
283
|
+
vocab.set(token, id);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
catch (e) {
|
|
289
|
+
throw new Error(`Failed to parse tokenizer.json: ${String(e)}`);
|
|
290
|
+
}
|
|
291
|
+
return vocab;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Parses vocabulary from string content (vocab.txt format)
|
|
295
|
+
*/
|
|
296
|
+
export function parseVocab(content) {
|
|
297
|
+
const vocab = new Map();
|
|
298
|
+
const lines = content.split('\n');
|
|
299
|
+
for (let i = 0; i < lines.length; i++) {
|
|
300
|
+
const token = lines[i]?.trim();
|
|
301
|
+
if (token !== undefined && token.length > 0) {
|
|
302
|
+
vocab.set(token, i);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
return vocab;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Creates a minimal vocabulary for testing
|
|
309
|
+
*/
|
|
310
|
+
export function createTestVocab() {
|
|
311
|
+
const tokens = [
|
|
312
|
+
'<s>',
|
|
313
|
+
'<pad>',
|
|
314
|
+
'</s>',
|
|
315
|
+
'<unk>',
|
|
316
|
+
'▁Hello',
|
|
317
|
+
'▁John',
|
|
318
|
+
'▁Smith',
|
|
319
|
+
'▁from',
|
|
320
|
+
'▁Acme',
|
|
321
|
+
'▁Corp',
|
|
322
|
+
'▁in',
|
|
323
|
+
'▁Berlin',
|
|
324
|
+
'!',
|
|
325
|
+
];
|
|
326
|
+
const vocab = new Map();
|
|
327
|
+
tokens.forEach((token, index) => {
|
|
328
|
+
vocab.set(token, index);
|
|
329
|
+
});
|
|
330
|
+
return vocab;
|
|
331
|
+
}
|
|
332
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkEH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,KAAK,EAAE,gCAAgC;CACrD,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IACxB,WAAW,CAA0B;IAE7C,wCAAwC;IAChC,KAAK,GAAW,CAAC,CAAC,CAAE,MAAM;IAC1B,KAAK,GAAW,CAAC,CAAC,CAAE,OAAO;IAC3B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAC5B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAEpC,wBAAwB;IAChB,QAAQ,GAAW,KAAK,CAAC;IACzB,QAAQ,GAAW,MAAM,CAAC;IAC1B,QAAQ,GAAW,OAAO,CAAC;IAC3B,QAAQ,GAAW,OAAO,CAAC;IAEnC,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,iEAAiE;QACjE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEzF,0CAA0C;QAC1C,IAAI,CAAC,mBAAmB,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,oBAAoB;QACpB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;YACtB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC;YACvB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QACD,aAAa;aACR,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;QAC9C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAE1E,sCAAsC;QACtC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,OAAO,GAAG,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;YAClC,kBAAkB;YAClB,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAE,CAAC,EAAE,CAAC;gBACnC,GAAG,EAAE,CAAC;gBACN,SAAS;YACX,CAAC;YAED,4DAA4D;YAC5D,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;YAErE,MAAM,aAAa,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC,CAAC;YAEtE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,KAAK;gBACL,KAAK,EAAE,GAAG;gBACV,GAAG,EAAE,GAAG,GAAG,MAAM;gBACjB,cAAc,EAAE,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;gBACxD,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YAE1C,GAAG,IAAI,MAAM,CAAC;QAChB,CAAC;QAED,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,QAAQ;gBACpB,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,QAAgB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEvC,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAE,CAAC,CAAC;QAErE,6DAA6D;QAC7D,IAAI,WAAW,EAAE,CAAC;YAChB,0BAA0B;YAC1B,MAAM,UAAU,GAAG,GAAG,GAAG,SAAS,CAAC;YACnC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBAChD,IAAI,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;oBACtC,0EAA0E;oBAC1E,OAAO;wBACL,KAAK,EAAE,UAAU;wBACjB,EAAE;wBACF,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,uBAAuB;qBACtD,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAChD,8EAA8E;YAC9E,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YACvE,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEzD,IAAI,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC;gBACxD,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;oBACpB,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;gBACxD,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC;QAChF,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAClC,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAE5F,+CAA+C;YAC/C,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB;IACtD,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IACnE,MAAM,OAAO,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAC3C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IAErD,gBAAgB;IAChB,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe,EAAE,SAAyB,MAAM;IACnF,IAAI,MAAM,KAAK,MAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACxD,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAsB,CAAC;QAExD,2BAA2B;QAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,CAAC,EAAE,CAAC;YACvC,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACxC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,MAAM,CAAC,KAAK,KAAK,SAAS,IAAI,MAAM,CAAC,KAAK,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;YACnE,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,gDAAgD;gBAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,QAAQ,EAAE,CAAC;wBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;oBACzB,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,mDAAmD;gBACnD,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7D,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,KAAK;QACL,OAAO;QACP,MAAM;QACN,OAAO;QACP,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,OAAO;QACP,OAAO;QACP,OAAO;QACP,KAAK;QACL,SAAS;QACT,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline Module
|
|
3
|
+
* Exports all pipeline components
|
|
4
|
+
*/
|
|
5
|
+
export * from "./prenormalize.js";
|
|
6
|
+
export * from "./resolver.js";
|
|
7
|
+
export * from "./tagger.js";
|
|
8
|
+
export * from "./validator.js";
|
|
9
|
+
export * from "./semantic-enricher.js";
|
|
10
|
+
export * from "./semantic-data-loader.js";
|
|
11
|
+
export * from "./title-extractor.js";
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC;AAC/B,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline Module
|
|
3
|
+
* Exports all pipeline components
|
|
4
|
+
*/
|
|
5
|
+
export * from "./prenormalize.js";
|
|
6
|
+
export * from "./resolver.js";
|
|
7
|
+
export * from "./tagger.js";
|
|
8
|
+
export * from "./validator.js";
|
|
9
|
+
export * from "./semantic-enricher.js";
|
|
10
|
+
export * from "./semantic-data-loader.js";
|
|
11
|
+
export * from "./title-extractor.js";
|
|
12
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/pipeline/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,mBAAmB,CAAC;AAClC,cAAc,eAAe,CAAC;AAC9B,cAAc,aAAa,CAAC;AAC5B,cAAc,gBAAgB,CAAC;AAC/B,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-normalization
|
|
3
|
+
* Normalizes text before PII detection
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Pre-normalization options
|
|
7
|
+
*/
|
|
8
|
+
export interface PrenormalizeOptions {
|
|
9
|
+
/** Normalize line endings to \n */
|
|
10
|
+
normalizeLineEndings: boolean;
|
|
11
|
+
/** Apply Unicode NFKC normalization */
|
|
12
|
+
unicodeNormalize: boolean;
|
|
13
|
+
/** Trim leading/trailing whitespace */
|
|
14
|
+
trim: boolean;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Default pre-normalization options
|
|
18
|
+
*/
|
|
19
|
+
export declare const DEFAULT_PRENORMALIZE_OPTIONS: PrenormalizeOptions;
|
|
20
|
+
/**
|
|
21
|
+
* Pre-normalizes text for PII detection
|
|
22
|
+
* Note: This currently only normalizes line endings to preserve character offsets
|
|
23
|
+
*
|
|
24
|
+
* @param text - Original input text
|
|
25
|
+
* @param options - Normalization options
|
|
26
|
+
* @returns Normalized text
|
|
27
|
+
*/
|
|
28
|
+
export declare function prenormalize(text: string, options?: Partial<PrenormalizeOptions>): string;
|
|
29
|
+
/**
|
|
30
|
+
* Calculates offset adjustments when text is modified
|
|
31
|
+
* Used when prenormalization changes text length
|
|
32
|
+
*/
|
|
33
|
+
export interface OffsetMapping {
|
|
34
|
+
/** Map from original offset to normalized offset */
|
|
35
|
+
toNormalized: (originalOffset: number) => number;
|
|
36
|
+
/** Map from normalized offset to original offset */
|
|
37
|
+
toOriginal: (normalizedOffset: number) => number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Creates an identity offset mapping (no changes)
|
|
41
|
+
*/
|
|
42
|
+
export declare function createIdentityMapping(): OffsetMapping;
|
|
43
|
+
/**
|
|
44
|
+
* Creates offset mapping for line ending normalization
|
|
45
|
+
* This handles \r\n -> \n replacement
|
|
46
|
+
*/
|
|
47
|
+
export declare function createLineEndingMapping(originalText: string): OffsetMapping;
|
|
48
|
+
//# sourceMappingURL=prenormalize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prenormalize.d.ts","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,oBAAoB,EAAE,OAAO,CAAC;IAC9B,uCAAuC;IACvC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,uCAAuC;IACvC,IAAI,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,eAAO,MAAM,4BAA4B,EAAE,mBAI1C,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,OAAO,CAAC,mBAAmB,CAAM,GACzC,MAAM,CAqBR;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC5B,oDAAoD;IACpD,YAAY,EAAE,CAAC,cAAc,EAAE,MAAM,KAAK,MAAM,CAAC;IACjD,oDAAoD;IACpD,UAAU,EAAE,CAAC,gBAAgB,EAAE,MAAM,KAAK,MAAM,CAAC;CAClD;AAED;;GAEG;AACH,wBAAgB,qBAAqB,IAAI,aAAa,CAKrD;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,YAAY,EAAE,MAAM,GAAG,aAAa,CA4C3E"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-normalization
|
|
3
|
+
* Normalizes text before PII detection
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Default pre-normalization options
|
|
7
|
+
*/
|
|
8
|
+
export const DEFAULT_PRENORMALIZE_OPTIONS = {
|
|
9
|
+
normalizeLineEndings: true,
|
|
10
|
+
unicodeNormalize: false, // Disabled by default to preserve offsets
|
|
11
|
+
trim: false, // Disabled by default to preserve offsets
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Pre-normalizes text for PII detection
|
|
15
|
+
* Note: This currently only normalizes line endings to preserve character offsets
|
|
16
|
+
*
|
|
17
|
+
* @param text - Original input text
|
|
18
|
+
* @param options - Normalization options
|
|
19
|
+
* @returns Normalized text
|
|
20
|
+
*/
|
|
21
|
+
export function prenormalize(text, options = {}) {
|
|
22
|
+
const opts = { ...DEFAULT_PRENORMALIZE_OPTIONS, ...options };
|
|
23
|
+
let result = text;
|
|
24
|
+
// Normalize line endings (\r\n -> \n, \r -> \n)
|
|
25
|
+
if (opts.normalizeLineEndings) {
|
|
26
|
+
result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
27
|
+
}
|
|
28
|
+
// Note: Unicode normalization (NFKC) can change string length
|
|
29
|
+
// We skip it by default to preserve character offsets
|
|
30
|
+
// If needed, implement offset mapping
|
|
31
|
+
if (opts.unicodeNormalize) {
|
|
32
|
+
result = result.normalize('NFKC');
|
|
33
|
+
}
|
|
34
|
+
if (opts.trim) {
|
|
35
|
+
result = result.trim();
|
|
36
|
+
}
|
|
37
|
+
return result;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Creates an identity offset mapping (no changes)
|
|
41
|
+
*/
|
|
42
|
+
export function createIdentityMapping() {
|
|
43
|
+
return {
|
|
44
|
+
toNormalized: (offset) => offset,
|
|
45
|
+
toOriginal: (offset) => offset,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Creates offset mapping for line ending normalization
|
|
50
|
+
* This handles \r\n -> \n replacement
|
|
51
|
+
*/
|
|
52
|
+
export function createLineEndingMapping(originalText) {
|
|
53
|
+
// Find all \r\n positions
|
|
54
|
+
const crlfPositions = [];
|
|
55
|
+
for (let i = 0; i < originalText.length - 1; i++) {
|
|
56
|
+
if (originalText[i] === '\r' && originalText[i + 1] === '\n') {
|
|
57
|
+
crlfPositions.push(i);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
if (crlfPositions.length === 0) {
|
|
61
|
+
return createIdentityMapping();
|
|
62
|
+
}
|
|
63
|
+
return {
|
|
64
|
+
toNormalized(originalOffset) {
|
|
65
|
+
// Count how many \r\n pairs are before this offset
|
|
66
|
+
let adjustment = 0;
|
|
67
|
+
for (const pos of crlfPositions) {
|
|
68
|
+
if (pos < originalOffset) {
|
|
69
|
+
adjustment++;
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return originalOffset - adjustment;
|
|
76
|
+
},
|
|
77
|
+
toOriginal(normalizedOffset) {
|
|
78
|
+
// Add back the removed \r characters
|
|
79
|
+
let adjustment = 0;
|
|
80
|
+
let currentNormalized = 0;
|
|
81
|
+
for (const pos of crlfPositions) {
|
|
82
|
+
if (currentNormalized + (pos - adjustment) <= normalizedOffset) {
|
|
83
|
+
adjustment++;
|
|
84
|
+
currentNormalized = pos - adjustment + 1;
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return normalizedOffset + adjustment;
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=prenormalize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prenormalize.js","sourceRoot":"","sources":["../../src/pipeline/prenormalize.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAcH;;GAEG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAwB;IAC/D,oBAAoB,EAAE,IAAI;IAC1B,gBAAgB,EAAE,KAAK,EAAE,0CAA0C;IACnE,IAAI,EAAE,KAAK,EAAE,0CAA0C;CACxD,CAAC;AAEF;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,UAAwC,EAAE;IAE1C,MAAM,IAAI,GAAG,EAAE,GAAG,4BAA4B,EAAE,GAAG,OAAO,EAAE,CAAC;IAC7D,IAAI,MAAM,GAAG,IAAI,CAAC;IAElB,gDAAgD;IAChD,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;QAC9B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC9D,CAAC;IAED,8DAA8D;IAC9D,sDAAsD;IACtD,sCAAsC;IACtC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC1B,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACd,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAaD;;GAEG;AACH,MAAM,UAAU,qBAAqB;IACnC,OAAO;QACL,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;QAChC,UAAU,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM;KAC/B,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,YAAoB;IAC1D,0BAA0B;IAC1B,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,IAAI,YAAY,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7D,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,qBAAqB,EAAE,CAAC;IACjC,CAAC;IAED,OAAO;QACL,YAAY,CAAC,cAAsB;YACjC,mDAAmD;YACnD,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;oBACzB,UAAU,EAAE,CAAC;gBACf,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YACD,OAAO,cAAc,GAAG,UAAU,CAAC;QACrC,CAAC;QAED,UAAU,CAAC,gBAAwB;YACjC,qCAAqC;YACrC,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,IAAI,iBAAiB,GAAG,CAAC,CAAC;YAE1B,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,iBAAiB,GAAG,CAAC,GAAG,GAAG,UAAU,CAAC,IAAI,gBAAgB,EAAE,CAAC;oBAC/D,UAAU,EAAE,CAAC;oBACb,iBAAiB,GAAG,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;gBAC3C,CAAC;qBAAM,CAAC;oBACN,MAAM;gBACR,CAAC;YACH,CAAC;YAED,OAAO,gBAAgB,GAAG,UAAU,CAAC;QACvC,CAAC;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Resolver
|
|
3
|
+
* Merges, deduplicates, and resolves overlapping entity detections
|
|
4
|
+
*/
|
|
5
|
+
import { SpanMatch, AnonymizationPolicy } from '../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Resolution strategy for overlapping entities
|
|
8
|
+
*/
|
|
9
|
+
export declare enum OverlapStrategy {
|
|
10
|
+
/** Regex matches always win over NER */
|
|
11
|
+
REGEX_PRIORITY = "REGEX_PRIORITY",
|
|
12
|
+
/** Longer span wins */
|
|
13
|
+
LONGER_SPAN = "LONGER_SPAN",
|
|
14
|
+
/** Higher confidence wins */
|
|
15
|
+
HIGHER_CONFIDENCE = "HIGHER_CONFIDENCE",
|
|
16
|
+
/** Use type priority from policy */
|
|
17
|
+
TYPE_PRIORITY = "TYPE_PRIORITY"
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Entity resolver configuration
|
|
21
|
+
*/
|
|
22
|
+
export interface ResolverConfig {
|
|
23
|
+
/** Primary strategy for overlap resolution */
|
|
24
|
+
overlapStrategy: OverlapStrategy;
|
|
25
|
+
/** Whether regex matches always take precedence */
|
|
26
|
+
regexPriority: boolean;
|
|
27
|
+
/** Minimum confidence to keep an entity */
|
|
28
|
+
minConfidence: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Default resolver configuration
|
|
32
|
+
*/
|
|
33
|
+
export declare const DEFAULT_RESOLVER_CONFIG: ResolverConfig;
|
|
34
|
+
/**
|
|
35
|
+
* Resolves and merges entity detections from regex and NER
|
|
36
|
+
*/
|
|
37
|
+
export declare function resolveEntities(regexMatches: SpanMatch[], nerMatches: SpanMatch[], policy: AnonymizationPolicy, originalText: string, config?: Partial<ResolverConfig>): SpanMatch[];
|
|
38
|
+
/**
|
|
39
|
+
* Creates protected spans from regex matches
|
|
40
|
+
* Used to mask regex matches from NER to avoid double-detection
|
|
41
|
+
*/
|
|
42
|
+
export declare function createProtectedSpans(regexMatches: SpanMatch[]): Array<{
|
|
43
|
+
start: number;
|
|
44
|
+
end: number;
|
|
45
|
+
}>;
|
|
46
|
+
/**
|
|
47
|
+
* Checks if a span overlaps with any protected span
|
|
48
|
+
*/
|
|
49
|
+
export declare function isInProtectedSpan(span: {
|
|
50
|
+
start: number;
|
|
51
|
+
end: number;
|
|
52
|
+
}, protectedSpans: Array<{
|
|
53
|
+
start: number;
|
|
54
|
+
end: number;
|
|
55
|
+
}>): boolean;
|
|
56
|
+
//# sourceMappingURL=resolver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAEL,SAAS,EAET,mBAAmB,EAEpB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,oBAAY,eAAe;IACzB,wCAAwC;IACxC,cAAc,mBAAmB;IACjC,uBAAuB;IACvB,WAAW,gBAAgB;IAC3B,6BAA6B;IAC7B,iBAAiB,sBAAsB;IACvC,oCAAoC;IACpC,aAAa,kBAAkB;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,8CAA8C;IAC9C,eAAe,EAAE,eAAe,CAAC;IACjC,mDAAmD;IACnD,aAAa,EAAE,OAAO,CAAC;IACvB,2CAA2C;IAC3C,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,uBAAuB,EAAE,cAIrC,CAAC;AAEF;;GAEG;AACH,wBAAgB,eAAe,CAC7B,YAAY,EAAE,SAAS,EAAE,EACzB,UAAU,EAAE,SAAS,EAAE,EACvB,MAAM,EAAE,mBAAmB,EAC3B,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,OAAO,CAAC,cAAc,CAAM,GACnC,SAAS,EAAE,CAyBb;AAkOD;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,YAAY,EAAE,SAAS,EAAE,GACxB,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,CAEvC;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,EACpC,cAAc,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,GACpD,OAAO,CAET"}
|