rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
@@ -0,0 +1,253 @@
1
+ /**
2
+ * NER Model Wrapper
3
+ * ONNX Runtime integration for Named Entity Recognition
4
+ * Supports both onnxruntime-node and onnxruntime-web
5
+ */
6
+ import { loadRuntime } from "./onnx-runtime.js";
7
+ import { WordPieceTokenizer, loadVocabFromFile, } from "./tokenizer.js";
8
+ import { decodeBIOTags, convertToSpanMatches, cleanupSpanBoundaries, mergeAdjacentSpans, } from "./bio-decoder.js";
9
+ import { getStorageProvider, isBrowser } from "../utils/storage.js";
10
+ /**
11
+ * Default label map for common NER models (CoNLL-style)
12
+ */
13
+ export const DEFAULT_LABEL_MAP = [
14
+ "O",
15
+ "B-PER",
16
+ "I-PER",
17
+ "B-ORG",
18
+ "I-ORG",
19
+ "B-LOC",
20
+ "I-LOC",
21
+ "B-MISC",
22
+ "I-MISC",
23
+ ];
24
+ /**
25
+ * NER Model wrapper for ONNX inference
26
+ */
27
+ export class NERModel {
28
+ ort = null;
29
+ session = null;
30
+ tokenizer = null;
31
+ config;
32
+ isLoaded = false;
33
+ constructor(config) {
34
+ this.config = config;
35
+ }
36
+ /**
37
+ * Loads the model and tokenizer
38
+ */
39
+ async load() {
40
+ if (this.isLoaded)
41
+ return;
42
+ // Load ONNX runtime (auto-detects best runtime for environment)
43
+ this.ort = await loadRuntime();
44
+ // Load ONNX model
45
+ // In browsers, we need to load the model as ArrayBuffer since file paths don't work
46
+ // onnxruntime-web accepts ArrayBuffer/Uint8Array, while onnxruntime-node accepts file paths
47
+ if (isBrowser()) {
48
+ const storage = await getStorageProvider();
49
+ const modelData = await storage.readFile(this.config.modelPath);
50
+ // onnxruntime-web accepts Uint8Array directly
51
+ this.session = await this.ort.InferenceSession.create(modelData);
52
+ }
53
+ else {
54
+ // In Node.js, we can use the file path directly
55
+ this.session = await this.ort.InferenceSession.create(this.config.modelPath);
56
+ }
57
+ // Load tokenizer vocabulary (already uses storage abstraction internally)
58
+ const vocab = await loadVocabFromFile(this.config.vocabPath);
59
+ this.tokenizer = new WordPieceTokenizer(vocab, {
60
+ maxLength: this.config.maxLength,
61
+ doLowerCase: this.config.doLowerCase,
62
+ });
63
+ this.isLoaded = true;
64
+ }
65
+ /**
66
+ * Predicts entities in text
67
+ */
68
+ async predict(text, policy) {
69
+ const startTime = performance.now();
70
+ if (!this.isLoaded || this.session === null || this.tokenizer === null) {
71
+ throw new Error("Model not loaded. Call load() first.");
72
+ }
73
+ // Tokenize input
74
+ const tokenization = this.tokenizer.tokenize(text);
75
+ // Run inference
76
+ const { labels, confidences } = await this.runInference(tokenization);
77
+ // Decode BIO tags to entities
78
+ const rawEntities = decodeBIOTags(tokenization.tokens, labels, confidences, text);
79
+ // Convert to SpanMatch format with confidence filtering
80
+ const minConfidence = this.getMinConfidence(policy);
81
+ let spans = convertToSpanMatches(rawEntities, minConfidence);
82
+ // Post-process spans
83
+ spans = cleanupSpanBoundaries(spans, text);
84
+ spans = mergeAdjacentSpans(spans, text);
85
+ // Filter by enabled types in policy
86
+ if (policy !== undefined) {
87
+ spans = spans.filter((span) => policy.enabledTypes.has(span.type) &&
88
+ policy.nerEnabledTypes.has(span.type));
89
+ }
90
+ const endTime = performance.now();
91
+ return {
92
+ spans,
93
+ processingTimeMs: endTime - startTime,
94
+ modelVersion: this.config.modelVersion,
95
+ };
96
+ }
97
+ /**
98
+ * Runs ONNX inference
99
+ */
100
+ async runInference(tokenization) {
101
+ if (this.session === null || this.ort === null) {
102
+ throw new Error("Session not initialized");
103
+ }
104
+ const session = this.session;
105
+ const seqLength = tokenization.inputIds.length;
106
+ // Create tensors
107
+ const inputIdsTensor = new this.ort.Tensor("int64", BigInt64Array.from(tokenization.inputIds.map(BigInt)), [1, seqLength]);
108
+ const attentionMaskTensor = new this.ort.Tensor("int64", BigInt64Array.from(tokenization.attentionMask.map(BigInt)), [1, seqLength]);
109
+ const tokenTypeIdsTensor = new this.ort.Tensor("int64", BigInt64Array.from(tokenization.tokenTypeIds.map(BigInt)), [1, seqLength]);
110
+ // Run inference
111
+ const feeds = {
112
+ input_ids: inputIdsTensor,
113
+ attention_mask: attentionMaskTensor,
114
+ };
115
+ // Some models also need token_type_ids
116
+ const inputNames = session.inputNames;
117
+ if (inputNames.includes("token_type_ids")) {
118
+ feeds["token_type_ids"] = tokenTypeIdsTensor;
119
+ }
120
+ const results = await session.run(feeds);
121
+ // Get logits output
122
+ const outputName = session.outputNames[0];
123
+ if (outputName === undefined) {
124
+ throw new Error("No output from model");
125
+ }
126
+ const logits = results[outputName];
127
+ if (logits === undefined) {
128
+ throw new Error("Logits output not found");
129
+ }
130
+ // Process logits to get labels and confidences
131
+ return this.processLogits(logits, seqLength);
132
+ }
133
+ /**
134
+ * Processes model logits to extract labels and confidences
135
+ */
136
+ processLogits(logits, seqLength) {
137
+ const data = logits.data;
138
+ const numLabels = this.config.labelMap.length;
139
+ const labels = [];
140
+ const confidences = [];
141
+ for (let i = 0; i < seqLength; i++) {
142
+ // Get logits for this token
143
+ const tokenLogits = [];
144
+ for (let j = 0; j < numLabels; j++) {
145
+ tokenLogits.push(data[i * numLabels + j] ?? 0);
146
+ }
147
+ // Apply softmax
148
+ const probs = softmax(tokenLogits);
149
+ // Get argmax
150
+ let maxIdx = 0;
151
+ let maxProb = probs[0] ?? 0;
152
+ for (let j = 1; j < probs.length; j++) {
153
+ if ((probs[j] ?? 0) > maxProb) {
154
+ maxProb = probs[j] ?? 0;
155
+ maxIdx = j;
156
+ }
157
+ }
158
+ labels.push(this.config.labelMap[maxIdx] ?? "O");
159
+ confidences.push(maxProb);
160
+ }
161
+ return { labels, confidences };
162
+ }
163
+ /**
164
+ * Gets minimum confidence threshold from policy
165
+ */
166
+ getMinConfidence(policy) {
167
+ if (policy === undefined)
168
+ return 0.5;
169
+ // Get minimum from all NER-enabled types
170
+ let minThreshold = 1.0;
171
+ for (const type of policy.nerEnabledTypes) {
172
+ const threshold = policy.confidenceThresholds.get(type) ?? 0.5;
173
+ if (threshold < minThreshold) {
174
+ minThreshold = threshold;
175
+ }
176
+ }
177
+ return minThreshold;
178
+ }
179
+ /**
180
+ * Gets model version
181
+ */
182
+ get version() {
183
+ return this.config.modelVersion;
184
+ }
185
+ /**
186
+ * Checks if model is loaded
187
+ */
188
+ get loaded() {
189
+ return this.isLoaded;
190
+ }
191
+ /**
192
+ * Disposes of model resources
193
+ */
194
+ dispose() {
195
+ // ONNX Runtime Node doesn't have explicit dispose, but we can clear references
196
+ this.session = null;
197
+ this.tokenizer = null;
198
+ this.isLoaded = false;
199
+ return Promise.resolve();
200
+ }
201
+ }
202
+ /**
203
+ * Softmax function for probability calculation
204
+ */
205
+ function softmax(logits) {
206
+ const maxLogit = Math.max(...logits);
207
+ const expLogits = logits.map((x) => Math.exp(x - maxLogit));
208
+ const sumExp = expLogits.reduce((a, b) => a + b, 0);
209
+ return expLogits.map((x) => x / sumExp);
210
+ }
211
+ /**
212
+ * Creates a NER model instance with configuration
213
+ */
214
+ export function createNERModel(config) {
215
+ const fullConfig = {
216
+ modelPath: config.modelPath,
217
+ vocabPath: config.vocabPath,
218
+ labelMap: config.labelMap ?? DEFAULT_LABEL_MAP,
219
+ maxLength: config.maxLength ?? 512,
220
+ doLowerCase: config.doLowerCase ?? false, // XLM-RoBERTa is cased
221
+ modelVersion: config.modelVersion ?? "1.0.0",
222
+ };
223
+ return new NERModel(fullConfig);
224
+ }
225
+ /**
226
+ * NER Model stub for when no model is available
227
+ * Returns empty results - useful for regex-only mode
228
+ */
229
+ export class NERModelStub {
230
+ version = "stub-1.0.0";
231
+ loaded = true;
232
+ async load() {
233
+ // No-op
234
+ }
235
+ predict(_text, _policy) {
236
+ return Promise.resolve({
237
+ spans: [],
238
+ processingTimeMs: 0,
239
+ modelVersion: this.version,
240
+ });
241
+ }
242
+ dispose() {
243
+ // No-op
244
+ return Promise.resolve();
245
+ }
246
+ }
247
+ /**
248
+ * Creates a stub NER model (for testing or regex-only mode)
249
+ */
250
+ export function createNERModelStub() {
251
+ return new NERModelStub();
252
+ }
253
+ //# sourceMappingURL=ner-model.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAElB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAC1B,OAAO,EAAE,kBAAkB,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAgCpE;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAY,IAAI,CAAC;IACxB,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,oFAAoF;QACpF,4FAA4F;QAC5F,IAAI,SAAS,EAAE,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,MAAM,kBAAkB,EAAE,CAAC;YAC3C,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;YAChE,8CAA8C;YAC9C,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACnE,CAAC;aAAM,CAAC;YACN,gDAAgD;YAChD,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CACnD,IAAI,CAAC,MAAM,CAAC,SAAS,CACtB,CAAC;QACJ,CAAC;QAED,0EAA0E;QAC1E,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC;gBAClC,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CACxC,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAMpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC;QACzB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,OAAO;QACL,+EAA+E;QAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACpB,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;QACtB,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;IAC3B,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,MAA0E;IAE1E,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,KAAK,EAAE,uBAAuB;QACjE,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,OAAO,CACL,KAAa,EACb,OAA6B;QAE7B,OAAO,OAAO,CAAC,OAAO,CAAC;YACrB,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,QAAQ;QACR,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;IAC3B,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
@@ -0,0 +1,46 @@
1
+ /**
2
+ * ONNX Runtime Abstraction
3
+ * Allows switching between onnxruntime-node and onnxruntime-web
4
+ *
5
+ * In browsers without a bundler, automatically loads onnxruntime-web from CDN
6
+ */
7
+ export interface OrtTensor {
8
+ data: Float32Array | BigInt64Array | Int32Array;
9
+ dims: readonly number[];
10
+ }
11
+ export interface OrtSession {
12
+ inputNames: readonly string[];
13
+ outputNames: readonly string[];
14
+ run(feeds: Record<string, OrtTensor>): Promise<Record<string, OrtTensor>>;
15
+ }
16
+ export interface OrtInferenceSession {
17
+ create(pathOrBuffer: string | ArrayBuffer | Uint8Array, options?: unknown): Promise<OrtSession>;
18
+ }
19
+ export interface OrtTensorConstructor {
20
+ new (type: string, data: Float32Array | BigInt64Array | Int32Array | number[] | bigint[], dims: number[]): OrtTensor;
21
+ }
22
+ export interface OrtRuntime {
23
+ InferenceSession: OrtInferenceSession;
24
+ Tensor: OrtTensorConstructor;
25
+ }
26
+ /**
27
+ * Detects the best ONNX runtime for the current environment
28
+ */
29
+ export declare function detectRuntime(): "node" | "web";
30
+ /**
31
+ * Loads the appropriate ONNX runtime
32
+ */
33
+ export declare function loadRuntime(preferredRuntime?: "node" | "web"): Promise<OrtRuntime>;
34
+ /**
35
+ * Gets the currently loaded runtime type
36
+ */
37
+ export declare function getRuntimeType(): "node" | "web" | null;
38
+ /**
39
+ * Resets the runtime (useful for testing)
40
+ */
41
+ export declare function resetRuntime(): void;
42
+ declare global {
43
+ var Bun: unknown;
44
+ var Deno: unknown;
45
+ }
46
+ //# sourceMappingURL=onnx-runtime.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"onnx-runtime.d.ts","sourceRoot":"","sources":["../../src/ner/onnx-runtime.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAQH,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,YAAY,GAAG,aAAa,GAAG,UAAU,CAAC;IAChD,IAAI,EAAE,SAAS,MAAM,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;IAC9B,WAAW,EAAE,SAAS,MAAM,EAAE,CAAC;IAC/B,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC;CAC3E;AAED,MAAM,WAAW,mBAAmB;IAClC,MAAM,CACJ,YAAY,EAAE,MAAM,GAAG,WAAW,GAAG,UAAU,EAC/C,OAAO,CAAC,EAAE,OAAO,GAChB,OAAO,CAAC,UAAU,CAAC,CAAC;CACxB;AAED,MAAM,WAAW,oBAAoB;IACnC,KACE,IAAI,EAAE,MAAM,EACZ,IAAI,EAAE,YAAY,GAAG,aAAa,GAAG,UAAU,GAAG,MAAM,EAAE,GAAG,MAAM,EAAE,EACrE,IAAI,EAAE,MAAM,EAAE,GACb,SAAS,CAAC;CACd;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,EAAE,mBAAmB,CAAC;IACtC,MAAM,EAAE,oBAAoB,CAAC;CAC9B;AAQD;;GAEG;AACH,wBAAgB,aAAa,IAAI,MAAM,GAAG,KAAK,CA4B9C;AAiCD;;GAEG;AACH,wBAAsB,WAAW,CAC/B,gBAAgB,CAAC,EAAE,MAAM,GAAG,KAAK,GAChC,OAAO,CAAC,UAAU,CAAC,CA2CrB;AAED;;GAEG;AACH,wBAAgB,cAAc,IAAI,MAAM,GAAG,KAAK,GAAG,IAAI,CAEtD;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,IAAI,CAGnC;AAGD,OAAO,CAAC,MAAM,CAAC;IAEb,IAAI,GAAG,EAAE,OAAO,CAAC;IAEjB,IAAI,IAAI,EAAE,OAAO,CAAC;CACnB"}
@@ -0,0 +1,130 @@
1
+ /**
2
+ * ONNX Runtime Abstraction
3
+ * Allows switching between onnxruntime-node and onnxruntime-web
4
+ *
5
+ * In browsers without a bundler, automatically loads onnxruntime-web from CDN
6
+ */
7
+ // CDN URL for onnxruntime-web (used when bare import fails in browser)
8
+ // Using the bundled ESM version that includes WebAssembly backend
9
+ const ONNX_WEB_CDN_URL = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.19.2/dist/ort.bundle.min.mjs";
10
+ /**
11
+ * Runtime detection and loading
12
+ */
13
+ let _runtime = null;
14
+ let _runtimeType = null;
15
+ /**
16
+ * Detects the best ONNX runtime for the current environment
17
+ */
18
+ export function detectRuntime() {
19
+ // Check if we're in Bun
20
+ const isBun = typeof globalThis.Bun !== "undefined";
21
+ // Check if we're in a browser-like environment
22
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
23
+ const isBrowser = typeof globalThis.window !== "undefined";
24
+ // Check if we're in Deno
25
+ const isDeno = typeof globalThis.Deno !== "undefined";
26
+ if (isBrowser || isDeno) {
27
+ return "web";
28
+ }
29
+ // For Bun, try node first, fall back to web
30
+ if (isBun) {
31
+ try {
32
+ // Quick check if onnxruntime-node is loadable
33
+ require.resolve("onnxruntime-node");
34
+ return "node";
35
+ }
36
+ catch {
37
+ return "web";
38
+ }
39
+ }
40
+ // Default to node for Node.js
41
+ return "node";
42
+ }
43
+ /**
44
+ * Attempts to load onnxruntime-web, first via bare import, then via CDN
45
+ */
46
+ async function loadOnnxWeb() {
47
+ // First try bare import (works with bundlers or import maps)
48
+ try {
49
+ // eslint-disable-next-line @typescript-eslint/ban-ts-comment
50
+ // @ts-ignore - onnxruntime-web may not be installed
51
+ const ort = (await import("onnxruntime-web"));
52
+ return ort;
53
+ }
54
+ catch {
55
+ // Bare import failed, try CDN (for browsers without bundlers)
56
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
57
+ const isBrowser = typeof globalThis.window !== "undefined";
58
+ if (isBrowser) {
59
+ try {
60
+ // Dynamic import from CDN URL
61
+ const ort = (await import(
62
+ /* webpackIgnore: true */ ONNX_WEB_CDN_URL));
63
+ return ort;
64
+ }
65
+ catch (cdnError) {
66
+ throw new Error(`Failed to load onnxruntime-web from CDN: ${String(cdnError)}`);
67
+ }
68
+ }
69
+ throw new Error("onnxruntime-web is not available");
70
+ }
71
+ }
72
+ /**
73
+ * Loads the appropriate ONNX runtime
74
+ */
75
+ export async function loadRuntime(preferredRuntime) {
76
+ if (_runtime !== null) {
77
+ return _runtime;
78
+ }
79
+ const runtimeType = preferredRuntime ?? detectRuntime();
80
+ try {
81
+ if (runtimeType === "node") {
82
+ // Dynamic import for onnxruntime-node
83
+ const ort = (await import("onnxruntime-node"));
84
+ _runtime = ort;
85
+ _runtimeType = "node";
86
+ }
87
+ else {
88
+ // Load onnxruntime-web (with CDN fallback for browsers)
89
+ const ort = await loadOnnxWeb();
90
+ _runtime = ort;
91
+ _runtimeType = "web";
92
+ }
93
+ }
94
+ catch (e) {
95
+ // If preferred runtime fails, try the other
96
+ const fallbackType = runtimeType === "node" ? "web" : "node";
97
+ try {
98
+ if (fallbackType === "node") {
99
+ const ort = (await import("onnxruntime-node"));
100
+ _runtime = ort;
101
+ _runtimeType = "node";
102
+ }
103
+ else {
104
+ // Load onnxruntime-web (with CDN fallback for browsers)
105
+ const ort = await loadOnnxWeb();
106
+ _runtime = ort;
107
+ _runtimeType = "web";
108
+ }
109
+ }
110
+ catch {
111
+ throw new Error(`Failed to load ONNX runtime. Install either 'onnxruntime-node' or 'onnxruntime-web'.\n` +
112
+ `Original error: ${String(e)}`);
113
+ }
114
+ }
115
+ return _runtime;
116
+ }
117
+ /**
118
+ * Gets the currently loaded runtime type
119
+ */
120
+ export function getRuntimeType() {
121
+ return _runtimeType;
122
+ }
123
+ /**
124
+ * Resets the runtime (useful for testing)
125
+ */
126
+ export function resetRuntime() {
127
+ _runtime = null;
128
+ _runtimeType = null;
129
+ }
130
+ //# sourceMappingURL=onnx-runtime.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"onnx-runtime.js","sourceRoot":"","sources":["../../src/ner/onnx-runtime.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,uEAAuE;AACvE,kEAAkE;AAClE,MAAM,gBAAgB,GACpB,6EAA6E,CAAC;AAkChF;;GAEG;AACH,IAAI,QAAQ,GAAsB,IAAI,CAAC;AACvC,IAAI,YAAY,GAA0B,IAAI,CAAC;AAE/C;;GAEG;AACH,MAAM,UAAU,aAAa;IAC3B,wBAAwB;IACxB,MAAM,KAAK,GAAG,OAAO,UAAU,CAAC,GAAG,KAAK,WAAW,CAAC;IAEpD,+CAA+C;IAC/C,uEAAuE;IACvE,MAAM,SAAS,GAAG,OAAO,UAAU,CAAC,MAAM,KAAK,WAAW,CAAC;IAE3D,yBAAyB;IACzB,MAAM,MAAM,GAAG,OAAO,UAAU,CAAC,IAAI,KAAK,WAAW,CAAC;IAEtD,IAAI,SAAS,IAAI,MAAM,EAAE,CAAC;QACxB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,4CAA4C;IAC5C,IAAI,KAAK,EAAE,CAAC;QACV,IAAI,CAAC;YACH,8CAA8C;YAC9C,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;YACpC,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,WAAW;IACxB,6DAA6D;IAC7D,IAAI,CAAC;QACH,6DAA6D;QAC7D,oDAAoD;QACpD,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAe,CAAC;QAC5D,OAAO,GAAG,CAAC;IACb,CAAC;IAAC,MAAM,CAAC;QACP,8DAA8D;QAC9D,uEAAuE;QACvE,MAAM,SAAS,GAAG,OAAO,UAAU,CAAC,MAAM,KAAK,WAAW,CAAC;QAC3D,IAAI,SAAS,EAAE,CAAC;YACd,IAAI,CAAC;gBACH,8BAA8B;gBAC9B,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM;gBACvB,yBAAyB,CAAC,gBAAgB,CAC3C,CAAe,CAAC;gBACjB,OAAO,GAAG,CAAC;YACb,CAAC;YAAC,OAAO,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CACb,4CAA4C,MAAM,CAAC,QAAQ,CAAC,EAAE,CAC/D,CAAC;YACJ,CAAC;QACH,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACtD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,gBAAiC;IAEjC,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;QACtB,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,MAAM,WAAW,GAAG,gBAAgB,IAAI,aAAa,EAAE,CAAC;IAExD,IAAI,CAAC;QACH,IAAI,WAAW,KAAK,MAAM,EAAE,CAAC;YAC3B,sCAAsC;YACtC,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAe,CAAC;YAC7D,QAAQ,GAAG,GAAG,CAAC;YACf,YAAY,GAAG,MAAM,CAAC;QACxB,CAAC;aAAM,CAAC;YACN,wDAAwD;YACxD,MAAM,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;YAChC,QAAQ,GAAG,GAAG,CAAC;YACf,YAAY,GAAG,KAAK,CAAC;QACvB,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,4CAA4C;QAC5C,MAAM,YAAY,GAAG,WAAW,KAAK,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;QAE7D,IAAI,CAAC;YACH,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAe,CAAC;gBAC7D,QAAQ,GAAG,GAAG,CAAC;gBACf,YAAY,GAAG,MAAM,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,wDAAwD;gBACxD,MAAM,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;gBAChC,QAAQ,GAAG,GAAG,CAAC;gBACf,YAAY,GAAG,KAAK,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,wFAAwF;gBACtF,mBAAmB,MAAM,CAAC,CAAC,CAAC,EAAE,CACjC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc;IAC5B,OAAO,YAAY,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY;IAC1B,QAAQ,GAAG,IAAI,CAAC;IAChB,YAAY,GAAG,IAAI,CAAC;AACtB,CAAC"}
@@ -0,0 +1,118 @@
1
+ /**
2
+ * HuggingFace Tokenizer
3
+ * Loads and uses tokenizers from HuggingFace's tokenizer.json format
4
+ * Supports Unigram (SentencePiece) and BPE tokenizers
5
+ */
6
+ /**
7
+ * Token with offset information
8
+ */
9
+ export interface Token {
10
+ /** Token ID in vocabulary */
11
+ id: number;
12
+ /** Token string */
13
+ token: string;
14
+ /** Start character offset in original text */
15
+ start: number;
16
+ /** End character offset in original text */
17
+ end: number;
18
+ /** Whether this is a continuation token */
19
+ isContinuation: boolean;
20
+ /** Whether this is a special token */
21
+ isSpecial: boolean;
22
+ }
23
+ /**
24
+ * Tokenization result with metadata
25
+ */
26
+ export interface TokenizationResult {
27
+ /** Array of tokens */
28
+ tokens: Token[];
29
+ /** Input IDs for model */
30
+ inputIds: number[];
31
+ /** Attention mask */
32
+ attentionMask: number[];
33
+ /** Token type IDs (for BERT-style models) */
34
+ tokenTypeIds: number[];
35
+ /** Mapping from token index to character span [start, end] */
36
+ tokenToCharSpan: Array<[number, number] | null>;
37
+ }
38
+ /**
39
+ * Tokenizer configuration
40
+ */
41
+ export interface TokenizerConfig {
42
+ /** Maximum sequence length */
43
+ maxLength: number;
44
+ /** Whether to lowercase input */
45
+ doLowerCase: boolean;
46
+ }
47
+ /**
48
+ * Default tokenizer configuration
49
+ */
50
+ export declare const DEFAULT_TOKENIZER_CONFIG: TokenizerConfig;
51
+ /**
52
+ * WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
53
+ */
54
+ export declare class WordPieceTokenizer {
55
+ private vocab;
56
+ private inverseVocab;
57
+ private config;
58
+ private sortedVocab;
59
+ private clsId;
60
+ private sepId;
61
+ private padId;
62
+ private unkId;
63
+ private clsToken;
64
+ private sepToken;
65
+ private padToken;
66
+ private unkToken;
67
+ constructor(vocab: Map<string, number>, config?: Partial<TokenizerConfig>);
68
+ /**
69
+ * Detect special tokens from vocabulary
70
+ */
71
+ private detectSpecialTokens;
72
+ /**
73
+ * Tokenizes text into tokens with offset tracking
74
+ */
75
+ tokenize(text: string): TokenizationResult;
76
+ /**
77
+ * Find the best matching token using greedy longest-match
78
+ */
79
+ private findBestToken;
80
+ /**
81
+ * Decodes token IDs back to text
82
+ */
83
+ decode(tokenIds: number[]): string;
84
+ /**
85
+ * Gets vocabulary size
86
+ */
87
+ get vocabSize(): number;
88
+ /**
89
+ * Gets a token ID by string
90
+ */
91
+ getTokenId(token: string): number | undefined;
92
+ /**
93
+ * Gets a token string by ID
94
+ */
95
+ getToken(id: number): string | undefined;
96
+ }
97
+ /**
98
+ * Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
99
+ * Uses storage abstraction for browser compatibility
100
+ */
101
+ export declare function loadVocabFromFile(filePath: string): Promise<Map<string, number>>;
102
+ /**
103
+ * Loads vocabulary from content string (for when content is already available)
104
+ */
105
+ export declare function loadVocabFromContent(content: string, format?: 'json' | 'txt'): Map<string, number>;
106
+ /**
107
+ * Parses HuggingFace tokenizer.json format
108
+ */
109
+ export declare function parseHFTokenizerJson(content: string): Map<string, number>;
110
+ /**
111
+ * Parses vocabulary from string content (vocab.txt format)
112
+ */
113
+ export declare function parseVocab(content: string): Map<string, number>;
114
+ /**
115
+ * Creates a minimal vocabulary for testing
116
+ */
117
+ export declare function createTestVocab(): Map<string, number>;
118
+ //# sourceMappingURL=tokenizer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,2CAA2C;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAsBD;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;CACtB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAGtC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,WAAW,CAA0B;IAG7C,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAG1B,OAAO,CAAC,QAAQ,CAAiB;IACjC,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,QAAQ,CAAmB;IACnC,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAyB3B;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAsF1C;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IAmBlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;;GAGG;AACH,wBAAsB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAWtF;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,GAAE,MAAM,GAAG,KAAc,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAM1G;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAmCzE;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAuBrD"}