@elanlanguages/bridge-anonymization 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ELAN Languages
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
package/README.md CHANGED
@@ -13,10 +13,10 @@ On-device PII anonymization module for high-privacy translation workflows. Detec
13
13
  ## Installation
14
14
 
15
15
  ```bash
16
- npm install bridge-anonymization
16
+ npm install @elanlanguages/bridge-anonymization
17
17
  ```
18
18
 
19
- > **Bun users**: Install `onnxruntime-web` additionally: `bun add bridge-anonymization onnxruntime-web`
19
+ > **Bun users**: Install `onnxruntime-web` additionally: `bun add @elanlanguages/bridge-anonymization onnxruntime-web`
20
20
 
21
21
  ## Quick Start
22
22
 
@@ -25,7 +25,7 @@ npm install bridge-anonymization
25
25
  For structured PII like emails, phones, IBANs, credit cards:
26
26
 
27
27
  ```typescript
28
- import { anonymizeRegexOnly } from 'bridge-anonymization';
28
+ import { anonymizeRegexOnly } from '@elanlanguages/bridge-anonymization';
29
29
 
30
30
  const result = await anonymizeRegexOnly(
31
31
  'Contact john@example.com or call +49 30 123456. IBAN: DE89370400440532013000'
@@ -40,7 +40,7 @@ console.log(result.anonymizedText);
40
40
  The NER model is automatically downloaded on first use (~280 MB for quantized):
41
41
 
42
42
  ```typescript
43
- import { createAnonymizer } from 'bridge-anonymization';
43
+ import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
44
44
 
45
45
  const anonymizer = createAnonymizer({
46
46
  ner: {
@@ -62,7 +62,7 @@ console.log(result.anonymizedText);
62
62
  ### One-liner with NER
63
63
 
64
64
  ```typescript
65
- import { anonymizeWithNER } from 'bridge-anonymization';
65
+ import { anonymizeWithNER } from '@elanlanguages/bridge-anonymization';
66
66
 
67
67
  const result = await anonymizeWithNER(
68
68
  'Contact John Smith at john@example.com',
@@ -75,7 +75,7 @@ const result = await anonymizeWithNER(
75
75
  ### Configuration Options
76
76
 
77
77
  ```typescript
78
- import { createAnonymizer, InMemoryKeyProvider } from 'bridge-anonymization';
78
+ import { createAnonymizer, InMemoryKeyProvider } from '@elanlanguages/bridge-anonymization';
79
79
 
80
80
  const anonymizer = createAnonymizer({
81
81
  // NER configuration
@@ -211,7 +211,7 @@ interface AnonymizationResult {
211
211
  ### Anonymization Policy
212
212
 
213
213
  ```typescript
214
- import { createAnonymizer, PIIType } from 'bridge-anonymization';
214
+ import { createAnonymizer, PIIType } from '@elanlanguages/bridge-anonymization';
215
215
 
216
216
  const anonymizer = createAnonymizer({
217
217
  ner: { mode: 'quantized' },
@@ -239,7 +239,7 @@ const anonymizer = createAnonymizer({
239
239
  Add domain-specific patterns:
240
240
 
241
241
  ```typescript
242
- import { createCustomIdRecognizer, PIIType, createAnonymizer } from 'bridge-anonymization';
242
+ import { createCustomIdRecognizer, PIIType, createAnonymizer } from '@elanlanguages/bridge-anonymization';
243
243
 
244
244
  const customRecognizer = createCustomIdRecognizer([
245
245
  {
@@ -271,7 +271,7 @@ import {
271
271
  clearModelCache,
272
272
  listDownloadedModels,
273
273
  getModelCacheDir
274
- } from 'bridge-anonymization';
274
+ } from '@elanlanguages/bridge-anonymization';
275
275
 
276
276
  // Check if model is downloaded
277
277
  const hasModel = await isModelDownloaded('quantized');
@@ -293,7 +293,7 @@ await clearModelCache('quantized'); // or clearModelCache() for all
293
293
  The PII map is encrypted using AES-256-GCM:
294
294
 
295
295
  ```typescript
296
- import { createAnonymizer, KeyProvider, generateKey } from 'bridge-anonymization';
296
+ import { createAnonymizer, KeyProvider, generateKey } from '@elanlanguages/bridge-anonymization';
297
297
 
298
298
  class SecureKeyProvider implements KeyProvider {
299
299
  async getKey(): Promise<Buffer> {
@@ -320,13 +320,13 @@ const anonymizer = createAnonymizer({
320
320
  This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun users need `onnxruntime-web`:
321
321
 
322
322
  ```bash
323
- bun add bridge-anonymization onnxruntime-web
323
+ bun add @elanlanguages/bridge-anonymization onnxruntime-web
324
324
  ```
325
325
 
326
326
  Usage is identical - the library auto-detects the runtime:
327
327
 
328
328
  ```typescript
329
- import { createAnonymizer } from 'bridge-anonymization';
329
+ import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
330
330
 
331
331
  const anonymizer = createAnonymizer({
332
332
  ner: { mode: 'quantized' }
@@ -41,8 +41,8 @@ export interface ModelInfo {
41
41
  /**
42
42
  * Registry of available models hosted on Hugging Face Hub
43
43
  *
44
- * Using Xenova's ONNX exports which are optimized for JS/ONNX runtime
45
- * https://huggingface.co/Xenova
44
+ * Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
45
+ * https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
46
46
  */
47
47
  export declare const MODEL_REGISTRY: Record<'standard' | 'quantized', ModelInfo>;
48
48
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,CA0BtE,CAAC;AAYF;;;GAGG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAYzC;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,MAAM,CAEnE;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,CAkBxF;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,CAAC,QAAQ,EAAE;IAChD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAuEX;;GAEG;AACH,wBAAsB,aAAa,CACjC,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,UAAU,CAAC,EAAE,wBAAwB,EACrC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CAiDjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,wBAAwB,CAAC;IACtC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC,CAiCzE;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,IAAI,CAAC,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAQpF;AAED;;GAEG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CAAC,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,GAAG,WAAW,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC,CAY3H;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAEtE"}
1
+ {"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,CAyBtE,CAAC;AAYF;;;GAGG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAYzC;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,MAAM,CAEnE;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,CAkBxF;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,CAAC,QAAQ,EAAE;IAChD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAuEX;;GAEG;AACH,wBAAsB,aAAa,CACjC,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,UAAU,CAAC,EAAE,wBAAwB,EACrC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CAiDjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,wBAAwB,CAAC;IACtC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC,CAiCzE;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,IAAI,CAAC,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAQpF;AAED;;GAEG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CAAC,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,GAAG,WAAW,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC,CAY3H;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAEtE"}
@@ -8,8 +8,8 @@ import * as os from 'os';
8
8
  /**
9
9
  * Registry of available models hosted on Hugging Face Hub
10
10
  *
11
- * Using Xenova's ONNX exports which are optimized for JS/ONNX runtime
12
- * https://huggingface.co/Xenova
11
+ * Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
12
+ * https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
13
13
  */
14
14
  export const MODEL_REGISTRY = {
15
15
  standard: {
@@ -17,25 +17,24 @@ export const MODEL_REGISTRY = {
17
17
  name: 'XLM-RoBERTa NER (Standard)',
18
18
  description: 'Multilingual NER model supporting EN, DE, FR, ES, and more',
19
19
  size: '~1.1 GB',
20
- hfRepo: 'Xenova/xlm-roberta-base-ner-hrl',
20
+ hfRepo: 'tjruesch/xlm-roberta-base-ner-hrl-onnx',
21
21
  hfSubfolder: 'onnx',
22
22
  files: [
23
23
  { repoFile: 'model.onnx', localFile: 'model.onnx', required: true },
24
- { repoFile: 'model.onnx_data', localFile: 'model.onnx_data', required: false },
25
24
  ],
26
- labelMap: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE'],
25
+ labelMap: ['O', 'B-DATE', 'I-DATE', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'],
27
26
  },
28
27
  quantized: {
29
28
  id: 'xlm-roberta-ner-quantized',
30
29
  name: 'XLM-RoBERTa NER (Quantized)',
31
30
  description: 'Quantized version, ~4x smaller with minimal accuracy loss',
32
- size: '~280 MB',
33
- hfRepo: 'Xenova/xlm-roberta-base-ner-hrl',
31
+ size: '~265 MB',
32
+ hfRepo: 'tjruesch/xlm-roberta-base-ner-hrl-onnx',
34
33
  hfSubfolder: 'onnx',
35
34
  files: [
36
35
  { repoFile: 'model_quantized.onnx', localFile: 'model.onnx', required: true },
37
36
  ],
38
- labelMap: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE'],
37
+ labelMap: ['O', 'B-DATE', 'I-DATE', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'],
39
38
  },
40
39
  };
41
40
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"model-manager.js","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AA0CzB;;;;;GAKG;AACH,MAAM,CAAC,MAAM,cAAc,GAAgD;IACzE,QAAQ,EAAE;QACR,EAAE,EAAE,0BAA0B;QAC9B,IAAI,EAAE,4BAA4B;QAClC,WAAW,EAAE,4DAA4D;QACzE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,iCAAiC;QACzC,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;YACnE,EAAE,QAAQ,EAAE,iBAAiB,EAAE,SAAS,EAAE,iBAAiB,EAAE,QAAQ,EAAE,KAAK,EAAE;SAC/E;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,CAAC;KAC1F;IACD,SAAS,EAAE;QACT,EAAE,EAAE,2BAA2B;QAC/B,IAAI,EAAE,6BAA6B;QACnC,WAAW,EAAE,2DAA2D;QACxE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,iCAAiC;QACzC,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,sBAAsB,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;SAC9E;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,CAAC;KAC1F;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAAoB;IACvC,EAAE,QAAQ,EAAE,gBAAgB,EAAE,SAAS,EAAE,gBAAgB,EAAE,QAAQ,EAAE,IAAI,EAAE;IAC3E,EAAE,QAAQ,EAAE,uBAAuB,EAAE,SAAS,EAAE,uBAAuB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC1F,EAAE,QAAQ,EAAE,yBAAyB,EAAE,SAAS,EAAE,yBAAyB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC9F,EAAE,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE;CACvE,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB;IAC9B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;IAE7B,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;QACzB,KAAK,QAAQ;YACX,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QACnF,KAAK,OAAO;YACV,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QAC5H;YACE,oDAAoD;YACpD,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;IACtH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,CAAC,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAA8B;IACpE,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,CAAC;QACH,6BAA6B;QAC7B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpF,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;QAC5D,CAAC;QAED,4BAA4B;QAC5B,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC,CAAC;QAEvD,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAYD;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY,EAAE,QAAgB,EAAE,SAAkB;IAC3E,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,GAAG,SAAS,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;IACnE,OAAO,0BAA0B,IAAI,iBAAiB,QAAQ,EAAE,CAAC;AACnE,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,YAAY,CACzB,GAAW,EACX,QAAgB,EAChB,UAAqC;IAErC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;QAChC,OAAO,EAAE;YACP,YAAY,EAAE,4BAA4B;SAC3C;KACF,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE5D,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,wDAAwD;IACxD,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,IAAI,eAAe,GAAG,CAAC,CAAC;IAExB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QAE5C,IAAI,IAAI;YAAE,MAAM;QAEhB,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,eAAe,IAAI,KAAK,CAAC,MAAM,CAAC;QAEhC,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC;gBACT,IAAI,EAAE,QAAQ;gBACd,eAAe;gBACf,UAAU,EAAE,KAAK;gBACjB,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI;aACpE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACrC,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;AACvC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,IAA8B,EAC9B,UAAqC,EACrC,QAAmC;IAEnC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAEpC,mBAAmB;IACnB,MAAM,EAAE,CAAC,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,IAAI,2BAA2B,CAAC,CAAC;IAChE,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAEzC,uBAAuB;IACvB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QAC5E,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,QAAQ,KAAK,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;YACD,mCAAmC;YACnC,QAAQ,EAAE,CAAC,0BAA0B,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;QACxD,CAAC;IACH,CAAC;IAED,2DAA2D;IAC3D,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;QACH,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAC3D,MAAM,EAAE,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEzE,QAAQ,EAAE,CAAC,oBAAoB,CAAC,CAAC;IAEjC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAA8B,EAC9B,UAII,EAAE;IAEN,MAAM,EAAE,YAAY,GAAG,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;IAE9D,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,8BAA8B;IAC9B,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAEnD,IAAI,CAAC,YAAY,EAAE,CAAC;QAClB,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CACb,cAAc,IAAI,kBAAkB,QAAQ,OAAO;gBACnD,mCAAmC;gBACnC,sCAAsC,IAAI,gCAAgC;gBAC1E,2BAA2B;gBAC3B,mDAAmD,CACpD,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,CAAC,IAAI,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IAClD,CAAC;SAAM,CAAC;QACN,QAAQ,EAAE,CAAC,uBAAuB,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,kBAAkB;IAClB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,YAAY,CAAC,CAAC;IAErE,OAAO;QACL,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,IAAI,YAAY,CAAC;QACpE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;QAChD,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KACpD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAA+B;IACnE,IAAI,IAAI,EAAE,CAAC;QACT,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;SAAM,CAAC;QACN,MAAM,QAAQ,GAAG,gBAAgB,EAAE,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB;IACxC,MAAM,MAAM,GAA0E,EAAE,CAAC;IAEzF,KAAK,MAAM,IAAI,IAAI,CAAC,UAAU,EAAE,WAAW,CAAU,EAAE,CAAC;QACtD,IAAI,MAAM,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC;YAClC,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
1
+ {"version":3,"file":"model-manager.js","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AA0CzB;;;;;GAKG;AACH,MAAM,CAAC,MAAM,cAAc,GAAgD;IACzE,QAAQ,EAAE;QACR,EAAE,EAAE,0BAA0B;QAC9B,IAAI,EAAE,4BAA4B;QAClC,WAAW,EAAE,4DAA4D;QACzE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,wCAAwC;QAChD,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;SACpE;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;IACD,SAAS,EAAE;QACT,EAAE,EAAE,2BAA2B;QAC/B,IAAI,EAAE,6BAA6B;QACnC,WAAW,EAAE,2DAA2D;QACxE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,wCAAwC;QAChD,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,sBAAsB,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;SAC9E;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAAoB;IACvC,EAAE,QAAQ,EAAE,gBAAgB,EAAE,SAAS,EAAE,gBAAgB,EAAE,QAAQ,EAAE,IAAI,EAAE;IAC3E,EAAE,QAAQ,EAAE,uBAAuB,EAAE,SAAS,EAAE,uBAAuB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC1F,EAAE,QAAQ,EAAE,yBAAyB,EAAE,SAAS,EAAE,yBAAyB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC9F,EAAE,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE;CACvE,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB;IAC9B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;IAE7B,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;QACzB,KAAK,QAAQ;YACX,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QACnF,KAAK,OAAO;YACV,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QAC5H;YACE,oDAAoD;YACpD,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;IACtH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,CAAC,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAA8B;IACpE,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,CAAC;QACH,6BAA6B;QAC7B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpF,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;QAC5D,CAAC;QAED,4BAA4B;QAC5B,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC,CAAC;QAEvD,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAYD;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY,EAAE,QAAgB,EAAE,SAAkB;IAC3E,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,GAAG,SAAS,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;IACnE,OAAO,0BAA0B,IAAI,iBAAiB,QAAQ,EAAE,CAAC;AACnE,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,YAAY,CACzB,GAAW,EACX,QAAgB,EAChB,UAAqC;IAErC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;QAChC,OAAO,EAAE;YACP,YAAY,EAAE,4BAA4B;SAC3C;KACF,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE5D,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,wDAAwD;IACxD,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,IAAI,eAAe,GAAG,CAAC,CAAC;IAExB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QAE5C,IAAI,IAAI;YAAE,MAAM;QAEhB,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,eAAe,IAAI,KAAK,CAAC,MAAM,CAAC;QAEhC,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC;gBACT,IAAI,EAAE,QAAQ;gBACd,eAAe;gBACf,UAAU,EAAE,KAAK;gBACjB,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI;aACpE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACrC,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;AACvC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,IAA8B,EAC9B,UAAqC,EACrC,QAAmC;IAEnC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAEpC,mBAAmB;IACnB,MAAM,EAAE,CAAC,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,IAAI,2BAA2B,CAAC,CAAC;IAChE,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAEzC,uBAAuB;IACvB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QAC5E,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,QAAQ,KAAK,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;YACD,mCAAmC;YACnC,QAAQ,EAAE,CAAC,0BAA0B,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;QACxD,CAAC;IACH,CAAC;IAED,2DAA2D;IAC3D,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;QACH,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAC3D,MAAM,EAAE,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEzE,QAAQ,EAAE,CAAC,oBAAoB,CAAC,CAAC;IAEjC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAA8B,EAC9B,UAII,EAAE;IAEN,MAAM,EAAE,YAAY,GAAG,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;IAE9D,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,8BAA8B;IAC9B,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAEnD,IAAI,CAAC,YAAY,EAAE,CAAC;QAClB,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CACb,cAAc,IAAI,kBAAkB,QAAQ,OAAO;gBACnD,mCAAmC;gBACnC,sCAAsC,IAAI,gCAAgC;gBAC1E,2BAA2B;gBAC3B,mDAAmD,CACpD,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,CAAC,IAAI,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IAClD,CAAC;SAAM,CAAC;QACN,QAAQ,EAAE,CAAC,uBAAuB,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,kBAAkB;IAClB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,YAAY,CAAC,CAAC;IAErE,OAAO;QACL,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,IAAI,YAAY,CAAC;QACpE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;QAChD,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KACpD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAA+B;IACnE,IAAI,IAAI,EAAE,CAAC;QACT,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;SAAM,CAAC;QACN,MAAM,QAAQ,GAAG,gBAAgB,EAAE,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB;IACxC,MAAM,MAAM,GAA0E,EAAE,CAAC;IAEzF,KAAK,MAAM,IAAI,IAAI,CAAC,UAAU,EAAE,WAAW,CAAU,EAAE,CAAC;QACtD,IAAI,MAAM,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC;YAClC,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
@@ -205,7 +205,7 @@ export function createNERModel(config) {
205
205
  vocabPath: config.vocabPath,
206
206
  labelMap: config.labelMap ?? DEFAULT_LABEL_MAP,
207
207
  maxLength: config.maxLength ?? 512,
208
- doLowerCase: config.doLowerCase ?? true,
208
+ doLowerCase: config.doLowerCase ?? false, // XLM-RoBERTa is cased
209
209
  modelVersion: config.modelVersion ?? '1.0.0',
210
210
  };
211
211
  return new NERModel(fullConfig);
@@ -1 +1 @@
1
- {"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,IAAI;QACvC,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAa,EAAE,OAA6B;QACxD,OAAO;YACL,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO;QACX,QAAQ;IACV,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
1
+ {"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,KAAK,EAAE,uBAAuB;QACjE,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAa,EAAE,OAA6B;QACxD,OAAO;YACL,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO;QACX,QAAQ;IACV,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
@@ -1,7 +1,7 @@
1
1
  /**
2
- * WordPiece Tokenizer
3
- * Tokenizes text into subword tokens while maintaining character offset mapping
4
- * Compatible with BERT-style models
2
+ * HuggingFace Tokenizer
3
+ * Loads and uses tokenizers from HuggingFace's tokenizer.json format
4
+ * Supports Unigram (SentencePiece) and BPE tokenizers
5
5
  */
6
6
  /**
7
7
  * Token with offset information
@@ -15,9 +15,9 @@ export interface Token {
15
15
  start: number;
16
16
  /** End character offset in original text */
17
17
  end: number;
18
- /** Whether this is a continuation token (starts with ##) */
18
+ /** Whether this is a continuation token */
19
19
  isContinuation: boolean;
20
- /** Whether this is a special token ([CLS], [SEP], etc.) */
20
+ /** Whether this is a special token */
21
21
  isSpecial: boolean;
22
22
  }
23
23
  /**
@@ -39,75 +39,44 @@ export interface TokenizationResult {
39
39
  * Tokenizer configuration
40
40
  */
41
41
  export interface TokenizerConfig {
42
- /** Path to vocabulary file */
43
- vocabPath?: string;
44
- /** Vocabulary as a Map */
45
- vocab?: Map<string, number>;
46
42
  /** Maximum sequence length */
47
43
  maxLength: number;
48
- /** Unknown token */
49
- unkToken: string;
50
- /** Classification token */
51
- clsToken: string;
52
- /** Separator token */
53
- sepToken: string;
54
- /** Padding token */
55
- padToken: string;
56
- /** Mask token */
57
- maskToken: string;
58
44
  /** Whether to lowercase input */
59
45
  doLowerCase: boolean;
60
- /** Strip accents */
61
- stripAccents: boolean;
62
46
  }
63
47
  /**
64
- * Default tokenizer configuration for BERT-style models
48
+ * Default tokenizer configuration
65
49
  */
66
50
  export declare const DEFAULT_TOKENIZER_CONFIG: TokenizerConfig;
67
51
  /**
68
- * WordPiece Tokenizer implementation
52
+ * WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
69
53
  */
70
54
  export declare class WordPieceTokenizer {
71
55
  private vocab;
72
56
  private inverseVocab;
73
57
  private config;
74
- private unkId;
58
+ private sortedVocab;
75
59
  private clsId;
76
60
  private sepId;
77
61
  private padId;
62
+ private unkId;
63
+ private clsToken;
64
+ private sepToken;
65
+ private padToken;
66
+ private unkToken;
78
67
  constructor(vocab: Map<string, number>, config?: Partial<TokenizerConfig>);
79
68
  /**
80
- * Tokenizes text into tokens with offset tracking
81
- */
82
- tokenize(text: string): TokenizationResult;
83
- /**
84
- * Preprocesses text (lowercase, accent stripping)
85
- */
86
- private preprocess;
87
- /**
88
- * Strips accents from text
89
- */
90
- private stripAccents;
91
- /**
92
- * Splits text into words while tracking character offsets
93
- */
94
- private splitIntoWords;
95
- /**
96
- * Tokenizes a single word using WordPiece algorithm
97
- */
98
- private tokenizeWord;
99
- /**
100
- * Splits a word into pieces, handling punctuation
69
+ * Detect special tokens from vocabulary
101
70
  */
102
- private splitWordIntoPieces;
71
+ private detectSpecialTokens;
103
72
  /**
104
- * Checks if a character is punctuation
73
+ * Tokenizes text into tokens with offset tracking
105
74
  */
106
- private isPunctuation;
75
+ tokenize(text: string): TokenizationResult;
107
76
  /**
108
- * Finds the longest matching token in vocabulary
77
+ * Find the best matching token using greedy longest-match
109
78
  */
110
- private findLongestMatch;
79
+ private findBestToken;
111
80
  /**
112
81
  * Decodes token IDs back to text
113
82
  */
@@ -126,11 +95,15 @@ export declare class WordPieceTokenizer {
126
95
  getToken(id: number): string | undefined;
127
96
  }
128
97
  /**
129
- * Loads vocabulary from a text file (one token per line)
98
+ * Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
99
+ */
100
+ export declare function loadVocabFromFile(filePath: string): Promise<Map<string, number>>;
101
+ /**
102
+ * Parses HuggingFace tokenizer.json format
130
103
  */
131
- export declare function loadVocabFromFile(path: string): Promise<Map<string, number>>;
104
+ export declare function parseHFTokenizerJson(content: string): Map<string, number>;
132
105
  /**
133
- * Parses vocabulary from string content
106
+ * Parses vocabulary from string content (vocab.txt format)
134
107
  */
135
108
  export declare function parseVocab(content: string): Map<string, number>;
136
109
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,4DAA4D;IAC5D,cAAc,EAAE,OAAO,CAAC;IACxB,2DAA2D;IAC3D,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0BAA0B;IAC1B,KAAK,CAAC,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC5B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,sBAAsB;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;IACrB,oBAAoB;IACpB,YAAY,EAAE,OAAO,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAStC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAGhC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;gBAEV,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAwE1C;;OAEG;IACH,OAAO,CAAC,UAAU;IAclB;;OAEG;IACH,OAAO,CAAC,YAAY;IAIpB;;OAEG;IACH,OAAO,CAAC,cAAc;IA0BtB;;OAEG;IACH,OAAO,CAAC,YAAY;IA2CpB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAuB3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAarB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAqBxB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IA8BlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAIlF;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CA8BrD"}
1
+ {"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,2CAA2C;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAsBD;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;CACtB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAGtC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,WAAW,CAA0B;IAG7C,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAG1B,OAAO,CAAC,QAAQ,CAAiB;IACjC,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,QAAQ,CAAmB;IACnC,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAyB3B;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAsF1C;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IAmBlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAUtF;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAmCzE;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAuBrD"}
@@ -1,33 +1,33 @@
1
1
  /**
2
- * WordPiece Tokenizer
3
- * Tokenizes text into subword tokens while maintaining character offset mapping
4
- * Compatible with BERT-style models
2
+ * HuggingFace Tokenizer
3
+ * Loads and uses tokenizers from HuggingFace's tokenizer.json format
4
+ * Supports Unigram (SentencePiece) and BPE tokenizers
5
5
  */
6
6
  /**
7
- * Default tokenizer configuration for BERT-style models
7
+ * Default tokenizer configuration
8
8
  */
9
9
  export const DEFAULT_TOKENIZER_CONFIG = {
10
10
  maxLength: 512,
11
- unkToken: '[UNK]',
12
- clsToken: '[CLS]',
13
- sepToken: '[SEP]',
14
- padToken: '[PAD]',
15
- maskToken: '[MASK]',
16
- doLowerCase: true,
17
- stripAccents: true,
11
+ doLowerCase: false, // XLM-RoBERTa doesn't lowercase
18
12
  };
19
13
  /**
20
- * WordPiece Tokenizer implementation
14
+ * WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
21
15
  */
22
16
  export class WordPieceTokenizer {
23
17
  vocab;
24
18
  inverseVocab;
25
19
  config;
26
- // Special token IDs
27
- unkId;
28
- clsId;
29
- sepId;
30
- padId;
20
+ sortedVocab;
21
+ // Special token IDs (XLM-RoBERTa style)
22
+ clsId = 0; // <s>
23
+ sepId = 2; // </s>
24
+ padId = 1; // <pad>
25
+ unkId = 3; // <unk>
26
+ // Special token strings
27
+ clsToken = '<s>';
28
+ sepToken = '</s>';
29
+ padToken = '<pad>';
30
+ unkToken = '<unk>';
31
31
  constructor(vocab, config = {}) {
32
32
  this.vocab = vocab;
33
33
  this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
@@ -36,11 +36,37 @@ export class WordPieceTokenizer {
36
36
  for (const [token, id] of vocab) {
37
37
  this.inverseVocab.set(id, token);
38
38
  }
39
- // Get special token IDs
40
- this.unkId = this.vocab.get(this.config.unkToken) ?? 0;
41
- this.clsId = this.vocab.get(this.config.clsToken) ?? 101;
42
- this.sepId = this.vocab.get(this.config.sepToken) ?? 102;
43
- this.padId = this.vocab.get(this.config.padToken) ?? 0;
39
+ // Sort vocab by token length (longest first) for greedy matching
40
+ this.sortedVocab = Array.from(vocab.entries()).sort((a, b) => b[0].length - a[0].length);
41
+ // Try to detect special tokens from vocab
42
+ this.detectSpecialTokens();
43
+ }
44
+ /**
45
+ * Detect special tokens from vocabulary
46
+ */
47
+ detectSpecialTokens() {
48
+ // XLM-RoBERTa style
49
+ if (this.vocab.has('<s>')) {
50
+ this.clsToken = '<s>';
51
+ this.clsId = this.vocab.get('<s>') ?? 0;
52
+ this.sepToken = '</s>';
53
+ this.sepId = this.vocab.get('</s>') ?? 2;
54
+ this.padToken = '<pad>';
55
+ this.padId = this.vocab.get('<pad>') ?? 1;
56
+ this.unkToken = '<unk>';
57
+ this.unkId = this.vocab.get('<unk>') ?? 3;
58
+ }
59
+ // BERT style
60
+ else if (this.vocab.has('[CLS]')) {
61
+ this.clsToken = '[CLS]';
62
+ this.clsId = this.vocab.get('[CLS]') ?? 101;
63
+ this.sepToken = '[SEP]';
64
+ this.sepId = this.vocab.get('[SEP]') ?? 102;
65
+ this.padToken = '[PAD]';
66
+ this.padId = this.vocab.get('[PAD]') ?? 0;
67
+ this.unkToken = '[UNK]';
68
+ this.unkId = this.vocab.get('[UNK]') ?? 100;
69
+ }
44
70
  }
45
71
  /**
46
72
  * Tokenizes text into tokens with offset tracking
@@ -48,10 +74,10 @@ export class WordPieceTokenizer {
48
74
  tokenize(text) {
49
75
  const tokens = [];
50
76
  const tokenToCharSpan = [];
51
- // Add [CLS] token
77
+ // Add CLS token
52
78
  tokens.push({
53
79
  id: this.clsId,
54
- token: this.config.clsToken,
80
+ token: this.clsToken,
55
81
  start: 0,
56
82
  end: 0,
57
83
  isContinuation: false,
@@ -59,21 +85,33 @@ export class WordPieceTokenizer {
59
85
  });
60
86
  tokenToCharSpan.push(null);
61
87
  // Preprocess text
62
- const processedText = this.preprocess(text);
63
- // Split into words by whitespace
64
- const wordSpans = this.splitIntoWords(processedText, text);
65
- // Tokenize each word
66
- for (const { word, start, end } of wordSpans) {
67
- const wordTokens = this.tokenizeWord(word, start, end);
68
- tokens.push(...wordTokens);
69
- for (const t of wordTokens) {
70
- tokenToCharSpan.push([t.start, t.end]);
88
+ const processedText = this.config.doLowerCase ? text.toLowerCase() : text;
89
+ // Tokenize using greedy longest-match
90
+ let pos = 0;
91
+ while (pos < processedText.length) {
92
+ // Skip whitespace
93
+ if (/\s/.test(processedText[pos])) {
94
+ pos++;
95
+ continue;
71
96
  }
97
+ // Find the longest matching token starting at this position
98
+ const { token, id, length } = this.findBestToken(processedText, pos);
99
+ const isFirstOfWord = pos === 0 || /\s/.test(processedText[pos - 1]);
100
+ tokens.push({
101
+ id,
102
+ token,
103
+ start: pos,
104
+ end: pos + length,
105
+ isContinuation: !isFirstOfWord && !token.startsWith('▁'),
106
+ isSpecial: false,
107
+ });
108
+ tokenToCharSpan.push([pos, pos + length]);
109
+ pos += length;
72
110
  }
73
- // Add [SEP] token
111
+ // Add SEP token
74
112
  tokens.push({
75
113
  id: this.sepId,
76
- token: this.config.sepToken,
114
+ token: this.sepToken,
77
115
  start: text.length,
78
116
  end: text.length,
79
117
  isContinuation: false,
@@ -85,10 +123,9 @@ export class WordPieceTokenizer {
85
123
  if (tokens.length > maxTokens) {
86
124
  tokens.length = maxTokens - 1;
87
125
  tokenToCharSpan.length = maxTokens - 1;
88
- // Add [SEP] at end
89
126
  tokens.push({
90
127
  id: this.sepId,
91
- token: this.config.sepToken,
128
+ token: this.sepToken,
92
129
  start: text.length,
93
130
  end: text.length,
94
131
  isContinuation: false,
@@ -109,161 +146,66 @@ export class WordPieceTokenizer {
109
146
  };
110
147
  }
111
148
  /**
112
- * Preprocesses text (lowercase, accent stripping)
113
- */
114
- preprocess(text) {
115
- let processed = text;
116
- if (this.config.doLowerCase) {
117
- processed = processed.toLowerCase();
118
- }
119
- if (this.config.stripAccents) {
120
- processed = this.stripAccents(processed);
121
- }
122
- return processed;
123
- }
124
- /**
125
- * Strips accents from text
126
- */
127
- stripAccents(text) {
128
- return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
129
- }
130
- /**
131
- * Splits text into words while tracking character offsets
132
- */
133
- splitIntoWords(processedText, originalText) {
134
- const words = [];
135
- // Split on whitespace and punctuation while keeping track of positions
136
- const wordPattern = /\S+/g;
137
- let match;
138
- while ((match = wordPattern.exec(processedText)) !== null) {
139
- // Find corresponding position in original text
140
- // Since we may have lowercased, we need to map positions
141
- const start = match.index;
142
- const end = start + match[0].length;
143
- words.push({
144
- word: match[0],
145
- start,
146
- end,
147
- });
148
- }
149
- return words;
150
- }
151
- /**
152
- * Tokenizes a single word using WordPiece algorithm
153
- */
154
- tokenizeWord(word, startOffset, endOffset) {
155
- const tokens = [];
156
- // Handle punctuation separately
157
- const subwords = this.splitWordIntoPieces(word);
158
- let currentOffset = startOffset;
159
- for (let i = 0; i < subwords.length; i++) {
160
- let subword = subwords[i];
161
- const isContinuation = i > 0;
162
- // For continuation tokens, add ## prefix for vocab lookup
163
- const vocabKey = isContinuation ? '##' + subword : subword;
164
- // Look up in vocabulary
165
- let tokenId = this.vocab.get(vocabKey);
166
- // If not found, try to find longest matching prefix
167
- if (tokenId === undefined) {
168
- const { id, token } = this.findLongestMatch(subword, isContinuation);
169
- tokenId = id;
170
- subword = token;
171
- }
172
- const tokenLength = subword.length;
173
- const tokenEnd = Math.min(currentOffset + tokenLength, endOffset);
174
- tokens.push({
175
- id: tokenId,
176
- token: isContinuation ? '##' + subword : subword,
177
- start: currentOffset,
178
- end: tokenEnd,
179
- isContinuation,
180
- isSpecial: false,
181
- });
182
- currentOffset = tokenEnd;
183
- }
184
- return tokens;
185
- }
186
- /**
187
- * Splits a word into pieces, handling punctuation
149
+ * Find the best matching token using greedy longest-match
188
150
  */
189
- splitWordIntoPieces(word) {
190
- const pieces = [];
191
- let current = '';
192
- for (const char of word) {
193
- if (this.isPunctuation(char)) {
194
- if (current.length > 0) {
195
- pieces.push(current);
196
- current = '';
151
+ findBestToken(text, startPos) {
152
+ const remaining = text.slice(startPos);
153
+ // Check if this starts a new word (preceded by space or start)
154
+ const isWordStart = startPos === 0 || /\s/.test(text[startPos - 1]);
155
+ // For SentencePiece models, word-initial tokens start with ▁
156
+ if (isWordStart) {
157
+ // Try with ▁ prefix first
158
+ const withPrefix = '' + remaining;
159
+ for (const [vocabToken, id] of this.sortedVocab) {
160
+ if (withPrefix.startsWith(vocabToken)) {
161
+ // Return the match length without the ▁ since that's not in original text
162
+ return {
163
+ token: vocabToken,
164
+ id,
165
+ length: vocabToken.length - 1 // Subtract 1 for the ▁
166
+ };
197
167
  }
198
- pieces.push(char);
199
168
  }
200
- else {
201
- current += char;
202
- }
203
- }
204
- if (current.length > 0) {
205
- pieces.push(current);
206
169
  }
207
- return pieces;
208
- }
209
- /**
210
- * Checks if a character is punctuation
211
- */
212
- isPunctuation(char) {
213
- const code = char.charCodeAt(0);
214
- // ASCII punctuation and some Unicode punctuation
215
- return ((code >= 33 && code <= 47) ||
216
- (code >= 58 && code <= 64) ||
217
- (code >= 91 && code <= 96) ||
218
- (code >= 123 && code <= 126) ||
219
- /[\u2000-\u206F]/.test(char) || // General punctuation
220
- /[\u3000-\u303F]/.test(char) // CJK punctuation
221
- );
222
- }
223
- /**
224
- * Finds the longest matching token in vocabulary
225
- */
226
- findLongestMatch(word, isContinuation) {
227
- const prefix = isContinuation ? '##' : '';
228
- // Try progressively shorter substrings
229
- for (let end = word.length; end > 0; end--) {
230
- const subword = word.slice(0, end);
231
- const vocabKey = prefix + subword;
232
- const id = this.vocab.get(vocabKey);
233
- if (id !== undefined) {
234
- return { id, token: subword };
170
+ // Try exact match without prefix
171
+ for (const [vocabToken, id] of this.sortedVocab) {
172
+ // Skip special tokens and tokens starting with ▁ for non-word-start positions
173
+ if (vocabToken.startsWith('<') || vocabToken.startsWith('['))
174
+ continue;
175
+ if (!isWordStart && vocabToken.startsWith('▁'))
176
+ continue;
177
+ if (remaining.startsWith(vocabToken.replace(/^▁/, ''))) {
178
+ const matchLength = vocabToken.replace(/^▁/, '').length;
179
+ if (matchLength > 0) {
180
+ return { token: vocabToken, id, length: matchLength };
181
+ }
235
182
  }
236
183
  }
237
- // Fall back to unknown token
238
- return { id: this.unkId, token: word };
184
+ // Single character fallback
185
+ const char = remaining[0];
186
+ const charId = this.vocab.get(char) ?? this.vocab.get('▁' + char) ?? this.unkId;
187
+ return { token: char, id: charId, length: 1 };
239
188
  }
240
189
  /**
241
190
  * Decodes token IDs back to text
242
191
  */
243
192
  decode(tokenIds) {
244
- const tokens = [];
193
+ const parts = [];
245
194
  for (const id of tokenIds) {
246
195
  const token = this.inverseVocab.get(id);
247
196
  if (token === undefined)
248
197
  continue;
249
- // Skip special tokens
250
- if (token === this.config.clsToken ||
251
- token === this.config.sepToken ||
252
- token === this.config.padToken) {
198
+ if (token === this.clsToken || token === this.sepToken || token === this.padToken)
253
199
  continue;
254
- }
255
- // Handle continuation tokens
256
- if (token.startsWith('##')) {
257
- tokens.push(token.slice(2));
200
+ // SentencePiece uses ▁ to mark word boundaries
201
+ if (token.startsWith('▁')) {
202
+ parts.push(' ' + token.slice(1));
258
203
  }
259
204
  else {
260
- if (tokens.length > 0) {
261
- tokens.push(' ');
262
- }
263
- tokens.push(token);
205
+ parts.push(token);
264
206
  }
265
207
  }
266
- return tokens.join('');
208
+ return parts.join('').trim();
267
209
  }
268
210
  /**
269
211
  * Gets vocabulary size
@@ -285,15 +227,58 @@ export class WordPieceTokenizer {
285
227
  }
286
228
  }
287
229
  /**
288
- * Loads vocabulary from a text file (one token per line)
230
+ * Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
289
231
  */
290
- export async function loadVocabFromFile(path) {
232
+ export async function loadVocabFromFile(filePath) {
291
233
  const fs = await import('fs/promises');
292
- const content = await fs.readFile(path, 'utf-8');
293
- return parseVocab(content);
234
+ const content = await fs.readFile(filePath, 'utf-8');
235
+ // Detect format
236
+ if (filePath.endsWith('.json') || content.trim().startsWith('{')) {
237
+ return parseHFTokenizerJson(content);
238
+ }
239
+ else {
240
+ return parseVocab(content);
241
+ }
242
+ }
243
+ /**
244
+ * Parses HuggingFace tokenizer.json format
245
+ */
246
+ export function parseHFTokenizerJson(content) {
247
+ const vocab = new Map();
248
+ try {
249
+ const config = JSON.parse(content);
250
+ // Add special tokens first
251
+ if (config.added_tokens) {
252
+ for (const token of config.added_tokens) {
253
+ vocab.set(token.content, token.id);
254
+ }
255
+ }
256
+ // Add model vocabulary
257
+ if (config.model?.vocab) {
258
+ if (Array.isArray(config.model.vocab)) {
259
+ // Unigram format: array of [token, score] pairs
260
+ for (let i = 0; i < config.model.vocab.length; i++) {
261
+ const entry = config.model.vocab[i];
262
+ if (entry && typeof entry[0] === 'string') {
263
+ vocab.set(entry[0], i);
264
+ }
265
+ }
266
+ }
267
+ else {
268
+ // BPE/WordPiece format: object mapping token -> id
269
+ for (const [token, id] of Object.entries(config.model.vocab)) {
270
+ vocab.set(token, id);
271
+ }
272
+ }
273
+ }
274
+ }
275
+ catch (e) {
276
+ throw new Error(`Failed to parse tokenizer.json: ${e}`);
277
+ }
278
+ return vocab;
294
279
  }
295
280
  /**
296
- * Parses vocabulary from string content
281
+ * Parses vocabulary from string content (vocab.txt format)
297
282
  */
298
283
  export function parseVocab(content) {
299
284
  const vocab = new Map();
@@ -311,26 +296,19 @@ export function parseVocab(content) {
311
296
  */
312
297
  export function createTestVocab() {
313
298
  const tokens = [
314
- '[PAD]',
315
- '[UNK]',
316
- '[CLS]',
317
- '[SEP]',
318
- '[MASK]',
319
- 'the',
320
- 'a',
321
- 'is',
322
- 'was',
323
- 'john',
324
- 'smith',
325
- 'berlin',
326
- 'germany',
327
- '##s',
328
- '##ed',
329
- '##ing',
330
- ',',
331
- '.',
299
+ '<s>',
300
+ '<pad>',
301
+ '</s>',
302
+ '<unk>',
303
+ '▁Hello',
304
+ '▁John',
305
+ '▁Smith',
306
+ '▁from',
307
+ '▁Acme',
308
+ '▁Corp',
309
+ '▁in',
310
+ '▁Berlin',
332
311
  '!',
333
- '?',
334
312
  ];
335
313
  const vocab = new Map();
336
314
  tokens.forEach((token, index) => {
@@ -1 +1 @@
1
- {"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AA8DH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,QAAQ,EAAE,OAAO;IACjB,SAAS,EAAE,QAAQ;IACnB,WAAW,EAAE,IAAI;IACjB,YAAY,EAAE,IAAI;CACnB,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IAEhC,oBAAoB;IACZ,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IACd,KAAK,CAAS;IAEtB,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;QACzD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QAE5C,iCAAiC;QACjC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;QAE3D,qBAAqB;QACrB,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,SAAS,EAAE,CAAC;YAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;YAC3B,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;gBAC3B,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACzC,CAAC;QACH,CAAC;QAED,kBAAkB;QAClB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,mBAAmB;YACnB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC3B,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,IAAY;QAC7B,IAAI,SAAS,GAAG,IAAI,CAAC;QAErB,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;YAC5B,SAAS,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YAC7B,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY;QAC/B,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,aAAqB,EACrB,YAAoB;QAEpB,MAAM,KAAK,GAAwD,EAAE,CAAC;QAEtE,uEAAuE;QACvE,MAAM,WAAW,GAAG,MAAM,CAAC;QAC3B,IAAI,KAA6B,CAAC;QAElC,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC1D,+CAA+C;YAC/C,yDAAyD;YACzD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,GAAG,GAAG,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YAEpC,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;gBACd,KAAK;gBACL,GAAG;aACJ,CAAC,CAAC;QACL,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY,EAAE,WAAmB,EAAE,SAAiB;QACvE,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,aAAa,GAAG,WAAW,CAAC;QAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,IAAI,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;YAC3B,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;YAE7B,0DAA0D;YAC1D,MAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;YAE3D,wBAAwB;YACxB,IAAI,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAEvC,oDAAoD;YACpD,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC1B,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;gBACrE,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,GAAG,KAAK,CAAC;YAClB,CAAC;YAED,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,aAAa,GAAG,WAAW,EAAE,SAAS,CAAC,CAAC;YAElE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,OAAO;gBACX,KAAK,EAAE,cAAc,CAAC,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO;gBAChD,KAAK,EAAE,aAAa;gBACpB,GAAG,EAAE,QAAQ;gBACb,cAAc;gBACd,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YAEH,aAAa,GAAG,QAAQ,CAAC;QAC3B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,IAAY;QACtC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;QAEjB,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACrB,OAAO,GAAG,EAAE,CAAC;gBACf,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACpB,CAAC;iBAAM,CAAC;gBACN,OAAO,IAAI,IAAI,CAAC;YAClB,CAAC;QACH,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY;QAChC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAChC,iDAAiD;QACjD,OAAO,CACL,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,CAAC;YAC1B,CAAC,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,GAAG,CAAC;YAC5B,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,sBAAsB;YACtD,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,kBAAkB;SAChD,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,gBAAgB,CACtB,IAAY,EACZ,cAAuB;QAEvB,MAAM,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1C,uCAAuC;QACvC,KAAK,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC;YAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACnC,MAAM,QAAQ,GAAG,MAAM,GAAG,OAAO,CAAC;YAElC,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACpC,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;YAChC,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,OAAO,EAAE,EAAE,EAAE,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACzC,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAElC,sBAAsB;YACtB,IACE,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,KAAK,KAAK,IAAI,CAAC,MAAM,CAAC,QAAQ,EAC9B,CAAC;gBACD,SAAS;YACX,CAAC;YAED,6BAA6B;YAC7B,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;gBACD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAAY;IAClD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,OAAO;QACP,OAAO;QACP,OAAO;QACP,OAAO;QACP,QAAQ;QACR,KAAK;QACL,GAAG;QACH,IAAI;QACJ,KAAK;QACL,MAAM;QACN,OAAO;QACP,QAAQ;QACR,SAAS;QACT,KAAK;QACL,MAAM;QACN,OAAO;QACP,GAAG;QACH,GAAG;QACH,GAAG;QACH,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
1
+ {"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkEH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,KAAK,EAAE,gCAAgC;CACrD,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IACxB,WAAW,CAA0B;IAE7C,wCAAwC;IAChC,KAAK,GAAW,CAAC,CAAC,CAAE,MAAM;IAC1B,KAAK,GAAW,CAAC,CAAC,CAAE,OAAO;IAC3B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAC5B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAEpC,wBAAwB;IAChB,QAAQ,GAAW,KAAK,CAAC;IACzB,QAAQ,GAAW,MAAM,CAAC;IAC1B,QAAQ,GAAW,OAAO,CAAC;IAC3B,QAAQ,GAAW,OAAO,CAAC;IAEnC,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,iEAAiE;QACjE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEzF,0CAA0C;QAC1C,IAAI,CAAC,mBAAmB,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,oBAAoB;QACpB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;YACtB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC;YACvB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QACD,aAAa;aACR,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;QAC9C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAE1E,sCAAsC;QACtC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,OAAO,GAAG,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;YAClC,kBAAkB;YAClB,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAE,CAAC,EAAE,CAAC;gBACnC,GAAG,EAAE,CAAC;gBACN,SAAS;YACX,CAAC;YAED,4DAA4D;YAC5D,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;YAErE,MAAM,aAAa,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC,CAAC;YAEtE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,KAAK;gBACL,KAAK,EAAE,GAAG;gBACV,GAAG,EAAE,GAAG,GAAG,MAAM;gBACjB,cAAc,EAAE,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;gBACxD,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YAE1C,GAAG,IAAI,MAAM,CAAC;QAChB,CAAC;QAED,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,QAAQ;gBACpB,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,QAAgB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEvC,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAE,CAAC,CAAC;QAErE,6DAA6D;QAC7D,IAAI,WAAW,EAAE,CAAC;YAChB,0BAA0B;YAC1B,MAAM,UAAU,GAAG,GAAG,GAAG,SAAS,CAAC;YACnC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBAChD,IAAI,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;oBACtC,0EAA0E;oBAC1E,OAAO;wBACL,KAAK,EAAE,UAAU;wBACjB,EAAE;wBACF,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,uBAAuB;qBACtD,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAChD,8EAA8E;YAC9E,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YACvE,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEzD,IAAI,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC;gBACxD,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;oBACpB,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;gBACxD,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC;QAChF,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAClC,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAE5F,+CAA+C;YAC/C,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB;IACtD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAErD,gBAAgB;IAChB,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAsB,CAAC;QAExD,2BAA2B;QAC3B,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YACxB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACxC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC;YACxB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,gDAAgD;gBAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,QAAQ,EAAE,CAAC;wBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;oBACzB,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,mDAAmD;gBACnD,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7D,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAY,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,KAAK;QACL,OAAO;QACP,MAAM;QACN,OAAO;QACP,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,OAAO;QACP,OAAO;QACP,OAAO;QACP,KAAK;QACL,SAAS;QACT,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@elanlanguages/bridge-anonymization",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "On-device PII anonymization module for high-privacy translation",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -15,8 +15,9 @@
15
15
  "test:run": "vitest run",
16
16
  "lint": "eslint src --ext .ts",
17
17
  "clean": "rm -rf dist",
18
- "setup:ner": "bash scripts/setup-ner-model.sh",
19
- "setup:ner:quantized": "bash scripts/setup-ner-model.sh --quantize",
18
+ "setup:ner": "bash scripts/setup-ner-model.sh --quantize",
19
+ "setup:ner:standard": "bash scripts/setup-ner-model.sh",
20
+ "setup:ner:upload": "bash scripts/setup-ner-model.sh --quantize --upload",
20
21
  "prepublishOnly": "npm run clean && npm run build"
21
22
  },
22
23
  "keywords": [