@elanlanguages/bridge-anonymization 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +12 -12
- package/dist/ner/model-manager.d.ts +2 -2
- package/dist/ner/model-manager.d.ts.map +1 -1
- package/dist/ner/model-manager.js +7 -8
- package/dist/ner/model-manager.js.map +1 -1
- package/dist/ner/ner-model.js +1 -1
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/tokenizer.d.ts +26 -53
- package/dist/ner/tokenizer.d.ts.map +1 -1
- package/dist/ner/tokenizer.js +174 -196
- package/dist/ner/tokenizer.js.map +1 -1
- package/package.json +4 -3
package/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ELAN Languages
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
package/README.md
CHANGED
|
@@ -13,10 +13,10 @@ On-device PII anonymization module for high-privacy translation workflows. Detec
|
|
|
13
13
|
## Installation
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
npm install bridge-anonymization
|
|
16
|
+
npm install @elanlanguages/bridge-anonymization
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
> **Bun users**: Install `onnxruntime-web` additionally: `bun add bridge-anonymization onnxruntime-web`
|
|
19
|
+
> **Bun users**: Install `onnxruntime-web` additionally: `bun add @elanlanguages/bridge-anonymization onnxruntime-web`
|
|
20
20
|
|
|
21
21
|
## Quick Start
|
|
22
22
|
|
|
@@ -25,7 +25,7 @@ npm install bridge-anonymization
|
|
|
25
25
|
For structured PII like emails, phones, IBANs, credit cards:
|
|
26
26
|
|
|
27
27
|
```typescript
|
|
28
|
-
import { anonymizeRegexOnly } from 'bridge-anonymization';
|
|
28
|
+
import { anonymizeRegexOnly } from '@elanlanguages/bridge-anonymization';
|
|
29
29
|
|
|
30
30
|
const result = await anonymizeRegexOnly(
|
|
31
31
|
'Contact john@example.com or call +49 30 123456. IBAN: DE89370400440532013000'
|
|
@@ -40,7 +40,7 @@ console.log(result.anonymizedText);
|
|
|
40
40
|
The NER model is automatically downloaded on first use (~280 MB for quantized):
|
|
41
41
|
|
|
42
42
|
```typescript
|
|
43
|
-
import { createAnonymizer } from 'bridge-anonymization';
|
|
43
|
+
import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
44
44
|
|
|
45
45
|
const anonymizer = createAnonymizer({
|
|
46
46
|
ner: {
|
|
@@ -62,7 +62,7 @@ console.log(result.anonymizedText);
|
|
|
62
62
|
### One-liner with NER
|
|
63
63
|
|
|
64
64
|
```typescript
|
|
65
|
-
import { anonymizeWithNER } from 'bridge-anonymization';
|
|
65
|
+
import { anonymizeWithNER } from '@elanlanguages/bridge-anonymization';
|
|
66
66
|
|
|
67
67
|
const result = await anonymizeWithNER(
|
|
68
68
|
'Contact John Smith at john@example.com',
|
|
@@ -75,7 +75,7 @@ const result = await anonymizeWithNER(
|
|
|
75
75
|
### Configuration Options
|
|
76
76
|
|
|
77
77
|
```typescript
|
|
78
|
-
import { createAnonymizer, InMemoryKeyProvider } from 'bridge-anonymization';
|
|
78
|
+
import { createAnonymizer, InMemoryKeyProvider } from '@elanlanguages/bridge-anonymization';
|
|
79
79
|
|
|
80
80
|
const anonymizer = createAnonymizer({
|
|
81
81
|
// NER configuration
|
|
@@ -211,7 +211,7 @@ interface AnonymizationResult {
|
|
|
211
211
|
### Anonymization Policy
|
|
212
212
|
|
|
213
213
|
```typescript
|
|
214
|
-
import { createAnonymizer, PIIType } from 'bridge-anonymization';
|
|
214
|
+
import { createAnonymizer, PIIType } from '@elanlanguages/bridge-anonymization';
|
|
215
215
|
|
|
216
216
|
const anonymizer = createAnonymizer({
|
|
217
217
|
ner: { mode: 'quantized' },
|
|
@@ -239,7 +239,7 @@ const anonymizer = createAnonymizer({
|
|
|
239
239
|
Add domain-specific patterns:
|
|
240
240
|
|
|
241
241
|
```typescript
|
|
242
|
-
import { createCustomIdRecognizer, PIIType, createAnonymizer } from 'bridge-anonymization';
|
|
242
|
+
import { createCustomIdRecognizer, PIIType, createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
243
243
|
|
|
244
244
|
const customRecognizer = createCustomIdRecognizer([
|
|
245
245
|
{
|
|
@@ -271,7 +271,7 @@ import {
|
|
|
271
271
|
clearModelCache,
|
|
272
272
|
listDownloadedModels,
|
|
273
273
|
getModelCacheDir
|
|
274
|
-
} from 'bridge-anonymization';
|
|
274
|
+
} from '@elanlanguages/bridge-anonymization';
|
|
275
275
|
|
|
276
276
|
// Check if model is downloaded
|
|
277
277
|
const hasModel = await isModelDownloaded('quantized');
|
|
@@ -293,7 +293,7 @@ await clearModelCache('quantized'); // or clearModelCache() for all
|
|
|
293
293
|
The PII map is encrypted using AES-256-GCM:
|
|
294
294
|
|
|
295
295
|
```typescript
|
|
296
|
-
import { createAnonymizer, KeyProvider, generateKey } from 'bridge-anonymization';
|
|
296
|
+
import { createAnonymizer, KeyProvider, generateKey } from '@elanlanguages/bridge-anonymization';
|
|
297
297
|
|
|
298
298
|
class SecureKeyProvider implements KeyProvider {
|
|
299
299
|
async getKey(): Promise<Buffer> {
|
|
@@ -320,13 +320,13 @@ const anonymizer = createAnonymizer({
|
|
|
320
320
|
This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun users need `onnxruntime-web`:
|
|
321
321
|
|
|
322
322
|
```bash
|
|
323
|
-
bun add bridge-anonymization onnxruntime-web
|
|
323
|
+
bun add @elanlanguages/bridge-anonymization onnxruntime-web
|
|
324
324
|
```
|
|
325
325
|
|
|
326
326
|
Usage is identical - the library auto-detects the runtime:
|
|
327
327
|
|
|
328
328
|
```typescript
|
|
329
|
-
import { createAnonymizer } from 'bridge-anonymization';
|
|
329
|
+
import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
330
330
|
|
|
331
331
|
const anonymizer = createAnonymizer({
|
|
332
332
|
ner: { mode: 'quantized' }
|
|
@@ -41,8 +41,8 @@ export interface ModelInfo {
|
|
|
41
41
|
/**
|
|
42
42
|
* Registry of available models hosted on Hugging Face Hub
|
|
43
43
|
*
|
|
44
|
-
* Using
|
|
45
|
-
* https://huggingface.co/
|
|
44
|
+
* Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
|
|
45
|
+
* https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
|
|
46
46
|
*/
|
|
47
47
|
export declare const MODEL_REGISTRY: Record<'standard' | 'quantized', ModelInfo>;
|
|
48
48
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,
|
|
1
|
+
{"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,CAyBtE,CAAC;AAYF;;;GAGG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAYzC;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,MAAM,CAEnE;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,CAkBxF;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,CAAC,QAAQ,EAAE;IAChD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAuEX;;GAEG;AACH,wBAAsB,aAAa,CACjC,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,UAAU,CAAC,EAAE,wBAAwB,EACrC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CAiDjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,wBAAwB,CAAC;IACtC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC,CAiCzE;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,IAAI,CAAC,EAAE,UAAU,GAAG,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAQpF;AAED;;GAEG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CAAC,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,GAAG,WAAW,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC,CAY3H;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAEtE"}
|
|
@@ -8,8 +8,8 @@ import * as os from 'os';
|
|
|
8
8
|
/**
|
|
9
9
|
* Registry of available models hosted on Hugging Face Hub
|
|
10
10
|
*
|
|
11
|
-
* Using
|
|
12
|
-
* https://huggingface.co/
|
|
11
|
+
* Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
|
|
12
|
+
* https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
|
|
13
13
|
*/
|
|
14
14
|
export const MODEL_REGISTRY = {
|
|
15
15
|
standard: {
|
|
@@ -17,25 +17,24 @@ export const MODEL_REGISTRY = {
|
|
|
17
17
|
name: 'XLM-RoBERTa NER (Standard)',
|
|
18
18
|
description: 'Multilingual NER model supporting EN, DE, FR, ES, and more',
|
|
19
19
|
size: '~1.1 GB',
|
|
20
|
-
hfRepo: '
|
|
20
|
+
hfRepo: 'tjruesch/xlm-roberta-base-ner-hrl-onnx',
|
|
21
21
|
hfSubfolder: 'onnx',
|
|
22
22
|
files: [
|
|
23
23
|
{ repoFile: 'model.onnx', localFile: 'model.onnx', required: true },
|
|
24
|
-
{ repoFile: 'model.onnx_data', localFile: 'model.onnx_data', required: false },
|
|
25
24
|
],
|
|
26
|
-
labelMap: ['O', 'B-
|
|
25
|
+
labelMap: ['O', 'B-DATE', 'I-DATE', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'],
|
|
27
26
|
},
|
|
28
27
|
quantized: {
|
|
29
28
|
id: 'xlm-roberta-ner-quantized',
|
|
30
29
|
name: 'XLM-RoBERTa NER (Quantized)',
|
|
31
30
|
description: 'Quantized version, ~4x smaller with minimal accuracy loss',
|
|
32
|
-
size: '~
|
|
33
|
-
hfRepo: '
|
|
31
|
+
size: '~265 MB',
|
|
32
|
+
hfRepo: 'tjruesch/xlm-roberta-base-ner-hrl-onnx',
|
|
34
33
|
hfSubfolder: 'onnx',
|
|
35
34
|
files: [
|
|
36
35
|
{ repoFile: 'model_quantized.onnx', localFile: 'model.onnx', required: true },
|
|
37
36
|
],
|
|
38
|
-
labelMap: ['O', 'B-
|
|
37
|
+
labelMap: ['O', 'B-DATE', 'I-DATE', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'],
|
|
39
38
|
},
|
|
40
39
|
};
|
|
41
40
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"model-manager.js","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AA0CzB;;;;;GAKG;AACH,MAAM,CAAC,MAAM,cAAc,GAAgD;IACzE,QAAQ,EAAE;QACR,EAAE,EAAE,0BAA0B;QAC9B,IAAI,EAAE,4BAA4B;QAClC,WAAW,EAAE,4DAA4D;QACzE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"model-manager.js","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AA0CzB;;;;;GAKG;AACH,MAAM,CAAC,MAAM,cAAc,GAAgD;IACzE,QAAQ,EAAE;QACR,EAAE,EAAE,0BAA0B;QAC9B,IAAI,EAAE,4BAA4B;QAClC,WAAW,EAAE,4DAA4D;QACzE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,wCAAwC;QAChD,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;SACpE;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;IACD,SAAS,EAAE;QACT,EAAE,EAAE,2BAA2B;QAC/B,IAAI,EAAE,6BAA6B;QACnC,WAAW,EAAE,2DAA2D;QACxE,IAAI,EAAE,SAAS;QACf,MAAM,EAAE,wCAAwC;QAChD,WAAW,EAAE,MAAM;QACnB,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,sBAAsB,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE;SAC9E;QACD,QAAQ,EAAE,CAAC,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAAoB;IACvC,EAAE,QAAQ,EAAE,gBAAgB,EAAE,SAAS,EAAE,gBAAgB,EAAE,QAAQ,EAAE,IAAI,EAAE;IAC3E,EAAE,QAAQ,EAAE,uBAAuB,EAAE,SAAS,EAAE,uBAAuB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC1F,EAAE,QAAQ,EAAE,yBAAyB,EAAE,SAAS,EAAE,yBAAyB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC9F,EAAE,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,aAAa,EAAE,QAAQ,EAAE,KAAK,EAAE;CACvE,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB;IAC9B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;IAE7B,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;QACzB,KAAK,QAAQ;YACX,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QACnF,KAAK,OAAO;YACV,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;QAC5H;YACE,oDAAoD;YACpD,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,EAAE,sBAAsB,EAAE,QAAQ,CAAC,CAAC;IACtH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,CAAC,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAA8B;IACpE,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,CAAC;QACH,6BAA6B;QAC7B,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpF,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;QAC5D,CAAC;QAED,4BAA4B;QAC5B,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC,CAAC;QAEvD,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAYD;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY,EAAE,QAAgB,EAAE,SAAkB;IAC3E,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,GAAG,SAAS,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;IACnE,OAAO,0BAA0B,IAAI,iBAAiB,QAAQ,EAAE,CAAC;AACnE,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,YAAY,CACzB,GAAW,EACX,QAAgB,EAChB,UAAqC;IAErC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;QAChC,OAAO,EAAE;YACP,YAAY,EAAE,4BAA4B;SAC3C;KACF,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAE3D,0BAA0B;IAC1B,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE5D,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,wDAAwD;IACxD,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,IAAI,eAAe,GAAG,CAAC,CAAC;IAExB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QAE5C,IAAI,IAAI;YAAE,MAAM;QAEhB,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,eAAe,IAAI,KAAK,CAAC,MAAM,CAAC;QAEhC,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC;gBACT,IAAI,EAAE,QAAQ;gBACd,eAAe;gBACf,UAAU,EAAE,KAAK;gBACjB,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI;aACpE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACrC,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;AACvC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,IAA8B,EAC9B,UAAqC,EACrC,QAAmC;IAEnC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAEpC,mBAAmB;IACnB,MAAM,EAAE,CAAC,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9C,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,IAAI,2BAA2B,CAAC,CAAC;IAChE,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAEzC,uBAAuB;IACvB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QAC5E,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,QAAQ,EAAE,CAAC,eAAe,IAAI,CAAC,QAAQ,KAAK,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;YACD,mCAAmC;YACnC,QAAQ,EAAE,CAAC,0BAA0B,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;QACxD,CAAC;IACH,CAAC;IAED,2DAA2D;IAC3D,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;QAErD,IAAI,CAAC;YACH,MAAM,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,oCAAoC,IAAI,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7E,CAAC;QACH,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAC3D,MAAM,EAAE,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEzE,QAAQ,EAAE,CAAC,oBAAoB,CAAC,CAAC;IAEjC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAA8B,EAC9B,UAII,EAAE;IAEN,MAAM,EAAE,YAAY,GAAG,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;IAE9D,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAElC,8BAA8B;IAC9B,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAEnD,IAAI,CAAC,YAAY,EAAE,CAAC;QAClB,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CACb,cAAc,IAAI,kBAAkB,QAAQ,OAAO;gBACnD,mCAAmC;gBACnC,sCAAsC,IAAI,gCAAgC;gBAC1E,2BAA2B;gBAC3B,mDAAmD,CACpD,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,CAAC,IAAI,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IAClD,CAAC;SAAM,CAAC;QACN,QAAQ,EAAE,CAAC,uBAAuB,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,kBAAkB;IAClB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,YAAY,CAAC,CAAC;IAErE,OAAO;QACL,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,IAAI,YAAY,CAAC;QACpE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;QAChD,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KACpD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAA+B;IACnE,IAAI,IAAI,EAAE,CAAC;QACT,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;SAAM,CAAC;QACN,MAAM,QAAQ,GAAG,gBAAgB,EAAE,CAAC;QACpC,MAAM,EAAE,CAAC,EAAE,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB;IACxC,MAAM,MAAM,GAA0E,EAAE,CAAC;IAEzF,KAAK,MAAM,IAAI,IAAI,CAAC,UAAU,EAAE,WAAW,CAAU,EAAE,CAAC;QACtD,IAAI,MAAM,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC;YAClC,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAA8B;IACzD,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
|
package/dist/ner/ner-model.js
CHANGED
|
@@ -205,7 +205,7 @@ export function createNERModel(config) {
|
|
|
205
205
|
vocabPath: config.vocabPath,
|
|
206
206
|
labelMap: config.labelMap ?? DEFAULT_LABEL_MAP,
|
|
207
207
|
maxLength: config.maxLength ?? 512,
|
|
208
|
-
doLowerCase: config.doLowerCase ??
|
|
208
|
+
doLowerCase: config.doLowerCase ?? false, // XLM-RoBERTa is cased
|
|
209
209
|
modelVersion: config.modelVersion ?? '1.0.0',
|
|
210
210
|
};
|
|
211
211
|
return new NERModel(fullConfig);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,
|
|
1
|
+
{"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,KAAK,EAAE,uBAAuB;QACjE,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAa,EAAE,OAA6B;QACxD,OAAO;YACL,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO;QACX,QAAQ;IACV,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
|
package/dist/ner/tokenizer.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* HuggingFace Tokenizer
|
|
3
|
+
* Loads and uses tokenizers from HuggingFace's tokenizer.json format
|
|
4
|
+
* Supports Unigram (SentencePiece) and BPE tokenizers
|
|
5
5
|
*/
|
|
6
6
|
/**
|
|
7
7
|
* Token with offset information
|
|
@@ -15,9 +15,9 @@ export interface Token {
|
|
|
15
15
|
start: number;
|
|
16
16
|
/** End character offset in original text */
|
|
17
17
|
end: number;
|
|
18
|
-
/** Whether this is a continuation token
|
|
18
|
+
/** Whether this is a continuation token */
|
|
19
19
|
isContinuation: boolean;
|
|
20
|
-
/** Whether this is a special token
|
|
20
|
+
/** Whether this is a special token */
|
|
21
21
|
isSpecial: boolean;
|
|
22
22
|
}
|
|
23
23
|
/**
|
|
@@ -39,75 +39,44 @@ export interface TokenizationResult {
|
|
|
39
39
|
* Tokenizer configuration
|
|
40
40
|
*/
|
|
41
41
|
export interface TokenizerConfig {
|
|
42
|
-
/** Path to vocabulary file */
|
|
43
|
-
vocabPath?: string;
|
|
44
|
-
/** Vocabulary as a Map */
|
|
45
|
-
vocab?: Map<string, number>;
|
|
46
42
|
/** Maximum sequence length */
|
|
47
43
|
maxLength: number;
|
|
48
|
-
/** Unknown token */
|
|
49
|
-
unkToken: string;
|
|
50
|
-
/** Classification token */
|
|
51
|
-
clsToken: string;
|
|
52
|
-
/** Separator token */
|
|
53
|
-
sepToken: string;
|
|
54
|
-
/** Padding token */
|
|
55
|
-
padToken: string;
|
|
56
|
-
/** Mask token */
|
|
57
|
-
maskToken: string;
|
|
58
44
|
/** Whether to lowercase input */
|
|
59
45
|
doLowerCase: boolean;
|
|
60
|
-
/** Strip accents */
|
|
61
|
-
stripAccents: boolean;
|
|
62
46
|
}
|
|
63
47
|
/**
|
|
64
|
-
* Default tokenizer configuration
|
|
48
|
+
* Default tokenizer configuration
|
|
65
49
|
*/
|
|
66
50
|
export declare const DEFAULT_TOKENIZER_CONFIG: TokenizerConfig;
|
|
67
51
|
/**
|
|
68
|
-
* WordPiece Tokenizer
|
|
52
|
+
* WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
|
|
69
53
|
*/
|
|
70
54
|
export declare class WordPieceTokenizer {
|
|
71
55
|
private vocab;
|
|
72
56
|
private inverseVocab;
|
|
73
57
|
private config;
|
|
74
|
-
private
|
|
58
|
+
private sortedVocab;
|
|
75
59
|
private clsId;
|
|
76
60
|
private sepId;
|
|
77
61
|
private padId;
|
|
62
|
+
private unkId;
|
|
63
|
+
private clsToken;
|
|
64
|
+
private sepToken;
|
|
65
|
+
private padToken;
|
|
66
|
+
private unkToken;
|
|
78
67
|
constructor(vocab: Map<string, number>, config?: Partial<TokenizerConfig>);
|
|
79
68
|
/**
|
|
80
|
-
*
|
|
81
|
-
*/
|
|
82
|
-
tokenize(text: string): TokenizationResult;
|
|
83
|
-
/**
|
|
84
|
-
* Preprocesses text (lowercase, accent stripping)
|
|
85
|
-
*/
|
|
86
|
-
private preprocess;
|
|
87
|
-
/**
|
|
88
|
-
* Strips accents from text
|
|
89
|
-
*/
|
|
90
|
-
private stripAccents;
|
|
91
|
-
/**
|
|
92
|
-
* Splits text into words while tracking character offsets
|
|
93
|
-
*/
|
|
94
|
-
private splitIntoWords;
|
|
95
|
-
/**
|
|
96
|
-
* Tokenizes a single word using WordPiece algorithm
|
|
97
|
-
*/
|
|
98
|
-
private tokenizeWord;
|
|
99
|
-
/**
|
|
100
|
-
* Splits a word into pieces, handling punctuation
|
|
69
|
+
* Detect special tokens from vocabulary
|
|
101
70
|
*/
|
|
102
|
-
private
|
|
71
|
+
private detectSpecialTokens;
|
|
103
72
|
/**
|
|
104
|
-
*
|
|
73
|
+
* Tokenizes text into tokens with offset tracking
|
|
105
74
|
*/
|
|
106
|
-
|
|
75
|
+
tokenize(text: string): TokenizationResult;
|
|
107
76
|
/**
|
|
108
|
-
*
|
|
77
|
+
* Find the best matching token using greedy longest-match
|
|
109
78
|
*/
|
|
110
|
-
private
|
|
79
|
+
private findBestToken;
|
|
111
80
|
/**
|
|
112
81
|
* Decodes token IDs back to text
|
|
113
82
|
*/
|
|
@@ -126,11 +95,15 @@ export declare class WordPieceTokenizer {
|
|
|
126
95
|
getToken(id: number): string | undefined;
|
|
127
96
|
}
|
|
128
97
|
/**
|
|
129
|
-
* Loads vocabulary from a
|
|
98
|
+
* Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
|
|
99
|
+
*/
|
|
100
|
+
export declare function loadVocabFromFile(filePath: string): Promise<Map<string, number>>;
|
|
101
|
+
/**
|
|
102
|
+
* Parses HuggingFace tokenizer.json format
|
|
130
103
|
*/
|
|
131
|
-
export declare function
|
|
104
|
+
export declare function parseHFTokenizerJson(content: string): Map<string, number>;
|
|
132
105
|
/**
|
|
133
|
-
* Parses vocabulary from string content
|
|
106
|
+
* Parses vocabulary from string content (vocab.txt format)
|
|
134
107
|
*/
|
|
135
108
|
export declare function parseVocab(content: string): Map<string, number>;
|
|
136
109
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,2CAA2C;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAsBD;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;CACtB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAGtC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,WAAW,CAA0B;IAG7C,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAG1B,OAAO,CAAC,QAAQ,CAAiB;IACjC,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,QAAQ,CAAmB;IACnC,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAyB3B;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAsF1C;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IAmBlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAUtF;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAmCzE;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAuBrD"}
|
package/dist/ner/tokenizer.js
CHANGED
|
@@ -1,33 +1,33 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* HuggingFace Tokenizer
|
|
3
|
+
* Loads and uses tokenizers from HuggingFace's tokenizer.json format
|
|
4
|
+
* Supports Unigram (SentencePiece) and BPE tokenizers
|
|
5
5
|
*/
|
|
6
6
|
/**
|
|
7
|
-
* Default tokenizer configuration
|
|
7
|
+
* Default tokenizer configuration
|
|
8
8
|
*/
|
|
9
9
|
export const DEFAULT_TOKENIZER_CONFIG = {
|
|
10
10
|
maxLength: 512,
|
|
11
|
-
|
|
12
|
-
clsToken: '[CLS]',
|
|
13
|
-
sepToken: '[SEP]',
|
|
14
|
-
padToken: '[PAD]',
|
|
15
|
-
maskToken: '[MASK]',
|
|
16
|
-
doLowerCase: true,
|
|
17
|
-
stripAccents: true,
|
|
11
|
+
doLowerCase: false, // XLM-RoBERTa doesn't lowercase
|
|
18
12
|
};
|
|
19
13
|
/**
|
|
20
|
-
* WordPiece Tokenizer
|
|
14
|
+
* WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
|
|
21
15
|
*/
|
|
22
16
|
export class WordPieceTokenizer {
|
|
23
17
|
vocab;
|
|
24
18
|
inverseVocab;
|
|
25
19
|
config;
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
clsId;
|
|
29
|
-
sepId;
|
|
30
|
-
padId;
|
|
20
|
+
sortedVocab;
|
|
21
|
+
// Special token IDs (XLM-RoBERTa style)
|
|
22
|
+
clsId = 0; // <s>
|
|
23
|
+
sepId = 2; // </s>
|
|
24
|
+
padId = 1; // <pad>
|
|
25
|
+
unkId = 3; // <unk>
|
|
26
|
+
// Special token strings
|
|
27
|
+
clsToken = '<s>';
|
|
28
|
+
sepToken = '</s>';
|
|
29
|
+
padToken = '<pad>';
|
|
30
|
+
unkToken = '<unk>';
|
|
31
31
|
constructor(vocab, config = {}) {
|
|
32
32
|
this.vocab = vocab;
|
|
33
33
|
this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
|
|
@@ -36,11 +36,37 @@ export class WordPieceTokenizer {
|
|
|
36
36
|
for (const [token, id] of vocab) {
|
|
37
37
|
this.inverseVocab.set(id, token);
|
|
38
38
|
}
|
|
39
|
-
//
|
|
40
|
-
this.
|
|
41
|
-
|
|
42
|
-
this.
|
|
43
|
-
|
|
39
|
+
// Sort vocab by token length (longest first) for greedy matching
|
|
40
|
+
this.sortedVocab = Array.from(vocab.entries()).sort((a, b) => b[0].length - a[0].length);
|
|
41
|
+
// Try to detect special tokens from vocab
|
|
42
|
+
this.detectSpecialTokens();
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Detect special tokens from vocabulary
|
|
46
|
+
*/
|
|
47
|
+
detectSpecialTokens() {
|
|
48
|
+
// XLM-RoBERTa style
|
|
49
|
+
if (this.vocab.has('<s>')) {
|
|
50
|
+
this.clsToken = '<s>';
|
|
51
|
+
this.clsId = this.vocab.get('<s>') ?? 0;
|
|
52
|
+
this.sepToken = '</s>';
|
|
53
|
+
this.sepId = this.vocab.get('</s>') ?? 2;
|
|
54
|
+
this.padToken = '<pad>';
|
|
55
|
+
this.padId = this.vocab.get('<pad>') ?? 1;
|
|
56
|
+
this.unkToken = '<unk>';
|
|
57
|
+
this.unkId = this.vocab.get('<unk>') ?? 3;
|
|
58
|
+
}
|
|
59
|
+
// BERT style
|
|
60
|
+
else if (this.vocab.has('[CLS]')) {
|
|
61
|
+
this.clsToken = '[CLS]';
|
|
62
|
+
this.clsId = this.vocab.get('[CLS]') ?? 101;
|
|
63
|
+
this.sepToken = '[SEP]';
|
|
64
|
+
this.sepId = this.vocab.get('[SEP]') ?? 102;
|
|
65
|
+
this.padToken = '[PAD]';
|
|
66
|
+
this.padId = this.vocab.get('[PAD]') ?? 0;
|
|
67
|
+
this.unkToken = '[UNK]';
|
|
68
|
+
this.unkId = this.vocab.get('[UNK]') ?? 100;
|
|
69
|
+
}
|
|
44
70
|
}
|
|
45
71
|
/**
|
|
46
72
|
* Tokenizes text into tokens with offset tracking
|
|
@@ -48,10 +74,10 @@ export class WordPieceTokenizer {
|
|
|
48
74
|
tokenize(text) {
|
|
49
75
|
const tokens = [];
|
|
50
76
|
const tokenToCharSpan = [];
|
|
51
|
-
// Add
|
|
77
|
+
// Add CLS token
|
|
52
78
|
tokens.push({
|
|
53
79
|
id: this.clsId,
|
|
54
|
-
token: this.
|
|
80
|
+
token: this.clsToken,
|
|
55
81
|
start: 0,
|
|
56
82
|
end: 0,
|
|
57
83
|
isContinuation: false,
|
|
@@ -59,21 +85,33 @@ export class WordPieceTokenizer {
|
|
|
59
85
|
});
|
|
60
86
|
tokenToCharSpan.push(null);
|
|
61
87
|
// Preprocess text
|
|
62
|
-
const processedText = this.
|
|
63
|
-
//
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
tokenToCharSpan.push([t.start, t.end]);
|
|
88
|
+
const processedText = this.config.doLowerCase ? text.toLowerCase() : text;
|
|
89
|
+
// Tokenize using greedy longest-match
|
|
90
|
+
let pos = 0;
|
|
91
|
+
while (pos < processedText.length) {
|
|
92
|
+
// Skip whitespace
|
|
93
|
+
if (/\s/.test(processedText[pos])) {
|
|
94
|
+
pos++;
|
|
95
|
+
continue;
|
|
71
96
|
}
|
|
97
|
+
// Find the longest matching token starting at this position
|
|
98
|
+
const { token, id, length } = this.findBestToken(processedText, pos);
|
|
99
|
+
const isFirstOfWord = pos === 0 || /\s/.test(processedText[pos - 1]);
|
|
100
|
+
tokens.push({
|
|
101
|
+
id,
|
|
102
|
+
token,
|
|
103
|
+
start: pos,
|
|
104
|
+
end: pos + length,
|
|
105
|
+
isContinuation: !isFirstOfWord && !token.startsWith('▁'),
|
|
106
|
+
isSpecial: false,
|
|
107
|
+
});
|
|
108
|
+
tokenToCharSpan.push([pos, pos + length]);
|
|
109
|
+
pos += length;
|
|
72
110
|
}
|
|
73
|
-
// Add
|
|
111
|
+
// Add SEP token
|
|
74
112
|
tokens.push({
|
|
75
113
|
id: this.sepId,
|
|
76
|
-
token: this.
|
|
114
|
+
token: this.sepToken,
|
|
77
115
|
start: text.length,
|
|
78
116
|
end: text.length,
|
|
79
117
|
isContinuation: false,
|
|
@@ -85,10 +123,9 @@ export class WordPieceTokenizer {
|
|
|
85
123
|
if (tokens.length > maxTokens) {
|
|
86
124
|
tokens.length = maxTokens - 1;
|
|
87
125
|
tokenToCharSpan.length = maxTokens - 1;
|
|
88
|
-
// Add [SEP] at end
|
|
89
126
|
tokens.push({
|
|
90
127
|
id: this.sepId,
|
|
91
|
-
token: this.
|
|
128
|
+
token: this.sepToken,
|
|
92
129
|
start: text.length,
|
|
93
130
|
end: text.length,
|
|
94
131
|
isContinuation: false,
|
|
@@ -109,161 +146,66 @@ export class WordPieceTokenizer {
|
|
|
109
146
|
};
|
|
110
147
|
}
|
|
111
148
|
/**
|
|
112
|
-
*
|
|
113
|
-
*/
|
|
114
|
-
preprocess(text) {
|
|
115
|
-
let processed = text;
|
|
116
|
-
if (this.config.doLowerCase) {
|
|
117
|
-
processed = processed.toLowerCase();
|
|
118
|
-
}
|
|
119
|
-
if (this.config.stripAccents) {
|
|
120
|
-
processed = this.stripAccents(processed);
|
|
121
|
-
}
|
|
122
|
-
return processed;
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Strips accents from text
|
|
126
|
-
*/
|
|
127
|
-
stripAccents(text) {
|
|
128
|
-
return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Splits text into words while tracking character offsets
|
|
132
|
-
*/
|
|
133
|
-
splitIntoWords(processedText, originalText) {
|
|
134
|
-
const words = [];
|
|
135
|
-
// Split on whitespace and punctuation while keeping track of positions
|
|
136
|
-
const wordPattern = /\S+/g;
|
|
137
|
-
let match;
|
|
138
|
-
while ((match = wordPattern.exec(processedText)) !== null) {
|
|
139
|
-
// Find corresponding position in original text
|
|
140
|
-
// Since we may have lowercased, we need to map positions
|
|
141
|
-
const start = match.index;
|
|
142
|
-
const end = start + match[0].length;
|
|
143
|
-
words.push({
|
|
144
|
-
word: match[0],
|
|
145
|
-
start,
|
|
146
|
-
end,
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
-
return words;
|
|
150
|
-
}
|
|
151
|
-
/**
|
|
152
|
-
* Tokenizes a single word using WordPiece algorithm
|
|
153
|
-
*/
|
|
154
|
-
tokenizeWord(word, startOffset, endOffset) {
|
|
155
|
-
const tokens = [];
|
|
156
|
-
// Handle punctuation separately
|
|
157
|
-
const subwords = this.splitWordIntoPieces(word);
|
|
158
|
-
let currentOffset = startOffset;
|
|
159
|
-
for (let i = 0; i < subwords.length; i++) {
|
|
160
|
-
let subword = subwords[i];
|
|
161
|
-
const isContinuation = i > 0;
|
|
162
|
-
// For continuation tokens, add ## prefix for vocab lookup
|
|
163
|
-
const vocabKey = isContinuation ? '##' + subword : subword;
|
|
164
|
-
// Look up in vocabulary
|
|
165
|
-
let tokenId = this.vocab.get(vocabKey);
|
|
166
|
-
// If not found, try to find longest matching prefix
|
|
167
|
-
if (tokenId === undefined) {
|
|
168
|
-
const { id, token } = this.findLongestMatch(subword, isContinuation);
|
|
169
|
-
tokenId = id;
|
|
170
|
-
subword = token;
|
|
171
|
-
}
|
|
172
|
-
const tokenLength = subword.length;
|
|
173
|
-
const tokenEnd = Math.min(currentOffset + tokenLength, endOffset);
|
|
174
|
-
tokens.push({
|
|
175
|
-
id: tokenId,
|
|
176
|
-
token: isContinuation ? '##' + subword : subword,
|
|
177
|
-
start: currentOffset,
|
|
178
|
-
end: tokenEnd,
|
|
179
|
-
isContinuation,
|
|
180
|
-
isSpecial: false,
|
|
181
|
-
});
|
|
182
|
-
currentOffset = tokenEnd;
|
|
183
|
-
}
|
|
184
|
-
return tokens;
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Splits a word into pieces, handling punctuation
|
|
149
|
+
* Find the best matching token using greedy longest-match
|
|
188
150
|
*/
|
|
189
|
-
|
|
190
|
-
const
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
151
|
+
findBestToken(text, startPos) {
|
|
152
|
+
const remaining = text.slice(startPos);
|
|
153
|
+
// Check if this starts a new word (preceded by space or start)
|
|
154
|
+
const isWordStart = startPos === 0 || /\s/.test(text[startPos - 1]);
|
|
155
|
+
// For SentencePiece models, word-initial tokens start with ▁
|
|
156
|
+
if (isWordStart) {
|
|
157
|
+
// Try with ▁ prefix first
|
|
158
|
+
const withPrefix = '▁' + remaining;
|
|
159
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
160
|
+
if (withPrefix.startsWith(vocabToken)) {
|
|
161
|
+
// Return the match length without the ▁ since that's not in original text
|
|
162
|
+
return {
|
|
163
|
+
token: vocabToken,
|
|
164
|
+
id,
|
|
165
|
+
length: vocabToken.length - 1 // Subtract 1 for the ▁
|
|
166
|
+
};
|
|
197
167
|
}
|
|
198
|
-
pieces.push(char);
|
|
199
168
|
}
|
|
200
|
-
else {
|
|
201
|
-
current += char;
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
if (current.length > 0) {
|
|
205
|
-
pieces.push(current);
|
|
206
169
|
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
/[\u2000-\u206F]/.test(char) || // General punctuation
|
|
220
|
-
/[\u3000-\u303F]/.test(char) // CJK punctuation
|
|
221
|
-
);
|
|
222
|
-
}
|
|
223
|
-
/**
|
|
224
|
-
* Finds the longest matching token in vocabulary
|
|
225
|
-
*/
|
|
226
|
-
findLongestMatch(word, isContinuation) {
|
|
227
|
-
const prefix = isContinuation ? '##' : '';
|
|
228
|
-
// Try progressively shorter substrings
|
|
229
|
-
for (let end = word.length; end > 0; end--) {
|
|
230
|
-
const subword = word.slice(0, end);
|
|
231
|
-
const vocabKey = prefix + subword;
|
|
232
|
-
const id = this.vocab.get(vocabKey);
|
|
233
|
-
if (id !== undefined) {
|
|
234
|
-
return { id, token: subword };
|
|
170
|
+
// Try exact match without prefix
|
|
171
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
172
|
+
// Skip special tokens and tokens starting with ▁ for non-word-start positions
|
|
173
|
+
if (vocabToken.startsWith('<') || vocabToken.startsWith('['))
|
|
174
|
+
continue;
|
|
175
|
+
if (!isWordStart && vocabToken.startsWith('▁'))
|
|
176
|
+
continue;
|
|
177
|
+
if (remaining.startsWith(vocabToken.replace(/^▁/, ''))) {
|
|
178
|
+
const matchLength = vocabToken.replace(/^▁/, '').length;
|
|
179
|
+
if (matchLength > 0) {
|
|
180
|
+
return { token: vocabToken, id, length: matchLength };
|
|
181
|
+
}
|
|
235
182
|
}
|
|
236
183
|
}
|
|
237
|
-
//
|
|
238
|
-
|
|
184
|
+
// Single character fallback
|
|
185
|
+
const char = remaining[0];
|
|
186
|
+
const charId = this.vocab.get(char) ?? this.vocab.get('▁' + char) ?? this.unkId;
|
|
187
|
+
return { token: char, id: charId, length: 1 };
|
|
239
188
|
}
|
|
240
189
|
/**
|
|
241
190
|
* Decodes token IDs back to text
|
|
242
191
|
*/
|
|
243
192
|
decode(tokenIds) {
|
|
244
|
-
const
|
|
193
|
+
const parts = [];
|
|
245
194
|
for (const id of tokenIds) {
|
|
246
195
|
const token = this.inverseVocab.get(id);
|
|
247
196
|
if (token === undefined)
|
|
248
197
|
continue;
|
|
249
|
-
|
|
250
|
-
if (token === this.config.clsToken ||
|
|
251
|
-
token === this.config.sepToken ||
|
|
252
|
-
token === this.config.padToken) {
|
|
198
|
+
if (token === this.clsToken || token === this.sepToken || token === this.padToken)
|
|
253
199
|
continue;
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
tokens.push(token.slice(2));
|
|
200
|
+
// SentencePiece uses ▁ to mark word boundaries
|
|
201
|
+
if (token.startsWith('▁')) {
|
|
202
|
+
parts.push(' ' + token.slice(1));
|
|
258
203
|
}
|
|
259
204
|
else {
|
|
260
|
-
|
|
261
|
-
tokens.push(' ');
|
|
262
|
-
}
|
|
263
|
-
tokens.push(token);
|
|
205
|
+
parts.push(token);
|
|
264
206
|
}
|
|
265
207
|
}
|
|
266
|
-
return
|
|
208
|
+
return parts.join('').trim();
|
|
267
209
|
}
|
|
268
210
|
/**
|
|
269
211
|
* Gets vocabulary size
|
|
@@ -285,15 +227,58 @@ export class WordPieceTokenizer {
|
|
|
285
227
|
}
|
|
286
228
|
}
|
|
287
229
|
/**
|
|
288
|
-
* Loads vocabulary from a
|
|
230
|
+
* Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
|
|
289
231
|
*/
|
|
290
|
-
export async function loadVocabFromFile(
|
|
232
|
+
export async function loadVocabFromFile(filePath) {
|
|
291
233
|
const fs = await import('fs/promises');
|
|
292
|
-
const content = await fs.readFile(
|
|
293
|
-
|
|
234
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
235
|
+
// Detect format
|
|
236
|
+
if (filePath.endsWith('.json') || content.trim().startsWith('{')) {
|
|
237
|
+
return parseHFTokenizerJson(content);
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
return parseVocab(content);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Parses HuggingFace tokenizer.json format
|
|
245
|
+
*/
|
|
246
|
+
export function parseHFTokenizerJson(content) {
|
|
247
|
+
const vocab = new Map();
|
|
248
|
+
try {
|
|
249
|
+
const config = JSON.parse(content);
|
|
250
|
+
// Add special tokens first
|
|
251
|
+
if (config.added_tokens) {
|
|
252
|
+
for (const token of config.added_tokens) {
|
|
253
|
+
vocab.set(token.content, token.id);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Add model vocabulary
|
|
257
|
+
if (config.model?.vocab) {
|
|
258
|
+
if (Array.isArray(config.model.vocab)) {
|
|
259
|
+
// Unigram format: array of [token, score] pairs
|
|
260
|
+
for (let i = 0; i < config.model.vocab.length; i++) {
|
|
261
|
+
const entry = config.model.vocab[i];
|
|
262
|
+
if (entry && typeof entry[0] === 'string') {
|
|
263
|
+
vocab.set(entry[0], i);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
// BPE/WordPiece format: object mapping token -> id
|
|
269
|
+
for (const [token, id] of Object.entries(config.model.vocab)) {
|
|
270
|
+
vocab.set(token, id);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
catch (e) {
|
|
276
|
+
throw new Error(`Failed to parse tokenizer.json: ${e}`);
|
|
277
|
+
}
|
|
278
|
+
return vocab;
|
|
294
279
|
}
|
|
295
280
|
/**
|
|
296
|
-
* Parses vocabulary from string content
|
|
281
|
+
* Parses vocabulary from string content (vocab.txt format)
|
|
297
282
|
*/
|
|
298
283
|
export function parseVocab(content) {
|
|
299
284
|
const vocab = new Map();
|
|
@@ -311,26 +296,19 @@ export function parseVocab(content) {
|
|
|
311
296
|
*/
|
|
312
297
|
export function createTestVocab() {
|
|
313
298
|
const tokens = [
|
|
314
|
-
'
|
|
315
|
-
'
|
|
316
|
-
'
|
|
317
|
-
'
|
|
318
|
-
'
|
|
319
|
-
'
|
|
320
|
-
'
|
|
321
|
-
'
|
|
322
|
-
'
|
|
323
|
-
'
|
|
324
|
-
'
|
|
325
|
-
'
|
|
326
|
-
'germany',
|
|
327
|
-
'##s',
|
|
328
|
-
'##ed',
|
|
329
|
-
'##ing',
|
|
330
|
-
',',
|
|
331
|
-
'.',
|
|
299
|
+
'<s>',
|
|
300
|
+
'<pad>',
|
|
301
|
+
'</s>',
|
|
302
|
+
'<unk>',
|
|
303
|
+
'▁Hello',
|
|
304
|
+
'▁John',
|
|
305
|
+
'▁Smith',
|
|
306
|
+
'▁from',
|
|
307
|
+
'▁Acme',
|
|
308
|
+
'▁Corp',
|
|
309
|
+
'▁in',
|
|
310
|
+
'▁Berlin',
|
|
332
311
|
'!',
|
|
333
|
-
'?',
|
|
334
312
|
];
|
|
335
313
|
const vocab = new Map();
|
|
336
314
|
tokens.forEach((token, index) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkEH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,KAAK,EAAE,gCAAgC;CACrD,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IACxB,WAAW,CAA0B;IAE7C,wCAAwC;IAChC,KAAK,GAAW,CAAC,CAAC,CAAE,MAAM;IAC1B,KAAK,GAAW,CAAC,CAAC,CAAE,OAAO;IAC3B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAC5B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAEpC,wBAAwB;IAChB,QAAQ,GAAW,KAAK,CAAC;IACzB,QAAQ,GAAW,MAAM,CAAC;IAC1B,QAAQ,GAAW,OAAO,CAAC;IAC3B,QAAQ,GAAW,OAAO,CAAC;IAEnC,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,iEAAiE;QACjE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEzF,0CAA0C;QAC1C,IAAI,CAAC,mBAAmB,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,oBAAoB;QACpB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;YACtB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC;YACvB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QACD,aAAa;aACR,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;QAC9C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAE1E,sCAAsC;QACtC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,OAAO,GAAG,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;YAClC,kBAAkB;YAClB,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAE,CAAC,EAAE,CAAC;gBACnC,GAAG,EAAE,CAAC;gBACN,SAAS;YACX,CAAC;YAED,4DAA4D;YAC5D,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;YAErE,MAAM,aAAa,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC,CAAC;YAEtE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,KAAK;gBACL,KAAK,EAAE,GAAG;gBACV,GAAG,EAAE,GAAG,GAAG,MAAM;gBACjB,cAAc,EAAE,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;gBACxD,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YAE1C,GAAG,IAAI,MAAM,CAAC;QAChB,CAAC;QAED,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,QAAQ;gBACpB,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,QAAgB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEvC,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAE,CAAC,CAAC;QAErE,6DAA6D;QAC7D,IAAI,WAAW,EAAE,CAAC;YAChB,0BAA0B;YAC1B,MAAM,UAAU,GAAG,GAAG,GAAG,SAAS,CAAC;YACnC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBAChD,IAAI,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;oBACtC,0EAA0E;oBAC1E,OAAO;wBACL,KAAK,EAAE,UAAU;wBACjB,EAAE;wBACF,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,uBAAuB;qBACtD,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAChD,8EAA8E;YAC9E,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YACvE,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEzD,IAAI,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC;gBACxD,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;oBACpB,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;gBACxD,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC;QAChF,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAClC,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAE5F,+CAA+C;YAC/C,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB;IACtD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAErD,gBAAgB;IAChB,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAsB,CAAC;QAExD,2BAA2B;QAC3B,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YACxB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACxC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC;YACxB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,gDAAgD;gBAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,QAAQ,EAAE,CAAC;wBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;oBACzB,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,mDAAmD;gBACnD,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7D,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAY,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,KAAK;QACL,OAAO;QACP,MAAM;QACN,OAAO;QACP,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,OAAO;QACP,OAAO;QACP,OAAO;QACP,KAAK;QACL,SAAS;QACT,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elanlanguages/bridge-anonymization",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "On-device PII anonymization module for high-privacy translation",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -15,8 +15,9 @@
|
|
|
15
15
|
"test:run": "vitest run",
|
|
16
16
|
"lint": "eslint src --ext .ts",
|
|
17
17
|
"clean": "rm -rf dist",
|
|
18
|
-
"setup:ner": "bash scripts/setup-ner-model.sh",
|
|
19
|
-
"setup:ner:
|
|
18
|
+
"setup:ner": "bash scripts/setup-ner-model.sh --quantize",
|
|
19
|
+
"setup:ner:standard": "bash scripts/setup-ner-model.sh",
|
|
20
|
+
"setup:ner:upload": "bash scripts/setup-ner-model.sh --quantize --upload",
|
|
20
21
|
"prepublishOnly": "npm run clean && npm run build"
|
|
21
22
|
},
|
|
22
23
|
"keywords": [
|