@elanlanguages/bridge-anonymization 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +382 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +100 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +163 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +173 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +294 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +102 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +253 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +240 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +45 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +99 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +140 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +341 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +9 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +9 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +238 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +74 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +169 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +134 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +69 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/package.json +62 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NER Model Wrapper
|
|
3
|
+
* ONNX Runtime integration for Named Entity Recognition
|
|
4
|
+
* Supports both onnxruntime-node and onnxruntime-web
|
|
5
|
+
*/
|
|
6
|
+
import { SpanMatch, AnonymizationPolicy } from '../types/index.js';
|
|
7
|
+
/**
|
|
8
|
+
* NER Model configuration
|
|
9
|
+
*/
|
|
10
|
+
export interface NERModelConfig {
|
|
11
|
+
/** Path to ONNX model file */
|
|
12
|
+
modelPath: string;
|
|
13
|
+
/** Path to vocabulary file */
|
|
14
|
+
vocabPath: string;
|
|
15
|
+
/** Label mapping (index -> label string) */
|
|
16
|
+
labelMap: string[];
|
|
17
|
+
/** Maximum sequence length */
|
|
18
|
+
maxLength: number;
|
|
19
|
+
/** Whether model expects lowercase input */
|
|
20
|
+
doLowerCase: boolean;
|
|
21
|
+
/** Model version for tracking */
|
|
22
|
+
modelVersion: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* NER prediction result for a single text
|
|
26
|
+
*/
|
|
27
|
+
export interface NERPrediction {
|
|
28
|
+
/** Detected entity spans */
|
|
29
|
+
spans: SpanMatch[];
|
|
30
|
+
/** Processing time in ms */
|
|
31
|
+
processingTimeMs: number;
|
|
32
|
+
/** Model version used */
|
|
33
|
+
modelVersion: string;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Default label map for common NER models (CoNLL-style)
|
|
37
|
+
*/
|
|
38
|
+
export declare const DEFAULT_LABEL_MAP: string[];
|
|
39
|
+
/**
|
|
40
|
+
* NER Model wrapper for ONNX inference
|
|
41
|
+
*/
|
|
42
|
+
export declare class NERModel {
|
|
43
|
+
private ort;
|
|
44
|
+
private session;
|
|
45
|
+
private tokenizer;
|
|
46
|
+
private config;
|
|
47
|
+
private isLoaded;
|
|
48
|
+
constructor(config: NERModelConfig);
|
|
49
|
+
/**
|
|
50
|
+
* Loads the model and tokenizer
|
|
51
|
+
*/
|
|
52
|
+
load(): Promise<void>;
|
|
53
|
+
/**
|
|
54
|
+
* Predicts entities in text
|
|
55
|
+
*/
|
|
56
|
+
predict(text: string, policy?: AnonymizationPolicy): Promise<NERPrediction>;
|
|
57
|
+
/**
|
|
58
|
+
* Runs ONNX inference
|
|
59
|
+
*/
|
|
60
|
+
private runInference;
|
|
61
|
+
/**
|
|
62
|
+
* Processes model logits to extract labels and confidences
|
|
63
|
+
*/
|
|
64
|
+
private processLogits;
|
|
65
|
+
/**
|
|
66
|
+
* Gets minimum confidence threshold from policy
|
|
67
|
+
*/
|
|
68
|
+
private getMinConfidence;
|
|
69
|
+
/**
|
|
70
|
+
* Gets model version
|
|
71
|
+
*/
|
|
72
|
+
get version(): string;
|
|
73
|
+
/**
|
|
74
|
+
* Checks if model is loaded
|
|
75
|
+
*/
|
|
76
|
+
get loaded(): boolean;
|
|
77
|
+
/**
|
|
78
|
+
* Disposes of model resources
|
|
79
|
+
*/
|
|
80
|
+
dispose(): Promise<void>;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Creates a NER model instance with configuration
|
|
84
|
+
*/
|
|
85
|
+
export declare function createNERModel(config: Partial<NERModelConfig> & {
|
|
86
|
+
modelPath: string;
|
|
87
|
+
vocabPath: string;
|
|
88
|
+
}): NERModel;
|
|
89
|
+
/**
|
|
90
|
+
* NER Model stub for when no model is available
|
|
91
|
+
* Returns empty results - useful for regex-only mode
|
|
92
|
+
*/
|
|
93
|
+
export declare class NERModelStub {
|
|
94
|
+
readonly version = "stub-1.0.0";
|
|
95
|
+
readonly loaded = true;
|
|
96
|
+
load(): Promise<void>;
|
|
97
|
+
predict(_text: string, _policy?: AnonymizationPolicy): Promise<NERPrediction>;
|
|
98
|
+
dispose(): Promise<void>;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Creates a stub NER model (for testing or regex-only mode)
|
|
102
|
+
*/
|
|
103
|
+
export declare function createNERModelStub(): NERModelStub;
|
|
104
|
+
/**
|
|
105
|
+
* NER model interface for dependency injection
|
|
106
|
+
*/
|
|
107
|
+
export interface INERModel {
|
|
108
|
+
readonly version: string;
|
|
109
|
+
readonly loaded: boolean;
|
|
110
|
+
load(): Promise<void>;
|
|
111
|
+
predict(text: string, policy?: AnonymizationPolicy): Promise<NERPrediction>;
|
|
112
|
+
dispose(): Promise<void>;
|
|
113
|
+
}
|
|
114
|
+
//# sourceMappingURL=ner-model.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ner-model.d.ts","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,EAAW,SAAS,EAAmB,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAc7F;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,4CAA4C;IAC5C,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,4CAA4C;IAC5C,WAAW,EAAE,OAAO,CAAC;IACrB,iCAAiC;IACjC,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,EAAE,CAAC;IACnB,4BAA4B;IAC5B,gBAAgB,EAAE,MAAM,CAAC;IACzB,yBAAyB;IACzB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,eAAO,MAAM,iBAAiB,UAU7B,CAAC;AAEF;;GAEG;AACH,qBAAa,QAAQ;IACnB,OAAO,CAAC,GAAG,CAA2B;IACtC,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,SAAS,CAAmC;IACpD,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,QAAQ,CAAS;gBAEb,MAAM,EAAE,cAAc;IAIlC;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAmB3B;;OAEG;IACG,OAAO,CACX,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,mBAAmB,GAC3B,OAAO,CAAC,aAAa,CAAC;IA8CzB;;OAEG;YACW,YAAY;IA+D1B;;OAEG;IACH,OAAO,CAAC,aAAa;IAqCrB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAexB;;OAEG;IACH,IAAI,OAAO,IAAI,MAAM,CAEpB;IAED;;OAEG;IACH,IAAI,MAAM,IAAI,OAAO,CAEpB;IAED;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAQ/B;AAYD;;GAEG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,OAAO,CAAC,cAAc,CAAC,GAAG;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GAAG,QAAQ,CAWnH;AAED;;;GAGG;AACH,qBAAa,YAAY;IACvB,QAAQ,CAAC,OAAO,gBAAgB;IAChC,QAAQ,CAAC,MAAM,QAAQ;IAEjB,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAIrB,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,aAAa,CAAC;IAQ7E,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B;AAED;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,YAAY,CAEjD;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC;IACzB,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACtB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;IAC5E,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B"}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NER Model Wrapper
|
|
3
|
+
* ONNX Runtime integration for Named Entity Recognition
|
|
4
|
+
* Supports both onnxruntime-node and onnxruntime-web
|
|
5
|
+
*/
|
|
6
|
+
import { loadRuntime } from './onnx-runtime.js';
|
|
7
|
+
import { WordPieceTokenizer, loadVocabFromFile, } from './tokenizer.js';
|
|
8
|
+
import { decodeBIOTags, convertToSpanMatches, cleanupSpanBoundaries, mergeAdjacentSpans, } from './bio-decoder.js';
|
|
9
|
+
/**
|
|
10
|
+
* Default label map for common NER models (CoNLL-style)
|
|
11
|
+
*/
|
|
12
|
+
export const DEFAULT_LABEL_MAP = [
|
|
13
|
+
'O',
|
|
14
|
+
'B-PER',
|
|
15
|
+
'I-PER',
|
|
16
|
+
'B-ORG',
|
|
17
|
+
'I-ORG',
|
|
18
|
+
'B-LOC',
|
|
19
|
+
'I-LOC',
|
|
20
|
+
'B-MISC',
|
|
21
|
+
'I-MISC',
|
|
22
|
+
];
|
|
23
|
+
/**
|
|
24
|
+
* NER Model wrapper for ONNX inference
|
|
25
|
+
*/
|
|
26
|
+
export class NERModel {
|
|
27
|
+
ort = null;
|
|
28
|
+
session = null;
|
|
29
|
+
tokenizer = null;
|
|
30
|
+
config;
|
|
31
|
+
isLoaded = false;
|
|
32
|
+
constructor(config) {
|
|
33
|
+
this.config = config;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Loads the model and tokenizer
|
|
37
|
+
*/
|
|
38
|
+
async load() {
|
|
39
|
+
if (this.isLoaded)
|
|
40
|
+
return;
|
|
41
|
+
// Load ONNX runtime (auto-detects best runtime for environment)
|
|
42
|
+
this.ort = await loadRuntime();
|
|
43
|
+
// Load ONNX model
|
|
44
|
+
this.session = await this.ort.InferenceSession.create(this.config.modelPath);
|
|
45
|
+
// Load tokenizer vocabulary
|
|
46
|
+
const vocab = await loadVocabFromFile(this.config.vocabPath);
|
|
47
|
+
this.tokenizer = new WordPieceTokenizer(vocab, {
|
|
48
|
+
maxLength: this.config.maxLength,
|
|
49
|
+
doLowerCase: this.config.doLowerCase,
|
|
50
|
+
});
|
|
51
|
+
this.isLoaded = true;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Predicts entities in text
|
|
55
|
+
*/
|
|
56
|
+
async predict(text, policy) {
|
|
57
|
+
const startTime = performance.now();
|
|
58
|
+
if (!this.isLoaded || this.session === null || this.tokenizer === null) {
|
|
59
|
+
throw new Error('Model not loaded. Call load() first.');
|
|
60
|
+
}
|
|
61
|
+
// Tokenize input
|
|
62
|
+
const tokenization = this.tokenizer.tokenize(text);
|
|
63
|
+
// Run inference
|
|
64
|
+
const { labels, confidences } = await this.runInference(tokenization);
|
|
65
|
+
// Decode BIO tags to entities
|
|
66
|
+
const rawEntities = decodeBIOTags(tokenization.tokens, labels, confidences, text);
|
|
67
|
+
// Convert to SpanMatch format with confidence filtering
|
|
68
|
+
const minConfidence = this.getMinConfidence(policy);
|
|
69
|
+
let spans = convertToSpanMatches(rawEntities, minConfidence);
|
|
70
|
+
// Post-process spans
|
|
71
|
+
spans = cleanupSpanBoundaries(spans, text);
|
|
72
|
+
spans = mergeAdjacentSpans(spans, text);
|
|
73
|
+
// Filter by enabled types in policy
|
|
74
|
+
if (policy !== undefined) {
|
|
75
|
+
spans = spans.filter((span) => policy.enabledTypes.has(span.type) && policy.nerEnabledTypes.has(span.type));
|
|
76
|
+
}
|
|
77
|
+
const endTime = performance.now();
|
|
78
|
+
return {
|
|
79
|
+
spans,
|
|
80
|
+
processingTimeMs: endTime - startTime,
|
|
81
|
+
modelVersion: this.config.modelVersion,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Runs ONNX inference
|
|
86
|
+
*/
|
|
87
|
+
async runInference(tokenization) {
|
|
88
|
+
if (this.session === null || this.ort === null) {
|
|
89
|
+
throw new Error('Session not initialized');
|
|
90
|
+
}
|
|
91
|
+
const session = this.session;
|
|
92
|
+
const seqLength = tokenization.inputIds.length;
|
|
93
|
+
// Create tensors
|
|
94
|
+
const inputIdsTensor = new this.ort.Tensor('int64', BigInt64Array.from(tokenization.inputIds.map(BigInt)), [1, seqLength]);
|
|
95
|
+
const attentionMaskTensor = new this.ort.Tensor('int64', BigInt64Array.from(tokenization.attentionMask.map(BigInt)), [1, seqLength]);
|
|
96
|
+
const tokenTypeIdsTensor = new this.ort.Tensor('int64', BigInt64Array.from(tokenization.tokenTypeIds.map(BigInt)), [1, seqLength]);
|
|
97
|
+
// Run inference
|
|
98
|
+
const feeds = {
|
|
99
|
+
input_ids: inputIdsTensor,
|
|
100
|
+
attention_mask: attentionMaskTensor,
|
|
101
|
+
};
|
|
102
|
+
// Some models also need token_type_ids
|
|
103
|
+
const inputNames = session.inputNames;
|
|
104
|
+
if (inputNames.includes('token_type_ids')) {
|
|
105
|
+
feeds['token_type_ids'] = tokenTypeIdsTensor;
|
|
106
|
+
}
|
|
107
|
+
const results = await session.run(feeds);
|
|
108
|
+
// Get logits output
|
|
109
|
+
const outputName = session.outputNames[0];
|
|
110
|
+
if (outputName === undefined) {
|
|
111
|
+
throw new Error('No output from model');
|
|
112
|
+
}
|
|
113
|
+
const logits = results[outputName];
|
|
114
|
+
if (logits === undefined) {
|
|
115
|
+
throw new Error('Logits output not found');
|
|
116
|
+
}
|
|
117
|
+
// Process logits to get labels and confidences
|
|
118
|
+
return this.processLogits(logits, seqLength);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Processes model logits to extract labels and confidences
|
|
122
|
+
*/
|
|
123
|
+
processLogits(logits, seqLength) {
|
|
124
|
+
const data = logits.data;
|
|
125
|
+
const numLabels = this.config.labelMap.length;
|
|
126
|
+
const labels = [];
|
|
127
|
+
const confidences = [];
|
|
128
|
+
for (let i = 0; i < seqLength; i++) {
|
|
129
|
+
// Get logits for this token
|
|
130
|
+
const tokenLogits = [];
|
|
131
|
+
for (let j = 0; j < numLabels; j++) {
|
|
132
|
+
tokenLogits.push(data[i * numLabels + j] ?? 0);
|
|
133
|
+
}
|
|
134
|
+
// Apply softmax
|
|
135
|
+
const probs = softmax(tokenLogits);
|
|
136
|
+
// Get argmax
|
|
137
|
+
let maxIdx = 0;
|
|
138
|
+
let maxProb = probs[0] ?? 0;
|
|
139
|
+
for (let j = 1; j < probs.length; j++) {
|
|
140
|
+
if ((probs[j] ?? 0) > maxProb) {
|
|
141
|
+
maxProb = probs[j] ?? 0;
|
|
142
|
+
maxIdx = j;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
labels.push(this.config.labelMap[maxIdx] ?? 'O');
|
|
146
|
+
confidences.push(maxProb);
|
|
147
|
+
}
|
|
148
|
+
return { labels, confidences };
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Gets minimum confidence threshold from policy
|
|
152
|
+
*/
|
|
153
|
+
getMinConfidence(policy) {
|
|
154
|
+
if (policy === undefined)
|
|
155
|
+
return 0.5;
|
|
156
|
+
// Get minimum from all NER-enabled types
|
|
157
|
+
let minThreshold = 1.0;
|
|
158
|
+
for (const type of policy.nerEnabledTypes) {
|
|
159
|
+
const threshold = policy.confidenceThresholds.get(type) ?? 0.5;
|
|
160
|
+
if (threshold < minThreshold) {
|
|
161
|
+
minThreshold = threshold;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return minThreshold;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Gets model version
|
|
168
|
+
*/
|
|
169
|
+
get version() {
|
|
170
|
+
return this.config.modelVersion;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Checks if model is loaded
|
|
174
|
+
*/
|
|
175
|
+
get loaded() {
|
|
176
|
+
return this.isLoaded;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Disposes of model resources
|
|
180
|
+
*/
|
|
181
|
+
async dispose() {
|
|
182
|
+
if (this.session !== null) {
|
|
183
|
+
// ONNX Runtime Node doesn't have explicit dispose, but we can clear references
|
|
184
|
+
this.session = null;
|
|
185
|
+
}
|
|
186
|
+
this.tokenizer = null;
|
|
187
|
+
this.isLoaded = false;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Softmax function for probability calculation
|
|
192
|
+
*/
|
|
193
|
+
function softmax(logits) {
|
|
194
|
+
const maxLogit = Math.max(...logits);
|
|
195
|
+
const expLogits = logits.map((x) => Math.exp(x - maxLogit));
|
|
196
|
+
const sumExp = expLogits.reduce((a, b) => a + b, 0);
|
|
197
|
+
return expLogits.map((x) => x / sumExp);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Creates a NER model instance with configuration
|
|
201
|
+
*/
|
|
202
|
+
export function createNERModel(config) {
|
|
203
|
+
const fullConfig = {
|
|
204
|
+
modelPath: config.modelPath,
|
|
205
|
+
vocabPath: config.vocabPath,
|
|
206
|
+
labelMap: config.labelMap ?? DEFAULT_LABEL_MAP,
|
|
207
|
+
maxLength: config.maxLength ?? 512,
|
|
208
|
+
doLowerCase: config.doLowerCase ?? true,
|
|
209
|
+
modelVersion: config.modelVersion ?? '1.0.0',
|
|
210
|
+
};
|
|
211
|
+
return new NERModel(fullConfig);
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* NER Model stub for when no model is available
|
|
215
|
+
* Returns empty results - useful for regex-only mode
|
|
216
|
+
*/
|
|
217
|
+
export class NERModelStub {
|
|
218
|
+
version = 'stub-1.0.0';
|
|
219
|
+
loaded = true;
|
|
220
|
+
async load() {
|
|
221
|
+
// No-op
|
|
222
|
+
}
|
|
223
|
+
async predict(_text, _policy) {
|
|
224
|
+
return {
|
|
225
|
+
spans: [],
|
|
226
|
+
processingTimeMs: 0,
|
|
227
|
+
modelVersion: this.version,
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
async dispose() {
|
|
231
|
+
// No-op
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Creates a stub NER model (for testing or regex-only mode)
|
|
236
|
+
*/
|
|
237
|
+
export function createNERModelStub() {
|
|
238
|
+
return new NERModelStub();
|
|
239
|
+
}
|
|
240
|
+
//# sourceMappingURL=ner-model.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,IAAI;QACvC,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAa,EAAE,OAA6B;QACxD,OAAO;YACL,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO;QACX,QAAQ;IACV,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX Runtime Abstraction
|
|
3
|
+
* Allows switching between onnxruntime-node and onnxruntime-web
|
|
4
|
+
*/
|
|
5
|
+
export interface OrtTensor {
|
|
6
|
+
data: Float32Array | BigInt64Array | Int32Array;
|
|
7
|
+
dims: readonly number[];
|
|
8
|
+
}
|
|
9
|
+
export interface OrtSession {
|
|
10
|
+
inputNames: readonly string[];
|
|
11
|
+
outputNames: readonly string[];
|
|
12
|
+
run(feeds: Record<string, OrtTensor>): Promise<Record<string, OrtTensor>>;
|
|
13
|
+
}
|
|
14
|
+
export interface OrtInferenceSession {
|
|
15
|
+
create(path: string, options?: unknown): Promise<OrtSession>;
|
|
16
|
+
}
|
|
17
|
+
export interface OrtTensorConstructor {
|
|
18
|
+
new (type: string, data: Float32Array | BigInt64Array | Int32Array | number[] | bigint[], dims: number[]): OrtTensor;
|
|
19
|
+
}
|
|
20
|
+
export interface OrtRuntime {
|
|
21
|
+
InferenceSession: OrtInferenceSession;
|
|
22
|
+
Tensor: OrtTensorConstructor;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Detects the best ONNX runtime for the current environment
|
|
26
|
+
*/
|
|
27
|
+
export declare function detectRuntime(): 'node' | 'web';
|
|
28
|
+
/**
|
|
29
|
+
* Loads the appropriate ONNX runtime
|
|
30
|
+
*/
|
|
31
|
+
export declare function loadRuntime(preferredRuntime?: 'node' | 'web'): Promise<OrtRuntime>;
|
|
32
|
+
/**
|
|
33
|
+
* Gets the currently loaded runtime type
|
|
34
|
+
*/
|
|
35
|
+
export declare function getRuntimeType(): 'node' | 'web' | null;
|
|
36
|
+
/**
|
|
37
|
+
* Resets the runtime (useful for testing)
|
|
38
|
+
*/
|
|
39
|
+
export declare function resetRuntime(): void;
|
|
40
|
+
declare global {
|
|
41
|
+
var Bun: unknown | undefined;
|
|
42
|
+
var Deno: unknown | undefined;
|
|
43
|
+
var window: unknown | undefined;
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=onnx-runtime.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onnx-runtime.d.ts","sourceRoot":"","sources":["../../src/ner/onnx-runtime.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,YAAY,GAAG,aAAa,GAAG,UAAU,CAAC;IAChD,IAAI,EAAE,SAAS,MAAM,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;IAC9B,WAAW,EAAE,SAAS,MAAM,EAAE,CAAC;IAC/B,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC;CAC3E;AAED,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;CAC9D;AAED,MAAM,WAAW,oBAAoB;IACnC,KACE,IAAI,EAAE,MAAM,EACZ,IAAI,EAAE,YAAY,GAAG,aAAa,GAAG,UAAU,GAAG,MAAM,EAAE,GAAG,MAAM,EAAE,EACrE,IAAI,EAAE,MAAM,EAAE,GACb,SAAS,CAAC;CACd;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,EAAE,mBAAmB,CAAC;IACtC,MAAM,EAAE,oBAAoB,CAAC;CAC9B;AAQD;;GAEG;AACH,wBAAgB,aAAa,IAAI,MAAM,GAAG,KAAK,CA4B9C;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,gBAAgB,CAAC,EAAE,MAAM,GAAG,KAAK,GAAG,OAAO,CAAC,UAAU,CAAC,CA8CxF;AAED;;GAEG;AACH,wBAAgB,cAAc,IAAI,MAAM,GAAG,KAAK,GAAG,IAAI,CAEtD;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,IAAI,CAGnC;AAGD,OAAO,CAAC,MAAM,CAAC;IAEb,IAAI,GAAG,EAAE,OAAO,GAAG,SAAS,CAAC;IAE7B,IAAI,IAAI,EAAE,OAAO,GAAG,SAAS,CAAC;IAE9B,IAAI,MAAM,EAAE,OAAO,GAAG,SAAS,CAAC;CACjC"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ONNX Runtime Abstraction
|
|
3
|
+
* Allows switching between onnxruntime-node and onnxruntime-web
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Runtime detection and loading
|
|
7
|
+
*/
|
|
8
|
+
let _runtime = null;
|
|
9
|
+
let _runtimeType = null;
|
|
10
|
+
/**
|
|
11
|
+
* Detects the best ONNX runtime for the current environment
|
|
12
|
+
*/
|
|
13
|
+
export function detectRuntime() {
|
|
14
|
+
// Check if we're in Bun
|
|
15
|
+
const isBun = typeof globalThis.Bun !== 'undefined';
|
|
16
|
+
// Check if we're in a browser-like environment
|
|
17
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
18
|
+
const isBrowser = typeof globalThis.window !== 'undefined';
|
|
19
|
+
// Check if we're in Deno
|
|
20
|
+
const isDeno = typeof globalThis.Deno !== 'undefined';
|
|
21
|
+
if (isBrowser || isDeno) {
|
|
22
|
+
return 'web';
|
|
23
|
+
}
|
|
24
|
+
// For Bun, try node first, fall back to web
|
|
25
|
+
if (isBun) {
|
|
26
|
+
try {
|
|
27
|
+
// Quick check if onnxruntime-node is loadable
|
|
28
|
+
require.resolve('onnxruntime-node');
|
|
29
|
+
return 'node';
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
return 'web';
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
// Default to node for Node.js
|
|
36
|
+
return 'node';
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Loads the appropriate ONNX runtime
|
|
40
|
+
*/
|
|
41
|
+
export async function loadRuntime(preferredRuntime) {
|
|
42
|
+
if (_runtime !== null) {
|
|
43
|
+
return _runtime;
|
|
44
|
+
}
|
|
45
|
+
const runtimeType = preferredRuntime ?? detectRuntime();
|
|
46
|
+
try {
|
|
47
|
+
if (runtimeType === 'node') {
|
|
48
|
+
// Dynamic import for onnxruntime-node
|
|
49
|
+
const ort = await import('onnxruntime-node');
|
|
50
|
+
_runtime = ort;
|
|
51
|
+
_runtimeType = 'node';
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
// Dynamic import for onnxruntime-web
|
|
55
|
+
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
|
56
|
+
// @ts-ignore - onnxruntime-web may not be installed
|
|
57
|
+
const ort = await import('onnxruntime-web');
|
|
58
|
+
_runtime = ort;
|
|
59
|
+
_runtimeType = 'web';
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
catch (e) {
|
|
63
|
+
// If preferred runtime fails, try the other
|
|
64
|
+
const fallbackType = runtimeType === 'node' ? 'web' : 'node';
|
|
65
|
+
try {
|
|
66
|
+
if (fallbackType === 'node') {
|
|
67
|
+
const ort = await import('onnxruntime-node');
|
|
68
|
+
_runtime = ort;
|
|
69
|
+
_runtimeType = 'node';
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
|
73
|
+
// @ts-ignore - onnxruntime-web may not be installed
|
|
74
|
+
const ort = await import('onnxruntime-web');
|
|
75
|
+
_runtime = ort;
|
|
76
|
+
_runtimeType = 'web';
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
throw new Error(`Failed to load ONNX runtime. Install either 'onnxruntime-node' or 'onnxruntime-web'.\n` +
|
|
81
|
+
`Original error: ${e}`);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return _runtime;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Gets the currently loaded runtime type
|
|
88
|
+
*/
|
|
89
|
+
export function getRuntimeType() {
|
|
90
|
+
return _runtimeType;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Resets the runtime (useful for testing)
|
|
94
|
+
*/
|
|
95
|
+
export function resetRuntime() {
|
|
96
|
+
_runtime = null;
|
|
97
|
+
_runtimeType = null;
|
|
98
|
+
}
|
|
99
|
+
//# sourceMappingURL=onnx-runtime.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onnx-runtime.js","sourceRoot":"","sources":["../../src/ner/onnx-runtime.ts"],"names":[],"mappings":"AAAA;;;GAGG;AA+BH;;GAEG;AACH,IAAI,QAAQ,GAAsB,IAAI,CAAC;AACvC,IAAI,YAAY,GAA0B,IAAI,CAAC;AAE/C;;GAEG;AACH,MAAM,UAAU,aAAa;IAC3B,wBAAwB;IACxB,MAAM,KAAK,GAAG,OAAO,UAAU,CAAC,GAAG,KAAK,WAAW,CAAC;IAEpD,+CAA+C;IAC/C,uEAAuE;IACvE,MAAM,SAAS,GAAG,OAAO,UAAU,CAAC,MAAM,KAAK,WAAW,CAAC;IAE3D,yBAAyB;IACzB,MAAM,MAAM,GAAG,OAAO,UAAU,CAAC,IAAI,KAAK,WAAW,CAAC;IAEtD,IAAI,SAAS,IAAI,MAAM,EAAE,CAAC;QACxB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,4CAA4C;IAC5C,IAAI,KAAK,EAAE,CAAC;QACV,IAAI,CAAC;YACH,8CAA8C;YAC9C,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;YACpC,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,gBAAiC;IACjE,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;QACtB,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,MAAM,WAAW,GAAG,gBAAgB,IAAI,aAAa,EAAE,CAAC;IAExD,IAAI,CAAC;QACH,IAAI,WAAW,KAAK,MAAM,EAAE,CAAC;YAC3B,sCAAsC;YACtC,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAC;YAC7C,QAAQ,GAAG,GAA4B,CAAC;YACxC,YAAY,GAAG,MAAM,CAAC;QACxB,CAAC;aAAM,CAAC;YACN,qCAAqC;YACrC,6DAA6D;YAC7D,oDAAoD;YACpD,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YAC5C,QAAQ,GAAG,GAA4B,CAAC;YACxC,YAAY,GAAG,KAAK,CAAC;QACvB,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,4CAA4C;QAC5C,MAAM,YAAY,GAAG,WAAW,KAAK,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;QAE7D,IAAI,CAAC;YACH,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAC;gBAC7C,QAAQ,GAAG,GAA4B,CAAC;gBACxC,YAAY,GAAG,MAAM,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,6DAA6D;gBAC7D,oDAAoD;gBACpD,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;gBAC5C,QAAQ,GAAG,GAA4B,CAAC;gBACxC,YAAY,GAAG,KAAK,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,wFAAwF;gBACxF,mBAAmB,CAAC,EAAE,CACvB,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc;IAC5B,OAAO,YAAY,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY;IAC1B,QAAQ,GAAG,IAAI,CAAC;IAChB,YAAY,GAAG,IAAI,CAAC;AACtB,CAAC"}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WordPiece Tokenizer
|
|
3
|
+
* Tokenizes text into subword tokens while maintaining character offset mapping
|
|
4
|
+
* Compatible with BERT-style models
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Token with offset information
|
|
8
|
+
*/
|
|
9
|
+
export interface Token {
|
|
10
|
+
/** Token ID in vocabulary */
|
|
11
|
+
id: number;
|
|
12
|
+
/** Token string */
|
|
13
|
+
token: string;
|
|
14
|
+
/** Start character offset in original text */
|
|
15
|
+
start: number;
|
|
16
|
+
/** End character offset in original text */
|
|
17
|
+
end: number;
|
|
18
|
+
/** Whether this is a continuation token (starts with ##) */
|
|
19
|
+
isContinuation: boolean;
|
|
20
|
+
/** Whether this is a special token ([CLS], [SEP], etc.) */
|
|
21
|
+
isSpecial: boolean;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Tokenization result with metadata
|
|
25
|
+
*/
|
|
26
|
+
export interface TokenizationResult {
|
|
27
|
+
/** Array of tokens */
|
|
28
|
+
tokens: Token[];
|
|
29
|
+
/** Input IDs for model */
|
|
30
|
+
inputIds: number[];
|
|
31
|
+
/** Attention mask */
|
|
32
|
+
attentionMask: number[];
|
|
33
|
+
/** Token type IDs (for BERT-style models) */
|
|
34
|
+
tokenTypeIds: number[];
|
|
35
|
+
/** Mapping from token index to character span [start, end] */
|
|
36
|
+
tokenToCharSpan: Array<[number, number] | null>;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Tokenizer configuration
|
|
40
|
+
*/
|
|
41
|
+
export interface TokenizerConfig {
|
|
42
|
+
/** Path to vocabulary file */
|
|
43
|
+
vocabPath?: string;
|
|
44
|
+
/** Vocabulary as a Map */
|
|
45
|
+
vocab?: Map<string, number>;
|
|
46
|
+
/** Maximum sequence length */
|
|
47
|
+
maxLength: number;
|
|
48
|
+
/** Unknown token */
|
|
49
|
+
unkToken: string;
|
|
50
|
+
/** Classification token */
|
|
51
|
+
clsToken: string;
|
|
52
|
+
/** Separator token */
|
|
53
|
+
sepToken: string;
|
|
54
|
+
/** Padding token */
|
|
55
|
+
padToken: string;
|
|
56
|
+
/** Mask token */
|
|
57
|
+
maskToken: string;
|
|
58
|
+
/** Whether to lowercase input */
|
|
59
|
+
doLowerCase: boolean;
|
|
60
|
+
/** Strip accents */
|
|
61
|
+
stripAccents: boolean;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Default tokenizer configuration for BERT-style models
|
|
65
|
+
*/
|
|
66
|
+
export declare const DEFAULT_TOKENIZER_CONFIG: TokenizerConfig;
|
|
67
|
+
/**
|
|
68
|
+
* WordPiece Tokenizer implementation
|
|
69
|
+
*/
|
|
70
|
+
export declare class WordPieceTokenizer {
|
|
71
|
+
private vocab;
|
|
72
|
+
private inverseVocab;
|
|
73
|
+
private config;
|
|
74
|
+
private unkId;
|
|
75
|
+
private clsId;
|
|
76
|
+
private sepId;
|
|
77
|
+
private padId;
|
|
78
|
+
constructor(vocab: Map<string, number>, config?: Partial<TokenizerConfig>);
|
|
79
|
+
/**
|
|
80
|
+
* Tokenizes text into tokens with offset tracking
|
|
81
|
+
*/
|
|
82
|
+
tokenize(text: string): TokenizationResult;
|
|
83
|
+
/**
|
|
84
|
+
* Preprocesses text (lowercase, accent stripping)
|
|
85
|
+
*/
|
|
86
|
+
private preprocess;
|
|
87
|
+
/**
|
|
88
|
+
* Strips accents from text
|
|
89
|
+
*/
|
|
90
|
+
private stripAccents;
|
|
91
|
+
/**
|
|
92
|
+
* Splits text into words while tracking character offsets
|
|
93
|
+
*/
|
|
94
|
+
private splitIntoWords;
|
|
95
|
+
/**
|
|
96
|
+
* Tokenizes a single word using WordPiece algorithm
|
|
97
|
+
*/
|
|
98
|
+
private tokenizeWord;
|
|
99
|
+
/**
|
|
100
|
+
* Splits a word into pieces, handling punctuation
|
|
101
|
+
*/
|
|
102
|
+
private splitWordIntoPieces;
|
|
103
|
+
/**
|
|
104
|
+
* Checks if a character is punctuation
|
|
105
|
+
*/
|
|
106
|
+
private isPunctuation;
|
|
107
|
+
/**
|
|
108
|
+
* Finds the longest matching token in vocabulary
|
|
109
|
+
*/
|
|
110
|
+
private findLongestMatch;
|
|
111
|
+
/**
|
|
112
|
+
* Decodes token IDs back to text
|
|
113
|
+
*/
|
|
114
|
+
decode(tokenIds: number[]): string;
|
|
115
|
+
/**
|
|
116
|
+
* Gets vocabulary size
|
|
117
|
+
*/
|
|
118
|
+
get vocabSize(): number;
|
|
119
|
+
/**
|
|
120
|
+
* Gets a token ID by string
|
|
121
|
+
*/
|
|
122
|
+
getTokenId(token: string): number | undefined;
|
|
123
|
+
/**
|
|
124
|
+
* Gets a token string by ID
|
|
125
|
+
*/
|
|
126
|
+
getToken(id: number): string | undefined;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Loads vocabulary from a text file (one token per line)
|
|
130
|
+
*/
|
|
131
|
+
export declare function loadVocabFromFile(path: string): Promise<Map<string, number>>;
|
|
132
|
+
/**
|
|
133
|
+
* Parses vocabulary from string content
|
|
134
|
+
*/
|
|
135
|
+
export declare function parseVocab(content: string): Map<string, number>;
|
|
136
|
+
/**
|
|
137
|
+
* Creates a minimal vocabulary for testing
|
|
138
|
+
*/
|
|
139
|
+
export declare function createTestVocab(): Map<string, number>;
|
|
140
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,4DAA4D;IAC5D,cAAc,EAAE,OAAO,CAAC;IACxB,2DAA2D;IAC3D,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0BAA0B;IAC1B,KAAK,CAAC,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC5B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,sBAAsB;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;IACrB,oBAAoB;IACpB,YAAY,EAAE,OAAO,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAStC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAGhC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,KAAK,CAAS;gBAEV,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAwE1C;;OAEG;IACH,OAAO,CAAC,UAAU;IAclB;;OAEG;IACH,OAAO,CAAC,YAAY;IAIpB;;OAEG;IACH,OAAO,CAAC,cAAc;IA0BtB;;OAEG;IACH,OAAO,CAAC,YAAY;IA2CpB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAuB3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAarB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAqBxB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IA8BlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAIlF;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CA8BrD"}
|