rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BIO Tag Decoder
|
|
3
|
+
* Converts BIO-tagged token sequences to entity spans
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, DetectionSource } from '../types/index.js';
|
|
6
|
+
import { getPIITypeFromNERLabel } from '../types/pii-types.js';
|
|
7
|
+
/**
|
|
8
|
+
* BIO tag types
|
|
9
|
+
*/
|
|
10
|
+
export var BIOTag;
|
|
11
|
+
(function (BIOTag) {
|
|
12
|
+
/** Beginning of an entity */
|
|
13
|
+
BIOTag["B"] = "B";
|
|
14
|
+
/** Inside an entity (continuation) */
|
|
15
|
+
BIOTag["I"] = "I";
|
|
16
|
+
/** Outside any entity */
|
|
17
|
+
BIOTag["O"] = "O";
|
|
18
|
+
})(BIOTag || (BIOTag = {}));
|
|
19
|
+
/**
|
|
20
|
+
* Parses a BIO label string (e.g., "B-PER", "I-ORG", "O")
|
|
21
|
+
*/
|
|
22
|
+
export function parseBIOLabel(label) {
|
|
23
|
+
if (label === 'O' || label === '[PAD]' || label === '[CLS]' || label === '[SEP]') {
|
|
24
|
+
return { tag: BIOTag.O, entityType: null };
|
|
25
|
+
}
|
|
26
|
+
const parts = label.split('-');
|
|
27
|
+
if (parts.length !== 2) {
|
|
28
|
+
return { tag: BIOTag.O, entityType: null };
|
|
29
|
+
}
|
|
30
|
+
const [tagStr, entityType] = parts;
|
|
31
|
+
let tag;
|
|
32
|
+
switch (tagStr?.toUpperCase()) {
|
|
33
|
+
case 'B':
|
|
34
|
+
tag = BIOTag.B;
|
|
35
|
+
break;
|
|
36
|
+
case 'I':
|
|
37
|
+
tag = BIOTag.I;
|
|
38
|
+
break;
|
|
39
|
+
default:
|
|
40
|
+
return { tag: BIOTag.O, entityType: null };
|
|
41
|
+
}
|
|
42
|
+
return { tag, entityType: entityType ?? null };
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Decodes BIO-tagged tokens into entity spans
|
|
46
|
+
*/
|
|
47
|
+
export function decodeBIOTags(tokens, labels, confidences, originalText) {
|
|
48
|
+
const entities = [];
|
|
49
|
+
let currentEntity = null;
|
|
50
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
51
|
+
const token = tokens[i];
|
|
52
|
+
const label = labels[i] ?? 'O';
|
|
53
|
+
const confidence = confidences[i] ?? 0;
|
|
54
|
+
// Skip special tokens
|
|
55
|
+
if (token.isSpecial) {
|
|
56
|
+
// If we have a current entity, close it
|
|
57
|
+
if (currentEntity !== null) {
|
|
58
|
+
entities.push(currentEntity);
|
|
59
|
+
currentEntity = null;
|
|
60
|
+
}
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
const { tag, entityType } = parseBIOLabel(label);
|
|
64
|
+
switch (tag) {
|
|
65
|
+
case BIOTag.B:
|
|
66
|
+
// Start of new entity
|
|
67
|
+
// Close previous entity if exists
|
|
68
|
+
if (currentEntity !== null) {
|
|
69
|
+
entities.push(currentEntity);
|
|
70
|
+
}
|
|
71
|
+
currentEntity = {
|
|
72
|
+
type: entityType ?? 'UNKNOWN',
|
|
73
|
+
start: token.start,
|
|
74
|
+
end: token.end,
|
|
75
|
+
confidence,
|
|
76
|
+
text: originalText.slice(token.start, token.end),
|
|
77
|
+
tokenIndices: [i],
|
|
78
|
+
};
|
|
79
|
+
break;
|
|
80
|
+
case BIOTag.I:
|
|
81
|
+
// Continuation of entity
|
|
82
|
+
if (currentEntity !== null && entityType === currentEntity.type) {
|
|
83
|
+
// Extend current entity
|
|
84
|
+
currentEntity.end = token.end;
|
|
85
|
+
currentEntity.text = originalText.slice(currentEntity.start, currentEntity.end);
|
|
86
|
+
currentEntity.tokenIndices.push(i);
|
|
87
|
+
// Average confidence
|
|
88
|
+
currentEntity.confidence =
|
|
89
|
+
(currentEntity.confidence * (currentEntity.tokenIndices.length - 1) + confidence) /
|
|
90
|
+
currentEntity.tokenIndices.length;
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
// I tag without matching B tag - treat as new entity (common in some models)
|
|
94
|
+
if (currentEntity !== null) {
|
|
95
|
+
entities.push(currentEntity);
|
|
96
|
+
}
|
|
97
|
+
currentEntity = {
|
|
98
|
+
type: entityType ?? 'UNKNOWN',
|
|
99
|
+
start: token.start,
|
|
100
|
+
end: token.end,
|
|
101
|
+
confidence,
|
|
102
|
+
text: originalText.slice(token.start, token.end),
|
|
103
|
+
tokenIndices: [i],
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
break;
|
|
107
|
+
case BIOTag.O:
|
|
108
|
+
// Outside entity - close current if exists
|
|
109
|
+
if (currentEntity !== null) {
|
|
110
|
+
entities.push(currentEntity);
|
|
111
|
+
currentEntity = null;
|
|
112
|
+
}
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
// Don't forget to close the last entity
|
|
117
|
+
if (currentEntity !== null) {
|
|
118
|
+
entities.push(currentEntity);
|
|
119
|
+
}
|
|
120
|
+
return entities;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Converts raw NER entities to SpanMatch format
|
|
124
|
+
*/
|
|
125
|
+
export function convertToSpanMatches(rawEntities, confidenceThreshold = 0.5) {
|
|
126
|
+
const spans = [];
|
|
127
|
+
for (const entity of rawEntities) {
|
|
128
|
+
// Filter by confidence
|
|
129
|
+
if (entity.confidence < confidenceThreshold) {
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
// Map entity type to PIIType
|
|
133
|
+
const piiType = getPIITypeFromNERLabel(entity.type);
|
|
134
|
+
if (piiType === null) {
|
|
135
|
+
continue; // Skip unknown types
|
|
136
|
+
}
|
|
137
|
+
spans.push({
|
|
138
|
+
type: piiType,
|
|
139
|
+
start: entity.start,
|
|
140
|
+
end: entity.end,
|
|
141
|
+
confidence: entity.confidence,
|
|
142
|
+
source: DetectionSource.NER,
|
|
143
|
+
text: entity.text,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
return spans;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Post-processes NER spans to clean up boundaries
|
|
150
|
+
*/
|
|
151
|
+
export function cleanupSpanBoundaries(spans, originalText) {
|
|
152
|
+
return spans.map((span) => {
|
|
153
|
+
let { start, end } = span;
|
|
154
|
+
// Trim leading whitespace
|
|
155
|
+
while (start < end && /\s/.test(originalText[start] ?? '')) {
|
|
156
|
+
start++;
|
|
157
|
+
}
|
|
158
|
+
// Trim trailing whitespace
|
|
159
|
+
while (end > start && /\s/.test(originalText[end - 1] ?? '')) {
|
|
160
|
+
end--;
|
|
161
|
+
}
|
|
162
|
+
// Trim leading/trailing punctuation for PERSON/ORG types
|
|
163
|
+
if (span.type === PIIType.PERSON || span.type === PIIType.ORG) {
|
|
164
|
+
while (start < end && /[.,;:!?'"()]/.test(originalText[start] ?? '')) {
|
|
165
|
+
start++;
|
|
166
|
+
}
|
|
167
|
+
while (end > start && /[.,;:!?'"()]/.test(originalText[end - 1] ?? '')) {
|
|
168
|
+
end--;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// If span became empty, return original
|
|
172
|
+
if (start >= end) {
|
|
173
|
+
return span;
|
|
174
|
+
}
|
|
175
|
+
return {
|
|
176
|
+
...span,
|
|
177
|
+
start,
|
|
178
|
+
end,
|
|
179
|
+
text: originalText.slice(start, end),
|
|
180
|
+
};
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Merges adjacent spans of the same type
|
|
185
|
+
*/
|
|
186
|
+
export function mergeAdjacentSpans(spans, originalText, maxGap = 1) {
|
|
187
|
+
if (spans.length <= 1)
|
|
188
|
+
return spans;
|
|
189
|
+
// Sort by start position
|
|
190
|
+
const sorted = [...spans].sort((a, b) => a.start - b.start);
|
|
191
|
+
const merged = [];
|
|
192
|
+
let current = sorted[0];
|
|
193
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
194
|
+
const next = sorted[i];
|
|
195
|
+
// Check if same type and close enough
|
|
196
|
+
const gap = next.start - current.end;
|
|
197
|
+
const gapText = originalText.slice(current.end, next.start);
|
|
198
|
+
const isOnlyWhitespace = /^\s*$/.test(gapText);
|
|
199
|
+
if (next.type === current.type && gap <= maxGap && isOnlyWhitespace) {
|
|
200
|
+
// Merge spans
|
|
201
|
+
current = {
|
|
202
|
+
...current,
|
|
203
|
+
end: next.end,
|
|
204
|
+
text: originalText.slice(current.start, next.end),
|
|
205
|
+
confidence: (current.confidence + next.confidence) / 2,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
else {
|
|
209
|
+
merged.push(current);
|
|
210
|
+
current = next;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
merged.push(current);
|
|
214
|
+
return merged;
|
|
215
|
+
}
|
|
216
|
+
//# sourceMappingURL=bio-decoder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bio-decoder.js","sourceRoot":"","sources":["../../src/ner/bio-decoder.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAa,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACxE,OAAO,EAAE,sBAAsB,EAAE,MAAM,uBAAuB,CAAC;AAG/D;;GAEG;AACH,MAAM,CAAN,IAAY,MAOX;AAPD,WAAY,MAAM;IAChB,6BAA6B;IAC7B,iBAAO,CAAA;IACP,sCAAsC;IACtC,iBAAO,CAAA;IACP,yBAAyB;IACzB,iBAAO,CAAA;AACT,CAAC,EAPW,MAAM,KAAN,MAAM,QAOjB;AA8BD;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,KAAa;IACzC,IAAI,KAAK,KAAK,GAAG,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;QACjF,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC/B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,GAAG,KAAK,CAAC;IAEnC,IAAI,GAAW,CAAC;IAChB,QAAQ,MAAM,EAAE,WAAW,EAAE,EAAE,CAAC;QAC9B,KAAK,GAAG;YACN,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YACf,MAAM;QACR,KAAK,GAAG;YACN,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YACf,MAAM;QACR;YACE,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC/C,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,UAAU,IAAI,IAAI,EAAE,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAC3B,MAAe,EACf,MAAgB,EAChB,WAAqB,EACrB,YAAoB;IAEpB,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,IAAI,aAAa,GAAwB,IAAI,CAAC;IAE9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QACzB,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;QAC/B,MAAM,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEvC,sBAAsB;QACtB,IAAI,KAAK,CAAC,SAAS,EAAE,CAAC;YACpB,wCAAwC;YACxC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;gBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC7B,aAAa,GAAG,IAAI,CAAC;YACvB,CAAC;YACD,SAAS;QACX,CAAC;QAED,MAAM,EAAE,GAAG,EAAE,UAAU,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;QAEjD,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,MAAM,CAAC,CAAC;gBACX,sBAAsB;gBACtB,kCAAkC;gBAClC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;oBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC/B,CAAC;gBAED,aAAa,GAAG;oBACd,IAAI,EAAE,UAAU,IAAI,SAAS;oBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;oBACd,UAAU;oBACV,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC;oBAChD,YAAY,EAAE,CAAC,CAAC,CAAC;iBAClB,CAAC;gBACF,MAAM;YAER,KAAK,MAAM,CAAC,CAAC;gBACX,yBAAyB;gBACzB,IAAI,aAAa,KAAK,IAAI,IAAI,UAAU,KAAK,aAAa,CAAC,IAAI,EAAE,CAAC;oBAChE,wBAAwB;oBACxB,aAAa,CAAC,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC;oBAC9B,aAAa,CAAC,IAAI,GAAG,YAAY,CAAC,KAAK,CAAC,aAAa,CAAC,KAAK,EAAE,aAAa,CAAC,GAAG,CAAC,CAAC;oBAChF,aAAa,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBACnC,qBAAqB;oBACrB,aAAa,CAAC,UAAU;wBACtB,CAAC,aAAa,CAAC,UAAU,GAAG,CAAC,aAAa,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC;4BACjF,aAAa,CAAC,YAAY,CAAC,MAAM,CAAC;gBACtC,CAAC;qBAAM,CAAC;oBACN,6EAA6E;oBAC7E,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;wBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC/B,CAAC;oBACD,aAAa,GAAG;wBACd,IAAI,EAAE,UAAU,IAAI,SAAS;wBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;wBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;wBACd,UAAU;wBACV,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC;wBAChD,YAAY,EAAE,CAAC,CAAC,CAAC;qBAClB,CAAC;gBACJ,CAAC;gBACD,MAAM;YAER,KAAK,MAAM,CAAC,CAAC;gBACX,2CAA2C;gBAC3C,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;oBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC7B,aAAa,GAAG,IAAI,CAAC;gBACvB,CAAC;gBACD,MAAM;QACV,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;QAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC/B,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAClC,WAA2B,EAC3B,sBAA8B,GAAG;IAEjC,MAAM,KAAK,GAAgB,EAAE,CAAC;IAE9B,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;QACjC,uBAAuB;QACvB,IAAI,MAAM,CAAC,UAAU,GAAG,mBAAmB,EAAE,CAAC;YAC5C,SAAS;QACX,CAAC;QAED,6BAA6B;QAC7B,MAAM,OAAO,GAAG,sBAAsB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACpD,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,SAAS,CAAC,qBAAqB;QACjC,CAAC;QAED,KAAK,CAAC,IAAI,CAAC;YACT,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,GAAG,EAAE,MAAM,CAAC,GAAG;YACf,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,MAAM,EAAE,eAAe,CAAC,GAAG;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,KAAkB,EAClB,YAAoB;IAEpB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACxB,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QAE1B,0BAA0B;QAC1B,OAAO,KAAK,GAAG,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;YAC3D,KAAK,EAAE,CAAC;QACV,CAAC;QAED,2BAA2B;QAC3B,OAAO,GAAG,GAAG,KAAK,IAAI,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;YAC7D,GAAG,EAAE,CAAC;QACR,CAAC;QAED,yDAAyD;QACzD,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,GAAG,EAAE,CAAC;YAC9D,OAAO,KAAK,GAAG,GAAG,IAAI,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;gBACrE,KAAK,EAAE,CAAC;YACV,CAAC;YACD,OAAO,GAAG,GAAG,KAAK,IAAI,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;gBACvE,GAAG,EAAE,CAAC;YACR,CAAC;QACH,CAAC;QAED,wCAAwC;QACxC,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;YACjB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,GAAG,IAAI;YACP,KAAK;YACL,GAAG;YACH,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC;SACrC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAChC,KAAkB,EAClB,YAAoB,EACpB,SAAiB,CAAC;IAElB,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAEpC,yBAAyB;IACzB,MAAM,MAAM,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,IAAI,OAAO,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QAExB,sCAAsC;QACtC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;QAC5D,MAAM,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE/C,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,IAAI,IAAI,GAAG,IAAI,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACpE,cAAc;YACd,OAAO,GAAG;gBACR,GAAG,OAAO;gBACV,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC;gBACjD,UAAU,EAAE,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;aACvD,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACrB,OAAO,GAAG,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACrB,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NER Module
|
|
3
|
+
* Exports NER model and tokenizer components
|
|
4
|
+
*/
|
|
5
|
+
export * from './tokenizer.js';
|
|
6
|
+
export * from './bio-decoder.js';
|
|
7
|
+
export * from './ner-model.js';
|
|
8
|
+
export { loadRuntime, detectRuntime, getRuntimeType } from './onnx-runtime.js';
|
|
9
|
+
export { type NERModelMode, type ModelInfo, type ModelFileInfo, type DownloadProgressCallback, MODEL_REGISTRY, getModelCacheDir, getModelPath, isModelDownloaded, downloadModel, ensureModel, clearModelCache, listDownloadedModels, getModelInfo, } from './model-manager.js';
|
|
10
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ner/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,gBAAgB,CAAC;AAC/B,cAAc,kBAAkB,CAAC;AACjC,cAAc,gBAAgB,CAAC;AAC/B,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAC/E,OAAO,EACL,KAAK,YAAY,EACjB,KAAK,SAAS,EACd,KAAK,aAAa,EAClB,KAAK,wBAAwB,EAC7B,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,EACjB,aAAa,EACb,WAAW,EACX,eAAe,EACf,oBAAoB,EACpB,YAAY,GACb,MAAM,oBAAoB,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NER Module
|
|
3
|
+
* Exports NER model and tokenizer components
|
|
4
|
+
*/
|
|
5
|
+
export * from './tokenizer.js';
|
|
6
|
+
export * from './bio-decoder.js';
|
|
7
|
+
export * from './ner-model.js';
|
|
8
|
+
export { loadRuntime, detectRuntime, getRuntimeType } from './onnx-runtime.js';
|
|
9
|
+
export { MODEL_REGISTRY, getModelCacheDir, getModelPath, isModelDownloaded, downloadModel, ensureModel, clearModelCache, listDownloadedModels, getModelInfo, } from './model-manager.js';
|
|
10
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/ner/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,gBAAgB,CAAC;AAC/B,cAAc,kBAAkB,CAAC;AACjC,cAAc,gBAAgB,CAAC;AAC/B,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAC/E,OAAO,EAKL,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,EACjB,aAAa,EACb,WAAW,EACX,eAAe,EACf,oBAAoB,EACpB,YAAY,GACb,MAAM,oBAAoB,CAAC"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NER Model Manager
|
|
3
|
+
* Handles automatic downloading and caching of NER models from Hugging Face Hub
|
|
4
|
+
* Browser-compatible using storage abstraction layer
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Available NER model variants
|
|
8
|
+
*/
|
|
9
|
+
export type NERModelMode = "standard" | "quantized" | "disabled" | "custom";
|
|
10
|
+
/**
|
|
11
|
+
* Model file info
|
|
12
|
+
*/
|
|
13
|
+
export interface ModelFileInfo {
|
|
14
|
+
/** Filename in the repo */
|
|
15
|
+
repoFile: string;
|
|
16
|
+
/** Local filename */
|
|
17
|
+
localFile: string;
|
|
18
|
+
/** Whether file is required */
|
|
19
|
+
required: boolean;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Model registry entry
|
|
23
|
+
*/
|
|
24
|
+
export interface ModelInfo {
|
|
25
|
+
/** Model identifier */
|
|
26
|
+
id: string;
|
|
27
|
+
/** Human-readable name */
|
|
28
|
+
name: string;
|
|
29
|
+
/** Description */
|
|
30
|
+
description: string;
|
|
31
|
+
/** Approximate size */
|
|
32
|
+
size: string;
|
|
33
|
+
/** Hugging Face repo ID */
|
|
34
|
+
hfRepo: string;
|
|
35
|
+
/** Subfolder in repo (for models with multiple variants) */
|
|
36
|
+
hfSubfolder?: string;
|
|
37
|
+
/** Files to download */
|
|
38
|
+
files: ModelFileInfo[];
|
|
39
|
+
/** Label map for this model */
|
|
40
|
+
labelMap: string[];
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Registry of available models hosted on Hugging Face Hub
|
|
44
|
+
*
|
|
45
|
+
* Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
|
|
46
|
+
* https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
|
|
47
|
+
*/
|
|
48
|
+
export declare const MODEL_REGISTRY: Record<"standard" | "quantized", ModelInfo>;
|
|
49
|
+
/**
|
|
50
|
+
* Gets the cache directory for models
|
|
51
|
+
* Uses platform-specific cache location (or virtual path in browser)
|
|
52
|
+
*/
|
|
53
|
+
export declare function getModelCacheDir(): Promise<string>;
|
|
54
|
+
/**
|
|
55
|
+
* Gets the path to a specific model variant
|
|
56
|
+
*/
|
|
57
|
+
export declare function getModelPath(mode: "standard" | "quantized"): Promise<string>;
|
|
58
|
+
/**
|
|
59
|
+
* Checks if a model is already downloaded
|
|
60
|
+
*/
|
|
61
|
+
export declare function isModelDownloaded(mode: "standard" | "quantized"): Promise<boolean>;
|
|
62
|
+
/**
|
|
63
|
+
* Progress callback for downloads
|
|
64
|
+
*/
|
|
65
|
+
export type DownloadProgressCallback = (progress: {
|
|
66
|
+
file: string;
|
|
67
|
+
bytesDownloaded: number;
|
|
68
|
+
totalBytes: number | null;
|
|
69
|
+
percent: number | null;
|
|
70
|
+
}) => void;
|
|
71
|
+
/**
|
|
72
|
+
* Downloads a model variant from Hugging Face Hub
|
|
73
|
+
*/
|
|
74
|
+
export declare function downloadModel(mode: "standard" | "quantized", onProgress?: DownloadProgressCallback, onStatus?: (status: string) => void): Promise<string>;
|
|
75
|
+
/**
|
|
76
|
+
* Gets model paths if available, or downloads if needed
|
|
77
|
+
*/
|
|
78
|
+
export declare function ensureModel(mode: "standard" | "quantized", options?: {
|
|
79
|
+
autoDownload?: boolean;
|
|
80
|
+
onProgress?: DownloadProgressCallback;
|
|
81
|
+
onStatus?: (status: string) => void;
|
|
82
|
+
}): Promise<{
|
|
83
|
+
modelPath: string;
|
|
84
|
+
vocabPath: string;
|
|
85
|
+
labelMapPath: string;
|
|
86
|
+
}>;
|
|
87
|
+
/**
|
|
88
|
+
* Clears cached models
|
|
89
|
+
*/
|
|
90
|
+
export declare function clearModelCache(mode?: "standard" | "quantized"): Promise<void>;
|
|
91
|
+
/**
|
|
92
|
+
* Lists downloaded models
|
|
93
|
+
*/
|
|
94
|
+
export declare function listDownloadedModels(): Promise<Array<{
|
|
95
|
+
mode: "standard" | "quantized";
|
|
96
|
+
path: string;
|
|
97
|
+
size: string;
|
|
98
|
+
}>>;
|
|
99
|
+
/**
|
|
100
|
+
* Gets info about available models
|
|
101
|
+
*/
|
|
102
|
+
export declare function getModelInfo(mode: "standard" | "quantized"): ModelInfo;
|
|
103
|
+
/**
|
|
104
|
+
* Reads a model file as ArrayBuffer (for onnxruntime)
|
|
105
|
+
*/
|
|
106
|
+
export declare function readModelFile(path: string): Promise<ArrayBuffer>;
|
|
107
|
+
/**
|
|
108
|
+
* Reads a text file from storage
|
|
109
|
+
*/
|
|
110
|
+
export declare function readTextFile(path: string): Promise<string>;
|
|
111
|
+
//# sourceMappingURL=model-manager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,CAiDtE,CAAC;AAiCF;;;GAGG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,MAAM,CAAC,CAGxD;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,IAAI,EAAE,UAAU,GAAG,WAAW,GAC7B,OAAO,CAAC,MAAM,CAAC,CAGjB;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,UAAU,GAAG,WAAW,GAC7B,OAAO,CAAC,OAAO,CAAC,CAyBlB;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,CAAC,QAAQ,EAAE;IAChD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAsGX;;GAEG;AACH,wBAAsB,aAAa,CACjC,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,UAAU,CAAC,EAAE,wBAAwB,EACrC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CAsDjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,wBAAwB,CAAC;IACtC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC,CAiCzE;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,IAAI,CAAC,EAAE,UAAU,GAAG,WAAW,GAC9B,OAAO,CAAC,IAAI,CAAC,CAUf;AAED;;GAEG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CACnD,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,GAAG,WAAW,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CACtE,CAgBA;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAEtE;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOtE;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAGhE"}
|