rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
@@ -0,0 +1,216 @@
1
+ /**
2
+ * BIO Tag Decoder
3
+ * Converts BIO-tagged token sequences to entity spans
4
+ */
5
+ import { PIIType, DetectionSource } from '../types/index.js';
6
+ import { getPIITypeFromNERLabel } from '../types/pii-types.js';
7
+ /**
8
+ * BIO tag types
9
+ */
10
+ export var BIOTag;
11
+ (function (BIOTag) {
12
+ /** Beginning of an entity */
13
+ BIOTag["B"] = "B";
14
+ /** Inside an entity (continuation) */
15
+ BIOTag["I"] = "I";
16
+ /** Outside any entity */
17
+ BIOTag["O"] = "O";
18
+ })(BIOTag || (BIOTag = {}));
19
+ /**
20
+ * Parses a BIO label string (e.g., "B-PER", "I-ORG", "O")
21
+ */
22
+ export function parseBIOLabel(label) {
23
+ if (label === 'O' || label === '[PAD]' || label === '[CLS]' || label === '[SEP]') {
24
+ return { tag: BIOTag.O, entityType: null };
25
+ }
26
+ const parts = label.split('-');
27
+ if (parts.length !== 2) {
28
+ return { tag: BIOTag.O, entityType: null };
29
+ }
30
+ const [tagStr, entityType] = parts;
31
+ let tag;
32
+ switch (tagStr?.toUpperCase()) {
33
+ case 'B':
34
+ tag = BIOTag.B;
35
+ break;
36
+ case 'I':
37
+ tag = BIOTag.I;
38
+ break;
39
+ default:
40
+ return { tag: BIOTag.O, entityType: null };
41
+ }
42
+ return { tag, entityType: entityType ?? null };
43
+ }
44
+ /**
45
+ * Decodes BIO-tagged tokens into entity spans
46
+ */
47
+ export function decodeBIOTags(tokens, labels, confidences, originalText) {
48
+ const entities = [];
49
+ let currentEntity = null;
50
+ for (let i = 0; i < tokens.length; i++) {
51
+ const token = tokens[i];
52
+ const label = labels[i] ?? 'O';
53
+ const confidence = confidences[i] ?? 0;
54
+ // Skip special tokens
55
+ if (token.isSpecial) {
56
+ // If we have a current entity, close it
57
+ if (currentEntity !== null) {
58
+ entities.push(currentEntity);
59
+ currentEntity = null;
60
+ }
61
+ continue;
62
+ }
63
+ const { tag, entityType } = parseBIOLabel(label);
64
+ switch (tag) {
65
+ case BIOTag.B:
66
+ // Start of new entity
67
+ // Close previous entity if exists
68
+ if (currentEntity !== null) {
69
+ entities.push(currentEntity);
70
+ }
71
+ currentEntity = {
72
+ type: entityType ?? 'UNKNOWN',
73
+ start: token.start,
74
+ end: token.end,
75
+ confidence,
76
+ text: originalText.slice(token.start, token.end),
77
+ tokenIndices: [i],
78
+ };
79
+ break;
80
+ case BIOTag.I:
81
+ // Continuation of entity
82
+ if (currentEntity !== null && entityType === currentEntity.type) {
83
+ // Extend current entity
84
+ currentEntity.end = token.end;
85
+ currentEntity.text = originalText.slice(currentEntity.start, currentEntity.end);
86
+ currentEntity.tokenIndices.push(i);
87
+ // Average confidence
88
+ currentEntity.confidence =
89
+ (currentEntity.confidence * (currentEntity.tokenIndices.length - 1) + confidence) /
90
+ currentEntity.tokenIndices.length;
91
+ }
92
+ else {
93
+ // I tag without matching B tag - treat as new entity (common in some models)
94
+ if (currentEntity !== null) {
95
+ entities.push(currentEntity);
96
+ }
97
+ currentEntity = {
98
+ type: entityType ?? 'UNKNOWN',
99
+ start: token.start,
100
+ end: token.end,
101
+ confidence,
102
+ text: originalText.slice(token.start, token.end),
103
+ tokenIndices: [i],
104
+ };
105
+ }
106
+ break;
107
+ case BIOTag.O:
108
+ // Outside entity - close current if exists
109
+ if (currentEntity !== null) {
110
+ entities.push(currentEntity);
111
+ currentEntity = null;
112
+ }
113
+ break;
114
+ }
115
+ }
116
+ // Don't forget to close the last entity
117
+ if (currentEntity !== null) {
118
+ entities.push(currentEntity);
119
+ }
120
+ return entities;
121
+ }
122
+ /**
123
+ * Converts raw NER entities to SpanMatch format
124
+ */
125
+ export function convertToSpanMatches(rawEntities, confidenceThreshold = 0.5) {
126
+ const spans = [];
127
+ for (const entity of rawEntities) {
128
+ // Filter by confidence
129
+ if (entity.confidence < confidenceThreshold) {
130
+ continue;
131
+ }
132
+ // Map entity type to PIIType
133
+ const piiType = getPIITypeFromNERLabel(entity.type);
134
+ if (piiType === null) {
135
+ continue; // Skip unknown types
136
+ }
137
+ spans.push({
138
+ type: piiType,
139
+ start: entity.start,
140
+ end: entity.end,
141
+ confidence: entity.confidence,
142
+ source: DetectionSource.NER,
143
+ text: entity.text,
144
+ });
145
+ }
146
+ return spans;
147
+ }
148
+ /**
149
+ * Post-processes NER spans to clean up boundaries
150
+ */
151
+ export function cleanupSpanBoundaries(spans, originalText) {
152
+ return spans.map((span) => {
153
+ let { start, end } = span;
154
+ // Trim leading whitespace
155
+ while (start < end && /\s/.test(originalText[start] ?? '')) {
156
+ start++;
157
+ }
158
+ // Trim trailing whitespace
159
+ while (end > start && /\s/.test(originalText[end - 1] ?? '')) {
160
+ end--;
161
+ }
162
+ // Trim leading/trailing punctuation for PERSON/ORG types
163
+ if (span.type === PIIType.PERSON || span.type === PIIType.ORG) {
164
+ while (start < end && /[.,;:!?'"()]/.test(originalText[start] ?? '')) {
165
+ start++;
166
+ }
167
+ while (end > start && /[.,;:!?'"()]/.test(originalText[end - 1] ?? '')) {
168
+ end--;
169
+ }
170
+ }
171
+ // If span became empty, return original
172
+ if (start >= end) {
173
+ return span;
174
+ }
175
+ return {
176
+ ...span,
177
+ start,
178
+ end,
179
+ text: originalText.slice(start, end),
180
+ };
181
+ });
182
+ }
183
+ /**
184
+ * Merges adjacent spans of the same type
185
+ */
186
+ export function mergeAdjacentSpans(spans, originalText, maxGap = 1) {
187
+ if (spans.length <= 1)
188
+ return spans;
189
+ // Sort by start position
190
+ const sorted = [...spans].sort((a, b) => a.start - b.start);
191
+ const merged = [];
192
+ let current = sorted[0];
193
+ for (let i = 1; i < sorted.length; i++) {
194
+ const next = sorted[i];
195
+ // Check if same type and close enough
196
+ const gap = next.start - current.end;
197
+ const gapText = originalText.slice(current.end, next.start);
198
+ const isOnlyWhitespace = /^\s*$/.test(gapText);
199
+ if (next.type === current.type && gap <= maxGap && isOnlyWhitespace) {
200
+ // Merge spans
201
+ current = {
202
+ ...current,
203
+ end: next.end,
204
+ text: originalText.slice(current.start, next.end),
205
+ confidence: (current.confidence + next.confidence) / 2,
206
+ };
207
+ }
208
+ else {
209
+ merged.push(current);
210
+ current = next;
211
+ }
212
+ }
213
+ merged.push(current);
214
+ return merged;
215
+ }
216
+ //# sourceMappingURL=bio-decoder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"bio-decoder.js","sourceRoot":"","sources":["../../src/ner/bio-decoder.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAa,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACxE,OAAO,EAAE,sBAAsB,EAAE,MAAM,uBAAuB,CAAC;AAG/D;;GAEG;AACH,MAAM,CAAN,IAAY,MAOX;AAPD,WAAY,MAAM;IAChB,6BAA6B;IAC7B,iBAAO,CAAA;IACP,sCAAsC;IACtC,iBAAO,CAAA;IACP,yBAAyB;IACzB,iBAAO,CAAA;AACT,CAAC,EAPW,MAAM,KAAN,MAAM,QAOjB;AA8BD;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,KAAa;IACzC,IAAI,KAAK,KAAK,GAAG,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;QACjF,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC/B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,GAAG,KAAK,CAAC;IAEnC,IAAI,GAAW,CAAC;IAChB,QAAQ,MAAM,EAAE,WAAW,EAAE,EAAE,CAAC;QAC9B,KAAK,GAAG;YACN,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YACf,MAAM;QACR,KAAK,GAAG;YACN,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YACf,MAAM;QACR;YACE,OAAO,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAC/C,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,UAAU,IAAI,IAAI,EAAE,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAC3B,MAAe,EACf,MAAgB,EAChB,WAAqB,EACrB,YAAoB;IAEpB,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,IAAI,aAAa,GAAwB,IAAI,CAAC;IAE9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QACzB,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;QAC/B,MAAM,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEvC,sBAAsB;QACtB,IAAI,KAAK,CAAC,SAAS,EAAE,CAAC;YACpB,wCAAwC;YACxC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;gBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC7B,aAAa,GAAG,IAAI,CAAC;YACvB,CAAC;YACD,SAAS;QACX,CAAC;QAED,MAAM,EAAE,GAAG,EAAE,UAAU,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;QAEjD,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,MAAM,CAAC,CAAC;gBACX,sBAAsB;gBACtB,kCAAkC;gBAClC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;oBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;gBAC/B,CAAC;gBAED,aAAa,GAAG;oBACd,IAAI,EAAE,UAAU,IAAI,SAAS;oBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;oBACd,UAAU;oBACV,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC;oBAChD,YAAY,EAAE,CAAC,CAAC,CAAC;iBAClB,CAAC;gBACF,MAAM;YAER,KAAK,MAAM,CAAC,CAAC;gBACX,yBAAyB;gBACzB,IAAI,aAAa,KAAK,IAAI,IAAI,UAAU,KAAK,aAAa,CAAC,IAAI,EAAE,CAAC;oBAChE,wBAAwB;oBACxB,aAAa,CAAC,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC;oBAC9B,aAAa,CAAC,IAAI,GAAG,YAAY,CAAC,KAAK,CAAC,aAAa,CAAC,KAAK,EAAE,aAAa,CAAC,GAAG,CAAC,CAAC;oBAChF,aAAa,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBACnC,qBAAqB;oBACrB,aAAa,CAAC,UAAU;wBACtB,CAAC,aAAa,CAAC,UAAU,GAAG,CAAC,aAAa,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC;4BACjF,aAAa,CAAC,YAAY,CAAC,MAAM,CAAC;gBACtC,CAAC;qBAAM,CAAC;oBACN,6EAA6E;oBAC7E,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;wBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC/B,CAAC;oBACD,aAAa,GAAG;wBACd,IAAI,EAAE,UAAU,IAAI,SAAS;wBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;wBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;wBACd,UAAU;wBACV,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC;wBAChD,YAAY,EAAE,CAAC,CAAC,CAAC;qBAClB,CAAC;gBACJ,CAAC;gBACD,MAAM;YAER,KAAK,MAAM,CAAC,CAAC;gBACX,2CAA2C;gBAC3C,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;oBAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC7B,aAAa,GAAG,IAAI,CAAC;gBACvB,CAAC;gBACD,MAAM;QACV,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;QAC3B,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC/B,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAClC,WAA2B,EAC3B,sBAA8B,GAAG;IAEjC,MAAM,KAAK,GAAgB,EAAE,CAAC;IAE9B,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;QACjC,uBAAuB;QACvB,IAAI,MAAM,CAAC,UAAU,GAAG,mBAAmB,EAAE,CAAC;YAC5C,SAAS;QACX,CAAC;QAED,6BAA6B;QAC7B,MAAM,OAAO,GAAG,sBAAsB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACpD,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,SAAS,CAAC,qBAAqB;QACjC,CAAC;QAED,KAAK,CAAC,IAAI,CAAC;YACT,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,GAAG,EAAE,MAAM,CAAC,GAAG;YACf,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,MAAM,EAAE,eAAe,CAAC,GAAG;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,KAAkB,EAClB,YAAoB;IAEpB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACxB,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QAE1B,0BAA0B;QAC1B,OAAO,KAAK,GAAG,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;YAC3D,KAAK,EAAE,CAAC;QACV,CAAC;QAED,2BAA2B;QAC3B,OAAO,GAAG,GAAG,KAAK,IAAI,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;YAC7D,GAAG,EAAE,CAAC;QACR,CAAC;QAED,yDAAyD;QACzD,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,GAAG,EAAE,CAAC;YAC9D,OAAO,KAAK,GAAG,GAAG,IAAI,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;gBACrE,KAAK,EAAE,CAAC;YACV,CAAC;YACD,OAAO,GAAG,GAAG,KAAK,IAAI,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC;gBACvE,GAAG,EAAE,CAAC;YACR,CAAC;QACH,CAAC;QAED,wCAAwC;QACxC,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;YACjB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,GAAG,IAAI;YACP,KAAK;YACL,GAAG;YACH,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC;SACrC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAChC,KAAkB,EAClB,YAAoB,EACpB,SAAiB,CAAC;IAElB,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAEpC,yBAAyB;IACzB,MAAM,MAAM,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,IAAI,OAAO,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QAExB,sCAAsC;QACtC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;QAC5D,MAAM,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE/C,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,IAAI,IAAI,GAAG,IAAI,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACpE,cAAc;YACd,OAAO,GAAG;gBACR,GAAG,OAAO;gBACV,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC;gBACjD,UAAU,EAAE,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;aACvD,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACrB,OAAO,GAAG,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACrB,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * NER Module
3
+ * Exports NER model and tokenizer components
4
+ */
5
+ export * from './tokenizer.js';
6
+ export * from './bio-decoder.js';
7
+ export * from './ner-model.js';
8
+ export { loadRuntime, detectRuntime, getRuntimeType } from './onnx-runtime.js';
9
+ export { type NERModelMode, type ModelInfo, type ModelFileInfo, type DownloadProgressCallback, MODEL_REGISTRY, getModelCacheDir, getModelPath, isModelDownloaded, downloadModel, ensureModel, clearModelCache, listDownloadedModels, getModelInfo, } from './model-manager.js';
10
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ner/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,gBAAgB,CAAC;AAC/B,cAAc,kBAAkB,CAAC;AACjC,cAAc,gBAAgB,CAAC;AAC/B,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAC/E,OAAO,EACL,KAAK,YAAY,EACjB,KAAK,SAAS,EACd,KAAK,aAAa,EAClB,KAAK,wBAAwB,EAC7B,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,EACjB,aAAa,EACb,WAAW,EACX,eAAe,EACf,oBAAoB,EACpB,YAAY,GACb,MAAM,oBAAoB,CAAC"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * NER Module
3
+ * Exports NER model and tokenizer components
4
+ */
5
+ export * from './tokenizer.js';
6
+ export * from './bio-decoder.js';
7
+ export * from './ner-model.js';
8
+ export { loadRuntime, detectRuntime, getRuntimeType } from './onnx-runtime.js';
9
+ export { MODEL_REGISTRY, getModelCacheDir, getModelPath, isModelDownloaded, downloadModel, ensureModel, clearModelCache, listDownloadedModels, getModelInfo, } from './model-manager.js';
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/ner/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,gBAAgB,CAAC;AAC/B,cAAc,kBAAkB,CAAC;AACjC,cAAc,gBAAgB,CAAC;AAC/B,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAC/E,OAAO,EAKL,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,EACjB,aAAa,EACb,WAAW,EACX,eAAe,EACf,oBAAoB,EACpB,YAAY,GACb,MAAM,oBAAoB,CAAC"}
@@ -0,0 +1,111 @@
1
+ /**
2
+ * NER Model Manager
3
+ * Handles automatic downloading and caching of NER models from Hugging Face Hub
4
+ * Browser-compatible using storage abstraction layer
5
+ */
6
+ /**
7
+ * Available NER model variants
8
+ */
9
+ export type NERModelMode = "standard" | "quantized" | "disabled" | "custom";
10
+ /**
11
+ * Model file info
12
+ */
13
+ export interface ModelFileInfo {
14
+ /** Filename in the repo */
15
+ repoFile: string;
16
+ /** Local filename */
17
+ localFile: string;
18
+ /** Whether file is required */
19
+ required: boolean;
20
+ }
21
+ /**
22
+ * Model registry entry
23
+ */
24
+ export interface ModelInfo {
25
+ /** Model identifier */
26
+ id: string;
27
+ /** Human-readable name */
28
+ name: string;
29
+ /** Description */
30
+ description: string;
31
+ /** Approximate size */
32
+ size: string;
33
+ /** Hugging Face repo ID */
34
+ hfRepo: string;
35
+ /** Subfolder in repo (for models with multiple variants) */
36
+ hfSubfolder?: string;
37
+ /** Files to download */
38
+ files: ModelFileInfo[];
39
+ /** Label map for this model */
40
+ labelMap: string[];
41
+ }
42
+ /**
43
+ * Registry of available models hosted on Hugging Face Hub
44
+ *
45
+ * Using ELAN's ONNX exports which are optimized for JS/ONNX runtime
46
+ * https://huggingface.co/tjruesch/xlm-roberta-base-ner-hrl-onnx
47
+ */
48
+ export declare const MODEL_REGISTRY: Record<"standard" | "quantized", ModelInfo>;
49
+ /**
50
+ * Gets the cache directory for models
51
+ * Uses platform-specific cache location (or virtual path in browser)
52
+ */
53
+ export declare function getModelCacheDir(): Promise<string>;
54
+ /**
55
+ * Gets the path to a specific model variant
56
+ */
57
+ export declare function getModelPath(mode: "standard" | "quantized"): Promise<string>;
58
+ /**
59
+ * Checks if a model is already downloaded
60
+ */
61
+ export declare function isModelDownloaded(mode: "standard" | "quantized"): Promise<boolean>;
62
+ /**
63
+ * Progress callback for downloads
64
+ */
65
+ export type DownloadProgressCallback = (progress: {
66
+ file: string;
67
+ bytesDownloaded: number;
68
+ totalBytes: number | null;
69
+ percent: number | null;
70
+ }) => void;
71
+ /**
72
+ * Downloads a model variant from Hugging Face Hub
73
+ */
74
+ export declare function downloadModel(mode: "standard" | "quantized", onProgress?: DownloadProgressCallback, onStatus?: (status: string) => void): Promise<string>;
75
+ /**
76
+ * Gets model paths if available, or downloads if needed
77
+ */
78
+ export declare function ensureModel(mode: "standard" | "quantized", options?: {
79
+ autoDownload?: boolean;
80
+ onProgress?: DownloadProgressCallback;
81
+ onStatus?: (status: string) => void;
82
+ }): Promise<{
83
+ modelPath: string;
84
+ vocabPath: string;
85
+ labelMapPath: string;
86
+ }>;
87
+ /**
88
+ * Clears cached models
89
+ */
90
+ export declare function clearModelCache(mode?: "standard" | "quantized"): Promise<void>;
91
+ /**
92
+ * Lists downloaded models
93
+ */
94
+ export declare function listDownloadedModels(): Promise<Array<{
95
+ mode: "standard" | "quantized";
96
+ path: string;
97
+ size: string;
98
+ }>>;
99
+ /**
100
+ * Gets info about available models
101
+ */
102
+ export declare function getModelInfo(mode: "standard" | "quantized"): ModelInfo;
103
+ /**
104
+ * Reads a model file as ArrayBuffer (for onnxruntime)
105
+ */
106
+ export declare function readModelFile(path: string): Promise<ArrayBuffer>;
107
+ /**
108
+ * Reads a text file from storage
109
+ */
110
+ export declare function readTextFile(path: string): Promise<string>;
111
+ //# sourceMappingURL=model-manager.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"model-manager.d.ts","sourceRoot":"","sources":["../../src/ner/model-manager.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,QAAQ,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,qBAAqB;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,CAAC,UAAU,GAAG,WAAW,EAAE,SAAS,CAiDtE,CAAC;AAiCF;;;GAGG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,MAAM,CAAC,CAGxD;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,IAAI,EAAE,UAAU,GAAG,WAAW,GAC7B,OAAO,CAAC,MAAM,CAAC,CAGjB;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,UAAU,GAAG,WAAW,GAC7B,OAAO,CAAC,OAAO,CAAC,CAyBlB;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,CAAC,QAAQ,EAAE;IAChD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAsGX;;GAEG;AACH,wBAAsB,aAAa,CACjC,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,UAAU,CAAC,EAAE,wBAAwB,EACrC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CAsDjB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,UAAU,GAAG,WAAW,EAC9B,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,wBAAwB,CAAC;IACtC,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC,CAiCzE;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,IAAI,CAAC,EAAE,UAAU,GAAG,WAAW,GAC9B,OAAO,CAAC,IAAI,CAAC,CAUf;AAED;;GAEG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CACnD,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,GAAG,WAAW,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CACtE,CAgBA;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAEtE;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOtE;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAGhE"}