rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
@@ -0,0 +1,239 @@
1
+ /**
2
+ * Entity Resolver
3
+ * Merges, deduplicates, and resolves overlapping entity detections
4
+ */
5
+ import { PIIType, DetectionSource, DEFAULT_TYPE_PRIORITY, } from '../types/index.js';
6
+ import { spansOverlap, spanLength, sortSpansByPosition } from '../utils/offsets.js';
7
+ /**
8
+ * Resolution strategy for overlapping entities
9
+ */
10
+ export var OverlapStrategy;
11
+ (function (OverlapStrategy) {
12
+ /** Regex matches always win over NER */
13
+ OverlapStrategy["REGEX_PRIORITY"] = "REGEX_PRIORITY";
14
+ /** Longer span wins */
15
+ OverlapStrategy["LONGER_SPAN"] = "LONGER_SPAN";
16
+ /** Higher confidence wins */
17
+ OverlapStrategy["HIGHER_CONFIDENCE"] = "HIGHER_CONFIDENCE";
18
+ /** Use type priority from policy */
19
+ OverlapStrategy["TYPE_PRIORITY"] = "TYPE_PRIORITY";
20
+ })(OverlapStrategy || (OverlapStrategy = {}));
21
+ /**
22
+ * Default resolver configuration
23
+ */
24
+ export const DEFAULT_RESOLVER_CONFIG = {
25
+ overlapStrategy: OverlapStrategy.REGEX_PRIORITY,
26
+ regexPriority: true,
27
+ minConfidence: 0.5,
28
+ };
29
+ /**
30
+ * Resolves and merges entity detections from regex and NER
31
+ */
32
+ export function resolveEntities(regexMatches, nerMatches, policy, originalText, config = {}) {
33
+ const resolverConfig = { ...DEFAULT_RESOLVER_CONFIG, ...config };
34
+ // Step 1: Filter by enabled types and confidence thresholds
35
+ const filteredRegex = filterByPolicy(regexMatches, policy);
36
+ const filteredNER = filterByPolicy(nerMatches, policy);
37
+ // Step 2: Apply allowlist filtering
38
+ const allowlistFilteredRegex = applyAllowlist(filteredRegex, policy, originalText);
39
+ const allowlistFilteredNER = applyAllowlist(filteredNER, policy, originalText);
40
+ // Step 3: Combine all matches
41
+ const allMatches = [...allowlistFilteredRegex, ...allowlistFilteredNER];
42
+ // Step 4: Remove overlaps based on strategy
43
+ const resolved = removeOverlaps(allMatches, policy, resolverConfig);
44
+ // Step 5: Apply denylist patterns (force include)
45
+ const withDenylist = applyDenylist(resolved, policy, originalText);
46
+ // Step 6: Final deduplication
47
+ const deduplicated = deduplicateExact(withDenylist);
48
+ // Step 7: Sort by position
49
+ return sortSpansByPosition(deduplicated);
50
+ }
51
+ /**
52
+ * Filters matches by policy (enabled types and confidence thresholds)
53
+ */
54
+ function filterByPolicy(matches, policy) {
55
+ return matches.filter((match) => {
56
+ // Check if type is enabled
57
+ if (!policy.enabledTypes.has(match.type)) {
58
+ return false;
59
+ }
60
+ // Check confidence threshold
61
+ const threshold = policy.confidenceThresholds.get(match.type) ?? 0.5;
62
+ if (match.confidence < threshold) {
63
+ return false;
64
+ }
65
+ return true;
66
+ });
67
+ }
68
+ /**
69
+ * Filters out matches that are in the allowlist (known non-PII terms)
70
+ */
71
+ function applyAllowlist(matches, policy, _originalText) {
72
+ if (policy.allowlistTerms.size === 0) {
73
+ return matches;
74
+ }
75
+ return matches.filter((match) => {
76
+ const matchText = match.text.toLowerCase().trim();
77
+ return !policy.allowlistTerms.has(matchText);
78
+ });
79
+ }
80
+ /**
81
+ * Adds matches from denylist patterns (patterns that must always be PII)
82
+ */
83
+ function applyDenylist(matches, policy, originalText) {
84
+ if (policy.denylistPatterns.length === 0) {
85
+ return matches;
86
+ }
87
+ const denylistMatches = [];
88
+ for (const pattern of policy.denylistPatterns) {
89
+ const globalPattern = pattern.global
90
+ ? pattern
91
+ : new RegExp(pattern.source, pattern.flags + 'g');
92
+ for (const match of originalText.matchAll(globalPattern)) {
93
+ if (match.index === undefined)
94
+ continue;
95
+ // Check if this is already covered by existing matches
96
+ const matchIndex = match.index;
97
+ const alreadyCovered = matches.some((existing) => existing.start <= matchIndex &&
98
+ existing.end >= matchIndex + match[0].length);
99
+ if (!alreadyCovered) {
100
+ denylistMatches.push({
101
+ type: PIIType.EMAIL, // Default type for denylist; could be configurable
102
+ start: match.index,
103
+ end: match.index + match[0].length,
104
+ confidence: 1.0,
105
+ source: DetectionSource.REGEX,
106
+ text: match[0],
107
+ });
108
+ }
109
+ }
110
+ }
111
+ return [...matches, ...denylistMatches];
112
+ }
113
+ /**
114
+ * Removes overlapping spans based on resolution strategy
115
+ */
116
+ function removeOverlaps(matches, policy, config) {
117
+ if (matches.length <= 1) {
118
+ return matches;
119
+ }
120
+ // Sort by start position
121
+ const sorted = sortSpansByPosition(matches);
122
+ const result = [];
123
+ for (const match of sorted) {
124
+ // Find overlapping matches in result
125
+ const overlappingIdx = result.findIndex((existing) => spansOverlap(match, existing));
126
+ if (overlappingIdx === -1) {
127
+ // No overlap, add directly
128
+ result.push(match);
129
+ }
130
+ else {
131
+ // Has overlap, resolve
132
+ const existing = result[overlappingIdx];
133
+ const winner = resolveOverlap(existing, match, policy, config);
134
+ if (winner === match) {
135
+ // New match wins, replace existing
136
+ result[overlappingIdx] = match;
137
+ }
138
+ // Otherwise keep existing (do nothing)
139
+ }
140
+ }
141
+ return result;
142
+ }
143
+ /**
144
+ * Resolves overlap between two spans
145
+ * Returns the winner
146
+ */
147
+ function resolveOverlap(a, b, policy, config) {
148
+ // Rule 1: Regex always beats NER if configured
149
+ if (config.regexPriority) {
150
+ if (a.source === DetectionSource.REGEX && b.source !== DetectionSource.REGEX) {
151
+ return a;
152
+ }
153
+ if (b.source === DetectionSource.REGEX && a.source !== DetectionSource.REGEX) {
154
+ return b;
155
+ }
156
+ }
157
+ // Rule 2: Apply overlap strategy
158
+ switch (config.overlapStrategy) {
159
+ case OverlapStrategy.LONGER_SPAN: {
160
+ const lenA = spanLength(a);
161
+ const lenB = spanLength(b);
162
+ if (lenA !== lenB) {
163
+ return lenA > lenB ? a : b;
164
+ }
165
+ break;
166
+ }
167
+ case OverlapStrategy.HIGHER_CONFIDENCE: {
168
+ if (a.confidence !== b.confidence) {
169
+ return a.confidence > b.confidence ? a : b;
170
+ }
171
+ break;
172
+ }
173
+ case OverlapStrategy.TYPE_PRIORITY: {
174
+ const priorityA = getTypePriority(a.type, policy);
175
+ const priorityB = getTypePriority(b.type, policy);
176
+ if (priorityA !== priorityB) {
177
+ return priorityA > priorityB ? a : b;
178
+ }
179
+ break;
180
+ }
181
+ case OverlapStrategy.REGEX_PRIORITY:
182
+ default:
183
+ // Already handled above
184
+ break;
185
+ }
186
+ // Tiebreakers: longer span > higher confidence > type priority
187
+ const lenA = spanLength(a);
188
+ const lenB = spanLength(b);
189
+ if (lenA !== lenB) {
190
+ return lenA > lenB ? a : b;
191
+ }
192
+ if (a.confidence !== b.confidence) {
193
+ return a.confidence > b.confidence ? a : b;
194
+ }
195
+ const priorityA = getTypePriority(a.type, policy);
196
+ const priorityB = getTypePriority(b.type, policy);
197
+ if (priorityA !== priorityB) {
198
+ return priorityA > priorityB ? a : b;
199
+ }
200
+ // Final tiebreaker: keep first one
201
+ return a;
202
+ }
203
+ /**
204
+ * Gets type priority from policy (higher = more important)
205
+ */
206
+ function getTypePriority(type, policy) {
207
+ const priorityList = policy.typePriority.length > 0 ? policy.typePriority : [...DEFAULT_TYPE_PRIORITY];
208
+ const index = priorityList.indexOf(type);
209
+ return index >= 0 ? index : -1;
210
+ }
211
+ /**
212
+ * Removes exact duplicate spans
213
+ */
214
+ function deduplicateExact(matches) {
215
+ const seen = new Set();
216
+ const result = [];
217
+ for (const match of matches) {
218
+ const key = `${match.start}:${match.end}:${match.type}`;
219
+ if (!seen.has(key)) {
220
+ seen.add(key);
221
+ result.push(match);
222
+ }
223
+ }
224
+ return result;
225
+ }
226
+ /**
227
+ * Creates protected spans from regex matches
228
+ * Used to mask regex matches from NER to avoid double-detection
229
+ */
230
+ export function createProtectedSpans(regexMatches) {
231
+ return regexMatches.map(({ start, end }) => ({ start, end }));
232
+ }
233
+ /**
234
+ * Checks if a span overlaps with any protected span
235
+ */
236
+ export function isInProtectedSpan(span, protectedSpans) {
237
+ return protectedSpans.some((protected_) => spansOverlap(span, protected_));
238
+ }
239
+ //# sourceMappingURL=resolver.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolver.js","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,EAEP,eAAe,EAEf,qBAAqB,GACtB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAEpF;;GAEG;AACH,MAAM,CAAN,IAAY,eASX;AATD,WAAY,eAAe;IACzB,wCAAwC;IACxC,oDAAiC,CAAA;IACjC,uBAAuB;IACvB,8CAA2B,CAAA;IAC3B,6BAA6B;IAC7B,0DAAuC,CAAA;IACvC,oCAAoC;IACpC,kDAA+B,CAAA;AACjC,CAAC,EATW,eAAe,KAAf,eAAe,QAS1B;AAcD;;GAEG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAmB;IACrD,eAAe,EAAE,eAAe,CAAC,cAAc;IAC/C,aAAa,EAAE,IAAI;IACnB,aAAa,EAAE,GAAG;CACnB,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,YAAyB,EACzB,UAAuB,EACvB,MAA2B,EAC3B,YAAoB,EACpB,SAAkC,EAAE;IAEpC,MAAM,cAAc,GAAG,EAAE,GAAG,uBAAuB,EAAE,GAAG,MAAM,EAAE,CAAC;IAEjE,4DAA4D;IAC5D,MAAM,aAAa,GAAG,cAAc,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAC3D,MAAM,WAAW,GAAG,cAAc,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;IAEvD,oCAAoC;IACpC,MAAM,sBAAsB,GAAG,cAAc,CAAC,aAAa,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IACnF,MAAM,oBAAoB,GAAG,cAAc,CAAC,WAAW,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IAE/E,8BAA8B;IAC9B,MAAM,UAAU,GAAG,CAAC,GAAG,sBAAsB,EAAE,GAAG,oBAAoB,CAAC,CAAC;IAExE,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,cAAc,CAAC,UAAU,EAAE,MAAM,EAAE,cAAc,CAAC,CAAC;IAEpE,kDAAkD;IAClD,MAAM,YAAY,GAAG,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IAEnE,8BAA8B;IAC9B,MAAM,YAAY,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAEpD,2BAA2B;IAC3B,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,OAAoB,EAAE,MAA2B;IACvE,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QAC9B,2BAA2B;QAC3B,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;YACzC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,6BAA6B;QAC7B,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;QACrE,IAAI,KAAK,CAAC,UAAU,GAAG,SAAS,EAAE,CAAC;YACjC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,OAAoB,EACpB,MAA2B,EAC3B,aAAqB;IAErB,IAAI,MAAM,CAAC,cAAc,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;QACrC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QAC9B,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;QAClD,OAAO,CAAC,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CACpB,OAAoB,EACpB,MAA2B,EAC3B,YAAoB;IAEpB,IAAI,MAAM,CAAC,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,MAAM,eAAe,GAAgB,EAAE,CAAC;IAExC,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;QAC9C,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM;YAClC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC;QAEpD,KAAK,MAAM,KAAK,IAAI,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;YACzD,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS;gBAAE,SAAS;YAExC,uDAAuD;YACvD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC;YAC/B,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CACjC,CAAC,QAAQ,EAAE,EAAE,CACX,QAAQ,CAAC,KAAK,IAAI,UAAU;gBAC5B,QAAQ,CAAC,GAAG,IAAI,UAAU,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAC/C,CAAC;YAEF,IAAI,CAAC,cAAc,EAAE,CAAC;gBACpB,eAAe,CAAC,IAAI,CAAC;oBACnB,IAAI,EAAE,OAAO,CAAC,KAAK,EAAE,mDAAmD;oBACxE,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM;oBAClC,UAAU,EAAE,GAAG;oBACf,MAAM,EAAE,eAAe,CAAC,KAAK;oBAC7B,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;iBACf,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,OAAO,EAAE,GAAG,eAAe,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,OAAoB,EACpB,MAA2B,EAC3B,MAAsB;IAEtB,IAAI,OAAO,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,yBAAyB;IACzB,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAC5C,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,qCAAqC;QACrC,MAAM,cAAc,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,YAAY,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC;QAErF,IAAI,cAAc,KAAK,CAAC,CAAC,EAAE,CAAC;YAC1B,2BAA2B;YAC3B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;aAAM,CAAC;YACN,uBAAuB;YACvB,MAAM,QAAQ,GAAG,MAAM,CAAC,cAAc,CAAE,CAAC;YACzC,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;YAE/D,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;gBACrB,mCAAmC;gBACnC,MAAM,CAAC,cAAc,CAAC,GAAG,KAAK,CAAC;YACjC,CAAC;YACD,uCAAuC;QACzC,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CACrB,CAAY,EACZ,CAAY,EACZ,MAA2B,EAC3B,MAAsB;IAEtB,+CAA+C;IAC/C,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;QACzB,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;YAC7E,OAAO,CAAC,CAAC;QACX,CAAC;QACD,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;YAC7E,OAAO,CAAC,CAAC;QACX,CAAC;IACH,CAAC;IAED,iCAAiC;IACjC,QAAQ,MAAM,CAAC,eAAe,EAAE,CAAC;QAC/B,KAAK,eAAe,CAAC,WAAW,CAAC,CAAC,CAAC;YACjC,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;gBAClB,OAAO,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,iBAAiB,CAAC,CAAC,CAAC;YACvC,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;gBAClC,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7C,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,aAAa,CAAC,CAAC,CAAC;YACnC,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAClD,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAClD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;gBAC5B,OAAO,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACvC,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,cAAc,CAAC;QACpC;YACE,wBAAwB;YACxB,MAAM;IACV,CAAC;IAED,+DAA+D;IAC/D,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;QAClC,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClD,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;QAC5B,OAAO,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,mCAAmC;IACnC,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,IAAa,EAAE,MAA2B;IACjE,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,GAAG,qBAAqB,CAAC,CAAC;IACvG,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IACzC,OAAO,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AACjC,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,OAAoB;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,KAAK,IAAI,KAAK,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACd,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAClC,YAAyB;IAEzB,OAAO,YAAY,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;AAChE,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAC/B,IAAoC,EACpC,cAAqD;IAErD,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC;AAC7E,CAAC"}
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Semantic Data Loader
3
+ * Handles automatic downloading, caching, and parsing of semantic enrichment data.
4
+ * Browser-compatible using storage abstraction layer
5
+ *
6
+ * Data sources:
7
+ * - nam_dict.txt: Name-gender mappings from gender-guesser (~40K names)
8
+ * - cities15000.txt: GeoNames cities with population > 15,000 (~25K cities)
9
+ * - countryInfo.txt: Country names and codes (~250 countries)
10
+ * - admin1CodesASCII.txt: First-level admin divisions (~4K regions)
11
+ *
12
+ * Data is cached using the storage abstraction layer:
13
+ * - Node.js: Platform-specific cache directories
14
+ * - Browser: IndexedDB
15
+ */
16
+ /**
17
+ * Loaded name-gender data
18
+ */
19
+ interface NameEntry {
20
+ gender: string;
21
+ localeOverrides?: Record<string, string>;
22
+ }
23
+ /**
24
+ * Loaded location data
25
+ */
26
+ interface CityEntry {
27
+ country: string;
28
+ population: number;
29
+ }
30
+ interface RegionEntry {
31
+ country: string;
32
+ name: string;
33
+ }
34
+ /**
35
+ * Semantic data store
36
+ */
37
+ interface SemanticData {
38
+ names: Map<string, NameEntry>;
39
+ cities: Map<string, CityEntry>;
40
+ countries: Map<string, string>;
41
+ regions: Map<string, RegionEntry>;
42
+ loaded: boolean;
43
+ }
44
+ /**
45
+ * Gets the cache directory for semantic data
46
+ * Uses platform-specific cache location (or virtual path in browser)
47
+ */
48
+ export declare function getSemanticDataCacheDir(): Promise<string>;
49
+ /**
50
+ * Gets the path to the data directory (alias for backwards compatibility)
51
+ */
52
+ export declare function getDataDirectory(): Promise<string>;
53
+ /**
54
+ * Semantic data file info
55
+ */
56
+ export interface SemanticDataFileInfo {
57
+ /** Filename */
58
+ filename: string;
59
+ /** Download URL */
60
+ url: string;
61
+ /** Whether file is required */
62
+ required: boolean;
63
+ /** Description */
64
+ description: string;
65
+ /** Approximate size */
66
+ size: string;
67
+ }
68
+ /**
69
+ * Registry of semantic data files and their download URLs
70
+ */
71
+ export declare const SEMANTIC_DATA_FILES: SemanticDataFileInfo[];
72
+ /**
73
+ * Progress callback for downloads
74
+ */
75
+ export type SemanticDownloadProgressCallback = (progress: {
76
+ file: string;
77
+ bytesDownloaded: number;
78
+ totalBytes: number | null;
79
+ percent: number | null;
80
+ }) => void;
81
+ /**
82
+ * Checks if semantic data is already downloaded
83
+ */
84
+ export declare function isSemanticDataDownloaded(): Promise<boolean>;
85
+ /**
86
+ * Checks if the semantic data files are available
87
+ * Note: This is now async to work with browser storage
88
+ */
89
+ export declare function isSemanticDataAvailable(): Promise<boolean>;
90
+ /**
91
+ * Downloads all semantic data files
92
+ */
93
+ export declare function downloadSemanticData(onProgress?: SemanticDownloadProgressCallback, onStatus?: (status: string) => void): Promise<string>;
94
+ /**
95
+ * Ensures semantic data is available, downloading if needed
96
+ */
97
+ export declare function ensureSemanticData(options?: {
98
+ autoDownload?: boolean;
99
+ onProgress?: SemanticDownloadProgressCallback;
100
+ onStatus?: (status: string) => void;
101
+ }): Promise<string>;
102
+ /**
103
+ * Clears cached semantic data
104
+ */
105
+ export declare function clearSemanticDataCache(): Promise<void>;
106
+ /**
107
+ * Gets info about semantic data files
108
+ */
109
+ export declare function getSemanticDataInfo(): Promise<{
110
+ files: SemanticDataFileInfo[];
111
+ cacheDir: string;
112
+ totalSize: string;
113
+ }>;
114
+ /**
115
+ * Initializes semantic data (downloads if needed, then loads)
116
+ */
117
+ export declare function initializeSemanticData(options?: {
118
+ autoDownload?: boolean;
119
+ onProgress?: SemanticDownloadProgressCallback;
120
+ onStatus?: (status: string) => void;
121
+ }): Promise<void>;
122
+ /**
123
+ * Loads semantic data from cached files
124
+ * @throws Error if required data files are not available
125
+ */
126
+ export declare function loadSemanticData(): Promise<SemanticData>;
127
+ /**
128
+ * Gets the loaded semantic data (loads if not already loaded)
129
+ * Note: This is now async to work with browser storage
130
+ */
131
+ export declare function getSemanticData(): Promise<SemanticData>;
132
+ /**
133
+ * Gets the loaded semantic data synchronously
134
+ * Returns null if data is not yet loaded
135
+ * Use this for performance-critical code paths after initial loading
136
+ */
137
+ export declare function getSemanticDataSync(): SemanticData | null;
138
+ /**
139
+ * Clears the loaded semantic data (useful for testing)
140
+ */
141
+ export declare function clearSemanticData(): void;
142
+ /**
143
+ * Looks up gender for a name
144
+ */
145
+ export declare function lookupGender(name: string, locale?: string): Promise<string | undefined>;
146
+ /**
147
+ * Looks up location type (city, country, or region)
148
+ * Priority: country > major city (pop > 500K) > region > other cities
149
+ */
150
+ export declare function lookupLocationType(location: string): Promise<{
151
+ type: "city" | "country" | "region";
152
+ countryCode?: string;
153
+ } | undefined>;
154
+ /**
155
+ * Gets statistics about loaded data
156
+ */
157
+ export declare function getDataStats(): {
158
+ names: number;
159
+ cities: number;
160
+ countries: number;
161
+ regions: number;
162
+ loaded: boolean;
163
+ };
164
+ export {};
165
+ //# sourceMappingURL=semantic-data-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic-data-loader.d.ts","sourceRoot":"","sources":["../../src/pipeline/semantic-data-loader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AA6EH;;GAEG;AACH,UAAU,SAAS;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC1C;AAED;;GAEG;AACH,UAAU,SAAS;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,UAAU,WAAW;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC9B,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/B,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC/B,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IAClC,MAAM,EAAE,OAAO,CAAC;CACjB;AAsBD;;;GAGG;AACH,wBAAsB,uBAAuB,IAAI,OAAO,CAAC,MAAM,CAAC,CAG/D;AAED;;GAEG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,MAAM,CAAC,CAExD;AAMD;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,eAAe;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,+BAA+B;IAC/B,QAAQ,EAAE,OAAO,CAAC;IAClB,kBAAkB;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;AAQD;;GAEG;AACH,eAAO,MAAM,mBAAmB,EAAE,oBAAoB,EA6BrD,CAAC;AAMF;;GAEG;AACH,MAAM,MAAM,gCAAgC,GAAG,CAAC,QAAQ,EAAE;IACxD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,CAAC;IACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB,KAAK,IAAI,CAAC;AAEX;;GAEG;AACH,wBAAsB,wBAAwB,IAAI,OAAO,CAAC,OAAO,CAAC,CAgBjE;AAED;;;GAGG;AACH,wBAAsB,uBAAuB,IAAI,OAAO,CAAC,OAAO,CAAC,CAEhE;AAuFD;;GAEG;AACH,wBAAsB,oBAAoB,CACxC,UAAU,CAAC,EAAE,gCAAgC,EAC7C,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,GAClC,OAAO,CAAC,MAAM,CAAC,CA+BjB;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,gCAAgC,CAAC;IAC9C,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC,MAAM,CAAC,CAyBjB;AAED;;GAEG;AACH,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAM5D;AAED;;GAEG;AACH,wBAAsB,mBAAmB,IAAI,OAAO,CAAC;IACnD,KAAK,EAAE,oBAAoB,EAAE,CAAC;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;CACnB,CAAC,CAOD;AAqPD;;GAEG;AACH,wBAAsB,sBAAsB,CAC1C,OAAO,GAAE;IACP,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,UAAU,CAAC,EAAE,gCAAgC,CAAC;IAC9C,QAAQ,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;CAChC,GACL,OAAO,CAAC,IAAI,CAAC,CAMf;AAED;;;GAGG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,YAAY,CAAC,CAqD9D;AAED;;;GAGG;AACH,wBAAsB,eAAe,IAAI,OAAO,CAAC,YAAY,CAAC,CAK7D;AAED;;;;GAIG;AACH,wBAAgB,mBAAmB,IAAI,YAAY,GAAG,IAAI,CAKzD;AAED;;GAEG;AACH,wBAAgB,iBAAiB,IAAI,IAAI,CAExC;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,CAiB7B;AAOD;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,GACf,OAAO,CACR;IAAE,IAAI,EAAE,MAAM,GAAG,SAAS,GAAG,QAAQ,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GAAG,SAAS,CAC1E,CA4BA;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,OAAO,CAAC;CACjB,CAYA"}