rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rehydra Module
|
|
3
|
+
* Main entry point for on-device PII anonymization
|
|
4
|
+
*/
|
|
5
|
+
// Re-export types
|
|
6
|
+
export * from "./types/index.js";
|
|
7
|
+
// Re-export recognizers
|
|
8
|
+
export { RegexRecognizer, RecognizerRegistry, createDefaultRegistry, createRegistry, getGlobalRegistry, emailRecognizer, phoneRecognizer, ibanRecognizer, bicSwiftRecognizer, creditCardRecognizer, ipAddressRecognizer, urlRecognizer, createCustomIdRecognizer, createCaseIdRecognizer, createCustomerIdRecognizer, } from "./recognizers/index.js";
|
|
9
|
+
// Re-export NER components
|
|
10
|
+
export { NERModel, NERModelStub, createNERModel, createNERModelStub, WordPieceTokenizer, loadVocabFromFile, parseVocab, loadRuntime, detectRuntime, getRuntimeType, MODEL_REGISTRY, getModelCacheDir, isModelDownloaded, downloadModel, ensureModel, clearModelCache, listDownloadedModels, } from "./ner/index.js";
|
|
11
|
+
// Re-export pipeline components
|
|
12
|
+
export { prenormalize, resolveEntities, tagEntities, validateOutput, generateTag, parseTag, rehydrate, enrichSemantics, inferGender, classifyLocation, getDatabaseStats, hasName, hasLocation,
|
|
13
|
+
// Semantic data loader exports
|
|
14
|
+
isSemanticDataAvailable, isSemanticDataDownloaded, getSemanticDataCacheDir, getDataDirectory, downloadSemanticData, ensureSemanticData, initializeSemanticData, loadSemanticData, clearSemanticData, clearSemanticDataCache, getSemanticDataInfo, SEMANTIC_DATA_FILES,
|
|
15
|
+
// Title extractor exports
|
|
16
|
+
extractTitle, extractTitlesFromSpans, mergeAdjacentTitleSpans, getTitlesForLanguage, getAllTitles, startsWithTitle, isOnlyTitle, } from "./pipeline/index.js";
|
|
17
|
+
// Re-export crypto
|
|
18
|
+
export { encryptPIIMap, decryptPIIMap, generateKey, deriveKey, generateSalt, InMemoryKeyProvider, ConfigKeyProvider, validateKey, secureCompare, uint8ArrayToBase64, base64ToUint8Array, } from "./crypto/index.js";
|
|
19
|
+
// Re-export storage utilities
|
|
20
|
+
export { getStorageProvider, isNode, isBrowser, resetStorageProvider, setStorageProvider, } from "./utils/storage.js";
|
|
21
|
+
// Re-export path utilities
|
|
22
|
+
export { join as pathJoin, dirname as pathDirname, basename as pathBasename, normalize as pathNormalize, extname as pathExtname, isAbsolute as pathIsAbsolute, } from "./utils/path.js";
|
|
23
|
+
// Main anonymization imports
|
|
24
|
+
import { createDefaultPolicy, } from "./types/index.js";
|
|
25
|
+
/**
|
|
26
|
+
* Merges a partial policy with a base policy (deep merge for Maps/Sets)
|
|
27
|
+
* Unlike the exported mergePolicy, this uses a custom base instead of global defaults
|
|
28
|
+
*/
|
|
29
|
+
function mergePolicyWithBase(base, partial) {
|
|
30
|
+
// Deep merge confidenceThresholds Map
|
|
31
|
+
let confidenceThresholds = base.confidenceThresholds;
|
|
32
|
+
if (partial.confidenceThresholds !== undefined) {
|
|
33
|
+
confidenceThresholds = new Map(base.confidenceThresholds);
|
|
34
|
+
for (const [type, threshold] of partial.confidenceThresholds) {
|
|
35
|
+
confidenceThresholds.set(type, threshold);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
enabledTypes: partial.enabledTypes ?? base.enabledTypes,
|
|
40
|
+
regexEnabledTypes: partial.regexEnabledTypes ?? base.regexEnabledTypes,
|
|
41
|
+
nerEnabledTypes: partial.nerEnabledTypes ?? base.nerEnabledTypes,
|
|
42
|
+
typePriority: partial.typePriority ?? base.typePriority,
|
|
43
|
+
confidenceThresholds,
|
|
44
|
+
customIdPatterns: partial.customIdPatterns ?? base.customIdPatterns,
|
|
45
|
+
allowlistTerms: partial.allowlistTerms ?? base.allowlistTerms,
|
|
46
|
+
denylistPatterns: partial.denylistPatterns ?? base.denylistPatterns,
|
|
47
|
+
reuseIdsForRepeatedPII: partial.reuseIdsForRepeatedPII ?? base.reuseIdsForRepeatedPII,
|
|
48
|
+
enableLeakScan: partial.enableLeakScan ?? base.enableLeakScan,
|
|
49
|
+
enableSemanticMasking: partial.enableSemanticMasking ?? base.enableSemanticMasking,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
import { createDefaultRegistry, } from "./recognizers/index.js";
|
|
53
|
+
import { NERModelStub, createNERModel, DEFAULT_LABEL_MAP, } from "./ner/index.js";
|
|
54
|
+
import { ensureModel, } from "./ner/model-manager.js";
|
|
55
|
+
import { prenormalize } from "./pipeline/prenormalize.js";
|
|
56
|
+
import { resolveEntities } from "./pipeline/resolver.js";
|
|
57
|
+
import { tagEntities, countEntitiesByType } from "./pipeline/tagger.js";
|
|
58
|
+
import { validateOutput } from "./pipeline/validator.js";
|
|
59
|
+
import { enrichSemantics } from "./pipeline/semantic-enricher.js";
|
|
60
|
+
import { ensureSemanticData, isSemanticDataAvailable, loadSemanticData, } from "./pipeline/semantic-data-loader.js";
|
|
61
|
+
import { extractTitlesFromSpans, mergeAdjacentTitleSpans, } from "./pipeline/title-extractor.js";
|
|
62
|
+
import { encryptPIIMap, generateKey, } from "./crypto/index.js";
|
|
63
|
+
import { getStorageProvider } from "./utils/storage.js";
|
|
64
|
+
/**
|
|
65
|
+
* Anonymizer instance
|
|
66
|
+
* Main class for performing PII anonymization
|
|
67
|
+
*/
|
|
68
|
+
export class Anonymizer {
|
|
69
|
+
registry;
|
|
70
|
+
nerModel = null;
|
|
71
|
+
nerConfig;
|
|
72
|
+
semanticConfig;
|
|
73
|
+
keyProvider;
|
|
74
|
+
defaultPolicy;
|
|
75
|
+
modelVersion;
|
|
76
|
+
policyVersion;
|
|
77
|
+
initialized = false;
|
|
78
|
+
semanticDataReady = false;
|
|
79
|
+
constructor(config = {}) {
|
|
80
|
+
this.registry = config.registry ?? createDefaultRegistry();
|
|
81
|
+
this.keyProvider = config.keyProvider ?? null;
|
|
82
|
+
this.defaultPolicy = config.defaultPolicy ?? createDefaultPolicy();
|
|
83
|
+
this.policyVersion = config.policyVersion ?? "1.0.0";
|
|
84
|
+
// Handle NER configuration
|
|
85
|
+
this.nerConfig = config.ner ?? { mode: "disabled" };
|
|
86
|
+
this.modelVersion = config.modelVersion ?? "1.0.0";
|
|
87
|
+
// Merge NER thresholds into default policy if provided
|
|
88
|
+
if (this.nerConfig.thresholds !== undefined) {
|
|
89
|
+
const thresholdsMap = new Map(this.defaultPolicy.confidenceThresholds);
|
|
90
|
+
for (const [type, threshold] of Object.entries(this.nerConfig.thresholds)) {
|
|
91
|
+
if (threshold !== undefined) {
|
|
92
|
+
thresholdsMap.set(type, threshold);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
this.defaultPolicy = {
|
|
96
|
+
...this.defaultPolicy,
|
|
97
|
+
confidenceThresholds: thresholdsMap,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// Handle semantic configuration
|
|
101
|
+
this.semanticConfig = config.semantic ?? { enabled: false };
|
|
102
|
+
// If semantic is enabled, also enable it in the default policy
|
|
103
|
+
if (this.semanticConfig.enabled) {
|
|
104
|
+
this.defaultPolicy = {
|
|
105
|
+
...this.defaultPolicy,
|
|
106
|
+
enableSemanticMasking: true,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Initializes the anonymizer
|
|
112
|
+
* Downloads NER model and semantic data if needed and loads them
|
|
113
|
+
*/
|
|
114
|
+
async initialize() {
|
|
115
|
+
if (this.initialized)
|
|
116
|
+
return;
|
|
117
|
+
// Handle NER model setup based on mode
|
|
118
|
+
if (this.nerConfig.mode === "disabled") {
|
|
119
|
+
this.nerModel = new NERModelStub();
|
|
120
|
+
}
|
|
121
|
+
else if (this.nerConfig.mode === "custom") {
|
|
122
|
+
if (this.nerConfig.modelPath === undefined ||
|
|
123
|
+
this.nerConfig.modelPath === "" ||
|
|
124
|
+
this.nerConfig.vocabPath === undefined ||
|
|
125
|
+
this.nerConfig.vocabPath === "") {
|
|
126
|
+
throw new Error("NER mode 'custom' requires modelPath and vocabPath");
|
|
127
|
+
}
|
|
128
|
+
this.nerModel = createNERModel({
|
|
129
|
+
modelPath: this.nerConfig.modelPath,
|
|
130
|
+
vocabPath: this.nerConfig.vocabPath,
|
|
131
|
+
modelVersion: this.modelVersion,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
// 'standard' or 'quantized' - use model manager
|
|
136
|
+
const { modelPath, vocabPath, labelMapPath } = await ensureModel(this.nerConfig.mode, {
|
|
137
|
+
autoDownload: this.nerConfig.autoDownload ?? true,
|
|
138
|
+
onProgress: this.nerConfig.onDownloadProgress,
|
|
139
|
+
onStatus: this.nerConfig.onStatus,
|
|
140
|
+
});
|
|
141
|
+
// Load label map
|
|
142
|
+
let labelMap = DEFAULT_LABEL_MAP;
|
|
143
|
+
try {
|
|
144
|
+
const storage = await getStorageProvider();
|
|
145
|
+
const labelMapContent = await storage.readTextFile(labelMapPath);
|
|
146
|
+
labelMap = JSON.parse(labelMapContent);
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
// Use default label map
|
|
150
|
+
}
|
|
151
|
+
this.nerModel = createNERModel({
|
|
152
|
+
modelPath,
|
|
153
|
+
vocabPath,
|
|
154
|
+
labelMap,
|
|
155
|
+
modelVersion: this.modelVersion,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
// Load the NER model
|
|
159
|
+
if (!this.nerModel.loaded) {
|
|
160
|
+
this.nerConfig.onStatus?.("Loading NER model...");
|
|
161
|
+
await this.nerModel.load();
|
|
162
|
+
this.nerConfig.onStatus?.("NER model loaded!");
|
|
163
|
+
}
|
|
164
|
+
// Handle semantic data setup if enabled
|
|
165
|
+
if (this.semanticConfig.enabled) {
|
|
166
|
+
const autoDownload = this.semanticConfig.autoDownload ?? true;
|
|
167
|
+
// Check if data is already available
|
|
168
|
+
const dataAvailable = await isSemanticDataAvailable();
|
|
169
|
+
if (!dataAvailable) {
|
|
170
|
+
if (!autoDownload) {
|
|
171
|
+
throw new Error("Semantic masking is enabled but data files are not available.\n\n" +
|
|
172
|
+
"To download automatically, use:\n" +
|
|
173
|
+
" createAnonymizer({ semantic: { enabled: true, autoDownload: true } })\n\n" +
|
|
174
|
+
"Or disable semantic masking:\n" +
|
|
175
|
+
" createAnonymizer({ semantic: { enabled: false } })");
|
|
176
|
+
}
|
|
177
|
+
// Download semantic data
|
|
178
|
+
await ensureSemanticData({
|
|
179
|
+
autoDownload: true,
|
|
180
|
+
onProgress: this.semanticConfig.onDownloadProgress,
|
|
181
|
+
onStatus: this.semanticConfig.onStatus,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
this.semanticConfig.onStatus?.("Semantic data already cached");
|
|
186
|
+
}
|
|
187
|
+
// Load data into memory for synchronous access during enrichment
|
|
188
|
+
await loadSemanticData();
|
|
189
|
+
this.semanticDataReady = true;
|
|
190
|
+
}
|
|
191
|
+
this.modelVersion = this.nerModel.version;
|
|
192
|
+
this.initialized = true;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Anonymizes text, replacing PII with placeholder tags
|
|
196
|
+
* @param text - Input text to anonymize
|
|
197
|
+
* @param locale - Optional locale hint (e.g., 'de-DE', 'en-US')
|
|
198
|
+
* @param policy - Optional policy override
|
|
199
|
+
* @returns Anonymization result with anonymized text and encrypted PII map
|
|
200
|
+
*/
|
|
201
|
+
async anonymize(text, locale, policy) {
|
|
202
|
+
if (!this.initialized) {
|
|
203
|
+
await this.initialize();
|
|
204
|
+
}
|
|
205
|
+
const startTime = performance.now();
|
|
206
|
+
// Merge policy with instance defaults (not global defaults)
|
|
207
|
+
// This ensures semantic config from constructor is preserved
|
|
208
|
+
// Uses deep merge for Maps (confidenceThresholds) and Sets
|
|
209
|
+
const effectivePolicy = policy !== undefined
|
|
210
|
+
? mergePolicyWithBase(this.defaultPolicy, policy)
|
|
211
|
+
: this.defaultPolicy;
|
|
212
|
+
// Step 1: Pre-normalize text
|
|
213
|
+
const normalizedText = prenormalize(text);
|
|
214
|
+
// Step 2: Run regex recognizers
|
|
215
|
+
const regexMatches = this.registry.findAll(normalizedText, effectivePolicy);
|
|
216
|
+
// Step 3: Run NER model
|
|
217
|
+
const nerResult = await this.nerModel.predict(normalizedText, effectivePolicy);
|
|
218
|
+
const nerMatches = nerResult.spans;
|
|
219
|
+
// Step 4: Resolve and merge entities
|
|
220
|
+
const resolvedMatches = resolveEntities(regexMatches, nerMatches, effectivePolicy, normalizedText);
|
|
221
|
+
// Step 4.5: Merge adjacent title+name PERSON spans (if semantic masking enabled)
|
|
222
|
+
// This fixes NER models that split "Mrs. Smith" into two entities
|
|
223
|
+
const mergedMatches = effectivePolicy.enableSemanticMasking === true
|
|
224
|
+
? mergeAdjacentTitleSpans(resolvedMatches, normalizedText)
|
|
225
|
+
: resolvedMatches;
|
|
226
|
+
// Step 4.6: Extract titles from PERSON entities (if semantic masking enabled)
|
|
227
|
+
// This strips honorific titles (Dr., Mrs., etc.) so they remain visible for translation
|
|
228
|
+
const titleExtractedMatches = effectivePolicy.enableSemanticMasking
|
|
229
|
+
? extractTitlesFromSpans(mergedMatches, normalizedText)
|
|
230
|
+
: mergedMatches;
|
|
231
|
+
// Step 4.6: Enrich with semantic attributes (if enabled)
|
|
232
|
+
// This adds gender for PERSON and scope for LOCATION entities
|
|
233
|
+
const enrichedMatches = effectivePolicy.enableSemanticMasking
|
|
234
|
+
? enrichSemantics(titleExtractedMatches, {
|
|
235
|
+
locale: locale !== undefined ? locale.split("-")[0] : undefined, // Extract language code
|
|
236
|
+
})
|
|
237
|
+
: titleExtractedMatches;
|
|
238
|
+
// Step 5: Tag entities and build PII map
|
|
239
|
+
const { anonymizedText, entities, piiMap } = tagEntities(normalizedText, enrichedMatches, effectivePolicy);
|
|
240
|
+
// Step 6: Validate output
|
|
241
|
+
const validation = validateOutput(anonymizedText, entities, Array.from(piiMap.keys()), effectivePolicy);
|
|
242
|
+
if (!validation.valid) {
|
|
243
|
+
// Log validation errors (but don't expose raw PII)
|
|
244
|
+
const safeErrors = validation.errors.map((e) => ({
|
|
245
|
+
code: e.code,
|
|
246
|
+
message: e.message,
|
|
247
|
+
}));
|
|
248
|
+
// eslint-disable-next-line no-console
|
|
249
|
+
console.warn("Validation warnings:", safeErrors);
|
|
250
|
+
}
|
|
251
|
+
// Step 7: Encrypt PII map
|
|
252
|
+
const encryptionKey = this.keyProvider !== null
|
|
253
|
+
? await this.keyProvider.getKey()
|
|
254
|
+
: generateKey();
|
|
255
|
+
const encryptedPiiMap = await encryptPIIMap(piiMap, encryptionKey);
|
|
256
|
+
// Step 8: Build stats
|
|
257
|
+
const endTime = performance.now();
|
|
258
|
+
const stats = {
|
|
259
|
+
countsByType: countEntitiesByType(entities),
|
|
260
|
+
totalEntities: entities.length,
|
|
261
|
+
modelVersion: this.modelVersion,
|
|
262
|
+
policyVersion: this.policyVersion,
|
|
263
|
+
processingTimeMs: endTime - startTime,
|
|
264
|
+
leakScanPassed: validation.leakScanPassed,
|
|
265
|
+
};
|
|
266
|
+
// Step 9: Build result (without original text in entities)
|
|
267
|
+
const safeEntities = entities.map(({ original: _original, ...rest }) => rest);
|
|
268
|
+
return {
|
|
269
|
+
anonymizedText,
|
|
270
|
+
entities: safeEntities,
|
|
271
|
+
piiMap: encryptedPiiMap,
|
|
272
|
+
stats,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Disposes of resources
|
|
277
|
+
*/
|
|
278
|
+
async dispose() {
|
|
279
|
+
if (this.nerModel) {
|
|
280
|
+
await this.nerModel.dispose();
|
|
281
|
+
}
|
|
282
|
+
this.initialized = false;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Gets the recognizer registry
|
|
286
|
+
*/
|
|
287
|
+
getRegistry() {
|
|
288
|
+
return this.registry;
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Gets the NER model
|
|
292
|
+
*/
|
|
293
|
+
getNERModel() {
|
|
294
|
+
return this.nerModel;
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Whether the anonymizer is initialized
|
|
298
|
+
*/
|
|
299
|
+
get isInitialized() {
|
|
300
|
+
return this.initialized;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Creates an anonymizer with the specified configuration
|
|
305
|
+
*
|
|
306
|
+
* @example
|
|
307
|
+
* ```typescript
|
|
308
|
+
* // Regex-only (no NER)
|
|
309
|
+
* const anonymizer = createAnonymizer();
|
|
310
|
+
*
|
|
311
|
+
* // With NER (auto-downloads model on first use)
|
|
312
|
+
* const anonymizer = createAnonymizer({
|
|
313
|
+
* ner: { mode: 'quantized' }
|
|
314
|
+
* });
|
|
315
|
+
*
|
|
316
|
+
* // With NER and progress callback
|
|
317
|
+
* const anonymizer = createAnonymizer({
|
|
318
|
+
* ner: {
|
|
319
|
+
* mode: 'standard',
|
|
320
|
+
* onStatus: (status) => console.log(status),
|
|
321
|
+
* onDownloadProgress: (p) => console.log(`${p.file}: ${p.percent}%`)
|
|
322
|
+
* }
|
|
323
|
+
* });
|
|
324
|
+
* ```
|
|
325
|
+
*/
|
|
326
|
+
export function createAnonymizer(config) {
|
|
327
|
+
return new Anonymizer(config);
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Convenience function for one-off anonymization
|
|
331
|
+
* Creates a temporary anonymizer with default settings (regex-only)
|
|
332
|
+
*/
|
|
333
|
+
export async function anonymize(text, locale, policy) {
|
|
334
|
+
const anonymizer = createAnonymizer();
|
|
335
|
+
await anonymizer.initialize();
|
|
336
|
+
try {
|
|
337
|
+
return await anonymizer.anonymize(text, locale, policy);
|
|
338
|
+
}
|
|
339
|
+
finally {
|
|
340
|
+
await anonymizer.dispose();
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Quick regex-only anonymization (no NER, faster)
|
|
345
|
+
*/
|
|
346
|
+
export async function anonymizeRegexOnly(text, policy) {
|
|
347
|
+
// Create policy with NER disabled
|
|
348
|
+
const regexOnlyPolicy = {
|
|
349
|
+
...policy,
|
|
350
|
+
nerEnabledTypes: new Set(), // Disable all NER types
|
|
351
|
+
};
|
|
352
|
+
return anonymize(text, undefined, regexOnlyPolicy);
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Full anonymization with NER
|
|
356
|
+
* Auto-downloads the quantized model on first use
|
|
357
|
+
*
|
|
358
|
+
* @example
|
|
359
|
+
* ```typescript
|
|
360
|
+
* const result = await anonymizeWithNER(
|
|
361
|
+
* 'Contact John Smith at john@example.com',
|
|
362
|
+
* {
|
|
363
|
+
* mode: 'quantized',
|
|
364
|
+
* onStatus: console.log
|
|
365
|
+
* }
|
|
366
|
+
* );
|
|
367
|
+
* ```
|
|
368
|
+
*/
|
|
369
|
+
export async function anonymizeWithNER(text, nerConfig, policy) {
|
|
370
|
+
const anonymizer = createAnonymizer({
|
|
371
|
+
ner: {
|
|
372
|
+
mode: nerConfig.mode ?? "quantized",
|
|
373
|
+
...nerConfig,
|
|
374
|
+
},
|
|
375
|
+
});
|
|
376
|
+
await anonymizer.initialize();
|
|
377
|
+
try {
|
|
378
|
+
return await anonymizer.anonymize(text, undefined, policy);
|
|
379
|
+
}
|
|
380
|
+
finally {
|
|
381
|
+
await anonymizer.dispose();
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,kBAAkB;AAClB,cAAc,kBAAkB,CAAC;AAEjC,wBAAwB;AACxB,OAAO,EAEL,eAAe,EACf,kBAAkB,EAClB,qBAAqB,EACrB,cAAc,EACd,iBAAiB,EACjB,eAAe,EACf,eAAe,EACf,cAAc,EACd,kBAAkB,EAClB,oBAAoB,EACpB,mBAAmB,EACnB,aAAa,EACb,wBAAwB,EACxB,sBAAsB,EACtB,0BAA0B,GAC3B,MAAM,wBAAwB,CAAC;AAEhC,2BAA2B;AAC3B,OAAO,EACL,QAAQ,EACR,YAAY,EACZ,cAAc,EACd,kBAAkB,EAClB,kBAAkB,EAClB,iBAAiB,EACjB,UAAU,EACV,WAAW,EACX,aAAa,EACb,cAAc,EAMd,cAAc,EACd,gBAAgB,EAChB,iBAAiB,EACjB,aAAa,EACb,WAAW,EACX,eAAe,EACf,oBAAoB,GACrB,MAAM,gBAAgB,CAAC;AAExB,gCAAgC;AAChC,OAAO,EACL,YAAY,EACZ,eAAe,EACf,WAAW,EACX,cAAc,EACd,WAAW,EACX,QAAQ,EACR,SAAS,EACT,eAAe,EACf,WAAW,EACX,gBAAgB,EAChB,gBAAgB,EAChB,OAAO,EACP,WAAW;AACX,+BAA+B;AAC/B,uBAAuB,EACvB,wBAAwB,EACxB,uBAAuB,EACvB,gBAAgB,EAChB,oBAAoB,EACpB,kBAAkB,EAClB,sBAAsB,EACtB,gBAAgB,EAChB,iBAAiB,EACjB,sBAAsB,EACtB,mBAAmB,EACnB,mBAAmB;AACnB,0BAA0B;AAC1B,YAAY,EACZ,sBAAsB,EACtB,uBAAuB,EACvB,oBAAoB,EACpB,YAAY,EACZ,eAAe,EACf,WAAW,GAMZ,MAAM,qBAAqB,CAAC;AAE7B,mBAAmB;AACnB,OAAO,EACL,aAAa,EACb,aAAa,EACb,WAAW,EACX,SAAS,EACT,YAAY,EAEZ,mBAAmB,EACnB,iBAAiB,EACjB,WAAW,EACX,aAAa,EACb,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,mBAAmB,CAAC;AAE3B,8BAA8B;AAC9B,OAAO,EACL,kBAAkB,EAClB,MAAM,EACN,SAAS,EACT,oBAAoB,EACpB,kBAAkB,GAEnB,MAAM,oBAAoB,CAAC;AAE5B,2BAA2B;AAC3B,OAAO,EACL,IAAI,IAAI,QAAQ,EAChB,OAAO,IAAI,WAAW,EACtB,QAAQ,IAAI,YAAY,EACxB,SAAS,IAAI,aAAa,EAC1B,OAAO,IAAI,WAAW,EACtB,UAAU,IAAI,cAAc,GAC7B,MAAM,iBAAiB,CAAC;AAEzB,6BAA6B;AAC7B,OAAO,EAQL,mBAAmB,GACpB,MAAM,kBAAkB,CAAC;AAE1B;;;GAGG;AACH,SAAS,mBAAmB,CAC1B,IAAyB,EACzB,OAAqC;IAErC,sCAAsC;IACtC,IAAI,oBAAoB,GAAG,IAAI,CAAC,oBAAoB,CAAC;IACrD,IAAI,OAAO,CAAC,oBAAoB,KAAK,SAAS,EAAE,CAAC;QAC/C,oBAAoB,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QAC1D,KAAK,MAAM,CAAC,IAAI,EAAE,SAAS,CAAC,IAAI,OAAO,CAAC,oBAAoB,EAAE,CAAC;YAC7D,oBAAoB,CAAC,GAAG,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,OAAO;QACL,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,IAAI,CAAC,YAAY;QACvD,iBAAiB,EAAE,OAAO,CAAC,iBAAiB,IAAI,IAAI,CAAC,iBAAiB;QACtE,eAAe,EAAE,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC,eAAe;QAChE,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,IAAI,CAAC,YAAY;QACvD,oBAAoB;QACpB,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,gBAAgB;QACnE,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc;QAC7D,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,gBAAgB;QACnE,sBAAsB,EACpB,OAAO,CAAC,sBAAsB,IAAI,IAAI,CAAC,sBAAsB;QAC/D,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc;QAC7D,qBAAqB,EACnB,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAC,qBAAqB;KAC9D,CAAC;AACJ,CAAC;AACD,OAAO,EACL,qBAAqB,GAEtB,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAEL,YAAY,EACZ,cAAc,EACd,iBAAiB,GAClB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EAEL,WAAW,GAEZ,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AACzD,OAAO,EAAE,eAAe,EAAE,MAAM,iCAAiC,CAAC;AAClE,OAAO,EACL,kBAAkB,EAClB,uBAAuB,EACvB,gBAAgB,GACjB,MAAM,oCAAoC,CAAC;AAC5C,OAAO,EACL,sBAAsB,EACtB,uBAAuB,GACxB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EACL,aAAa,EACb,WAAW,GAEZ,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAkFxD;;;GAGG;AACH,MAAM,OAAO,UAAU;IACb,QAAQ,CAAqB;IAC7B,QAAQ,GAAqB,IAAI,CAAC;IAClC,SAAS,CAAY;IACrB,cAAc,CAAiB;IAC/B,WAAW,CAAqB;IAChC,aAAa,CAAsB;IACnC,YAAY,CAAS;IACrB,aAAa,CAAS;IACtB,WAAW,GAAG,KAAK,CAAC;IACpB,iBAAiB,GAAG,KAAK,CAAC;IAElC,YAAY,SAA2B,EAAE;QACvC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,IAAI,qBAAqB,EAAE,CAAC;QAC3D,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,WAAW,IAAI,IAAI,CAAC;QAC9C,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,IAAI,mBAAmB,EAAE,CAAC;QACnE,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,IAAI,OAAO,CAAC;QAErD,2BAA2B;QAC3B,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,GAAG,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACpD,IAAI,CAAC,YAAY,GAAG,MAAM,CAAC,YAAY,IAAI,OAAO,CAAC;QAEnD,uDAAuD;QACvD,IAAI,IAAI,CAAC,SAAS,CAAC,UAAU,KAAK,SAAS,EAAE,CAAC;YAC5C,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,aAAa,CAAC,oBAAoB,CAAC,CAAC;YACvE,KAAK,MAAM,CAAC,IAAI,EAAE,SAAS,CAAC,IAAI,MAAM,CAAC,OAAO,CAC5C,IAAI,CAAC,SAAS,CAAC,UAAU,CAC1B,EAAE,CAAC;gBACF,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;oBAC5B,aAAa,CAAC,GAAG,CAAC,IAAe,EAAE,SAAS,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC;YACD,IAAI,CAAC,aAAa,GAAG;gBACnB,GAAG,IAAI,CAAC,aAAa;gBACrB,oBAAoB,EAAE,aAAa;aACpC,CAAC;QACJ,CAAC;QAED,gCAAgC;QAChC,IAAI,CAAC,cAAc,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;QAE5D,+DAA+D;QAC/D,IAAI,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,CAAC;YAChC,IAAI,CAAC,aAAa,GAAG;gBACnB,GAAG,IAAI,CAAC,aAAa;gBACrB,qBAAqB,EAAE,IAAI;aAC5B,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,UAAU;QACd,IAAI,IAAI,CAAC,WAAW;YAAE,OAAO;QAE7B,uCAAuC;QACvC,IAAI,IAAI,CAAC,SAAS,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YACvC,IAAI,CAAC,QAAQ,GAAG,IAAI,YAAY,EAAE,CAAC;QACrC,CAAC;aAAM,IAAI,IAAI,CAAC,SAAS,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC5C,IACE,IAAI,CAAC,SAAS,CAAC,SAAS,KAAK,SAAS;gBACtC,IAAI,CAAC,SAAS,CAAC,SAAS,KAAK,EAAE;gBAC/B,IAAI,CAAC,SAAS,CAAC,SAAS,KAAK,SAAS;gBACtC,IAAI,CAAC,SAAS,CAAC,SAAS,KAAK,EAAE,EAC/B,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,oDAAoD,CAAC,CAAC;YACxE,CAAC;YAED,IAAI,CAAC,QAAQ,GAAG,cAAc,CAAC;gBAC7B,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS;gBACnC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS;gBACnC,YAAY,EAAE,IAAI,CAAC,YAAY;aAChC,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,gDAAgD;YAChD,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,MAAM,WAAW,CAC9D,IAAI,CAAC,SAAS,CAAC,IAAI,EACnB;gBACE,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,YAAY,IAAI,IAAI;gBACjD,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,kBAAkB;gBAC7C,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ;aAClC,CACF,CAAC;YAEF,iBAAiB;YACjB,IAAI,QAAQ,GAAG,iBAAiB,CAAC;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,kBAAkB,EAAE,CAAC;gBAC3C,MAAM,eAAe,GAAG,MAAM,OAAO,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;gBACjE,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAa,CAAC;YACrD,CAAC;YAAC,MAAM,CAAC;gBACP,wBAAwB;YAC1B,CAAC;YAED,IAAI,CAAC,QAAQ,GAAG,cAAc,CAAC;gBAC7B,SAAS;gBACT,SAAS;gBACT,QAAQ;gBACR,YAAY,EAAE,IAAI,CAAC,YAAY;aAChC,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;YAC1B,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC,sBAAsB,CAAC,CAAC;YAClD,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YAC3B,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC,mBAAmB,CAAC,CAAC;QACjD,CAAC;QAED,wCAAwC;QACxC,IAAI,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,CAAC;YAChC,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,CAAC,YAAY,IAAI,IAAI,CAAC;YAE9D,qCAAqC;YACrC,MAAM,aAAa,GAAG,MAAM,uBAAuB,EAAE,CAAC;YACtD,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,IAAI,CAAC,YAAY,EAAE,CAAC;oBAClB,MAAM,IAAI,KAAK,CACb,mEAAmE;wBACjE,mCAAmC;wBACnC,6EAA6E;wBAC7E,gCAAgC;wBAChC,sDAAsD,CACzD,CAAC;gBACJ,CAAC;gBAED,yBAAyB;gBACzB,MAAM,kBAAkB,CAAC;oBACvB,YAAY,EAAE,IAAI;oBAClB,UAAU,EAAE,IAAI,CAAC,cAAc,CAAC,kBAAkB;oBAClD,QAAQ,EAAE,IAAI,CAAC,cAAc,CAAC,QAAQ;iBACvC,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,CAAC,8BAA8B,CAAC,CAAC;YACjE,CAAC;YAED,iEAAiE;YACjE,MAAM,gBAAgB,EAAE,CAAC;YACzB,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAChC,CAAC;QAED,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC;QAC1C,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;IAC1B,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,SAAS,CACb,IAAY,EACZ,MAAe,EACf,MAAqC;QAErC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QAC1B,CAAC;QAED,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,4DAA4D;QAC5D,6DAA6D;QAC7D,2DAA2D;QAC3D,MAAM,eAAe,GACnB,MAAM,KAAK,SAAS;YAClB,CAAC,CAAC,mBAAmB,CAAC,IAAI,CAAC,aAAa,EAAE,MAAM,CAAC;YACjD,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC;QAEzB,6BAA6B;QAC7B,MAAM,cAAc,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;QAE1C,gCAAgC;QAChC,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;QAE5E,wBAAwB;QACxB,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,QAAS,CAAC,OAAO,CAC5C,cAAc,EACd,eAAe,CAChB,CAAC;QACF,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC;QAEnC,qCAAqC;QACrC,MAAM,eAAe,GAAG,eAAe,CACrC,YAAY,EACZ,UAAU,EACV,eAAe,EACf,cAAc,CACf,CAAC;QAEF,iFAAiF;QACjF,kEAAkE;QAClE,MAAM,aAAa,GACjB,eAAe,CAAC,qBAAqB,KAAK,IAAI;YAC5C,CAAC,CAAC,uBAAuB,CAAC,eAAe,EAAE,cAAc,CAAC;YAC1D,CAAC,CAAC,eAAe,CAAC;QAEtB,8EAA8E;QAC9E,wFAAwF;QACxF,MAAM,qBAAqB,GAAG,eAAe,CAAC,qBAAqB;YACjE,CAAC,CAAC,sBAAsB,CAAC,aAAa,EAAE,cAAc,CAAC;YACvD,CAAC,CAAC,aAAa,CAAC;QAElB,yDAAyD;QACzD,8DAA8D;QAC9D,MAAM,eAAe,GAAG,eAAe,CAAC,qBAAqB;YAC3D,CAAC,CAAC,eAAe,CAAC,qBAAqB,EAAE;gBACrC,MAAM,EAAE,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,wBAAwB;aAC1F,CAAC;YACJ,CAAC,CAAC,qBAAqB,CAAC;QAE1B,yCAAyC;QACzC,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,WAAW,CACtD,cAAc,EACd,eAAe,EACf,eAAe,CAChB,CAAC;QAEF,0BAA0B;QAC1B,MAAM,UAAU,GAAG,cAAc,CAC/B,cAAc,EACd,QAAQ,EACR,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,EACzB,eAAe,CAChB,CAAC;QAEF,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;YACtB,mDAAmD;YACnD,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC/C,IAAI,EAAE,CAAC,CAAC,IAAI;gBACZ,OAAO,EAAE,CAAC,CAAC,OAAO;aACnB,CAAC,CAAC,CAAC;YACJ,sCAAsC;YACtC,OAAO,CAAC,IAAI,CAAC,sBAAsB,EAAE,UAAU,CAAC,CAAC;QACnD,CAAC;QAED,0BAA0B;QAC1B,MAAM,aAAa,GACjB,IAAI,CAAC,WAAW,KAAK,IAAI;YACvB,CAAC,CAAC,MAAM,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE;YACjC,CAAC,CAAC,WAAW,EAAE,CAAC;QAEpB,MAAM,eAAe,GAAG,MAAM,aAAa,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;QAEnE,sBAAsB;QACtB,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAClC,MAAM,KAAK,GAAuB;YAChC,YAAY,EAAE,mBAAmB,CAAC,QAAQ,CAAC;YAC3C,aAAa,EAAE,QAAQ,CAAC,MAAM;YAC9B,YAAY,EAAE,IAAI,CAAC,YAAY;YAC/B,aAAa,EAAE,IAAI,CAAC,aAAa;YACjC,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,cAAc,EAAE,UAAU,CAAC,cAAc;SAC1C,CAAC;QAEF,2DAA2D;QAC3D,MAAM,YAAY,GAAuC,QAAQ,CAAC,GAAG,CACnE,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAC3C,CAAC;QAEF,OAAO;YACL,cAAc;YACd,QAAQ,EAAE,YAAY;YACtB,MAAM,EAAE,eAAe;YACvB,KAAK;SACN,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;QAChC,CAAC;QACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,WAAW;QACT,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,WAAW;QACT,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;CACF;AAED;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,MAAM,UAAU,gBAAgB,CAAC,MAAyB;IACxD,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;AAChC,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,MAAe,EACf,MAAqC;IAErC,MAAM,UAAU,GAAG,gBAAgB,EAAE,CAAC;IACtC,MAAM,UAAU,CAAC,UAAU,EAAE,CAAC;IAE9B,IAAI,CAAC;QACH,OAAO,MAAM,UAAU,CAAC,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1D,CAAC;YAAS,CAAC;QACT,MAAM,UAAU,CAAC,OAAO,EAAE,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,IAAY,EACZ,MAAqC;IAErC,kCAAkC;IAClC,MAAM,eAAe,GAAiC;QACpD,GAAG,MAAM;QACT,eAAe,EAAE,IAAI,GAAG,EAAE,EAAE,wBAAwB;KACrD,CAAC;IAEF,OAAO,SAAS,CAAC,IAAI,EAAE,SAAS,EAAE,eAAe,CAAC,CAAC;AACrD,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,IAAY,EACZ,SAAwE,EACxE,MAAqC;IAErC,MAAM,UAAU,GAAG,gBAAgB,CAAC;QAClC,GAAG,EAAE;YACH,IAAI,EAAE,SAAS,CAAC,IAAI,IAAI,WAAW;YACnC,GAAG,SAAS;SACb;KACF,CAAC,CAAC;IAEH,MAAM,UAAU,CAAC,UAAU,EAAE,CAAC;IAE9B,IAAI,CAAC;QACH,OAAO,MAAM,UAAU,CAAC,SAAS,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;IAC7D,CAAC;YAAS,CAAC;QACT,MAAM,UAAU,CAAC,OAAO,EAAE,CAAC;IAC7B,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BIO Tag Decoder
|
|
3
|
+
* Converts BIO-tagged token sequences to entity spans
|
|
4
|
+
*/
|
|
5
|
+
import { SpanMatch } from '../types/index.js';
|
|
6
|
+
import type { Token } from './tokenizer.js';
|
|
7
|
+
/**
|
|
8
|
+
* BIO tag types
|
|
9
|
+
*/
|
|
10
|
+
export declare enum BIOTag {
|
|
11
|
+
/** Beginning of an entity */
|
|
12
|
+
B = "B",
|
|
13
|
+
/** Inside an entity (continuation) */
|
|
14
|
+
I = "I",
|
|
15
|
+
/** Outside any entity */
|
|
16
|
+
O = "O"
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Parsed BIO label
|
|
20
|
+
*/
|
|
21
|
+
export interface ParsedBIOLabel {
|
|
22
|
+
/** BIO tag type */
|
|
23
|
+
tag: BIOTag;
|
|
24
|
+
/** Entity type (null for O tag) */
|
|
25
|
+
entityType: string | null;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Raw entity span from NER (before conversion to SpanMatch)
|
|
29
|
+
*/
|
|
30
|
+
export interface RawNEREntity {
|
|
31
|
+
/** Entity type string from model */
|
|
32
|
+
type: string;
|
|
33
|
+
/** Start character offset */
|
|
34
|
+
start: number;
|
|
35
|
+
/** End character offset */
|
|
36
|
+
end: number;
|
|
37
|
+
/** Combined confidence score */
|
|
38
|
+
confidence: number;
|
|
39
|
+
/** Raw text */
|
|
40
|
+
text: string;
|
|
41
|
+
/** Token indices that make up this entity */
|
|
42
|
+
tokenIndices: number[];
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Parses a BIO label string (e.g., "B-PER", "I-ORG", "O")
|
|
46
|
+
*/
|
|
47
|
+
export declare function parseBIOLabel(label: string): ParsedBIOLabel;
|
|
48
|
+
/**
|
|
49
|
+
* Decodes BIO-tagged tokens into entity spans
|
|
50
|
+
*/
|
|
51
|
+
export declare function decodeBIOTags(tokens: Token[], labels: string[], confidences: number[], originalText: string): RawNEREntity[];
|
|
52
|
+
/**
|
|
53
|
+
* Converts raw NER entities to SpanMatch format
|
|
54
|
+
*/
|
|
55
|
+
export declare function convertToSpanMatches(rawEntities: RawNEREntity[], confidenceThreshold?: number): SpanMatch[];
|
|
56
|
+
/**
|
|
57
|
+
* Post-processes NER spans to clean up boundaries
|
|
58
|
+
*/
|
|
59
|
+
export declare function cleanupSpanBoundaries(spans: SpanMatch[], originalText: string): SpanMatch[];
|
|
60
|
+
/**
|
|
61
|
+
* Merges adjacent spans of the same type
|
|
62
|
+
*/
|
|
63
|
+
export declare function mergeAdjacentSpans(spans: SpanMatch[], originalText: string, maxGap?: number): SpanMatch[];
|
|
64
|
+
//# sourceMappingURL=bio-decoder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bio-decoder.d.ts","sourceRoot":"","sources":["../../src/ner/bio-decoder.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAW,SAAS,EAAmB,MAAM,mBAAmB,CAAC;AAExE,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAE5C;;GAEG;AACH,oBAAY,MAAM;IAChB,6BAA6B;IAC7B,CAAC,MAAM;IACP,sCAAsC;IACtC,CAAC,MAAM;IACP,yBAAyB;IACzB,CAAC,MAAM;CACR;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,mCAAmC;IACnC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,oCAAoC;IACpC,IAAI,EAAE,MAAM,CAAC;IACb,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,2BAA2B;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,gCAAgC;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe;IACf,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;CACxB;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,cAAc,CAyB3D;AAED;;GAEG;AACH,wBAAgB,aAAa,CAC3B,MAAM,EAAE,KAAK,EAAE,EACf,MAAM,EAAE,MAAM,EAAE,EAChB,WAAW,EAAE,MAAM,EAAE,EACrB,YAAY,EAAE,MAAM,GACnB,YAAY,EAAE,CAkFhB;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,WAAW,EAAE,YAAY,EAAE,EAC3B,mBAAmB,GAAE,MAAY,GAChC,SAAS,EAAE,CA0Bb;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAoCb;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,SAAS,EAAE,EAClB,YAAY,EAAE,MAAM,EACpB,MAAM,GAAE,MAAU,GACjB,SAAS,EAAE,CAiCb"}
|