@elanlanguages/bridge-anonymization 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +382 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +100 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +163 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +173 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +294 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +102 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +253 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +240 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +45 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +99 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +140 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +341 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +9 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +9 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +238 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +74 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +169 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +134 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +69 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +8 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +8 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/package.json +62 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Resolver
|
|
3
|
+
* Merges, deduplicates, and resolves overlapping entity detections
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, DetectionSource, DEFAULT_TYPE_PRIORITY, } from '../types/index.js';
|
|
6
|
+
import { spansOverlap, spanLength, sortSpansByPosition } from '../utils/offsets.js';
|
|
7
|
+
/**
|
|
8
|
+
* Resolution strategy for overlapping entities
|
|
9
|
+
*/
|
|
10
|
+
export var OverlapStrategy;
|
|
11
|
+
(function (OverlapStrategy) {
|
|
12
|
+
/** Regex matches always win over NER */
|
|
13
|
+
OverlapStrategy["REGEX_PRIORITY"] = "REGEX_PRIORITY";
|
|
14
|
+
/** Longer span wins */
|
|
15
|
+
OverlapStrategy["LONGER_SPAN"] = "LONGER_SPAN";
|
|
16
|
+
/** Higher confidence wins */
|
|
17
|
+
OverlapStrategy["HIGHER_CONFIDENCE"] = "HIGHER_CONFIDENCE";
|
|
18
|
+
/** Use type priority from policy */
|
|
19
|
+
OverlapStrategy["TYPE_PRIORITY"] = "TYPE_PRIORITY";
|
|
20
|
+
})(OverlapStrategy || (OverlapStrategy = {}));
|
|
21
|
+
/**
|
|
22
|
+
* Default resolver configuration
|
|
23
|
+
*/
|
|
24
|
+
export const DEFAULT_RESOLVER_CONFIG = {
|
|
25
|
+
overlapStrategy: OverlapStrategy.REGEX_PRIORITY,
|
|
26
|
+
regexPriority: true,
|
|
27
|
+
minConfidence: 0.5,
|
|
28
|
+
};
|
|
29
|
+
/**
|
|
30
|
+
* Resolves and merges entity detections from regex and NER
|
|
31
|
+
*/
|
|
32
|
+
export function resolveEntities(regexMatches, nerMatches, policy, originalText, config = {}) {
|
|
33
|
+
const resolverConfig = { ...DEFAULT_RESOLVER_CONFIG, ...config };
|
|
34
|
+
// Step 1: Filter by enabled types and confidence thresholds
|
|
35
|
+
const filteredRegex = filterByPolicy(regexMatches, policy);
|
|
36
|
+
const filteredNER = filterByPolicy(nerMatches, policy);
|
|
37
|
+
// Step 2: Apply allowlist filtering
|
|
38
|
+
const allowlistFilteredRegex = applyAllowlist(filteredRegex, policy, originalText);
|
|
39
|
+
const allowlistFilteredNER = applyAllowlist(filteredNER, policy, originalText);
|
|
40
|
+
// Step 3: Combine all matches
|
|
41
|
+
const allMatches = [...allowlistFilteredRegex, ...allowlistFilteredNER];
|
|
42
|
+
// Step 4: Remove overlaps based on strategy
|
|
43
|
+
const resolved = removeOverlaps(allMatches, policy, resolverConfig);
|
|
44
|
+
// Step 5: Apply denylist patterns (force include)
|
|
45
|
+
const withDenylist = applyDenylist(resolved, policy, originalText);
|
|
46
|
+
// Step 6: Final deduplication
|
|
47
|
+
const deduplicated = deduplicateExact(withDenylist);
|
|
48
|
+
// Step 7: Sort by position
|
|
49
|
+
return sortSpansByPosition(deduplicated);
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Filters matches by policy (enabled types and confidence thresholds)
|
|
53
|
+
*/
|
|
54
|
+
function filterByPolicy(matches, policy) {
|
|
55
|
+
return matches.filter((match) => {
|
|
56
|
+
// Check if type is enabled
|
|
57
|
+
if (!policy.enabledTypes.has(match.type)) {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
// Check confidence threshold
|
|
61
|
+
const threshold = policy.confidenceThresholds.get(match.type) ?? 0.5;
|
|
62
|
+
if (match.confidence < threshold) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
return true;
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Filters out matches that are in the allowlist (known non-PII terms)
|
|
70
|
+
*/
|
|
71
|
+
function applyAllowlist(matches, policy, _originalText) {
|
|
72
|
+
if (policy.allowlistTerms.size === 0) {
|
|
73
|
+
return matches;
|
|
74
|
+
}
|
|
75
|
+
return matches.filter((match) => {
|
|
76
|
+
const matchText = match.text.toLowerCase().trim();
|
|
77
|
+
return !policy.allowlistTerms.has(matchText);
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Adds matches from denylist patterns (patterns that must always be PII)
|
|
82
|
+
*/
|
|
83
|
+
function applyDenylist(matches, policy, originalText) {
|
|
84
|
+
if (policy.denylistPatterns.length === 0) {
|
|
85
|
+
return matches;
|
|
86
|
+
}
|
|
87
|
+
const denylistMatches = [];
|
|
88
|
+
for (const pattern of policy.denylistPatterns) {
|
|
89
|
+
const globalPattern = pattern.global
|
|
90
|
+
? pattern
|
|
91
|
+
: new RegExp(pattern.source, pattern.flags + 'g');
|
|
92
|
+
for (const match of originalText.matchAll(globalPattern)) {
|
|
93
|
+
if (match.index === undefined)
|
|
94
|
+
continue;
|
|
95
|
+
// Check if this is already covered by existing matches
|
|
96
|
+
const alreadyCovered = matches.some((existing) => existing.start <= match.index &&
|
|
97
|
+
existing.end >= match.index + match[0].length);
|
|
98
|
+
if (!alreadyCovered) {
|
|
99
|
+
denylistMatches.push({
|
|
100
|
+
type: PIIType.EMAIL, // Default type for denylist; could be configurable
|
|
101
|
+
start: match.index,
|
|
102
|
+
end: match.index + match[0].length,
|
|
103
|
+
confidence: 1.0,
|
|
104
|
+
source: DetectionSource.REGEX,
|
|
105
|
+
text: match[0],
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return [...matches, ...denylistMatches];
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Removes overlapping spans based on resolution strategy
|
|
114
|
+
*/
|
|
115
|
+
function removeOverlaps(matches, policy, config) {
|
|
116
|
+
if (matches.length <= 1) {
|
|
117
|
+
return matches;
|
|
118
|
+
}
|
|
119
|
+
// Sort by start position
|
|
120
|
+
const sorted = sortSpansByPosition(matches);
|
|
121
|
+
const result = [];
|
|
122
|
+
for (const match of sorted) {
|
|
123
|
+
// Find overlapping matches in result
|
|
124
|
+
const overlappingIdx = result.findIndex((existing) => spansOverlap(match, existing));
|
|
125
|
+
if (overlappingIdx === -1) {
|
|
126
|
+
// No overlap, add directly
|
|
127
|
+
result.push(match);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
// Has overlap, resolve
|
|
131
|
+
const existing = result[overlappingIdx];
|
|
132
|
+
const winner = resolveOverlap(existing, match, policy, config);
|
|
133
|
+
if (winner === match) {
|
|
134
|
+
// New match wins, replace existing
|
|
135
|
+
result[overlappingIdx] = match;
|
|
136
|
+
}
|
|
137
|
+
// Otherwise keep existing (do nothing)
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Resolves overlap between two spans
|
|
144
|
+
* Returns the winner
|
|
145
|
+
*/
|
|
146
|
+
function resolveOverlap(a, b, policy, config) {
|
|
147
|
+
// Rule 1: Regex always beats NER if configured
|
|
148
|
+
if (config.regexPriority) {
|
|
149
|
+
if (a.source === DetectionSource.REGEX && b.source !== DetectionSource.REGEX) {
|
|
150
|
+
return a;
|
|
151
|
+
}
|
|
152
|
+
if (b.source === DetectionSource.REGEX && a.source !== DetectionSource.REGEX) {
|
|
153
|
+
return b;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
// Rule 2: Apply overlap strategy
|
|
157
|
+
switch (config.overlapStrategy) {
|
|
158
|
+
case OverlapStrategy.LONGER_SPAN: {
|
|
159
|
+
const lenA = spanLength(a);
|
|
160
|
+
const lenB = spanLength(b);
|
|
161
|
+
if (lenA !== lenB) {
|
|
162
|
+
return lenA > lenB ? a : b;
|
|
163
|
+
}
|
|
164
|
+
break;
|
|
165
|
+
}
|
|
166
|
+
case OverlapStrategy.HIGHER_CONFIDENCE: {
|
|
167
|
+
if (a.confidence !== b.confidence) {
|
|
168
|
+
return a.confidence > b.confidence ? a : b;
|
|
169
|
+
}
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
case OverlapStrategy.TYPE_PRIORITY: {
|
|
173
|
+
const priorityA = getTypePriority(a.type, policy);
|
|
174
|
+
const priorityB = getTypePriority(b.type, policy);
|
|
175
|
+
if (priorityA !== priorityB) {
|
|
176
|
+
return priorityA > priorityB ? a : b;
|
|
177
|
+
}
|
|
178
|
+
break;
|
|
179
|
+
}
|
|
180
|
+
case OverlapStrategy.REGEX_PRIORITY:
|
|
181
|
+
default:
|
|
182
|
+
// Already handled above
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
// Tiebreakers: longer span > higher confidence > type priority
|
|
186
|
+
const lenA = spanLength(a);
|
|
187
|
+
const lenB = spanLength(b);
|
|
188
|
+
if (lenA !== lenB) {
|
|
189
|
+
return lenA > lenB ? a : b;
|
|
190
|
+
}
|
|
191
|
+
if (a.confidence !== b.confidence) {
|
|
192
|
+
return a.confidence > b.confidence ? a : b;
|
|
193
|
+
}
|
|
194
|
+
const priorityA = getTypePriority(a.type, policy);
|
|
195
|
+
const priorityB = getTypePriority(b.type, policy);
|
|
196
|
+
if (priorityA !== priorityB) {
|
|
197
|
+
return priorityA > priorityB ? a : b;
|
|
198
|
+
}
|
|
199
|
+
// Final tiebreaker: keep first one
|
|
200
|
+
return a;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Gets type priority from policy (higher = more important)
|
|
204
|
+
*/
|
|
205
|
+
function getTypePriority(type, policy) {
|
|
206
|
+
const priorityList = policy.typePriority.length > 0 ? policy.typePriority : [...DEFAULT_TYPE_PRIORITY];
|
|
207
|
+
const index = priorityList.indexOf(type);
|
|
208
|
+
return index >= 0 ? index : -1;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Removes exact duplicate spans
|
|
212
|
+
*/
|
|
213
|
+
function deduplicateExact(matches) {
|
|
214
|
+
const seen = new Set();
|
|
215
|
+
const result = [];
|
|
216
|
+
for (const match of matches) {
|
|
217
|
+
const key = `${match.start}:${match.end}:${match.type}`;
|
|
218
|
+
if (!seen.has(key)) {
|
|
219
|
+
seen.add(key);
|
|
220
|
+
result.push(match);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
return result;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Creates protected spans from regex matches
|
|
227
|
+
* Used to mask regex matches from NER to avoid double-detection
|
|
228
|
+
*/
|
|
229
|
+
export function createProtectedSpans(regexMatches) {
|
|
230
|
+
return regexMatches.map(({ start, end }) => ({ start, end }));
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Checks if a span overlaps with any protected span
|
|
234
|
+
*/
|
|
235
|
+
export function isInProtectedSpan(span, protectedSpans) {
|
|
236
|
+
return protectedSpans.some((protected_) => spansOverlap(span, protected_));
|
|
237
|
+
}
|
|
238
|
+
//# sourceMappingURL=resolver.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolver.js","sourceRoot":"","sources":["../../src/pipeline/resolver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,EAEP,eAAe,EAEf,qBAAqB,GACtB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAEpF;;GAEG;AACH,MAAM,CAAN,IAAY,eASX;AATD,WAAY,eAAe;IACzB,wCAAwC;IACxC,oDAAiC,CAAA;IACjC,uBAAuB;IACvB,8CAA2B,CAAA;IAC3B,6BAA6B;IAC7B,0DAAuC,CAAA;IACvC,oCAAoC;IACpC,kDAA+B,CAAA;AACjC,CAAC,EATW,eAAe,KAAf,eAAe,QAS1B;AAcD;;GAEG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAmB;IACrD,eAAe,EAAE,eAAe,CAAC,cAAc;IAC/C,aAAa,EAAE,IAAI;IACnB,aAAa,EAAE,GAAG;CACnB,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,YAAyB,EACzB,UAAuB,EACvB,MAA2B,EAC3B,YAAoB,EACpB,SAAkC,EAAE;IAEpC,MAAM,cAAc,GAAG,EAAE,GAAG,uBAAuB,EAAE,GAAG,MAAM,EAAE,CAAC;IAEjE,4DAA4D;IAC5D,MAAM,aAAa,GAAG,cAAc,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAC3D,MAAM,WAAW,GAAG,cAAc,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;IAEvD,oCAAoC;IACpC,MAAM,sBAAsB,GAAG,cAAc,CAAC,aAAa,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IACnF,MAAM,oBAAoB,GAAG,cAAc,CAAC,WAAW,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IAE/E,8BAA8B;IAC9B,MAAM,UAAU,GAAG,CAAC,GAAG,sBAAsB,EAAE,GAAG,oBAAoB,CAAC,CAAC;IAExE,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,cAAc,CAAC,UAAU,EAAE,MAAM,EAAE,cAAc,CAAC,CAAC;IAEpE,kDAAkD;IAClD,MAAM,YAAY,GAAG,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;IAEnE,8BAA8B;IAC9B,MAAM,YAAY,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAEpD,2BAA2B;IAC3B,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,OAAoB,EAAE,MAA2B;IACvE,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QAC9B,2BAA2B;QAC3B,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;YACzC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,6BAA6B;QAC7B,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;QACrE,IAAI,KAAK,CAAC,UAAU,GAAG,SAAS,EAAE,CAAC;YACjC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,OAAoB,EACpB,MAA2B,EAC3B,aAAqB;IAErB,IAAI,MAAM,CAAC,cAAc,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;QACrC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QAC9B,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;QAClD,OAAO,CAAC,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CACpB,OAAoB,EACpB,MAA2B,EAC3B,YAAoB;IAEpB,IAAI,MAAM,CAAC,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,MAAM,eAAe,GAAgB,EAAE,CAAC;IAExC,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;QAC9C,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM;YAClC,CAAC,CAAC,OAAO;YACT,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC;QAEpD,KAAK,MAAM,KAAK,IAAI,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;YACzD,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS;gBAAE,SAAS;YAExC,uDAAuD;YACvD,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CACjC,CAAC,QAAQ,EAAE,EAAE,CACX,QAAQ,CAAC,KAAK,IAAI,KAAK,CAAC,KAAM;gBAC9B,QAAQ,CAAC,GAAG,IAAI,KAAK,CAAC,KAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CACjD,CAAC;YAEF,IAAI,CAAC,cAAc,EAAE,CAAC;gBACpB,eAAe,CAAC,IAAI,CAAC;oBACnB,IAAI,EAAE,OAAO,CAAC,KAAK,EAAE,mDAAmD;oBACxE,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM;oBAClC,UAAU,EAAE,GAAG;oBACf,MAAM,EAAE,eAAe,CAAC,KAAK;oBAC7B,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;iBACf,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,OAAO,EAAE,GAAG,eAAe,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,OAAoB,EACpB,MAA2B,EAC3B,MAAsB;IAEtB,IAAI,OAAO,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,yBAAyB;IACzB,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAC5C,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,qCAAqC;QACrC,MAAM,cAAc,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,YAAY,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC;QAErF,IAAI,cAAc,KAAK,CAAC,CAAC,EAAE,CAAC;YAC1B,2BAA2B;YAC3B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;aAAM,CAAC;YACN,uBAAuB;YACvB,MAAM,QAAQ,GAAG,MAAM,CAAC,cAAc,CAAE,CAAC;YACzC,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;YAE/D,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;gBACrB,mCAAmC;gBACnC,MAAM,CAAC,cAAc,CAAC,GAAG,KAAK,CAAC;YACjC,CAAC;YACD,uCAAuC;QACzC,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CACrB,CAAY,EACZ,CAAY,EACZ,MAA2B,EAC3B,MAAsB;IAEtB,+CAA+C;IAC/C,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;QACzB,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;YAC7E,OAAO,CAAC,CAAC;QACX,CAAC;QACD,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;YAC7E,OAAO,CAAC,CAAC;QACX,CAAC;IACH,CAAC;IAED,iCAAiC;IACjC,QAAQ,MAAM,CAAC,eAAe,EAAE,CAAC;QAC/B,KAAK,eAAe,CAAC,WAAW,CAAC,CAAC,CAAC;YACjC,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;gBAClB,OAAO,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,iBAAiB,CAAC,CAAC,CAAC;YACvC,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;gBAClC,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7C,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,aAAa,CAAC,CAAC,CAAC;YACnC,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAClD,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAClD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;gBAC5B,OAAO,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACvC,CAAC;YACD,MAAM;QACR,CAAC;QAED,KAAK,eAAe,CAAC,cAAc,CAAC;QACpC;YACE,wBAAwB;YACxB,MAAM;IACV,CAAC;IAED,+DAA+D;IAC/D,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;QAClC,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClD,MAAM,SAAS,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAClD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;QAC5B,OAAO,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,mCAAmC;IACnC,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,IAAa,EAAE,MAA2B;IACjE,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,GAAG,qBAAqB,CAAC,CAAC;IACvG,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IACzC,OAAO,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AACjC,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,OAAoB;IAC5C,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,KAAK,IAAI,KAAK,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACd,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAClC,YAAyB;IAEzB,OAAO,YAAY,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;AAChE,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAC/B,IAAoC,EACpC,cAAqD;IAErD,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC;AAC7E,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Replacement Tagger
|
|
3
|
+
* Replaces PII spans with placeholder tags and builds the PII map
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, SpanMatch, DetectedEntity, AnonymizationPolicy } from '../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* PII Map entry (before encryption)
|
|
8
|
+
*/
|
|
9
|
+
export interface PIIMapEntry {
|
|
10
|
+
/** PII type */
|
|
11
|
+
type: PIIType;
|
|
12
|
+
/** Entity ID */
|
|
13
|
+
id: number;
|
|
14
|
+
/** Original text */
|
|
15
|
+
original: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Raw PII Map (before encryption)
|
|
19
|
+
*/
|
|
20
|
+
export type RawPIIMap = Map<string, string>;
|
|
21
|
+
/**
|
|
22
|
+
* Tagging result
|
|
23
|
+
*/
|
|
24
|
+
export interface TaggingResult {
|
|
25
|
+
/** Anonymized text with placeholder tags */
|
|
26
|
+
anonymizedText: string;
|
|
27
|
+
/** List of detected entities with assigned IDs */
|
|
28
|
+
entities: DetectedEntity[];
|
|
29
|
+
/** Raw PII map (type_id -> original) */
|
|
30
|
+
piiMap: RawPIIMap;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Generates a PII placeholder tag
|
|
34
|
+
* Format: <PII type="TYPE" id="N"/>
|
|
35
|
+
*/
|
|
36
|
+
export declare function generateTag(type: PIIType, id: number): string;
|
|
37
|
+
/**
|
|
38
|
+
* Parses a PII tag to extract type and id
|
|
39
|
+
* Returns null if not a valid tag
|
|
40
|
+
*/
|
|
41
|
+
export declare function parseTag(tag: string): {
|
|
42
|
+
type: PIIType;
|
|
43
|
+
id: number;
|
|
44
|
+
} | null;
|
|
45
|
+
/**
|
|
46
|
+
* Creates a key for the PII map
|
|
47
|
+
*/
|
|
48
|
+
export declare function createPIIMapKey(type: PIIType, id: number): string;
|
|
49
|
+
/**
|
|
50
|
+
* Tags PII spans in text and builds the PII map
|
|
51
|
+
*/
|
|
52
|
+
export declare function tagEntities(text: string, matches: SpanMatch[], policy: AnonymizationPolicy): TaggingResult;
|
|
53
|
+
/**
|
|
54
|
+
* Validates that a tag is well-formed
|
|
55
|
+
*/
|
|
56
|
+
export declare function isValidTag(tag: string): boolean;
|
|
57
|
+
/**
|
|
58
|
+
* Extracts all PII tags from anonymized text
|
|
59
|
+
*/
|
|
60
|
+
export declare function extractTags(anonymizedText: string): Array<{
|
|
61
|
+
type: PIIType;
|
|
62
|
+
id: number;
|
|
63
|
+
position: number;
|
|
64
|
+
}>;
|
|
65
|
+
/**
|
|
66
|
+
* Counts entities by type
|
|
67
|
+
*/
|
|
68
|
+
export declare function countEntitiesByType(entities: DetectedEntity[]): Record<PIIType, number>;
|
|
69
|
+
/**
|
|
70
|
+
* Rehydrates anonymized text using the PII map
|
|
71
|
+
* (For testing/debugging only - not part of the anonymization pipeline)
|
|
72
|
+
*/
|
|
73
|
+
export declare function rehydrate(anonymizedText: string, piiMap: RawPIIMap): string;
|
|
74
|
+
//# sourceMappingURL=tagger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tagger.d.ts","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,EACP,SAAS,EACT,cAAc,EAEd,mBAAmB,EACpB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,eAAe;IACf,IAAI,EAAE,OAAO,CAAC;IACd,gBAAgB;IAChB,EAAE,EAAE,MAAM,CAAC;IACX,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AAE5C;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,4CAA4C;IAC5C,cAAc,EAAE,MAAM,CAAC;IACvB,kDAAkD;IAClD,QAAQ,EAAE,cAAc,EAAE,CAAC;IAC3B,wCAAwC;IACxC,MAAM,EAAE,SAAS,CAAC;CACnB;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,GAAG,MAAM,CAE7D;AAED;;;GAGG;AACH,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,CAoB1E;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,GAAG,MAAM,CAEjE;AAED;;GAEG;AACH,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,SAAS,EAAE,EACpB,MAAM,EAAE,mBAAmB,GAC1B,aAAa,CAyEf;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAE/C;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,cAAc,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,EAAE,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,CAoB1G;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,cAAc,EAAE,GAAG,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,CAcvF;AAED;;;GAGG;AACH,wBAAgB,SAAS,CAAC,cAAc,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,GAAG,MAAM,CAkB3E"}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Replacement Tagger
|
|
3
|
+
* Replaces PII spans with placeholder tags and builds the PII map
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, } from '../types/index.js';
|
|
6
|
+
import { sortSpansByPosition } from '../utils/offsets.js';
|
|
7
|
+
/**
|
|
8
|
+
* Generates a PII placeholder tag
|
|
9
|
+
* Format: <PII type="TYPE" id="N"/>
|
|
10
|
+
*/
|
|
11
|
+
export function generateTag(type, id) {
|
|
12
|
+
return `<PII type="${type}" id="${id}"/>`;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Parses a PII tag to extract type and id
|
|
16
|
+
* Returns null if not a valid tag
|
|
17
|
+
*/
|
|
18
|
+
export function parseTag(tag) {
|
|
19
|
+
const match = tag.match(/^<PII\s+type="([A-Z_]+)"\s+id="(\d+)"\s*\/>$/);
|
|
20
|
+
if (match === null) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
const [, typeStr, idStr] = match;
|
|
24
|
+
if (typeStr === undefined || idStr === undefined) {
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
const type = typeStr;
|
|
28
|
+
const id = parseInt(idStr, 10);
|
|
29
|
+
// Validate type is a valid PIIType
|
|
30
|
+
if (!Object.values(PIIType).includes(type)) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
return { type, id };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Creates a key for the PII map
|
|
37
|
+
*/
|
|
38
|
+
export function createPIIMapKey(type, id) {
|
|
39
|
+
return `${type}_${id}`;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Tags PII spans in text and builds the PII map
|
|
43
|
+
*/
|
|
44
|
+
export function tagEntities(text, matches, policy) {
|
|
45
|
+
if (matches.length === 0) {
|
|
46
|
+
return {
|
|
47
|
+
anonymizedText: text,
|
|
48
|
+
entities: [],
|
|
49
|
+
piiMap: new Map(),
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
// Sort by start position ascending for ID assignment
|
|
53
|
+
const sortedAscending = sortSpansByPosition(matches);
|
|
54
|
+
// Assign IDs
|
|
55
|
+
const entitiesWithIds = [];
|
|
56
|
+
let nextId = 1;
|
|
57
|
+
// Track seen text for ID reuse (if enabled)
|
|
58
|
+
const seenText = new Map(); // text -> id
|
|
59
|
+
for (const match of sortedAscending) {
|
|
60
|
+
let id;
|
|
61
|
+
if (policy.reuseIdsForRepeatedPII) {
|
|
62
|
+
const key = `${match.type}:${match.text}`;
|
|
63
|
+
const existingId = seenText.get(key);
|
|
64
|
+
if (existingId !== undefined) {
|
|
65
|
+
id = existingId;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
id = nextId++;
|
|
69
|
+
seenText.set(key, id);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
id = nextId++;
|
|
74
|
+
}
|
|
75
|
+
entitiesWithIds.push({ ...match, id });
|
|
76
|
+
}
|
|
77
|
+
// Build PII map
|
|
78
|
+
const piiMap = new Map();
|
|
79
|
+
for (const entity of entitiesWithIds) {
|
|
80
|
+
const key = createPIIMapKey(entity.type, entity.id);
|
|
81
|
+
piiMap.set(key, entity.text);
|
|
82
|
+
}
|
|
83
|
+
// Sort by start position descending for replacement
|
|
84
|
+
// (replacing from end to start preserves earlier offsets)
|
|
85
|
+
const sortedDescending = [...entitiesWithIds].sort((a, b) => b.start - a.start);
|
|
86
|
+
// Perform replacements
|
|
87
|
+
let anonymizedText = text;
|
|
88
|
+
for (const entity of sortedDescending) {
|
|
89
|
+
const tag = generateTag(entity.type, entity.id);
|
|
90
|
+
anonymizedText =
|
|
91
|
+
anonymizedText.slice(0, entity.start) + tag + anonymizedText.slice(entity.end);
|
|
92
|
+
}
|
|
93
|
+
// Build final entities list (sorted by position)
|
|
94
|
+
const entities = entitiesWithIds.map((e) => ({
|
|
95
|
+
type: e.type,
|
|
96
|
+
id: e.id,
|
|
97
|
+
start: e.start,
|
|
98
|
+
end: e.end,
|
|
99
|
+
confidence: e.confidence,
|
|
100
|
+
source: e.source,
|
|
101
|
+
original: e.text,
|
|
102
|
+
}));
|
|
103
|
+
return {
|
|
104
|
+
anonymizedText,
|
|
105
|
+
entities: sortSpansByPosition(entities),
|
|
106
|
+
piiMap,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Validates that a tag is well-formed
|
|
111
|
+
*/
|
|
112
|
+
export function isValidTag(tag) {
|
|
113
|
+
return parseTag(tag) !== null;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Extracts all PII tags from anonymized text
|
|
117
|
+
*/
|
|
118
|
+
export function extractTags(anonymizedText) {
|
|
119
|
+
const tags = [];
|
|
120
|
+
const tagPattern = /<PII\s+type="([A-Z_]+)"\s+id="(\d+)"\s*\/>/g;
|
|
121
|
+
let match;
|
|
122
|
+
while ((match = tagPattern.exec(anonymizedText)) !== null) {
|
|
123
|
+
const typeStr = match[1];
|
|
124
|
+
const idStr = match[2];
|
|
125
|
+
if (typeStr !== undefined && idStr !== undefined) {
|
|
126
|
+
const type = typeStr;
|
|
127
|
+
const id = parseInt(idStr, 10);
|
|
128
|
+
if (Object.values(PIIType).includes(type)) {
|
|
129
|
+
tags.push({ type, id, position: match.index });
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return tags;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Counts entities by type
|
|
137
|
+
*/
|
|
138
|
+
export function countEntitiesByType(entities) {
|
|
139
|
+
const counts = {};
|
|
140
|
+
// Initialize all types to 0
|
|
141
|
+
for (const type of Object.values(PIIType)) {
|
|
142
|
+
counts[type] = 0;
|
|
143
|
+
}
|
|
144
|
+
// Count entities
|
|
145
|
+
for (const entity of entities) {
|
|
146
|
+
counts[entity.type] = (counts[entity.type] ?? 0) + 1;
|
|
147
|
+
}
|
|
148
|
+
return counts;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Rehydrates anonymized text using the PII map
|
|
152
|
+
* (For testing/debugging only - not part of the anonymization pipeline)
|
|
153
|
+
*/
|
|
154
|
+
export function rehydrate(anonymizedText, piiMap) {
|
|
155
|
+
let result = anonymizedText;
|
|
156
|
+
const tags = extractTags(anonymizedText);
|
|
157
|
+
// Sort by position descending for replacement
|
|
158
|
+
tags.sort((a, b) => b.position - a.position);
|
|
159
|
+
for (const { type, id, position } of tags) {
|
|
160
|
+
const key = createPIIMapKey(type, id);
|
|
161
|
+
const original = piiMap.get(key);
|
|
162
|
+
if (original !== undefined) {
|
|
163
|
+
const tag = generateTag(type, id);
|
|
164
|
+
result = result.slice(0, position) + original + result.slice(position + tag.length);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return result;
|
|
168
|
+
}
|
|
169
|
+
//# sourceMappingURL=tagger.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tagger.js","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,GAKR,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,mBAAmB,EAAiC,MAAM,qBAAqB,CAAC;AA+BzF;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,IAAa,EAAE,EAAU;IACnD,OAAO,cAAc,IAAI,SAAS,EAAE,KAAK,CAAC;AAC5C,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,QAAQ,CAAC,GAAW;IAClC,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAC;IACxE,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,CAAC,EAAE,OAAO,EAAE,KAAK,CAAC,GAAG,KAAK,CAAC;IACjC,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QACjD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,IAAI,GAAG,OAAkB,CAAC;IAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAE/B,mCAAmC;IACnC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAAC,IAAa,EAAE,EAAU;IACvD,OAAO,GAAG,IAAI,IAAI,EAAE,EAAE,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAoB,EACpB,MAA2B;IAE3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO;YACL,cAAc,EAAE,IAAI;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,IAAI,GAAG,EAAE;SAClB,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,MAAM,eAAe,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAErD,aAAa;IACb,MAAM,eAAe,GAAsC,EAAE,CAAC;IAC9D,IAAI,MAAM,GAAG,CAAC,CAAC;IAEf,4CAA4C;IAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,aAAa;IAEzD,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,EAAU,CAAC;QAEf,IAAI,MAAM,CAAC,sBAAsB,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACrC,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;gBAC7B,EAAE,GAAG,UAAU,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,EAAE,GAAG,MAAM,EAAE,CAAC;gBACd,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,MAAM,EAAE,CAAC;QAChB,CAAC;QAED,eAAe,CAAC,IAAI,CAAC,EAAE,GAAG,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,gBAAgB;IAChB,MAAM,MAAM,GAAc,IAAI,GAAG,EAAE,CAAC;IACpC,KAAK,MAAM,MAAM,IAAI,eAAe,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;QACpD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IAED,oDAAoD;IACpD,0DAA0D;IAC1D,MAAM,gBAAgB,GAAG,CAAC,GAAG,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAEhF,uBAAuB;IACvB,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACtC,MAAM,GAAG,GAAG,WAAW,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;QAChD,cAAc;YACZ,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IACnF,CAAC;IAED,iDAAiD;IACjD,MAAM,QAAQ,GAAqB,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7D,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,IAAI;KACjB,CAAC,CAAC,CAAC;IAEJ,OAAO;QACL,cAAc;QACd,QAAQ,EAAE,mBAAmB,CAAC,QAAQ,CAAqB;QAC3D,MAAM;KACP,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,OAAO,QAAQ,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,cAAsB;IAChD,MAAM,IAAI,GAA2D,EAAE,CAAC;IACxE,MAAM,UAAU,GAAG,6CAA6C,CAAC;IAEjE,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1D,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACzB,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEvB,IAAI,OAAO,KAAK,SAAS,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACjD,MAAM,IAAI,GAAG,OAAkB,CAAC;YAChC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE/B,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1C,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,QAA0B;IAC5D,MAAM,MAAM,GAA4B,EAA6B,CAAC;IAEtE,4BAA4B;IAC5B,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;IAED,iBAAiB;IACjB,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,cAAsB,EAAE,MAAiB;IACjE,IAAI,MAAM,GAAG,cAAc,CAAC;IAC5B,MAAM,IAAI,GAAG,WAAW,CAAC,cAAc,CAAC,CAAC;IAEzC,8CAA8C;IAC9C,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;IAE7C,KAAK,MAAM,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE,IAAI,IAAI,EAAE,CAAC;QAC1C,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjC,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,MAAM,GAAG,GAAG,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAClC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,GAAG,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;QACtF,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Output Validator
|
|
3
|
+
* Validates anonymized output and performs leak scan
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, DetectedEntity, AnonymizationPolicy } from '../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Validation result
|
|
8
|
+
*/
|
|
9
|
+
export interface ValidationResult {
|
|
10
|
+
/** Whether validation passed */
|
|
11
|
+
valid: boolean;
|
|
12
|
+
/** List of validation errors */
|
|
13
|
+
errors: ValidationError[];
|
|
14
|
+
/** Whether leak scan passed (if performed) */
|
|
15
|
+
leakScanPassed?: boolean;
|
|
16
|
+
/** Potential leaks found by leak scan */
|
|
17
|
+
potentialLeaks?: LeakScanMatch[];
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Validation error
|
|
21
|
+
*/
|
|
22
|
+
export interface ValidationError {
|
|
23
|
+
/** Error code */
|
|
24
|
+
code: ValidationErrorCode;
|
|
25
|
+
/** Human-readable message */
|
|
26
|
+
message: string;
|
|
27
|
+
/** Additional details */
|
|
28
|
+
details?: Record<string, unknown>;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Validation error codes
|
|
32
|
+
*/
|
|
33
|
+
export declare enum ValidationErrorCode {
|
|
34
|
+
OVERLAPPING_ENTITIES = "OVERLAPPING_ENTITIES",
|
|
35
|
+
DUPLICATE_IDS = "DUPLICATE_IDS",
|
|
36
|
+
MALFORMED_TAG = "MALFORMED_TAG",
|
|
37
|
+
ID_MISMATCH = "ID_MISMATCH",
|
|
38
|
+
MISSING_IN_MAP = "MISSING_IN_MAP",
|
|
39
|
+
POTENTIAL_PII_LEAK = "POTENTIAL_PII_LEAK"
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Leak scan match
|
|
43
|
+
*/
|
|
44
|
+
export interface LeakScanMatch {
|
|
45
|
+
/** Type of potential leak */
|
|
46
|
+
type: PIIType;
|
|
47
|
+
/** Matched text */
|
|
48
|
+
text: string;
|
|
49
|
+
/** Position in anonymized text */
|
|
50
|
+
position: number;
|
|
51
|
+
/** Pattern that matched */
|
|
52
|
+
pattern: string;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Validates anonymization output
|
|
56
|
+
*/
|
|
57
|
+
export declare function validateOutput(anonymizedText: string, entities: DetectedEntity[], piiMapKeys: string[], policy: AnonymizationPolicy): ValidationResult;
|
|
58
|
+
/**
|
|
59
|
+
* Validates that no overlaps exist (fast check)
|
|
60
|
+
*/
|
|
61
|
+
export declare function hasNoOverlaps(entities: Array<{
|
|
62
|
+
start: number;
|
|
63
|
+
end: number;
|
|
64
|
+
}>): boolean;
|
|
65
|
+
//# sourceMappingURL=validator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"validator.d.ts","sourceRoot":"","sources":["../../src/pipeline/validator.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAIjF;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,gCAAgC;IAChC,KAAK,EAAE,OAAO,CAAC;IACf,gCAAgC;IAChC,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,8CAA8C;IAC9C,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,yCAAyC;IACzC,cAAc,CAAC,EAAE,aAAa,EAAE,CAAC;CAClC;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,iBAAiB;IACjB,IAAI,EAAE,mBAAmB,CAAC;IAC1B,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,yBAAyB;IACzB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED;;GAEG;AACH,oBAAY,mBAAmB;IAC7B,oBAAoB,yBAAyB;IAC7C,aAAa,kBAAkB;IAC/B,aAAa,kBAAkB;IAC/B,WAAW,gBAAgB;IAC3B,cAAc,mBAAmB;IACjC,kBAAkB,uBAAuB;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,6BAA6B;IAC7B,IAAI,EAAE,OAAO,CAAC;IACd,mBAAmB;IACnB,IAAI,EAAE,MAAM,CAAC;IACb,kCAAkC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,2BAA2B;IAC3B,OAAO,EAAE,MAAM,CAAC;CACjB;AAkCD;;GAEG;AACH,wBAAgB,cAAc,CAC5B,cAAc,EAAE,MAAM,EACtB,QAAQ,EAAE,cAAc,EAAE,EAC1B,UAAU,EAAE,MAAM,EAAE,EACpB,MAAM,EAAE,mBAAmB,GAC1B,gBAAgB,CA+ClB;AAkMD;;GAEG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAA;CAAE,CAAC,GAAG,OAAO,CAYtF"}
|