mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/core/fpe.ts CHANGED
@@ -10,7 +10,6 @@ import * as crypto from 'crypto';
10
10
  import { config } from '../config';
11
11
  import { getKeyProvider } from './key_provider';
12
12
  import { MaskSecurityError } from './exceptions';
13
- import { looksLikeToken } from './fpe_utils';
14
13
 
15
14
  // Master key management
16
15
 
@@ -52,13 +51,12 @@ export function resetMasterKey(): void {
52
51
  // Detectors — order matters: first match wins
53
52
 
54
53
  const _EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
55
- const _PHONE_RE = /^\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}$|^\d{3}[\s\-.]?\d{4}$/;
54
+ const _PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
55
+ const _PHONE_INTL_RE = /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/;
56
56
  const _SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
57
57
  const _CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
58
58
  const _ROUTING_RE = /^\d{9}$/;
59
- const _TCID_RE = /^[1-9]\d{9}[02468]$/;
60
- const _SAUDI_NID_RE = /^1\d{9}$/;
61
- const _UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
59
+ const _ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
62
60
  const _IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
63
61
 
64
62
  // Deterministic helpers (HMAC-based)
@@ -96,54 +94,109 @@ async function _hmacDigits(plaintext: string, n: number, offset: number = 0): Pr
96
94
 
97
95
  // Public API
98
96
 
97
+ // Dictionary for Semantic NLP Faker Generation
98
+ const _FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
99
+ const _LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
100
+ const _CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
101
+
102
+ /** Return a deterministic item from an array. */
103
+ async function _pickFromArray(plaintext: string, array: string[]): Promise<string> {
104
+ const digits = await _hmacDigits(plaintext, 8);
105
+ const num = parseInt(digits, 10);
106
+ return array[num % array.length];
107
+ }
108
+
109
+ /** Compute Luhn check digit */
110
+ function _computeLuhnDigit(partialNum: string): string {
111
+ const digits = partialNum.split("").map(Number);
112
+ let sum = 0;
113
+ let shouldDouble = true;
114
+ for (let i = digits.length - 1; i >= 0; i--) {
115
+ let digit = digits[i];
116
+ if (shouldDouble) {
117
+ digit *= 2;
118
+ if (digit > 9) digit -= 9;
119
+ }
120
+ sum += digit;
121
+ shouldDouble = !shouldDouble;
122
+ }
123
+ return ((10 - (sum % 10)) % 10).toString();
124
+ }
125
+
126
+
127
+
128
+ function _computeEsIdCheck(num: number): string {
129
+ return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
130
+ }
131
+
132
+ // Public API
133
+
99
134
  /**
100
- * Return a **deterministic**, format-preserving token for rawText.
135
+ * Return a **deterministic**, format-preserving token for rawText using its entityType.
101
136
  */
102
- export async function generateFPEToken(rawText: string): Promise<string> {
137
+ export async function generateFPEToken(rawText: string, entityType: string = 'UNKNOWN'): Promise<string> {
103
138
  const text = rawText.trim();
139
+ let type = (entityType || "UNKNOWN").toUpperCase();
140
+
141
+ if (type === "UNKNOWN") {
142
+ if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
143
+ else if (_SSN_RE.test(text)) type = "US_SSN";
144
+ else if (_CC_RE.test(text)) type = "CREDIT_CARD";
145
+ else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
146
+ else if (_ES_ID_RE.test(text)) type = "ES_DNI";
147
+ else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
148
+ else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
149
+ }
104
150
 
105
- if (_EMAIL_RE.test(text)) {
106
- return `tkn-${await _hmacHex(text)}@email.com`;
151
+ if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
152
+ const parts = text.split("@");
153
+ const domain = parts.length === 2 ? parts[1] : "email.com";
154
+ return `tkn-${await _hmacHex(text)}@${domain}`;
107
155
  }
108
156
 
109
- if (_PHONE_RE.test(text)) {
110
- return `+1-555-${await _hmacDigits(text, 7)}`;
157
+ if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
158
+ const m = text.match(/^\+([1-9]\d{0,3})/);
159
+ const cc = m ? m[1] : "1";
160
+ return `+${cc}-555-${await _hmacDigits(text, 7)}`;
111
161
  }
112
162
 
113
- if (_SSN_RE.test(text)) {
163
+ if (type === "US_SSN") {
114
164
  return `000-00-${await _hmacDigits(text, 4)}`;
115
165
  }
116
166
 
117
- if (_CC_RE.test(text)) {
118
- return `4000-0000-0000-${await _hmacDigits(text, 4)}`;
167
+ if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
168
+ const base = `400000000000${await _hmacDigits(text, 3)}`;
169
+ const checkDig = _computeLuhnDigit(base);
170
+ const full = base + checkDig;
171
+ return `${full.slice(0,4)}-${full.slice(4,8)}-${full.slice(8,12)}-${full.slice(12,16)}`;
119
172
  }
120
173
 
121
- if (_ROUTING_RE.test(text)) {
174
+ if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
122
175
  return `000000${await _hmacDigits(text, 3)}`;
123
176
  }
124
177
 
125
- // Turkish TC Kimlik No (format: 990000 + XXXX + even digit)
126
- if (_TCID_RE.test(text)) {
127
- const tail = await _hmacDigits(text, 5);
128
- let lastD = parseInt(tail[tail.length - 1], 10);
129
- if (lastD % 2 !== 0) lastD = (lastD + 1) % 10;
130
- return `990000${tail.slice(0, 4)}${lastD}`;
178
+ if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
179
+ const countryCode = (text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2))) ? text.slice(0, 2).toUpperCase() : "US";
180
+ return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
131
181
  }
132
182
 
133
- // Saudi National ID (format: 100000XXXX)
134
- if (_SAUDI_NID_RE.test(text)) {
135
- return `100000${await _hmacDigits(text, 4)}`;
183
+ if (type === "ES_DNI") {
184
+ const digits = `000${await _hmacDigits(text, 5)}`;
185
+ return digits + _computeEsIdCheck(parseInt(digits, 10));
136
186
  }
137
187
 
138
- // UAE Emirates ID (format: 784-0000-XXXXXXX-X)
139
- if (_UAE_EID_RE.test(text)) {
140
- return `784-0000-${await _hmacDigits(text, 7)}-${await _hmacDigits(text, 1, 20)}`;
188
+ if (type === "PERSON" || type === "PERSON_NAME") {
189
+ const f = await _pickFromArray(text, _FIRST_NAMES);
190
+ const l = await _pickFromArray(text + "last", _LAST_NAMES);
191
+ return `<PER:${f}_${l}>`;
141
192
  }
142
-
143
- // IBAN (format: XX00-XXXX... — preserve country code, zero check digits)
144
- if (_IBAN_RE.test(text)) {
145
- const countryCode = text.slice(0, 2);
146
- return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
193
+ if (type === "LOCATION" || type === "PHYS_ADDRESS") {
194
+ const c = await _pickFromArray(text, _CITIES);
195
+ return `<LOC:${c}>`;
196
+ }
197
+ if (type === "ORGANIZATION") {
198
+ const c = await _pickFromArray(text, _LAST_NAMES);
199
+ return `<ORG:${c}_Inc>`;
147
200
  }
148
201
 
149
202
  return `[TKN-${await _hmacHex(text)}]`;
@@ -10,15 +10,14 @@
10
10
  * Used for sub-string detokenization (finding tokens inside paragraphs).
11
11
  */
12
12
  export const TOKEN_PATTERN = new RegExp(
13
- "tkn-[a-f0-9]{8,64}@email\\.com" + // Email
14
- "|\\+1-555-\\d{7}" + // Phone
13
+ "tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}" + // Email
14
+ "|\\+[1-9]\\d{0,3}-555-\\d{7}" + // Phone
15
15
  "|000-00-\\d{4}" + // SSN
16
16
  "|4000-0000-0000-\\d{4}" + // CC
17
17
  "|000000\\d{3}" + // Routing
18
- "|990000\\d{4}[02468]" + // Turkish TCID token
19
- "|100000\\d{4}" + // Saudi NID token
20
- "|784-0000-\\d{7}-\\d" + // UAE EID token
18
+ "|000\\d{5}[A-Z]" + // Spanish DNI token
21
19
  "|[A-Z]{2}00[A-F0-9]{4,16}" + // IBAN token
20
+ "|<(?:PER|LOC|ORG):[^>]+>" + // NLP Semantic tokens
22
21
  "|\\[TKN-[a-f0-9]{8,64}\\]", // Opaque
23
22
  "g"
24
23
  );
@@ -30,13 +29,16 @@ export function looksLikeToken(value: string | any): boolean {
30
29
  if (typeof value !== 'string') return false;
31
30
  const v = value.trim();
32
31
 
33
- // Email tokens: tkn-<hex>@email.com
34
- if (v.startsWith("tkn-") && v.includes("@email.com")) {
35
- return true;
32
+ // Email tokens: tkn-<hex>@domain.com
33
+ if (v.startsWith("tkn-") && v.includes("@")) {
34
+ const parts = v.split("@");
35
+ if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
36
+ return true;
37
+ }
36
38
  }
37
39
 
38
- // Phone tokens: +1-555-XXXXXXX
39
- if (v.startsWith("+1-555-") && v.length === 14) {
40
+ // Phone tokens: +CC-555-XXXXXXX
41
+ if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v)) {
40
42
  return true;
41
43
  }
42
44
 
@@ -55,18 +57,10 @@ export function looksLikeToken(value: string | any): boolean {
55
57
  return true;
56
58
  }
57
59
 
58
- // UAE Emirates ID tokens: 784-0000-XXXXXXX-X
59
- if (v.startsWith("784-0000-") && v.length === 18) {
60
- return true;
61
- }
62
60
 
63
- // Turkish TCID tokens: 990000XXXX(even)
64
- if (v.length === 11 && v.startsWith("990000") && /^\d+$/.test(v) && parseInt(v[v.length - 1], 10) % 2 === 0) {
65
- return true;
66
- }
67
61
 
68
- // Saudi NID tokens: 100000XXXX
69
- if (v.length === 10 && v.startsWith("100000") && /^\d+$/.test(v)) {
62
+ // Spanish ID tokens: 000XXXXX[A-Z]
63
+ if (v.length === 9 && v.startsWith("000") && /[A-Z]$/.test(v)) {
70
64
  return true;
71
65
  }
72
66
 
@@ -75,6 +69,11 @@ export function looksLikeToken(value: string | any): boolean {
75
69
  return true;
76
70
  }
77
71
 
72
+ // Semantic NLP tokens: <PER:Taylor_Morgan>
73
+ if (/^<(PER|LOC|ORG):[^>]+>$/.test(v)) {
74
+ return true;
75
+ }
76
+
78
77
  // Opaque fallback tokens: [TKN-<hex>]
79
78
  if (v.startsWith("[TKN-") && v.endsWith("]")) {
80
79
  return true;
@@ -83,3 +82,4 @@ export function looksLikeToken(value: string | any): boolean {
83
82
  return false;
84
83
  }
85
84
 
85
+
@@ -19,6 +19,7 @@ import { LanguageContextResolver } from './dlp/assessor';
19
19
  import { DLPPatternRegistry } from './dlp/registry';
20
20
  import { DLPValidationEngine } from './dlp/handlers';
21
21
  import { DLPConfidenceScorer } from './dlp/scorer';
22
+ import { Span, resolveOverlaps, reconstruct } from './span';
22
23
 
23
24
  // Module-level DLP singletons (created once, reused for all scans)
24
25
  const _dlpLanguageResolver = new LanguageContextResolver();
@@ -29,11 +30,11 @@ const _dlpConfidenceScorer = new DLPConfidenceScorer();
29
30
  /** Regex patterns for Tier 1 deterministic detection */
30
31
  export const REGEX_PATTERNS: Record<string, RegExp> = {
31
32
  "EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
32
- "PHONE_NUMBER": /\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4}/g,
33
- "PHONE_NUMBER_INTL": /\+(?:44|33|49)[\s\-.]?\(?\d{1,5}\)?(?:[\s\-.]?\d{2,4}){2,4}/g,
34
- "US_SSN": /\d{3}-\d{2}-\d{4}/g,
35
- "CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
36
- "US_ROUTING_NUMBER": /\b\d{9}\b/g,
33
+ "PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
34
+ "PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
35
+ "US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
36
+ "CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
37
+ "US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
37
38
  "US_PASSPORT": /\b[A-Z]\d{8}\b/g,
38
39
  "DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g,
39
40
  };
@@ -83,177 +84,163 @@ export class BaseScanner {
83
84
  return checksum % 10 === 0;
84
85
  }
85
86
 
86
- protected async _tier0Dlp(
87
+ protected async _tier0CollectSpans(
87
88
  text: string,
88
- encodeFn: (val: string) => Promise<string>,
89
89
  confidenceThreshold: number,
90
- ): Promise<[string, any[]]> {
90
+ ): Promise<Span[]> {
91
91
  const detectedLanguage = _dlpLanguageResolver.resolve(text);
92
+ const spans: Span[] = [];
93
+ const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
92
94
 
93
- type RawHit = { start: number; end: number; tag: string; val: string; conf: number };
94
- const rawHits: RawHit[] = [];
95
-
96
- // Pass 1: Structured patterns from the registry
97
- for (const [typeTag, descriptor] of _dlpPatternRegistry.iterDescriptors()) {
98
- const re = new RegExp(descriptor.compiledRe.source, descriptor.compiledRe.flags);
95
+ // Pass 1: Category Mega-Regexes (O(text) per category bucket)
96
+ for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
97
+ const megaRe = new RegExp(re.source, re.flags);
99
98
  let m: RegExpExecArray | null;
100
- while ((m = re.exec(text)) !== null) {
99
+ while ((m = megaRe.exec(text)) !== null) {
100
+ // Identify which named group matched
101
+ const groups = m.groups ?? {};
102
+ let typeTag: string | undefined;
103
+ for (const name of typeOrder) {
104
+ if (groups[name] !== undefined) { typeTag = name; break; }
105
+ }
106
+ if (!typeTag) continue;
101
107
  const matchedStr = m[0];
102
108
  if (looksLikeToken(matchedStr)) continue;
109
+ const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
110
+ if (!descriptor) continue;
111
+
103
112
  const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
104
- const conf = _dlpConfidenceScorer.score({
105
- baseRisk: descriptor.baseRisk,
106
- matchStart: m.index,
107
- matchEnd: m.index + matchedStr.length,
108
- fullText: text,
109
- proximityTerms: descriptor.proximityTerms,
110
- validatorPassed: validatorResult,
111
- });
113
+
114
+ let conf: number;
115
+ // FUZZY FAIL-SAFE logic
116
+ if (validatorResult === false) {
117
+ if (descriptor.isHighEntropy) {
118
+ conf = 0.85; // Boosted to prioritize over generic types
119
+ } else {
120
+ continue;
121
+ }
122
+ } else {
123
+ conf = _dlpConfidenceScorer.score({
124
+ baseRisk: descriptor.baseRisk,
125
+ matchStart: m.index,
126
+ matchEnd: m.index + matchedStr.length,
127
+ fullText: text,
128
+ proximityTerms: descriptor.proximityTerms,
129
+ validatorPassed: validatorResult,
130
+ });
131
+ }
132
+
112
133
  if (conf >= confidenceThreshold) {
113
- rawHits.push({ start: m.index, end: m.index + matchedStr.length, tag: typeTag, val: matchedStr, conf });
134
+ spans.push({ start: m.index, end: m.index + matchedStr.length,
135
+ entityType: typeTag, originalValue: matchedStr,
136
+ confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
114
137
  }
115
138
  }
116
139
  }
117
140
 
118
- // Pass 2: Locale-tuned name patterns
119
- const nameProximity = new Set(["name", "contact", "person", "nom", "isim", "اسم"]);
141
+ // Pass 2: Locale-tuned name patterns (JIT)
142
+ const nameProximity = new Set(['name', 'contact', 'person', 'nom', 'isim', '\u0627\u0633\u0645']);
120
143
  for (const nameRe of _dlpPatternRegistry.namePatternsFor(detectedLanguage)) {
121
144
  const re = new RegExp(nameRe.source, nameRe.flags);
122
145
  let m: RegExpExecArray | null;
123
146
  while ((m = re.exec(text)) !== null) {
124
147
  if (looksLikeToken(m[0])) continue;
125
148
  const conf = _dlpConfidenceScorer.score({
126
- baseRisk: 0.50,
127
- matchStart: m.index,
128
- matchEnd: m.index + m[0].length,
129
- fullText: text,
130
- proximityTerms: nameProximity,
131
- validatorPassed: null,
149
+ baseRisk: 0.50, matchStart: m.index, matchEnd: m.index + m[0].length,
150
+ fullText: text, proximityTerms: nameProximity, validatorPassed: null,
132
151
  });
133
152
  if (conf >= confidenceThreshold) {
134
- rawHits.push({ start: m.index, end: m.index + m[0].length, tag: "PERSON_NAME", val: m[0], conf });
153
+ spans.push({ start: m.index, end: m.index + m[0].length,
154
+ entityType: 'PERSON_NAME', originalValue: m[0],
155
+ confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
135
156
  }
136
157
  }
137
158
  }
138
159
 
139
- // Pass 3: Locale-tuned address patterns
160
+ // Pass 3: Locale-tuned address patterns (JIT)
140
161
  for (const addrRe of _dlpPatternRegistry.addressPatternsFor(detectedLanguage)) {
141
162
  const re = new RegExp(addrRe.source, addrRe.flags);
142
163
  let m: RegExpExecArray | null;
143
164
  while ((m = re.exec(text)) !== null) {
144
165
  if (looksLikeToken(m[0])) continue;
145
- rawHits.push({ start: m.index, end: m.index + m[0].length, tag: "PHYS_ADDRESS", val: m[0], conf: 0.55 });
166
+ spans.push({ start: m.index, end: m.index + m[0].length,
167
+ entityType: 'PHYS_ADDRESS', originalValue: m[0],
168
+ confidence: 0.55, method: 'dlp_heuristic', language: detectedLanguage });
146
169
  }
147
170
  }
148
171
 
149
- // De-duplicate overlapping spans — keep longer / higher-confidence match
150
- rawHits.sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start) || b.conf - a.conf);
151
- const deduped: RawHit[] = [];
152
- let occupiedEnd = -1;
153
- for (const hit of rawHits) {
154
- if (hit.start >= occupiedEnd) {
155
- deduped.push(hit);
156
- occupiedEnd = hit.end;
157
- }
158
- }
172
+ return spans;
173
+ }
159
174
 
160
- // Replace right-to-left to preserve offsets
175
+ /** Backward-compat wrapper — collects spans then single-pass encodes. */
176
+ protected async _tier0Dlp(
177
+ text: string,
178
+ encodeFn: (val: string, options?: any) => Promise<string>,
179
+ confidenceThreshold: number,
180
+ ): Promise<[string, any[]]> {
181
+ const spans = await this._tier0CollectSpans(text, confidenceThreshold);
182
+ const resolved = resolveOverlaps(spans);
161
183
  const entities: any[] = [];
162
- let excised = text;
163
- for (const hit of [...deduped].reverse()) {
164
- const token = await encodeFn(hit.val);
165
- excised = excised.slice(0, hit.start) + token + excised.slice(hit.end);
166
- entities.push({
167
- type: hit.tag,
168
- value: hit.val,
169
- method: "dlp_heuristic",
170
- confidence: hit.conf,
171
- masked_value: token,
172
- language: detectedLanguage,
173
- });
174
- }
175
-
176
- return [excised, entities];
184
+ await Promise.all(resolved.map(async (span) => {
185
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
186
+ entities.push({ type: span.entityType, value: span.originalValue,
187
+ method: span.method, confidence: span.confidence,
188
+ masked_value: span.maskedValue, language: span.language });
189
+ }));
190
+ return [reconstruct(text, resolved), entities];
177
191
  }
178
192
 
179
- protected async _tier1Regex(
193
+ protected async _tier1CollectSpans(
180
194
  text: string,
181
- encodeFn: (val: string) => Promise<string>,
182
195
  boostEntities: Set<string>,
183
196
  aggressive: boolean,
184
197
  confidenceThreshold: number,
185
- ): Promise<[string, any[]]> {
186
- let entities: any[] = [];
187
- let excised = text;
188
-
189
- let allMatches: any[] = [];
190
-
198
+ ): Promise<Span[]> {
199
+ const spans: Span[] = [];
191
200
  for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
192
- // Create a fresh regex for matchAll
193
201
  const re = new RegExp(pattern.source, pattern.flags);
194
- let match;
202
+ let match: RegExpExecArray | null;
195
203
  while ((match = re.exec(text)) !== null) {
196
- let confidence = 0.95;
197
- if (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " "))) {
198
- confidence = 1.0;
199
- }
200
- if (entityType === "CREDIT_CARD" && BaseScanner._luhnChecksum(match[0])) {
201
- confidence = Math.max(confidence, 0.99);
204
+ const val = match[0];
205
+ if (looksLikeToken(val)) continue;
206
+ let confidence = (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, ' '))) ? 1.0 : 0.95;
207
+ if (entityType === 'CREDIT_CARD' && BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
208
+ if (entityType === 'US_ROUTING_NUMBER' && !BaseScanner._abaChecksum(val)) continue;
209
+ if (confidence >= confidenceThreshold) {
210
+ spans.push({ start: match.index, end: match.index + val.length,
211
+ entityType, originalValue: val, confidence, method: 'regex' });
202
212
  }
203
- if (entityType === "US_ROUTING_NUMBER" && !BaseScanner._abaChecksum(match[0])) {
204
- continue;
205
- }
206
- allMatches.push({
207
- start: match.index,
208
- end: match.index + match[0].length,
209
- type: entityType,
210
- value: match[0],
211
- confidence
212
- });
213
- }
214
- }
215
-
216
- // Deduplicate overlapping spans — keep the longest match
217
- allMatches.sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start));
218
- let filtered: any[] = [];
219
- let lastEnd = -1;
220
- for (const m of allMatches) {
221
- if (m.start >= lastEnd) {
222
- filtered.push(m);
223
- lastEnd = m.end;
224
- }
225
- }
226
-
227
- // Replace from right to left to preserve offsets
228
- const sortedFiltered = [...filtered].sort((a, b) => b.start - a.start);
229
- for (const m of sortedFiltered) {
230
- if (m.confidence >= confidenceThreshold && !looksLikeToken(m.value)) {
231
- const token = await encodeFn(m.value);
232
- excised = excised.slice(0, m.start) + token + excised.slice(m.end);
233
- entities.push({
234
- type: m.type,
235
- value: m.value,
236
- method: "regex",
237
- confidence: m.confidence,
238
- masked_value: token,
239
- });
240
213
  }
241
214
  }
215
+ return spans;
216
+ }
242
217
 
243
- return [excised, entities];
218
+ /** Backward-compat wrapper. */
219
+ protected async _tier1Regex(
220
+ text: string,
221
+ encodeFn: (val: string, options?: any) => Promise<string>,
222
+ boostEntities: Set<string>,
223
+ aggressive: boolean,
224
+ confidenceThreshold: number,
225
+ ): Promise<[string, any[]]> {
226
+ const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
227
+ const resolved = resolveOverlaps(spans);
228
+ const entities: any[] = [];
229
+ await Promise.all(resolved.map(async (span) => {
230
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
231
+ entities.push({ type: span.entityType, value: span.originalValue,
232
+ method: span.method, confidence: span.confidence, masked_value: span.maskedValue });
233
+ }));
234
+ return [reconstruct(text, resolved), entities];
244
235
  }
245
236
 
246
237
  protected async _tier2Nlp(
247
238
  text: string,
248
- encodeFn: (val: string) => Promise<string>,
239
+ encodeFn: (val: string, options?: any) => Promise<string>,
249
240
  boostEntities: Set<string>,
250
241
  aggressive: boolean,
251
242
  confidenceThreshold: number,
252
243
  ): Promise<[string, any[]]> {
253
- /**
254
- * Base implementation is a no-op. Override in LocalTransformersScanner
255
- * to enable NLP-based detection.
256
- */
257
244
  return [text, []];
258
245
  }
259
246
 
@@ -270,7 +257,7 @@ export class BaseScanner {
270
257
  async scanAndTokenize(
271
258
  text: string,
272
259
  options: {
273
- encodeFn?: (val: string) => Promise<string>;
260
+ encodeFn?: (val: string, options?: any) => Promise<string>;
274
261
  pipeline?: string[];
275
262
  confidenceThreshold?: number;
276
263
  context?: string | null;
@@ -279,25 +266,30 @@ export class BaseScanner {
279
266
  ): Promise<string> {
280
267
  if (!text || typeof text !== 'string') return text;
281
268
 
282
- const pipeline = options.pipeline || ["dlp", "regex", "checksum", "nlp"];
269
+ const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
283
270
  const _encode = options.encodeFn || encode;
284
271
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
285
272
  const boost = this._resolveBoost(options.context);
286
273
 
287
- let currentText = text;
274
+ // ── Span-accumulation phase (no string mutation) ─────────────────────
275
+ const allSpans: Span[] = [];
288
276
 
289
- // --- Tier 0: DLP Heuristic (multilingual, 50+ types) ---
290
- if (pipeline.includes("dlp")) {
291
- [currentText] = await this._tier0Dlp(currentText, _encode, confidenceThreshold);
277
+ if (pipeline.includes('dlp')) {
278
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
292
279
  }
293
-
294
- // --- Tier 1: Deterministic ---
295
- if (pipeline.includes("regex") || pipeline.includes("checksum")) {
296
- [currentText] = await this._tier1Regex(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
280
+ if (pipeline.includes('regex') || pipeline.includes('checksum')) {
281
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
297
282
  }
298
283
 
299
- // --- Tier 2: Probabilistic ---
300
- if (pipeline.includes("nlp")) {
284
+ // ── Single-pass resolve + reconstruct ────────────────────────────────
285
+ const resolved = resolveOverlaps(allSpans);
286
+ await Promise.all(resolved.map(async (span) => {
287
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
288
+ }));
289
+ let currentText = reconstruct(text, resolved);
290
+
291
+ // ── Tier 2: Probabilistic NLP (on already-masked text) ───────────────
292
+ if (pipeline.includes('nlp')) {
301
293
  [currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
302
294
  }
303
295
 
@@ -307,7 +299,7 @@ export class BaseScanner {
307
299
  async scanAndReturnEntities(
308
300
  text: string,
309
301
  options: {
310
- encodeFn?: (val: string) => Promise<string>;
302
+ encodeFn?: (val: string, options?: any) => Promise<string>;
311
303
  pipeline?: string[];
312
304
  confidenceThreshold?: number;
313
305
  context?: string | null;
@@ -316,30 +308,33 @@ export class BaseScanner {
316
308
  ): Promise<any[]> {
317
309
  if (!text || typeof text !== 'string') return [];
318
310
 
319
- const pipeline = options.pipeline || ["dlp", "regex", "checksum", "nlp"];
311
+ const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
320
312
  const _encode = options.encodeFn || encode;
321
313
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
322
314
  const boost = this._resolveBoost(options.context);
323
- let allEntities: any[] = [];
324
- let remaining = text;
325
-
326
- // --- Tier 0: DLP Heuristic ---
327
- if (pipeline.includes("dlp")) {
328
- const [newText, tier0] = await this._tier0Dlp(remaining, _encode, confidenceThreshold);
329
- remaining = newText;
330
- allEntities.push(...tier0);
331
- }
315
+ const allEntities: any[] = [];
332
316
 
333
- // --- Tier 1: Deterministic ---
334
- if (pipeline.includes("regex") || pipeline.includes("checksum")) {
335
- const [newText, tier1] = await this._tier1Regex(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
336
- remaining = newText;
337
- allEntities.push(...tier1);
317
+ // ── Span-accumulation phase ──────────────────────────────────────────
318
+ const allSpans: Span[] = [];
319
+ if (pipeline.includes('dlp')) {
320
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
321
+ }
322
+ if (pipeline.includes('regex') || pipeline.includes('checksum')) {
323
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
338
324
  }
339
325
 
340
- // --- Tier 2: Probabilistic ---
341
- if (pipeline.includes("nlp")) {
342
- const [_newText, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
326
+ const resolved = resolveOverlaps(allSpans);
327
+ await Promise.all(resolved.map(async (span) => {
328
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
329
+ allEntities.push({ type: span.entityType, value: span.originalValue,
330
+ method: span.method, confidence: span.confidence,
331
+ masked_value: span.maskedValue, language: span.language });
332
+ }));
333
+
334
+ const remaining = reconstruct(text, resolved);
335
+
336
+ if (pipeline.includes('nlp')) {
337
+ const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
343
338
  allEntities.push(...tier2);
344
339
  }
345
340