mask-privacy 3.0.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -17
- package/dist/index.d.mts +58 -27
- package/dist/index.d.ts +58 -27
- package/dist/index.js +394 -310
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +394 -310
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/core/dlp/assessor.ts +3 -26
- package/src/core/dlp/handlers.ts +44 -31
- package/src/core/dlp/index.ts +0 -2
- package/src/core/dlp/registry.ts +119 -107
- package/src/core/dlp/scorer.ts +4 -4
- package/src/core/fpe.ts +85 -32
- package/src/core/fpe_utils.ts +20 -20
- package/src/core/scanner.ts +146 -151
- package/src/core/span.ts +76 -0
- package/src/core/transformers_scanner.ts +2 -2
- package/src/core/vault.ts +2 -1
- package/tests/async.test.ts +2 -2
- package/tests/dlp_hardened.test.ts +21 -0
- package/tests/fpe.test.ts +4 -4
- package/tests/hooks.test.ts +2 -2
- package/tests/langchain.test.ts +2 -2
- package/tests/llamaindex.test.ts +1 -1
- package/tests/scanner.test.ts +0 -1
- package/tests/substring.test.ts +1 -1
- package/tests/vault.test.ts +1 -1
package/src/core/fpe.ts
CHANGED
|
@@ -10,7 +10,6 @@ import * as crypto from 'crypto';
|
|
|
10
10
|
import { config } from '../config';
|
|
11
11
|
import { getKeyProvider } from './key_provider';
|
|
12
12
|
import { MaskSecurityError } from './exceptions';
|
|
13
|
-
import { looksLikeToken } from './fpe_utils';
|
|
14
13
|
|
|
15
14
|
// Master key management
|
|
16
15
|
|
|
@@ -52,13 +51,12 @@ export function resetMasterKey(): void {
|
|
|
52
51
|
// Detectors — order matters: first match wins
|
|
53
52
|
|
|
54
53
|
const _EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
|
|
55
|
-
const _PHONE_RE =
|
|
54
|
+
const _PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
|
|
55
|
+
const _PHONE_INTL_RE = /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/;
|
|
56
56
|
const _SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
|
|
57
57
|
const _CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
|
|
58
58
|
const _ROUTING_RE = /^\d{9}$/;
|
|
59
|
-
const
|
|
60
|
-
const _SAUDI_NID_RE = /^1\d{9}$/;
|
|
61
|
-
const _UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
|
|
59
|
+
const _ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
|
|
62
60
|
const _IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
|
|
63
61
|
|
|
64
62
|
// Deterministic helpers (HMAC-based)
|
|
@@ -96,54 +94,109 @@ async function _hmacDigits(plaintext: string, n: number, offset: number = 0): Pr
|
|
|
96
94
|
|
|
97
95
|
// Public API
|
|
98
96
|
|
|
97
|
+
// Dictionary for Semantic NLP Faker Generation
|
|
98
|
+
const _FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
|
|
99
|
+
const _LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
|
|
100
|
+
const _CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
|
|
101
|
+
|
|
102
|
+
/** Return a deterministic item from an array. */
|
|
103
|
+
async function _pickFromArray(plaintext: string, array: string[]): Promise<string> {
|
|
104
|
+
const digits = await _hmacDigits(plaintext, 8);
|
|
105
|
+
const num = parseInt(digits, 10);
|
|
106
|
+
return array[num % array.length];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Compute Luhn check digit */
|
|
110
|
+
function _computeLuhnDigit(partialNum: string): string {
|
|
111
|
+
const digits = partialNum.split("").map(Number);
|
|
112
|
+
let sum = 0;
|
|
113
|
+
let shouldDouble = true;
|
|
114
|
+
for (let i = digits.length - 1; i >= 0; i--) {
|
|
115
|
+
let digit = digits[i];
|
|
116
|
+
if (shouldDouble) {
|
|
117
|
+
digit *= 2;
|
|
118
|
+
if (digit > 9) digit -= 9;
|
|
119
|
+
}
|
|
120
|
+
sum += digit;
|
|
121
|
+
shouldDouble = !shouldDouble;
|
|
122
|
+
}
|
|
123
|
+
return ((10 - (sum % 10)) % 10).toString();
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
function _computeEsIdCheck(num: number): string {
|
|
129
|
+
return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Public API
|
|
133
|
+
|
|
99
134
|
/**
|
|
100
|
-
* Return a **deterministic**, format-preserving token for rawText.
|
|
135
|
+
* Return a **deterministic**, format-preserving token for rawText using its entityType.
|
|
101
136
|
*/
|
|
102
|
-
export async function generateFPEToken(rawText: string): Promise<string> {
|
|
137
|
+
export async function generateFPEToken(rawText: string, entityType: string = 'UNKNOWN'): Promise<string> {
|
|
103
138
|
const text = rawText.trim();
|
|
139
|
+
let type = (entityType || "UNKNOWN").toUpperCase();
|
|
140
|
+
|
|
141
|
+
if (type === "UNKNOWN") {
|
|
142
|
+
if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
|
|
143
|
+
else if (_SSN_RE.test(text)) type = "US_SSN";
|
|
144
|
+
else if (_CC_RE.test(text)) type = "CREDIT_CARD";
|
|
145
|
+
else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
|
|
146
|
+
else if (_ES_ID_RE.test(text)) type = "ES_DNI";
|
|
147
|
+
else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
|
|
148
|
+
else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
|
|
149
|
+
}
|
|
104
150
|
|
|
105
|
-
if (
|
|
106
|
-
|
|
151
|
+
if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
|
|
152
|
+
const parts = text.split("@");
|
|
153
|
+
const domain = parts.length === 2 ? parts[1] : "email.com";
|
|
154
|
+
return `tkn-${await _hmacHex(text)}@${domain}`;
|
|
107
155
|
}
|
|
108
156
|
|
|
109
|
-
if (
|
|
110
|
-
|
|
157
|
+
if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
|
|
158
|
+
const m = text.match(/^\+([1-9]\d{0,3})/);
|
|
159
|
+
const cc = m ? m[1] : "1";
|
|
160
|
+
return `+${cc}-555-${await _hmacDigits(text, 7)}`;
|
|
111
161
|
}
|
|
112
162
|
|
|
113
|
-
if (
|
|
163
|
+
if (type === "US_SSN") {
|
|
114
164
|
return `000-00-${await _hmacDigits(text, 4)}`;
|
|
115
165
|
}
|
|
116
166
|
|
|
117
|
-
if (
|
|
118
|
-
|
|
167
|
+
if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
|
|
168
|
+
const base = `400000000000${await _hmacDigits(text, 3)}`;
|
|
169
|
+
const checkDig = _computeLuhnDigit(base);
|
|
170
|
+
const full = base + checkDig;
|
|
171
|
+
return `${full.slice(0,4)}-${full.slice(4,8)}-${full.slice(8,12)}-${full.slice(12,16)}`;
|
|
119
172
|
}
|
|
120
173
|
|
|
121
|
-
if (
|
|
174
|
+
if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
|
|
122
175
|
return `000000${await _hmacDigits(text, 3)}`;
|
|
123
176
|
}
|
|
124
177
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
let lastD = parseInt(tail[tail.length - 1], 10);
|
|
129
|
-
if (lastD % 2 !== 0) lastD = (lastD + 1) % 10;
|
|
130
|
-
return `990000${tail.slice(0, 4)}${lastD}`;
|
|
178
|
+
if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
|
|
179
|
+
const countryCode = (text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2))) ? text.slice(0, 2).toUpperCase() : "US";
|
|
180
|
+
return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
|
|
131
181
|
}
|
|
132
182
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
return
|
|
183
|
+
if (type === "ES_DNI") {
|
|
184
|
+
const digits = `000${await _hmacDigits(text, 5)}`;
|
|
185
|
+
return digits + _computeEsIdCheck(parseInt(digits, 10));
|
|
136
186
|
}
|
|
137
187
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
188
|
+
if (type === "PERSON" || type === "PERSON_NAME") {
|
|
189
|
+
const f = await _pickFromArray(text, _FIRST_NAMES);
|
|
190
|
+
const l = await _pickFromArray(text + "last", _LAST_NAMES);
|
|
191
|
+
return `<PER:${f}_${l}>`;
|
|
141
192
|
}
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
193
|
+
if (type === "LOCATION" || type === "PHYS_ADDRESS") {
|
|
194
|
+
const c = await _pickFromArray(text, _CITIES);
|
|
195
|
+
return `<LOC:${c}>`;
|
|
196
|
+
}
|
|
197
|
+
if (type === "ORGANIZATION") {
|
|
198
|
+
const c = await _pickFromArray(text, _LAST_NAMES);
|
|
199
|
+
return `<ORG:${c}_Inc>`;
|
|
147
200
|
}
|
|
148
201
|
|
|
149
202
|
return `[TKN-${await _hmacHex(text)}]`;
|
package/src/core/fpe_utils.ts
CHANGED
|
@@ -10,15 +10,14 @@
|
|
|
10
10
|
* Used for sub-string detokenization (finding tokens inside paragraphs).
|
|
11
11
|
*/
|
|
12
12
|
export const TOKEN_PATTERN = new RegExp(
|
|
13
|
-
"tkn-[a-f0-9]{8,64}@
|
|
14
|
-
"|\\+1-555-\\d{7}" + // Phone
|
|
13
|
+
"tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}" + // Email
|
|
14
|
+
"|\\+[1-9]\\d{0,3}-555-\\d{7}" + // Phone
|
|
15
15
|
"|000-00-\\d{4}" + // SSN
|
|
16
16
|
"|4000-0000-0000-\\d{4}" + // CC
|
|
17
17
|
"|000000\\d{3}" + // Routing
|
|
18
|
-
"|
|
|
19
|
-
"|100000\\d{4}" + // Saudi NID token
|
|
20
|
-
"|784-0000-\\d{7}-\\d" + // UAE EID token
|
|
18
|
+
"|000\\d{5}[A-Z]" + // Spanish DNI token
|
|
21
19
|
"|[A-Z]{2}00[A-F0-9]{4,16}" + // IBAN token
|
|
20
|
+
"|<(?:PER|LOC|ORG):[^>]+>" + // NLP Semantic tokens
|
|
22
21
|
"|\\[TKN-[a-f0-9]{8,64}\\]", // Opaque
|
|
23
22
|
"g"
|
|
24
23
|
);
|
|
@@ -30,13 +29,16 @@ export function looksLikeToken(value: string | any): boolean {
|
|
|
30
29
|
if (typeof value !== 'string') return false;
|
|
31
30
|
const v = value.trim();
|
|
32
31
|
|
|
33
|
-
// Email tokens: tkn-<hex>@
|
|
34
|
-
if (v.startsWith("tkn-") && v.includes("@
|
|
35
|
-
|
|
32
|
+
// Email tokens: tkn-<hex>@domain.com
|
|
33
|
+
if (v.startsWith("tkn-") && v.includes("@")) {
|
|
34
|
+
const parts = v.split("@");
|
|
35
|
+
if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
36
38
|
}
|
|
37
39
|
|
|
38
|
-
// Phone tokens: +
|
|
39
|
-
if (
|
|
40
|
+
// Phone tokens: +CC-555-XXXXXXX
|
|
41
|
+
if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v)) {
|
|
40
42
|
return true;
|
|
41
43
|
}
|
|
42
44
|
|
|
@@ -55,18 +57,10 @@ export function looksLikeToken(value: string | any): boolean {
|
|
|
55
57
|
return true;
|
|
56
58
|
}
|
|
57
59
|
|
|
58
|
-
// UAE Emirates ID tokens: 784-0000-XXXXXXX-X
|
|
59
|
-
if (v.startsWith("784-0000-") && v.length === 18) {
|
|
60
|
-
return true;
|
|
61
|
-
}
|
|
62
60
|
|
|
63
|
-
// Turkish TCID tokens: 990000XXXX(even)
|
|
64
|
-
if (v.length === 11 && v.startsWith("990000") && /^\d+$/.test(v) && parseInt(v[v.length - 1], 10) % 2 === 0) {
|
|
65
|
-
return true;
|
|
66
|
-
}
|
|
67
61
|
|
|
68
|
-
//
|
|
69
|
-
if (v.length ===
|
|
62
|
+
// Spanish ID tokens: 000XXXXX[A-Z]
|
|
63
|
+
if (v.length === 9 && v.startsWith("000") && /[A-Z]$/.test(v)) {
|
|
70
64
|
return true;
|
|
71
65
|
}
|
|
72
66
|
|
|
@@ -75,6 +69,11 @@ export function looksLikeToken(value: string | any): boolean {
|
|
|
75
69
|
return true;
|
|
76
70
|
}
|
|
77
71
|
|
|
72
|
+
// Semantic NLP tokens: <PER:Taylor_Morgan>
|
|
73
|
+
if (/^<(PER|LOC|ORG):[^>]+>$/.test(v)) {
|
|
74
|
+
return true;
|
|
75
|
+
}
|
|
76
|
+
|
|
78
77
|
// Opaque fallback tokens: [TKN-<hex>]
|
|
79
78
|
if (v.startsWith("[TKN-") && v.endsWith("]")) {
|
|
80
79
|
return true;
|
|
@@ -83,3 +82,4 @@ export function looksLikeToken(value: string | any): boolean {
|
|
|
83
82
|
return false;
|
|
84
83
|
}
|
|
85
84
|
|
|
85
|
+
|
package/src/core/scanner.ts
CHANGED
|
@@ -19,6 +19,7 @@ import { LanguageContextResolver } from './dlp/assessor';
|
|
|
19
19
|
import { DLPPatternRegistry } from './dlp/registry';
|
|
20
20
|
import { DLPValidationEngine } from './dlp/handlers';
|
|
21
21
|
import { DLPConfidenceScorer } from './dlp/scorer';
|
|
22
|
+
import { Span, resolveOverlaps, reconstruct } from './span';
|
|
22
23
|
|
|
23
24
|
// Module-level DLP singletons (created once, reused for all scans)
|
|
24
25
|
const _dlpLanguageResolver = new LanguageContextResolver();
|
|
@@ -29,11 +30,11 @@ const _dlpConfidenceScorer = new DLPConfidenceScorer();
|
|
|
29
30
|
/** Regex patterns for Tier 1 deterministic detection */
|
|
30
31
|
export const REGEX_PATTERNS: Record<string, RegExp> = {
|
|
31
32
|
"EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
|
|
32
|
-
"PHONE_NUMBER":
|
|
33
|
-
"PHONE_NUMBER_INTL":
|
|
34
|
-
"US_SSN":
|
|
35
|
-
"CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
|
|
36
|
-
"US_ROUTING_NUMBER":
|
|
33
|
+
"PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
|
|
34
|
+
"PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
|
|
35
|
+
"US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
|
|
36
|
+
"CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
|
|
37
|
+
"US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
|
|
37
38
|
"US_PASSPORT": /\b[A-Z]\d{8}\b/g,
|
|
38
39
|
"DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g,
|
|
39
40
|
};
|
|
@@ -83,177 +84,163 @@ export class BaseScanner {
|
|
|
83
84
|
return checksum % 10 === 0;
|
|
84
85
|
}
|
|
85
86
|
|
|
86
|
-
protected async
|
|
87
|
+
protected async _tier0CollectSpans(
|
|
87
88
|
text: string,
|
|
88
|
-
encodeFn: (val: string) => Promise<string>,
|
|
89
89
|
confidenceThreshold: number,
|
|
90
|
-
): Promise<[
|
|
90
|
+
): Promise<Span[]> {
|
|
91
91
|
const detectedLanguage = _dlpLanguageResolver.resolve(text);
|
|
92
|
+
const spans: Span[] = [];
|
|
93
|
+
const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
|
|
92
94
|
|
|
93
|
-
|
|
94
|
-
const
|
|
95
|
-
|
|
96
|
-
// Pass 1: Structured patterns from the registry
|
|
97
|
-
for (const [typeTag, descriptor] of _dlpPatternRegistry.iterDescriptors()) {
|
|
98
|
-
const re = new RegExp(descriptor.compiledRe.source, descriptor.compiledRe.flags);
|
|
95
|
+
// Pass 1: Category Mega-Regexes (O(text) per category bucket)
|
|
96
|
+
for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
|
|
97
|
+
const megaRe = new RegExp(re.source, re.flags);
|
|
99
98
|
let m: RegExpExecArray | null;
|
|
100
|
-
while ((m =
|
|
99
|
+
while ((m = megaRe.exec(text)) !== null) {
|
|
100
|
+
// Identify which named group matched
|
|
101
|
+
const groups = m.groups ?? {};
|
|
102
|
+
let typeTag: string | undefined;
|
|
103
|
+
for (const name of typeOrder) {
|
|
104
|
+
if (groups[name] !== undefined) { typeTag = name; break; }
|
|
105
|
+
}
|
|
106
|
+
if (!typeTag) continue;
|
|
101
107
|
const matchedStr = m[0];
|
|
102
108
|
if (looksLikeToken(matchedStr)) continue;
|
|
109
|
+
const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
|
|
110
|
+
if (!descriptor) continue;
|
|
111
|
+
|
|
103
112
|
const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
113
|
+
|
|
114
|
+
let conf: number;
|
|
115
|
+
// FUZZY FAIL-SAFE logic
|
|
116
|
+
if (validatorResult === false) {
|
|
117
|
+
if (descriptor.isHighEntropy) {
|
|
118
|
+
conf = 0.85; // Boosted to prioritize over generic types
|
|
119
|
+
} else {
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
} else {
|
|
123
|
+
conf = _dlpConfidenceScorer.score({
|
|
124
|
+
baseRisk: descriptor.baseRisk,
|
|
125
|
+
matchStart: m.index,
|
|
126
|
+
matchEnd: m.index + matchedStr.length,
|
|
127
|
+
fullText: text,
|
|
128
|
+
proximityTerms: descriptor.proximityTerms,
|
|
129
|
+
validatorPassed: validatorResult,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
|
|
112
133
|
if (conf >= confidenceThreshold) {
|
|
113
|
-
|
|
134
|
+
spans.push({ start: m.index, end: m.index + matchedStr.length,
|
|
135
|
+
entityType: typeTag, originalValue: matchedStr,
|
|
136
|
+
confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
|
|
114
137
|
}
|
|
115
138
|
}
|
|
116
139
|
}
|
|
117
140
|
|
|
118
|
-
// Pass 2: Locale-tuned name patterns
|
|
119
|
-
const nameProximity = new Set([
|
|
141
|
+
// Pass 2: Locale-tuned name patterns (JIT)
|
|
142
|
+
const nameProximity = new Set(['name', 'contact', 'person', 'nom', 'isim', '\u0627\u0633\u0645']);
|
|
120
143
|
for (const nameRe of _dlpPatternRegistry.namePatternsFor(detectedLanguage)) {
|
|
121
144
|
const re = new RegExp(nameRe.source, nameRe.flags);
|
|
122
145
|
let m: RegExpExecArray | null;
|
|
123
146
|
while ((m = re.exec(text)) !== null) {
|
|
124
147
|
if (looksLikeToken(m[0])) continue;
|
|
125
148
|
const conf = _dlpConfidenceScorer.score({
|
|
126
|
-
baseRisk: 0.50,
|
|
127
|
-
|
|
128
|
-
matchEnd: m.index + m[0].length,
|
|
129
|
-
fullText: text,
|
|
130
|
-
proximityTerms: nameProximity,
|
|
131
|
-
validatorPassed: null,
|
|
149
|
+
baseRisk: 0.50, matchStart: m.index, matchEnd: m.index + m[0].length,
|
|
150
|
+
fullText: text, proximityTerms: nameProximity, validatorPassed: null,
|
|
132
151
|
});
|
|
133
152
|
if (conf >= confidenceThreshold) {
|
|
134
|
-
|
|
153
|
+
spans.push({ start: m.index, end: m.index + m[0].length,
|
|
154
|
+
entityType: 'PERSON_NAME', originalValue: m[0],
|
|
155
|
+
confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
|
|
135
156
|
}
|
|
136
157
|
}
|
|
137
158
|
}
|
|
138
159
|
|
|
139
|
-
// Pass 3: Locale-tuned address patterns
|
|
160
|
+
// Pass 3: Locale-tuned address patterns (JIT)
|
|
140
161
|
for (const addrRe of _dlpPatternRegistry.addressPatternsFor(detectedLanguage)) {
|
|
141
162
|
const re = new RegExp(addrRe.source, addrRe.flags);
|
|
142
163
|
let m: RegExpExecArray | null;
|
|
143
164
|
while ((m = re.exec(text)) !== null) {
|
|
144
165
|
if (looksLikeToken(m[0])) continue;
|
|
145
|
-
|
|
166
|
+
spans.push({ start: m.index, end: m.index + m[0].length,
|
|
167
|
+
entityType: 'PHYS_ADDRESS', originalValue: m[0],
|
|
168
|
+
confidence: 0.55, method: 'dlp_heuristic', language: detectedLanguage });
|
|
146
169
|
}
|
|
147
170
|
}
|
|
148
171
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
const deduped: RawHit[] = [];
|
|
152
|
-
let occupiedEnd = -1;
|
|
153
|
-
for (const hit of rawHits) {
|
|
154
|
-
if (hit.start >= occupiedEnd) {
|
|
155
|
-
deduped.push(hit);
|
|
156
|
-
occupiedEnd = hit.end;
|
|
157
|
-
}
|
|
158
|
-
}
|
|
172
|
+
return spans;
|
|
173
|
+
}
|
|
159
174
|
|
|
160
|
-
|
|
175
|
+
/** Backward-compat wrapper — collects spans then single-pass encodes. */
|
|
176
|
+
protected async _tier0Dlp(
|
|
177
|
+
text: string,
|
|
178
|
+
encodeFn: (val: string, options?: any) => Promise<string>,
|
|
179
|
+
confidenceThreshold: number,
|
|
180
|
+
): Promise<[string, any[]]> {
|
|
181
|
+
const spans = await this._tier0CollectSpans(text, confidenceThreshold);
|
|
182
|
+
const resolved = resolveOverlaps(spans);
|
|
161
183
|
const entities: any[] = [];
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
method: "dlp_heuristic",
|
|
170
|
-
confidence: hit.conf,
|
|
171
|
-
masked_value: token,
|
|
172
|
-
language: detectedLanguage,
|
|
173
|
-
});
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
return [excised, entities];
|
|
184
|
+
await Promise.all(resolved.map(async (span) => {
|
|
185
|
+
span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
|
|
186
|
+
entities.push({ type: span.entityType, value: span.originalValue,
|
|
187
|
+
method: span.method, confidence: span.confidence,
|
|
188
|
+
masked_value: span.maskedValue, language: span.language });
|
|
189
|
+
}));
|
|
190
|
+
return [reconstruct(text, resolved), entities];
|
|
177
191
|
}
|
|
178
192
|
|
|
179
|
-
protected async
|
|
193
|
+
protected async _tier1CollectSpans(
|
|
180
194
|
text: string,
|
|
181
|
-
encodeFn: (val: string) => Promise<string>,
|
|
182
195
|
boostEntities: Set<string>,
|
|
183
196
|
aggressive: boolean,
|
|
184
197
|
confidenceThreshold: number,
|
|
185
|
-
): Promise<[
|
|
186
|
-
|
|
187
|
-
let excised = text;
|
|
188
|
-
|
|
189
|
-
let allMatches: any[] = [];
|
|
190
|
-
|
|
198
|
+
): Promise<Span[]> {
|
|
199
|
+
const spans: Span[] = [];
|
|
191
200
|
for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
|
|
192
|
-
// Create a fresh regex for matchAll
|
|
193
201
|
const re = new RegExp(pattern.source, pattern.flags);
|
|
194
|
-
let match;
|
|
202
|
+
let match: RegExpExecArray | null;
|
|
195
203
|
while ((match = re.exec(text)) !== null) {
|
|
196
|
-
|
|
197
|
-
if (
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
if (entityType ===
|
|
201
|
-
|
|
204
|
+
const val = match[0];
|
|
205
|
+
if (looksLikeToken(val)) continue;
|
|
206
|
+
let confidence = (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, ' '))) ? 1.0 : 0.95;
|
|
207
|
+
if (entityType === 'CREDIT_CARD' && BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
|
|
208
|
+
if (entityType === 'US_ROUTING_NUMBER' && !BaseScanner._abaChecksum(val)) continue;
|
|
209
|
+
if (confidence >= confidenceThreshold) {
|
|
210
|
+
spans.push({ start: match.index, end: match.index + val.length,
|
|
211
|
+
entityType, originalValue: val, confidence, method: 'regex' });
|
|
202
212
|
}
|
|
203
|
-
if (entityType === "US_ROUTING_NUMBER" && !BaseScanner._abaChecksum(match[0])) {
|
|
204
|
-
continue;
|
|
205
|
-
}
|
|
206
|
-
allMatches.push({
|
|
207
|
-
start: match.index,
|
|
208
|
-
end: match.index + match[0].length,
|
|
209
|
-
type: entityType,
|
|
210
|
-
value: match[0],
|
|
211
|
-
confidence
|
|
212
|
-
});
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Deduplicate overlapping spans — keep the longest match
|
|
217
|
-
allMatches.sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start));
|
|
218
|
-
let filtered: any[] = [];
|
|
219
|
-
let lastEnd = -1;
|
|
220
|
-
for (const m of allMatches) {
|
|
221
|
-
if (m.start >= lastEnd) {
|
|
222
|
-
filtered.push(m);
|
|
223
|
-
lastEnd = m.end;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
// Replace from right to left to preserve offsets
|
|
228
|
-
const sortedFiltered = [...filtered].sort((a, b) => b.start - a.start);
|
|
229
|
-
for (const m of sortedFiltered) {
|
|
230
|
-
if (m.confidence >= confidenceThreshold && !looksLikeToken(m.value)) {
|
|
231
|
-
const token = await encodeFn(m.value);
|
|
232
|
-
excised = excised.slice(0, m.start) + token + excised.slice(m.end);
|
|
233
|
-
entities.push({
|
|
234
|
-
type: m.type,
|
|
235
|
-
value: m.value,
|
|
236
|
-
method: "regex",
|
|
237
|
-
confidence: m.confidence,
|
|
238
|
-
masked_value: token,
|
|
239
|
-
});
|
|
240
213
|
}
|
|
241
214
|
}
|
|
215
|
+
return spans;
|
|
216
|
+
}
|
|
242
217
|
|
|
243
|
-
|
|
218
|
+
/** Backward-compat wrapper. */
|
|
219
|
+
protected async _tier1Regex(
|
|
220
|
+
text: string,
|
|
221
|
+
encodeFn: (val: string, options?: any) => Promise<string>,
|
|
222
|
+
boostEntities: Set<string>,
|
|
223
|
+
aggressive: boolean,
|
|
224
|
+
confidenceThreshold: number,
|
|
225
|
+
): Promise<[string, any[]]> {
|
|
226
|
+
const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
|
|
227
|
+
const resolved = resolveOverlaps(spans);
|
|
228
|
+
const entities: any[] = [];
|
|
229
|
+
await Promise.all(resolved.map(async (span) => {
|
|
230
|
+
span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
|
|
231
|
+
entities.push({ type: span.entityType, value: span.originalValue,
|
|
232
|
+
method: span.method, confidence: span.confidence, masked_value: span.maskedValue });
|
|
233
|
+
}));
|
|
234
|
+
return [reconstruct(text, resolved), entities];
|
|
244
235
|
}
|
|
245
236
|
|
|
246
237
|
protected async _tier2Nlp(
|
|
247
238
|
text: string,
|
|
248
|
-
encodeFn: (val: string) => Promise<string>,
|
|
239
|
+
encodeFn: (val: string, options?: any) => Promise<string>,
|
|
249
240
|
boostEntities: Set<string>,
|
|
250
241
|
aggressive: boolean,
|
|
251
242
|
confidenceThreshold: number,
|
|
252
243
|
): Promise<[string, any[]]> {
|
|
253
|
-
/**
|
|
254
|
-
* Base implementation is a no-op. Override in LocalTransformersScanner
|
|
255
|
-
* to enable NLP-based detection.
|
|
256
|
-
*/
|
|
257
244
|
return [text, []];
|
|
258
245
|
}
|
|
259
246
|
|
|
@@ -270,7 +257,7 @@ export class BaseScanner {
|
|
|
270
257
|
async scanAndTokenize(
|
|
271
258
|
text: string,
|
|
272
259
|
options: {
|
|
273
|
-
encodeFn?: (val: string) => Promise<string>;
|
|
260
|
+
encodeFn?: (val: string, options?: any) => Promise<string>;
|
|
274
261
|
pipeline?: string[];
|
|
275
262
|
confidenceThreshold?: number;
|
|
276
263
|
context?: string | null;
|
|
@@ -279,25 +266,30 @@ export class BaseScanner {
|
|
|
279
266
|
): Promise<string> {
|
|
280
267
|
if (!text || typeof text !== 'string') return text;
|
|
281
268
|
|
|
282
|
-
const pipeline = options.pipeline || [
|
|
269
|
+
const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
|
|
283
270
|
const _encode = options.encodeFn || encode;
|
|
284
271
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
285
272
|
const boost = this._resolveBoost(options.context);
|
|
286
273
|
|
|
287
|
-
|
|
274
|
+
// ── Span-accumulation phase (no string mutation) ─────────────────────
|
|
275
|
+
const allSpans: Span[] = [];
|
|
288
276
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
[currentText] = await this._tier0Dlp(currentText, _encode, confidenceThreshold);
|
|
277
|
+
if (pipeline.includes('dlp')) {
|
|
278
|
+
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
292
279
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
if (pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
296
|
-
[currentText] = await this._tier1Regex(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
280
|
+
if (pipeline.includes('regex') || pipeline.includes('checksum')) {
|
|
281
|
+
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
297
282
|
}
|
|
298
283
|
|
|
299
|
-
//
|
|
300
|
-
|
|
284
|
+
// ── Single-pass resolve + reconstruct ────────────────────────────────
|
|
285
|
+
const resolved = resolveOverlaps(allSpans);
|
|
286
|
+
await Promise.all(resolved.map(async (span) => {
|
|
287
|
+
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
288
|
+
}));
|
|
289
|
+
let currentText = reconstruct(text, resolved);
|
|
290
|
+
|
|
291
|
+
// ── Tier 2: Probabilistic NLP (on already-masked text) ───────────────
|
|
292
|
+
if (pipeline.includes('nlp')) {
|
|
301
293
|
[currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
302
294
|
}
|
|
303
295
|
|
|
@@ -307,7 +299,7 @@ export class BaseScanner {
|
|
|
307
299
|
async scanAndReturnEntities(
|
|
308
300
|
text: string,
|
|
309
301
|
options: {
|
|
310
|
-
encodeFn?: (val: string) => Promise<string>;
|
|
302
|
+
encodeFn?: (val: string, options?: any) => Promise<string>;
|
|
311
303
|
pipeline?: string[];
|
|
312
304
|
confidenceThreshold?: number;
|
|
313
305
|
context?: string | null;
|
|
@@ -316,30 +308,33 @@ export class BaseScanner {
|
|
|
316
308
|
): Promise<any[]> {
|
|
317
309
|
if (!text || typeof text !== 'string') return [];
|
|
318
310
|
|
|
319
|
-
const pipeline = options.pipeline || [
|
|
311
|
+
const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
|
|
320
312
|
const _encode = options.encodeFn || encode;
|
|
321
313
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
322
314
|
const boost = this._resolveBoost(options.context);
|
|
323
|
-
|
|
324
|
-
let remaining = text;
|
|
325
|
-
|
|
326
|
-
// --- Tier 0: DLP Heuristic ---
|
|
327
|
-
if (pipeline.includes("dlp")) {
|
|
328
|
-
const [newText, tier0] = await this._tier0Dlp(remaining, _encode, confidenceThreshold);
|
|
329
|
-
remaining = newText;
|
|
330
|
-
allEntities.push(...tier0);
|
|
331
|
-
}
|
|
315
|
+
const allEntities: any[] = [];
|
|
332
316
|
|
|
333
|
-
//
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
317
|
+
// ── Span-accumulation phase ──────────────────────────────────────────
|
|
318
|
+
const allSpans: Span[] = [];
|
|
319
|
+
if (pipeline.includes('dlp')) {
|
|
320
|
+
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
321
|
+
}
|
|
322
|
+
if (pipeline.includes('regex') || pipeline.includes('checksum')) {
|
|
323
|
+
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
338
324
|
}
|
|
339
325
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
326
|
+
const resolved = resolveOverlaps(allSpans);
|
|
327
|
+
await Promise.all(resolved.map(async (span) => {
|
|
328
|
+
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
329
|
+
allEntities.push({ type: span.entityType, value: span.originalValue,
|
|
330
|
+
method: span.method, confidence: span.confidence,
|
|
331
|
+
masked_value: span.maskedValue, language: span.language });
|
|
332
|
+
}));
|
|
333
|
+
|
|
334
|
+
const remaining = reconstruct(text, resolved);
|
|
335
|
+
|
|
336
|
+
if (pipeline.includes('nlp')) {
|
|
337
|
+
const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
343
338
|
allEntities.push(...tier2);
|
|
344
339
|
}
|
|
345
340
|
|