mask-privacy 3.0.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -17
- package/dist/index.d.mts +58 -27
- package/dist/index.d.ts +58 -27
- package/dist/index.js +394 -310
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +394 -310
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/core/dlp/assessor.ts +3 -26
- package/src/core/dlp/handlers.ts +44 -31
- package/src/core/dlp/index.ts +0 -2
- package/src/core/dlp/registry.ts +119 -107
- package/src/core/dlp/scorer.ts +4 -4
- package/src/core/fpe.ts +85 -32
- package/src/core/fpe_utils.ts +20 -20
- package/src/core/scanner.ts +146 -151
- package/src/core/span.ts +76 -0
- package/src/core/transformers_scanner.ts +2 -2
- package/src/core/vault.ts +2 -1
- package/tests/async.test.ts +2 -2
- package/tests/dlp_hardened.test.ts +21 -0
- package/tests/fpe.test.ts +4 -4
- package/tests/hooks.test.ts +2 -2
- package/tests/langchain.test.ts +2 -2
- package/tests/llamaindex.test.ts +1 -1
- package/tests/scanner.test.ts +0 -1
- package/tests/substring.test.ts +1 -1
- package/tests/vault.test.ts +1 -1
package/package.json
CHANGED
package/src/core/dlp/assessor.ts
CHANGED
|
@@ -8,40 +8,17 @@
|
|
|
8
8
|
* Supported language tags:
|
|
9
9
|
* en — English (default / Latin-only fallback)
|
|
10
10
|
* es — Spanish
|
|
11
|
-
* fr — French
|
|
12
|
-
* de — German
|
|
13
|
-
* tr — Turkish
|
|
14
|
-
* ar — Arabic
|
|
15
|
-
* zh — Chinese
|
|
16
|
-
* ja — Japanese
|
|
17
11
|
*/
|
|
18
12
|
|
|
19
|
-
export type LanguageTag =
|
|
20
|
-
| "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
|
|
13
|
+
export type LanguageTag = "en" | "es";
|
|
21
14
|
|
|
22
15
|
/**
|
|
23
16
|
* Ordered array of script signatures — more specific blocks are checked first
|
|
24
17
|
* to avoid misclassification (e.g. ş/ğ/ı for Turkish before generic accented-Latin).
|
|
25
18
|
*/
|
|
26
19
|
const SCRIPT_SIGNATURES: ReadonlyArray<{ tag: LanguageTag; pattern: RegExp }> = [
|
|
27
|
-
// CJK / East-Asian — checked first because they are unambiguous
|
|
28
|
-
{ tag: "zh", pattern: /[\u4e00-\u9fff\u3400-\u4dbf]/g },
|
|
29
|
-
{ tag: "ja", pattern: /[\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff]/g },
|
|
30
|
-
|
|
31
|
-
// Arabic script — covers Standard Arabic, Urdu overlap, etc.
|
|
32
|
-
{ tag: "ar", pattern: /[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]/g },
|
|
33
|
-
|
|
34
|
-
// Turkish — distinguished by dotless-i (ı), soft-g (ğ), ş, and cedilla ç
|
|
35
|
-
{ tag: "tr", pattern: /[ğıİşŞ]/g },
|
|
36
|
-
|
|
37
|
-
// German — umlauts and Eszett
|
|
38
|
-
{ tag: "de", pattern: /[äöüÄÖÜß]/g },
|
|
39
|
-
|
|
40
20
|
// Spanish — ñ and inverted punctuation
|
|
41
21
|
{ tag: "es", pattern: /[ñÑ¡¿]/g },
|
|
42
|
-
|
|
43
|
-
// French — cedilla, accented vowels with circumflex / diaeresis
|
|
44
|
-
{ tag: "fr", pattern: /[àâçéèêëïîôùûüÿœæ]/gi },
|
|
45
22
|
];
|
|
46
23
|
|
|
47
24
|
export interface LanguageBreakdown {
|
|
@@ -57,8 +34,8 @@ export interface LanguageBreakdown {
|
|
|
57
34
|
* @example
|
|
58
35
|
* ```ts
|
|
59
36
|
* const resolver = new LanguageContextResolver();
|
|
60
|
-
* const tag = resolver.resolve("
|
|
61
|
-
* // tag === "
|
|
37
|
+
* const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
|
|
38
|
+
* // tag === "es"
|
|
62
39
|
* ```
|
|
63
40
|
*/
|
|
64
41
|
export class LanguageContextResolver {
|
package/src/core/dlp/handlers.ts
CHANGED
|
@@ -146,44 +146,56 @@ export function checkIpv4Octets(raw: string): boolean {
|
|
|
146
146
|
return true;
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
// ──
|
|
149
|
+
// ── Canadian SIN (Luhn-9) ──────────────────────────────────────────────────
|
|
150
150
|
|
|
151
|
-
export function
|
|
152
|
-
const
|
|
153
|
-
if (
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
151
|
+
export function checkCaSin(raw: string): boolean {
|
|
152
|
+
const digits = raw.replace(/\D/g, "");
|
|
153
|
+
if (digits.length !== 9) return false;
|
|
154
|
+
|
|
155
|
+
let total = 0;
|
|
156
|
+
for (let idx = 0; idx < digits.length; idx++) {
|
|
157
|
+
let val = parseInt(digits[idx], 10);
|
|
158
|
+
if (idx % 2 === 1) { // 1st is 0, 2nd is 1...
|
|
159
|
+
val *= 2;
|
|
160
|
+
if (val > 9) val -= 9;
|
|
161
|
+
}
|
|
162
|
+
total += val;
|
|
163
|
+
}
|
|
164
|
+
return total % 10 === 0;
|
|
165
|
+
}
|
|
157
166
|
|
|
158
|
-
|
|
159
|
-
const evenSum = d[1] + d[3] + d[5] + d[7];
|
|
160
|
-
const computedD10 = ((oddSum * 7 - evenSum) % 10 + 10) % 10;
|
|
161
|
-
if (computedD10 !== d[9]) return false;
|
|
167
|
+
// ── UK National Insurance Number (NINO) ────────────────────────────────────
|
|
162
168
|
|
|
163
|
-
|
|
164
|
-
if (firstTenSum % 10 !== d[10]) return false;
|
|
169
|
+
const UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
|
|
165
170
|
|
|
166
|
-
|
|
171
|
+
export function checkUkNino(raw: string): boolean {
|
|
172
|
+
const cleaned = raw.replace(/ /g, "").toUpperCase();
|
|
173
|
+
if (cleaned.length !== 9) return false;
|
|
174
|
+
return UK_NINO_REGEX.test(cleaned);
|
|
167
175
|
}
|
|
168
176
|
|
|
169
|
-
// ──
|
|
177
|
+
// ── Spanish DNI/NIE (8 digits + 1 letter) ───────────────────────────────────
|
|
170
178
|
|
|
171
|
-
export function
|
|
172
|
-
const
|
|
173
|
-
if (
|
|
174
|
-
const d = digitsStr.split("").map(Number);
|
|
175
|
-
if (d[0] !== 1) return false;
|
|
179
|
+
export function checkEsId(raw: string): boolean {
|
|
180
|
+
const cleaned = raw.replace(/[\s-]/g, "").toUpperCase();
|
|
181
|
+
if (cleaned.length !== 9) return false;
|
|
176
182
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
183
|
+
const mapping: Record<string, string> = { X: "0", Y: "1", Z: "2" };
|
|
184
|
+
const firstChar = cleaned[0];
|
|
185
|
+
let numStr: string;
|
|
186
|
+
|
|
187
|
+
if (firstChar in mapping) {
|
|
188
|
+
numStr = mapping[firstChar] + cleaned.slice(1, 8);
|
|
189
|
+
} else if (/^\d$/.test(firstChar)) {
|
|
190
|
+
numStr = cleaned.slice(0, 8);
|
|
191
|
+
} else {
|
|
192
|
+
return false;
|
|
185
193
|
}
|
|
186
|
-
|
|
194
|
+
|
|
195
|
+
if (!/^\d+$/.test(numStr)) return false;
|
|
196
|
+
const num = parseInt(numStr, 10);
|
|
197
|
+
const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
198
|
+
return cleaned[8] === validLetters[num % 23];
|
|
187
199
|
}
|
|
188
200
|
|
|
189
201
|
// ── Dispatcher ─────────────────────────────────────────────────────────────
|
|
@@ -198,8 +210,9 @@ const VALIDATOR_DISPATCH: Record<string, ValidatorFn> = {
|
|
|
198
210
|
vin_format: checkVinFormat,
|
|
199
211
|
btc_format: checkBtcFormat,
|
|
200
212
|
ipv4: checkIpv4Octets,
|
|
201
|
-
|
|
202
|
-
|
|
213
|
+
ca_sin: checkCaSin,
|
|
214
|
+
uk_nino: checkUkNino,
|
|
215
|
+
es_id: checkEsId,
|
|
203
216
|
};
|
|
204
217
|
|
|
205
218
|
/**
|
package/src/core/dlp/index.ts
CHANGED
package/src/core/dlp/registry.ts
CHANGED
|
@@ -33,40 +33,20 @@ export interface PatternDescriptor {
|
|
|
33
33
|
baseRisk: number;
|
|
34
34
|
category: SensitiveCategory;
|
|
35
35
|
validatorTag: string | null;
|
|
36
|
+
isHighEntropy: boolean;
|
|
37
|
+
supportedLocales: string[];
|
|
36
38
|
}
|
|
37
39
|
|
|
38
40
|
// ── Locale-specific auxiliary patterns ──────────────────────────────────────
|
|
39
41
|
|
|
40
42
|
export const LOCALE_NAME_RULES: Record<string, RegExp[]> = {
|
|
41
43
|
en: [
|
|
42
|
-
/\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b/g,
|
|
43
|
-
/\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\b/g,
|
|
44
|
+
/\b[A-Z][a-z\-\']+ [A-Z][a-z\-\']+(?:\s+[A-Z][a-z\-\']+)?\b/g,
|
|
45
|
+
/\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z\-\']+\b/g,
|
|
44
46
|
],
|
|
45
47
|
es: [
|
|
46
|
-
/\b[A-Z][a-
|
|
47
|
-
/\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-
|
|
48
|
-
],
|
|
49
|
-
fr: [
|
|
50
|
-
/\b[A-Z][a-zàâçéèêëïîôùûü]+ [A-Z][a-zàâçéèêëïîôùûü]+\b/g,
|
|
51
|
-
/\b(?:M|Mme|Mlle)\.?\s+[A-Z][a-zàâçéèêëïîôùûü]+\b/g,
|
|
52
|
-
],
|
|
53
|
-
de: [
|
|
54
|
-
/\b[A-Z][a-zäöüß]+ [A-Z][a-zäöüß]+\b/g,
|
|
55
|
-
/\b(?:Herr|Frau)\.?\s+[A-Z][a-zäöüß]+\b/g,
|
|
56
|
-
],
|
|
57
|
-
tr: [
|
|
58
|
-
/\b[A-ZÇĞİÖŞÜ][a-zçğıöşü]+ [A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
|
|
59
|
-
/\b(?:Bay|Bayan|Sayın)\.?\s+[A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
|
|
60
|
-
],
|
|
61
|
-
ar: [
|
|
62
|
-
/[\u0621-\u064a][\u0600-\u06ff]+ [\u0621-\u064a][\u0600-\u06ff]+/g,
|
|
63
|
-
/(?:أبو|أم|ابن|بنت)\s+[\u0621-\u064a][\u0600-\u06ff]+/gi,
|
|
64
|
-
],
|
|
65
|
-
ja: [
|
|
66
|
-
/\b[A-Z][a-z]+(?:moto|yama|kawa|mura|ta|da|shi|no)\s+[A-Z][a-z]+\b/g,
|
|
67
|
-
],
|
|
68
|
-
zh: [
|
|
69
|
-
/\b[A-Z][a-z]{1,3}\s+[A-Z][a-z]+\b/g,
|
|
48
|
+
/\b[A-Z][a-záéíóúñ\-\']+ [A-Z][a-záéíóúñ\-\']+(?:\s+[A-Z][a-záéíóúñ\-\']+)?\b/g,
|
|
49
|
+
/\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ\-\']+\b/g,
|
|
70
50
|
],
|
|
71
51
|
};
|
|
72
52
|
|
|
@@ -75,26 +55,8 @@ export const LOCALE_ADDRESS_RULES: Record<string, RegExp[]> = {
|
|
|
75
55
|
/\b\d{1,5}\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b/g,
|
|
76
56
|
/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?\b/g,
|
|
77
57
|
],
|
|
78
|
-
|
|
79
|
-
/\b
|
|
80
|
-
],
|
|
81
|
-
de: [
|
|
82
|
-
/\b[A-ZÄÖÜa-zäöüß]+(?:straße|strasse|weg|gasse|platz)\s+\d{1,4}\b/g,
|
|
83
|
-
],
|
|
84
|
-
tr: [
|
|
85
|
-
/\b[A-ZÇĞİÖŞÜa-zçğıöşü]+\s+(?:Cad|Sok|Mah)\.?\s+/gi,
|
|
86
|
-
/\b\d{5}\s+[A-ZÇĞİÖŞÜa-zçğıöşü]+\/[A-ZÇĞİÖŞÜa-zçğıöşü]+\b/g,
|
|
87
|
-
],
|
|
88
|
-
ar: [
|
|
89
|
-
/شارع\s+[\u0600-\u06ff]+/g,
|
|
90
|
-
/حي\s+[\u0600-\u06ff]+/g,
|
|
91
|
-
/(?:ص\.ب|P\.?O\.?\s*Box)\s*\d{3,6}/gi,
|
|
92
|
-
],
|
|
93
|
-
uk_postcode: [
|
|
94
|
-
/\b[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\b/g,
|
|
95
|
-
],
|
|
96
|
-
ca_postal: [
|
|
97
|
-
/\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b/g,
|
|
58
|
+
es: [
|
|
59
|
+
/\b(?:Calle|Carrera|Avenida|Paseo|Plaza)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi,
|
|
98
60
|
],
|
|
99
61
|
};
|
|
100
62
|
|
|
@@ -102,117 +64,106 @@ export const LOCALE_ADDRESS_RULES: Record<string, RegExp[]> = {
|
|
|
102
64
|
|
|
103
65
|
type RawEntry = [
|
|
104
66
|
typeName: string,
|
|
105
|
-
|
|
106
|
-
flags: string,
|
|
67
|
+
regexSource: string | RegExp,
|
|
107
68
|
terms: string[],
|
|
108
69
|
risk: number,
|
|
109
|
-
|
|
110
|
-
|
|
70
|
+
category: SensitiveCategory,
|
|
71
|
+
validatorTag: string | null,
|
|
72
|
+
isHighEntropy?: boolean,
|
|
73
|
+
supportedLocales?: string[],
|
|
111
74
|
];
|
|
112
75
|
|
|
113
76
|
const RAW_PATTERNS: RawEntry[] = [
|
|
114
77
|
// ── FINANCIAL ──────────────────────────────────────────────────────
|
|
115
|
-
["US_SSN", "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
|
|
78
|
+
["US_SSN", "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
|
|
116
79
|
["ssn", "social security", "tax id", "taxpayer"], 0.95, SensitiveCategory.FINANCIAL, "ssn_area"],
|
|
117
80
|
|
|
118
|
-
["CREDIT_CARD_NUMBER", "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
|
|
81
|
+
["CREDIT_CARD_NUMBER", "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
|
|
119
82
|
["card", "credit", "visa", "mastercard", "amex", "payment"], 0.97, SensitiveCategory.FINANCIAL, "luhn"],
|
|
120
83
|
|
|
121
|
-
["INTL_BANK_IBAN", "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
|
|
84
|
+
["INTL_BANK_IBAN", "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
|
|
122
85
|
["iban", "swift", "sepa", "wire", "bank transfer"], 0.96, SensitiveCategory.FINANCIAL, "iban"],
|
|
123
86
|
|
|
124
|
-
["CRYPTO_BTC", "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
|
|
87
|
+
["CRYPTO_BTC", "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
|
|
125
88
|
["bitcoin", "btc", "wallet", "crypto"], 0.94, SensitiveCategory.FINANCIAL, "btc_format"],
|
|
126
89
|
|
|
127
|
-
["CRYPTO_ETH", "\\b0x[a-fA-F0-9]{40}\\b",
|
|
90
|
+
["CRYPTO_ETH", "\\b0x[a-fA-F0-9]{40}\\b",
|
|
128
91
|
["ethereum", "eth", "wallet", "0x"], 0.93, SensitiveCategory.FINANCIAL, null],
|
|
129
92
|
|
|
130
|
-
["US_ABA_ROUTING",
|
|
93
|
+
["US_ABA_ROUTING", /(?<!\d)\d{9}(?!\d)/,
|
|
131
94
|
["routing", "aba", "wire", "bank"], 0.88, SensitiveCategory.FINANCIAL, "aba_check"],
|
|
132
95
|
|
|
133
|
-
["BANK_ACCT_NUM",
|
|
134
|
-
["account", "checking", "savings", "deposit", "bank"], 0.
|
|
96
|
+
["BANK_ACCT_NUM", /(?<!\d)\d{8,17}(?!\d)/,
|
|
97
|
+
["account", "checking", "savings", "deposit", "bank"], 0.50, SensitiveCategory.FINANCIAL, "luhn_soft"],
|
|
135
98
|
|
|
136
|
-
["SWIFT_BIC", "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
|
|
99
|
+
["SWIFT_BIC", "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
|
|
137
100
|
["swift", "bic", "bank code", "transfer"], 0.60, SensitiveCategory.FINANCIAL, null],
|
|
138
101
|
|
|
139
102
|
// ── CONTACT ────────────────────────────────────────────────────────
|
|
140
|
-
["EMAIL_ADDR", "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
|
|
103
|
+
["EMAIL_ADDR", "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
|
|
141
104
|
["email", "mail", "contact", "address"], 0.99, SensitiveCategory.CONTACT, null],
|
|
142
105
|
|
|
143
|
-
["PHONE_NUM",
|
|
144
|
-
["phone", "call", "mobile", "tel", "whatsapp", "number"], 0.
|
|
106
|
+
["PHONE_NUM", /(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
|
|
107
|
+
["phone", "call", "mobile", "tel", "whatsapp", "number"], 0.80, SensitiveCategory.CONTACT, null],
|
|
145
108
|
|
|
146
|
-
["PHONE_NUM_INTL",
|
|
147
|
-
["phone", "call", "mobile", "tel"], 0.
|
|
109
|
+
["PHONE_NUM_INTL", /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/,
|
|
110
|
+
["phone", "call", "mobile", "tel"], 0.80, SensitiveCategory.CONTACT, null],
|
|
148
111
|
|
|
149
|
-
["IPV4_ADDR", "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
|
|
112
|
+
["IPV4_ADDR", "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
|
|
150
113
|
["ip", "server", "host", "network", "address"], 0.94, SensitiveCategory.CONTACT, "ipv4"],
|
|
151
114
|
|
|
152
|
-
["IPV6_ADDR", "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
|
|
115
|
+
["IPV6_ADDR", "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
|
|
153
116
|
["ipv6", "ip", "network", "server"], 0.93, SensitiveCategory.CONTACT, null],
|
|
154
117
|
|
|
155
|
-
["HW_MAC_ADDR", "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
|
|
118
|
+
["HW_MAC_ADDR", "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
|
|
156
119
|
["mac", "hardware", "network", "device"], 0.91, SensitiveCategory.CONTACT, null],
|
|
157
120
|
|
|
158
121
|
// ── PERSONAL ───────────────────────────────────────────────────────
|
|
159
|
-
["BIRTH_DATE", "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
|
|
122
|
+
["BIRTH_DATE", "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
|
|
160
123
|
["birth", "dob", "born", "birthday", "date of birth"], 0.88, SensitiveCategory.PERSONAL, null],
|
|
161
124
|
|
|
162
|
-
["US_DRIVERS_LIC", "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
|
|
163
|
-
["driver", "license", "licence", "dl", "dmv"], 0.
|
|
125
|
+
["US_DRIVERS_LIC", "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
|
|
126
|
+
["driver", "license", "licence", "dl", "dmv"], 0.55, SensitiveCategory.PERSONAL, null],
|
|
164
127
|
|
|
165
|
-
["US_PASSPORT_NUM", "\\b[A-Z]\\d{8}\\b",
|
|
128
|
+
["US_PASSPORT_NUM", "\\b[A-Z]\\d{8}\\b",
|
|
166
129
|
["passport", "travel", "visa", "immigration"], 0.87, SensitiveCategory.PERSONAL, null],
|
|
167
130
|
|
|
168
131
|
// ── VEHICLE ────────────────────────────────────────────────────────
|
|
169
|
-
["VEHICLE_VIN", "\\b[A-HJ-NPR-Z0-9]{17}\\b",
|
|
132
|
+
["VEHICLE_VIN", "\\b[A-HJ-NPR-Z0-9]{17}\\b",
|
|
170
133
|
["vin", "vehicle", "chassis", "automobile"], 0.92, SensitiveCategory.VEHICLE, "vin_format"],
|
|
171
134
|
|
|
172
|
-
["VEHICLE_PLATE", "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
|
|
135
|
+
["VEHICLE_PLATE", "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
|
|
173
136
|
["plate", "registration", "vehicle", "plaka"], 0.45, SensitiveCategory.VEHICLE, null],
|
|
174
137
|
|
|
175
138
|
// ── HEALTHCARE ─────────────────────────────────────────────────────
|
|
176
|
-
["MED_RECORD_ID", "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
|
|
139
|
+
["MED_RECORD_ID", "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
|
|
177
140
|
["patient", "medical", "record", "mrn", "hospital"], 0.96, SensitiveCategory.HEALTHCARE, null],
|
|
178
141
|
|
|
179
|
-
["US_MEDICARE_ID", "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
|
|
142
|
+
["US_MEDICARE_ID", "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
|
|
180
143
|
["medicare", "cms", "beneficiary", "health insurance"], 0.91, SensitiveCategory.HEALTHCARE, null],
|
|
181
144
|
|
|
182
|
-
["US_DEA_NUM", "\\b[A-Z]{2}\\d{7}\\b",
|
|
145
|
+
["US_DEA_NUM", "\\b[A-Z]{2}\\d{7}\\b",
|
|
183
146
|
["dea", "prescriber", "drug", "enforcement"], 0.89, SensitiveCategory.HEALTHCARE, null],
|
|
184
147
|
|
|
185
|
-
["US_NPI_NUM", "\\b\\d{10}\\b",
|
|
148
|
+
["US_NPI_NUM", "\\b\\d{10}\\b",
|
|
186
149
|
["npi", "provider", "national provider", "healthcare"], 0.87, SensitiveCategory.HEALTHCARE, null],
|
|
187
150
|
|
|
188
151
|
// ── IDENTITY_US ────────────────────────────────────────────────────
|
|
189
|
-
["US_EIN_TAX", "\\b\\d{2}-\\d{7}\\b",
|
|
152
|
+
["US_EIN_TAX", "\\b\\d{2}-\\d{7}\\b",
|
|
190
153
|
["ein", "federal", "employer", "tax id"], 0.89, SensitiveCategory.IDENTITY_US, null],
|
|
191
154
|
|
|
192
155
|
// ── IDENTITY_INTL ──────────────────────────────────────────────────
|
|
193
|
-
["UK_NATL_INS", "\\b[A-Z]{2}\\d{6}[A-Z]\\b",
|
|
194
|
-
["nino", "national insurance", "ni number", "uk"], 0.90, SensitiveCategory.IDENTITY_INTL,
|
|
195
|
-
|
|
196
|
-
["CA_SOCIAL_INS", "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b", "g",
|
|
197
|
-
["sin", "social insurance", "canada", "canadian"], 0.89, SensitiveCategory.IDENTITY_INTL, null],
|
|
198
|
-
|
|
199
|
-
["FR_INSEE_NUM", "\\b[12]\\d{2}[01]\\d\\d{8}\\d{2}\\b", "g",
|
|
200
|
-
["insee", "sécurité sociale", "france", "numéro"], 0.88, SensitiveCategory.IDENTITY_INTL, null],
|
|
156
|
+
["UK_NATL_INS", "\\b[A-Z]{2}\\d{6}[A-Z]\\b",
|
|
157
|
+
["nino", "national insurance", "ni number", "uk"], 0.90, SensitiveCategory.IDENTITY_INTL, "uk_nino"],
|
|
201
158
|
|
|
202
|
-
["
|
|
203
|
-
["
|
|
159
|
+
["CA_SOCIAL_INS", "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b",
|
|
160
|
+
["sin", "social insurance", "canada", "canadian"], 0.89, SensitiveCategory.IDENTITY_INTL, "ca_sin"],
|
|
204
161
|
|
|
205
|
-
["
|
|
206
|
-
["
|
|
207
|
-
|
|
208
|
-
["SA_NATIONAL_ID", "\\b1\\d{9}\\b", "g",
|
|
209
|
-
["هوية", "رقم الهوية", "saudi", "وطنية", "identity"], 0.91, SensitiveCategory.IDENTITY_INTL, "saudi_nid"],
|
|
210
|
-
|
|
211
|
-
["UAE_EMIRATES_ID", "\\b784-\\d{4}-\\d{7}-\\d\\b", "g",
|
|
212
|
-
["emirates", "هوية", "uae", "emirati", "identity"], 0.93, SensitiveCategory.IDENTITY_INTL, "luhn"],
|
|
162
|
+
["ES_DNI", "(?:\\d{8}[A-Z]|[XYZ]\\d{7}[A-Z])",
|
|
163
|
+
["dni", "nie", "identidad", "nif", "spain"], 0.94, SensitiveCategory.IDENTITY_INTL, "es_id", true, ["*", "es"]],
|
|
213
164
|
|
|
214
165
|
// ── CORPORATE ──────────────────────────────────────────────────────
|
|
215
|
-
["CORP_EMPLOYEE_ID", "
|
|
166
|
+
["CORP_EMPLOYEE_ID", "(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}",
|
|
216
167
|
["employee", "staff", "personnel", "worker"], 0.55, SensitiveCategory.CORPORATE, null],
|
|
217
168
|
];
|
|
218
169
|
|
|
@@ -220,18 +171,16 @@ const RAW_PATTERNS: RawEntry[] = [
|
|
|
220
171
|
|
|
221
172
|
/**
|
|
222
173
|
* Immutable catalogue of sensitive-data regex signatures.
|
|
223
|
-
*
|
|
224
|
-
* @example
|
|
225
|
-
* ```ts
|
|
226
|
-
* const reg = new DLPPatternRegistry(); // load everything
|
|
227
|
-
* const reg = new DLPPatternRegistry(new Set([SensitiveCategory.FINANCIAL]));
|
|
228
|
-
* ```
|
|
229
174
|
*/
|
|
230
175
|
export class DLPPatternRegistry {
|
|
231
176
|
private readonly catalogue: Map<string, PatternDescriptor> = new Map();
|
|
177
|
+
private readonly localeCategoryRegexMap: Map<string, Map<string, { re: RegExp; typeOrder: string[] }>> = new Map();
|
|
232
178
|
|
|
233
179
|
constructor(loadGroups?: ReadonlySet<SensitiveCategory>) {
|
|
234
180
|
this.buildCatalogue(loadGroups ?? null);
|
|
181
|
+
for (const loc of ["*", "en", "es"]) {
|
|
182
|
+
this.compileForLocale(loc);
|
|
183
|
+
}
|
|
235
184
|
}
|
|
236
185
|
|
|
237
186
|
get typeNames(): string[] {
|
|
@@ -247,25 +196,88 @@ export class DLPPatternRegistry {
|
|
|
247
196
|
return this.catalogue.get(typeName);
|
|
248
197
|
}
|
|
249
198
|
|
|
250
|
-
/** Return locale-tuned name regexes, falling back to English. */
|
|
251
199
|
namePatternsFor(lang: LanguageTag | string): RegExp[] {
|
|
252
200
|
return LOCALE_NAME_RULES[lang] ?? LOCALE_NAME_RULES["en"];
|
|
253
201
|
}
|
|
254
202
|
|
|
255
|
-
/** Return locale-tuned address regexes, falling back to English. */
|
|
256
203
|
addressPatternsFor(lang: LanguageTag | string): RegExp[] {
|
|
257
204
|
return LOCALE_ADDRESS_RULES[lang] ?? LOCALE_ADDRESS_RULES["en"];
|
|
258
205
|
}
|
|
259
206
|
|
|
207
|
+
getCategoryRegexesMap(locale: string = "en"): Map<string, { re: RegExp; typeOrder: string[] }> {
|
|
208
|
+
if (!this.localeCategoryRegexMap.has(locale)) {
|
|
209
|
+
this.compileForLocale(locale);
|
|
210
|
+
}
|
|
211
|
+
return this.localeCategoryRegexMap.get(locale)!;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
getCategoryTypeMap(categoryName: string, locale: string = "en"): string[] {
|
|
215
|
+
return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
private compileForLocale(locale: string): void {
|
|
219
|
+
const localePool = new Map<string, [string, PatternDescriptor][]>();
|
|
220
|
+
|
|
221
|
+
for (const [typeName, desc] of this.catalogue.entries()) {
|
|
222
|
+
if (desc.supportedLocales.includes("*") || desc.supportedLocales.includes(locale)) {
|
|
223
|
+
const catKey = desc.category;
|
|
224
|
+
if (!localePool.has(catKey)) localePool.set(catKey, []);
|
|
225
|
+
localePool.get(catKey)!.push([typeName, desc]);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const categoryMap = new Map<string, { re: RegExp; typeOrder: string[] }>();
|
|
230
|
+
|
|
231
|
+
for (const [catKey, entries] of localePool.entries()) {
|
|
232
|
+
entries.sort(([, a], [, b]) => {
|
|
233
|
+
const aVal = a.validatorTag ? 0 : 1;
|
|
234
|
+
const bVal = b.validatorTag ? 0 : 1;
|
|
235
|
+
if (aVal !== bVal) return aVal - bVal;
|
|
236
|
+
return b.compiledRe.source.length - a.compiledRe.source.length;
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
const parts: string[] = [];
|
|
240
|
+
const typeOrder: string[] = [];
|
|
241
|
+
for (const [typeName, desc] of entries) {
|
|
242
|
+
parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
|
|
243
|
+
typeOrder.push(typeName);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
const combinedSource = parts.join('|');
|
|
247
|
+
const needsI = entries.some(([, d]) => d.compiledRe.flags.includes('i'));
|
|
248
|
+
const flags = needsI ? 'gi' : 'g';
|
|
249
|
+
|
|
250
|
+
try {
|
|
251
|
+
const re = new RegExp(combinedSource, flags);
|
|
252
|
+
categoryMap.set(catKey, { re, typeOrder });
|
|
253
|
+
} catch (err) {
|
|
254
|
+
console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
this.localeCategoryRegexMap.set(locale, categoryMap);
|
|
259
|
+
}
|
|
260
|
+
|
|
260
261
|
private buildCatalogue(restrict: ReadonlySet<SensitiveCategory> | null): void {
|
|
261
|
-
for (const
|
|
262
|
+
for (const entry of RAW_PATTERNS) {
|
|
263
|
+
const [typeName, regexSource, terms, risk, cat, vtag, isHighEntropy, supportedLocales] = entry;
|
|
262
264
|
if (restrict !== null && !restrict.has(cat)) continue;
|
|
265
|
+
|
|
266
|
+
let re: RegExp;
|
|
267
|
+
if (regexSource instanceof RegExp) {
|
|
268
|
+
re = regexSource;
|
|
269
|
+
} else {
|
|
270
|
+
re = new RegExp(regexSource, "g");
|
|
271
|
+
}
|
|
272
|
+
|
|
263
273
|
this.catalogue.set(typeName, {
|
|
264
|
-
compiledRe:
|
|
274
|
+
compiledRe: re,
|
|
265
275
|
proximityTerms: new Set(terms),
|
|
266
276
|
baseRisk: risk,
|
|
267
277
|
category: cat,
|
|
268
278
|
validatorTag: vtag,
|
|
279
|
+
isHighEntropy: isHighEntropy ?? (vtag !== null),
|
|
280
|
+
supportedLocales: supportedLocales ?? ["*"],
|
|
269
281
|
});
|
|
270
282
|
}
|
|
271
283
|
}
|
package/src/core/dlp/scorer.ts
CHANGED
|
@@ -23,7 +23,7 @@ const DEFAULT_CONFIG: Required<ScorerConfig> = {
|
|
|
23
23
|
keywordBoost: 0.10,
|
|
24
24
|
validatorOverride: 0.99,
|
|
25
25
|
maxConfidence: 0.99,
|
|
26
|
-
penaltyFactor: 0.
|
|
26
|
+
penaltyFactor: 0.99, // Renamed functionally to validator failure penalty subtraction
|
|
27
27
|
};
|
|
28
28
|
|
|
29
29
|
export interface ScoreInput {
|
|
@@ -45,8 +45,8 @@ export interface ScoreInput {
|
|
|
45
45
|
* baseRisk: 0.92,
|
|
46
46
|
* matchStart: 10,
|
|
47
47
|
* matchEnd: 21,
|
|
48
|
-
* fullText: "
|
|
49
|
-
* proximityTerms: new Set(["
|
|
48
|
+
* fullText: "Mi número de DNI es 12345678Z",
|
|
49
|
+
* proximityTerms: new Set(["dni", "número"]),
|
|
50
50
|
* validatorPassed: true,
|
|
51
51
|
* });
|
|
52
52
|
* // score === 0.99 (validator override)
|
|
@@ -77,7 +77,7 @@ export class DLPConfidenceScorer {
|
|
|
77
77
|
// Hard-validator short-circuits
|
|
78
78
|
if (input.validatorPassed === true) return this.valOverride;
|
|
79
79
|
if (input.validatorPassed === false) {
|
|
80
|
-
return Math.
|
|
80
|
+
return Math.max(0.0, input.baseRisk - this.penalty);
|
|
81
81
|
}
|
|
82
82
|
|
|
83
83
|
// Extract the context window around the match
|