mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mask-privacy",
3
- "version": "3.0.0",
3
+ "version": "3.2.0",
4
4
  "description": "Enterprise-grade AI Data Loss Prevention (DLP) SDK for TypeScript",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -8,40 +8,17 @@
8
8
  * Supported language tags:
9
9
  * en — English (default / Latin-only fallback)
10
10
  * es — Spanish
11
- * fr — French
12
- * de — German
13
- * tr — Turkish
14
- * ar — Arabic
15
- * zh — Chinese
16
- * ja — Japanese
17
11
  */
18
12
 
19
- export type LanguageTag =
20
- | "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
13
+ export type LanguageTag = "en" | "es";
21
14
 
22
15
  /**
23
16
  * Ordered array of script signatures — more specific blocks are checked first
24
17
  * to avoid misclassification (e.g. ş/ğ/ı for Turkish before generic accented-Latin).
25
18
  */
26
19
  const SCRIPT_SIGNATURES: ReadonlyArray<{ tag: LanguageTag; pattern: RegExp }> = [
27
- // CJK / East-Asian — checked first because they are unambiguous
28
- { tag: "zh", pattern: /[\u4e00-\u9fff\u3400-\u4dbf]/g },
29
- { tag: "ja", pattern: /[\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff]/g },
30
-
31
- // Arabic script — covers Standard Arabic, Urdu overlap, etc.
32
- { tag: "ar", pattern: /[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]/g },
33
-
34
- // Turkish — distinguished by dotless-i (ı), soft-g (ğ), ş, and cedilla ç
35
- { tag: "tr", pattern: /[ğıİşŞ]/g },
36
-
37
- // German — umlauts and Eszett
38
- { tag: "de", pattern: /[äöüÄÖÜß]/g },
39
-
40
20
  // Spanish — ñ and inverted punctuation
41
21
  { tag: "es", pattern: /[ñÑ¡¿]/g },
42
-
43
- // French — cedilla, accented vowels with circumflex / diaeresis
44
- { tag: "fr", pattern: /[àâçéèêëïîôùûüÿœæ]/gi },
45
22
  ];
46
23
 
47
24
  export interface LanguageBreakdown {
@@ -57,8 +34,8 @@ export interface LanguageBreakdown {
57
34
  * @example
58
35
  * ```ts
59
36
  * const resolver = new LanguageContextResolver();
60
- * const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
61
- * // tag === "tr"
37
+ * const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
38
+ * // tag === "es"
62
39
  * ```
63
40
  */
64
41
  export class LanguageContextResolver {
@@ -146,44 +146,56 @@ export function checkIpv4Octets(raw: string): boolean {
146
146
  return true;
147
147
  }
148
148
 
149
- // ── Turkish TCID (11-digit Kimlik No) ──────────────────────────────────────
149
+ // ── Canadian SIN (Luhn-9) ──────────────────────────────────────────────────
150
150
 
151
- export function checkTcidNumber(raw: string): boolean {
152
- const digitsStr = raw.replace(/\D/g, "");
153
- if (digitsStr.length !== 11) return false;
154
- const d = digitsStr.split("").map(Number);
155
- if (d[0] === 0) return false;
156
- if (d[10] % 2 !== 0) return false;
151
+ export function checkCaSin(raw: string): boolean {
152
+ const digits = raw.replace(/\D/g, "");
153
+ if (digits.length !== 9) return false;
154
+
155
+ let total = 0;
156
+ for (let idx = 0; idx < digits.length; idx++) {
157
+ let val = parseInt(digits[idx], 10);
158
+ if (idx % 2 === 1) { // 1st is 0, 2nd is 1...
159
+ val *= 2;
160
+ if (val > 9) val -= 9;
161
+ }
162
+ total += val;
163
+ }
164
+ return total % 10 === 0;
165
+ }
157
166
 
158
- const oddSum = d[0] + d[2] + d[4] + d[6] + d[8];
159
- const evenSum = d[1] + d[3] + d[5] + d[7];
160
- const computedD10 = ((oddSum * 7 - evenSum) % 10 + 10) % 10;
161
- if (computedD10 !== d[9]) return false;
167
+ // ── UK National Insurance Number (NINO) ────────────────────────────────────
162
168
 
163
- const firstTenSum = d.slice(0, 10).reduce((a, b) => a + b, 0);
164
- if (firstTenSum % 10 !== d[10]) return false;
169
+ const UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
165
170
 
166
- return true;
171
+ export function checkUkNino(raw: string): boolean {
172
+ const cleaned = raw.replace(/ /g, "").toUpperCase();
173
+ if (cleaned.length !== 9) return false;
174
+ return UK_NINO_REGEX.test(cleaned);
167
175
  }
168
176
 
169
- // ── Saudi National ID (10-digit, starts with 1) ────────────────────────────
177
+ // ── Spanish DNI/NIE (8 digits + 1 letter) ───────────────────────────────────
170
178
 
171
- export function checkSaudiNid(raw: string): boolean {
172
- const digitsStr = raw.replace(/\D/g, "");
173
- if (digitsStr.length !== 10) return false;
174
- const d = digitsStr.split("").map(Number);
175
- if (d[0] !== 1) return false;
179
+ export function checkEsId(raw: string): boolean {
180
+ const cleaned = raw.replace(/[\s-]/g, "").toUpperCase();
181
+ if (cleaned.length !== 9) return false;
176
182
 
177
- let total = 0;
178
- for (let idx = 0; idx < 10; idx++) {
179
- let val = d[idx];
180
- if (idx % 2 === 0) { // 0-indexed odd positions
181
- val *= 2;
182
- if (val > 9) val -= 9;
183
- }
184
- total += val;
183
+ const mapping: Record<string, string> = { X: "0", Y: "1", Z: "2" };
184
+ const firstChar = cleaned[0];
185
+ let numStr: string;
186
+
187
+ if (firstChar in mapping) {
188
+ numStr = mapping[firstChar] + cleaned.slice(1, 8);
189
+ } else if (/^\d$/.test(firstChar)) {
190
+ numStr = cleaned.slice(0, 8);
191
+ } else {
192
+ return false;
185
193
  }
186
- return total % 10 === 0;
194
+
195
+ if (!/^\d+$/.test(numStr)) return false;
196
+ const num = parseInt(numStr, 10);
197
+ const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
198
+ return cleaned[8] === validLetters[num % 23];
187
199
  }
188
200
 
189
201
  // ── Dispatcher ─────────────────────────────────────────────────────────────
@@ -198,8 +210,9 @@ const VALIDATOR_DISPATCH: Record<string, ValidatorFn> = {
198
210
  vin_format: checkVinFormat,
199
211
  btc_format: checkBtcFormat,
200
212
  ipv4: checkIpv4Octets,
201
- tcid: checkTcidNumber,
202
- saudi_nid: checkSaudiNid,
213
+ ca_sin: checkCaSin,
214
+ uk_nino: checkUkNino,
215
+ es_id: checkEsId,
203
216
  };
204
217
 
205
218
  /**
@@ -21,8 +21,6 @@ export {
21
21
  checkVinFormat,
22
22
  checkBtcFormat,
23
23
  checkIpv4Octets,
24
- checkTcidNumber,
25
- checkSaudiNid,
26
24
  } from "./handlers";
27
25
 
28
26
  export { DLPConfidenceScorer } from "./scorer";
@@ -33,40 +33,20 @@ export interface PatternDescriptor {
33
33
  baseRisk: number;
34
34
  category: SensitiveCategory;
35
35
  validatorTag: string | null;
36
+ isHighEntropy: boolean;
37
+ supportedLocales: string[];
36
38
  }
37
39
 
38
40
  // ── Locale-specific auxiliary patterns ──────────────────────────────────────
39
41
 
40
42
  export const LOCALE_NAME_RULES: Record<string, RegExp[]> = {
41
43
  en: [
42
- /\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b/g,
43
- /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\b/g,
44
+ /\b[A-Z][a-z\-\']+ [A-Z][a-z\-\']+(?:\s+[A-Z][a-z\-\']+)?\b/g,
45
+ /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z\-\']+\b/g,
44
46
  ],
45
47
  es: [
46
- /\b[A-Z][a-záéíóúñ]+ [A-Z][a-záéíóúñ]+(?:\s+[A-Z][a-záéíóúñ]+)?\b/g,
47
- /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ]+\b/g,
48
- ],
49
- fr: [
50
- /\b[A-Z][a-zàâçéèêëïîôùûü]+ [A-Z][a-zàâçéèêëïîôùûü]+\b/g,
51
- /\b(?:M|Mme|Mlle)\.?\s+[A-Z][a-zàâçéèêëïîôùûü]+\b/g,
52
- ],
53
- de: [
54
- /\b[A-Z][a-zäöüß]+ [A-Z][a-zäöüß]+\b/g,
55
- /\b(?:Herr|Frau)\.?\s+[A-Z][a-zäöüß]+\b/g,
56
- ],
57
- tr: [
58
- /\b[A-ZÇĞİÖŞÜ][a-zçğıöşü]+ [A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
59
- /\b(?:Bay|Bayan|Sayın)\.?\s+[A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
60
- ],
61
- ar: [
62
- /[\u0621-\u064a][\u0600-\u06ff]+ [\u0621-\u064a][\u0600-\u06ff]+/g,
63
- /(?:أبو|أم|ابن|بنت)\s+[\u0621-\u064a][\u0600-\u06ff]+/gi,
64
- ],
65
- ja: [
66
- /\b[A-Z][a-z]+(?:moto|yama|kawa|mura|ta|da|shi|no)\s+[A-Z][a-z]+\b/g,
67
- ],
68
- zh: [
69
- /\b[A-Z][a-z]{1,3}\s+[A-Z][a-z]+\b/g,
48
+ /\b[A-Z][a-záéíóúñ\-\']+ [A-Z][a-záéíóúñ\-\']+(?:\s+[A-Z][a-záéíóúñ\-\']+)?\b/g,
49
+ /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ\-\']+\b/g,
70
50
  ],
71
51
  };
72
52
 
@@ -75,26 +55,8 @@ export const LOCALE_ADDRESS_RULES: Record<string, RegExp[]> = {
75
55
  /\b\d{1,5}\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b/g,
76
56
  /\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?\b/g,
77
57
  ],
78
- fr: [
79
- /\b\d{1,4}\s+(?:rue|avenue|boulevard|place|chemin)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi,
80
- ],
81
- de: [
82
- /\b[A-ZÄÖÜa-zäöüß]+(?:straße|strasse|weg|gasse|platz)\s+\d{1,4}\b/g,
83
- ],
84
- tr: [
85
- /\b[A-ZÇĞİÖŞÜa-zçğıöşü]+\s+(?:Cad|Sok|Mah)\.?\s+/gi,
86
- /\b\d{5}\s+[A-ZÇĞİÖŞÜa-zçğıöşü]+\/[A-ZÇĞİÖŞÜa-zçğıöşü]+\b/g,
87
- ],
88
- ar: [
89
- /شارع\s+[\u0600-\u06ff]+/g,
90
- /حي\s+[\u0600-\u06ff]+/g,
91
- /(?:ص\.ب|P\.?O\.?\s*Box)\s*\d{3,6}/gi,
92
- ],
93
- uk_postcode: [
94
- /\b[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\b/g,
95
- ],
96
- ca_postal: [
97
- /\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b/g,
58
+ es: [
59
+ /\b(?:Calle|Carrera|Avenida|Paseo|Plaza)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi,
98
60
  ],
99
61
  };
100
62
 
@@ -102,117 +64,106 @@ export const LOCALE_ADDRESS_RULES: Record<string, RegExp[]> = {
102
64
 
103
65
  type RawEntry = [
104
66
  typeName: string,
105
- regexStr: string,
106
- flags: string,
67
+ regexSource: string | RegExp,
107
68
  terms: string[],
108
69
  risk: number,
109
- cat: SensitiveCategory,
110
- vtag: string | null,
70
+ category: SensitiveCategory,
71
+ validatorTag: string | null,
72
+ isHighEntropy?: boolean,
73
+ supportedLocales?: string[],
111
74
  ];
112
75
 
113
76
  const RAW_PATTERNS: RawEntry[] = [
114
77
  // ── FINANCIAL ──────────────────────────────────────────────────────
115
- ["US_SSN", "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b", "g",
78
+ ["US_SSN", "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
116
79
  ["ssn", "social security", "tax id", "taxpayer"], 0.95, SensitiveCategory.FINANCIAL, "ssn_area"],
117
80
 
118
- ["CREDIT_CARD_NUMBER", "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b", "g",
81
+ ["CREDIT_CARD_NUMBER", "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
119
82
  ["card", "credit", "visa", "mastercard", "amex", "payment"], 0.97, SensitiveCategory.FINANCIAL, "luhn"],
120
83
 
121
- ["INTL_BANK_IBAN", "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b", "g",
84
+ ["INTL_BANK_IBAN", "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
122
85
  ["iban", "swift", "sepa", "wire", "bank transfer"], 0.96, SensitiveCategory.FINANCIAL, "iban"],
123
86
 
124
- ["CRYPTO_BTC", "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b", "g",
87
+ ["CRYPTO_BTC", "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
125
88
  ["bitcoin", "btc", "wallet", "crypto"], 0.94, SensitiveCategory.FINANCIAL, "btc_format"],
126
89
 
127
- ["CRYPTO_ETH", "\\b0x[a-fA-F0-9]{40}\\b", "g",
90
+ ["CRYPTO_ETH", "\\b0x[a-fA-F0-9]{40}\\b",
128
91
  ["ethereum", "eth", "wallet", "0x"], 0.93, SensitiveCategory.FINANCIAL, null],
129
92
 
130
- ["US_ABA_ROUTING", "\\b\\d{9}\\b", "g",
93
+ ["US_ABA_ROUTING", /(?<!\d)\d{9}(?!\d)/,
131
94
  ["routing", "aba", "wire", "bank"], 0.88, SensitiveCategory.FINANCIAL, "aba_check"],
132
95
 
133
- ["BANK_ACCT_NUM", "\\b\\d{8,17}\\b", "g",
134
- ["account", "checking", "savings", "deposit", "bank"], 0.83, SensitiveCategory.FINANCIAL, null],
96
+ ["BANK_ACCT_NUM", /(?<!\d)\d{8,17}(?!\d)/,
97
+ ["account", "checking", "savings", "deposit", "bank"], 0.50, SensitiveCategory.FINANCIAL, "luhn_soft"],
135
98
 
136
- ["SWIFT_BIC", "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b", "gi",
99
+ ["SWIFT_BIC", "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
137
100
  ["swift", "bic", "bank code", "transfer"], 0.60, SensitiveCategory.FINANCIAL, null],
138
101
 
139
102
  // ── CONTACT ────────────────────────────────────────────────────────
140
- ["EMAIL_ADDR", "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b", "g",
103
+ ["EMAIL_ADDR", "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
141
104
  ["email", "mail", "contact", "address"], 0.99, SensitiveCategory.CONTACT, null],
142
105
 
143
- ["PHONE_NUM", "(?:\\+?[1-9]\\d{0,3}[-.\\s]?)?\\(?\\d{1,4}\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}", "g",
144
- ["phone", "call", "mobile", "tel", "whatsapp", "number"], 0.92, SensitiveCategory.CONTACT, null],
106
+ ["PHONE_NUM", /(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
107
+ ["phone", "call", "mobile", "tel", "whatsapp", "number"], 0.80, SensitiveCategory.CONTACT, null],
145
108
 
146
- ["PHONE_NUM_INTL", "\\+(?:44|33|49|90|966|971)[-.\\s]?\\(?\\d{1,5}\\)?(?:[-.\\s]?\\d{2,4}){2,4}", "g",
147
- ["phone", "call", "mobile", "tel"], 0.93, SensitiveCategory.CONTACT, null],
109
+ ["PHONE_NUM_INTL", /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/,
110
+ ["phone", "call", "mobile", "tel"], 0.80, SensitiveCategory.CONTACT, null],
148
111
 
149
- ["IPV4_ADDR", "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b", "g",
112
+ ["IPV4_ADDR", "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
150
113
  ["ip", "server", "host", "network", "address"], 0.94, SensitiveCategory.CONTACT, "ipv4"],
151
114
 
152
- ["IPV6_ADDR", "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b", "g",
115
+ ["IPV6_ADDR", "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
153
116
  ["ipv6", "ip", "network", "server"], 0.93, SensitiveCategory.CONTACT, null],
154
117
 
155
- ["HW_MAC_ADDR", "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b", "g",
118
+ ["HW_MAC_ADDR", "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
156
119
  ["mac", "hardware", "network", "device"], 0.91, SensitiveCategory.CONTACT, null],
157
120
 
158
121
  // ── PERSONAL ───────────────────────────────────────────────────────
159
- ["BIRTH_DATE", "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b", "g",
122
+ ["BIRTH_DATE", "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
160
123
  ["birth", "dob", "born", "birthday", "date of birth"], 0.88, SensitiveCategory.PERSONAL, null],
161
124
 
162
- ["US_DRIVERS_LIC", "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b", "g",
163
- ["driver", "license", "licence", "dl", "dmv"], 0.85, SensitiveCategory.PERSONAL, null],
125
+ ["US_DRIVERS_LIC", "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
126
+ ["driver", "license", "licence", "dl", "dmv"], 0.55, SensitiveCategory.PERSONAL, null],
164
127
 
165
- ["US_PASSPORT_NUM", "\\b[A-Z]\\d{8}\\b", "g",
128
+ ["US_PASSPORT_NUM", "\\b[A-Z]\\d{8}\\b",
166
129
  ["passport", "travel", "visa", "immigration"], 0.87, SensitiveCategory.PERSONAL, null],
167
130
 
168
131
  // ── VEHICLE ────────────────────────────────────────────────────────
169
- ["VEHICLE_VIN", "\\b[A-HJ-NPR-Z0-9]{17}\\b", "g",
132
+ ["VEHICLE_VIN", "\\b[A-HJ-NPR-Z0-9]{17}\\b",
170
133
  ["vin", "vehicle", "chassis", "automobile"], 0.92, SensitiveCategory.VEHICLE, "vin_format"],
171
134
 
172
- ["VEHICLE_PLATE", "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b", "g",
135
+ ["VEHICLE_PLATE", "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
173
136
  ["plate", "registration", "vehicle", "plaka"], 0.45, SensitiveCategory.VEHICLE, null],
174
137
 
175
138
  // ── HEALTHCARE ─────────────────────────────────────────────────────
176
- ["MED_RECORD_ID", "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b", "g",
139
+ ["MED_RECORD_ID", "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
177
140
  ["patient", "medical", "record", "mrn", "hospital"], 0.96, SensitiveCategory.HEALTHCARE, null],
178
141
 
179
- ["US_MEDICARE_ID", "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b", "g",
142
+ ["US_MEDICARE_ID", "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
180
143
  ["medicare", "cms", "beneficiary", "health insurance"], 0.91, SensitiveCategory.HEALTHCARE, null],
181
144
 
182
- ["US_DEA_NUM", "\\b[A-Z]{2}\\d{7}\\b", "g",
145
+ ["US_DEA_NUM", "\\b[A-Z]{2}\\d{7}\\b",
183
146
  ["dea", "prescriber", "drug", "enforcement"], 0.89, SensitiveCategory.HEALTHCARE, null],
184
147
 
185
- ["US_NPI_NUM", "\\b\\d{10}\\b", "g",
148
+ ["US_NPI_NUM", "\\b\\d{10}\\b",
186
149
  ["npi", "provider", "national provider", "healthcare"], 0.87, SensitiveCategory.HEALTHCARE, null],
187
150
 
188
151
  // ── IDENTITY_US ────────────────────────────────────────────────────
189
- ["US_EIN_TAX", "\\b\\d{2}-\\d{7}\\b", "g",
152
+ ["US_EIN_TAX", "\\b\\d{2}-\\d{7}\\b",
190
153
  ["ein", "federal", "employer", "tax id"], 0.89, SensitiveCategory.IDENTITY_US, null],
191
154
 
192
155
  // ── IDENTITY_INTL ──────────────────────────────────────────────────
193
- ["UK_NATL_INS", "\\b[A-Z]{2}\\d{6}[A-Z]\\b", "g",
194
- ["nino", "national insurance", "ni number", "uk"], 0.90, SensitiveCategory.IDENTITY_INTL, null],
195
-
196
- ["CA_SOCIAL_INS", "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b", "g",
197
- ["sin", "social insurance", "canada", "canadian"], 0.89, SensitiveCategory.IDENTITY_INTL, null],
198
-
199
- ["FR_INSEE_NUM", "\\b[12]\\d{2}[01]\\d\\d{8}\\d{2}\\b", "g",
200
- ["insee", "sécurité sociale", "france", "numéro"], 0.88, SensitiveCategory.IDENTITY_INTL, null],
156
+ ["UK_NATL_INS", "\\b[A-Z]{2}\\d{6}[A-Z]\\b",
157
+ ["nino", "national insurance", "ni number", "uk"], 0.90, SensitiveCategory.IDENTITY_INTL, "uk_nino"],
201
158
 
202
- ["DE_STEUER_ID", "\\b\\d{2}\\s?\\d{3}\\s?\\d{3}\\s?\\d{3}\\b", "g",
203
- ["steuer", "steuernummer", "finanzamt", "deutschland"], 0.87, SensitiveCategory.IDENTITY_INTL, null],
159
+ ["CA_SOCIAL_INS", "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b",
160
+ ["sin", "social insurance", "canada", "canadian"], 0.89, SensitiveCategory.IDENTITY_INTL, "ca_sin"],
204
161
 
205
- ["TR_TCID", "\\b[1-9]\\d{9}[02468]\\b", "g",
206
- ["tc", "kimlik", "vatandaşlık", "nüfus", "türkiye"], 0.92, SensitiveCategory.IDENTITY_INTL, "tcid"],
207
-
208
- ["SA_NATIONAL_ID", "\\b1\\d{9}\\b", "g",
209
- ["هوية", "رقم الهوية", "saudi", "وطنية", "identity"], 0.91, SensitiveCategory.IDENTITY_INTL, "saudi_nid"],
210
-
211
- ["UAE_EMIRATES_ID", "\\b784-\\d{4}-\\d{7}-\\d\\b", "g",
212
- ["emirates", "هوية", "uae", "emirati", "identity"], 0.93, SensitiveCategory.IDENTITY_INTL, "luhn"],
162
+ ["ES_DNI", "(?:\\d{8}[A-Z]|[XYZ]\\d{7}[A-Z])",
163
+ ["dni", "nie", "identidad", "nif", "spain"], 0.94, SensitiveCategory.IDENTITY_INTL, "es_id", true, ["*", "es"]],
213
164
 
214
165
  // ── CORPORATE ──────────────────────────────────────────────────────
215
- ["CORP_EMPLOYEE_ID", "\\b(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}\\b", "gi",
166
+ ["CORP_EMPLOYEE_ID", "(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}",
216
167
  ["employee", "staff", "personnel", "worker"], 0.55, SensitiveCategory.CORPORATE, null],
217
168
  ];
218
169
 
@@ -220,18 +171,16 @@ const RAW_PATTERNS: RawEntry[] = [
220
171
 
221
172
  /**
222
173
  * Immutable catalogue of sensitive-data regex signatures.
223
- *
224
- * @example
225
- * ```ts
226
- * const reg = new DLPPatternRegistry(); // load everything
227
- * const reg = new DLPPatternRegistry(new Set([SensitiveCategory.FINANCIAL]));
228
- * ```
229
174
  */
230
175
  export class DLPPatternRegistry {
231
176
  private readonly catalogue: Map<string, PatternDescriptor> = new Map();
177
+ private readonly localeCategoryRegexMap: Map<string, Map<string, { re: RegExp; typeOrder: string[] }>> = new Map();
232
178
 
233
179
  constructor(loadGroups?: ReadonlySet<SensitiveCategory>) {
234
180
  this.buildCatalogue(loadGroups ?? null);
181
+ for (const loc of ["*", "en", "es"]) {
182
+ this.compileForLocale(loc);
183
+ }
235
184
  }
236
185
 
237
186
  get typeNames(): string[] {
@@ -247,25 +196,88 @@ export class DLPPatternRegistry {
247
196
  return this.catalogue.get(typeName);
248
197
  }
249
198
 
250
- /** Return locale-tuned name regexes, falling back to English. */
251
199
  namePatternsFor(lang: LanguageTag | string): RegExp[] {
252
200
  return LOCALE_NAME_RULES[lang] ?? LOCALE_NAME_RULES["en"];
253
201
  }
254
202
 
255
- /** Return locale-tuned address regexes, falling back to English. */
256
203
  addressPatternsFor(lang: LanguageTag | string): RegExp[] {
257
204
  return LOCALE_ADDRESS_RULES[lang] ?? LOCALE_ADDRESS_RULES["en"];
258
205
  }
259
206
 
207
+ getCategoryRegexesMap(locale: string = "en"): Map<string, { re: RegExp; typeOrder: string[] }> {
208
+ if (!this.localeCategoryRegexMap.has(locale)) {
209
+ this.compileForLocale(locale);
210
+ }
211
+ return this.localeCategoryRegexMap.get(locale)!;
212
+ }
213
+
214
+ getCategoryTypeMap(categoryName: string, locale: string = "en"): string[] {
215
+ return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
216
+ }
217
+
218
+ private compileForLocale(locale: string): void {
219
+ const localePool = new Map<string, [string, PatternDescriptor][]>();
220
+
221
+ for (const [typeName, desc] of this.catalogue.entries()) {
222
+ if (desc.supportedLocales.includes("*") || desc.supportedLocales.includes(locale)) {
223
+ const catKey = desc.category;
224
+ if (!localePool.has(catKey)) localePool.set(catKey, []);
225
+ localePool.get(catKey)!.push([typeName, desc]);
226
+ }
227
+ }
228
+
229
+ const categoryMap = new Map<string, { re: RegExp; typeOrder: string[] }>();
230
+
231
+ for (const [catKey, entries] of localePool.entries()) {
232
+ entries.sort(([, a], [, b]) => {
233
+ const aVal = a.validatorTag ? 0 : 1;
234
+ const bVal = b.validatorTag ? 0 : 1;
235
+ if (aVal !== bVal) return aVal - bVal;
236
+ return b.compiledRe.source.length - a.compiledRe.source.length;
237
+ });
238
+
239
+ const parts: string[] = [];
240
+ const typeOrder: string[] = [];
241
+ for (const [typeName, desc] of entries) {
242
+ parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
243
+ typeOrder.push(typeName);
244
+ }
245
+
246
+ const combinedSource = parts.join('|');
247
+ const needsI = entries.some(([, d]) => d.compiledRe.flags.includes('i'));
248
+ const flags = needsI ? 'gi' : 'g';
249
+
250
+ try {
251
+ const re = new RegExp(combinedSource, flags);
252
+ categoryMap.set(catKey, { re, typeOrder });
253
+ } catch (err) {
254
+ console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err);
255
+ }
256
+ }
257
+
258
+ this.localeCategoryRegexMap.set(locale, categoryMap);
259
+ }
260
+
260
261
  private buildCatalogue(restrict: ReadonlySet<SensitiveCategory> | null): void {
261
- for (const [typeName, regexStr, flags, terms, risk, cat, vtag] of RAW_PATTERNS) {
262
+ for (const entry of RAW_PATTERNS) {
263
+ const [typeName, regexSource, terms, risk, cat, vtag, isHighEntropy, supportedLocales] = entry;
262
264
  if (restrict !== null && !restrict.has(cat)) continue;
265
+
266
+ let re: RegExp;
267
+ if (regexSource instanceof RegExp) {
268
+ re = regexSource;
269
+ } else {
270
+ re = new RegExp(regexSource, "g");
271
+ }
272
+
263
273
  this.catalogue.set(typeName, {
264
- compiledRe: new RegExp(regexStr, flags),
274
+ compiledRe: re,
265
275
  proximityTerms: new Set(terms),
266
276
  baseRisk: risk,
267
277
  category: cat,
268
278
  validatorTag: vtag,
279
+ isHighEntropy: isHighEntropy ?? (vtag !== null),
280
+ supportedLocales: supportedLocales ?? ["*"],
269
281
  });
270
282
  }
271
283
  }
@@ -23,7 +23,7 @@ const DEFAULT_CONFIG: Required<ScorerConfig> = {
23
23
  keywordBoost: 0.10,
24
24
  validatorOverride: 0.99,
25
25
  maxConfidence: 0.99,
26
- penaltyFactor: 0.65,
26
+ penaltyFactor: 0.99, // Renamed functionally to validator failure penalty subtraction
27
27
  };
28
28
 
29
29
  export interface ScoreInput {
@@ -45,8 +45,8 @@ export interface ScoreInput {
45
45
  * baseRisk: 0.92,
46
46
  * matchStart: 10,
47
47
  * matchEnd: 21,
48
- * fullText: "TC Kimlik No: 10000000146",
49
- * proximityTerms: new Set(["kimlik", "tc"]),
48
+ * fullText: "Mi número de DNI es 12345678Z",
49
+ * proximityTerms: new Set(["dni", "número"]),
50
50
  * validatorPassed: true,
51
51
  * });
52
52
  * // score === 0.99 (validator override)
@@ -77,7 +77,7 @@ export class DLPConfidenceScorer {
77
77
  // Hard-validator short-circuits
78
78
  if (input.validatorPassed === true) return this.valOverride;
79
79
  if (input.validatorPassed === false) {
80
- return Math.min(this.ceil, input.baseRisk * this.penalty);
80
+ return Math.max(0.0, input.baseRisk - this.penalty);
81
81
  }
82
82
 
83
83
  // Extract the context window around the match