mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -231,10 +231,13 @@ var init_exceptions = __esm({
231
231
  function looksLikeToken(value) {
232
232
  if (typeof value !== "string") return false;
233
233
  const v7 = value.trim();
234
- if (v7.startsWith("tkn-") && v7.includes("@email.com")) {
235
- return true;
234
+ if (v7.startsWith("tkn-") && v7.includes("@")) {
235
+ const parts = v7.split("@");
236
+ if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
237
+ return true;
238
+ }
236
239
  }
237
- if (v7.startsWith("+1-555-") && v7.length === 14) {
240
+ if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v7)) {
238
241
  return true;
239
242
  }
240
243
  if (v7.startsWith("000-00-") && v7.length === 11) {
@@ -246,16 +249,13 @@ function looksLikeToken(value) {
246
249
  if (v7.startsWith("000000") && v7.length === 9) {
247
250
  return true;
248
251
  }
249
- if (v7.startsWith("784-0000-") && v7.length === 18) {
250
- return true;
251
- }
252
- if (v7.length === 11 && v7.startsWith("990000") && /^\d+$/.test(v7) && parseInt(v7[v7.length - 1], 10) % 2 === 0) {
252
+ if (v7.length === 9 && v7.startsWith("000") && /[A-Z]$/.test(v7)) {
253
253
  return true;
254
254
  }
255
- if (v7.length === 10 && v7.startsWith("100000") && /^\d+$/.test(v7)) {
255
+ if (/^[A-Z]{2}00[A-F0-9]{4,16}$/.test(v7)) {
256
256
  return true;
257
257
  }
258
- if (/^[A-Z]{2}00[A-F0-9]{4,16}$/.test(v7)) {
258
+ if (/^<(PER|LOC|ORG):[^>]+>$/.test(v7)) {
259
259
  return true;
260
260
  }
261
261
  if (v7.startsWith("[TKN-") && v7.endsWith("]")) {
@@ -267,7 +267,7 @@ var TOKEN_PATTERN;
267
267
  var init_fpe_utils = __esm({
268
268
  "src/core/fpe_utils.ts"() {
269
269
  TOKEN_PATTERN = new RegExp(
270
- "tkn-[a-f0-9]{8,64}@email\\.com|\\+1-555-\\d{7}|000-00-\\d{4}|4000-0000-0000-\\d{4}|000000\\d{3}|990000\\d{4}[02468]|100000\\d{4}|784-0000-\\d{7}-\\d|[A-Z]{2}00[A-F0-9]{4,16}|\\[TKN-[a-f0-9]{8,64}\\]",
270
+ "tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}|\\+[1-9]\\d{0,3}-555-\\d{7}|000-00-\\d{4}|4000-0000-0000-\\d{4}|000000\\d{3}|000\\d{5}[A-Z]|[A-Z]{2}00[A-F0-9]{4,16}|<(?:PER|LOC|ORG):[^>]+>|\\[TKN-[a-f0-9]{8,64}\\]",
271
271
  // Opaque
272
272
  "g"
273
273
  );
@@ -316,42 +316,87 @@ async function _hmacDigits(plaintext, n6, offset = 0) {
316
316
  }
317
317
  return result.join("");
318
318
  }
319
- async function generateFPEToken(rawText) {
320
- const text = rawText.trim();
321
- if (_EMAIL_RE.test(text)) {
322
- return `tkn-${await _hmacHex(text)}@email.com`;
323
- }
324
- if (_PHONE_RE.test(text)) {
325
- return `+1-555-${await _hmacDigits(text, 7)}`;
319
+ async function _pickFromArray(plaintext, array) {
320
+ const digits = await _hmacDigits(plaintext, 8);
321
+ const num = parseInt(digits, 10);
322
+ return array[num % array.length];
323
+ }
324
+ function _computeLuhnDigit(partialNum) {
325
+ const digits = partialNum.split("").map(Number);
326
+ let sum = 0;
327
+ let shouldDouble = true;
328
+ for (let i6 = digits.length - 1; i6 >= 0; i6--) {
329
+ let digit = digits[i6];
330
+ if (shouldDouble) {
331
+ digit *= 2;
332
+ if (digit > 9) digit -= 9;
333
+ }
334
+ sum += digit;
335
+ shouldDouble = !shouldDouble;
326
336
  }
327
- if (_SSN_RE.test(text)) {
337
+ return ((10 - sum % 10) % 10).toString();
338
+ }
339
+ function _computeEsIdCheck(num) {
340
+ return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
341
+ }
342
+ async function generateFPEToken(rawText, entityType = "UNKNOWN") {
343
+ const text = rawText.trim();
344
+ let type = (entityType || "UNKNOWN").toUpperCase();
345
+ if (type === "UNKNOWN") {
346
+ if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
347
+ else if (_SSN_RE.test(text)) type = "US_SSN";
348
+ else if (_CC_RE.test(text)) type = "CREDIT_CARD";
349
+ else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
350
+ else if (_ES_ID_RE.test(text)) type = "ES_DNI";
351
+ else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
352
+ else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
353
+ }
354
+ if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
355
+ const parts = text.split("@");
356
+ const domain = parts.length === 2 ? parts[1] : "email.com";
357
+ return `tkn-${await _hmacHex(text)}@${domain}`;
358
+ }
359
+ if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
360
+ const m6 = text.match(/^\+([1-9]\d{0,3})/);
361
+ const cc = m6 ? m6[1] : "1";
362
+ return `+${cc}-555-${await _hmacDigits(text, 7)}`;
363
+ }
364
+ if (type === "US_SSN") {
328
365
  return `000-00-${await _hmacDigits(text, 4)}`;
329
366
  }
330
- if (_CC_RE.test(text)) {
331
- return `4000-0000-0000-${await _hmacDigits(text, 4)}`;
367
+ if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
368
+ const base = `400000000000${await _hmacDigits(text, 3)}`;
369
+ const checkDig = _computeLuhnDigit(base);
370
+ const full = base + checkDig;
371
+ return `${full.slice(0, 4)}-${full.slice(4, 8)}-${full.slice(8, 12)}-${full.slice(12, 16)}`;
332
372
  }
333
- if (_ROUTING_RE.test(text)) {
373
+ if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
334
374
  return `000000${await _hmacDigits(text, 3)}`;
335
375
  }
336
- if (_TCID_RE.test(text)) {
337
- const tail = await _hmacDigits(text, 5);
338
- let lastD = parseInt(tail[tail.length - 1], 10);
339
- if (lastD % 2 !== 0) lastD = (lastD + 1) % 10;
340
- return `990000${tail.slice(0, 4)}${lastD}`;
376
+ if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
377
+ const countryCode = text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2)) ? text.slice(0, 2).toUpperCase() : "US";
378
+ return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
379
+ }
380
+ if (type === "ES_DNI") {
381
+ const digits = `000${await _hmacDigits(text, 5)}`;
382
+ return digits + _computeEsIdCheck(parseInt(digits, 10));
341
383
  }
342
- if (_SAUDI_NID_RE.test(text)) {
343
- return `100000${await _hmacDigits(text, 4)}`;
384
+ if (type === "PERSON" || type === "PERSON_NAME") {
385
+ const f6 = await _pickFromArray(text, _FIRST_NAMES);
386
+ const l6 = await _pickFromArray(text + "last", _LAST_NAMES);
387
+ return `<PER:${f6}_${l6}>`;
344
388
  }
345
- if (_UAE_EID_RE.test(text)) {
346
- return `784-0000-${await _hmacDigits(text, 7)}-${await _hmacDigits(text, 1, 20)}`;
389
+ if (type === "LOCATION" || type === "PHYS_ADDRESS") {
390
+ const c6 = await _pickFromArray(text, _CITIES);
391
+ return `<LOC:${c6}>`;
347
392
  }
348
- if (_IBAN_RE.test(text)) {
349
- const countryCode = text.slice(0, 2);
350
- return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
393
+ if (type === "ORGANIZATION") {
394
+ const c6 = await _pickFromArray(text, _LAST_NAMES);
395
+ return `<ORG:${c6}_Inc>`;
351
396
  }
352
397
  return `[TKN-${await _hmacHex(text)}]`;
353
398
  }
354
- var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE, _TCID_RE, _SAUDI_NID_RE, _UAE_EID_RE, _IBAN_RE;
399
+ var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE, _ES_ID_RE, _IBAN_RE, _FIRST_NAMES, _LAST_NAMES, _CITIES;
355
400
  var init_fpe = __esm({
356
401
  "src/core/fpe.ts"() {
357
402
  init_config();
@@ -360,14 +405,15 @@ var init_fpe = __esm({
360
405
  init_fpe_utils();
361
406
  _masterKey = null;
362
407
  _EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
363
- _PHONE_RE = /^\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}$|^\d{3}[\s\-.]?\d{4}$/;
408
+ _PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
364
409
  _SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
365
410
  _CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
366
411
  _ROUTING_RE = /^\d{9}$/;
367
- _TCID_RE = /^[1-9]\d{9}[02468]$/;
368
- _SAUDI_NID_RE = /^1\d{9}$/;
369
- _UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
412
+ _ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
370
413
  _IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
414
+ _FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
415
+ _LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
416
+ _CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
371
417
  }
372
418
  });
373
419
 
@@ -42968,7 +43014,7 @@ async function encode(rawText, options = {}) {
42968
43014
  return existingToken;
42969
43015
  }
42970
43016
  }
42971
- const token = await generateFPEToken(text);
43017
+ const token = await generateFPEToken(text, options.entityType || "UNKNOWN");
42972
43018
  const ciphertext = cryptoEngine.encrypt(text);
42973
43019
  const ttl = options.ttl || DEFAULT_TTL;
42974
43020
  await vault.store(token, ciphertext, ttl, ptHash);
@@ -43420,19 +43466,8 @@ var SCRIPT_SIGNATURES, LanguageContextResolver;
43420
43466
  var init_assessor = __esm({
43421
43467
  "src/core/dlp/assessor.ts"() {
43422
43468
  SCRIPT_SIGNATURES = [
43423
- // CJK / East-Asian — checked first because they are unambiguous
43424
- { tag: "zh", pattern: /[\u4e00-\u9fff\u3400-\u4dbf]/g },
43425
- { tag: "ja", pattern: /[\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff]/g },
43426
- // Arabic script — covers Standard Arabic, Urdu overlap, etc.
43427
- { tag: "ar", pattern: /[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]/g },
43428
- // Turkish — distinguished by dotless-i (ı), soft-g (ğ), ş, and cedilla ç
43429
- { tag: "tr", pattern: /[ğıİşŞ]/g },
43430
- // German — umlauts and Eszett
43431
- { tag: "de", pattern: /[äöüÄÖÜß]/g },
43432
43469
  // Spanish — ñ and inverted punctuation
43433
- { tag: "es", pattern: /[ñÑ¡¿]/g },
43434
- // French — cedilla, accented vowels with circumflex / diaeresis
43435
- { tag: "fr", pattern: /[àâçéèêëïîôùûüÿœæ]/gi }
43470
+ { tag: "es", pattern: /[ñÑ¡¿]/g }
43436
43471
  ];
43437
43472
  LanguageContextResolver = class {
43438
43473
  constructor(charThreshold = 1) {
@@ -43490,34 +43525,12 @@ var init_registry = __esm({
43490
43525
  })(SensitiveCategory || {});
43491
43526
  LOCALE_NAME_RULES = {
43492
43527
  en: [
43493
- /\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b/g,
43494
- /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\b/g
43528
+ /\b[A-Z][a-z\-\']+ [A-Z][a-z\-\']+(?:\s+[A-Z][a-z\-\']+)?\b/g,
43529
+ /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z\-\']+\b/g
43495
43530
  ],
43496
43531
  es: [
43497
- /\b[A-Z][a-záéíóúñ]+ [A-Z][a-záéíóúñ]+(?:\s+[A-Z][a-záéíóúñ]+)?\b/g,
43498
- /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ]+\b/g
43499
- ],
43500
- fr: [
43501
- /\b[A-Z][a-zàâçéèêëïîôùûü]+ [A-Z][a-zàâçéèêëïîôùûü]+\b/g,
43502
- /\b(?:M|Mme|Mlle)\.?\s+[A-Z][a-zàâçéèêëïîôùûü]+\b/g
43503
- ],
43504
- de: [
43505
- /\b[A-Z][a-zäöüß]+ [A-Z][a-zäöüß]+\b/g,
43506
- /\b(?:Herr|Frau)\.?\s+[A-Z][a-zäöüß]+\b/g
43507
- ],
43508
- tr: [
43509
- /\b[A-ZÇĞİÖŞÜ][a-zçğıöşü]+ [A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
43510
- /\b(?:Bay|Bayan|Sayın)\.?\s+[A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g
43511
- ],
43512
- ar: [
43513
- /[\u0621-\u064a][\u0600-\u06ff]+ [\u0621-\u064a][\u0600-\u06ff]+/g,
43514
- /(?:أبو|أم|ابن|بنت)\s+[\u0621-\u064a][\u0600-\u06ff]+/gi
43515
- ],
43516
- ja: [
43517
- /\b[A-Z][a-z]+(?:moto|yama|kawa|mura|ta|da|shi|no)\s+[A-Z][a-z]+\b/g
43518
- ],
43519
- zh: [
43520
- /\b[A-Z][a-z]{1,3}\s+[A-Z][a-z]+\b/g
43532
+ /\b[A-Z][a-záéíóúñ\-\']+ [A-Z][a-záéíóúñ\-\']+(?:\s+[A-Z][a-záéíóúñ\-\']+)?\b/g,
43533
+ /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ\-\']+\b/g
43521
43534
  ]
43522
43535
  };
43523
43536
  LOCALE_ADDRESS_RULES = {
@@ -43525,26 +43538,8 @@ var init_registry = __esm({
43525
43538
  /\b\d{1,5}\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b/g,
43526
43539
  /\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?\b/g
43527
43540
  ],
43528
- fr: [
43529
- /\b\d{1,4}\s+(?:rue|avenue|boulevard|place|chemin)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi
43530
- ],
43531
- de: [
43532
- /\b[A-ZÄÖÜa-zäöüß]+(?:straße|strasse|weg|gasse|platz)\s+\d{1,4}\b/g
43533
- ],
43534
- tr: [
43535
- /\b[A-ZÇĞİÖŞÜa-zçğıöşü]+\s+(?:Cad|Sok|Mah)\.?\s+/gi,
43536
- /\b\d{5}\s+[A-ZÇĞİÖŞÜa-zçğıöşü]+\/[A-ZÇĞİÖŞÜa-zçğıöşü]+\b/g
43537
- ],
43538
- ar: [
43539
- /شارع\s+[\u0600-\u06ff]+/g,
43540
- /حي\s+[\u0600-\u06ff]+/g,
43541
- /(?:ص\.ب|P\.?O\.?\s*Box)\s*\d{3,6}/gi
43542
- ],
43543
- uk_postcode: [
43544
- /\b[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\b/g
43545
- ],
43546
- ca_postal: [
43547
- /\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b/g
43541
+ es: [
43542
+ /\b(?:Calle|Carrera|Avenida|Paseo|Plaza)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi
43548
43543
  ]
43549
43544
  };
43550
43545
  RAW_PATTERNS = [
@@ -43552,7 +43547,6 @@ var init_registry = __esm({
43552
43547
  [
43553
43548
  "US_SSN",
43554
43549
  "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
43555
- "g",
43556
43550
  ["ssn", "social security", "tax id", "taxpayer"],
43557
43551
  0.95,
43558
43552
  "FINANCIAL" /* FINANCIAL */,
@@ -43561,7 +43555,6 @@ var init_registry = __esm({
43561
43555
  [
43562
43556
  "CREDIT_CARD_NUMBER",
43563
43557
  "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
43564
- "g",
43565
43558
  ["card", "credit", "visa", "mastercard", "amex", "payment"],
43566
43559
  0.97,
43567
43560
  "FINANCIAL" /* FINANCIAL */,
@@ -43570,7 +43563,6 @@ var init_registry = __esm({
43570
43563
  [
43571
43564
  "INTL_BANK_IBAN",
43572
43565
  "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
43573
- "g",
43574
43566
  ["iban", "swift", "sepa", "wire", "bank transfer"],
43575
43567
  0.96,
43576
43568
  "FINANCIAL" /* FINANCIAL */,
@@ -43579,7 +43571,6 @@ var init_registry = __esm({
43579
43571
  [
43580
43572
  "CRYPTO_BTC",
43581
43573
  "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
43582
- "g",
43583
43574
  ["bitcoin", "btc", "wallet", "crypto"],
43584
43575
  0.94,
43585
43576
  "FINANCIAL" /* FINANCIAL */,
@@ -43588,7 +43579,6 @@ var init_registry = __esm({
43588
43579
  [
43589
43580
  "CRYPTO_ETH",
43590
43581
  "\\b0x[a-fA-F0-9]{40}\\b",
43591
- "g",
43592
43582
  ["ethereum", "eth", "wallet", "0x"],
43593
43583
  0.93,
43594
43584
  "FINANCIAL" /* FINANCIAL */,
@@ -43596,8 +43586,7 @@ var init_registry = __esm({
43596
43586
  ],
43597
43587
  [
43598
43588
  "US_ABA_ROUTING",
43599
- "\\b\\d{9}\\b",
43600
- "g",
43589
+ /(?<!\d)\d{9}(?!\d)/,
43601
43590
  ["routing", "aba", "wire", "bank"],
43602
43591
  0.88,
43603
43592
  "FINANCIAL" /* FINANCIAL */,
@@ -43605,17 +43594,15 @@ var init_registry = __esm({
43605
43594
  ],
43606
43595
  [
43607
43596
  "BANK_ACCT_NUM",
43608
- "\\b\\d{8,17}\\b",
43609
- "g",
43597
+ /(?<!\d)\d{8,17}(?!\d)/,
43610
43598
  ["account", "checking", "savings", "deposit", "bank"],
43611
- 0.83,
43599
+ 0.5,
43612
43600
  "FINANCIAL" /* FINANCIAL */,
43613
- null
43601
+ "luhn_soft"
43614
43602
  ],
43615
43603
  [
43616
43604
  "SWIFT_BIC",
43617
43605
  "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
43618
- "gi",
43619
43606
  ["swift", "bic", "bank code", "transfer"],
43620
43607
  0.6,
43621
43608
  "FINANCIAL" /* FINANCIAL */,
@@ -43625,7 +43612,6 @@ var init_registry = __esm({
43625
43612
  [
43626
43613
  "EMAIL_ADDR",
43627
43614
  "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
43628
- "g",
43629
43615
  ["email", "mail", "contact", "address"],
43630
43616
  0.99,
43631
43617
  "CONTACT" /* CONTACT */,
@@ -43633,26 +43619,23 @@ var init_registry = __esm({
43633
43619
  ],
43634
43620
  [
43635
43621
  "PHONE_NUM",
43636
- "(?:\\+?[1-9]\\d{0,3}[-.\\s]?)?\\(?\\d{1,4}\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}",
43637
- "g",
43622
+ /(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
43638
43623
  ["phone", "call", "mobile", "tel", "whatsapp", "number"],
43639
- 0.92,
43624
+ 0.8,
43640
43625
  "CONTACT" /* CONTACT */,
43641
43626
  null
43642
43627
  ],
43643
43628
  [
43644
43629
  "PHONE_NUM_INTL",
43645
- "\\+(?:44|33|49|90|966|971)[-.\\s]?\\(?\\d{1,5}\\)?(?:[-.\\s]?\\d{2,4}){2,4}",
43646
- "g",
43630
+ /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/,
43647
43631
  ["phone", "call", "mobile", "tel"],
43648
- 0.93,
43632
+ 0.8,
43649
43633
  "CONTACT" /* CONTACT */,
43650
43634
  null
43651
43635
  ],
43652
43636
  [
43653
43637
  "IPV4_ADDR",
43654
43638
  "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
43655
- "g",
43656
43639
  ["ip", "server", "host", "network", "address"],
43657
43640
  0.94,
43658
43641
  "CONTACT" /* CONTACT */,
@@ -43661,7 +43644,6 @@ var init_registry = __esm({
43661
43644
  [
43662
43645
  "IPV6_ADDR",
43663
43646
  "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
43664
- "g",
43665
43647
  ["ipv6", "ip", "network", "server"],
43666
43648
  0.93,
43667
43649
  "CONTACT" /* CONTACT */,
@@ -43670,7 +43652,6 @@ var init_registry = __esm({
43670
43652
  [
43671
43653
  "HW_MAC_ADDR",
43672
43654
  "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
43673
- "g",
43674
43655
  ["mac", "hardware", "network", "device"],
43675
43656
  0.91,
43676
43657
  "CONTACT" /* CONTACT */,
@@ -43680,7 +43661,6 @@ var init_registry = __esm({
43680
43661
  [
43681
43662
  "BIRTH_DATE",
43682
43663
  "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
43683
- "g",
43684
43664
  ["birth", "dob", "born", "birthday", "date of birth"],
43685
43665
  0.88,
43686
43666
  "PERSONAL" /* PERSONAL */,
@@ -43689,16 +43669,14 @@ var init_registry = __esm({
43689
43669
  [
43690
43670
  "US_DRIVERS_LIC",
43691
43671
  "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
43692
- "g",
43693
43672
  ["driver", "license", "licence", "dl", "dmv"],
43694
- 0.85,
43673
+ 0.55,
43695
43674
  "PERSONAL" /* PERSONAL */,
43696
43675
  null
43697
43676
  ],
43698
43677
  [
43699
43678
  "US_PASSPORT_NUM",
43700
43679
  "\\b[A-Z]\\d{8}\\b",
43701
- "g",
43702
43680
  ["passport", "travel", "visa", "immigration"],
43703
43681
  0.87,
43704
43682
  "PERSONAL" /* PERSONAL */,
@@ -43708,7 +43686,6 @@ var init_registry = __esm({
43708
43686
  [
43709
43687
  "VEHICLE_VIN",
43710
43688
  "\\b[A-HJ-NPR-Z0-9]{17}\\b",
43711
- "g",
43712
43689
  ["vin", "vehicle", "chassis", "automobile"],
43713
43690
  0.92,
43714
43691
  "VEHICLE" /* VEHICLE */,
@@ -43717,7 +43694,6 @@ var init_registry = __esm({
43717
43694
  [
43718
43695
  "VEHICLE_PLATE",
43719
43696
  "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
43720
- "g",
43721
43697
  ["plate", "registration", "vehicle", "plaka"],
43722
43698
  0.45,
43723
43699
  "VEHICLE" /* VEHICLE */,
@@ -43727,7 +43703,6 @@ var init_registry = __esm({
43727
43703
  [
43728
43704
  "MED_RECORD_ID",
43729
43705
  "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
43730
- "g",
43731
43706
  ["patient", "medical", "record", "mrn", "hospital"],
43732
43707
  0.96,
43733
43708
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43736,7 +43711,6 @@ var init_registry = __esm({
43736
43711
  [
43737
43712
  "US_MEDICARE_ID",
43738
43713
  "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
43739
- "g",
43740
43714
  ["medicare", "cms", "beneficiary", "health insurance"],
43741
43715
  0.91,
43742
43716
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43745,7 +43719,6 @@ var init_registry = __esm({
43745
43719
  [
43746
43720
  "US_DEA_NUM",
43747
43721
  "\\b[A-Z]{2}\\d{7}\\b",
43748
- "g",
43749
43722
  ["dea", "prescriber", "drug", "enforcement"],
43750
43723
  0.89,
43751
43724
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43754,7 +43727,6 @@ var init_registry = __esm({
43754
43727
  [
43755
43728
  "US_NPI_NUM",
43756
43729
  "\\b\\d{10}\\b",
43757
- "g",
43758
43730
  ["npi", "provider", "national provider", "healthcare"],
43759
43731
  0.87,
43760
43732
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43764,7 +43736,6 @@ var init_registry = __esm({
43764
43736
  [
43765
43737
  "US_EIN_TAX",
43766
43738
  "\\b\\d{2}-\\d{7}\\b",
43767
- "g",
43768
43739
  ["ein", "federal", "employer", "tax id"],
43769
43740
  0.89,
43770
43741
  "IDENTITY_US" /* IDENTITY_US */,
@@ -43774,71 +43745,33 @@ var init_registry = __esm({
43774
43745
  [
43775
43746
  "UK_NATL_INS",
43776
43747
  "\\b[A-Z]{2}\\d{6}[A-Z]\\b",
43777
- "g",
43778
43748
  ["nino", "national insurance", "ni number", "uk"],
43779
43749
  0.9,
43780
43750
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43781
- null
43751
+ "uk_nino"
43782
43752
  ],
43783
43753
  [
43784
43754
  "CA_SOCIAL_INS",
43785
43755
  "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b",
43786
- "g",
43787
43756
  ["sin", "social insurance", "canada", "canadian"],
43788
43757
  0.89,
43789
43758
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43790
- null
43759
+ "ca_sin"
43791
43760
  ],
43792
43761
  [
43793
- "FR_INSEE_NUM",
43794
- "\\b[12]\\d{2}[01]\\d\\d{8}\\d{2}\\b",
43795
- "g",
43796
- ["insee", "s\xE9curit\xE9 sociale", "france", "num\xE9ro"],
43797
- 0.88,
43798
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43799
- null
43800
- ],
43801
- [
43802
- "DE_STEUER_ID",
43803
- "\\b\\d{2}\\s?\\d{3}\\s?\\d{3}\\s?\\d{3}\\b",
43804
- "g",
43805
- ["steuer", "steuernummer", "finanzamt", "deutschland"],
43806
- 0.87,
43807
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43808
- null
43809
- ],
43810
- [
43811
- "TR_TCID",
43812
- "\\b[1-9]\\d{9}[02468]\\b",
43813
- "g",
43814
- ["tc", "kimlik", "vatanda\u015Fl\u0131k", "n\xFCfus", "t\xFCrkiye"],
43815
- 0.92,
43816
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43817
- "tcid"
43818
- ],
43819
- [
43820
- "SA_NATIONAL_ID",
43821
- "\\b1\\d{9}\\b",
43822
- "g",
43823
- ["\u0647\u0648\u064A\u0629", "\u0631\u0642\u0645 \u0627\u0644\u0647\u0648\u064A\u0629", "saudi", "\u0648\u0637\u0646\u064A\u0629", "identity"],
43824
- 0.91,
43825
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43826
- "saudi_nid"
43827
- ],
43828
- [
43829
- "UAE_EMIRATES_ID",
43830
- "\\b784-\\d{4}-\\d{7}-\\d\\b",
43831
- "g",
43832
- ["emirates", "\u0647\u0648\u064A\u0629", "uae", "emirati", "identity"],
43833
- 0.93,
43762
+ "ES_DNI",
43763
+ "(?:\\d{8}[A-Z]|[XYZ]\\d{7}[A-Z])",
43764
+ ["dni", "nie", "identidad", "nif", "spain"],
43765
+ 0.94,
43834
43766
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43835
- "luhn"
43767
+ "es_id",
43768
+ true,
43769
+ ["*", "es"]
43836
43770
  ],
43837
43771
  // ── CORPORATE ──────────────────────────────────────────────────────
43838
43772
  [
43839
43773
  "CORP_EMPLOYEE_ID",
43840
- "\\b(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}\\b",
43841
- "gi",
43774
+ "(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}",
43842
43775
  ["employee", "staff", "personnel", "worker"],
43843
43776
  0.55,
43844
43777
  "CORPORATE" /* CORPORATE */,
@@ -43848,7 +43781,11 @@ var init_registry = __esm({
43848
43781
  DLPPatternRegistry = class {
43849
43782
  constructor(loadGroups) {
43850
43783
  this.catalogue = /* @__PURE__ */ new Map();
43784
+ this.localeCategoryRegexMap = /* @__PURE__ */ new Map();
43851
43785
  this.buildCatalogue(loadGroups ?? null);
43786
+ for (const loc of ["*", "en", "es"]) {
43787
+ this.compileForLocale(loc);
43788
+ }
43852
43789
  }
43853
43790
  get typeNames() {
43854
43791
  return [...this.catalogue.keys()];
@@ -43860,23 +43797,74 @@ var init_registry = __esm({
43860
43797
  descriptorFor(typeName) {
43861
43798
  return this.catalogue.get(typeName);
43862
43799
  }
43863
- /** Return locale-tuned name regexes, falling back to English. */
43864
43800
  namePatternsFor(lang) {
43865
43801
  return LOCALE_NAME_RULES[lang] ?? LOCALE_NAME_RULES["en"];
43866
43802
  }
43867
- /** Return locale-tuned address regexes, falling back to English. */
43868
43803
  addressPatternsFor(lang) {
43869
43804
  return LOCALE_ADDRESS_RULES[lang] ?? LOCALE_ADDRESS_RULES["en"];
43870
43805
  }
43806
+ getCategoryRegexesMap(locale = "en") {
43807
+ if (!this.localeCategoryRegexMap.has(locale)) {
43808
+ this.compileForLocale(locale);
43809
+ }
43810
+ return this.localeCategoryRegexMap.get(locale);
43811
+ }
43812
+ getCategoryTypeMap(categoryName, locale = "en") {
43813
+ return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
43814
+ }
43815
+ compileForLocale(locale) {
43816
+ const localePool = /* @__PURE__ */ new Map();
43817
+ for (const [typeName, desc] of this.catalogue.entries()) {
43818
+ if (desc.supportedLocales.includes("*") || desc.supportedLocales.includes(locale)) {
43819
+ const catKey = desc.category;
43820
+ if (!localePool.has(catKey)) localePool.set(catKey, []);
43821
+ localePool.get(catKey).push([typeName, desc]);
43822
+ }
43823
+ }
43824
+ const categoryMap = /* @__PURE__ */ new Map();
43825
+ for (const [catKey, entries] of localePool.entries()) {
43826
+ entries.sort(([, a6], [, b6]) => {
43827
+ const aVal = a6.validatorTag ? 0 : 1;
43828
+ const bVal = b6.validatorTag ? 0 : 1;
43829
+ if (aVal !== bVal) return aVal - bVal;
43830
+ return b6.compiledRe.source.length - a6.compiledRe.source.length;
43831
+ });
43832
+ const parts = [];
43833
+ const typeOrder = [];
43834
+ for (const [typeName, desc] of entries) {
43835
+ parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
43836
+ typeOrder.push(typeName);
43837
+ }
43838
+ const combinedSource = parts.join("|");
43839
+ const needsI = entries.some(([, d6]) => d6.compiledRe.flags.includes("i"));
43840
+ const flags = needsI ? "gi" : "g";
43841
+ try {
43842
+ const re = new RegExp(combinedSource, flags);
43843
+ categoryMap.set(catKey, { re, typeOrder });
43844
+ } catch (err2) {
43845
+ console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err2);
43846
+ }
43847
+ }
43848
+ this.localeCategoryRegexMap.set(locale, categoryMap);
43849
+ }
43871
43850
  buildCatalogue(restrict) {
43872
- for (const [typeName, regexStr, flags, terms, risk, cat, vtag] of RAW_PATTERNS) {
43851
+ for (const entry of RAW_PATTERNS) {
43852
+ const [typeName, regexSource, terms, risk, cat, vtag, isHighEntropy, supportedLocales] = entry;
43873
43853
  if (restrict !== null && !restrict.has(cat)) continue;
43854
+ let re;
43855
+ if (regexSource instanceof RegExp) {
43856
+ re = regexSource;
43857
+ } else {
43858
+ re = new RegExp(regexSource, "g");
43859
+ }
43874
43860
  this.catalogue.set(typeName, {
43875
- compiledRe: new RegExp(regexStr, flags),
43861
+ compiledRe: re,
43876
43862
  proximityTerms: new Set(terms),
43877
43863
  baseRisk: risk,
43878
43864
  category: cat,
43879
- validatorTag: vtag
43865
+ validatorTag: vtag,
43866
+ isHighEntropy: isHighEntropy ?? vtag !== null,
43867
+ supportedLocales: supportedLocales ?? ["*"]
43880
43868
  });
43881
43869
  }
43882
43870
  }
@@ -43973,29 +43961,13 @@ function checkIpv4Octets(raw) {
43973
43961
  }
43974
43962
  return true;
43975
43963
  }
43976
- function checkTcidNumber(raw) {
43977
- const digitsStr = raw.replace(/\D/g, "");
43978
- if (digitsStr.length !== 11) return false;
43979
- const d6 = digitsStr.split("").map(Number);
43980
- if (d6[0] === 0) return false;
43981
- if (d6[10] % 2 !== 0) return false;
43982
- const oddSum = d6[0] + d6[2] + d6[4] + d6[6] + d6[8];
43983
- const evenSum = d6[1] + d6[3] + d6[5] + d6[7];
43984
- const computedD10 = ((oddSum * 7 - evenSum) % 10 + 10) % 10;
43985
- if (computedD10 !== d6[9]) return false;
43986
- const firstTenSum = d6.slice(0, 10).reduce((a6, b6) => a6 + b6, 0);
43987
- if (firstTenSum % 10 !== d6[10]) return false;
43988
- return true;
43989
- }
43990
- function checkSaudiNid(raw) {
43991
- const digitsStr = raw.replace(/\D/g, "");
43992
- if (digitsStr.length !== 10) return false;
43993
- const d6 = digitsStr.split("").map(Number);
43994
- if (d6[0] !== 1) return false;
43964
+ function checkCaSin(raw) {
43965
+ const digits = raw.replace(/\D/g, "");
43966
+ if (digits.length !== 9) return false;
43995
43967
  let total = 0;
43996
- for (let idx = 0; idx < 10; idx++) {
43997
- let val = d6[idx];
43998
- if (idx % 2 === 0) {
43968
+ for (let idx = 0; idx < digits.length; idx++) {
43969
+ let val = parseInt(digits[idx], 10);
43970
+ if (idx % 2 === 1) {
43999
43971
  val *= 2;
44000
43972
  if (val > 9) val -= 9;
44001
43973
  }
@@ -44003,7 +43975,30 @@ function checkSaudiNid(raw) {
44003
43975
  }
44004
43976
  return total % 10 === 0;
44005
43977
  }
44006
- var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, VALIDATOR_DISPATCH, DLPValidationEngine;
43978
+ function checkUkNino(raw) {
43979
+ const cleaned = raw.replace(/ /g, "").toUpperCase();
43980
+ if (cleaned.length !== 9) return false;
43981
+ return UK_NINO_REGEX.test(cleaned);
43982
+ }
43983
+ function checkEsId(raw) {
43984
+ const cleaned = raw.replace(/[\s-]/g, "").toUpperCase();
43985
+ if (cleaned.length !== 9) return false;
43986
+ const mapping = { X: "0", Y: "1", Z: "2" };
43987
+ const firstChar = cleaned[0];
43988
+ let numStr;
43989
+ if (firstChar in mapping) {
43990
+ numStr = mapping[firstChar] + cleaned.slice(1, 8);
43991
+ } else if (/^\d$/.test(firstChar)) {
43992
+ numStr = cleaned.slice(0, 8);
43993
+ } else {
43994
+ return false;
43995
+ }
43996
+ if (!/^\d+$/.test(numStr)) return false;
43997
+ const num = parseInt(numStr, 10);
43998
+ const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
43999
+ return cleaned[8] === validLetters[num % 23];
44000
+ }
44001
+ var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, UK_NINO_REGEX, VALIDATOR_DISPATCH, DLPValidationEngine;
44007
44002
  var init_handlers = __esm({
44008
44003
  "src/core/dlp/handlers.ts"() {
44009
44004
  IBAN_COUNTRY_LENGTHS = {
@@ -44106,6 +44101,7 @@ var init_handlers = __esm({
44106
44101
  Z: 9
44107
44102
  };
44108
44103
  VIN_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2];
44104
+ UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
44109
44105
  VALIDATOR_DISPATCH = {
44110
44106
  luhn: checkLuhn,
44111
44107
  ssn_area: checkSsnArea,
@@ -44114,8 +44110,9 @@ var init_handlers = __esm({
44114
44110
  vin_format: checkVinFormat,
44115
44111
  btc_format: checkBtcFormat,
44116
44112
  ipv4: checkIpv4Octets,
44117
- tcid: checkTcidNumber,
44118
- saudi_nid: checkSaudiNid
44113
+ ca_sin: checkCaSin,
44114
+ uk_nino: checkUkNino,
44115
+ es_id: checkEsId
44119
44116
  };
44120
44117
  DLPValidationEngine = class {
44121
44118
  /**
@@ -44152,7 +44149,8 @@ var init_scorer = __esm({
44152
44149
  keywordBoost: 0.1,
44153
44150
  validatorOverride: 0.99,
44154
44151
  maxConfidence: 0.99,
44155
- penaltyFactor: 0.65
44152
+ penaltyFactor: 0.99
44153
+ // Renamed functionally to validator failure penalty subtraction
44156
44154
  };
44157
44155
  DLPConfidenceScorer = class {
44158
44156
  constructor(overrides = {}) {
@@ -44171,7 +44169,7 @@ var init_scorer = __esm({
44171
44169
  score(input) {
44172
44170
  if (input.validatorPassed === true) return this.valOverride;
44173
44171
  if (input.validatorPassed === false) {
44174
- return Math.min(this.ceil, input.baseRisk * this.penalty);
44172
+ return Math.max(0, input.baseRisk - this.penalty);
44175
44173
  }
44176
44174
  const windowLo = Math.max(0, input.matchStart - this.window);
44177
44175
  const windowHi = Math.min(input.fullText.length, input.matchEnd + this.window);
@@ -44197,6 +44195,47 @@ var init_scorer = __esm({
44197
44195
  }
44198
44196
  });
44199
44197
 
44198
+ // src/core/span.ts
44199
+ function resolveOverlaps(spans) {
44200
+ if (spans.length === 0) return [];
44201
+ const sorted = [...spans].sort((a6, b6) => {
44202
+ if (a6.start !== b6.start) return a6.start - b6.start;
44203
+ const lenDiff = b6.end - b6.start - (a6.end - a6.start);
44204
+ if (lenDiff !== 0) return lenDiff;
44205
+ return b6.confidence - a6.confidence;
44206
+ });
44207
+ const resolved = [];
44208
+ let occupiedEnd = -1;
44209
+ for (const span of sorted) {
44210
+ if (span.start >= occupiedEnd) {
44211
+ resolved.push(span);
44212
+ occupiedEnd = span.end;
44213
+ } else if (span.end <= occupiedEnd) {
44214
+ continue;
44215
+ } else {
44216
+ const last = resolved[resolved.length - 1];
44217
+ if (span.confidence > last.confidence) {
44218
+ resolved.pop();
44219
+ resolved.push(span);
44220
+ occupiedEnd = span.end;
44221
+ }
44222
+ }
44223
+ }
44224
+ return resolved.sort((a6, b6) => b6.start - a6.start);
44225
+ }
44226
+ function reconstruct(text, resolvedSpans) {
44227
+ let result = text;
44228
+ for (const span of resolvedSpans) {
44229
+ if (span.maskedValue == null) continue;
44230
+ result = result.slice(0, span.start) + span.maskedValue + result.slice(span.end);
44231
+ }
44232
+ return result;
44233
+ }
44234
+ var init_span = __esm({
44235
+ "src/core/span.ts"() {
44236
+ }
44237
+ });
44238
+
44200
44239
  // node_modules/delayed-stream/lib/delayed_stream.js
44201
44240
  var require_delayed_stream = __commonJS({
44202
44241
  "node_modules/delayed-stream/lib/delayed_stream.js"(exports2, module) {
@@ -58867,7 +58906,7 @@ var init_transformers_scanner = __esm({
58867
58906
  confidence = Math.min(1, confidence + 0.2);
58868
58907
  }
58869
58908
  if (confidence >= confidenceThreshold && !looksLikeToken(val) && val.length > 1) {
58870
- const token = await encodeFn(val);
58909
+ const token = await encodeFn(val, { entityType });
58871
58910
  entities.push({
58872
58911
  type: entityType,
58873
58912
  value: val,
@@ -58974,17 +59013,18 @@ var init_scanner = __esm({
58974
59013
  init_registry();
58975
59014
  init_handlers();
58976
59015
  init_scorer();
59016
+ init_span();
58977
59017
  _dlpLanguageResolver = new LanguageContextResolver();
58978
59018
  _dlpPatternRegistry = new DLPPatternRegistry();
58979
59019
  _dlpValidationEngine = new DLPValidationEngine();
58980
59020
  _dlpConfidenceScorer = new DLPConfidenceScorer();
58981
59021
  REGEX_PATTERNS = {
58982
59022
  "EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
58983
- "PHONE_NUMBER": /\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4}/g,
58984
- "PHONE_NUMBER_INTL": /\+(?:44|33|49)[\s\-.]?\(?\d{1,5}\)?(?:[\s\-.]?\d{2,4}){2,4}/g,
58985
- "US_SSN": /\d{3}-\d{2}-\d{4}/g,
58986
- "CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
58987
- "US_ROUTING_NUMBER": /\b\d{9}\b/g,
59023
+ "PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
59024
+ "PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
59025
+ "US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
59026
+ "CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
59027
+ "US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
58988
59028
  "US_PASSPORT": /\b[A-Z]\d{8}\b/g,
58989
59029
  "DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g
58990
59030
  };
@@ -59039,26 +59079,55 @@ var init_scanner = __esm({
59039
59079
  const checksum = 3 * (d6[0] + d6[3] + d6[6]) + 7 * (d6[1] + d6[4] + d6[7]) + (d6[2] + d6[5] + d6[8]);
59040
59080
  return checksum % 10 === 0;
59041
59081
  }
59042
- async _tier0Dlp(text, encodeFn, confidenceThreshold) {
59082
+ async _tier0CollectSpans(text, confidenceThreshold) {
59043
59083
  const detectedLanguage = _dlpLanguageResolver.resolve(text);
59044
- const rawHits = [];
59045
- for (const [typeTag, descriptor] of _dlpPatternRegistry.iterDescriptors()) {
59046
- const re = new RegExp(descriptor.compiledRe.source, descriptor.compiledRe.flags);
59084
+ const spans = [];
59085
+ const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
59086
+ for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
59087
+ const megaRe = new RegExp(re.source, re.flags);
59047
59088
  let m6;
59048
- while ((m6 = re.exec(text)) !== null) {
59089
+ while ((m6 = megaRe.exec(text)) !== null) {
59090
+ const groups = m6.groups ?? {};
59091
+ let typeTag;
59092
+ for (const name of typeOrder) {
59093
+ if (groups[name] !== void 0) {
59094
+ typeTag = name;
59095
+ break;
59096
+ }
59097
+ }
59098
+ if (!typeTag) continue;
59049
59099
  const matchedStr = m6[0];
59050
59100
  if (looksLikeToken(matchedStr)) continue;
59101
+ const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
59102
+ if (!descriptor) continue;
59051
59103
  const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
59052
- const conf = _dlpConfidenceScorer.score({
59053
- baseRisk: descriptor.baseRisk,
59054
- matchStart: m6.index,
59055
- matchEnd: m6.index + matchedStr.length,
59056
- fullText: text,
59057
- proximityTerms: descriptor.proximityTerms,
59058
- validatorPassed: validatorResult
59059
- });
59104
+ let conf;
59105
+ if (validatorResult === false) {
59106
+ if (descriptor.isHighEntropy) {
59107
+ conf = 0.85;
59108
+ } else {
59109
+ continue;
59110
+ }
59111
+ } else {
59112
+ conf = _dlpConfidenceScorer.score({
59113
+ baseRisk: descriptor.baseRisk,
59114
+ matchStart: m6.index,
59115
+ matchEnd: m6.index + matchedStr.length,
59116
+ fullText: text,
59117
+ proximityTerms: descriptor.proximityTerms,
59118
+ validatorPassed: validatorResult
59119
+ });
59120
+ }
59060
59121
  if (conf >= confidenceThreshold) {
59061
- rawHits.push({ start: m6.index, end: m6.index + matchedStr.length, tag: typeTag, val: matchedStr, conf });
59122
+ spans.push({
59123
+ start: m6.index,
59124
+ end: m6.index + matchedStr.length,
59125
+ entityType: typeTag,
59126
+ originalValue: matchedStr,
59127
+ confidence: conf,
59128
+ method: "dlp_heuristic",
59129
+ language: detectedLanguage
59130
+ });
59062
59131
  }
59063
59132
  }
59064
59133
  }
@@ -59077,7 +59146,15 @@ var init_scanner = __esm({
59077
59146
  validatorPassed: null
59078
59147
  });
59079
59148
  if (conf >= confidenceThreshold) {
59080
- rawHits.push({ start: m6.index, end: m6.index + m6[0].length, tag: "PERSON_NAME", val: m6[0], conf });
59149
+ spans.push({
59150
+ start: m6.index,
59151
+ end: m6.index + m6[0].length,
59152
+ entityType: "PERSON_NAME",
59153
+ originalValue: m6[0],
59154
+ confidence: conf,
59155
+ method: "dlp_heuristic",
59156
+ language: detectedLanguage
59157
+ });
59081
59158
  }
59082
59159
  }
59083
59160
  }
@@ -59086,85 +59163,78 @@ var init_scanner = __esm({
59086
59163
  let m6;
59087
59164
  while ((m6 = re.exec(text)) !== null) {
59088
59165
  if (looksLikeToken(m6[0])) continue;
59089
- rawHits.push({ start: m6.index, end: m6.index + m6[0].length, tag: "PHYS_ADDRESS", val: m6[0], conf: 0.55 });
59090
- }
59091
- }
59092
- rawHits.sort((a6, b6) => a6.start - b6.start || b6.end - b6.start - (a6.end - a6.start) || b6.conf - a6.conf);
59093
- const deduped = [];
59094
- let occupiedEnd = -1;
59095
- for (const hit of rawHits) {
59096
- if (hit.start >= occupiedEnd) {
59097
- deduped.push(hit);
59098
- occupiedEnd = hit.end;
59166
+ spans.push({
59167
+ start: m6.index,
59168
+ end: m6.index + m6[0].length,
59169
+ entityType: "PHYS_ADDRESS",
59170
+ originalValue: m6[0],
59171
+ confidence: 0.55,
59172
+ method: "dlp_heuristic",
59173
+ language: detectedLanguage
59174
+ });
59099
59175
  }
59100
59176
  }
59177
+ return spans;
59178
+ }
59179
+ /** Backward-compat wrapper — collects spans then single-pass encodes. */
59180
+ async _tier0Dlp(text, encodeFn, confidenceThreshold) {
59181
+ const spans = await this._tier0CollectSpans(text, confidenceThreshold);
59182
+ const resolved = resolveOverlaps(spans);
59101
59183
  const entities = [];
59102
- let excised = text;
59103
- for (const hit of [...deduped].reverse()) {
59104
- const token = await encodeFn(hit.val);
59105
- excised = excised.slice(0, hit.start) + token + excised.slice(hit.end);
59184
+ await Promise.all(resolved.map(async (span) => {
59185
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
59106
59186
  entities.push({
59107
- type: hit.tag,
59108
- value: hit.val,
59109
- method: "dlp_heuristic",
59110
- confidence: hit.conf,
59111
- masked_value: token,
59112
- language: detectedLanguage
59187
+ type: span.entityType,
59188
+ value: span.originalValue,
59189
+ method: span.method,
59190
+ confidence: span.confidence,
59191
+ masked_value: span.maskedValue,
59192
+ language: span.language
59113
59193
  });
59114
- }
59115
- return [excised, entities];
59194
+ }));
59195
+ return [reconstruct(text, resolved), entities];
59116
59196
  }
59117
- async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59118
- let entities = [];
59119
- let excised = text;
59120
- let allMatches = [];
59197
+ async _tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold) {
59198
+ const spans = [];
59121
59199
  for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
59122
59200
  const re = new RegExp(pattern.source, pattern.flags);
59123
59201
  let match;
59124
59202
  while ((match = re.exec(text)) !== null) {
59125
- let confidence = 0.95;
59126
- if (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " "))) {
59127
- confidence = 1;
59128
- }
59129
- if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(match[0])) {
59130
- confidence = Math.max(confidence, 0.99);
59131
- }
59132
- if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(match[0])) {
59133
- continue;
59203
+ const val = match[0];
59204
+ if (looksLikeToken(val)) continue;
59205
+ let confidence = aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " ")) ? 1 : 0.95;
59206
+ if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
59207
+ if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(val)) continue;
59208
+ if (confidence >= confidenceThreshold) {
59209
+ spans.push({
59210
+ start: match.index,
59211
+ end: match.index + val.length,
59212
+ entityType,
59213
+ originalValue: val,
59214
+ confidence,
59215
+ method: "regex"
59216
+ });
59134
59217
  }
59135
- allMatches.push({
59136
- start: match.index,
59137
- end: match.index + match[0].length,
59138
- type: entityType,
59139
- value: match[0],
59140
- confidence
59141
- });
59142
- }
59143
- }
59144
- allMatches.sort((a6, b6) => a6.start - b6.start || b6.end - b6.start - (a6.end - a6.start));
59145
- let filtered = [];
59146
- let lastEnd = -1;
59147
- for (const m6 of allMatches) {
59148
- if (m6.start >= lastEnd) {
59149
- filtered.push(m6);
59150
- lastEnd = m6.end;
59151
59218
  }
59152
59219
  }
59153
- const sortedFiltered = [...filtered].sort((a6, b6) => b6.start - a6.start);
59154
- for (const m6 of sortedFiltered) {
59155
- if (m6.confidence >= confidenceThreshold && !looksLikeToken(m6.value)) {
59156
- const token = await encodeFn(m6.value);
59157
- excised = excised.slice(0, m6.start) + token + excised.slice(m6.end);
59158
- entities.push({
59159
- type: m6.type,
59160
- value: m6.value,
59161
- method: "regex",
59162
- confidence: m6.confidence,
59163
- masked_value: token
59164
- });
59165
- }
59166
- }
59167
- return [excised, entities];
59220
+ return spans;
59221
+ }
59222
+ /** Backward-compat wrapper. */
59223
+ async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59224
+ const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
59225
+ const resolved = resolveOverlaps(spans);
59226
+ const entities = [];
59227
+ await Promise.all(resolved.map(async (span) => {
59228
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
59229
+ entities.push({
59230
+ type: span.entityType,
59231
+ value: span.originalValue,
59232
+ method: span.method,
59233
+ confidence: span.confidence,
59234
+ masked_value: span.maskedValue
59235
+ });
59236
+ }));
59237
+ return [reconstruct(text, resolved), entities];
59168
59238
  }
59169
59239
  async _tier2Nlp(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59170
59240
  return [text, []];
@@ -59184,13 +59254,18 @@ var init_scanner = __esm({
59184
59254
  const _encode = options.encodeFn || encode;
59185
59255
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
59186
59256
  const boost = this._resolveBoost(options.context);
59187
- let currentText = text;
59257
+ const allSpans = [];
59188
59258
  if (pipeline.includes("dlp")) {
59189
- [currentText] = await this._tier0Dlp(currentText, _encode, confidenceThreshold);
59259
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59190
59260
  }
59191
59261
  if (pipeline.includes("regex") || pipeline.includes("checksum")) {
59192
- [currentText] = await this._tier1Regex(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
59262
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
59193
59263
  }
59264
+ const resolved = resolveOverlaps(allSpans);
59265
+ await Promise.all(resolved.map(async (span) => {
59266
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59267
+ }));
59268
+ let currentText = reconstruct(text, resolved);
59194
59269
  if (pipeline.includes("nlp")) {
59195
59270
  [currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
59196
59271
  }
@@ -59202,20 +59277,29 @@ var init_scanner = __esm({
59202
59277
  const _encode = options.encodeFn || encode;
59203
59278
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
59204
59279
  const boost = this._resolveBoost(options.context);
59205
- let allEntities = [];
59206
- let remaining = text;
59280
+ const allEntities = [];
59281
+ const allSpans = [];
59207
59282
  if (pipeline.includes("dlp")) {
59208
- const [newText, tier0] = await this._tier0Dlp(remaining, _encode, confidenceThreshold);
59209
- remaining = newText;
59210
- allEntities.push(...tier0);
59283
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59211
59284
  }
59212
59285
  if (pipeline.includes("regex") || pipeline.includes("checksum")) {
59213
- const [newText, tier1] = await this._tier1Regex(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59214
- remaining = newText;
59215
- allEntities.push(...tier1);
59216
- }
59286
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
59287
+ }
59288
+ const resolved = resolveOverlaps(allSpans);
59289
+ await Promise.all(resolved.map(async (span) => {
59290
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59291
+ allEntities.push({
59292
+ type: span.entityType,
59293
+ value: span.originalValue,
59294
+ method: span.method,
59295
+ confidence: span.confidence,
59296
+ masked_value: span.maskedValue,
59297
+ language: span.language
59298
+ });
59299
+ }));
59300
+ const remaining = reconstruct(text, resolved);
59217
59301
  if (pipeline.includes("nlp")) {
59218
- const [_newText, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59302
+ const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59219
59303
  allEntities.push(...tier2);
59220
59304
  }
59221
59305
  return allEntities;