mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -256,10 +256,13 @@ var init_exceptions = __esm({
256
256
  function looksLikeToken(value) {
257
257
  if (typeof value !== "string") return false;
258
258
  const v7 = value.trim();
259
- if (v7.startsWith("tkn-") && v7.includes("@email.com")) {
260
- return true;
259
+ if (v7.startsWith("tkn-") && v7.includes("@")) {
260
+ const parts = v7.split("@");
261
+ if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
262
+ return true;
263
+ }
261
264
  }
262
- if (v7.startsWith("+1-555-") && v7.length === 14) {
265
+ if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v7)) {
263
266
  return true;
264
267
  }
265
268
  if (v7.startsWith("000-00-") && v7.length === 11) {
@@ -271,16 +274,13 @@ function looksLikeToken(value) {
271
274
  if (v7.startsWith("000000") && v7.length === 9) {
272
275
  return true;
273
276
  }
274
- if (v7.startsWith("784-0000-") && v7.length === 18) {
275
- return true;
276
- }
277
- if (v7.length === 11 && v7.startsWith("990000") && /^\d+$/.test(v7) && parseInt(v7[v7.length - 1], 10) % 2 === 0) {
277
+ if (v7.length === 9 && v7.startsWith("000") && /[A-Z]$/.test(v7)) {
278
278
  return true;
279
279
  }
280
- if (v7.length === 10 && v7.startsWith("100000") && /^\d+$/.test(v7)) {
280
+ if (/^[A-Z]{2}00[A-F0-9]{4,16}$/.test(v7)) {
281
281
  return true;
282
282
  }
283
- if (/^[A-Z]{2}00[A-F0-9]{4,16}$/.test(v7)) {
283
+ if (/^<(PER|LOC|ORG):[^>]+>$/.test(v7)) {
284
284
  return true;
285
285
  }
286
286
  if (v7.startsWith("[TKN-") && v7.endsWith("]")) {
@@ -292,7 +292,7 @@ var TOKEN_PATTERN;
292
292
  var init_fpe_utils = __esm({
293
293
  "src/core/fpe_utils.ts"() {
294
294
  TOKEN_PATTERN = new RegExp(
295
- "tkn-[a-f0-9]{8,64}@email\\.com|\\+1-555-\\d{7}|000-00-\\d{4}|4000-0000-0000-\\d{4}|000000\\d{3}|990000\\d{4}[02468]|100000\\d{4}|784-0000-\\d{7}-\\d|[A-Z]{2}00[A-F0-9]{4,16}|\\[TKN-[a-f0-9]{8,64}\\]",
295
+ "tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}|\\+[1-9]\\d{0,3}-555-\\d{7}|000-00-\\d{4}|4000-0000-0000-\\d{4}|000000\\d{3}|000\\d{5}[A-Z]|[A-Z]{2}00[A-F0-9]{4,16}|<(?:PER|LOC|ORG):[^>]+>|\\[TKN-[a-f0-9]{8,64}\\]",
296
296
  // Opaque
297
297
  "g"
298
298
  );
@@ -341,42 +341,87 @@ async function _hmacDigits(plaintext, n6, offset = 0) {
341
341
  }
342
342
  return result.join("");
343
343
  }
344
- async function generateFPEToken(rawText) {
345
- const text = rawText.trim();
346
- if (_EMAIL_RE.test(text)) {
347
- return `tkn-${await _hmacHex(text)}@email.com`;
348
- }
349
- if (_PHONE_RE.test(text)) {
350
- return `+1-555-${await _hmacDigits(text, 7)}`;
344
+ async function _pickFromArray(plaintext, array) {
345
+ const digits = await _hmacDigits(plaintext, 8);
346
+ const num = parseInt(digits, 10);
347
+ return array[num % array.length];
348
+ }
349
+ function _computeLuhnDigit(partialNum) {
350
+ const digits = partialNum.split("").map(Number);
351
+ let sum = 0;
352
+ let shouldDouble = true;
353
+ for (let i6 = digits.length - 1; i6 >= 0; i6--) {
354
+ let digit = digits[i6];
355
+ if (shouldDouble) {
356
+ digit *= 2;
357
+ if (digit > 9) digit -= 9;
358
+ }
359
+ sum += digit;
360
+ shouldDouble = !shouldDouble;
351
361
  }
352
- if (_SSN_RE.test(text)) {
362
+ return ((10 - sum % 10) % 10).toString();
363
+ }
364
+ function _computeEsIdCheck(num) {
365
+ return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
366
+ }
367
+ async function generateFPEToken(rawText, entityType = "UNKNOWN") {
368
+ const text = rawText.trim();
369
+ let type = (entityType || "UNKNOWN").toUpperCase();
370
+ if (type === "UNKNOWN") {
371
+ if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
372
+ else if (_SSN_RE.test(text)) type = "US_SSN";
373
+ else if (_CC_RE.test(text)) type = "CREDIT_CARD";
374
+ else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
375
+ else if (_ES_ID_RE.test(text)) type = "ES_DNI";
376
+ else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
377
+ else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
378
+ }
379
+ if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
380
+ const parts = text.split("@");
381
+ const domain = parts.length === 2 ? parts[1] : "email.com";
382
+ return `tkn-${await _hmacHex(text)}@${domain}`;
383
+ }
384
+ if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
385
+ const m6 = text.match(/^\+([1-9]\d{0,3})/);
386
+ const cc = m6 ? m6[1] : "1";
387
+ return `+${cc}-555-${await _hmacDigits(text, 7)}`;
388
+ }
389
+ if (type === "US_SSN") {
353
390
  return `000-00-${await _hmacDigits(text, 4)}`;
354
391
  }
355
- if (_CC_RE.test(text)) {
356
- return `4000-0000-0000-${await _hmacDigits(text, 4)}`;
392
+ if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
393
+ const base = `400000000000${await _hmacDigits(text, 3)}`;
394
+ const checkDig = _computeLuhnDigit(base);
395
+ const full = base + checkDig;
396
+ return `${full.slice(0, 4)}-${full.slice(4, 8)}-${full.slice(8, 12)}-${full.slice(12, 16)}`;
357
397
  }
358
- if (_ROUTING_RE.test(text)) {
398
+ if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
359
399
  return `000000${await _hmacDigits(text, 3)}`;
360
400
  }
361
- if (_TCID_RE.test(text)) {
362
- const tail = await _hmacDigits(text, 5);
363
- let lastD = parseInt(tail[tail.length - 1], 10);
364
- if (lastD % 2 !== 0) lastD = (lastD + 1) % 10;
365
- return `990000${tail.slice(0, 4)}${lastD}`;
401
+ if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
402
+ const countryCode = text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2)) ? text.slice(0, 2).toUpperCase() : "US";
403
+ return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
404
+ }
405
+ if (type === "ES_DNI") {
406
+ const digits = `000${await _hmacDigits(text, 5)}`;
407
+ return digits + _computeEsIdCheck(parseInt(digits, 10));
366
408
  }
367
- if (_SAUDI_NID_RE.test(text)) {
368
- return `100000${await _hmacDigits(text, 4)}`;
409
+ if (type === "PERSON" || type === "PERSON_NAME") {
410
+ const f6 = await _pickFromArray(text, _FIRST_NAMES);
411
+ const l6 = await _pickFromArray(text + "last", _LAST_NAMES);
412
+ return `<PER:${f6}_${l6}>`;
369
413
  }
370
- if (_UAE_EID_RE.test(text)) {
371
- return `784-0000-${await _hmacDigits(text, 7)}-${await _hmacDigits(text, 1, 20)}`;
414
+ if (type === "LOCATION" || type === "PHYS_ADDRESS") {
415
+ const c6 = await _pickFromArray(text, _CITIES);
416
+ return `<LOC:${c6}>`;
372
417
  }
373
- if (_IBAN_RE.test(text)) {
374
- const countryCode = text.slice(0, 2);
375
- return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
418
+ if (type === "ORGANIZATION") {
419
+ const c6 = await _pickFromArray(text, _LAST_NAMES);
420
+ return `<ORG:${c6}_Inc>`;
376
421
  }
377
422
  return `[TKN-${await _hmacHex(text)}]`;
378
423
  }
379
- var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE, _TCID_RE, _SAUDI_NID_RE, _UAE_EID_RE, _IBAN_RE;
424
+ var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE, _ES_ID_RE, _IBAN_RE, _FIRST_NAMES, _LAST_NAMES, _CITIES;
380
425
  var init_fpe = __esm({
381
426
  "src/core/fpe.ts"() {
382
427
  init_config();
@@ -385,14 +430,15 @@ var init_fpe = __esm({
385
430
  init_fpe_utils();
386
431
  _masterKey = null;
387
432
  _EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
388
- _PHONE_RE = /^\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}$|^\d{3}[\s\-.]?\d{4}$/;
433
+ _PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
389
434
  _SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
390
435
  _CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
391
436
  _ROUTING_RE = /^\d{9}$/;
392
- _TCID_RE = /^[1-9]\d{9}[02468]$/;
393
- _SAUDI_NID_RE = /^1\d{9}$/;
394
- _UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
437
+ _ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
395
438
  _IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
439
+ _FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
440
+ _LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
441
+ _CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
396
442
  }
397
443
  });
398
444
 
@@ -42993,7 +43039,7 @@ async function encode(rawText, options = {}) {
42993
43039
  return existingToken;
42994
43040
  }
42995
43041
  }
42996
- const token = await generateFPEToken(text);
43042
+ const token = await generateFPEToken(text, options.entityType || "UNKNOWN");
42997
43043
  const ciphertext = cryptoEngine.encrypt(text);
42998
43044
  const ttl = options.ttl || DEFAULT_TTL;
42999
43045
  await vault.store(token, ciphertext, ttl, ptHash);
@@ -43445,19 +43491,8 @@ var SCRIPT_SIGNATURES; exports.LanguageContextResolver = void 0;
43445
43491
  var init_assessor = __esm({
43446
43492
  "src/core/dlp/assessor.ts"() {
43447
43493
  SCRIPT_SIGNATURES = [
43448
- // CJK / East-Asian — checked first because they are unambiguous
43449
- { tag: "zh", pattern: /[\u4e00-\u9fff\u3400-\u4dbf]/g },
43450
- { tag: "ja", pattern: /[\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff]/g },
43451
- // Arabic script — covers Standard Arabic, Urdu overlap, etc.
43452
- { tag: "ar", pattern: /[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]/g },
43453
- // Turkish — distinguished by dotless-i (ı), soft-g (ğ), ş, and cedilla ç
43454
- { tag: "tr", pattern: /[ğıİşŞ]/g },
43455
- // German — umlauts and Eszett
43456
- { tag: "de", pattern: /[äöüÄÖÜß]/g },
43457
43494
  // Spanish — ñ and inverted punctuation
43458
- { tag: "es", pattern: /[ñÑ¡¿]/g },
43459
- // French — cedilla, accented vowels with circumflex / diaeresis
43460
- { tag: "fr", pattern: /[àâçéèêëïîôùûüÿœæ]/gi }
43495
+ { tag: "es", pattern: /[ñÑ¡¿]/g }
43461
43496
  ];
43462
43497
  exports.LanguageContextResolver = class {
43463
43498
  constructor(charThreshold = 1) {
@@ -43515,34 +43550,12 @@ var init_registry = __esm({
43515
43550
  })(exports.SensitiveCategory || {});
43516
43551
  LOCALE_NAME_RULES = {
43517
43552
  en: [
43518
- /\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b/g,
43519
- /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\b/g
43553
+ /\b[A-Z][a-z\-\']+ [A-Z][a-z\-\']+(?:\s+[A-Z][a-z\-\']+)?\b/g,
43554
+ /\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z\-\']+\b/g
43520
43555
  ],
43521
43556
  es: [
43522
- /\b[A-Z][a-záéíóúñ]+ [A-Z][a-záéíóúñ]+(?:\s+[A-Z][a-záéíóúñ]+)?\b/g,
43523
- /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ]+\b/g
43524
- ],
43525
- fr: [
43526
- /\b[A-Z][a-zàâçéèêëïîôùûü]+ [A-Z][a-zàâçéèêëïîôùûü]+\b/g,
43527
- /\b(?:M|Mme|Mlle)\.?\s+[A-Z][a-zàâçéèêëïîôùûü]+\b/g
43528
- ],
43529
- de: [
43530
- /\b[A-Z][a-zäöüß]+ [A-Z][a-zäöüß]+\b/g,
43531
- /\b(?:Herr|Frau)\.?\s+[A-Z][a-zäöüß]+\b/g
43532
- ],
43533
- tr: [
43534
- /\b[A-ZÇĞİÖŞÜ][a-zçğıöşü]+ [A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
43535
- /\b(?:Bay|Bayan|Sayın)\.?\s+[A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g
43536
- ],
43537
- ar: [
43538
- /[\u0621-\u064a][\u0600-\u06ff]+ [\u0621-\u064a][\u0600-\u06ff]+/g,
43539
- /(?:أبو|أم|ابن|بنت)\s+[\u0621-\u064a][\u0600-\u06ff]+/gi
43540
- ],
43541
- ja: [
43542
- /\b[A-Z][a-z]+(?:moto|yama|kawa|mura|ta|da|shi|no)\s+[A-Z][a-z]+\b/g
43543
- ],
43544
- zh: [
43545
- /\b[A-Z][a-z]{1,3}\s+[A-Z][a-z]+\b/g
43557
+ /\b[A-Z][a-záéíóúñ\-\']+ [A-Z][a-záéíóúñ\-\']+(?:\s+[A-Z][a-záéíóúñ\-\']+)?\b/g,
43558
+ /\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ\-\']+\b/g
43546
43559
  ]
43547
43560
  };
43548
43561
  LOCALE_ADDRESS_RULES = {
@@ -43550,26 +43563,8 @@ var init_registry = __esm({
43550
43563
  /\b\d{1,5}\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b/g,
43551
43564
  /\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?\b/g
43552
43565
  ],
43553
- fr: [
43554
- /\b\d{1,4}\s+(?:rue|avenue|boulevard|place|chemin)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi
43555
- ],
43556
- de: [
43557
- /\b[A-ZÄÖÜa-zäöüß]+(?:straße|strasse|weg|gasse|platz)\s+\d{1,4}\b/g
43558
- ],
43559
- tr: [
43560
- /\b[A-ZÇĞİÖŞÜa-zçğıöşü]+\s+(?:Cad|Sok|Mah)\.?\s+/gi,
43561
- /\b\d{5}\s+[A-ZÇĞİÖŞÜa-zçğıöşü]+\/[A-ZÇĞİÖŞÜa-zçğıöşü]+\b/g
43562
- ],
43563
- ar: [
43564
- /شارع\s+[\u0600-\u06ff]+/g,
43565
- /حي\s+[\u0600-\u06ff]+/g,
43566
- /(?:ص\.ب|P\.?O\.?\s*Box)\s*\d{3,6}/gi
43567
- ],
43568
- uk_postcode: [
43569
- /\b[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\b/g
43570
- ],
43571
- ca_postal: [
43572
- /\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b/g
43566
+ es: [
43567
+ /\b(?:Calle|Carrera|Avenida|Paseo|Plaza)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi
43573
43568
  ]
43574
43569
  };
43575
43570
  RAW_PATTERNS = [
@@ -43577,7 +43572,6 @@ var init_registry = __esm({
43577
43572
  [
43578
43573
  "US_SSN",
43579
43574
  "\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
43580
- "g",
43581
43575
  ["ssn", "social security", "tax id", "taxpayer"],
43582
43576
  0.95,
43583
43577
  "FINANCIAL" /* FINANCIAL */,
@@ -43586,7 +43580,6 @@ var init_registry = __esm({
43586
43580
  [
43587
43581
  "CREDIT_CARD_NUMBER",
43588
43582
  "\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
43589
- "g",
43590
43583
  ["card", "credit", "visa", "mastercard", "amex", "payment"],
43591
43584
  0.97,
43592
43585
  "FINANCIAL" /* FINANCIAL */,
@@ -43595,7 +43588,6 @@ var init_registry = __esm({
43595
43588
  [
43596
43589
  "INTL_BANK_IBAN",
43597
43590
  "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
43598
- "g",
43599
43591
  ["iban", "swift", "sepa", "wire", "bank transfer"],
43600
43592
  0.96,
43601
43593
  "FINANCIAL" /* FINANCIAL */,
@@ -43604,7 +43596,6 @@ var init_registry = __esm({
43604
43596
  [
43605
43597
  "CRYPTO_BTC",
43606
43598
  "\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
43607
- "g",
43608
43599
  ["bitcoin", "btc", "wallet", "crypto"],
43609
43600
  0.94,
43610
43601
  "FINANCIAL" /* FINANCIAL */,
@@ -43613,7 +43604,6 @@ var init_registry = __esm({
43613
43604
  [
43614
43605
  "CRYPTO_ETH",
43615
43606
  "\\b0x[a-fA-F0-9]{40}\\b",
43616
- "g",
43617
43607
  ["ethereum", "eth", "wallet", "0x"],
43618
43608
  0.93,
43619
43609
  "FINANCIAL" /* FINANCIAL */,
@@ -43621,8 +43611,7 @@ var init_registry = __esm({
43621
43611
  ],
43622
43612
  [
43623
43613
  "US_ABA_ROUTING",
43624
- "\\b\\d{9}\\b",
43625
- "g",
43614
+ /(?<!\d)\d{9}(?!\d)/,
43626
43615
  ["routing", "aba", "wire", "bank"],
43627
43616
  0.88,
43628
43617
  "FINANCIAL" /* FINANCIAL */,
@@ -43630,17 +43619,15 @@ var init_registry = __esm({
43630
43619
  ],
43631
43620
  [
43632
43621
  "BANK_ACCT_NUM",
43633
- "\\b\\d{8,17}\\b",
43634
- "g",
43622
+ /(?<!\d)\d{8,17}(?!\d)/,
43635
43623
  ["account", "checking", "savings", "deposit", "bank"],
43636
- 0.83,
43624
+ 0.5,
43637
43625
  "FINANCIAL" /* FINANCIAL */,
43638
- null
43626
+ "luhn_soft"
43639
43627
  ],
43640
43628
  [
43641
43629
  "SWIFT_BIC",
43642
43630
  "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
43643
- "gi",
43644
43631
  ["swift", "bic", "bank code", "transfer"],
43645
43632
  0.6,
43646
43633
  "FINANCIAL" /* FINANCIAL */,
@@ -43650,7 +43637,6 @@ var init_registry = __esm({
43650
43637
  [
43651
43638
  "EMAIL_ADDR",
43652
43639
  "\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
43653
- "g",
43654
43640
  ["email", "mail", "contact", "address"],
43655
43641
  0.99,
43656
43642
  "CONTACT" /* CONTACT */,
@@ -43658,26 +43644,23 @@ var init_registry = __esm({
43658
43644
  ],
43659
43645
  [
43660
43646
  "PHONE_NUM",
43661
- "(?:\\+?[1-9]\\d{0,3}[-.\\s]?)?\\(?\\d{1,4}\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}",
43662
- "g",
43647
+ /(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
43663
43648
  ["phone", "call", "mobile", "tel", "whatsapp", "number"],
43664
- 0.92,
43649
+ 0.8,
43665
43650
  "CONTACT" /* CONTACT */,
43666
43651
  null
43667
43652
  ],
43668
43653
  [
43669
43654
  "PHONE_NUM_INTL",
43670
- "\\+(?:44|33|49|90|966|971)[-.\\s]?\\(?\\d{1,5}\\)?(?:[-.\\s]?\\d{2,4}){2,4}",
43671
- "g",
43655
+ /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/,
43672
43656
  ["phone", "call", "mobile", "tel"],
43673
- 0.93,
43657
+ 0.8,
43674
43658
  "CONTACT" /* CONTACT */,
43675
43659
  null
43676
43660
  ],
43677
43661
  [
43678
43662
  "IPV4_ADDR",
43679
43663
  "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
43680
- "g",
43681
43664
  ["ip", "server", "host", "network", "address"],
43682
43665
  0.94,
43683
43666
  "CONTACT" /* CONTACT */,
@@ -43686,7 +43669,6 @@ var init_registry = __esm({
43686
43669
  [
43687
43670
  "IPV6_ADDR",
43688
43671
  "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
43689
- "g",
43690
43672
  ["ipv6", "ip", "network", "server"],
43691
43673
  0.93,
43692
43674
  "CONTACT" /* CONTACT */,
@@ -43695,7 +43677,6 @@ var init_registry = __esm({
43695
43677
  [
43696
43678
  "HW_MAC_ADDR",
43697
43679
  "\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
43698
- "g",
43699
43680
  ["mac", "hardware", "network", "device"],
43700
43681
  0.91,
43701
43682
  "CONTACT" /* CONTACT */,
@@ -43705,7 +43686,6 @@ var init_registry = __esm({
43705
43686
  [
43706
43687
  "BIRTH_DATE",
43707
43688
  "\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
43708
- "g",
43709
43689
  ["birth", "dob", "born", "birthday", "date of birth"],
43710
43690
  0.88,
43711
43691
  "PERSONAL" /* PERSONAL */,
@@ -43714,16 +43694,14 @@ var init_registry = __esm({
43714
43694
  [
43715
43695
  "US_DRIVERS_LIC",
43716
43696
  "\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
43717
- "g",
43718
43697
  ["driver", "license", "licence", "dl", "dmv"],
43719
- 0.85,
43698
+ 0.55,
43720
43699
  "PERSONAL" /* PERSONAL */,
43721
43700
  null
43722
43701
  ],
43723
43702
  [
43724
43703
  "US_PASSPORT_NUM",
43725
43704
  "\\b[A-Z]\\d{8}\\b",
43726
- "g",
43727
43705
  ["passport", "travel", "visa", "immigration"],
43728
43706
  0.87,
43729
43707
  "PERSONAL" /* PERSONAL */,
@@ -43733,7 +43711,6 @@ var init_registry = __esm({
43733
43711
  [
43734
43712
  "VEHICLE_VIN",
43735
43713
  "\\b[A-HJ-NPR-Z0-9]{17}\\b",
43736
- "g",
43737
43714
  ["vin", "vehicle", "chassis", "automobile"],
43738
43715
  0.92,
43739
43716
  "VEHICLE" /* VEHICLE */,
@@ -43742,7 +43719,6 @@ var init_registry = __esm({
43742
43719
  [
43743
43720
  "VEHICLE_PLATE",
43744
43721
  "\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
43745
- "g",
43746
43722
  ["plate", "registration", "vehicle", "plaka"],
43747
43723
  0.45,
43748
43724
  "VEHICLE" /* VEHICLE */,
@@ -43752,7 +43728,6 @@ var init_registry = __esm({
43752
43728
  [
43753
43729
  "MED_RECORD_ID",
43754
43730
  "\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
43755
- "g",
43756
43731
  ["patient", "medical", "record", "mrn", "hospital"],
43757
43732
  0.96,
43758
43733
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43761,7 +43736,6 @@ var init_registry = __esm({
43761
43736
  [
43762
43737
  "US_MEDICARE_ID",
43763
43738
  "\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
43764
- "g",
43765
43739
  ["medicare", "cms", "beneficiary", "health insurance"],
43766
43740
  0.91,
43767
43741
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43770,7 +43744,6 @@ var init_registry = __esm({
43770
43744
  [
43771
43745
  "US_DEA_NUM",
43772
43746
  "\\b[A-Z]{2}\\d{7}\\b",
43773
- "g",
43774
43747
  ["dea", "prescriber", "drug", "enforcement"],
43775
43748
  0.89,
43776
43749
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43779,7 +43752,6 @@ var init_registry = __esm({
43779
43752
  [
43780
43753
  "US_NPI_NUM",
43781
43754
  "\\b\\d{10}\\b",
43782
- "g",
43783
43755
  ["npi", "provider", "national provider", "healthcare"],
43784
43756
  0.87,
43785
43757
  "HEALTHCARE" /* HEALTHCARE */,
@@ -43789,7 +43761,6 @@ var init_registry = __esm({
43789
43761
  [
43790
43762
  "US_EIN_TAX",
43791
43763
  "\\b\\d{2}-\\d{7}\\b",
43792
- "g",
43793
43764
  ["ein", "federal", "employer", "tax id"],
43794
43765
  0.89,
43795
43766
  "IDENTITY_US" /* IDENTITY_US */,
@@ -43799,71 +43770,33 @@ var init_registry = __esm({
43799
43770
  [
43800
43771
  "UK_NATL_INS",
43801
43772
  "\\b[A-Z]{2}\\d{6}[A-Z]\\b",
43802
- "g",
43803
43773
  ["nino", "national insurance", "ni number", "uk"],
43804
43774
  0.9,
43805
43775
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43806
- null
43776
+ "uk_nino"
43807
43777
  ],
43808
43778
  [
43809
43779
  "CA_SOCIAL_INS",
43810
43780
  "\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b",
43811
- "g",
43812
43781
  ["sin", "social insurance", "canada", "canadian"],
43813
43782
  0.89,
43814
43783
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43815
- null
43784
+ "ca_sin"
43816
43785
  ],
43817
43786
  [
43818
- "FR_INSEE_NUM",
43819
- "\\b[12]\\d{2}[01]\\d\\d{8}\\d{2}\\b",
43820
- "g",
43821
- ["insee", "s\xE9curit\xE9 sociale", "france", "num\xE9ro"],
43822
- 0.88,
43823
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43824
- null
43825
- ],
43826
- [
43827
- "DE_STEUER_ID",
43828
- "\\b\\d{2}\\s?\\d{3}\\s?\\d{3}\\s?\\d{3}\\b",
43829
- "g",
43830
- ["steuer", "steuernummer", "finanzamt", "deutschland"],
43831
- 0.87,
43832
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43833
- null
43834
- ],
43835
- [
43836
- "TR_TCID",
43837
- "\\b[1-9]\\d{9}[02468]\\b",
43838
- "g",
43839
- ["tc", "kimlik", "vatanda\u015Fl\u0131k", "n\xFCfus", "t\xFCrkiye"],
43840
- 0.92,
43841
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43842
- "tcid"
43843
- ],
43844
- [
43845
- "SA_NATIONAL_ID",
43846
- "\\b1\\d{9}\\b",
43847
- "g",
43848
- ["\u0647\u0648\u064A\u0629", "\u0631\u0642\u0645 \u0627\u0644\u0647\u0648\u064A\u0629", "saudi", "\u0648\u0637\u0646\u064A\u0629", "identity"],
43849
- 0.91,
43850
- "IDENTITY_INTL" /* IDENTITY_INTL */,
43851
- "saudi_nid"
43852
- ],
43853
- [
43854
- "UAE_EMIRATES_ID",
43855
- "\\b784-\\d{4}-\\d{7}-\\d\\b",
43856
- "g",
43857
- ["emirates", "\u0647\u0648\u064A\u0629", "uae", "emirati", "identity"],
43858
- 0.93,
43787
+ "ES_DNI",
43788
+ "(?:\\d{8}[A-Z]|[XYZ]\\d{7}[A-Z])",
43789
+ ["dni", "nie", "identidad", "nif", "spain"],
43790
+ 0.94,
43859
43791
  "IDENTITY_INTL" /* IDENTITY_INTL */,
43860
- "luhn"
43792
+ "es_id",
43793
+ true,
43794
+ ["*", "es"]
43861
43795
  ],
43862
43796
  // ── CORPORATE ──────────────────────────────────────────────────────
43863
43797
  [
43864
43798
  "CORP_EMPLOYEE_ID",
43865
- "\\b(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}\\b",
43866
- "gi",
43799
+ "(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}",
43867
43800
  ["employee", "staff", "personnel", "worker"],
43868
43801
  0.55,
43869
43802
  "CORPORATE" /* CORPORATE */,
@@ -43873,7 +43806,11 @@ var init_registry = __esm({
43873
43806
  exports.DLPPatternRegistry = class {
43874
43807
  constructor(loadGroups) {
43875
43808
  this.catalogue = /* @__PURE__ */ new Map();
43809
+ this.localeCategoryRegexMap = /* @__PURE__ */ new Map();
43876
43810
  this.buildCatalogue(loadGroups ?? null);
43811
+ for (const loc of ["*", "en", "es"]) {
43812
+ this.compileForLocale(loc);
43813
+ }
43877
43814
  }
43878
43815
  get typeNames() {
43879
43816
  return [...this.catalogue.keys()];
@@ -43885,23 +43822,74 @@ var init_registry = __esm({
43885
43822
  descriptorFor(typeName) {
43886
43823
  return this.catalogue.get(typeName);
43887
43824
  }
43888
- /** Return locale-tuned name regexes, falling back to English. */
43889
43825
  namePatternsFor(lang) {
43890
43826
  return LOCALE_NAME_RULES[lang] ?? LOCALE_NAME_RULES["en"];
43891
43827
  }
43892
- /** Return locale-tuned address regexes, falling back to English. */
43893
43828
  addressPatternsFor(lang) {
43894
43829
  return LOCALE_ADDRESS_RULES[lang] ?? LOCALE_ADDRESS_RULES["en"];
43895
43830
  }
43831
+ getCategoryRegexesMap(locale = "en") {
43832
+ if (!this.localeCategoryRegexMap.has(locale)) {
43833
+ this.compileForLocale(locale);
43834
+ }
43835
+ return this.localeCategoryRegexMap.get(locale);
43836
+ }
43837
+ getCategoryTypeMap(categoryName, locale = "en") {
43838
+ return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
43839
+ }
43840
+ compileForLocale(locale) {
43841
+ const localePool = /* @__PURE__ */ new Map();
43842
+ for (const [typeName, desc] of this.catalogue.entries()) {
43843
+ if (desc.supportedLocales.includes("*") || desc.supportedLocales.includes(locale)) {
43844
+ const catKey = desc.category;
43845
+ if (!localePool.has(catKey)) localePool.set(catKey, []);
43846
+ localePool.get(catKey).push([typeName, desc]);
43847
+ }
43848
+ }
43849
+ const categoryMap = /* @__PURE__ */ new Map();
43850
+ for (const [catKey, entries] of localePool.entries()) {
43851
+ entries.sort(([, a6], [, b6]) => {
43852
+ const aVal = a6.validatorTag ? 0 : 1;
43853
+ const bVal = b6.validatorTag ? 0 : 1;
43854
+ if (aVal !== bVal) return aVal - bVal;
43855
+ return b6.compiledRe.source.length - a6.compiledRe.source.length;
43856
+ });
43857
+ const parts = [];
43858
+ const typeOrder = [];
43859
+ for (const [typeName, desc] of entries) {
43860
+ parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
43861
+ typeOrder.push(typeName);
43862
+ }
43863
+ const combinedSource = parts.join("|");
43864
+ const needsI = entries.some(([, d6]) => d6.compiledRe.flags.includes("i"));
43865
+ const flags = needsI ? "gi" : "g";
43866
+ try {
43867
+ const re = new RegExp(combinedSource, flags);
43868
+ categoryMap.set(catKey, { re, typeOrder });
43869
+ } catch (err2) {
43870
+ console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err2);
43871
+ }
43872
+ }
43873
+ this.localeCategoryRegexMap.set(locale, categoryMap);
43874
+ }
43896
43875
  buildCatalogue(restrict) {
43897
- for (const [typeName, regexStr, flags, terms, risk, cat, vtag] of RAW_PATTERNS) {
43876
+ for (const entry of RAW_PATTERNS) {
43877
+ const [typeName, regexSource, terms, risk, cat, vtag, isHighEntropy, supportedLocales] = entry;
43898
43878
  if (restrict !== null && !restrict.has(cat)) continue;
43879
+ let re;
43880
+ if (regexSource instanceof RegExp) {
43881
+ re = regexSource;
43882
+ } else {
43883
+ re = new RegExp(regexSource, "g");
43884
+ }
43899
43885
  this.catalogue.set(typeName, {
43900
- compiledRe: new RegExp(regexStr, flags),
43886
+ compiledRe: re,
43901
43887
  proximityTerms: new Set(terms),
43902
43888
  baseRisk: risk,
43903
43889
  category: cat,
43904
- validatorTag: vtag
43890
+ validatorTag: vtag,
43891
+ isHighEntropy: isHighEntropy ?? vtag !== null,
43892
+ supportedLocales: supportedLocales ?? ["*"]
43905
43893
  });
43906
43894
  }
43907
43895
  }
@@ -43998,29 +43986,13 @@ function checkIpv4Octets(raw) {
43998
43986
  }
43999
43987
  return true;
44000
43988
  }
44001
- function checkTcidNumber(raw) {
44002
- const digitsStr = raw.replace(/\D/g, "");
44003
- if (digitsStr.length !== 11) return false;
44004
- const d6 = digitsStr.split("").map(Number);
44005
- if (d6[0] === 0) return false;
44006
- if (d6[10] % 2 !== 0) return false;
44007
- const oddSum = d6[0] + d6[2] + d6[4] + d6[6] + d6[8];
44008
- const evenSum = d6[1] + d6[3] + d6[5] + d6[7];
44009
- const computedD10 = ((oddSum * 7 - evenSum) % 10 + 10) % 10;
44010
- if (computedD10 !== d6[9]) return false;
44011
- const firstTenSum = d6.slice(0, 10).reduce((a6, b6) => a6 + b6, 0);
44012
- if (firstTenSum % 10 !== d6[10]) return false;
44013
- return true;
44014
- }
44015
- function checkSaudiNid(raw) {
44016
- const digitsStr = raw.replace(/\D/g, "");
44017
- if (digitsStr.length !== 10) return false;
44018
- const d6 = digitsStr.split("").map(Number);
44019
- if (d6[0] !== 1) return false;
43989
+ function checkCaSin(raw) {
43990
+ const digits = raw.replace(/\D/g, "");
43991
+ if (digits.length !== 9) return false;
44020
43992
  let total = 0;
44021
- for (let idx = 0; idx < 10; idx++) {
44022
- let val = d6[idx];
44023
- if (idx % 2 === 0) {
43993
+ for (let idx = 0; idx < digits.length; idx++) {
43994
+ let val = parseInt(digits[idx], 10);
43995
+ if (idx % 2 === 1) {
44024
43996
  val *= 2;
44025
43997
  if (val > 9) val -= 9;
44026
43998
  }
@@ -44028,7 +44000,30 @@ function checkSaudiNid(raw) {
44028
44000
  }
44029
44001
  return total % 10 === 0;
44030
44002
  }
44031
- var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, VALIDATOR_DISPATCH; exports.DLPValidationEngine = void 0;
44003
+ function checkUkNino(raw) {
44004
+ const cleaned = raw.replace(/ /g, "").toUpperCase();
44005
+ if (cleaned.length !== 9) return false;
44006
+ return UK_NINO_REGEX.test(cleaned);
44007
+ }
44008
+ function checkEsId(raw) {
44009
+ const cleaned = raw.replace(/[\s-]/g, "").toUpperCase();
44010
+ if (cleaned.length !== 9) return false;
44011
+ const mapping = { X: "0", Y: "1", Z: "2" };
44012
+ const firstChar = cleaned[0];
44013
+ let numStr;
44014
+ if (firstChar in mapping) {
44015
+ numStr = mapping[firstChar] + cleaned.slice(1, 8);
44016
+ } else if (/^\d$/.test(firstChar)) {
44017
+ numStr = cleaned.slice(0, 8);
44018
+ } else {
44019
+ return false;
44020
+ }
44021
+ if (!/^\d+$/.test(numStr)) return false;
44022
+ const num = parseInt(numStr, 10);
44023
+ const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
44024
+ return cleaned[8] === validLetters[num % 23];
44025
+ }
44026
+ var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, UK_NINO_REGEX, VALIDATOR_DISPATCH; exports.DLPValidationEngine = void 0;
44032
44027
  var init_handlers = __esm({
44033
44028
  "src/core/dlp/handlers.ts"() {
44034
44029
  IBAN_COUNTRY_LENGTHS = {
@@ -44131,6 +44126,7 @@ var init_handlers = __esm({
44131
44126
  Z: 9
44132
44127
  };
44133
44128
  VIN_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2];
44129
+ UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
44134
44130
  VALIDATOR_DISPATCH = {
44135
44131
  luhn: checkLuhn,
44136
44132
  ssn_area: checkSsnArea,
@@ -44139,8 +44135,9 @@ var init_handlers = __esm({
44139
44135
  vin_format: checkVinFormat,
44140
44136
  btc_format: checkBtcFormat,
44141
44137
  ipv4: checkIpv4Octets,
44142
- tcid: checkTcidNumber,
44143
- saudi_nid: checkSaudiNid
44138
+ ca_sin: checkCaSin,
44139
+ uk_nino: checkUkNino,
44140
+ es_id: checkEsId
44144
44141
  };
44145
44142
  exports.DLPValidationEngine = class {
44146
44143
  /**
@@ -44177,7 +44174,8 @@ var init_scorer = __esm({
44177
44174
  keywordBoost: 0.1,
44178
44175
  validatorOverride: 0.99,
44179
44176
  maxConfidence: 0.99,
44180
- penaltyFactor: 0.65
44177
+ penaltyFactor: 0.99
44178
+ // Renamed functionally to validator failure penalty subtraction
44181
44179
  };
44182
44180
  exports.DLPConfidenceScorer = class {
44183
44181
  constructor(overrides = {}) {
@@ -44196,7 +44194,7 @@ var init_scorer = __esm({
44196
44194
  score(input) {
44197
44195
  if (input.validatorPassed === true) return this.valOverride;
44198
44196
  if (input.validatorPassed === false) {
44199
- return Math.min(this.ceil, input.baseRisk * this.penalty);
44197
+ return Math.max(0, input.baseRisk - this.penalty);
44200
44198
  }
44201
44199
  const windowLo = Math.max(0, input.matchStart - this.window);
44202
44200
  const windowHi = Math.min(input.fullText.length, input.matchEnd + this.window);
@@ -44222,6 +44220,47 @@ var init_scorer = __esm({
44222
44220
  }
44223
44221
  });
44224
44222
 
44223
+ // src/core/span.ts
44224
+ function resolveOverlaps(spans) {
44225
+ if (spans.length === 0) return [];
44226
+ const sorted = [...spans].sort((a6, b6) => {
44227
+ if (a6.start !== b6.start) return a6.start - b6.start;
44228
+ const lenDiff = b6.end - b6.start - (a6.end - a6.start);
44229
+ if (lenDiff !== 0) return lenDiff;
44230
+ return b6.confidence - a6.confidence;
44231
+ });
44232
+ const resolved = [];
44233
+ let occupiedEnd = -1;
44234
+ for (const span of sorted) {
44235
+ if (span.start >= occupiedEnd) {
44236
+ resolved.push(span);
44237
+ occupiedEnd = span.end;
44238
+ } else if (span.end <= occupiedEnd) {
44239
+ continue;
44240
+ } else {
44241
+ const last = resolved[resolved.length - 1];
44242
+ if (span.confidence > last.confidence) {
44243
+ resolved.pop();
44244
+ resolved.push(span);
44245
+ occupiedEnd = span.end;
44246
+ }
44247
+ }
44248
+ }
44249
+ return resolved.sort((a6, b6) => b6.start - a6.start);
44250
+ }
44251
+ function reconstruct(text, resolvedSpans) {
44252
+ let result = text;
44253
+ for (const span of resolvedSpans) {
44254
+ if (span.maskedValue == null) continue;
44255
+ result = result.slice(0, span.start) + span.maskedValue + result.slice(span.end);
44256
+ }
44257
+ return result;
44258
+ }
44259
+ var init_span = __esm({
44260
+ "src/core/span.ts"() {
44261
+ }
44262
+ });
44263
+
44225
44264
  // node_modules/delayed-stream/lib/delayed_stream.js
44226
44265
  var require_delayed_stream = __commonJS({
44227
44266
  "node_modules/delayed-stream/lib/delayed_stream.js"(exports2, module) {
@@ -58892,7 +58931,7 @@ var init_transformers_scanner = __esm({
58892
58931
  confidence = Math.min(1, confidence + 0.2);
58893
58932
  }
58894
58933
  if (confidence >= confidenceThreshold && !looksLikeToken(val) && val.length > 1) {
58895
- const token = await encodeFn(val);
58934
+ const token = await encodeFn(val, { entityType });
58896
58935
  entities.push({
58897
58936
  type: entityType,
58898
58937
  value: val,
@@ -58999,17 +59038,18 @@ var init_scanner = __esm({
58999
59038
  init_registry();
59000
59039
  init_handlers();
59001
59040
  init_scorer();
59041
+ init_span();
59002
59042
  _dlpLanguageResolver = new exports.LanguageContextResolver();
59003
59043
  _dlpPatternRegistry = new exports.DLPPatternRegistry();
59004
59044
  _dlpValidationEngine = new exports.DLPValidationEngine();
59005
59045
  _dlpConfidenceScorer = new exports.DLPConfidenceScorer();
59006
59046
  REGEX_PATTERNS = {
59007
59047
  "EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
59008
- "PHONE_NUMBER": /\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4}/g,
59009
- "PHONE_NUMBER_INTL": /\+(?:44|33|49)[\s\-.]?\(?\d{1,5}\)?(?:[\s\-.]?\d{2,4}){2,4}/g,
59010
- "US_SSN": /\d{3}-\d{2}-\d{4}/g,
59011
- "CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
59012
- "US_ROUTING_NUMBER": /\b\d{9}\b/g,
59048
+ "PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
59049
+ "PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
59050
+ "US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
59051
+ "CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
59052
+ "US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
59013
59053
  "US_PASSPORT": /\b[A-Z]\d{8}\b/g,
59014
59054
  "DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g
59015
59055
  };
@@ -59064,26 +59104,55 @@ var init_scanner = __esm({
59064
59104
  const checksum = 3 * (d6[0] + d6[3] + d6[6]) + 7 * (d6[1] + d6[4] + d6[7]) + (d6[2] + d6[5] + d6[8]);
59065
59105
  return checksum % 10 === 0;
59066
59106
  }
59067
- async _tier0Dlp(text, encodeFn, confidenceThreshold) {
59107
+ async _tier0CollectSpans(text, confidenceThreshold) {
59068
59108
  const detectedLanguage = _dlpLanguageResolver.resolve(text);
59069
- const rawHits = [];
59070
- for (const [typeTag, descriptor] of _dlpPatternRegistry.iterDescriptors()) {
59071
- const re = new RegExp(descriptor.compiledRe.source, descriptor.compiledRe.flags);
59109
+ const spans = [];
59110
+ const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
59111
+ for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
59112
+ const megaRe = new RegExp(re.source, re.flags);
59072
59113
  let m6;
59073
- while ((m6 = re.exec(text)) !== null) {
59114
+ while ((m6 = megaRe.exec(text)) !== null) {
59115
+ const groups = m6.groups ?? {};
59116
+ let typeTag;
59117
+ for (const name of typeOrder) {
59118
+ if (groups[name] !== void 0) {
59119
+ typeTag = name;
59120
+ break;
59121
+ }
59122
+ }
59123
+ if (!typeTag) continue;
59074
59124
  const matchedStr = m6[0];
59075
59125
  if (looksLikeToken(matchedStr)) continue;
59126
+ const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
59127
+ if (!descriptor) continue;
59076
59128
  const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
59077
- const conf = _dlpConfidenceScorer.score({
59078
- baseRisk: descriptor.baseRisk,
59079
- matchStart: m6.index,
59080
- matchEnd: m6.index + matchedStr.length,
59081
- fullText: text,
59082
- proximityTerms: descriptor.proximityTerms,
59083
- validatorPassed: validatorResult
59084
- });
59129
+ let conf;
59130
+ if (validatorResult === false) {
59131
+ if (descriptor.isHighEntropy) {
59132
+ conf = 0.85;
59133
+ } else {
59134
+ continue;
59135
+ }
59136
+ } else {
59137
+ conf = _dlpConfidenceScorer.score({
59138
+ baseRisk: descriptor.baseRisk,
59139
+ matchStart: m6.index,
59140
+ matchEnd: m6.index + matchedStr.length,
59141
+ fullText: text,
59142
+ proximityTerms: descriptor.proximityTerms,
59143
+ validatorPassed: validatorResult
59144
+ });
59145
+ }
59085
59146
  if (conf >= confidenceThreshold) {
59086
- rawHits.push({ start: m6.index, end: m6.index + matchedStr.length, tag: typeTag, val: matchedStr, conf });
59147
+ spans.push({
59148
+ start: m6.index,
59149
+ end: m6.index + matchedStr.length,
59150
+ entityType: typeTag,
59151
+ originalValue: matchedStr,
59152
+ confidence: conf,
59153
+ method: "dlp_heuristic",
59154
+ language: detectedLanguage
59155
+ });
59087
59156
  }
59088
59157
  }
59089
59158
  }
@@ -59102,7 +59171,15 @@ var init_scanner = __esm({
59102
59171
  validatorPassed: null
59103
59172
  });
59104
59173
  if (conf >= confidenceThreshold) {
59105
- rawHits.push({ start: m6.index, end: m6.index + m6[0].length, tag: "PERSON_NAME", val: m6[0], conf });
59174
+ spans.push({
59175
+ start: m6.index,
59176
+ end: m6.index + m6[0].length,
59177
+ entityType: "PERSON_NAME",
59178
+ originalValue: m6[0],
59179
+ confidence: conf,
59180
+ method: "dlp_heuristic",
59181
+ language: detectedLanguage
59182
+ });
59106
59183
  }
59107
59184
  }
59108
59185
  }
@@ -59111,85 +59188,78 @@ var init_scanner = __esm({
59111
59188
  let m6;
59112
59189
  while ((m6 = re.exec(text)) !== null) {
59113
59190
  if (looksLikeToken(m6[0])) continue;
59114
- rawHits.push({ start: m6.index, end: m6.index + m6[0].length, tag: "PHYS_ADDRESS", val: m6[0], conf: 0.55 });
59115
- }
59116
- }
59117
- rawHits.sort((a6, b6) => a6.start - b6.start || b6.end - b6.start - (a6.end - a6.start) || b6.conf - a6.conf);
59118
- const deduped = [];
59119
- let occupiedEnd = -1;
59120
- for (const hit of rawHits) {
59121
- if (hit.start >= occupiedEnd) {
59122
- deduped.push(hit);
59123
- occupiedEnd = hit.end;
59191
+ spans.push({
59192
+ start: m6.index,
59193
+ end: m6.index + m6[0].length,
59194
+ entityType: "PHYS_ADDRESS",
59195
+ originalValue: m6[0],
59196
+ confidence: 0.55,
59197
+ method: "dlp_heuristic",
59198
+ language: detectedLanguage
59199
+ });
59124
59200
  }
59125
59201
  }
59202
+ return spans;
59203
+ }
59204
+ /** Backward-compat wrapper — collects spans then single-pass encodes. */
59205
+ async _tier0Dlp(text, encodeFn, confidenceThreshold) {
59206
+ const spans = await this._tier0CollectSpans(text, confidenceThreshold);
59207
+ const resolved = resolveOverlaps(spans);
59126
59208
  const entities = [];
59127
- let excised = text;
59128
- for (const hit of [...deduped].reverse()) {
59129
- const token = await encodeFn(hit.val);
59130
- excised = excised.slice(0, hit.start) + token + excised.slice(hit.end);
59209
+ await Promise.all(resolved.map(async (span) => {
59210
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
59131
59211
  entities.push({
59132
- type: hit.tag,
59133
- value: hit.val,
59134
- method: "dlp_heuristic",
59135
- confidence: hit.conf,
59136
- masked_value: token,
59137
- language: detectedLanguage
59212
+ type: span.entityType,
59213
+ value: span.originalValue,
59214
+ method: span.method,
59215
+ confidence: span.confidence,
59216
+ masked_value: span.maskedValue,
59217
+ language: span.language
59138
59218
  });
59139
- }
59140
- return [excised, entities];
59219
+ }));
59220
+ return [reconstruct(text, resolved), entities];
59141
59221
  }
59142
- async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59143
- let entities = [];
59144
- let excised = text;
59145
- let allMatches = [];
59222
+ async _tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold) {
59223
+ const spans = [];
59146
59224
  for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
59147
59225
  const re = new RegExp(pattern.source, pattern.flags);
59148
59226
  let match;
59149
59227
  while ((match = re.exec(text)) !== null) {
59150
- let confidence = 0.95;
59151
- if (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " "))) {
59152
- confidence = 1;
59153
- }
59154
- if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(match[0])) {
59155
- confidence = Math.max(confidence, 0.99);
59156
- }
59157
- if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(match[0])) {
59158
- continue;
59228
+ const val = match[0];
59229
+ if (looksLikeToken(val)) continue;
59230
+ let confidence = aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " ")) ? 1 : 0.95;
59231
+ if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
59232
+ if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(val)) continue;
59233
+ if (confidence >= confidenceThreshold) {
59234
+ spans.push({
59235
+ start: match.index,
59236
+ end: match.index + val.length,
59237
+ entityType,
59238
+ originalValue: val,
59239
+ confidence,
59240
+ method: "regex"
59241
+ });
59159
59242
  }
59160
- allMatches.push({
59161
- start: match.index,
59162
- end: match.index + match[0].length,
59163
- type: entityType,
59164
- value: match[0],
59165
- confidence
59166
- });
59167
- }
59168
- }
59169
- allMatches.sort((a6, b6) => a6.start - b6.start || b6.end - b6.start - (a6.end - a6.start));
59170
- let filtered = [];
59171
- let lastEnd = -1;
59172
- for (const m6 of allMatches) {
59173
- if (m6.start >= lastEnd) {
59174
- filtered.push(m6);
59175
- lastEnd = m6.end;
59176
59243
  }
59177
59244
  }
59178
- const sortedFiltered = [...filtered].sort((a6, b6) => b6.start - a6.start);
59179
- for (const m6 of sortedFiltered) {
59180
- if (m6.confidence >= confidenceThreshold && !looksLikeToken(m6.value)) {
59181
- const token = await encodeFn(m6.value);
59182
- excised = excised.slice(0, m6.start) + token + excised.slice(m6.end);
59183
- entities.push({
59184
- type: m6.type,
59185
- value: m6.value,
59186
- method: "regex",
59187
- confidence: m6.confidence,
59188
- masked_value: token
59189
- });
59190
- }
59191
- }
59192
- return [excised, entities];
59245
+ return spans;
59246
+ }
59247
+ /** Backward-compat wrapper. */
59248
+ async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59249
+ const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
59250
+ const resolved = resolveOverlaps(spans);
59251
+ const entities = [];
59252
+ await Promise.all(resolved.map(async (span) => {
59253
+ span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
59254
+ entities.push({
59255
+ type: span.entityType,
59256
+ value: span.originalValue,
59257
+ method: span.method,
59258
+ confidence: span.confidence,
59259
+ masked_value: span.maskedValue
59260
+ });
59261
+ }));
59262
+ return [reconstruct(text, resolved), entities];
59193
59263
  }
59194
59264
  async _tier2Nlp(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
59195
59265
  return [text, []];
@@ -59209,13 +59279,18 @@ var init_scanner = __esm({
59209
59279
  const _encode = options.encodeFn || encode;
59210
59280
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
59211
59281
  const boost = this._resolveBoost(options.context);
59212
- let currentText = text;
59282
+ const allSpans = [];
59213
59283
  if (pipeline.includes("dlp")) {
59214
- [currentText] = await this._tier0Dlp(currentText, _encode, confidenceThreshold);
59284
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59215
59285
  }
59216
59286
  if (pipeline.includes("regex") || pipeline.includes("checksum")) {
59217
- [currentText] = await this._tier1Regex(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
59287
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
59218
59288
  }
59289
+ const resolved = resolveOverlaps(allSpans);
59290
+ await Promise.all(resolved.map(async (span) => {
59291
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59292
+ }));
59293
+ let currentText = reconstruct(text, resolved);
59219
59294
  if (pipeline.includes("nlp")) {
59220
59295
  [currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
59221
59296
  }
@@ -59227,20 +59302,29 @@ var init_scanner = __esm({
59227
59302
  const _encode = options.encodeFn || encode;
59228
59303
  const confidenceThreshold = options.confidenceThreshold ?? 0.7;
59229
59304
  const boost = this._resolveBoost(options.context);
59230
- let allEntities = [];
59231
- let remaining = text;
59305
+ const allEntities = [];
59306
+ const allSpans = [];
59232
59307
  if (pipeline.includes("dlp")) {
59233
- const [newText, tier0] = await this._tier0Dlp(remaining, _encode, confidenceThreshold);
59234
- remaining = newText;
59235
- allEntities.push(...tier0);
59308
+ allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59236
59309
  }
59237
59310
  if (pipeline.includes("regex") || pipeline.includes("checksum")) {
59238
- const [newText, tier1] = await this._tier1Regex(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59239
- remaining = newText;
59240
- allEntities.push(...tier1);
59241
- }
59311
+ allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
59312
+ }
59313
+ const resolved = resolveOverlaps(allSpans);
59314
+ await Promise.all(resolved.map(async (span) => {
59315
+ span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59316
+ allEntities.push({
59317
+ type: span.entityType,
59318
+ value: span.originalValue,
59319
+ method: span.method,
59320
+ confidence: span.confidence,
59321
+ masked_value: span.maskedValue,
59322
+ language: span.language
59323
+ });
59324
+ }));
59325
+ const remaining = reconstruct(text, resolved);
59242
59326
  if (pipeline.includes("nlp")) {
59243
- const [_newText, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59327
+ const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
59244
59328
  allEntities.push(...tier2);
59245
59329
  }
59246
59330
  return allEntities;