npm - mask-privacy - Versions diffs - 3.0.0 → 3.2.0 - Mend

mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +5 -17
package/dist/index.d.mts +58 -27
package/dist/index.d.ts +58 -27
package/dist/index.js +394 -310
package/dist/index.js.map +1 -1
package/dist/index.mjs +394 -310
package/dist/index.mjs.map +1 -1
package/package.json +1 -1
package/src/core/dlp/assessor.ts +3 -26
package/src/core/dlp/handlers.ts +44 -31
package/src/core/dlp/index.ts +0 -2
package/src/core/dlp/registry.ts +119 -107
package/src/core/dlp/scorer.ts +4 -4
package/src/core/fpe.ts +85 -32
package/src/core/fpe_utils.ts +20 -20
package/src/core/scanner.ts +146 -151
package/src/core/span.ts +76 -0
package/src/core/transformers_scanner.ts +2 -2
package/src/core/vault.ts +2 -1
package/tests/async.test.ts +2 -2
package/tests/dlp_hardened.test.ts +21 -0
package/tests/fpe.test.ts +4 -4
package/tests/hooks.test.ts +2 -2
package/tests/langchain.test.ts +2 -2
package/tests/llamaindex.test.ts +1 -1
package/tests/scanner.test.ts +0 -1
package/tests/substring.test.ts +1 -1
package/tests/vault.test.ts +1 -1

package/src/core/fpe.ts CHANGED Viewed

@@ -10,7 +10,6 @@ import * as crypto from 'crypto';
 import { config } from '../config';
 import { getKeyProvider } from './key_provider';
 import { MaskSecurityError } from './exceptions';
-import { looksLikeToken } from './fpe_utils';
 // Master key management
@@ -52,13 +51,12 @@ export function resetMasterKey(): void {
 // Detectors — order matters: first match wins
 const _EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
-const _PHONE_RE = /^\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}$|^\d{3}[\s\-.]?\d{4}$/;
+const _PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
+const _PHONE_INTL_RE = /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/;
 const _SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
 const _CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
 const _ROUTING_RE = /^\d{9}$/;
-const _TCID_RE = /^[1-9]\d{9}[02468]$/;
-const _SAUDI_NID_RE = /^1\d{9}$/;
-const _UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
+const _ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
 const _IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
 // Deterministic helpers (HMAC-based)
@@ -96,54 +94,109 @@ async function _hmacDigits(plaintext: string, n: number, offset: number = 0): Pr
 // Public API
+// Dictionary for Semantic NLP Faker Generation
+const _FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
+const _LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
+const _CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
+/** Return a deterministic item from an array. */
+async function _pickFromArray(plaintext: string, array: string[]): Promise<string> {
+   const digits = await _hmacDigits(plaintext, 8);
+   const num = parseInt(digits, 10);
+   return array[num % array.length];
+}
+/** Compute Luhn check digit */
+function _computeLuhnDigit(partialNum: string): string {
+    const digits = partialNum.split("").map(Number);
+    let sum = 0;
+    let shouldDouble = true;
+    for (let i = digits.length - 1; i >= 0; i--) {
+        let digit = digits[i];
+        if (shouldDouble) {
+            digit *= 2;
+            if (digit > 9) digit -= 9;
+        }
+        sum += digit;
+        shouldDouble = !shouldDouble;
+    }
+    return ((10 - (sum % 10)) % 10).toString();
+}
+function _computeEsIdCheck(num: number): string {
+  return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
+}
+// Public API
 /**
- * Return a **deterministic**, format-preserving token for rawText.
+ * Return a **deterministic**, format-preserving token for rawText using its entityType.
  */
-export async function generateFPEToken(rawText: string): Promise<string> {
+export async function generateFPEToken(rawText: string, entityType: string = 'UNKNOWN'): Promise<string> {
   const text = rawText.trim();
+  let type = (entityType || "UNKNOWN").toUpperCase();
+  if (type === "UNKNOWN") {
+    if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
+    else if (_SSN_RE.test(text)) type = "US_SSN";
+    else if (_CC_RE.test(text)) type = "CREDIT_CARD";
+    else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
+    else if (_ES_ID_RE.test(text)) type = "ES_DNI";
+    else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
+    else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
+  }
-  if (_EMAIL_RE.test(text)) {
-    return `tkn-${await _hmacHex(text)}@email.com`;
+  if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
+    const parts = text.split("@");
+    const domain = parts.length === 2 ? parts[1] : "email.com";
+    return `tkn-${await _hmacHex(text)}@${domain}`;
   }
-  if (_PHONE_RE.test(text)) {
-    return `+1-555-${await _hmacDigits(text, 7)}`;
+  if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
+    const m = text.match(/^\+([1-9]\d{0,3})/);
+    const cc = m ? m[1] : "1";
+    return `+${cc}-555-${await _hmacDigits(text, 7)}`;
   }
-  if (_SSN_RE.test(text)) {
+  if (type === "US_SSN") {
     return `000-00-${await _hmacDigits(text, 4)}`;
   }
-  if (_CC_RE.test(text)) {
-    return `4000-0000-0000-${await _hmacDigits(text, 4)}`;
+  if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
+    const base = `400000000000${await _hmacDigits(text, 3)}`;
+    const checkDig = _computeLuhnDigit(base);
+    const full = base + checkDig;
+    return `${full.slice(0,4)}-${full.slice(4,8)}-${full.slice(8,12)}-${full.slice(12,16)}`;
   }
-  if (_ROUTING_RE.test(text)) {
+  if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
     return `000000${await _hmacDigits(text, 3)}`;
   }
-  // Turkish TC Kimlik No (format: 990000 + XXXX + even digit)
-  if (_TCID_RE.test(text)) {
-    const tail = await _hmacDigits(text, 5);
-    let lastD = parseInt(tail[tail.length - 1], 10);
-    if (lastD % 2 !== 0) lastD = (lastD + 1) % 10;
-    return `990000${tail.slice(0, 4)}${lastD}`;
+  if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
+    const countryCode = (text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2))) ? text.slice(0, 2).toUpperCase() : "US";
+    return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
   }
-  // Saudi National ID (format: 100000XXXX)
-  if (_SAUDI_NID_RE.test(text)) {
-    return `100000${await _hmacDigits(text, 4)}`;
+  if (type === "ES_DNI") {
+    const digits = `000${await _hmacDigits(text, 5)}`;
+    return digits + _computeEsIdCheck(parseInt(digits, 10));
   }
-  // UAE Emirates ID (format: 784-0000-XXXXXXX-X)
-  if (_UAE_EID_RE.test(text)) {
-    return `784-0000-${await _hmacDigits(text, 7)}-${await _hmacDigits(text, 1, 20)}`;
+  if (type === "PERSON" || type === "PERSON_NAME") {
+      const f = await _pickFromArray(text, _FIRST_NAMES);
+      const l = await _pickFromArray(text + "last", _LAST_NAMES);
+      return `<PER:${f}_${l}>`;
   }
-  // IBAN (format: XX00-XXXX... — preserve country code, zero check digits)
-  if (_IBAN_RE.test(text)) {
-    const countryCode = text.slice(0, 2);
-    return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
+  if (type === "LOCATION" || type === "PHYS_ADDRESS") {
+      const c = await _pickFromArray(text, _CITIES);
+      return `<LOC:${c}>`;
+  }
+  if (type === "ORGANIZATION") {
+      const c = await _pickFromArray(text, _LAST_NAMES);
+      return `<ORG:${c}_Inc>`;
   }
   return `[TKN-${await _hmacHex(text)}]`;

package/src/core/fpe_utils.ts CHANGED Viewed

@@ -10,15 +10,14 @@
  * Used for sub-string detokenization (finding tokens inside paragraphs).
  */
 export const TOKEN_PATTERN = new RegExp(
-  "tkn-[a-f0-9]{8,64}@email\\.com" +            // Email
-  "|\\+1-555-\\d{7}" +                           // Phone
+  "tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}" +            // Email
+  "|\\+[1-9]\\d{0,3}-555-\\d{7}" +                           // Phone
   "|000-00-\\d{4}" +                            // SSN
   "|4000-0000-0000-\\d{4}" +                    // CC
   "|000000\\d{3}" +                             // Routing
-  "|990000\\d{4}[02468]" +                      // Turkish TCID token
-  "|100000\\d{4}" +                             // Saudi NID token
-  "|784-0000-\\d{7}-\\d" +                       // UAE EID token
+  "|000\\d{5}[A-Z]" +                           // Spanish DNI token
   "|[A-Z]{2}00[A-F0-9]{4,16}" +                // IBAN token
+  "|<(?:PER|LOC|ORG):[^>]+>" +                 // NLP Semantic tokens
   "|\\[TKN-[a-f0-9]{8,64}\\]",                  // Opaque
   "g"
 );
@@ -30,13 +29,16 @@ export function looksLikeToken(value: string | any): boolean {
   if (typeof value !== 'string') return false;
   const v = value.trim();
-  // Email tokens: tkn-<hex>@email.com
-  if (v.startsWith("tkn-") && v.includes("@email.com")) {
-    return true;
+  // Email tokens: tkn-<hex>@domain.com
+  if (v.startsWith("tkn-") && v.includes("@")) {
+    const parts = v.split("@");
+    if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
+      return true;
+    }
   }
-  // Phone tokens: +1-555-XXXXXXX
-  if (v.startsWith("+1-555-") && v.length === 14) {
+  // Phone tokens: +CC-555-XXXXXXX
+  if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v)) {
     return true;
   }
@@ -55,18 +57,10 @@ export function looksLikeToken(value: string | any): boolean {
     return true;
   }
-  // UAE Emirates ID tokens: 784-0000-XXXXXXX-X
-  if (v.startsWith("784-0000-") && v.length === 18) {
-    return true;
-  }
-  // Turkish TCID tokens: 990000XXXX(even)
-  if (v.length === 11 && v.startsWith("990000") && /^\d+$/.test(v) && parseInt(v[v.length - 1], 10) % 2 === 0) {
-    return true;
-  }
-  // Saudi NID tokens: 100000XXXX
-  if (v.length === 10 && v.startsWith("100000") && /^\d+$/.test(v)) {
+  // Spanish ID tokens: 000XXXXX[A-Z]
+  if (v.length === 9 && v.startsWith("000") && /[A-Z]$/.test(v)) {
     return true;
   }
@@ -75,6 +69,11 @@ export function looksLikeToken(value: string | any): boolean {
     return true;
   }
+  // Semantic NLP tokens: <PER:Taylor_Morgan>
+  if (/^<(PER|LOC|ORG):[^>]+>$/.test(v)) {
+    return true;
+  }
   // Opaque fallback tokens: [TKN-<hex>]
   if (v.startsWith("[TKN-") && v.endsWith("]")) {
     return true;
@@ -83,3 +82,4 @@ export function looksLikeToken(value: string | any): boolean {
   return false;
 }

package/src/core/scanner.ts CHANGED Viewed

@@ -19,6 +19,7 @@ import { LanguageContextResolver } from './dlp/assessor';
 import { DLPPatternRegistry } from './dlp/registry';
 import { DLPValidationEngine } from './dlp/handlers';
 import { DLPConfidenceScorer } from './dlp/scorer';
+import { Span, resolveOverlaps, reconstruct } from './span';
 // Module-level DLP singletons (created once, reused for all scans)
 const _dlpLanguageResolver = new LanguageContextResolver();
@@ -29,11 +30,11 @@ const _dlpConfidenceScorer = new DLPConfidenceScorer();
 /** Regex patterns for Tier 1 deterministic detection */
 export const REGEX_PATTERNS: Record<string, RegExp> = {
   "EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
-  "PHONE_NUMBER": /\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4}/g,
-  "PHONE_NUMBER_INTL": /\+(?:44|33|49)[\s\-.]?\(?\d{1,5}\)?(?:[\s\-.]?\d{2,4}){2,4}/g,
-  "US_SSN": /\d{3}-\d{2}-\d{4}/g,
-  "CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
-  "US_ROUTING_NUMBER": /\b\d{9}\b/g,
+  "PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
+  "PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
+  "US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
+  "CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
+  "US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
   "US_PASSPORT": /\b[A-Z]\d{8}\b/g,
   "DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g,
 };
@@ -83,177 +84,163 @@ export class BaseScanner {
     return checksum % 10 === 0;
   }
-  protected async _tier0Dlp(
+  protected async _tier0CollectSpans(
     text: string,
-    encodeFn: (val: string) => Promise<string>,
     confidenceThreshold: number,
-  ): Promise<[string, any[]]> {
+  ): Promise<Span[]> {
     const detectedLanguage = _dlpLanguageResolver.resolve(text);
+    const spans: Span[] = [];
+    const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
-    type RawHit = { start: number; end: number; tag: string; val: string; conf: number };
-    const rawHits: RawHit[] = [];
-    // Pass 1: Structured patterns from the registry
-    for (const [typeTag, descriptor] of _dlpPatternRegistry.iterDescriptors()) {
-      const re = new RegExp(descriptor.compiledRe.source, descriptor.compiledRe.flags);
+    // Pass 1: Category Mega-Regexes (O(text) per category bucket)
+    for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
+      const megaRe = new RegExp(re.source, re.flags);
       let m: RegExpExecArray | null;
-      while ((m = re.exec(text)) !== null) {
+      while ((m = megaRe.exec(text)) !== null) {
+        // Identify which named group matched
+        const groups = m.groups ?? {};
+        let typeTag: string | undefined;
+        for (const name of typeOrder) {
+          if (groups[name] !== undefined) { typeTag = name; break; }
+        }
+        if (!typeTag) continue;
         const matchedStr = m[0];
         if (looksLikeToken(matchedStr)) continue;
+        const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
+        if (!descriptor) continue;
         const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
-        const conf = _dlpConfidenceScorer.score({
-          baseRisk: descriptor.baseRisk,
-          matchStart: m.index,
-          matchEnd: m.index + matchedStr.length,
-          fullText: text,
-          proximityTerms: descriptor.proximityTerms,
-          validatorPassed: validatorResult,
-        });
+        let conf: number;
+        // FUZZY FAIL-SAFE logic
+        if (validatorResult === false) {
+          if (descriptor.isHighEntropy) {
+            conf = 0.85; // Boosted to prioritize over generic types
+          } else {
+            continue;
+          }
+        } else {
+          conf = _dlpConfidenceScorer.score({
+            baseRisk: descriptor.baseRisk,
+            matchStart: m.index,
+            matchEnd: m.index + matchedStr.length,
+            fullText: text,
+            proximityTerms: descriptor.proximityTerms,
+            validatorPassed: validatorResult,
+          });
+        }
         if (conf >= confidenceThreshold) {
-          rawHits.push({ start: m.index, end: m.index + matchedStr.length, tag: typeTag, val: matchedStr, conf });
+          spans.push({ start: m.index, end: m.index + matchedStr.length,
+            entityType: typeTag, originalValue: matchedStr,
+            confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
         }
       }
     }
-    // Pass 2: Locale-tuned name patterns
-    const nameProximity = new Set(["name", "contact", "person", "nom", "isim", "اسم"]);
+    // Pass 2: Locale-tuned name patterns (JIT)
+    const nameProximity = new Set(['name', 'contact', 'person', 'nom', 'isim', '\u0627\u0633\u0645']);
     for (const nameRe of _dlpPatternRegistry.namePatternsFor(detectedLanguage)) {
       const re = new RegExp(nameRe.source, nameRe.flags);
       let m: RegExpExecArray | null;
       while ((m = re.exec(text)) !== null) {
         if (looksLikeToken(m[0])) continue;
         const conf = _dlpConfidenceScorer.score({
-          baseRisk: 0.50,
-          matchStart: m.index,
-          matchEnd: m.index + m[0].length,
-          fullText: text,
-          proximityTerms: nameProximity,
-          validatorPassed: null,
+          baseRisk: 0.50, matchStart: m.index, matchEnd: m.index + m[0].length,
+          fullText: text, proximityTerms: nameProximity, validatorPassed: null,
         });
         if (conf >= confidenceThreshold) {
-          rawHits.push({ start: m.index, end: m.index + m[0].length, tag: "PERSON_NAME", val: m[0], conf });
+          spans.push({ start: m.index, end: m.index + m[0].length,
+            entityType: 'PERSON_NAME', originalValue: m[0],
+            confidence: conf, method: 'dlp_heuristic', language: detectedLanguage });
         }
       }
     }
-    // Pass 3: Locale-tuned address patterns
+    // Pass 3: Locale-tuned address patterns (JIT)
     for (const addrRe of _dlpPatternRegistry.addressPatternsFor(detectedLanguage)) {
       const re = new RegExp(addrRe.source, addrRe.flags);
       let m: RegExpExecArray | null;
       while ((m = re.exec(text)) !== null) {
         if (looksLikeToken(m[0])) continue;
-        rawHits.push({ start: m.index, end: m.index + m[0].length, tag: "PHYS_ADDRESS", val: m[0], conf: 0.55 });
+        spans.push({ start: m.index, end: m.index + m[0].length,
+          entityType: 'PHYS_ADDRESS', originalValue: m[0],
+          confidence: 0.55, method: 'dlp_heuristic', language: detectedLanguage });
       }
     }
-    // De-duplicate overlapping spans — keep longer / higher-confidence match
-    rawHits.sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start) || b.conf - a.conf);
-    const deduped: RawHit[] = [];
-    let occupiedEnd = -1;
-    for (const hit of rawHits) {
-      if (hit.start >= occupiedEnd) {
-        deduped.push(hit);
-        occupiedEnd = hit.end;
-      }
-    }
+    return spans;
+  }
-    // Replace right-to-left to preserve offsets
+  /** Backward-compat wrapper — collects spans then single-pass encodes. */
+  protected async _tier0Dlp(
+    text: string,
+    encodeFn: (val: string, options?: any) => Promise<string>,
+    confidenceThreshold: number,
+  ): Promise<[string, any[]]> {
+    const spans = await this._tier0CollectSpans(text, confidenceThreshold);
+    const resolved = resolveOverlaps(spans);
     const entities: any[] = [];
-    let excised = text;
-    for (const hit of [...deduped].reverse()) {
-      const token = await encodeFn(hit.val);
-      excised = excised.slice(0, hit.start) + token + excised.slice(hit.end);
-      entities.push({
-        type: hit.tag,
-        value: hit.val,
-        method: "dlp_heuristic",
-        confidence: hit.conf,
-        masked_value: token,
-        language: detectedLanguage,
-      });
-    }
-    return [excised, entities];
+    await Promise.all(resolved.map(async (span) => {
+      span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
+      entities.push({ type: span.entityType, value: span.originalValue,
+        method: span.method, confidence: span.confidence,
+        masked_value: span.maskedValue, language: span.language });
+    }));
+    return [reconstruct(text, resolved), entities];
   }
-  protected async _tier1Regex(
+  protected async _tier1CollectSpans(
     text: string,
-    encodeFn: (val: string) => Promise<string>,
     boostEntities: Set<string>,
     aggressive: boolean,
     confidenceThreshold: number,
-  ): Promise<[string, any[]]> {
-    let entities: any[] = [];
-    let excised = text;
-    let allMatches: any[] = [];
+  ): Promise<Span[]> {
+    const spans: Span[] = [];
     for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
-      // Create a fresh regex for matchAll
       const re = new RegExp(pattern.source, pattern.flags);
-      let match;
+      let match: RegExpExecArray | null;
       while ((match = re.exec(text)) !== null) {
-        let confidence = 0.95;
-        if (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " "))) {
-          confidence = 1.0;
-        }
-        if (entityType === "CREDIT_CARD" && BaseScanner._luhnChecksum(match[0])) {
-          confidence = Math.max(confidence, 0.99);
+        const val = match[0];
+        if (looksLikeToken(val)) continue;
+        let confidence = (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, ' '))) ? 1.0 : 0.95;
+        if (entityType === 'CREDIT_CARD' && BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
+        if (entityType === 'US_ROUTING_NUMBER' && !BaseScanner._abaChecksum(val)) continue;
+        if (confidence >= confidenceThreshold) {
+          spans.push({ start: match.index, end: match.index + val.length,
+            entityType, originalValue: val, confidence, method: 'regex' });
         }
-        if (entityType === "US_ROUTING_NUMBER" && !BaseScanner._abaChecksum(match[0])) {
-          continue;
-        }
-        allMatches.push({
-          start: match.index,
-          end: match.index + match[0].length,
-          type: entityType,
-          value: match[0],
-          confidence
-        });
-      }
-    }
-    // Deduplicate overlapping spans — keep the longest match
-    allMatches.sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start));
-    let filtered: any[] = [];
-    let lastEnd = -1;
-    for (const m of allMatches) {
-      if (m.start >= lastEnd) {
-        filtered.push(m);
-        lastEnd = m.end;
-      }
-    }
-    // Replace from right to left to preserve offsets
-    const sortedFiltered = [...filtered].sort((a, b) => b.start - a.start);
-    for (const m of sortedFiltered) {
-      if (m.confidence >= confidenceThreshold && !looksLikeToken(m.value)) {
-        const token = await encodeFn(m.value);
-        excised = excised.slice(0, m.start) + token + excised.slice(m.end);
-        entities.push({
-          type: m.type,
-          value: m.value,
-          method: "regex",
-          confidence: m.confidence,
-          masked_value: token,
-        });
       }
     }
+    return spans;
+  }
-    return [excised, entities];
+  /** Backward-compat wrapper. */
+  protected async _tier1Regex(
+    text: string,
+    encodeFn: (val: string, options?: any) => Promise<string>,
+    boostEntities: Set<string>,
+    aggressive: boolean,
+    confidenceThreshold: number,
+  ): Promise<[string, any[]]> {
+    const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
+    const resolved = resolveOverlaps(spans);
+    const entities: any[] = [];
+    await Promise.all(resolved.map(async (span) => {
+      span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
+      entities.push({ type: span.entityType, value: span.originalValue,
+        method: span.method, confidence: span.confidence, masked_value: span.maskedValue });
+    }));
+    return [reconstruct(text, resolved), entities];
   }
   protected async _tier2Nlp(
     text: string,
-    encodeFn: (val: string) => Promise<string>,
+    encodeFn: (val: string, options?: any) => Promise<string>,
     boostEntities: Set<string>,
     aggressive: boolean,
     confidenceThreshold: number,
   ): Promise<[string, any[]]> {
-    /**
-     * Base implementation is a no-op. Override in LocalTransformersScanner
-     * to enable NLP-based detection.
-     */
     return [text, []];
   }
@@ -270,7 +257,7 @@ export class BaseScanner {
   async scanAndTokenize(
     text: string,
     options: {
-      encodeFn?: (val: string) => Promise<string>;
+      encodeFn?: (val: string, options?: any) => Promise<string>;
       pipeline?: string[];
       confidenceThreshold?: number;
       context?: string | null;
@@ -279,25 +266,30 @@ export class BaseScanner {
   ): Promise<string> {
     if (!text || typeof text !== 'string') return text;
-    const pipeline = options.pipeline || ["dlp", "regex", "checksum", "nlp"];
+    const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
     const _encode = options.encodeFn || encode;
     const confidenceThreshold = options.confidenceThreshold ?? 0.7;
     const boost = this._resolveBoost(options.context);
-    let currentText = text;
+    // ── Span-accumulation phase (no string mutation) ─────────────────────
+    const allSpans: Span[] = [];
-    // --- Tier 0: DLP Heuristic (multilingual, 50+ types) ---
-    if (pipeline.includes("dlp")) {
-      [currentText] = await this._tier0Dlp(currentText, _encode, confidenceThreshold);
+    if (pipeline.includes('dlp')) {
+      allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
     }
-    // --- Tier 1: Deterministic ---
-    if (pipeline.includes("regex") || pipeline.includes("checksum")) {
-      [currentText] = await this._tier1Regex(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
+    if (pipeline.includes('regex') || pipeline.includes('checksum')) {
+      allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
     }
-    // --- Tier 2: Probabilistic ---
-    if (pipeline.includes("nlp")) {
+    // ── Single-pass resolve + reconstruct ────────────────────────────────
+    const resolved = resolveOverlaps(allSpans);
+    await Promise.all(resolved.map(async (span) => {
+      span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
+    }));
+    let currentText = reconstruct(text, resolved);
+    // ── Tier 2: Probabilistic NLP (on already-masked text) ───────────────
+    if (pipeline.includes('nlp')) {
       [currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
     }
@@ -307,7 +299,7 @@ export class BaseScanner {
   async scanAndReturnEntities(
     text: string,
     options: {
-      encodeFn?: (val: string) => Promise<string>;
+      encodeFn?: (val: string, options?: any) => Promise<string>;
       pipeline?: string[];
       confidenceThreshold?: number;
       context?: string | null;
@@ -316,30 +308,33 @@ export class BaseScanner {
   ): Promise<any[]> {
     if (!text || typeof text !== 'string') return [];
-    const pipeline = options.pipeline || ["dlp", "regex", "checksum", "nlp"];
+    const pipeline = options.pipeline || ['dlp', 'regex', 'checksum', 'nlp'];
     const _encode = options.encodeFn || encode;
     const confidenceThreshold = options.confidenceThreshold ?? 0.7;
     const boost = this._resolveBoost(options.context);
-    let allEntities: any[] = [];
-    let remaining = text;
-    // --- Tier 0: DLP Heuristic ---
-    if (pipeline.includes("dlp")) {
-      const [newText, tier0] = await this._tier0Dlp(remaining, _encode, confidenceThreshold);
-      remaining = newText;
-      allEntities.push(...tier0);
-    }
+    const allEntities: any[] = [];
-    // --- Tier 1: Deterministic ---
-    if (pipeline.includes("regex") || pipeline.includes("checksum")) {
-      const [newText, tier1] = await this._tier1Regex(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
-      remaining = newText;
-      allEntities.push(...tier1);
+    // ── Span-accumulation phase ──────────────────────────────────────────
+    const allSpans: Span[] = [];
+    if (pipeline.includes('dlp')) {
+      allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
+    }
+    if (pipeline.includes('regex') || pipeline.includes('checksum')) {
+      allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
     }
-    // --- Tier 2: Probabilistic ---
-    if (pipeline.includes("nlp")) {
-      const [_newText, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
+    const resolved = resolveOverlaps(allSpans);
+    await Promise.all(resolved.map(async (span) => {
+      span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
+      allEntities.push({ type: span.entityType, value: span.originalValue,
+        method: span.method, confidence: span.confidence,
+        masked_value: span.maskedValue, language: span.language });
+    }));
+    const remaining = reconstruct(text, resolved);
+    if (pipeline.includes('nlp')) {
+      const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
       allEntities.push(...tier2);
     }