npm - mask-privacy - Versions diffs - 3.0.0 → 3.2.0 - Mend

mask-privacy 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +5 -17
package/dist/index.d.mts +58 -27
package/dist/index.d.ts +58 -27
package/dist/index.js +394 -310
package/dist/index.js.map +1 -1
package/dist/index.mjs +394 -310
package/dist/index.mjs.map +1 -1
package/package.json +1 -1
package/src/core/dlp/assessor.ts +3 -26
package/src/core/dlp/handlers.ts +44 -31
package/src/core/dlp/index.ts +0 -2
package/src/core/dlp/registry.ts +119 -107
package/src/core/dlp/scorer.ts +4 -4
package/src/core/fpe.ts +85 -32
package/src/core/fpe_utils.ts +20 -20
package/src/core/scanner.ts +146 -151
package/src/core/span.ts +76 -0
package/src/core/transformers_scanner.ts +2 -2
package/src/core/vault.ts +2 -1
package/tests/async.test.ts +2 -2
package/tests/dlp_hardened.test.ts +21 -0
package/tests/fpe.test.ts +4 -4
package/tests/hooks.test.ts +2 -2
package/tests/langchain.test.ts +2 -2
package/tests/llamaindex.test.ts +1 -1
package/tests/scanner.test.ts +0 -1
package/tests/substring.test.ts +1 -1
package/tests/vault.test.ts +1 -1

package/README.md CHANGED Viewed

@@ -87,12 +87,6 @@ Mask prevents the misidentification of real data as tokens by using universally
 This prefix-based approach ensures that the SDK does not inadvertently process valid PII as an existing token.
-Additional collision-proof prefixes for international identifiers:
-* Turkish TCID tokens use the `990000` prefix (no valid Kimlik number starts with `99`).
-* Saudi NID tokens use the `100000` prefix (length-constrained to avoid overlap with real IDs).
-* UAE Emirates ID tokens use the `784-0000-` prefix (zeroed sub-fields are structurally invalid).
-* IBAN tokens zero the check digits (`XX00...`), which always fails ISO 7064 Mod-97 verification.
 ### 4. Enterprise Async Support
 Mask is built from the ground up for high-concurrency Node.js environments. All core operations are asynchronous and promised-based. Calling `encode()`, `decode()`, or `scanAndTokenize()` allows your event loop to remain unblocked while handling PII tokenization tasks.
@@ -135,7 +129,7 @@ Mask includes the ability to detokenize PII embedded within larger text blocks (
 ## Multilingual PII Detection (Waterfall Pipeline)
-Mask is built for the global enterprise. While many privacy tools are English-centric, the TypeScript SDK implements a **3-Tier Waterfall Detection** strategy designed for high-performance PII detection across 8 major languages using local ONNX models.
+Mask is built for the global enterprise. The TypeScript SDK implements a **3-Tier Waterfall Detection** strategy for high-precision PII detection in **English and Spanish** using local ONNX models.
 ### Supported Language Matrix
@@ -145,12 +139,6 @@ Mask provides first-class support for the following languages:
 | :--- | :--- | :--- | :--- |
 | **English** | `en` | ✅ Full | DistilBERT (Simple) |
 | **Spanish** | `es` | ✅ Full | BERT Multilingual |
-| **French** | `fr` | ✅ Full | BERT Multilingual |
-| **German** | `de` | ✅ Full | BERT Multilingual |
-| **Turkish** | `tr` | ✅ Full | BERT Multilingual |
-| **Arabic** | `ar` | ✅ Full | BERT Multilingual |
-| **Japanese** | `ja` | ✅ Full | BERT Multilingual |
-| **Chinese** | `zh` | ✅ Full | BERT Multilingual |
 ### How the Waterfall Works: The Excising Mechanism
@@ -165,11 +153,11 @@ To maintain high performance, the TypeScript SDK does not simply run three separ
 ### Configuration & Environment Variables
-Configure your multilingual environment using standard variables. These are parsed at runtime when the `LocalTransformersScanner` is initialized.
+Configure your language environment using standard variables.
 | Variable | Default | Description |
 | :--- | :--- | :--- |
-| `MASK_LANGUAGES` | `en` | Comma-separated list of languages (e.g., `en,es,fr,ar`). |
+| `MASK_LANGUAGES` | `en` | Comma-separated language codes. Supported: `en`, `es`. |
 | `MASK_NLP_MODEL` | *(varies)* | Override the default model (e.g., `Xenova/bert-base-multilingual-cased-ner-hrl`). |
 | `MASK_MODEL_CACHE_DIR` | `~/.cache` | Local directory for storing serialized ONNX models. |
 | `MASK_NLP_MAX_WORKERS` | `4` | Number of worker processes/threads for NLP analysis. |
@@ -221,8 +209,8 @@ The TypeScript SDK manages AI models automatically via **Transformers.js**. For
 ```bash
 npm install @huggingface/transformers # Required extra
-# Pre-cache models for your required languages
-export MASK_LANGUAGES="en,es,fr"
+# Pre-cache models for English and Spanish
+export MASK_LANGUAGES="en,es"
 npx mask-privacy cache-models
 ```

package/dist/index.d.mts CHANGED Viewed

@@ -14,6 +14,7 @@ type EncodeOptions = {
     ttl?: number;
     searchBuckets?: ('year' | 'month' | 'day' | 'numeric')[];
     searchBucketSize?: number;
+    entityType?: string;
 };
 /**
  * Tokenise rawText, encrypt it, store in vault, return the FPE token.
@@ -49,9 +50,40 @@ declare function looksLikeToken(value: string | any): boolean;
 /** Clear the cached master key. Useful in tests. */
 declare function resetMasterKey(): void;
 /**
- * Return a **deterministic**, format-preserving token for rawText.
+ * Return a **deterministic**, format-preserving token for rawText using its entityType.
+ */
+declare function generateFPEToken(rawText: string, entityType?: string): Promise<string>;
+/**
+ * Span Resolution Engine — Sweep-Line Overlap Resolver (TypeScript).
+ *
+ * All detection tiers now return Span objects instead of mutating the text.
+ * resolveOverlaps() chooses the winning span in every conflicting region,
+ * and reconstruct() rebuilds the string exactly once.
+ */
+interface Span {
+    start: number;
+    end: number;
+    entityType: string;
+    originalValue: string;
+    confidence: number;
+    method: string;
+    language?: string;
+    maskedValue?: string;
+}
+/**
+ * Entity Detection Scanner — Tiered Waterfall Pipeline.
+ *
+ * Scans unstructured text to identify PII (Emails, Phones, SSNs, Credit Cards,
+ * Names) and replaces them in-place with Format-Preserving Encryption (FPE)
+ * tokens.
+ *
+ * Detection Architecture (Waterfall):
+ *   Tier 0 — DLP Heuristic: Multilingual, 50+ types, checksum validators
+ *   Tier 1 — Deterministic: Regex + Checksum  (fast, provable, auditable)
+ *   Tier 2 — Probabilistic: Local NLP via Transformers (catches names/orgs)
  */
-declare function generateFPEToken(rawText: string): Promise<string>;
 declare class BaseScanner {
     protected _supportedEntities: string[];
@@ -61,19 +93,23 @@ declare class BaseScanner {
     protected static _luhnChecksum(ccNumber: string): boolean;
     /** Validate a US ABA routing number using the checksum algorithm. */
     protected static _abaChecksum(routingNumber: string): boolean;
-    protected _tier0Dlp(text: string, encodeFn: (val: string) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
-    protected _tier1Regex(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
-    protected _tier2Nlp(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
+    /** Backward-compat wrapper — collects spans then single-pass encodes. */
+    protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
+    /** Backward-compat wrapper. */
+    protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
     protected _resolveBoost(context?: string | null): Set<string>;
     scanAndTokenize(text: string, options?: {
-        encodeFn?: (val: string) => Promise<string>;
+        encodeFn?: (val: string, options?: any) => Promise<string>;
         pipeline?: string[];
         confidenceThreshold?: number;
         context?: string | null;
         aggressive?: boolean;
     }): Promise<string>;
     scanAndReturnEntities(text: string, options?: {
-        encodeFn?: (val: string) => Promise<string>;
+        encodeFn?: (val: string, options?: any) => Promise<string>;
         pipeline?: string[];
         confidenceThreshold?: number;
         context?: string | null;
@@ -107,7 +143,7 @@ declare class LocalTransformersScanner extends BaseScanner {
      * Map Transformer entity types to Mask internal entity types.
      */
     private _mapEntityType;
-    protected _tier2Nlp(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
     /**
      * Merges sub-tokens and entities of the same type while precisely tracking
      * offsets in the original text.
@@ -276,14 +312,8 @@ declare class MaskClient {
  * Supported language tags:
  *   en — English (default / Latin-only fallback)
  *   es — Spanish
- *   fr — French
- *   de — German
- *   tr — Turkish
- *   ar — Arabic
- *   zh — Chinese
- *   ja — Japanese
  */
-type LanguageTag = "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
+type LanguageTag = "en" | "es";
 interface LanguageBreakdown {
     language: LanguageTag;
     breakdown: Record<string, number>;
@@ -296,8 +326,8 @@ interface LanguageBreakdown {
  * @example
  * ```ts
  * const resolver = new LanguageContextResolver();
- * const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
- * // tag === "tr"
+ * const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
+ * // tag === "es"
  * ```
  */
 declare class LanguageContextResolver {
@@ -338,27 +368,28 @@ interface PatternDescriptor {
     baseRisk: number;
     category: SensitiveCategory;
     validatorTag: string | null;
+    isHighEntropy: boolean;
+    supportedLocales: string[];
 }
 /**
  * Immutable catalogue of sensitive-data regex signatures.
- *
- * @example
- * ```ts
- * const reg = new DLPPatternRegistry(); // load everything
- * const reg = new DLPPatternRegistry(new Set([SensitiveCategory.FINANCIAL]));
- * ```
  */
 declare class DLPPatternRegistry {
     private readonly catalogue;
+    private readonly localeCategoryRegexMap;
     constructor(loadGroups?: ReadonlySet<SensitiveCategory>);
     get typeNames(): string[];
     /** Yield [typeName, descriptor] pairs. */
     iterDescriptors(): IterableIterator<[string, PatternDescriptor]>;
     descriptorFor(typeName: string): PatternDescriptor | undefined;
-    /** Return locale-tuned name regexes, falling back to English. */
     namePatternsFor(lang: LanguageTag | string): RegExp[];
-    /** Return locale-tuned address regexes, falling back to English. */
     addressPatternsFor(lang: LanguageTag | string): RegExp[];
+    getCategoryRegexesMap(locale?: string): Map<string, {
+        re: RegExp;
+        typeOrder: string[];
+    }>;
+    getCategoryTypeMap(categoryName: string, locale?: string): string[];
+    private compileForLocale;
     private buildCatalogue;
 }
@@ -420,8 +451,8 @@ interface ScoreInput {
  *   baseRisk: 0.92,
  *   matchStart: 10,
  *   matchEnd: 21,
- *   fullText: "TC Kimlik No: 10000000146",
- *   proximityTerms: new Set(["kimlik", "tc"]),
+ *   fullText: "Mi número de DNI es 12345678Z",
+ *   proximityTerms: new Set(["dni", "número"]),
  *   validatorPassed: true,
  * });
  * // score === 0.99 (validator override)

package/dist/index.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ type EncodeOptions = {
     ttl?: number;
     searchBuckets?: ('year' | 'month' | 'day' | 'numeric')[];
     searchBucketSize?: number;
+    entityType?: string;
 };
 /**
  * Tokenise rawText, encrypt it, store in vault, return the FPE token.
@@ -49,9 +50,40 @@ declare function looksLikeToken(value: string | any): boolean;
 /** Clear the cached master key. Useful in tests. */
 declare function resetMasterKey(): void;
 /**
- * Return a **deterministic**, format-preserving token for rawText.
+ * Return a **deterministic**, format-preserving token for rawText using its entityType.
+ */
+declare function generateFPEToken(rawText: string, entityType?: string): Promise<string>;
+/**
+ * Span Resolution Engine — Sweep-Line Overlap Resolver (TypeScript).
+ *
+ * All detection tiers now return Span objects instead of mutating the text.
+ * resolveOverlaps() chooses the winning span in every conflicting region,
+ * and reconstruct() rebuilds the string exactly once.
+ */
+interface Span {
+    start: number;
+    end: number;
+    entityType: string;
+    originalValue: string;
+    confidence: number;
+    method: string;
+    language?: string;
+    maskedValue?: string;
+}
+/**
+ * Entity Detection Scanner — Tiered Waterfall Pipeline.
+ *
+ * Scans unstructured text to identify PII (Emails, Phones, SSNs, Credit Cards,
+ * Names) and replaces them in-place with Format-Preserving Encryption (FPE)
+ * tokens.
+ *
+ * Detection Architecture (Waterfall):
+ *   Tier 0 — DLP Heuristic: Multilingual, 50+ types, checksum validators
+ *   Tier 1 — Deterministic: Regex + Checksum  (fast, provable, auditable)
+ *   Tier 2 — Probabilistic: Local NLP via Transformers (catches names/orgs)
  */
-declare function generateFPEToken(rawText: string): Promise<string>;
 declare class BaseScanner {
     protected _supportedEntities: string[];
@@ -61,19 +93,23 @@ declare class BaseScanner {
     protected static _luhnChecksum(ccNumber: string): boolean;
     /** Validate a US ABA routing number using the checksum algorithm. */
     protected static _abaChecksum(routingNumber: string): boolean;
-    protected _tier0Dlp(text: string, encodeFn: (val: string) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
-    protected _tier1Regex(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
-    protected _tier2Nlp(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
+    /** Backward-compat wrapper — collects spans then single-pass encodes. */
+    protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
+    /** Backward-compat wrapper. */
+    protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
     protected _resolveBoost(context?: string | null): Set<string>;
     scanAndTokenize(text: string, options?: {
-        encodeFn?: (val: string) => Promise<string>;
+        encodeFn?: (val: string, options?: any) => Promise<string>;
         pipeline?: string[];
         confidenceThreshold?: number;
         context?: string | null;
         aggressive?: boolean;
     }): Promise<string>;
     scanAndReturnEntities(text: string, options?: {
-        encodeFn?: (val: string) => Promise<string>;
+        encodeFn?: (val: string, options?: any) => Promise<string>;
         pipeline?: string[];
         confidenceThreshold?: number;
         context?: string | null;
@@ -107,7 +143,7 @@ declare class LocalTransformersScanner extends BaseScanner {
      * Map Transformer entity types to Mask internal entity types.
      */
     private _mapEntityType;
-    protected _tier2Nlp(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
+    protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
     /**
      * Merges sub-tokens and entities of the same type while precisely tracking
      * offsets in the original text.
@@ -276,14 +312,8 @@ declare class MaskClient {
  * Supported language tags:
  *   en — English (default / Latin-only fallback)
  *   es — Spanish
- *   fr — French
- *   de — German
- *   tr — Turkish
- *   ar — Arabic
- *   zh — Chinese
- *   ja — Japanese
  */
-type LanguageTag = "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
+type LanguageTag = "en" | "es";
 interface LanguageBreakdown {
     language: LanguageTag;
     breakdown: Record<string, number>;
@@ -296,8 +326,8 @@ interface LanguageBreakdown {
  * @example
  * ```ts
  * const resolver = new LanguageContextResolver();
- * const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
- * // tag === "tr"
+ * const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
+ * // tag === "es"
  * ```
  */
 declare class LanguageContextResolver {
@@ -338,27 +368,28 @@ interface PatternDescriptor {
     baseRisk: number;
     category: SensitiveCategory;
     validatorTag: string | null;
+    isHighEntropy: boolean;
+    supportedLocales: string[];
 }
 /**
  * Immutable catalogue of sensitive-data regex signatures.
- *
- * @example
- * ```ts
- * const reg = new DLPPatternRegistry(); // load everything
- * const reg = new DLPPatternRegistry(new Set([SensitiveCategory.FINANCIAL]));
- * ```
  */
 declare class DLPPatternRegistry {
     private readonly catalogue;
+    private readonly localeCategoryRegexMap;
     constructor(loadGroups?: ReadonlySet<SensitiveCategory>);
     get typeNames(): string[];
     /** Yield [typeName, descriptor] pairs. */
     iterDescriptors(): IterableIterator<[string, PatternDescriptor]>;
     descriptorFor(typeName: string): PatternDescriptor | undefined;
-    /** Return locale-tuned name regexes, falling back to English. */
     namePatternsFor(lang: LanguageTag | string): RegExp[];
-    /** Return locale-tuned address regexes, falling back to English. */
     addressPatternsFor(lang: LanguageTag | string): RegExp[];
+    getCategoryRegexesMap(locale?: string): Map<string, {
+        re: RegExp;
+        typeOrder: string[];
+    }>;
+    getCategoryTypeMap(categoryName: string, locale?: string): string[];
+    private compileForLocale;
     private buildCatalogue;
 }
@@ -420,8 +451,8 @@ interface ScoreInput {
  *   baseRisk: 0.92,
  *   matchStart: 10,
  *   matchEnd: 21,
- *   fullText: "TC Kimlik No: 10000000146",
- *   proximityTerms: new Set(["kimlik", "tc"]),
+ *   fullText: "Mi número de DNI es 12345678Z",
+ *   proximityTerms: new Set(["dni", "número"]),
  *   validatorPassed: true,
  * });
  * // score === 0.99 (validator override)