mask-privacy 3.1.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -87,12 +87,6 @@ Mask prevents the misidentification of real data as tokens by using universally
87
87
 
88
88
  This prefix-based approach ensures that the SDK does not inadvertently process valid PII as an existing token.
89
89
 
90
- Additional collision-proof prefixes for international identifiers:
91
- * Turkish TCID tokens use the `990000` prefix (no valid Kimlik number starts with `99`).
92
- * Saudi NID tokens use the `100000` prefix (length-constrained to avoid overlap with real IDs).
93
- * UAE Emirates ID tokens use the `784-0000-` prefix (zeroed sub-fields are structurally invalid).
94
- * IBAN tokens zero the check digits (`XX00...`), which always fails ISO 7064 Mod-97 verification.
95
-
96
90
  ### 4. Enterprise Async Support
97
91
  Mask is built from the ground up for high-concurrency Node.js environments. All core operations are asynchronous and promised-based. Calling `encode()`, `decode()`, or `scanAndTokenize()` allows your event loop to remain unblocked while handling PII tokenization tasks.
98
92
 
@@ -135,7 +129,7 @@ Mask includes the ability to detokenize PII embedded within larger text blocks (
135
129
 
136
130
  ## Multilingual PII Detection (Waterfall Pipeline)
137
131
 
138
- Mask is built for the global enterprise. While many privacy tools are English-centric, the TypeScript SDK implements a **3-Tier Waterfall Detection** strategy designed for high-performance PII detection across 8 major languages using local ONNX models.
132
+ Mask is built for the global enterprise. The TypeScript SDK implements a **3-Tier Waterfall Detection** strategy for high-precision PII detection in **English and Spanish** using local ONNX models.
139
133
 
140
134
  ### Supported Language Matrix
141
135
 
@@ -145,12 +139,6 @@ Mask provides first-class support for the following languages:
145
139
  | :--- | :--- | :--- | :--- |
146
140
  | **English** | `en` | ✅ Full | DistilBERT (Simple) |
147
141
  | **Spanish** | `es` | ✅ Full | BERT Multilingual |
148
- | **French** | `fr` | ✅ Full | BERT Multilingual |
149
- | **German** | `de` | ✅ Full | BERT Multilingual |
150
- | **Turkish** | `tr` | ✅ Full | BERT Multilingual |
151
- | **Arabic** | `ar` | ✅ Full | BERT Multilingual |
152
- | **Japanese** | `ja` | ✅ Full | BERT Multilingual |
153
- | **Chinese** | `zh` | ✅ Full | BERT Multilingual |
154
142
 
155
143
  ### How the Waterfall Works: The Excising Mechanism
156
144
 
@@ -165,11 +153,11 @@ To maintain high performance, the TypeScript SDK does not simply run three separ
165
153
 
166
154
  ### Configuration & Environment Variables
167
155
 
168
- Configure your multilingual environment using standard variables. These are parsed at runtime when the `LocalTransformersScanner` is initialized.
156
+ Configure your language environment using standard variables.
169
157
 
170
158
  | Variable | Default | Description |
171
159
  | :--- | :--- | :--- |
172
- | `MASK_LANGUAGES` | `en` | Comma-separated list of languages (e.g., `en,es,fr,ar`). |
160
+ | `MASK_LANGUAGES` | `en` | Comma-separated language codes. Supported: `en`, `es`. |
173
161
  | `MASK_NLP_MODEL` | *(varies)* | Override the default model (e.g., `Xenova/bert-base-multilingual-cased-ner-hrl`). |
174
162
  | `MASK_MODEL_CACHE_DIR` | `~/.cache` | Local directory for storing serialized ONNX models. |
175
163
  | `MASK_NLP_MAX_WORKERS` | `4` | Number of worker processes/threads for NLP analysis. |
@@ -221,8 +209,8 @@ The TypeScript SDK manages AI models automatically via **Transformers.js**. For
221
209
  ```bash
222
210
  npm install @huggingface/transformers # Required extra
223
211
 
224
- # Pre-cache models for your required languages
225
- export MASK_LANGUAGES="en,es,fr"
212
+ # Pre-cache models for English and Spanish
213
+ export MASK_LANGUAGES="en,es"
226
214
  npx mask-privacy cache-models
227
215
  ```
228
216
 
package/dist/index.d.mts CHANGED
@@ -96,8 +96,9 @@ declare class BaseScanner {
96
96
  protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
97
97
  /** Backward-compat wrapper — collects spans then single-pass encodes. */
98
98
  protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
99
+ /** Tier 1 — Deterministic detection (Legacy: Redirected to DLP) */
99
100
  protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
100
- /** Backward-compat wrapper. */
101
+ /** Backward-compat wrapper. Redirected to DLP. */
101
102
  protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
102
103
  protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
103
104
  protected _resolveBoost(context?: string | null): Set<string>;
@@ -312,14 +313,8 @@ declare class MaskClient {
312
313
  * Supported language tags:
313
314
  * en — English (default / Latin-only fallback)
314
315
  * es — Spanish
315
- * fr — French
316
- * de — German
317
- * tr — Turkish
318
- * ar — Arabic
319
- * zh — Chinese
320
- * ja — Japanese
321
316
  */
322
- type LanguageTag = "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
317
+ type LanguageTag = "en" | "es";
323
318
  interface LanguageBreakdown {
324
319
  language: LanguageTag;
325
320
  breakdown: Record<string, number>;
@@ -332,8 +327,8 @@ interface LanguageBreakdown {
332
327
  * @example
333
328
  * ```ts
334
329
  * const resolver = new LanguageContextResolver();
335
- * const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
336
- * // tag === "tr"
330
+ * const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
331
+ * // tag === "es"
337
332
  * ```
338
333
  */
339
334
  declare class LanguageContextResolver {
@@ -457,8 +452,8 @@ interface ScoreInput {
457
452
  * baseRisk: 0.92,
458
453
  * matchStart: 10,
459
454
  * matchEnd: 21,
460
- * fullText: "TC Kimlik No: 10000000146",
461
- * proximityTerms: new Set(["kimlik", "tc"]),
455
+ * fullText: "Mi número de DNI es 12345678Z",
456
+ * proximityTerms: new Set(["dni", "número"]),
462
457
  * validatorPassed: true,
463
458
  * });
464
459
  * // score === 0.99 (validator override)
@@ -486,7 +481,7 @@ declare class DLPConfidenceScorer {
486
481
  * Provides format-preserving encryption, local/distributed vaulting,
487
482
  * and framework-agnostic tool interception hooks.
488
483
  */
489
- declare const VERSION = "2.0.0";
484
+ declare const VERSION = "3.3.0";
490
485
 
491
486
  /**
492
487
  * Detect PII entities in text and return a list of objects with metadata.
package/dist/index.d.ts CHANGED
@@ -96,8 +96,9 @@ declare class BaseScanner {
96
96
  protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
97
97
  /** Backward-compat wrapper — collects spans then single-pass encodes. */
98
98
  protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
99
+ /** Tier 1 — Deterministic detection (Legacy: Redirected to DLP) */
99
100
  protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
100
- /** Backward-compat wrapper. */
101
+ /** Backward-compat wrapper. Redirected to DLP. */
101
102
  protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
102
103
  protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
103
104
  protected _resolveBoost(context?: string | null): Set<string>;
@@ -312,14 +313,8 @@ declare class MaskClient {
312
313
  * Supported language tags:
313
314
  * en — English (default / Latin-only fallback)
314
315
  * es — Spanish
315
- * fr — French
316
- * de — German
317
- * tr — Turkish
318
- * ar — Arabic
319
- * zh — Chinese
320
- * ja — Japanese
321
316
  */
322
- type LanguageTag = "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
317
+ type LanguageTag = "en" | "es";
323
318
  interface LanguageBreakdown {
324
319
  language: LanguageTag;
325
320
  breakdown: Record<string, number>;
@@ -332,8 +327,8 @@ interface LanguageBreakdown {
332
327
  * @example
333
328
  * ```ts
334
329
  * const resolver = new LanguageContextResolver();
335
- * const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
336
- * // tag === "tr"
330
+ * const tag = resolver.resolve("Hola, mi DNI es 12345678Z");
331
+ * // tag === "es"
337
332
  * ```
338
333
  */
339
334
  declare class LanguageContextResolver {
@@ -457,8 +452,8 @@ interface ScoreInput {
457
452
  * baseRisk: 0.92,
458
453
  * matchStart: 10,
459
454
  * matchEnd: 21,
460
- * fullText: "TC Kimlik No: 10000000146",
461
- * proximityTerms: new Set(["kimlik", "tc"]),
455
+ * fullText: "Mi número de DNI es 12345678Z",
456
+ * proximityTerms: new Set(["dni", "número"]),
462
457
  * validatorPassed: true,
463
458
  * });
464
459
  * // score === 0.99 (validator override)
@@ -486,7 +481,7 @@ declare class DLPConfidenceScorer {
486
481
  * Provides format-preserving encryption, local/distributed vaulting,
487
482
  * and framework-agnostic tool interception hooks.
488
483
  */
489
- declare const VERSION = "2.0.0";
484
+ declare const VERSION = "3.3.0";
490
485
 
491
486
  /**
492
487
  * Detect PII entities in text and return a list of objects with metadata.