mask-privacy 2.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -14
- package/dist/index.d.mts +222 -8
- package/dist/index.d.ts +222 -8
- package/dist/index.js +2308 -913
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +2303 -908
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
- package/src/cli.ts +79 -0
- package/src/config.ts +75 -0
- package/src/core/crypto.ts +5 -4
- package/src/core/dlp/assessor.ts +109 -0
- package/src/core/dlp/handlers.ts +348 -0
- package/src/core/dlp/index.ts +29 -0
- package/src/core/dlp/registry.ts +348 -0
- package/src/core/dlp/scorer.ts +109 -0
- package/src/core/fpe.ts +154 -14
- package/src/core/fpe_utils.ts +60 -7
- package/src/core/key_provider.ts +8 -10
- package/src/core/remote_scanner.ts +3 -2
- package/src/core/scanner.ts +192 -77
- package/src/core/span.ts +76 -0
- package/src/core/transformers_scanner.ts +20 -7
- package/src/core/vault.ts +24 -19
- package/src/index.ts +10 -0
- package/src/telemetry/audit_logger.ts +4 -4
- package/tests/async.test.ts +2 -2
- package/tests/dlp_hardened.test.ts +38 -0
- package/tests/fail_strategy.test.ts +2 -2
- package/tests/fpe.test.ts +4 -4
- package/tests/hooks.test.ts +2 -2
- package/tests/jest.setup.ts +36 -0
- package/tests/langchain.test.ts +2 -2
- package/tests/llamaindex.test.ts +1 -1
- package/tests/scanner.test.ts +0 -1
- package/tests/substring.test.ts +1 -1
- package/tests/vault.test.ts +1 -1
package/README.md
CHANGED
|
@@ -87,6 +87,12 @@ Mask prevents the misidentification of real data as tokens by using universally
|
|
|
87
87
|
|
|
88
88
|
This prefix-based approach ensures that the SDK does not inadvertently process valid PII as an existing token.
|
|
89
89
|
|
|
90
|
+
Additional collision-proof prefixes for international identifiers:
|
|
91
|
+
* Turkish TCID tokens use the `990000` prefix (no valid Kimlik number starts with `99`).
|
|
92
|
+
* Saudi NID tokens use the `100000` prefix (length-constrained to avoid overlap with real IDs).
|
|
93
|
+
* UAE Emirates ID tokens use the `784-0000-` prefix (zeroed sub-fields are structurally invalid).
|
|
94
|
+
* IBAN tokens zero the check digits (`XX00...`), which always fails ISO 7064 Mod-97 verification.
|
|
95
|
+
|
|
90
96
|
### 4. Enterprise Async Support
|
|
91
97
|
Mask is built from the ground up for high-concurrency Node.js environments. All core operations are asynchronous and promised-based. Calling `encode()`, `decode()`, or `scanAndTokenize()` allows your event loop to remain unblocked while handling PII tokenization tasks.
|
|
92
98
|
|
|
@@ -127,19 +133,97 @@ Performance-sensitive deployments utilize the built-in `LocalTransformersScanner
|
|
|
127
133
|
### 7. Sub-string Detokenization
|
|
128
134
|
Mask includes the ability to detokenize PII embedded within larger text blocks (like email bodies or chat messages). `detokenizeText()` uses high-performance regex to find and restore all tokens within a paragraph before they hit your tools.
|
|
129
135
|
|
|
130
|
-
##
|
|
136
|
+
## Multilingual PII Detection (Waterfall Pipeline)
|
|
137
|
+
|
|
138
|
+
Mask is built for the global enterprise. While many privacy tools are English-centric, the TypeScript SDK implements a **3-Tier Waterfall Detection** strategy designed for high-performance PII detection across 8 major languages using local ONNX models.
|
|
139
|
+
|
|
140
|
+
### Supported Language Matrix
|
|
141
|
+
|
|
142
|
+
Mask provides first-class support for the following languages:
|
|
143
|
+
|
|
144
|
+
| Language | Code | Tier 0 (DLP) | Tier 2 (NLP Engine) |
|
|
145
|
+
| :--- | :--- | :--- | :--- |
|
|
146
|
+
| **English** | `en` | ✅ Full | DistilBERT (Simple) |
|
|
147
|
+
| **Spanish** | `es` | ✅ Full | BERT Multilingual |
|
|
148
|
+
| **French** | `fr` | ✅ Full | BERT Multilingual |
|
|
149
|
+
| **German** | `de` | ✅ Full | BERT Multilingual |
|
|
150
|
+
| **Turkish** | `tr` | ✅ Full | BERT Multilingual |
|
|
151
|
+
| **Arabic** | `ar` | ✅ Full | BERT Multilingual |
|
|
152
|
+
| **Japanese** | `ja` | ✅ Full | BERT Multilingual |
|
|
153
|
+
| **Chinese** | `zh` | ✅ Full | BERT Multilingual |
|
|
154
|
+
|
|
155
|
+
### How the Waterfall Works: The Excising Mechanism
|
|
156
|
+
|
|
157
|
+
To maintain high performance, the TypeScript SDK does not simply run three separate scans. It uses a **Sequential Mutation** strategy:
|
|
158
|
+
|
|
159
|
+
1. **Tier 0 & 1 (The Scouts):** The SDK first runs the high-speed DLP and Regex engines synchronously in the main thread.
|
|
160
|
+
2. **Immediate Tokenization:** Any PII found by these tiers is **immediately replaced** by a token in the string buffer.
|
|
161
|
+
3. **Tier 2 (The Heavy Infantry):** The expensive NLP engine (Transformers.js) only scans the *remaining* text. Because the PII has already been "excised" (cut out and replaced with tokens), the NLP engine doesn't waste compute on data already identified.
|
|
162
|
+
4. **Bypass Logic:** All tiers are "token-aware." If a scan encounter a string that is already a Mask token, it skips it entirely, preventing redundant processing or "double-tokenization."
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
### Configuration & Environment Variables
|
|
167
|
+
|
|
168
|
+
Configure your multilingual environment using standard variables. These are parsed at runtime when the `LocalTransformersScanner` is initialized.
|
|
169
|
+
|
|
170
|
+
| Variable | Default | Description |
|
|
171
|
+
| :--- | :--- | :--- |
|
|
172
|
+
| `MASK_LANGUAGES` | `en` | Comma-separated list of languages (e.g., `en,es,fr,ar`). |
|
|
173
|
+
| `MASK_NLP_MODEL` | *(varies)* | Override the default model (e.g., `Xenova/bert-base-multilingual-cased-ner-hrl`). |
|
|
174
|
+
| `MASK_MODEL_CACHE_DIR` | `~/.cache` | Local directory for storing serialized ONNX models. |
|
|
175
|
+
| `MASK_NLP_MAX_WORKERS` | `4` | Number of worker processes/threads for NLP analysis. |
|
|
176
|
+
| `MASK_NLP_TIMEOUT_SECONDS` | `60` | Max seconds for a scan before timing out. |
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
### Automatic Model Management
|
|
181
|
+
|
|
182
|
+
The TypeScript SDK manages AI models automatically via the **Transformers.js** runtime.
|
|
183
|
+
|
|
184
|
+
#### 1. Automatic Downloads
|
|
185
|
+
When you set `MASK_LANGUAGES` to include non-English languages, the scanner will automatically download the multilingual BERT model from Hugging Face on the first execution and cache it locally.
|
|
186
|
+
|
|
187
|
+
#### 2. Pre-Warming (Performance)
|
|
188
|
+
Upon initialization, the `LocalTransformersScanner` starts a worker pool and "pre-warms" the models. This ensures that the first real request doesn't suffer from high "cold-start" latency.
|
|
131
189
|
|
|
132
|
-
|
|
190
|
+
#### 3. Air-Gapped / Offline Environments
|
|
191
|
+
For high-security environments, you can pre-cache models. Run this script in your build pipeline:
|
|
133
192
|
```bash
|
|
134
|
-
|
|
193
|
+
# Set a custom cache directory
|
|
194
|
+
export MASK_MODEL_CACHE_DIR="./models"
|
|
195
|
+
|
|
196
|
+
# Run a dummy scan to trigger the download
|
|
197
|
+
node -e "require('mask-privacy').getScanner().scanAndTokenize('John Doe')"
|
|
198
|
+
|
|
199
|
+
# Bundle the './models' folder with your container
|
|
135
200
|
```
|
|
136
201
|
|
|
137
|
-
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
### Performance & Latency Benchmarks
|
|
205
|
+
|
|
206
|
+
*Measured on 4-vCPU 8GB RAM Instance (Node.js 20+)*
|
|
207
|
+
|
|
208
|
+
| Tier | Engine | Avg. Latency | Rationale |
|
|
209
|
+
| :--- | :--- | :--- | :--- |
|
|
210
|
+
| Tier 0 | DLP (Heuristic) | ~2ms | Main-thread synchronous regex |
|
|
211
|
+
| Tier 1 | Regex (Deterministic) | ~1ms | Main-thread synchronous regex |
|
|
212
|
+
| Tier 2 | Transformers (Local) | 300ms - 800ms | Offloaded to Worker Threads (Piscina) |
|
|
213
|
+
|
|
214
|
+
**Total Overhead:** Usually **< 400ms** for standard chat lengths. Mask uses an **Excising Mechanism** to ensure that text already identified in Tiers 0/1 is removed from the NLP buffer, significantly accelerating the heavy Transformer inference.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
### Installing AI Models (Production Ready)
|
|
219
|
+
The TypeScript SDK manages AI models automatically via **Transformers.js**. For production air-gapped environments or to avoid "cold-start" latency, we recommend using the pre-caching CLI:
|
|
220
|
+
|
|
138
221
|
```bash
|
|
139
|
-
npm install
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
222
|
+
npm install @huggingface/transformers # Required extra
|
|
223
|
+
|
|
224
|
+
# Pre-cache models for your required languages
|
|
225
|
+
export MASK_LANGUAGES="en,es,fr"
|
|
226
|
+
npx mask-privacy cache-models
|
|
143
227
|
```
|
|
144
228
|
|
|
145
229
|
### Framework Support
|
|
@@ -152,6 +236,24 @@ Mask supports major AI frameworks via built-in hooks:
|
|
|
152
236
|
|
|
153
237
|
Before running your agents, Mask requires an encryption key and a vault backend selection.
|
|
154
238
|
|
|
239
|
+
#### Where to set these?
|
|
240
|
+
Select the method that best fits your deployment:
|
|
241
|
+
|
|
242
|
+
1. **In a `.env` file (Recommended)**: Create a file in your project root.
|
|
243
|
+
```env
|
|
244
|
+
MASK_LANGUAGES="es,en"
|
|
245
|
+
MASK_ENCRYPTION_KEY="your-key"
|
|
246
|
+
```
|
|
247
|
+
Then load it using a library like `dotenv`.
|
|
248
|
+
2. **In your Terminal**:
|
|
249
|
+
* **Bash**: `export MASK_LANGUAGES="es,en"`
|
|
250
|
+
* **PowerShell**: `$env:MASK_LANGUAGES="es,en"`
|
|
251
|
+
3. **Directly in TypeScript/Node.js**:
|
|
252
|
+
```typescript
|
|
253
|
+
process.env.MASK_LANGUAGES = "es,en";
|
|
254
|
+
// Ensure this happens BEFORE initializing the MaskClient
|
|
255
|
+
```
|
|
256
|
+
|
|
155
257
|
#### 1. Configure Keys
|
|
156
258
|
By default, Mask reads from environment variables.
|
|
157
259
|
```bash
|
|
@@ -181,15 +283,24 @@ export MASK_DYNAMODB_REGION=us-east-1
|
|
|
181
283
|
export MASK_MEMCACHED_HOST=localhost
|
|
182
284
|
export MASK_MEMCACHED_PORT=11211
|
|
183
285
|
|
|
184
|
-
#### 4. Security
|
|
185
|
-
# Enable strict mode to refuse startup without MASK_ENCRYPTION_KEY
|
|
186
|
-
export MASK_STRICT_PROD=true
|
|
286
|
+
#### 4. Security Guardrails: Fail-Shut by Default
|
|
187
287
|
|
|
188
|
-
|
|
189
|
-
export MASK_BLIND_INDEX_SALT="custom-salt-here"
|
|
288
|
+
To prevent accidental data leakage, Mask defaults to a **Fail-Shut** strategy. If the Vault or Key Provider is unreachable, the SDK will throw a `MaskVaultConnectionError`.
|
|
190
289
|
|
|
191
290
|
> [!IMPORTANT]
|
|
192
|
-
> **
|
|
291
|
+
> **Environment Modes:**
|
|
292
|
+
> - **Production (Default):** Fail-Shut enabled. Strictly protects PII.
|
|
293
|
+
> - **Development:** Set `MASK_ENV=dev` to allow "Fail-Open" behavior (PII is returned as-is if the vault fails).
|
|
294
|
+
|
|
295
|
+
#### 5. Model Pre-caching CLI
|
|
296
|
+
|
|
297
|
+
For production air-gapped environments or to avoid "cold-start" latency, use the model pre-caching tool:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# Cache English and Spanish models
|
|
301
|
+
export MASK_MODEL_CACHE_DIR="./models"
|
|
302
|
+
npx ts-node src/cli.ts cache-models --languages en,es
|
|
303
|
+
```
|
|
193
304
|
|
|
194
305
|
# Configure MemoryVault cleanup aggressiveness (default: 0.01)
|
|
195
306
|
export MASK_VAULT_CLEANUP_FREQUENCY=0.05
|
package/dist/index.d.mts
CHANGED
|
@@ -14,6 +14,7 @@ type EncodeOptions = {
|
|
|
14
14
|
ttl?: number;
|
|
15
15
|
searchBuckets?: ('year' | 'month' | 'day' | 'numeric')[];
|
|
16
16
|
searchBucketSize?: number;
|
|
17
|
+
entityType?: string;
|
|
17
18
|
};
|
|
18
19
|
/**
|
|
19
20
|
* Tokenise rawText, encrypt it, store in vault, return the FPE token.
|
|
@@ -49,9 +50,40 @@ declare function looksLikeToken(value: string | any): boolean;
|
|
|
49
50
|
/** Clear the cached master key. Useful in tests. */
|
|
50
51
|
declare function resetMasterKey(): void;
|
|
51
52
|
/**
|
|
52
|
-
* Return a **deterministic**, format-preserving token for rawText.
|
|
53
|
+
* Return a **deterministic**, format-preserving token for rawText using its entityType.
|
|
54
|
+
*/
|
|
55
|
+
declare function generateFPEToken(rawText: string, entityType?: string): Promise<string>;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Span Resolution Engine — Sweep-Line Overlap Resolver (TypeScript).
|
|
59
|
+
*
|
|
60
|
+
* All detection tiers now return Span objects instead of mutating the text.
|
|
61
|
+
* resolveOverlaps() chooses the winning span in every conflicting region,
|
|
62
|
+
* and reconstruct() rebuilds the string exactly once.
|
|
63
|
+
*/
|
|
64
|
+
interface Span {
|
|
65
|
+
start: number;
|
|
66
|
+
end: number;
|
|
67
|
+
entityType: string;
|
|
68
|
+
originalValue: string;
|
|
69
|
+
confidence: number;
|
|
70
|
+
method: string;
|
|
71
|
+
language?: string;
|
|
72
|
+
maskedValue?: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Entity Detection Scanner — Tiered Waterfall Pipeline.
|
|
77
|
+
*
|
|
78
|
+
* Scans unstructured text to identify PII (Emails, Phones, SSNs, Credit Cards,
|
|
79
|
+
* Names) and replaces them in-place with Format-Preserving Encryption (FPE)
|
|
80
|
+
* tokens.
|
|
81
|
+
*
|
|
82
|
+
* Detection Architecture (Waterfall):
|
|
83
|
+
* Tier 0 — DLP Heuristic: Multilingual, 50+ types, checksum validators
|
|
84
|
+
* Tier 1 — Deterministic: Regex + Checksum (fast, provable, auditable)
|
|
85
|
+
* Tier 2 — Probabilistic: Local NLP via Transformers (catches names/orgs)
|
|
53
86
|
*/
|
|
54
|
-
declare function generateFPEToken(rawText: string): Promise<string>;
|
|
55
87
|
|
|
56
88
|
declare class BaseScanner {
|
|
57
89
|
protected _supportedEntities: string[];
|
|
@@ -61,18 +93,23 @@ declare class BaseScanner {
|
|
|
61
93
|
protected static _luhnChecksum(ccNumber: string): boolean;
|
|
62
94
|
/** Validate a US ABA routing number using the checksum algorithm. */
|
|
63
95
|
protected static _abaChecksum(routingNumber: string): boolean;
|
|
64
|
-
protected
|
|
65
|
-
|
|
96
|
+
protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
|
|
97
|
+
/** Backward-compat wrapper — collects spans then single-pass encodes. */
|
|
98
|
+
protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
99
|
+
protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
|
|
100
|
+
/** Backward-compat wrapper. */
|
|
101
|
+
protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
102
|
+
protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
66
103
|
protected _resolveBoost(context?: string | null): Set<string>;
|
|
67
104
|
scanAndTokenize(text: string, options?: {
|
|
68
|
-
encodeFn?: (val: string) => Promise<string>;
|
|
105
|
+
encodeFn?: (val: string, options?: any) => Promise<string>;
|
|
69
106
|
pipeline?: string[];
|
|
70
107
|
confidenceThreshold?: number;
|
|
71
108
|
context?: string | null;
|
|
72
109
|
aggressive?: boolean;
|
|
73
110
|
}): Promise<string>;
|
|
74
111
|
scanAndReturnEntities(text: string, options?: {
|
|
75
|
-
encodeFn?: (val: string) => Promise<string>;
|
|
112
|
+
encodeFn?: (val: string, options?: any) => Promise<string>;
|
|
76
113
|
pipeline?: string[];
|
|
77
114
|
confidenceThreshold?: number;
|
|
78
115
|
context?: string | null;
|
|
@@ -106,7 +143,7 @@ declare class LocalTransformersScanner extends BaseScanner {
|
|
|
106
143
|
* Map Transformer entity types to Mask internal entity types.
|
|
107
144
|
*/
|
|
108
145
|
private _mapEntityType;
|
|
109
|
-
protected _tier2Nlp(text: string, encodeFn: (val: string) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
146
|
+
protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
110
147
|
/**
|
|
111
148
|
* Merges sub-tokens and entities of the same type while precisely tracking
|
|
112
149
|
* offsets in the original text.
|
|
@@ -265,6 +302,183 @@ declare class MaskClient {
|
|
|
265
302
|
close(): Promise<void>;
|
|
266
303
|
}
|
|
267
304
|
|
|
305
|
+
/**
|
|
306
|
+
* Language Context Resolver — Unicode-block heuristic for multilingual DLP.
|
|
307
|
+
*
|
|
308
|
+
* Examines the character distribution of an input buffer to infer the
|
|
309
|
+
* dominant script / language. The resolved language tag is consumed by
|
|
310
|
+
* the DLPPatternRegistry to prioritise locale-specific regex groups.
|
|
311
|
+
*
|
|
312
|
+
* Supported language tags:
|
|
313
|
+
* en — English (default / Latin-only fallback)
|
|
314
|
+
* es — Spanish
|
|
315
|
+
* fr — French
|
|
316
|
+
* de — German
|
|
317
|
+
* tr — Turkish
|
|
318
|
+
* ar — Arabic
|
|
319
|
+
* zh — Chinese
|
|
320
|
+
* ja — Japanese
|
|
321
|
+
*/
|
|
322
|
+
type LanguageTag = "en" | "es" | "fr" | "de" | "tr" | "ar" | "zh" | "ja";
|
|
323
|
+
interface LanguageBreakdown {
|
|
324
|
+
language: LanguageTag;
|
|
325
|
+
breakdown: Record<string, number>;
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Determine the dominant language of a text buffer.
|
|
329
|
+
*
|
|
330
|
+
* The resolver is stateless and safe for concurrent use.
|
|
331
|
+
*
|
|
332
|
+
* @example
|
|
333
|
+
* ```ts
|
|
334
|
+
* const resolver = new LanguageContextResolver();
|
|
335
|
+
* const tag = resolver.resolve("Merhaba, TC Kimlik Numaram 12345678901");
|
|
336
|
+
* // tag === "tr"
|
|
337
|
+
* ```
|
|
338
|
+
*/
|
|
339
|
+
declare class LanguageContextResolver {
|
|
340
|
+
/** Minimum number of script-specific characters required. */
|
|
341
|
+
private readonly charThreshold;
|
|
342
|
+
constructor(charThreshold?: number);
|
|
343
|
+
/** Return an ISO-639-1 language tag for the given text. Falls back to "en". */
|
|
344
|
+
resolve(text: string): LanguageTag;
|
|
345
|
+
/** Return the language tag together with per-script hit counts. */
|
|
346
|
+
resolveWithDetail(text: string): LanguageBreakdown;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* DLP Pattern Registry — Centralised catalogue of 50+ sensitive-data signatures.
|
|
351
|
+
*
|
|
352
|
+
* Each entry bundles a compiled regex, a list of proximity keywords (used by
|
|
353
|
+
* the scorer for context boosting), a base leakage-risk probability, and an
|
|
354
|
+
* optional hard-validator tag that tells the DLPValidationEngine which
|
|
355
|
+
* checksum to run after the initial pattern match.
|
|
356
|
+
*
|
|
357
|
+
* Patterns are organised into SensitiveCategory groups so that callers can
|
|
358
|
+
* selectively load only the groups relevant to their compliance scope.
|
|
359
|
+
*/
|
|
360
|
+
|
|
361
|
+
declare enum SensitiveCategory {
|
|
362
|
+
FINANCIAL = "FINANCIAL",
|
|
363
|
+
CONTACT = "CONTACT",
|
|
364
|
+
PERSONAL = "PERSONAL",
|
|
365
|
+
HEALTHCARE = "HEALTHCARE",
|
|
366
|
+
IDENTITY_US = "IDENTITY_US",
|
|
367
|
+
IDENTITY_INTL = "IDENTITY_INTL",
|
|
368
|
+
VEHICLE = "VEHICLE",
|
|
369
|
+
CORPORATE = "CORPORATE"
|
|
370
|
+
}
|
|
371
|
+
interface PatternDescriptor {
|
|
372
|
+
compiledRe: RegExp;
|
|
373
|
+
proximityTerms: ReadonlySet<string>;
|
|
374
|
+
baseRisk: number;
|
|
375
|
+
category: SensitiveCategory;
|
|
376
|
+
validatorTag: string | null;
|
|
377
|
+
isHighEntropy: boolean;
|
|
378
|
+
supportedLocales: string[];
|
|
379
|
+
}
|
|
380
|
+
/**
|
|
381
|
+
* Immutable catalogue of sensitive-data regex signatures.
|
|
382
|
+
*/
|
|
383
|
+
declare class DLPPatternRegistry {
|
|
384
|
+
private readonly catalogue;
|
|
385
|
+
private readonly localeCategoryRegexMap;
|
|
386
|
+
constructor(loadGroups?: ReadonlySet<SensitiveCategory>);
|
|
387
|
+
get typeNames(): string[];
|
|
388
|
+
/** Yield [typeName, descriptor] pairs. */
|
|
389
|
+
iterDescriptors(): IterableIterator<[string, PatternDescriptor]>;
|
|
390
|
+
descriptorFor(typeName: string): PatternDescriptor | undefined;
|
|
391
|
+
namePatternsFor(lang: LanguageTag | string): RegExp[];
|
|
392
|
+
addressPatternsFor(lang: LanguageTag | string): RegExp[];
|
|
393
|
+
getCategoryRegexesMap(locale?: string): Map<string, {
|
|
394
|
+
re: RegExp;
|
|
395
|
+
typeOrder: string[];
|
|
396
|
+
}>;
|
|
397
|
+
getCategoryTypeMap(categoryName: string, locale?: string): string[];
|
|
398
|
+
private compileForLocale;
|
|
399
|
+
private buildCatalogue;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Run the appropriate hard-validator for a given validator tag.
|
|
404
|
+
*
|
|
405
|
+
* @example
|
|
406
|
+
* ```ts
|
|
407
|
+
* const engine = new DLPValidationEngine();
|
|
408
|
+
* const passed = engine.run("luhn", "4111111111111111");
|
|
409
|
+
* ```
|
|
410
|
+
*/
|
|
411
|
+
declare class DLPValidationEngine {
|
|
412
|
+
/**
|
|
413
|
+
* Execute the validator identified by tag.
|
|
414
|
+
*
|
|
415
|
+
* @returns `true` — value passed checksum → confidence override.
|
|
416
|
+
* @returns `false` — value failed → confidence penalty.
|
|
417
|
+
* @returns `null` — no validator registered for tag.
|
|
418
|
+
*/
|
|
419
|
+
run(tag: string | null | undefined, rawValue: string): boolean | null;
|
|
420
|
+
/** Return all registered validator tag names. */
|
|
421
|
+
static availableTags(): string[];
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
/**
|
|
425
|
+
* DLP Confidence Scorer — Proximity-weighted scoring for sensitive data matches.
|
|
426
|
+
*
|
|
427
|
+
* Combines three independent signals into a single confidence value:
|
|
428
|
+
* 1. Base risk — intrinsic leakage probability of the data type.
|
|
429
|
+
* 2. Proximity boost — logarithmic-decay bonus for each context keyword
|
|
430
|
+
* found near the match within a configurable window.
|
|
431
|
+
* 3. Validator override — hard-validator pass forces confidence to 0.99.
|
|
432
|
+
*
|
|
433
|
+
* The scorer is stateless and safe for concurrent use.
|
|
434
|
+
*/
|
|
435
|
+
interface ScorerConfig {
|
|
436
|
+
contextWindow?: number;
|
|
437
|
+
keywordBoost?: number;
|
|
438
|
+
validatorOverride?: number;
|
|
439
|
+
maxConfidence?: number;
|
|
440
|
+
penaltyFactor?: number;
|
|
441
|
+
}
|
|
442
|
+
interface ScoreInput {
|
|
443
|
+
baseRisk: number;
|
|
444
|
+
matchStart: number;
|
|
445
|
+
matchEnd: number;
|
|
446
|
+
fullText: string;
|
|
447
|
+
proximityTerms: ReadonlySet<string>;
|
|
448
|
+
validatorPassed: boolean | null;
|
|
449
|
+
}
|
|
450
|
+
/**
|
|
451
|
+
* Calculate a weighted confidence score for a single regex hit.
|
|
452
|
+
*
|
|
453
|
+
* @example
|
|
454
|
+
* ```ts
|
|
455
|
+
* const scorer = new DLPConfidenceScorer();
|
|
456
|
+
* const score = scorer.score({
|
|
457
|
+
* baseRisk: 0.92,
|
|
458
|
+
* matchStart: 10,
|
|
459
|
+
* matchEnd: 21,
|
|
460
|
+
* fullText: "TC Kimlik No: 10000000146",
|
|
461
|
+
* proximityTerms: new Set(["kimlik", "tc"]),
|
|
462
|
+
* validatorPassed: true,
|
|
463
|
+
* });
|
|
464
|
+
* // score === 0.99 (validator override)
|
|
465
|
+
* ```
|
|
466
|
+
*/
|
|
467
|
+
declare class DLPConfidenceScorer {
|
|
468
|
+
private readonly window;
|
|
469
|
+
private readonly kwBoost;
|
|
470
|
+
private readonly valOverride;
|
|
471
|
+
private readonly ceil;
|
|
472
|
+
private readonly penalty;
|
|
473
|
+
constructor(overrides?: ScorerConfig);
|
|
474
|
+
/**
|
|
475
|
+
* Compute the final confidence for one candidate match.
|
|
476
|
+
*
|
|
477
|
+
* @returns Confidence in [0.0, maxConfidence].
|
|
478
|
+
*/
|
|
479
|
+
score(input: ScoreInput): number;
|
|
480
|
+
}
|
|
481
|
+
|
|
268
482
|
/**
|
|
269
483
|
* Mask Privacy SDK
|
|
270
484
|
* Just-In-Time Privacy Middleware for AI Agents.
|
|
@@ -298,4 +512,4 @@ declare function ascanAndTokenize(text: string, options?: {
|
|
|
298
512
|
*/
|
|
299
513
|
declare function secureTool(...args: any[]): any;
|
|
300
514
|
|
|
301
|
-
export { BaseScanner, LocalTransformersScanner, MaskClient, MaskDecryptionError, MaskError, MaskNLPTimeout, MaskSecurityError, MaskVaultConnectionError, PresidioScanner, VERSION, adecode, adetokenizeText, aencode, ascanAndTokenize, decode, detectEntitiesWithConfidence, detokenizeText, encode, generateFPEToken, getScanner, getVault, looksLikeToken, resetMasterKey, secureTool };
|
|
515
|
+
export { BaseScanner, DLPConfidenceScorer, DLPPatternRegistry, DLPValidationEngine, LanguageContextResolver, LocalTransformersScanner, MaskClient, MaskDecryptionError, MaskError, MaskNLPTimeout, MaskSecurityError, MaskVaultConnectionError, PresidioScanner, SensitiveCategory, VERSION, adecode, adetokenizeText, aencode, ascanAndTokenize, decode, detectEntitiesWithConfidence, detokenizeText, encode, generateFPEToken, getScanner, getVault, looksLikeToken, resetMasterKey, secureTool };
|