mask-privacy 3.3.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -16
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +90 -63
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +90 -63
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/core/dlp/registry.ts +41 -16
- package/src/core/scanner.ts +67 -47
- package/src/index.ts +1 -1
package/README.md
CHANGED
|
@@ -127,27 +127,18 @@ Performance-sensitive deployments utilize the built-in `LocalTransformersScanner
|
|
|
127
127
|
### 7. Sub-string Detokenization
|
|
128
128
|
Mask includes the ability to detokenize PII embedded within larger text blocks (like email bodies or chat messages). `detokenizeText()` uses high-performance regex to find and restore all tokens within a paragraph before they hit your tools.
|
|
129
129
|
|
|
130
|
-
## Multilingual PII Detection (Waterfall
|
|
130
|
+
## Multilingual PII Detection (2-Tier Waterfall)
|
|
131
131
|
|
|
132
|
-
Mask is built for the global enterprise. The TypeScript SDK implements a **
|
|
133
|
-
|
|
134
|
-
### Supported Language Matrix
|
|
135
|
-
|
|
136
|
-
Mask provides first-class support for the following languages:
|
|
137
|
-
|
|
138
|
-
| Language | Code | Tier 0 (DLP) | Tier 2 (NLP Engine) |
|
|
139
|
-
| :--- | :--- | :--- | :--- |
|
|
140
|
-
| **English** | `en` | ✅ Full | DistilBERT (Simple) |
|
|
141
|
-
| **Spanish** | `es` | ✅ Full | BERT Multilingual |
|
|
132
|
+
Mask is built for the global enterprise. The TypeScript SDK implements a **2-Tier Model-Augmented Waterfall** strategy for high-precision PII detection in **English and Spanish**.
|
|
142
133
|
|
|
143
134
|
### How the Waterfall Works: The Excising Mechanism
|
|
144
135
|
|
|
145
|
-
To maintain high performance, the TypeScript SDK does not simply run
|
|
136
|
+
To maintain high performance, the TypeScript SDK does not simply run multiple separate scans. It uses a **Sequential Mutation** strategy:
|
|
146
137
|
|
|
147
|
-
1. **Tier 0
|
|
148
|
-
2. **Immediate Tokenization:** Any PII found by
|
|
149
|
-
3. **Tier
|
|
150
|
-
4. **Bypass Logic:** All tiers are "token-aware." If a scan
|
|
138
|
+
1. **Tier 0: Deterministic (The Registry):** The SDK first runs the high-speed DLP and Registry engines. These use regex + checksums (Luhn, Mod-97, Mod-11) + Proximity Keywords to identify structured PII (Bank Accounts, SSNs, DNI, NUSS, etc.) with 100% precision.
|
|
139
|
+
2. **Immediate Tokenization:** Any PII found by Tier 0 is **immediately replaced** by a token in the string buffer.
|
|
140
|
+
3. **Tier 1: Probabilistic (Neural NER):** The expensive NLP engine (Transformers.js) only scans the *remaining* text for unstructured entities: **PERSON**, **LOCATION**, and **ORGANIZATION**. Because Tier 0 PII has already been "excised", the NLP engine doesn't waste compute on data already identified, and entity collisions are avoided.
|
|
141
|
+
4. **Bypass Logic:** All tiers are "token-aware." If a scan encounters a string that is already a Mask token, it skips it entirely.
|
|
151
142
|
|
|
152
143
|
---
|
|
153
144
|
|
package/dist/index.d.mts
CHANGED
|
@@ -388,7 +388,7 @@ declare class DLPPatternRegistry {
|
|
|
388
388
|
getCategoryRegexesMap(locale?: string): Map<string, {
|
|
389
389
|
re: RegExp;
|
|
390
390
|
typeOrder: string[];
|
|
391
|
-
}>;
|
|
391
|
+
}[]>;
|
|
392
392
|
getCategoryTypeMap(categoryName: string, locale?: string): string[];
|
|
393
393
|
private compileForLocale;
|
|
394
394
|
private buildCatalogue;
|
|
@@ -481,7 +481,7 @@ declare class DLPConfidenceScorer {
|
|
|
481
481
|
* Provides format-preserving encryption, local/distributed vaulting,
|
|
482
482
|
* and framework-agnostic tool interception hooks.
|
|
483
483
|
*/
|
|
484
|
-
declare const VERSION = "3.
|
|
484
|
+
declare const VERSION = "3.5.0";
|
|
485
485
|
|
|
486
486
|
/**
|
|
487
487
|
* Detect PII entities in text and return a list of objects with metadata.
|
package/dist/index.d.ts
CHANGED
|
@@ -388,7 +388,7 @@ declare class DLPPatternRegistry {
|
|
|
388
388
|
getCategoryRegexesMap(locale?: string): Map<string, {
|
|
389
389
|
re: RegExp;
|
|
390
390
|
typeOrder: string[];
|
|
391
|
-
}>;
|
|
391
|
+
}[]>;
|
|
392
392
|
getCategoryTypeMap(categoryName: string, locale?: string): string[];
|
|
393
393
|
private compileForLocale;
|
|
394
394
|
private buildCatalogue;
|
|
@@ -481,7 +481,7 @@ declare class DLPConfidenceScorer {
|
|
|
481
481
|
* Provides format-preserving encryption, local/distributed vaulting,
|
|
482
482
|
* and framework-agnostic tool interception hooks.
|
|
483
483
|
*/
|
|
484
|
-
declare const VERSION = "3.
|
|
484
|
+
declare const VERSION = "3.5.0";
|
|
485
485
|
|
|
486
486
|
/**
|
|
487
487
|
* Detect PII entities in text and return a list of objects with metadata.
|
package/dist/index.js
CHANGED
|
@@ -43855,7 +43855,8 @@ var init_registry = __esm({
|
|
|
43855
43855
|
return this.localeCategoryRegexMap.get(locale);
|
|
43856
43856
|
}
|
|
43857
43857
|
getCategoryTypeMap(categoryName, locale = "en") {
|
|
43858
|
-
|
|
43858
|
+
const groups = this.localeCategoryRegexMap.get(locale)?.get(categoryName) ?? [];
|
|
43859
|
+
return groups.flatMap((g6) => g6.typeOrder);
|
|
43859
43860
|
}
|
|
43860
43861
|
compileForLocale(locale) {
|
|
43861
43862
|
const localePool = /* @__PURE__ */ new Map();
|
|
@@ -43874,20 +43875,38 @@ var init_registry = __esm({
|
|
|
43874
43875
|
if (aVal !== bVal) return aVal - bVal;
|
|
43875
43876
|
return b6.compiledRe.source.length - a6.compiledRe.source.length;
|
|
43876
43877
|
});
|
|
43877
|
-
const
|
|
43878
|
-
const
|
|
43878
|
+
const csParts = [];
|
|
43879
|
+
const csOrder = [];
|
|
43880
|
+
const ciParts = [];
|
|
43881
|
+
const ciOrder = [];
|
|
43879
43882
|
for (const [typeName, desc] of entries) {
|
|
43880
|
-
|
|
43881
|
-
|
|
43883
|
+
const named = `(?<${typeName}>${desc.compiledRe.source})`;
|
|
43884
|
+
if (desc.compiledRe.flags.includes("i")) {
|
|
43885
|
+
ciParts.push(named);
|
|
43886
|
+
ciOrder.push(typeName);
|
|
43887
|
+
} else {
|
|
43888
|
+
csParts.push(named);
|
|
43889
|
+
csOrder.push(typeName);
|
|
43890
|
+
}
|
|
43882
43891
|
}
|
|
43883
|
-
const
|
|
43884
|
-
const
|
|
43885
|
-
|
|
43886
|
-
|
|
43887
|
-
|
|
43888
|
-
|
|
43889
|
-
|
|
43890
|
-
|
|
43892
|
+
const groups = [];
|
|
43893
|
+
const subGroups = [
|
|
43894
|
+
[csParts, csOrder, "g"],
|
|
43895
|
+
[ciParts, ciOrder, "gi"]
|
|
43896
|
+
];
|
|
43897
|
+
for (const [parts, order, flags] of subGroups) {
|
|
43898
|
+
if (parts.length === 0) continue;
|
|
43899
|
+
try {
|
|
43900
|
+
groups.push({ re: new RegExp(parts.join("|"), flags), typeOrder: order });
|
|
43901
|
+
} catch (err2) {
|
|
43902
|
+
console.error(
|
|
43903
|
+
`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] (${flags}) failed:`,
|
|
43904
|
+
err2
|
|
43905
|
+
);
|
|
43906
|
+
}
|
|
43907
|
+
}
|
|
43908
|
+
if (groups.length > 0) {
|
|
43909
|
+
categoryMap.set(catKey, groups);
|
|
43891
43910
|
}
|
|
43892
43911
|
}
|
|
43893
43912
|
this.localeCategoryRegexMap.set(locale, categoryMap);
|
|
@@ -59071,6 +59090,11 @@ var init_transformers_scanner = __esm({
|
|
|
59071
59090
|
});
|
|
59072
59091
|
|
|
59073
59092
|
// src/core/scanner.ts
|
|
59093
|
+
async function chunkEncode(items, fn) {
|
|
59094
|
+
for (let i6 = 0; i6 < items.length; i6 += CHUNK_SIZE) {
|
|
59095
|
+
await Promise.all(items.slice(i6, i6 + CHUNK_SIZE).map(fn));
|
|
59096
|
+
}
|
|
59097
|
+
}
|
|
59074
59098
|
function getScanner() {
|
|
59075
59099
|
if (scannerInstance === null) {
|
|
59076
59100
|
const scannerType = config.MASK_SCANNER_TYPE;
|
|
@@ -59084,7 +59108,7 @@ function getScanner() {
|
|
|
59084
59108
|
}
|
|
59085
59109
|
return scannerInstance;
|
|
59086
59110
|
}
|
|
59087
|
-
var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer; exports.BaseScanner = void 0; exports.PresidioScanner = void 0; var scannerInstance;
|
|
59111
|
+
var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer, CHUNK_SIZE; exports.BaseScanner = void 0; exports.PresidioScanner = void 0; var scannerInstance;
|
|
59088
59112
|
var init_scanner = __esm({
|
|
59089
59113
|
"src/core/scanner.ts"() {
|
|
59090
59114
|
init_config();
|
|
@@ -59099,6 +59123,7 @@ var init_scanner = __esm({
|
|
|
59099
59123
|
_dlpPatternRegistry = new exports.DLPPatternRegistry();
|
|
59100
59124
|
_dlpValidationEngine = new exports.DLPValidationEngine();
|
|
59101
59125
|
_dlpConfidenceScorer = new exports.DLPConfidenceScorer();
|
|
59126
|
+
CHUNK_SIZE = 50;
|
|
59102
59127
|
exports.BaseScanner = class {
|
|
59103
59128
|
constructor() {
|
|
59104
59129
|
this._supportedEntities = [
|
|
@@ -59137,51 +59162,53 @@ var init_scanner = __esm({
|
|
|
59137
59162
|
const detectedLanguage = _dlpLanguageResolver.resolve(text);
|
|
59138
59163
|
const spans = [];
|
|
59139
59164
|
const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
|
|
59140
|
-
for (const [catKey,
|
|
59141
|
-
const
|
|
59142
|
-
|
|
59143
|
-
|
|
59144
|
-
|
|
59145
|
-
|
|
59146
|
-
|
|
59147
|
-
|
|
59148
|
-
|
|
59149
|
-
|
|
59165
|
+
for (const [catKey, groups] of categoryMap.entries()) {
|
|
59166
|
+
for (const { re, typeOrder } of groups) {
|
|
59167
|
+
const megaRe = new RegExp(re.source, re.flags);
|
|
59168
|
+
let m6;
|
|
59169
|
+
while ((m6 = megaRe.exec(text)) !== null) {
|
|
59170
|
+
const groups2 = m6.groups ?? {};
|
|
59171
|
+
let typeTag;
|
|
59172
|
+
for (const name of typeOrder) {
|
|
59173
|
+
if (groups2[name] !== void 0) {
|
|
59174
|
+
typeTag = name;
|
|
59175
|
+
break;
|
|
59176
|
+
}
|
|
59150
59177
|
}
|
|
59151
|
-
|
|
59152
|
-
|
|
59153
|
-
|
|
59154
|
-
|
|
59155
|
-
|
|
59156
|
-
|
|
59157
|
-
|
|
59158
|
-
|
|
59159
|
-
|
|
59160
|
-
|
|
59161
|
-
|
|
59178
|
+
if (!typeTag) continue;
|
|
59179
|
+
const matchedStr = m6[0];
|
|
59180
|
+
if (looksLikeToken(matchedStr)) continue;
|
|
59181
|
+
const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
|
|
59182
|
+
if (!descriptor) continue;
|
|
59183
|
+
const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
|
|
59184
|
+
let conf;
|
|
59185
|
+
if (validatorResult === false) {
|
|
59186
|
+
if (descriptor.isHighEntropy) {
|
|
59187
|
+
conf = 0.85;
|
|
59188
|
+
} else {
|
|
59189
|
+
continue;
|
|
59190
|
+
}
|
|
59162
59191
|
} else {
|
|
59163
|
-
|
|
59192
|
+
conf = _dlpConfidenceScorer.score({
|
|
59193
|
+
baseRisk: descriptor.baseRisk,
|
|
59194
|
+
matchStart: m6.index,
|
|
59195
|
+
matchEnd: m6.index + matchedStr.length,
|
|
59196
|
+
fullText: text,
|
|
59197
|
+
proximityTerms: descriptor.proximityTerms,
|
|
59198
|
+
validatorPassed: validatorResult
|
|
59199
|
+
});
|
|
59200
|
+
}
|
|
59201
|
+
if (conf >= confidenceThreshold) {
|
|
59202
|
+
spans.push({
|
|
59203
|
+
start: m6.index,
|
|
59204
|
+
end: m6.index + matchedStr.length,
|
|
59205
|
+
entityType: typeTag,
|
|
59206
|
+
originalValue: matchedStr,
|
|
59207
|
+
confidence: conf,
|
|
59208
|
+
method: "dlp_heuristic",
|
|
59209
|
+
language: detectedLanguage
|
|
59210
|
+
});
|
|
59164
59211
|
}
|
|
59165
|
-
} else {
|
|
59166
|
-
conf = _dlpConfidenceScorer.score({
|
|
59167
|
-
baseRisk: descriptor.baseRisk,
|
|
59168
|
-
matchStart: m6.index,
|
|
59169
|
-
matchEnd: m6.index + matchedStr.length,
|
|
59170
|
-
fullText: text,
|
|
59171
|
-
proximityTerms: descriptor.proximityTerms,
|
|
59172
|
-
validatorPassed: validatorResult
|
|
59173
|
-
});
|
|
59174
|
-
}
|
|
59175
|
-
if (conf >= confidenceThreshold) {
|
|
59176
|
-
spans.push({
|
|
59177
|
-
start: m6.index,
|
|
59178
|
-
end: m6.index + matchedStr.length,
|
|
59179
|
-
entityType: typeTag,
|
|
59180
|
-
originalValue: matchedStr,
|
|
59181
|
-
confidence: conf,
|
|
59182
|
-
method: "dlp_heuristic",
|
|
59183
|
-
language: detectedLanguage
|
|
59184
|
-
});
|
|
59185
59212
|
}
|
|
59186
59213
|
}
|
|
59187
59214
|
}
|
|
@@ -59261,11 +59288,11 @@ var init_scanner = __esm({
|
|
|
59261
59288
|
}
|
|
59262
59289
|
_resolveBoost(context) {
|
|
59263
59290
|
if (!context) return /* @__PURE__ */ new Set();
|
|
59264
|
-
const lowered = context.toLowerCase();
|
|
59265
59291
|
const boosted = /* @__PURE__ */ new Set();
|
|
59266
59292
|
for (const [, desc] of _dlpPatternRegistry.iterDescriptors()) {
|
|
59267
59293
|
for (const term of desc.proximityTerms) {
|
|
59268
|
-
|
|
59294
|
+
const pattern = new RegExp("\\b" + term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i");
|
|
59295
|
+
if (pattern.test(context)) {
|
|
59269
59296
|
boosted.add(desc.category.toLowerCase());
|
|
59270
59297
|
break;
|
|
59271
59298
|
}
|
|
@@ -59284,9 +59311,9 @@ var init_scanner = __esm({
|
|
|
59284
59311
|
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59285
59312
|
}
|
|
59286
59313
|
const resolved = resolveOverlaps(allSpans);
|
|
59287
|
-
await
|
|
59314
|
+
await chunkEncode(resolved, async (span) => {
|
|
59288
59315
|
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
59289
|
-
})
|
|
59316
|
+
});
|
|
59290
59317
|
let currentText = reconstruct(text, resolved);
|
|
59291
59318
|
if (pipeline.includes("nlp")) {
|
|
59292
59319
|
[currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
@@ -59305,7 +59332,7 @@ var init_scanner = __esm({
|
|
|
59305
59332
|
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59306
59333
|
}
|
|
59307
59334
|
const resolved = resolveOverlaps(allSpans);
|
|
59308
|
-
await
|
|
59335
|
+
await chunkEncode(resolved, async (span) => {
|
|
59309
59336
|
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
59310
59337
|
allEntities.push({
|
|
59311
59338
|
type: span.entityType,
|
|
@@ -59315,7 +59342,7 @@ var init_scanner = __esm({
|
|
|
59315
59342
|
masked_value: span.maskedValue,
|
|
59316
59343
|
language: span.language
|
|
59317
59344
|
});
|
|
59318
|
-
})
|
|
59345
|
+
});
|
|
59319
59346
|
const remaining = reconstruct(text, resolved);
|
|
59320
59347
|
if (pipeline.includes("nlp")) {
|
|
59321
59348
|
const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
@@ -59619,7 +59646,7 @@ init_handlers();
|
|
|
59619
59646
|
init_scorer();
|
|
59620
59647
|
|
|
59621
59648
|
// src/index.ts
|
|
59622
|
-
var VERSION = "3.
|
|
59649
|
+
var VERSION = "3.5.0";
|
|
59623
59650
|
async function detectEntitiesWithConfidence(text, options = {}) {
|
|
59624
59651
|
const scanner = getScanner();
|
|
59625
59652
|
return await scanner.scanAndReturnEntities(text, options);
|