mask-privacy 3.2.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -16
- package/dist/index.d.mts +3 -2
- package/dist/index.d.ts +3 -2
- package/dist/index.js +85 -91
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +85 -91
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/core/dlp/handlers.ts +48 -0
- package/src/core/dlp/registry.ts +13 -7
- package/src/core/scanner.ts +17 -59
- package/src/core/transformers_scanner.ts +2 -0
- package/src/index.ts +1 -1
- package/tests/scanner.test.ts +16 -11
- package/tests/test_cross.ts +1 -1
- package/tsconfig.json +2 -2
package/README.md
CHANGED
|
@@ -127,27 +127,18 @@ Performance-sensitive deployments utilize the built-in `LocalTransformersScanner
|
|
|
127
127
|
### 7. Sub-string Detokenization
|
|
128
128
|
Mask includes the ability to detokenize PII embedded within larger text blocks (like email bodies or chat messages). `detokenizeText()` uses high-performance regex to find and restore all tokens within a paragraph before they hit your tools.
|
|
129
129
|
|
|
130
|
-
## Multilingual PII Detection (Waterfall
|
|
130
|
+
## Multilingual PII Detection (2-Tier Waterfall)
|
|
131
131
|
|
|
132
|
-
Mask is built for the global enterprise. The TypeScript SDK implements a **
|
|
133
|
-
|
|
134
|
-
### Supported Language Matrix
|
|
135
|
-
|
|
136
|
-
Mask provides first-class support for the following languages:
|
|
137
|
-
|
|
138
|
-
| Language | Code | Tier 0 (DLP) | Tier 2 (NLP Engine) |
|
|
139
|
-
| :--- | :--- | :--- | :--- |
|
|
140
|
-
| **English** | `en` | ✅ Full | DistilBERT (Simple) |
|
|
141
|
-
| **Spanish** | `es` | ✅ Full | BERT Multilingual |
|
|
132
|
+
Mask is built for the global enterprise. The TypeScript SDK implements a **2-Tier Model-Augmented Waterfall** strategy for high-precision PII detection in **English and Spanish**.
|
|
142
133
|
|
|
143
134
|
### How the Waterfall Works: The Excising Mechanism
|
|
144
135
|
|
|
145
|
-
To maintain high performance, the TypeScript SDK does not simply run
|
|
136
|
+
To maintain high performance, the TypeScript SDK does not simply run multiple separate scans. It uses a **Sequential Mutation** strategy:
|
|
146
137
|
|
|
147
|
-
1. **Tier 0
|
|
148
|
-
2. **Immediate Tokenization:** Any PII found by
|
|
149
|
-
3. **Tier
|
|
150
|
-
4. **Bypass Logic:** All tiers are "token-aware." If a scan
|
|
138
|
+
1. **Tier 0: Deterministic (The Registry):** The SDK first runs the high-speed DLP and Registry engines. These use regex + checksums (Luhn, Mod-97, Mod-11) + Proximity Keywords to identify structured PII (Bank Accounts, SSNs, DNI, NUSS, etc.) with 100% precision.
|
|
139
|
+
2. **Immediate Tokenization:** Any PII found by Tier 0 is **immediately replaced** by a token in the string buffer.
|
|
140
|
+
3. **Tier 1: Probabilistic (Neural NER):** The expensive NLP engine (Transformers.js) only scans the *remaining* text for unstructured entities: **PERSON**, **LOCATION**, and **ORGANIZATION**. Because Tier 0 PII has already been "excised", the NLP engine doesn't waste compute on data already identified, and entity collisions are avoided.
|
|
141
|
+
4. **Bypass Logic:** All tiers are "token-aware." If a scan encounters a string that is already a Mask token, it skips it entirely.
|
|
151
142
|
|
|
152
143
|
---
|
|
153
144
|
|
package/dist/index.d.mts
CHANGED
|
@@ -96,8 +96,9 @@ declare class BaseScanner {
|
|
|
96
96
|
protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
|
|
97
97
|
/** Backward-compat wrapper — collects spans then single-pass encodes. */
|
|
98
98
|
protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
99
|
+
/** Tier 1 — Deterministic detection (Legacy: Redirected to DLP) */
|
|
99
100
|
protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
|
|
100
|
-
/** Backward-compat wrapper. */
|
|
101
|
+
/** Backward-compat wrapper. Redirected to DLP. */
|
|
101
102
|
protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
102
103
|
protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
103
104
|
protected _resolveBoost(context?: string | null): Set<string>;
|
|
@@ -480,7 +481,7 @@ declare class DLPConfidenceScorer {
|
|
|
480
481
|
* Provides format-preserving encryption, local/distributed vaulting,
|
|
481
482
|
* and framework-agnostic tool interception hooks.
|
|
482
483
|
*/
|
|
483
|
-
declare const VERSION = "
|
|
484
|
+
declare const VERSION = "3.4.0";
|
|
484
485
|
|
|
485
486
|
/**
|
|
486
487
|
* Detect PII entities in text and return a list of objects with metadata.
|
package/dist/index.d.ts
CHANGED
|
@@ -96,8 +96,9 @@ declare class BaseScanner {
|
|
|
96
96
|
protected _tier0CollectSpans(text: string, confidenceThreshold: number): Promise<Span[]>;
|
|
97
97
|
/** Backward-compat wrapper — collects spans then single-pass encodes. */
|
|
98
98
|
protected _tier0Dlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
99
|
+
/** Tier 1 — Deterministic detection (Legacy: Redirected to DLP) */
|
|
99
100
|
protected _tier1CollectSpans(text: string, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<Span[]>;
|
|
100
|
-
/** Backward-compat wrapper. */
|
|
101
|
+
/** Backward-compat wrapper. Redirected to DLP. */
|
|
101
102
|
protected _tier1Regex(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
102
103
|
protected _tier2Nlp(text: string, encodeFn: (val: string, options?: any) => Promise<string>, boostEntities: Set<string>, aggressive: boolean, confidenceThreshold: number): Promise<[string, any[]]>;
|
|
103
104
|
protected _resolveBoost(context?: string | null): Set<string>;
|
|
@@ -480,7 +481,7 @@ declare class DLPConfidenceScorer {
|
|
|
480
481
|
* Provides format-preserving encryption, local/distributed vaulting,
|
|
481
482
|
* and framework-agnostic tool interception hooks.
|
|
482
483
|
*/
|
|
483
|
-
declare const VERSION = "
|
|
484
|
+
declare const VERSION = "3.4.0";
|
|
484
485
|
|
|
485
486
|
/**
|
|
486
487
|
* Detect PII entities in text and return a list of objects with metadata.
|
package/dist/index.js
CHANGED
|
@@ -43580,7 +43580,7 @@ var init_registry = __esm({
|
|
|
43580
43580
|
[
|
|
43581
43581
|
"CREDIT_CARD_NUMBER",
|
|
43582
43582
|
"\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
|
|
43583
|
-
["card", "credit", "visa", "mastercard", "amex", "payment"],
|
|
43583
|
+
["card", "credit", "visa", "mastercard", "amex", "payment", "tarjeta", "credito", "debito", "pago"],
|
|
43584
43584
|
0.97,
|
|
43585
43585
|
"FINANCIAL" /* FINANCIAL */,
|
|
43586
43586
|
"luhn"
|
|
@@ -43588,7 +43588,7 @@ var init_registry = __esm({
|
|
|
43588
43588
|
[
|
|
43589
43589
|
"INTL_BANK_IBAN",
|
|
43590
43590
|
"\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
|
|
43591
|
-
["iban", "swift", "sepa", "wire", "bank transfer"],
|
|
43591
|
+
["iban", "swift", "sepa", "wire", "bank transfer", "cuenta", "banco", "transferencia"],
|
|
43592
43592
|
0.96,
|
|
43593
43593
|
"FINANCIAL" /* FINANCIAL */,
|
|
43594
43594
|
"iban"
|
|
@@ -43625,6 +43625,16 @@ var init_registry = __esm({
|
|
|
43625
43625
|
"FINANCIAL" /* FINANCIAL */,
|
|
43626
43626
|
"luhn_soft"
|
|
43627
43627
|
],
|
|
43628
|
+
[
|
|
43629
|
+
"ES_CCC",
|
|
43630
|
+
"\\b\\d{4}[-\\s]?\\d{4}[-\\s]?\\d{2}[-\\s]?\\d{10}\\b",
|
|
43631
|
+
["cuenta", "ccc", "banco", "sucursal", "entidad", "codigo cuenta cliente"],
|
|
43632
|
+
0.9,
|
|
43633
|
+
"FINANCIAL" /* FINANCIAL */,
|
|
43634
|
+
"es_ccc",
|
|
43635
|
+
true,
|
|
43636
|
+
["*", "es"]
|
|
43637
|
+
],
|
|
43628
43638
|
[
|
|
43629
43639
|
"SWIFT_BIC",
|
|
43630
43640
|
"\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
|
|
@@ -43637,15 +43647,15 @@ var init_registry = __esm({
|
|
|
43637
43647
|
[
|
|
43638
43648
|
"EMAIL_ADDR",
|
|
43639
43649
|
"\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
|
|
43640
|
-
["email", "mail", "contact", "address"],
|
|
43650
|
+
["email", "mail", "contact", "address", "correo", "electronico"],
|
|
43641
43651
|
0.99,
|
|
43642
43652
|
"CONTACT" /* CONTACT */,
|
|
43643
43653
|
null
|
|
43644
43654
|
],
|
|
43645
43655
|
[
|
|
43646
43656
|
"PHONE_NUM",
|
|
43647
|
-
/(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
|
|
43648
|
-
["phone", "call", "mobile", "tel", "whatsapp", "number"],
|
|
43657
|
+
/(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
|
|
43658
|
+
["phone", "call", "mobile", "tel", "whatsapp", "number", "tel\xE9fono", "telefono", "movil", "celular", "llamada"],
|
|
43649
43659
|
0.8,
|
|
43650
43660
|
"CONTACT" /* CONTACT */,
|
|
43651
43661
|
null
|
|
@@ -43685,8 +43695,8 @@ var init_registry = __esm({
|
|
|
43685
43695
|
// ── PERSONAL ───────────────────────────────────────────────────────
|
|
43686
43696
|
[
|
|
43687
43697
|
"BIRTH_DATE",
|
|
43688
|
-
"\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
|
|
43689
|
-
["birth", "dob", "born", "birthday", "date of birth"],
|
|
43698
|
+
"\\b(?:(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}|(?:19|20)\\d{2}[/-](?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01]))\\b",
|
|
43699
|
+
["birth", "dob", "born", "birthday", "date of birth", "nacimiento", "fecha", "cumplea\xF1os"],
|
|
43690
43700
|
0.88,
|
|
43691
43701
|
"PERSONAL" /* PERSONAL */,
|
|
43692
43702
|
null
|
|
@@ -43793,6 +43803,16 @@ var init_registry = __esm({
|
|
|
43793
43803
|
true,
|
|
43794
43804
|
["*", "es"]
|
|
43795
43805
|
],
|
|
43806
|
+
[
|
|
43807
|
+
"ES_NUSS",
|
|
43808
|
+
"\\b\\d{2}[-\\s]?\\d{8}[-\\s]?\\d{2}\\b",
|
|
43809
|
+
["seguridad social", "nuss", "naf", "afiliacion"],
|
|
43810
|
+
0.9,
|
|
43811
|
+
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43812
|
+
"es_nuss",
|
|
43813
|
+
true,
|
|
43814
|
+
["*", "es"]
|
|
43815
|
+
],
|
|
43796
43816
|
// ── CORPORATE ──────────────────────────────────────────────────────
|
|
43797
43817
|
[
|
|
43798
43818
|
"CORP_EMPLOYEE_ID",
|
|
@@ -44023,6 +44043,38 @@ function checkEsId(raw) {
|
|
|
44023
44043
|
const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
44024
44044
|
return cleaned[8] === validLetters[num % 23];
|
|
44025
44045
|
}
|
|
44046
|
+
function checkEsNuss(raw) {
|
|
44047
|
+
const digits = raw.replace(/\D/g, "");
|
|
44048
|
+
if (digits.length !== 12) return false;
|
|
44049
|
+
const a6 = parseInt(digits.slice(0, 2), 10);
|
|
44050
|
+
const b6 = parseInt(digits.slice(2, 10), 10);
|
|
44051
|
+
const c6 = parseInt(digits.slice(10), 10);
|
|
44052
|
+
let check;
|
|
44053
|
+
if (b6 < 1e7) {
|
|
44054
|
+
check = (a6 * 1e7 + b6) % 97;
|
|
44055
|
+
} else {
|
|
44056
|
+
check = Number(BigInt(digits.slice(0, 10)) % 97n);
|
|
44057
|
+
}
|
|
44058
|
+
return check === c6;
|
|
44059
|
+
}
|
|
44060
|
+
function checkEsCcc(raw) {
|
|
44061
|
+
const digits = raw.replace(/\D/g, "");
|
|
44062
|
+
if (digits.length !== 20) return false;
|
|
44063
|
+
const weights = [1, 2, 4, 8, 5, 10, 9, 7, 3, 6];
|
|
44064
|
+
const calcDigit = (block) => {
|
|
44065
|
+
let s6 = 0;
|
|
44066
|
+
for (let i6 = 0; i6 < block.length; i6++) {
|
|
44067
|
+
s6 += parseInt(block[i6], 10) * weights[i6];
|
|
44068
|
+
}
|
|
44069
|
+
let rem = 11 - s6 % 11;
|
|
44070
|
+
if (rem === 10) return 1;
|
|
44071
|
+
if (rem === 11) return 0;
|
|
44072
|
+
return rem;
|
|
44073
|
+
};
|
|
44074
|
+
const d1 = calcDigit("00" + digits.slice(0, 8));
|
|
44075
|
+
const d22 = calcDigit(digits.slice(10));
|
|
44076
|
+
return parseInt(digits[8], 10) === d1 && parseInt(digits[9], 10) === d22;
|
|
44077
|
+
}
|
|
44026
44078
|
var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, UK_NINO_REGEX, VALIDATOR_DISPATCH; exports.DLPValidationEngine = void 0;
|
|
44027
44079
|
var init_handlers = __esm({
|
|
44028
44080
|
"src/core/dlp/handlers.ts"() {
|
|
@@ -44129,6 +44181,7 @@ var init_handlers = __esm({
|
|
|
44129
44181
|
UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
|
|
44130
44182
|
VALIDATOR_DISPATCH = {
|
|
44131
44183
|
luhn: checkLuhn,
|
|
44184
|
+
luhn_soft: checkLuhn,
|
|
44132
44185
|
ssn_area: checkSsnArea,
|
|
44133
44186
|
iban: checkIbanStructure,
|
|
44134
44187
|
aba_check: checkAbaRouting,
|
|
@@ -44137,7 +44190,9 @@ var init_handlers = __esm({
|
|
|
44137
44190
|
ipv4: checkIpv4Octets,
|
|
44138
44191
|
ca_sin: checkCaSin,
|
|
44139
44192
|
uk_nino: checkUkNino,
|
|
44140
|
-
es_id: checkEsId
|
|
44193
|
+
es_id: checkEsId,
|
|
44194
|
+
es_nuss: checkEsNuss,
|
|
44195
|
+
es_ccc: checkEsCcc
|
|
44141
44196
|
};
|
|
44142
44197
|
exports.DLPValidationEngine = class {
|
|
44143
44198
|
/**
|
|
@@ -58926,6 +58981,7 @@ var init_transformers_scanner = __esm({
|
|
|
58926
58981
|
const end = r6.end;
|
|
58927
58982
|
const val = text.slice(start, end);
|
|
58928
58983
|
const entityType = this._mapEntityType(r6.entity);
|
|
58984
|
+
if (!this._supportedEntities.includes(entityType)) continue;
|
|
58929
58985
|
let confidence = r6.score || 0.7;
|
|
58930
58986
|
if (aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " "))) {
|
|
58931
58987
|
confidence = Math.min(1, confidence + 0.2);
|
|
@@ -59028,7 +59084,7 @@ function getScanner() {
|
|
|
59028
59084
|
}
|
|
59029
59085
|
return scannerInstance;
|
|
59030
59086
|
}
|
|
59031
|
-
var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer
|
|
59087
|
+
var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer; exports.BaseScanner = void 0; exports.PresidioScanner = void 0; var scannerInstance;
|
|
59032
59088
|
var init_scanner = __esm({
|
|
59033
59089
|
"src/core/scanner.ts"() {
|
|
59034
59090
|
init_config();
|
|
@@ -59043,39 +59099,12 @@ var init_scanner = __esm({
|
|
|
59043
59099
|
_dlpPatternRegistry = new exports.DLPPatternRegistry();
|
|
59044
59100
|
_dlpValidationEngine = new exports.DLPValidationEngine();
|
|
59045
59101
|
_dlpConfidenceScorer = new exports.DLPConfidenceScorer();
|
|
59046
|
-
|
|
59047
|
-
"EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
|
|
59048
|
-
"PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
|
|
59049
|
-
"PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
|
|
59050
|
-
"US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
|
|
59051
|
-
"CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
|
|
59052
|
-
"US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
|
|
59053
|
-
"US_PASSPORT": /\b[A-Z]\d{8}\b/g,
|
|
59054
|
-
"DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g
|
|
59055
|
-
};
|
|
59056
|
-
CONTEXT_KEYWORDS = /* @__PURE__ */ new Set([
|
|
59057
|
-
"account number",
|
|
59058
|
-
"ssn",
|
|
59059
|
-
"phone",
|
|
59060
|
-
"credit card",
|
|
59061
|
-
"iban",
|
|
59062
|
-
"bank",
|
|
59063
|
-
"email",
|
|
59064
|
-
"pii",
|
|
59065
|
-
"personal info"
|
|
59066
|
-
]);
|
|
59067
|
-
exports.BaseScanner = class _BaseScanner {
|
|
59102
|
+
exports.BaseScanner = class {
|
|
59068
59103
|
constructor() {
|
|
59069
59104
|
this._supportedEntities = [
|
|
59070
|
-
"
|
|
59071
|
-
"
|
|
59072
|
-
"
|
|
59073
|
-
"CREDIT_CARD",
|
|
59074
|
-
"US_BANK_NUMBER",
|
|
59075
|
-
"CRYPTO",
|
|
59076
|
-
"IBAN_CODE",
|
|
59077
|
-
"IP_ADDRESS",
|
|
59078
|
-
"PERSON"
|
|
59105
|
+
"PERSON",
|
|
59106
|
+
"LOCATION",
|
|
59107
|
+
"ORGANIZATION"
|
|
59079
59108
|
];
|
|
59080
59109
|
}
|
|
59081
59110
|
setSupportedEntities(entities) {
|
|
@@ -59219,47 +59248,13 @@ var init_scanner = __esm({
|
|
|
59219
59248
|
}));
|
|
59220
59249
|
return [reconstruct(text, resolved), entities];
|
|
59221
59250
|
}
|
|
59251
|
+
/** Tier 1 — Deterministic detection (Legacy: Redirected to DLP) */
|
|
59222
59252
|
async _tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold) {
|
|
59223
|
-
|
|
59224
|
-
for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
|
|
59225
|
-
const re = new RegExp(pattern.source, pattern.flags);
|
|
59226
|
-
let match;
|
|
59227
|
-
while ((match = re.exec(text)) !== null) {
|
|
59228
|
-
const val = match[0];
|
|
59229
|
-
if (looksLikeToken(val)) continue;
|
|
59230
|
-
let confidence = aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " ")) ? 1 : 0.95;
|
|
59231
|
-
if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
|
|
59232
|
-
if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(val)) continue;
|
|
59233
|
-
if (confidence >= confidenceThreshold) {
|
|
59234
|
-
spans.push({
|
|
59235
|
-
start: match.index,
|
|
59236
|
-
end: match.index + val.length,
|
|
59237
|
-
entityType,
|
|
59238
|
-
originalValue: val,
|
|
59239
|
-
confidence,
|
|
59240
|
-
method: "regex"
|
|
59241
|
-
});
|
|
59242
|
-
}
|
|
59243
|
-
}
|
|
59244
|
-
}
|
|
59245
|
-
return spans;
|
|
59253
|
+
return this._tier0CollectSpans(text, confidenceThreshold);
|
|
59246
59254
|
}
|
|
59247
|
-
/** Backward-compat wrapper. */
|
|
59255
|
+
/** Backward-compat wrapper. Redirected to DLP. */
|
|
59248
59256
|
async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
|
|
59249
|
-
|
|
59250
|
-
const resolved = resolveOverlaps(spans);
|
|
59251
|
-
const entities = [];
|
|
59252
|
-
await Promise.all(resolved.map(async (span) => {
|
|
59253
|
-
span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
|
|
59254
|
-
entities.push({
|
|
59255
|
-
type: span.entityType,
|
|
59256
|
-
value: span.originalValue,
|
|
59257
|
-
method: span.method,
|
|
59258
|
-
confidence: span.confidence,
|
|
59259
|
-
masked_value: span.maskedValue
|
|
59260
|
-
});
|
|
59261
|
-
}));
|
|
59262
|
-
return [reconstruct(text, resolved), entities];
|
|
59257
|
+
return this._tier0Dlp(text, encodeFn, confidenceThreshold);
|
|
59263
59258
|
}
|
|
59264
59259
|
async _tier2Nlp(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
|
|
59265
59260
|
return [text, []];
|
|
@@ -59268,24 +59263,26 @@ var init_scanner = __esm({
|
|
|
59268
59263
|
if (!context) return /* @__PURE__ */ new Set();
|
|
59269
59264
|
const lowered = context.toLowerCase();
|
|
59270
59265
|
const boosted = /* @__PURE__ */ new Set();
|
|
59271
|
-
for (const
|
|
59272
|
-
|
|
59266
|
+
for (const [, desc] of _dlpPatternRegistry.iterDescriptors()) {
|
|
59267
|
+
for (const term of desc.proximityTerms) {
|
|
59268
|
+
if (lowered.includes(term)) {
|
|
59269
|
+
boosted.add(desc.category.toLowerCase());
|
|
59270
|
+
break;
|
|
59271
|
+
}
|
|
59272
|
+
}
|
|
59273
59273
|
}
|
|
59274
59274
|
return boosted;
|
|
59275
59275
|
}
|
|
59276
59276
|
async scanAndTokenize(text, options = {}) {
|
|
59277
59277
|
if (!text || typeof text !== "string") return text;
|
|
59278
|
-
const pipeline = options.pipeline || ["dlp", "
|
|
59278
|
+
const pipeline = options.pipeline || ["dlp", "nlp"];
|
|
59279
59279
|
const _encode = options.encodeFn || encode;
|
|
59280
59280
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
59281
59281
|
const boost = this._resolveBoost(options.context);
|
|
59282
59282
|
const allSpans = [];
|
|
59283
|
-
if (pipeline.includes("dlp")) {
|
|
59283
|
+
if (pipeline.includes("dlp") || pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59284
59284
|
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59285
59285
|
}
|
|
59286
|
-
if (pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59287
|
-
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
59288
|
-
}
|
|
59289
59286
|
const resolved = resolveOverlaps(allSpans);
|
|
59290
59287
|
await Promise.all(resolved.map(async (span) => {
|
|
59291
59288
|
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
@@ -59298,18 +59295,15 @@ var init_scanner = __esm({
|
|
|
59298
59295
|
}
|
|
59299
59296
|
async scanAndReturnEntities(text, options = {}) {
|
|
59300
59297
|
if (!text || typeof text !== "string") return [];
|
|
59301
|
-
const pipeline = options.pipeline || ["dlp", "
|
|
59298
|
+
const pipeline = options.pipeline || ["dlp", "nlp"];
|
|
59302
59299
|
const _encode = options.encodeFn || encode;
|
|
59303
59300
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
59304
59301
|
const boost = this._resolveBoost(options.context);
|
|
59305
59302
|
const allEntities = [];
|
|
59306
59303
|
const allSpans = [];
|
|
59307
|
-
if (pipeline.includes("dlp")) {
|
|
59304
|
+
if (pipeline.includes("dlp") || pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59308
59305
|
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59309
59306
|
}
|
|
59310
|
-
if (pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59311
|
-
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
59312
|
-
}
|
|
59313
59307
|
const resolved = resolveOverlaps(allSpans);
|
|
59314
59308
|
await Promise.all(resolved.map(async (span) => {
|
|
59315
59309
|
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
@@ -59625,7 +59619,7 @@ init_handlers();
|
|
|
59625
59619
|
init_scorer();
|
|
59626
59620
|
|
|
59627
59621
|
// src/index.ts
|
|
59628
|
-
var VERSION = "
|
|
59622
|
+
var VERSION = "3.4.0";
|
|
59629
59623
|
async function detectEntitiesWithConfidence(text, options = {}) {
|
|
59630
59624
|
const scanner = getScanner();
|
|
59631
59625
|
return await scanner.scanAndReturnEntities(text, options);
|