mask-privacy 3.0.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -17
- package/dist/index.d.mts +58 -27
- package/dist/index.d.ts +58 -27
- package/dist/index.js +394 -310
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +394 -310
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/core/dlp/assessor.ts +3 -26
- package/src/core/dlp/handlers.ts +44 -31
- package/src/core/dlp/index.ts +0 -2
- package/src/core/dlp/registry.ts +119 -107
- package/src/core/dlp/scorer.ts +4 -4
- package/src/core/fpe.ts +85 -32
- package/src/core/fpe_utils.ts +20 -20
- package/src/core/scanner.ts +146 -151
- package/src/core/span.ts +76 -0
- package/src/core/transformers_scanner.ts +2 -2
- package/src/core/vault.ts +2 -1
- package/tests/async.test.ts +2 -2
- package/tests/dlp_hardened.test.ts +21 -0
- package/tests/fpe.test.ts +4 -4
- package/tests/hooks.test.ts +2 -2
- package/tests/langchain.test.ts +2 -2
- package/tests/llamaindex.test.ts +1 -1
- package/tests/scanner.test.ts +0 -1
- package/tests/substring.test.ts +1 -1
- package/tests/vault.test.ts +1 -1
package/dist/index.mjs
CHANGED
|
@@ -231,10 +231,13 @@ var init_exceptions = __esm({
|
|
|
231
231
|
function looksLikeToken(value) {
|
|
232
232
|
if (typeof value !== "string") return false;
|
|
233
233
|
const v7 = value.trim();
|
|
234
|
-
if (v7.startsWith("tkn-") && v7.includes("@
|
|
235
|
-
|
|
234
|
+
if (v7.startsWith("tkn-") && v7.includes("@")) {
|
|
235
|
+
const parts = v7.split("@");
|
|
236
|
+
if (parts.length === 2 && parts[0].length >= 12 && parts[1].includes(".")) {
|
|
237
|
+
return true;
|
|
238
|
+
}
|
|
236
239
|
}
|
|
237
|
-
if (
|
|
240
|
+
if (/^\+[1-9]\d{0,3}-555-\d{7}$/.test(v7)) {
|
|
238
241
|
return true;
|
|
239
242
|
}
|
|
240
243
|
if (v7.startsWith("000-00-") && v7.length === 11) {
|
|
@@ -246,16 +249,13 @@ function looksLikeToken(value) {
|
|
|
246
249
|
if (v7.startsWith("000000") && v7.length === 9) {
|
|
247
250
|
return true;
|
|
248
251
|
}
|
|
249
|
-
if (v7.startsWith("
|
|
250
|
-
return true;
|
|
251
|
-
}
|
|
252
|
-
if (v7.length === 11 && v7.startsWith("990000") && /^\d+$/.test(v7) && parseInt(v7[v7.length - 1], 10) % 2 === 0) {
|
|
252
|
+
if (v7.length === 9 && v7.startsWith("000") && /[A-Z]$/.test(v7)) {
|
|
253
253
|
return true;
|
|
254
254
|
}
|
|
255
|
-
if (
|
|
255
|
+
if (/^[A-Z]{2}00[A-F0-9]{4,16}$/.test(v7)) {
|
|
256
256
|
return true;
|
|
257
257
|
}
|
|
258
|
-
if (
|
|
258
|
+
if (/^<(PER|LOC|ORG):[^>]+>$/.test(v7)) {
|
|
259
259
|
return true;
|
|
260
260
|
}
|
|
261
261
|
if (v7.startsWith("[TKN-") && v7.endsWith("]")) {
|
|
@@ -267,7 +267,7 @@ var TOKEN_PATTERN;
|
|
|
267
267
|
var init_fpe_utils = __esm({
|
|
268
268
|
"src/core/fpe_utils.ts"() {
|
|
269
269
|
TOKEN_PATTERN = new RegExp(
|
|
270
|
-
"tkn-[a-f0-9]{8,64}@
|
|
270
|
+
"tkn-[a-f0-9]{8,64}@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}|\\+[1-9]\\d{0,3}-555-\\d{7}|000-00-\\d{4}|4000-0000-0000-\\d{4}|000000\\d{3}|000\\d{5}[A-Z]|[A-Z]{2}00[A-F0-9]{4,16}|<(?:PER|LOC|ORG):[^>]+>|\\[TKN-[a-f0-9]{8,64}\\]",
|
|
271
271
|
// Opaque
|
|
272
272
|
"g"
|
|
273
273
|
);
|
|
@@ -316,42 +316,87 @@ async function _hmacDigits(plaintext, n6, offset = 0) {
|
|
|
316
316
|
}
|
|
317
317
|
return result.join("");
|
|
318
318
|
}
|
|
319
|
-
async function
|
|
320
|
-
const
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
319
|
+
async function _pickFromArray(plaintext, array) {
|
|
320
|
+
const digits = await _hmacDigits(plaintext, 8);
|
|
321
|
+
const num = parseInt(digits, 10);
|
|
322
|
+
return array[num % array.length];
|
|
323
|
+
}
|
|
324
|
+
function _computeLuhnDigit(partialNum) {
|
|
325
|
+
const digits = partialNum.split("").map(Number);
|
|
326
|
+
let sum = 0;
|
|
327
|
+
let shouldDouble = true;
|
|
328
|
+
for (let i6 = digits.length - 1; i6 >= 0; i6--) {
|
|
329
|
+
let digit = digits[i6];
|
|
330
|
+
if (shouldDouble) {
|
|
331
|
+
digit *= 2;
|
|
332
|
+
if (digit > 9) digit -= 9;
|
|
333
|
+
}
|
|
334
|
+
sum += digit;
|
|
335
|
+
shouldDouble = !shouldDouble;
|
|
326
336
|
}
|
|
327
|
-
|
|
337
|
+
return ((10 - sum % 10) % 10).toString();
|
|
338
|
+
}
|
|
339
|
+
function _computeEsIdCheck(num) {
|
|
340
|
+
return "TRWAGMYFPDXBNJZSQVHLCKE"[num % 23];
|
|
341
|
+
}
|
|
342
|
+
async function generateFPEToken(rawText, entityType = "UNKNOWN") {
|
|
343
|
+
const text = rawText.trim();
|
|
344
|
+
let type = (entityType || "UNKNOWN").toUpperCase();
|
|
345
|
+
if (type === "UNKNOWN") {
|
|
346
|
+
if (_EMAIL_RE.test(text)) type = "EMAIL_ADDRESS";
|
|
347
|
+
else if (_SSN_RE.test(text)) type = "US_SSN";
|
|
348
|
+
else if (_CC_RE.test(text)) type = "CREDIT_CARD";
|
|
349
|
+
else if (_ROUTING_RE.test(text)) type = "US_ROUTING_NUMBER";
|
|
350
|
+
else if (_ES_ID_RE.test(text)) type = "ES_DNI";
|
|
351
|
+
else if (_IBAN_RE.test(text)) type = "INTL_BANK_IBAN";
|
|
352
|
+
else if (_PHONE_RE.test(text)) type = "PHONE_NUMBER";
|
|
353
|
+
}
|
|
354
|
+
if (type === "EMAIL_ADDRESS" || type === "EMAIL_ADDR") {
|
|
355
|
+
const parts = text.split("@");
|
|
356
|
+
const domain = parts.length === 2 ? parts[1] : "email.com";
|
|
357
|
+
return `tkn-${await _hmacHex(text)}@${domain}`;
|
|
358
|
+
}
|
|
359
|
+
if (type === "PHONE_NUMBER" || type === "PHONE_NUM" || type === "PHONE_NUM_INTL") {
|
|
360
|
+
const m6 = text.match(/^\+([1-9]\d{0,3})/);
|
|
361
|
+
const cc = m6 ? m6[1] : "1";
|
|
362
|
+
return `+${cc}-555-${await _hmacDigits(text, 7)}`;
|
|
363
|
+
}
|
|
364
|
+
if (type === "US_SSN") {
|
|
328
365
|
return `000-00-${await _hmacDigits(text, 4)}`;
|
|
329
366
|
}
|
|
330
|
-
if (
|
|
331
|
-
|
|
367
|
+
if (type === "CREDIT_CARD" || type === "CREDIT_CARD_NUMBER") {
|
|
368
|
+
const base = `400000000000${await _hmacDigits(text, 3)}`;
|
|
369
|
+
const checkDig = _computeLuhnDigit(base);
|
|
370
|
+
const full = base + checkDig;
|
|
371
|
+
return `${full.slice(0, 4)}-${full.slice(4, 8)}-${full.slice(8, 12)}-${full.slice(12, 16)}`;
|
|
332
372
|
}
|
|
333
|
-
if (
|
|
373
|
+
if (type === "US_ROUTING_NUMBER" || type === "US_ABA_ROUTING") {
|
|
334
374
|
return `000000${await _hmacDigits(text, 3)}`;
|
|
335
375
|
}
|
|
336
|
-
if (
|
|
337
|
-
const
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
376
|
+
if (type === "INTL_BANK_IBAN" || type === "IBAN_CODE") {
|
|
377
|
+
const countryCode = text.length >= 2 && /[a-zA-Z]{2}/.test(text.slice(0, 2)) ? text.slice(0, 2).toUpperCase() : "US";
|
|
378
|
+
return `${countryCode}00${(await _hmacHex(text, 8)).toUpperCase()}`;
|
|
379
|
+
}
|
|
380
|
+
if (type === "ES_DNI") {
|
|
381
|
+
const digits = `000${await _hmacDigits(text, 5)}`;
|
|
382
|
+
return digits + _computeEsIdCheck(parseInt(digits, 10));
|
|
341
383
|
}
|
|
342
|
-
if (
|
|
343
|
-
|
|
384
|
+
if (type === "PERSON" || type === "PERSON_NAME") {
|
|
385
|
+
const f6 = await _pickFromArray(text, _FIRST_NAMES);
|
|
386
|
+
const l6 = await _pickFromArray(text + "last", _LAST_NAMES);
|
|
387
|
+
return `<PER:${f6}_${l6}>`;
|
|
344
388
|
}
|
|
345
|
-
if (
|
|
346
|
-
|
|
389
|
+
if (type === "LOCATION" || type === "PHYS_ADDRESS") {
|
|
390
|
+
const c6 = await _pickFromArray(text, _CITIES);
|
|
391
|
+
return `<LOC:${c6}>`;
|
|
347
392
|
}
|
|
348
|
-
if (
|
|
349
|
-
const
|
|
350
|
-
return
|
|
393
|
+
if (type === "ORGANIZATION") {
|
|
394
|
+
const c6 = await _pickFromArray(text, _LAST_NAMES);
|
|
395
|
+
return `<ORG:${c6}_Inc>`;
|
|
351
396
|
}
|
|
352
397
|
return `[TKN-${await _hmacHex(text)}]`;
|
|
353
398
|
}
|
|
354
|
-
var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE,
|
|
399
|
+
var _masterKey, _EMAIL_RE, _PHONE_RE, _SSN_RE, _CC_RE, _ROUTING_RE, _ES_ID_RE, _IBAN_RE, _FIRST_NAMES, _LAST_NAMES, _CITIES;
|
|
355
400
|
var init_fpe = __esm({
|
|
356
401
|
"src/core/fpe.ts"() {
|
|
357
402
|
init_config();
|
|
@@ -360,14 +405,15 @@ var init_fpe = __esm({
|
|
|
360
405
|
init_fpe_utils();
|
|
361
406
|
_masterKey = null;
|
|
362
407
|
_EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
|
|
363
|
-
_PHONE_RE =
|
|
408
|
+
_PHONE_RE = /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/;
|
|
364
409
|
_SSN_RE = /^\d{3}-\d{2}-\d{4}$/;
|
|
365
410
|
_CC_RE = /^(?:\d{4}[ \-]?){3}\d{4}$/;
|
|
366
411
|
_ROUTING_RE = /^\d{9}$/;
|
|
367
|
-
|
|
368
|
-
_SAUDI_NID_RE = /^1\d{9}$/;
|
|
369
|
-
_UAE_EID_RE = /^784-\d{4}-\d{7}-\d$/;
|
|
412
|
+
_ES_ID_RE = /^(?:\d{8}[A-Z]|[XYZ]\d{7}[A-Z])$/;
|
|
370
413
|
_IBAN_RE = /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/;
|
|
414
|
+
_FIRST_NAMES = ["Taylor", "Jordan", "Casey", "Morgan", "Riley", "Avery", "Rowan", "Quinn", "Charlie", "Peyton", "Blake", "Dakota", "Reese", "Skyler", "Finley", "Eden", "Harley", "Rory", "Emerson", "Remi"];
|
|
415
|
+
_LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"];
|
|
416
|
+
_CITIES = ["London", "Paris", "Berlin", "Tokyo", "Rome", "Madrid", "Vienna", "Sydney", "Toronto", "Chicago", "Seattle", "Austin", "Boston", "Denver", "Dallas", "Miami", "Seoul", "Dubai", "Mumbai", "Cairo"];
|
|
371
417
|
}
|
|
372
418
|
});
|
|
373
419
|
|
|
@@ -42968,7 +43014,7 @@ async function encode(rawText, options = {}) {
|
|
|
42968
43014
|
return existingToken;
|
|
42969
43015
|
}
|
|
42970
43016
|
}
|
|
42971
|
-
const token = await generateFPEToken(text);
|
|
43017
|
+
const token = await generateFPEToken(text, options.entityType || "UNKNOWN");
|
|
42972
43018
|
const ciphertext = cryptoEngine.encrypt(text);
|
|
42973
43019
|
const ttl = options.ttl || DEFAULT_TTL;
|
|
42974
43020
|
await vault.store(token, ciphertext, ttl, ptHash);
|
|
@@ -43420,19 +43466,8 @@ var SCRIPT_SIGNATURES, LanguageContextResolver;
|
|
|
43420
43466
|
var init_assessor = __esm({
|
|
43421
43467
|
"src/core/dlp/assessor.ts"() {
|
|
43422
43468
|
SCRIPT_SIGNATURES = [
|
|
43423
|
-
// CJK / East-Asian — checked first because they are unambiguous
|
|
43424
|
-
{ tag: "zh", pattern: /[\u4e00-\u9fff\u3400-\u4dbf]/g },
|
|
43425
|
-
{ tag: "ja", pattern: /[\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff]/g },
|
|
43426
|
-
// Arabic script — covers Standard Arabic, Urdu overlap, etc.
|
|
43427
|
-
{ tag: "ar", pattern: /[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]/g },
|
|
43428
|
-
// Turkish — distinguished by dotless-i (ı), soft-g (ğ), ş, and cedilla ç
|
|
43429
|
-
{ tag: "tr", pattern: /[ğıİşŞ]/g },
|
|
43430
|
-
// German — umlauts and Eszett
|
|
43431
|
-
{ tag: "de", pattern: /[äöüÄÖÜß]/g },
|
|
43432
43469
|
// Spanish — ñ and inverted punctuation
|
|
43433
|
-
{ tag: "es", pattern: /[ñÑ¡¿]/g }
|
|
43434
|
-
// French — cedilla, accented vowels with circumflex / diaeresis
|
|
43435
|
-
{ tag: "fr", pattern: /[àâçéèêëïîôùûüÿœæ]/gi }
|
|
43470
|
+
{ tag: "es", pattern: /[ñÑ¡¿]/g }
|
|
43436
43471
|
];
|
|
43437
43472
|
LanguageContextResolver = class {
|
|
43438
43473
|
constructor(charThreshold = 1) {
|
|
@@ -43490,34 +43525,12 @@ var init_registry = __esm({
|
|
|
43490
43525
|
})(SensitiveCategory || {});
|
|
43491
43526
|
LOCALE_NAME_RULES = {
|
|
43492
43527
|
en: [
|
|
43493
|
-
/\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b/g,
|
|
43494
|
-
/\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\b/g
|
|
43528
|
+
/\b[A-Z][a-z\-\']+ [A-Z][a-z\-\']+(?:\s+[A-Z][a-z\-\']+)?\b/g,
|
|
43529
|
+
/\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z\-\']+\b/g
|
|
43495
43530
|
],
|
|
43496
43531
|
es: [
|
|
43497
|
-
/\b[A-Z][a-
|
|
43498
|
-
/\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-
|
|
43499
|
-
],
|
|
43500
|
-
fr: [
|
|
43501
|
-
/\b[A-Z][a-zàâçéèêëïîôùûü]+ [A-Z][a-zàâçéèêëïîôùûü]+\b/g,
|
|
43502
|
-
/\b(?:M|Mme|Mlle)\.?\s+[A-Z][a-zàâçéèêëïîôùûü]+\b/g
|
|
43503
|
-
],
|
|
43504
|
-
de: [
|
|
43505
|
-
/\b[A-Z][a-zäöüß]+ [A-Z][a-zäöüß]+\b/g,
|
|
43506
|
-
/\b(?:Herr|Frau)\.?\s+[A-Z][a-zäöüß]+\b/g
|
|
43507
|
-
],
|
|
43508
|
-
tr: [
|
|
43509
|
-
/\b[A-ZÇĞİÖŞÜ][a-zçğıöşü]+ [A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g,
|
|
43510
|
-
/\b(?:Bay|Bayan|Sayın)\.?\s+[A-ZÇĞİÖŞÜ][a-zçğıöşü]+\b/g
|
|
43511
|
-
],
|
|
43512
|
-
ar: [
|
|
43513
|
-
/[\u0621-\u064a][\u0600-\u06ff]+ [\u0621-\u064a][\u0600-\u06ff]+/g,
|
|
43514
|
-
/(?:أبو|أم|ابن|بنت)\s+[\u0621-\u064a][\u0600-\u06ff]+/gi
|
|
43515
|
-
],
|
|
43516
|
-
ja: [
|
|
43517
|
-
/\b[A-Z][a-z]+(?:moto|yama|kawa|mura|ta|da|shi|no)\s+[A-Z][a-z]+\b/g
|
|
43518
|
-
],
|
|
43519
|
-
zh: [
|
|
43520
|
-
/\b[A-Z][a-z]{1,3}\s+[A-Z][a-z]+\b/g
|
|
43532
|
+
/\b[A-Z][a-záéíóúñ\-\']+ [A-Z][a-záéíóúñ\-\']+(?:\s+[A-Z][a-záéíóúñ\-\']+)?\b/g,
|
|
43533
|
+
/\b(?:Sr|Sra|Srta)\.?\s+[A-Z][a-záéíóúñ\-\']+\b/g
|
|
43521
43534
|
]
|
|
43522
43535
|
};
|
|
43523
43536
|
LOCALE_ADDRESS_RULES = {
|
|
@@ -43525,26 +43538,8 @@ var init_registry = __esm({
|
|
|
43525
43538
|
/\b\d{1,5}\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way)\b/g,
|
|
43526
43539
|
/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?\b/g
|
|
43527
43540
|
],
|
|
43528
|
-
|
|
43529
|
-
/\b
|
|
43530
|
-
],
|
|
43531
|
-
de: [
|
|
43532
|
-
/\b[A-ZÄÖÜa-zäöüß]+(?:straße|strasse|weg|gasse|platz)\s+\d{1,4}\b/g
|
|
43533
|
-
],
|
|
43534
|
-
tr: [
|
|
43535
|
-
/\b[A-ZÇĞİÖŞÜa-zçğıöşü]+\s+(?:Cad|Sok|Mah)\.?\s+/gi,
|
|
43536
|
-
/\b\d{5}\s+[A-ZÇĞİÖŞÜa-zçğıöşü]+\/[A-ZÇĞİÖŞÜa-zçğıöşü]+\b/g
|
|
43537
|
-
],
|
|
43538
|
-
ar: [
|
|
43539
|
-
/شارع\s+[\u0600-\u06ff]+/g,
|
|
43540
|
-
/حي\s+[\u0600-\u06ff]+/g,
|
|
43541
|
-
/(?:ص\.ب|P\.?O\.?\s*Box)\s*\d{3,6}/gi
|
|
43542
|
-
],
|
|
43543
|
-
uk_postcode: [
|
|
43544
|
-
/\b[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\b/g
|
|
43545
|
-
],
|
|
43546
|
-
ca_postal: [
|
|
43547
|
-
/\b[A-Z]\d[A-Z]\s*\d[A-Z]\d\b/g
|
|
43541
|
+
es: [
|
|
43542
|
+
/\b(?:Calle|Carrera|Avenida|Paseo|Plaza)\s+[A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+\b/gi
|
|
43548
43543
|
]
|
|
43549
43544
|
};
|
|
43550
43545
|
RAW_PATTERNS = [
|
|
@@ -43552,7 +43547,6 @@ var init_registry = __esm({
|
|
|
43552
43547
|
[
|
|
43553
43548
|
"US_SSN",
|
|
43554
43549
|
"\\b(?!000|666|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0000)\\d{4}\\b",
|
|
43555
|
-
"g",
|
|
43556
43550
|
["ssn", "social security", "tax id", "taxpayer"],
|
|
43557
43551
|
0.95,
|
|
43558
43552
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43561,7 +43555,6 @@ var init_registry = __esm({
|
|
|
43561
43555
|
[
|
|
43562
43556
|
"CREDIT_CARD_NUMBER",
|
|
43563
43557
|
"\\b(?:4\\d{3}|5[1-5]\\d{2}|3[47]\\d{2}|6(?:011|5\\d{2}))[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}\\b",
|
|
43564
|
-
"g",
|
|
43565
43558
|
["card", "credit", "visa", "mastercard", "amex", "payment"],
|
|
43566
43559
|
0.97,
|
|
43567
43560
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43570,7 +43563,6 @@ var init_registry = __esm({
|
|
|
43570
43563
|
[
|
|
43571
43564
|
"INTL_BANK_IBAN",
|
|
43572
43565
|
"\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}[A-Z0-9]{0,16}\\b",
|
|
43573
|
-
"g",
|
|
43574
43566
|
["iban", "swift", "sepa", "wire", "bank transfer"],
|
|
43575
43567
|
0.96,
|
|
43576
43568
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43579,7 +43571,6 @@ var init_registry = __esm({
|
|
|
43579
43571
|
[
|
|
43580
43572
|
"CRYPTO_BTC",
|
|
43581
43573
|
"\\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-z0-9]{39,59})\\b",
|
|
43582
|
-
"g",
|
|
43583
43574
|
["bitcoin", "btc", "wallet", "crypto"],
|
|
43584
43575
|
0.94,
|
|
43585
43576
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43588,7 +43579,6 @@ var init_registry = __esm({
|
|
|
43588
43579
|
[
|
|
43589
43580
|
"CRYPTO_ETH",
|
|
43590
43581
|
"\\b0x[a-fA-F0-9]{40}\\b",
|
|
43591
|
-
"g",
|
|
43592
43582
|
["ethereum", "eth", "wallet", "0x"],
|
|
43593
43583
|
0.93,
|
|
43594
43584
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43596,8 +43586,7 @@ var init_registry = __esm({
|
|
|
43596
43586
|
],
|
|
43597
43587
|
[
|
|
43598
43588
|
"US_ABA_ROUTING",
|
|
43599
|
-
|
|
43600
|
-
"g",
|
|
43589
|
+
/(?<!\d)\d{9}(?!\d)/,
|
|
43601
43590
|
["routing", "aba", "wire", "bank"],
|
|
43602
43591
|
0.88,
|
|
43603
43592
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43605,17 +43594,15 @@ var init_registry = __esm({
|
|
|
43605
43594
|
],
|
|
43606
43595
|
[
|
|
43607
43596
|
"BANK_ACCT_NUM",
|
|
43608
|
-
|
|
43609
|
-
"g",
|
|
43597
|
+
/(?<!\d)\d{8,17}(?!\d)/,
|
|
43610
43598
|
["account", "checking", "savings", "deposit", "bank"],
|
|
43611
|
-
0.
|
|
43599
|
+
0.5,
|
|
43612
43600
|
"FINANCIAL" /* FINANCIAL */,
|
|
43613
|
-
|
|
43601
|
+
"luhn_soft"
|
|
43614
43602
|
],
|
|
43615
43603
|
[
|
|
43616
43604
|
"SWIFT_BIC",
|
|
43617
43605
|
"\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
|
|
43618
|
-
"gi",
|
|
43619
43606
|
["swift", "bic", "bank code", "transfer"],
|
|
43620
43607
|
0.6,
|
|
43621
43608
|
"FINANCIAL" /* FINANCIAL */,
|
|
@@ -43625,7 +43612,6 @@ var init_registry = __esm({
|
|
|
43625
43612
|
[
|
|
43626
43613
|
"EMAIL_ADDR",
|
|
43627
43614
|
"\\b[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}\\b",
|
|
43628
|
-
"g",
|
|
43629
43615
|
["email", "mail", "contact", "address"],
|
|
43630
43616
|
0.99,
|
|
43631
43617
|
"CONTACT" /* CONTACT */,
|
|
@@ -43633,26 +43619,23 @@ var init_registry = __esm({
|
|
|
43633
43619
|
],
|
|
43634
43620
|
[
|
|
43635
43621
|
"PHONE_NUM",
|
|
43636
|
-
|
|
43637
|
-
"g",
|
|
43622
|
+
/(?<!\d)(?:\+?[1-9]\d{0,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}(?!\d)/,
|
|
43638
43623
|
["phone", "call", "mobile", "tel", "whatsapp", "number"],
|
|
43639
|
-
0.
|
|
43624
|
+
0.8,
|
|
43640
43625
|
"CONTACT" /* CONTACT */,
|
|
43641
43626
|
null
|
|
43642
43627
|
],
|
|
43643
43628
|
[
|
|
43644
43629
|
"PHONE_NUM_INTL",
|
|
43645
|
-
|
|
43646
|
-
"g",
|
|
43630
|
+
/(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/,
|
|
43647
43631
|
["phone", "call", "mobile", "tel"],
|
|
43648
|
-
0.
|
|
43632
|
+
0.8,
|
|
43649
43633
|
"CONTACT" /* CONTACT */,
|
|
43650
43634
|
null
|
|
43651
43635
|
],
|
|
43652
43636
|
[
|
|
43653
43637
|
"IPV4_ADDR",
|
|
43654
43638
|
"\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b",
|
|
43655
|
-
"g",
|
|
43656
43639
|
["ip", "server", "host", "network", "address"],
|
|
43657
43640
|
0.94,
|
|
43658
43641
|
"CONTACT" /* CONTACT */,
|
|
@@ -43661,7 +43644,6 @@ var init_registry = __esm({
|
|
|
43661
43644
|
[
|
|
43662
43645
|
"IPV6_ADDR",
|
|
43663
43646
|
"\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b",
|
|
43664
|
-
"g",
|
|
43665
43647
|
["ipv6", "ip", "network", "server"],
|
|
43666
43648
|
0.93,
|
|
43667
43649
|
"CONTACT" /* CONTACT */,
|
|
@@ -43670,7 +43652,6 @@ var init_registry = __esm({
|
|
|
43670
43652
|
[
|
|
43671
43653
|
"HW_MAC_ADDR",
|
|
43672
43654
|
"\\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\\b",
|
|
43673
|
-
"g",
|
|
43674
43655
|
["mac", "hardware", "network", "device"],
|
|
43675
43656
|
0.91,
|
|
43676
43657
|
"CONTACT" /* CONTACT */,
|
|
@@ -43680,7 +43661,6 @@ var init_registry = __esm({
|
|
|
43680
43661
|
[
|
|
43681
43662
|
"BIRTH_DATE",
|
|
43682
43663
|
"\\b(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b",
|
|
43683
|
-
"g",
|
|
43684
43664
|
["birth", "dob", "born", "birthday", "date of birth"],
|
|
43685
43665
|
0.88,
|
|
43686
43666
|
"PERSONAL" /* PERSONAL */,
|
|
@@ -43689,16 +43669,14 @@ var init_registry = __esm({
|
|
|
43689
43669
|
[
|
|
43690
43670
|
"US_DRIVERS_LIC",
|
|
43691
43671
|
"\\b(?:[A-Z]\\d{7,12}|\\d{7,12}[A-Z]?)\\b",
|
|
43692
|
-
"g",
|
|
43693
43672
|
["driver", "license", "licence", "dl", "dmv"],
|
|
43694
|
-
0.
|
|
43673
|
+
0.55,
|
|
43695
43674
|
"PERSONAL" /* PERSONAL */,
|
|
43696
43675
|
null
|
|
43697
43676
|
],
|
|
43698
43677
|
[
|
|
43699
43678
|
"US_PASSPORT_NUM",
|
|
43700
43679
|
"\\b[A-Z]\\d{8}\\b",
|
|
43701
|
-
"g",
|
|
43702
43680
|
["passport", "travel", "visa", "immigration"],
|
|
43703
43681
|
0.87,
|
|
43704
43682
|
"PERSONAL" /* PERSONAL */,
|
|
@@ -43708,7 +43686,6 @@ var init_registry = __esm({
|
|
|
43708
43686
|
[
|
|
43709
43687
|
"VEHICLE_VIN",
|
|
43710
43688
|
"\\b[A-HJ-NPR-Z0-9]{17}\\b",
|
|
43711
|
-
"g",
|
|
43712
43689
|
["vin", "vehicle", "chassis", "automobile"],
|
|
43713
43690
|
0.92,
|
|
43714
43691
|
"VEHICLE" /* VEHICLE */,
|
|
@@ -43717,7 +43694,6 @@ var init_registry = __esm({
|
|
|
43717
43694
|
[
|
|
43718
43695
|
"VEHICLE_PLATE",
|
|
43719
43696
|
"\\b[A-Z0-9]{1,3}[\\-\\s][A-Z0-9]{1,4}[\\-\\s][A-Z0-9]{1,4}\\b",
|
|
43720
|
-
"g",
|
|
43721
43697
|
["plate", "registration", "vehicle", "plaka"],
|
|
43722
43698
|
0.45,
|
|
43723
43699
|
"VEHICLE" /* VEHICLE */,
|
|
@@ -43727,7 +43703,6 @@ var init_registry = __esm({
|
|
|
43727
43703
|
[
|
|
43728
43704
|
"MED_RECORD_ID",
|
|
43729
43705
|
"\\b(?:MRN|Patient ID|Medical Record)[:\\s]*[A-Z0-9]{6,10}\\b",
|
|
43730
|
-
"g",
|
|
43731
43706
|
["patient", "medical", "record", "mrn", "hospital"],
|
|
43732
43707
|
0.96,
|
|
43733
43708
|
"HEALTHCARE" /* HEALTHCARE */,
|
|
@@ -43736,7 +43711,6 @@ var init_registry = __esm({
|
|
|
43736
43711
|
[
|
|
43737
43712
|
"US_MEDICARE_ID",
|
|
43738
43713
|
"\\b\\d{3}-\\d{2}-\\d{4}[A-Z]\\b",
|
|
43739
|
-
"g",
|
|
43740
43714
|
["medicare", "cms", "beneficiary", "health insurance"],
|
|
43741
43715
|
0.91,
|
|
43742
43716
|
"HEALTHCARE" /* HEALTHCARE */,
|
|
@@ -43745,7 +43719,6 @@ var init_registry = __esm({
|
|
|
43745
43719
|
[
|
|
43746
43720
|
"US_DEA_NUM",
|
|
43747
43721
|
"\\b[A-Z]{2}\\d{7}\\b",
|
|
43748
|
-
"g",
|
|
43749
43722
|
["dea", "prescriber", "drug", "enforcement"],
|
|
43750
43723
|
0.89,
|
|
43751
43724
|
"HEALTHCARE" /* HEALTHCARE */,
|
|
@@ -43754,7 +43727,6 @@ var init_registry = __esm({
|
|
|
43754
43727
|
[
|
|
43755
43728
|
"US_NPI_NUM",
|
|
43756
43729
|
"\\b\\d{10}\\b",
|
|
43757
|
-
"g",
|
|
43758
43730
|
["npi", "provider", "national provider", "healthcare"],
|
|
43759
43731
|
0.87,
|
|
43760
43732
|
"HEALTHCARE" /* HEALTHCARE */,
|
|
@@ -43764,7 +43736,6 @@ var init_registry = __esm({
|
|
|
43764
43736
|
[
|
|
43765
43737
|
"US_EIN_TAX",
|
|
43766
43738
|
"\\b\\d{2}-\\d{7}\\b",
|
|
43767
|
-
"g",
|
|
43768
43739
|
["ein", "federal", "employer", "tax id"],
|
|
43769
43740
|
0.89,
|
|
43770
43741
|
"IDENTITY_US" /* IDENTITY_US */,
|
|
@@ -43774,71 +43745,33 @@ var init_registry = __esm({
|
|
|
43774
43745
|
[
|
|
43775
43746
|
"UK_NATL_INS",
|
|
43776
43747
|
"\\b[A-Z]{2}\\d{6}[A-Z]\\b",
|
|
43777
|
-
"g",
|
|
43778
43748
|
["nino", "national insurance", "ni number", "uk"],
|
|
43779
43749
|
0.9,
|
|
43780
43750
|
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43781
|
-
|
|
43751
|
+
"uk_nino"
|
|
43782
43752
|
],
|
|
43783
43753
|
[
|
|
43784
43754
|
"CA_SOCIAL_INS",
|
|
43785
43755
|
"\\b\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{3}\\b",
|
|
43786
|
-
"g",
|
|
43787
43756
|
["sin", "social insurance", "canada", "canadian"],
|
|
43788
43757
|
0.89,
|
|
43789
43758
|
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43790
|
-
|
|
43759
|
+
"ca_sin"
|
|
43791
43760
|
],
|
|
43792
43761
|
[
|
|
43793
|
-
"
|
|
43794
|
-
"
|
|
43795
|
-
"
|
|
43796
|
-
|
|
43797
|
-
0.88,
|
|
43798
|
-
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43799
|
-
null
|
|
43800
|
-
],
|
|
43801
|
-
[
|
|
43802
|
-
"DE_STEUER_ID",
|
|
43803
|
-
"\\b\\d{2}\\s?\\d{3}\\s?\\d{3}\\s?\\d{3}\\b",
|
|
43804
|
-
"g",
|
|
43805
|
-
["steuer", "steuernummer", "finanzamt", "deutschland"],
|
|
43806
|
-
0.87,
|
|
43807
|
-
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43808
|
-
null
|
|
43809
|
-
],
|
|
43810
|
-
[
|
|
43811
|
-
"TR_TCID",
|
|
43812
|
-
"\\b[1-9]\\d{9}[02468]\\b",
|
|
43813
|
-
"g",
|
|
43814
|
-
["tc", "kimlik", "vatanda\u015Fl\u0131k", "n\xFCfus", "t\xFCrkiye"],
|
|
43815
|
-
0.92,
|
|
43816
|
-
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43817
|
-
"tcid"
|
|
43818
|
-
],
|
|
43819
|
-
[
|
|
43820
|
-
"SA_NATIONAL_ID",
|
|
43821
|
-
"\\b1\\d{9}\\b",
|
|
43822
|
-
"g",
|
|
43823
|
-
["\u0647\u0648\u064A\u0629", "\u0631\u0642\u0645 \u0627\u0644\u0647\u0648\u064A\u0629", "saudi", "\u0648\u0637\u0646\u064A\u0629", "identity"],
|
|
43824
|
-
0.91,
|
|
43825
|
-
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43826
|
-
"saudi_nid"
|
|
43827
|
-
],
|
|
43828
|
-
[
|
|
43829
|
-
"UAE_EMIRATES_ID",
|
|
43830
|
-
"\\b784-\\d{4}-\\d{7}-\\d\\b",
|
|
43831
|
-
"g",
|
|
43832
|
-
["emirates", "\u0647\u0648\u064A\u0629", "uae", "emirati", "identity"],
|
|
43833
|
-
0.93,
|
|
43762
|
+
"ES_DNI",
|
|
43763
|
+
"(?:\\d{8}[A-Z]|[XYZ]\\d{7}[A-Z])",
|
|
43764
|
+
["dni", "nie", "identidad", "nif", "spain"],
|
|
43765
|
+
0.94,
|
|
43834
43766
|
"IDENTITY_INTL" /* IDENTITY_INTL */,
|
|
43835
|
-
"
|
|
43767
|
+
"es_id",
|
|
43768
|
+
true,
|
|
43769
|
+
["*", "es"]
|
|
43836
43770
|
],
|
|
43837
43771
|
// ── CORPORATE ──────────────────────────────────────────────────────
|
|
43838
43772
|
[
|
|
43839
43773
|
"CORP_EMPLOYEE_ID",
|
|
43840
|
-
"
|
|
43841
|
-
"gi",
|
|
43774
|
+
"(?:EMP|EMPLOYEE|ID)[:\\s]?[A-Z0-9]{5,10}",
|
|
43842
43775
|
["employee", "staff", "personnel", "worker"],
|
|
43843
43776
|
0.55,
|
|
43844
43777
|
"CORPORATE" /* CORPORATE */,
|
|
@@ -43848,7 +43781,11 @@ var init_registry = __esm({
|
|
|
43848
43781
|
DLPPatternRegistry = class {
|
|
43849
43782
|
constructor(loadGroups) {
|
|
43850
43783
|
this.catalogue = /* @__PURE__ */ new Map();
|
|
43784
|
+
this.localeCategoryRegexMap = /* @__PURE__ */ new Map();
|
|
43851
43785
|
this.buildCatalogue(loadGroups ?? null);
|
|
43786
|
+
for (const loc of ["*", "en", "es"]) {
|
|
43787
|
+
this.compileForLocale(loc);
|
|
43788
|
+
}
|
|
43852
43789
|
}
|
|
43853
43790
|
get typeNames() {
|
|
43854
43791
|
return [...this.catalogue.keys()];
|
|
@@ -43860,23 +43797,74 @@ var init_registry = __esm({
|
|
|
43860
43797
|
descriptorFor(typeName) {
|
|
43861
43798
|
return this.catalogue.get(typeName);
|
|
43862
43799
|
}
|
|
43863
|
-
/** Return locale-tuned name regexes, falling back to English. */
|
|
43864
43800
|
namePatternsFor(lang) {
|
|
43865
43801
|
return LOCALE_NAME_RULES[lang] ?? LOCALE_NAME_RULES["en"];
|
|
43866
43802
|
}
|
|
43867
|
-
/** Return locale-tuned address regexes, falling back to English. */
|
|
43868
43803
|
addressPatternsFor(lang) {
|
|
43869
43804
|
return LOCALE_ADDRESS_RULES[lang] ?? LOCALE_ADDRESS_RULES["en"];
|
|
43870
43805
|
}
|
|
43806
|
+
getCategoryRegexesMap(locale = "en") {
|
|
43807
|
+
if (!this.localeCategoryRegexMap.has(locale)) {
|
|
43808
|
+
this.compileForLocale(locale);
|
|
43809
|
+
}
|
|
43810
|
+
return this.localeCategoryRegexMap.get(locale);
|
|
43811
|
+
}
|
|
43812
|
+
getCategoryTypeMap(categoryName, locale = "en") {
|
|
43813
|
+
return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
|
|
43814
|
+
}
|
|
43815
|
+
compileForLocale(locale) {
|
|
43816
|
+
const localePool = /* @__PURE__ */ new Map();
|
|
43817
|
+
for (const [typeName, desc] of this.catalogue.entries()) {
|
|
43818
|
+
if (desc.supportedLocales.includes("*") || desc.supportedLocales.includes(locale)) {
|
|
43819
|
+
const catKey = desc.category;
|
|
43820
|
+
if (!localePool.has(catKey)) localePool.set(catKey, []);
|
|
43821
|
+
localePool.get(catKey).push([typeName, desc]);
|
|
43822
|
+
}
|
|
43823
|
+
}
|
|
43824
|
+
const categoryMap = /* @__PURE__ */ new Map();
|
|
43825
|
+
for (const [catKey, entries] of localePool.entries()) {
|
|
43826
|
+
entries.sort(([, a6], [, b6]) => {
|
|
43827
|
+
const aVal = a6.validatorTag ? 0 : 1;
|
|
43828
|
+
const bVal = b6.validatorTag ? 0 : 1;
|
|
43829
|
+
if (aVal !== bVal) return aVal - bVal;
|
|
43830
|
+
return b6.compiledRe.source.length - a6.compiledRe.source.length;
|
|
43831
|
+
});
|
|
43832
|
+
const parts = [];
|
|
43833
|
+
const typeOrder = [];
|
|
43834
|
+
for (const [typeName, desc] of entries) {
|
|
43835
|
+
parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
|
|
43836
|
+
typeOrder.push(typeName);
|
|
43837
|
+
}
|
|
43838
|
+
const combinedSource = parts.join("|");
|
|
43839
|
+
const needsI = entries.some(([, d6]) => d6.compiledRe.flags.includes("i"));
|
|
43840
|
+
const flags = needsI ? "gi" : "g";
|
|
43841
|
+
try {
|
|
43842
|
+
const re = new RegExp(combinedSource, flags);
|
|
43843
|
+
categoryMap.set(catKey, { re, typeOrder });
|
|
43844
|
+
} catch (err2) {
|
|
43845
|
+
console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err2);
|
|
43846
|
+
}
|
|
43847
|
+
}
|
|
43848
|
+
this.localeCategoryRegexMap.set(locale, categoryMap);
|
|
43849
|
+
}
|
|
43871
43850
|
buildCatalogue(restrict) {
|
|
43872
|
-
for (const
|
|
43851
|
+
for (const entry of RAW_PATTERNS) {
|
|
43852
|
+
const [typeName, regexSource, terms, risk, cat, vtag, isHighEntropy, supportedLocales] = entry;
|
|
43873
43853
|
if (restrict !== null && !restrict.has(cat)) continue;
|
|
43854
|
+
let re;
|
|
43855
|
+
if (regexSource instanceof RegExp) {
|
|
43856
|
+
re = regexSource;
|
|
43857
|
+
} else {
|
|
43858
|
+
re = new RegExp(regexSource, "g");
|
|
43859
|
+
}
|
|
43874
43860
|
this.catalogue.set(typeName, {
|
|
43875
|
-
compiledRe:
|
|
43861
|
+
compiledRe: re,
|
|
43876
43862
|
proximityTerms: new Set(terms),
|
|
43877
43863
|
baseRisk: risk,
|
|
43878
43864
|
category: cat,
|
|
43879
|
-
validatorTag: vtag
|
|
43865
|
+
validatorTag: vtag,
|
|
43866
|
+
isHighEntropy: isHighEntropy ?? vtag !== null,
|
|
43867
|
+
supportedLocales: supportedLocales ?? ["*"]
|
|
43880
43868
|
});
|
|
43881
43869
|
}
|
|
43882
43870
|
}
|
|
@@ -43973,29 +43961,13 @@ function checkIpv4Octets(raw) {
|
|
|
43973
43961
|
}
|
|
43974
43962
|
return true;
|
|
43975
43963
|
}
|
|
43976
|
-
function
|
|
43977
|
-
const
|
|
43978
|
-
if (
|
|
43979
|
-
const d6 = digitsStr.split("").map(Number);
|
|
43980
|
-
if (d6[0] === 0) return false;
|
|
43981
|
-
if (d6[10] % 2 !== 0) return false;
|
|
43982
|
-
const oddSum = d6[0] + d6[2] + d6[4] + d6[6] + d6[8];
|
|
43983
|
-
const evenSum = d6[1] + d6[3] + d6[5] + d6[7];
|
|
43984
|
-
const computedD10 = ((oddSum * 7 - evenSum) % 10 + 10) % 10;
|
|
43985
|
-
if (computedD10 !== d6[9]) return false;
|
|
43986
|
-
const firstTenSum = d6.slice(0, 10).reduce((a6, b6) => a6 + b6, 0);
|
|
43987
|
-
if (firstTenSum % 10 !== d6[10]) return false;
|
|
43988
|
-
return true;
|
|
43989
|
-
}
|
|
43990
|
-
function checkSaudiNid(raw) {
|
|
43991
|
-
const digitsStr = raw.replace(/\D/g, "");
|
|
43992
|
-
if (digitsStr.length !== 10) return false;
|
|
43993
|
-
const d6 = digitsStr.split("").map(Number);
|
|
43994
|
-
if (d6[0] !== 1) return false;
|
|
43964
|
+
function checkCaSin(raw) {
|
|
43965
|
+
const digits = raw.replace(/\D/g, "");
|
|
43966
|
+
if (digits.length !== 9) return false;
|
|
43995
43967
|
let total = 0;
|
|
43996
|
-
for (let idx = 0; idx <
|
|
43997
|
-
let val =
|
|
43998
|
-
if (idx % 2 ===
|
|
43968
|
+
for (let idx = 0; idx < digits.length; idx++) {
|
|
43969
|
+
let val = parseInt(digits[idx], 10);
|
|
43970
|
+
if (idx % 2 === 1) {
|
|
43999
43971
|
val *= 2;
|
|
44000
43972
|
if (val > 9) val -= 9;
|
|
44001
43973
|
}
|
|
@@ -44003,7 +43975,30 @@ function checkSaudiNid(raw) {
|
|
|
44003
43975
|
}
|
|
44004
43976
|
return total % 10 === 0;
|
|
44005
43977
|
}
|
|
44006
|
-
|
|
43978
|
+
function checkUkNino(raw) {
|
|
43979
|
+
const cleaned = raw.replace(/ /g, "").toUpperCase();
|
|
43980
|
+
if (cleaned.length !== 9) return false;
|
|
43981
|
+
return UK_NINO_REGEX.test(cleaned);
|
|
43982
|
+
}
|
|
43983
|
+
function checkEsId(raw) {
|
|
43984
|
+
const cleaned = raw.replace(/[\s-]/g, "").toUpperCase();
|
|
43985
|
+
if (cleaned.length !== 9) return false;
|
|
43986
|
+
const mapping = { X: "0", Y: "1", Z: "2" };
|
|
43987
|
+
const firstChar = cleaned[0];
|
|
43988
|
+
let numStr;
|
|
43989
|
+
if (firstChar in mapping) {
|
|
43990
|
+
numStr = mapping[firstChar] + cleaned.slice(1, 8);
|
|
43991
|
+
} else if (/^\d$/.test(firstChar)) {
|
|
43992
|
+
numStr = cleaned.slice(0, 8);
|
|
43993
|
+
} else {
|
|
43994
|
+
return false;
|
|
43995
|
+
}
|
|
43996
|
+
if (!/^\d+$/.test(numStr)) return false;
|
|
43997
|
+
const num = parseInt(numStr, 10);
|
|
43998
|
+
const validLetters = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
43999
|
+
return cleaned[8] === validLetters[num % 23];
|
|
44000
|
+
}
|
|
44001
|
+
var IBAN_COUNTRY_LENGTHS, VIN_TRANSLITERATION, VIN_WEIGHTS, UK_NINO_REGEX, VALIDATOR_DISPATCH, DLPValidationEngine;
|
|
44007
44002
|
var init_handlers = __esm({
|
|
44008
44003
|
"src/core/dlp/handlers.ts"() {
|
|
44009
44004
|
IBAN_COUNTRY_LENGTHS = {
|
|
@@ -44106,6 +44101,7 @@ var init_handlers = __esm({
|
|
|
44106
44101
|
Z: 9
|
|
44107
44102
|
};
|
|
44108
44103
|
VIN_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2];
|
|
44104
|
+
UK_NINO_REGEX = /^(?!BG|GB|NK|KN|TN|NT|ZZ)[A-CEGHJ-PR-TW-Z]{2}[0-9]{6}[A-D]$/;
|
|
44109
44105
|
VALIDATOR_DISPATCH = {
|
|
44110
44106
|
luhn: checkLuhn,
|
|
44111
44107
|
ssn_area: checkSsnArea,
|
|
@@ -44114,8 +44110,9 @@ var init_handlers = __esm({
|
|
|
44114
44110
|
vin_format: checkVinFormat,
|
|
44115
44111
|
btc_format: checkBtcFormat,
|
|
44116
44112
|
ipv4: checkIpv4Octets,
|
|
44117
|
-
|
|
44118
|
-
|
|
44113
|
+
ca_sin: checkCaSin,
|
|
44114
|
+
uk_nino: checkUkNino,
|
|
44115
|
+
es_id: checkEsId
|
|
44119
44116
|
};
|
|
44120
44117
|
DLPValidationEngine = class {
|
|
44121
44118
|
/**
|
|
@@ -44152,7 +44149,8 @@ var init_scorer = __esm({
|
|
|
44152
44149
|
keywordBoost: 0.1,
|
|
44153
44150
|
validatorOverride: 0.99,
|
|
44154
44151
|
maxConfidence: 0.99,
|
|
44155
|
-
penaltyFactor: 0.
|
|
44152
|
+
penaltyFactor: 0.99
|
|
44153
|
+
// Renamed functionally to validator failure penalty subtraction
|
|
44156
44154
|
};
|
|
44157
44155
|
DLPConfidenceScorer = class {
|
|
44158
44156
|
constructor(overrides = {}) {
|
|
@@ -44171,7 +44169,7 @@ var init_scorer = __esm({
|
|
|
44171
44169
|
score(input) {
|
|
44172
44170
|
if (input.validatorPassed === true) return this.valOverride;
|
|
44173
44171
|
if (input.validatorPassed === false) {
|
|
44174
|
-
return Math.
|
|
44172
|
+
return Math.max(0, input.baseRisk - this.penalty);
|
|
44175
44173
|
}
|
|
44176
44174
|
const windowLo = Math.max(0, input.matchStart - this.window);
|
|
44177
44175
|
const windowHi = Math.min(input.fullText.length, input.matchEnd + this.window);
|
|
@@ -44197,6 +44195,47 @@ var init_scorer = __esm({
|
|
|
44197
44195
|
}
|
|
44198
44196
|
});
|
|
44199
44197
|
|
|
44198
|
+
// src/core/span.ts
|
|
44199
|
+
function resolveOverlaps(spans) {
|
|
44200
|
+
if (spans.length === 0) return [];
|
|
44201
|
+
const sorted = [...spans].sort((a6, b6) => {
|
|
44202
|
+
if (a6.start !== b6.start) return a6.start - b6.start;
|
|
44203
|
+
const lenDiff = b6.end - b6.start - (a6.end - a6.start);
|
|
44204
|
+
if (lenDiff !== 0) return lenDiff;
|
|
44205
|
+
return b6.confidence - a6.confidence;
|
|
44206
|
+
});
|
|
44207
|
+
const resolved = [];
|
|
44208
|
+
let occupiedEnd = -1;
|
|
44209
|
+
for (const span of sorted) {
|
|
44210
|
+
if (span.start >= occupiedEnd) {
|
|
44211
|
+
resolved.push(span);
|
|
44212
|
+
occupiedEnd = span.end;
|
|
44213
|
+
} else if (span.end <= occupiedEnd) {
|
|
44214
|
+
continue;
|
|
44215
|
+
} else {
|
|
44216
|
+
const last = resolved[resolved.length - 1];
|
|
44217
|
+
if (span.confidence > last.confidence) {
|
|
44218
|
+
resolved.pop();
|
|
44219
|
+
resolved.push(span);
|
|
44220
|
+
occupiedEnd = span.end;
|
|
44221
|
+
}
|
|
44222
|
+
}
|
|
44223
|
+
}
|
|
44224
|
+
return resolved.sort((a6, b6) => b6.start - a6.start);
|
|
44225
|
+
}
|
|
44226
|
+
function reconstruct(text, resolvedSpans) {
|
|
44227
|
+
let result = text;
|
|
44228
|
+
for (const span of resolvedSpans) {
|
|
44229
|
+
if (span.maskedValue == null) continue;
|
|
44230
|
+
result = result.slice(0, span.start) + span.maskedValue + result.slice(span.end);
|
|
44231
|
+
}
|
|
44232
|
+
return result;
|
|
44233
|
+
}
|
|
44234
|
+
var init_span = __esm({
|
|
44235
|
+
"src/core/span.ts"() {
|
|
44236
|
+
}
|
|
44237
|
+
});
|
|
44238
|
+
|
|
44200
44239
|
// node_modules/delayed-stream/lib/delayed_stream.js
|
|
44201
44240
|
var require_delayed_stream = __commonJS({
|
|
44202
44241
|
"node_modules/delayed-stream/lib/delayed_stream.js"(exports2, module) {
|
|
@@ -58867,7 +58906,7 @@ var init_transformers_scanner = __esm({
|
|
|
58867
58906
|
confidence = Math.min(1, confidence + 0.2);
|
|
58868
58907
|
}
|
|
58869
58908
|
if (confidence >= confidenceThreshold && !looksLikeToken(val) && val.length > 1) {
|
|
58870
|
-
const token = await encodeFn(val);
|
|
58909
|
+
const token = await encodeFn(val, { entityType });
|
|
58871
58910
|
entities.push({
|
|
58872
58911
|
type: entityType,
|
|
58873
58912
|
value: val,
|
|
@@ -58974,17 +59013,18 @@ var init_scanner = __esm({
|
|
|
58974
59013
|
init_registry();
|
|
58975
59014
|
init_handlers();
|
|
58976
59015
|
init_scorer();
|
|
59016
|
+
init_span();
|
|
58977
59017
|
_dlpLanguageResolver = new LanguageContextResolver();
|
|
58978
59018
|
_dlpPatternRegistry = new DLPPatternRegistry();
|
|
58979
59019
|
_dlpValidationEngine = new DLPValidationEngine();
|
|
58980
59020
|
_dlpConfidenceScorer = new DLPConfidenceScorer();
|
|
58981
59021
|
REGEX_PATTERNS = {
|
|
58982
59022
|
"EMAIL_ADDRESS": /[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+/g,
|
|
58983
|
-
"PHONE_NUMBER":
|
|
58984
|
-
"PHONE_NUMBER_INTL":
|
|
58985
|
-
"US_SSN":
|
|
58986
|
-
"CREDIT_CARD": /(?:\d{4}[ \-]?){3}\d{4}/g,
|
|
58987
|
-
"US_ROUTING_NUMBER":
|
|
59023
|
+
"PHONE_NUMBER": /(?<!\d)(?:\+?1?[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}|\d{3}[\s\-.]?\d{4})(?!\d)/g,
|
|
59024
|
+
"PHONE_NUMBER_INTL": /(?<!\d)\+(?:[1-9]\d{0,3})[-.\s]?\(?\d{1,5}\)?(?:[-.\s]?\d{2,4}){2,4}(?!\d)/g,
|
|
59025
|
+
"US_SSN": /(?<!\d)\d{3}-\d{2}-\d{4}(?!\d)/g,
|
|
59026
|
+
"CREDIT_CARD": /(?<!\d)(?:\d{4}[ \-]?){3}\d{4}(?!\d)/g,
|
|
59027
|
+
"US_ROUTING_NUMBER": /(?<!\d)\d{9}(?!\d)/g,
|
|
58988
59028
|
"US_PASSPORT": /\b[A-Z]\d{8}\b/g,
|
|
58989
59029
|
"DATE_OF_BIRTH": /\b(?:0[1-9]|1[0-2])\/(?:0[1-9]|[12]\d|3[01])\/(?:19|20)\d{2}\b|\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])\b/g
|
|
58990
59030
|
};
|
|
@@ -59039,26 +59079,55 @@ var init_scanner = __esm({
|
|
|
59039
59079
|
const checksum = 3 * (d6[0] + d6[3] + d6[6]) + 7 * (d6[1] + d6[4] + d6[7]) + (d6[2] + d6[5] + d6[8]);
|
|
59040
59080
|
return checksum % 10 === 0;
|
|
59041
59081
|
}
|
|
59042
|
-
async
|
|
59082
|
+
async _tier0CollectSpans(text, confidenceThreshold) {
|
|
59043
59083
|
const detectedLanguage = _dlpLanguageResolver.resolve(text);
|
|
59044
|
-
const
|
|
59045
|
-
|
|
59046
|
-
|
|
59084
|
+
const spans = [];
|
|
59085
|
+
const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
|
|
59086
|
+
for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
|
|
59087
|
+
const megaRe = new RegExp(re.source, re.flags);
|
|
59047
59088
|
let m6;
|
|
59048
|
-
while ((m6 =
|
|
59089
|
+
while ((m6 = megaRe.exec(text)) !== null) {
|
|
59090
|
+
const groups = m6.groups ?? {};
|
|
59091
|
+
let typeTag;
|
|
59092
|
+
for (const name of typeOrder) {
|
|
59093
|
+
if (groups[name] !== void 0) {
|
|
59094
|
+
typeTag = name;
|
|
59095
|
+
break;
|
|
59096
|
+
}
|
|
59097
|
+
}
|
|
59098
|
+
if (!typeTag) continue;
|
|
59049
59099
|
const matchedStr = m6[0];
|
|
59050
59100
|
if (looksLikeToken(matchedStr)) continue;
|
|
59101
|
+
const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
|
|
59102
|
+
if (!descriptor) continue;
|
|
59051
59103
|
const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
|
|
59052
|
-
|
|
59053
|
-
|
|
59054
|
-
|
|
59055
|
-
|
|
59056
|
-
|
|
59057
|
-
|
|
59058
|
-
|
|
59059
|
-
}
|
|
59104
|
+
let conf;
|
|
59105
|
+
if (validatorResult === false) {
|
|
59106
|
+
if (descriptor.isHighEntropy) {
|
|
59107
|
+
conf = 0.85;
|
|
59108
|
+
} else {
|
|
59109
|
+
continue;
|
|
59110
|
+
}
|
|
59111
|
+
} else {
|
|
59112
|
+
conf = _dlpConfidenceScorer.score({
|
|
59113
|
+
baseRisk: descriptor.baseRisk,
|
|
59114
|
+
matchStart: m6.index,
|
|
59115
|
+
matchEnd: m6.index + matchedStr.length,
|
|
59116
|
+
fullText: text,
|
|
59117
|
+
proximityTerms: descriptor.proximityTerms,
|
|
59118
|
+
validatorPassed: validatorResult
|
|
59119
|
+
});
|
|
59120
|
+
}
|
|
59060
59121
|
if (conf >= confidenceThreshold) {
|
|
59061
|
-
|
|
59122
|
+
spans.push({
|
|
59123
|
+
start: m6.index,
|
|
59124
|
+
end: m6.index + matchedStr.length,
|
|
59125
|
+
entityType: typeTag,
|
|
59126
|
+
originalValue: matchedStr,
|
|
59127
|
+
confidence: conf,
|
|
59128
|
+
method: "dlp_heuristic",
|
|
59129
|
+
language: detectedLanguage
|
|
59130
|
+
});
|
|
59062
59131
|
}
|
|
59063
59132
|
}
|
|
59064
59133
|
}
|
|
@@ -59077,7 +59146,15 @@ var init_scanner = __esm({
|
|
|
59077
59146
|
validatorPassed: null
|
|
59078
59147
|
});
|
|
59079
59148
|
if (conf >= confidenceThreshold) {
|
|
59080
|
-
|
|
59149
|
+
spans.push({
|
|
59150
|
+
start: m6.index,
|
|
59151
|
+
end: m6.index + m6[0].length,
|
|
59152
|
+
entityType: "PERSON_NAME",
|
|
59153
|
+
originalValue: m6[0],
|
|
59154
|
+
confidence: conf,
|
|
59155
|
+
method: "dlp_heuristic",
|
|
59156
|
+
language: detectedLanguage
|
|
59157
|
+
});
|
|
59081
59158
|
}
|
|
59082
59159
|
}
|
|
59083
59160
|
}
|
|
@@ -59086,85 +59163,78 @@ var init_scanner = __esm({
|
|
|
59086
59163
|
let m6;
|
|
59087
59164
|
while ((m6 = re.exec(text)) !== null) {
|
|
59088
59165
|
if (looksLikeToken(m6[0])) continue;
|
|
59089
|
-
|
|
59090
|
-
|
|
59091
|
-
|
|
59092
|
-
|
|
59093
|
-
|
|
59094
|
-
|
|
59095
|
-
|
|
59096
|
-
|
|
59097
|
-
|
|
59098
|
-
occupiedEnd = hit.end;
|
|
59166
|
+
spans.push({
|
|
59167
|
+
start: m6.index,
|
|
59168
|
+
end: m6.index + m6[0].length,
|
|
59169
|
+
entityType: "PHYS_ADDRESS",
|
|
59170
|
+
originalValue: m6[0],
|
|
59171
|
+
confidence: 0.55,
|
|
59172
|
+
method: "dlp_heuristic",
|
|
59173
|
+
language: detectedLanguage
|
|
59174
|
+
});
|
|
59099
59175
|
}
|
|
59100
59176
|
}
|
|
59177
|
+
return spans;
|
|
59178
|
+
}
|
|
59179
|
+
/** Backward-compat wrapper — collects spans then single-pass encodes. */
|
|
59180
|
+
async _tier0Dlp(text, encodeFn, confidenceThreshold) {
|
|
59181
|
+
const spans = await this._tier0CollectSpans(text, confidenceThreshold);
|
|
59182
|
+
const resolved = resolveOverlaps(spans);
|
|
59101
59183
|
const entities = [];
|
|
59102
|
-
|
|
59103
|
-
|
|
59104
|
-
const token = await encodeFn(hit.val);
|
|
59105
|
-
excised = excised.slice(0, hit.start) + token + excised.slice(hit.end);
|
|
59184
|
+
await Promise.all(resolved.map(async (span) => {
|
|
59185
|
+
span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
|
|
59106
59186
|
entities.push({
|
|
59107
|
-
type:
|
|
59108
|
-
value:
|
|
59109
|
-
method:
|
|
59110
|
-
confidence:
|
|
59111
|
-
masked_value:
|
|
59112
|
-
language:
|
|
59187
|
+
type: span.entityType,
|
|
59188
|
+
value: span.originalValue,
|
|
59189
|
+
method: span.method,
|
|
59190
|
+
confidence: span.confidence,
|
|
59191
|
+
masked_value: span.maskedValue,
|
|
59192
|
+
language: span.language
|
|
59113
59193
|
});
|
|
59114
|
-
}
|
|
59115
|
-
return [
|
|
59194
|
+
}));
|
|
59195
|
+
return [reconstruct(text, resolved), entities];
|
|
59116
59196
|
}
|
|
59117
|
-
async
|
|
59118
|
-
|
|
59119
|
-
let excised = text;
|
|
59120
|
-
let allMatches = [];
|
|
59197
|
+
async _tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold) {
|
|
59198
|
+
const spans = [];
|
|
59121
59199
|
for (const [entityType, pattern] of Object.entries(REGEX_PATTERNS)) {
|
|
59122
59200
|
const re = new RegExp(pattern.source, pattern.flags);
|
|
59123
59201
|
let match;
|
|
59124
59202
|
while ((match = re.exec(text)) !== null) {
|
|
59125
|
-
|
|
59126
|
-
if (
|
|
59127
|
-
|
|
59128
|
-
|
|
59129
|
-
if (entityType === "
|
|
59130
|
-
|
|
59131
|
-
|
|
59132
|
-
|
|
59133
|
-
|
|
59203
|
+
const val = match[0];
|
|
59204
|
+
if (looksLikeToken(val)) continue;
|
|
59205
|
+
let confidence = aggressive || boostEntities.has(entityType.toLowerCase().replace(/_/g, " ")) ? 1 : 0.95;
|
|
59206
|
+
if (entityType === "CREDIT_CARD" && _BaseScanner._luhnChecksum(val)) confidence = Math.max(confidence, 0.99);
|
|
59207
|
+
if (entityType === "US_ROUTING_NUMBER" && !_BaseScanner._abaChecksum(val)) continue;
|
|
59208
|
+
if (confidence >= confidenceThreshold) {
|
|
59209
|
+
spans.push({
|
|
59210
|
+
start: match.index,
|
|
59211
|
+
end: match.index + val.length,
|
|
59212
|
+
entityType,
|
|
59213
|
+
originalValue: val,
|
|
59214
|
+
confidence,
|
|
59215
|
+
method: "regex"
|
|
59216
|
+
});
|
|
59134
59217
|
}
|
|
59135
|
-
allMatches.push({
|
|
59136
|
-
start: match.index,
|
|
59137
|
-
end: match.index + match[0].length,
|
|
59138
|
-
type: entityType,
|
|
59139
|
-
value: match[0],
|
|
59140
|
-
confidence
|
|
59141
|
-
});
|
|
59142
|
-
}
|
|
59143
|
-
}
|
|
59144
|
-
allMatches.sort((a6, b6) => a6.start - b6.start || b6.end - b6.start - (a6.end - a6.start));
|
|
59145
|
-
let filtered = [];
|
|
59146
|
-
let lastEnd = -1;
|
|
59147
|
-
for (const m6 of allMatches) {
|
|
59148
|
-
if (m6.start >= lastEnd) {
|
|
59149
|
-
filtered.push(m6);
|
|
59150
|
-
lastEnd = m6.end;
|
|
59151
59218
|
}
|
|
59152
59219
|
}
|
|
59153
|
-
|
|
59154
|
-
|
|
59155
|
-
|
|
59156
|
-
|
|
59157
|
-
|
|
59158
|
-
|
|
59159
|
-
|
|
59160
|
-
|
|
59161
|
-
|
|
59162
|
-
|
|
59163
|
-
|
|
59164
|
-
|
|
59165
|
-
|
|
59166
|
-
|
|
59167
|
-
|
|
59220
|
+
return spans;
|
|
59221
|
+
}
|
|
59222
|
+
/** Backward-compat wrapper. */
|
|
59223
|
+
async _tier1Regex(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
|
|
59224
|
+
const spans = await this._tier1CollectSpans(text, boostEntities, aggressive, confidenceThreshold);
|
|
59225
|
+
const resolved = resolveOverlaps(spans);
|
|
59226
|
+
const entities = [];
|
|
59227
|
+
await Promise.all(resolved.map(async (span) => {
|
|
59228
|
+
span.maskedValue = await encodeFn(span.originalValue, { entityType: span.entityType });
|
|
59229
|
+
entities.push({
|
|
59230
|
+
type: span.entityType,
|
|
59231
|
+
value: span.originalValue,
|
|
59232
|
+
method: span.method,
|
|
59233
|
+
confidence: span.confidence,
|
|
59234
|
+
masked_value: span.maskedValue
|
|
59235
|
+
});
|
|
59236
|
+
}));
|
|
59237
|
+
return [reconstruct(text, resolved), entities];
|
|
59168
59238
|
}
|
|
59169
59239
|
async _tier2Nlp(text, encodeFn, boostEntities, aggressive, confidenceThreshold) {
|
|
59170
59240
|
return [text, []];
|
|
@@ -59184,13 +59254,18 @@ var init_scanner = __esm({
|
|
|
59184
59254
|
const _encode = options.encodeFn || encode;
|
|
59185
59255
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
59186
59256
|
const boost = this._resolveBoost(options.context);
|
|
59187
|
-
|
|
59257
|
+
const allSpans = [];
|
|
59188
59258
|
if (pipeline.includes("dlp")) {
|
|
59189
|
-
|
|
59259
|
+
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59190
59260
|
}
|
|
59191
59261
|
if (pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59192
|
-
|
|
59262
|
+
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
59193
59263
|
}
|
|
59264
|
+
const resolved = resolveOverlaps(allSpans);
|
|
59265
|
+
await Promise.all(resolved.map(async (span) => {
|
|
59266
|
+
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
59267
|
+
}));
|
|
59268
|
+
let currentText = reconstruct(text, resolved);
|
|
59194
59269
|
if (pipeline.includes("nlp")) {
|
|
59195
59270
|
[currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
59196
59271
|
}
|
|
@@ -59202,20 +59277,29 @@ var init_scanner = __esm({
|
|
|
59202
59277
|
const _encode = options.encodeFn || encode;
|
|
59203
59278
|
const confidenceThreshold = options.confidenceThreshold ?? 0.7;
|
|
59204
59279
|
const boost = this._resolveBoost(options.context);
|
|
59205
|
-
|
|
59206
|
-
|
|
59280
|
+
const allEntities = [];
|
|
59281
|
+
const allSpans = [];
|
|
59207
59282
|
if (pipeline.includes("dlp")) {
|
|
59208
|
-
|
|
59209
|
-
remaining = newText;
|
|
59210
|
-
allEntities.push(...tier0);
|
|
59283
|
+
allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
|
|
59211
59284
|
}
|
|
59212
59285
|
if (pipeline.includes("regex") || pipeline.includes("checksum")) {
|
|
59213
|
-
|
|
59214
|
-
|
|
59215
|
-
|
|
59216
|
-
|
|
59286
|
+
allSpans.push(...await this._tier1CollectSpans(text, boost, !!options.aggressive, confidenceThreshold));
|
|
59287
|
+
}
|
|
59288
|
+
const resolved = resolveOverlaps(allSpans);
|
|
59289
|
+
await Promise.all(resolved.map(async (span) => {
|
|
59290
|
+
span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
|
|
59291
|
+
allEntities.push({
|
|
59292
|
+
type: span.entityType,
|
|
59293
|
+
value: span.originalValue,
|
|
59294
|
+
method: span.method,
|
|
59295
|
+
confidence: span.confidence,
|
|
59296
|
+
masked_value: span.maskedValue,
|
|
59297
|
+
language: span.language
|
|
59298
|
+
});
|
|
59299
|
+
}));
|
|
59300
|
+
const remaining = reconstruct(text, resolved);
|
|
59217
59301
|
if (pipeline.includes("nlp")) {
|
|
59218
|
-
const [
|
|
59302
|
+
const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
|
|
59219
59303
|
allEntities.push(...tier2);
|
|
59220
59304
|
}
|
|
59221
59305
|
return allEntities;
|