mask-privacy 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -388,7 +388,7 @@ declare class DLPPatternRegistry {
388
388
  getCategoryRegexesMap(locale?: string): Map<string, {
389
389
  re: RegExp;
390
390
  typeOrder: string[];
391
- }>;
391
+ }[]>;
392
392
  getCategoryTypeMap(categoryName: string, locale?: string): string[];
393
393
  private compileForLocale;
394
394
  private buildCatalogue;
@@ -481,7 +481,7 @@ declare class DLPConfidenceScorer {
481
481
  * Provides format-preserving encryption, local/distributed vaulting,
482
482
  * and framework-agnostic tool interception hooks.
483
483
  */
484
- declare const VERSION = "3.4.0";
484
+ declare const VERSION = "3.5.0";
485
485
 
486
486
  /**
487
487
  * Detect PII entities in text and return a list of objects with metadata.
package/dist/index.d.ts CHANGED
@@ -388,7 +388,7 @@ declare class DLPPatternRegistry {
388
388
  getCategoryRegexesMap(locale?: string): Map<string, {
389
389
  re: RegExp;
390
390
  typeOrder: string[];
391
- }>;
391
+ }[]>;
392
392
  getCategoryTypeMap(categoryName: string, locale?: string): string[];
393
393
  private compileForLocale;
394
394
  private buildCatalogue;
@@ -481,7 +481,7 @@ declare class DLPConfidenceScorer {
481
481
  * Provides format-preserving encryption, local/distributed vaulting,
482
482
  * and framework-agnostic tool interception hooks.
483
483
  */
484
- declare const VERSION = "3.4.0";
484
+ declare const VERSION = "3.5.0";
485
485
 
486
486
  /**
487
487
  * Detect PII entities in text and return a list of objects with metadata.
package/dist/index.js CHANGED
@@ -43855,7 +43855,8 @@ var init_registry = __esm({
43855
43855
  return this.localeCategoryRegexMap.get(locale);
43856
43856
  }
43857
43857
  getCategoryTypeMap(categoryName, locale = "en") {
43858
- return this.localeCategoryRegexMap.get(locale)?.get(categoryName)?.typeOrder ?? [];
43858
+ const groups = this.localeCategoryRegexMap.get(locale)?.get(categoryName) ?? [];
43859
+ return groups.flatMap((g6) => g6.typeOrder);
43859
43860
  }
43860
43861
  compileForLocale(locale) {
43861
43862
  const localePool = /* @__PURE__ */ new Map();
@@ -43874,20 +43875,38 @@ var init_registry = __esm({
43874
43875
  if (aVal !== bVal) return aVal - bVal;
43875
43876
  return b6.compiledRe.source.length - a6.compiledRe.source.length;
43876
43877
  });
43877
- const parts = [];
43878
- const typeOrder = [];
43878
+ const csParts = [];
43879
+ const csOrder = [];
43880
+ const ciParts = [];
43881
+ const ciOrder = [];
43879
43882
  for (const [typeName, desc] of entries) {
43880
- parts.push(`(?<${typeName}>${desc.compiledRe.source})`);
43881
- typeOrder.push(typeName);
43883
+ const named = `(?<${typeName}>${desc.compiledRe.source})`;
43884
+ if (desc.compiledRe.flags.includes("i")) {
43885
+ ciParts.push(named);
43886
+ ciOrder.push(typeName);
43887
+ } else {
43888
+ csParts.push(named);
43889
+ csOrder.push(typeName);
43890
+ }
43882
43891
  }
43883
- const combinedSource = parts.join("|");
43884
- const needsI = entries.some(([, d6]) => d6.compiledRe.flags.includes("i"));
43885
- const flags = needsI ? "gi" : "g";
43886
- try {
43887
- const re = new RegExp(combinedSource, flags);
43888
- categoryMap.set(catKey, { re, typeOrder });
43889
- } catch (err2) {
43890
- console.error(`[DLPPatternRegistry] Locale [${locale}] category [${catKey}] failed:`, err2);
43892
+ const groups = [];
43893
+ const subGroups = [
43894
+ [csParts, csOrder, "g"],
43895
+ [ciParts, ciOrder, "gi"]
43896
+ ];
43897
+ for (const [parts, order, flags] of subGroups) {
43898
+ if (parts.length === 0) continue;
43899
+ try {
43900
+ groups.push({ re: new RegExp(parts.join("|"), flags), typeOrder: order });
43901
+ } catch (err2) {
43902
+ console.error(
43903
+ `[DLPPatternRegistry] Locale [${locale}] category [${catKey}] (${flags}) failed:`,
43904
+ err2
43905
+ );
43906
+ }
43907
+ }
43908
+ if (groups.length > 0) {
43909
+ categoryMap.set(catKey, groups);
43891
43910
  }
43892
43911
  }
43893
43912
  this.localeCategoryRegexMap.set(locale, categoryMap);
@@ -59071,6 +59090,11 @@ var init_transformers_scanner = __esm({
59071
59090
  });
59072
59091
 
59073
59092
  // src/core/scanner.ts
59093
+ async function chunkEncode(items, fn) {
59094
+ for (let i6 = 0; i6 < items.length; i6 += CHUNK_SIZE) {
59095
+ await Promise.all(items.slice(i6, i6 + CHUNK_SIZE).map(fn));
59096
+ }
59097
+ }
59074
59098
  function getScanner() {
59075
59099
  if (scannerInstance === null) {
59076
59100
  const scannerType = config.MASK_SCANNER_TYPE;
@@ -59084,7 +59108,7 @@ function getScanner() {
59084
59108
  }
59085
59109
  return scannerInstance;
59086
59110
  }
59087
- var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer; exports.BaseScanner = void 0; exports.PresidioScanner = void 0; var scannerInstance;
59111
+ var _dlpLanguageResolver, _dlpPatternRegistry, _dlpValidationEngine, _dlpConfidenceScorer, CHUNK_SIZE; exports.BaseScanner = void 0; exports.PresidioScanner = void 0; var scannerInstance;
59088
59112
  var init_scanner = __esm({
59089
59113
  "src/core/scanner.ts"() {
59090
59114
  init_config();
@@ -59099,6 +59123,7 @@ var init_scanner = __esm({
59099
59123
  _dlpPatternRegistry = new exports.DLPPatternRegistry();
59100
59124
  _dlpValidationEngine = new exports.DLPValidationEngine();
59101
59125
  _dlpConfidenceScorer = new exports.DLPConfidenceScorer();
59126
+ CHUNK_SIZE = 50;
59102
59127
  exports.BaseScanner = class {
59103
59128
  constructor() {
59104
59129
  this._supportedEntities = [
@@ -59137,51 +59162,53 @@ var init_scanner = __esm({
59137
59162
  const detectedLanguage = _dlpLanguageResolver.resolve(text);
59138
59163
  const spans = [];
59139
59164
  const categoryMap = _dlpPatternRegistry.getCategoryRegexesMap();
59140
- for (const [catKey, { re, typeOrder }] of categoryMap.entries()) {
59141
- const megaRe = new RegExp(re.source, re.flags);
59142
- let m6;
59143
- while ((m6 = megaRe.exec(text)) !== null) {
59144
- const groups = m6.groups ?? {};
59145
- let typeTag;
59146
- for (const name of typeOrder) {
59147
- if (groups[name] !== void 0) {
59148
- typeTag = name;
59149
- break;
59165
+ for (const [catKey, groups] of categoryMap.entries()) {
59166
+ for (const { re, typeOrder } of groups) {
59167
+ const megaRe = new RegExp(re.source, re.flags);
59168
+ let m6;
59169
+ while ((m6 = megaRe.exec(text)) !== null) {
59170
+ const groups2 = m6.groups ?? {};
59171
+ let typeTag;
59172
+ for (const name of typeOrder) {
59173
+ if (groups2[name] !== void 0) {
59174
+ typeTag = name;
59175
+ break;
59176
+ }
59150
59177
  }
59151
- }
59152
- if (!typeTag) continue;
59153
- const matchedStr = m6[0];
59154
- if (looksLikeToken(matchedStr)) continue;
59155
- const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
59156
- if (!descriptor) continue;
59157
- const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
59158
- let conf;
59159
- if (validatorResult === false) {
59160
- if (descriptor.isHighEntropy) {
59161
- conf = 0.85;
59178
+ if (!typeTag) continue;
59179
+ const matchedStr = m6[0];
59180
+ if (looksLikeToken(matchedStr)) continue;
59181
+ const descriptor = _dlpPatternRegistry.descriptorFor(typeTag);
59182
+ if (!descriptor) continue;
59183
+ const validatorResult = _dlpValidationEngine.run(descriptor.validatorTag, matchedStr);
59184
+ let conf;
59185
+ if (validatorResult === false) {
59186
+ if (descriptor.isHighEntropy) {
59187
+ conf = 0.85;
59188
+ } else {
59189
+ continue;
59190
+ }
59162
59191
  } else {
59163
- continue;
59192
+ conf = _dlpConfidenceScorer.score({
59193
+ baseRisk: descriptor.baseRisk,
59194
+ matchStart: m6.index,
59195
+ matchEnd: m6.index + matchedStr.length,
59196
+ fullText: text,
59197
+ proximityTerms: descriptor.proximityTerms,
59198
+ validatorPassed: validatorResult
59199
+ });
59200
+ }
59201
+ if (conf >= confidenceThreshold) {
59202
+ spans.push({
59203
+ start: m6.index,
59204
+ end: m6.index + matchedStr.length,
59205
+ entityType: typeTag,
59206
+ originalValue: matchedStr,
59207
+ confidence: conf,
59208
+ method: "dlp_heuristic",
59209
+ language: detectedLanguage
59210
+ });
59164
59211
  }
59165
- } else {
59166
- conf = _dlpConfidenceScorer.score({
59167
- baseRisk: descriptor.baseRisk,
59168
- matchStart: m6.index,
59169
- matchEnd: m6.index + matchedStr.length,
59170
- fullText: text,
59171
- proximityTerms: descriptor.proximityTerms,
59172
- validatorPassed: validatorResult
59173
- });
59174
- }
59175
- if (conf >= confidenceThreshold) {
59176
- spans.push({
59177
- start: m6.index,
59178
- end: m6.index + matchedStr.length,
59179
- entityType: typeTag,
59180
- originalValue: matchedStr,
59181
- confidence: conf,
59182
- method: "dlp_heuristic",
59183
- language: detectedLanguage
59184
- });
59185
59212
  }
59186
59213
  }
59187
59214
  }
@@ -59261,11 +59288,11 @@ var init_scanner = __esm({
59261
59288
  }
59262
59289
  _resolveBoost(context) {
59263
59290
  if (!context) return /* @__PURE__ */ new Set();
59264
- const lowered = context.toLowerCase();
59265
59291
  const boosted = /* @__PURE__ */ new Set();
59266
59292
  for (const [, desc] of _dlpPatternRegistry.iterDescriptors()) {
59267
59293
  for (const term of desc.proximityTerms) {
59268
- if (lowered.includes(term)) {
59294
+ const pattern = new RegExp("\\b" + term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i");
59295
+ if (pattern.test(context)) {
59269
59296
  boosted.add(desc.category.toLowerCase());
59270
59297
  break;
59271
59298
  }
@@ -59284,9 +59311,9 @@ var init_scanner = __esm({
59284
59311
  allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59285
59312
  }
59286
59313
  const resolved = resolveOverlaps(allSpans);
59287
- await Promise.all(resolved.map(async (span) => {
59314
+ await chunkEncode(resolved, async (span) => {
59288
59315
  span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59289
- }));
59316
+ });
59290
59317
  let currentText = reconstruct(text, resolved);
59291
59318
  if (pipeline.includes("nlp")) {
59292
59319
  [currentText] = await this._tier2Nlp(currentText, _encode, boost, !!options.aggressive, confidenceThreshold);
@@ -59305,7 +59332,7 @@ var init_scanner = __esm({
59305
59332
  allSpans.push(...await this._tier0CollectSpans(text, confidenceThreshold));
59306
59333
  }
59307
59334
  const resolved = resolveOverlaps(allSpans);
59308
- await Promise.all(resolved.map(async (span) => {
59335
+ await chunkEncode(resolved, async (span) => {
59309
59336
  span.maskedValue = await _encode(span.originalValue, { entityType: span.entityType });
59310
59337
  allEntities.push({
59311
59338
  type: span.entityType,
@@ -59315,7 +59342,7 @@ var init_scanner = __esm({
59315
59342
  masked_value: span.maskedValue,
59316
59343
  language: span.language
59317
59344
  });
59318
- }));
59345
+ });
59319
59346
  const remaining = reconstruct(text, resolved);
59320
59347
  if (pipeline.includes("nlp")) {
59321
59348
  const [, tier2] = await this._tier2Nlp(remaining, _encode, boost, !!options.aggressive, confidenceThreshold);
@@ -59619,7 +59646,7 @@ init_handlers();
59619
59646
  init_scorer();
59620
59647
 
59621
59648
  // src/index.ts
59622
- var VERSION = "3.4.0";
59649
+ var VERSION = "3.5.0";
59623
59650
  async function detectEntitiesWithConfidence(text, options = {}) {
59624
59651
  const scanner = getScanner();
59625
59652
  return await scanner.scanAndReturnEntities(text, options);