@flexorch/audit 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -26,6 +26,7 @@ __export(index_exports, {
26
26
  detectPii: () => detectPii,
27
27
  mask: () => mask,
28
28
  noiseMetrics: () => noiseMetrics,
29
+ noiseRatio: () => noiseRatio,
29
30
  qualityMetrics: () => qualityMetrics,
30
31
  version: () => version
31
32
  });
@@ -33,8 +34,9 @@ module.exports = __toCommonJS(index_exports);
33
34
 
34
35
  // src/pii.ts
35
36
  var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
36
- var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
37
+ var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
37
38
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
39
+ var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
38
40
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
39
41
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
40
42
  var _H = "[0-9a-fA-F]{1,4}";
@@ -45,6 +47,102 @@ var IPV6_RE = new RegExp(
45
47
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
46
48
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
47
49
  var VKN_RE = /\b([1-9]\d{9})\b/g;
50
+ var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
51
+ var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
52
+ var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
53
+ var COMPANY_NAME_TR_RE = new RegExp(
54
+ `(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
55
+ "gu"
56
+ );
57
+ var MERSIS_RE = /\b([1-9]\d{15})\b/g;
58
+ var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
59
+ var _TR_PROVINCES_SORTED = [
60
+ "Afyonkarahisar",
61
+ "Kahramanmara\u015F",
62
+ "K\u0131r\u0131kkale",
63
+ "K\u0131rklareli",
64
+ "Diyarbak\u0131r",
65
+ "Gaziantep",
66
+ "\u015Eanl\u0131urfa",
67
+ "Nev\u015Fehir",
68
+ "Kastamonu",
69
+ "G\xFCm\xFC\u015Fhane",
70
+ "Eski\u015Fehir",
71
+ "Erzincan",
72
+ "Erzurum",
73
+ "Denizli",
74
+ "\xC7anakkale",
75
+ "Ad\u0131yaman",
76
+ "Zonguldak",
77
+ "Tekirda\u011F",
78
+ "Trabzon",
79
+ "Tunceli",
80
+ "Karaman",
81
+ "Karab\xFCk",
82
+ "Aksaray",
83
+ "Antalya",
84
+ "K\u0131r\u015Fehir",
85
+ "Osmaniye",
86
+ "Kocaeli",
87
+ "Sakarya",
88
+ "Bart\u0131n",
89
+ "Bayburt",
90
+ "Ardahan",
91
+ "Yozgat",
92
+ "Ankara",
93
+ "Amasya",
94
+ "Artvin",
95
+ "Bal\u0131kesir",
96
+ "Bilecik",
97
+ "Bing\xF6l",
98
+ "Bitlis",
99
+ "Burdur",
100
+ "\xC7ank\u0131r\u0131",
101
+ "Edirne",
102
+ "Elaz\u0131\u011F",
103
+ "Giresun",
104
+ "Hakkari",
105
+ "Isparta",
106
+ "\u0130stanbul",
107
+ "\u0130zmir",
108
+ "Kayseri",
109
+ "K\xFCtahya",
110
+ "Malatya",
111
+ "Manisa",
112
+ "Mardin",
113
+ "Samsun",
114
+ "\u015E\u0131rnak",
115
+ "Sinop",
116
+ "Tokat",
117
+ "Hatay",
118
+ "Konya",
119
+ "Mu\u011Fla",
120
+ "Ni\u011Fde",
121
+ "Rize",
122
+ "Siirt",
123
+ "Sivas",
124
+ "Adana",
125
+ "Ayd\u0131n",
126
+ "Bursa",
127
+ "\xC7orum",
128
+ "I\u011Fd\u0131r",
129
+ "Kilis",
130
+ "Mersin",
131
+ "Batman",
132
+ "Yalova",
133
+ "D\xFCzce",
134
+ "Ordu",
135
+ "Kars",
136
+ "A\u011Fr\u0131",
137
+ "Bolu",
138
+ "Van",
139
+ "U\u015Fak",
140
+ "Mu\u015F"
141
+ ].sort((a, b) => b.length - a.length);
142
+ var PROVINCE_TR_RE = new RegExp(
143
+ `(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
144
+ "gu"
145
+ );
48
146
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
49
147
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
50
148
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -52,7 +150,64 @@ var NAME_RE = new RegExp(
52
150
  `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
53
151
  "gu"
54
152
  );
153
+ var _IBAN_INTL_LENGTHS = {
154
+ AT: 20,
155
+ BE: 16,
156
+ BG: 22,
157
+ HR: 21,
158
+ CY: 28,
159
+ CZ: 24,
160
+ DK: 18,
161
+ EE: 20,
162
+ FI: 18,
163
+ FR: 27,
164
+ DE: 22,
165
+ GR: 27,
166
+ HU: 28,
167
+ IE: 22,
168
+ IT: 27,
169
+ LV: 21,
170
+ LT: 20,
171
+ LU: 20,
172
+ MT: 31,
173
+ NL: 18,
174
+ PL: 28,
175
+ PT: 25,
176
+ RO: 24,
177
+ SK: 24,
178
+ SI: 19,
179
+ ES: 24,
180
+ SE: 24,
181
+ GB: 22,
182
+ CH: 21,
183
+ NO: 15
184
+ };
185
+ var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
186
+ var _UC = "[A-Z\xC0-\u024F]";
187
+ var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
188
+ var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
189
+ var COMPANY_NAME_INTL_RE = new RegExp(
190
+ `(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
191
+ "gu"
192
+ );
55
193
  var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
194
+ var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
195
+ var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
196
+ var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
197
+ var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
198
+ var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
199
+ var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
200
+ var INSEE_FR_RE = /\b([12]\d{14})\b/g;
201
+ var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
202
+ var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
203
+ var BSN_NL_RE = /\b(\d{9})\b/g;
204
+ var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
205
+ var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
206
+ var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
207
+ var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
208
+ var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
209
+ var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
210
+ var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
56
211
  function validTckn(s) {
57
212
  if (s.length !== 11 || s[0] === "0") return false;
58
213
  const d = s.split("").map(Number);
@@ -103,19 +258,109 @@ function validIban(s) {
103
258
  }
104
259
  return remainder === 1;
105
260
  }
261
+ function validIbanIntl(s) {
262
+ const country = s.slice(0, 2);
263
+ if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
264
+ if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
265
+ return validIban(s);
266
+ }
267
+ function validPhoneIntl(raw) {
268
+ const digits = raw.replace(/\D/g, "");
269
+ return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
270
+ }
271
+ function validSteuerIdDe(s) {
272
+ if (s.length !== 11 || s[0] === "0") return false;
273
+ let product = 10;
274
+ for (let i = 0; i < 10; i++) {
275
+ let total = (parseInt(s[i]) + product) % 10;
276
+ if (total === 0) total = 10;
277
+ product = total * 2 % 11;
278
+ }
279
+ let check = 11 - product;
280
+ if (check === 10) check = 0;
281
+ return check === parseInt(s[10]);
282
+ }
283
+ function validPartitaIvaIt(s) {
284
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
285
+ let oddSum = 0;
286
+ let evenSum = 0;
287
+ for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
288
+ for (let i = 1; i < 10; i += 2) {
289
+ let v = parseInt(s[i]) * 2;
290
+ evenSum += v < 10 ? v : v - 9;
291
+ }
292
+ return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
293
+ }
294
+ function validBsnNl(s) {
295
+ if (s.length !== 9 || !/^\d+$/.test(s)) return false;
296
+ let total = 0;
297
+ for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
298
+ total -= parseInt(s[8]);
299
+ return total > 0 && total % 11 === 0;
300
+ }
301
+ function validDniEs(s) {
302
+ if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
303
+ return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
304
+ }
305
+ function validNieEs(s) {
306
+ if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
307
+ const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
308
+ return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
309
+ }
310
+ var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
311
+ function validNiUk(s) {
312
+ return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
313
+ }
314
+ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
315
+ "00",
316
+ "07",
317
+ "08",
318
+ "09",
319
+ "17",
320
+ "18",
321
+ "19",
322
+ "28",
323
+ "29",
324
+ "49",
325
+ "69",
326
+ "70",
327
+ "78",
328
+ "79",
329
+ "89",
330
+ "96",
331
+ "97"
332
+ ]);
333
+ function validEinUs(s) {
334
+ return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
335
+ }
106
336
  var LOCALE_DETECTORS = {
107
- tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
108
- us: /* @__PURE__ */ new Set(["ssn", "phone"]),
109
- eu: /* @__PURE__ */ new Set(["phone"])
337
+ tr: /* @__PURE__ */ new Set([
338
+ "national_id_tr",
339
+ "tax_id_tr",
340
+ "phone_tr",
341
+ "name",
342
+ "iban_tr",
343
+ "company_name_tr",
344
+ "mersis_no",
345
+ "postal_code_tr",
346
+ "province_tr"
347
+ ]),
348
+ us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
349
+ eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
350
+ de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
351
+ fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
352
+ it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
353
+ nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
354
+ es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
355
+ uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
110
356
  };
111
357
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
112
358
  function activeDetectors(locale) {
113
- if (locale === "all") {
359
+ if (locale === "all" || locale === "und") {
114
360
  const active2 = new Set(UNIVERSAL);
115
361
  for (const detectors of Object.values(LOCALE_DETECTORS)) {
116
362
  detectors.forEach((d) => active2.add(d));
117
363
  }
118
- if (active2.has("phone_tr")) active2.delete("phone");
119
364
  return active2;
120
365
  }
121
366
  const active = new Set(UNIVERSAL);
@@ -131,18 +376,18 @@ function findAll(re, text, type) {
131
376
  }
132
377
  return results;
133
378
  }
134
- function detectPii(text, locale = "tr") {
379
+ function detectPii(text, locale = "und") {
135
380
  const active = activeDetectors(locale);
136
381
  const t = text ?? "";
137
- const findings = [];
382
+ let findings = [];
138
383
  if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
139
- if (active.has("phone")) {
384
+ if (active.has("phone_intl")) {
140
385
  PHONE_INTL_RE.lastIndex = 0;
141
386
  let m;
142
387
  while ((m = PHONE_INTL_RE.exec(t)) !== null) {
143
- const digits = m[0].replace(/\D/g, "").length;
144
- if (digits >= 10) {
145
- findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
388
+ const candidate = m[1];
389
+ if (validPhoneIntl(candidate)) {
390
+ findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
146
391
  }
147
392
  }
148
393
  }
@@ -195,8 +440,180 @@ function detectPii(text, locale = "tr") {
195
440
  findings.push({ type: "name", value, start, end: start + value.length });
196
441
  }
197
442
  }
443
+ if (active.has("iban_tr")) {
444
+ IBAN_TR_RE.lastIndex = 0;
445
+ let m;
446
+ while ((m = IBAN_TR_RE.exec(t)) !== null) {
447
+ if (validIban(m[0])) {
448
+ findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
449
+ }
450
+ }
451
+ }
452
+ if (active.has("company_name_tr")) {
453
+ COMPANY_NAME_TR_RE.lastIndex = 0;
454
+ let m;
455
+ while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
456
+ findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
457
+ }
458
+ }
459
+ if (active.has("mersis_no")) {
460
+ MERSIS_RE.lastIndex = 0;
461
+ let m;
462
+ while ((m = MERSIS_RE.exec(t)) !== null) {
463
+ findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
464
+ }
465
+ }
466
+ if (active.has("postal_code_tr")) {
467
+ POSTAL_CODE_TR_RE.lastIndex = 0;
468
+ let m;
469
+ while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
470
+ findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
471
+ }
472
+ }
473
+ if (active.has("province_tr")) {
474
+ PROVINCE_TR_RE.lastIndex = 0;
475
+ let m;
476
+ while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
477
+ findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
478
+ }
479
+ }
198
480
  if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
199
- return findings.sort((a, b) => a.start - b.start);
481
+ if (active.has("tax_id_us")) {
482
+ EIN_US_RE.lastIndex = 0;
483
+ let m;
484
+ while ((m = EIN_US_RE.exec(t)) !== null) {
485
+ if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
486
+ }
487
+ }
488
+ if (active.has("national_id_us")) {
489
+ ITIN_US_RE.lastIndex = 0;
490
+ let m;
491
+ while ((m = ITIN_US_RE.exec(t)) !== null) {
492
+ findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
493
+ }
494
+ }
495
+ if (active.has("tax_id_de")) {
496
+ STEUER_ID_DE_RE.lastIndex = 0;
497
+ let m;
498
+ while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
499
+ if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
500
+ }
501
+ }
502
+ if (active.has("social_id_de")) {
503
+ SVNR_DE_RE.lastIndex = 0;
504
+ let m;
505
+ while ((m = SVNR_DE_RE.exec(t)) !== null) {
506
+ findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
507
+ }
508
+ }
509
+ if (active.has("siret_fr")) {
510
+ SIRET_FR_RE.lastIndex = 0;
511
+ let m;
512
+ while ((m = SIRET_FR_RE.exec(t)) !== null) {
513
+ findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
514
+ }
515
+ }
516
+ if (active.has("company_id_fr")) {
517
+ SIREN_FR_RE.lastIndex = 0;
518
+ let m;
519
+ while ((m = SIREN_FR_RE.exec(t)) !== null) {
520
+ findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
521
+ }
522
+ }
523
+ if (active.has("social_id_fr")) {
524
+ INSEE_FR_RE.lastIndex = 0;
525
+ let m;
526
+ while ((m = INSEE_FR_RE.exec(t)) !== null) {
527
+ findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
528
+ }
529
+ }
530
+ if (active.has("national_id_it")) {
531
+ CODICE_FISCALE_IT_RE.lastIndex = 0;
532
+ let m;
533
+ while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
534
+ findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
535
+ }
536
+ }
537
+ if (active.has("tax_id_it")) {
538
+ PARTITA_IVA_IT_RE.lastIndex = 0;
539
+ let m;
540
+ while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
541
+ if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
542
+ }
543
+ }
544
+ if (active.has("national_id_nl")) {
545
+ BSN_NL_RE.lastIndex = 0;
546
+ let m;
547
+ while ((m = BSN_NL_RE.exec(t)) !== null) {
548
+ if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
549
+ }
550
+ }
551
+ if (active.has("company_id_nl")) {
552
+ KVK_NL_RE.lastIndex = 0;
553
+ let m;
554
+ while ((m = KVK_NL_RE.exec(t)) !== null) {
555
+ findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
556
+ }
557
+ }
558
+ if (active.has("national_id_es")) {
559
+ DNI_ES_RE.lastIndex = 0;
560
+ let m;
561
+ while ((m = DNI_ES_RE.exec(t)) !== null) {
562
+ if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
563
+ }
564
+ NIE_ES_RE.lastIndex = 0;
565
+ while ((m = NIE_ES_RE.exec(t)) !== null) {
566
+ if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
567
+ }
568
+ }
569
+ if (active.has("tax_id_es")) {
570
+ CIF_ES_RE.lastIndex = 0;
571
+ let m;
572
+ while ((m = CIF_ES_RE.exec(t)) !== null) {
573
+ findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
574
+ }
575
+ }
576
+ if (active.has("social_id_uk")) {
577
+ NI_UK_RE.lastIndex = 0;
578
+ let m;
579
+ while ((m = NI_UK_RE.exec(t)) !== null) {
580
+ if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
581
+ }
582
+ }
583
+ if (active.has("tax_id_uk")) {
584
+ UTR_UK_RE.lastIndex = 0;
585
+ let m;
586
+ while ((m = UTR_UK_RE.exec(t)) !== null) {
587
+ findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
588
+ }
589
+ }
590
+ if (active.has("iban_intl")) {
591
+ IBAN_INTL_RE.lastIndex = 0;
592
+ let m;
593
+ while ((m = IBAN_INTL_RE.exec(t)) !== null) {
594
+ const candidate = m[1];
595
+ if (validIbanIntl(candidate)) {
596
+ findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
597
+ }
598
+ }
599
+ }
600
+ if (active.has("company_name_intl")) {
601
+ COMPANY_NAME_INTL_RE.lastIndex = 0;
602
+ let m;
603
+ while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
604
+ findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
605
+ }
606
+ }
607
+ findings.sort((a, b) => a.start - b.start);
608
+ const specificIbanSpans = new Set(
609
+ findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
610
+ );
611
+ if (specificIbanSpans.size > 0) {
612
+ findings = findings.filter(
613
+ (f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
614
+ );
615
+ }
616
+ return findings;
200
617
  }
201
618
 
202
619
  // src/quality.ts
@@ -218,6 +635,15 @@ function isGarbage(ch) {
218
635
  return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
219
636
  cp >= 55296 && cp <= 57343;
220
637
  }
638
+ var LINE_NOISE_RE = /[@#!~*=]{3,}/;
639
+ function noiseRatio(text) {
640
+ if (!text) return 0;
641
+ const lines = text.split("\n");
642
+ const total = lines.length;
643
+ if (total === 0) return 0;
644
+ const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
645
+ return Math.round(noisy / total * 1e4) / 1e4;
646
+ }
221
647
  function noiseMetrics(text) {
222
648
  if (!text) return { garbage_ratio: 0, encoding_ok: true };
223
649
  const n = text.length;
@@ -274,7 +700,7 @@ function applyMask(text, findings, strategy = "redact") {
274
700
  }
275
701
 
276
702
  // src/index.ts
277
- var version = "0.3.1";
703
+ var version = "0.5.0";
278
704
  function computeQualityScore(completeness, avgLength, garbageRatio) {
279
705
  const lengthScore = Math.min(avgLength / 500, 1);
280
706
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
@@ -287,10 +713,11 @@ function computeQualityGrade(score) {
287
713
  return "D";
288
714
  }
289
715
  function audit(text, options = {}) {
290
- const locale = options.locale ?? "tr";
716
+ const locale = options.locale ?? "und";
291
717
  const pii = detectPii(text, locale);
292
718
  const quality = qualityMetrics(text);
293
719
  const noise = noiseMetrics(text);
720
+ const noise_ratio = noiseRatio(text);
294
721
  const quality_score = computeQualityScore(
295
722
  quality.completeness,
296
723
  quality.avg_length,
@@ -300,7 +727,7 @@ function audit(text, options = {}) {
300
727
  const counts = /* @__PURE__ */ new Map();
301
728
  for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
302
729
  const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
303
- return { quality_grade, quality_score, pii_summary, pii, quality, noise };
730
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
304
731
  }
305
732
  function auditBatch(texts, options = {}) {
306
733
  if (texts.length === 0) {
@@ -332,6 +759,7 @@ function mask(text, findings, options = {}) {
332
759
  detectPii,
333
760
  mask,
334
761
  noiseMetrics,
762
+ noiseRatio,
335
763
  qualityMetrics,
336
764
  version
337
765
  });
package/dist/index.d.cts CHANGED
@@ -17,6 +17,12 @@ interface NoiseMetrics {
17
17
  garbage_ratio: number;
18
18
  encoding_ok: boolean;
19
19
  }
20
+ /**
21
+ * Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
22
+ * Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
23
+ * indicate a document likely to reduce extraction quality.
24
+ */
25
+ declare function noiseRatio(text: string): number;
20
26
  declare function noiseMetrics(text: string): NoiseMetrics;
21
27
 
22
28
  type MaskStrategy = "redact" | "replace" | "token" | "hash";
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
30
36
  * import { readFileSync } from "fs"
31
37
  *
32
38
  * const text = readFileSync("contract.txt", "utf8")
33
- * const result = audit(text, { locale: "tr" })
39
+ * const result = audit(text) // locale defaults to "und" (all detectors)
40
+ * const result = audit(text, { locale: "tr" }) // Turkish-only detectors
34
41
  *
35
- * result.quality_grade // "A"
36
- * result.quality_score // 0.91
37
- * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
42
+ * result.quality_grade // "A"
43
+ * result.quality_score // 0.91
44
+ * result.noise_ratio // 0.03 (line-level noise fraction)
45
+ * result.detected_language // "und" (locale passed in — caller controls language)
46
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
47
  *
39
48
  * // Raw findings and metrics also available:
40
49
  * result.pii // [{ type, value, start, end }, ...]
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
54
  * // "Contact: [REDACTED_EMAIL]"
46
55
  */
47
56
 
48
- declare const version = "0.3.1";
57
+ declare const version = "0.5.0";
49
58
  type QualityGrade = "A" | "B" | "C" | "D";
50
59
  interface PiiSummaryEntry {
51
60
  type: string;
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
54
63
  interface AuditOptions {
55
64
  /**
56
65
  * Active locale-specific detectors.
57
- * - "tr" Turkish: TCKN, VKN, phone_tr, name (default)
58
- * - "us" US: SSN, E.164 phone
59
- * - "eu" — EU: E.164 phone
60
- * - "all" All detectors
66
+ * - "und" All detectors combined (default; use when language is unknown)
67
+ * - "all" Alias for "und"
68
+ * - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
69
+ * - "us" US: SSN, EIN, ITIN, E.164 phone, company_name_intl
70
+ * - "eu" — EU: E.164 phone, iban_intl, company_name_intl
71
+ * - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
61
72
  *
62
73
  * Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
63
74
  */
@@ -74,6 +85,10 @@ interface AuditResult {
74
85
  pii: PiiFinding[];
75
86
  quality: QualityMetrics;
76
87
  noise: NoiseMetrics;
88
+ /** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
89
+ noise_ratio: number;
90
+ /** The locale value passed to audit() — caller-controlled language selection. */
91
+ detected_language: string;
77
92
  }
78
93
  interface MaskOptions {
79
94
  /** @default "redact" */
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
102
117
  */
103
118
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
104
119
 
105
- export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
120
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };