@flexorch/audit 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -17,6 +17,12 @@ interface NoiseMetrics {
17
17
  garbage_ratio: number;
18
18
  encoding_ok: boolean;
19
19
  }
20
+ /**
21
+ * Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
22
+ * Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
23
+ * indicate a document likely to reduce extraction quality.
24
+ */
25
+ declare function noiseRatio(text: string): number;
20
26
  declare function noiseMetrics(text: string): NoiseMetrics;
21
27
 
22
28
  type MaskStrategy = "redact" | "replace" | "token" | "hash";
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
30
36
  * import { readFileSync } from "fs"
31
37
  *
32
38
  * const text = readFileSync("contract.txt", "utf8")
33
- * const result = audit(text, { locale: "tr" })
39
+ * const result = audit(text) // locale defaults to "und" (all detectors)
40
+ * const result = audit(text, { locale: "tr" }) // Turkish-only detectors
34
41
  *
35
- * result.quality_grade // "A"
36
- * result.quality_score // 0.91
37
- * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
42
+ * result.quality_grade // "A"
43
+ * result.quality_score // 0.91
44
+ * result.noise_ratio // 0.03 (line-level noise fraction)
45
+ * result.detected_language // "und" (locale passed in — caller controls language)
46
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
47
  *
39
48
  * // Raw findings and metrics also available:
40
49
  * result.pii // [{ type, value, start, end }, ...]
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
54
  * // "Contact: [REDACTED_EMAIL]"
46
55
  */
47
56
 
48
- declare const version = "0.3.1";
57
+ declare const version = "0.5.0";
49
58
  type QualityGrade = "A" | "B" | "C" | "D";
50
59
  interface PiiSummaryEntry {
51
60
  type: string;
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
54
63
  interface AuditOptions {
55
64
  /**
56
65
  * Active locale-specific detectors.
57
- * - "tr" Turkish: TCKN, VKN, phone_tr, name (default)
58
- * - "us" US: SSN, E.164 phone
59
- * - "eu" — EU: E.164 phone
60
- * - "all" All detectors
66
+ * - "und" All detectors combined (default; use when language is unknown)
67
+ * - "all" Alias for "und"
68
+ * - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
69
+ * - "us" US: SSN, EIN, ITIN, E.164 phone, company_name_intl
70
+ * - "eu" — EU: E.164 phone, iban_intl, company_name_intl
71
+ * - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
61
72
  *
62
73
  * Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
63
74
  */
@@ -74,6 +85,10 @@ interface AuditResult {
74
85
  pii: PiiFinding[];
75
86
  quality: QualityMetrics;
76
87
  noise: NoiseMetrics;
88
+ /** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
89
+ noise_ratio: number;
90
+ /** The locale value passed to audit() — caller-controlled language selection. */
91
+ detected_language: string;
77
92
  }
78
93
  interface MaskOptions {
79
94
  /** @default "redact" */
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
102
117
  */
103
118
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
104
119
 
105
- export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
120
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
package/dist/index.js CHANGED
@@ -1,7 +1,8 @@
1
1
  // src/pii.ts
2
2
  var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
3
- var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
3
+ var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
4
4
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
5
+ var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
5
6
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
6
7
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
7
8
  var _H = "[0-9a-fA-F]{1,4}";
@@ -12,6 +13,102 @@ var IPV6_RE = new RegExp(
12
13
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
13
14
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
14
15
  var VKN_RE = /\b([1-9]\d{9})\b/g;
16
+ var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
17
+ var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
18
+ var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
19
+ var COMPANY_NAME_TR_RE = new RegExp(
20
+ `(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
21
+ "gu"
22
+ );
23
+ var MERSIS_RE = /\b([1-9]\d{15})\b/g;
24
+ var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
25
+ var _TR_PROVINCES_SORTED = [
26
+ "Afyonkarahisar",
27
+ "Kahramanmara\u015F",
28
+ "K\u0131r\u0131kkale",
29
+ "K\u0131rklareli",
30
+ "Diyarbak\u0131r",
31
+ "Gaziantep",
32
+ "\u015Eanl\u0131urfa",
33
+ "Nev\u015Fehir",
34
+ "Kastamonu",
35
+ "G\xFCm\xFC\u015Fhane",
36
+ "Eski\u015Fehir",
37
+ "Erzincan",
38
+ "Erzurum",
39
+ "Denizli",
40
+ "\xC7anakkale",
41
+ "Ad\u0131yaman",
42
+ "Zonguldak",
43
+ "Tekirda\u011F",
44
+ "Trabzon",
45
+ "Tunceli",
46
+ "Karaman",
47
+ "Karab\xFCk",
48
+ "Aksaray",
49
+ "Antalya",
50
+ "K\u0131r\u015Fehir",
51
+ "Osmaniye",
52
+ "Kocaeli",
53
+ "Sakarya",
54
+ "Bart\u0131n",
55
+ "Bayburt",
56
+ "Ardahan",
57
+ "Yozgat",
58
+ "Ankara",
59
+ "Amasya",
60
+ "Artvin",
61
+ "Bal\u0131kesir",
62
+ "Bilecik",
63
+ "Bing\xF6l",
64
+ "Bitlis",
65
+ "Burdur",
66
+ "\xC7ank\u0131r\u0131",
67
+ "Edirne",
68
+ "Elaz\u0131\u011F",
69
+ "Giresun",
70
+ "Hakkari",
71
+ "Isparta",
72
+ "\u0130stanbul",
73
+ "\u0130zmir",
74
+ "Kayseri",
75
+ "K\xFCtahya",
76
+ "Malatya",
77
+ "Manisa",
78
+ "Mardin",
79
+ "Samsun",
80
+ "\u015E\u0131rnak",
81
+ "Sinop",
82
+ "Tokat",
83
+ "Hatay",
84
+ "Konya",
85
+ "Mu\u011Fla",
86
+ "Ni\u011Fde",
87
+ "Rize",
88
+ "Siirt",
89
+ "Sivas",
90
+ "Adana",
91
+ "Ayd\u0131n",
92
+ "Bursa",
93
+ "\xC7orum",
94
+ "I\u011Fd\u0131r",
95
+ "Kilis",
96
+ "Mersin",
97
+ "Batman",
98
+ "Yalova",
99
+ "D\xFCzce",
100
+ "Ordu",
101
+ "Kars",
102
+ "A\u011Fr\u0131",
103
+ "Bolu",
104
+ "Van",
105
+ "U\u015Fak",
106
+ "Mu\u015F"
107
+ ].sort((a, b) => b.length - a.length);
108
+ var PROVINCE_TR_RE = new RegExp(
109
+ `(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
110
+ "gu"
111
+ );
15
112
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
16
113
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
17
114
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -19,7 +116,64 @@ var NAME_RE = new RegExp(
19
116
  `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
20
117
  "gu"
21
118
  );
119
+ var _IBAN_INTL_LENGTHS = {
120
+ AT: 20,
121
+ BE: 16,
122
+ BG: 22,
123
+ HR: 21,
124
+ CY: 28,
125
+ CZ: 24,
126
+ DK: 18,
127
+ EE: 20,
128
+ FI: 18,
129
+ FR: 27,
130
+ DE: 22,
131
+ GR: 27,
132
+ HU: 28,
133
+ IE: 22,
134
+ IT: 27,
135
+ LV: 21,
136
+ LT: 20,
137
+ LU: 20,
138
+ MT: 31,
139
+ NL: 18,
140
+ PL: 28,
141
+ PT: 25,
142
+ RO: 24,
143
+ SK: 24,
144
+ SI: 19,
145
+ ES: 24,
146
+ SE: 24,
147
+ GB: 22,
148
+ CH: 21,
149
+ NO: 15
150
+ };
151
+ var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
152
+ var _UC = "[A-Z\xC0-\u024F]";
153
+ var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
154
+ var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
155
+ var COMPANY_NAME_INTL_RE = new RegExp(
156
+ `(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
157
+ "gu"
158
+ );
22
159
  var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
160
+ var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
161
+ var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
162
+ var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
163
+ var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
164
+ var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
165
+ var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
166
+ var INSEE_FR_RE = /\b([12]\d{14})\b/g;
167
+ var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
168
+ var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
169
+ var BSN_NL_RE = /\b(\d{9})\b/g;
170
+ var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
171
+ var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
172
+ var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
173
+ var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
174
+ var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
175
+ var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
176
+ var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
23
177
  function validTckn(s) {
24
178
  if (s.length !== 11 || s[0] === "0") return false;
25
179
  const d = s.split("").map(Number);
@@ -70,19 +224,109 @@ function validIban(s) {
70
224
  }
71
225
  return remainder === 1;
72
226
  }
227
+ function validIbanIntl(s) {
228
+ const country = s.slice(0, 2);
229
+ if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
230
+ if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
231
+ return validIban(s);
232
+ }
233
+ function validPhoneIntl(raw) {
234
+ const digits = raw.replace(/\D/g, "");
235
+ return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
236
+ }
237
+ function validSteuerIdDe(s) {
238
+ if (s.length !== 11 || s[0] === "0") return false;
239
+ let product = 10;
240
+ for (let i = 0; i < 10; i++) {
241
+ let total = (parseInt(s[i]) + product) % 10;
242
+ if (total === 0) total = 10;
243
+ product = total * 2 % 11;
244
+ }
245
+ let check = 11 - product;
246
+ if (check === 10) check = 0;
247
+ return check === parseInt(s[10]);
248
+ }
249
+ function validPartitaIvaIt(s) {
250
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
251
+ let oddSum = 0;
252
+ let evenSum = 0;
253
+ for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
254
+ for (let i = 1; i < 10; i += 2) {
255
+ let v = parseInt(s[i]) * 2;
256
+ evenSum += v < 10 ? v : v - 9;
257
+ }
258
+ return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
259
+ }
260
+ function validBsnNl(s) {
261
+ if (s.length !== 9 || !/^\d+$/.test(s)) return false;
262
+ let total = 0;
263
+ for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
264
+ total -= parseInt(s[8]);
265
+ return total > 0 && total % 11 === 0;
266
+ }
267
+ function validDniEs(s) {
268
+ if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
269
+ return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
270
+ }
271
+ function validNieEs(s) {
272
+ if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
273
+ const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
274
+ return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
275
+ }
276
+ var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
277
+ function validNiUk(s) {
278
+ return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
279
+ }
280
+ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
281
+ "00",
282
+ "07",
283
+ "08",
284
+ "09",
285
+ "17",
286
+ "18",
287
+ "19",
288
+ "28",
289
+ "29",
290
+ "49",
291
+ "69",
292
+ "70",
293
+ "78",
294
+ "79",
295
+ "89",
296
+ "96",
297
+ "97"
298
+ ]);
299
+ function validEinUs(s) {
300
+ return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
301
+ }
73
302
  var LOCALE_DETECTORS = {
74
- tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
75
- us: /* @__PURE__ */ new Set(["ssn", "phone"]),
76
- eu: /* @__PURE__ */ new Set(["phone"])
303
+ tr: /* @__PURE__ */ new Set([
304
+ "national_id_tr",
305
+ "tax_id_tr",
306
+ "phone_tr",
307
+ "name",
308
+ "iban_tr",
309
+ "company_name_tr",
310
+ "mersis_no",
311
+ "postal_code_tr",
312
+ "province_tr"
313
+ ]),
314
+ us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
315
+ eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
316
+ de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
317
+ fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
318
+ it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
319
+ nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
320
+ es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
321
+ uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
77
322
  };
78
323
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
79
324
  function activeDetectors(locale) {
80
- if (locale === "all") {
325
+ if (locale === "all" || locale === "und") {
81
326
  const active2 = new Set(UNIVERSAL);
82
327
  for (const detectors of Object.values(LOCALE_DETECTORS)) {
83
328
  detectors.forEach((d) => active2.add(d));
84
329
  }
85
- if (active2.has("phone_tr")) active2.delete("phone");
86
330
  return active2;
87
331
  }
88
332
  const active = new Set(UNIVERSAL);
@@ -98,18 +342,18 @@ function findAll(re, text, type) {
98
342
  }
99
343
  return results;
100
344
  }
101
- function detectPii(text, locale = "tr") {
345
+ function detectPii(text, locale = "und") {
102
346
  const active = activeDetectors(locale);
103
347
  const t = text ?? "";
104
- const findings = [];
348
+ let findings = [];
105
349
  if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
106
- if (active.has("phone")) {
350
+ if (active.has("phone_intl")) {
107
351
  PHONE_INTL_RE.lastIndex = 0;
108
352
  let m;
109
353
  while ((m = PHONE_INTL_RE.exec(t)) !== null) {
110
- const digits = m[0].replace(/\D/g, "").length;
111
- if (digits >= 10) {
112
- findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
354
+ const candidate = m[1];
355
+ if (validPhoneIntl(candidate)) {
356
+ findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
113
357
  }
114
358
  }
115
359
  }
@@ -162,8 +406,180 @@ function detectPii(text, locale = "tr") {
162
406
  findings.push({ type: "name", value, start, end: start + value.length });
163
407
  }
164
408
  }
409
+ if (active.has("iban_tr")) {
410
+ IBAN_TR_RE.lastIndex = 0;
411
+ let m;
412
+ while ((m = IBAN_TR_RE.exec(t)) !== null) {
413
+ if (validIban(m[0])) {
414
+ findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
415
+ }
416
+ }
417
+ }
418
+ if (active.has("company_name_tr")) {
419
+ COMPANY_NAME_TR_RE.lastIndex = 0;
420
+ let m;
421
+ while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
422
+ findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
423
+ }
424
+ }
425
+ if (active.has("mersis_no")) {
426
+ MERSIS_RE.lastIndex = 0;
427
+ let m;
428
+ while ((m = MERSIS_RE.exec(t)) !== null) {
429
+ findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
430
+ }
431
+ }
432
+ if (active.has("postal_code_tr")) {
433
+ POSTAL_CODE_TR_RE.lastIndex = 0;
434
+ let m;
435
+ while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
436
+ findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
437
+ }
438
+ }
439
+ if (active.has("province_tr")) {
440
+ PROVINCE_TR_RE.lastIndex = 0;
441
+ let m;
442
+ while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
443
+ findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
444
+ }
445
+ }
165
446
  if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
166
- return findings.sort((a, b) => a.start - b.start);
447
+ if (active.has("tax_id_us")) {
448
+ EIN_US_RE.lastIndex = 0;
449
+ let m;
450
+ while ((m = EIN_US_RE.exec(t)) !== null) {
451
+ if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
452
+ }
453
+ }
454
+ if (active.has("national_id_us")) {
455
+ ITIN_US_RE.lastIndex = 0;
456
+ let m;
457
+ while ((m = ITIN_US_RE.exec(t)) !== null) {
458
+ findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
459
+ }
460
+ }
461
+ if (active.has("tax_id_de")) {
462
+ STEUER_ID_DE_RE.lastIndex = 0;
463
+ let m;
464
+ while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
465
+ if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
466
+ }
467
+ }
468
+ if (active.has("social_id_de")) {
469
+ SVNR_DE_RE.lastIndex = 0;
470
+ let m;
471
+ while ((m = SVNR_DE_RE.exec(t)) !== null) {
472
+ findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
473
+ }
474
+ }
475
+ if (active.has("siret_fr")) {
476
+ SIRET_FR_RE.lastIndex = 0;
477
+ let m;
478
+ while ((m = SIRET_FR_RE.exec(t)) !== null) {
479
+ findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
480
+ }
481
+ }
482
+ if (active.has("company_id_fr")) {
483
+ SIREN_FR_RE.lastIndex = 0;
484
+ let m;
485
+ while ((m = SIREN_FR_RE.exec(t)) !== null) {
486
+ findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
487
+ }
488
+ }
489
+ if (active.has("social_id_fr")) {
490
+ INSEE_FR_RE.lastIndex = 0;
491
+ let m;
492
+ while ((m = INSEE_FR_RE.exec(t)) !== null) {
493
+ findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
494
+ }
495
+ }
496
+ if (active.has("national_id_it")) {
497
+ CODICE_FISCALE_IT_RE.lastIndex = 0;
498
+ let m;
499
+ while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
500
+ findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
501
+ }
502
+ }
503
+ if (active.has("tax_id_it")) {
504
+ PARTITA_IVA_IT_RE.lastIndex = 0;
505
+ let m;
506
+ while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
507
+ if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
508
+ }
509
+ }
510
+ if (active.has("national_id_nl")) {
511
+ BSN_NL_RE.lastIndex = 0;
512
+ let m;
513
+ while ((m = BSN_NL_RE.exec(t)) !== null) {
514
+ if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
515
+ }
516
+ }
517
+ if (active.has("company_id_nl")) {
518
+ KVK_NL_RE.lastIndex = 0;
519
+ let m;
520
+ while ((m = KVK_NL_RE.exec(t)) !== null) {
521
+ findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
522
+ }
523
+ }
524
+ if (active.has("national_id_es")) {
525
+ DNI_ES_RE.lastIndex = 0;
526
+ let m;
527
+ while ((m = DNI_ES_RE.exec(t)) !== null) {
528
+ if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
529
+ }
530
+ NIE_ES_RE.lastIndex = 0;
531
+ while ((m = NIE_ES_RE.exec(t)) !== null) {
532
+ if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
533
+ }
534
+ }
535
+ if (active.has("tax_id_es")) {
536
+ CIF_ES_RE.lastIndex = 0;
537
+ let m;
538
+ while ((m = CIF_ES_RE.exec(t)) !== null) {
539
+ findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
540
+ }
541
+ }
542
+ if (active.has("social_id_uk")) {
543
+ NI_UK_RE.lastIndex = 0;
544
+ let m;
545
+ while ((m = NI_UK_RE.exec(t)) !== null) {
546
+ if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
547
+ }
548
+ }
549
+ if (active.has("tax_id_uk")) {
550
+ UTR_UK_RE.lastIndex = 0;
551
+ let m;
552
+ while ((m = UTR_UK_RE.exec(t)) !== null) {
553
+ findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
554
+ }
555
+ }
556
+ if (active.has("iban_intl")) {
557
+ IBAN_INTL_RE.lastIndex = 0;
558
+ let m;
559
+ while ((m = IBAN_INTL_RE.exec(t)) !== null) {
560
+ const candidate = m[1];
561
+ if (validIbanIntl(candidate)) {
562
+ findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
563
+ }
564
+ }
565
+ }
566
+ if (active.has("company_name_intl")) {
567
+ COMPANY_NAME_INTL_RE.lastIndex = 0;
568
+ let m;
569
+ while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
570
+ findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
571
+ }
572
+ }
573
+ findings.sort((a, b) => a.start - b.start);
574
+ const specificIbanSpans = new Set(
575
+ findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
576
+ );
577
+ if (specificIbanSpans.size > 0) {
578
+ findings = findings.filter(
579
+ (f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
580
+ );
581
+ }
582
+ return findings;
167
583
  }
168
584
 
169
585
  // src/quality.ts
@@ -185,6 +601,15 @@ function isGarbage(ch) {
185
601
  return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
186
602
  cp >= 55296 && cp <= 57343;
187
603
  }
604
+ var LINE_NOISE_RE = /[@#!~*=]{3,}/;
605
+ function noiseRatio(text) {
606
+ if (!text) return 0;
607
+ const lines = text.split("\n");
608
+ const total = lines.length;
609
+ if (total === 0) return 0;
610
+ const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
611
+ return Math.round(noisy / total * 1e4) / 1e4;
612
+ }
188
613
  function noiseMetrics(text) {
189
614
  if (!text) return { garbage_ratio: 0, encoding_ok: true };
190
615
  const n = text.length;
@@ -241,7 +666,7 @@ function applyMask(text, findings, strategy = "redact") {
241
666
  }
242
667
 
243
668
  // src/index.ts
244
- var version = "0.3.1";
669
+ var version = "0.5.0";
245
670
  function computeQualityScore(completeness, avgLength, garbageRatio) {
246
671
  const lengthScore = Math.min(avgLength / 500, 1);
247
672
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
@@ -254,10 +679,11 @@ function computeQualityGrade(score) {
254
679
  return "D";
255
680
  }
256
681
  function audit(text, options = {}) {
257
- const locale = options.locale ?? "tr";
682
+ const locale = options.locale ?? "und";
258
683
  const pii = detectPii(text, locale);
259
684
  const quality = qualityMetrics(text);
260
685
  const noise = noiseMetrics(text);
686
+ const noise_ratio = noiseRatio(text);
261
687
  const quality_score = computeQualityScore(
262
688
  quality.completeness,
263
689
  quality.avg_length,
@@ -267,7 +693,7 @@ function audit(text, options = {}) {
267
693
  const counts = /* @__PURE__ */ new Map();
268
694
  for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
269
695
  const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
270
- return { quality_grade, quality_score, pii_summary, pii, quality, noise };
696
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
271
697
  }
272
698
  function auditBatch(texts, options = {}) {
273
699
  if (texts.length === 0) {
@@ -298,6 +724,7 @@ export {
298
724
  detectPii,
299
725
  mask,
300
726
  noiseMetrics,
727
+ noiseRatio,
301
728
  qualityMetrics,
302
729
  version
303
730
  };