@flexorch/audit 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +275 -148
- package/dist/index.cjs +444 -16
- package/dist/index.d.cts +25 -10
- package/dist/index.d.ts +25 -10
- package/dist/index.js +443 -16
- package/package.json +49 -49
package/dist/index.cjs
CHANGED
|
@@ -26,6 +26,7 @@ __export(index_exports, {
|
|
|
26
26
|
detectPii: () => detectPii,
|
|
27
27
|
mask: () => mask,
|
|
28
28
|
noiseMetrics: () => noiseMetrics,
|
|
29
|
+
noiseRatio: () => noiseRatio,
|
|
29
30
|
qualityMetrics: () => qualityMetrics,
|
|
30
31
|
version: () => version
|
|
31
32
|
});
|
|
@@ -33,8 +34,9 @@ module.exports = __toCommonJS(index_exports);
|
|
|
33
34
|
|
|
34
35
|
// src/pii.ts
|
|
35
36
|
var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
|
|
36
|
-
var PHONE_INTL_RE =
|
|
37
|
+
var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
|
|
37
38
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
39
|
+
var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
|
|
38
40
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
39
41
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
40
42
|
var _H = "[0-9a-fA-F]{1,4}";
|
|
@@ -45,6 +47,102 @@ var IPV6_RE = new RegExp(
|
|
|
45
47
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
46
48
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
47
49
|
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
50
|
+
var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
|
|
51
|
+
var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
|
|
52
|
+
var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
|
|
53
|
+
var COMPANY_NAME_TR_RE = new RegExp(
|
|
54
|
+
`(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
|
|
55
|
+
"gu"
|
|
56
|
+
);
|
|
57
|
+
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
58
|
+
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
59
|
+
var _TR_PROVINCES_SORTED = [
|
|
60
|
+
"Afyonkarahisar",
|
|
61
|
+
"Kahramanmara\u015F",
|
|
62
|
+
"K\u0131r\u0131kkale",
|
|
63
|
+
"K\u0131rklareli",
|
|
64
|
+
"Diyarbak\u0131r",
|
|
65
|
+
"Gaziantep",
|
|
66
|
+
"\u015Eanl\u0131urfa",
|
|
67
|
+
"Nev\u015Fehir",
|
|
68
|
+
"Kastamonu",
|
|
69
|
+
"G\xFCm\xFC\u015Fhane",
|
|
70
|
+
"Eski\u015Fehir",
|
|
71
|
+
"Erzincan",
|
|
72
|
+
"Erzurum",
|
|
73
|
+
"Denizli",
|
|
74
|
+
"\xC7anakkale",
|
|
75
|
+
"Ad\u0131yaman",
|
|
76
|
+
"Zonguldak",
|
|
77
|
+
"Tekirda\u011F",
|
|
78
|
+
"Trabzon",
|
|
79
|
+
"Tunceli",
|
|
80
|
+
"Karaman",
|
|
81
|
+
"Karab\xFCk",
|
|
82
|
+
"Aksaray",
|
|
83
|
+
"Antalya",
|
|
84
|
+
"K\u0131r\u015Fehir",
|
|
85
|
+
"Osmaniye",
|
|
86
|
+
"Kocaeli",
|
|
87
|
+
"Sakarya",
|
|
88
|
+
"Bart\u0131n",
|
|
89
|
+
"Bayburt",
|
|
90
|
+
"Ardahan",
|
|
91
|
+
"Yozgat",
|
|
92
|
+
"Ankara",
|
|
93
|
+
"Amasya",
|
|
94
|
+
"Artvin",
|
|
95
|
+
"Bal\u0131kesir",
|
|
96
|
+
"Bilecik",
|
|
97
|
+
"Bing\xF6l",
|
|
98
|
+
"Bitlis",
|
|
99
|
+
"Burdur",
|
|
100
|
+
"\xC7ank\u0131r\u0131",
|
|
101
|
+
"Edirne",
|
|
102
|
+
"Elaz\u0131\u011F",
|
|
103
|
+
"Giresun",
|
|
104
|
+
"Hakkari",
|
|
105
|
+
"Isparta",
|
|
106
|
+
"\u0130stanbul",
|
|
107
|
+
"\u0130zmir",
|
|
108
|
+
"Kayseri",
|
|
109
|
+
"K\xFCtahya",
|
|
110
|
+
"Malatya",
|
|
111
|
+
"Manisa",
|
|
112
|
+
"Mardin",
|
|
113
|
+
"Samsun",
|
|
114
|
+
"\u015E\u0131rnak",
|
|
115
|
+
"Sinop",
|
|
116
|
+
"Tokat",
|
|
117
|
+
"Hatay",
|
|
118
|
+
"Konya",
|
|
119
|
+
"Mu\u011Fla",
|
|
120
|
+
"Ni\u011Fde",
|
|
121
|
+
"Rize",
|
|
122
|
+
"Siirt",
|
|
123
|
+
"Sivas",
|
|
124
|
+
"Adana",
|
|
125
|
+
"Ayd\u0131n",
|
|
126
|
+
"Bursa",
|
|
127
|
+
"\xC7orum",
|
|
128
|
+
"I\u011Fd\u0131r",
|
|
129
|
+
"Kilis",
|
|
130
|
+
"Mersin",
|
|
131
|
+
"Batman",
|
|
132
|
+
"Yalova",
|
|
133
|
+
"D\xFCzce",
|
|
134
|
+
"Ordu",
|
|
135
|
+
"Kars",
|
|
136
|
+
"A\u011Fr\u0131",
|
|
137
|
+
"Bolu",
|
|
138
|
+
"Van",
|
|
139
|
+
"U\u015Fak",
|
|
140
|
+
"Mu\u015F"
|
|
141
|
+
].sort((a, b) => b.length - a.length);
|
|
142
|
+
var PROVINCE_TR_RE = new RegExp(
|
|
143
|
+
`(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
|
|
144
|
+
"gu"
|
|
145
|
+
);
|
|
48
146
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
49
147
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
50
148
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -52,7 +150,64 @@ var NAME_RE = new RegExp(
|
|
|
52
150
|
`(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
|
|
53
151
|
"gu"
|
|
54
152
|
);
|
|
153
|
+
var _IBAN_INTL_LENGTHS = {
|
|
154
|
+
AT: 20,
|
|
155
|
+
BE: 16,
|
|
156
|
+
BG: 22,
|
|
157
|
+
HR: 21,
|
|
158
|
+
CY: 28,
|
|
159
|
+
CZ: 24,
|
|
160
|
+
DK: 18,
|
|
161
|
+
EE: 20,
|
|
162
|
+
FI: 18,
|
|
163
|
+
FR: 27,
|
|
164
|
+
DE: 22,
|
|
165
|
+
GR: 27,
|
|
166
|
+
HU: 28,
|
|
167
|
+
IE: 22,
|
|
168
|
+
IT: 27,
|
|
169
|
+
LV: 21,
|
|
170
|
+
LT: 20,
|
|
171
|
+
LU: 20,
|
|
172
|
+
MT: 31,
|
|
173
|
+
NL: 18,
|
|
174
|
+
PL: 28,
|
|
175
|
+
PT: 25,
|
|
176
|
+
RO: 24,
|
|
177
|
+
SK: 24,
|
|
178
|
+
SI: 19,
|
|
179
|
+
ES: 24,
|
|
180
|
+
SE: 24,
|
|
181
|
+
GB: 22,
|
|
182
|
+
CH: 21,
|
|
183
|
+
NO: 15
|
|
184
|
+
};
|
|
185
|
+
var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
|
|
186
|
+
var _UC = "[A-Z\xC0-\u024F]";
|
|
187
|
+
var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
|
|
188
|
+
var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
|
|
189
|
+
var COMPANY_NAME_INTL_RE = new RegExp(
|
|
190
|
+
`(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
|
|
191
|
+
"gu"
|
|
192
|
+
);
|
|
55
193
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
194
|
+
var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
|
|
195
|
+
var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
|
|
196
|
+
var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
|
|
197
|
+
var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
|
|
198
|
+
var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
|
|
199
|
+
var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
|
|
200
|
+
var INSEE_FR_RE = /\b([12]\d{14})\b/g;
|
|
201
|
+
var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
|
|
202
|
+
var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
|
|
203
|
+
var BSN_NL_RE = /\b(\d{9})\b/g;
|
|
204
|
+
var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
|
|
205
|
+
var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
206
|
+
var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
|
|
207
|
+
var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
|
|
208
|
+
var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
|
|
209
|
+
var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
|
|
210
|
+
var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
|
|
56
211
|
function validTckn(s) {
|
|
57
212
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
58
213
|
const d = s.split("").map(Number);
|
|
@@ -103,19 +258,109 @@ function validIban(s) {
|
|
|
103
258
|
}
|
|
104
259
|
return remainder === 1;
|
|
105
260
|
}
|
|
261
|
+
function validIbanIntl(s) {
|
|
262
|
+
const country = s.slice(0, 2);
|
|
263
|
+
if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
|
|
264
|
+
if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
|
|
265
|
+
return validIban(s);
|
|
266
|
+
}
|
|
267
|
+
function validPhoneIntl(raw) {
|
|
268
|
+
const digits = raw.replace(/\D/g, "");
|
|
269
|
+
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
270
|
+
}
|
|
271
|
+
function validSteuerIdDe(s) {
|
|
272
|
+
if (s.length !== 11 || s[0] === "0") return false;
|
|
273
|
+
let product = 10;
|
|
274
|
+
for (let i = 0; i < 10; i++) {
|
|
275
|
+
let total = (parseInt(s[i]) + product) % 10;
|
|
276
|
+
if (total === 0) total = 10;
|
|
277
|
+
product = total * 2 % 11;
|
|
278
|
+
}
|
|
279
|
+
let check = 11 - product;
|
|
280
|
+
if (check === 10) check = 0;
|
|
281
|
+
return check === parseInt(s[10]);
|
|
282
|
+
}
|
|
283
|
+
function validPartitaIvaIt(s) {
|
|
284
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
285
|
+
let oddSum = 0;
|
|
286
|
+
let evenSum = 0;
|
|
287
|
+
for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
|
|
288
|
+
for (let i = 1; i < 10; i += 2) {
|
|
289
|
+
let v = parseInt(s[i]) * 2;
|
|
290
|
+
evenSum += v < 10 ? v : v - 9;
|
|
291
|
+
}
|
|
292
|
+
return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
|
|
293
|
+
}
|
|
294
|
+
function validBsnNl(s) {
|
|
295
|
+
if (s.length !== 9 || !/^\d+$/.test(s)) return false;
|
|
296
|
+
let total = 0;
|
|
297
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
298
|
+
total -= parseInt(s[8]);
|
|
299
|
+
return total > 0 && total % 11 === 0;
|
|
300
|
+
}
|
|
301
|
+
function validDniEs(s) {
|
|
302
|
+
if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
|
|
303
|
+
return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
|
|
304
|
+
}
|
|
305
|
+
function validNieEs(s) {
|
|
306
|
+
if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
|
|
307
|
+
const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
|
|
308
|
+
return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
|
|
309
|
+
}
|
|
310
|
+
var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
|
|
311
|
+
function validNiUk(s) {
|
|
312
|
+
return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
|
|
313
|
+
}
|
|
314
|
+
var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
315
|
+
"00",
|
|
316
|
+
"07",
|
|
317
|
+
"08",
|
|
318
|
+
"09",
|
|
319
|
+
"17",
|
|
320
|
+
"18",
|
|
321
|
+
"19",
|
|
322
|
+
"28",
|
|
323
|
+
"29",
|
|
324
|
+
"49",
|
|
325
|
+
"69",
|
|
326
|
+
"70",
|
|
327
|
+
"78",
|
|
328
|
+
"79",
|
|
329
|
+
"89",
|
|
330
|
+
"96",
|
|
331
|
+
"97"
|
|
332
|
+
]);
|
|
333
|
+
function validEinUs(s) {
|
|
334
|
+
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
335
|
+
}
|
|
106
336
|
var LOCALE_DETECTORS = {
|
|
107
|
-
tr: /* @__PURE__ */ new Set([
|
|
108
|
-
|
|
109
|
-
|
|
337
|
+
tr: /* @__PURE__ */ new Set([
|
|
338
|
+
"national_id_tr",
|
|
339
|
+
"tax_id_tr",
|
|
340
|
+
"phone_tr",
|
|
341
|
+
"name",
|
|
342
|
+
"iban_tr",
|
|
343
|
+
"company_name_tr",
|
|
344
|
+
"mersis_no",
|
|
345
|
+
"postal_code_tr",
|
|
346
|
+
"province_tr"
|
|
347
|
+
]),
|
|
348
|
+
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
349
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
350
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
351
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
352
|
+
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
353
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
354
|
+
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
355
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
110
356
|
};
|
|
111
357
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
112
358
|
function activeDetectors(locale) {
|
|
113
|
-
if (locale === "all") {
|
|
359
|
+
if (locale === "all" || locale === "und") {
|
|
114
360
|
const active2 = new Set(UNIVERSAL);
|
|
115
361
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
116
362
|
detectors.forEach((d) => active2.add(d));
|
|
117
363
|
}
|
|
118
|
-
if (active2.has("phone_tr")) active2.delete("phone");
|
|
119
364
|
return active2;
|
|
120
365
|
}
|
|
121
366
|
const active = new Set(UNIVERSAL);
|
|
@@ -131,18 +376,18 @@ function findAll(re, text, type) {
|
|
|
131
376
|
}
|
|
132
377
|
return results;
|
|
133
378
|
}
|
|
134
|
-
function detectPii(text, locale = "
|
|
379
|
+
function detectPii(text, locale = "und") {
|
|
135
380
|
const active = activeDetectors(locale);
|
|
136
381
|
const t = text ?? "";
|
|
137
|
-
|
|
382
|
+
let findings = [];
|
|
138
383
|
if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
|
|
139
|
-
if (active.has("
|
|
384
|
+
if (active.has("phone_intl")) {
|
|
140
385
|
PHONE_INTL_RE.lastIndex = 0;
|
|
141
386
|
let m;
|
|
142
387
|
while ((m = PHONE_INTL_RE.exec(t)) !== null) {
|
|
143
|
-
const
|
|
144
|
-
if (
|
|
145
|
-
findings.push({ type: "
|
|
388
|
+
const candidate = m[1];
|
|
389
|
+
if (validPhoneIntl(candidate)) {
|
|
390
|
+
findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
146
391
|
}
|
|
147
392
|
}
|
|
148
393
|
}
|
|
@@ -195,8 +440,180 @@ function detectPii(text, locale = "tr") {
|
|
|
195
440
|
findings.push({ type: "name", value, start, end: start + value.length });
|
|
196
441
|
}
|
|
197
442
|
}
|
|
443
|
+
if (active.has("iban_tr")) {
|
|
444
|
+
IBAN_TR_RE.lastIndex = 0;
|
|
445
|
+
let m;
|
|
446
|
+
while ((m = IBAN_TR_RE.exec(t)) !== null) {
|
|
447
|
+
if (validIban(m[0])) {
|
|
448
|
+
findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
if (active.has("company_name_tr")) {
|
|
453
|
+
COMPANY_NAME_TR_RE.lastIndex = 0;
|
|
454
|
+
let m;
|
|
455
|
+
while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
|
|
456
|
+
findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
if (active.has("mersis_no")) {
|
|
460
|
+
MERSIS_RE.lastIndex = 0;
|
|
461
|
+
let m;
|
|
462
|
+
while ((m = MERSIS_RE.exec(t)) !== null) {
|
|
463
|
+
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
if (active.has("postal_code_tr")) {
|
|
467
|
+
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
468
|
+
let m;
|
|
469
|
+
while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
|
|
470
|
+
findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
if (active.has("province_tr")) {
|
|
474
|
+
PROVINCE_TR_RE.lastIndex = 0;
|
|
475
|
+
let m;
|
|
476
|
+
while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
|
|
477
|
+
findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
478
|
+
}
|
|
479
|
+
}
|
|
198
480
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
199
|
-
|
|
481
|
+
if (active.has("tax_id_us")) {
|
|
482
|
+
EIN_US_RE.lastIndex = 0;
|
|
483
|
+
let m;
|
|
484
|
+
while ((m = EIN_US_RE.exec(t)) !== null) {
|
|
485
|
+
if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
if (active.has("national_id_us")) {
|
|
489
|
+
ITIN_US_RE.lastIndex = 0;
|
|
490
|
+
let m;
|
|
491
|
+
while ((m = ITIN_US_RE.exec(t)) !== null) {
|
|
492
|
+
findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
if (active.has("tax_id_de")) {
|
|
496
|
+
STEUER_ID_DE_RE.lastIndex = 0;
|
|
497
|
+
let m;
|
|
498
|
+
while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
|
|
499
|
+
if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
if (active.has("social_id_de")) {
|
|
503
|
+
SVNR_DE_RE.lastIndex = 0;
|
|
504
|
+
let m;
|
|
505
|
+
while ((m = SVNR_DE_RE.exec(t)) !== null) {
|
|
506
|
+
findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
if (active.has("siret_fr")) {
|
|
510
|
+
SIRET_FR_RE.lastIndex = 0;
|
|
511
|
+
let m;
|
|
512
|
+
while ((m = SIRET_FR_RE.exec(t)) !== null) {
|
|
513
|
+
findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
if (active.has("company_id_fr")) {
|
|
517
|
+
SIREN_FR_RE.lastIndex = 0;
|
|
518
|
+
let m;
|
|
519
|
+
while ((m = SIREN_FR_RE.exec(t)) !== null) {
|
|
520
|
+
findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
if (active.has("social_id_fr")) {
|
|
524
|
+
INSEE_FR_RE.lastIndex = 0;
|
|
525
|
+
let m;
|
|
526
|
+
while ((m = INSEE_FR_RE.exec(t)) !== null) {
|
|
527
|
+
findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
if (active.has("national_id_it")) {
|
|
531
|
+
CODICE_FISCALE_IT_RE.lastIndex = 0;
|
|
532
|
+
let m;
|
|
533
|
+
while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
|
|
534
|
+
findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
if (active.has("tax_id_it")) {
|
|
538
|
+
PARTITA_IVA_IT_RE.lastIndex = 0;
|
|
539
|
+
let m;
|
|
540
|
+
while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
|
|
541
|
+
if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (active.has("national_id_nl")) {
|
|
545
|
+
BSN_NL_RE.lastIndex = 0;
|
|
546
|
+
let m;
|
|
547
|
+
while ((m = BSN_NL_RE.exec(t)) !== null) {
|
|
548
|
+
if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
if (active.has("company_id_nl")) {
|
|
552
|
+
KVK_NL_RE.lastIndex = 0;
|
|
553
|
+
let m;
|
|
554
|
+
while ((m = KVK_NL_RE.exec(t)) !== null) {
|
|
555
|
+
findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
if (active.has("national_id_es")) {
|
|
559
|
+
DNI_ES_RE.lastIndex = 0;
|
|
560
|
+
let m;
|
|
561
|
+
while ((m = DNI_ES_RE.exec(t)) !== null) {
|
|
562
|
+
if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
563
|
+
}
|
|
564
|
+
NIE_ES_RE.lastIndex = 0;
|
|
565
|
+
while ((m = NIE_ES_RE.exec(t)) !== null) {
|
|
566
|
+
if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
if (active.has("tax_id_es")) {
|
|
570
|
+
CIF_ES_RE.lastIndex = 0;
|
|
571
|
+
let m;
|
|
572
|
+
while ((m = CIF_ES_RE.exec(t)) !== null) {
|
|
573
|
+
findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
if (active.has("social_id_uk")) {
|
|
577
|
+
NI_UK_RE.lastIndex = 0;
|
|
578
|
+
let m;
|
|
579
|
+
while ((m = NI_UK_RE.exec(t)) !== null) {
|
|
580
|
+
if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
if (active.has("tax_id_uk")) {
|
|
584
|
+
UTR_UK_RE.lastIndex = 0;
|
|
585
|
+
let m;
|
|
586
|
+
while ((m = UTR_UK_RE.exec(t)) !== null) {
|
|
587
|
+
findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
if (active.has("iban_intl")) {
|
|
591
|
+
IBAN_INTL_RE.lastIndex = 0;
|
|
592
|
+
let m;
|
|
593
|
+
while ((m = IBAN_INTL_RE.exec(t)) !== null) {
|
|
594
|
+
const candidate = m[1];
|
|
595
|
+
if (validIbanIntl(candidate)) {
|
|
596
|
+
findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
if (active.has("company_name_intl")) {
|
|
601
|
+
COMPANY_NAME_INTL_RE.lastIndex = 0;
|
|
602
|
+
let m;
|
|
603
|
+
while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
|
|
604
|
+
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
findings.sort((a, b) => a.start - b.start);
|
|
608
|
+
const specificIbanSpans = new Set(
|
|
609
|
+
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
610
|
+
);
|
|
611
|
+
if (specificIbanSpans.size > 0) {
|
|
612
|
+
findings = findings.filter(
|
|
613
|
+
(f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
|
|
614
|
+
);
|
|
615
|
+
}
|
|
616
|
+
return findings;
|
|
200
617
|
}
|
|
201
618
|
|
|
202
619
|
// src/quality.ts
|
|
@@ -218,6 +635,15 @@ function isGarbage(ch) {
|
|
|
218
635
|
return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
|
|
219
636
|
cp >= 55296 && cp <= 57343;
|
|
220
637
|
}
|
|
638
|
+
var LINE_NOISE_RE = /[@#!~*=]{3,}/;
|
|
639
|
+
function noiseRatio(text) {
|
|
640
|
+
if (!text) return 0;
|
|
641
|
+
const lines = text.split("\n");
|
|
642
|
+
const total = lines.length;
|
|
643
|
+
if (total === 0) return 0;
|
|
644
|
+
const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
|
|
645
|
+
return Math.round(noisy / total * 1e4) / 1e4;
|
|
646
|
+
}
|
|
221
647
|
function noiseMetrics(text) {
|
|
222
648
|
if (!text) return { garbage_ratio: 0, encoding_ok: true };
|
|
223
649
|
const n = text.length;
|
|
@@ -274,7 +700,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
274
700
|
}
|
|
275
701
|
|
|
276
702
|
// src/index.ts
|
|
277
|
-
var version = "0.
|
|
703
|
+
var version = "0.5.0";
|
|
278
704
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
279
705
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
280
706
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -287,10 +713,11 @@ function computeQualityGrade(score) {
|
|
|
287
713
|
return "D";
|
|
288
714
|
}
|
|
289
715
|
function audit(text, options = {}) {
|
|
290
|
-
const locale = options.locale ?? "
|
|
716
|
+
const locale = options.locale ?? "und";
|
|
291
717
|
const pii = detectPii(text, locale);
|
|
292
718
|
const quality = qualityMetrics(text);
|
|
293
719
|
const noise = noiseMetrics(text);
|
|
720
|
+
const noise_ratio = noiseRatio(text);
|
|
294
721
|
const quality_score = computeQualityScore(
|
|
295
722
|
quality.completeness,
|
|
296
723
|
quality.avg_length,
|
|
@@ -300,7 +727,7 @@ function audit(text, options = {}) {
|
|
|
300
727
|
const counts = /* @__PURE__ */ new Map();
|
|
301
728
|
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
302
729
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
303
|
-
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
730
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
|
|
304
731
|
}
|
|
305
732
|
function auditBatch(texts, options = {}) {
|
|
306
733
|
if (texts.length === 0) {
|
|
@@ -332,6 +759,7 @@ function mask(text, findings, options = {}) {
|
|
|
332
759
|
detectPii,
|
|
333
760
|
mask,
|
|
334
761
|
noiseMetrics,
|
|
762
|
+
noiseRatio,
|
|
335
763
|
qualityMetrics,
|
|
336
764
|
version
|
|
337
765
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -17,6 +17,12 @@ interface NoiseMetrics {
|
|
|
17
17
|
garbage_ratio: number;
|
|
18
18
|
encoding_ok: boolean;
|
|
19
19
|
}
|
|
20
|
+
/**
|
|
21
|
+
* Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
|
|
22
|
+
* Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
|
|
23
|
+
* indicate a document likely to reduce extraction quality.
|
|
24
|
+
*/
|
|
25
|
+
declare function noiseRatio(text: string): number;
|
|
20
26
|
declare function noiseMetrics(text: string): NoiseMetrics;
|
|
21
27
|
|
|
22
28
|
type MaskStrategy = "redact" | "replace" | "token" | "hash";
|
|
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
30
36
|
* import { readFileSync } from "fs"
|
|
31
37
|
*
|
|
32
38
|
* const text = readFileSync("contract.txt", "utf8")
|
|
33
|
-
* const result = audit(text
|
|
39
|
+
* const result = audit(text) // locale defaults to "und" (all detectors)
|
|
40
|
+
* const result = audit(text, { locale: "tr" }) // Turkish-only detectors
|
|
34
41
|
*
|
|
35
|
-
* result.quality_grade
|
|
36
|
-
* result.quality_score
|
|
37
|
-
* result.
|
|
42
|
+
* result.quality_grade // "A"
|
|
43
|
+
* result.quality_score // 0.91
|
|
44
|
+
* result.noise_ratio // 0.03 (line-level noise fraction)
|
|
45
|
+
* result.detected_language // "und" (locale passed in — caller controls language)
|
|
46
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
47
|
*
|
|
39
48
|
* // Raw findings and metrics also available:
|
|
40
49
|
* result.pii // [{ type, value, start, end }, ...]
|
|
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
55
|
*/
|
|
47
56
|
|
|
48
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.5.0";
|
|
49
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
59
|
interface PiiSummaryEntry {
|
|
51
60
|
type: string;
|
|
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
|
|
|
54
63
|
interface AuditOptions {
|
|
55
64
|
/**
|
|
56
65
|
* Active locale-specific detectors.
|
|
57
|
-
* - "
|
|
58
|
-
* - "
|
|
59
|
-
* - "
|
|
60
|
-
* - "
|
|
66
|
+
* - "und" — All detectors combined (default; use when language is unknown)
|
|
67
|
+
* - "all" — Alias for "und"
|
|
68
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
|
|
69
|
+
* - "us" — US: SSN, EIN, ITIN, E.164 phone, company_name_intl
|
|
70
|
+
* - "eu" — EU: E.164 phone, iban_intl, company_name_intl
|
|
71
|
+
* - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
|
|
61
72
|
*
|
|
62
73
|
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
74
|
*/
|
|
@@ -74,6 +85,10 @@ interface AuditResult {
|
|
|
74
85
|
pii: PiiFinding[];
|
|
75
86
|
quality: QualityMetrics;
|
|
76
87
|
noise: NoiseMetrics;
|
|
88
|
+
/** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
|
|
89
|
+
noise_ratio: number;
|
|
90
|
+
/** The locale value passed to audit() — caller-controlled language selection. */
|
|
91
|
+
detected_language: string;
|
|
77
92
|
}
|
|
78
93
|
interface MaskOptions {
|
|
79
94
|
/** @default "redact" */
|
|
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
102
117
|
*/
|
|
103
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
104
119
|
|
|
105
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
120
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|