@flexorch/audit 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +275 -148
- package/dist/index.cjs +444 -16
- package/dist/index.d.cts +25 -10
- package/dist/index.d.ts +25 -10
- package/dist/index.js +443 -16
- package/package.json +49 -49
package/dist/index.d.ts
CHANGED
|
@@ -17,6 +17,12 @@ interface NoiseMetrics {
|
|
|
17
17
|
garbage_ratio: number;
|
|
18
18
|
encoding_ok: boolean;
|
|
19
19
|
}
|
|
20
|
+
/**
|
|
21
|
+
* Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
|
|
22
|
+
* Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
|
|
23
|
+
* indicate a document likely to reduce extraction quality.
|
|
24
|
+
*/
|
|
25
|
+
declare function noiseRatio(text: string): number;
|
|
20
26
|
declare function noiseMetrics(text: string): NoiseMetrics;
|
|
21
27
|
|
|
22
28
|
type MaskStrategy = "redact" | "replace" | "token" | "hash";
|
|
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
30
36
|
* import { readFileSync } from "fs"
|
|
31
37
|
*
|
|
32
38
|
* const text = readFileSync("contract.txt", "utf8")
|
|
33
|
-
* const result = audit(text
|
|
39
|
+
* const result = audit(text) // locale defaults to "und" (all detectors)
|
|
40
|
+
* const result = audit(text, { locale: "tr" }) // Turkish-only detectors
|
|
34
41
|
*
|
|
35
|
-
* result.quality_grade
|
|
36
|
-
* result.quality_score
|
|
37
|
-
* result.
|
|
42
|
+
* result.quality_grade // "A"
|
|
43
|
+
* result.quality_score // 0.91
|
|
44
|
+
* result.noise_ratio // 0.03 (line-level noise fraction)
|
|
45
|
+
* result.detected_language // "und" (locale passed in — caller controls language)
|
|
46
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
47
|
*
|
|
39
48
|
* // Raw findings and metrics also available:
|
|
40
49
|
* result.pii // [{ type, value, start, end }, ...]
|
|
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
55
|
*/
|
|
47
56
|
|
|
48
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.5.0";
|
|
49
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
59
|
interface PiiSummaryEntry {
|
|
51
60
|
type: string;
|
|
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
|
|
|
54
63
|
interface AuditOptions {
|
|
55
64
|
/**
|
|
56
65
|
* Active locale-specific detectors.
|
|
57
|
-
* - "
|
|
58
|
-
* - "
|
|
59
|
-
* - "
|
|
60
|
-
* - "
|
|
66
|
+
* - "und" — All detectors combined (default; use when language is unknown)
|
|
67
|
+
* - "all" — Alias for "und"
|
|
68
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
|
|
69
|
+
* - "us" — US: SSN, EIN, ITIN, E.164 phone, company_name_intl
|
|
70
|
+
* - "eu" — EU: E.164 phone, iban_intl, company_name_intl
|
|
71
|
+
* - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
|
|
61
72
|
*
|
|
62
73
|
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
74
|
*/
|
|
@@ -74,6 +85,10 @@ interface AuditResult {
|
|
|
74
85
|
pii: PiiFinding[];
|
|
75
86
|
quality: QualityMetrics;
|
|
76
87
|
noise: NoiseMetrics;
|
|
88
|
+
/** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
|
|
89
|
+
noise_ratio: number;
|
|
90
|
+
/** The locale value passed to audit() — caller-controlled language selection. */
|
|
91
|
+
detected_language: string;
|
|
77
92
|
}
|
|
78
93
|
interface MaskOptions {
|
|
79
94
|
/** @default "redact" */
|
|
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
102
117
|
*/
|
|
103
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
104
119
|
|
|
105
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
120
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
// src/pii.ts
|
|
2
2
|
var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
|
|
3
|
-
var PHONE_INTL_RE =
|
|
3
|
+
var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
|
|
4
4
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
5
|
+
var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
|
|
5
6
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
6
7
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
7
8
|
var _H = "[0-9a-fA-F]{1,4}";
|
|
@@ -12,6 +13,102 @@ var IPV6_RE = new RegExp(
|
|
|
12
13
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
13
14
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
14
15
|
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
16
|
+
var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
|
|
17
|
+
var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
|
|
18
|
+
var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
|
|
19
|
+
var COMPANY_NAME_TR_RE = new RegExp(
|
|
20
|
+
`(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
|
|
21
|
+
"gu"
|
|
22
|
+
);
|
|
23
|
+
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
24
|
+
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
25
|
+
var _TR_PROVINCES_SORTED = [
|
|
26
|
+
"Afyonkarahisar",
|
|
27
|
+
"Kahramanmara\u015F",
|
|
28
|
+
"K\u0131r\u0131kkale",
|
|
29
|
+
"K\u0131rklareli",
|
|
30
|
+
"Diyarbak\u0131r",
|
|
31
|
+
"Gaziantep",
|
|
32
|
+
"\u015Eanl\u0131urfa",
|
|
33
|
+
"Nev\u015Fehir",
|
|
34
|
+
"Kastamonu",
|
|
35
|
+
"G\xFCm\xFC\u015Fhane",
|
|
36
|
+
"Eski\u015Fehir",
|
|
37
|
+
"Erzincan",
|
|
38
|
+
"Erzurum",
|
|
39
|
+
"Denizli",
|
|
40
|
+
"\xC7anakkale",
|
|
41
|
+
"Ad\u0131yaman",
|
|
42
|
+
"Zonguldak",
|
|
43
|
+
"Tekirda\u011F",
|
|
44
|
+
"Trabzon",
|
|
45
|
+
"Tunceli",
|
|
46
|
+
"Karaman",
|
|
47
|
+
"Karab\xFCk",
|
|
48
|
+
"Aksaray",
|
|
49
|
+
"Antalya",
|
|
50
|
+
"K\u0131r\u015Fehir",
|
|
51
|
+
"Osmaniye",
|
|
52
|
+
"Kocaeli",
|
|
53
|
+
"Sakarya",
|
|
54
|
+
"Bart\u0131n",
|
|
55
|
+
"Bayburt",
|
|
56
|
+
"Ardahan",
|
|
57
|
+
"Yozgat",
|
|
58
|
+
"Ankara",
|
|
59
|
+
"Amasya",
|
|
60
|
+
"Artvin",
|
|
61
|
+
"Bal\u0131kesir",
|
|
62
|
+
"Bilecik",
|
|
63
|
+
"Bing\xF6l",
|
|
64
|
+
"Bitlis",
|
|
65
|
+
"Burdur",
|
|
66
|
+
"\xC7ank\u0131r\u0131",
|
|
67
|
+
"Edirne",
|
|
68
|
+
"Elaz\u0131\u011F",
|
|
69
|
+
"Giresun",
|
|
70
|
+
"Hakkari",
|
|
71
|
+
"Isparta",
|
|
72
|
+
"\u0130stanbul",
|
|
73
|
+
"\u0130zmir",
|
|
74
|
+
"Kayseri",
|
|
75
|
+
"K\xFCtahya",
|
|
76
|
+
"Malatya",
|
|
77
|
+
"Manisa",
|
|
78
|
+
"Mardin",
|
|
79
|
+
"Samsun",
|
|
80
|
+
"\u015E\u0131rnak",
|
|
81
|
+
"Sinop",
|
|
82
|
+
"Tokat",
|
|
83
|
+
"Hatay",
|
|
84
|
+
"Konya",
|
|
85
|
+
"Mu\u011Fla",
|
|
86
|
+
"Ni\u011Fde",
|
|
87
|
+
"Rize",
|
|
88
|
+
"Siirt",
|
|
89
|
+
"Sivas",
|
|
90
|
+
"Adana",
|
|
91
|
+
"Ayd\u0131n",
|
|
92
|
+
"Bursa",
|
|
93
|
+
"\xC7orum",
|
|
94
|
+
"I\u011Fd\u0131r",
|
|
95
|
+
"Kilis",
|
|
96
|
+
"Mersin",
|
|
97
|
+
"Batman",
|
|
98
|
+
"Yalova",
|
|
99
|
+
"D\xFCzce",
|
|
100
|
+
"Ordu",
|
|
101
|
+
"Kars",
|
|
102
|
+
"A\u011Fr\u0131",
|
|
103
|
+
"Bolu",
|
|
104
|
+
"Van",
|
|
105
|
+
"U\u015Fak",
|
|
106
|
+
"Mu\u015F"
|
|
107
|
+
].sort((a, b) => b.length - a.length);
|
|
108
|
+
var PROVINCE_TR_RE = new RegExp(
|
|
109
|
+
`(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
|
|
110
|
+
"gu"
|
|
111
|
+
);
|
|
15
112
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
16
113
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
17
114
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -19,7 +116,64 @@ var NAME_RE = new RegExp(
|
|
|
19
116
|
`(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
|
|
20
117
|
"gu"
|
|
21
118
|
);
|
|
119
|
+
var _IBAN_INTL_LENGTHS = {
|
|
120
|
+
AT: 20,
|
|
121
|
+
BE: 16,
|
|
122
|
+
BG: 22,
|
|
123
|
+
HR: 21,
|
|
124
|
+
CY: 28,
|
|
125
|
+
CZ: 24,
|
|
126
|
+
DK: 18,
|
|
127
|
+
EE: 20,
|
|
128
|
+
FI: 18,
|
|
129
|
+
FR: 27,
|
|
130
|
+
DE: 22,
|
|
131
|
+
GR: 27,
|
|
132
|
+
HU: 28,
|
|
133
|
+
IE: 22,
|
|
134
|
+
IT: 27,
|
|
135
|
+
LV: 21,
|
|
136
|
+
LT: 20,
|
|
137
|
+
LU: 20,
|
|
138
|
+
MT: 31,
|
|
139
|
+
NL: 18,
|
|
140
|
+
PL: 28,
|
|
141
|
+
PT: 25,
|
|
142
|
+
RO: 24,
|
|
143
|
+
SK: 24,
|
|
144
|
+
SI: 19,
|
|
145
|
+
ES: 24,
|
|
146
|
+
SE: 24,
|
|
147
|
+
GB: 22,
|
|
148
|
+
CH: 21,
|
|
149
|
+
NO: 15
|
|
150
|
+
};
|
|
151
|
+
var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
|
|
152
|
+
var _UC = "[A-Z\xC0-\u024F]";
|
|
153
|
+
var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
|
|
154
|
+
var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
|
|
155
|
+
var COMPANY_NAME_INTL_RE = new RegExp(
|
|
156
|
+
`(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
|
|
157
|
+
"gu"
|
|
158
|
+
);
|
|
22
159
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
160
|
+
var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
|
|
161
|
+
var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
|
|
162
|
+
var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
|
|
163
|
+
var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
|
|
164
|
+
var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
|
|
165
|
+
var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
|
|
166
|
+
var INSEE_FR_RE = /\b([12]\d{14})\b/g;
|
|
167
|
+
var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
|
|
168
|
+
var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
|
|
169
|
+
var BSN_NL_RE = /\b(\d{9})\b/g;
|
|
170
|
+
var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
|
|
171
|
+
var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
172
|
+
var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
|
|
173
|
+
var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
|
|
174
|
+
var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
|
|
175
|
+
var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
|
|
176
|
+
var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
|
|
23
177
|
function validTckn(s) {
|
|
24
178
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
25
179
|
const d = s.split("").map(Number);
|
|
@@ -70,19 +224,109 @@ function validIban(s) {
|
|
|
70
224
|
}
|
|
71
225
|
return remainder === 1;
|
|
72
226
|
}
|
|
227
|
+
function validIbanIntl(s) {
|
|
228
|
+
const country = s.slice(0, 2);
|
|
229
|
+
if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
|
|
230
|
+
if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
|
|
231
|
+
return validIban(s);
|
|
232
|
+
}
|
|
233
|
+
function validPhoneIntl(raw) {
|
|
234
|
+
const digits = raw.replace(/\D/g, "");
|
|
235
|
+
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
236
|
+
}
|
|
237
|
+
function validSteuerIdDe(s) {
|
|
238
|
+
if (s.length !== 11 || s[0] === "0") return false;
|
|
239
|
+
let product = 10;
|
|
240
|
+
for (let i = 0; i < 10; i++) {
|
|
241
|
+
let total = (parseInt(s[i]) + product) % 10;
|
|
242
|
+
if (total === 0) total = 10;
|
|
243
|
+
product = total * 2 % 11;
|
|
244
|
+
}
|
|
245
|
+
let check = 11 - product;
|
|
246
|
+
if (check === 10) check = 0;
|
|
247
|
+
return check === parseInt(s[10]);
|
|
248
|
+
}
|
|
249
|
+
function validPartitaIvaIt(s) {
|
|
250
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
251
|
+
let oddSum = 0;
|
|
252
|
+
let evenSum = 0;
|
|
253
|
+
for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
|
|
254
|
+
for (let i = 1; i < 10; i += 2) {
|
|
255
|
+
let v = parseInt(s[i]) * 2;
|
|
256
|
+
evenSum += v < 10 ? v : v - 9;
|
|
257
|
+
}
|
|
258
|
+
return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
|
|
259
|
+
}
|
|
260
|
+
function validBsnNl(s) {
|
|
261
|
+
if (s.length !== 9 || !/^\d+$/.test(s)) return false;
|
|
262
|
+
let total = 0;
|
|
263
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
264
|
+
total -= parseInt(s[8]);
|
|
265
|
+
return total > 0 && total % 11 === 0;
|
|
266
|
+
}
|
|
267
|
+
function validDniEs(s) {
|
|
268
|
+
if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
|
|
269
|
+
return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
|
|
270
|
+
}
|
|
271
|
+
function validNieEs(s) {
|
|
272
|
+
if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
|
|
273
|
+
const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
|
|
274
|
+
return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
|
|
275
|
+
}
|
|
276
|
+
var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
|
|
277
|
+
function validNiUk(s) {
|
|
278
|
+
return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
|
|
279
|
+
}
|
|
280
|
+
var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
281
|
+
"00",
|
|
282
|
+
"07",
|
|
283
|
+
"08",
|
|
284
|
+
"09",
|
|
285
|
+
"17",
|
|
286
|
+
"18",
|
|
287
|
+
"19",
|
|
288
|
+
"28",
|
|
289
|
+
"29",
|
|
290
|
+
"49",
|
|
291
|
+
"69",
|
|
292
|
+
"70",
|
|
293
|
+
"78",
|
|
294
|
+
"79",
|
|
295
|
+
"89",
|
|
296
|
+
"96",
|
|
297
|
+
"97"
|
|
298
|
+
]);
|
|
299
|
+
function validEinUs(s) {
|
|
300
|
+
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
301
|
+
}
|
|
73
302
|
var LOCALE_DETECTORS = {
|
|
74
|
-
tr: /* @__PURE__ */ new Set([
|
|
75
|
-
|
|
76
|
-
|
|
303
|
+
tr: /* @__PURE__ */ new Set([
|
|
304
|
+
"national_id_tr",
|
|
305
|
+
"tax_id_tr",
|
|
306
|
+
"phone_tr",
|
|
307
|
+
"name",
|
|
308
|
+
"iban_tr",
|
|
309
|
+
"company_name_tr",
|
|
310
|
+
"mersis_no",
|
|
311
|
+
"postal_code_tr",
|
|
312
|
+
"province_tr"
|
|
313
|
+
]),
|
|
314
|
+
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
315
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
316
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
317
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
318
|
+
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
319
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
320
|
+
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
321
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
77
322
|
};
|
|
78
323
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
79
324
|
function activeDetectors(locale) {
|
|
80
|
-
if (locale === "all") {
|
|
325
|
+
if (locale === "all" || locale === "und") {
|
|
81
326
|
const active2 = new Set(UNIVERSAL);
|
|
82
327
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
83
328
|
detectors.forEach((d) => active2.add(d));
|
|
84
329
|
}
|
|
85
|
-
if (active2.has("phone_tr")) active2.delete("phone");
|
|
86
330
|
return active2;
|
|
87
331
|
}
|
|
88
332
|
const active = new Set(UNIVERSAL);
|
|
@@ -98,18 +342,18 @@ function findAll(re, text, type) {
|
|
|
98
342
|
}
|
|
99
343
|
return results;
|
|
100
344
|
}
|
|
101
|
-
function detectPii(text, locale = "
|
|
345
|
+
function detectPii(text, locale = "und") {
|
|
102
346
|
const active = activeDetectors(locale);
|
|
103
347
|
const t = text ?? "";
|
|
104
|
-
|
|
348
|
+
let findings = [];
|
|
105
349
|
if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
|
|
106
|
-
if (active.has("
|
|
350
|
+
if (active.has("phone_intl")) {
|
|
107
351
|
PHONE_INTL_RE.lastIndex = 0;
|
|
108
352
|
let m;
|
|
109
353
|
while ((m = PHONE_INTL_RE.exec(t)) !== null) {
|
|
110
|
-
const
|
|
111
|
-
if (
|
|
112
|
-
findings.push({ type: "
|
|
354
|
+
const candidate = m[1];
|
|
355
|
+
if (validPhoneIntl(candidate)) {
|
|
356
|
+
findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
113
357
|
}
|
|
114
358
|
}
|
|
115
359
|
}
|
|
@@ -162,8 +406,180 @@ function detectPii(text, locale = "tr") {
|
|
|
162
406
|
findings.push({ type: "name", value, start, end: start + value.length });
|
|
163
407
|
}
|
|
164
408
|
}
|
|
409
|
+
if (active.has("iban_tr")) {
|
|
410
|
+
IBAN_TR_RE.lastIndex = 0;
|
|
411
|
+
let m;
|
|
412
|
+
while ((m = IBAN_TR_RE.exec(t)) !== null) {
|
|
413
|
+
if (validIban(m[0])) {
|
|
414
|
+
findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
if (active.has("company_name_tr")) {
|
|
419
|
+
COMPANY_NAME_TR_RE.lastIndex = 0;
|
|
420
|
+
let m;
|
|
421
|
+
while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
|
|
422
|
+
findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if (active.has("mersis_no")) {
|
|
426
|
+
MERSIS_RE.lastIndex = 0;
|
|
427
|
+
let m;
|
|
428
|
+
while ((m = MERSIS_RE.exec(t)) !== null) {
|
|
429
|
+
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
if (active.has("postal_code_tr")) {
|
|
433
|
+
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
434
|
+
let m;
|
|
435
|
+
while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
|
|
436
|
+
findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
if (active.has("province_tr")) {
|
|
440
|
+
PROVINCE_TR_RE.lastIndex = 0;
|
|
441
|
+
let m;
|
|
442
|
+
while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
|
|
443
|
+
findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
444
|
+
}
|
|
445
|
+
}
|
|
165
446
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
166
|
-
|
|
447
|
+
if (active.has("tax_id_us")) {
|
|
448
|
+
EIN_US_RE.lastIndex = 0;
|
|
449
|
+
let m;
|
|
450
|
+
while ((m = EIN_US_RE.exec(t)) !== null) {
|
|
451
|
+
if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (active.has("national_id_us")) {
|
|
455
|
+
ITIN_US_RE.lastIndex = 0;
|
|
456
|
+
let m;
|
|
457
|
+
while ((m = ITIN_US_RE.exec(t)) !== null) {
|
|
458
|
+
findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
if (active.has("tax_id_de")) {
|
|
462
|
+
STEUER_ID_DE_RE.lastIndex = 0;
|
|
463
|
+
let m;
|
|
464
|
+
while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
|
|
465
|
+
if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
if (active.has("social_id_de")) {
|
|
469
|
+
SVNR_DE_RE.lastIndex = 0;
|
|
470
|
+
let m;
|
|
471
|
+
while ((m = SVNR_DE_RE.exec(t)) !== null) {
|
|
472
|
+
findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
if (active.has("siret_fr")) {
|
|
476
|
+
SIRET_FR_RE.lastIndex = 0;
|
|
477
|
+
let m;
|
|
478
|
+
while ((m = SIRET_FR_RE.exec(t)) !== null) {
|
|
479
|
+
findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
if (active.has("company_id_fr")) {
|
|
483
|
+
SIREN_FR_RE.lastIndex = 0;
|
|
484
|
+
let m;
|
|
485
|
+
while ((m = SIREN_FR_RE.exec(t)) !== null) {
|
|
486
|
+
findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
if (active.has("social_id_fr")) {
|
|
490
|
+
INSEE_FR_RE.lastIndex = 0;
|
|
491
|
+
let m;
|
|
492
|
+
while ((m = INSEE_FR_RE.exec(t)) !== null) {
|
|
493
|
+
findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (active.has("national_id_it")) {
|
|
497
|
+
CODICE_FISCALE_IT_RE.lastIndex = 0;
|
|
498
|
+
let m;
|
|
499
|
+
while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
|
|
500
|
+
findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
if (active.has("tax_id_it")) {
|
|
504
|
+
PARTITA_IVA_IT_RE.lastIndex = 0;
|
|
505
|
+
let m;
|
|
506
|
+
while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
|
|
507
|
+
if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
if (active.has("national_id_nl")) {
|
|
511
|
+
BSN_NL_RE.lastIndex = 0;
|
|
512
|
+
let m;
|
|
513
|
+
while ((m = BSN_NL_RE.exec(t)) !== null) {
|
|
514
|
+
if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
if (active.has("company_id_nl")) {
|
|
518
|
+
KVK_NL_RE.lastIndex = 0;
|
|
519
|
+
let m;
|
|
520
|
+
while ((m = KVK_NL_RE.exec(t)) !== null) {
|
|
521
|
+
findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
if (active.has("national_id_es")) {
|
|
525
|
+
DNI_ES_RE.lastIndex = 0;
|
|
526
|
+
let m;
|
|
527
|
+
while ((m = DNI_ES_RE.exec(t)) !== null) {
|
|
528
|
+
if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
529
|
+
}
|
|
530
|
+
NIE_ES_RE.lastIndex = 0;
|
|
531
|
+
while ((m = NIE_ES_RE.exec(t)) !== null) {
|
|
532
|
+
if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
if (active.has("tax_id_es")) {
|
|
536
|
+
CIF_ES_RE.lastIndex = 0;
|
|
537
|
+
let m;
|
|
538
|
+
while ((m = CIF_ES_RE.exec(t)) !== null) {
|
|
539
|
+
findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
if (active.has("social_id_uk")) {
|
|
543
|
+
NI_UK_RE.lastIndex = 0;
|
|
544
|
+
let m;
|
|
545
|
+
while ((m = NI_UK_RE.exec(t)) !== null) {
|
|
546
|
+
if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
if (active.has("tax_id_uk")) {
|
|
550
|
+
UTR_UK_RE.lastIndex = 0;
|
|
551
|
+
let m;
|
|
552
|
+
while ((m = UTR_UK_RE.exec(t)) !== null) {
|
|
553
|
+
findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
if (active.has("iban_intl")) {
|
|
557
|
+
IBAN_INTL_RE.lastIndex = 0;
|
|
558
|
+
let m;
|
|
559
|
+
while ((m = IBAN_INTL_RE.exec(t)) !== null) {
|
|
560
|
+
const candidate = m[1];
|
|
561
|
+
if (validIbanIntl(candidate)) {
|
|
562
|
+
findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
if (active.has("company_name_intl")) {
|
|
567
|
+
COMPANY_NAME_INTL_RE.lastIndex = 0;
|
|
568
|
+
let m;
|
|
569
|
+
while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
|
|
570
|
+
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
findings.sort((a, b) => a.start - b.start);
|
|
574
|
+
const specificIbanSpans = new Set(
|
|
575
|
+
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
576
|
+
);
|
|
577
|
+
if (specificIbanSpans.size > 0) {
|
|
578
|
+
findings = findings.filter(
|
|
579
|
+
(f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
|
|
580
|
+
);
|
|
581
|
+
}
|
|
582
|
+
return findings;
|
|
167
583
|
}
|
|
168
584
|
|
|
169
585
|
// src/quality.ts
|
|
@@ -185,6 +601,15 @@ function isGarbage(ch) {
|
|
|
185
601
|
return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
|
|
186
602
|
cp >= 55296 && cp <= 57343;
|
|
187
603
|
}
|
|
604
|
+
var LINE_NOISE_RE = /[@#!~*=]{3,}/;
|
|
605
|
+
function noiseRatio(text) {
|
|
606
|
+
if (!text) return 0;
|
|
607
|
+
const lines = text.split("\n");
|
|
608
|
+
const total = lines.length;
|
|
609
|
+
if (total === 0) return 0;
|
|
610
|
+
const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
|
|
611
|
+
return Math.round(noisy / total * 1e4) / 1e4;
|
|
612
|
+
}
|
|
188
613
|
function noiseMetrics(text) {
|
|
189
614
|
if (!text) return { garbage_ratio: 0, encoding_ok: true };
|
|
190
615
|
const n = text.length;
|
|
@@ -241,7 +666,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
241
666
|
}
|
|
242
667
|
|
|
243
668
|
// src/index.ts
|
|
244
|
-
var version = "0.
|
|
669
|
+
var version = "0.5.0";
|
|
245
670
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
246
671
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
247
672
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -254,10 +679,11 @@ function computeQualityGrade(score) {
|
|
|
254
679
|
return "D";
|
|
255
680
|
}
|
|
256
681
|
function audit(text, options = {}) {
|
|
257
|
-
const locale = options.locale ?? "
|
|
682
|
+
const locale = options.locale ?? "und";
|
|
258
683
|
const pii = detectPii(text, locale);
|
|
259
684
|
const quality = qualityMetrics(text);
|
|
260
685
|
const noise = noiseMetrics(text);
|
|
686
|
+
const noise_ratio = noiseRatio(text);
|
|
261
687
|
const quality_score = computeQualityScore(
|
|
262
688
|
quality.completeness,
|
|
263
689
|
quality.avg_length,
|
|
@@ -267,7 +693,7 @@ function audit(text, options = {}) {
|
|
|
267
693
|
const counts = /* @__PURE__ */ new Map();
|
|
268
694
|
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
269
695
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
270
|
-
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
696
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
|
|
271
697
|
}
|
|
272
698
|
function auditBatch(texts, options = {}) {
|
|
273
699
|
if (texts.length === 0) {
|
|
@@ -298,6 +724,7 @@ export {
|
|
|
298
724
|
detectPii,
|
|
299
725
|
mask,
|
|
300
726
|
noiseMetrics,
|
|
727
|
+
noiseRatio,
|
|
301
728
|
qualityMetrics,
|
|
302
729
|
version
|
|
303
730
|
};
|