@flexorch/audit 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,13 +1,38 @@
1
1
  # @flexorch/audit
2
2
 
3
- Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
3
+ [![npm](https://img.shields.io/npm/v/@flexorch/audit)](https://www.npmjs.com/package/@flexorch/audit)
4
+ [![Node](https://img.shields.io/node/v/@flexorch/audit)](https://www.npmjs.com/package/@flexorch/audit)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
4
6
 
5
- - **Quality grade** A/B/C/D score that signals LLM-readiness at a glance
6
- - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
7
- - **Quality metrics** — completeness, average length, duplicate ratio
8
- - **Noise metrics** — garbage character ratio, encoding health
9
- - **Masking** redact / replace / token / hash strategies
7
+ Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
8
+
9
+ ## Why
10
+
11
+ Before feeding documents into an LLM pipeline you need to answer three questions:
12
+
13
+ 1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
14
+ 2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
15
+ 3. **How bad is the noise?** Garbled encodings and control characters degrade model output silently.
16
+
17
+ Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
18
+
19
+ ## Features
20
+
21
+ - **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
22
+ - **PII detection** — email, phone (TR mobile + E.164), credit card (Luhn), IPv4, IPv6, TCKN, VKN, IBAN (mod-97 validated), SSN, label-prefixed names
23
+ - **Batch audit** — `auditBatch()` aggregates duplicate ratio and PII counts across an entire dataset in one call
24
+ - **Noise metrics** — garbage character ratio, encoding health check
25
+ - **Masking** — four strategies: redact, replace (synthetic), token, hash
10
26
  - **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
27
+ - **TypeScript-first** — full type definitions, no `@types/` package needed
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ npm install @flexorch/audit
33
+ ```
34
+
35
+ ## Quick start
11
36
 
12
37
  ```ts
13
38
  import { audit, mask } from "@flexorch/audit"
@@ -20,7 +45,6 @@ result.quality_grade // "A"
20
45
  result.quality_score // 0.91 (0.0–1.0 composite)
21
46
  result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
22
47
 
23
- // Raw findings and metrics — also available:
24
48
  result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
25
49
  result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
26
50
  result.noise // { garbage_ratio: 0.0, encoding_ok: true }
@@ -29,21 +53,31 @@ const clean = mask(text, result.pii, { strategy: "redact" })
29
53
  // "Contact: [REDACTED_EMAIL]"
30
54
  ```
31
55
 
32
- ## Install
56
+ ![demo](assets/demo.svg)
33
57
 
34
- ```bash
35
- npm install @flexorch/audit
36
- ```
58
+ ## Batch audit
37
59
 
38
- ![demo](assets/demo.svg)
60
+ Use `auditBatch()` to audit an entire dataset and get aggregate metrics including `duplicate_ratio`:
61
+
62
+ ```ts
63
+ import { auditBatch } from "@flexorch/audit"
64
+
65
+ const texts = dataset.map((r) => r.text)
66
+ const batch = auditBatch(texts, { locale: "tr" })
67
+
68
+ batch.duplicate_ratio // 0.12 — fraction of exact-duplicate records
69
+ batch.avg_quality_score // 0.78
70
+ batch.pii_summary // [{ type: "email", count: 47 }, ...]
71
+ batch.results // AuditResult[], one per text
72
+ ```
39
73
 
40
74
  ## Locale support
41
75
 
42
76
  | `locale` | Active detectors |
43
77
  |----------|-----------------|
44
- | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
45
- | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
46
- | `"eu"` | email, iban, credit_card, ip + E.164 phone |
78
+ | `"tr"` (default) | email, iban, credit_card, ip, ip_v6 + TCKN, VKN, phone_tr, name |
79
+ | `"us"` | email, iban, credit_card, ip, ip_v6 + SSN, E.164 phone |
80
+ | `"eu"` | email, iban, credit_card, ip, ip_v6 + E.164 phone |
47
81
  | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
48
82
 
49
83
  ## PII types
@@ -51,11 +85,13 @@ npm install @flexorch/audit
51
85
  | Type | Description | Locale |
52
86
  |------|-------------|--------|
53
87
  | `email` | RFC-5321 address | all |
54
- | `iban` | ISO 13616 IBAN (any country) | all |
88
+ | `iban` | ISO 13616 IBAN mod-97 checksum validated | all |
55
89
  | `credit_card` | 16-digit groups, Luhn-validated | all |
56
90
  | `ip` | IPv4 address | all |
91
+ | `ip_v6` | IPv6 address (full, compressed, loopback) | all |
57
92
  | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
58
93
  | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
94
+ | `tax_id_tr` | VKN — 10-digit Luhn-variant checksum | tr |
59
95
  | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
60
96
  | `phone` | E.164 international phone | us, eu |
61
97
  | `ssn` | US Social Security Number (###-##-####) | us |
@@ -65,53 +101,37 @@ npm install @flexorch/audit
65
101
  | Strategy | Example output |
66
102
  |----------|----------------|
67
103
  | `redact` (default) | `[REDACTED_EMAIL]` |
68
- | `replace` | `user@example.com` (realistic synthetic) |
69
- | `token` | `<PII_EMAIL_1>` (unique per type) |
104
+ | `replace` | `user@example.com` (static synthetic) |
105
+ | `token` | `<PII_EMAIL_1>` (unique per type per call) |
70
106
  | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
71
107
 
72
108
  ## TypeScript
73
109
 
74
- Full type definitions included. No `@types/` package needed.
75
-
76
110
  ```ts
77
- import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
111
+ import {
112
+ audit, auditBatch, mask,
113
+ type AuditResult, type BatchAuditResult, type PiiFinding,
114
+ } from "@flexorch/audit"
78
115
  ```
79
116
 
80
117
  ## Quality grade
81
118
 
82
- The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
119
+ `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals:
83
120
 
84
- | Grade | Score | Meaning |
85
- |-------|-------|---------|
121
+ | Grade | Score | Signal |
122
+ |-------|-------|--------|
86
123
  | A | ≥ 0.85 | Ready for LLM training or RAG |
87
124
  | B | ≥ 0.65 | Usable with minor cleanup |
88
- | C | ≥ 0.40 | Needs review before use |
125
+ | C | ≥ 0.40 | Review before use |
89
126
  | D | < 0.40 | Not suitable — empty, too short, or high noise |
90
127
 
91
- Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
92
- where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
128
+ Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
129
+ `lengthScore = Math.min(charCount / 500, 1.0)` · `noiseScore = Math.max(0, 1 − garbageRatio × 10)`
93
130
 
94
- ## Quality & noise
95
-
96
- `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
97
-
98
- ```ts
99
- const texts = dataset.map((r) => r.text)
100
- const seen = new Set<string>()
101
- let duplicates = 0
102
- for (const t of texts) {
103
- if (seen.has(t)) duplicates++
104
- else seen.add(t)
105
- }
106
- const duplicateRatio = duplicates / texts.length
107
- ```
108
-
109
- ## Limitations (v0.2)
131
+ ## Limitations (v0.4)
110
132
 
111
133
  - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
112
- - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
113
- - IPv6 not detected.
114
- - IBAN format-only check; mod-97 validation not performed.
134
+ - `replace` masking strategy uses static synthetic values; locale-aware realistic synthesis is not yet implemented.
115
135
 
116
136
  ## Also available for Python
117
137
 
@@ -119,6 +139,10 @@ const duplicateRatio = duplicates / texts.length
119
139
  pip install flexorch-audit
120
140
  ```
121
141
 
142
+ ## Contributing
143
+
144
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
145
+
122
146
  ## License
123
147
 
124
148
  MIT
package/dist/index.cjs CHANGED
@@ -33,8 +33,9 @@ module.exports = __toCommonJS(index_exports);
33
33
 
34
34
  // src/pii.ts
35
35
  var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
36
- var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
36
+ var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
37
37
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
38
+ var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
38
39
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
39
40
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
40
41
  var _H = "[0-9a-fA-F]{1,4}";
@@ -45,6 +46,102 @@ var IPV6_RE = new RegExp(
45
46
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
46
47
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
47
48
  var VKN_RE = /\b([1-9]\d{9})\b/g;
49
+ var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
50
+ var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
51
+ var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
52
+ var COMPANY_NAME_TR_RE = new RegExp(
53
+ `(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
54
+ "gu"
55
+ );
56
+ var MERSIS_RE = /\b([1-9]\d{15})\b/g;
57
+ var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
58
+ var _TR_PROVINCES_SORTED = [
59
+ "Afyonkarahisar",
60
+ "Kahramanmara\u015F",
61
+ "K\u0131r\u0131kkale",
62
+ "K\u0131rklareli",
63
+ "Diyarbak\u0131r",
64
+ "Gaziantep",
65
+ "\u015Eanl\u0131urfa",
66
+ "Nev\u015Fehir",
67
+ "Kastamonu",
68
+ "G\xFCm\xFC\u015Fhane",
69
+ "Eski\u015Fehir",
70
+ "Erzincan",
71
+ "Erzurum",
72
+ "Denizli",
73
+ "\xC7anakkale",
74
+ "Ad\u0131yaman",
75
+ "Zonguldak",
76
+ "Tekirda\u011F",
77
+ "Trabzon",
78
+ "Tunceli",
79
+ "Karaman",
80
+ "Karab\xFCk",
81
+ "Aksaray",
82
+ "Antalya",
83
+ "K\u0131r\u015Fehir",
84
+ "Osmaniye",
85
+ "Kocaeli",
86
+ "Sakarya",
87
+ "Bart\u0131n",
88
+ "Bayburt",
89
+ "Ardahan",
90
+ "Yozgat",
91
+ "Ankara",
92
+ "Amasya",
93
+ "Artvin",
94
+ "Bal\u0131kesir",
95
+ "Bilecik",
96
+ "Bing\xF6l",
97
+ "Bitlis",
98
+ "Burdur",
99
+ "\xC7ank\u0131r\u0131",
100
+ "Edirne",
101
+ "Elaz\u0131\u011F",
102
+ "Giresun",
103
+ "Hakkari",
104
+ "Isparta",
105
+ "\u0130stanbul",
106
+ "\u0130zmir",
107
+ "Kayseri",
108
+ "K\xFCtahya",
109
+ "Malatya",
110
+ "Manisa",
111
+ "Mardin",
112
+ "Samsun",
113
+ "\u015E\u0131rnak",
114
+ "Sinop",
115
+ "Tokat",
116
+ "Hatay",
117
+ "Konya",
118
+ "Mu\u011Fla",
119
+ "Ni\u011Fde",
120
+ "Rize",
121
+ "Siirt",
122
+ "Sivas",
123
+ "Adana",
124
+ "Ayd\u0131n",
125
+ "Bursa",
126
+ "\xC7orum",
127
+ "I\u011Fd\u0131r",
128
+ "Kilis",
129
+ "Mersin",
130
+ "Batman",
131
+ "Yalova",
132
+ "D\xFCzce",
133
+ "Ordu",
134
+ "Kars",
135
+ "A\u011Fr\u0131",
136
+ "Bolu",
137
+ "Van",
138
+ "U\u015Fak",
139
+ "Mu\u015F"
140
+ ].sort((a, b) => b.length - a.length);
141
+ var PROVINCE_TR_RE = new RegExp(
142
+ `(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
143
+ "gu"
144
+ );
48
145
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
49
146
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
50
147
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -52,6 +149,46 @@ var NAME_RE = new RegExp(
52
149
  `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
53
150
  "gu"
54
151
  );
152
+ var _IBAN_INTL_LENGTHS = {
153
+ AT: 20,
154
+ BE: 16,
155
+ BG: 22,
156
+ HR: 21,
157
+ CY: 28,
158
+ CZ: 24,
159
+ DK: 18,
160
+ EE: 20,
161
+ FI: 18,
162
+ FR: 27,
163
+ DE: 22,
164
+ GR: 27,
165
+ HU: 28,
166
+ IE: 22,
167
+ IT: 27,
168
+ LV: 21,
169
+ LT: 20,
170
+ LU: 20,
171
+ MT: 31,
172
+ NL: 18,
173
+ PL: 28,
174
+ PT: 25,
175
+ RO: 24,
176
+ SK: 24,
177
+ SI: 19,
178
+ ES: 24,
179
+ SE: 24,
180
+ GB: 22,
181
+ CH: 21,
182
+ NO: 15
183
+ };
184
+ var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
185
+ var _UC = "[A-Z\xC0-\u024F]";
186
+ var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
187
+ var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
188
+ var COMPANY_NAME_INTL_RE = new RegExp(
189
+ `(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
190
+ "gu"
191
+ );
55
192
  var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
56
193
  function validTckn(s) {
57
194
  if (s.length !== 11 || s[0] === "0") return false;
@@ -103,10 +240,30 @@ function validIban(s) {
103
240
  }
104
241
  return remainder === 1;
105
242
  }
243
+ function validIbanIntl(s) {
244
+ const country = s.slice(0, 2);
245
+ if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
246
+ if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
247
+ return validIban(s);
248
+ }
249
+ function validPhoneIntl(raw) {
250
+ const digits = raw.replace(/\D/g, "");
251
+ return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
252
+ }
106
253
  var LOCALE_DETECTORS = {
107
- tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
108
- us: /* @__PURE__ */ new Set(["ssn", "phone"]),
109
- eu: /* @__PURE__ */ new Set(["phone"])
254
+ tr: /* @__PURE__ */ new Set([
255
+ "national_id_tr",
256
+ "tax_id_tr",
257
+ "phone_tr",
258
+ "name",
259
+ "iban_tr",
260
+ "company_name_tr",
261
+ "mersis_no",
262
+ "postal_code_tr",
263
+ "province_tr"
264
+ ]),
265
+ us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
266
+ eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
110
267
  };
111
268
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
112
269
  function activeDetectors(locale) {
@@ -115,7 +272,6 @@ function activeDetectors(locale) {
115
272
  for (const detectors of Object.values(LOCALE_DETECTORS)) {
116
273
  detectors.forEach((d) => active2.add(d));
117
274
  }
118
- if (active2.has("phone_tr")) active2.delete("phone");
119
275
  return active2;
120
276
  }
121
277
  const active = new Set(UNIVERSAL);
@@ -134,15 +290,15 @@ function findAll(re, text, type) {
134
290
  function detectPii(text, locale = "tr") {
135
291
  const active = activeDetectors(locale);
136
292
  const t = text ?? "";
137
- const findings = [];
293
+ let findings = [];
138
294
  if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
139
- if (active.has("phone")) {
295
+ if (active.has("phone_intl")) {
140
296
  PHONE_INTL_RE.lastIndex = 0;
141
297
  let m;
142
298
  while ((m = PHONE_INTL_RE.exec(t)) !== null) {
143
- const digits = m[0].replace(/\D/g, "").length;
144
- if (digits >= 10) {
145
- findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
299
+ const candidate = m[1];
300
+ if (validPhoneIntl(candidate)) {
301
+ findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
146
302
  }
147
303
  }
148
304
  }
@@ -195,8 +351,71 @@ function detectPii(text, locale = "tr") {
195
351
  findings.push({ type: "name", value, start, end: start + value.length });
196
352
  }
197
353
  }
354
+ if (active.has("iban_tr")) {
355
+ IBAN_TR_RE.lastIndex = 0;
356
+ let m;
357
+ while ((m = IBAN_TR_RE.exec(t)) !== null) {
358
+ if (validIban(m[0])) {
359
+ findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
360
+ }
361
+ }
362
+ }
363
+ if (active.has("company_name_tr")) {
364
+ COMPANY_NAME_TR_RE.lastIndex = 0;
365
+ let m;
366
+ while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
367
+ findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
368
+ }
369
+ }
370
+ if (active.has("mersis_no")) {
371
+ MERSIS_RE.lastIndex = 0;
372
+ let m;
373
+ while ((m = MERSIS_RE.exec(t)) !== null) {
374
+ findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
375
+ }
376
+ }
377
+ if (active.has("postal_code_tr")) {
378
+ POSTAL_CODE_TR_RE.lastIndex = 0;
379
+ let m;
380
+ while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
381
+ findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
382
+ }
383
+ }
384
+ if (active.has("province_tr")) {
385
+ PROVINCE_TR_RE.lastIndex = 0;
386
+ let m;
387
+ while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
388
+ findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
389
+ }
390
+ }
198
391
  if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
199
- return findings.sort((a, b) => a.start - b.start);
392
+ if (active.has("iban_intl")) {
393
+ IBAN_INTL_RE.lastIndex = 0;
394
+ let m;
395
+ while ((m = IBAN_INTL_RE.exec(t)) !== null) {
396
+ const candidate = m[1];
397
+ if (validIbanIntl(candidate)) {
398
+ findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
399
+ }
400
+ }
401
+ }
402
+ if (active.has("company_name_intl")) {
403
+ COMPANY_NAME_INTL_RE.lastIndex = 0;
404
+ let m;
405
+ while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
406
+ findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
407
+ }
408
+ }
409
+ findings.sort((a, b) => a.start - b.start);
410
+ const specificIbanSpans = new Set(
411
+ findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
412
+ );
413
+ if (specificIbanSpans.size > 0) {
414
+ findings = findings.filter(
415
+ (f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
416
+ );
417
+ }
418
+ return findings;
200
419
  }
201
420
 
202
421
  // src/quality.ts
@@ -274,7 +493,7 @@ function applyMask(text, findings, strategy = "redact") {
274
493
  }
275
494
 
276
495
  // src/index.ts
277
- var version = "0.3.0";
496
+ var version = "0.3.1";
278
497
  function computeQualityScore(completeness, avgLength, garbageRatio) {
279
498
  const lengthScore = Math.min(avgLength / 500, 1);
280
499
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/dist/index.d.cts CHANGED
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
45
  * // "Contact: [REDACTED_EMAIL]"
46
46
  */
47
47
 
48
- declare const version = "0.3.0";
48
+ declare const version = "0.3.1";
49
49
  type QualityGrade = "A" | "B" | "C" | "D";
50
50
  interface PiiSummaryEntry {
51
51
  type: string;
package/dist/index.d.ts CHANGED
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
45
  * // "Contact: [REDACTED_EMAIL]"
46
46
  */
47
47
 
48
- declare const version = "0.3.0";
48
+ declare const version = "0.3.1";
49
49
  type QualityGrade = "A" | "B" | "C" | "D";
50
50
  interface PiiSummaryEntry {
51
51
  type: string;
package/dist/index.js CHANGED
@@ -1,7 +1,8 @@
1
1
  // src/pii.ts
2
2
  var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
3
- var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
3
+ var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
4
4
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
5
+ var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
5
6
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
6
7
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
7
8
  var _H = "[0-9a-fA-F]{1,4}";
@@ -12,6 +13,102 @@ var IPV6_RE = new RegExp(
12
13
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
13
14
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
14
15
  var VKN_RE = /\b([1-9]\d{9})\b/g;
16
+ var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
17
+ var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
18
+ var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
19
+ var COMPANY_NAME_TR_RE = new RegExp(
20
+ `(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
21
+ "gu"
22
+ );
23
+ var MERSIS_RE = /\b([1-9]\d{15})\b/g;
24
+ var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
25
+ var _TR_PROVINCES_SORTED = [
26
+ "Afyonkarahisar",
27
+ "Kahramanmara\u015F",
28
+ "K\u0131r\u0131kkale",
29
+ "K\u0131rklareli",
30
+ "Diyarbak\u0131r",
31
+ "Gaziantep",
32
+ "\u015Eanl\u0131urfa",
33
+ "Nev\u015Fehir",
34
+ "Kastamonu",
35
+ "G\xFCm\xFC\u015Fhane",
36
+ "Eski\u015Fehir",
37
+ "Erzincan",
38
+ "Erzurum",
39
+ "Denizli",
40
+ "\xC7anakkale",
41
+ "Ad\u0131yaman",
42
+ "Zonguldak",
43
+ "Tekirda\u011F",
44
+ "Trabzon",
45
+ "Tunceli",
46
+ "Karaman",
47
+ "Karab\xFCk",
48
+ "Aksaray",
49
+ "Antalya",
50
+ "K\u0131r\u015Fehir",
51
+ "Osmaniye",
52
+ "Kocaeli",
53
+ "Sakarya",
54
+ "Bart\u0131n",
55
+ "Bayburt",
56
+ "Ardahan",
57
+ "Yozgat",
58
+ "Ankara",
59
+ "Amasya",
60
+ "Artvin",
61
+ "Bal\u0131kesir",
62
+ "Bilecik",
63
+ "Bing\xF6l",
64
+ "Bitlis",
65
+ "Burdur",
66
+ "\xC7ank\u0131r\u0131",
67
+ "Edirne",
68
+ "Elaz\u0131\u011F",
69
+ "Giresun",
70
+ "Hakkari",
71
+ "Isparta",
72
+ "\u0130stanbul",
73
+ "\u0130zmir",
74
+ "Kayseri",
75
+ "K\xFCtahya",
76
+ "Malatya",
77
+ "Manisa",
78
+ "Mardin",
79
+ "Samsun",
80
+ "\u015E\u0131rnak",
81
+ "Sinop",
82
+ "Tokat",
83
+ "Hatay",
84
+ "Konya",
85
+ "Mu\u011Fla",
86
+ "Ni\u011Fde",
87
+ "Rize",
88
+ "Siirt",
89
+ "Sivas",
90
+ "Adana",
91
+ "Ayd\u0131n",
92
+ "Bursa",
93
+ "\xC7orum",
94
+ "I\u011Fd\u0131r",
95
+ "Kilis",
96
+ "Mersin",
97
+ "Batman",
98
+ "Yalova",
99
+ "D\xFCzce",
100
+ "Ordu",
101
+ "Kars",
102
+ "A\u011Fr\u0131",
103
+ "Bolu",
104
+ "Van",
105
+ "U\u015Fak",
106
+ "Mu\u015F"
107
+ ].sort((a, b) => b.length - a.length);
108
+ var PROVINCE_TR_RE = new RegExp(
109
+ `(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
110
+ "gu"
111
+ );
15
112
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
16
113
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
17
114
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -19,6 +116,46 @@ var NAME_RE = new RegExp(
19
116
  `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
20
117
  "gu"
21
118
  );
119
+ var _IBAN_INTL_LENGTHS = {
120
+ AT: 20,
121
+ BE: 16,
122
+ BG: 22,
123
+ HR: 21,
124
+ CY: 28,
125
+ CZ: 24,
126
+ DK: 18,
127
+ EE: 20,
128
+ FI: 18,
129
+ FR: 27,
130
+ DE: 22,
131
+ GR: 27,
132
+ HU: 28,
133
+ IE: 22,
134
+ IT: 27,
135
+ LV: 21,
136
+ LT: 20,
137
+ LU: 20,
138
+ MT: 31,
139
+ NL: 18,
140
+ PL: 28,
141
+ PT: 25,
142
+ RO: 24,
143
+ SK: 24,
144
+ SI: 19,
145
+ ES: 24,
146
+ SE: 24,
147
+ GB: 22,
148
+ CH: 21,
149
+ NO: 15
150
+ };
151
+ var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
152
+ var _UC = "[A-Z\xC0-\u024F]";
153
+ var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
154
+ var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
155
+ var COMPANY_NAME_INTL_RE = new RegExp(
156
+ `(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
157
+ "gu"
158
+ );
22
159
  var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
23
160
  function validTckn(s) {
24
161
  if (s.length !== 11 || s[0] === "0") return false;
@@ -70,10 +207,30 @@ function validIban(s) {
70
207
  }
71
208
  return remainder === 1;
72
209
  }
210
+ function validIbanIntl(s) {
211
+ const country = s.slice(0, 2);
212
+ if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
213
+ if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
214
+ return validIban(s);
215
+ }
216
+ function validPhoneIntl(raw) {
217
+ const digits = raw.replace(/\D/g, "");
218
+ return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
219
+ }
73
220
  var LOCALE_DETECTORS = {
74
- tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
75
- us: /* @__PURE__ */ new Set(["ssn", "phone"]),
76
- eu: /* @__PURE__ */ new Set(["phone"])
221
+ tr: /* @__PURE__ */ new Set([
222
+ "national_id_tr",
223
+ "tax_id_tr",
224
+ "phone_tr",
225
+ "name",
226
+ "iban_tr",
227
+ "company_name_tr",
228
+ "mersis_no",
229
+ "postal_code_tr",
230
+ "province_tr"
231
+ ]),
232
+ us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
233
+ eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
77
234
  };
78
235
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
79
236
  function activeDetectors(locale) {
@@ -82,7 +239,6 @@ function activeDetectors(locale) {
82
239
  for (const detectors of Object.values(LOCALE_DETECTORS)) {
83
240
  detectors.forEach((d) => active2.add(d));
84
241
  }
85
- if (active2.has("phone_tr")) active2.delete("phone");
86
242
  return active2;
87
243
  }
88
244
  const active = new Set(UNIVERSAL);
@@ -101,15 +257,15 @@ function findAll(re, text, type) {
101
257
  function detectPii(text, locale = "tr") {
102
258
  const active = activeDetectors(locale);
103
259
  const t = text ?? "";
104
- const findings = [];
260
+ let findings = [];
105
261
  if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
106
- if (active.has("phone")) {
262
+ if (active.has("phone_intl")) {
107
263
  PHONE_INTL_RE.lastIndex = 0;
108
264
  let m;
109
265
  while ((m = PHONE_INTL_RE.exec(t)) !== null) {
110
- const digits = m[0].replace(/\D/g, "").length;
111
- if (digits >= 10) {
112
- findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
266
+ const candidate = m[1];
267
+ if (validPhoneIntl(candidate)) {
268
+ findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
113
269
  }
114
270
  }
115
271
  }
@@ -162,8 +318,71 @@ function detectPii(text, locale = "tr") {
162
318
  findings.push({ type: "name", value, start, end: start + value.length });
163
319
  }
164
320
  }
321
+ if (active.has("iban_tr")) {
322
+ IBAN_TR_RE.lastIndex = 0;
323
+ let m;
324
+ while ((m = IBAN_TR_RE.exec(t)) !== null) {
325
+ if (validIban(m[0])) {
326
+ findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
327
+ }
328
+ }
329
+ }
330
+ if (active.has("company_name_tr")) {
331
+ COMPANY_NAME_TR_RE.lastIndex = 0;
332
+ let m;
333
+ while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
334
+ findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
335
+ }
336
+ }
337
+ if (active.has("mersis_no")) {
338
+ MERSIS_RE.lastIndex = 0;
339
+ let m;
340
+ while ((m = MERSIS_RE.exec(t)) !== null) {
341
+ findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
342
+ }
343
+ }
344
+ if (active.has("postal_code_tr")) {
345
+ POSTAL_CODE_TR_RE.lastIndex = 0;
346
+ let m;
347
+ while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
348
+ findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
349
+ }
350
+ }
351
+ if (active.has("province_tr")) {
352
+ PROVINCE_TR_RE.lastIndex = 0;
353
+ let m;
354
+ while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
355
+ findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
356
+ }
357
+ }
165
358
  if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
166
- return findings.sort((a, b) => a.start - b.start);
359
+ if (active.has("iban_intl")) {
360
+ IBAN_INTL_RE.lastIndex = 0;
361
+ let m;
362
+ while ((m = IBAN_INTL_RE.exec(t)) !== null) {
363
+ const candidate = m[1];
364
+ if (validIbanIntl(candidate)) {
365
+ findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
366
+ }
367
+ }
368
+ }
369
+ if (active.has("company_name_intl")) {
370
+ COMPANY_NAME_INTL_RE.lastIndex = 0;
371
+ let m;
372
+ while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
373
+ findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
374
+ }
375
+ }
376
+ findings.sort((a, b) => a.start - b.start);
377
+ const specificIbanSpans = new Set(
378
+ findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
379
+ );
380
+ if (specificIbanSpans.size > 0) {
381
+ findings = findings.filter(
382
+ (f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
383
+ );
384
+ }
385
+ return findings;
167
386
  }
168
387
 
169
388
  // src/quality.ts
@@ -241,7 +460,7 @@ function applyMask(text, findings, strategy = "redact") {
241
460
  }
242
461
 
243
462
  // src/index.ts
244
- var version = "0.3.0";
463
+ var version = "0.3.1";
245
464
  function computeQualityScore(completeness, avgLength, garbageRatio) {
246
465
  const lengthScore = Math.min(avgLength / 500, 1);
247
466
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",