@flexorch/audit 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
4
4
 
5
+ - **Quality grade** — A/B/C/D score that signals LLM-readiness at a glance
5
6
  - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
6
7
  - **Quality metrics** — completeness, average length, duplicate ratio
7
8
  - **Noise metrics** — garbage character ratio, encoding health
@@ -10,13 +11,19 @@ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one questi
10
11
 
11
12
  ```ts
12
13
  import { audit, mask } from "@flexorch/audit"
14
+ import { readFileSync } from "fs"
13
15
 
16
+ const text = readFileSync("contract.txt", "utf8")
14
17
  const result = audit(text, { locale: "tr" })
15
- // {
16
- // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
17
- // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
18
- // noise: { garbage_ratio: 0.0, encoding_ok: true },
19
- // }
18
+
19
+ result.quality_grade // "A"
20
+ result.quality_score // 0.91 (0.0–1.0 composite)
21
+ result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
22
+
23
+ // Raw findings and metrics — also available:
24
+ result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
25
+ result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
26
+ result.noise // { garbage_ratio: 0.0, encoding_ok: true }
20
27
 
21
28
  const clean = mask(text, result.pii, { strategy: "redact" })
22
29
  // "Contact: [REDACTED_EMAIL]"
@@ -28,6 +35,8 @@ const clean = mask(text, result.pii, { strategy: "redact" })
28
35
  npm install @flexorch/audit
29
36
  ```
30
37
 
38
+ ![demo](assets/demo.svg)
39
+
31
40
  ## Locale support
32
41
 
33
42
  | `locale` | Active detectors |
@@ -68,6 +77,20 @@ Full type definitions included. No `@types/` package needed.
68
77
  import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
69
78
  ```
70
79
 
80
+ ## Quality grade
81
+
82
+ The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
83
+
84
+ | Grade | Score | Meaning |
85
+ |-------|-------|---------|
86
+ | A | ≥ 0.85 | Ready for LLM training or RAG |
87
+ | B | ≥ 0.65 | Usable with minor cleanup |
88
+ | C | ≥ 0.40 | Needs review before use |
89
+ | D | < 0.40 | Not suitable — empty, too short, or high noise |
90
+
91
+ Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
92
+ where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
93
+
71
94
  ## Quality & noise
72
95
 
73
96
  `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
@@ -83,7 +106,7 @@ for (const t of texts) {
83
106
  const duplicateRatio = duplicates / texts.length
84
107
  ```
85
108
 
86
- ## Limitations (v0.1)
109
+ ## Limitations (v0.2)
87
110
 
88
111
  - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
89
112
  - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
package/dist/index.cjs CHANGED
@@ -22,6 +22,7 @@ var index_exports = {};
22
22
  __export(index_exports, {
23
23
  applyMask: () => applyMask,
24
24
  audit: () => audit,
25
+ auditBatch: () => auditBatch,
25
26
  detectPii: () => detectPii,
26
27
  mask: () => mask,
27
28
  noiseMetrics: () => noiseMetrics,
@@ -36,8 +37,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
36
37
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
37
38
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
38
39
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
40
+ var _H = "[0-9a-fA-F]{1,4}";
41
+ var IPV6_RE = new RegExp(
42
+ `(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
43
+ "gi"
44
+ );
39
45
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
40
46
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
47
+ var VKN_RE = /\b([1-9]\d{9})\b/g;
41
48
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
42
49
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
43
50
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -54,6 +61,20 @@ function validTckn(s) {
54
61
  if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
55
62
  return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
56
63
  }
64
+ function validVkn(s) {
65
+ if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
66
+ const d = s.split("").map(Number);
67
+ let total = 0;
68
+ for (let i = 0; i < 9; i++) {
69
+ const x = (d[i] + (9 - i)) % 10;
70
+ if (x !== 0) {
71
+ let y = x * Math.pow(2, 9 - i) % 9;
72
+ if (y === 0) y = 9;
73
+ total += y;
74
+ }
75
+ }
76
+ return (10 - total % 10) % 10 === d[9];
77
+ }
57
78
  function luhn(number) {
58
79
  const digits = number.replace(/\D/g, "");
59
80
  if (digits.length < 13 || digits.length > 19) return false;
@@ -68,12 +89,26 @@ function luhn(number) {
68
89
  }
69
90
  return total % 10 === 0;
70
91
  }
92
+ function validIban(s) {
93
+ const rearranged = s.slice(4) + s.slice(0, 4);
94
+ const numeric = rearranged.toUpperCase().split("").map((c) => {
95
+ const code = c.charCodeAt(0);
96
+ return code >= 65 && code <= 90 ? String(code - 55) : c;
97
+ }).join("");
98
+ let remainder = 0;
99
+ for (let i = 0; i < numeric.length; i += 9) {
100
+ const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
101
+ if (!Number.isFinite(chunk)) return false;
102
+ remainder = chunk % 97;
103
+ }
104
+ return remainder === 1;
105
+ }
71
106
  var LOCALE_DETECTORS = {
72
- tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
107
+ tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
73
108
  us: /* @__PURE__ */ new Set(["ssn", "phone"]),
74
109
  eu: /* @__PURE__ */ new Set(["phone"])
75
110
  };
76
- var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
111
+ var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
77
112
  function activeDetectors(locale) {
78
113
  if (locale === "all") {
79
114
  const active2 = new Set(UNIVERSAL);
@@ -111,7 +146,15 @@ function detectPii(text, locale = "tr") {
111
146
  }
112
147
  }
113
148
  }
114
- if (active.has("iban")) findings.push(...findAll(IBAN_RE, t, "iban"));
149
+ if (active.has("iban")) {
150
+ IBAN_RE.lastIndex = 0;
151
+ let m;
152
+ while ((m = IBAN_RE.exec(t)) !== null) {
153
+ if (validIban(m[0])) {
154
+ findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
155
+ }
156
+ }
157
+ }
115
158
  if (active.has("credit_card")) {
116
159
  CC_RE.lastIndex = 0;
117
160
  let m;
@@ -122,6 +165,7 @@ function detectPii(text, locale = "tr") {
122
165
  }
123
166
  }
124
167
  if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
168
+ if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
125
169
  if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
126
170
  if (active.has("national_id_tr")) {
127
171
  TCKN_RE.lastIndex = 0;
@@ -132,6 +176,15 @@ function detectPii(text, locale = "tr") {
132
176
  }
133
177
  }
134
178
  }
179
+ if (active.has("tax_id_tr")) {
180
+ VKN_RE.lastIndex = 0;
181
+ let m;
182
+ while ((m = VKN_RE.exec(t)) !== null) {
183
+ if (validVkn(m[1])) {
184
+ findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
185
+ }
186
+ }
187
+ }
135
188
  if (active.has("name")) {
136
189
  NAME_RE.lastIndex = 0;
137
190
  let m;
@@ -221,14 +274,52 @@ function applyMask(text, findings, strategy = "redact") {
221
274
  }
222
275
 
223
276
  // src/index.ts
224
- var version = "0.1.0";
277
+ var version = "0.3.0";
278
+ function computeQualityScore(completeness, avgLength, garbageRatio) {
279
+ const lengthScore = Math.min(avgLength / 500, 1);
280
+ const noiseScore = Math.max(0, 1 - garbageRatio * 10);
281
+ return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
282
+ }
283
+ function computeQualityGrade(score) {
284
+ if (score >= 0.85) return "A";
285
+ if (score >= 0.65) return "B";
286
+ if (score >= 0.4) return "C";
287
+ return "D";
288
+ }
225
289
  function audit(text, options = {}) {
226
290
  const locale = options.locale ?? "tr";
227
- return {
228
- pii: detectPii(text, locale),
229
- quality: qualityMetrics(text),
230
- noise: noiseMetrics(text)
231
- };
291
+ const pii = detectPii(text, locale);
292
+ const quality = qualityMetrics(text);
293
+ const noise = noiseMetrics(text);
294
+ const quality_score = computeQualityScore(
295
+ quality.completeness,
296
+ quality.avg_length,
297
+ noise.garbage_ratio
298
+ );
299
+ const quality_grade = computeQualityGrade(quality_score);
300
+ const counts = /* @__PURE__ */ new Map();
301
+ for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
302
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
303
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise };
304
+ }
305
+ function auditBatch(texts, options = {}) {
306
+ if (texts.length === 0) {
307
+ return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
308
+ }
309
+ const results = texts.map((t) => audit(t, options));
310
+ const seen = /* @__PURE__ */ new Set();
311
+ let dupCount = 0;
312
+ for (const t of texts) {
313
+ if (seen.has(t)) dupCount++;
314
+ else seen.add(t);
315
+ }
316
+ const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
317
+ const allPii = results.flatMap((r) => r.pii);
318
+ const counts = /* @__PURE__ */ new Map();
319
+ for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
320
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
321
+ const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
322
+ return { results, duplicate_ratio, pii_summary, avg_quality_score };
232
323
  }
233
324
  function mask(text, findings, options = {}) {
234
325
  return applyMask(text, findings, options.strategy ?? "redact");
@@ -237,6 +328,7 @@ function mask(text, findings, options = {}) {
237
328
  0 && (module.exports = {
238
329
  applyMask,
239
330
  audit,
331
+ auditBatch,
240
332
  detectPii,
241
333
  mask,
242
334
  noiseMetrics,
package/dist/index.d.cts CHANGED
@@ -27,32 +27,50 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
27
27
  *
28
28
  * @example
29
29
  * import { audit, mask } from "@flexorch/audit"
30
+ * import { readFileSync } from "fs"
30
31
  *
32
+ * const text = readFileSync("contract.txt", "utf8")
31
33
  * const result = audit(text, { locale: "tr" })
32
- * // {
33
- * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
- * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
- * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
- * // }
34
+ *
35
+ * result.quality_grade // "A"
36
+ * result.quality_score // 0.91
37
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
+ *
39
+ * // Raw findings and metrics also available:
40
+ * result.pii // [{ type, value, start, end }, ...]
41
+ * result.quality // { completeness, avg_length, duplicate_ratio }
42
+ * result.noise // { garbage_ratio, encoding_ok }
37
43
  *
38
44
  * const clean = mask(text, result.pii, { strategy: "redact" })
39
45
  * // "Contact: [REDACTED_EMAIL]"
40
46
  */
41
47
 
42
- declare const version = "0.1.0";
48
+ declare const version = "0.3.0";
49
+ type QualityGrade = "A" | "B" | "C" | "D";
50
+ interface PiiSummaryEntry {
51
+ type: string;
52
+ count: number;
53
+ }
43
54
  interface AuditOptions {
44
55
  /**
45
56
  * Active locale-specific detectors.
46
- * - "tr" — Turkish: TCKN, phone_tr, name (default)
57
+ * - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
47
58
  * - "us" — US: SSN, E.164 phone
48
59
  * - "eu" — EU: E.164 phone
49
60
  * - "all" — All detectors
50
61
  *
51
- * Universal detectors (email, iban, credit_card, ip) are always active.
62
+ * Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
52
63
  */
53
64
  locale?: string;
54
65
  }
55
66
  interface AuditResult {
67
+ /** A/B/C/D overall LLM-readiness grade. */
68
+ quality_grade: QualityGrade;
69
+ /** 0.0–1.0 composite score (completeness + length + noise). */
70
+ quality_score: number;
71
+ /** PII findings aggregated by type: [{ type, count }]. */
72
+ pii_summary: PiiSummaryEntry[];
73
+ /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
56
74
  pii: PiiFinding[];
57
75
  quality: QualityMetrics;
58
76
  noise: NoiseMetrics;
@@ -65,9 +83,23 @@ interface MaskOptions {
65
83
  * Audit *text* for LLM dataset readiness.
66
84
  */
67
85
  declare function audit(text: string, options?: AuditOptions): AuditResult;
86
+ interface BatchAuditResult {
87
+ /** One AuditResult per input text, in order. */
88
+ results: AuditResult[];
89
+ /** Fraction of texts that are exact duplicates (0.0–1.0). */
90
+ duplicate_ratio: number;
91
+ /** PII counts aggregated across all texts. */
92
+ pii_summary: PiiSummaryEntry[];
93
+ /** Mean quality_score across all texts. */
94
+ avg_quality_score: number;
95
+ }
96
+ /**
97
+ * Audit a list of texts and aggregate metrics — including duplicate_ratio.
98
+ */
99
+ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
68
100
  /**
69
101
  * Apply masking to PII findings in *text*.
70
102
  */
71
103
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
104
 
73
- export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
105
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
package/dist/index.d.ts CHANGED
@@ -27,32 +27,50 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
27
27
  *
28
28
  * @example
29
29
  * import { audit, mask } from "@flexorch/audit"
30
+ * import { readFileSync } from "fs"
30
31
  *
32
+ * const text = readFileSync("contract.txt", "utf8")
31
33
  * const result = audit(text, { locale: "tr" })
32
- * // {
33
- * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
- * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
- * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
- * // }
34
+ *
35
+ * result.quality_grade // "A"
36
+ * result.quality_score // 0.91
37
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
+ *
39
+ * // Raw findings and metrics also available:
40
+ * result.pii // [{ type, value, start, end }, ...]
41
+ * result.quality // { completeness, avg_length, duplicate_ratio }
42
+ * result.noise // { garbage_ratio, encoding_ok }
37
43
  *
38
44
  * const clean = mask(text, result.pii, { strategy: "redact" })
39
45
  * // "Contact: [REDACTED_EMAIL]"
40
46
  */
41
47
 
42
- declare const version = "0.1.0";
48
+ declare const version = "0.3.0";
49
+ type QualityGrade = "A" | "B" | "C" | "D";
50
+ interface PiiSummaryEntry {
51
+ type: string;
52
+ count: number;
53
+ }
43
54
  interface AuditOptions {
44
55
  /**
45
56
  * Active locale-specific detectors.
46
- * - "tr" — Turkish: TCKN, phone_tr, name (default)
57
+ * - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
47
58
  * - "us" — US: SSN, E.164 phone
48
59
  * - "eu" — EU: E.164 phone
49
60
  * - "all" — All detectors
50
61
  *
51
- * Universal detectors (email, iban, credit_card, ip) are always active.
62
+ * Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
52
63
  */
53
64
  locale?: string;
54
65
  }
55
66
  interface AuditResult {
67
+ /** A/B/C/D overall LLM-readiness grade. */
68
+ quality_grade: QualityGrade;
69
+ /** 0.0–1.0 composite score (completeness + length + noise). */
70
+ quality_score: number;
71
+ /** PII findings aggregated by type: [{ type, count }]. */
72
+ pii_summary: PiiSummaryEntry[];
73
+ /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
56
74
  pii: PiiFinding[];
57
75
  quality: QualityMetrics;
58
76
  noise: NoiseMetrics;
@@ -65,9 +83,23 @@ interface MaskOptions {
65
83
  * Audit *text* for LLM dataset readiness.
66
84
  */
67
85
  declare function audit(text: string, options?: AuditOptions): AuditResult;
86
+ interface BatchAuditResult {
87
+ /** One AuditResult per input text, in order. */
88
+ results: AuditResult[];
89
+ /** Fraction of texts that are exact duplicates (0.0–1.0). */
90
+ duplicate_ratio: number;
91
+ /** PII counts aggregated across all texts. */
92
+ pii_summary: PiiSummaryEntry[];
93
+ /** Mean quality_score across all texts. */
94
+ avg_quality_score: number;
95
+ }
96
+ /**
97
+ * Audit a list of texts and aggregate metrics — including duplicate_ratio.
98
+ */
99
+ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
68
100
  /**
69
101
  * Apply masking to PII findings in *text*.
70
102
  */
71
103
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
104
 
73
- export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
105
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
package/dist/index.js CHANGED
@@ -4,8 +4,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
4
4
  var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
5
5
  var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
6
6
  var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
7
+ var _H = "[0-9a-fA-F]{1,4}";
8
+ var IPV6_RE = new RegExp(
9
+ `(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
10
+ "gi"
11
+ );
7
12
  var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
8
13
  var TCKN_RE = /\b([1-9]\d{10})\b/g;
14
+ var VKN_RE = /\b([1-9]\d{9})\b/g;
9
15
  var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
10
16
  var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
11
17
  var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
@@ -22,6 +28,20 @@ function validTckn(s) {
22
28
  if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
23
29
  return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
24
30
  }
31
+ function validVkn(s) {
32
+ if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
33
+ const d = s.split("").map(Number);
34
+ let total = 0;
35
+ for (let i = 0; i < 9; i++) {
36
+ const x = (d[i] + (9 - i)) % 10;
37
+ if (x !== 0) {
38
+ let y = x * Math.pow(2, 9 - i) % 9;
39
+ if (y === 0) y = 9;
40
+ total += y;
41
+ }
42
+ }
43
+ return (10 - total % 10) % 10 === d[9];
44
+ }
25
45
  function luhn(number) {
26
46
  const digits = number.replace(/\D/g, "");
27
47
  if (digits.length < 13 || digits.length > 19) return false;
@@ -36,12 +56,26 @@ function luhn(number) {
36
56
  }
37
57
  return total % 10 === 0;
38
58
  }
59
+ function validIban(s) {
60
+ const rearranged = s.slice(4) + s.slice(0, 4);
61
+ const numeric = rearranged.toUpperCase().split("").map((c) => {
62
+ const code = c.charCodeAt(0);
63
+ return code >= 65 && code <= 90 ? String(code - 55) : c;
64
+ }).join("");
65
+ let remainder = 0;
66
+ for (let i = 0; i < numeric.length; i += 9) {
67
+ const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
68
+ if (!Number.isFinite(chunk)) return false;
69
+ remainder = chunk % 97;
70
+ }
71
+ return remainder === 1;
72
+ }
39
73
  var LOCALE_DETECTORS = {
40
- tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
74
+ tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
41
75
  us: /* @__PURE__ */ new Set(["ssn", "phone"]),
42
76
  eu: /* @__PURE__ */ new Set(["phone"])
43
77
  };
44
- var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
78
+ var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
45
79
  function activeDetectors(locale) {
46
80
  if (locale === "all") {
47
81
  const active2 = new Set(UNIVERSAL);
@@ -79,7 +113,15 @@ function detectPii(text, locale = "tr") {
79
113
  }
80
114
  }
81
115
  }
82
- if (active.has("iban")) findings.push(...findAll(IBAN_RE, t, "iban"));
116
+ if (active.has("iban")) {
117
+ IBAN_RE.lastIndex = 0;
118
+ let m;
119
+ while ((m = IBAN_RE.exec(t)) !== null) {
120
+ if (validIban(m[0])) {
121
+ findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
122
+ }
123
+ }
124
+ }
83
125
  if (active.has("credit_card")) {
84
126
  CC_RE.lastIndex = 0;
85
127
  let m;
@@ -90,6 +132,7 @@ function detectPii(text, locale = "tr") {
90
132
  }
91
133
  }
92
134
  if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
135
+ if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
93
136
  if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
94
137
  if (active.has("national_id_tr")) {
95
138
  TCKN_RE.lastIndex = 0;
@@ -100,6 +143,15 @@ function detectPii(text, locale = "tr") {
100
143
  }
101
144
  }
102
145
  }
146
+ if (active.has("tax_id_tr")) {
147
+ VKN_RE.lastIndex = 0;
148
+ let m;
149
+ while ((m = VKN_RE.exec(t)) !== null) {
150
+ if (validVkn(m[1])) {
151
+ findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
152
+ }
153
+ }
154
+ }
103
155
  if (active.has("name")) {
104
156
  NAME_RE.lastIndex = 0;
105
157
  let m;
@@ -189,14 +241,52 @@ function applyMask(text, findings, strategy = "redact") {
189
241
  }
190
242
 
191
243
  // src/index.ts
192
- var version = "0.1.0";
244
+ var version = "0.3.0";
245
+ function computeQualityScore(completeness, avgLength, garbageRatio) {
246
+ const lengthScore = Math.min(avgLength / 500, 1);
247
+ const noiseScore = Math.max(0, 1 - garbageRatio * 10);
248
+ return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
249
+ }
250
+ function computeQualityGrade(score) {
251
+ if (score >= 0.85) return "A";
252
+ if (score >= 0.65) return "B";
253
+ if (score >= 0.4) return "C";
254
+ return "D";
255
+ }
193
256
  function audit(text, options = {}) {
194
257
  const locale = options.locale ?? "tr";
195
- return {
196
- pii: detectPii(text, locale),
197
- quality: qualityMetrics(text),
198
- noise: noiseMetrics(text)
199
- };
258
+ const pii = detectPii(text, locale);
259
+ const quality = qualityMetrics(text);
260
+ const noise = noiseMetrics(text);
261
+ const quality_score = computeQualityScore(
262
+ quality.completeness,
263
+ quality.avg_length,
264
+ noise.garbage_ratio
265
+ );
266
+ const quality_grade = computeQualityGrade(quality_score);
267
+ const counts = /* @__PURE__ */ new Map();
268
+ for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
269
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
270
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise };
271
+ }
272
+ function auditBatch(texts, options = {}) {
273
+ if (texts.length === 0) {
274
+ return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
275
+ }
276
+ const results = texts.map((t) => audit(t, options));
277
+ const seen = /* @__PURE__ */ new Set();
278
+ let dupCount = 0;
279
+ for (const t of texts) {
280
+ if (seen.has(t)) dupCount++;
281
+ else seen.add(t);
282
+ }
283
+ const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
284
+ const allPii = results.flatMap((r) => r.pii);
285
+ const counts = /* @__PURE__ */ new Map();
286
+ for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
287
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
288
+ const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
289
+ return { results, duplicate_ratio, pii_summary, avg_quality_score };
200
290
  }
201
291
  function mask(text, findings, options = {}) {
202
292
  return applyMask(text, findings, options.strategy ?? "redact");
@@ -204,6 +294,7 @@ function mask(text, findings, options = {}) {
204
294
  export {
205
295
  applyMask,
206
296
  audit,
297
+ auditBatch,
207
298
  detectPii,
208
299
  mask,
209
300
  noiseMetrics,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.1.0",
3
+ "version": "0.3.0",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",
@@ -17,7 +17,7 @@
17
17
  "homepage": "https://github.com/flexorch/flexorch-audit-js",
18
18
  "repository": {
19
19
  "type": "git",
20
- "url": "https://github.com/flexorch/flexorch-audit-js.git"
20
+ "url": "git+https://github.com/flexorch/flexorch-audit-js.git"
21
21
  },
22
22
  "bugs": {
23
23
  "url": "https://github.com/flexorch/flexorch-audit-js/issues"