@flexorch/audit 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -6
- package/dist/index.cjs +101 -9
- package/dist/index.d.cts +41 -9
- package/dist/index.d.ts +41 -9
- package/dist/index.js +100 -9
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
|
|
4
4
|
|
|
5
|
+
- **Quality grade** — A/B/C/D score that signals LLM-readiness at a glance
|
|
5
6
|
- **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
|
|
6
7
|
- **Quality metrics** — completeness, average length, duplicate ratio
|
|
7
8
|
- **Noise metrics** — garbage character ratio, encoding health
|
|
@@ -10,13 +11,19 @@ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one questi
|
|
|
10
11
|
|
|
11
12
|
```ts
|
|
12
13
|
import { audit, mask } from "@flexorch/audit"
|
|
14
|
+
import { readFileSync } from "fs"
|
|
13
15
|
|
|
16
|
+
const text = readFileSync("contract.txt", "utf8")
|
|
14
17
|
const result = audit(text, { locale: "tr" })
|
|
15
|
-
|
|
16
|
-
//
|
|
17
|
-
//
|
|
18
|
-
//
|
|
19
|
-
|
|
18
|
+
|
|
19
|
+
result.quality_grade // "A"
|
|
20
|
+
result.quality_score // 0.91 (0.0–1.0 composite)
|
|
21
|
+
result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
|
|
22
|
+
|
|
23
|
+
// Raw findings and metrics — also available:
|
|
24
|
+
result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
|
|
25
|
+
result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
|
|
26
|
+
result.noise // { garbage_ratio: 0.0, encoding_ok: true }
|
|
20
27
|
|
|
21
28
|
const clean = mask(text, result.pii, { strategy: "redact" })
|
|
22
29
|
// "Contact: [REDACTED_EMAIL]"
|
|
@@ -28,6 +35,8 @@ const clean = mask(text, result.pii, { strategy: "redact" })
|
|
|
28
35
|
npm install @flexorch/audit
|
|
29
36
|
```
|
|
30
37
|
|
|
38
|
+

|
|
39
|
+
|
|
31
40
|
## Locale support
|
|
32
41
|
|
|
33
42
|
| `locale` | Active detectors |
|
|
@@ -68,6 +77,20 @@ Full type definitions included. No `@types/` package needed.
|
|
|
68
77
|
import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
|
|
69
78
|
```
|
|
70
79
|
|
|
80
|
+
## Quality grade
|
|
81
|
+
|
|
82
|
+
The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
|
|
83
|
+
|
|
84
|
+
| Grade | Score | Meaning |
|
|
85
|
+
|-------|-------|---------|
|
|
86
|
+
| A | ≥ 0.85 | Ready for LLM training or RAG |
|
|
87
|
+
| B | ≥ 0.65 | Usable with minor cleanup |
|
|
88
|
+
| C | ≥ 0.40 | Needs review before use |
|
|
89
|
+
| D | < 0.40 | Not suitable — empty, too short, or high noise |
|
|
90
|
+
|
|
91
|
+
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
92
|
+
where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
|
|
93
|
+
|
|
71
94
|
## Quality & noise
|
|
72
95
|
|
|
73
96
|
`duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
|
|
@@ -83,7 +106,7 @@ for (const t of texts) {
|
|
|
83
106
|
const duplicateRatio = duplicates / texts.length
|
|
84
107
|
```
|
|
85
108
|
|
|
86
|
-
## Limitations (v0.
|
|
109
|
+
## Limitations (v0.2)
|
|
87
110
|
|
|
88
111
|
- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
|
|
89
112
|
- `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
|
package/dist/index.cjs
CHANGED
|
@@ -22,6 +22,7 @@ var index_exports = {};
|
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
applyMask: () => applyMask,
|
|
24
24
|
audit: () => audit,
|
|
25
|
+
auditBatch: () => auditBatch,
|
|
25
26
|
detectPii: () => detectPii,
|
|
26
27
|
mask: () => mask,
|
|
27
28
|
noiseMetrics: () => noiseMetrics,
|
|
@@ -36,8 +37,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
|
|
|
36
37
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
37
38
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
38
39
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
40
|
+
var _H = "[0-9a-fA-F]{1,4}";
|
|
41
|
+
var IPV6_RE = new RegExp(
|
|
42
|
+
`(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
|
|
43
|
+
"gi"
|
|
44
|
+
);
|
|
39
45
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
40
46
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
47
|
+
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
41
48
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
42
49
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
43
50
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -54,6 +61,20 @@ function validTckn(s) {
|
|
|
54
61
|
if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
|
|
55
62
|
return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
|
|
56
63
|
}
|
|
64
|
+
function validVkn(s) {
|
|
65
|
+
if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
66
|
+
const d = s.split("").map(Number);
|
|
67
|
+
let total = 0;
|
|
68
|
+
for (let i = 0; i < 9; i++) {
|
|
69
|
+
const x = (d[i] + (9 - i)) % 10;
|
|
70
|
+
if (x !== 0) {
|
|
71
|
+
let y = x * Math.pow(2, 9 - i) % 9;
|
|
72
|
+
if (y === 0) y = 9;
|
|
73
|
+
total += y;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return (10 - total % 10) % 10 === d[9];
|
|
77
|
+
}
|
|
57
78
|
function luhn(number) {
|
|
58
79
|
const digits = number.replace(/\D/g, "");
|
|
59
80
|
if (digits.length < 13 || digits.length > 19) return false;
|
|
@@ -68,12 +89,26 @@ function luhn(number) {
|
|
|
68
89
|
}
|
|
69
90
|
return total % 10 === 0;
|
|
70
91
|
}
|
|
92
|
+
function validIban(s) {
|
|
93
|
+
const rearranged = s.slice(4) + s.slice(0, 4);
|
|
94
|
+
const numeric = rearranged.toUpperCase().split("").map((c) => {
|
|
95
|
+
const code = c.charCodeAt(0);
|
|
96
|
+
return code >= 65 && code <= 90 ? String(code - 55) : c;
|
|
97
|
+
}).join("");
|
|
98
|
+
let remainder = 0;
|
|
99
|
+
for (let i = 0; i < numeric.length; i += 9) {
|
|
100
|
+
const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
|
|
101
|
+
if (!Number.isFinite(chunk)) return false;
|
|
102
|
+
remainder = chunk % 97;
|
|
103
|
+
}
|
|
104
|
+
return remainder === 1;
|
|
105
|
+
}
|
|
71
106
|
var LOCALE_DETECTORS = {
|
|
72
|
-
tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
|
|
107
|
+
tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
|
|
73
108
|
us: /* @__PURE__ */ new Set(["ssn", "phone"]),
|
|
74
109
|
eu: /* @__PURE__ */ new Set(["phone"])
|
|
75
110
|
};
|
|
76
|
-
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
|
|
111
|
+
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
77
112
|
function activeDetectors(locale) {
|
|
78
113
|
if (locale === "all") {
|
|
79
114
|
const active2 = new Set(UNIVERSAL);
|
|
@@ -111,7 +146,15 @@ function detectPii(text, locale = "tr") {
|
|
|
111
146
|
}
|
|
112
147
|
}
|
|
113
148
|
}
|
|
114
|
-
if (active.has("iban"))
|
|
149
|
+
if (active.has("iban")) {
|
|
150
|
+
IBAN_RE.lastIndex = 0;
|
|
151
|
+
let m;
|
|
152
|
+
while ((m = IBAN_RE.exec(t)) !== null) {
|
|
153
|
+
if (validIban(m[0])) {
|
|
154
|
+
findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
115
158
|
if (active.has("credit_card")) {
|
|
116
159
|
CC_RE.lastIndex = 0;
|
|
117
160
|
let m;
|
|
@@ -122,6 +165,7 @@ function detectPii(text, locale = "tr") {
|
|
|
122
165
|
}
|
|
123
166
|
}
|
|
124
167
|
if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
|
|
168
|
+
if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
|
|
125
169
|
if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
|
|
126
170
|
if (active.has("national_id_tr")) {
|
|
127
171
|
TCKN_RE.lastIndex = 0;
|
|
@@ -132,6 +176,15 @@ function detectPii(text, locale = "tr") {
|
|
|
132
176
|
}
|
|
133
177
|
}
|
|
134
178
|
}
|
|
179
|
+
if (active.has("tax_id_tr")) {
|
|
180
|
+
VKN_RE.lastIndex = 0;
|
|
181
|
+
let m;
|
|
182
|
+
while ((m = VKN_RE.exec(t)) !== null) {
|
|
183
|
+
if (validVkn(m[1])) {
|
|
184
|
+
findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
135
188
|
if (active.has("name")) {
|
|
136
189
|
NAME_RE.lastIndex = 0;
|
|
137
190
|
let m;
|
|
@@ -221,14 +274,52 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
221
274
|
}
|
|
222
275
|
|
|
223
276
|
// src/index.ts
|
|
224
|
-
var version = "0.
|
|
277
|
+
var version = "0.3.0";
|
|
278
|
+
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
279
|
+
const lengthScore = Math.min(avgLength / 500, 1);
|
|
280
|
+
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
281
|
+
return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
|
|
282
|
+
}
|
|
283
|
+
function computeQualityGrade(score) {
|
|
284
|
+
if (score >= 0.85) return "A";
|
|
285
|
+
if (score >= 0.65) return "B";
|
|
286
|
+
if (score >= 0.4) return "C";
|
|
287
|
+
return "D";
|
|
288
|
+
}
|
|
225
289
|
function audit(text, options = {}) {
|
|
226
290
|
const locale = options.locale ?? "tr";
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
291
|
+
const pii = detectPii(text, locale);
|
|
292
|
+
const quality = qualityMetrics(text);
|
|
293
|
+
const noise = noiseMetrics(text);
|
|
294
|
+
const quality_score = computeQualityScore(
|
|
295
|
+
quality.completeness,
|
|
296
|
+
quality.avg_length,
|
|
297
|
+
noise.garbage_ratio
|
|
298
|
+
);
|
|
299
|
+
const quality_grade = computeQualityGrade(quality_score);
|
|
300
|
+
const counts = /* @__PURE__ */ new Map();
|
|
301
|
+
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
302
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
303
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
304
|
+
}
|
|
305
|
+
function auditBatch(texts, options = {}) {
|
|
306
|
+
if (texts.length === 0) {
|
|
307
|
+
return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
|
|
308
|
+
}
|
|
309
|
+
const results = texts.map((t) => audit(t, options));
|
|
310
|
+
const seen = /* @__PURE__ */ new Set();
|
|
311
|
+
let dupCount = 0;
|
|
312
|
+
for (const t of texts) {
|
|
313
|
+
if (seen.has(t)) dupCount++;
|
|
314
|
+
else seen.add(t);
|
|
315
|
+
}
|
|
316
|
+
const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
|
|
317
|
+
const allPii = results.flatMap((r) => r.pii);
|
|
318
|
+
const counts = /* @__PURE__ */ new Map();
|
|
319
|
+
for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
320
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
321
|
+
const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
|
|
322
|
+
return { results, duplicate_ratio, pii_summary, avg_quality_score };
|
|
232
323
|
}
|
|
233
324
|
function mask(text, findings, options = {}) {
|
|
234
325
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
@@ -237,6 +328,7 @@ function mask(text, findings, options = {}) {
|
|
|
237
328
|
0 && (module.exports = {
|
|
238
329
|
applyMask,
|
|
239
330
|
audit,
|
|
331
|
+
auditBatch,
|
|
240
332
|
detectPii,
|
|
241
333
|
mask,
|
|
242
334
|
noiseMetrics,
|
package/dist/index.d.cts
CHANGED
|
@@ -27,32 +27,50 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
27
27
|
*
|
|
28
28
|
* @example
|
|
29
29
|
* import { audit, mask } from "@flexorch/audit"
|
|
30
|
+
* import { readFileSync } from "fs"
|
|
30
31
|
*
|
|
32
|
+
* const text = readFileSync("contract.txt", "utf8")
|
|
31
33
|
* const result = audit(text, { locale: "tr" })
|
|
32
|
-
*
|
|
33
|
-
* //
|
|
34
|
-
* //
|
|
35
|
-
* //
|
|
36
|
-
*
|
|
34
|
+
*
|
|
35
|
+
* result.quality_grade // "A"
|
|
36
|
+
* result.quality_score // 0.91
|
|
37
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
|
+
*
|
|
39
|
+
* // Raw findings and metrics also available:
|
|
40
|
+
* result.pii // [{ type, value, start, end }, ...]
|
|
41
|
+
* result.quality // { completeness, avg_length, duplicate_ratio }
|
|
42
|
+
* result.noise // { garbage_ratio, encoding_ok }
|
|
37
43
|
*
|
|
38
44
|
* const clean = mask(text, result.pii, { strategy: "redact" })
|
|
39
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
40
46
|
*/
|
|
41
47
|
|
|
42
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.3.0";
|
|
49
|
+
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
|
+
interface PiiSummaryEntry {
|
|
51
|
+
type: string;
|
|
52
|
+
count: number;
|
|
53
|
+
}
|
|
43
54
|
interface AuditOptions {
|
|
44
55
|
/**
|
|
45
56
|
* Active locale-specific detectors.
|
|
46
|
-
* - "tr" — Turkish: TCKN, phone_tr, name (default)
|
|
57
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
|
|
47
58
|
* - "us" — US: SSN, E.164 phone
|
|
48
59
|
* - "eu" — EU: E.164 phone
|
|
49
60
|
* - "all" — All detectors
|
|
50
61
|
*
|
|
51
|
-
* Universal detectors (email, iban, credit_card, ip) are always active.
|
|
62
|
+
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
52
63
|
*/
|
|
53
64
|
locale?: string;
|
|
54
65
|
}
|
|
55
66
|
interface AuditResult {
|
|
67
|
+
/** A/B/C/D overall LLM-readiness grade. */
|
|
68
|
+
quality_grade: QualityGrade;
|
|
69
|
+
/** 0.0–1.0 composite score (completeness + length + noise). */
|
|
70
|
+
quality_score: number;
|
|
71
|
+
/** PII findings aggregated by type: [{ type, count }]. */
|
|
72
|
+
pii_summary: PiiSummaryEntry[];
|
|
73
|
+
/** Raw PII findings sorted by position: [{ type, value, start, end }]. */
|
|
56
74
|
pii: PiiFinding[];
|
|
57
75
|
quality: QualityMetrics;
|
|
58
76
|
noise: NoiseMetrics;
|
|
@@ -65,9 +83,23 @@ interface MaskOptions {
|
|
|
65
83
|
* Audit *text* for LLM dataset readiness.
|
|
66
84
|
*/
|
|
67
85
|
declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
86
|
+
interface BatchAuditResult {
|
|
87
|
+
/** One AuditResult per input text, in order. */
|
|
88
|
+
results: AuditResult[];
|
|
89
|
+
/** Fraction of texts that are exact duplicates (0.0–1.0). */
|
|
90
|
+
duplicate_ratio: number;
|
|
91
|
+
/** PII counts aggregated across all texts. */
|
|
92
|
+
pii_summary: PiiSummaryEntry[];
|
|
93
|
+
/** Mean quality_score across all texts. */
|
|
94
|
+
avg_quality_score: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Audit a list of texts and aggregate metrics — including duplicate_ratio.
|
|
98
|
+
*/
|
|
99
|
+
declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
|
|
68
100
|
/**
|
|
69
101
|
* Apply masking to PII findings in *text*.
|
|
70
102
|
*/
|
|
71
103
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
72
104
|
|
|
73
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
105
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -27,32 +27,50 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
27
27
|
*
|
|
28
28
|
* @example
|
|
29
29
|
* import { audit, mask } from "@flexorch/audit"
|
|
30
|
+
* import { readFileSync } from "fs"
|
|
30
31
|
*
|
|
32
|
+
* const text = readFileSync("contract.txt", "utf8")
|
|
31
33
|
* const result = audit(text, { locale: "tr" })
|
|
32
|
-
*
|
|
33
|
-
* //
|
|
34
|
-
* //
|
|
35
|
-
* //
|
|
36
|
-
*
|
|
34
|
+
*
|
|
35
|
+
* result.quality_grade // "A"
|
|
36
|
+
* result.quality_score // 0.91
|
|
37
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
|
+
*
|
|
39
|
+
* // Raw findings and metrics also available:
|
|
40
|
+
* result.pii // [{ type, value, start, end }, ...]
|
|
41
|
+
* result.quality // { completeness, avg_length, duplicate_ratio }
|
|
42
|
+
* result.noise // { garbage_ratio, encoding_ok }
|
|
37
43
|
*
|
|
38
44
|
* const clean = mask(text, result.pii, { strategy: "redact" })
|
|
39
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
40
46
|
*/
|
|
41
47
|
|
|
42
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.3.0";
|
|
49
|
+
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
|
+
interface PiiSummaryEntry {
|
|
51
|
+
type: string;
|
|
52
|
+
count: number;
|
|
53
|
+
}
|
|
43
54
|
interface AuditOptions {
|
|
44
55
|
/**
|
|
45
56
|
* Active locale-specific detectors.
|
|
46
|
-
* - "tr" — Turkish: TCKN, phone_tr, name (default)
|
|
57
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
|
|
47
58
|
* - "us" — US: SSN, E.164 phone
|
|
48
59
|
* - "eu" — EU: E.164 phone
|
|
49
60
|
* - "all" — All detectors
|
|
50
61
|
*
|
|
51
|
-
* Universal detectors (email, iban, credit_card, ip) are always active.
|
|
62
|
+
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
52
63
|
*/
|
|
53
64
|
locale?: string;
|
|
54
65
|
}
|
|
55
66
|
interface AuditResult {
|
|
67
|
+
/** A/B/C/D overall LLM-readiness grade. */
|
|
68
|
+
quality_grade: QualityGrade;
|
|
69
|
+
/** 0.0–1.0 composite score (completeness + length + noise). */
|
|
70
|
+
quality_score: number;
|
|
71
|
+
/** PII findings aggregated by type: [{ type, count }]. */
|
|
72
|
+
pii_summary: PiiSummaryEntry[];
|
|
73
|
+
/** Raw PII findings sorted by position: [{ type, value, start, end }]. */
|
|
56
74
|
pii: PiiFinding[];
|
|
57
75
|
quality: QualityMetrics;
|
|
58
76
|
noise: NoiseMetrics;
|
|
@@ -65,9 +83,23 @@ interface MaskOptions {
|
|
|
65
83
|
* Audit *text* for LLM dataset readiness.
|
|
66
84
|
*/
|
|
67
85
|
declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
86
|
+
interface BatchAuditResult {
|
|
87
|
+
/** One AuditResult per input text, in order. */
|
|
88
|
+
results: AuditResult[];
|
|
89
|
+
/** Fraction of texts that are exact duplicates (0.0–1.0). */
|
|
90
|
+
duplicate_ratio: number;
|
|
91
|
+
/** PII counts aggregated across all texts. */
|
|
92
|
+
pii_summary: PiiSummaryEntry[];
|
|
93
|
+
/** Mean quality_score across all texts. */
|
|
94
|
+
avg_quality_score: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Audit a list of texts and aggregate metrics — including duplicate_ratio.
|
|
98
|
+
*/
|
|
99
|
+
declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
|
|
68
100
|
/**
|
|
69
101
|
* Apply masking to PII findings in *text*.
|
|
70
102
|
*/
|
|
71
103
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
72
104
|
|
|
73
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
105
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.js
CHANGED
|
@@ -4,8 +4,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
|
|
|
4
4
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
5
5
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
6
6
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
7
|
+
var _H = "[0-9a-fA-F]{1,4}";
|
|
8
|
+
var IPV6_RE = new RegExp(
|
|
9
|
+
`(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
|
|
10
|
+
"gi"
|
|
11
|
+
);
|
|
7
12
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
8
13
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
14
|
+
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
9
15
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
10
16
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
11
17
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -22,6 +28,20 @@ function validTckn(s) {
|
|
|
22
28
|
if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
|
|
23
29
|
return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
|
|
24
30
|
}
|
|
31
|
+
function validVkn(s) {
|
|
32
|
+
if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
33
|
+
const d = s.split("").map(Number);
|
|
34
|
+
let total = 0;
|
|
35
|
+
for (let i = 0; i < 9; i++) {
|
|
36
|
+
const x = (d[i] + (9 - i)) % 10;
|
|
37
|
+
if (x !== 0) {
|
|
38
|
+
let y = x * Math.pow(2, 9 - i) % 9;
|
|
39
|
+
if (y === 0) y = 9;
|
|
40
|
+
total += y;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return (10 - total % 10) % 10 === d[9];
|
|
44
|
+
}
|
|
25
45
|
function luhn(number) {
|
|
26
46
|
const digits = number.replace(/\D/g, "");
|
|
27
47
|
if (digits.length < 13 || digits.length > 19) return false;
|
|
@@ -36,12 +56,26 @@ function luhn(number) {
|
|
|
36
56
|
}
|
|
37
57
|
return total % 10 === 0;
|
|
38
58
|
}
|
|
59
|
+
function validIban(s) {
|
|
60
|
+
const rearranged = s.slice(4) + s.slice(0, 4);
|
|
61
|
+
const numeric = rearranged.toUpperCase().split("").map((c) => {
|
|
62
|
+
const code = c.charCodeAt(0);
|
|
63
|
+
return code >= 65 && code <= 90 ? String(code - 55) : c;
|
|
64
|
+
}).join("");
|
|
65
|
+
let remainder = 0;
|
|
66
|
+
for (let i = 0; i < numeric.length; i += 9) {
|
|
67
|
+
const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
|
|
68
|
+
if (!Number.isFinite(chunk)) return false;
|
|
69
|
+
remainder = chunk % 97;
|
|
70
|
+
}
|
|
71
|
+
return remainder === 1;
|
|
72
|
+
}
|
|
39
73
|
var LOCALE_DETECTORS = {
|
|
40
|
-
tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
|
|
74
|
+
tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
|
|
41
75
|
us: /* @__PURE__ */ new Set(["ssn", "phone"]),
|
|
42
76
|
eu: /* @__PURE__ */ new Set(["phone"])
|
|
43
77
|
};
|
|
44
|
-
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
|
|
78
|
+
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
45
79
|
function activeDetectors(locale) {
|
|
46
80
|
if (locale === "all") {
|
|
47
81
|
const active2 = new Set(UNIVERSAL);
|
|
@@ -79,7 +113,15 @@ function detectPii(text, locale = "tr") {
|
|
|
79
113
|
}
|
|
80
114
|
}
|
|
81
115
|
}
|
|
82
|
-
if (active.has("iban"))
|
|
116
|
+
if (active.has("iban")) {
|
|
117
|
+
IBAN_RE.lastIndex = 0;
|
|
118
|
+
let m;
|
|
119
|
+
while ((m = IBAN_RE.exec(t)) !== null) {
|
|
120
|
+
if (validIban(m[0])) {
|
|
121
|
+
findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
83
125
|
if (active.has("credit_card")) {
|
|
84
126
|
CC_RE.lastIndex = 0;
|
|
85
127
|
let m;
|
|
@@ -90,6 +132,7 @@ function detectPii(text, locale = "tr") {
|
|
|
90
132
|
}
|
|
91
133
|
}
|
|
92
134
|
if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
|
|
135
|
+
if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
|
|
93
136
|
if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
|
|
94
137
|
if (active.has("national_id_tr")) {
|
|
95
138
|
TCKN_RE.lastIndex = 0;
|
|
@@ -100,6 +143,15 @@ function detectPii(text, locale = "tr") {
|
|
|
100
143
|
}
|
|
101
144
|
}
|
|
102
145
|
}
|
|
146
|
+
if (active.has("tax_id_tr")) {
|
|
147
|
+
VKN_RE.lastIndex = 0;
|
|
148
|
+
let m;
|
|
149
|
+
while ((m = VKN_RE.exec(t)) !== null) {
|
|
150
|
+
if (validVkn(m[1])) {
|
|
151
|
+
findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
103
155
|
if (active.has("name")) {
|
|
104
156
|
NAME_RE.lastIndex = 0;
|
|
105
157
|
let m;
|
|
@@ -189,14 +241,52 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
189
241
|
}
|
|
190
242
|
|
|
191
243
|
// src/index.ts
|
|
192
|
-
var version = "0.
|
|
244
|
+
var version = "0.3.0";
|
|
245
|
+
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
246
|
+
const lengthScore = Math.min(avgLength / 500, 1);
|
|
247
|
+
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
248
|
+
return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
|
|
249
|
+
}
|
|
250
|
+
function computeQualityGrade(score) {
|
|
251
|
+
if (score >= 0.85) return "A";
|
|
252
|
+
if (score >= 0.65) return "B";
|
|
253
|
+
if (score >= 0.4) return "C";
|
|
254
|
+
return "D";
|
|
255
|
+
}
|
|
193
256
|
function audit(text, options = {}) {
|
|
194
257
|
const locale = options.locale ?? "tr";
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
258
|
+
const pii = detectPii(text, locale);
|
|
259
|
+
const quality = qualityMetrics(text);
|
|
260
|
+
const noise = noiseMetrics(text);
|
|
261
|
+
const quality_score = computeQualityScore(
|
|
262
|
+
quality.completeness,
|
|
263
|
+
quality.avg_length,
|
|
264
|
+
noise.garbage_ratio
|
|
265
|
+
);
|
|
266
|
+
const quality_grade = computeQualityGrade(quality_score);
|
|
267
|
+
const counts = /* @__PURE__ */ new Map();
|
|
268
|
+
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
269
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
270
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
271
|
+
}
|
|
272
|
+
function auditBatch(texts, options = {}) {
|
|
273
|
+
if (texts.length === 0) {
|
|
274
|
+
return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
|
|
275
|
+
}
|
|
276
|
+
const results = texts.map((t) => audit(t, options));
|
|
277
|
+
const seen = /* @__PURE__ */ new Set();
|
|
278
|
+
let dupCount = 0;
|
|
279
|
+
for (const t of texts) {
|
|
280
|
+
if (seen.has(t)) dupCount++;
|
|
281
|
+
else seen.add(t);
|
|
282
|
+
}
|
|
283
|
+
const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
|
|
284
|
+
const allPii = results.flatMap((r) => r.pii);
|
|
285
|
+
const counts = /* @__PURE__ */ new Map();
|
|
286
|
+
for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
287
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
288
|
+
const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
|
|
289
|
+
return { results, duplicate_ratio, pii_summary, avg_quality_score };
|
|
200
290
|
}
|
|
201
291
|
function mask(text, findings, options = {}) {
|
|
202
292
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
@@ -204,6 +294,7 @@ function mask(text, findings, options = {}) {
|
|
|
204
294
|
export {
|
|
205
295
|
applyMask,
|
|
206
296
|
audit,
|
|
297
|
+
auditBatch,
|
|
207
298
|
detectPii,
|
|
208
299
|
mask,
|
|
209
300
|
noiseMetrics,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@flexorch/audit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pii",
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"homepage": "https://github.com/flexorch/flexorch-audit-js",
|
|
18
18
|
"repository": {
|
|
19
19
|
"type": "git",
|
|
20
|
-
"url": "https://github.com/flexorch/flexorch-audit-js.git"
|
|
20
|
+
"url": "git+https://github.com/flexorch/flexorch-audit-js.git"
|
|
21
21
|
},
|
|
22
22
|
"bugs": {
|
|
23
23
|
"url": "https://github.com/flexorch/flexorch-audit-js/issues"
|