@flexorch/audit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 FlexOrch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # @flexorch/audit
2
+
3
+ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
4
+
5
+ - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
6
+ - **Quality metrics** — completeness, average length, duplicate ratio
7
+ - **Noise metrics** — garbage character ratio, encoding health
8
+ - **Masking** — redact / replace / token / hash strategies
9
+ - **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
10
+
11
+ ```ts
12
+ import { audit, mask } from "@flexorch/audit"
13
+
14
+ const result = audit(text, { locale: "tr" })
15
+ // {
16
+ // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
17
+ // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
18
+ // noise: { garbage_ratio: 0.0, encoding_ok: true },
19
+ // }
20
+
21
+ const clean = mask(text, result.pii, { strategy: "redact" })
22
+ // "Contact: [REDACTED_EMAIL]"
23
+ ```
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ npm install @flexorch/audit
29
+ ```
30
+
31
+ ## Locale support
32
+
33
+ | `locale` | Active detectors |
34
+ |----------|-----------------|
35
+ | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
36
+ | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
37
+ | `"eu"` | email, iban, credit_card, ip + E.164 phone |
38
+ | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
39
+
40
+ ## PII types
41
+
42
+ | Type | Description | Locale |
43
+ |------|-------------|--------|
44
+ | `email` | RFC-5321 address | all |
45
+ | `iban` | ISO 13616 IBAN (any country) | all |
46
+ | `credit_card` | 16-digit groups, Luhn-validated | all |
47
+ | `ip` | IPv4 address | all |
48
+ | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
49
+ | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
50
+ | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
51
+ | `phone` | E.164 international phone | us, eu |
52
+ | `ssn` | US Social Security Number (###-##-####) | us |
53
+
54
+ ## Masking strategies
55
+
56
+ | Strategy | Example output |
57
+ |----------|----------------|
58
+ | `redact` (default) | `[REDACTED_EMAIL]` |
59
+ | `replace` | `user@example.com` (realistic synthetic) |
60
+ | `token` | `<PII_EMAIL_1>` (unique per type) |
61
+ | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
62
+
63
+ ## TypeScript
64
+
65
+ Full type definitions included. No `@types/` package needed.
66
+
67
+ ```ts
68
+ import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
69
+ ```
70
+
71
+ ## Quality & noise
72
+
73
+ `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
74
+
75
+ ```ts
76
+ const texts = dataset.map((r) => r.text)
77
+ const seen = new Set<string>()
78
+ let duplicates = 0
79
+ for (const t of texts) {
80
+ if (seen.has(t)) duplicates++
81
+ else seen.add(t)
82
+ }
83
+ const duplicateRatio = duplicates / texts.length
84
+ ```
85
+
86
+ ## Limitations (v0.1)
87
+
88
+ - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
89
+ - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
90
+ - IPv6 not detected.
91
+ - IBAN format-only check; mod-97 validation not performed.
92
+
93
+ ## Also available for Python
94
+
95
+ ```bash
96
+ pip install flexorch-audit
97
+ ```
98
+
99
+ ## License
100
+
101
+ MIT
package/dist/index.cjs ADDED
@@ -0,0 +1,245 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ applyMask: () => applyMask,
24
+ audit: () => audit,
25
+ detectPii: () => detectPii,
26
+ mask: () => mask,
27
+ noiseMetrics: () => noiseMetrics,
28
+ qualityMetrics: () => qualityMetrics,
29
+ version: () => version
30
+ });
31
+ module.exports = __toCommonJS(index_exports);
32
+
33
+ // src/pii.ts
34
+ var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
35
+ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
36
+ var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
37
+ var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
38
+ var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
39
+ var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
40
+ var TCKN_RE = /\b([1-9]\d{10})\b/g;
41
+ var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
42
+ var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
43
+ var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
44
+ var NAME_RE = new RegExp(
45
+ `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
46
+ "gu"
47
+ );
48
+ var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
49
+ function validTckn(s) {
50
+ if (s.length !== 11 || s[0] === "0") return false;
51
+ const d = s.split("").map(Number);
52
+ const sumOdd = d[0] + d[2] + d[4] + d[6] + d[8];
53
+ const sumEven = d[1] + d[3] + d[5] + d[7];
54
+ if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
55
+ return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
56
+ }
57
+ function luhn(number) {
58
+ const digits = number.replace(/\D/g, "");
59
+ if (digits.length < 13 || digits.length > 19) return false;
60
+ let total = 0;
61
+ for (let i = 0; i < digits.length; i++) {
62
+ let d = parseInt(digits[digits.length - 1 - i]);
63
+ if (i % 2 === 1) {
64
+ d *= 2;
65
+ if (d > 9) d -= 9;
66
+ }
67
+ total += d;
68
+ }
69
+ return total % 10 === 0;
70
+ }
71
+ var LOCALE_DETECTORS = {
72
+ tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
73
+ us: /* @__PURE__ */ new Set(["ssn", "phone"]),
74
+ eu: /* @__PURE__ */ new Set(["phone"])
75
+ };
76
+ var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
77
+ function activeDetectors(locale) {
78
+ if (locale === "all") {
79
+ const active2 = new Set(UNIVERSAL);
80
+ for (const detectors of Object.values(LOCALE_DETECTORS)) {
81
+ detectors.forEach((d) => active2.add(d));
82
+ }
83
+ if (active2.has("phone_tr")) active2.delete("phone");
84
+ return active2;
85
+ }
86
+ const active = new Set(UNIVERSAL);
87
+ (LOCALE_DETECTORS[locale] ?? /* @__PURE__ */ new Set()).forEach((d) => active.add(d));
88
+ return active;
89
+ }
90
+ function findAll(re, text, type) {
91
+ const results = [];
92
+ re.lastIndex = 0;
93
+ let m;
94
+ while ((m = re.exec(text)) !== null) {
95
+ results.push({ type, value: m[0], start: m.index, end: m.index + m[0].length });
96
+ }
97
+ return results;
98
+ }
99
+ function detectPii(text, locale = "tr") {
100
+ const active = activeDetectors(locale);
101
+ const t = text ?? "";
102
+ const findings = [];
103
+ if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
104
+ if (active.has("phone")) {
105
+ PHONE_INTL_RE.lastIndex = 0;
106
+ let m;
107
+ while ((m = PHONE_INTL_RE.exec(t)) !== null) {
108
+ const digits = m[0].replace(/\D/g, "").length;
109
+ if (digits >= 10) {
110
+ findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
111
+ }
112
+ }
113
+ }
114
+ if (active.has("iban")) findings.push(...findAll(IBAN_RE, t, "iban"));
115
+ if (active.has("credit_card")) {
116
+ CC_RE.lastIndex = 0;
117
+ let m;
118
+ while ((m = CC_RE.exec(t)) !== null) {
119
+ if (luhn(m[0])) {
120
+ findings.push({ type: "credit_card", value: m[0], start: m.index, end: m.index + m[0].length });
121
+ }
122
+ }
123
+ }
124
+ if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
125
+ if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
126
+ if (active.has("national_id_tr")) {
127
+ TCKN_RE.lastIndex = 0;
128
+ let m;
129
+ while ((m = TCKN_RE.exec(t)) !== null) {
130
+ if (validTckn(m[1])) {
131
+ findings.push({ type: "national_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
132
+ }
133
+ }
134
+ }
135
+ if (active.has("name")) {
136
+ NAME_RE.lastIndex = 0;
137
+ let m;
138
+ while ((m = NAME_RE.exec(t)) !== null) {
139
+ const idx = m.length - 1;
140
+ const value = m[idx];
141
+ const start = m.index + m[0].lastIndexOf(value);
142
+ findings.push({ type: "name", value, start, end: start + value.length });
143
+ }
144
+ }
145
+ if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
146
+ return findings.sort((a, b) => a.start - b.start);
147
+ }
148
+
149
+ // src/quality.ts
150
+ function qualityMetrics(text) {
151
+ const stripped = (text ?? "").trim();
152
+ return {
153
+ completeness: stripped.length > 0 ? 1 : 0,
154
+ avg_length: stripped.length,
155
+ duplicate_ratio: null
156
+ };
157
+ }
158
+
159
+ // src/noise.ts
160
+ var SAFE_WHITESPACE = /* @__PURE__ */ new Set([" ", " ", "\n", "\r", "\v", "\f"]);
161
+ var REPLACEMENT_CHAR = "\uFFFD";
162
+ function isGarbage(ch) {
163
+ if (SAFE_WHITESPACE.has(ch)) return false;
164
+ const cp = ch.codePointAt(0) ?? 0;
165
+ return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
166
+ cp >= 55296 && cp <= 57343;
167
+ }
168
+ function noiseMetrics(text) {
169
+ if (!text) return { garbage_ratio: 0, encoding_ok: true };
170
+ const n = text.length;
171
+ let garbage = 0;
172
+ for (const ch of text) {
173
+ if (isGarbage(ch)) garbage++;
174
+ }
175
+ return {
176
+ garbage_ratio: Math.round(garbage / n * 1e4) / 1e4,
177
+ encoding_ok: !text.includes(REPLACEMENT_CHAR)
178
+ };
179
+ }
180
+
181
+ // src/mask.ts
182
+ var import_crypto = require("crypto");
183
+ var SYNTHETIC = {
184
+ email: "user@example.com",
185
+ phone: "+1 000 000 0000",
186
+ phone_tr: "0500 000 00 00",
187
+ national_id_tr: "00000000000",
188
+ ssn: "000-00-0000",
189
+ iban: "XX00 0000 0000 0000 0000 00",
190
+ credit_card: "0000 0000 0000 0000",
191
+ ip: "0.0.0.0",
192
+ name: "AD SOYAD"
193
+ };
194
+ var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
195
+ function applyMask(text, findings, strategy = "redact") {
196
+ if (!VALID_STRATEGIES.has(strategy)) {
197
+ throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
198
+ }
199
+ if (!text || findings.length === 0) return text ?? "";
200
+ let result = text;
201
+ const counter = {};
202
+ const sorted = [...findings].sort((a, b) => b.start - a.start);
203
+ for (const finding of sorted) {
204
+ const { type, value, start, end } = finding;
205
+ counter[type] = (counter[type] ?? 0) + 1;
206
+ const tag = type.toUpperCase();
207
+ let replacement;
208
+ if (strategy === "redact") {
209
+ replacement = `[REDACTED_${tag}]`;
210
+ } else if (strategy === "replace") {
211
+ replacement = SYNTHETIC[type] ?? `[${tag}]`;
212
+ } else if (strategy === "token") {
213
+ replacement = `<PII_${tag}_${counter[type]}>`;
214
+ } else {
215
+ const h = (0, import_crypto.createHash)("sha256").update(value).digest("hex").slice(0, 16);
216
+ replacement = `[${h}]`;
217
+ }
218
+ result = result.slice(0, start) + replacement + result.slice(end);
219
+ }
220
+ return result;
221
+ }
222
+
223
+ // src/index.ts
224
+ var version = "0.1.0";
225
+ function audit(text, options = {}) {
226
+ const locale = options.locale ?? "tr";
227
+ return {
228
+ pii: detectPii(text, locale),
229
+ quality: qualityMetrics(text),
230
+ noise: noiseMetrics(text)
231
+ };
232
+ }
233
+ function mask(text, findings, options = {}) {
234
+ return applyMask(text, findings, options.strategy ?? "redact");
235
+ }
236
+ // Annotate the CommonJS export names for ESM import in node:
237
+ 0 && (module.exports = {
238
+ applyMask,
239
+ audit,
240
+ detectPii,
241
+ mask,
242
+ noiseMetrics,
243
+ qualityMetrics,
244
+ version
245
+ });
@@ -0,0 +1,73 @@
1
+ interface PiiFinding {
2
+ type: string;
3
+ value: string;
4
+ start: number;
5
+ end: number;
6
+ }
7
+ declare function detectPii(text: string, locale?: string): PiiFinding[];
8
+
9
+ interface QualityMetrics {
10
+ completeness: number;
11
+ avg_length: number;
12
+ duplicate_ratio: null;
13
+ }
14
+ declare function qualityMetrics(text: string): QualityMetrics;
15
+
16
+ interface NoiseMetrics {
17
+ garbage_ratio: number;
18
+ encoding_ok: boolean;
19
+ }
20
+ declare function noiseMetrics(text: string): NoiseMetrics;
21
+
22
+ type MaskStrategy = "redact" | "replace" | "token" | "hash";
23
+ declare function applyMask(text: string, findings: PiiFinding[], strategy?: MaskStrategy): string;
24
+
25
+ /**
26
+ * @flexorch/audit — zero-dependency PII + quality + noise audit for LLM datasets.
27
+ *
28
+ * @example
29
+ * import { audit, mask } from "@flexorch/audit"
30
+ *
31
+ * const result = audit(text, { locale: "tr" })
32
+ * // {
33
+ * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
+ * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
+ * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
+ * // }
37
+ *
38
+ * const clean = mask(text, result.pii, { strategy: "redact" })
39
+ * // "Contact: [REDACTED_EMAIL]"
40
+ */
41
+
42
+ declare const version = "0.1.0";
43
+ interface AuditOptions {
44
+ /**
45
+ * Active locale-specific detectors.
46
+ * - "tr" — Turkish: TCKN, phone_tr, name (default)
47
+ * - "us" — US: SSN, E.164 phone
48
+ * - "eu" — EU: E.164 phone
49
+ * - "all" — All detectors
50
+ *
51
+ * Universal detectors (email, iban, credit_card, ip) are always active.
52
+ */
53
+ locale?: string;
54
+ }
55
+ interface AuditResult {
56
+ pii: PiiFinding[];
57
+ quality: QualityMetrics;
58
+ noise: NoiseMetrics;
59
+ }
60
+ interface MaskOptions {
61
+ /** @default "redact" */
62
+ strategy?: MaskStrategy;
63
+ }
64
+ /**
65
+ * Audit *text* for LLM dataset readiness.
66
+ */
67
+ declare function audit(text: string, options?: AuditOptions): AuditResult;
68
+ /**
69
+ * Apply masking to PII findings in *text*.
70
+ */
71
+ declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
+
73
+ export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
@@ -0,0 +1,73 @@
1
+ interface PiiFinding {
2
+ type: string;
3
+ value: string;
4
+ start: number;
5
+ end: number;
6
+ }
7
+ declare function detectPii(text: string, locale?: string): PiiFinding[];
8
+
9
+ interface QualityMetrics {
10
+ completeness: number;
11
+ avg_length: number;
12
+ duplicate_ratio: null;
13
+ }
14
+ declare function qualityMetrics(text: string): QualityMetrics;
15
+
16
+ interface NoiseMetrics {
17
+ garbage_ratio: number;
18
+ encoding_ok: boolean;
19
+ }
20
+ declare function noiseMetrics(text: string): NoiseMetrics;
21
+
22
+ type MaskStrategy = "redact" | "replace" | "token" | "hash";
23
+ declare function applyMask(text: string, findings: PiiFinding[], strategy?: MaskStrategy): string;
24
+
25
+ /**
26
+ * @flexorch/audit — zero-dependency PII + quality + noise audit for LLM datasets.
27
+ *
28
+ * @example
29
+ * import { audit, mask } from "@flexorch/audit"
30
+ *
31
+ * const result = audit(text, { locale: "tr" })
32
+ * // {
33
+ * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
+ * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
+ * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
+ * // }
37
+ *
38
+ * const clean = mask(text, result.pii, { strategy: "redact" })
39
+ * // "Contact: [REDACTED_EMAIL]"
40
+ */
41
+
42
+ declare const version = "0.1.0";
43
+ interface AuditOptions {
44
+ /**
45
+ * Active locale-specific detectors.
46
+ * - "tr" — Turkish: TCKN, phone_tr, name (default)
47
+ * - "us" — US: SSN, E.164 phone
48
+ * - "eu" — EU: E.164 phone
49
+ * - "all" — All detectors
50
+ *
51
+ * Universal detectors (email, iban, credit_card, ip) are always active.
52
+ */
53
+ locale?: string;
54
+ }
55
+ interface AuditResult {
56
+ pii: PiiFinding[];
57
+ quality: QualityMetrics;
58
+ noise: NoiseMetrics;
59
+ }
60
+ interface MaskOptions {
61
+ /** @default "redact" */
62
+ strategy?: MaskStrategy;
63
+ }
64
+ /**
65
+ * Audit *text* for LLM dataset readiness.
66
+ */
67
+ declare function audit(text: string, options?: AuditOptions): AuditResult;
68
+ /**
69
+ * Apply masking to PII findings in *text*.
70
+ */
71
+ declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
+
73
+ export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
package/dist/index.js ADDED
@@ -0,0 +1,212 @@
1
+ // src/pii.ts
2
+ var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
3
+ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\b/g;
4
+ var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
5
+ var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
6
+ var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
7
+ var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
8
+ var TCKN_RE = /\b([1-9]\d{10})\b/g;
9
+ var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
10
+ var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
11
+ var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
12
+ var NAME_RE = new RegExp(
13
+ `(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
14
+ "gu"
15
+ );
16
+ var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
17
+ function validTckn(s) {
18
+ if (s.length !== 11 || s[0] === "0") return false;
19
+ const d = s.split("").map(Number);
20
+ const sumOdd = d[0] + d[2] + d[4] + d[6] + d[8];
21
+ const sumEven = d[1] + d[3] + d[5] + d[7];
22
+ if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
23
+ return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
24
+ }
25
+ function luhn(number) {
26
+ const digits = number.replace(/\D/g, "");
27
+ if (digits.length < 13 || digits.length > 19) return false;
28
+ let total = 0;
29
+ for (let i = 0; i < digits.length; i++) {
30
+ let d = parseInt(digits[digits.length - 1 - i]);
31
+ if (i % 2 === 1) {
32
+ d *= 2;
33
+ if (d > 9) d -= 9;
34
+ }
35
+ total += d;
36
+ }
37
+ return total % 10 === 0;
38
+ }
39
+ var LOCALE_DETECTORS = {
40
+ tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
41
+ us: /* @__PURE__ */ new Set(["ssn", "phone"]),
42
+ eu: /* @__PURE__ */ new Set(["phone"])
43
+ };
44
+ var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
45
+ function activeDetectors(locale) {
46
+ if (locale === "all") {
47
+ const active2 = new Set(UNIVERSAL);
48
+ for (const detectors of Object.values(LOCALE_DETECTORS)) {
49
+ detectors.forEach((d) => active2.add(d));
50
+ }
51
+ if (active2.has("phone_tr")) active2.delete("phone");
52
+ return active2;
53
+ }
54
+ const active = new Set(UNIVERSAL);
55
+ (LOCALE_DETECTORS[locale] ?? /* @__PURE__ */ new Set()).forEach((d) => active.add(d));
56
+ return active;
57
+ }
58
+ function findAll(re, text, type) {
59
+ const results = [];
60
+ re.lastIndex = 0;
61
+ let m;
62
+ while ((m = re.exec(text)) !== null) {
63
+ results.push({ type, value: m[0], start: m.index, end: m.index + m[0].length });
64
+ }
65
+ return results;
66
+ }
67
+ function detectPii(text, locale = "tr") {
68
+ const active = activeDetectors(locale);
69
+ const t = text ?? "";
70
+ const findings = [];
71
+ if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
72
+ if (active.has("phone")) {
73
+ PHONE_INTL_RE.lastIndex = 0;
74
+ let m;
75
+ while ((m = PHONE_INTL_RE.exec(t)) !== null) {
76
+ const digits = m[0].replace(/\D/g, "").length;
77
+ if (digits >= 10) {
78
+ findings.push({ type: "phone", value: m[0], start: m.index, end: m.index + m[0].length });
79
+ }
80
+ }
81
+ }
82
+ if (active.has("iban")) findings.push(...findAll(IBAN_RE, t, "iban"));
83
+ if (active.has("credit_card")) {
84
+ CC_RE.lastIndex = 0;
85
+ let m;
86
+ while ((m = CC_RE.exec(t)) !== null) {
87
+ if (luhn(m[0])) {
88
+ findings.push({ type: "credit_card", value: m[0], start: m.index, end: m.index + m[0].length });
89
+ }
90
+ }
91
+ }
92
+ if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
93
+ if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
94
+ if (active.has("national_id_tr")) {
95
+ TCKN_RE.lastIndex = 0;
96
+ let m;
97
+ while ((m = TCKN_RE.exec(t)) !== null) {
98
+ if (validTckn(m[1])) {
99
+ findings.push({ type: "national_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
100
+ }
101
+ }
102
+ }
103
+ if (active.has("name")) {
104
+ NAME_RE.lastIndex = 0;
105
+ let m;
106
+ while ((m = NAME_RE.exec(t)) !== null) {
107
+ const idx = m.length - 1;
108
+ const value = m[idx];
109
+ const start = m.index + m[0].lastIndexOf(value);
110
+ findings.push({ type: "name", value, start, end: start + value.length });
111
+ }
112
+ }
113
+ if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
114
+ return findings.sort((a, b) => a.start - b.start);
115
+ }
116
+
117
+ // src/quality.ts
118
+ function qualityMetrics(text) {
119
+ const stripped = (text ?? "").trim();
120
+ return {
121
+ completeness: stripped.length > 0 ? 1 : 0,
122
+ avg_length: stripped.length,
123
+ duplicate_ratio: null
124
+ };
125
+ }
126
+
127
+ // src/noise.ts
128
+ var SAFE_WHITESPACE = /* @__PURE__ */ new Set([" ", " ", "\n", "\r", "\v", "\f"]);
129
+ var REPLACEMENT_CHAR = "\uFFFD";
130
+ function isGarbage(ch) {
131
+ if (SAFE_WHITESPACE.has(ch)) return false;
132
+ const cp = ch.codePointAt(0) ?? 0;
133
+ return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
134
+ cp >= 55296 && cp <= 57343;
135
+ }
136
+ function noiseMetrics(text) {
137
+ if (!text) return { garbage_ratio: 0, encoding_ok: true };
138
+ const n = text.length;
139
+ let garbage = 0;
140
+ for (const ch of text) {
141
+ if (isGarbage(ch)) garbage++;
142
+ }
143
+ return {
144
+ garbage_ratio: Math.round(garbage / n * 1e4) / 1e4,
145
+ encoding_ok: !text.includes(REPLACEMENT_CHAR)
146
+ };
147
+ }
148
+
149
+ // src/mask.ts
150
+ import { createHash } from "crypto";
151
+ var SYNTHETIC = {
152
+ email: "user@example.com",
153
+ phone: "+1 000 000 0000",
154
+ phone_tr: "0500 000 00 00",
155
+ national_id_tr: "00000000000",
156
+ ssn: "000-00-0000",
157
+ iban: "XX00 0000 0000 0000 0000 00",
158
+ credit_card: "0000 0000 0000 0000",
159
+ ip: "0.0.0.0",
160
+ name: "AD SOYAD"
161
+ };
162
+ var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
163
+ function applyMask(text, findings, strategy = "redact") {
164
+ if (!VALID_STRATEGIES.has(strategy)) {
165
+ throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
166
+ }
167
+ if (!text || findings.length === 0) return text ?? "";
168
+ let result = text;
169
+ const counter = {};
170
+ const sorted = [...findings].sort((a, b) => b.start - a.start);
171
+ for (const finding of sorted) {
172
+ const { type, value, start, end } = finding;
173
+ counter[type] = (counter[type] ?? 0) + 1;
174
+ const tag = type.toUpperCase();
175
+ let replacement;
176
+ if (strategy === "redact") {
177
+ replacement = `[REDACTED_${tag}]`;
178
+ } else if (strategy === "replace") {
179
+ replacement = SYNTHETIC[type] ?? `[${tag}]`;
180
+ } else if (strategy === "token") {
181
+ replacement = `<PII_${tag}_${counter[type]}>`;
182
+ } else {
183
+ const h = createHash("sha256").update(value).digest("hex").slice(0, 16);
184
+ replacement = `[${h}]`;
185
+ }
186
+ result = result.slice(0, start) + replacement + result.slice(end);
187
+ }
188
+ return result;
189
+ }
190
+
191
+ // src/index.ts
192
+ var version = "0.1.0";
193
+ function audit(text, options = {}) {
194
+ const locale = options.locale ?? "tr";
195
+ return {
196
+ pii: detectPii(text, locale),
197
+ quality: qualityMetrics(text),
198
+ noise: noiseMetrics(text)
199
+ };
200
+ }
201
+ function mask(text, findings, options = {}) {
202
+ return applyMask(text, findings, options.strategy ?? "redact");
203
+ }
204
+ export {
205
+ applyMask,
206
+ audit,
207
+ detectPii,
208
+ mask,
209
+ noiseMetrics,
210
+ qualityMetrics,
211
+ version
212
+ };
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@flexorch/audit",
3
+ "version": "0.1.0",
4
+ "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
+ "keywords": [
6
+ "pii",
7
+ "privacy",
8
+ "llm",
9
+ "dataset",
10
+ "audit",
11
+ "tckn",
12
+ "kvkk",
13
+ "gdpr"
14
+ ],
15
+ "license": "MIT",
16
+ "author": "FlexOrch",
17
+ "homepage": "https://github.com/flexorch/flexorch-audit-js",
18
+ "repository": {
19
+ "type": "git",
20
+ "url": "https://github.com/flexorch/flexorch-audit-js.git"
21
+ },
22
+ "bugs": {
23
+ "url": "https://github.com/flexorch/flexorch-audit-js/issues"
24
+ },
25
+ "type": "module",
26
+ "main": "./dist/index.cjs",
27
+ "module": "./dist/index.js",
28
+ "types": "./dist/index.d.ts",
29
+ "exports": {
30
+ ".": {
31
+ "types": "./dist/index.d.ts",
32
+ "import": "./dist/index.js",
33
+ "require": "./dist/index.cjs"
34
+ }
35
+ },
36
+ "files": [
37
+ "dist"
38
+ ],
39
+ "scripts": {
40
+ "build": "tsup src/index.ts --format cjs,esm --dts --clean",
41
+ "test": "node --test tests/*.test.js",
42
+ "prepublishOnly": "npm run build && npm test"
43
+ },
44
+ "devDependencies": {
45
+ "@types/node": "^25.6.0",
46
+ "tsup": "^8.0.0",
47
+ "typescript": "^5.0.0"
48
+ }
49
+ }