@flexorch/audit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.cjs +77 -4
- package/dist/index.d.cts +18 -4
- package/dist/index.d.ts +18 -4
- package/dist/index.js +76 -4
- package/package.json +2 -2
package/README.md
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -22,6 +22,7 @@ var index_exports = {};
|
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
applyMask: () => applyMask,
|
|
24
24
|
audit: () => audit,
|
|
25
|
+
auditBatch: () => auditBatch,
|
|
25
26
|
detectPii: () => detectPii,
|
|
26
27
|
mask: () => mask,
|
|
27
28
|
noiseMetrics: () => noiseMetrics,
|
|
@@ -36,8 +37,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
|
|
|
36
37
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
37
38
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
38
39
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
40
|
+
var _H = "[0-9a-fA-F]{1,4}";
|
|
41
|
+
var IPV6_RE = new RegExp(
|
|
42
|
+
`(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
|
|
43
|
+
"gi"
|
|
44
|
+
);
|
|
39
45
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
40
46
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
47
|
+
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
41
48
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
42
49
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
43
50
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -54,6 +61,20 @@ function validTckn(s) {
|
|
|
54
61
|
if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
|
|
55
62
|
return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
|
|
56
63
|
}
|
|
64
|
+
function validVkn(s) {
|
|
65
|
+
if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
66
|
+
const d = s.split("").map(Number);
|
|
67
|
+
let total = 0;
|
|
68
|
+
for (let i = 0; i < 9; i++) {
|
|
69
|
+
const x = (d[i] + (9 - i)) % 10;
|
|
70
|
+
if (x !== 0) {
|
|
71
|
+
let y = x * Math.pow(2, 9 - i) % 9;
|
|
72
|
+
if (y === 0) y = 9;
|
|
73
|
+
total += y;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return (10 - total % 10) % 10 === d[9];
|
|
77
|
+
}
|
|
57
78
|
function luhn(number) {
|
|
58
79
|
const digits = number.replace(/\D/g, "");
|
|
59
80
|
if (digits.length < 13 || digits.length > 19) return false;
|
|
@@ -68,12 +89,26 @@ function luhn(number) {
|
|
|
68
89
|
}
|
|
69
90
|
return total % 10 === 0;
|
|
70
91
|
}
|
|
92
|
+
function validIban(s) {
|
|
93
|
+
const rearranged = s.slice(4) + s.slice(0, 4);
|
|
94
|
+
const numeric = rearranged.toUpperCase().split("").map((c) => {
|
|
95
|
+
const code = c.charCodeAt(0);
|
|
96
|
+
return code >= 65 && code <= 90 ? String(code - 55) : c;
|
|
97
|
+
}).join("");
|
|
98
|
+
let remainder = 0;
|
|
99
|
+
for (let i = 0; i < numeric.length; i += 9) {
|
|
100
|
+
const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
|
|
101
|
+
if (!Number.isFinite(chunk)) return false;
|
|
102
|
+
remainder = chunk % 97;
|
|
103
|
+
}
|
|
104
|
+
return remainder === 1;
|
|
105
|
+
}
|
|
71
106
|
var LOCALE_DETECTORS = {
|
|
72
|
-
tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
|
|
107
|
+
tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
|
|
73
108
|
us: /* @__PURE__ */ new Set(["ssn", "phone"]),
|
|
74
109
|
eu: /* @__PURE__ */ new Set(["phone"])
|
|
75
110
|
};
|
|
76
|
-
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
|
|
111
|
+
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
77
112
|
function activeDetectors(locale) {
|
|
78
113
|
if (locale === "all") {
|
|
79
114
|
const active2 = new Set(UNIVERSAL);
|
|
@@ -111,7 +146,15 @@ function detectPii(text, locale = "tr") {
|
|
|
111
146
|
}
|
|
112
147
|
}
|
|
113
148
|
}
|
|
114
|
-
if (active.has("iban"))
|
|
149
|
+
if (active.has("iban")) {
|
|
150
|
+
IBAN_RE.lastIndex = 0;
|
|
151
|
+
let m;
|
|
152
|
+
while ((m = IBAN_RE.exec(t)) !== null) {
|
|
153
|
+
if (validIban(m[0])) {
|
|
154
|
+
findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
115
158
|
if (active.has("credit_card")) {
|
|
116
159
|
CC_RE.lastIndex = 0;
|
|
117
160
|
let m;
|
|
@@ -122,6 +165,7 @@ function detectPii(text, locale = "tr") {
|
|
|
122
165
|
}
|
|
123
166
|
}
|
|
124
167
|
if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
|
|
168
|
+
if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
|
|
125
169
|
if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
|
|
126
170
|
if (active.has("national_id_tr")) {
|
|
127
171
|
TCKN_RE.lastIndex = 0;
|
|
@@ -132,6 +176,15 @@ function detectPii(text, locale = "tr") {
|
|
|
132
176
|
}
|
|
133
177
|
}
|
|
134
178
|
}
|
|
179
|
+
if (active.has("tax_id_tr")) {
|
|
180
|
+
VKN_RE.lastIndex = 0;
|
|
181
|
+
let m;
|
|
182
|
+
while ((m = VKN_RE.exec(t)) !== null) {
|
|
183
|
+
if (validVkn(m[1])) {
|
|
184
|
+
findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
135
188
|
if (active.has("name")) {
|
|
136
189
|
NAME_RE.lastIndex = 0;
|
|
137
190
|
let m;
|
|
@@ -221,7 +274,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
221
274
|
}
|
|
222
275
|
|
|
223
276
|
// src/index.ts
|
|
224
|
-
var version = "0.
|
|
277
|
+
var version = "0.3.0";
|
|
225
278
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
226
279
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
227
280
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -249,6 +302,25 @@ function audit(text, options = {}) {
|
|
|
249
302
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
250
303
|
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
251
304
|
}
|
|
305
|
+
function auditBatch(texts, options = {}) {
|
|
306
|
+
if (texts.length === 0) {
|
|
307
|
+
return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
|
|
308
|
+
}
|
|
309
|
+
const results = texts.map((t) => audit(t, options));
|
|
310
|
+
const seen = /* @__PURE__ */ new Set();
|
|
311
|
+
let dupCount = 0;
|
|
312
|
+
for (const t of texts) {
|
|
313
|
+
if (seen.has(t)) dupCount++;
|
|
314
|
+
else seen.add(t);
|
|
315
|
+
}
|
|
316
|
+
const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
|
|
317
|
+
const allPii = results.flatMap((r) => r.pii);
|
|
318
|
+
const counts = /* @__PURE__ */ new Map();
|
|
319
|
+
for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
320
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
321
|
+
const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
|
|
322
|
+
return { results, duplicate_ratio, pii_summary, avg_quality_score };
|
|
323
|
+
}
|
|
252
324
|
function mask(text, findings, options = {}) {
|
|
253
325
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
254
326
|
}
|
|
@@ -256,6 +328,7 @@ function mask(text, findings, options = {}) {
|
|
|
256
328
|
0 && (module.exports = {
|
|
257
329
|
applyMask,
|
|
258
330
|
audit,
|
|
331
|
+
auditBatch,
|
|
259
332
|
detectPii,
|
|
260
333
|
mask,
|
|
261
334
|
noiseMetrics,
|
package/dist/index.d.cts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.3.0";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
|
@@ -54,12 +54,12 @@ interface PiiSummaryEntry {
|
|
|
54
54
|
interface AuditOptions {
|
|
55
55
|
/**
|
|
56
56
|
* Active locale-specific detectors.
|
|
57
|
-
* - "tr" — Turkish: TCKN, phone_tr, name (default)
|
|
57
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
|
|
58
58
|
* - "us" — US: SSN, E.164 phone
|
|
59
59
|
* - "eu" — EU: E.164 phone
|
|
60
60
|
* - "all" — All detectors
|
|
61
61
|
*
|
|
62
|
-
* Universal detectors (email, iban, credit_card, ip) are always active.
|
|
62
|
+
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
63
|
*/
|
|
64
64
|
locale?: string;
|
|
65
65
|
}
|
|
@@ -83,9 +83,23 @@ interface MaskOptions {
|
|
|
83
83
|
* Audit *text* for LLM dataset readiness.
|
|
84
84
|
*/
|
|
85
85
|
declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
86
|
+
interface BatchAuditResult {
|
|
87
|
+
/** One AuditResult per input text, in order. */
|
|
88
|
+
results: AuditResult[];
|
|
89
|
+
/** Fraction of texts that are exact duplicates (0.0–1.0). */
|
|
90
|
+
duplicate_ratio: number;
|
|
91
|
+
/** PII counts aggregated across all texts. */
|
|
92
|
+
pii_summary: PiiSummaryEntry[];
|
|
93
|
+
/** Mean quality_score across all texts. */
|
|
94
|
+
avg_quality_score: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Audit a list of texts and aggregate metrics — including duplicate_ratio.
|
|
98
|
+
*/
|
|
99
|
+
declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
|
|
86
100
|
/**
|
|
87
101
|
* Apply masking to PII findings in *text*.
|
|
88
102
|
*/
|
|
89
103
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
90
104
|
|
|
91
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
105
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.3.0";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
|
@@ -54,12 +54,12 @@ interface PiiSummaryEntry {
|
|
|
54
54
|
interface AuditOptions {
|
|
55
55
|
/**
|
|
56
56
|
* Active locale-specific detectors.
|
|
57
|
-
* - "tr" — Turkish: TCKN, phone_tr, name (default)
|
|
57
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name (default)
|
|
58
58
|
* - "us" — US: SSN, E.164 phone
|
|
59
59
|
* - "eu" — EU: E.164 phone
|
|
60
60
|
* - "all" — All detectors
|
|
61
61
|
*
|
|
62
|
-
* Universal detectors (email, iban, credit_card, ip) are always active.
|
|
62
|
+
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
63
|
*/
|
|
64
64
|
locale?: string;
|
|
65
65
|
}
|
|
@@ -83,9 +83,23 @@ interface MaskOptions {
|
|
|
83
83
|
* Audit *text* for LLM dataset readiness.
|
|
84
84
|
*/
|
|
85
85
|
declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
86
|
+
interface BatchAuditResult {
|
|
87
|
+
/** One AuditResult per input text, in order. */
|
|
88
|
+
results: AuditResult[];
|
|
89
|
+
/** Fraction of texts that are exact duplicates (0.0–1.0). */
|
|
90
|
+
duplicate_ratio: number;
|
|
91
|
+
/** PII counts aggregated across all texts. */
|
|
92
|
+
pii_summary: PiiSummaryEntry[];
|
|
93
|
+
/** Mean quality_score across all texts. */
|
|
94
|
+
avg_quality_score: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Audit a list of texts and aggregate metrics — including duplicate_ratio.
|
|
98
|
+
*/
|
|
99
|
+
declare function auditBatch(texts: string[], options?: AuditOptions): BatchAuditResult;
|
|
86
100
|
/**
|
|
87
101
|
* Apply masking to PII findings in *text*.
|
|
88
102
|
*/
|
|
89
103
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
90
104
|
|
|
91
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
105
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.js
CHANGED
|
@@ -4,8 +4,14 @@ var PHONE_INTL_RE = /\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{4}\
|
|
|
4
4
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
5
5
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
6
6
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
7
|
+
var _H = "[0-9a-fA-F]{1,4}";
|
|
8
|
+
var IPV6_RE = new RegExp(
|
|
9
|
+
`(?<![:\\.\\w])(?:(?:${_H}:){7}${_H}|(?:${_H}:){1,7}:|::(?:(?:${_H}:){0,6}${_H})?|(?:${_H}:){1,6}:${_H}|(?:${_H}:){1,5}(?::${_H}){1,2}|(?:${_H}:){1,4}(?::${_H}){1,3}|(?:${_H}:){1,3}(?::${_H}){1,4}|(?:${_H}:){1,2}(?::${_H}){1,5}|${_H}:(?::${_H}){1,6})(?![:\\.\\w])`,
|
|
10
|
+
"gi"
|
|
11
|
+
);
|
|
7
12
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
8
13
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
14
|
+
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
9
15
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
10
16
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
11
17
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -22,6 +28,20 @@ function validTckn(s) {
|
|
|
22
28
|
if ((sumOdd * 7 - sumEven) % 10 !== d[9]) return false;
|
|
23
29
|
return d.slice(0, 10).reduce((a, b) => a + b, 0) % 10 === d[10];
|
|
24
30
|
}
|
|
31
|
+
function validVkn(s) {
|
|
32
|
+
if (s.length !== 10 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
33
|
+
const d = s.split("").map(Number);
|
|
34
|
+
let total = 0;
|
|
35
|
+
for (let i = 0; i < 9; i++) {
|
|
36
|
+
const x = (d[i] + (9 - i)) % 10;
|
|
37
|
+
if (x !== 0) {
|
|
38
|
+
let y = x * Math.pow(2, 9 - i) % 9;
|
|
39
|
+
if (y === 0) y = 9;
|
|
40
|
+
total += y;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return (10 - total % 10) % 10 === d[9];
|
|
44
|
+
}
|
|
25
45
|
function luhn(number) {
|
|
26
46
|
const digits = number.replace(/\D/g, "");
|
|
27
47
|
if (digits.length < 13 || digits.length > 19) return false;
|
|
@@ -36,12 +56,26 @@ function luhn(number) {
|
|
|
36
56
|
}
|
|
37
57
|
return total % 10 === 0;
|
|
38
58
|
}
|
|
59
|
+
function validIban(s) {
|
|
60
|
+
const rearranged = s.slice(4) + s.slice(0, 4);
|
|
61
|
+
const numeric = rearranged.toUpperCase().split("").map((c) => {
|
|
62
|
+
const code = c.charCodeAt(0);
|
|
63
|
+
return code >= 65 && code <= 90 ? String(code - 55) : c;
|
|
64
|
+
}).join("");
|
|
65
|
+
let remainder = 0;
|
|
66
|
+
for (let i = 0; i < numeric.length; i += 9) {
|
|
67
|
+
const chunk = Number(String(remainder) + numeric.slice(i, i + 9));
|
|
68
|
+
if (!Number.isFinite(chunk)) return false;
|
|
69
|
+
remainder = chunk % 97;
|
|
70
|
+
}
|
|
71
|
+
return remainder === 1;
|
|
72
|
+
}
|
|
39
73
|
var LOCALE_DETECTORS = {
|
|
40
|
-
tr: /* @__PURE__ */ new Set(["national_id_tr", "phone_tr", "name"]),
|
|
74
|
+
tr: /* @__PURE__ */ new Set(["national_id_tr", "tax_id_tr", "phone_tr", "name"]),
|
|
41
75
|
us: /* @__PURE__ */ new Set(["ssn", "phone"]),
|
|
42
76
|
eu: /* @__PURE__ */ new Set(["phone"])
|
|
43
77
|
};
|
|
44
|
-
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip"]);
|
|
78
|
+
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
45
79
|
function activeDetectors(locale) {
|
|
46
80
|
if (locale === "all") {
|
|
47
81
|
const active2 = new Set(UNIVERSAL);
|
|
@@ -79,7 +113,15 @@ function detectPii(text, locale = "tr") {
|
|
|
79
113
|
}
|
|
80
114
|
}
|
|
81
115
|
}
|
|
82
|
-
if (active.has("iban"))
|
|
116
|
+
if (active.has("iban")) {
|
|
117
|
+
IBAN_RE.lastIndex = 0;
|
|
118
|
+
let m;
|
|
119
|
+
while ((m = IBAN_RE.exec(t)) !== null) {
|
|
120
|
+
if (validIban(m[0])) {
|
|
121
|
+
findings.push({ type: "iban", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
83
125
|
if (active.has("credit_card")) {
|
|
84
126
|
CC_RE.lastIndex = 0;
|
|
85
127
|
let m;
|
|
@@ -90,6 +132,7 @@ function detectPii(text, locale = "tr") {
|
|
|
90
132
|
}
|
|
91
133
|
}
|
|
92
134
|
if (active.has("ip")) findings.push(...findAll(IPV4_RE, t, "ip"));
|
|
135
|
+
if (active.has("ip_v6")) findings.push(...findAll(IPV6_RE, t, "ip_v6"));
|
|
93
136
|
if (active.has("phone_tr")) findings.push(...findAll(PHONE_TR_RE, t, "phone_tr"));
|
|
94
137
|
if (active.has("national_id_tr")) {
|
|
95
138
|
TCKN_RE.lastIndex = 0;
|
|
@@ -100,6 +143,15 @@ function detectPii(text, locale = "tr") {
|
|
|
100
143
|
}
|
|
101
144
|
}
|
|
102
145
|
}
|
|
146
|
+
if (active.has("tax_id_tr")) {
|
|
147
|
+
VKN_RE.lastIndex = 0;
|
|
148
|
+
let m;
|
|
149
|
+
while ((m = VKN_RE.exec(t)) !== null) {
|
|
150
|
+
if (validVkn(m[1])) {
|
|
151
|
+
findings.push({ type: "tax_id_tr", value: m[1], start: m.index, end: m.index + m[0].length });
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
103
155
|
if (active.has("name")) {
|
|
104
156
|
NAME_RE.lastIndex = 0;
|
|
105
157
|
let m;
|
|
@@ -189,7 +241,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
189
241
|
}
|
|
190
242
|
|
|
191
243
|
// src/index.ts
|
|
192
|
-
var version = "0.
|
|
244
|
+
var version = "0.3.0";
|
|
193
245
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
194
246
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
195
247
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -217,12 +269,32 @@ function audit(text, options = {}) {
|
|
|
217
269
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
218
270
|
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
219
271
|
}
|
|
272
|
+
function auditBatch(texts, options = {}) {
|
|
273
|
+
if (texts.length === 0) {
|
|
274
|
+
return { results: [], duplicate_ratio: 0, pii_summary: [], avg_quality_score: 0 };
|
|
275
|
+
}
|
|
276
|
+
const results = texts.map((t) => audit(t, options));
|
|
277
|
+
const seen = /* @__PURE__ */ new Set();
|
|
278
|
+
let dupCount = 0;
|
|
279
|
+
for (const t of texts) {
|
|
280
|
+
if (seen.has(t)) dupCount++;
|
|
281
|
+
else seen.add(t);
|
|
282
|
+
}
|
|
283
|
+
const duplicate_ratio = Math.round(dupCount / texts.length * 1e4) / 1e4;
|
|
284
|
+
const allPii = results.flatMap((r) => r.pii);
|
|
285
|
+
const counts = /* @__PURE__ */ new Map();
|
|
286
|
+
for (const f of allPii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
287
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
288
|
+
const avg_quality_score = Math.round(results.reduce((sum, r) => sum + r.quality_score, 0) / results.length * 1e4) / 1e4;
|
|
289
|
+
return { results, duplicate_ratio, pii_summary, avg_quality_score };
|
|
290
|
+
}
|
|
220
291
|
function mask(text, findings, options = {}) {
|
|
221
292
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
222
293
|
}
|
|
223
294
|
export {
|
|
224
295
|
applyMask,
|
|
225
296
|
audit,
|
|
297
|
+
auditBatch,
|
|
226
298
|
detectPii,
|
|
227
299
|
mask,
|
|
228
300
|
noiseMetrics,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@flexorch/audit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pii",
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"homepage": "https://github.com/flexorch/flexorch-audit-js",
|
|
18
18
|
"repository": {
|
|
19
19
|
"type": "git",
|
|
20
|
-
"url": "https://github.com/flexorch/flexorch-audit-js.git"
|
|
20
|
+
"url": "git+https://github.com/flexorch/flexorch-audit-js.git"
|
|
21
21
|
},
|
|
22
22
|
"bugs": {
|
|
23
23
|
"url": "https://github.com/flexorch/flexorch-audit-js/issues"
|