@flexorch/audit 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -6
- package/dist/index.cjs +25 -6
- package/dist/index.d.cts +25 -7
- package/dist/index.d.ts +25 -7
- package/dist/index.js +25 -6
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
|
|
4
4
|
|
|
5
|
+
- **Quality grade** — A/B/C/D score that signals LLM-readiness at a glance
|
|
5
6
|
- **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
|
|
6
7
|
- **Quality metrics** — completeness, average length, duplicate ratio
|
|
7
8
|
- **Noise metrics** — garbage character ratio, encoding health
|
|
@@ -10,13 +11,19 @@ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one questi
|
|
|
10
11
|
|
|
11
12
|
```ts
|
|
12
13
|
import { audit, mask } from "@flexorch/audit"
|
|
14
|
+
import { readFileSync } from "fs"
|
|
13
15
|
|
|
16
|
+
const text = readFileSync("contract.txt", "utf8")
|
|
14
17
|
const result = audit(text, { locale: "tr" })
|
|
15
|
-
|
|
16
|
-
//
|
|
17
|
-
//
|
|
18
|
-
//
|
|
19
|
-
|
|
18
|
+
|
|
19
|
+
result.quality_grade // "A"
|
|
20
|
+
result.quality_score // 0.91 (0.0–1.0 composite)
|
|
21
|
+
result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
|
|
22
|
+
|
|
23
|
+
// Raw findings and metrics — also available:
|
|
24
|
+
result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
|
|
25
|
+
result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
|
|
26
|
+
result.noise // { garbage_ratio: 0.0, encoding_ok: true }
|
|
20
27
|
|
|
21
28
|
const clean = mask(text, result.pii, { strategy: "redact" })
|
|
22
29
|
// "Contact: [REDACTED_EMAIL]"
|
|
@@ -68,6 +75,20 @@ Full type definitions included. No `@types/` package needed.
|
|
|
68
75
|
import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
|
|
69
76
|
```
|
|
70
77
|
|
|
78
|
+
## Quality grade
|
|
79
|
+
|
|
80
|
+
The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
|
|
81
|
+
|
|
82
|
+
| Grade | Score | Meaning |
|
|
83
|
+
|-------|-------|---------|
|
|
84
|
+
| A | ≥ 0.85 | Ready for LLM training or RAG |
|
|
85
|
+
| B | ≥ 0.65 | Usable with minor cleanup |
|
|
86
|
+
| C | ≥ 0.40 | Needs review before use |
|
|
87
|
+
| D | < 0.40 | Not suitable — empty, too short, or high noise |
|
|
88
|
+
|
|
89
|
+
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
90
|
+
where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
|
|
91
|
+
|
|
71
92
|
## Quality & noise
|
|
72
93
|
|
|
73
94
|
`duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
|
|
@@ -83,7 +104,7 @@ for (const t of texts) {
|
|
|
83
104
|
const duplicateRatio = duplicates / texts.length
|
|
84
105
|
```
|
|
85
106
|
|
|
86
|
-
## Limitations (v0.
|
|
107
|
+
## Limitations (v0.2)
|
|
87
108
|
|
|
88
109
|
- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
|
|
89
110
|
- `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
|
package/dist/index.cjs
CHANGED
|
@@ -221,14 +221,33 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
221
221
|
}
|
|
222
222
|
|
|
223
223
|
// src/index.ts
|
|
224
|
-
var version = "0.
|
|
224
|
+
var version = "0.2.0";
|
|
225
|
+
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
226
|
+
const lengthScore = Math.min(avgLength / 500, 1);
|
|
227
|
+
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
228
|
+
return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
|
|
229
|
+
}
|
|
230
|
+
function computeQualityGrade(score) {
|
|
231
|
+
if (score >= 0.85) return "A";
|
|
232
|
+
if (score >= 0.65) return "B";
|
|
233
|
+
if (score >= 0.4) return "C";
|
|
234
|
+
return "D";
|
|
235
|
+
}
|
|
225
236
|
function audit(text, options = {}) {
|
|
226
237
|
const locale = options.locale ?? "tr";
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
238
|
+
const pii = detectPii(text, locale);
|
|
239
|
+
const quality = qualityMetrics(text);
|
|
240
|
+
const noise = noiseMetrics(text);
|
|
241
|
+
const quality_score = computeQualityScore(
|
|
242
|
+
quality.completeness,
|
|
243
|
+
quality.avg_length,
|
|
244
|
+
noise.garbage_ratio
|
|
245
|
+
);
|
|
246
|
+
const quality_grade = computeQualityGrade(quality_score);
|
|
247
|
+
const counts = /* @__PURE__ */ new Map();
|
|
248
|
+
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
249
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
250
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
232
251
|
}
|
|
233
252
|
function mask(text, findings, options = {}) {
|
|
234
253
|
return applyMask(text, findings, options.strategy ?? "redact");
|
package/dist/index.d.cts
CHANGED
|
@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
27
27
|
*
|
|
28
28
|
* @example
|
|
29
29
|
* import { audit, mask } from "@flexorch/audit"
|
|
30
|
+
* import { readFileSync } from "fs"
|
|
30
31
|
*
|
|
32
|
+
* const text = readFileSync("contract.txt", "utf8")
|
|
31
33
|
* const result = audit(text, { locale: "tr" })
|
|
32
|
-
*
|
|
33
|
-
* //
|
|
34
|
-
* //
|
|
35
|
-
* //
|
|
36
|
-
*
|
|
34
|
+
*
|
|
35
|
+
* result.quality_grade // "A"
|
|
36
|
+
* result.quality_score // 0.91
|
|
37
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
|
+
*
|
|
39
|
+
* // Raw findings and metrics also available:
|
|
40
|
+
* result.pii // [{ type, value, start, end }, ...]
|
|
41
|
+
* result.quality // { completeness, avg_length, duplicate_ratio }
|
|
42
|
+
* result.noise // { garbage_ratio, encoding_ok }
|
|
37
43
|
*
|
|
38
44
|
* const clean = mask(text, result.pii, { strategy: "redact" })
|
|
39
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
40
46
|
*/
|
|
41
47
|
|
|
42
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.2.0";
|
|
49
|
+
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
|
+
interface PiiSummaryEntry {
|
|
51
|
+
type: string;
|
|
52
|
+
count: number;
|
|
53
|
+
}
|
|
43
54
|
interface AuditOptions {
|
|
44
55
|
/**
|
|
45
56
|
* Active locale-specific detectors.
|
|
@@ -53,6 +64,13 @@ interface AuditOptions {
|
|
|
53
64
|
locale?: string;
|
|
54
65
|
}
|
|
55
66
|
interface AuditResult {
|
|
67
|
+
/** A/B/C/D overall LLM-readiness grade. */
|
|
68
|
+
quality_grade: QualityGrade;
|
|
69
|
+
/** 0.0–1.0 composite score (completeness + length + noise). */
|
|
70
|
+
quality_score: number;
|
|
71
|
+
/** PII findings aggregated by type: [{ type, count }]. */
|
|
72
|
+
pii_summary: PiiSummaryEntry[];
|
|
73
|
+
/** Raw PII findings sorted by position: [{ type, value, start, end }]. */
|
|
56
74
|
pii: PiiFinding[];
|
|
57
75
|
quality: QualityMetrics;
|
|
58
76
|
noise: NoiseMetrics;
|
|
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
|
70
88
|
*/
|
|
71
89
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
72
90
|
|
|
73
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
91
|
+
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
27
27
|
*
|
|
28
28
|
* @example
|
|
29
29
|
* import { audit, mask } from "@flexorch/audit"
|
|
30
|
+
* import { readFileSync } from "fs"
|
|
30
31
|
*
|
|
32
|
+
* const text = readFileSync("contract.txt", "utf8")
|
|
31
33
|
* const result = audit(text, { locale: "tr" })
|
|
32
|
-
*
|
|
33
|
-
* //
|
|
34
|
-
* //
|
|
35
|
-
* //
|
|
36
|
-
*
|
|
34
|
+
*
|
|
35
|
+
* result.quality_grade // "A"
|
|
36
|
+
* result.quality_score // 0.91
|
|
37
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
|
+
*
|
|
39
|
+
* // Raw findings and metrics also available:
|
|
40
|
+
* result.pii // [{ type, value, start, end }, ...]
|
|
41
|
+
* result.quality // { completeness, avg_length, duplicate_ratio }
|
|
42
|
+
* result.noise // { garbage_ratio, encoding_ok }
|
|
37
43
|
*
|
|
38
44
|
* const clean = mask(text, result.pii, { strategy: "redact" })
|
|
39
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
40
46
|
*/
|
|
41
47
|
|
|
42
|
-
declare const version = "0.
|
|
48
|
+
declare const version = "0.2.0";
|
|
49
|
+
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
|
+
interface PiiSummaryEntry {
|
|
51
|
+
type: string;
|
|
52
|
+
count: number;
|
|
53
|
+
}
|
|
43
54
|
interface AuditOptions {
|
|
44
55
|
/**
|
|
45
56
|
* Active locale-specific detectors.
|
|
@@ -53,6 +64,13 @@ interface AuditOptions {
|
|
|
53
64
|
locale?: string;
|
|
54
65
|
}
|
|
55
66
|
interface AuditResult {
|
|
67
|
+
/** A/B/C/D overall LLM-readiness grade. */
|
|
68
|
+
quality_grade: QualityGrade;
|
|
69
|
+
/** 0.0–1.0 composite score (completeness + length + noise). */
|
|
70
|
+
quality_score: number;
|
|
71
|
+
/** PII findings aggregated by type: [{ type, count }]. */
|
|
72
|
+
pii_summary: PiiSummaryEntry[];
|
|
73
|
+
/** Raw PII findings sorted by position: [{ type, value, start, end }]. */
|
|
56
74
|
pii: PiiFinding[];
|
|
57
75
|
quality: QualityMetrics;
|
|
58
76
|
noise: NoiseMetrics;
|
|
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
|
|
|
70
88
|
*/
|
|
71
89
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
72
90
|
|
|
73
|
-
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
91
|
+
export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
package/dist/index.js
CHANGED
|
@@ -189,14 +189,33 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
189
189
|
}
|
|
190
190
|
|
|
191
191
|
// src/index.ts
|
|
192
|
-
var version = "0.
|
|
192
|
+
var version = "0.2.0";
|
|
193
|
+
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
194
|
+
const lengthScore = Math.min(avgLength / 500, 1);
|
|
195
|
+
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
196
|
+
return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
|
|
197
|
+
}
|
|
198
|
+
function computeQualityGrade(score) {
|
|
199
|
+
if (score >= 0.85) return "A";
|
|
200
|
+
if (score >= 0.65) return "B";
|
|
201
|
+
if (score >= 0.4) return "C";
|
|
202
|
+
return "D";
|
|
203
|
+
}
|
|
193
204
|
function audit(text, options = {}) {
|
|
194
205
|
const locale = options.locale ?? "tr";
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
206
|
+
const pii = detectPii(text, locale);
|
|
207
|
+
const quality = qualityMetrics(text);
|
|
208
|
+
const noise = noiseMetrics(text);
|
|
209
|
+
const quality_score = computeQualityScore(
|
|
210
|
+
quality.completeness,
|
|
211
|
+
quality.avg_length,
|
|
212
|
+
noise.garbage_ratio
|
|
213
|
+
);
|
|
214
|
+
const quality_grade = computeQualityGrade(quality_score);
|
|
215
|
+
const counts = /* @__PURE__ */ new Map();
|
|
216
|
+
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
217
|
+
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
218
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
200
219
|
}
|
|
201
220
|
function mask(text, findings, options = {}) {
|
|
202
221
|
return applyMask(text, findings, options.strategy ?? "redact");
|