@flexorch/audit 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
4
4
 
5
+ - **Quality grade** — A/B/C/D score that signals LLM-readiness at a glance
5
6
  - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
6
7
  - **Quality metrics** — completeness, average length, duplicate ratio
7
8
  - **Noise metrics** — garbage character ratio, encoding health
@@ -10,13 +11,19 @@ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one questi
10
11
 
11
12
  ```ts
12
13
  import { audit, mask } from "@flexorch/audit"
14
+ import { readFileSync } from "fs"
13
15
 
16
+ const text = readFileSync("contract.txt", "utf8")
14
17
  const result = audit(text, { locale: "tr" })
15
- // {
16
- // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
17
- // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
18
- // noise: { garbage_ratio: 0.0, encoding_ok: true },
19
- // }
18
+
19
+ result.quality_grade // "A"
20
+ result.quality_score // 0.91 (0.0–1.0 composite)
21
+ result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
22
+
23
+ // Raw findings and metrics — also available:
24
+ result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
25
+ result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
26
+ result.noise // { garbage_ratio: 0.0, encoding_ok: true }
20
27
 
21
28
  const clean = mask(text, result.pii, { strategy: "redact" })
22
29
  // "Contact: [REDACTED_EMAIL]"
@@ -68,6 +75,20 @@ Full type definitions included. No `@types/` package needed.
68
75
  import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
69
76
  ```
70
77
 
78
+ ## Quality grade
79
+
80
+ The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
81
+
82
+ | Grade | Score | Meaning |
83
+ |-------|-------|---------|
84
+ | A | ≥ 0.85 | Ready for LLM training or RAG |
85
+ | B | ≥ 0.65 | Usable with minor cleanup |
86
+ | C | ≥ 0.40 | Needs review before use |
87
+ | D | < 0.40 | Not suitable — empty, too short, or high noise |
88
+
89
+ Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
90
+ where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
91
+
71
92
  ## Quality & noise
72
93
 
73
94
  `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
@@ -83,7 +104,7 @@ for (const t of texts) {
83
104
  const duplicateRatio = duplicates / texts.length
84
105
  ```
85
106
 
86
- ## Limitations (v0.1)
107
+ ## Limitations (v0.2)
87
108
 
88
109
  - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
89
110
  - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
package/dist/index.cjs CHANGED
@@ -221,14 +221,33 @@ function applyMask(text, findings, strategy = "redact") {
221
221
  }
222
222
 
223
223
  // src/index.ts
224
- var version = "0.1.0";
224
+ var version = "0.2.0";
225
+ function computeQualityScore(completeness, avgLength, garbageRatio) {
226
+ const lengthScore = Math.min(avgLength / 500, 1);
227
+ const noiseScore = Math.max(0, 1 - garbageRatio * 10);
228
+ return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
229
+ }
230
+ function computeQualityGrade(score) {
231
+ if (score >= 0.85) return "A";
232
+ if (score >= 0.65) return "B";
233
+ if (score >= 0.4) return "C";
234
+ return "D";
235
+ }
225
236
  function audit(text, options = {}) {
226
237
  const locale = options.locale ?? "tr";
227
- return {
228
- pii: detectPii(text, locale),
229
- quality: qualityMetrics(text),
230
- noise: noiseMetrics(text)
231
- };
238
+ const pii = detectPii(text, locale);
239
+ const quality = qualityMetrics(text);
240
+ const noise = noiseMetrics(text);
241
+ const quality_score = computeQualityScore(
242
+ quality.completeness,
243
+ quality.avg_length,
244
+ noise.garbage_ratio
245
+ );
246
+ const quality_grade = computeQualityGrade(quality_score);
247
+ const counts = /* @__PURE__ */ new Map();
248
+ for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
249
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
250
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise };
232
251
  }
233
252
  function mask(text, findings, options = {}) {
234
253
  return applyMask(text, findings, options.strategy ?? "redact");
package/dist/index.d.cts CHANGED
@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
27
27
  *
28
28
  * @example
29
29
  * import { audit, mask } from "@flexorch/audit"
30
+ * import { readFileSync } from "fs"
30
31
  *
32
+ * const text = readFileSync("contract.txt", "utf8")
31
33
  * const result = audit(text, { locale: "tr" })
32
- * // {
33
- * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
- * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
- * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
- * // }
34
+ *
35
+ * result.quality_grade // "A"
36
+ * result.quality_score // 0.91
37
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
+ *
39
+ * // Raw findings and metrics also available:
40
+ * result.pii // [{ type, value, start, end }, ...]
41
+ * result.quality // { completeness, avg_length, duplicate_ratio }
42
+ * result.noise // { garbage_ratio, encoding_ok }
37
43
  *
38
44
  * const clean = mask(text, result.pii, { strategy: "redact" })
39
45
  * // "Contact: [REDACTED_EMAIL]"
40
46
  */
41
47
 
42
- declare const version = "0.1.0";
48
+ declare const version = "0.2.0";
49
+ type QualityGrade = "A" | "B" | "C" | "D";
50
+ interface PiiSummaryEntry {
51
+ type: string;
52
+ count: number;
53
+ }
43
54
  interface AuditOptions {
44
55
  /**
45
56
  * Active locale-specific detectors.
@@ -53,6 +64,13 @@ interface AuditOptions {
53
64
  locale?: string;
54
65
  }
55
66
  interface AuditResult {
67
+ /** A/B/C/D overall LLM-readiness grade. */
68
+ quality_grade: QualityGrade;
69
+ /** 0.0–1.0 composite score (completeness + length + noise). */
70
+ quality_score: number;
71
+ /** PII findings aggregated by type: [{ type, count }]. */
72
+ pii_summary: PiiSummaryEntry[];
73
+ /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
56
74
  pii: PiiFinding[];
57
75
  quality: QualityMetrics;
58
76
  noise: NoiseMetrics;
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
70
88
  */
71
89
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
90
 
73
- export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
91
+ export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
package/dist/index.d.ts CHANGED
@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
27
27
  *
28
28
  * @example
29
29
  * import { audit, mask } from "@flexorch/audit"
30
+ * import { readFileSync } from "fs"
30
31
  *
32
+ * const text = readFileSync("contract.txt", "utf8")
31
33
  * const result = audit(text, { locale: "tr" })
32
- * // {
33
- * // pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
34
- * // quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
35
- * // noise: { garbage_ratio: 0.0, encoding_ok: true },
36
- * // }
34
+ *
35
+ * result.quality_grade // "A"
36
+ * result.quality_score // 0.91
37
+ * result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
38
+ *
39
+ * // Raw findings and metrics also available:
40
+ * result.pii // [{ type, value, start, end }, ...]
41
+ * result.quality // { completeness, avg_length, duplicate_ratio }
42
+ * result.noise // { garbage_ratio, encoding_ok }
37
43
  *
38
44
  * const clean = mask(text, result.pii, { strategy: "redact" })
39
45
  * // "Contact: [REDACTED_EMAIL]"
40
46
  */
41
47
 
42
- declare const version = "0.1.0";
48
+ declare const version = "0.2.0";
49
+ type QualityGrade = "A" | "B" | "C" | "D";
50
+ interface PiiSummaryEntry {
51
+ type: string;
52
+ count: number;
53
+ }
43
54
  interface AuditOptions {
44
55
  /**
45
56
  * Active locale-specific detectors.
@@ -53,6 +64,13 @@ interface AuditOptions {
53
64
  locale?: string;
54
65
  }
55
66
  interface AuditResult {
67
+ /** A/B/C/D overall LLM-readiness grade. */
68
+ quality_grade: QualityGrade;
69
+ /** 0.0–1.0 composite score (completeness + length + noise). */
70
+ quality_score: number;
71
+ /** PII findings aggregated by type: [{ type, count }]. */
72
+ pii_summary: PiiSummaryEntry[];
73
+ /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
56
74
  pii: PiiFinding[];
57
75
  quality: QualityMetrics;
58
76
  noise: NoiseMetrics;
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
70
88
  */
71
89
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
72
90
 
73
- export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
91
+ export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
package/dist/index.js CHANGED
@@ -189,14 +189,33 @@ function applyMask(text, findings, strategy = "redact") {
189
189
  }
190
190
 
191
191
  // src/index.ts
192
- var version = "0.1.0";
192
+ var version = "0.2.0";
193
+ function computeQualityScore(completeness, avgLength, garbageRatio) {
194
+ const lengthScore = Math.min(avgLength / 500, 1);
195
+ const noiseScore = Math.max(0, 1 - garbageRatio * 10);
196
+ return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
197
+ }
198
+ function computeQualityGrade(score) {
199
+ if (score >= 0.85) return "A";
200
+ if (score >= 0.65) return "B";
201
+ if (score >= 0.4) return "C";
202
+ return "D";
203
+ }
193
204
  function audit(text, options = {}) {
194
205
  const locale = options.locale ?? "tr";
195
- return {
196
- pii: detectPii(text, locale),
197
- quality: qualityMetrics(text),
198
- noise: noiseMetrics(text)
199
- };
206
+ const pii = detectPii(text, locale);
207
+ const quality = qualityMetrics(text);
208
+ const noise = noiseMetrics(text);
209
+ const quality_score = computeQualityScore(
210
+ quality.completeness,
211
+ quality.avg_length,
212
+ noise.garbage_ratio
213
+ );
214
+ const quality_grade = computeQualityGrade(quality_score);
215
+ const counts = /* @__PURE__ */ new Map();
216
+ for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
217
+ const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
218
+ return { quality_grade, quality_score, pii_summary, pii, quality, noise };
200
219
  }
201
220
  function mask(text, findings, options = {}) {
202
221
  return applyMask(text, findings, options.strategy ?? "redact");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",