npm - @flexorch/audit - Versions diffs - 0.1.0 → 0.2.0 - Mend

@flexorch/audit 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -2,6 +2,7 @@
 Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
+- **Quality grade** — A/B/C/D score that signals LLM-readiness at a glance
 - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
 - **Quality metrics** — completeness, average length, duplicate ratio
 - **Noise metrics** — garbage character ratio, encoding health
@@ -10,13 +11,19 @@ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one questi
 ```ts
 import { audit, mask } from "@flexorch/audit"
+import { readFileSync } from "fs"
+const text = readFileSync("contract.txt", "utf8")
 const result = audit(text, { locale: "tr" })
-// {
-//   pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
-//   quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
-//   noise: { garbage_ratio: 0.0, encoding_ok: true },
-// }
+result.quality_grade   // "A"
+result.quality_score   // 0.91  (0.0–1.0 composite)
+result.pii_summary     // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
+// Raw findings and metrics — also available:
+result.pii             // [{ type: "email", value: "...", start: 8, end: 23 }]
+result.quality         // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
+result.noise           // { garbage_ratio: 0.0, encoding_ok: true }
 const clean = mask(text, result.pii, { strategy: "redact" })
 // "Contact: [REDACTED_EMAIL]"
@@ -68,6 +75,20 @@ Full type definitions included. No `@types/` package needed.
 import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
 ```
+## Quality grade
+The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
+| Grade | Score | Meaning |
+|-------|-------|---------|
+| A | ≥ 0.85 | Ready for LLM training or RAG |
+| B | ≥ 0.65 | Usable with minor cleanup |
+| C | ≥ 0.40 | Needs review before use |
+| D | < 0.40 | Not suitable — empty, too short, or high noise |
+Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
+where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
 ## Quality & noise
 `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
@@ -83,7 +104,7 @@ for (const t of texts) {
 const duplicateRatio = duplicates / texts.length
 ```
-## Limitations (v0.1)
+## Limitations (v0.2)
 - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
 - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).

package/dist/index.cjs CHANGED Viewed

@@ -221,14 +221,33 @@ function applyMask(text, findings, strategy = "redact") {
 }
 // src/index.ts
-var version = "0.1.0";
+var version = "0.2.0";
+function computeQualityScore(completeness, avgLength, garbageRatio) {
+  const lengthScore = Math.min(avgLength / 500, 1);
+  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
+  return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
+}
+function computeQualityGrade(score) {
+  if (score >= 0.85) return "A";
+  if (score >= 0.65) return "B";
+  if (score >= 0.4) return "C";
+  return "D";
+}
 function audit(text, options = {}) {
   const locale = options.locale ?? "tr";
-  return {
-    pii: detectPii(text, locale),
-    quality: qualityMetrics(text),
-    noise: noiseMetrics(text)
-  };
+  const pii = detectPii(text, locale);
+  const quality = qualityMetrics(text);
+  const noise = noiseMetrics(text);
+  const quality_score = computeQualityScore(
+    quality.completeness,
+    quality.avg_length,
+    noise.garbage_ratio
+  );
+  const quality_grade = computeQualityGrade(quality_score);
+  const counts = /* @__PURE__ */ new Map();
+  for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
+  const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
+  return { quality_grade, quality_score, pii_summary, pii, quality, noise };
 }
 function mask(text, findings, options = {}) {
   return applyMask(text, findings, options.strategy ?? "redact");

package/dist/index.d.cts CHANGED Viewed

@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
  *
  * @example
  * import { audit, mask } from "@flexorch/audit"
+ * import { readFileSync } from "fs"
  *
+ * const text = readFileSync("contract.txt", "utf8")
  * const result = audit(text, { locale: "tr" })
- * // {
- * //   pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
- * //   quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
- * //   noise: { garbage_ratio: 0.0, encoding_ok: true },
- * // }
+ *
+ * result.quality_grade   // "A"
+ * result.quality_score   // 0.91
+ * result.pii_summary     // [{ type: "national_id_tr", count: 3 }, ...]
+ *
+ * // Raw findings and metrics also available:
+ * result.pii             // [{ type, value, start, end }, ...]
+ * result.quality         // { completeness, avg_length, duplicate_ratio }
+ * result.noise           // { garbage_ratio, encoding_ok }
  *
  * const clean = mask(text, result.pii, { strategy: "redact" })
  * // "Contact: [REDACTED_EMAIL]"
  */
-declare const version = "0.1.0";
+declare const version = "0.2.0";
+type QualityGrade = "A" | "B" | "C" | "D";
+interface PiiSummaryEntry {
+    type: string;
+    count: number;
+}
 interface AuditOptions {
     /**
      * Active locale-specific detectors.
@@ -53,6 +64,13 @@ interface AuditOptions {
     locale?: string;
 }
 interface AuditResult {
+    /** A/B/C/D overall LLM-readiness grade. */
+    quality_grade: QualityGrade;
+    /** 0.0–1.0 composite score (completeness + length + noise). */
+    quality_score: number;
+    /** PII findings aggregated by type: [{ type, count }]. */
+    pii_summary: PiiSummaryEntry[];
+    /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
     pii: PiiFinding[];
     quality: QualityMetrics;
     noise: NoiseMetrics;
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
  */
 declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
-export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
+export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };

package/dist/index.d.ts CHANGED Viewed

@@ -27,19 +27,30 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
  *
  * @example
  * import { audit, mask } from "@flexorch/audit"
+ * import { readFileSync } from "fs"
  *
+ * const text = readFileSync("contract.txt", "utf8")
  * const result = audit(text, { locale: "tr" })
- * // {
- * //   pii: [{ type: "email", value: "ali@example.com", start: 8, end: 23 }],
- * //   quality: { completeness: 1.0, avg_length: 342, duplicate_ratio: null },
- * //   noise: { garbage_ratio: 0.0, encoding_ok: true },
- * // }
+ *
+ * result.quality_grade   // "A"
+ * result.quality_score   // 0.91
+ * result.pii_summary     // [{ type: "national_id_tr", count: 3 }, ...]
+ *
+ * // Raw findings and metrics also available:
+ * result.pii             // [{ type, value, start, end }, ...]
+ * result.quality         // { completeness, avg_length, duplicate_ratio }
+ * result.noise           // { garbage_ratio, encoding_ok }
  *
  * const clean = mask(text, result.pii, { strategy: "redact" })
  * // "Contact: [REDACTED_EMAIL]"
  */
-declare const version = "0.1.0";
+declare const version = "0.2.0";
+type QualityGrade = "A" | "B" | "C" | "D";
+interface PiiSummaryEntry {
+    type: string;
+    count: number;
+}
 interface AuditOptions {
     /**
      * Active locale-specific detectors.
@@ -53,6 +64,13 @@ interface AuditOptions {
     locale?: string;
 }
 interface AuditResult {
+    /** A/B/C/D overall LLM-readiness grade. */
+    quality_grade: QualityGrade;
+    /** 0.0–1.0 composite score (completeness + length + noise). */
+    quality_score: number;
+    /** PII findings aggregated by type: [{ type, count }]. */
+    pii_summary: PiiSummaryEntry[];
+    /** Raw PII findings sorted by position: [{ type, value, start, end }]. */
     pii: PiiFinding[];
     quality: QualityMetrics;
     noise: NoiseMetrics;
@@ -70,4 +88,4 @@ declare function audit(text: string, options?: AuditOptions): AuditResult;
  */
 declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
-export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };
+export { type AuditOptions, type AuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, detectPii, mask, noiseMetrics, qualityMetrics, version };

package/dist/index.js CHANGED Viewed

@@ -189,14 +189,33 @@ function applyMask(text, findings, strategy = "redact") {
 }
 // src/index.ts
-var version = "0.1.0";
+var version = "0.2.0";
+function computeQualityScore(completeness, avgLength, garbageRatio) {
+  const lengthScore = Math.min(avgLength / 500, 1);
+  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
+  return Math.round(completeness * (0.4 * noiseScore + 0.4 * lengthScore + 0.2) * 1e4) / 1e4;
+}
+function computeQualityGrade(score) {
+  if (score >= 0.85) return "A";
+  if (score >= 0.65) return "B";
+  if (score >= 0.4) return "C";
+  return "D";
+}
 function audit(text, options = {}) {
   const locale = options.locale ?? "tr";
-  return {
-    pii: detectPii(text, locale),
-    quality: qualityMetrics(text),
-    noise: noiseMetrics(text)
-  };
+  const pii = detectPii(text, locale);
+  const quality = qualityMetrics(text);
+  const noise = noiseMetrics(text);
+  const quality_score = computeQualityScore(
+    quality.completeness,
+    quality.avg_length,
+    noise.garbage_ratio
+  );
+  const quality_grade = computeQualityGrade(quality_score);
+  const counts = /* @__PURE__ */ new Map();
+  for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
+  const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
+  return { quality_grade, quality_score, pii_summary, pii, quality, noise };
 }
 function mask(text, findings, options = {}) {
   return applyMask(text, findings, options.strategy ?? "redact");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@flexorch/audit",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
   "keywords": [
     "pii",