@flexorch/audit 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,13 +1,38 @@
1
1
  # @flexorch/audit
2
2
 
3
- Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
3
+ [![npm](https://img.shields.io/npm/v/@flexorch/audit)](https://www.npmjs.com/package/@flexorch/audit)
4
+ [![Node](https://img.shields.io/node/v/@flexorch/audit)](https://www.npmjs.com/package/@flexorch/audit)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
4
6
 
5
- - **Quality grade** A/B/C/D score that signals LLM-readiness at a glance
6
- - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
7
- - **Quality metrics** — completeness, average length, duplicate ratio
8
- - **Noise metrics** — garbage character ratio, encoding health
9
- - **Masking** redact / replace / token / hash strategies
7
+ Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
8
+
9
+ ## Why
10
+
11
+ Before feeding documents into an LLM pipeline you need to answer three questions:
12
+
13
+ 1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
14
+ 2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
15
+ 3. **How bad is the noise?** Garbled encodings and control characters degrade model output silently.
16
+
17
+ Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
18
+
19
+ ## Features
20
+
21
+ - **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
22
+ - **PII detection** — email, phone (TR mobile + E.164), credit card (Luhn), IPv4, IPv6, TCKN, VKN, IBAN (mod-97 validated), SSN, label-prefixed names
23
+ - **Batch audit** — `auditBatch()` aggregates duplicate ratio and PII counts across an entire dataset in one call
24
+ - **Noise metrics** — garbage character ratio, encoding health check
25
+ - **Masking** — four strategies: redact, replace (synthetic), token, hash
10
26
  - **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
27
+ - **TypeScript-first** — full type definitions, no `@types/` package needed
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ npm install @flexorch/audit
33
+ ```
34
+
35
+ ## Quick start
11
36
 
12
37
  ```ts
13
38
  import { audit, mask } from "@flexorch/audit"
@@ -20,7 +45,6 @@ result.quality_grade // "A"
20
45
  result.quality_score // 0.91 (0.0–1.0 composite)
21
46
  result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
22
47
 
23
- // Raw findings and metrics — also available:
24
48
  result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
25
49
  result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
26
50
  result.noise // { garbage_ratio: 0.0, encoding_ok: true }
@@ -29,21 +53,31 @@ const clean = mask(text, result.pii, { strategy: "redact" })
29
53
  // "Contact: [REDACTED_EMAIL]"
30
54
  ```
31
55
 
32
- ## Install
56
+ ![demo](assets/demo.svg)
33
57
 
34
- ```bash
35
- npm install @flexorch/audit
36
- ```
58
+ ## Batch audit
37
59
 
38
- ![demo](assets/demo.svg)
60
+ Use `auditBatch()` to audit an entire dataset and get aggregate metrics including `duplicate_ratio`:
61
+
62
+ ```ts
63
+ import { auditBatch } from "@flexorch/audit"
64
+
65
+ const texts = dataset.map((r) => r.text)
66
+ const batch = auditBatch(texts, { locale: "tr" })
67
+
68
+ batch.duplicate_ratio // 0.12 — fraction of exact-duplicate records
69
+ batch.avg_quality_score // 0.78
70
+ batch.pii_summary // [{ type: "email", count: 47 }, ...]
71
+ batch.results // AuditResult[], one per text
72
+ ```
39
73
 
40
74
  ## Locale support
41
75
 
42
76
  | `locale` | Active detectors |
43
77
  |----------|-----------------|
44
- | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
45
- | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
46
- | `"eu"` | email, iban, credit_card, ip + E.164 phone |
78
+ | `"tr"` (default) | email, iban, credit_card, ip, ip_v6 + TCKN, VKN, phone_tr, name |
79
+ | `"us"` | email, iban, credit_card, ip, ip_v6 + SSN, E.164 phone |
80
+ | `"eu"` | email, iban, credit_card, ip, ip_v6 + E.164 phone |
47
81
  | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
48
82
 
49
83
  ## PII types
@@ -51,11 +85,13 @@ npm install @flexorch/audit
51
85
  | Type | Description | Locale |
52
86
  |------|-------------|--------|
53
87
  | `email` | RFC-5321 address | all |
54
- | `iban` | ISO 13616 IBAN (any country) | all |
88
+ | `iban` | ISO 13616 IBAN mod-97 checksum validated | all |
55
89
  | `credit_card` | 16-digit groups, Luhn-validated | all |
56
90
  | `ip` | IPv4 address | all |
91
+ | `ip_v6` | IPv6 address (full, compressed, loopback) | all |
57
92
  | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
58
93
  | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
94
+ | `tax_id_tr` | VKN — 10-digit Luhn-variant checksum | tr |
59
95
  | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
60
96
  | `phone` | E.164 international phone | us, eu |
61
97
  | `ssn` | US Social Security Number (###-##-####) | us |
@@ -65,53 +101,37 @@ npm install @flexorch/audit
65
101
  | Strategy | Example output |
66
102
  |----------|----------------|
67
103
  | `redact` (default) | `[REDACTED_EMAIL]` |
68
- | `replace` | `user@example.com` (realistic synthetic) |
69
- | `token` | `<PII_EMAIL_1>` (unique per type) |
104
+ | `replace` | `user@example.com` (static synthetic) |
105
+ | `token` | `<PII_EMAIL_1>` (unique per type per call) |
70
106
  | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
71
107
 
72
108
  ## TypeScript
73
109
 
74
- Full type definitions included. No `@types/` package needed.
75
-
76
110
  ```ts
77
- import { audit, mask, type AuditResult, type PiiFinding } from "@flexorch/audit"
111
+ import {
112
+ audit, auditBatch, mask,
113
+ type AuditResult, type BatchAuditResult, type PiiFinding,
114
+ } from "@flexorch/audit"
78
115
  ```
79
116
 
80
117
  ## Quality grade
81
118
 
82
- The `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals derived from three dimensions:
119
+ `quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals:
83
120
 
84
- | Grade | Score | Meaning |
85
- |-------|-------|---------|
121
+ | Grade | Score | Signal |
122
+ |-------|-------|--------|
86
123
  | A | ≥ 0.85 | Ready for LLM training or RAG |
87
124
  | B | ≥ 0.65 | Usable with minor cleanup |
88
- | C | ≥ 0.40 | Needs review before use |
125
+ | C | ≥ 0.40 | Review before use |
89
126
  | D | < 0.40 | Not suitable — empty, too short, or high noise |
90
127
 
91
- Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
92
- where `lengthScore = Math.min(charCount / 500, 1.0)` and `noiseScore = Math.max(0, 1 − garbageRatio × 10)`.
128
+ Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
129
+ `lengthScore = Math.min(charCount / 500, 1.0)` · `noiseScore = Math.max(0, 1 − garbageRatio × 10)`
93
130
 
94
- ## Quality & noise
95
-
96
- `duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
97
-
98
- ```ts
99
- const texts = dataset.map((r) => r.text)
100
- const seen = new Set<string>()
101
- let duplicates = 0
102
- for (const t of texts) {
103
- if (seen.has(t)) duplicates++
104
- else seen.add(t)
105
- }
106
- const duplicateRatio = duplicates / texts.length
107
- ```
108
-
109
- ## Limitations (v0.2)
131
+ ## Limitations (v0.4)
110
132
 
111
133
  - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
112
- - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
113
- - IPv6 not detected.
114
- - IBAN format-only check; mod-97 validation not performed.
134
+ - `replace` masking strategy uses static synthetic values; locale-aware realistic synthesis is not yet implemented.
115
135
 
116
136
  ## Also available for Python
117
137
 
@@ -119,6 +139,10 @@ const duplicateRatio = duplicates / texts.length
119
139
  pip install flexorch-audit
120
140
  ```
121
141
 
142
+ ## Contributing
143
+
144
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
145
+
122
146
  ## License
123
147
 
124
148
  MIT
package/dist/index.cjs CHANGED
@@ -274,7 +274,7 @@ function applyMask(text, findings, strategy = "redact") {
274
274
  }
275
275
 
276
276
  // src/index.ts
277
- var version = "0.3.0";
277
+ var version = "0.3.1";
278
278
  function computeQualityScore(completeness, avgLength, garbageRatio) {
279
279
  const lengthScore = Math.min(avgLength / 500, 1);
280
280
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/dist/index.d.cts CHANGED
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
45
  * // "Contact: [REDACTED_EMAIL]"
46
46
  */
47
47
 
48
- declare const version = "0.3.0";
48
+ declare const version = "0.3.1";
49
49
  type QualityGrade = "A" | "B" | "C" | "D";
50
50
  interface PiiSummaryEntry {
51
51
  type: string;
package/dist/index.d.ts CHANGED
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
45
45
  * // "Contact: [REDACTED_EMAIL]"
46
46
  */
47
47
 
48
- declare const version = "0.3.0";
48
+ declare const version = "0.3.1";
49
49
  type QualityGrade = "A" | "B" | "C" | "D";
50
50
  interface PiiSummaryEntry {
51
51
  type: string;
package/dist/index.js CHANGED
@@ -241,7 +241,7 @@ function applyMask(text, findings, strategy = "redact") {
241
241
  }
242
242
 
243
243
  // src/index.ts
244
- var version = "0.3.0";
244
+ var version = "0.3.1";
245
245
  function computeQualityScore(completeness, avgLength, garbageRatio) {
246
246
  const lengthScore = Math.min(avgLength / 500, 1);
247
247
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",