@flexorch/audit 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -46
- package/dist/index.cjs +1 -1
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,13 +1,38 @@
|
|
|
1
1
|
# @flexorch/audit
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
4
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
5
|
+
[](LICENSE)
|
|
4
6
|
|
|
5
|
-
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
|
|
8
|
+
|
|
9
|
+
## Why
|
|
10
|
+
|
|
11
|
+
Before feeding documents into an LLM pipeline you need to answer three questions:
|
|
12
|
+
|
|
13
|
+
1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
|
|
14
|
+
2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
|
|
15
|
+
3. **How bad is the noise?** Garbled encodings and control characters degrade model output silently.
|
|
16
|
+
|
|
17
|
+
Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
|
|
22
|
+
- **PII detection** — email, phone (TR mobile + E.164), credit card (Luhn), IPv4, IPv6, TCKN, VKN, IBAN (mod-97 validated), SSN, label-prefixed names
|
|
23
|
+
- **Batch audit** — `auditBatch()` aggregates duplicate ratio and PII counts across an entire dataset in one call
|
|
24
|
+
- **Noise metrics** — garbage character ratio, encoding health check
|
|
25
|
+
- **Masking** — four strategies: redact, replace (synthetic), token, hash
|
|
10
26
|
- **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
|
|
27
|
+
- **TypeScript-first** — full type definitions, no `@types/` package needed
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npm install @flexorch/audit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick start
|
|
11
36
|
|
|
12
37
|
```ts
|
|
13
38
|
import { audit, mask } from "@flexorch/audit"
|
|
@@ -20,7 +45,6 @@ result.quality_grade // "A"
|
|
|
20
45
|
result.quality_score // 0.91 (0.0–1.0 composite)
|
|
21
46
|
result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
|
|
22
47
|
|
|
23
|
-
// Raw findings and metrics — also available:
|
|
24
48
|
result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
|
|
25
49
|
result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
|
|
26
50
|
result.noise // { garbage_ratio: 0.0, encoding_ok: true }
|
|
@@ -29,21 +53,31 @@ const clean = mask(text, result.pii, { strategy: "redact" })
|
|
|
29
53
|
// "Contact: [REDACTED_EMAIL]"
|
|
30
54
|
```
|
|
31
55
|
|
|
32
|
-
|
|
56
|
+

|
|
33
57
|
|
|
34
|
-
|
|
35
|
-
npm install @flexorch/audit
|
|
36
|
-
```
|
|
58
|
+
## Batch audit
|
|
37
59
|
|
|
38
|
-
|
|
60
|
+
Use `auditBatch()` to audit an entire dataset and get aggregate metrics including `duplicate_ratio`:
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
import { auditBatch } from "@flexorch/audit"
|
|
64
|
+
|
|
65
|
+
const texts = dataset.map((r) => r.text)
|
|
66
|
+
const batch = auditBatch(texts, { locale: "tr" })
|
|
67
|
+
|
|
68
|
+
batch.duplicate_ratio // 0.12 — fraction of exact-duplicate records
|
|
69
|
+
batch.avg_quality_score // 0.78
|
|
70
|
+
batch.pii_summary // [{ type: "email", count: 47 }, ...]
|
|
71
|
+
batch.results // AuditResult[], one per text
|
|
72
|
+
```
|
|
39
73
|
|
|
40
74
|
## Locale support
|
|
41
75
|
|
|
42
76
|
| `locale` | Active detectors |
|
|
43
77
|
|----------|-----------------|
|
|
44
|
-
| `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
|
|
45
|
-
| `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
|
|
46
|
-
| `"eu"` | email, iban, credit_card, ip + E.164 phone |
|
|
78
|
+
| `"tr"` (default) | email, iban, credit_card, ip, ip_v6 + TCKN, VKN, phone_tr, name |
|
|
79
|
+
| `"us"` | email, iban, credit_card, ip, ip_v6 + SSN, E.164 phone |
|
|
80
|
+
| `"eu"` | email, iban, credit_card, ip, ip_v6 + E.164 phone |
|
|
47
81
|
| `"all"` | All of the above (phone_tr takes precedence over generic phone) |
|
|
48
82
|
|
|
49
83
|
## PII types
|
|
@@ -51,11 +85,13 @@ npm install @flexorch/audit
|
|
|
51
85
|
| Type | Description | Locale |
|
|
52
86
|
|------|-------------|--------|
|
|
53
87
|
| `email` | RFC-5321 address | all |
|
|
54
|
-
| `iban` | ISO 13616 IBAN
|
|
88
|
+
| `iban` | ISO 13616 IBAN — mod-97 checksum validated | all |
|
|
55
89
|
| `credit_card` | 16-digit groups, Luhn-validated | all |
|
|
56
90
|
| `ip` | IPv4 address | all |
|
|
91
|
+
| `ip_v6` | IPv6 address (full, compressed, loopback) | all |
|
|
57
92
|
| `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
|
|
58
93
|
| `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
|
|
94
|
+
| `tax_id_tr` | VKN — 10-digit Luhn-variant checksum | tr |
|
|
59
95
|
| `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
|
|
60
96
|
| `phone` | E.164 international phone | us, eu |
|
|
61
97
|
| `ssn` | US Social Security Number (###-##-####) | us |
|
|
@@ -65,53 +101,37 @@ npm install @flexorch/audit
|
|
|
65
101
|
| Strategy | Example output |
|
|
66
102
|
|----------|----------------|
|
|
67
103
|
| `redact` (default) | `[REDACTED_EMAIL]` |
|
|
68
|
-
| `replace` | `user@example.com` (
|
|
69
|
-
| `token` | `<PII_EMAIL_1>` (unique per type) |
|
|
104
|
+
| `replace` | `user@example.com` (static synthetic) |
|
|
105
|
+
| `token` | `<PII_EMAIL_1>` (unique per type per call) |
|
|
70
106
|
| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
|
|
71
107
|
|
|
72
108
|
## TypeScript
|
|
73
109
|
|
|
74
|
-
Full type definitions included. No `@types/` package needed.
|
|
75
|
-
|
|
76
110
|
```ts
|
|
77
|
-
import {
|
|
111
|
+
import {
|
|
112
|
+
audit, auditBatch, mask,
|
|
113
|
+
type AuditResult, type BatchAuditResult, type PiiFinding,
|
|
114
|
+
} from "@flexorch/audit"
|
|
78
115
|
```
|
|
79
116
|
|
|
80
117
|
## Quality grade
|
|
81
118
|
|
|
82
|
-
|
|
119
|
+
`quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals:
|
|
83
120
|
|
|
84
|
-
| Grade | Score |
|
|
85
|
-
|
|
121
|
+
| Grade | Score | Signal |
|
|
122
|
+
|-------|-------|--------|
|
|
86
123
|
| A | ≥ 0.85 | Ready for LLM training or RAG |
|
|
87
124
|
| B | ≥ 0.65 | Usable with minor cleanup |
|
|
88
|
-
| C | ≥ 0.40 |
|
|
125
|
+
| C | ≥ 0.40 | Review before use |
|
|
89
126
|
| D | < 0.40 | Not suitable — empty, too short, or high noise |
|
|
90
127
|
|
|
91
|
-
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
92
|
-
|
|
128
|
+
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
129
|
+
`lengthScore = Math.min(charCount / 500, 1.0)` · `noiseScore = Math.max(0, 1 − garbageRatio × 10)`
|
|
93
130
|
|
|
94
|
-
##
|
|
95
|
-
|
|
96
|
-
`duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
|
|
97
|
-
|
|
98
|
-
```ts
|
|
99
|
-
const texts = dataset.map((r) => r.text)
|
|
100
|
-
const seen = new Set<string>()
|
|
101
|
-
let duplicates = 0
|
|
102
|
-
for (const t of texts) {
|
|
103
|
-
if (seen.has(t)) duplicates++
|
|
104
|
-
else seen.add(t)
|
|
105
|
-
}
|
|
106
|
-
const duplicateRatio = duplicates / texts.length
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
## Limitations (v0.2)
|
|
131
|
+
## Limitations (v0.4)
|
|
110
132
|
|
|
111
133
|
- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
|
|
112
|
-
- `
|
|
113
|
-
- IPv6 not detected.
|
|
114
|
-
- IBAN format-only check; mod-97 validation not performed.
|
|
134
|
+
- `replace` masking strategy uses static synthetic values; locale-aware realistic synthesis is not yet implemented.
|
|
115
135
|
|
|
116
136
|
## Also available for Python
|
|
117
137
|
|
|
@@ -119,6 +139,10 @@ const duplicateRatio = duplicates / texts.length
|
|
|
119
139
|
pip install flexorch-audit
|
|
120
140
|
```
|
|
121
141
|
|
|
142
|
+
## Contributing
|
|
143
|
+
|
|
144
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
145
|
+
|
|
122
146
|
## License
|
|
123
147
|
|
|
124
148
|
MIT
|
package/dist/index.cjs
CHANGED
|
@@ -274,7 +274,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
274
274
|
}
|
|
275
275
|
|
|
276
276
|
// src/index.ts
|
|
277
|
-
var version = "0.3.
|
|
277
|
+
var version = "0.3.1";
|
|
278
278
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
279
279
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
280
280
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
package/dist/index.d.cts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.3.
|
|
48
|
+
declare const version = "0.3.1";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
package/dist/index.d.ts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.3.
|
|
48
|
+
declare const version = "0.3.1";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
package/dist/index.js
CHANGED
|
@@ -241,7 +241,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
241
241
|
}
|
|
242
242
|
|
|
243
243
|
// src/index.ts
|
|
244
|
-
var version = "0.3.
|
|
244
|
+
var version = "0.3.1";
|
|
245
245
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
246
246
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
247
247
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|