@flexorch/audit 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +275 -148
- package/dist/index.cjs +216 -7
- package/dist/index.d.cts +25 -10
- package/dist/index.d.ts +25 -10
- package/dist/index.js +215 -7
- package/package.json +49 -49
package/README.md
CHANGED
|
@@ -1,148 +1,275 @@
|
|
|
1
|
-
# @flexorch/audit
|
|
2
|
-
|
|
3
|
-
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
4
|
-
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
5
|
-
[](LICENSE)
|
|
6
|
-
|
|
7
|
-
Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
|
|
8
|
-
|
|
9
|
-
## Why
|
|
10
|
-
|
|
11
|
-
Before feeding documents into an LLM pipeline you need to answer three questions:
|
|
12
|
-
|
|
13
|
-
1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
|
|
14
|
-
2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
|
|
15
|
-
3. **How bad is the noise?** Garbled encodings and
|
|
16
|
-
|
|
17
|
-
Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
|
|
18
|
-
|
|
19
|
-
## Features
|
|
20
|
-
|
|
21
|
-
- **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
|
|
22
|
-
- **
|
|
23
|
-
- **
|
|
24
|
-
- **
|
|
25
|
-
- **Masking** — four strategies: redact, replace (synthetic), token, hash
|
|
26
|
-
- **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
|
|
27
|
-
- **TypeScript-first** — full type definitions, no `@types/` package needed
|
|
28
|
-
|
|
29
|
-
## Install
|
|
30
|
-
|
|
31
|
-
```bash
|
|
32
|
-
npm install @flexorch/audit
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## Quick start
|
|
36
|
-
|
|
37
|
-
```ts
|
|
38
|
-
import { audit, mask } from "@flexorch/audit"
|
|
39
|
-
import { readFileSync } from "fs"
|
|
40
|
-
|
|
41
|
-
const text = readFileSync("contract.txt", "utf8")
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
result.
|
|
47
|
-
|
|
48
|
-
result.
|
|
49
|
-
result.
|
|
50
|
-
result.
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
//
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
batch
|
|
69
|
-
|
|
70
|
-
batch.
|
|
71
|
-
batch.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
| `
|
|
79
|
-
|
|
80
|
-
| `"
|
|
81
|
-
| `"all"` |
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
|
86
|
-
|
|
87
|
-
| `
|
|
88
|
-
| `
|
|
89
|
-
| `
|
|
90
|
-
| `
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
|
102
|
-
|
|
103
|
-
| `
|
|
104
|
-
| `
|
|
105
|
-
| `
|
|
106
|
-
| `
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
`
|
|
120
|
-
|
|
121
|
-
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
1
|
+
# @flexorch/audit
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
4
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
|
|
8
|
+
|
|
9
|
+
## Why
|
|
10
|
+
|
|
11
|
+
Before feeding documents into an LLM pipeline you need to answer three questions:
|
|
12
|
+
|
|
13
|
+
1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
|
|
14
|
+
2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
|
|
15
|
+
3. **How bad is the noise?** Garbled encodings and symbol clutter degrade model output silently.
|
|
16
|
+
|
|
17
|
+
Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
|
|
22
|
+
- **Noise ratio** — line-level symbol clutter detection (`noise_ratio`); values above 0.20 indicate likely extraction artifacts
|
|
23
|
+
- **PII detection** — 30+ types across 8 countries (TR/DE/FR/IT/NL/ES/UK/US) + universal types; all regex-based with checksum validation
|
|
24
|
+
- **Batch audit** — `auditBatch()` aggregates duplicate ratio and PII counts across an entire dataset in one call
|
|
25
|
+
- **Masking** — four strategies: redact, replace (synthetic), token, hash
|
|
26
|
+
- **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
|
|
27
|
+
- **TypeScript-first** — full type definitions, no `@types/` package needed
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npm install @flexorch/audit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick start
|
|
36
|
+
|
|
37
|
+
```ts
|
|
38
|
+
import { audit, mask } from "@flexorch/audit"
|
|
39
|
+
import { readFileSync } from "fs"
|
|
40
|
+
|
|
41
|
+
const text = readFileSync("contract.txt", "utf8") // extract from PDF/DOCX first
|
|
42
|
+
|
|
43
|
+
const result = audit(text) // "und" by default — all detectors active
|
|
44
|
+
// const result = audit(text, { locale: "tr" }) // restrict to TR-only detectors
|
|
45
|
+
|
|
46
|
+
result.quality_grade // "B"
|
|
47
|
+
result.quality_score // 0.73 (0.0–1.0 composite)
|
|
48
|
+
result.noise_ratio // 0.04 (fraction of blank/garbage lines; >0.20 = low quality)
|
|
49
|
+
result.detected_language // "und" (locale you passed in; caller controls language)
|
|
50
|
+
result.pii_summary // [{ type: "email", count: 2 }, { type: "national_id_tr", count: 1 }]
|
|
51
|
+
|
|
52
|
+
result.pii // [{ type: "email", value: "ali@example.com", start: 8, end: 23 }]
|
|
53
|
+
result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
|
|
54
|
+
result.noise // { garbage_ratio: 0.0, encoding_ok: true }
|
|
55
|
+
|
|
56
|
+
const clean = mask(text, result.pii, { strategy: "redact" })
|
|
57
|
+
// "Contact: [REDACTED_EMAIL]"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+

|
|
61
|
+
|
|
62
|
+
## Batch audit
|
|
63
|
+
|
|
64
|
+
```ts
|
|
65
|
+
import { auditBatch } from "@flexorch/audit"
|
|
66
|
+
|
|
67
|
+
const texts = dataset.map((r) => r.text)
|
|
68
|
+
const batch = auditBatch(texts) // locale: "und" by default
|
|
69
|
+
|
|
70
|
+
batch.duplicate_ratio // 0.12 — fraction of exact-duplicate records
|
|
71
|
+
batch.avg_quality_score // 0.78
|
|
72
|
+
batch.pii_summary // [{ type: "email", count: 47 }, ...]
|
|
73
|
+
batch.results // AuditResult[], one per text
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Country coverage
|
|
77
|
+
|
|
78
|
+
| `locale` | Detectors activated |
|
|
79
|
+
|----------|---------------------|
|
|
80
|
+
| `"und"` **(default)** | All locales combined — use when document language is unknown |
|
|
81
|
+
| `"all"` | Alias for `"und"` |
|
|
82
|
+
| `"tr"` | TCKN · VKN · phone_tr · name · IBAN_TR · company_name_tr · MERSIS · postal_code_tr · province_tr |
|
|
83
|
+
| `"de"` | Steueridentifikationsnummer · Sozialversicherungsnummer |
|
|
84
|
+
| `"fr"` | SIREN · SIRET · INSEE/NIR |
|
|
85
|
+
| `"it"` | Codice Fiscale · Partita IVA |
|
|
86
|
+
| `"nl"` | BSN · KvK |
|
|
87
|
+
| `"es"` | DNI/NIE · CIF |
|
|
88
|
+
| `"uk"` | NI number · UTR |
|
|
89
|
+
| `"us"` | SSN · EIN · ITIN |
|
|
90
|
+
| `"eu"` | E.164 phone · IBAN (EU+GB+CH+NO) · company name |
|
|
91
|
+
|
|
92
|
+
Universal detectors (always active regardless of locale): `email` · `iban` · `credit_card` · `ip` · `ip_v6`
|
|
93
|
+
|
|
94
|
+
> **Language detection:** `@flexorch/audit` is zero-dependency — no language detection library is included.
|
|
95
|
+
> Pass the correct `locale` yourself, or use `"und"` (default) to activate all detectors.
|
|
96
|
+
|
|
97
|
+
## PII types
|
|
98
|
+
|
|
99
|
+
### Universal
|
|
100
|
+
|
|
101
|
+
| Type | Description |
|
|
102
|
+
|------|-------------|
|
|
103
|
+
| `email` | RFC-5321 email address |
|
|
104
|
+
| `iban` | ISO 13616 IBAN — mod-97 validated; suppressed when `iban_tr` or `iban_intl` fires on same span |
|
|
105
|
+
| `credit_card` | 16-digit groups, Luhn-validated |
|
|
106
|
+
| `ip` | IPv4 address |
|
|
107
|
+
| `ip_v6` | IPv6 — full, compressed `::`, loopback forms |
|
|
108
|
+
|
|
109
|
+
### Turkey (`locale="tr"`)
|
|
110
|
+
|
|
111
|
+
| Type | Description |
|
|
112
|
+
|------|-------------|
|
|
113
|
+
| `national_id_tr` | TCKN — 11-digit, modular arithmetic checksum |
|
|
114
|
+
| `tax_id_tr` | VKN — 10-digit, Luhn-variant checksum |
|
|
115
|
+
| `phone_tr` | Turkish mobile: `+90`/`0` prefix + 10 digits |
|
|
116
|
+
| `name` | Label-prefixed name: `Adı:`, `Full Name:`, `Customer Name:`, etc. |
|
|
117
|
+
| `iban_tr` | Turkish IBAN (`TR` + 24 chars), mod-97 validated |
|
|
118
|
+
| `company_name_tr` | Company with TR legal suffix: A.Ş. · Ltd.Şti. · Koll.Şti. · Koop. · T.A.Ş. |
|
|
119
|
+
| `mersis_no` | MERSIS — 16-digit company registry number |
|
|
120
|
+
| `postal_code_tr` | Turkish postal code (province plate 01–81) |
|
|
121
|
+
| `province_tr` | All 81 Turkish provinces |
|
|
122
|
+
|
|
123
|
+
### Germany (`locale="de"`)
|
|
124
|
+
|
|
125
|
+
| Type | Description |
|
|
126
|
+
|------|-------------|
|
|
127
|
+
| `tax_id_de` | Steueridentifikationsnummer — 11 digits, ISO 7064 MOD 11,2 checksum |
|
|
128
|
+
| `social_id_de` | Sozialversicherungsnummer — area + DOB + letter + serial |
|
|
129
|
+
|
|
130
|
+
### France (`locale="fr"`)
|
|
131
|
+
|
|
132
|
+
| Type | Description |
|
|
133
|
+
|------|-------------|
|
|
134
|
+
| `siret_fr` | SIRET — 14 digits, label-prefix gated |
|
|
135
|
+
| `company_id_fr` | SIREN — 9 digits, label-prefix gated |
|
|
136
|
+
| `social_id_fr` | INSEE/NIR — 15 digits, starts with `1` or `2` |
|
|
137
|
+
|
|
138
|
+
### Italy (`locale="it"`)
|
|
139
|
+
|
|
140
|
+
| Type | Description |
|
|
141
|
+
|------|-------------|
|
|
142
|
+
| `national_id_it` | Codice Fiscale — 16 chars alphanumeric, uppercase normalized |
|
|
143
|
+
| `tax_id_it` | Partita IVA — 11 digits, Agenzia delle Entrate checksum |
|
|
144
|
+
|
|
145
|
+
### Netherlands (`locale="nl"`)
|
|
146
|
+
|
|
147
|
+
| Type | Description |
|
|
148
|
+
|------|-------------|
|
|
149
|
+
| `national_id_nl` | BSN — 9 digits, 11-check (weighted sum mod 11) |
|
|
150
|
+
| `company_id_nl` | KvK — 8 digits, label-prefix gated |
|
|
151
|
+
|
|
152
|
+
### Spain (`locale="es"`)
|
|
153
|
+
|
|
154
|
+
| Type | Description |
|
|
155
|
+
|------|-------------|
|
|
156
|
+
| `national_id_es` | DNI (8 digits + letter, mod-23) and NIE (X/Y/Z prefix, same check) |
|
|
157
|
+
| `tax_id_es` | CIF — letter prefix + 7 digits + control character |
|
|
158
|
+
|
|
159
|
+
### United Kingdom (`locale="uk"`)
|
|
160
|
+
|
|
161
|
+
| Type | Description |
|
|
162
|
+
|------|-------------|
|
|
163
|
+
| `social_id_uk` | NI number — 2 letters + 6 digits + A/B/C/D; HMRC forbidden prefixes excluded |
|
|
164
|
+
| `tax_id_uk` | UTR — 10 digits, label-prefix gated |
|
|
165
|
+
|
|
166
|
+
### United States (`locale="us"`)
|
|
167
|
+
|
|
168
|
+
| Type | Description |
|
|
169
|
+
|------|-------------|
|
|
170
|
+
| `ssn` | SSN — `###-##-####`, invalid prefixes (000/666/9xx) excluded |
|
|
171
|
+
| `tax_id_us` | EIN — `XX-XXXXXXX`, IRS invalid area prefixes excluded |
|
|
172
|
+
| `national_id_us` | ITIN — `9XX-7X/8X/9X-XXXX` middle group validated |
|
|
173
|
+
|
|
174
|
+
### EU / International (`locale="eu"`)
|
|
175
|
+
|
|
176
|
+
| Type | Description |
|
|
177
|
+
|------|-------------|
|
|
178
|
+
| `phone_intl` | E.164 international phone — 7–15 digits, TR (+90) excluded |
|
|
179
|
+
| `iban_intl` | IBAN for EU+GB+CH+NO — ISO 13616 country+length table + mod-97 |
|
|
180
|
+
| `company_name_intl` | Company with international suffix: GmbH · LLC · S.r.l. · B.V. · SAS · Inc. · Ltd. etc. |
|
|
181
|
+
|
|
182
|
+
## Noise detection
|
|
183
|
+
|
|
184
|
+
`noise_ratio` measures the fraction of lines that are blank or contain symbol clutter:
|
|
185
|
+
|
|
186
|
+
```ts
|
|
187
|
+
const result = audit("clean line\n@@@garbage\n\nclean")
|
|
188
|
+
result.noise_ratio // 0.5 (2 noisy lines out of 4)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
A line is "noisy" when it is blank (after trim) or contains 3+ consecutive characters from `@ # ! ~ * =`.
|
|
192
|
+
|
|
193
|
+
| `noise_ratio` | Signal |
|
|
194
|
+
|---------------|--------|
|
|
195
|
+
| `< 0.05` | Clean — likely well-extracted text |
|
|
196
|
+
| `0.05–0.20` | Acceptable — minor formatting artifacts |
|
|
197
|
+
| `> 0.20` | Low quality — likely OCR noise or extraction failure |
|
|
198
|
+
|
|
199
|
+
## Masking strategies
|
|
200
|
+
|
|
201
|
+
```ts
|
|
202
|
+
const clean = mask(text, result.pii) // redact (default)
|
|
203
|
+
const clean = mask(text, result.pii, { strategy: "token" })
|
|
204
|
+
const clean = mask(text, result.pii, { strategy: "hash" })
|
|
205
|
+
const clean = mask(text, result.pii, { strategy: "replace" })
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
| Strategy | Example output |
|
|
209
|
+
|----------|----------------|
|
|
210
|
+
| `redact` (default) | `[REDACTED_EMAIL]` |
|
|
211
|
+
| `replace` | `user@example.com` (static synthetic) |
|
|
212
|
+
| `token` | `<PII_EMAIL_1>` (unique per type per call) |
|
|
213
|
+
| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
|
|
214
|
+
|
|
215
|
+
## TypeScript
|
|
216
|
+
|
|
217
|
+
Full type definitions — no `@types/` package needed:
|
|
218
|
+
|
|
219
|
+
```ts
|
|
220
|
+
import {
|
|
221
|
+
audit, auditBatch, mask,
|
|
222
|
+
type AuditResult, type BatchAuditResult,
|
|
223
|
+
type PiiFinding, type AuditOptions,
|
|
224
|
+
} from "@flexorch/audit"
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
`AuditResult` includes:
|
|
228
|
+
|
|
229
|
+
```ts
|
|
230
|
+
interface AuditResult {
|
|
231
|
+
quality_grade: "A" | "B" | "C" | "D"
|
|
232
|
+
quality_score: number
|
|
233
|
+
noise_ratio: number
|
|
234
|
+
detected_language: string
|
|
235
|
+
pii_summary: { type: string; count: number }[]
|
|
236
|
+
pii: { type: string; value: string; start: number; end: number }[]
|
|
237
|
+
quality: { completeness: number; avg_length: number; duplicate_ratio: number | null }
|
|
238
|
+
noise: { garbage_ratio: number; encoding_ok: boolean }
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## Quality grade
|
|
243
|
+
|
|
244
|
+
`quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals:
|
|
245
|
+
|
|
246
|
+
| Grade | Score | Signal |
|
|
247
|
+
|-------|-------|--------|
|
|
248
|
+
| A | ≥ 0.85 | Ready for LLM training or RAG |
|
|
249
|
+
| B | ≥ 0.65 | Usable with minor cleanup |
|
|
250
|
+
| C | ≥ 0.40 | Review before use |
|
|
251
|
+
| D | < 0.40 | Not suitable — empty, too short, or high noise |
|
|
252
|
+
|
|
253
|
+
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
254
|
+
`lengthScore = Math.min(charCount / 500, 1.0)` · `noiseScore = Math.max(0, 1 − garbageRatio × 10)`
|
|
255
|
+
|
|
256
|
+
## Limitations
|
|
257
|
+
|
|
258
|
+
- **No automatic language detection** — `@flexorch/audit` has zero dependencies. Pass `locale` explicitly, or use the default `"und"` to activate all detectors.
|
|
259
|
+
- **Free-standing name detection** (without a label prefix) requires NLP/NER — not included.
|
|
260
|
+
- `replace` masking uses static synthetic values; locale-aware realistic synthesis is not implemented.
|
|
261
|
+
- The library audits plain text. PDF/DOCX parsing, e-invoice extraction, and pipeline orchestration are out of scope.
|
|
262
|
+
|
|
263
|
+
## Also available for Python
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
pip install flexorch-audit
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Contributing
|
|
270
|
+
|
|
271
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
272
|
+
|
|
273
|
+
## License
|
|
274
|
+
|
|
275
|
+
MIT
|
package/dist/index.cjs
CHANGED
|
@@ -26,6 +26,7 @@ __export(index_exports, {
|
|
|
26
26
|
detectPii: () => detectPii,
|
|
27
27
|
mask: () => mask,
|
|
28
28
|
noiseMetrics: () => noiseMetrics,
|
|
29
|
+
noiseRatio: () => noiseRatio,
|
|
29
30
|
qualityMetrics: () => qualityMetrics,
|
|
30
31
|
version: () => version
|
|
31
32
|
});
|
|
@@ -190,6 +191,23 @@ var COMPANY_NAME_INTL_RE = new RegExp(
|
|
|
190
191
|
"gu"
|
|
191
192
|
);
|
|
192
193
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
194
|
+
var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
|
|
195
|
+
var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
|
|
196
|
+
var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
|
|
197
|
+
var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
|
|
198
|
+
var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
|
|
199
|
+
var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
|
|
200
|
+
var INSEE_FR_RE = /\b([12]\d{14})\b/g;
|
|
201
|
+
var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
|
|
202
|
+
var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
|
|
203
|
+
var BSN_NL_RE = /\b(\d{9})\b/g;
|
|
204
|
+
var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
|
|
205
|
+
var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
206
|
+
var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
|
|
207
|
+
var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
|
|
208
|
+
var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
|
|
209
|
+
var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
|
|
210
|
+
var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
|
|
193
211
|
function validTckn(s) {
|
|
194
212
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
195
213
|
const d = s.split("").map(Number);
|
|
@@ -250,6 +268,71 @@ function validPhoneIntl(raw) {
|
|
|
250
268
|
const digits = raw.replace(/\D/g, "");
|
|
251
269
|
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
252
270
|
}
|
|
271
|
+
function validSteuerIdDe(s) {
|
|
272
|
+
if (s.length !== 11 || s[0] === "0") return false;
|
|
273
|
+
let product = 10;
|
|
274
|
+
for (let i = 0; i < 10; i++) {
|
|
275
|
+
let total = (parseInt(s[i]) + product) % 10;
|
|
276
|
+
if (total === 0) total = 10;
|
|
277
|
+
product = total * 2 % 11;
|
|
278
|
+
}
|
|
279
|
+
let check = 11 - product;
|
|
280
|
+
if (check === 10) check = 0;
|
|
281
|
+
return check === parseInt(s[10]);
|
|
282
|
+
}
|
|
283
|
+
function validPartitaIvaIt(s) {
|
|
284
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
285
|
+
let oddSum = 0;
|
|
286
|
+
let evenSum = 0;
|
|
287
|
+
for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
|
|
288
|
+
for (let i = 1; i < 10; i += 2) {
|
|
289
|
+
let v = parseInt(s[i]) * 2;
|
|
290
|
+
evenSum += v < 10 ? v : v - 9;
|
|
291
|
+
}
|
|
292
|
+
return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
|
|
293
|
+
}
|
|
294
|
+
function validBsnNl(s) {
|
|
295
|
+
if (s.length !== 9 || !/^\d+$/.test(s)) return false;
|
|
296
|
+
let total = 0;
|
|
297
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
298
|
+
total -= parseInt(s[8]);
|
|
299
|
+
return total > 0 && total % 11 === 0;
|
|
300
|
+
}
|
|
301
|
+
function validDniEs(s) {
|
|
302
|
+
if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
|
|
303
|
+
return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
|
|
304
|
+
}
|
|
305
|
+
function validNieEs(s) {
|
|
306
|
+
if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
|
|
307
|
+
const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
|
|
308
|
+
return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
|
|
309
|
+
}
|
|
310
|
+
var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
|
|
311
|
+
function validNiUk(s) {
|
|
312
|
+
return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
|
|
313
|
+
}
|
|
314
|
+
var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
315
|
+
"00",
|
|
316
|
+
"07",
|
|
317
|
+
"08",
|
|
318
|
+
"09",
|
|
319
|
+
"17",
|
|
320
|
+
"18",
|
|
321
|
+
"19",
|
|
322
|
+
"28",
|
|
323
|
+
"29",
|
|
324
|
+
"49",
|
|
325
|
+
"69",
|
|
326
|
+
"70",
|
|
327
|
+
"78",
|
|
328
|
+
"79",
|
|
329
|
+
"89",
|
|
330
|
+
"96",
|
|
331
|
+
"97"
|
|
332
|
+
]);
|
|
333
|
+
function validEinUs(s) {
|
|
334
|
+
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
335
|
+
}
|
|
253
336
|
var LOCALE_DETECTORS = {
|
|
254
337
|
tr: /* @__PURE__ */ new Set([
|
|
255
338
|
"national_id_tr",
|
|
@@ -262,12 +345,18 @@ var LOCALE_DETECTORS = {
|
|
|
262
345
|
"postal_code_tr",
|
|
263
346
|
"province_tr"
|
|
264
347
|
]),
|
|
265
|
-
us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
|
|
266
|
-
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
|
|
348
|
+
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
349
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
350
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
351
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
352
|
+
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
353
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
354
|
+
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
355
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
267
356
|
};
|
|
268
357
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
269
358
|
function activeDetectors(locale) {
|
|
270
|
-
if (locale === "all") {
|
|
359
|
+
if (locale === "all" || locale === "und") {
|
|
271
360
|
const active2 = new Set(UNIVERSAL);
|
|
272
361
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
273
362
|
detectors.forEach((d) => active2.add(d));
|
|
@@ -287,7 +376,7 @@ function findAll(re, text, type) {
|
|
|
287
376
|
}
|
|
288
377
|
return results;
|
|
289
378
|
}
|
|
290
|
-
function detectPii(text, locale = "
|
|
379
|
+
function detectPii(text, locale = "und") {
|
|
291
380
|
const active = activeDetectors(locale);
|
|
292
381
|
const t = text ?? "";
|
|
293
382
|
let findings = [];
|
|
@@ -389,6 +478,115 @@ function detectPii(text, locale = "tr") {
|
|
|
389
478
|
}
|
|
390
479
|
}
|
|
391
480
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
481
|
+
if (active.has("tax_id_us")) {
|
|
482
|
+
EIN_US_RE.lastIndex = 0;
|
|
483
|
+
let m;
|
|
484
|
+
while ((m = EIN_US_RE.exec(t)) !== null) {
|
|
485
|
+
if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
if (active.has("national_id_us")) {
|
|
489
|
+
ITIN_US_RE.lastIndex = 0;
|
|
490
|
+
let m;
|
|
491
|
+
while ((m = ITIN_US_RE.exec(t)) !== null) {
|
|
492
|
+
findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
if (active.has("tax_id_de")) {
|
|
496
|
+
STEUER_ID_DE_RE.lastIndex = 0;
|
|
497
|
+
let m;
|
|
498
|
+
while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
|
|
499
|
+
if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
if (active.has("social_id_de")) {
|
|
503
|
+
SVNR_DE_RE.lastIndex = 0;
|
|
504
|
+
let m;
|
|
505
|
+
while ((m = SVNR_DE_RE.exec(t)) !== null) {
|
|
506
|
+
findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
if (active.has("siret_fr")) {
|
|
510
|
+
SIRET_FR_RE.lastIndex = 0;
|
|
511
|
+
let m;
|
|
512
|
+
while ((m = SIRET_FR_RE.exec(t)) !== null) {
|
|
513
|
+
findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
if (active.has("company_id_fr")) {
|
|
517
|
+
SIREN_FR_RE.lastIndex = 0;
|
|
518
|
+
let m;
|
|
519
|
+
while ((m = SIREN_FR_RE.exec(t)) !== null) {
|
|
520
|
+
findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
if (active.has("social_id_fr")) {
|
|
524
|
+
INSEE_FR_RE.lastIndex = 0;
|
|
525
|
+
let m;
|
|
526
|
+
while ((m = INSEE_FR_RE.exec(t)) !== null) {
|
|
527
|
+
findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
if (active.has("national_id_it")) {
|
|
531
|
+
CODICE_FISCALE_IT_RE.lastIndex = 0;
|
|
532
|
+
let m;
|
|
533
|
+
while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
|
|
534
|
+
findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
if (active.has("tax_id_it")) {
|
|
538
|
+
PARTITA_IVA_IT_RE.lastIndex = 0;
|
|
539
|
+
let m;
|
|
540
|
+
while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
|
|
541
|
+
if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (active.has("national_id_nl")) {
|
|
545
|
+
BSN_NL_RE.lastIndex = 0;
|
|
546
|
+
let m;
|
|
547
|
+
while ((m = BSN_NL_RE.exec(t)) !== null) {
|
|
548
|
+
if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
if (active.has("company_id_nl")) {
|
|
552
|
+
KVK_NL_RE.lastIndex = 0;
|
|
553
|
+
let m;
|
|
554
|
+
while ((m = KVK_NL_RE.exec(t)) !== null) {
|
|
555
|
+
findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
if (active.has("national_id_es")) {
|
|
559
|
+
DNI_ES_RE.lastIndex = 0;
|
|
560
|
+
let m;
|
|
561
|
+
while ((m = DNI_ES_RE.exec(t)) !== null) {
|
|
562
|
+
if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
563
|
+
}
|
|
564
|
+
NIE_ES_RE.lastIndex = 0;
|
|
565
|
+
while ((m = NIE_ES_RE.exec(t)) !== null) {
|
|
566
|
+
if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
if (active.has("tax_id_es")) {
|
|
570
|
+
CIF_ES_RE.lastIndex = 0;
|
|
571
|
+
let m;
|
|
572
|
+
while ((m = CIF_ES_RE.exec(t)) !== null) {
|
|
573
|
+
findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
if (active.has("social_id_uk")) {
|
|
577
|
+
NI_UK_RE.lastIndex = 0;
|
|
578
|
+
let m;
|
|
579
|
+
while ((m = NI_UK_RE.exec(t)) !== null) {
|
|
580
|
+
if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
if (active.has("tax_id_uk")) {
|
|
584
|
+
UTR_UK_RE.lastIndex = 0;
|
|
585
|
+
let m;
|
|
586
|
+
while ((m = UTR_UK_RE.exec(t)) !== null) {
|
|
587
|
+
findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
588
|
+
}
|
|
589
|
+
}
|
|
392
590
|
if (active.has("iban_intl")) {
|
|
393
591
|
IBAN_INTL_RE.lastIndex = 0;
|
|
394
592
|
let m;
|
|
@@ -437,6 +635,15 @@ function isGarbage(ch) {
|
|
|
437
635
|
return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
|
|
438
636
|
cp >= 55296 && cp <= 57343;
|
|
439
637
|
}
|
|
638
|
+
var LINE_NOISE_RE = /[@#!~*=]{3,}/;
|
|
639
|
+
function noiseRatio(text) {
|
|
640
|
+
if (!text) return 0;
|
|
641
|
+
const lines = text.split("\n");
|
|
642
|
+
const total = lines.length;
|
|
643
|
+
if (total === 0) return 0;
|
|
644
|
+
const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
|
|
645
|
+
return Math.round(noisy / total * 1e4) / 1e4;
|
|
646
|
+
}
|
|
440
647
|
function noiseMetrics(text) {
|
|
441
648
|
if (!text) return { garbage_ratio: 0, encoding_ok: true };
|
|
442
649
|
const n = text.length;
|
|
@@ -493,7 +700,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
493
700
|
}
|
|
494
701
|
|
|
495
702
|
// src/index.ts
|
|
496
|
-
var version = "0.
|
|
703
|
+
var version = "0.5.1";
|
|
497
704
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
498
705
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
499
706
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -506,10 +713,11 @@ function computeQualityGrade(score) {
|
|
|
506
713
|
return "D";
|
|
507
714
|
}
|
|
508
715
|
function audit(text, options = {}) {
|
|
509
|
-
const locale = options.locale ?? "
|
|
716
|
+
const locale = options.locale ?? "und";
|
|
510
717
|
const pii = detectPii(text, locale);
|
|
511
718
|
const quality = qualityMetrics(text);
|
|
512
719
|
const noise = noiseMetrics(text);
|
|
720
|
+
const noise_ratio = noiseRatio(text);
|
|
513
721
|
const quality_score = computeQualityScore(
|
|
514
722
|
quality.completeness,
|
|
515
723
|
quality.avg_length,
|
|
@@ -519,7 +727,7 @@ function audit(text, options = {}) {
|
|
|
519
727
|
const counts = /* @__PURE__ */ new Map();
|
|
520
728
|
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
521
729
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
522
|
-
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
730
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
|
|
523
731
|
}
|
|
524
732
|
function auditBatch(texts, options = {}) {
|
|
525
733
|
if (texts.length === 0) {
|
|
@@ -551,6 +759,7 @@ function mask(text, findings, options = {}) {
|
|
|
551
759
|
detectPii,
|
|
552
760
|
mask,
|
|
553
761
|
noiseMetrics,
|
|
762
|
+
noiseRatio,
|
|
554
763
|
qualityMetrics,
|
|
555
764
|
version
|
|
556
765
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -17,6 +17,12 @@ interface NoiseMetrics {
|
|
|
17
17
|
garbage_ratio: number;
|
|
18
18
|
encoding_ok: boolean;
|
|
19
19
|
}
|
|
20
|
+
/**
|
|
21
|
+
* Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
|
|
22
|
+
* Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
|
|
23
|
+
* indicate a document likely to reduce extraction quality.
|
|
24
|
+
*/
|
|
25
|
+
declare function noiseRatio(text: string): number;
|
|
20
26
|
declare function noiseMetrics(text: string): NoiseMetrics;
|
|
21
27
|
|
|
22
28
|
type MaskStrategy = "redact" | "replace" | "token" | "hash";
|
|
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
30
36
|
* import { readFileSync } from "fs"
|
|
31
37
|
*
|
|
32
38
|
* const text = readFileSync("contract.txt", "utf8")
|
|
33
|
-
* const result = audit(text
|
|
39
|
+
* const result = audit(text) // locale defaults to "und" (all detectors)
|
|
40
|
+
* const result = audit(text, { locale: "tr" }) // Turkish-only detectors
|
|
34
41
|
*
|
|
35
|
-
* result.quality_grade
|
|
36
|
-
* result.quality_score
|
|
37
|
-
* result.
|
|
42
|
+
* result.quality_grade // "A"
|
|
43
|
+
* result.quality_score // 0.91
|
|
44
|
+
* result.noise_ratio // 0.03 (line-level noise fraction)
|
|
45
|
+
* result.detected_language // "und" (locale passed in — caller controls language)
|
|
46
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
47
|
*
|
|
39
48
|
* // Raw findings and metrics also available:
|
|
40
49
|
* result.pii // [{ type, value, start, end }, ...]
|
|
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
55
|
*/
|
|
47
56
|
|
|
48
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.5.1";
|
|
49
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
59
|
interface PiiSummaryEntry {
|
|
51
60
|
type: string;
|
|
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
|
|
|
54
63
|
interface AuditOptions {
|
|
55
64
|
/**
|
|
56
65
|
* Active locale-specific detectors.
|
|
57
|
-
* - "
|
|
58
|
-
* - "
|
|
59
|
-
* - "
|
|
60
|
-
* - "
|
|
66
|
+
* - "und" — All detectors combined (default; use when language is unknown)
|
|
67
|
+
* - "all" — Alias for "und"
|
|
68
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
|
|
69
|
+
* - "us" — US: SSN, EIN, ITIN, E.164 phone, company_name_intl
|
|
70
|
+
* - "eu" — EU: E.164 phone, iban_intl, company_name_intl
|
|
71
|
+
* - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
|
|
61
72
|
*
|
|
62
73
|
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
74
|
*/
|
|
@@ -74,6 +85,10 @@ interface AuditResult {
|
|
|
74
85
|
pii: PiiFinding[];
|
|
75
86
|
quality: QualityMetrics;
|
|
76
87
|
noise: NoiseMetrics;
|
|
88
|
+
/** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
|
|
89
|
+
noise_ratio: number;
|
|
90
|
+
/** The locale value passed to audit() — caller-controlled language selection. */
|
|
91
|
+
detected_language: string;
|
|
77
92
|
}
|
|
78
93
|
interface MaskOptions {
|
|
79
94
|
/** @default "redact" */
|
|
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
102
117
|
*/
|
|
103
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
104
119
|
|
|
105
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
120
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -17,6 +17,12 @@ interface NoiseMetrics {
|
|
|
17
17
|
garbage_ratio: number;
|
|
18
18
|
encoding_ok: boolean;
|
|
19
19
|
}
|
|
20
|
+
/**
|
|
21
|
+
* Fraction of lines that are blank or contain symbol noise (`[@#!~*=]{3+}`).
|
|
22
|
+
* Mirrors the FlexOrch pipeline quality-step threshold — values above 0.20
|
|
23
|
+
* indicate a document likely to reduce extraction quality.
|
|
24
|
+
*/
|
|
25
|
+
declare function noiseRatio(text: string): number;
|
|
20
26
|
declare function noiseMetrics(text: string): NoiseMetrics;
|
|
21
27
|
|
|
22
28
|
type MaskStrategy = "redact" | "replace" | "token" | "hash";
|
|
@@ -30,11 +36,14 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
30
36
|
* import { readFileSync } from "fs"
|
|
31
37
|
*
|
|
32
38
|
* const text = readFileSync("contract.txt", "utf8")
|
|
33
|
-
* const result = audit(text
|
|
39
|
+
* const result = audit(text) // locale defaults to "und" (all detectors)
|
|
40
|
+
* const result = audit(text, { locale: "tr" }) // Turkish-only detectors
|
|
34
41
|
*
|
|
35
|
-
* result.quality_grade
|
|
36
|
-
* result.quality_score
|
|
37
|
-
* result.
|
|
42
|
+
* result.quality_grade // "A"
|
|
43
|
+
* result.quality_score // 0.91
|
|
44
|
+
* result.noise_ratio // 0.03 (line-level noise fraction)
|
|
45
|
+
* result.detected_language // "und" (locale passed in — caller controls language)
|
|
46
|
+
* result.pii_summary // [{ type: "national_id_tr", count: 3 }, ...]
|
|
38
47
|
*
|
|
39
48
|
* // Raw findings and metrics also available:
|
|
40
49
|
* result.pii // [{ type, value, start, end }, ...]
|
|
@@ -45,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
55
|
*/
|
|
47
56
|
|
|
48
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.5.1";
|
|
49
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
59
|
interface PiiSummaryEntry {
|
|
51
60
|
type: string;
|
|
@@ -54,10 +63,12 @@ interface PiiSummaryEntry {
|
|
|
54
63
|
interface AuditOptions {
|
|
55
64
|
/**
|
|
56
65
|
* Active locale-specific detectors.
|
|
57
|
-
* - "
|
|
58
|
-
* - "
|
|
59
|
-
* - "
|
|
60
|
-
* - "
|
|
66
|
+
* - "und" — All detectors combined (default; use when language is unknown)
|
|
67
|
+
* - "all" — Alias for "und"
|
|
68
|
+
* - "tr" — Turkish: TCKN, VKN, phone_tr, name, iban_tr, company_name_tr, mersis_no, postal_code_tr, province_tr
|
|
69
|
+
* - "us" — US: SSN, EIN, ITIN, E.164 phone, company_name_intl
|
|
70
|
+
* - "eu" — EU: E.164 phone, iban_intl, company_name_intl
|
|
71
|
+
* - "de" / "fr" / "it" / "nl" / "es" / "uk" — country-specific detectors
|
|
61
72
|
*
|
|
62
73
|
* Universal detectors (email, iban, credit_card, ip, ip_v6) are always active.
|
|
63
74
|
*/
|
|
@@ -74,6 +85,10 @@ interface AuditResult {
|
|
|
74
85
|
pii: PiiFinding[];
|
|
75
86
|
quality: QualityMetrics;
|
|
76
87
|
noise: NoiseMetrics;
|
|
88
|
+
/** Fraction of lines that are blank or contain symbol noise (>0.20 = low quality). */
|
|
89
|
+
noise_ratio: number;
|
|
90
|
+
/** The locale value passed to audit() — caller-controlled language selection. */
|
|
91
|
+
detected_language: string;
|
|
77
92
|
}
|
|
78
93
|
interface MaskOptions {
|
|
79
94
|
/** @default "redact" */
|
|
@@ -102,4 +117,4 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
102
117
|
*/
|
|
103
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
104
119
|
|
|
105
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, qualityMetrics, version };
|
|
120
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
package/dist/index.js
CHANGED
|
@@ -157,6 +157,23 @@ var COMPANY_NAME_INTL_RE = new RegExp(
|
|
|
157
157
|
"gu"
|
|
158
158
|
);
|
|
159
159
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
160
|
+
var EIN_US_RE = /\b(\d{2}-\d{7})\b/g;
|
|
161
|
+
var ITIN_US_RE = /\b(9\d{2}-(?:7[0-9]|8[0-8]|9[0-24-9])-\d{4})\b/g;
|
|
162
|
+
var STEUER_ID_DE_RE = /\b([1-9]\d{10})\b/g;
|
|
163
|
+
var SVNR_DE_RE = /\b(\d{4}[01]\d[0-3]\d[A-Z]\d{4})\b/g;
|
|
164
|
+
var SIRET_FR_RE = /(?:SIRET|N°\s*SIRET|Num[eé]ro\s+SIRET|RCS)\s*[:#]*\s*(\d{14})\b/gi;
|
|
165
|
+
var SIREN_FR_RE = /(?:SIREN|N°\s*SIREN|Num[eé]ro\s+SIREN)\s*[:#]*\s*(\d{9})\b/gi;
|
|
166
|
+
var INSEE_FR_RE = /\b([12]\d{14})\b/g;
|
|
167
|
+
var CODICE_FISCALE_IT_RE = /\b([A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z])\b/gi;
|
|
168
|
+
var PARTITA_IVA_IT_RE = /\b(\d{11})\b/g;
|
|
169
|
+
var BSN_NL_RE = /\b(\d{9})\b/g;
|
|
170
|
+
var KVK_NL_RE = /(?:KVK|KvK|Handelsregister(?:nummer)?)\s*[:#]*\s*(\d{8})\b/gi;
|
|
171
|
+
var _DNI_LETTER_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
172
|
+
var DNI_ES_RE = /\b(\d{8}[A-Z])\b/g;
|
|
173
|
+
var NIE_ES_RE = /\b([XYZ]\d{7}[A-Z])\b/g;
|
|
174
|
+
var CIF_ES_RE = /\b([ABCDEFGHJKLMNPQRSUVW]\d{7}[0-9A-J])\b/g;
|
|
175
|
+
var NI_UK_RE = /\b([A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\d{6}[ABCD])\b/g;
|
|
176
|
+
var UTR_UK_RE = /(?:UTR|Unique\s+Taxpayer(?:\s+Reference)?)\s*[:#]*\s*(\d{10})\b/gi;
|
|
160
177
|
function validTckn(s) {
|
|
161
178
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
162
179
|
const d = s.split("").map(Number);
|
|
@@ -217,6 +234,71 @@ function validPhoneIntl(raw) {
|
|
|
217
234
|
const digits = raw.replace(/\D/g, "");
|
|
218
235
|
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
219
236
|
}
|
|
237
|
+
function validSteuerIdDe(s) {
|
|
238
|
+
if (s.length !== 11 || s[0] === "0") return false;
|
|
239
|
+
let product = 10;
|
|
240
|
+
for (let i = 0; i < 10; i++) {
|
|
241
|
+
let total = (parseInt(s[i]) + product) % 10;
|
|
242
|
+
if (total === 0) total = 10;
|
|
243
|
+
product = total * 2 % 11;
|
|
244
|
+
}
|
|
245
|
+
let check = 11 - product;
|
|
246
|
+
if (check === 10) check = 0;
|
|
247
|
+
return check === parseInt(s[10]);
|
|
248
|
+
}
|
|
249
|
+
function validPartitaIvaIt(s) {
|
|
250
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
251
|
+
let oddSum = 0;
|
|
252
|
+
let evenSum = 0;
|
|
253
|
+
for (let i = 0; i < 10; i += 2) oddSum += parseInt(s[i]);
|
|
254
|
+
for (let i = 1; i < 10; i += 2) {
|
|
255
|
+
let v = parseInt(s[i]) * 2;
|
|
256
|
+
evenSum += v < 10 ? v : v - 9;
|
|
257
|
+
}
|
|
258
|
+
return (10 - (oddSum + evenSum) % 10) % 10 === parseInt(s[10]);
|
|
259
|
+
}
|
|
260
|
+
function validBsnNl(s) {
|
|
261
|
+
if (s.length !== 9 || !/^\d+$/.test(s)) return false;
|
|
262
|
+
let total = 0;
|
|
263
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
264
|
+
total -= parseInt(s[8]);
|
|
265
|
+
return total > 0 && total % 11 === 0;
|
|
266
|
+
}
|
|
267
|
+
function validDniEs(s) {
|
|
268
|
+
if (s.length !== 9 || !/^\d{8}/.test(s)) return false;
|
|
269
|
+
return _DNI_LETTER_TABLE[parseInt(s.slice(0, 8)) % 23] === s[8];
|
|
270
|
+
}
|
|
271
|
+
function validNieEs(s) {
|
|
272
|
+
if (s.length !== 9 || !"XYZ".includes(s[0])) return false;
|
|
273
|
+
const prefix = { X: "0", Y: "1", Z: "2" }[s[0]];
|
|
274
|
+
return _DNI_LETTER_TABLE[parseInt(prefix + s.slice(1, 8)) % 23] === s[8];
|
|
275
|
+
}
|
|
276
|
+
var _NI_UK_FORBIDDEN = /* @__PURE__ */ new Set(["BG", "GB", "KN", "NK", "NT", "TN", "ZZ"]);
|
|
277
|
+
function validNiUk(s) {
|
|
278
|
+
return !_NI_UK_FORBIDDEN.has(s.slice(0, 2).toUpperCase());
|
|
279
|
+
}
|
|
280
|
+
var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
281
|
+
"00",
|
|
282
|
+
"07",
|
|
283
|
+
"08",
|
|
284
|
+
"09",
|
|
285
|
+
"17",
|
|
286
|
+
"18",
|
|
287
|
+
"19",
|
|
288
|
+
"28",
|
|
289
|
+
"29",
|
|
290
|
+
"49",
|
|
291
|
+
"69",
|
|
292
|
+
"70",
|
|
293
|
+
"78",
|
|
294
|
+
"79",
|
|
295
|
+
"89",
|
|
296
|
+
"96",
|
|
297
|
+
"97"
|
|
298
|
+
]);
|
|
299
|
+
function validEinUs(s) {
|
|
300
|
+
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
301
|
+
}
|
|
220
302
|
var LOCALE_DETECTORS = {
|
|
221
303
|
tr: /* @__PURE__ */ new Set([
|
|
222
304
|
"national_id_tr",
|
|
@@ -229,12 +311,18 @@ var LOCALE_DETECTORS = {
|
|
|
229
311
|
"postal_code_tr",
|
|
230
312
|
"province_tr"
|
|
231
313
|
]),
|
|
232
|
-
us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
|
|
233
|
-
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
|
|
314
|
+
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
315
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
316
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
317
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
318
|
+
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
319
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
320
|
+
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
321
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
234
322
|
};
|
|
235
323
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
236
324
|
function activeDetectors(locale) {
|
|
237
|
-
if (locale === "all") {
|
|
325
|
+
if (locale === "all" || locale === "und") {
|
|
238
326
|
const active2 = new Set(UNIVERSAL);
|
|
239
327
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
240
328
|
detectors.forEach((d) => active2.add(d));
|
|
@@ -254,7 +342,7 @@ function findAll(re, text, type) {
|
|
|
254
342
|
}
|
|
255
343
|
return results;
|
|
256
344
|
}
|
|
257
|
-
function detectPii(text, locale = "
|
|
345
|
+
function detectPii(text, locale = "und") {
|
|
258
346
|
const active = activeDetectors(locale);
|
|
259
347
|
const t = text ?? "";
|
|
260
348
|
let findings = [];
|
|
@@ -356,6 +444,115 @@ function detectPii(text, locale = "tr") {
|
|
|
356
444
|
}
|
|
357
445
|
}
|
|
358
446
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
447
|
+
if (active.has("tax_id_us")) {
|
|
448
|
+
EIN_US_RE.lastIndex = 0;
|
|
449
|
+
let m;
|
|
450
|
+
while ((m = EIN_US_RE.exec(t)) !== null) {
|
|
451
|
+
if (validEinUs(m[1])) findings.push({ type: "tax_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (active.has("national_id_us")) {
|
|
455
|
+
ITIN_US_RE.lastIndex = 0;
|
|
456
|
+
let m;
|
|
457
|
+
while ((m = ITIN_US_RE.exec(t)) !== null) {
|
|
458
|
+
findings.push({ type: "national_id_us", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
if (active.has("tax_id_de")) {
|
|
462
|
+
STEUER_ID_DE_RE.lastIndex = 0;
|
|
463
|
+
let m;
|
|
464
|
+
while ((m = STEUER_ID_DE_RE.exec(t)) !== null) {
|
|
465
|
+
if (validSteuerIdDe(m[1])) findings.push({ type: "tax_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
if (active.has("social_id_de")) {
|
|
469
|
+
SVNR_DE_RE.lastIndex = 0;
|
|
470
|
+
let m;
|
|
471
|
+
while ((m = SVNR_DE_RE.exec(t)) !== null) {
|
|
472
|
+
findings.push({ type: "social_id_de", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
if (active.has("siret_fr")) {
|
|
476
|
+
SIRET_FR_RE.lastIndex = 0;
|
|
477
|
+
let m;
|
|
478
|
+
while ((m = SIRET_FR_RE.exec(t)) !== null) {
|
|
479
|
+
findings.push({ type: "siret_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
if (active.has("company_id_fr")) {
|
|
483
|
+
SIREN_FR_RE.lastIndex = 0;
|
|
484
|
+
let m;
|
|
485
|
+
while ((m = SIREN_FR_RE.exec(t)) !== null) {
|
|
486
|
+
findings.push({ type: "company_id_fr", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
if (active.has("social_id_fr")) {
|
|
490
|
+
INSEE_FR_RE.lastIndex = 0;
|
|
491
|
+
let m;
|
|
492
|
+
while ((m = INSEE_FR_RE.exec(t)) !== null) {
|
|
493
|
+
findings.push({ type: "social_id_fr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (active.has("national_id_it")) {
|
|
497
|
+
CODICE_FISCALE_IT_RE.lastIndex = 0;
|
|
498
|
+
let m;
|
|
499
|
+
while ((m = CODICE_FISCALE_IT_RE.exec(t)) !== null) {
|
|
500
|
+
findings.push({ type: "national_id_it", value: m[1].toUpperCase(), start: m.index, end: m.index + m[1].length });
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
if (active.has("tax_id_it")) {
|
|
504
|
+
PARTITA_IVA_IT_RE.lastIndex = 0;
|
|
505
|
+
let m;
|
|
506
|
+
while ((m = PARTITA_IVA_IT_RE.exec(t)) !== null) {
|
|
507
|
+
if (validPartitaIvaIt(m[1])) findings.push({ type: "tax_id_it", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
if (active.has("national_id_nl")) {
|
|
511
|
+
BSN_NL_RE.lastIndex = 0;
|
|
512
|
+
let m;
|
|
513
|
+
while ((m = BSN_NL_RE.exec(t)) !== null) {
|
|
514
|
+
if (validBsnNl(m[1])) findings.push({ type: "national_id_nl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
if (active.has("company_id_nl")) {
|
|
518
|
+
KVK_NL_RE.lastIndex = 0;
|
|
519
|
+
let m;
|
|
520
|
+
while ((m = KVK_NL_RE.exec(t)) !== null) {
|
|
521
|
+
findings.push({ type: "company_id_nl", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
if (active.has("national_id_es")) {
|
|
525
|
+
DNI_ES_RE.lastIndex = 0;
|
|
526
|
+
let m;
|
|
527
|
+
while ((m = DNI_ES_RE.exec(t)) !== null) {
|
|
528
|
+
if (validDniEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
529
|
+
}
|
|
530
|
+
NIE_ES_RE.lastIndex = 0;
|
|
531
|
+
while ((m = NIE_ES_RE.exec(t)) !== null) {
|
|
532
|
+
if (validNieEs(m[1])) findings.push({ type: "national_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
if (active.has("tax_id_es")) {
|
|
536
|
+
CIF_ES_RE.lastIndex = 0;
|
|
537
|
+
let m;
|
|
538
|
+
while ((m = CIF_ES_RE.exec(t)) !== null) {
|
|
539
|
+
findings.push({ type: "tax_id_es", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
if (active.has("social_id_uk")) {
|
|
543
|
+
NI_UK_RE.lastIndex = 0;
|
|
544
|
+
let m;
|
|
545
|
+
while ((m = NI_UK_RE.exec(t)) !== null) {
|
|
546
|
+
if (validNiUk(m[1])) findings.push({ type: "social_id_uk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
if (active.has("tax_id_uk")) {
|
|
550
|
+
UTR_UK_RE.lastIndex = 0;
|
|
551
|
+
let m;
|
|
552
|
+
while ((m = UTR_UK_RE.exec(t)) !== null) {
|
|
553
|
+
findings.push({ type: "tax_id_uk", value: m[1], start: m.index + m[0].indexOf(m[1]), end: m.index + m[0].indexOf(m[1]) + m[1].length });
|
|
554
|
+
}
|
|
555
|
+
}
|
|
359
556
|
if (active.has("iban_intl")) {
|
|
360
557
|
IBAN_INTL_RE.lastIndex = 0;
|
|
361
558
|
let m;
|
|
@@ -404,6 +601,15 @@ function isGarbage(ch) {
|
|
|
404
601
|
return ch === REPLACEMENT_CHAR || cp <= 31 || cp >= 127 && cp <= 159 || cp >= 57344 && cp <= 63743 || // private use area
|
|
405
602
|
cp >= 55296 && cp <= 57343;
|
|
406
603
|
}
|
|
604
|
+
var LINE_NOISE_RE = /[@#!~*=]{3,}/;
|
|
605
|
+
function noiseRatio(text) {
|
|
606
|
+
if (!text) return 0;
|
|
607
|
+
const lines = text.split("\n");
|
|
608
|
+
const total = lines.length;
|
|
609
|
+
if (total === 0) return 0;
|
|
610
|
+
const noisy = lines.filter((line) => !line.trim() || LINE_NOISE_RE.test(line)).length;
|
|
611
|
+
return Math.round(noisy / total * 1e4) / 1e4;
|
|
612
|
+
}
|
|
407
613
|
function noiseMetrics(text) {
|
|
408
614
|
if (!text) return { garbage_ratio: 0, encoding_ok: true };
|
|
409
615
|
const n = text.length;
|
|
@@ -460,7 +666,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
460
666
|
}
|
|
461
667
|
|
|
462
668
|
// src/index.ts
|
|
463
|
-
var version = "0.
|
|
669
|
+
var version = "0.5.1";
|
|
464
670
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
465
671
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
466
672
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -473,10 +679,11 @@ function computeQualityGrade(score) {
|
|
|
473
679
|
return "D";
|
|
474
680
|
}
|
|
475
681
|
function audit(text, options = {}) {
|
|
476
|
-
const locale = options.locale ?? "
|
|
682
|
+
const locale = options.locale ?? "und";
|
|
477
683
|
const pii = detectPii(text, locale);
|
|
478
684
|
const quality = qualityMetrics(text);
|
|
479
685
|
const noise = noiseMetrics(text);
|
|
686
|
+
const noise_ratio = noiseRatio(text);
|
|
480
687
|
const quality_score = computeQualityScore(
|
|
481
688
|
quality.completeness,
|
|
482
689
|
quality.avg_length,
|
|
@@ -486,7 +693,7 @@ function audit(text, options = {}) {
|
|
|
486
693
|
const counts = /* @__PURE__ */ new Map();
|
|
487
694
|
for (const f of pii) counts.set(f.type, (counts.get(f.type) ?? 0) + 1);
|
|
488
695
|
const pii_summary = Array.from(counts.entries()).sort(([a], [b]) => a.localeCompare(b)).map(([type, count]) => ({ type, count }));
|
|
489
|
-
return { quality_grade, quality_score, pii_summary, pii, quality, noise };
|
|
696
|
+
return { quality_grade, quality_score, pii_summary, pii, quality, noise, noise_ratio, detected_language: locale };
|
|
490
697
|
}
|
|
491
698
|
function auditBatch(texts, options = {}) {
|
|
492
699
|
if (texts.length === 0) {
|
|
@@ -517,6 +724,7 @@ export {
|
|
|
517
724
|
detectPii,
|
|
518
725
|
mask,
|
|
519
726
|
noiseMetrics,
|
|
727
|
+
noiseRatio,
|
|
520
728
|
qualityMetrics,
|
|
521
729
|
version
|
|
522
730
|
};
|
package/package.json
CHANGED
|
@@ -1,49 +1,49 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "@flexorch/audit",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
|
|
5
|
-
"keywords": [
|
|
6
|
-
"pii",
|
|
7
|
-
"privacy",
|
|
8
|
-
"llm",
|
|
9
|
-
"dataset",
|
|
10
|
-
"audit",
|
|
11
|
-
"tckn",
|
|
12
|
-
"kvkk",
|
|
13
|
-
"gdpr"
|
|
14
|
-
],
|
|
15
|
-
"license": "MIT",
|
|
16
|
-
"author": "FlexOrch",
|
|
17
|
-
"homepage": "https://github.com/flexorch/flexorch-audit-js",
|
|
18
|
-
"repository": {
|
|
19
|
-
"type": "git",
|
|
20
|
-
"url": "git+https://github.com/flexorch/flexorch-audit-js.git"
|
|
21
|
-
},
|
|
22
|
-
"bugs": {
|
|
23
|
-
"url": "https://github.com/flexorch/flexorch-audit-js/issues"
|
|
24
|
-
},
|
|
25
|
-
"type": "module",
|
|
26
|
-
"main": "./dist/index.cjs",
|
|
27
|
-
"module": "./dist/index.js",
|
|
28
|
-
"types": "./dist/index.d.ts",
|
|
29
|
-
"exports": {
|
|
30
|
-
".": {
|
|
31
|
-
"types": "./dist/index.d.ts",
|
|
32
|
-
"import": "./dist/index.js",
|
|
33
|
-
"require": "./dist/index.cjs"
|
|
34
|
-
}
|
|
35
|
-
},
|
|
36
|
-
"files": [
|
|
37
|
-
"dist"
|
|
38
|
-
],
|
|
39
|
-
"scripts": {
|
|
40
|
-
"build": "tsup src/index.ts --format cjs,esm --dts --clean",
|
|
41
|
-
"test": "node --test tests/*.test.js",
|
|
42
|
-
"prepublishOnly": "npm run build && npm test"
|
|
43
|
-
},
|
|
44
|
-
"devDependencies": {
|
|
45
|
-
"@types/node": "^25.6.0",
|
|
46
|
-
"tsup": "^8.0.0",
|
|
47
|
-
"typescript": "^5.0.0"
|
|
48
|
-
}
|
|
49
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "@flexorch/audit",
|
|
3
|
+
"version": "0.5.1",
|
|
4
|
+
"description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"pii",
|
|
7
|
+
"privacy",
|
|
8
|
+
"llm",
|
|
9
|
+
"dataset",
|
|
10
|
+
"audit",
|
|
11
|
+
"tckn",
|
|
12
|
+
"kvkk",
|
|
13
|
+
"gdpr"
|
|
14
|
+
],
|
|
15
|
+
"license": "MIT",
|
|
16
|
+
"author": "FlexOrch",
|
|
17
|
+
"homepage": "https://github.com/flexorch/flexorch-audit-js",
|
|
18
|
+
"repository": {
|
|
19
|
+
"type": "git",
|
|
20
|
+
"url": "git+https://github.com/flexorch/flexorch-audit-js.git"
|
|
21
|
+
},
|
|
22
|
+
"bugs": {
|
|
23
|
+
"url": "https://github.com/flexorch/flexorch-audit-js/issues"
|
|
24
|
+
},
|
|
25
|
+
"type": "module",
|
|
26
|
+
"main": "./dist/index.cjs",
|
|
27
|
+
"module": "./dist/index.js",
|
|
28
|
+
"types": "./dist/index.d.ts",
|
|
29
|
+
"exports": {
|
|
30
|
+
".": {
|
|
31
|
+
"types": "./dist/index.d.ts",
|
|
32
|
+
"import": "./dist/index.js",
|
|
33
|
+
"require": "./dist/index.cjs"
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"files": [
|
|
37
|
+
"dist"
|
|
38
|
+
],
|
|
39
|
+
"scripts": {
|
|
40
|
+
"build": "tsup src/index.ts --format cjs,esm --dts --clean",
|
|
41
|
+
"test": "node --test tests/*.test.js",
|
|
42
|
+
"prepublishOnly": "npm run build && npm test"
|
|
43
|
+
},
|
|
44
|
+
"devDependencies": {
|
|
45
|
+
"@types/node": "^25.6.0",
|
|
46
|
+
"tsup": "^8.0.0",
|
|
47
|
+
"typescript": "^5.0.0"
|
|
48
|
+
}
|
|
49
|
+
}
|