@flexorch/audit 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -46
- package/dist/index.cjs +231 -12
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +231 -12
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,13 +1,38 @@
|
|
|
1
1
|
# @flexorch/audit
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
4
|
+
[](https://www.npmjs.com/package/@flexorch/audit)
|
|
5
|
+
[](LICENSE)
|
|
4
6
|
|
|
5
|
-
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
Zero-dependency PII detection, quality grading, and noise audit for LLM datasets — in a single function call.
|
|
8
|
+
|
|
9
|
+
## Why
|
|
10
|
+
|
|
11
|
+
Before feeding documents into an LLM pipeline you need to answer three questions:
|
|
12
|
+
|
|
13
|
+
1. **Does this text contain personal data?** Sending PII to a language model is a compliance risk.
|
|
14
|
+
2. **Is the text quality high enough?** Short, noisy, or duplicate records hurt fine-tuning and RAG retrieval.
|
|
15
|
+
3. **How bad is the noise?** Garbled encodings and control characters degrade model output silently.
|
|
16
|
+
|
|
17
|
+
Most tools that answer these questions require heavy NLP frameworks, model weights, or cloud APIs. `@flexorch/audit` answers all three with one call — using only regex and Node.js built-ins. No model weights, no network calls, no external packages.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Quality grade** — A/B/C/D composite score: is this text LLM-ready at a glance?
|
|
22
|
+
- **PII detection** — email, phone (TR mobile + E.164), credit card (Luhn), IPv4, IPv6, TCKN, VKN, IBAN (mod-97 validated), SSN, label-prefixed names
|
|
23
|
+
- **Batch audit** — `auditBatch()` aggregates duplicate ratio and PII counts across an entire dataset in one call
|
|
24
|
+
- **Noise metrics** — garbage character ratio, encoding health check
|
|
25
|
+
- **Masking** — four strategies: redact, replace (synthetic), token, hash
|
|
10
26
|
- **Zero runtime dependencies** — pure Node.js built-ins, Node 18+
|
|
27
|
+
- **TypeScript-first** — full type definitions, no `@types/` package needed
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npm install @flexorch/audit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick start
|
|
11
36
|
|
|
12
37
|
```ts
|
|
13
38
|
import { audit, mask } from "@flexorch/audit"
|
|
@@ -20,7 +45,6 @@ result.quality_grade // "A"
|
|
|
20
45
|
result.quality_score // 0.91 (0.0–1.0 composite)
|
|
21
46
|
result.pii_summary // [{ type: "national_id_tr", count: 3 }, { type: "email", count: 1 }]
|
|
22
47
|
|
|
23
|
-
// Raw findings and metrics — also available:
|
|
24
48
|
result.pii // [{ type: "email", value: "...", start: 8, end: 23 }]
|
|
25
49
|
result.quality // { completeness: 1.0, avg_length: 342, duplicate_ratio: null }
|
|
26
50
|
result.noise // { garbage_ratio: 0.0, encoding_ok: true }
|
|
@@ -29,21 +53,31 @@ const clean = mask(text, result.pii, { strategy: "redact" })
|
|
|
29
53
|
// "Contact: [REDACTED_EMAIL]"
|
|
30
54
|
```
|
|
31
55
|
|
|
32
|
-
|
|
56
|
+

|
|
33
57
|
|
|
34
|
-
|
|
35
|
-
npm install @flexorch/audit
|
|
36
|
-
```
|
|
58
|
+
## Batch audit
|
|
37
59
|
|
|
38
|
-
|
|
60
|
+
Use `auditBatch()` to audit an entire dataset and get aggregate metrics including `duplicate_ratio`:
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
import { auditBatch } from "@flexorch/audit"
|
|
64
|
+
|
|
65
|
+
const texts = dataset.map((r) => r.text)
|
|
66
|
+
const batch = auditBatch(texts, { locale: "tr" })
|
|
67
|
+
|
|
68
|
+
batch.duplicate_ratio // 0.12 — fraction of exact-duplicate records
|
|
69
|
+
batch.avg_quality_score // 0.78
|
|
70
|
+
batch.pii_summary // [{ type: "email", count: 47 }, ...]
|
|
71
|
+
batch.results // AuditResult[], one per text
|
|
72
|
+
```
|
|
39
73
|
|
|
40
74
|
## Locale support
|
|
41
75
|
|
|
42
76
|
| `locale` | Active detectors |
|
|
43
77
|
|----------|-----------------|
|
|
44
|
-
| `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
|
|
45
|
-
| `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
|
|
46
|
-
| `"eu"` | email, iban, credit_card, ip + E.164 phone |
|
|
78
|
+
| `"tr"` (default) | email, iban, credit_card, ip, ip_v6 + TCKN, VKN, phone_tr, name |
|
|
79
|
+
| `"us"` | email, iban, credit_card, ip, ip_v6 + SSN, E.164 phone |
|
|
80
|
+
| `"eu"` | email, iban, credit_card, ip, ip_v6 + E.164 phone |
|
|
47
81
|
| `"all"` | All of the above (phone_tr takes precedence over generic phone) |
|
|
48
82
|
|
|
49
83
|
## PII types
|
|
@@ -51,11 +85,13 @@ npm install @flexorch/audit
|
|
|
51
85
|
| Type | Description | Locale |
|
|
52
86
|
|------|-------------|--------|
|
|
53
87
|
| `email` | RFC-5321 address | all |
|
|
54
|
-
| `iban` | ISO 13616 IBAN
|
|
88
|
+
| `iban` | ISO 13616 IBAN — mod-97 checksum validated | all |
|
|
55
89
|
| `credit_card` | 16-digit groups, Luhn-validated | all |
|
|
56
90
|
| `ip` | IPv4 address | all |
|
|
91
|
+
| `ip_v6` | IPv6 address (full, compressed, loopback) | all |
|
|
57
92
|
| `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
|
|
58
93
|
| `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
|
|
94
|
+
| `tax_id_tr` | VKN — 10-digit Luhn-variant checksum | tr |
|
|
59
95
|
| `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
|
|
60
96
|
| `phone` | E.164 international phone | us, eu |
|
|
61
97
|
| `ssn` | US Social Security Number (###-##-####) | us |
|
|
@@ -65,53 +101,37 @@ npm install @flexorch/audit
|
|
|
65
101
|
| Strategy | Example output |
|
|
66
102
|
|----------|----------------|
|
|
67
103
|
| `redact` (default) | `[REDACTED_EMAIL]` |
|
|
68
|
-
| `replace` | `user@example.com` (
|
|
69
|
-
| `token` | `<PII_EMAIL_1>` (unique per type) |
|
|
104
|
+
| `replace` | `user@example.com` (static synthetic) |
|
|
105
|
+
| `token` | `<PII_EMAIL_1>` (unique per type per call) |
|
|
70
106
|
| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
|
|
71
107
|
|
|
72
108
|
## TypeScript
|
|
73
109
|
|
|
74
|
-
Full type definitions included. No `@types/` package needed.
|
|
75
|
-
|
|
76
110
|
```ts
|
|
77
|
-
import {
|
|
111
|
+
import {
|
|
112
|
+
audit, auditBatch, mask,
|
|
113
|
+
type AuditResult, type BatchAuditResult, type PiiFinding,
|
|
114
|
+
} from "@flexorch/audit"
|
|
78
115
|
```
|
|
79
116
|
|
|
80
117
|
## Quality grade
|
|
81
118
|
|
|
82
|
-
|
|
119
|
+
`quality_grade` (A–D) and `quality_score` (0.0–1.0) are composite signals:
|
|
83
120
|
|
|
84
|
-
| Grade | Score |
|
|
85
|
-
|
|
121
|
+
| Grade | Score | Signal |
|
|
122
|
+
|-------|-------|--------|
|
|
86
123
|
| A | ≥ 0.85 | Ready for LLM training or RAG |
|
|
87
124
|
| B | ≥ 0.65 | Usable with minor cleanup |
|
|
88
|
-
| C | ≥ 0.40 |
|
|
125
|
+
| C | ≥ 0.40 | Review before use |
|
|
89
126
|
| D | < 0.40 | Not suitable — empty, too short, or high noise |
|
|
90
127
|
|
|
91
|
-
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
92
|
-
|
|
128
|
+
Score formula: `completeness × (0.4 × noiseScore + 0.4 × lengthScore + 0.2)`
|
|
129
|
+
`lengthScore = Math.min(charCount / 500, 1.0)` · `noiseScore = Math.max(0, 1 − garbageRatio × 10)`
|
|
93
130
|
|
|
94
|
-
##
|
|
95
|
-
|
|
96
|
-
`duplicate_ratio` is `null` for single-string input. Compute it across your dataset:
|
|
97
|
-
|
|
98
|
-
```ts
|
|
99
|
-
const texts = dataset.map((r) => r.text)
|
|
100
|
-
const seen = new Set<string>()
|
|
101
|
-
let duplicates = 0
|
|
102
|
-
for (const t of texts) {
|
|
103
|
-
if (seen.has(t)) duplicates++
|
|
104
|
-
else seen.add(t)
|
|
105
|
-
}
|
|
106
|
-
const duplicateRatio = duplicates / texts.length
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
## Limitations (v0.2)
|
|
131
|
+
## Limitations (v0.4)
|
|
110
132
|
|
|
111
133
|
- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
|
|
112
|
-
- `
|
|
113
|
-
- IPv6 not detected.
|
|
114
|
-
- IBAN format-only check; mod-97 validation not performed.
|
|
134
|
+
- `replace` masking strategy uses static synthetic values; locale-aware realistic synthesis is not yet implemented.
|
|
115
135
|
|
|
116
136
|
## Also available for Python
|
|
117
137
|
|
|
@@ -119,6 +139,10 @@ const duplicateRatio = duplicates / texts.length
|
|
|
119
139
|
pip install flexorch-audit
|
|
120
140
|
```
|
|
121
141
|
|
|
142
|
+
## Contributing
|
|
143
|
+
|
|
144
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
145
|
+
|
|
122
146
|
## License
|
|
123
147
|
|
|
124
148
|
MIT
|
package/dist/index.cjs
CHANGED
|
@@ -33,8 +33,9 @@ module.exports = __toCommonJS(index_exports);
|
|
|
33
33
|
|
|
34
34
|
// src/pii.ts
|
|
35
35
|
var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
|
|
36
|
-
var PHONE_INTL_RE =
|
|
36
|
+
var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
|
|
37
37
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
38
|
+
var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
|
|
38
39
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
39
40
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
40
41
|
var _H = "[0-9a-fA-F]{1,4}";
|
|
@@ -45,6 +46,102 @@ var IPV6_RE = new RegExp(
|
|
|
45
46
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
46
47
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
47
48
|
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
49
|
+
var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
|
|
50
|
+
var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
|
|
51
|
+
var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
|
|
52
|
+
var COMPANY_NAME_TR_RE = new RegExp(
|
|
53
|
+
`(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
|
|
54
|
+
"gu"
|
|
55
|
+
);
|
|
56
|
+
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
57
|
+
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
58
|
+
var _TR_PROVINCES_SORTED = [
|
|
59
|
+
"Afyonkarahisar",
|
|
60
|
+
"Kahramanmara\u015F",
|
|
61
|
+
"K\u0131r\u0131kkale",
|
|
62
|
+
"K\u0131rklareli",
|
|
63
|
+
"Diyarbak\u0131r",
|
|
64
|
+
"Gaziantep",
|
|
65
|
+
"\u015Eanl\u0131urfa",
|
|
66
|
+
"Nev\u015Fehir",
|
|
67
|
+
"Kastamonu",
|
|
68
|
+
"G\xFCm\xFC\u015Fhane",
|
|
69
|
+
"Eski\u015Fehir",
|
|
70
|
+
"Erzincan",
|
|
71
|
+
"Erzurum",
|
|
72
|
+
"Denizli",
|
|
73
|
+
"\xC7anakkale",
|
|
74
|
+
"Ad\u0131yaman",
|
|
75
|
+
"Zonguldak",
|
|
76
|
+
"Tekirda\u011F",
|
|
77
|
+
"Trabzon",
|
|
78
|
+
"Tunceli",
|
|
79
|
+
"Karaman",
|
|
80
|
+
"Karab\xFCk",
|
|
81
|
+
"Aksaray",
|
|
82
|
+
"Antalya",
|
|
83
|
+
"K\u0131r\u015Fehir",
|
|
84
|
+
"Osmaniye",
|
|
85
|
+
"Kocaeli",
|
|
86
|
+
"Sakarya",
|
|
87
|
+
"Bart\u0131n",
|
|
88
|
+
"Bayburt",
|
|
89
|
+
"Ardahan",
|
|
90
|
+
"Yozgat",
|
|
91
|
+
"Ankara",
|
|
92
|
+
"Amasya",
|
|
93
|
+
"Artvin",
|
|
94
|
+
"Bal\u0131kesir",
|
|
95
|
+
"Bilecik",
|
|
96
|
+
"Bing\xF6l",
|
|
97
|
+
"Bitlis",
|
|
98
|
+
"Burdur",
|
|
99
|
+
"\xC7ank\u0131r\u0131",
|
|
100
|
+
"Edirne",
|
|
101
|
+
"Elaz\u0131\u011F",
|
|
102
|
+
"Giresun",
|
|
103
|
+
"Hakkari",
|
|
104
|
+
"Isparta",
|
|
105
|
+
"\u0130stanbul",
|
|
106
|
+
"\u0130zmir",
|
|
107
|
+
"Kayseri",
|
|
108
|
+
"K\xFCtahya",
|
|
109
|
+
"Malatya",
|
|
110
|
+
"Manisa",
|
|
111
|
+
"Mardin",
|
|
112
|
+
"Samsun",
|
|
113
|
+
"\u015E\u0131rnak",
|
|
114
|
+
"Sinop",
|
|
115
|
+
"Tokat",
|
|
116
|
+
"Hatay",
|
|
117
|
+
"Konya",
|
|
118
|
+
"Mu\u011Fla",
|
|
119
|
+
"Ni\u011Fde",
|
|
120
|
+
"Rize",
|
|
121
|
+
"Siirt",
|
|
122
|
+
"Sivas",
|
|
123
|
+
"Adana",
|
|
124
|
+
"Ayd\u0131n",
|
|
125
|
+
"Bursa",
|
|
126
|
+
"\xC7orum",
|
|
127
|
+
"I\u011Fd\u0131r",
|
|
128
|
+
"Kilis",
|
|
129
|
+
"Mersin",
|
|
130
|
+
"Batman",
|
|
131
|
+
"Yalova",
|
|
132
|
+
"D\xFCzce",
|
|
133
|
+
"Ordu",
|
|
134
|
+
"Kars",
|
|
135
|
+
"A\u011Fr\u0131",
|
|
136
|
+
"Bolu",
|
|
137
|
+
"Van",
|
|
138
|
+
"U\u015Fak",
|
|
139
|
+
"Mu\u015F"
|
|
140
|
+
].sort((a, b) => b.length - a.length);
|
|
141
|
+
var PROVINCE_TR_RE = new RegExp(
|
|
142
|
+
`(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
|
|
143
|
+
"gu"
|
|
144
|
+
);
|
|
48
145
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
49
146
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
50
147
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -52,6 +149,46 @@ var NAME_RE = new RegExp(
|
|
|
52
149
|
`(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
|
|
53
150
|
"gu"
|
|
54
151
|
);
|
|
152
|
+
var _IBAN_INTL_LENGTHS = {
|
|
153
|
+
AT: 20,
|
|
154
|
+
BE: 16,
|
|
155
|
+
BG: 22,
|
|
156
|
+
HR: 21,
|
|
157
|
+
CY: 28,
|
|
158
|
+
CZ: 24,
|
|
159
|
+
DK: 18,
|
|
160
|
+
EE: 20,
|
|
161
|
+
FI: 18,
|
|
162
|
+
FR: 27,
|
|
163
|
+
DE: 22,
|
|
164
|
+
GR: 27,
|
|
165
|
+
HU: 28,
|
|
166
|
+
IE: 22,
|
|
167
|
+
IT: 27,
|
|
168
|
+
LV: 21,
|
|
169
|
+
LT: 20,
|
|
170
|
+
LU: 20,
|
|
171
|
+
MT: 31,
|
|
172
|
+
NL: 18,
|
|
173
|
+
PL: 28,
|
|
174
|
+
PT: 25,
|
|
175
|
+
RO: 24,
|
|
176
|
+
SK: 24,
|
|
177
|
+
SI: 19,
|
|
178
|
+
ES: 24,
|
|
179
|
+
SE: 24,
|
|
180
|
+
GB: 22,
|
|
181
|
+
CH: 21,
|
|
182
|
+
NO: 15
|
|
183
|
+
};
|
|
184
|
+
var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
|
|
185
|
+
var _UC = "[A-Z\xC0-\u024F]";
|
|
186
|
+
var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
|
|
187
|
+
var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
|
|
188
|
+
var COMPANY_NAME_INTL_RE = new RegExp(
|
|
189
|
+
`(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
|
|
190
|
+
"gu"
|
|
191
|
+
);
|
|
55
192
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
56
193
|
function validTckn(s) {
|
|
57
194
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
@@ -103,10 +240,30 @@ function validIban(s) {
|
|
|
103
240
|
}
|
|
104
241
|
return remainder === 1;
|
|
105
242
|
}
|
|
243
|
+
function validIbanIntl(s) {
|
|
244
|
+
const country = s.slice(0, 2);
|
|
245
|
+
if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
|
|
246
|
+
if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
|
|
247
|
+
return validIban(s);
|
|
248
|
+
}
|
|
249
|
+
function validPhoneIntl(raw) {
|
|
250
|
+
const digits = raw.replace(/\D/g, "");
|
|
251
|
+
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
252
|
+
}
|
|
106
253
|
var LOCALE_DETECTORS = {
|
|
107
|
-
tr: /* @__PURE__ */ new Set([
|
|
108
|
-
|
|
109
|
-
|
|
254
|
+
tr: /* @__PURE__ */ new Set([
|
|
255
|
+
"national_id_tr",
|
|
256
|
+
"tax_id_tr",
|
|
257
|
+
"phone_tr",
|
|
258
|
+
"name",
|
|
259
|
+
"iban_tr",
|
|
260
|
+
"company_name_tr",
|
|
261
|
+
"mersis_no",
|
|
262
|
+
"postal_code_tr",
|
|
263
|
+
"province_tr"
|
|
264
|
+
]),
|
|
265
|
+
us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
|
|
266
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
|
|
110
267
|
};
|
|
111
268
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
112
269
|
function activeDetectors(locale) {
|
|
@@ -115,7 +272,6 @@ function activeDetectors(locale) {
|
|
|
115
272
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
116
273
|
detectors.forEach((d) => active2.add(d));
|
|
117
274
|
}
|
|
118
|
-
if (active2.has("phone_tr")) active2.delete("phone");
|
|
119
275
|
return active2;
|
|
120
276
|
}
|
|
121
277
|
const active = new Set(UNIVERSAL);
|
|
@@ -134,15 +290,15 @@ function findAll(re, text, type) {
|
|
|
134
290
|
function detectPii(text, locale = "tr") {
|
|
135
291
|
const active = activeDetectors(locale);
|
|
136
292
|
const t = text ?? "";
|
|
137
|
-
|
|
293
|
+
let findings = [];
|
|
138
294
|
if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
|
|
139
|
-
if (active.has("
|
|
295
|
+
if (active.has("phone_intl")) {
|
|
140
296
|
PHONE_INTL_RE.lastIndex = 0;
|
|
141
297
|
let m;
|
|
142
298
|
while ((m = PHONE_INTL_RE.exec(t)) !== null) {
|
|
143
|
-
const
|
|
144
|
-
if (
|
|
145
|
-
findings.push({ type: "
|
|
299
|
+
const candidate = m[1];
|
|
300
|
+
if (validPhoneIntl(candidate)) {
|
|
301
|
+
findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
146
302
|
}
|
|
147
303
|
}
|
|
148
304
|
}
|
|
@@ -195,8 +351,71 @@ function detectPii(text, locale = "tr") {
|
|
|
195
351
|
findings.push({ type: "name", value, start, end: start + value.length });
|
|
196
352
|
}
|
|
197
353
|
}
|
|
354
|
+
if (active.has("iban_tr")) {
|
|
355
|
+
IBAN_TR_RE.lastIndex = 0;
|
|
356
|
+
let m;
|
|
357
|
+
while ((m = IBAN_TR_RE.exec(t)) !== null) {
|
|
358
|
+
if (validIban(m[0])) {
|
|
359
|
+
findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (active.has("company_name_tr")) {
|
|
364
|
+
COMPANY_NAME_TR_RE.lastIndex = 0;
|
|
365
|
+
let m;
|
|
366
|
+
while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
|
|
367
|
+
findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
if (active.has("mersis_no")) {
|
|
371
|
+
MERSIS_RE.lastIndex = 0;
|
|
372
|
+
let m;
|
|
373
|
+
while ((m = MERSIS_RE.exec(t)) !== null) {
|
|
374
|
+
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
if (active.has("postal_code_tr")) {
|
|
378
|
+
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
379
|
+
let m;
|
|
380
|
+
while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
|
|
381
|
+
findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
if (active.has("province_tr")) {
|
|
385
|
+
PROVINCE_TR_RE.lastIndex = 0;
|
|
386
|
+
let m;
|
|
387
|
+
while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
|
|
388
|
+
findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
389
|
+
}
|
|
390
|
+
}
|
|
198
391
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
199
|
-
|
|
392
|
+
if (active.has("iban_intl")) {
|
|
393
|
+
IBAN_INTL_RE.lastIndex = 0;
|
|
394
|
+
let m;
|
|
395
|
+
while ((m = IBAN_INTL_RE.exec(t)) !== null) {
|
|
396
|
+
const candidate = m[1];
|
|
397
|
+
if (validIbanIntl(candidate)) {
|
|
398
|
+
findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
if (active.has("company_name_intl")) {
|
|
403
|
+
COMPANY_NAME_INTL_RE.lastIndex = 0;
|
|
404
|
+
let m;
|
|
405
|
+
while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
|
|
406
|
+
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
findings.sort((a, b) => a.start - b.start);
|
|
410
|
+
const specificIbanSpans = new Set(
|
|
411
|
+
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
412
|
+
);
|
|
413
|
+
if (specificIbanSpans.size > 0) {
|
|
414
|
+
findings = findings.filter(
|
|
415
|
+
(f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
|
|
416
|
+
);
|
|
417
|
+
}
|
|
418
|
+
return findings;
|
|
200
419
|
}
|
|
201
420
|
|
|
202
421
|
// src/quality.ts
|
|
@@ -274,7 +493,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
274
493
|
}
|
|
275
494
|
|
|
276
495
|
// src/index.ts
|
|
277
|
-
var version = "0.3.
|
|
496
|
+
var version = "0.3.1";
|
|
278
497
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
279
498
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
280
499
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
package/dist/index.d.cts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.3.
|
|
48
|
+
declare const version = "0.3.1";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
package/dist/index.d.ts
CHANGED
|
@@ -45,7 +45,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
45
45
|
* // "Contact: [REDACTED_EMAIL]"
|
|
46
46
|
*/
|
|
47
47
|
|
|
48
|
-
declare const version = "0.3.
|
|
48
|
+
declare const version = "0.3.1";
|
|
49
49
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
50
50
|
interface PiiSummaryEntry {
|
|
51
51
|
type: string;
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
// src/pii.ts
|
|
2
2
|
var EMAIL_RE = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;
|
|
3
|
-
var PHONE_INTL_RE =
|
|
3
|
+
var PHONE_INTL_RE = /(?<![+\d])(\+[1-9][\d\s\-.()]{5,18}\d)(?!\d)/g;
|
|
4
4
|
var IBAN_RE = /\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b/g;
|
|
5
|
+
var IBAN_INTL_RE = /\b([A-Z]{2}\d{2}[0-9A-Z]{11,30})\b/g;
|
|
5
6
|
var CC_RE = /\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b/g;
|
|
6
7
|
var IPV4_RE = /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g;
|
|
7
8
|
var _H = "[0-9a-fA-F]{1,4}";
|
|
@@ -12,6 +13,102 @@ var IPV6_RE = new RegExp(
|
|
|
12
13
|
var PHONE_TR_RE = /\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b/g;
|
|
13
14
|
var TCKN_RE = /\b([1-9]\d{10})\b/g;
|
|
14
15
|
var VKN_RE = /\b([1-9]\d{9})\b/g;
|
|
16
|
+
var IBAN_TR_RE = /\bTR\d{2}[0-9A-Z]{22}\b/g;
|
|
17
|
+
var _TR_COMPANY_SUFFIX = "(?:A\\.\u015E\\.|Ltd\\.\\s*\u015Eti\\.|Koll\\.\\s*\u015Eti\\.|Koop\\.|T\\.A\\.\u015E\\.)";
|
|
18
|
+
var _TR_NAME_TOKEN = "(?:ve|ile|[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*\\.?)";
|
|
19
|
+
var COMPANY_NAME_TR_RE = new RegExp(
|
|
20
|
+
`(?<![A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC])([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][A-Za-z\xC7\u011E\u0130\xD6\u015E\xDC\xE7\u011F\u0131\u015F\xF6\u015F\xFC]*(?:\\s+${_TR_NAME_TOKEN}){0,6}\\s+${_TR_COMPANY_SUFFIX})`,
|
|
21
|
+
"gu"
|
|
22
|
+
);
|
|
23
|
+
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
24
|
+
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
25
|
+
var _TR_PROVINCES_SORTED = [
|
|
26
|
+
"Afyonkarahisar",
|
|
27
|
+
"Kahramanmara\u015F",
|
|
28
|
+
"K\u0131r\u0131kkale",
|
|
29
|
+
"K\u0131rklareli",
|
|
30
|
+
"Diyarbak\u0131r",
|
|
31
|
+
"Gaziantep",
|
|
32
|
+
"\u015Eanl\u0131urfa",
|
|
33
|
+
"Nev\u015Fehir",
|
|
34
|
+
"Kastamonu",
|
|
35
|
+
"G\xFCm\xFC\u015Fhane",
|
|
36
|
+
"Eski\u015Fehir",
|
|
37
|
+
"Erzincan",
|
|
38
|
+
"Erzurum",
|
|
39
|
+
"Denizli",
|
|
40
|
+
"\xC7anakkale",
|
|
41
|
+
"Ad\u0131yaman",
|
|
42
|
+
"Zonguldak",
|
|
43
|
+
"Tekirda\u011F",
|
|
44
|
+
"Trabzon",
|
|
45
|
+
"Tunceli",
|
|
46
|
+
"Karaman",
|
|
47
|
+
"Karab\xFCk",
|
|
48
|
+
"Aksaray",
|
|
49
|
+
"Antalya",
|
|
50
|
+
"K\u0131r\u015Fehir",
|
|
51
|
+
"Osmaniye",
|
|
52
|
+
"Kocaeli",
|
|
53
|
+
"Sakarya",
|
|
54
|
+
"Bart\u0131n",
|
|
55
|
+
"Bayburt",
|
|
56
|
+
"Ardahan",
|
|
57
|
+
"Yozgat",
|
|
58
|
+
"Ankara",
|
|
59
|
+
"Amasya",
|
|
60
|
+
"Artvin",
|
|
61
|
+
"Bal\u0131kesir",
|
|
62
|
+
"Bilecik",
|
|
63
|
+
"Bing\xF6l",
|
|
64
|
+
"Bitlis",
|
|
65
|
+
"Burdur",
|
|
66
|
+
"\xC7ank\u0131r\u0131",
|
|
67
|
+
"Edirne",
|
|
68
|
+
"Elaz\u0131\u011F",
|
|
69
|
+
"Giresun",
|
|
70
|
+
"Hakkari",
|
|
71
|
+
"Isparta",
|
|
72
|
+
"\u0130stanbul",
|
|
73
|
+
"\u0130zmir",
|
|
74
|
+
"Kayseri",
|
|
75
|
+
"K\xFCtahya",
|
|
76
|
+
"Malatya",
|
|
77
|
+
"Manisa",
|
|
78
|
+
"Mardin",
|
|
79
|
+
"Samsun",
|
|
80
|
+
"\u015E\u0131rnak",
|
|
81
|
+
"Sinop",
|
|
82
|
+
"Tokat",
|
|
83
|
+
"Hatay",
|
|
84
|
+
"Konya",
|
|
85
|
+
"Mu\u011Fla",
|
|
86
|
+
"Ni\u011Fde",
|
|
87
|
+
"Rize",
|
|
88
|
+
"Siirt",
|
|
89
|
+
"Sivas",
|
|
90
|
+
"Adana",
|
|
91
|
+
"Ayd\u0131n",
|
|
92
|
+
"Bursa",
|
|
93
|
+
"\xC7orum",
|
|
94
|
+
"I\u011Fd\u0131r",
|
|
95
|
+
"Kilis",
|
|
96
|
+
"Mersin",
|
|
97
|
+
"Batman",
|
|
98
|
+
"Yalova",
|
|
99
|
+
"D\xFCzce",
|
|
100
|
+
"Ordu",
|
|
101
|
+
"Kars",
|
|
102
|
+
"A\u011Fr\u0131",
|
|
103
|
+
"Bolu",
|
|
104
|
+
"Van",
|
|
105
|
+
"U\u015Fak",
|
|
106
|
+
"Mu\u015F"
|
|
107
|
+
].sort((a, b) => b.length - a.length);
|
|
108
|
+
var PROVINCE_TR_RE = new RegExp(
|
|
109
|
+
`(?<!\\w)(${_TR_PROVINCES_SORTED.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|")})(?!\\w)`,
|
|
110
|
+
"gu"
|
|
111
|
+
);
|
|
15
112
|
var NAME_PREFIX_TR = "(?:Ad[\u0131i]\\s*(?:Soyad[\u0131i])?|Soyad[\u0131i]|\u0130sim|M\xFC\u015Fteri\\s+Ad[\u0131i]|Yetkili(?:\\s+Ki\u015Fi)?|\xC7al\u0131\u015Fan\\s+Ad[\u0131i]|Personel\\s+Ad[\u0131i]|Ki\u015Fi\\s+Ad[\u0131i]|Sat\u0131c\u0131\\s+Ad[\u0131i]|Al\u0131c\u0131\\s+Ad[\u0131i]|\u0130lgili\\s+Ki\u015Fi|Hesap\\s+Sahibi)";
|
|
16
113
|
var NAME_PREFIX_EN = "(?:Full\\s+Name|Customer\\s+Name|Employee\\s+Name|Contact\\s+Name|Authorized\\s+(?:By|Person)|Account\\s+Holder|(?<!\\bUser\\s)Name)";
|
|
17
114
|
var NAME_VALUE = "([A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+(?:\\s+[A-Z\xC7\u011E\u0130\xD6\u015E\xDC][a-z\xE7\u011F\u0131\u015F\xF6\u015F\xFC]+){0,2})";
|
|
@@ -19,6 +116,46 @@ var NAME_RE = new RegExp(
|
|
|
19
116
|
`(?:${NAME_PREFIX_TR}|${NAME_PREFIX_EN})\\s*[:\\-]\\s*${NAME_VALUE}`,
|
|
20
117
|
"gu"
|
|
21
118
|
);
|
|
119
|
+
var _IBAN_INTL_LENGTHS = {
|
|
120
|
+
AT: 20,
|
|
121
|
+
BE: 16,
|
|
122
|
+
BG: 22,
|
|
123
|
+
HR: 21,
|
|
124
|
+
CY: 28,
|
|
125
|
+
CZ: 24,
|
|
126
|
+
DK: 18,
|
|
127
|
+
EE: 20,
|
|
128
|
+
FI: 18,
|
|
129
|
+
FR: 27,
|
|
130
|
+
DE: 22,
|
|
131
|
+
GR: 27,
|
|
132
|
+
HU: 28,
|
|
133
|
+
IE: 22,
|
|
134
|
+
IT: 27,
|
|
135
|
+
LV: 21,
|
|
136
|
+
LT: 20,
|
|
137
|
+
LU: 20,
|
|
138
|
+
MT: 31,
|
|
139
|
+
NL: 18,
|
|
140
|
+
PL: 28,
|
|
141
|
+
PT: 25,
|
|
142
|
+
RO: 24,
|
|
143
|
+
SK: 24,
|
|
144
|
+
SI: 19,
|
|
145
|
+
ES: 24,
|
|
146
|
+
SE: 24,
|
|
147
|
+
GB: 22,
|
|
148
|
+
CH: 21,
|
|
149
|
+
NO: 15
|
|
150
|
+
};
|
|
151
|
+
var _INTL_SUFFIX = "(?:KGaA|GmbH|OHG|GbR|SARL|EURL|S\\.p\\.A\\.|S\\.r\\.l\\.|S\\.n\\.c\\.|S\\.a\\.s\\.|B\\.V\\.|N\\.V\\.|S\\.A\\.|S\\.L\\.|Corp\\.|Inc\\.|Ltd\\.|LLP|LLC|PLC|SpA|Srl|SNC|SAS|BV|NV|SL|SA|Corp|Inc|Ltd|KG|AG|UG)";
|
|
152
|
+
var _UC = "[A-Z\xC0-\u024F]";
|
|
153
|
+
var _WC = "[A-Za-z0-9\xC0-\u024F\\-]";
|
|
154
|
+
var _INTL_NAME_TOKEN = `(?:and|&|${_UC}${_WC}*\\.?)`;
|
|
155
|
+
var COMPANY_NAME_INTL_RE = new RegExp(
|
|
156
|
+
`(?<![A-Za-z\xC0-\u024F])(${_UC}${_WC}*(?:\\s+${_INTL_NAME_TOKEN}){0,6}\\s+${_INTL_SUFFIX})`,
|
|
157
|
+
"gu"
|
|
158
|
+
);
|
|
22
159
|
var SSN_RE = /\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b/g;
|
|
23
160
|
function validTckn(s) {
|
|
24
161
|
if (s.length !== 11 || s[0] === "0") return false;
|
|
@@ -70,10 +207,30 @@ function validIban(s) {
|
|
|
70
207
|
}
|
|
71
208
|
return remainder === 1;
|
|
72
209
|
}
|
|
210
|
+
function validIbanIntl(s) {
|
|
211
|
+
const country = s.slice(0, 2);
|
|
212
|
+
if (country === "TR" || !(country in _IBAN_INTL_LENGTHS)) return false;
|
|
213
|
+
if (s.length !== _IBAN_INTL_LENGTHS[country]) return false;
|
|
214
|
+
return validIban(s);
|
|
215
|
+
}
|
|
216
|
+
function validPhoneIntl(raw) {
|
|
217
|
+
const digits = raw.replace(/\D/g, "");
|
|
218
|
+
return digits.length >= 7 && digits.length <= 15 && digits.slice(0, 2) !== "90";
|
|
219
|
+
}
|
|
73
220
|
var LOCALE_DETECTORS = {
|
|
74
|
-
tr: /* @__PURE__ */ new Set([
|
|
75
|
-
|
|
76
|
-
|
|
221
|
+
tr: /* @__PURE__ */ new Set([
|
|
222
|
+
"national_id_tr",
|
|
223
|
+
"tax_id_tr",
|
|
224
|
+
"phone_tr",
|
|
225
|
+
"name",
|
|
226
|
+
"iban_tr",
|
|
227
|
+
"company_name_tr",
|
|
228
|
+
"mersis_no",
|
|
229
|
+
"postal_code_tr",
|
|
230
|
+
"province_tr"
|
|
231
|
+
]),
|
|
232
|
+
us: /* @__PURE__ */ new Set(["ssn", "phone_intl", "company_name_intl"]),
|
|
233
|
+
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"])
|
|
77
234
|
};
|
|
78
235
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
79
236
|
function activeDetectors(locale) {
|
|
@@ -82,7 +239,6 @@ function activeDetectors(locale) {
|
|
|
82
239
|
for (const detectors of Object.values(LOCALE_DETECTORS)) {
|
|
83
240
|
detectors.forEach((d) => active2.add(d));
|
|
84
241
|
}
|
|
85
|
-
if (active2.has("phone_tr")) active2.delete("phone");
|
|
86
242
|
return active2;
|
|
87
243
|
}
|
|
88
244
|
const active = new Set(UNIVERSAL);
|
|
@@ -101,15 +257,15 @@ function findAll(re, text, type) {
|
|
|
101
257
|
function detectPii(text, locale = "tr") {
|
|
102
258
|
const active = activeDetectors(locale);
|
|
103
259
|
const t = text ?? "";
|
|
104
|
-
|
|
260
|
+
let findings = [];
|
|
105
261
|
if (active.has("email")) findings.push(...findAll(EMAIL_RE, t, "email"));
|
|
106
|
-
if (active.has("
|
|
262
|
+
if (active.has("phone_intl")) {
|
|
107
263
|
PHONE_INTL_RE.lastIndex = 0;
|
|
108
264
|
let m;
|
|
109
265
|
while ((m = PHONE_INTL_RE.exec(t)) !== null) {
|
|
110
|
-
const
|
|
111
|
-
if (
|
|
112
|
-
findings.push({ type: "
|
|
266
|
+
const candidate = m[1];
|
|
267
|
+
if (validPhoneIntl(candidate)) {
|
|
268
|
+
findings.push({ type: "phone_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
113
269
|
}
|
|
114
270
|
}
|
|
115
271
|
}
|
|
@@ -162,8 +318,71 @@ function detectPii(text, locale = "tr") {
|
|
|
162
318
|
findings.push({ type: "name", value, start, end: start + value.length });
|
|
163
319
|
}
|
|
164
320
|
}
|
|
321
|
+
if (active.has("iban_tr")) {
|
|
322
|
+
IBAN_TR_RE.lastIndex = 0;
|
|
323
|
+
let m;
|
|
324
|
+
while ((m = IBAN_TR_RE.exec(t)) !== null) {
|
|
325
|
+
if (validIban(m[0])) {
|
|
326
|
+
findings.push({ type: "iban_tr", value: m[0], start: m.index, end: m.index + m[0].length });
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
if (active.has("company_name_tr")) {
|
|
331
|
+
COMPANY_NAME_TR_RE.lastIndex = 0;
|
|
332
|
+
let m;
|
|
333
|
+
while ((m = COMPANY_NAME_TR_RE.exec(t)) !== null) {
|
|
334
|
+
findings.push({ type: "company_name_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
if (active.has("mersis_no")) {
|
|
338
|
+
MERSIS_RE.lastIndex = 0;
|
|
339
|
+
let m;
|
|
340
|
+
while ((m = MERSIS_RE.exec(t)) !== null) {
|
|
341
|
+
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
if (active.has("postal_code_tr")) {
|
|
345
|
+
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
346
|
+
let m;
|
|
347
|
+
while ((m = POSTAL_CODE_TR_RE.exec(t)) !== null) {
|
|
348
|
+
findings.push({ type: "postal_code_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
if (active.has("province_tr")) {
|
|
352
|
+
PROVINCE_TR_RE.lastIndex = 0;
|
|
353
|
+
let m;
|
|
354
|
+
while ((m = PROVINCE_TR_RE.exec(t)) !== null) {
|
|
355
|
+
findings.push({ type: "province_tr", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
356
|
+
}
|
|
357
|
+
}
|
|
165
358
|
if (active.has("ssn")) findings.push(...findAll(SSN_RE, t, "ssn"));
|
|
166
|
-
|
|
359
|
+
if (active.has("iban_intl")) {
|
|
360
|
+
IBAN_INTL_RE.lastIndex = 0;
|
|
361
|
+
let m;
|
|
362
|
+
while ((m = IBAN_INTL_RE.exec(t)) !== null) {
|
|
363
|
+
const candidate = m[1];
|
|
364
|
+
if (validIbanIntl(candidate)) {
|
|
365
|
+
findings.push({ type: "iban_intl", value: candidate, start: m.index, end: m.index + candidate.length });
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
if (active.has("company_name_intl")) {
|
|
370
|
+
COMPANY_NAME_INTL_RE.lastIndex = 0;
|
|
371
|
+
let m;
|
|
372
|
+
while ((m = COMPANY_NAME_INTL_RE.exec(t)) !== null) {
|
|
373
|
+
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
findings.sort((a, b) => a.start - b.start);
|
|
377
|
+
const specificIbanSpans = new Set(
|
|
378
|
+
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
379
|
+
);
|
|
380
|
+
if (specificIbanSpans.size > 0) {
|
|
381
|
+
findings = findings.filter(
|
|
382
|
+
(f) => !(f.type === "iban" && specificIbanSpans.has(`${f.start}:${f.end}`))
|
|
383
|
+
);
|
|
384
|
+
}
|
|
385
|
+
return findings;
|
|
167
386
|
}
|
|
168
387
|
|
|
169
388
|
// src/quality.ts
|
|
@@ -241,7 +460,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
241
460
|
}
|
|
242
461
|
|
|
243
462
|
// src/index.ts
|
|
244
|
-
var version = "0.3.
|
|
463
|
+
var version = "0.3.1";
|
|
245
464
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
246
465
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
247
466
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|