haechi 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +46 -11
- package/README.md +46 -11
- package/SECURITY.md +7 -1
- package/docs/README.md +2 -0
- package/docs/current/compliance-mapping.ko.md +53 -0
- package/docs/current/compliance-mapping.md +53 -0
- package/docs/current/config-version.ko.md +30 -0
- package/docs/current/config-version.md +51 -0
- package/docs/current/configuration.ko.md +165 -9
- package/docs/current/configuration.md +165 -9
- package/docs/current/operations-runbook.ko.md +155 -0
- package/docs/current/operations-runbook.md +241 -0
- package/docs/current/release-process.ko.md +5 -1
- package/docs/current/release-process.md +5 -1
- package/docs/current/risk-register-release-gate.ko.md +5 -3
- package/docs/current/risk-register-release-gate.md +13 -3
- package/docs/current/security-whitepaper.ko.md +102 -0
- package/docs/current/security-whitepaper.md +102 -0
- package/docs/current/shared-responsibility.ko.md +2 -2
- package/docs/current/shared-responsibility.md +2 -2
- package/docs/current/threat-model.ko.md +4 -2
- package/docs/current/threat-model.md +4 -2
- package/examples/local-proxy-demo/README.md +51 -0
- package/examples/local-proxy-demo/demo.mjs +144 -0
- package/examples/local-proxy-demo/demo.tape +19 -0
- package/examples/local-proxy-demo/live-demo.mjs +121 -0
- package/examples/local-proxy-demo/live-demo.tape +25 -0
- package/haechi.config.example.json +20 -3
- package/package.json +7 -2
- package/packages/audit/index.mjs +26 -2
- package/packages/cli/bin/haechi.mjs +57 -10
- package/packages/cli/runtime.mjs +402 -10
- package/packages/core/index.mjs +143 -8
- package/packages/filter/index.mjs +975 -12
- package/packages/metrics/index.mjs +181 -0
- package/packages/privacy-profiles/index.mjs +72 -3
- package/packages/protocol-adapters/index.mjs +99 -1
- package/packages/proxy/index.mjs +525 -40
- package/packages/stream-filter/index.mjs +69 -7
|
@@ -1,3 +1,52 @@
|
|
|
1
|
+
import { isUtf8 } from "node:buffer";
|
|
2
|
+
|
|
3
|
+
// The hard-block detection types: a leak of one of these is a load-bearing
|
|
4
|
+
// fail-closed concern, so the WS2c precision dials (filters.minConfidence,
|
|
5
|
+
// filters.allowlist) may NOT suppress a detection of any of them. minConfidence
|
|
6
|
+
// trims only the precision-risky SOFT types; the allowlist's per-value/per-path
|
|
7
|
+
// exceptions are ignored for these types (the detection still fires). Exported
|
|
8
|
+
// so the core detect→decide path enforces the same exemption set the docs pin.
|
|
9
|
+
//
|
|
10
|
+
// Hard-block types are sensitive AND have a STRONG enough anchor that a match is
|
|
11
|
+
// effectively a true positive by construction, so the precision dials
|
|
12
|
+
// (filters.minConfidence / filters.allowlist) can never suppress them:
|
|
13
|
+
// - kr_rrn / card — checksum + constrained format
|
|
14
|
+
// - fr_nir (mod-97 over a long structured 15-digit run) and es_dni (mod-23 plus
|
|
15
|
+
// a required check LETTER suffix) — a random same-shaped value almost never
|
|
16
|
+
// passes, and the shapes are rare in ordinary payloads.
|
|
17
|
+
// - it_codice_fiscale — a 16-char MIXED alpha+digit shape with a mod-26 check
|
|
18
|
+
// CHARACTER. The non-numeric structural anchor (the rigid letter/digit layout)
|
|
19
|
+
// makes a benign 16-char `[A-Z]{6}\d{2}[A-Z]…` run in an ordinary payload
|
|
20
|
+
// implausible (measured collision ~3.8% over an already-rare shape), so a match
|
|
21
|
+
// is effectively a true positive.
|
|
22
|
+
// - sg_nric — a LETTER prefix ([STFGM]) + 7 digits + a CHECK LETTER. Two
|
|
23
|
+
// non-numeric anchors (prefix letter + checksum letter) over a rare shape
|
|
24
|
+
// (measured collision ~3.9% over the prefix+letter shape) make a benign FP
|
|
25
|
+
// implausible. Both anchored, un-allowlistable.
|
|
26
|
+
// DELIBERATELY DIAL-ELIGIBLE (NOT hard-block) — bare-digit runs whose only guard is
|
|
27
|
+
// a single numeric checksum over a COMMON digit length, so a benign id/counter FP is
|
|
28
|
+
// plausible and the operator needs the allowlist/minConfidence escape hatch (the
|
|
29
|
+
// jp_mynumber precedent). They still detect + (per profile) block by default:
|
|
30
|
+
// - jp_mynumber — a bare 12-digit run + a SINGLE mod-11 check digit (measured
|
|
31
|
+
// ~9% of random 12-digit numbers pass), and 12-digit ids/counters are common.
|
|
32
|
+
// - uk_nino — NO checksum exists (format + invalid-prefix exclusions only), the
|
|
33
|
+
// largest FP surface.
|
|
34
|
+
// - in_aadhaar — a bare 12-digit run + the Verhoeff checksum (measured ~9.9% of
|
|
35
|
+
// random 12-digit runs pass, the same order as jp_mynumber's mod-11). Aadhaar
|
|
36
|
+
// is extremely sensitive, but Verhoeff over the COMMON 12-digit shape is exactly
|
|
37
|
+
// the jp_mynumber footgun (a 12-digit id/counter is common), so it stays
|
|
38
|
+
// allowlist-clearable rather than un-suppressable.
|
|
39
|
+
// - de_steuer_id — a bare 11-digit run + ISO 7064 MOD 11,10 plus a structural
|
|
40
|
+
// "exactly one repeated digit" test. The combined guard is strong (measured
|
|
41
|
+
// ~0.37% collision), BUT there is NO non-numeric anchor and 11-digit ids are
|
|
42
|
+
// common in payloads, so per the jp_mynumber discipline (a bare-digit shape over
|
|
43
|
+
// a common length) it stays DIAL-ELIGIBLE so an operator can clear an 11-digit-id
|
|
44
|
+
// FP. It still detects + blocks by default.
|
|
45
|
+
// - nl_bsn — a bare 9-digit run + the "11-proef" weighted mod-11 (measured ~9.1%
|
|
46
|
+
// of random 9-digit runs pass). 9 bare digits is VERY common, so this is the
|
|
47
|
+
// clearest dial-eligible case.
|
|
48
|
+
export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card", "fr_nir", "es_dni", "it_codice_fiscale", "sg_nric"]);
|
|
49
|
+
|
|
1
50
|
const DEFAULT_RULES = [
|
|
2
51
|
{
|
|
3
52
|
id: "email",
|
|
@@ -9,9 +58,15 @@ const DEFAULT_RULES = [
|
|
|
9
58
|
{
|
|
10
59
|
// KR mobile numbers (01[016789] prefixes); landlines are out of scope.
|
|
11
60
|
// krPhoneValid keeps a bare separator-less run from matching a timestamp/id.
|
|
61
|
+
// The leading `(?<![\w+-])` / trailing `(?![\w-])` boundaries (WS2c) stop the
|
|
62
|
+
// rule from matching a phone-shaped digit run that is a SUBSTRING of a longer
|
|
63
|
+
// hex/alnum/dashed run — e.g. the `…a716-446655440000` tail of a UUID, where
|
|
64
|
+
// the inner `16-44665544` otherwise mis-fired as a phone. The boundaries
|
|
65
|
+
// never affect a real number: a KR mobile sits on a word/space/punctuation
|
|
66
|
+
// edge and `+82` starts on the `+` (allowed before the boundary).
|
|
12
67
|
id: "kr-phone",
|
|
13
68
|
type: "phone",
|
|
14
|
-
pattern: "(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}",
|
|
69
|
+
pattern: "(?<![\\w+-])(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}(?![\\w-])",
|
|
15
70
|
flags: "g",
|
|
16
71
|
confidence: 0.9,
|
|
17
72
|
validate: krPhoneValid
|
|
@@ -40,6 +95,146 @@ const DEFAULT_RULES = [
|
|
|
40
95
|
confidence: 0.95
|
|
41
96
|
},
|
|
42
97
|
{
|
|
98
|
+
// AWS access key id: a long-lived (AKIA) or temporary (ASIA) key id is a
|
|
99
|
+
// hard-anchored prefix + EXACTLY 16 uppercase-alphanumeric chars. The fixed
|
|
100
|
+
// prefix + fixed length is what makes this high-precision (no bare base64).
|
|
101
|
+
id: "aws-access-key-id",
|
|
102
|
+
type: "api_key",
|
|
103
|
+
pattern: "\\b(?:AKIA|ASIA)[0-9A-Z]{16}\\b",
|
|
104
|
+
flags: "g",
|
|
105
|
+
confidence: 0.95
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
// GitHub token: pat (ghp_), oauth (gho_), user-to-server (ghu_), server-to-
|
|
109
|
+
// server (ghs_), refresh (ghr_). Anchored prefix + a long base64-ish body.
|
|
110
|
+
// GitHub's own format is 36 chars after the prefix; we allow >=36 (the
|
|
111
|
+
// corpus fixture is 38) and cap to keep the match bounded.
|
|
112
|
+
id: "github-token",
|
|
113
|
+
type: "secret",
|
|
114
|
+
pattern: "\\bgh[pousr]_[A-Za-z0-9]{36,255}\\b",
|
|
115
|
+
flags: "g",
|
|
116
|
+
confidence: 0.95
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
// Google API key: anchored AIza + exactly 35 chars from the URL-safe
|
|
120
|
+
// alphabet. Fixed prefix + fixed length = high precision.
|
|
121
|
+
id: "google-api-key",
|
|
122
|
+
type: "api_key",
|
|
123
|
+
pattern: "\\bAIza[0-9A-Za-z_-]{35}\\b",
|
|
124
|
+
flags: "g",
|
|
125
|
+
confidence: 0.9
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
// Slack token: bot (xoxb-), user (xoxa/xoxp-), refresh (xoxr-), legacy
|
|
129
|
+
// (xoxs-). Anchored xox[baprs]- + a >=10-char body. The corpus value is a
|
|
130
|
+
// deliberately low-entropy placeholder, so the rule anchors on the prefix +
|
|
131
|
+
// body shape, not entropy.
|
|
132
|
+
id: "slack-token",
|
|
133
|
+
type: "secret",
|
|
134
|
+
pattern: "\\bxox[baprs]-[0-9A-Za-z-]{10,}\\b",
|
|
135
|
+
flags: "g",
|
|
136
|
+
confidence: 0.9
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
// Anthropic API key: `sk-ant-` + a long body. Ordered BEFORE the OpenAI rule
|
|
140
|
+
// below so a Claude key is attributed to its own rule (both emit `secret`, so
|
|
141
|
+
// removeOverlaps collapsing the shared span to either is type-identical — the
|
|
142
|
+
// ordering is for ruleId attribution, not for the scored type). The HYPHEN
|
|
143
|
+
// after `sk` is load-bearing: it keeps this OFF the underscore-based Stripe/
|
|
144
|
+
// OpenAI-platform `sk_` rule (openai-like-key, `api_key`), so the two never
|
|
145
|
+
// collide on the same span.
|
|
146
|
+
id: "anthropic-api-key",
|
|
147
|
+
type: "secret",
|
|
148
|
+
pattern: "\\bsk-ant-[A-Za-z0-9_-]{16,}\\b",
|
|
149
|
+
flags: "g",
|
|
150
|
+
confidence: 0.95
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
// OpenAI API key: `sk-` (and project keys `sk-proj-`) + a long base62-ish
|
|
154
|
+
// body. The HYPHEN is the disambiguator from Stripe/OpenAI-platform `sk_`
|
|
155
|
+
// (underscore — handled by openai-like-key as `api_key`); this rule never
|
|
156
|
+
// matches an underscore form, so the two prefixes do not overlap. A >=20-char
|
|
157
|
+
// body keeps a bare `sk-foo` slug from firing. The Anthropic `sk-ant-` rule
|
|
158
|
+
// above is a stricter sibling that runs first.
|
|
159
|
+
id: "openai-api-key",
|
|
160
|
+
type: "secret",
|
|
161
|
+
pattern: "\\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\\b",
|
|
162
|
+
flags: "g",
|
|
163
|
+
confidence: 0.9
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
// Google OAuth client secret: anchored `GOCSPX-` + exactly 28 chars from the
|
|
167
|
+
// URL-safe alphabet. Fixed prefix + fixed length = high precision (this is the
|
|
168
|
+
// OAuth client secret, distinct from the `AIza` API key above).
|
|
169
|
+
id: "google-oauth-client-secret",
|
|
170
|
+
type: "secret",
|
|
171
|
+
pattern: "\\bGOCSPX-[A-Za-z0-9_-]{28}\\b",
|
|
172
|
+
flags: "g",
|
|
173
|
+
confidence: 0.95
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
// SendGrid API key: `SG.` + 22 URL-safe chars + `.` + 43 URL-safe chars. The
|
|
177
|
+
// two fixed-length dotted segments after the `SG.` prefix are what make this
|
|
178
|
+
// high-precision — a bare `SG.`-prefixed string of the wrong shape is rejected.
|
|
179
|
+
id: "sendgrid-api-key",
|
|
180
|
+
type: "api_key",
|
|
181
|
+
pattern: "\\bSG\\.[A-Za-z0-9_-]{22}\\.[A-Za-z0-9_-]{43}\\b",
|
|
182
|
+
flags: "g",
|
|
183
|
+
confidence: 0.95
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
// Twilio Account SID (AC…) / API Key SID (SK…): the fixed `AC`/`SK` prefix +
|
|
187
|
+
// EXACTLY 32 HEX chars. The hex-only body (not base62) is the precision guard:
|
|
188
|
+
// a random alphanumeric run of the same length carries non-hex letters and is
|
|
189
|
+
// rejected, and the `SK` form does not collide with the underscore/hyphen
|
|
190
|
+
// `sk_`/`sk-` rules. Twilio's bare 32-hex AUTH TOKEN is deliberately NOT a
|
|
191
|
+
// standalone rule (a prefix-less 32-hex run is indistinguishable from an MD5
|
|
192
|
+
// hash / id) — it is caught via the `<key> = <value>` assignment vocabulary.
|
|
193
|
+
id: "twilio-sid",
|
|
194
|
+
type: "api_key",
|
|
195
|
+
pattern: "\\b(?:AC|SK)[0-9a-fA-F]{32}\\b",
|
|
196
|
+
flags: "g",
|
|
197
|
+
confidence: 0.9
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
// npm access token: anchored `npm_` + exactly 36 base62 chars. Fixed prefix +
|
|
201
|
+
// fixed length = high precision.
|
|
202
|
+
id: "npm-token",
|
|
203
|
+
type: "secret",
|
|
204
|
+
pattern: "\\bnpm_[A-Za-z0-9]{36}\\b",
|
|
205
|
+
flags: "g",
|
|
206
|
+
confidence: 0.95
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
// JWT: three dot-separated base64url segments where the FIRST starts with
|
|
210
|
+
// `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
|
|
211
|
+
// on `eyJ` + two more base64url groups keeps this from matching arbitrary
|
|
212
|
+
// dotted tokens (a bare base64 triplet without the JSON header is not a JWT).
|
|
213
|
+
id: "jwt",
|
|
214
|
+
type: "secret",
|
|
215
|
+
pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
|
|
216
|
+
flags: "g",
|
|
217
|
+
confidence: 0.9
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
// PEM private key: the armored header. We match the header line itself
|
|
221
|
+
// (`-----BEGIN [...] PRIVATE KEY-----`) — its presence is the credential
|
|
222
|
+
// signal; the body is high-entropy base64 we do not need to span. Covers
|
|
223
|
+
// RSA/EC/OPENSSH/DSA/ENCRYPTED variants and the bare `PRIVATE KEY` form.
|
|
224
|
+
id: "pem-private-key",
|
|
225
|
+
type: "secret",
|
|
226
|
+
pattern: "-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----",
|
|
227
|
+
flags: "g",
|
|
228
|
+
confidence: 0.98
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
// Bearer credential. Deliberately NOT context-anchored to `Authorization:`:
|
|
232
|
+
// detection runs PER STRING LEAF, and a real payload carries the credential
|
|
233
|
+
// as its own leaf (`{"Authorization": "Bearer <token>"}` walks to the bare
|
|
234
|
+
// value `"Bearer <token>"`), so a lookbehind requiring the header key in the
|
|
235
|
+
// same string would MISS the realistic case — a recall regression on a
|
|
236
|
+
// hard-block (`secret`) type. `secret` is fail-closed: a `Bearer …` prose
|
|
237
|
+
// false positive is the accepted cost of never missing a leaked token.
|
|
43
238
|
id: "bearer-token",
|
|
44
239
|
type: "secret",
|
|
45
240
|
pattern: "\\bBearer\\s+[A-Za-z0-9._~+/-]{16,}\\b",
|
|
@@ -50,11 +245,193 @@ const DEFAULT_RULES = [
|
|
|
50
245
|
id: "assignment-secret",
|
|
51
246
|
type: "secret",
|
|
52
247
|
// Lookbehind keeps the key name out of the match so transforms replace
|
|
53
|
-
// only the secret value, not the assignment prefix.
|
|
54
|
-
|
|
248
|
+
// only the secret value, not the assignment prefix. The key vocabulary
|
|
249
|
+
// covers the common credential-assignment names (cloud secrets, OAuth
|
|
250
|
+
// client secrets, PEM/private keys, access/refresh tokens) so a
|
|
251
|
+
// `<key> = <value>` leak is caught even when the value itself has no
|
|
252
|
+
// self-describing prefix (e.g. an AWS secret access key is bare base64).
|
|
253
|
+
// `accountkey` catches the Azure Storage connection-string `AccountKey=<88-
|
|
254
|
+
// char base64>=` segment — an un-anchored 88-char base64 rule would false-fire
|
|
255
|
+
// on any blob, so the `AccountKey=` assignment context is the precision anchor.
|
|
256
|
+
// `auth[_-]?token` catches the Twilio auth token (a bare 32-hex run with no
|
|
257
|
+
// self-describing prefix) when it is leaked as a `<key> = <value>` pair.
|
|
258
|
+
pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|auth[_-]?token|accountkey|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
|
|
55
259
|
flags: "gi",
|
|
56
260
|
confidence: 0.85
|
|
57
261
|
},
|
|
262
|
+
{
|
|
263
|
+
// US SSN: AAA-GG-SSSS. The format alone collides with 9-digit ids, so a
|
|
264
|
+
// validator rejects the SSA-invalid ranges (area 000/666/900-999, group 00,
|
|
265
|
+
// serial 0000). The separators are required by the pattern — a bare 9-digit
|
|
266
|
+
// run is intentionally NOT matched (it is indistinguishable from an id).
|
|
267
|
+
id: "us-ssn",
|
|
268
|
+
type: "us_ssn",
|
|
269
|
+
pattern: "(?<![\\w-])\\d{3}-\\d{2}-\\d{4}(?![\\w-])",
|
|
270
|
+
flags: "g",
|
|
271
|
+
confidence: 0.85,
|
|
272
|
+
validate: usSsnValid
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
// IBAN: country(2 alpha) + 2 check digits + BBAN. The mod-97 checksum is
|
|
276
|
+
// what makes this high-precision — a random alnum run of the right shape
|
|
277
|
+
// almost never satisfies mod-97 == 1. Length 15-34 per ISO 13616.
|
|
278
|
+
id: "iban",
|
|
279
|
+
type: "iban",
|
|
280
|
+
pattern: "(?<![A-Z0-9])[A-Z]{2}\\d{2}[A-Z0-9]{11,30}(?![A-Z0-9])",
|
|
281
|
+
flags: "g",
|
|
282
|
+
confidence: 0.9,
|
|
283
|
+
validate: ibanValid
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
// Japan My Number (個人番号): EXACTLY 12 digits with the official mod-11
|
|
287
|
+
// weighted check digit over the first 11. A bare 12-digit run is ambiguous
|
|
288
|
+
// (an id/timestamp), so jpMyNumberValid is the precision guard — only a run
|
|
289
|
+
// whose 12th digit equals the prescribed check digit fires. The leading/
|
|
290
|
+
// trailing boundaries (`(?<![\d-])`/`(?![\d-])`) stop the rule from matching
|
|
291
|
+
// a 12-digit window inside a longer digit/dashed run. NOT hard-block: a single
|
|
292
|
+
// mod-11 check digit only rejects ~10/11 of random 12-digit runs, and such
|
|
293
|
+
// runs are common (ids/counters), so a benign FP is plausible and the operator
|
|
294
|
+
// keeps the allowlist escape hatch (it still detects + blocks by default).
|
|
295
|
+
id: "jp-mynumber",
|
|
296
|
+
type: "jp_mynumber",
|
|
297
|
+
pattern: "(?<![\\d-])\\d{12}(?![\\d-])",
|
|
298
|
+
flags: "g",
|
|
299
|
+
confidence: 0.9,
|
|
300
|
+
validate: jpMyNumberValid
|
|
301
|
+
},
|
|
302
|
+
{
|
|
303
|
+
// France NIR / INSEE social-security: 15 chars where the department field may
|
|
304
|
+
// carry the Corsica `2A`/`2B` letters, validated by the control key
|
|
305
|
+
// `97 - (first13 mod 97) == last2` (Corsica 2A→19, 2B→18 before the mod).
|
|
306
|
+
// The control key is the precision guard — a wrong key is rejected. The
|
|
307
|
+
// department alpha is optional so the pure-numeric form also matches. Anchored
|
|
308
|
+
// on word boundaries; hard-block (checksummed).
|
|
309
|
+
id: "fr-nir",
|
|
310
|
+
type: "fr_nir",
|
|
311
|
+
pattern: "(?<![\\w-])[12]\\d{2}(?:0[1-9]|1[0-2]|20)(?:\\d{2}|2[AB])\\d{6}\\d{2}(?![\\w-])",
|
|
312
|
+
flags: "g",
|
|
313
|
+
confidence: 0.9,
|
|
314
|
+
validate: frNirValid
|
|
315
|
+
},
|
|
316
|
+
{
|
|
317
|
+
// Spain DNI/NIE: 8 digits (DNI) or a leading X/Y/Z + 7 digits (NIE) + a check
|
|
318
|
+
// letter from the mod-23 table (NIE maps X/Y/Z→0/1/2 before the mod). The
|
|
319
|
+
// check letter is the precision guard — a wrong letter is rejected. The
|
|
320
|
+
// letters that can never appear (I/O/U) are excluded from the suffix class so
|
|
321
|
+
// an ordinary `<8-digit><letter>` token rarely even reaches the validator.
|
|
322
|
+
// Hard-block (checksummed).
|
|
323
|
+
id: "es-dni-nie",
|
|
324
|
+
type: "es_dni",
|
|
325
|
+
pattern: "(?<![\\w-])[XYZ]?\\d{7,8}[A-HJ-NP-TV-Z](?![\\w-])",
|
|
326
|
+
flags: "gi",
|
|
327
|
+
confidence: 0.85,
|
|
328
|
+
validate: esDniValid
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
// Italy Codice Fiscale: 16 chars in the rigid layout
|
|
332
|
+
// [A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z] (surname, name, year, month-letter,
|
|
333
|
+
// day, place code, check character). The 16th char is the official check
|
|
334
|
+
// character: sum the odd/even-position table values over the first 15 chars
|
|
335
|
+
// and map (sum mod 26) to a letter. The mixed alpha+digit structure + the
|
|
336
|
+
// mod-26 check character are the precision guard. Hard-block (strong
|
|
337
|
+
// non-numeric anchor over a rare shape).
|
|
338
|
+
id: "it-codice-fiscale",
|
|
339
|
+
type: "it_codice_fiscale",
|
|
340
|
+
pattern: "(?<![A-Z0-9])[A-Z]{6}\\d{2}[A-Z]\\d{2}[A-Z]\\d{3}[A-Z](?![A-Z0-9])",
|
|
341
|
+
flags: "gi",
|
|
342
|
+
confidence: 0.9,
|
|
343
|
+
validate: itCodiceFiscaleValid
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
// Singapore NRIC/FIN: a series LETTER ([STFGM]) + 7 digits + a CHECK LETTER.
|
|
347
|
+
// The check letter is a weighted sum (weights 2,7,6,5,4,3,2) plus a per-prefix
|
|
348
|
+
// offset (T/G +4, M +3), mod 11, looked up in the per-series letter table. The
|
|
349
|
+
// prefix letter + check letter are the precision guard. Hard-block (two
|
|
350
|
+
// non-numeric anchors over a rare shape).
|
|
351
|
+
id: "sg-nric",
|
|
352
|
+
type: "sg_nric",
|
|
353
|
+
pattern: "(?<![A-Z0-9])[STFGMstfgm]\\d{7}[A-Za-z](?![A-Z0-9])",
|
|
354
|
+
flags: "g",
|
|
355
|
+
confidence: 0.9,
|
|
356
|
+
validate: sgNricValid
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
// India Aadhaar: 12 digits (never starting 0 or 1) with the Verhoeff checksum
|
|
360
|
+
// over all 12. A bare 12-digit run is ambiguous (an id/timestamp), so the
|
|
361
|
+
// Verhoeff check is the precision guard. NOT hard-block: Verhoeff over the
|
|
362
|
+
// COMMON 12-digit shape passes ~1/10 of random runs (the jp_mynumber footgun),
|
|
363
|
+
// so it stays dial-eligible (still detects + blocks by default). The leading/
|
|
364
|
+
// trailing boundaries stop the rule from matching a 12-digit window inside a
|
|
365
|
+
// longer digit/dashed run.
|
|
366
|
+
id: "in-aadhaar",
|
|
367
|
+
type: "in_aadhaar",
|
|
368
|
+
pattern: "(?<![\\d-])[2-9]\\d{11}(?![\\d-])",
|
|
369
|
+
flags: "g",
|
|
370
|
+
confidence: 0.85,
|
|
371
|
+
validate: inAadhaarValid
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
// Germany tax ID (Steuer-Identifikationsnummer): 11 digits with the ISO 7064
|
|
375
|
+
// MOD 11,10 check digit over the first 10, plus the structural rule that the
|
|
376
|
+
// first 10 digits contain exactly one repeated digit (one value appears 2 or 3
|
|
377
|
+
// times, the rest once). The combined guard is strong, but it is a BARE-DIGIT
|
|
378
|
+
// run with no non-numeric anchor over a common 11-digit length, so per the
|
|
379
|
+
// jp_mynumber discipline it stays dial-eligible (the operator can clear an
|
|
380
|
+
// 11-digit-id FP).
|
|
381
|
+
id: "de-steuer-id",
|
|
382
|
+
type: "de_steuer_id",
|
|
383
|
+
pattern: "(?<![\\d-])\\d{11}(?![\\d-])",
|
|
384
|
+
flags: "g",
|
|
385
|
+
confidence: 0.85,
|
|
386
|
+
validate: deSteuerIdValid
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
// Netherlands BSN (Burgerservicenummer): 9 digits validated by the "11-proef"
|
|
390
|
+
// weighted mod-11 (Σ digit_i · weight_i ≡ 0 mod 11, with the last weight -1).
|
|
391
|
+
// 9 bare digits is VERY common, so the 11-proef passes ~1/11 of random runs —
|
|
392
|
+
// the clearest dial-eligible case (still detects + blocks by default; the
|
|
393
|
+
// operator keeps the allowlist escape hatch).
|
|
394
|
+
id: "nl-bsn",
|
|
395
|
+
type: "nl_bsn",
|
|
396
|
+
pattern: "(?<![\\d-])\\d{9}(?![\\d-])",
|
|
397
|
+
flags: "g",
|
|
398
|
+
confidence: 0.8,
|
|
399
|
+
validate: nlBsnValid
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
// UK National Insurance Number: two prefix letters + 6 digits + a suffix
|
|
403
|
+
// A-D. There is NO checksum, so this is FORMAT-ONLY and stays OUT of
|
|
404
|
+
// HARD_BLOCK_TYPES (dial-eligible). The pattern bakes in the documented
|
|
405
|
+
// invalid-prefix exclusions: 1st letter never D/F/I/Q/U/V, 2nd letter never
|
|
406
|
+
// D/F/I/O/Q/U/V, and the disallowed pairs BG/GB/NK/KN/TN/NT/ZZ are rejected
|
|
407
|
+
// by ukNinoValid (a negative-set the regex can't express cleanly).
|
|
408
|
+
id: "uk-nino",
|
|
409
|
+
type: "uk_nino",
|
|
410
|
+
pattern: "(?<![\\w-])[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\\d{6}[A-D](?![\\w-])",
|
|
411
|
+
flags: "g",
|
|
412
|
+
confidence: 0.7,
|
|
413
|
+
validate: ukNinoValid
|
|
414
|
+
},
|
|
415
|
+
{
|
|
416
|
+
// E.164 international phone: ONLY with a leading `+` (a bare digit run is an
|
|
417
|
+
// id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
|
|
418
|
+
id: "e164-phone",
|
|
419
|
+
type: "phone",
|
|
420
|
+
pattern: "(?<![\\w+])\\+[1-9]\\d{6,14}(?![\\w])",
|
|
421
|
+
flags: "g",
|
|
422
|
+
confidence: 0.8
|
|
423
|
+
},
|
|
424
|
+
{
|
|
425
|
+
// US national phone: ONLY with separators — `(NXX) NXX-XXXX` or
|
|
426
|
+
// `NXX-NXX-XXXX`. A separator-less 10-digit run is deliberately NOT matched
|
|
427
|
+
// (it collides with ids/timestamps; the kr-phone rule already guards bare
|
|
428
|
+
// runs). Conservative by design — phone is the highest false-positive risk.
|
|
429
|
+
id: "us-phone",
|
|
430
|
+
type: "phone",
|
|
431
|
+
pattern: "(?<![\\w-])(?:\\(\\d{3}\\)\\s?|\\d{3}-)\\d{3}-\\d{4}(?![\\w-])",
|
|
432
|
+
flags: "g",
|
|
433
|
+
confidence: 0.75
|
|
434
|
+
},
|
|
58
435
|
// Indirect prompt injection heuristics. Response/tool-result direction only,
|
|
59
436
|
// and the policy default for the injection type is `allow` (report-only):
|
|
60
437
|
// detections are audited regardless of action, and false-positive blocks
|
|
@@ -101,8 +478,13 @@ const DEFAULT_RULES = [
|
|
|
101
478
|
}
|
|
102
479
|
];
|
|
103
480
|
|
|
104
|
-
export function createDefaultFilterEngine({ customRules = [] } = {}) {
|
|
481
|
+
export function createDefaultFilterEngine({ customRules = [], decodeAndRescan = false } = {}) {
|
|
105
482
|
const rules = DEFAULT_RULES.concat(customRules.map(normalizeCustomRule));
|
|
483
|
+
// The opt-in base64/percent decode-and-rescan pass (WS2d residual). Default OFF
|
|
484
|
+
// => byte-identical to prior behavior. Held in the engine CLOSURE, NOT threaded
|
|
485
|
+
// through the protect `context`: the request context is data and must not carry
|
|
486
|
+
// this control flag (it would pollute tokenize AAD / audit).
|
|
487
|
+
const decodeOptions = { decodeAndRescan: decodeAndRescan === true };
|
|
106
488
|
|
|
107
489
|
return {
|
|
108
490
|
id: "haechi.filter.default",
|
|
@@ -112,12 +494,32 @@ export function createDefaultFilterEngine({ customRules = [] } = {}) {
|
|
|
112
494
|
networkEgress: false
|
|
113
495
|
},
|
|
114
496
|
async detect({ entries, context }) {
|
|
115
|
-
return entries.flatMap((entry) => detectEntry(entry, rules, context));
|
|
497
|
+
return entries.flatMap((entry) => detectEntry(entry, rules, context, decodeOptions));
|
|
116
498
|
}
|
|
117
499
|
};
|
|
118
500
|
}
|
|
119
501
|
|
|
120
|
-
export function detectEntry(entry, rules, context = {}) {
|
|
502
|
+
export function detectEntry(entry, rules, context = {}, options = {}) {
|
|
503
|
+
const baseDetections = scanEntry(entry, rules, context);
|
|
504
|
+
// WS2d residual — opt-in (default OFF) base64/percent decode-and-rescan. After
|
|
505
|
+
// the normal NFKC scan above, if the flag is on, attempt to decode the leaf and
|
|
506
|
+
// rescan the decoded text. A decoded hit has NO valid offset in the encoded leaf
|
|
507
|
+
// (decoding remaps everything), so it fails closed to a WHOLE-LEAF detection of
|
|
508
|
+
// the original encoded leaf — and only fires for a validator-backed/hard-block
|
|
509
|
+
// hit so random base64 never false-positives. See decodeAndRescanEntry.
|
|
510
|
+
if (options?.decodeAndRescan === true) {
|
|
511
|
+
const decoded = decodeAndRescanEntry(entry, rules, context);
|
|
512
|
+
if (decoded.length > 0) {
|
|
513
|
+
return baseDetections.concat(decoded);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return baseDetections;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// The original per-leaf NFKC scan (WS2d), unchanged. Extracted from detectEntry so
|
|
520
|
+
// the opt-in decode-and-rescan pass wraps it without touching the byte-identical
|
|
521
|
+
// default path.
|
|
522
|
+
function scanEntry(entry, rules, context = {}) {
|
|
121
523
|
const detections = [];
|
|
122
524
|
// On the RESPONSE direction, a bare JSON NUMBER leaf is inference-server
|
|
123
525
|
// metadata (a nanosecond `*_duration`, a token count, a numeric id/timestamp) —
|
|
@@ -139,8 +541,62 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
139
541
|
// marker-shaped string is NOT Haechi output (Haechi hasn't transformed it yet),
|
|
140
542
|
// so it is scanned normally — otherwise an attacker could wrap a real secret in
|
|
141
543
|
// a fake `[TOKEN:…]` to evade request-side detection.
|
|
544
|
+
// Markers are pure ASCII and NFKC-stable, so their spans are computed on the
|
|
545
|
+
// ORIGINAL value exactly as before — they line up with the same-length
|
|
546
|
+
// normalized scan (Case 2 below) and are irrelevant to the whole-leaf scan
|
|
547
|
+
// (Case 3).
|
|
142
548
|
const markerSpans = context?.direction === "response" ? haechiMarkerSpans(entry.value) : [];
|
|
143
549
|
|
|
550
|
+
// WS2d — Unicode evasion via NFKC normalization. A client can defeat every
|
|
551
|
+
// regex rule by sending PII/secrets in a Unicode form that folds to ASCII
|
|
552
|
+
// (full-width digits `4242…`, full-width `@`, mathematical/enclosed
|
|
553
|
+
// alphanumerics). NFKC normalization maps those to their compatibility ASCII
|
|
554
|
+
// form so the rules match. The crux is OFFSET INTEGRITY: detections carry
|
|
555
|
+
// {start,end} into entry.value, but the transform slices the ORIGINAL string
|
|
556
|
+
// (packages/core transformString). Three cases keep offsets valid:
|
|
557
|
+
const value = entry.value;
|
|
558
|
+
const normalized = value.normalize("NFKC");
|
|
559
|
+
if (normalized === value) {
|
|
560
|
+
// Case 1 (~99%): nothing folded. Detect on the original exactly as before —
|
|
561
|
+
// byte-identical behavior, zero regression.
|
|
562
|
+
return removeOverlaps(scanForDetections(value, rules, context, markerSpans, entry, value));
|
|
563
|
+
}
|
|
564
|
+
if (isPositionStableNfkc(value, normalized)) {
|
|
565
|
+
// Case 2: every codepoint folded to the SAME UTF-16 length and the per-
|
|
566
|
+
// codepoint folds reconstruct the whole normalization, so each original
|
|
567
|
+
// character occupies the SAME offsets in `normalized` as in `value` (e.g.
|
|
568
|
+
// full-width→ASCII digits/letters). A match's {start,end} on `normalized` are
|
|
569
|
+
// therefore valid on the ORIGINAL value — exact-span redaction of the evaded
|
|
570
|
+
// value, with the recorded `value` taken from the original slice so
|
|
571
|
+
// tokenize/AAD/audit see the real bytes. A bare `normalized.length ===
|
|
572
|
+
// value.length` check is UNSOUND: a length-contracting codepoint before the
|
|
573
|
+
// PII compensated by a length-expanding one after it keeps the total length
|
|
574
|
+
// equal yet shifts every interior offset (redacting the wrong bytes), so such
|
|
575
|
+
// inputs must fall through to the Case 3 whole-leaf path. Validators still run
|
|
576
|
+
// on the normalized match text (Luhn/RRN need ASCII digits).
|
|
577
|
+
return removeOverlaps(scanForDetections(normalized, rules, context, markerSpans, entry, value));
|
|
578
|
+
}
|
|
579
|
+
// Case 3: the fold is NOT position-stable (a length-changing decomposition, or a
|
|
580
|
+
// compensating contraction+expansion that shifts interior offsets). Offsets on
|
|
581
|
+
// the normalized copy do NOT map back to the original, so we CANNOT do exact-span
|
|
582
|
+
// redaction.
|
|
583
|
+
// FAIL CLOSED: emit ONE detection per matched type covering the WHOLE leaf so
|
|
584
|
+
// the transform redacts/blocks the entire leaf. Over-redacting an evasion
|
|
585
|
+
// attempt is the safe failure. removeOverlaps is intentionally skipped — every
|
|
586
|
+
// detection spans the whole leaf so they all "overlap"; the transform collapses
|
|
587
|
+
// them to a single whole-leaf replacement via its cursor, and any `block` among
|
|
588
|
+
// them blocks the payload, while preserving per-type detection reporting.
|
|
589
|
+
return wholeLeafDetections(normalized, rules, context, entry, value);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Run every applicable rule over `scanText` (the original value, or its
|
|
593
|
+
// same-length NFKC normalization). Offsets index `scanText`, which is positionally
|
|
594
|
+
// 1:1 with `originalValue` (Case 1: identical; Case 2: same UTF-16 length), so the
|
|
595
|
+
// {start,end} are valid on `originalValue`. The recorded `value` is the ORIGINAL
|
|
596
|
+
// slice (never the normalized form). Marker spans (response-only) are computed on
|
|
597
|
+
// the original and align under both cases.
|
|
598
|
+
function scanForDetections(scanText, rules, context, markerSpans, entry, originalValue) {
|
|
599
|
+
const detections = [];
|
|
144
600
|
for (const rule of rules) {
|
|
145
601
|
// Direction-scoped rules (e.g. injection heuristics) only run on the
|
|
146
602
|
// matching traffic direction; rules without a direction run everywhere.
|
|
@@ -148,13 +604,13 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
148
604
|
continue;
|
|
149
605
|
}
|
|
150
606
|
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
151
|
-
for (const match of
|
|
152
|
-
const
|
|
153
|
-
if (rule.validate && !rule.validate(
|
|
607
|
+
for (const match of scanText.matchAll(regex)) {
|
|
608
|
+
const matchText = match[0];
|
|
609
|
+
if (rule.validate && !rule.validate(matchText)) {
|
|
154
610
|
continue;
|
|
155
611
|
}
|
|
156
612
|
const start = match.index;
|
|
157
|
-
const end = match.index +
|
|
613
|
+
const end = match.index + matchText.length;
|
|
158
614
|
if (overlapsAny(start, end, markerSpans)) {
|
|
159
615
|
continue;
|
|
160
616
|
}
|
|
@@ -167,12 +623,205 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
167
623
|
start,
|
|
168
624
|
end,
|
|
169
625
|
confidence: rule.confidence,
|
|
170
|
-
value
|
|
626
|
+
value: originalValue.slice(start, end)
|
|
171
627
|
});
|
|
172
628
|
}
|
|
173
629
|
}
|
|
630
|
+
return detections;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
// Case 3 fail-closed scan: discover which types the NFKC-normalized text matches,
|
|
634
|
+
// then emit one whole-leaf detection per distinct type (start:0, end:value.length,
|
|
635
|
+
// value: the whole original leaf). The response-direction marker skip does NOT
|
|
636
|
+
// apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
|
|
637
|
+
// and NFKC-stable), so an evasion attempt can never masquerade as one.
|
|
638
|
+
function wholeLeafDetections(normalized, rules, context, entry, originalValue, ruleFilter = null) {
|
|
639
|
+
const seenTypes = new Set();
|
|
640
|
+
const detections = [];
|
|
641
|
+
for (const rule of rules) {
|
|
642
|
+
if (rule.direction && rule.direction !== context?.direction) {
|
|
643
|
+
continue;
|
|
644
|
+
}
|
|
645
|
+
// The decode-and-rescan caller passes a precision filter so only validator-
|
|
646
|
+
// backed / hard-block rules can fire on decoded text (random base64 guard).
|
|
647
|
+
// The Case-3 NFKC caller passes nothing → every rule is eligible (unchanged).
|
|
648
|
+
if (ruleFilter && !ruleFilter(rule)) {
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
if (seenTypes.has(rule.type)) {
|
|
652
|
+
continue;
|
|
653
|
+
}
|
|
654
|
+
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
655
|
+
let matched = false;
|
|
656
|
+
for (const match of normalized.matchAll(regex)) {
|
|
657
|
+
if (!rule.validate || rule.validate(match[0])) {
|
|
658
|
+
matched = true;
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
if (!matched) {
|
|
663
|
+
continue;
|
|
664
|
+
}
|
|
665
|
+
seenTypes.add(rule.type);
|
|
666
|
+
detections.push({
|
|
667
|
+
type: rule.type,
|
|
668
|
+
ruleId: rule.id,
|
|
669
|
+
path: entry.path,
|
|
670
|
+
pathText: entry.pathText,
|
|
671
|
+
kind: entry.kind ?? "value",
|
|
672
|
+
start: 0,
|
|
673
|
+
end: originalValue.length,
|
|
674
|
+
confidence: rule.confidence,
|
|
675
|
+
value: originalValue
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
return detections;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// WS2d residual — opt-in base64/percent decode-and-rescan (default OFF). An
|
|
682
|
+
// always-on decode is false-positive-prone (random base64 decodes to bytes that
|
|
683
|
+
// can shape-match a soft rule), so this is gated behind `filters.decodeAndRescan`
|
|
684
|
+
// AND a precision guard: a decoded hit only fires when it is VALIDATOR-BACKED or a
|
|
685
|
+
// HARD-BLOCK type (a Luhn-passing card, a checksum kr_rrn/us_ssn, an IBAN mod-97,
|
|
686
|
+
// or a secret/api_key on its anchored rule). A decoded soft-type-without-validator
|
|
687
|
+
// match (a bare phone-shaped run in random decoded bytes) does NOT fire — requiring
|
|
688
|
+
// validators keeps precision ~100% (random base64 Luhn-passing as a 16-digit card
|
|
689
|
+
// is astronomically unlikely).
|
|
690
|
+
//
|
|
691
|
+
// OFFSET HANDLING (fail closed): a detection found in the DECODED text has no valid
|
|
692
|
+
// offset in the original encoded leaf (decoding remaps everything), so we emit a
|
|
693
|
+
// WHOLE-LEAF detection per matched type (start:0, end:leaf.length, value: the whole
|
|
694
|
+
// original encoded leaf) — exactly the WS2d Case-3 path. The transform then
|
|
695
|
+
// redacts/blocks the entire encoded leaf. We never map a decoded offset back.
|
|
696
|
+
function decodeAndRescanEntry(entry, rules, context) {
|
|
697
|
+
// Only string leaves carry an encoded value; a number/boolean leaf cannot be a
|
|
698
|
+
// base64/percent blob (and the response-direction number skip already applies in
|
|
699
|
+
// the base scan).
|
|
700
|
+
if (entry.kind === "number") {
|
|
701
|
+
return [];
|
|
702
|
+
}
|
|
703
|
+
const decoded = decodeLeaf(entry.value);
|
|
704
|
+
if (decoded === null) {
|
|
705
|
+
return [];
|
|
706
|
+
}
|
|
707
|
+
// Reuse the Case-3 whole-leaf path, but restricted to precision-eligible rules so
|
|
708
|
+
// random base64 never false-positives. `decoded` supplies the scan text; the
|
|
709
|
+
// recorded detection still spans the ORIGINAL encoded leaf (entry.value).
|
|
710
|
+
return wholeLeafDetections(decoded, rules, context, entry, entry.value, isDecodeEligibleRule);
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
// A decoded whole-leaf detection only fires for a "meaningful" hit: a hard-block
|
|
714
|
+
// type (secret/api_key/kr_rrn/card) on its anchored rule, OR a checksum-validated
|
|
715
|
+
// type. The `phone` type is excluded even though kr-phone carries a `validate`
|
|
716
|
+
// helper — that helper is a trunk-prefix heuristic, not a checksum, so a phone-
|
|
717
|
+
// shaped run in random decoded bytes must NOT fire (the spec's named exclusion).
|
|
718
|
+
function isDecodeEligibleRule(rule) {
|
|
719
|
+
if (HARD_BLOCK_TYPES.has(rule.type)) {
|
|
720
|
+
return true;
|
|
721
|
+
}
|
|
722
|
+
return typeof rule.validate === "function" && rule.type !== "phone";
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Attempt to decode a string leaf to UTF-8 text, returning the decoded string or
|
|
726
|
+
// null when the leaf does not look like (or does not cleanly round-trip as) an
|
|
727
|
+
// encoded value. Two encodings, each precision-guarded so a benign value is skipped
|
|
728
|
+
// rather than mis-decoded:
|
|
729
|
+
// - base64 / base64url: the leaf must LOOK like base64 (no spaces, the base64 or
|
|
730
|
+
// base64url alphabet, a valid length for that variant) within bounds, decode to
|
|
731
|
+
// VALID UTF-8, and RE-ENCODE back to exactly the leaf (rejects the bytes that
|
|
732
|
+
// Buffer.from leniently accepts but are not the canonical encoding of the leaf).
|
|
733
|
+
// - percent-encoding: only when the leaf actually contains a `%XX` escape;
|
|
734
|
+
// decodeURIComponent in a try/catch (a malformed escape → skip, never throws).
|
|
735
|
+
// base64 is tried first (a `%`-bearing string is not base64), then percent.
|
|
736
|
+
const DECODE_MIN_LEN = 16;
|
|
737
|
+
const DECODE_MAX_LEN = 8192;
|
|
738
|
+
const BASE64_STD = /^[A-Za-z0-9+/]+={0,2}$/;
|
|
739
|
+
const BASE64_URL = /^[A-Za-z0-9_-]+$/;
|
|
740
|
+
|
|
741
|
+
function decodeLeaf(value) {
|
|
742
|
+
if (typeof value !== "string" || value.length < DECODE_MIN_LEN || value.length > DECODE_MAX_LEN) {
|
|
743
|
+
return null;
|
|
744
|
+
}
|
|
745
|
+
const base64 = decodeBase64Leaf(value);
|
|
746
|
+
if (base64 !== null) {
|
|
747
|
+
return base64;
|
|
748
|
+
}
|
|
749
|
+
return decodePercentLeaf(value);
|
|
750
|
+
}
|
|
174
751
|
|
|
175
|
-
|
|
752
|
+
function decodeBase64Leaf(value) {
|
|
753
|
+
// Standard base64: length must be a multiple of 4. base64url: length mod 4 may be
|
|
754
|
+
// 0, 2, or 3 (1 is impossible for any byte run) and the alphabet is `-_` not `+/`.
|
|
755
|
+
// A `%` or whitespace disqualifies it (handled by the anchored alphabet regexes).
|
|
756
|
+
let encoding = null;
|
|
757
|
+
if (BASE64_STD.test(value) && value.length % 4 === 0) {
|
|
758
|
+
encoding = "base64";
|
|
759
|
+
} else if (BASE64_URL.test(value) && value.length % 4 !== 1) {
|
|
760
|
+
encoding = "base64url";
|
|
761
|
+
} else {
|
|
762
|
+
return null;
|
|
763
|
+
}
|
|
764
|
+
let bytes;
|
|
765
|
+
try {
|
|
766
|
+
bytes = Buffer.from(value, encoding);
|
|
767
|
+
} catch {
|
|
768
|
+
return null;
|
|
769
|
+
}
|
|
770
|
+
if (bytes.length === 0) {
|
|
771
|
+
return null;
|
|
772
|
+
}
|
|
773
|
+
// Round-trip guard: Buffer.from is lenient (it ignores stray chars / bad padding),
|
|
774
|
+
// so a non-canonical string can "decode". Re-encoding the bytes must reproduce the
|
|
775
|
+
// EXACT leaf — otherwise the leaf was not really this base64 value.
|
|
776
|
+
if (bytes.toString(encoding) !== value) {
|
|
777
|
+
return null;
|
|
778
|
+
}
|
|
779
|
+
// The decoded bytes must be valid UTF-8 text; a card/RRN/secret is text. Random
|
|
780
|
+
// base64 usually decodes to non-UTF-8 bytes, which we skip here (a cheap, strong
|
|
781
|
+
// false-positive filter before we even run the rules).
|
|
782
|
+
if (!isUtf8(bytes)) {
|
|
783
|
+
return null;
|
|
784
|
+
}
|
|
785
|
+
return bytes.toString("utf8");
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
function decodePercentLeaf(value) {
|
|
789
|
+
// Only attempt when there is an actual `%XX` escape — otherwise decodeURIComponent
|
|
790
|
+
// is a no-op and we would needlessly rescan an identical string.
|
|
791
|
+
if (!/%[0-9A-Fa-f]{2}/.test(value)) {
|
|
792
|
+
return null;
|
|
793
|
+
}
|
|
794
|
+
let decoded;
|
|
795
|
+
try {
|
|
796
|
+
decoded = decodeURIComponent(value);
|
|
797
|
+
} catch {
|
|
798
|
+
// Malformed percent-escape (e.g. a bare `%` or `%zz`) → skip, never throw.
|
|
799
|
+
return null;
|
|
800
|
+
}
|
|
801
|
+
if (decoded === value) {
|
|
802
|
+
return null;
|
|
803
|
+
}
|
|
804
|
+
return decoded;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
// Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
|
|
808
|
+
// text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
|
|
809
|
+
// the same number of UTF-16 units (so no interior offset shifts) AND the per-
|
|
810
|
+
// codepoint folds concatenate to the whole normalization (so no cross-boundary
|
|
811
|
+
// composition moved content). The bare `normalized.length === value.length` check
|
|
812
|
+
// is unsound — a contraction before the PII compensated by an expansion after it
|
|
813
|
+
// keeps the total length equal while shifting every interior offset, redacting the
|
|
814
|
+
// wrong bytes. Runs only on a leaf that actually folded (normalized !== value).
|
|
815
|
+
function isPositionStableNfkc(value, normalized) {
|
|
816
|
+
let rebuilt = "";
|
|
817
|
+
for (const ch of value) {
|
|
818
|
+
const folded = ch.normalize("NFKC");
|
|
819
|
+
if (folded.length !== ch.length) {
|
|
820
|
+
return false;
|
|
821
|
+
}
|
|
822
|
+
rebuilt += folded;
|
|
823
|
+
}
|
|
824
|
+
return rebuilt === normalized;
|
|
176
825
|
}
|
|
177
826
|
|
|
178
827
|
// Spans of Haechi's own transform markers in a string, so detection can skip
|
|
@@ -291,3 +940,317 @@ function krRrnValid(value) {
|
|
|
291
940
|
const check = (11 - (sum % 11)) % 10;
|
|
292
941
|
return check === Number(digits[12]);
|
|
293
942
|
}
|
|
943
|
+
|
|
944
|
+
// US SSN structural validity (SSA allocation rules). The format `AAA-GG-SSSS`
|
|
945
|
+
// alone collides with arbitrary 9-digit ids, so we reject the never-issued
|
|
946
|
+
// ranges: area 000, 666, and 900-999; group 00; serial 0000. This is what turns
|
|
947
|
+
// the loose shape into a high-precision detection.
|
|
948
|
+
function usSsnValid(value) {
|
|
949
|
+
const match = /^(\d{3})-(\d{2})-(\d{4})$/.exec(value);
|
|
950
|
+
if (!match) {
|
|
951
|
+
return false;
|
|
952
|
+
}
|
|
953
|
+
const area = Number(match[1]);
|
|
954
|
+
const group = Number(match[2]);
|
|
955
|
+
const serial = Number(match[3]);
|
|
956
|
+
if (area === 0 || area === 666 || area >= 900) {
|
|
957
|
+
return false;
|
|
958
|
+
}
|
|
959
|
+
if (group === 0) {
|
|
960
|
+
return false;
|
|
961
|
+
}
|
|
962
|
+
if (serial === 0) {
|
|
963
|
+
return false;
|
|
964
|
+
}
|
|
965
|
+
return true;
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
// IBAN mod-97 checksum (ISO 7064 / ISO 13616). Move the first four chars to the
|
|
969
|
+
// end, map letters to 10-35, and the resulting integer must be congruent to 1
|
|
970
|
+
// mod 97. Computed digit-by-digit so the big integer never overflows. This
|
|
971
|
+
// checksum is the precision guarantee — random alnum runs almost never pass.
|
|
972
|
+
function ibanValid(value) {
|
|
973
|
+
const iban = value.replace(/\s/g, "").toUpperCase();
|
|
974
|
+
if (!/^[A-Z]{2}\d{2}[A-Z0-9]{11,30}$/.test(iban)) {
|
|
975
|
+
return false;
|
|
976
|
+
}
|
|
977
|
+
const rearranged = iban.slice(4) + iban.slice(0, 4);
|
|
978
|
+
let remainder = 0;
|
|
979
|
+
for (const char of rearranged) {
|
|
980
|
+
const mapped = /\d/.test(char) ? char : String(char.charCodeAt(0) - 55);
|
|
981
|
+
for (const digit of mapped) {
|
|
982
|
+
remainder = (remainder * 10 + Number(digit)) % 97;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
return remainder === 1;
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
// Japan My Number (個人番号) check digit. The official scheme: over the first 11
|
|
989
|
+
// digits, P = 11 - (Σ n_i · Q_i mod 11), where n_i is the i-th digit FROM THE
|
|
990
|
+
// RIGHT of the 11-digit prefix and Q_i = i+1 for 1≤i≤6, i-5 for 7≤i≤11. When the
|
|
991
|
+
// remainder is 0 or 1 the check digit is 0. The 12th digit must equal P. This
|
|
992
|
+
// check digit is the precision guarantee — a random 12-digit id passes only 1
|
|
993
|
+
// time in 10, and the corpus hard-negative (a valid-shape, wrong-check value)
|
|
994
|
+
// proves the rejection.
|
|
995
|
+
function jpMyNumberValid(value) {
|
|
996
|
+
const digits = value.replace(/\D/g, "");
|
|
997
|
+
if (digits.length !== 12) {
|
|
998
|
+
return false;
|
|
999
|
+
}
|
|
1000
|
+
let sum = 0;
|
|
1001
|
+
for (let n = 1; n <= 11; n += 1) {
|
|
1002
|
+
const digit = Number(digits[11 - n]);
|
|
1003
|
+
const weight = n <= 6 ? n + 1 : n - 5;
|
|
1004
|
+
sum += digit * weight;
|
|
1005
|
+
}
|
|
1006
|
+
const remainder = sum % 11;
|
|
1007
|
+
const check = remainder <= 1 ? 0 : 11 - remainder;
|
|
1008
|
+
return check === Number(digits[11]);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
// France NIR / INSEE social-security control key. The first 13 chars are the
|
|
1012
|
+
// body (sex, birth year/month, department, commune, order); the last 2 are the
|
|
1013
|
+
// control key, which must equal `97 - (body mod 97)`. The Corsica department is
|
|
1014
|
+
// written 2A/2B; the official rule substitutes 2A→19 and 2B→18 in the body
|
|
1015
|
+
// BEFORE the mod (the rest of the body is numeric). The control key is the
|
|
1016
|
+
// precision guarantee — a wrong key is rejected (corpus hard-negative).
|
|
1017
|
+
function frNirValid(value) {
|
|
1018
|
+
const compact = value.replace(/[\s.-]/g, "").toUpperCase();
|
|
1019
|
+
if (!/^[12]\d{2}(?:\d{2}|0[1-9]|1[0-2]|20)(?:\d{2}|2[AB])\d{6}\d{2}$/.test(compact)) {
|
|
1020
|
+
return false;
|
|
1021
|
+
}
|
|
1022
|
+
const bodyRaw = compact.slice(0, 13);
|
|
1023
|
+
const control = Number(compact.slice(13));
|
|
1024
|
+
// Corsica substitution: 2A→19, 2B→18 (only the department field can be alpha).
|
|
1025
|
+
const body = bodyRaw.replace("2A", "19").replace("2B", "18");
|
|
1026
|
+
if (!/^\d{13}$/.test(body)) {
|
|
1027
|
+
return false;
|
|
1028
|
+
}
|
|
1029
|
+
let remainder = 0;
|
|
1030
|
+
for (const char of body) {
|
|
1031
|
+
remainder = (remainder * 10 + Number(char)) % 97;
|
|
1032
|
+
}
|
|
1033
|
+
const key = 97 - remainder;
|
|
1034
|
+
return key === control;
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// Spain DNI/NIE check letter (mod-23 table). DNI is 8 digits + a letter; NIE is
|
|
1038
|
+
// a leading X/Y/Z (mapped to 0/1/2) + 7 digits + a letter. The letter is
|
|
1039
|
+
// `table[number mod 23]` where table = "TRWAGMYFPDXBNJZSQVHLCKE". The letter is
|
|
1040
|
+
// the precision guarantee — a structurally valid but wrong letter is rejected
|
|
1041
|
+
// (corpus hard-negative).
|
|
1042
|
+
const ES_DNI_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
1043
|
+
const ES_NIE_PREFIX = { X: "0", Y: "1", Z: "2" };
|
|
1044
|
+
function esDniValid(value) {
|
|
1045
|
+
const compact = value.replace(/[\s-]/g, "").toUpperCase();
|
|
1046
|
+
let body;
|
|
1047
|
+
let letter;
|
|
1048
|
+
if (/^\d{8}[A-Z]$/.test(compact)) {
|
|
1049
|
+
body = compact.slice(0, 8);
|
|
1050
|
+
letter = compact[8];
|
|
1051
|
+
} else if (/^[XYZ]\d{7}[A-Z]$/.test(compact)) {
|
|
1052
|
+
body = ES_NIE_PREFIX[compact[0]] + compact.slice(1, 8);
|
|
1053
|
+
letter = compact[8];
|
|
1054
|
+
} else {
|
|
1055
|
+
return false;
|
|
1056
|
+
}
|
|
1057
|
+
return ES_DNI_TABLE[Number(body) % 23] === letter;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
// Italy Codice Fiscale check character. Over the first 15 chars: odd positions
|
|
1061
|
+
// (1st, 3rd, … counting from 1) use the ODD table, even positions use the EVEN
|
|
1062
|
+
// table; sum the mapped values and the (sum mod 26)-th letter (A=0) must equal the
|
|
1063
|
+
// 16th char. The mixed alpha+digit structure + the mod-26 check character are the
|
|
1064
|
+
// precision guard — a structurally valid but wrong check char is rejected (corpus
|
|
1065
|
+
// hard-negative). Hard-block.
|
|
1066
|
+
const IT_CF_ODD = {
|
|
1067
|
+
"0": 1, "1": 0, "2": 5, "3": 7, "4": 9, "5": 13, "6": 15, "7": 17, "8": 19, "9": 21,
|
|
1068
|
+
A: 1, B: 0, C: 5, D: 7, E: 9, F: 13, G: 15, H: 17, I: 19, J: 21, K: 2, L: 4, M: 18, N: 20,
|
|
1069
|
+
O: 11, P: 3, Q: 6, R: 8, S: 12, T: 14, U: 16, V: 10, W: 22, X: 25, Y: 24, Z: 23
|
|
1070
|
+
};
|
|
1071
|
+
const IT_CF_EVEN = {
|
|
1072
|
+
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
|
|
1073
|
+
A: 0, B: 1, C: 2, D: 3, E: 4, F: 5, G: 6, H: 7, I: 8, J: 9, K: 10, L: 11, M: 12, N: 13,
|
|
1074
|
+
O: 14, P: 15, Q: 16, R: 17, S: 18, T: 19, U: 20, V: 21, W: 22, X: 23, Y: 24, Z: 25
|
|
1075
|
+
};
|
|
1076
|
+
const IT_CF_REMAINDER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
1077
|
+
function itCodiceFiscaleValid(value) {
|
|
1078
|
+
const cf = value.replace(/\s/g, "").toUpperCase();
|
|
1079
|
+
if (!/^[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]$/.test(cf)) {
|
|
1080
|
+
return false;
|
|
1081
|
+
}
|
|
1082
|
+
let sum = 0;
|
|
1083
|
+
for (let index = 0; index < 15; index += 1) {
|
|
1084
|
+
const char = cf[index];
|
|
1085
|
+
// Position 1 (index 0) is ODD; alternate from there.
|
|
1086
|
+
sum += index % 2 === 0 ? IT_CF_ODD[char] : IT_CF_EVEN[char];
|
|
1087
|
+
}
|
|
1088
|
+
return IT_CF_REMAINDER[sum % 26] === cf[15];
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// Singapore NRIC/FIN check letter. Weighted sum (weights 2,7,6,5,4,3,2) over the
|
|
1092
|
+
// 7 digits, plus a per-prefix offset (T/G +4, M +3), mod 11, mapped through the
|
|
1093
|
+
// per-series letter table. S/T (citizen/PR), F/G (foreigner FIN), and M (post-2022
|
|
1094
|
+
// FIN) each have their own table. The prefix letter + check letter are the
|
|
1095
|
+
// precision guard — a wrong letter is rejected (corpus hard-negative). Hard-block.
|
|
1096
|
+
const SG_NRIC_WEIGHTS = [2, 7, 6, 5, 4, 3, 2];
|
|
1097
|
+
const SG_NRIC_TABLE_ST = ["J", "Z", "I", "H", "G", "F", "E", "D", "C", "B", "A"];
|
|
1098
|
+
const SG_NRIC_TABLE_FG = ["X", "W", "U", "T", "R", "Q", "P", "N", "M", "L", "K"];
|
|
1099
|
+
const SG_NRIC_TABLE_M = ["K", "L", "J", "N", "P", "Q", "R", "T", "U", "W", "X"];
|
|
1100
|
+
function sgNricValid(value) {
|
|
1101
|
+
const v = value.replace(/\s/g, "").toUpperCase();
|
|
1102
|
+
if (!/^[STFGM]\d{7}[A-Z]$/.test(v)) {
|
|
1103
|
+
return false;
|
|
1104
|
+
}
|
|
1105
|
+
const prefix = v[0];
|
|
1106
|
+
let sum = 0;
|
|
1107
|
+
for (let index = 0; index < 7; index += 1) {
|
|
1108
|
+
sum += Number(v[index + 1]) * SG_NRIC_WEIGHTS[index];
|
|
1109
|
+
}
|
|
1110
|
+
if (prefix === "T" || prefix === "G") {
|
|
1111
|
+
sum += 4;
|
|
1112
|
+
} else if (prefix === "M") {
|
|
1113
|
+
sum += 3;
|
|
1114
|
+
}
|
|
1115
|
+
const remainder = sum % 11;
|
|
1116
|
+
let table;
|
|
1117
|
+
if (prefix === "S" || prefix === "T") {
|
|
1118
|
+
table = SG_NRIC_TABLE_ST;
|
|
1119
|
+
} else if (prefix === "F" || prefix === "G") {
|
|
1120
|
+
table = SG_NRIC_TABLE_FG;
|
|
1121
|
+
} else {
|
|
1122
|
+
table = SG_NRIC_TABLE_M;
|
|
1123
|
+
}
|
|
1124
|
+
return table[remainder] === v[8];
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// India Aadhaar Verhoeff checksum. The Verhoeff scheme runs the dihedral-group
|
|
1128
|
+
// multiplication (VERHOEFF_D) over each digit permuted by position (VERHOEFF_P)
|
|
1129
|
+
// from the right; the running value must be 0 for a valid full number. Aadhaar is
|
|
1130
|
+
// 12 digits and never starts 0 or 1. The Verhoeff check is the precision guard — a
|
|
1131
|
+
// wrong check digit is rejected (corpus hard-negative). Dial-eligible: Verhoeff
|
|
1132
|
+
// over a common 12-digit shape passes ~1/10 of random runs (the jp_mynumber
|
|
1133
|
+
// footgun), so it stays allowlist-clearable.
|
|
1134
|
+
const VERHOEFF_D = [
|
|
1135
|
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
1136
|
+
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
|
|
1137
|
+
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
|
|
1138
|
+
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
|
|
1139
|
+
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
|
|
1140
|
+
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
|
|
1141
|
+
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
|
|
1142
|
+
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
|
|
1143
|
+
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
|
|
1144
|
+
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
1145
|
+
];
|
|
1146
|
+
const VERHOEFF_P = [
|
|
1147
|
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
1148
|
+
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
|
|
1149
|
+
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
|
|
1150
|
+
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
|
|
1151
|
+
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
|
|
1152
|
+
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
|
|
1153
|
+
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
|
|
1154
|
+
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8]
|
|
1155
|
+
];
|
|
1156
|
+
function verhoeffValid(digits) {
|
|
1157
|
+
let check = 0;
|
|
1158
|
+
const reversed = digits.split("").reverse();
|
|
1159
|
+
for (let index = 0; index < reversed.length; index += 1) {
|
|
1160
|
+
check = VERHOEFF_D[check][VERHOEFF_P[index % 8][Number(reversed[index])]];
|
|
1161
|
+
}
|
|
1162
|
+
return check === 0;
|
|
1163
|
+
}
|
|
1164
|
+
function inAadhaarValid(value) {
|
|
1165
|
+
const digits = value.replace(/[\s-]/g, "");
|
|
1166
|
+
if (!/^[2-9]\d{11}$/.test(digits)) {
|
|
1167
|
+
return false;
|
|
1168
|
+
}
|
|
1169
|
+
return verhoeffValid(digits);
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
// Germany tax ID (Steuer-Identifikationsnummer). Two guards: (1) the structural
|
|
1173
|
+
// rule that the first 10 digits contain exactly one repeated digit (one value
|
|
1174
|
+
// appears 2 or 3 times, the rest once), and (2) the ISO 7064 MOD 11,10 check digit
|
|
1175
|
+
// over the first 10 must equal the 11th. Dial-eligible: a bare 11-digit run with no
|
|
1176
|
+
// non-numeric anchor over a common length (the jp_mynumber discipline), even though
|
|
1177
|
+
// the combined guard is strong.
|
|
1178
|
+
function deSteuerIdStructural(first10) {
|
|
1179
|
+
const counts = new Map();
|
|
1180
|
+
for (const char of first10) {
|
|
1181
|
+
counts.set(char, (counts.get(char) ?? 0) + 1);
|
|
1182
|
+
}
|
|
1183
|
+
const repeats = [...counts.values()].filter((count) => count >= 2);
|
|
1184
|
+
// Exactly one digit repeats, and it repeats 2 or 3 times (never more).
|
|
1185
|
+
return repeats.length === 1 && repeats[0] <= 3;
|
|
1186
|
+
}
|
|
1187
|
+
function deSteuerIdCheckDigit(first10) {
|
|
1188
|
+
let product = 10;
|
|
1189
|
+
for (let index = 0; index < 10; index += 1) {
|
|
1190
|
+
let sum = (Number(first10[index]) + product) % 10;
|
|
1191
|
+
if (sum === 0) {
|
|
1192
|
+
sum = 10;
|
|
1193
|
+
}
|
|
1194
|
+
product = (sum * 2) % 11;
|
|
1195
|
+
}
|
|
1196
|
+
let check = 11 - product;
|
|
1197
|
+
if (check === 10) {
|
|
1198
|
+
check = 0;
|
|
1199
|
+
}
|
|
1200
|
+
return check;
|
|
1201
|
+
}
|
|
1202
|
+
function deSteuerIdValid(value) {
|
|
1203
|
+
const digits = value.replace(/[\s/]/g, "");
|
|
1204
|
+
if (!/^\d{11}$/.test(digits)) {
|
|
1205
|
+
return false;
|
|
1206
|
+
}
|
|
1207
|
+
const first10 = digits.slice(0, 10);
|
|
1208
|
+
if (!deSteuerIdStructural(first10)) {
|
|
1209
|
+
return false;
|
|
1210
|
+
}
|
|
1211
|
+
return deSteuerIdCheckDigit(first10) === Number(digits[10]);
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
// Netherlands BSN "11-proef": Σ (digit_i · weight_i) ≡ 0 mod 11 over the 9 digits,
|
|
1215
|
+
// where the weights run 9,8,…,2 for the first eight and -1 for the last. The
|
|
1216
|
+
// all-zero number is rejected. Dial-eligible: 9 bare digits is very common, so the
|
|
1217
|
+
// 11-proef passes ~1/11 of random runs (the clearest jp_mynumber-style footgun).
|
|
1218
|
+
function nlBsnValid(value) {
|
|
1219
|
+
const digits = value.replace(/[\s.]/g, "");
|
|
1220
|
+
if (!/^\d{9}$/.test(digits)) {
|
|
1221
|
+
return false;
|
|
1222
|
+
}
|
|
1223
|
+
if (/^0{9}$/.test(digits)) {
|
|
1224
|
+
return false;
|
|
1225
|
+
}
|
|
1226
|
+
let sum = 0;
|
|
1227
|
+
for (let index = 0; index < 8; index += 1) {
|
|
1228
|
+
sum += Number(digits[index]) * (9 - index);
|
|
1229
|
+
}
|
|
1230
|
+
sum += Number(digits[8]) * -1;
|
|
1231
|
+
return sum % 11 === 0;
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
// UK National Insurance Number — FORMAT-ONLY (no checksum exists), which is why
|
|
1235
|
+
// uk_nino stays OUT of HARD_BLOCK_TYPES (dial-eligible). The regex already
|
|
1236
|
+
// excludes the disallowed individual letters; this validator rejects the
|
|
1237
|
+
// documented invalid PREFIX PAIRS (BG, GB, NK, KN, TN, NT, ZZ) that the regex
|
|
1238
|
+
// cannot express as a negative set, plus the `O`-as-second-letter case (belt-and-
|
|
1239
|
+
// braces with the regex class). The administrative `TN`/`NT` and the temporary
|
|
1240
|
+
// `OO`/the suspended `BG` etc. are never issued, so excluding them lifts precision.
|
|
1241
|
+
const UK_NINO_INVALID_PREFIXES = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
|
|
1242
|
+
function ukNinoValid(value) {
|
|
1243
|
+
const compact = value.replace(/\s/g, "").toUpperCase();
|
|
1244
|
+
if (!/^[A-Z]{2}\d{6}[A-D]$/.test(compact)) {
|
|
1245
|
+
return false;
|
|
1246
|
+
}
|
|
1247
|
+
const prefix = compact.slice(0, 2);
|
|
1248
|
+
if (UK_NINO_INVALID_PREFIXES.has(prefix)) {
|
|
1249
|
+
return false;
|
|
1250
|
+
}
|
|
1251
|
+
// First letter never D/F/I/Q/U/V; second letter never D/F/I/O/Q/U/V.
|
|
1252
|
+
if (/[DFIQUV]/.test(prefix[0]) || /[DFIOQUV]/.test(prefix[1])) {
|
|
1253
|
+
return false;
|
|
1254
|
+
}
|
|
1255
|
+
return true;
|
|
1256
|
+
}
|