haechi 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +57 -11
- package/README.md +57 -11
- package/docs/current/code-review-risk-register-2026-06-16.ko.md +377 -0
- package/docs/current/code-review-risk-register-2026-06-16.md +377 -0
- package/docs/current/config-version.ko.md +2 -2
- package/docs/current/config-version.md +2 -2
- package/docs/current/configuration.ko.md +28 -11
- package/docs/current/configuration.md +28 -11
- package/docs/current/operations-runbook.ko.md +36 -2
- package/docs/current/operations-runbook.md +39 -2
- package/docs/current/release-process.ko.md +5 -1
- package/docs/current/release-process.md +5 -1
- package/docs/current/risk-register-release-gate.ko.md +34 -8
- package/docs/current/risk-register-release-gate.md +34 -8
- package/docs/current/shared-responsibility.ko.md +12 -3
- package/docs/current/shared-responsibility.md +12 -3
- package/docs/current/threat-model.ko.md +7 -3
- package/docs/current/threat-model.md +7 -3
- package/examples/local-proxy-demo/README.md +51 -0
- package/examples/local-proxy-demo/demo.mjs +144 -0
- package/examples/local-proxy-demo/demo.tape +19 -0
- package/examples/local-proxy-demo/live-demo.mjs +121 -0
- package/examples/local-proxy-demo/live-demo.tape +25 -0
- package/haechi.config.example.json +2 -1
- package/package.json +3 -1
- package/packages/cli/bin/haechi.mjs +95 -5
- package/packages/cli/runtime.mjs +61 -1
- package/packages/core/index.mjs +15 -0
- package/packages/crypto/index.mjs +42 -20
- package/packages/filter/index.mjs +679 -6
- package/packages/privacy-profiles/index.mjs +72 -3
- package/packages/protocol-adapters/index.mjs +99 -1
- package/packages/proxy/index.mjs +270 -29
- package/packages/ssrf/index.mjs +60 -4
- package/packages/stream-filter/index.mjs +194 -17
|
@@ -1,10 +1,51 @@
|
|
|
1
|
+
import { isUtf8 } from "node:buffer";
|
|
2
|
+
|
|
1
3
|
// The hard-block detection types: a leak of one of these is a load-bearing
|
|
2
4
|
// fail-closed concern, so the WS2c precision dials (filters.minConfidence,
|
|
3
5
|
// filters.allowlist) may NOT suppress a detection of any of them. minConfidence
|
|
4
6
|
// trims only the precision-risky SOFT types; the allowlist's per-value/per-path
|
|
5
7
|
// exceptions are ignored for these types (the detection still fires). Exported
|
|
6
8
|
// so the core detect→decide path enforces the same exemption set the docs pin.
|
|
7
|
-
|
|
9
|
+
//
|
|
10
|
+
// Hard-block types are sensitive AND have a STRONG enough anchor that a match is
|
|
11
|
+
// effectively a true positive by construction, so the precision dials
|
|
12
|
+
// (filters.minConfidence / filters.allowlist) can never suppress them:
|
|
13
|
+
// - kr_rrn / card — checksum + constrained format
|
|
14
|
+
// - fr_nir (mod-97 over a long structured 15-digit run) and es_dni (mod-23 plus
|
|
15
|
+
// a required check LETTER suffix) — a random same-shaped value almost never
|
|
16
|
+
// passes, and the shapes are rare in ordinary payloads.
|
|
17
|
+
// - it_codice_fiscale — a 16-char MIXED alpha+digit shape with a mod-26 check
|
|
18
|
+
// CHARACTER. The non-numeric structural anchor (the rigid letter/digit layout)
|
|
19
|
+
// makes a benign 16-char `[A-Z]{6}\d{2}[A-Z]…` run in an ordinary payload
|
|
20
|
+
// implausible (measured collision ~3.8% over an already-rare shape), so a match
|
|
21
|
+
// is effectively a true positive.
|
|
22
|
+
// - sg_nric — a LETTER prefix ([STFGM]) + 7 digits + a CHECK LETTER. Two
|
|
23
|
+
// non-numeric anchors (prefix letter + checksum letter) over a rare shape
|
|
24
|
+
// (measured collision ~3.9% over the prefix+letter shape) make a benign FP
|
|
25
|
+
// implausible. Both anchored, un-allowlistable.
|
|
26
|
+
// DELIBERATELY DIAL-ELIGIBLE (NOT hard-block) — bare-digit runs whose only guard is
|
|
27
|
+
// a single numeric checksum over a COMMON digit length, so a benign id/counter FP is
|
|
28
|
+
// plausible and the operator needs the allowlist/minConfidence escape hatch (the
|
|
29
|
+
// jp_mynumber precedent). They still detect + (per profile) block by default:
|
|
30
|
+
// - jp_mynumber — a bare 12-digit run + a SINGLE mod-11 check digit (measured
|
|
31
|
+
// ~9% of random 12-digit numbers pass), and 12-digit ids/counters are common.
|
|
32
|
+
// - uk_nino — NO checksum exists (format + invalid-prefix exclusions only), the
|
|
33
|
+
// largest FP surface.
|
|
34
|
+
// - in_aadhaar — a bare 12-digit run + the Verhoeff checksum (measured ~9.9% of
|
|
35
|
+
// random 12-digit runs pass, the same order as jp_mynumber's mod-11). Aadhaar
|
|
36
|
+
// is extremely sensitive, but Verhoeff over the COMMON 12-digit shape is exactly
|
|
37
|
+
// the jp_mynumber footgun (a 12-digit id/counter is common), so it stays
|
|
38
|
+
// allowlist-clearable rather than un-suppressable.
|
|
39
|
+
// - de_steuer_id — a bare 11-digit run + ISO 7064 MOD 11,10 plus a structural
|
|
40
|
+
// "exactly one repeated digit" test. The combined guard is strong (measured
|
|
41
|
+
// ~0.37% collision), BUT there is NO non-numeric anchor and 11-digit ids are
|
|
42
|
+
// common in payloads, so per the jp_mynumber discipline (a bare-digit shape over
|
|
43
|
+
// a common length) it stays DIAL-ELIGIBLE so an operator can clear an 11-digit-id
|
|
44
|
+
// FP. It still detects + blocks by default.
|
|
45
|
+
// - nl_bsn — a bare 9-digit run + the "11-proef" weighted mod-11 (measured ~9.1%
|
|
46
|
+
// of random 9-digit runs pass). 9 bare digits is VERY common, so this is the
|
|
47
|
+
// clearest dial-eligible case.
|
|
48
|
+
export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card", "fr_nir", "es_dni", "it_codice_fiscale", "sg_nric"]);
|
|
8
49
|
|
|
9
50
|
const DEFAULT_RULES = [
|
|
10
51
|
{
|
|
@@ -94,6 +135,76 @@ const DEFAULT_RULES = [
|
|
|
94
135
|
flags: "g",
|
|
95
136
|
confidence: 0.9
|
|
96
137
|
},
|
|
138
|
+
{
|
|
139
|
+
// Anthropic API key: `sk-ant-` + a long body. Ordered BEFORE the OpenAI rule
|
|
140
|
+
// below so a Claude key is attributed to its own rule (both emit `secret`, so
|
|
141
|
+
// removeOverlaps collapsing the shared span to either is type-identical — the
|
|
142
|
+
// ordering is for ruleId attribution, not for the scored type). The HYPHEN
|
|
143
|
+
// after `sk` is load-bearing: it keeps this OFF the underscore-based Stripe/
|
|
144
|
+
// OpenAI-platform `sk_` rule (openai-like-key, `api_key`), so the two never
|
|
145
|
+
// collide on the same span.
|
|
146
|
+
id: "anthropic-api-key",
|
|
147
|
+
type: "secret",
|
|
148
|
+
pattern: "\\bsk-ant-[A-Za-z0-9_-]{16,}\\b",
|
|
149
|
+
flags: "g",
|
|
150
|
+
confidence: 0.95
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
// OpenAI API key: `sk-` (and project keys `sk-proj-`) + a long base62-ish
|
|
154
|
+
// body. The HYPHEN is the disambiguator from Stripe/OpenAI-platform `sk_`
|
|
155
|
+
// (underscore — handled by openai-like-key as `api_key`); this rule never
|
|
156
|
+
// matches an underscore form, so the two prefixes do not overlap. A >=20-char
|
|
157
|
+
// body keeps a bare `sk-foo` slug from firing. The Anthropic `sk-ant-` rule
|
|
158
|
+
// above is a stricter sibling that runs first.
|
|
159
|
+
id: "openai-api-key",
|
|
160
|
+
type: "secret",
|
|
161
|
+
pattern: "\\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\\b",
|
|
162
|
+
flags: "g",
|
|
163
|
+
confidence: 0.9
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
// Google OAuth client secret: anchored `GOCSPX-` + exactly 28 chars from the
|
|
167
|
+
// URL-safe alphabet. Fixed prefix + fixed length = high precision (this is the
|
|
168
|
+
// OAuth client secret, distinct from the `AIza` API key above).
|
|
169
|
+
id: "google-oauth-client-secret",
|
|
170
|
+
type: "secret",
|
|
171
|
+
pattern: "\\bGOCSPX-[A-Za-z0-9_-]{28}\\b",
|
|
172
|
+
flags: "g",
|
|
173
|
+
confidence: 0.95
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
// SendGrid API key: `SG.` + 22 URL-safe chars + `.` + 43 URL-safe chars. The
|
|
177
|
+
// two fixed-length dotted segments after the `SG.` prefix are what make this
|
|
178
|
+
// high-precision — a bare `SG.`-prefixed string of the wrong shape is rejected.
|
|
179
|
+
id: "sendgrid-api-key",
|
|
180
|
+
type: "api_key",
|
|
181
|
+
pattern: "\\bSG\\.[A-Za-z0-9_-]{22}\\.[A-Za-z0-9_-]{43}\\b",
|
|
182
|
+
flags: "g",
|
|
183
|
+
confidence: 0.95
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
// Twilio Account SID (AC…) / API Key SID (SK…): the fixed `AC`/`SK` prefix +
|
|
187
|
+
// EXACTLY 32 HEX chars. The hex-only body (not base62) is the precision guard:
|
|
188
|
+
// a random alphanumeric run of the same length carries non-hex letters and is
|
|
189
|
+
// rejected, and the `SK` form does not collide with the underscore/hyphen
|
|
190
|
+
// `sk_`/`sk-` rules. Twilio's bare 32-hex AUTH TOKEN is deliberately NOT a
|
|
191
|
+
// standalone rule (a prefix-less 32-hex run is indistinguishable from an MD5
|
|
192
|
+
// hash / id) — it is caught via the `<key> = <value>` assignment vocabulary.
|
|
193
|
+
id: "twilio-sid",
|
|
194
|
+
type: "api_key",
|
|
195
|
+
pattern: "\\b(?:AC|SK)[0-9a-fA-F]{32}\\b",
|
|
196
|
+
flags: "g",
|
|
197
|
+
confidence: 0.9
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
// npm access token: anchored `npm_` + exactly 36 base62 chars. Fixed prefix +
|
|
201
|
+
// fixed length = high precision.
|
|
202
|
+
id: "npm-token",
|
|
203
|
+
type: "secret",
|
|
204
|
+
pattern: "\\bnpm_[A-Za-z0-9]{36}\\b",
|
|
205
|
+
flags: "g",
|
|
206
|
+
confidence: 0.95
|
|
207
|
+
},
|
|
97
208
|
{
|
|
98
209
|
// JWT: three dot-separated base64url segments where the FIRST starts with
|
|
99
210
|
// `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
|
|
@@ -139,7 +250,12 @@ const DEFAULT_RULES = [
|
|
|
139
250
|
// client secrets, PEM/private keys, access/refresh tokens) so a
|
|
140
251
|
// `<key> = <value>` leak is caught even when the value itself has no
|
|
141
252
|
// self-describing prefix (e.g. an AWS secret access key is bare base64).
|
|
142
|
-
|
|
253
|
+
// `accountkey` catches the Azure Storage connection-string `AccountKey=<88-
|
|
254
|
+
// char base64>=` segment — an un-anchored 88-char base64 rule would false-fire
|
|
255
|
+
// on any blob, so the `AccountKey=` assignment context is the precision anchor.
|
|
256
|
+
// `auth[_-]?token` catches the Twilio auth token (a bare 32-hex run with no
|
|
257
|
+
// self-describing prefix) when it is leaked as a `<key> = <value>` pair.
|
|
258
|
+
pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|auth[_-]?token|accountkey|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
|
|
143
259
|
flags: "gi",
|
|
144
260
|
confidence: 0.85
|
|
145
261
|
},
|
|
@@ -166,6 +282,136 @@ const DEFAULT_RULES = [
|
|
|
166
282
|
confidence: 0.9,
|
|
167
283
|
validate: ibanValid
|
|
168
284
|
},
|
|
285
|
+
{
|
|
286
|
+
// Japan My Number (個人番号): EXACTLY 12 digits with the official mod-11
|
|
287
|
+
// weighted check digit over the first 11. A bare 12-digit run is ambiguous
|
|
288
|
+
// (an id/timestamp), so jpMyNumberValid is the precision guard — only a run
|
|
289
|
+
// whose 12th digit equals the prescribed check digit fires. The leading/
|
|
290
|
+
// trailing boundaries (`(?<![\d-])`/`(?![\d-])`) stop the rule from matching
|
|
291
|
+
// a 12-digit window inside a longer digit/dashed run. NOT hard-block: a single
|
|
292
|
+
// mod-11 check digit only rejects ~10/11 of random 12-digit runs, and such
|
|
293
|
+
// runs are common (ids/counters), so a benign FP is plausible and the operator
|
|
294
|
+
// keeps the allowlist escape hatch (it still detects + blocks by default).
|
|
295
|
+
id: "jp-mynumber",
|
|
296
|
+
type: "jp_mynumber",
|
|
297
|
+
pattern: "(?<![\\d-])\\d{12}(?![\\d-])",
|
|
298
|
+
flags: "g",
|
|
299
|
+
confidence: 0.9,
|
|
300
|
+
validate: jpMyNumberValid
|
|
301
|
+
},
|
|
302
|
+
{
|
|
303
|
+
// France NIR / INSEE social-security: 15 chars where the department field may
|
|
304
|
+
// carry the Corsica `2A`/`2B` letters, validated by the control key
|
|
305
|
+
// `97 - (first13 mod 97) == last2` (Corsica 2A→19, 2B→18 before the mod).
|
|
306
|
+
// The control key is the precision guard — a wrong key is rejected. The
|
|
307
|
+
// department alpha is optional so the pure-numeric form also matches. Anchored
|
|
308
|
+
// on word boundaries; hard-block (checksummed).
|
|
309
|
+
id: "fr-nir",
|
|
310
|
+
type: "fr_nir",
|
|
311
|
+
pattern: "(?<![\\w-])[12]\\d{2}(?:0[1-9]|1[0-2]|20)(?:\\d{2}|2[AB])\\d{6}\\d{2}(?![\\w-])",
|
|
312
|
+
flags: "g",
|
|
313
|
+
confidence: 0.9,
|
|
314
|
+
validate: frNirValid
|
|
315
|
+
},
|
|
316
|
+
{
|
|
317
|
+
// Spain DNI/NIE: 8 digits (DNI) or a leading X/Y/Z + 7 digits (NIE) + a check
|
|
318
|
+
// letter from the mod-23 table (NIE maps X/Y/Z→0/1/2 before the mod). The
|
|
319
|
+
// check letter is the precision guard — a wrong letter is rejected. The
|
|
320
|
+
// letters that can never appear (I/O/U) are excluded from the suffix class so
|
|
321
|
+
// an ordinary `<8-digit><letter>` token rarely even reaches the validator.
|
|
322
|
+
// Hard-block (checksummed).
|
|
323
|
+
id: "es-dni-nie",
|
|
324
|
+
type: "es_dni",
|
|
325
|
+
pattern: "(?<![\\w-])[XYZ]?\\d{7,8}[A-HJ-NP-TV-Z](?![\\w-])",
|
|
326
|
+
flags: "gi",
|
|
327
|
+
confidence: 0.85,
|
|
328
|
+
validate: esDniValid
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
// Italy Codice Fiscale: 16 chars in the rigid layout
|
|
332
|
+
// [A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z] (surname, name, year, month-letter,
|
|
333
|
+
// day, place code, check character). The 16th char is the official check
|
|
334
|
+
// character: sum the odd/even-position table values over the first 15 chars
|
|
335
|
+
// and map (sum mod 26) to a letter. The mixed alpha+digit structure + the
|
|
336
|
+
// mod-26 check character are the precision guard. Hard-block (strong
|
|
337
|
+
// non-numeric anchor over a rare shape).
|
|
338
|
+
id: "it-codice-fiscale",
|
|
339
|
+
type: "it_codice_fiscale",
|
|
340
|
+
pattern: "(?<![A-Z0-9])[A-Z]{6}\\d{2}[A-Z]\\d{2}[A-Z]\\d{3}[A-Z](?![A-Z0-9])",
|
|
341
|
+
flags: "gi",
|
|
342
|
+
confidence: 0.9,
|
|
343
|
+
validate: itCodiceFiscaleValid
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
// Singapore NRIC/FIN: a series LETTER ([STFGM]) + 7 digits + a CHECK LETTER.
|
|
347
|
+
// The check letter is a weighted sum (weights 2,7,6,5,4,3,2) plus a per-prefix
|
|
348
|
+
// offset (T/G +4, M +3), mod 11, looked up in the per-series letter table. The
|
|
349
|
+
// prefix letter + check letter are the precision guard. Hard-block (two
|
|
350
|
+
// non-numeric anchors over a rare shape).
|
|
351
|
+
id: "sg-nric",
|
|
352
|
+
type: "sg_nric",
|
|
353
|
+
pattern: "(?<![A-Z0-9])[STFGMstfgm]\\d{7}[A-Za-z](?![A-Z0-9])",
|
|
354
|
+
flags: "g",
|
|
355
|
+
confidence: 0.9,
|
|
356
|
+
validate: sgNricValid
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
// India Aadhaar: 12 digits (never starting 0 or 1) with the Verhoeff checksum
|
|
360
|
+
// over all 12. A bare 12-digit run is ambiguous (an id/timestamp), so the
|
|
361
|
+
// Verhoeff check is the precision guard. NOT hard-block: Verhoeff over the
|
|
362
|
+
// COMMON 12-digit shape passes ~1/10 of random runs (the jp_mynumber footgun),
|
|
363
|
+
// so it stays dial-eligible (still detects + blocks by default). The leading/
|
|
364
|
+
// trailing boundaries stop the rule from matching a 12-digit window inside a
|
|
365
|
+
// longer digit/dashed run.
|
|
366
|
+
id: "in-aadhaar",
|
|
367
|
+
type: "in_aadhaar",
|
|
368
|
+
pattern: "(?<![\\d-])[2-9]\\d{11}(?![\\d-])",
|
|
369
|
+
flags: "g",
|
|
370
|
+
confidence: 0.85,
|
|
371
|
+
validate: inAadhaarValid
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
// Germany tax ID (Steuer-Identifikationsnummer): 11 digits with the ISO 7064
|
|
375
|
+
// MOD 11,10 check digit over the first 10, plus the structural rule that the
|
|
376
|
+
// first 10 digits contain exactly one repeated digit (one value appears 2 or 3
|
|
377
|
+
// times, the rest once). The combined guard is strong, but it is a BARE-DIGIT
|
|
378
|
+
// run with no non-numeric anchor over a common 11-digit length, so per the
|
|
379
|
+
// jp_mynumber discipline it stays dial-eligible (the operator can clear an
|
|
380
|
+
// 11-digit-id FP).
|
|
381
|
+
id: "de-steuer-id",
|
|
382
|
+
type: "de_steuer_id",
|
|
383
|
+
pattern: "(?<![\\d-])\\d{11}(?![\\d-])",
|
|
384
|
+
flags: "g",
|
|
385
|
+
confidence: 0.85,
|
|
386
|
+
validate: deSteuerIdValid
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
// Netherlands BSN (Burgerservicenummer): 9 digits validated by the "11-proef"
|
|
390
|
+
// weighted mod-11 (Σ digit_i · weight_i ≡ 0 mod 11, with the last weight -1).
|
|
391
|
+
// 9 bare digits is VERY common, so the 11-proef passes ~1/11 of random runs —
|
|
392
|
+
// the clearest dial-eligible case (still detects + blocks by default; the
|
|
393
|
+
// operator keeps the allowlist escape hatch).
|
|
394
|
+
id: "nl-bsn",
|
|
395
|
+
type: "nl_bsn",
|
|
396
|
+
pattern: "(?<![\\d-])\\d{9}(?![\\d-])",
|
|
397
|
+
flags: "g",
|
|
398
|
+
confidence: 0.8,
|
|
399
|
+
validate: nlBsnValid
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
// UK National Insurance Number: two prefix letters + 6 digits + a suffix
|
|
403
|
+
// A-D. There is NO checksum, so this is FORMAT-ONLY and stays OUT of
|
|
404
|
+
// HARD_BLOCK_TYPES (dial-eligible). The pattern bakes in the documented
|
|
405
|
+
// invalid-prefix exclusions: 1st letter never D/F/I/Q/U/V, 2nd letter never
|
|
406
|
+
// D/F/I/O/Q/U/V, and the disallowed pairs BG/GB/NK/KN/TN/NT/ZZ are rejected
|
|
407
|
+
// by ukNinoValid (a negative-set the regex can't express cleanly).
|
|
408
|
+
id: "uk-nino",
|
|
409
|
+
type: "uk_nino",
|
|
410
|
+
pattern: "(?<![\\w-])[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\\d{6}[A-D](?![\\w-])",
|
|
411
|
+
flags: "g",
|
|
412
|
+
confidence: 0.7,
|
|
413
|
+
validate: ukNinoValid
|
|
414
|
+
},
|
|
169
415
|
{
|
|
170
416
|
// E.164 international phone: ONLY with a leading `+` (a bare digit run is an
|
|
171
417
|
// id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
|
|
@@ -232,8 +478,13 @@ const DEFAULT_RULES = [
|
|
|
232
478
|
}
|
|
233
479
|
];
|
|
234
480
|
|
|
235
|
-
export function createDefaultFilterEngine({ customRules = [] } = {}) {
|
|
481
|
+
export function createDefaultFilterEngine({ customRules = [], decodeAndRescan = false } = {}) {
|
|
236
482
|
const rules = DEFAULT_RULES.concat(customRules.map(normalizeCustomRule));
|
|
483
|
+
// The opt-in base64/percent decode-and-rescan pass (WS2d residual). Default OFF
|
|
484
|
+
// => byte-identical to prior behavior. Held in the engine CLOSURE, NOT threaded
|
|
485
|
+
// through the protect `context`: the request context is data and must not carry
|
|
486
|
+
// this control flag (it would pollute tokenize AAD / audit).
|
|
487
|
+
const decodeOptions = { decodeAndRescan: decodeAndRescan === true };
|
|
237
488
|
|
|
238
489
|
return {
|
|
239
490
|
id: "haechi.filter.default",
|
|
@@ -243,12 +494,32 @@ export function createDefaultFilterEngine({ customRules = [] } = {}) {
|
|
|
243
494
|
networkEgress: false
|
|
244
495
|
},
|
|
245
496
|
async detect({ entries, context }) {
|
|
246
|
-
return entries.flatMap((entry) => detectEntry(entry, rules, context));
|
|
497
|
+
return entries.flatMap((entry) => detectEntry(entry, rules, context, decodeOptions));
|
|
247
498
|
}
|
|
248
499
|
};
|
|
249
500
|
}
|
|
250
501
|
|
|
251
|
-
export function detectEntry(entry, rules, context = {}) {
|
|
502
|
+
export function detectEntry(entry, rules, context = {}, options = {}) {
|
|
503
|
+
const baseDetections = scanEntry(entry, rules, context);
|
|
504
|
+
// WS2d residual — opt-in (default OFF) base64/percent decode-and-rescan. After
|
|
505
|
+
// the normal NFKC scan above, if the flag is on, attempt to decode the leaf and
|
|
506
|
+
// rescan the decoded text. A decoded hit has NO valid offset in the encoded leaf
|
|
507
|
+
// (decoding remaps everything), so it fails closed to a WHOLE-LEAF detection of
|
|
508
|
+
// the original encoded leaf — and only fires for a validator-backed/hard-block
|
|
509
|
+
// hit so random base64 never false-positives. See decodeAndRescanEntry.
|
|
510
|
+
if (options?.decodeAndRescan === true) {
|
|
511
|
+
const decoded = decodeAndRescanEntry(entry, rules, context);
|
|
512
|
+
if (decoded.length > 0) {
|
|
513
|
+
return baseDetections.concat(decoded);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return baseDetections;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// The original per-leaf NFKC scan (WS2d), unchanged. Extracted from detectEntry so
|
|
520
|
+
// the opt-in decode-and-rescan pass wraps it without touching the byte-identical
|
|
521
|
+
// default path.
|
|
522
|
+
function scanEntry(entry, rules, context = {}) {
|
|
252
523
|
const detections = [];
|
|
253
524
|
// On the RESPONSE direction, a bare JSON NUMBER leaf is inference-server
|
|
254
525
|
// metadata (a nanosecond `*_duration`, a token count, a numeric id/timestamp) —
|
|
@@ -364,13 +635,19 @@ function scanForDetections(scanText, rules, context, markerSpans, entry, origina
|
|
|
364
635
|
// value: the whole original leaf). The response-direction marker skip does NOT
|
|
365
636
|
// apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
|
|
366
637
|
// and NFKC-stable), so an evasion attempt can never masquerade as one.
|
|
367
|
-
function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
|
|
638
|
+
function wholeLeafDetections(normalized, rules, context, entry, originalValue, ruleFilter = null) {
|
|
368
639
|
const seenTypes = new Set();
|
|
369
640
|
const detections = [];
|
|
370
641
|
for (const rule of rules) {
|
|
371
642
|
if (rule.direction && rule.direction !== context?.direction) {
|
|
372
643
|
continue;
|
|
373
644
|
}
|
|
645
|
+
// The decode-and-rescan caller passes a precision filter so only validator-
|
|
646
|
+
// backed / hard-block rules can fire on decoded text (random base64 guard).
|
|
647
|
+
// The Case-3 NFKC caller passes nothing → every rule is eligible (unchanged).
|
|
648
|
+
if (ruleFilter && !ruleFilter(rule)) {
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
374
651
|
if (seenTypes.has(rule.type)) {
|
|
375
652
|
continue;
|
|
376
653
|
}
|
|
@@ -401,6 +678,132 @@ function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
|
|
|
401
678
|
return detections;
|
|
402
679
|
}
|
|
403
680
|
|
|
681
|
+
// WS2d residual — opt-in base64/percent decode-and-rescan (default OFF). An
|
|
682
|
+
// always-on decode is false-positive-prone (random base64 decodes to bytes that
|
|
683
|
+
// can shape-match a soft rule), so this is gated behind `filters.decodeAndRescan`
|
|
684
|
+
// AND a precision guard: a decoded hit only fires when it is VALIDATOR-BACKED or a
|
|
685
|
+
// HARD-BLOCK type (a Luhn-passing card, a checksum kr_rrn/us_ssn, an IBAN mod-97,
|
|
686
|
+
// or a secret/api_key on its anchored rule). A decoded soft-type-without-validator
|
|
687
|
+
// match (a bare phone-shaped run in random decoded bytes) does NOT fire — requiring
|
|
688
|
+
// validators keeps precision ~100% (random base64 Luhn-passing as a 16-digit card
|
|
689
|
+
// is astronomically unlikely).
|
|
690
|
+
//
|
|
691
|
+
// OFFSET HANDLING (fail closed): a detection found in the DECODED text has no valid
|
|
692
|
+
// offset in the original encoded leaf (decoding remaps everything), so we emit a
|
|
693
|
+
// WHOLE-LEAF detection per matched type (start:0, end:leaf.length, value: the whole
|
|
694
|
+
// original encoded leaf) — exactly the WS2d Case-3 path. The transform then
|
|
695
|
+
// redacts/blocks the entire encoded leaf. We never map a decoded offset back.
|
|
696
|
+
function decodeAndRescanEntry(entry, rules, context) {
|
|
697
|
+
// Only string leaves carry an encoded value; a number/boolean leaf cannot be a
|
|
698
|
+
// base64/percent blob (and the response-direction number skip already applies in
|
|
699
|
+
// the base scan).
|
|
700
|
+
if (entry.kind === "number") {
|
|
701
|
+
return [];
|
|
702
|
+
}
|
|
703
|
+
const decoded = decodeLeaf(entry.value);
|
|
704
|
+
if (decoded === null) {
|
|
705
|
+
return [];
|
|
706
|
+
}
|
|
707
|
+
// Reuse the Case-3 whole-leaf path, but restricted to precision-eligible rules so
|
|
708
|
+
// random base64 never false-positives. `decoded` supplies the scan text; the
|
|
709
|
+
// recorded detection still spans the ORIGINAL encoded leaf (entry.value).
|
|
710
|
+
return wholeLeafDetections(decoded, rules, context, entry, entry.value, isDecodeEligibleRule);
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
// A decoded whole-leaf detection only fires for a "meaningful" hit: a hard-block
|
|
714
|
+
// type (secret/api_key/kr_rrn/card) on its anchored rule, OR a checksum-validated
|
|
715
|
+
// type. The `phone` type is excluded even though kr-phone carries a `validate`
|
|
716
|
+
// helper — that helper is a trunk-prefix heuristic, not a checksum, so a phone-
|
|
717
|
+
// shaped run in random decoded bytes must NOT fire (the spec's named exclusion).
|
|
718
|
+
function isDecodeEligibleRule(rule) {
|
|
719
|
+
if (HARD_BLOCK_TYPES.has(rule.type)) {
|
|
720
|
+
return true;
|
|
721
|
+
}
|
|
722
|
+
return typeof rule.validate === "function" && rule.type !== "phone";
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Attempt to decode a string leaf to UTF-8 text, returning the decoded string or
|
|
726
|
+
// null when the leaf does not look like (or does not cleanly round-trip as) an
|
|
727
|
+
// encoded value. Two encodings, each precision-guarded so a benign value is skipped
|
|
728
|
+
// rather than mis-decoded:
|
|
729
|
+
// - base64 / base64url: the leaf must LOOK like base64 (no spaces, the base64 or
|
|
730
|
+
// base64url alphabet, a valid length for that variant) within bounds, decode to
|
|
731
|
+
// VALID UTF-8, and RE-ENCODE back to exactly the leaf (rejects the bytes that
|
|
732
|
+
// Buffer.from leniently accepts but are not the canonical encoding of the leaf).
|
|
733
|
+
// - percent-encoding: only when the leaf actually contains a `%XX` escape;
|
|
734
|
+
// decodeURIComponent in a try/catch (a malformed escape → skip, never throws).
|
|
735
|
+
// base64 is tried first (a `%`-bearing string is not base64), then percent.
|
|
736
|
+
const DECODE_MIN_LEN = 16;
|
|
737
|
+
const DECODE_MAX_LEN = 8192;
|
|
738
|
+
const BASE64_STD = /^[A-Za-z0-9+/]+={0,2}$/;
|
|
739
|
+
const BASE64_URL = /^[A-Za-z0-9_-]+$/;
|
|
740
|
+
|
|
741
|
+
function decodeLeaf(value) {
|
|
742
|
+
if (typeof value !== "string" || value.length < DECODE_MIN_LEN || value.length > DECODE_MAX_LEN) {
|
|
743
|
+
return null;
|
|
744
|
+
}
|
|
745
|
+
const base64 = decodeBase64Leaf(value);
|
|
746
|
+
if (base64 !== null) {
|
|
747
|
+
return base64;
|
|
748
|
+
}
|
|
749
|
+
return decodePercentLeaf(value);
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
function decodeBase64Leaf(value) {
|
|
753
|
+
// Standard base64: length must be a multiple of 4. base64url: length mod 4 may be
|
|
754
|
+
// 0, 2, or 3 (1 is impossible for any byte run) and the alphabet is `-_` not `+/`.
|
|
755
|
+
// A `%` or whitespace disqualifies it (handled by the anchored alphabet regexes).
|
|
756
|
+
let encoding = null;
|
|
757
|
+
if (BASE64_STD.test(value) && value.length % 4 === 0) {
|
|
758
|
+
encoding = "base64";
|
|
759
|
+
} else if (BASE64_URL.test(value) && value.length % 4 !== 1) {
|
|
760
|
+
encoding = "base64url";
|
|
761
|
+
} else {
|
|
762
|
+
return null;
|
|
763
|
+
}
|
|
764
|
+
let bytes;
|
|
765
|
+
try {
|
|
766
|
+
bytes = Buffer.from(value, encoding);
|
|
767
|
+
} catch {
|
|
768
|
+
return null;
|
|
769
|
+
}
|
|
770
|
+
if (bytes.length === 0) {
|
|
771
|
+
return null;
|
|
772
|
+
}
|
|
773
|
+
// Round-trip guard: Buffer.from is lenient (it ignores stray chars / bad padding),
|
|
774
|
+
// so a non-canonical string can "decode". Re-encoding the bytes must reproduce the
|
|
775
|
+
// EXACT leaf — otherwise the leaf was not really this base64 value.
|
|
776
|
+
if (bytes.toString(encoding) !== value) {
|
|
777
|
+
return null;
|
|
778
|
+
}
|
|
779
|
+
// The decoded bytes must be valid UTF-8 text; a card/RRN/secret is text. Random
|
|
780
|
+
// base64 usually decodes to non-UTF-8 bytes, which we skip here (a cheap, strong
|
|
781
|
+
// false-positive filter before we even run the rules).
|
|
782
|
+
if (!isUtf8(bytes)) {
|
|
783
|
+
return null;
|
|
784
|
+
}
|
|
785
|
+
return bytes.toString("utf8");
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
function decodePercentLeaf(value) {
|
|
789
|
+
// Only attempt when there is an actual `%XX` escape — otherwise decodeURIComponent
|
|
790
|
+
// is a no-op and we would needlessly rescan an identical string.
|
|
791
|
+
if (!/%[0-9A-Fa-f]{2}/.test(value)) {
|
|
792
|
+
return null;
|
|
793
|
+
}
|
|
794
|
+
let decoded;
|
|
795
|
+
try {
|
|
796
|
+
decoded = decodeURIComponent(value);
|
|
797
|
+
} catch {
|
|
798
|
+
// Malformed percent-escape (e.g. a bare `%` or `%zz`) → skip, never throw.
|
|
799
|
+
return null;
|
|
800
|
+
}
|
|
801
|
+
if (decoded === value) {
|
|
802
|
+
return null;
|
|
803
|
+
}
|
|
804
|
+
return decoded;
|
|
805
|
+
}
|
|
806
|
+
|
|
404
807
|
// Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
|
|
405
808
|
// text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
|
|
406
809
|
// the same number of UTF-16 units (so no interior offset shifts) AND the per-
|
|
@@ -581,3 +984,273 @@ function ibanValid(value) {
|
|
|
581
984
|
}
|
|
582
985
|
return remainder === 1;
|
|
583
986
|
}
|
|
987
|
+
|
|
988
|
+
// Japan My Number (個人番号) check digit. The official scheme: over the first 11
|
|
989
|
+
// digits, P = 11 - (Σ n_i · Q_i mod 11), where n_i is the i-th digit FROM THE
|
|
990
|
+
// RIGHT of the 11-digit prefix and Q_i = i+1 for 1≤i≤6, i-5 for 7≤i≤11. When the
|
|
991
|
+
// remainder is 0 or 1 the check digit is 0. The 12th digit must equal P. This
|
|
992
|
+
// check digit is the precision guarantee — a random 12-digit id passes only 1
|
|
993
|
+
// time in 10, and the corpus hard-negative (a valid-shape, wrong-check value)
|
|
994
|
+
// proves the rejection.
|
|
995
|
+
function jpMyNumberValid(value) {
|
|
996
|
+
const digits = value.replace(/\D/g, "");
|
|
997
|
+
if (digits.length !== 12) {
|
|
998
|
+
return false;
|
|
999
|
+
}
|
|
1000
|
+
let sum = 0;
|
|
1001
|
+
for (let n = 1; n <= 11; n += 1) {
|
|
1002
|
+
const digit = Number(digits[11 - n]);
|
|
1003
|
+
const weight = n <= 6 ? n + 1 : n - 5;
|
|
1004
|
+
sum += digit * weight;
|
|
1005
|
+
}
|
|
1006
|
+
const remainder = sum % 11;
|
|
1007
|
+
const check = remainder <= 1 ? 0 : 11 - remainder;
|
|
1008
|
+
return check === Number(digits[11]);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
// France NIR / INSEE social-security control key. The first 13 chars are the
|
|
1012
|
+
// body (sex, birth year/month, department, commune, order); the last 2 are the
|
|
1013
|
+
// control key, which must equal `97 - (body mod 97)`. The Corsica department is
|
|
1014
|
+
// written 2A/2B; the official rule substitutes 2A→19 and 2B→18 in the body
|
|
1015
|
+
// BEFORE the mod (the rest of the body is numeric). The control key is the
|
|
1016
|
+
// precision guarantee — a wrong key is rejected (corpus hard-negative).
|
|
1017
|
+
function frNirValid(value) {
|
|
1018
|
+
const compact = value.replace(/[\s.-]/g, "").toUpperCase();
|
|
1019
|
+
if (!/^[12]\d{2}(?:\d{2}|0[1-9]|1[0-2]|20)(?:\d{2}|2[AB])\d{6}\d{2}$/.test(compact)) {
|
|
1020
|
+
return false;
|
|
1021
|
+
}
|
|
1022
|
+
const bodyRaw = compact.slice(0, 13);
|
|
1023
|
+
const control = Number(compact.slice(13));
|
|
1024
|
+
// Corsica substitution: 2A→19, 2B→18 (only the department field can be alpha).
|
|
1025
|
+
const body = bodyRaw.replace("2A", "19").replace("2B", "18");
|
|
1026
|
+
if (!/^\d{13}$/.test(body)) {
|
|
1027
|
+
return false;
|
|
1028
|
+
}
|
|
1029
|
+
let remainder = 0;
|
|
1030
|
+
for (const char of body) {
|
|
1031
|
+
remainder = (remainder * 10 + Number(char)) % 97;
|
|
1032
|
+
}
|
|
1033
|
+
const key = 97 - remainder;
|
|
1034
|
+
return key === control;
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// Spain DNI/NIE check letter (mod-23 table). DNI is 8 digits + a letter; NIE is
|
|
1038
|
+
// a leading X/Y/Z (mapped to 0/1/2) + 7 digits + a letter. The letter is
|
|
1039
|
+
// `table[number mod 23]` where table = "TRWAGMYFPDXBNJZSQVHLCKE". The letter is
|
|
1040
|
+
// the precision guarantee — a structurally valid but wrong letter is rejected
|
|
1041
|
+
// (corpus hard-negative).
|
|
1042
|
+
const ES_DNI_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
|
|
1043
|
+
const ES_NIE_PREFIX = { X: "0", Y: "1", Z: "2" };
|
|
1044
|
+
function esDniValid(value) {
|
|
1045
|
+
const compact = value.replace(/[\s-]/g, "").toUpperCase();
|
|
1046
|
+
let body;
|
|
1047
|
+
let letter;
|
|
1048
|
+
if (/^\d{8}[A-Z]$/.test(compact)) {
|
|
1049
|
+
body = compact.slice(0, 8);
|
|
1050
|
+
letter = compact[8];
|
|
1051
|
+
} else if (/^[XYZ]\d{7}[A-Z]$/.test(compact)) {
|
|
1052
|
+
body = ES_NIE_PREFIX[compact[0]] + compact.slice(1, 8);
|
|
1053
|
+
letter = compact[8];
|
|
1054
|
+
} else {
|
|
1055
|
+
return false;
|
|
1056
|
+
}
|
|
1057
|
+
return ES_DNI_TABLE[Number(body) % 23] === letter;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
// Italy Codice Fiscale check character. Over the first 15 chars: odd positions
|
|
1061
|
+
// (1st, 3rd, … counting from 1) use the ODD table, even positions use the EVEN
|
|
1062
|
+
// table; sum the mapped values and the (sum mod 26)-th letter (A=0) must equal the
|
|
1063
|
+
// 16th char. The mixed alpha+digit structure + the mod-26 check character are the
|
|
1064
|
+
// precision guard — a structurally valid but wrong check char is rejected (corpus
|
|
1065
|
+
// hard-negative). Hard-block.
|
|
1066
|
+
const IT_CF_ODD = {
|
|
1067
|
+
"0": 1, "1": 0, "2": 5, "3": 7, "4": 9, "5": 13, "6": 15, "7": 17, "8": 19, "9": 21,
|
|
1068
|
+
A: 1, B: 0, C: 5, D: 7, E: 9, F: 13, G: 15, H: 17, I: 19, J: 21, K: 2, L: 4, M: 18, N: 20,
|
|
1069
|
+
O: 11, P: 3, Q: 6, R: 8, S: 12, T: 14, U: 16, V: 10, W: 22, X: 25, Y: 24, Z: 23
|
|
1070
|
+
};
|
|
1071
|
+
const IT_CF_EVEN = {
|
|
1072
|
+
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
|
|
1073
|
+
A: 0, B: 1, C: 2, D: 3, E: 4, F: 5, G: 6, H: 7, I: 8, J: 9, K: 10, L: 11, M: 12, N: 13,
|
|
1074
|
+
O: 14, P: 15, Q: 16, R: 17, S: 18, T: 19, U: 20, V: 21, W: 22, X: 23, Y: 24, Z: 25
|
|
1075
|
+
};
|
|
1076
|
+
const IT_CF_REMAINDER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
1077
|
+
function itCodiceFiscaleValid(value) {
|
|
1078
|
+
const cf = value.replace(/\s/g, "").toUpperCase();
|
|
1079
|
+
if (!/^[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]$/.test(cf)) {
|
|
1080
|
+
return false;
|
|
1081
|
+
}
|
|
1082
|
+
let sum = 0;
|
|
1083
|
+
for (let index = 0; index < 15; index += 1) {
|
|
1084
|
+
const char = cf[index];
|
|
1085
|
+
// Position 1 (index 0) is ODD; alternate from there.
|
|
1086
|
+
sum += index % 2 === 0 ? IT_CF_ODD[char] : IT_CF_EVEN[char];
|
|
1087
|
+
}
|
|
1088
|
+
return IT_CF_REMAINDER[sum % 26] === cf[15];
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// Singapore NRIC/FIN check letter. Weighted sum (weights 2,7,6,5,4,3,2) over the
|
|
1092
|
+
// 7 digits, plus a per-prefix offset (T/G +4, M +3), mod 11, mapped through the
|
|
1093
|
+
// per-series letter table. S/T (citizen/PR), F/G (foreigner FIN), and M (post-2022
|
|
1094
|
+
// FIN) each have their own table. The prefix letter + check letter are the
|
|
1095
|
+
// precision guard — a wrong letter is rejected (corpus hard-negative). Hard-block.
|
|
1096
|
+
const SG_NRIC_WEIGHTS = [2, 7, 6, 5, 4, 3, 2];
|
|
1097
|
+
const SG_NRIC_TABLE_ST = ["J", "Z", "I", "H", "G", "F", "E", "D", "C", "B", "A"];
|
|
1098
|
+
const SG_NRIC_TABLE_FG = ["X", "W", "U", "T", "R", "Q", "P", "N", "M", "L", "K"];
|
|
1099
|
+
const SG_NRIC_TABLE_M = ["K", "L", "J", "N", "P", "Q", "R", "T", "U", "W", "X"];
|
|
1100
|
+
function sgNricValid(value) {
|
|
1101
|
+
const v = value.replace(/\s/g, "").toUpperCase();
|
|
1102
|
+
if (!/^[STFGM]\d{7}[A-Z]$/.test(v)) {
|
|
1103
|
+
return false;
|
|
1104
|
+
}
|
|
1105
|
+
const prefix = v[0];
|
|
1106
|
+
let sum = 0;
|
|
1107
|
+
for (let index = 0; index < 7; index += 1) {
|
|
1108
|
+
sum += Number(v[index + 1]) * SG_NRIC_WEIGHTS[index];
|
|
1109
|
+
}
|
|
1110
|
+
if (prefix === "T" || prefix === "G") {
|
|
1111
|
+
sum += 4;
|
|
1112
|
+
} else if (prefix === "M") {
|
|
1113
|
+
sum += 3;
|
|
1114
|
+
}
|
|
1115
|
+
const remainder = sum % 11;
|
|
1116
|
+
let table;
|
|
1117
|
+
if (prefix === "S" || prefix === "T") {
|
|
1118
|
+
table = SG_NRIC_TABLE_ST;
|
|
1119
|
+
} else if (prefix === "F" || prefix === "G") {
|
|
1120
|
+
table = SG_NRIC_TABLE_FG;
|
|
1121
|
+
} else {
|
|
1122
|
+
table = SG_NRIC_TABLE_M;
|
|
1123
|
+
}
|
|
1124
|
+
return table[remainder] === v[8];
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// India Aadhaar Verhoeff checksum. The Verhoeff scheme runs the dihedral-group
|
|
1128
|
+
// multiplication (VERHOEFF_D) over each digit permuted by position (VERHOEFF_P)
|
|
1129
|
+
// from the right; the running value must be 0 for a valid full number. Aadhaar is
|
|
1130
|
+
// 12 digits and never starts 0 or 1. The Verhoeff check is the precision guard — a
|
|
1131
|
+
// wrong check digit is rejected (corpus hard-negative). Dial-eligible: Verhoeff
|
|
1132
|
+
// over a common 12-digit shape passes ~1/10 of random runs (the jp_mynumber
|
|
1133
|
+
// footgun), so it stays allowlist-clearable.
|
|
1134
|
+
const VERHOEFF_D = [
|
|
1135
|
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
1136
|
+
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
|
|
1137
|
+
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
|
|
1138
|
+
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
|
|
1139
|
+
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
|
|
1140
|
+
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
|
|
1141
|
+
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
|
|
1142
|
+
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
|
|
1143
|
+
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
|
|
1144
|
+
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
1145
|
+
];
|
|
1146
|
+
const VERHOEFF_P = [
|
|
1147
|
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
1148
|
+
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
|
|
1149
|
+
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
|
|
1150
|
+
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
|
|
1151
|
+
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
|
|
1152
|
+
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
|
|
1153
|
+
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
|
|
1154
|
+
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8]
|
|
1155
|
+
];
|
|
1156
|
+
function verhoeffValid(digits) {
|
|
1157
|
+
let check = 0;
|
|
1158
|
+
const reversed = digits.split("").reverse();
|
|
1159
|
+
for (let index = 0; index < reversed.length; index += 1) {
|
|
1160
|
+
check = VERHOEFF_D[check][VERHOEFF_P[index % 8][Number(reversed[index])]];
|
|
1161
|
+
}
|
|
1162
|
+
return check === 0;
|
|
1163
|
+
}
|
|
1164
|
+
function inAadhaarValid(value) {
|
|
1165
|
+
const digits = value.replace(/[\s-]/g, "");
|
|
1166
|
+
if (!/^[2-9]\d{11}$/.test(digits)) {
|
|
1167
|
+
return false;
|
|
1168
|
+
}
|
|
1169
|
+
return verhoeffValid(digits);
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
// Germany tax ID (Steuer-Identifikationsnummer). Two guards: (1) the structural
|
|
1173
|
+
// rule that the first 10 digits contain exactly one repeated digit (one value
|
|
1174
|
+
// appears 2 or 3 times, the rest once), and (2) the ISO 7064 MOD 11,10 check digit
|
|
1175
|
+
// over the first 10 must equal the 11th. Dial-eligible: a bare 11-digit run with no
|
|
1176
|
+
// non-numeric anchor over a common length (the jp_mynumber discipline), even though
|
|
1177
|
+
// the combined guard is strong.
|
|
1178
|
+
function deSteuerIdStructural(first10) {
|
|
1179
|
+
const counts = new Map();
|
|
1180
|
+
for (const char of first10) {
|
|
1181
|
+
counts.set(char, (counts.get(char) ?? 0) + 1);
|
|
1182
|
+
}
|
|
1183
|
+
const repeats = [...counts.values()].filter((count) => count >= 2);
|
|
1184
|
+
// Exactly one digit repeats, and it repeats 2 or 3 times (never more).
|
|
1185
|
+
return repeats.length === 1 && repeats[0] <= 3;
|
|
1186
|
+
}
|
|
1187
|
+
function deSteuerIdCheckDigit(first10) {
|
|
1188
|
+
let product = 10;
|
|
1189
|
+
for (let index = 0; index < 10; index += 1) {
|
|
1190
|
+
let sum = (Number(first10[index]) + product) % 10;
|
|
1191
|
+
if (sum === 0) {
|
|
1192
|
+
sum = 10;
|
|
1193
|
+
}
|
|
1194
|
+
product = (sum * 2) % 11;
|
|
1195
|
+
}
|
|
1196
|
+
let check = 11 - product;
|
|
1197
|
+
if (check === 10) {
|
|
1198
|
+
check = 0;
|
|
1199
|
+
}
|
|
1200
|
+
return check;
|
|
1201
|
+
}
|
|
1202
|
+
function deSteuerIdValid(value) {
|
|
1203
|
+
const digits = value.replace(/[\s/]/g, "");
|
|
1204
|
+
if (!/^\d{11}$/.test(digits)) {
|
|
1205
|
+
return false;
|
|
1206
|
+
}
|
|
1207
|
+
const first10 = digits.slice(0, 10);
|
|
1208
|
+
if (!deSteuerIdStructural(first10)) {
|
|
1209
|
+
return false;
|
|
1210
|
+
}
|
|
1211
|
+
return deSteuerIdCheckDigit(first10) === Number(digits[10]);
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
// Netherlands BSN "11-proef": Σ (digit_i · weight_i) ≡ 0 mod 11 over the 9 digits,
|
|
1215
|
+
// where the weights run 9,8,…,2 for the first eight and -1 for the last. The
|
|
1216
|
+
// all-zero number is rejected. Dial-eligible: 9 bare digits is very common, so the
|
|
1217
|
+
// 11-proef passes ~1/11 of random runs (the clearest jp_mynumber-style footgun).
|
|
1218
|
+
function nlBsnValid(value) {
|
|
1219
|
+
const digits = value.replace(/[\s.]/g, "");
|
|
1220
|
+
if (!/^\d{9}$/.test(digits)) {
|
|
1221
|
+
return false;
|
|
1222
|
+
}
|
|
1223
|
+
if (/^0{9}$/.test(digits)) {
|
|
1224
|
+
return false;
|
|
1225
|
+
}
|
|
1226
|
+
let sum = 0;
|
|
1227
|
+
for (let index = 0; index < 8; index += 1) {
|
|
1228
|
+
sum += Number(digits[index]) * (9 - index);
|
|
1229
|
+
}
|
|
1230
|
+
sum += Number(digits[8]) * -1;
|
|
1231
|
+
return sum % 11 === 0;
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
// UK National Insurance Number — FORMAT-ONLY (no checksum exists), which is why
|
|
1235
|
+
// uk_nino stays OUT of HARD_BLOCK_TYPES (dial-eligible). The regex already
|
|
1236
|
+
// excludes the disallowed individual letters; this validator rejects the
|
|
1237
|
+
// documented invalid PREFIX PAIRS (BG, GB, NK, KN, TN, NT, ZZ) that the regex
|
|
1238
|
+
// cannot express as a negative set, plus the `O`-as-second-letter case (belt-and-
|
|
1239
|
+
// braces with the regex class). The administrative `TN`/`NT` and the temporary
|
|
1240
|
+
// `OO`/the suspended `BG` etc. are never issued, so excluding them lifts precision.
|
|
1241
|
+
const UK_NINO_INVALID_PREFIXES = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
|
|
1242
|
+
function ukNinoValid(value) {
|
|
1243
|
+
const compact = value.replace(/\s/g, "").toUpperCase();
|
|
1244
|
+
if (!/^[A-Z]{2}\d{6}[A-D]$/.test(compact)) {
|
|
1245
|
+
return false;
|
|
1246
|
+
}
|
|
1247
|
+
const prefix = compact.slice(0, 2);
|
|
1248
|
+
if (UK_NINO_INVALID_PREFIXES.has(prefix)) {
|
|
1249
|
+
return false;
|
|
1250
|
+
}
|
|
1251
|
+
// First letter never D/F/I/Q/U/V; second letter never D/F/I/O/Q/U/V.
|
|
1252
|
+
if (/[DFIQUV]/.test(prefix[0]) || /[DFIOQUV]/.test(prefix[1])) {
|
|
1253
|
+
return false;
|
|
1254
|
+
}
|
|
1255
|
+
return true;
|
|
1256
|
+
}
|