haechi 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,51 @@
1
+ import { isUtf8 } from "node:buffer";
2
+
1
3
  // The hard-block detection types: a leak of one of these is a load-bearing
2
4
  // fail-closed concern, so the WS2c precision dials (filters.minConfidence,
3
5
  // filters.allowlist) may NOT suppress a detection of any of them. minConfidence
4
6
  // trims only the precision-risky SOFT types; the allowlist's per-value/per-path
5
7
  // exceptions are ignored for these types (the detection still fires). Exported
6
8
  // so the core detect→decide path enforces the same exemption set the docs pin.
7
- export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card"]);
9
+ //
10
+ // Hard-block types are sensitive AND have a STRONG enough anchor that a match is
11
+ // effectively a true positive by construction, so the precision dials
12
+ // (filters.minConfidence / filters.allowlist) can never suppress them:
13
+ // - kr_rrn / card — checksum + constrained format
14
+ // - fr_nir (mod-97 over a long structured 15-digit run) and es_dni (mod-23 plus
15
+ // a required check LETTER suffix) — a random same-shaped value almost never
16
+ // passes, and the shapes are rare in ordinary payloads.
17
+ // - it_codice_fiscale — a 16-char MIXED alpha+digit shape with a mod-26 check
18
+ // CHARACTER. The non-numeric structural anchor (the rigid letter/digit layout)
19
+ // makes a benign 16-char `[A-Z]{6}\d{2}[A-Z]…` run in an ordinary payload
20
+ // implausible (measured collision ~3.8% over an already-rare shape), so a match
21
+ // is effectively a true positive.
22
+ // - sg_nric — a LETTER prefix ([STFGM]) + 7 digits + a CHECK LETTER. Two
23
+ // non-numeric anchors (prefix letter + checksum letter) over a rare shape
24
+ // (measured collision ~3.9% over the prefix+letter shape) make a benign FP
25
+ // implausible. Both anchored, un-allowlistable.
26
+ // DELIBERATELY DIAL-ELIGIBLE (NOT hard-block) — bare-digit runs whose only guard is
27
+ // a single numeric checksum over a COMMON digit length, so a benign id/counter FP is
28
+ // plausible and the operator needs the allowlist/minConfidence escape hatch (the
29
+ // jp_mynumber precedent). They still detect + (per profile) block by default:
30
+ // - jp_mynumber — a bare 12-digit run + a SINGLE mod-11 check digit (measured
31
+ // ~9% of random 12-digit numbers pass), and 12-digit ids/counters are common.
32
+ // - uk_nino — NO checksum exists (format + invalid-prefix exclusions only), the
33
+ // largest FP surface.
34
+ // - in_aadhaar — a bare 12-digit run + the Verhoeff checksum (measured ~9.9% of
35
+ // random 12-digit runs pass, the same order as jp_mynumber's mod-11). Aadhaar
36
+ // is extremely sensitive, but Verhoeff over the COMMON 12-digit shape is exactly
37
+ // the jp_mynumber footgun (a 12-digit id/counter is common), so it stays
38
+ // allowlist-clearable rather than un-suppressable.
39
+ // - de_steuer_id — a bare 11-digit run + ISO 7064 MOD 11,10 plus a structural
40
+ // "exactly one repeated digit" test. The combined guard is strong (measured
41
+ // ~0.37% collision), BUT there is NO non-numeric anchor and 11-digit ids are
42
+ // common in payloads, so per the jp_mynumber discipline (a bare-digit shape over
43
+ // a common length) it stays DIAL-ELIGIBLE so an operator can clear an 11-digit-id
44
+ // FP. It still detects + blocks by default.
45
+ // - nl_bsn — a bare 9-digit run + the "11-proef" weighted mod-11 (measured ~9.1%
46
+ // of random 9-digit runs pass). 9 bare digits is VERY common, so this is the
47
+ // clearest dial-eligible case.
48
+ export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card", "fr_nir", "es_dni", "it_codice_fiscale", "sg_nric"]);
8
49
 
9
50
  const DEFAULT_RULES = [
10
51
  {
@@ -94,6 +135,76 @@ const DEFAULT_RULES = [
94
135
  flags: "g",
95
136
  confidence: 0.9
96
137
  },
138
+ {
139
+ // Anthropic API key: `sk-ant-` + a long body. Ordered BEFORE the OpenAI rule
140
+ // below so a Claude key is attributed to its own rule (both emit `secret`, so
141
+ // removeOverlaps collapsing the shared span to either is type-identical — the
142
+ // ordering is for ruleId attribution, not for the scored type). The HYPHEN
143
+ // after `sk` is load-bearing: it keeps this OFF the underscore-based Stripe/
144
+ // OpenAI-platform `sk_` rule (openai-like-key, `api_key`), so the two never
145
+ // collide on the same span.
146
+ id: "anthropic-api-key",
147
+ type: "secret",
148
+ pattern: "\\bsk-ant-[A-Za-z0-9_-]{16,}\\b",
149
+ flags: "g",
150
+ confidence: 0.95
151
+ },
152
+ {
153
+ // OpenAI API key: `sk-` (and project keys `sk-proj-`) + a long base62-ish
154
+ // body. The HYPHEN is the disambiguator from Stripe/OpenAI-platform `sk_`
155
+ // (underscore — handled by openai-like-key as `api_key`); this rule never
156
+ // matches an underscore form, so the two prefixes do not overlap. A >=20-char
157
+ // body keeps a bare `sk-foo` slug from firing. The Anthropic `sk-ant-` rule
158
+ // above is a stricter sibling that runs first.
159
+ id: "openai-api-key",
160
+ type: "secret",
161
+ pattern: "\\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\\b",
162
+ flags: "g",
163
+ confidence: 0.9
164
+ },
165
+ {
166
+ // Google OAuth client secret: anchored `GOCSPX-` + exactly 28 chars from the
167
+ // URL-safe alphabet. Fixed prefix + fixed length = high precision (this is the
168
+ // OAuth client secret, distinct from the `AIza` API key above).
169
+ id: "google-oauth-client-secret",
170
+ type: "secret",
171
+ pattern: "\\bGOCSPX-[A-Za-z0-9_-]{28}\\b",
172
+ flags: "g",
173
+ confidence: 0.95
174
+ },
175
+ {
176
+ // SendGrid API key: `SG.` + 22 URL-safe chars + `.` + 43 URL-safe chars. The
177
+ // two fixed-length dotted segments after the `SG.` prefix are what make this
178
+ // high-precision — a bare `SG.`-prefixed string of the wrong shape is rejected.
179
+ id: "sendgrid-api-key",
180
+ type: "api_key",
181
+ pattern: "\\bSG\\.[A-Za-z0-9_-]{22}\\.[A-Za-z0-9_-]{43}\\b",
182
+ flags: "g",
183
+ confidence: 0.95
184
+ },
185
+ {
186
+ // Twilio Account SID (AC…) / API Key SID (SK…): the fixed `AC`/`SK` prefix +
187
+ // EXACTLY 32 HEX chars. The hex-only body (not base62) is the precision guard:
188
+ // a random alphanumeric run of the same length carries non-hex letters and is
189
+ // rejected, and the `SK` form does not collide with the underscore/hyphen
190
+ // `sk_`/`sk-` rules. Twilio's bare 32-hex AUTH TOKEN is deliberately NOT a
191
+ // standalone rule (a prefix-less 32-hex run is indistinguishable from an MD5
192
+ // hash / id) — it is caught via the `<key> = <value>` assignment vocabulary.
193
+ id: "twilio-sid",
194
+ type: "api_key",
195
+ pattern: "\\b(?:AC|SK)[0-9a-fA-F]{32}\\b",
196
+ flags: "g",
197
+ confidence: 0.9
198
+ },
199
+ {
200
+ // npm access token: anchored `npm_` + exactly 36 base62 chars. Fixed prefix +
201
+ // fixed length = high precision.
202
+ id: "npm-token",
203
+ type: "secret",
204
+ pattern: "\\bnpm_[A-Za-z0-9]{36}\\b",
205
+ flags: "g",
206
+ confidence: 0.95
207
+ },
97
208
  {
98
209
  // JWT: three dot-separated base64url segments where the FIRST starts with
99
210
  // `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
@@ -139,7 +250,12 @@ const DEFAULT_RULES = [
139
250
  // client secrets, PEM/private keys, access/refresh tokens) so a
140
251
  // `<key> = <value>` leak is caught even when the value itself has no
141
252
  // self-describing prefix (e.g. an AWS secret access key is bare base64).
142
- pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
253
+ // `accountkey` catches the Azure Storage connection-string `AccountKey=<88-
254
+ // char base64>=` segment — an un-anchored 88-char base64 rule would false-fire
255
+ // on any blob, so the `AccountKey=` assignment context is the precision anchor.
256
+ // `auth[_-]?token` catches the Twilio auth token (a bare 32-hex run with no
257
+ // self-describing prefix) when it is leaked as a `<key> = <value>` pair.
258
+ pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|auth[_-]?token|accountkey|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
143
259
  flags: "gi",
144
260
  confidence: 0.85
145
261
  },
@@ -166,6 +282,136 @@ const DEFAULT_RULES = [
166
282
  confidence: 0.9,
167
283
  validate: ibanValid
168
284
  },
285
+ {
286
+ // Japan My Number (個人番号): EXACTLY 12 digits with the official mod-11
287
+ // weighted check digit over the first 11. A bare 12-digit run is ambiguous
288
+ // (an id/timestamp), so jpMyNumberValid is the precision guard — only a run
289
+ // whose 12th digit equals the prescribed check digit fires. The leading/
290
+ // trailing boundaries (`(?<![\d-])`/`(?![\d-])`) stop the rule from matching
291
+ // a 12-digit window inside a longer digit/dashed run. NOT hard-block: a single
292
+ // mod-11 check digit only rejects ~10/11 of random 12-digit runs, and such
293
+ // runs are common (ids/counters), so a benign FP is plausible and the operator
294
+ // keeps the allowlist escape hatch (it still detects + blocks by default).
295
+ id: "jp-mynumber",
296
+ type: "jp_mynumber",
297
+ pattern: "(?<![\\d-])\\d{12}(?![\\d-])",
298
+ flags: "g",
299
+ confidence: 0.9,
300
+ validate: jpMyNumberValid
301
+ },
302
+ {
303
+ // France NIR / INSEE social-security: 15 chars where the department field may
304
+ // carry the Corsica `2A`/`2B` letters, validated by the control key
305
+ // `97 - (first13 mod 97) == last2` (Corsica 2A→19, 2B→18 before the mod).
306
+ // The control key is the precision guard — a wrong key is rejected. The
307
+ // department alpha is optional so the pure-numeric form also matches. Anchored
308
+ // on word boundaries; hard-block (checksummed).
309
+ id: "fr-nir",
310
+ type: "fr_nir",
311
+ pattern: "(?<![\\w-])[12]\\d{2}(?:0[1-9]|1[0-2]|20)(?:\\d{2}|2[AB])\\d{6}\\d{2}(?![\\w-])",
312
+ flags: "g",
313
+ confidence: 0.9,
314
+ validate: frNirValid
315
+ },
316
+ {
317
+ // Spain DNI/NIE: 8 digits (DNI) or a leading X/Y/Z + 7 digits (NIE) + a check
318
+ // letter from the mod-23 table (NIE maps X/Y/Z→0/1/2 before the mod). The
319
+ // check letter is the precision guard — a wrong letter is rejected. The
320
+ // letters that can never appear (I/O/U) are excluded from the suffix class so
321
+ // an ordinary `<8-digit><letter>` token rarely even reaches the validator.
322
+ // Hard-block (checksummed).
323
+ id: "es-dni-nie",
324
+ type: "es_dni",
325
+ pattern: "(?<![\\w-])[XYZ]?\\d{7,8}[A-HJ-NP-TV-Z](?![\\w-])",
326
+ flags: "gi",
327
+ confidence: 0.85,
328
+ validate: esDniValid
329
+ },
330
+ {
331
+ // Italy Codice Fiscale: 16 chars in the rigid layout
332
+ // [A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z] (surname, name, year, month-letter,
333
+ // day, place code, check character). The 16th char is the official check
334
+ // character: sum the odd/even-position table values over the first 15 chars
335
+ // and map (sum mod 26) to a letter. The mixed alpha+digit structure + the
336
+ // mod-26 check character are the precision guard. Hard-block (strong
337
+ // non-numeric anchor over a rare shape).
338
+ id: "it-codice-fiscale",
339
+ type: "it_codice_fiscale",
340
+ pattern: "(?<![A-Z0-9])[A-Z]{6}\\d{2}[A-Z]\\d{2}[A-Z]\\d{3}[A-Z](?![A-Z0-9])",
341
+ flags: "gi",
342
+ confidence: 0.9,
343
+ validate: itCodiceFiscaleValid
344
+ },
345
+ {
346
+ // Singapore NRIC/FIN: a series LETTER ([STFGM]) + 7 digits + a CHECK LETTER.
347
+ // The check letter is a weighted sum (weights 2,7,6,5,4,3,2) plus a per-prefix
348
+ // offset (T/G +4, M +3), mod 11, looked up in the per-series letter table. The
349
+ // prefix letter + check letter are the precision guard. Hard-block (two
350
+ // non-numeric anchors over a rare shape).
351
+ id: "sg-nric",
352
+ type: "sg_nric",
353
+ pattern: "(?<![A-Z0-9])[STFGMstfgm]\\d{7}[A-Za-z](?![A-Z0-9])",
354
+ flags: "g",
355
+ confidence: 0.9,
356
+ validate: sgNricValid
357
+ },
358
+ {
359
+ // India Aadhaar: 12 digits (never starting 0 or 1) with the Verhoeff checksum
360
+ // over all 12. A bare 12-digit run is ambiguous (an id/timestamp), so the
361
+ // Verhoeff check is the precision guard. NOT hard-block: Verhoeff over the
362
+ // COMMON 12-digit shape passes ~1/10 of random runs (the jp_mynumber footgun),
363
+ // so it stays dial-eligible (still detects + blocks by default). The leading/
364
+ // trailing boundaries stop the rule from matching a 12-digit window inside a
365
+ // longer digit/dashed run.
366
+ id: "in-aadhaar",
367
+ type: "in_aadhaar",
368
+ pattern: "(?<![\\d-])[2-9]\\d{11}(?![\\d-])",
369
+ flags: "g",
370
+ confidence: 0.85,
371
+ validate: inAadhaarValid
372
+ },
373
+ {
374
+ // Germany tax ID (Steuer-Identifikationsnummer): 11 digits with the ISO 7064
375
+ // MOD 11,10 check digit over the first 10, plus the structural rule that the
376
+ // first 10 digits contain exactly one repeated digit (one value appears 2 or 3
377
+ // times, the rest once). The combined guard is strong, but it is a BARE-DIGIT
378
+ // run with no non-numeric anchor over a common 11-digit length, so per the
379
+ // jp_mynumber discipline it stays dial-eligible (the operator can clear an
380
+ // 11-digit-id FP).
381
+ id: "de-steuer-id",
382
+ type: "de_steuer_id",
383
+ pattern: "(?<![\\d-])\\d{11}(?![\\d-])",
384
+ flags: "g",
385
+ confidence: 0.85,
386
+ validate: deSteuerIdValid
387
+ },
388
+ {
389
+ // Netherlands BSN (Burgerservicenummer): 9 digits validated by the "11-proef"
390
+ // weighted mod-11 (Σ digit_i · weight_i ≡ 0 mod 11, with the last weight -1).
391
+ // 9 bare digits is VERY common, so the 11-proef passes ~1/11 of random runs —
392
+ // the clearest dial-eligible case (still detects + blocks by default; the
393
+ // operator keeps the allowlist escape hatch).
394
+ id: "nl-bsn",
395
+ type: "nl_bsn",
396
+ pattern: "(?<![\\d-])\\d{9}(?![\\d-])",
397
+ flags: "g",
398
+ confidence: 0.8,
399
+ validate: nlBsnValid
400
+ },
401
+ {
402
+ // UK National Insurance Number: two prefix letters + 6 digits + a suffix
403
+ // A-D. There is NO checksum, so this is FORMAT-ONLY and stays OUT of
404
+ // HARD_BLOCK_TYPES (dial-eligible). The pattern bakes in the documented
405
+ // invalid-prefix exclusions: 1st letter never D/F/I/Q/U/V, 2nd letter never
406
+ // D/F/I/O/Q/U/V, and the disallowed pairs BG/GB/NK/KN/TN/NT/ZZ are rejected
407
+ // by ukNinoValid (a negative-set the regex can't express cleanly).
408
+ id: "uk-nino",
409
+ type: "uk_nino",
410
+ pattern: "(?<![\\w-])[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\\d{6}[A-D](?![\\w-])",
411
+ flags: "g",
412
+ confidence: 0.7,
413
+ validate: ukNinoValid
414
+ },
169
415
  {
170
416
  // E.164 international phone: ONLY with a leading `+` (a bare digit run is an
171
417
  // id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
@@ -232,8 +478,13 @@ const DEFAULT_RULES = [
232
478
  }
233
479
  ];
234
480
 
235
- export function createDefaultFilterEngine({ customRules = [] } = {}) {
481
+ export function createDefaultFilterEngine({ customRules = [], decodeAndRescan = false } = {}) {
236
482
  const rules = DEFAULT_RULES.concat(customRules.map(normalizeCustomRule));
483
+ // The opt-in base64/percent decode-and-rescan pass (WS2d residual). Default OFF
484
+ // => byte-identical to prior behavior. Held in the engine CLOSURE, NOT threaded
485
+ // through the protect `context`: the request context is data and must not carry
486
+ // this control flag (it would pollute tokenize AAD / audit).
487
+ const decodeOptions = { decodeAndRescan: decodeAndRescan === true };
237
488
 
238
489
  return {
239
490
  id: "haechi.filter.default",
@@ -243,12 +494,32 @@ export function createDefaultFilterEngine({ customRules = [] } = {}) {
243
494
  networkEgress: false
244
495
  },
245
496
  async detect({ entries, context }) {
246
- return entries.flatMap((entry) => detectEntry(entry, rules, context));
497
+ return entries.flatMap((entry) => detectEntry(entry, rules, context, decodeOptions));
247
498
  }
248
499
  };
249
500
  }
250
501
 
251
- export function detectEntry(entry, rules, context = {}) {
502
+ export function detectEntry(entry, rules, context = {}, options = {}) {
503
+ const baseDetections = scanEntry(entry, rules, context);
504
+ // WS2d residual — opt-in (default OFF) base64/percent decode-and-rescan. After
505
+ // the normal NFKC scan above, if the flag is on, attempt to decode the leaf and
506
+ // rescan the decoded text. A decoded hit has NO valid offset in the encoded leaf
507
+ // (decoding remaps everything), so it fails closed to a WHOLE-LEAF detection of
508
+ // the original encoded leaf — and only fires for a validator-backed/hard-block
509
+ // hit so random base64 never false-positives. See decodeAndRescanEntry.
510
+ if (options?.decodeAndRescan === true) {
511
+ const decoded = decodeAndRescanEntry(entry, rules, context);
512
+ if (decoded.length > 0) {
513
+ return baseDetections.concat(decoded);
514
+ }
515
+ }
516
+ return baseDetections;
517
+ }
518
+
519
+ // The original per-leaf NFKC scan (WS2d), unchanged. Extracted from detectEntry so
520
+ // the opt-in decode-and-rescan pass wraps it without touching the byte-identical
521
+ // default path.
522
+ function scanEntry(entry, rules, context = {}) {
252
523
  const detections = [];
253
524
  // On the RESPONSE direction, a bare JSON NUMBER leaf is inference-server
254
525
  // metadata (a nanosecond `*_duration`, a token count, a numeric id/timestamp) —
@@ -364,13 +635,19 @@ function scanForDetections(scanText, rules, context, markerSpans, entry, origina
364
635
  // value: the whole original leaf). The response-direction marker skip does NOT
365
636
  // apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
366
637
  // and NFKC-stable), so an evasion attempt can never masquerade as one.
367
- function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
638
+ function wholeLeafDetections(normalized, rules, context, entry, originalValue, ruleFilter = null) {
368
639
  const seenTypes = new Set();
369
640
  const detections = [];
370
641
  for (const rule of rules) {
371
642
  if (rule.direction && rule.direction !== context?.direction) {
372
643
  continue;
373
644
  }
645
+ // The decode-and-rescan caller passes a precision filter so only validator-
646
+ // backed / hard-block rules can fire on decoded text (random base64 guard).
647
+ // The Case-3 NFKC caller passes nothing → every rule is eligible (unchanged).
648
+ if (ruleFilter && !ruleFilter(rule)) {
649
+ continue;
650
+ }
374
651
  if (seenTypes.has(rule.type)) {
375
652
  continue;
376
653
  }
@@ -401,6 +678,132 @@ function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
401
678
  return detections;
402
679
  }
403
680
 
681
+ // WS2d residual — opt-in base64/percent decode-and-rescan (default OFF). An
682
+ // always-on decode is false-positive-prone (random base64 decodes to bytes that
683
+ // can shape-match a soft rule), so this is gated behind `filters.decodeAndRescan`
684
+ // AND a precision guard: a decoded hit only fires when it is VALIDATOR-BACKED or a
685
+ // HARD-BLOCK type (a Luhn-passing card, a checksum kr_rrn/us_ssn, an IBAN mod-97,
686
+ // or a secret/api_key on its anchored rule). A decoded soft-type-without-validator
687
+ // match (a bare phone-shaped run in random decoded bytes) does NOT fire — requiring
688
+ // validators keeps precision ~100% (random base64 Luhn-passing as a 16-digit card
689
+ // is astronomically unlikely).
690
+ //
691
+ // OFFSET HANDLING (fail closed): a detection found in the DECODED text has no valid
692
+ // offset in the original encoded leaf (decoding remaps everything), so we emit a
693
+ // WHOLE-LEAF detection per matched type (start:0, end:leaf.length, value: the whole
694
+ // original encoded leaf) — exactly the WS2d Case-3 path. The transform then
695
+ // redacts/blocks the entire encoded leaf. We never map a decoded offset back.
696
+ function decodeAndRescanEntry(entry, rules, context) {
697
+ // Only string leaves carry an encoded value; a number/boolean leaf cannot be a
698
+ // base64/percent blob (and the response-direction number skip already applies in
699
+ // the base scan).
700
+ if (entry.kind === "number") {
701
+ return [];
702
+ }
703
+ const decoded = decodeLeaf(entry.value);
704
+ if (decoded === null) {
705
+ return [];
706
+ }
707
+ // Reuse the Case-3 whole-leaf path, but restricted to precision-eligible rules so
708
+ // random base64 never false-positives. `decoded` supplies the scan text; the
709
+ // recorded detection still spans the ORIGINAL encoded leaf (entry.value).
710
+ return wholeLeafDetections(decoded, rules, context, entry, entry.value, isDecodeEligibleRule);
711
+ }
712
+
713
+ // A decoded whole-leaf detection only fires for a "meaningful" hit: a hard-block
714
+ // type (secret/api_key/kr_rrn/card) on its anchored rule, OR a checksum-validated
715
+ // type. The `phone` type is excluded even though kr-phone carries a `validate`
716
+ // helper — that helper is a trunk-prefix heuristic, not a checksum, so a phone-
717
+ // shaped run in random decoded bytes must NOT fire (the spec's named exclusion).
718
+ function isDecodeEligibleRule(rule) {
719
+ if (HARD_BLOCK_TYPES.has(rule.type)) {
720
+ return true;
721
+ }
722
+ return typeof rule.validate === "function" && rule.type !== "phone";
723
+ }
724
+
725
+ // Attempt to decode a string leaf to UTF-8 text, returning the decoded string or
726
+ // null when the leaf does not look like (or does not cleanly round-trip as) an
727
+ // encoded value. Two encodings, each precision-guarded so a benign value is skipped
728
+ // rather than mis-decoded:
729
+ // - base64 / base64url: the leaf must LOOK like base64 (no spaces, the base64 or
730
+ // base64url alphabet, a valid length for that variant) within bounds, decode to
731
+ // VALID UTF-8, and RE-ENCODE back to exactly the leaf (rejects the bytes that
732
+ // Buffer.from leniently accepts but are not the canonical encoding of the leaf).
733
+ // - percent-encoding: only when the leaf actually contains a `%XX` escape;
734
+ // decodeURIComponent in a try/catch (a malformed escape → skip, never throws).
735
+ // base64 is tried first (a `%`-bearing string is not base64), then percent.
736
+ const DECODE_MIN_LEN = 16;
737
+ const DECODE_MAX_LEN = 8192;
738
+ const BASE64_STD = /^[A-Za-z0-9+/]+={0,2}$/;
739
+ const BASE64_URL = /^[A-Za-z0-9_-]+$/;
740
+
741
+ function decodeLeaf(value) {
742
+ if (typeof value !== "string" || value.length < DECODE_MIN_LEN || value.length > DECODE_MAX_LEN) {
743
+ return null;
744
+ }
745
+ const base64 = decodeBase64Leaf(value);
746
+ if (base64 !== null) {
747
+ return base64;
748
+ }
749
+ return decodePercentLeaf(value);
750
+ }
751
+
752
+ function decodeBase64Leaf(value) {
753
+ // Standard base64: length must be a multiple of 4. base64url: length mod 4 may be
754
+ // 0, 2, or 3 (1 is impossible for any byte run) and the alphabet is `-_` not `+/`.
755
+ // A `%` or whitespace disqualifies it (handled by the anchored alphabet regexes).
756
+ let encoding = null;
757
+ if (BASE64_STD.test(value) && value.length % 4 === 0) {
758
+ encoding = "base64";
759
+ } else if (BASE64_URL.test(value) && value.length % 4 !== 1) {
760
+ encoding = "base64url";
761
+ } else {
762
+ return null;
763
+ }
764
+ let bytes;
765
+ try {
766
+ bytes = Buffer.from(value, encoding);
767
+ } catch {
768
+ return null;
769
+ }
770
+ if (bytes.length === 0) {
771
+ return null;
772
+ }
773
+ // Round-trip guard: Buffer.from is lenient (it ignores stray chars / bad padding),
774
+ // so a non-canonical string can "decode". Re-encoding the bytes must reproduce the
775
+ // EXACT leaf — otherwise the leaf was not really this base64 value.
776
+ if (bytes.toString(encoding) !== value) {
777
+ return null;
778
+ }
779
+ // The decoded bytes must be valid UTF-8 text; a card/RRN/secret is text. Random
780
+ // base64 usually decodes to non-UTF-8 bytes, which we skip here (a cheap, strong
781
+ // false-positive filter before we even run the rules).
782
+ if (!isUtf8(bytes)) {
783
+ return null;
784
+ }
785
+ return bytes.toString("utf8");
786
+ }
787
+
788
+ function decodePercentLeaf(value) {
789
+ // Only attempt when there is an actual `%XX` escape — otherwise decodeURIComponent
790
+ // is a no-op and we would needlessly rescan an identical string.
791
+ if (!/%[0-9A-Fa-f]{2}/.test(value)) {
792
+ return null;
793
+ }
794
+ let decoded;
795
+ try {
796
+ decoded = decodeURIComponent(value);
797
+ } catch {
798
+ // Malformed percent-escape (e.g. a bare `%` or `%zz`) → skip, never throw.
799
+ return null;
800
+ }
801
+ if (decoded === value) {
802
+ return null;
803
+ }
804
+ return decoded;
805
+ }
806
+
404
807
  // Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
405
808
  // text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
406
809
  // the same number of UTF-16 units (so no interior offset shifts) AND the per-
@@ -581,3 +984,273 @@ function ibanValid(value) {
581
984
  }
582
985
  return remainder === 1;
583
986
  }
987
+
988
+ // Japan My Number (個人番号) check digit. The official scheme: over the first 11
989
+ // digits, P = 11 - (Σ n_i · Q_i mod 11), where n_i is the i-th digit FROM THE
990
+ // RIGHT of the 11-digit prefix and Q_i = i+1 for 1≤i≤6, i-5 for 7≤i≤11. When the
991
+ // remainder is 0 or 1 the check digit is 0. The 12th digit must equal P. This
992
+ // check digit is the precision guarantee — a random 12-digit id passes only 1
993
+ // time in 10, and the corpus hard-negative (a valid-shape, wrong-check value)
994
+ // proves the rejection.
995
+ function jpMyNumberValid(value) {
996
+ const digits = value.replace(/\D/g, "");
997
+ if (digits.length !== 12) {
998
+ return false;
999
+ }
1000
+ let sum = 0;
1001
+ for (let n = 1; n <= 11; n += 1) {
1002
+ const digit = Number(digits[11 - n]);
1003
+ const weight = n <= 6 ? n + 1 : n - 5;
1004
+ sum += digit * weight;
1005
+ }
1006
+ const remainder = sum % 11;
1007
+ const check = remainder <= 1 ? 0 : 11 - remainder;
1008
+ return check === Number(digits[11]);
1009
+ }
1010
+
1011
+ // France NIR / INSEE social-security control key. The first 13 chars are the
1012
+ // body (sex, birth year/month, department, commune, order); the last 2 are the
1013
+ // control key, which must equal `97 - (body mod 97)`. The Corsica department is
1014
+ // written 2A/2B; the official rule substitutes 2A→19 and 2B→18 in the body
1015
+ // BEFORE the mod (the rest of the body is numeric). The control key is the
1016
+ // precision guarantee — a wrong key is rejected (corpus hard-negative).
1017
+ function frNirValid(value) {
1018
+ const compact = value.replace(/[\s.-]/g, "").toUpperCase();
1019
+ if (!/^[12]\d{2}(?:\d{2}|0[1-9]|1[0-2]|20)(?:\d{2}|2[AB])\d{6}\d{2}$/.test(compact)) {
1020
+ return false;
1021
+ }
1022
+ const bodyRaw = compact.slice(0, 13);
1023
+ const control = Number(compact.slice(13));
1024
+ // Corsica substitution: 2A→19, 2B→18 (only the department field can be alpha).
1025
+ const body = bodyRaw.replace("2A", "19").replace("2B", "18");
1026
+ if (!/^\d{13}$/.test(body)) {
1027
+ return false;
1028
+ }
1029
+ let remainder = 0;
1030
+ for (const char of body) {
1031
+ remainder = (remainder * 10 + Number(char)) % 97;
1032
+ }
1033
+ const key = 97 - remainder;
1034
+ return key === control;
1035
+ }
1036
+
1037
+ // Spain DNI/NIE check letter (mod-23 table). DNI is 8 digits + a letter; NIE is
1038
+ // a leading X/Y/Z (mapped to 0/1/2) + 7 digits + a letter. The letter is
1039
+ // `table[number mod 23]` where table = "TRWAGMYFPDXBNJZSQVHLCKE". The letter is
1040
+ // the precision guarantee — a structurally valid but wrong letter is rejected
1041
+ // (corpus hard-negative).
1042
+ const ES_DNI_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
1043
+ const ES_NIE_PREFIX = { X: "0", Y: "1", Z: "2" };
1044
+ function esDniValid(value) {
1045
+ const compact = value.replace(/[\s-]/g, "").toUpperCase();
1046
+ let body;
1047
+ let letter;
1048
+ if (/^\d{8}[A-Z]$/.test(compact)) {
1049
+ body = compact.slice(0, 8);
1050
+ letter = compact[8];
1051
+ } else if (/^[XYZ]\d{7}[A-Z]$/.test(compact)) {
1052
+ body = ES_NIE_PREFIX[compact[0]] + compact.slice(1, 8);
1053
+ letter = compact[8];
1054
+ } else {
1055
+ return false;
1056
+ }
1057
+ return ES_DNI_TABLE[Number(body) % 23] === letter;
1058
+ }
1059
+
1060
+ // Italy Codice Fiscale check character. Over the first 15 chars: odd positions
1061
+ // (1st, 3rd, … counting from 1) use the ODD table, even positions use the EVEN
1062
+ // table; sum the mapped values and the (sum mod 26)-th letter (A=0) must equal the
1063
+ // 16th char. The mixed alpha+digit structure + the mod-26 check character are the
1064
+ // precision guard — a structurally valid but wrong check char is rejected (corpus
1065
+ // hard-negative). Hard-block.
1066
+ const IT_CF_ODD = {
1067
+ "0": 1, "1": 0, "2": 5, "3": 7, "4": 9, "5": 13, "6": 15, "7": 17, "8": 19, "9": 21,
1068
+ A: 1, B: 0, C: 5, D: 7, E: 9, F: 13, G: 15, H: 17, I: 19, J: 21, K: 2, L: 4, M: 18, N: 20,
1069
+ O: 11, P: 3, Q: 6, R: 8, S: 12, T: 14, U: 16, V: 10, W: 22, X: 25, Y: 24, Z: 23
1070
+ };
1071
+ const IT_CF_EVEN = {
1072
+ "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
1073
+ A: 0, B: 1, C: 2, D: 3, E: 4, F: 5, G: 6, H: 7, I: 8, J: 9, K: 10, L: 11, M: 12, N: 13,
1074
+ O: 14, P: 15, Q: 16, R: 17, S: 18, T: 19, U: 20, V: 21, W: 22, X: 23, Y: 24, Z: 25
1075
+ };
1076
+ const IT_CF_REMAINDER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1077
+ function itCodiceFiscaleValid(value) {
1078
+ const cf = value.replace(/\s/g, "").toUpperCase();
1079
+ if (!/^[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]$/.test(cf)) {
1080
+ return false;
1081
+ }
1082
+ let sum = 0;
1083
+ for (let index = 0; index < 15; index += 1) {
1084
+ const char = cf[index];
1085
+ // Position 1 (index 0) is ODD; alternate from there.
1086
+ sum += index % 2 === 0 ? IT_CF_ODD[char] : IT_CF_EVEN[char];
1087
+ }
1088
+ return IT_CF_REMAINDER[sum % 26] === cf[15];
1089
+ }
1090
+
1091
+ // Singapore NRIC/FIN check letter. Weighted sum (weights 2,7,6,5,4,3,2) over the
1092
+ // 7 digits, plus a per-prefix offset (T/G +4, M +3), mod 11, mapped through the
1093
+ // per-series letter table. S/T (citizen/PR), F/G (foreigner FIN), and M (post-2022
1094
+ // FIN) each have their own table. The prefix letter + check letter are the
1095
+ // precision guard — a wrong letter is rejected (corpus hard-negative). Hard-block.
1096
+ const SG_NRIC_WEIGHTS = [2, 7, 6, 5, 4, 3, 2];
1097
+ const SG_NRIC_TABLE_ST = ["J", "Z", "I", "H", "G", "F", "E", "D", "C", "B", "A"];
1098
+ const SG_NRIC_TABLE_FG = ["X", "W", "U", "T", "R", "Q", "P", "N", "M", "L", "K"];
1099
+ const SG_NRIC_TABLE_M = ["K", "L", "J", "N", "P", "Q", "R", "T", "U", "W", "X"];
1100
+ function sgNricValid(value) {
1101
+ const v = value.replace(/\s/g, "").toUpperCase();
1102
+ if (!/^[STFGM]\d{7}[A-Z]$/.test(v)) {
1103
+ return false;
1104
+ }
1105
+ const prefix = v[0];
1106
+ let sum = 0;
1107
+ for (let index = 0; index < 7; index += 1) {
1108
+ sum += Number(v[index + 1]) * SG_NRIC_WEIGHTS[index];
1109
+ }
1110
+ if (prefix === "T" || prefix === "G") {
1111
+ sum += 4;
1112
+ } else if (prefix === "M") {
1113
+ sum += 3;
1114
+ }
1115
+ const remainder = sum % 11;
1116
+ let table;
1117
+ if (prefix === "S" || prefix === "T") {
1118
+ table = SG_NRIC_TABLE_ST;
1119
+ } else if (prefix === "F" || prefix === "G") {
1120
+ table = SG_NRIC_TABLE_FG;
1121
+ } else {
1122
+ table = SG_NRIC_TABLE_M;
1123
+ }
1124
+ return table[remainder] === v[8];
1125
+ }
1126
+
1127
+ // India Aadhaar Verhoeff checksum. The Verhoeff scheme runs the dihedral-group
1128
+ // multiplication (VERHOEFF_D) over each digit permuted by position (VERHOEFF_P)
1129
+ // from the right; the running value must be 0 for a valid full number. Aadhaar is
1130
+ // 12 digits and never starts 0 or 1. The Verhoeff check is the precision guard — a
1131
+ // wrong check digit is rejected (corpus hard-negative). Dial-eligible: Verhoeff
1132
+ // over a common 12-digit shape passes ~1/10 of random runs (the jp_mynumber
1133
+ // footgun), so it stays allowlist-clearable.
1134
+ const VERHOEFF_D = [
1135
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
1136
+ [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
1137
+ [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
1138
+ [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
1139
+ [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
1140
+ [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
1141
+ [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
1142
+ [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
1143
+ [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
1144
+ [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1145
+ ];
1146
+ const VERHOEFF_P = [
1147
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
1148
+ [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
1149
+ [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
1150
+ [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
1151
+ [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
1152
+ [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
1153
+ [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
1154
+ [7, 0, 4, 6, 9, 1, 3, 2, 5, 8]
1155
+ ];
1156
+ function verhoeffValid(digits) {
1157
+ let check = 0;
1158
+ const reversed = digits.split("").reverse();
1159
+ for (let index = 0; index < reversed.length; index += 1) {
1160
+ check = VERHOEFF_D[check][VERHOEFF_P[index % 8][Number(reversed[index])]];
1161
+ }
1162
+ return check === 0;
1163
+ }
1164
+ function inAadhaarValid(value) {
1165
+ const digits = value.replace(/[\s-]/g, "");
1166
+ if (!/^[2-9]\d{11}$/.test(digits)) {
1167
+ return false;
1168
+ }
1169
+ return verhoeffValid(digits);
1170
+ }
1171
+
1172
+ // Germany tax ID (Steuer-Identifikationsnummer). Two guards: (1) the structural
1173
+ // rule that the first 10 digits contain exactly one repeated digit (one value
1174
+ // appears 2 or 3 times, the rest once), and (2) the ISO 7064 MOD 11,10 check digit
1175
+ // over the first 10 must equal the 11th. Dial-eligible: a bare 11-digit run with no
1176
+ // non-numeric anchor over a common length (the jp_mynumber discipline), even though
1177
+ // the combined guard is strong.
1178
+ function deSteuerIdStructural(first10) {
1179
+ const counts = new Map();
1180
+ for (const char of first10) {
1181
+ counts.set(char, (counts.get(char) ?? 0) + 1);
1182
+ }
1183
+ const repeats = [...counts.values()].filter((count) => count >= 2);
1184
+ // Exactly one digit repeats, and it repeats 2 or 3 times (never more).
1185
+ return repeats.length === 1 && repeats[0] <= 3;
1186
+ }
1187
+ function deSteuerIdCheckDigit(first10) {
1188
+ let product = 10;
1189
+ for (let index = 0; index < 10; index += 1) {
1190
+ let sum = (Number(first10[index]) + product) % 10;
1191
+ if (sum === 0) {
1192
+ sum = 10;
1193
+ }
1194
+ product = (sum * 2) % 11;
1195
+ }
1196
+ let check = 11 - product;
1197
+ if (check === 10) {
1198
+ check = 0;
1199
+ }
1200
+ return check;
1201
+ }
1202
+ function deSteuerIdValid(value) {
1203
+ const digits = value.replace(/[\s/]/g, "");
1204
+ if (!/^\d{11}$/.test(digits)) {
1205
+ return false;
1206
+ }
1207
+ const first10 = digits.slice(0, 10);
1208
+ if (!deSteuerIdStructural(first10)) {
1209
+ return false;
1210
+ }
1211
+ return deSteuerIdCheckDigit(first10) === Number(digits[10]);
1212
+ }
1213
+
1214
+ // Netherlands BSN "11-proef": Σ (digit_i · weight_i) ≡ 0 mod 11 over the 9 digits,
1215
+ // where the weights run 9,8,…,2 for the first eight and -1 for the last. The
1216
+ // all-zero number is rejected. Dial-eligible: 9 bare digits is very common, so the
1217
+ // 11-proef passes ~1/11 of random runs (the clearest jp_mynumber-style footgun).
1218
+ function nlBsnValid(value) {
1219
+ const digits = value.replace(/[\s.]/g, "");
1220
+ if (!/^\d{9}$/.test(digits)) {
1221
+ return false;
1222
+ }
1223
+ if (/^0{9}$/.test(digits)) {
1224
+ return false;
1225
+ }
1226
+ let sum = 0;
1227
+ for (let index = 0; index < 8; index += 1) {
1228
+ sum += Number(digits[index]) * (9 - index);
1229
+ }
1230
+ sum += Number(digits[8]) * -1;
1231
+ return sum % 11 === 0;
1232
+ }
1233
+
1234
+ // UK National Insurance Number — FORMAT-ONLY (no checksum exists), which is why
1235
+ // uk_nino stays OUT of HARD_BLOCK_TYPES (dial-eligible). The regex already
1236
+ // excludes the disallowed individual letters; this validator rejects the
1237
+ // documented invalid PREFIX PAIRS (BG, GB, NK, KN, TN, NT, ZZ) that the regex
1238
+ // cannot express as a negative set, plus the `O`-as-second-letter case (belt-and-
1239
+ // braces with the regex class). The administrative `TN`/`NT` and the temporary
1240
+ // `OO`/the suspended `BG` etc. are never issued, so excluding them lifts precision.
1241
+ const UK_NINO_INVALID_PREFIXES = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
1242
+ function ukNinoValid(value) {
1243
+ const compact = value.replace(/\s/g, "").toUpperCase();
1244
+ if (!/^[A-Z]{2}\d{6}[A-D]$/.test(compact)) {
1245
+ return false;
1246
+ }
1247
+ const prefix = compact.slice(0, 2);
1248
+ if (UK_NINO_INVALID_PREFIXES.has(prefix)) {
1249
+ return false;
1250
+ }
1251
+ // First letter never D/F/I/Q/U/V; second letter never D/F/I/O/Q/U/V.
1252
+ if (/[DFIQUV]/.test(prefix[0]) || /[DFIOQUV]/.test(prefix[1])) {
1253
+ return false;
1254
+ }
1255
+ return true;
1256
+ }