haechi 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.ko.md +46 -11
  2. package/README.md +46 -11
  3. package/SECURITY.md +7 -1
  4. package/docs/README.md +2 -0
  5. package/docs/current/compliance-mapping.ko.md +53 -0
  6. package/docs/current/compliance-mapping.md +53 -0
  7. package/docs/current/config-version.ko.md +30 -0
  8. package/docs/current/config-version.md +51 -0
  9. package/docs/current/configuration.ko.md +165 -9
  10. package/docs/current/configuration.md +165 -9
  11. package/docs/current/operations-runbook.ko.md +155 -0
  12. package/docs/current/operations-runbook.md +241 -0
  13. package/docs/current/release-process.ko.md +5 -1
  14. package/docs/current/release-process.md +5 -1
  15. package/docs/current/risk-register-release-gate.ko.md +5 -3
  16. package/docs/current/risk-register-release-gate.md +13 -3
  17. package/docs/current/security-whitepaper.ko.md +102 -0
  18. package/docs/current/security-whitepaper.md +102 -0
  19. package/docs/current/shared-responsibility.ko.md +2 -2
  20. package/docs/current/shared-responsibility.md +2 -2
  21. package/docs/current/threat-model.ko.md +4 -2
  22. package/docs/current/threat-model.md +4 -2
  23. package/examples/local-proxy-demo/README.md +51 -0
  24. package/examples/local-proxy-demo/demo.mjs +144 -0
  25. package/examples/local-proxy-demo/demo.tape +19 -0
  26. package/examples/local-proxy-demo/live-demo.mjs +121 -0
  27. package/examples/local-proxy-demo/live-demo.tape +25 -0
  28. package/haechi.config.example.json +20 -3
  29. package/package.json +7 -2
  30. package/packages/audit/index.mjs +26 -2
  31. package/packages/cli/bin/haechi.mjs +57 -10
  32. package/packages/cli/runtime.mjs +402 -10
  33. package/packages/core/index.mjs +143 -8
  34. package/packages/filter/index.mjs +975 -12
  35. package/packages/metrics/index.mjs +181 -0
  36. package/packages/privacy-profiles/index.mjs +72 -3
  37. package/packages/protocol-adapters/index.mjs +99 -1
  38. package/packages/proxy/index.mjs +525 -40
  39. package/packages/stream-filter/index.mjs +69 -7
@@ -1,3 +1,52 @@
1
+ import { isUtf8 } from "node:buffer";
2
+
3
+ // The hard-block detection types: a leak of one of these is a load-bearing
4
+ // fail-closed concern, so the WS2c precision dials (filters.minConfidence,
5
+ // filters.allowlist) may NOT suppress a detection of any of them. minConfidence
6
+ // trims only the precision-risky SOFT types; the allowlist's per-value/per-path
7
+ // exceptions are ignored for these types (the detection still fires). Exported
8
+ // so the core detect→decide path enforces the same exemption set the docs pin.
9
+ //
10
+ // Hard-block types are sensitive AND have a STRONG enough anchor that a match is
11
+ // effectively a true positive by construction, so the precision dials
12
+ // (filters.minConfidence / filters.allowlist) can never suppress them:
13
+ // - kr_rrn / card — checksum + constrained format
14
+ // - fr_nir (mod-97 over a long structured 15-digit run) and es_dni (mod-23 plus
15
+ // a required check LETTER suffix) — a random same-shaped value almost never
16
+ // passes, and the shapes are rare in ordinary payloads.
17
+ // - it_codice_fiscale — a 16-char MIXED alpha+digit shape with a mod-26 check
18
+ // CHARACTER. The non-numeric structural anchor (the rigid letter/digit layout)
19
+ // makes a benign 16-char `[A-Z]{6}\d{2}[A-Z]…` run in an ordinary payload
20
+ // implausible (measured collision ~3.8% over an already-rare shape), so a match
21
+ // is effectively a true positive.
22
+ // - sg_nric — a LETTER prefix ([STFGM]) + 7 digits + a CHECK LETTER. Two
23
+ // non-numeric anchors (prefix letter + checksum letter) over a rare shape
24
+ // (measured collision ~3.9% over the prefix+letter shape) make a benign FP
25
+ // implausible. Both anchored, un-allowlistable.
26
+ // DELIBERATELY DIAL-ELIGIBLE (NOT hard-block) — bare-digit runs whose only guard is
27
+ // a single numeric checksum over a COMMON digit length, so a benign id/counter FP is
28
+ // plausible and the operator needs the allowlist/minConfidence escape hatch (the
29
+ // jp_mynumber precedent). They still detect + (per profile) block by default:
30
+ // - jp_mynumber — a bare 12-digit run + a SINGLE mod-11 check digit (measured
31
+ // ~9% of random 12-digit numbers pass), and 12-digit ids/counters are common.
32
+ // - uk_nino — NO checksum exists (format + invalid-prefix exclusions only), the
33
+ // largest FP surface.
34
+ // - in_aadhaar — a bare 12-digit run + the Verhoeff checksum (measured ~9.9% of
35
+ // random 12-digit runs pass, the same order as jp_mynumber's mod-11). Aadhaar
36
+ // is extremely sensitive, but Verhoeff over the COMMON 12-digit shape is exactly
37
+ // the jp_mynumber footgun (a 12-digit id/counter is common), so it stays
38
+ // allowlist-clearable rather than un-suppressable.
39
+ // - de_steuer_id — a bare 11-digit run + ISO 7064 MOD 11,10 plus a structural
40
+ // "exactly one repeated digit" test. The combined guard is strong (measured
41
+ // ~0.37% collision), BUT there is NO non-numeric anchor and 11-digit ids are
42
+ // common in payloads, so per the jp_mynumber discipline (a bare-digit shape over
43
+ // a common length) it stays DIAL-ELIGIBLE so an operator can clear an 11-digit-id
44
+ // FP. It still detects + blocks by default.
45
+ // - nl_bsn — a bare 9-digit run + the "11-proef" weighted mod-11 (measured ~9.1%
46
+ // of random 9-digit runs pass). 9 bare digits is VERY common, so this is the
47
+ // clearest dial-eligible case.
48
+ export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card", "fr_nir", "es_dni", "it_codice_fiscale", "sg_nric"]);
49
+
1
50
  const DEFAULT_RULES = [
2
51
  {
3
52
  id: "email",
@@ -9,9 +58,15 @@ const DEFAULT_RULES = [
9
58
  {
10
59
  // KR mobile numbers (01[016789] prefixes); landlines are out of scope.
11
60
  // krPhoneValid keeps a bare separator-less run from matching a timestamp/id.
61
+ // The leading `(?<![\w+-])` / trailing `(?![\w-])` boundaries (WS2c) stop the
62
+ // rule from matching a phone-shaped digit run that is a SUBSTRING of a longer
63
+ // hex/alnum/dashed run — e.g. the `…a716-446655440000` tail of a UUID, where
64
+ // the inner `16-44665544` otherwise mis-fired as a phone. The boundaries
65
+ // never affect a real number: a KR mobile sits on a word/space/punctuation
66
+ // edge and `+82` starts on the `+` (allowed before the boundary).
12
67
  id: "kr-phone",
13
68
  type: "phone",
14
- pattern: "(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}",
69
+ pattern: "(?<![\\w+-])(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}(?![\\w-])",
15
70
  flags: "g",
16
71
  confidence: 0.9,
17
72
  validate: krPhoneValid
@@ -40,6 +95,146 @@ const DEFAULT_RULES = [
40
95
  confidence: 0.95
41
96
  },
42
97
  {
98
+ // AWS access key id: a long-lived (AKIA) or temporary (ASIA) key id is a
99
+ // hard-anchored prefix + EXACTLY 16 uppercase-alphanumeric chars. The fixed
100
+ // prefix + fixed length is what makes this high-precision (no bare base64).
101
+ id: "aws-access-key-id",
102
+ type: "api_key",
103
+ pattern: "\\b(?:AKIA|ASIA)[0-9A-Z]{16}\\b",
104
+ flags: "g",
105
+ confidence: 0.95
106
+ },
107
+ {
108
+ // GitHub token: pat (ghp_), oauth (gho_), user-to-server (ghu_), server-to-
109
+ // server (ghs_), refresh (ghr_). Anchored prefix + a long base64-ish body.
110
+ // GitHub's own format is 36 chars after the prefix; we allow >=36 (the
111
+ // corpus fixture is 38) and cap to keep the match bounded.
112
+ id: "github-token",
113
+ type: "secret",
114
+ pattern: "\\bgh[pousr]_[A-Za-z0-9]{36,255}\\b",
115
+ flags: "g",
116
+ confidence: 0.95
117
+ },
118
+ {
119
+ // Google API key: anchored AIza + exactly 35 chars from the URL-safe
120
+ // alphabet. Fixed prefix + fixed length = high precision.
121
+ id: "google-api-key",
122
+ type: "api_key",
123
+ pattern: "\\bAIza[0-9A-Za-z_-]{35}\\b",
124
+ flags: "g",
125
+ confidence: 0.9
126
+ },
127
+ {
128
+ // Slack token: bot (xoxb-), user (xoxa/xoxp-), refresh (xoxr-), legacy
129
+ // (xoxs-). Anchored xox[baprs]- + a >=10-char body. The corpus value is a
130
+ // deliberately low-entropy placeholder, so the rule anchors on the prefix +
131
+ // body shape, not entropy.
132
+ id: "slack-token",
133
+ type: "secret",
134
+ pattern: "\\bxox[baprs]-[0-9A-Za-z-]{10,}\\b",
135
+ flags: "g",
136
+ confidence: 0.9
137
+ },
138
+ {
139
+ // Anthropic API key: `sk-ant-` + a long body. Ordered BEFORE the OpenAI rule
140
+ // below so a Claude key is attributed to its own rule (both emit `secret`, so
141
+ // removeOverlaps collapsing the shared span to either is type-identical — the
142
+ // ordering is for ruleId attribution, not for the scored type). The HYPHEN
143
+ // after `sk` is load-bearing: it keeps this OFF the underscore-based Stripe/
144
+ // OpenAI-platform `sk_` rule (openai-like-key, `api_key`), so the two never
145
+ // collide on the same span.
146
+ id: "anthropic-api-key",
147
+ type: "secret",
148
+ pattern: "\\bsk-ant-[A-Za-z0-9_-]{16,}\\b",
149
+ flags: "g",
150
+ confidence: 0.95
151
+ },
152
+ {
153
+ // OpenAI API key: `sk-` (and project keys `sk-proj-`) + a long base62-ish
154
+ // body. The HYPHEN is the disambiguator from Stripe/OpenAI-platform `sk_`
155
+ // (underscore — handled by openai-like-key as `api_key`); this rule never
156
+ // matches an underscore form, so the two prefixes do not overlap. A >=20-char
157
+ // body keeps a bare `sk-foo` slug from firing. The Anthropic `sk-ant-` rule
158
+ // above is a stricter sibling that runs first.
159
+ id: "openai-api-key",
160
+ type: "secret",
161
+ pattern: "\\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\\b",
162
+ flags: "g",
163
+ confidence: 0.9
164
+ },
165
+ {
166
+ // Google OAuth client secret: anchored `GOCSPX-` + exactly 28 chars from the
167
+ // URL-safe alphabet. Fixed prefix + fixed length = high precision (this is the
168
+ // OAuth client secret, distinct from the `AIza` API key above).
169
+ id: "google-oauth-client-secret",
170
+ type: "secret",
171
+ pattern: "\\bGOCSPX-[A-Za-z0-9_-]{28}\\b",
172
+ flags: "g",
173
+ confidence: 0.95
174
+ },
175
+ {
176
+ // SendGrid API key: `SG.` + 22 URL-safe chars + `.` + 43 URL-safe chars. The
177
+ // two fixed-length dotted segments after the `SG.` prefix are what make this
178
+ // high-precision — a bare `SG.`-prefixed string of the wrong shape is rejected.
179
+ id: "sendgrid-api-key",
180
+ type: "api_key",
181
+ pattern: "\\bSG\\.[A-Za-z0-9_-]{22}\\.[A-Za-z0-9_-]{43}\\b",
182
+ flags: "g",
183
+ confidence: 0.95
184
+ },
185
+ {
186
+ // Twilio Account SID (AC…) / API Key SID (SK…): the fixed `AC`/`SK` prefix +
187
+ // EXACTLY 32 HEX chars. The hex-only body (not base62) is the precision guard:
188
+ // a random alphanumeric run of the same length carries non-hex letters and is
189
+ // rejected, and the `SK` form does not collide with the underscore/hyphen
190
+ // `sk_`/`sk-` rules. Twilio's bare 32-hex AUTH TOKEN is deliberately NOT a
191
+ // standalone rule (a prefix-less 32-hex run is indistinguishable from an MD5
192
+ // hash / id) — it is caught via the `<key> = <value>` assignment vocabulary.
193
+ id: "twilio-sid",
194
+ type: "api_key",
195
+ pattern: "\\b(?:AC|SK)[0-9a-fA-F]{32}\\b",
196
+ flags: "g",
197
+ confidence: 0.9
198
+ },
199
+ {
200
+ // npm access token: anchored `npm_` + exactly 36 base62 chars. Fixed prefix +
201
+ // fixed length = high precision.
202
+ id: "npm-token",
203
+ type: "secret",
204
+ pattern: "\\bnpm_[A-Za-z0-9]{36}\\b",
205
+ flags: "g",
206
+ confidence: 0.95
207
+ },
208
+ {
209
+ // JWT: three dot-separated base64url segments where the FIRST starts with
210
+ // `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
211
+ // on `eyJ` + two more base64url groups keeps this from matching arbitrary
212
+ // dotted tokens (a bare base64 triplet without the JSON header is not a JWT).
213
+ id: "jwt",
214
+ type: "secret",
215
+ pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
216
+ flags: "g",
217
+ confidence: 0.9
218
+ },
219
+ {
220
+ // PEM private key: the armored header. We match the header line itself
221
+ // (`-----BEGIN [...] PRIVATE KEY-----`) — its presence is the credential
222
+ // signal; the body is high-entropy base64 we do not need to span. Covers
223
+ // RSA/EC/OPENSSH/DSA/ENCRYPTED variants and the bare `PRIVATE KEY` form.
224
+ id: "pem-private-key",
225
+ type: "secret",
226
+ pattern: "-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----",
227
+ flags: "g",
228
+ confidence: 0.98
229
+ },
230
+ {
231
+ // Bearer credential. Deliberately NOT context-anchored to `Authorization:`:
232
+ // detection runs PER STRING LEAF, and a real payload carries the credential
233
+ // as its own leaf (`{"Authorization": "Bearer <token>"}` walks to the bare
234
+ // value `"Bearer <token>"`), so a lookbehind requiring the header key in the
235
+ // same string would MISS the realistic case — a recall regression on a
236
+ // hard-block (`secret`) type. `secret` is fail-closed: a `Bearer …` prose
237
+ // false positive is the accepted cost of never missing a leaked token.
43
238
  id: "bearer-token",
44
239
  type: "secret",
45
240
  pattern: "\\bBearer\\s+[A-Za-z0-9._~+/-]{16,}\\b",
@@ -50,11 +245,193 @@ const DEFAULT_RULES = [
50
245
  id: "assignment-secret",
51
246
  type: "secret",
52
247
  // Lookbehind keeps the key name out of the match so transforms replace
53
- // only the secret value, not the assignment prefix.
54
- pattern: "(?<=\\b(?:api[_-]?key|secret|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
248
+ // only the secret value, not the assignment prefix. The key vocabulary
249
+ // covers the common credential-assignment names (cloud secrets, OAuth
250
+ // client secrets, PEM/private keys, access/refresh tokens) so a
251
+ // `<key> = <value>` leak is caught even when the value itself has no
252
+ // self-describing prefix (e.g. an AWS secret access key is bare base64).
253
+ // `accountkey` catches the Azure Storage connection-string `AccountKey=<88-
254
+ // char base64>=` segment — an un-anchored 88-char base64 rule would false-fire
255
+ // on any blob, so the `AccountKey=` assignment context is the precision anchor.
256
+ // `auth[_-]?token` catches the Twilio auth token (a bare 32-hex run with no
257
+ // self-describing prefix) when it is leaked as a `<key> = <value>` pair.
258
+ pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|auth[_-]?token|accountkey|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
55
259
  flags: "gi",
56
260
  confidence: 0.85
57
261
  },
262
+ {
263
+ // US SSN: AAA-GG-SSSS. The format alone collides with 9-digit ids, so a
264
+ // validator rejects the SSA-invalid ranges (area 000/666/900-999, group 00,
265
+ // serial 0000). The separators are required by the pattern — a bare 9-digit
266
+ // run is intentionally NOT matched (it is indistinguishable from an id).
267
+ id: "us-ssn",
268
+ type: "us_ssn",
269
+ pattern: "(?<![\\w-])\\d{3}-\\d{2}-\\d{4}(?![\\w-])",
270
+ flags: "g",
271
+ confidence: 0.85,
272
+ validate: usSsnValid
273
+ },
274
+ {
275
+ // IBAN: country(2 alpha) + 2 check digits + BBAN. The mod-97 checksum is
276
+ // what makes this high-precision — a random alnum run of the right shape
277
+ // almost never satisfies mod-97 == 1. Length 15-34 per ISO 13616.
278
+ id: "iban",
279
+ type: "iban",
280
+ pattern: "(?<![A-Z0-9])[A-Z]{2}\\d{2}[A-Z0-9]{11,30}(?![A-Z0-9])",
281
+ flags: "g",
282
+ confidence: 0.9,
283
+ validate: ibanValid
284
+ },
285
+ {
286
+ // Japan My Number (個人番号): EXACTLY 12 digits with the official mod-11
287
+ // weighted check digit over the first 11. A bare 12-digit run is ambiguous
288
+ // (an id/timestamp), so jpMyNumberValid is the precision guard — only a run
289
+ // whose 12th digit equals the prescribed check digit fires. The leading/
290
+ // trailing boundaries (`(?<![\d-])`/`(?![\d-])`) stop the rule from matching
291
+ // a 12-digit window inside a longer digit/dashed run. NOT hard-block: a single
292
+ // mod-11 check digit only rejects ~10/11 of random 12-digit runs, and such
293
+ // runs are common (ids/counters), so a benign FP is plausible and the operator
294
+ // keeps the allowlist escape hatch (it still detects + blocks by default).
295
+ id: "jp-mynumber",
296
+ type: "jp_mynumber",
297
+ pattern: "(?<![\\d-])\\d{12}(?![\\d-])",
298
+ flags: "g",
299
+ confidence: 0.9,
300
+ validate: jpMyNumberValid
301
+ },
302
+ {
303
+ // France NIR / INSEE social-security: 15 chars where the department field may
304
+ // carry the Corsica `2A`/`2B` letters, validated by the control key
305
+ // `97 - (first13 mod 97) == last2` (Corsica 2A→19, 2B→18 before the mod).
306
+ // The control key is the precision guard — a wrong key is rejected. The
307
+ // department alpha is optional so the pure-numeric form also matches. Anchored
308
+ // on word boundaries; hard-block (checksummed).
309
+ id: "fr-nir",
310
+ type: "fr_nir",
311
+ pattern: "(?<![\\w-])[12]\\d{2}(?:0[1-9]|1[0-2]|20)(?:\\d{2}|2[AB])\\d{6}\\d{2}(?![\\w-])",
312
+ flags: "g",
313
+ confidence: 0.9,
314
+ validate: frNirValid
315
+ },
316
+ {
317
+ // Spain DNI/NIE: 8 digits (DNI) or a leading X/Y/Z + 7 digits (NIE) + a check
318
+ // letter from the mod-23 table (NIE maps X/Y/Z→0/1/2 before the mod). The
319
+ // check letter is the precision guard — a wrong letter is rejected. The
320
+ // letters that can never appear (I/O/U) are excluded from the suffix class so
321
+ // an ordinary `<8-digit><letter>` token rarely even reaches the validator.
322
+ // Hard-block (checksummed).
323
+ id: "es-dni-nie",
324
+ type: "es_dni",
325
+ pattern: "(?<![\\w-])[XYZ]?\\d{7,8}[A-HJ-NP-TV-Z](?![\\w-])",
326
+ flags: "gi",
327
+ confidence: 0.85,
328
+ validate: esDniValid
329
+ },
330
+ {
331
+ // Italy Codice Fiscale: 16 chars in the rigid layout
332
+ // [A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z] (surname, name, year, month-letter,
333
+ // day, place code, check character). The 16th char is the official check
334
+ // character: sum the odd/even-position table values over the first 15 chars
335
+ // and map (sum mod 26) to a letter. The mixed alpha+digit structure + the
336
+ // mod-26 check character are the precision guard. Hard-block (strong
337
+ // non-numeric anchor over a rare shape).
338
+ id: "it-codice-fiscale",
339
+ type: "it_codice_fiscale",
340
+ pattern: "(?<![A-Z0-9])[A-Z]{6}\\d{2}[A-Z]\\d{2}[A-Z]\\d{3}[A-Z](?![A-Z0-9])",
341
+ flags: "gi",
342
+ confidence: 0.9,
343
+ validate: itCodiceFiscaleValid
344
+ },
345
+ {
346
+ // Singapore NRIC/FIN: a series LETTER ([STFGM]) + 7 digits + a CHECK LETTER.
347
+ // The check letter is a weighted sum (weights 2,7,6,5,4,3,2) plus a per-prefix
348
+ // offset (T/G +4, M +3), mod 11, looked up in the per-series letter table. The
349
+ // prefix letter + check letter are the precision guard. Hard-block (two
350
+ // non-numeric anchors over a rare shape).
351
+ id: "sg-nric",
352
+ type: "sg_nric",
353
+ pattern: "(?<![A-Z0-9])[STFGMstfgm]\\d{7}[A-Za-z](?![A-Z0-9])",
354
+ flags: "g",
355
+ confidence: 0.9,
356
+ validate: sgNricValid
357
+ },
358
+ {
359
+ // India Aadhaar: 12 digits (never starting 0 or 1) with the Verhoeff checksum
360
+ // over all 12. A bare 12-digit run is ambiguous (an id/timestamp), so the
361
+ // Verhoeff check is the precision guard. NOT hard-block: Verhoeff over the
362
+ // COMMON 12-digit shape passes ~1/10 of random runs (the jp_mynumber footgun),
363
+ // so it stays dial-eligible (still detects + blocks by default). The leading/
364
+ // trailing boundaries stop the rule from matching a 12-digit window inside a
365
+ // longer digit/dashed run.
366
+ id: "in-aadhaar",
367
+ type: "in_aadhaar",
368
+ pattern: "(?<![\\d-])[2-9]\\d{11}(?![\\d-])",
369
+ flags: "g",
370
+ confidence: 0.85,
371
+ validate: inAadhaarValid
372
+ },
373
+ {
374
+ // Germany tax ID (Steuer-Identifikationsnummer): 11 digits with the ISO 7064
375
+ // MOD 11,10 check digit over the first 10, plus the structural rule that the
376
+ // first 10 digits contain exactly one repeated digit (one value appears 2 or 3
377
+ // times, the rest once). The combined guard is strong, but it is a BARE-DIGIT
378
+ // run with no non-numeric anchor over a common 11-digit length, so per the
379
+ // jp_mynumber discipline it stays dial-eligible (the operator can clear an
380
+ // 11-digit-id FP).
381
+ id: "de-steuer-id",
382
+ type: "de_steuer_id",
383
+ pattern: "(?<![\\d-])\\d{11}(?![\\d-])",
384
+ flags: "g",
385
+ confidence: 0.85,
386
+ validate: deSteuerIdValid
387
+ },
388
+ {
389
+ // Netherlands BSN (Burgerservicenummer): 9 digits validated by the "11-proef"
390
+ // weighted mod-11 (Σ digit_i · weight_i ≡ 0 mod 11, with the last weight -1).
391
+ // 9 bare digits is VERY common, so the 11-proef passes ~1/11 of random runs —
392
+ // the clearest dial-eligible case (still detects + blocks by default; the
393
+ // operator keeps the allowlist escape hatch).
394
+ id: "nl-bsn",
395
+ type: "nl_bsn",
396
+ pattern: "(?<![\\d-])\\d{9}(?![\\d-])",
397
+ flags: "g",
398
+ confidence: 0.8,
399
+ validate: nlBsnValid
400
+ },
401
+ {
402
+ // UK National Insurance Number: two prefix letters + 6 digits + a suffix
403
+ // A-D. There is NO checksum, so this is FORMAT-ONLY and stays OUT of
404
+ // HARD_BLOCK_TYPES (dial-eligible). The pattern bakes in the documented
405
+ // invalid-prefix exclusions: 1st letter never D/F/I/Q/U/V, 2nd letter never
406
+ // D/F/I/O/Q/U/V, and the disallowed pairs BG/GB/NK/KN/TN/NT/ZZ are rejected
407
+ // by ukNinoValid (a negative-set the regex can't express cleanly).
408
+ id: "uk-nino",
409
+ type: "uk_nino",
410
+ pattern: "(?<![\\w-])[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]\\d{6}[A-D](?![\\w-])",
411
+ flags: "g",
412
+ confidence: 0.7,
413
+ validate: ukNinoValid
414
+ },
415
+ {
416
+ // E.164 international phone: ONLY with a leading `+` (a bare digit run is an
417
+ // id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
418
+ id: "e164-phone",
419
+ type: "phone",
420
+ pattern: "(?<![\\w+])\\+[1-9]\\d{6,14}(?![\\w])",
421
+ flags: "g",
422
+ confidence: 0.8
423
+ },
424
+ {
425
+ // US national phone: ONLY with separators — `(NXX) NXX-XXXX` or
426
+ // `NXX-NXX-XXXX`. A separator-less 10-digit run is deliberately NOT matched
427
+ // (it collides with ids/timestamps; the kr-phone rule already guards bare
428
+ // runs). Conservative by design — phone is the highest false-positive risk.
429
+ id: "us-phone",
430
+ type: "phone",
431
+ pattern: "(?<![\\w-])(?:\\(\\d{3}\\)\\s?|\\d{3}-)\\d{3}-\\d{4}(?![\\w-])",
432
+ flags: "g",
433
+ confidence: 0.75
434
+ },
58
435
  // Indirect prompt injection heuristics. Response/tool-result direction only,
59
436
  // and the policy default for the injection type is `allow` (report-only):
60
437
  // detections are audited regardless of action, and false-positive blocks
@@ -101,8 +478,13 @@ const DEFAULT_RULES = [
101
478
  }
102
479
  ];
103
480
 
104
- export function createDefaultFilterEngine({ customRules = [] } = {}) {
481
+ export function createDefaultFilterEngine({ customRules = [], decodeAndRescan = false } = {}) {
105
482
  const rules = DEFAULT_RULES.concat(customRules.map(normalizeCustomRule));
483
+ // The opt-in base64/percent decode-and-rescan pass (WS2d residual). Default OFF
484
+ // => byte-identical to prior behavior. Held in the engine CLOSURE, NOT threaded
485
+ // through the protect `context`: the request context is data and must not carry
486
+ // this control flag (it would pollute tokenize AAD / audit).
487
+ const decodeOptions = { decodeAndRescan: decodeAndRescan === true };
106
488
 
107
489
  return {
108
490
  id: "haechi.filter.default",
@@ -112,12 +494,32 @@ export function createDefaultFilterEngine({ customRules = [] } = {}) {
112
494
  networkEgress: false
113
495
  },
114
496
  async detect({ entries, context }) {
115
- return entries.flatMap((entry) => detectEntry(entry, rules, context));
497
+ return entries.flatMap((entry) => detectEntry(entry, rules, context, decodeOptions));
116
498
  }
117
499
  };
118
500
  }
119
501
 
120
- export function detectEntry(entry, rules, context = {}) {
502
+ export function detectEntry(entry, rules, context = {}, options = {}) {
503
+ const baseDetections = scanEntry(entry, rules, context);
504
+ // WS2d residual — opt-in (default OFF) base64/percent decode-and-rescan. After
505
+ // the normal NFKC scan above, if the flag is on, attempt to decode the leaf and
506
+ // rescan the decoded text. A decoded hit has NO valid offset in the encoded leaf
507
+ // (decoding remaps everything), so it fails closed to a WHOLE-LEAF detection of
508
+ // the original encoded leaf — and only fires for a validator-backed/hard-block
509
+ // hit so random base64 never false-positives. See decodeAndRescanEntry.
510
+ if (options?.decodeAndRescan === true) {
511
+ const decoded = decodeAndRescanEntry(entry, rules, context);
512
+ if (decoded.length > 0) {
513
+ return baseDetections.concat(decoded);
514
+ }
515
+ }
516
+ return baseDetections;
517
+ }
518
+
519
+ // The original per-leaf NFKC scan (WS2d), unchanged. Extracted from detectEntry so
520
+ // the opt-in decode-and-rescan pass wraps it without touching the byte-identical
521
+ // default path.
522
+ function scanEntry(entry, rules, context = {}) {
121
523
  const detections = [];
122
524
  // On the RESPONSE direction, a bare JSON NUMBER leaf is inference-server
123
525
  // metadata (a nanosecond `*_duration`, a token count, a numeric id/timestamp) —
@@ -139,8 +541,62 @@ export function detectEntry(entry, rules, context = {}) {
139
541
  // marker-shaped string is NOT Haechi output (Haechi hasn't transformed it yet),
140
542
  // so it is scanned normally — otherwise an attacker could wrap a real secret in
141
543
  // a fake `[TOKEN:…]` to evade request-side detection.
544
+ // Markers are pure ASCII and NFKC-stable, so their spans are computed on the
545
+ // ORIGINAL value exactly as before — they line up with the same-length
546
+ // normalized scan (Case 2 below) and are irrelevant to the whole-leaf scan
547
+ // (Case 3).
142
548
  const markerSpans = context?.direction === "response" ? haechiMarkerSpans(entry.value) : [];
143
549
 
550
+ // WS2d — Unicode evasion via NFKC normalization. A client can defeat every
551
+ // regex rule by sending PII/secrets in a Unicode form that folds to ASCII
552
+ // (full-width digits `4242…`, full-width `@`, mathematical/enclosed
553
+ // alphanumerics). NFKC normalization maps those to their compatibility ASCII
554
+ // form so the rules match. The crux is OFFSET INTEGRITY: detections carry
555
+ // {start,end} into entry.value, but the transform slices the ORIGINAL string
556
+ // (packages/core transformString). Three cases keep offsets valid:
557
+ const value = entry.value;
558
+ const normalized = value.normalize("NFKC");
559
+ if (normalized === value) {
560
+ // Case 1 (~99%): nothing folded. Detect on the original exactly as before —
561
+ // byte-identical behavior, zero regression.
562
+ return removeOverlaps(scanForDetections(value, rules, context, markerSpans, entry, value));
563
+ }
564
+ if (isPositionStableNfkc(value, normalized)) {
565
+ // Case 2: every codepoint folded to the SAME UTF-16 length and the per-
566
+ // codepoint folds reconstruct the whole normalization, so each original
567
+ // character occupies the SAME offsets in `normalized` as in `value` (e.g.
568
+ // full-width→ASCII digits/letters). A match's {start,end} on `normalized` are
569
+ // therefore valid on the ORIGINAL value — exact-span redaction of the evaded
570
+ // value, with the recorded `value` taken from the original slice so
571
+ // tokenize/AAD/audit see the real bytes. A bare `normalized.length ===
572
+ // value.length` check is UNSOUND: a length-contracting codepoint before the
573
+ // PII compensated by a length-expanding one after it keeps the total length
574
+ // equal yet shifts every interior offset (redacting the wrong bytes), so such
575
+ // inputs must fall through to the Case 3 whole-leaf path. Validators still run
576
+ // on the normalized match text (Luhn/RRN need ASCII digits).
577
+ return removeOverlaps(scanForDetections(normalized, rules, context, markerSpans, entry, value));
578
+ }
579
+ // Case 3: the fold is NOT position-stable (a length-changing decomposition, or a
580
+ // compensating contraction+expansion that shifts interior offsets). Offsets on
581
+ // the normalized copy do NOT map back to the original, so we CANNOT do exact-span
582
+ // redaction.
583
+ // FAIL CLOSED: emit ONE detection per matched type covering the WHOLE leaf so
584
+ // the transform redacts/blocks the entire leaf. Over-redacting an evasion
585
+ // attempt is the safe failure. removeOverlaps is intentionally skipped — every
586
+ // detection spans the whole leaf so they all "overlap"; the transform collapses
587
+ // them to a single whole-leaf replacement via its cursor, and any `block` among
588
+ // them blocks the payload, while preserving per-type detection reporting.
589
+ return wholeLeafDetections(normalized, rules, context, entry, value);
590
+ }
591
+
592
+ // Run every applicable rule over `scanText` (the original value, or its
593
+ // same-length NFKC normalization). Offsets index `scanText`, which is positionally
594
+ // 1:1 with `originalValue` (Case 1: identical; Case 2: same UTF-16 length), so the
595
+ // {start,end} are valid on `originalValue`. The recorded `value` is the ORIGINAL
596
+ // slice (never the normalized form). Marker spans (response-only) are computed on
597
+ // the original and align under both cases.
598
+ function scanForDetections(scanText, rules, context, markerSpans, entry, originalValue) {
599
+ const detections = [];
144
600
  for (const rule of rules) {
145
601
  // Direction-scoped rules (e.g. injection heuristics) only run on the
146
602
  // matching traffic direction; rules without a direction run everywhere.
@@ -148,13 +604,13 @@ export function detectEntry(entry, rules, context = {}) {
148
604
  continue;
149
605
  }
150
606
  const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
151
- for (const match of entry.value.matchAll(regex)) {
152
- const value = match[0];
153
- if (rule.validate && !rule.validate(value)) {
607
+ for (const match of scanText.matchAll(regex)) {
608
+ const matchText = match[0];
609
+ if (rule.validate && !rule.validate(matchText)) {
154
610
  continue;
155
611
  }
156
612
  const start = match.index;
157
- const end = match.index + value.length;
613
+ const end = match.index + matchText.length;
158
614
  if (overlapsAny(start, end, markerSpans)) {
159
615
  continue;
160
616
  }
@@ -167,12 +623,205 @@ export function detectEntry(entry, rules, context = {}) {
167
623
  start,
168
624
  end,
169
625
  confidence: rule.confidence,
170
- value
626
+ value: originalValue.slice(start, end)
171
627
  });
172
628
  }
173
629
  }
630
+ return detections;
631
+ }
632
+
633
+ // Case 3 fail-closed scan: discover which types the NFKC-normalized text matches,
634
+ // then emit one whole-leaf detection per distinct type (start:0, end:value.length,
635
+ // value: the whole original leaf). The response-direction marker skip does NOT
636
+ // apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
637
+ // and NFKC-stable), so an evasion attempt can never masquerade as one.
638
+ function wholeLeafDetections(normalized, rules, context, entry, originalValue, ruleFilter = null) {
639
+ const seenTypes = new Set();
640
+ const detections = [];
641
+ for (const rule of rules) {
642
+ if (rule.direction && rule.direction !== context?.direction) {
643
+ continue;
644
+ }
645
+ // The decode-and-rescan caller passes a precision filter so only validator-
646
+ // backed / hard-block rules can fire on decoded text (random base64 guard).
647
+ // The Case-3 NFKC caller passes nothing → every rule is eligible (unchanged).
648
+ if (ruleFilter && !ruleFilter(rule)) {
649
+ continue;
650
+ }
651
+ if (seenTypes.has(rule.type)) {
652
+ continue;
653
+ }
654
+ const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
655
+ let matched = false;
656
+ for (const match of normalized.matchAll(regex)) {
657
+ if (!rule.validate || rule.validate(match[0])) {
658
+ matched = true;
659
+ break;
660
+ }
661
+ }
662
+ if (!matched) {
663
+ continue;
664
+ }
665
+ seenTypes.add(rule.type);
666
+ detections.push({
667
+ type: rule.type,
668
+ ruleId: rule.id,
669
+ path: entry.path,
670
+ pathText: entry.pathText,
671
+ kind: entry.kind ?? "value",
672
+ start: 0,
673
+ end: originalValue.length,
674
+ confidence: rule.confidence,
675
+ value: originalValue
676
+ });
677
+ }
678
+ return detections;
679
+ }
680
+
681
+ // WS2d residual — opt-in base64/percent decode-and-rescan (default OFF). An
682
+ // always-on decode is false-positive-prone (random base64 decodes to bytes that
683
+ // can shape-match a soft rule), so this is gated behind `filters.decodeAndRescan`
684
+ // AND a precision guard: a decoded hit only fires when it is VALIDATOR-BACKED or a
685
+ // HARD-BLOCK type (a Luhn-passing card, a checksum kr_rrn/us_ssn, an IBAN mod-97,
686
+ // or a secret/api_key on its anchored rule). A decoded soft-type-without-validator
687
+ // match (a bare phone-shaped run in random decoded bytes) does NOT fire — requiring
688
+ // validators keeps precision ~100% (random base64 Luhn-passing as a 16-digit card
689
+ // is astronomically unlikely).
690
+ //
691
+ // OFFSET HANDLING (fail closed): a detection found in the DECODED text has no valid
692
+ // offset in the original encoded leaf (decoding remaps everything), so we emit a
693
+ // WHOLE-LEAF detection per matched type (start:0, end:leaf.length, value: the whole
694
+ // original encoded leaf) — exactly the WS2d Case-3 path. The transform then
695
+ // redacts/blocks the entire encoded leaf. We never map a decoded offset back.
696
+ function decodeAndRescanEntry(entry, rules, context) {
697
+ // Only string leaves carry an encoded value; a number/boolean leaf cannot be a
698
+ // base64/percent blob (and the response-direction number skip already applies in
699
+ // the base scan).
700
+ if (entry.kind === "number") {
701
+ return [];
702
+ }
703
+ const decoded = decodeLeaf(entry.value);
704
+ if (decoded === null) {
705
+ return [];
706
+ }
707
+ // Reuse the Case-3 whole-leaf path, but restricted to precision-eligible rules so
708
+ // random base64 never false-positives. `decoded` supplies the scan text; the
709
+ // recorded detection still spans the ORIGINAL encoded leaf (entry.value).
710
+ return wholeLeafDetections(decoded, rules, context, entry, entry.value, isDecodeEligibleRule);
711
+ }
712
+
713
+ // A decoded whole-leaf detection only fires for a "meaningful" hit: a hard-block
714
+ // type (secret/api_key/kr_rrn/card) on its anchored rule, OR a checksum-validated
715
+ // type. The `phone` type is excluded even though kr-phone carries a `validate`
716
+ // helper — that helper is a trunk-prefix heuristic, not a checksum, so a phone-
717
+ // shaped run in random decoded bytes must NOT fire (the spec's named exclusion).
718
+ function isDecodeEligibleRule(rule) {
719
+ if (HARD_BLOCK_TYPES.has(rule.type)) {
720
+ return true;
721
+ }
722
+ return typeof rule.validate === "function" && rule.type !== "phone";
723
+ }
724
+
725
+ // Attempt to decode a string leaf to UTF-8 text, returning the decoded string or
726
+ // null when the leaf does not look like (or does not cleanly round-trip as) an
727
+ // encoded value. Two encodings, each precision-guarded so a benign value is skipped
728
+ // rather than mis-decoded:
729
+ // - base64 / base64url: the leaf must LOOK like base64 (no spaces, the base64 or
730
+ // base64url alphabet, a valid length for that variant) within bounds, decode to
731
+ // VALID UTF-8, and RE-ENCODE back to exactly the leaf (rejects the bytes that
732
+ // Buffer.from leniently accepts but are not the canonical encoding of the leaf).
733
+ // - percent-encoding: only when the leaf actually contains a `%XX` escape;
734
+ // decodeURIComponent in a try/catch (a malformed escape → skip, never throws).
735
+ // base64 is tried first (a `%`-bearing string is not base64), then percent.
736
+ const DECODE_MIN_LEN = 16;
737
+ const DECODE_MAX_LEN = 8192;
738
+ const BASE64_STD = /^[A-Za-z0-9+/]+={0,2}$/;
739
+ const BASE64_URL = /^[A-Za-z0-9_-]+$/;
740
+
741
+ function decodeLeaf(value) {
742
+ if (typeof value !== "string" || value.length < DECODE_MIN_LEN || value.length > DECODE_MAX_LEN) {
743
+ return null;
744
+ }
745
+ const base64 = decodeBase64Leaf(value);
746
+ if (base64 !== null) {
747
+ return base64;
748
+ }
749
+ return decodePercentLeaf(value);
750
+ }
174
751
 
175
- return removeOverlaps(detections);
752
+ function decodeBase64Leaf(value) {
753
+ // Standard base64: length must be a multiple of 4. base64url: length mod 4 may be
754
+ // 0, 2, or 3 (1 is impossible for any byte run) and the alphabet is `-_` not `+/`.
755
+ // A `%` or whitespace disqualifies it (handled by the anchored alphabet regexes).
756
+ let encoding = null;
757
+ if (BASE64_STD.test(value) && value.length % 4 === 0) {
758
+ encoding = "base64";
759
+ } else if (BASE64_URL.test(value) && value.length % 4 !== 1) {
760
+ encoding = "base64url";
761
+ } else {
762
+ return null;
763
+ }
764
+ let bytes;
765
+ try {
766
+ bytes = Buffer.from(value, encoding);
767
+ } catch {
768
+ return null;
769
+ }
770
+ if (bytes.length === 0) {
771
+ return null;
772
+ }
773
+ // Round-trip guard: Buffer.from is lenient (it ignores stray chars / bad padding),
774
+ // so a non-canonical string can "decode". Re-encoding the bytes must reproduce the
775
+ // EXACT leaf — otherwise the leaf was not really this base64 value.
776
+ if (bytes.toString(encoding) !== value) {
777
+ return null;
778
+ }
779
+ // The decoded bytes must be valid UTF-8 text; a card/RRN/secret is text. Random
780
+ // base64 usually decodes to non-UTF-8 bytes, which we skip here (a cheap, strong
781
+ // false-positive filter before we even run the rules).
782
+ if (!isUtf8(bytes)) {
783
+ return null;
784
+ }
785
+ return bytes.toString("utf8");
786
+ }
787
+
788
+ function decodePercentLeaf(value) {
789
+ // Only attempt when there is an actual `%XX` escape — otherwise decodeURIComponent
790
+ // is a no-op and we would needlessly rescan an identical string.
791
+ if (!/%[0-9A-Fa-f]{2}/.test(value)) {
792
+ return null;
793
+ }
794
+ let decoded;
795
+ try {
796
+ decoded = decodeURIComponent(value);
797
+ } catch {
798
+ // Malformed percent-escape (e.g. a bare `%` or `%zz`) → skip, never throw.
799
+ return null;
800
+ }
801
+ if (decoded === value) {
802
+ return null;
803
+ }
804
+ return decoded;
805
+ }
806
+
807
+ // Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
808
+ // text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
809
+ // the same number of UTF-16 units (so no interior offset shifts) AND the per-
810
+ // codepoint folds concatenate to the whole normalization (so no cross-boundary
811
+ // composition moved content). The bare `normalized.length === value.length` check
812
+ // is unsound — a contraction before the PII compensated by an expansion after it
813
+ // keeps the total length equal while shifting every interior offset, redacting the
814
+ // wrong bytes. Runs only on a leaf that actually folded (normalized !== value).
815
+ function isPositionStableNfkc(value, normalized) {
816
+ let rebuilt = "";
817
+ for (const ch of value) {
818
+ const folded = ch.normalize("NFKC");
819
+ if (folded.length !== ch.length) {
820
+ return false;
821
+ }
822
+ rebuilt += folded;
823
+ }
824
+ return rebuilt === normalized;
176
825
  }
177
826
 
178
827
  // Spans of Haechi's own transform markers in a string, so detection can skip
@@ -291,3 +940,317 @@ function krRrnValid(value) {
291
940
  const check = (11 - (sum % 11)) % 10;
292
941
  return check === Number(digits[12]);
293
942
  }
943
+
944
+ // US SSN structural validity (SSA allocation rules). The format `AAA-GG-SSSS`
945
+ // alone collides with arbitrary 9-digit ids, so we reject the never-issued
946
+ // ranges: area 000, 666, and 900-999; group 00; serial 0000. This is what turns
947
+ // the loose shape into a high-precision detection.
948
+ function usSsnValid(value) {
949
+ const match = /^(\d{3})-(\d{2})-(\d{4})$/.exec(value);
950
+ if (!match) {
951
+ return false;
952
+ }
953
+ const area = Number(match[1]);
954
+ const group = Number(match[2]);
955
+ const serial = Number(match[3]);
956
+ if (area === 0 || area === 666 || area >= 900) {
957
+ return false;
958
+ }
959
+ if (group === 0) {
960
+ return false;
961
+ }
962
+ if (serial === 0) {
963
+ return false;
964
+ }
965
+ return true;
966
+ }
967
+
968
+ // IBAN mod-97 checksum (ISO 7064 / ISO 13616). Move the first four chars to the
969
+ // end, map letters to 10-35, and the resulting integer must be congruent to 1
970
+ // mod 97. Computed digit-by-digit so the big integer never overflows. This
971
+ // checksum is the precision guarantee — random alnum runs almost never pass.
972
+ function ibanValid(value) {
973
+ const iban = value.replace(/\s/g, "").toUpperCase();
974
+ if (!/^[A-Z]{2}\d{2}[A-Z0-9]{11,30}$/.test(iban)) {
975
+ return false;
976
+ }
977
+ const rearranged = iban.slice(4) + iban.slice(0, 4);
978
+ let remainder = 0;
979
+ for (const char of rearranged) {
980
+ const mapped = /\d/.test(char) ? char : String(char.charCodeAt(0) - 55);
981
+ for (const digit of mapped) {
982
+ remainder = (remainder * 10 + Number(digit)) % 97;
983
+ }
984
+ }
985
+ return remainder === 1;
986
+ }
987
+
988
+ // Japan My Number (個人番号) check digit. The official scheme: over the first 11
989
+ // digits, P = 11 - (Σ n_i · Q_i mod 11), where n_i is the i-th digit FROM THE
990
+ // RIGHT of the 11-digit prefix and Q_i = i+1 for 1≤i≤6, i-5 for 7≤i≤11. When the
991
+ // remainder is 0 or 1 the check digit is 0. The 12th digit must equal P. This
992
+ // check digit is the precision guarantee — a random 12-digit id passes only 1
993
+ // time in 10, and the corpus hard-negative (a valid-shape, wrong-check value)
994
+ // proves the rejection.
995
+ function jpMyNumberValid(value) {
996
+ const digits = value.replace(/\D/g, "");
997
+ if (digits.length !== 12) {
998
+ return false;
999
+ }
1000
+ let sum = 0;
1001
+ for (let n = 1; n <= 11; n += 1) {
1002
+ const digit = Number(digits[11 - n]);
1003
+ const weight = n <= 6 ? n + 1 : n - 5;
1004
+ sum += digit * weight;
1005
+ }
1006
+ const remainder = sum % 11;
1007
+ const check = remainder <= 1 ? 0 : 11 - remainder;
1008
+ return check === Number(digits[11]);
1009
+ }
1010
+
1011
+ // France NIR / INSEE social-security control key. The first 13 chars are the
1012
+ // body (sex, birth year/month, department, commune, order); the last 2 are the
1013
+ // control key, which must equal `97 - (body mod 97)`. The Corsica department is
1014
+ // written 2A/2B; the official rule substitutes 2A→19 and 2B→18 in the body
1015
+ // BEFORE the mod (the rest of the body is numeric). The control key is the
1016
+ // precision guarantee — a wrong key is rejected (corpus hard-negative).
1017
+ function frNirValid(value) {
1018
+ const compact = value.replace(/[\s.-]/g, "").toUpperCase();
1019
+ if (!/^[12]\d{2}(?:\d{2}|0[1-9]|1[0-2]|20)(?:\d{2}|2[AB])\d{6}\d{2}$/.test(compact)) {
1020
+ return false;
1021
+ }
1022
+ const bodyRaw = compact.slice(0, 13);
1023
+ const control = Number(compact.slice(13));
1024
+ // Corsica substitution: 2A→19, 2B→18 (only the department field can be alpha).
1025
+ const body = bodyRaw.replace("2A", "19").replace("2B", "18");
1026
+ if (!/^\d{13}$/.test(body)) {
1027
+ return false;
1028
+ }
1029
+ let remainder = 0;
1030
+ for (const char of body) {
1031
+ remainder = (remainder * 10 + Number(char)) % 97;
1032
+ }
1033
+ const key = 97 - remainder;
1034
+ return key === control;
1035
+ }
1036
+
1037
+ // Spain DNI/NIE check letter (mod-23 table). DNI is 8 digits + a letter; NIE is
1038
+ // a leading X/Y/Z (mapped to 0/1/2) + 7 digits + a letter. The letter is
1039
+ // `table[number mod 23]` where table = "TRWAGMYFPDXBNJZSQVHLCKE". The letter is
1040
+ // the precision guarantee — a structurally valid but wrong letter is rejected
1041
+ // (corpus hard-negative).
1042
+ const ES_DNI_TABLE = "TRWAGMYFPDXBNJZSQVHLCKE";
1043
+ const ES_NIE_PREFIX = { X: "0", Y: "1", Z: "2" };
1044
+ function esDniValid(value) {
1045
+ const compact = value.replace(/[\s-]/g, "").toUpperCase();
1046
+ let body;
1047
+ let letter;
1048
+ if (/^\d{8}[A-Z]$/.test(compact)) {
1049
+ body = compact.slice(0, 8);
1050
+ letter = compact[8];
1051
+ } else if (/^[XYZ]\d{7}[A-Z]$/.test(compact)) {
1052
+ body = ES_NIE_PREFIX[compact[0]] + compact.slice(1, 8);
1053
+ letter = compact[8];
1054
+ } else {
1055
+ return false;
1056
+ }
1057
+ return ES_DNI_TABLE[Number(body) % 23] === letter;
1058
+ }
1059
+
1060
+ // Italy Codice Fiscale check character. Over the first 15 chars: odd positions
1061
+ // (1st, 3rd, … counting from 1) use the ODD table, even positions use the EVEN
1062
+ // table; sum the mapped values and the (sum mod 26)-th letter (A=0) must equal the
1063
+ // 16th char. The mixed alpha+digit structure + the mod-26 check character are the
1064
+ // precision guard — a structurally valid but wrong check char is rejected (corpus
1065
+ // hard-negative). Hard-block.
1066
+ const IT_CF_ODD = {
1067
+ "0": 1, "1": 0, "2": 5, "3": 7, "4": 9, "5": 13, "6": 15, "7": 17, "8": 19, "9": 21,
1068
+ A: 1, B: 0, C: 5, D: 7, E: 9, F: 13, G: 15, H: 17, I: 19, J: 21, K: 2, L: 4, M: 18, N: 20,
1069
+ O: 11, P: 3, Q: 6, R: 8, S: 12, T: 14, U: 16, V: 10, W: 22, X: 25, Y: 24, Z: 23
1070
+ };
1071
+ const IT_CF_EVEN = {
1072
+ "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
1073
+ A: 0, B: 1, C: 2, D: 3, E: 4, F: 5, G: 6, H: 7, I: 8, J: 9, K: 10, L: 11, M: 12, N: 13,
1074
+ O: 14, P: 15, Q: 16, R: 17, S: 18, T: 19, U: 20, V: 21, W: 22, X: 23, Y: 24, Z: 25
1075
+ };
1076
+ const IT_CF_REMAINDER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1077
+ function itCodiceFiscaleValid(value) {
1078
+ const cf = value.replace(/\s/g, "").toUpperCase();
1079
+ if (!/^[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]$/.test(cf)) {
1080
+ return false;
1081
+ }
1082
+ let sum = 0;
1083
+ for (let index = 0; index < 15; index += 1) {
1084
+ const char = cf[index];
1085
+ // Position 1 (index 0) is ODD; alternate from there.
1086
+ sum += index % 2 === 0 ? IT_CF_ODD[char] : IT_CF_EVEN[char];
1087
+ }
1088
+ return IT_CF_REMAINDER[sum % 26] === cf[15];
1089
+ }
1090
+
1091
+ // Singapore NRIC/FIN check letter. Weighted sum (weights 2,7,6,5,4,3,2) over the
1092
+ // 7 digits, plus a per-prefix offset (T/G +4, M +3), mod 11, mapped through the
1093
+ // per-series letter table. S/T (citizen/PR), F/G (foreigner FIN), and M (post-2022
1094
+ // FIN) each have their own table. The prefix letter + check letter are the
1095
+ // precision guard — a wrong letter is rejected (corpus hard-negative). Hard-block.
1096
+ const SG_NRIC_WEIGHTS = [2, 7, 6, 5, 4, 3, 2];
1097
+ const SG_NRIC_TABLE_ST = ["J", "Z", "I", "H", "G", "F", "E", "D", "C", "B", "A"];
1098
+ const SG_NRIC_TABLE_FG = ["X", "W", "U", "T", "R", "Q", "P", "N", "M", "L", "K"];
1099
+ const SG_NRIC_TABLE_M = ["K", "L", "J", "N", "P", "Q", "R", "T", "U", "W", "X"];
1100
+ function sgNricValid(value) {
1101
+ const v = value.replace(/\s/g, "").toUpperCase();
1102
+ if (!/^[STFGM]\d{7}[A-Z]$/.test(v)) {
1103
+ return false;
1104
+ }
1105
+ const prefix = v[0];
1106
+ let sum = 0;
1107
+ for (let index = 0; index < 7; index += 1) {
1108
+ sum += Number(v[index + 1]) * SG_NRIC_WEIGHTS[index];
1109
+ }
1110
+ if (prefix === "T" || prefix === "G") {
1111
+ sum += 4;
1112
+ } else if (prefix === "M") {
1113
+ sum += 3;
1114
+ }
1115
+ const remainder = sum % 11;
1116
+ let table;
1117
+ if (prefix === "S" || prefix === "T") {
1118
+ table = SG_NRIC_TABLE_ST;
1119
+ } else if (prefix === "F" || prefix === "G") {
1120
+ table = SG_NRIC_TABLE_FG;
1121
+ } else {
1122
+ table = SG_NRIC_TABLE_M;
1123
+ }
1124
+ return table[remainder] === v[8];
1125
+ }
1126
+
1127
+ // India Aadhaar Verhoeff checksum. The Verhoeff scheme runs the dihedral-group
1128
+ // multiplication (VERHOEFF_D) over each digit permuted by position (VERHOEFF_P)
1129
+ // from the right; the running value must be 0 for a valid full number. Aadhaar is
1130
+ // 12 digits and never starts 0 or 1. The Verhoeff check is the precision guard — a
1131
+ // wrong check digit is rejected (corpus hard-negative). Dial-eligible: Verhoeff
1132
+ // over a common 12-digit shape passes ~1/10 of random runs (the jp_mynumber
1133
+ // footgun), so it stays allowlist-clearable.
1134
+ const VERHOEFF_D = [
1135
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
1136
+ [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
1137
+ [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
1138
+ [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
1139
+ [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
1140
+ [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
1141
+ [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
1142
+ [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
1143
+ [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
1144
+ [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
1145
+ ];
1146
+ const VERHOEFF_P = [
1147
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
1148
+ [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
1149
+ [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
1150
+ [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
1151
+ [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
1152
+ [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
1153
+ [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
1154
+ [7, 0, 4, 6, 9, 1, 3, 2, 5, 8]
1155
+ ];
1156
+ function verhoeffValid(digits) {
1157
+ let check = 0;
1158
+ const reversed = digits.split("").reverse();
1159
+ for (let index = 0; index < reversed.length; index += 1) {
1160
+ check = VERHOEFF_D[check][VERHOEFF_P[index % 8][Number(reversed[index])]];
1161
+ }
1162
+ return check === 0;
1163
+ }
1164
+ function inAadhaarValid(value) {
1165
+ const digits = value.replace(/[\s-]/g, "");
1166
+ if (!/^[2-9]\d{11}$/.test(digits)) {
1167
+ return false;
1168
+ }
1169
+ return verhoeffValid(digits);
1170
+ }
1171
+
1172
+ // Germany tax ID (Steuer-Identifikationsnummer). Two guards: (1) the structural
1173
+ // rule that the first 10 digits contain exactly one repeated digit (one value
1174
+ // appears 2 or 3 times, the rest once), and (2) the ISO 7064 MOD 11,10 check digit
1175
+ // over the first 10 must equal the 11th. Dial-eligible: a bare 11-digit run with no
1176
+ // non-numeric anchor over a common length (the jp_mynumber discipline), even though
1177
+ // the combined guard is strong.
1178
+ function deSteuerIdStructural(first10) {
1179
+ const counts = new Map();
1180
+ for (const char of first10) {
1181
+ counts.set(char, (counts.get(char) ?? 0) + 1);
1182
+ }
1183
+ const repeats = [...counts.values()].filter((count) => count >= 2);
1184
+ // Exactly one digit repeats, and it repeats 2 or 3 times (never more).
1185
+ return repeats.length === 1 && repeats[0] <= 3;
1186
+ }
1187
+ function deSteuerIdCheckDigit(first10) {
1188
+ let product = 10;
1189
+ for (let index = 0; index < 10; index += 1) {
1190
+ let sum = (Number(first10[index]) + product) % 10;
1191
+ if (sum === 0) {
1192
+ sum = 10;
1193
+ }
1194
+ product = (sum * 2) % 11;
1195
+ }
1196
+ let check = 11 - product;
1197
+ if (check === 10) {
1198
+ check = 0;
1199
+ }
1200
+ return check;
1201
+ }
1202
+ function deSteuerIdValid(value) {
1203
+ const digits = value.replace(/[\s/]/g, "");
1204
+ if (!/^\d{11}$/.test(digits)) {
1205
+ return false;
1206
+ }
1207
+ const first10 = digits.slice(0, 10);
1208
+ if (!deSteuerIdStructural(first10)) {
1209
+ return false;
1210
+ }
1211
+ return deSteuerIdCheckDigit(first10) === Number(digits[10]);
1212
+ }
1213
+
1214
+ // Netherlands BSN "11-proef": Σ (digit_i · weight_i) ≡ 0 mod 11 over the 9 digits,
1215
+ // where the weights run 9,8,…,2 for the first eight and -1 for the last. The
1216
+ // all-zero number is rejected. Dial-eligible: 9 bare digits is very common, so the
1217
+ // 11-proef passes ~1/11 of random runs (the clearest jp_mynumber-style footgun).
1218
+ function nlBsnValid(value) {
1219
+ const digits = value.replace(/[\s.]/g, "");
1220
+ if (!/^\d{9}$/.test(digits)) {
1221
+ return false;
1222
+ }
1223
+ if (/^0{9}$/.test(digits)) {
1224
+ return false;
1225
+ }
1226
+ let sum = 0;
1227
+ for (let index = 0; index < 8; index += 1) {
1228
+ sum += Number(digits[index]) * (9 - index);
1229
+ }
1230
+ sum += Number(digits[8]) * -1;
1231
+ return sum % 11 === 0;
1232
+ }
1233
+
1234
+ // UK National Insurance Number — FORMAT-ONLY (no checksum exists), which is why
1235
+ // uk_nino stays OUT of HARD_BLOCK_TYPES (dial-eligible). The regex already
1236
+ // excludes the disallowed individual letters; this validator rejects the
1237
+ // documented invalid PREFIX PAIRS (BG, GB, NK, KN, TN, NT, ZZ) that the regex
1238
+ // cannot express as a negative set, plus the `O`-as-second-letter case (belt-and-
1239
+ // braces with the regex class). The administrative `TN`/`NT` and the temporary
1240
+ // `OO`/the suspended `BG` etc. are never issued, so excluding them lifts precision.
1241
+ const UK_NINO_INVALID_PREFIXES = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
1242
+ function ukNinoValid(value) {
1243
+ const compact = value.replace(/\s/g, "").toUpperCase();
1244
+ if (!/^[A-Z]{2}\d{6}[A-D]$/.test(compact)) {
1245
+ return false;
1246
+ }
1247
+ const prefix = compact.slice(0, 2);
1248
+ if (UK_NINO_INVALID_PREFIXES.has(prefix)) {
1249
+ return false;
1250
+ }
1251
+ // First letter never D/F/I/Q/U/V; second letter never D/F/I/O/Q/U/V.
1252
+ if (/[DFIQUV]/.test(prefix[0]) || /[DFIOQUV]/.test(prefix[1])) {
1253
+ return false;
1254
+ }
1255
+ return true;
1256
+ }