haechi 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { createHash, randomUUID } from "node:crypto";
2
+ import { HARD_BLOCK_TYPES } from "../filter/index.mjs";
2
3
 
3
4
  const NO_ENFORCE_MODES = new Set(["dry-run", "report-only"]);
4
5
 
@@ -10,7 +11,7 @@ const NO_ENFORCE_MODES = new Set(["dry-run", "report-only"]);
10
11
  // limits.maxNestingDepth through createHaechi → protectJson instead.
11
12
  export const DEFAULT_MAX_NESTING_DEPTH = 256;
12
13
 
13
- export function createHaechi({ filterEngine, policyEngine, cryptoProvider, auditSink, tokenVault = null, mode = "dry-run", limits = {} }) {
14
+ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, auditSink, tokenVault = null, mode = "dry-run", limits = {}, precision = {} }) {
14
15
  if (!filterEngine || !policyEngine || !cryptoProvider || !auditSink) {
15
16
  throw new Error("Haechi requires filterEngine, policyEngine, cryptoProvider, and auditSink");
16
17
  }
@@ -20,6 +21,15 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
20
21
  ? limits.maxNestingDepth
21
22
  : DEFAULT_MAX_NESTING_DEPTH;
22
23
 
24
+ // WS2c precision controls, resolved once. `minConfidence` is the precision dial
25
+ // (drop a detection below the threshold) and `allowlist` is the operator FP
26
+ // exception set. Both are FAIL-OPEN-FOR-PROTECTION: they may only TRIM
27
+ // precision-risky soft-type detections and can NEVER suppress a hard-block type
28
+ // (secret/api_key/kr_rrn/card) — that load-bearing exemption is enforced in
29
+ // applyPrecisionControls, not trusted to config. Default {} = current behavior.
30
+ const minConfidence = Number.isFinite(precision.minConfidence) ? precision.minConfidence : 0;
31
+ const allowlist = compileAllowlist(precision.allowlist);
32
+
23
33
  async function protectJson(payload, rawContext = {}) {
24
34
  // A per-request policy engine (a named profile selected from identity)
25
35
  // overrides the default. It is a control object, NOT data: strip it before
@@ -35,7 +45,13 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
35
45
  // `context.direction` ("request" | "response") gates direction-scoped rules
36
46
  // (injection) and the response-only marker exclusion in the filter engine.
37
47
  // The proxy sets it per direction; do not drop it here.
38
- const detections = await filterEngine.detect({ entries, context });
48
+ const rawDetections = await filterEngine.detect({ entries, context });
49
+ // WS2c precision controls run AFTER detect and BEFORE decide: drop a low-
50
+ // confidence soft-type detection (minConfidence) and suppress an allowlisted
51
+ // soft-type detection — never a hard-block type. `precisionAudit` carries the
52
+ // per-type counts of what was suppressed/dropped so the audit event records
53
+ // it (counts/types only, never the raw value). See applyPrecisionControls.
54
+ const { detections, precisionAudit } = applyPrecisionControls(rawDetections, { minConfidence, allowlist });
39
55
  const decisions = [];
40
56
 
41
57
  for (const detection of detections) {
@@ -62,7 +78,8 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
62
78
  blocked,
63
79
  payload,
64
80
  detections,
65
- decisions
81
+ decisions,
82
+ precisionAudit
66
83
  });
67
84
 
68
85
  await auditSink.record(auditEvent);
@@ -70,7 +87,7 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
70
87
  return {
71
88
  payload: protectedPayload,
72
89
  blocked,
73
- summary: summarize(detections, decisions),
90
+ summary: summarize(detections, decisions, precisionAudit),
74
91
  auditEvent,
75
92
  issuedTokens: [...issuedTokens]
76
93
  };
@@ -274,7 +291,7 @@ export function shapeOnly(value) {
274
291
  return { type: value === null ? "null" : typeof value };
275
292
  }
276
293
 
277
- export function summarize(detections, decisions) {
294
+ export function summarize(detections, decisions, precisionAudit = null) {
278
295
  const byType = {};
279
296
  const byAction = {};
280
297
 
@@ -286,11 +303,121 @@ export function summarize(detections, decisions) {
286
303
  byAction[decision.action] = (byAction[decision.action] ?? 0) + 1;
287
304
  }
288
305
 
289
- return {
306
+ const summary = {
290
307
  detectionCount: detections.length,
291
308
  byType,
292
309
  byAction
293
310
  };
311
+
312
+ // WS2c: additively record how many detections the precision controls removed
313
+ // before decide — `suppressedCount`/`suppressedByType` for allowlist FP
314
+ // exceptions and `droppedCount`/`droppedByType` for sub-minConfidence drops.
315
+ // Counts and types only; the matched value is NEVER recorded (no-plaintext-in-
316
+ // audit). Omitted entirely when nothing was removed, so 1.1 events are byte-
317
+ // identical and the audit hash-chain canonicalization is unaffected.
318
+ if (precisionAudit && precisionAudit.suppressedCount > 0) {
319
+ summary.suppressedCount = precisionAudit.suppressedCount;
320
+ summary.suppressedByType = precisionAudit.suppressedByType;
321
+ }
322
+ if (precisionAudit && precisionAudit.droppedCount > 0) {
323
+ summary.droppedCount = precisionAudit.droppedCount;
324
+ summary.droppedByType = precisionAudit.droppedByType;
325
+ }
326
+
327
+ return summary;
328
+ }
329
+
330
+ // Compile the configured allowlist into fast lookup sets. An entry is either a
331
+ // bare string (an exact matched-VALUE exception) or an object { value?, path? }
332
+ // (value exception, JSON-path exception via the PII-safe pathText, or both —
333
+ // when both are present BOTH must match). Returns null when there is nothing to
334
+ // allowlist so the hot path can skip the work entirely.
335
+ function compileAllowlist(allowlist) {
336
+ if (!Array.isArray(allowlist) || allowlist.length === 0) {
337
+ return null;
338
+ }
339
+ const values = new Set();
340
+ const paths = new Set();
341
+ const pairs = [];
342
+ for (const entry of allowlist) {
343
+ if (typeof entry === "string") {
344
+ values.add(entry);
345
+ continue;
346
+ }
347
+ const hasValue = typeof entry.value === "string";
348
+ const hasPath = typeof entry.path === "string";
349
+ if (hasValue && hasPath) {
350
+ pairs.push({ value: entry.value, path: entry.path });
351
+ } else if (hasValue) {
352
+ values.add(entry.value);
353
+ } else if (hasPath) {
354
+ paths.add(entry.path);
355
+ }
356
+ }
357
+ return { values, paths, pairs };
358
+ }
359
+
360
+ // Does this detection's matched value / JSON path match an allowlist entry? The
361
+ // path comparison uses the PII-safe `pathText` (the same hashed path the audit
362
+ // records), so an operator allowlists `key_<hash>.…` — never a raw key name.
363
+ function isAllowlisted(detection, allowlist) {
364
+ if (!allowlist) {
365
+ return false;
366
+ }
367
+ const { values, paths, pairs } = allowlist;
368
+ if (typeof detection.value === "string" && values.has(detection.value)) {
369
+ return true;
370
+ }
371
+ if (typeof detection.pathText === "string" && paths.has(detection.pathText)) {
372
+ return true;
373
+ }
374
+ for (const pair of pairs) {
375
+ if (detection.value === pair.value && detection.pathText === pair.path) {
376
+ return true;
377
+ }
378
+ }
379
+ return false;
380
+ }
381
+
382
+ // WS2c precision controls — run AFTER detect, BEFORE decide. Returns the kept
383
+ // detections plus a precisionAudit of what was removed (counts/types only).
384
+ //
385
+ // HARD-BLOCK INVARIANT (load-bearing, fail-closed): a detection whose type is in
386
+ // HARD_BLOCK_TYPES (secret/api_key/kr_rrn/card) is NEVER removed here — neither a
387
+ // low confidence nor an allowlist entry can suppress it. minConfidence trims only
388
+ // the precision-risky SOFT types; an allowlist entry that would suppress a hard-
389
+ // block type is ignored and the detection still fires. This guard lives in core
390
+ // (not trusted to config) so the invariant holds for every caller.
391
+ export function applyPrecisionControls(detections, { minConfidence = 0, allowlist = null } = {}) {
392
+ const kept = [];
393
+ const suppressedByType = {};
394
+ const droppedByType = {};
395
+ let suppressedCount = 0;
396
+ let droppedCount = 0;
397
+
398
+ for (const detection of detections) {
399
+ const hardBlock = HARD_BLOCK_TYPES.has(detection.type);
400
+ // Allowlist suppression first (an operator-declared FP exception), but never
401
+ // for a hard-block type.
402
+ if (!hardBlock && isAllowlisted(detection, allowlist)) {
403
+ suppressedByType[detection.type] = (suppressedByType[detection.type] ?? 0) + 1;
404
+ suppressedCount += 1;
405
+ continue;
406
+ }
407
+ // minConfidence drop — only for soft types. A low-confidence hard-block
408
+ // detection (e.g. a card at confidence 0.75) is kept and acted on.
409
+ if (!hardBlock && Number.isFinite(detection.confidence) && detection.confidence < minConfidence) {
410
+ droppedByType[detection.type] = (droppedByType[detection.type] ?? 0) + 1;
411
+ droppedCount += 1;
412
+ continue;
413
+ }
414
+ kept.push(detection);
415
+ }
416
+
417
+ return {
418
+ detections: kept,
419
+ precisionAudit: { suppressedCount, suppressedByType, droppedCount, droppedByType }
420
+ };
294
421
  }
295
422
 
296
423
  async function transformPayload(payload, detections, decisions, { context, cryptoProvider, tokenVault, enforced, issuedTokens = null }) {
@@ -424,7 +551,7 @@ async function replacementFor(segment, detection, decision, { context, cryptoPro
424
551
  }
425
552
  }
426
553
 
427
- function buildAuditEvent({ context, mode, enforced, blocked, payload, detections, decisions }) {
554
+ function buildAuditEvent({ context, mode, enforced, blocked, payload, detections, decisions, precisionAudit = null }) {
428
555
  return {
429
556
  // Reader-facing audit-event schema version (frozen as part of the 1.0 API
430
557
  // contract — see docs/current/api-stability.md). Additive-only: a new field
@@ -433,6 +560,14 @@ function buildAuditEvent({ context, mode, enforced, blocked, payload, detections
433
560
  // and so is self-consistent for hash-chain verification of new events.
434
561
  schemaVersion: "1",
435
562
  id: randomUUID(),
563
+ // Per-REQUEST correlation id (WS4-A). Additive top-level field: the proxy
564
+ // generates one randomUUID() per request and threads it into the protect
565
+ // context, so the request- and response-direction events of ONE request
566
+ // share it (and it appears in the structured error log for the same request).
567
+ // It is null when no context.correlationId is set, preserving the existing
568
+ // non-proxy protectJson() behavior and keeping the api-contract subset green.
569
+ // It is a UUID — never a payload/identity/PII value.
570
+ correlationId: context.correlationId ?? null,
436
571
  timestamp: new Date().toISOString(),
437
572
  protocol: context.protocol ?? "custom",
438
573
  operation: context.operation ?? "protect",
@@ -463,7 +598,7 @@ function buildAuditEvent({ context, mode, enforced, blocked, payload, detections
463
598
  action: decisions[index]?.action ?? "unknown",
464
599
  enforced
465
600
  })),
466
- summary: summarize(detections, decisions)
601
+ summary: summarize(detections, decisions, precisionAudit)
467
602
  };
468
603
  }
469
604
 
@@ -1,3 +1,11 @@
1
+ // The hard-block detection types: a leak of one of these is a load-bearing
2
+ // fail-closed concern, so the WS2c precision dials (filters.minConfidence,
3
+ // filters.allowlist) may NOT suppress a detection of any of them. minConfidence
4
+ // trims only the precision-risky SOFT types; the allowlist's per-value/per-path
5
+ // exceptions are ignored for these types (the detection still fires). Exported
6
+ // so the core detect→decide path enforces the same exemption set the docs pin.
7
+ export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card"]);
8
+
1
9
  const DEFAULT_RULES = [
2
10
  {
3
11
  id: "email",
@@ -9,9 +17,15 @@ const DEFAULT_RULES = [
9
17
  {
10
18
  // KR mobile numbers (01[016789] prefixes); landlines are out of scope.
11
19
  // krPhoneValid keeps a bare separator-less run from matching a timestamp/id.
20
+ // The leading `(?<![\w+-])` / trailing `(?![\w-])` boundaries (WS2c) stop the
21
+ // rule from matching a phone-shaped digit run that is a SUBSTRING of a longer
22
+ // hex/alnum/dashed run — e.g. the `…a716-446655440000` tail of a UUID, where
23
+ // the inner `16-44665544` otherwise mis-fired as a phone. The boundaries
24
+ // never affect a real number: a KR mobile sits on a word/space/punctuation
25
+ // edge and `+82` starts on the `+` (allowed before the boundary).
12
26
  id: "kr-phone",
13
27
  type: "phone",
14
- pattern: "(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}",
28
+ pattern: "(?<![\\w+-])(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}(?![\\w-])",
15
29
  flags: "g",
16
30
  confidence: 0.9,
17
31
  validate: krPhoneValid
@@ -40,6 +54,76 @@ const DEFAULT_RULES = [
40
54
  confidence: 0.95
41
55
  },
42
56
  {
57
+ // AWS access key id: a long-lived (AKIA) or temporary (ASIA) key id is a
58
+ // hard-anchored prefix + EXACTLY 16 uppercase-alphanumeric chars. The fixed
59
+ // prefix + fixed length is what makes this high-precision (no bare base64).
60
+ id: "aws-access-key-id",
61
+ type: "api_key",
62
+ pattern: "\\b(?:AKIA|ASIA)[0-9A-Z]{16}\\b",
63
+ flags: "g",
64
+ confidence: 0.95
65
+ },
66
+ {
67
+ // GitHub token: pat (ghp_), oauth (gho_), user-to-server (ghu_), server-to-
68
+ // server (ghs_), refresh (ghr_). Anchored prefix + a long base64-ish body.
69
+ // GitHub's own format is 36 chars after the prefix; we allow >=36 (the
70
+ // corpus fixture is 38) and cap to keep the match bounded.
71
+ id: "github-token",
72
+ type: "secret",
73
+ pattern: "\\bgh[pousr]_[A-Za-z0-9]{36,255}\\b",
74
+ flags: "g",
75
+ confidence: 0.95
76
+ },
77
+ {
78
+ // Google API key: anchored AIza + exactly 35 chars from the URL-safe
79
+ // alphabet. Fixed prefix + fixed length = high precision.
80
+ id: "google-api-key",
81
+ type: "api_key",
82
+ pattern: "\\bAIza[0-9A-Za-z_-]{35}\\b",
83
+ flags: "g",
84
+ confidence: 0.9
85
+ },
86
+ {
87
+ // Slack token: bot (xoxb-), user (xoxa/xoxp-), refresh (xoxr-), legacy
88
+ // (xoxs-). Anchored xox[baprs]- + a >=10-char body. The corpus value is a
89
+ // deliberately low-entropy placeholder, so the rule anchors on the prefix +
90
+ // body shape, not entropy.
91
+ id: "slack-token",
92
+ type: "secret",
93
+ pattern: "\\bxox[baprs]-[0-9A-Za-z-]{10,}\\b",
94
+ flags: "g",
95
+ confidence: 0.9
96
+ },
97
+ {
98
+ // JWT: three dot-separated base64url segments where the FIRST starts with
99
+ // `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
100
+ // on `eyJ` + two more base64url groups keeps this from matching arbitrary
101
+ // dotted tokens (a bare base64 triplet without the JSON header is not a JWT).
102
+ id: "jwt",
103
+ type: "secret",
104
+ pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
105
+ flags: "g",
106
+ confidence: 0.9
107
+ },
108
+ {
109
+ // PEM private key: the armored header. We match the header line itself
110
+ // (`-----BEGIN [...] PRIVATE KEY-----`) — its presence is the credential
111
+ // signal; the body is high-entropy base64 we do not need to span. Covers
112
+ // RSA/EC/OPENSSH/DSA/ENCRYPTED variants and the bare `PRIVATE KEY` form.
113
+ id: "pem-private-key",
114
+ type: "secret",
115
+ pattern: "-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----",
116
+ flags: "g",
117
+ confidence: 0.98
118
+ },
119
+ {
120
+ // Bearer credential. Deliberately NOT context-anchored to `Authorization:`:
121
+ // detection runs PER STRING LEAF, and a real payload carries the credential
122
+ // as its own leaf (`{"Authorization": "Bearer <token>"}` walks to the bare
123
+ // value `"Bearer <token>"`), so a lookbehind requiring the header key in the
124
+ // same string would MISS the realistic case — a recall regression on a
125
+ // hard-block (`secret`) type. `secret` is fail-closed: a `Bearer …` prose
126
+ // false positive is the accepted cost of never missing a leaked token.
43
127
  id: "bearer-token",
44
128
  type: "secret",
45
129
  pattern: "\\bBearer\\s+[A-Za-z0-9._~+/-]{16,}\\b",
@@ -50,11 +134,58 @@ const DEFAULT_RULES = [
50
134
  id: "assignment-secret",
51
135
  type: "secret",
52
136
  // Lookbehind keeps the key name out of the match so transforms replace
53
- // only the secret value, not the assignment prefix.
54
- pattern: "(?<=\\b(?:api[_-]?key|secret|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
137
+ // only the secret value, not the assignment prefix. The key vocabulary
138
+ // covers the common credential-assignment names (cloud secrets, OAuth
139
+ // client secrets, PEM/private keys, access/refresh tokens) so a
140
+ // `<key> = <value>` leak is caught even when the value itself has no
141
+ // self-describing prefix (e.g. an AWS secret access key is bare base64).
142
+ pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
55
143
  flags: "gi",
56
144
  confidence: 0.85
57
145
  },
146
+ {
147
+ // US SSN: AAA-GG-SSSS. The format alone collides with 9-digit ids, so a
148
+ // validator rejects the SSA-invalid ranges (area 000/666/900-999, group 00,
149
+ // serial 0000). The separators are required by the pattern — a bare 9-digit
150
+ // run is intentionally NOT matched (it is indistinguishable from an id).
151
+ id: "us-ssn",
152
+ type: "us_ssn",
153
+ pattern: "(?<![\\w-])\\d{3}-\\d{2}-\\d{4}(?![\\w-])",
154
+ flags: "g",
155
+ confidence: 0.85,
156
+ validate: usSsnValid
157
+ },
158
+ {
159
+ // IBAN: country(2 alpha) + 2 check digits + BBAN. The mod-97 checksum is
160
+ // what makes this high-precision — a random alnum run of the right shape
161
+ // almost never satisfies mod-97 == 1. Length 15-34 per ISO 13616.
162
+ id: "iban",
163
+ type: "iban",
164
+ pattern: "(?<![A-Z0-9])[A-Z]{2}\\d{2}[A-Z0-9]{11,30}(?![A-Z0-9])",
165
+ flags: "g",
166
+ confidence: 0.9,
167
+ validate: ibanValid
168
+ },
169
+ {
170
+ // E.164 international phone: ONLY with a leading `+` (a bare digit run is an
171
+ // id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
172
+ id: "e164-phone",
173
+ type: "phone",
174
+ pattern: "(?<![\\w+])\\+[1-9]\\d{6,14}(?![\\w])",
175
+ flags: "g",
176
+ confidence: 0.8
177
+ },
178
+ {
179
+ // US national phone: ONLY with separators — `(NXX) NXX-XXXX` or
180
+ // `NXX-NXX-XXXX`. A separator-less 10-digit run is deliberately NOT matched
181
+ // (it collides with ids/timestamps; the kr-phone rule already guards bare
182
+ // runs). Conservative by design — phone is the highest false-positive risk.
183
+ id: "us-phone",
184
+ type: "phone",
185
+ pattern: "(?<![\\w-])(?:\\(\\d{3}\\)\\s?|\\d{3}-)\\d{3}-\\d{4}(?![\\w-])",
186
+ flags: "g",
187
+ confidence: 0.75
188
+ },
58
189
  // Indirect prompt injection heuristics. Response/tool-result direction only,
59
190
  // and the policy default for the injection type is `allow` (report-only):
60
191
  // detections are audited regardless of action, and false-positive blocks
@@ -139,8 +270,62 @@ export function detectEntry(entry, rules, context = {}) {
139
270
  // marker-shaped string is NOT Haechi output (Haechi hasn't transformed it yet),
140
271
  // so it is scanned normally — otherwise an attacker could wrap a real secret in
141
272
  // a fake `[TOKEN:…]` to evade request-side detection.
273
+ // Markers are pure ASCII and NFKC-stable, so their spans are computed on the
274
+ // ORIGINAL value exactly as before — they line up with the same-length
275
+ // normalized scan (Case 2 below) and are irrelevant to the whole-leaf scan
276
+ // (Case 3).
142
277
  const markerSpans = context?.direction === "response" ? haechiMarkerSpans(entry.value) : [];
143
278
 
279
+ // WS2d — Unicode evasion via NFKC normalization. A client can defeat every
280
+ // regex rule by sending PII/secrets in a Unicode form that folds to ASCII
281
+ // (full-width digits `4242…`, full-width `@`, mathematical/enclosed
282
+ // alphanumerics). NFKC normalization maps those to their compatibility ASCII
283
+ // form so the rules match. The crux is OFFSET INTEGRITY: detections carry
284
+ // {start,end} into entry.value, but the transform slices the ORIGINAL string
285
+ // (packages/core transformString). Three cases keep offsets valid:
286
+ const value = entry.value;
287
+ const normalized = value.normalize("NFKC");
288
+ if (normalized === value) {
289
+ // Case 1 (~99%): nothing folded. Detect on the original exactly as before —
290
+ // byte-identical behavior, zero regression.
291
+ return removeOverlaps(scanForDetections(value, rules, context, markerSpans, entry, value));
292
+ }
293
+ if (isPositionStableNfkc(value, normalized)) {
294
+ // Case 2: every codepoint folded to the SAME UTF-16 length and the per-
295
+ // codepoint folds reconstruct the whole normalization, so each original
296
+ // character occupies the SAME offsets in `normalized` as in `value` (e.g.
297
+ // full-width→ASCII digits/letters). A match's {start,end} on `normalized` are
298
+ // therefore valid on the ORIGINAL value — exact-span redaction of the evaded
299
+ // value, with the recorded `value` taken from the original slice so
300
+ // tokenize/AAD/audit see the real bytes. A bare `normalized.length ===
301
+ // value.length` check is UNSOUND: a length-contracting codepoint before the
302
+ // PII compensated by a length-expanding one after it keeps the total length
303
+ // equal yet shifts every interior offset (redacting the wrong bytes), so such
304
+ // inputs must fall through to the Case 3 whole-leaf path. Validators still run
305
+ // on the normalized match text (Luhn/RRN need ASCII digits).
306
+ return removeOverlaps(scanForDetections(normalized, rules, context, markerSpans, entry, value));
307
+ }
308
+ // Case 3: the fold is NOT position-stable (a length-changing decomposition, or a
309
+ // compensating contraction+expansion that shifts interior offsets). Offsets on
310
+ // the normalized copy do NOT map back to the original, so we CANNOT do exact-span
311
+ // redaction.
312
+ // FAIL CLOSED: emit ONE detection per matched type covering the WHOLE leaf so
313
+ // the transform redacts/blocks the entire leaf. Over-redacting an evasion
314
+ // attempt is the safe failure. removeOverlaps is intentionally skipped — every
315
+ // detection spans the whole leaf so they all "overlap"; the transform collapses
316
+ // them to a single whole-leaf replacement via its cursor, and any `block` among
317
+ // them blocks the payload, while preserving per-type detection reporting.
318
+ return wholeLeafDetections(normalized, rules, context, entry, value);
319
+ }
320
+
321
+ // Run every applicable rule over `scanText` (the original value, or its
322
+ // same-length NFKC normalization). Offsets index `scanText`, which is positionally
323
+ // 1:1 with `originalValue` (Case 1: identical; Case 2: same UTF-16 length), so the
324
+ // {start,end} are valid on `originalValue`. The recorded `value` is the ORIGINAL
325
+ // slice (never the normalized form). Marker spans (response-only) are computed on
326
+ // the original and align under both cases.
327
+ function scanForDetections(scanText, rules, context, markerSpans, entry, originalValue) {
328
+ const detections = [];
144
329
  for (const rule of rules) {
145
330
  // Direction-scoped rules (e.g. injection heuristics) only run on the
146
331
  // matching traffic direction; rules without a direction run everywhere.
@@ -148,13 +333,13 @@ export function detectEntry(entry, rules, context = {}) {
148
333
  continue;
149
334
  }
150
335
  const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
151
- for (const match of entry.value.matchAll(regex)) {
152
- const value = match[0];
153
- if (rule.validate && !rule.validate(value)) {
336
+ for (const match of scanText.matchAll(regex)) {
337
+ const matchText = match[0];
338
+ if (rule.validate && !rule.validate(matchText)) {
154
339
  continue;
155
340
  }
156
341
  const start = match.index;
157
- const end = match.index + value.length;
342
+ const end = match.index + matchText.length;
158
343
  if (overlapsAny(start, end, markerSpans)) {
159
344
  continue;
160
345
  }
@@ -167,12 +352,73 @@ export function detectEntry(entry, rules, context = {}) {
167
352
  start,
168
353
  end,
169
354
  confidence: rule.confidence,
170
- value
355
+ value: originalValue.slice(start, end)
171
356
  });
172
357
  }
173
358
  }
359
+ return detections;
360
+ }
361
+
362
+ // Case 3 fail-closed scan: discover which types the NFKC-normalized text matches,
363
+ // then emit one whole-leaf detection per distinct type (start:0, end:value.length,
364
+ // value: the whole original leaf). The response-direction marker skip does NOT
365
+ // apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
366
+ // and NFKC-stable), so an evasion attempt can never masquerade as one.
367
+ function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
368
+ const seenTypes = new Set();
369
+ const detections = [];
370
+ for (const rule of rules) {
371
+ if (rule.direction && rule.direction !== context?.direction) {
372
+ continue;
373
+ }
374
+ if (seenTypes.has(rule.type)) {
375
+ continue;
376
+ }
377
+ const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
378
+ let matched = false;
379
+ for (const match of normalized.matchAll(regex)) {
380
+ if (!rule.validate || rule.validate(match[0])) {
381
+ matched = true;
382
+ break;
383
+ }
384
+ }
385
+ if (!matched) {
386
+ continue;
387
+ }
388
+ seenTypes.add(rule.type);
389
+ detections.push({
390
+ type: rule.type,
391
+ ruleId: rule.id,
392
+ path: entry.path,
393
+ pathText: entry.pathText,
394
+ kind: entry.kind ?? "value",
395
+ start: 0,
396
+ end: originalValue.length,
397
+ confidence: rule.confidence,
398
+ value: originalValue
399
+ });
400
+ }
401
+ return detections;
402
+ }
174
403
 
175
- return removeOverlaps(detections);
404
+ // Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
405
+ // text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
406
+ // the same number of UTF-16 units (so no interior offset shifts) AND the per-
407
+ // codepoint folds concatenate to the whole normalization (so no cross-boundary
408
+ // composition moved content). The bare `normalized.length === value.length` check
409
+ // is unsound — a contraction before the PII compensated by an expansion after it
410
+ // keeps the total length equal while shifting every interior offset, redacting the
411
+ // wrong bytes. Runs only on a leaf that actually folded (normalized !== value).
412
+ function isPositionStableNfkc(value, normalized) {
413
+ let rebuilt = "";
414
+ for (const ch of value) {
415
+ const folded = ch.normalize("NFKC");
416
+ if (folded.length !== ch.length) {
417
+ return false;
418
+ }
419
+ rebuilt += folded;
420
+ }
421
+ return rebuilt === normalized;
176
422
  }
177
423
 
178
424
  // Spans of Haechi's own transform markers in a string, so detection can skip
@@ -291,3 +537,47 @@ function krRrnValid(value) {
291
537
  const check = (11 - (sum % 11)) % 10;
292
538
  return check === Number(digits[12]);
293
539
  }
540
+
541
+ // US SSN structural validity (SSA allocation rules). The format `AAA-GG-SSSS`
542
+ // alone collides with arbitrary 9-digit ids, so we reject the never-issued
543
+ // ranges: area 000, 666, and 900-999; group 00; serial 0000. This is what turns
544
+ // the loose shape into a high-precision detection.
545
+ function usSsnValid(value) {
546
+ const match = /^(\d{3})-(\d{2})-(\d{4})$/.exec(value);
547
+ if (!match) {
548
+ return false;
549
+ }
550
+ const area = Number(match[1]);
551
+ const group = Number(match[2]);
552
+ const serial = Number(match[3]);
553
+ if (area === 0 || area === 666 || area >= 900) {
554
+ return false;
555
+ }
556
+ if (group === 0) {
557
+ return false;
558
+ }
559
+ if (serial === 0) {
560
+ return false;
561
+ }
562
+ return true;
563
+ }
564
+
565
+ // IBAN mod-97 checksum (ISO 7064 / ISO 13616). Move the first four chars to the
566
+ // end, map letters to 10-35, and the resulting integer must be congruent to 1
567
+ // mod 97. Computed digit-by-digit so the big integer never overflows. This
568
+ // checksum is the precision guarantee — random alnum runs almost never pass.
569
+ function ibanValid(value) {
570
+ const iban = value.replace(/\s/g, "").toUpperCase();
571
+ if (!/^[A-Z]{2}\d{2}[A-Z0-9]{11,30}$/.test(iban)) {
572
+ return false;
573
+ }
574
+ const rearranged = iban.slice(4) + iban.slice(0, 4);
575
+ let remainder = 0;
576
+ for (const char of rearranged) {
577
+ const mapped = /\d/.test(char) ? char : String(char.charCodeAt(0) - 55);
578
+ for (const digit of mapped) {
579
+ remainder = (remainder * 10 + Number(digit)) % 97;
580
+ }
581
+ }
582
+ return remainder === 1;
583
+ }