haechi 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SECURITY.md +7 -1
- package/docs/README.md +2 -0
- package/docs/current/compliance-mapping.ko.md +53 -0
- package/docs/current/compliance-mapping.md +53 -0
- package/docs/current/config-version.ko.md +30 -0
- package/docs/current/config-version.md +51 -0
- package/docs/current/configuration.ko.md +147 -7
- package/docs/current/configuration.md +147 -7
- package/docs/current/operations-runbook.ko.md +121 -0
- package/docs/current/operations-runbook.md +204 -0
- package/docs/current/release-process.ko.md +1 -1
- package/docs/current/release-process.md +1 -1
- package/docs/current/risk-register-release-gate.ko.md +3 -2
- package/docs/current/risk-register-release-gate.md +11 -2
- package/docs/current/security-whitepaper.ko.md +102 -0
- package/docs/current/security-whitepaper.md +102 -0
- package/docs/current/shared-responsibility.ko.md +2 -2
- package/docs/current/shared-responsibility.md +2 -2
- package/docs/current/threat-model.ko.md +3 -2
- package/docs/current/threat-model.md +3 -2
- package/haechi.config.example.json +19 -3
- package/package.json +5 -2
- package/packages/audit/index.mjs +26 -2
- package/packages/cli/bin/haechi.mjs +54 -8
- package/packages/cli/runtime.mjs +391 -10
- package/packages/core/index.mjs +143 -8
- package/packages/filter/index.mjs +299 -9
- package/packages/metrics/index.mjs +181 -0
- package/packages/proxy/index.mjs +518 -39
package/packages/core/index.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { createHash, randomUUID } from "node:crypto";
|
|
2
|
+
import { HARD_BLOCK_TYPES } from "../filter/index.mjs";
|
|
2
3
|
|
|
3
4
|
const NO_ENFORCE_MODES = new Set(["dry-run", "report-only"]);
|
|
4
5
|
|
|
@@ -10,7 +11,7 @@ const NO_ENFORCE_MODES = new Set(["dry-run", "report-only"]);
|
|
|
10
11
|
// limits.maxNestingDepth through createHaechi → protectJson instead.
|
|
11
12
|
export const DEFAULT_MAX_NESTING_DEPTH = 256;
|
|
12
13
|
|
|
13
|
-
export function createHaechi({ filterEngine, policyEngine, cryptoProvider, auditSink, tokenVault = null, mode = "dry-run", limits = {} }) {
|
|
14
|
+
export function createHaechi({ filterEngine, policyEngine, cryptoProvider, auditSink, tokenVault = null, mode = "dry-run", limits = {}, precision = {} }) {
|
|
14
15
|
if (!filterEngine || !policyEngine || !cryptoProvider || !auditSink) {
|
|
15
16
|
throw new Error("Haechi requires filterEngine, policyEngine, cryptoProvider, and auditSink");
|
|
16
17
|
}
|
|
@@ -20,6 +21,15 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
|
|
|
20
21
|
? limits.maxNestingDepth
|
|
21
22
|
: DEFAULT_MAX_NESTING_DEPTH;
|
|
22
23
|
|
|
24
|
+
// WS2c precision controls, resolved once. `minConfidence` is the precision dial
|
|
25
|
+
// (drop a detection below the threshold) and `allowlist` is the operator FP
|
|
26
|
+
// exception set. Both are FAIL-OPEN-FOR-PROTECTION: they may only TRIM
|
|
27
|
+
// precision-risky soft-type detections and can NEVER suppress a hard-block type
|
|
28
|
+
// (secret/api_key/kr_rrn/card) — that load-bearing exemption is enforced in
|
|
29
|
+
// applyPrecisionControls, not trusted to config. Default {} = current behavior.
|
|
30
|
+
const minConfidence = Number.isFinite(precision.minConfidence) ? precision.minConfidence : 0;
|
|
31
|
+
const allowlist = compileAllowlist(precision.allowlist);
|
|
32
|
+
|
|
23
33
|
async function protectJson(payload, rawContext = {}) {
|
|
24
34
|
// A per-request policy engine (a named profile selected from identity)
|
|
25
35
|
// overrides the default. It is a control object, NOT data: strip it before
|
|
@@ -35,7 +45,13 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
|
|
|
35
45
|
// `context.direction` ("request" | "response") gates direction-scoped rules
|
|
36
46
|
// (injection) and the response-only marker exclusion in the filter engine.
|
|
37
47
|
// The proxy sets it per direction; do not drop it here.
|
|
38
|
-
const
|
|
48
|
+
const rawDetections = await filterEngine.detect({ entries, context });
|
|
49
|
+
// WS2c precision controls run AFTER detect and BEFORE decide: drop a low-
|
|
50
|
+
// confidence soft-type detection (minConfidence) and suppress an allowlisted
|
|
51
|
+
// soft-type detection — never a hard-block type. `precisionAudit` carries the
|
|
52
|
+
// per-type counts of what was suppressed/dropped so the audit event records
|
|
53
|
+
// it (counts/types only, never the raw value). See applyPrecisionControls.
|
|
54
|
+
const { detections, precisionAudit } = applyPrecisionControls(rawDetections, { minConfidence, allowlist });
|
|
39
55
|
const decisions = [];
|
|
40
56
|
|
|
41
57
|
for (const detection of detections) {
|
|
@@ -62,7 +78,8 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
|
|
|
62
78
|
blocked,
|
|
63
79
|
payload,
|
|
64
80
|
detections,
|
|
65
|
-
decisions
|
|
81
|
+
decisions,
|
|
82
|
+
precisionAudit
|
|
66
83
|
});
|
|
67
84
|
|
|
68
85
|
await auditSink.record(auditEvent);
|
|
@@ -70,7 +87,7 @@ export function createHaechi({ filterEngine, policyEngine, cryptoProvider, audit
|
|
|
70
87
|
return {
|
|
71
88
|
payload: protectedPayload,
|
|
72
89
|
blocked,
|
|
73
|
-
summary: summarize(detections, decisions),
|
|
90
|
+
summary: summarize(detections, decisions, precisionAudit),
|
|
74
91
|
auditEvent,
|
|
75
92
|
issuedTokens: [...issuedTokens]
|
|
76
93
|
};
|
|
@@ -274,7 +291,7 @@ export function shapeOnly(value) {
|
|
|
274
291
|
return { type: value === null ? "null" : typeof value };
|
|
275
292
|
}
|
|
276
293
|
|
|
277
|
-
export function summarize(detections, decisions) {
|
|
294
|
+
export function summarize(detections, decisions, precisionAudit = null) {
|
|
278
295
|
const byType = {};
|
|
279
296
|
const byAction = {};
|
|
280
297
|
|
|
@@ -286,11 +303,121 @@ export function summarize(detections, decisions) {
|
|
|
286
303
|
byAction[decision.action] = (byAction[decision.action] ?? 0) + 1;
|
|
287
304
|
}
|
|
288
305
|
|
|
289
|
-
|
|
306
|
+
const summary = {
|
|
290
307
|
detectionCount: detections.length,
|
|
291
308
|
byType,
|
|
292
309
|
byAction
|
|
293
310
|
};
|
|
311
|
+
|
|
312
|
+
// WS2c: additively record how many detections the precision controls removed
|
|
313
|
+
// before decide — `suppressedCount`/`suppressedByType` for allowlist FP
|
|
314
|
+
// exceptions and `droppedCount`/`droppedByType` for sub-minConfidence drops.
|
|
315
|
+
// Counts and types only; the matched value is NEVER recorded (no-plaintext-in-
|
|
316
|
+
// audit). Omitted entirely when nothing was removed, so 1.1 events are byte-
|
|
317
|
+
// identical and the audit hash-chain canonicalization is unaffected.
|
|
318
|
+
if (precisionAudit && precisionAudit.suppressedCount > 0) {
|
|
319
|
+
summary.suppressedCount = precisionAudit.suppressedCount;
|
|
320
|
+
summary.suppressedByType = precisionAudit.suppressedByType;
|
|
321
|
+
}
|
|
322
|
+
if (precisionAudit && precisionAudit.droppedCount > 0) {
|
|
323
|
+
summary.droppedCount = precisionAudit.droppedCount;
|
|
324
|
+
summary.droppedByType = precisionAudit.droppedByType;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return summary;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Compile the configured allowlist into fast lookup sets. An entry is either a
|
|
331
|
+
// bare string (an exact matched-VALUE exception) or an object { value?, path? }
|
|
332
|
+
// (value exception, JSON-path exception via the PII-safe pathText, or both —
|
|
333
|
+
// when both are present BOTH must match). Returns null when there is nothing to
|
|
334
|
+
// allowlist so the hot path can skip the work entirely.
|
|
335
|
+
function compileAllowlist(allowlist) {
|
|
336
|
+
if (!Array.isArray(allowlist) || allowlist.length === 0) {
|
|
337
|
+
return null;
|
|
338
|
+
}
|
|
339
|
+
const values = new Set();
|
|
340
|
+
const paths = new Set();
|
|
341
|
+
const pairs = [];
|
|
342
|
+
for (const entry of allowlist) {
|
|
343
|
+
if (typeof entry === "string") {
|
|
344
|
+
values.add(entry);
|
|
345
|
+
continue;
|
|
346
|
+
}
|
|
347
|
+
const hasValue = typeof entry.value === "string";
|
|
348
|
+
const hasPath = typeof entry.path === "string";
|
|
349
|
+
if (hasValue && hasPath) {
|
|
350
|
+
pairs.push({ value: entry.value, path: entry.path });
|
|
351
|
+
} else if (hasValue) {
|
|
352
|
+
values.add(entry.value);
|
|
353
|
+
} else if (hasPath) {
|
|
354
|
+
paths.add(entry.path);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
return { values, paths, pairs };
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Does this detection's matched value / JSON path match an allowlist entry? The
|
|
361
|
+
// path comparison uses the PII-safe `pathText` (the same hashed path the audit
|
|
362
|
+
// records), so an operator allowlists `key_<hash>.…` — never a raw key name.
|
|
363
|
+
function isAllowlisted(detection, allowlist) {
|
|
364
|
+
if (!allowlist) {
|
|
365
|
+
return false;
|
|
366
|
+
}
|
|
367
|
+
const { values, paths, pairs } = allowlist;
|
|
368
|
+
if (typeof detection.value === "string" && values.has(detection.value)) {
|
|
369
|
+
return true;
|
|
370
|
+
}
|
|
371
|
+
if (typeof detection.pathText === "string" && paths.has(detection.pathText)) {
|
|
372
|
+
return true;
|
|
373
|
+
}
|
|
374
|
+
for (const pair of pairs) {
|
|
375
|
+
if (detection.value === pair.value && detection.pathText === pair.path) {
|
|
376
|
+
return true;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
return false;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// WS2c precision controls — run AFTER detect, BEFORE decide. Returns the kept
|
|
383
|
+
// detections plus a precisionAudit of what was removed (counts/types only).
|
|
384
|
+
//
|
|
385
|
+
// HARD-BLOCK INVARIANT (load-bearing, fail-closed): a detection whose type is in
|
|
386
|
+
// HARD_BLOCK_TYPES (secret/api_key/kr_rrn/card) is NEVER removed here — neither a
|
|
387
|
+
// low confidence nor an allowlist entry can suppress it. minConfidence trims only
|
|
388
|
+
// the precision-risky SOFT types; an allowlist entry that would suppress a hard-
|
|
389
|
+
// block type is ignored and the detection still fires. This guard lives in core
|
|
390
|
+
// (not trusted to config) so the invariant holds for every caller.
|
|
391
|
+
export function applyPrecisionControls(detections, { minConfidence = 0, allowlist = null } = {}) {
|
|
392
|
+
const kept = [];
|
|
393
|
+
const suppressedByType = {};
|
|
394
|
+
const droppedByType = {};
|
|
395
|
+
let suppressedCount = 0;
|
|
396
|
+
let droppedCount = 0;
|
|
397
|
+
|
|
398
|
+
for (const detection of detections) {
|
|
399
|
+
const hardBlock = HARD_BLOCK_TYPES.has(detection.type);
|
|
400
|
+
// Allowlist suppression first (an operator-declared FP exception), but never
|
|
401
|
+
// for a hard-block type.
|
|
402
|
+
if (!hardBlock && isAllowlisted(detection, allowlist)) {
|
|
403
|
+
suppressedByType[detection.type] = (suppressedByType[detection.type] ?? 0) + 1;
|
|
404
|
+
suppressedCount += 1;
|
|
405
|
+
continue;
|
|
406
|
+
}
|
|
407
|
+
// minConfidence drop — only for soft types. A low-confidence hard-block
|
|
408
|
+
// detection (e.g. a card at confidence 0.75) is kept and acted on.
|
|
409
|
+
if (!hardBlock && Number.isFinite(detection.confidence) && detection.confidence < minConfidence) {
|
|
410
|
+
droppedByType[detection.type] = (droppedByType[detection.type] ?? 0) + 1;
|
|
411
|
+
droppedCount += 1;
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
kept.push(detection);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
return {
|
|
418
|
+
detections: kept,
|
|
419
|
+
precisionAudit: { suppressedCount, suppressedByType, droppedCount, droppedByType }
|
|
420
|
+
};
|
|
294
421
|
}
|
|
295
422
|
|
|
296
423
|
async function transformPayload(payload, detections, decisions, { context, cryptoProvider, tokenVault, enforced, issuedTokens = null }) {
|
|
@@ -424,7 +551,7 @@ async function replacementFor(segment, detection, decision, { context, cryptoPro
|
|
|
424
551
|
}
|
|
425
552
|
}
|
|
426
553
|
|
|
427
|
-
function buildAuditEvent({ context, mode, enforced, blocked, payload, detections, decisions }) {
|
|
554
|
+
function buildAuditEvent({ context, mode, enforced, blocked, payload, detections, decisions, precisionAudit = null }) {
|
|
428
555
|
return {
|
|
429
556
|
// Reader-facing audit-event schema version (frozen as part of the 1.0 API
|
|
430
557
|
// contract — see docs/current/api-stability.md). Additive-only: a new field
|
|
@@ -433,6 +560,14 @@ function buildAuditEvent({ context, mode, enforced, blocked, payload, detections
|
|
|
433
560
|
// and so is self-consistent for hash-chain verification of new events.
|
|
434
561
|
schemaVersion: "1",
|
|
435
562
|
id: randomUUID(),
|
|
563
|
+
// Per-REQUEST correlation id (WS4-A). Additive top-level field: the proxy
|
|
564
|
+
// generates one randomUUID() per request and threads it into the protect
|
|
565
|
+
// context, so the request- and response-direction events of ONE request
|
|
566
|
+
// share it (and it appears in the structured error log for the same request).
|
|
567
|
+
// It is null when no context.correlationId is set, preserving the existing
|
|
568
|
+
// non-proxy protectJson() behavior and keeping the api-contract subset green.
|
|
569
|
+
// It is a UUID — never a payload/identity/PII value.
|
|
570
|
+
correlationId: context.correlationId ?? null,
|
|
436
571
|
timestamp: new Date().toISOString(),
|
|
437
572
|
protocol: context.protocol ?? "custom",
|
|
438
573
|
operation: context.operation ?? "protect",
|
|
@@ -463,7 +598,7 @@ function buildAuditEvent({ context, mode, enforced, blocked, payload, detections
|
|
|
463
598
|
action: decisions[index]?.action ?? "unknown",
|
|
464
599
|
enforced
|
|
465
600
|
})),
|
|
466
|
-
summary: summarize(detections, decisions)
|
|
601
|
+
summary: summarize(detections, decisions, precisionAudit)
|
|
467
602
|
};
|
|
468
603
|
}
|
|
469
604
|
|
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
// The hard-block detection types: a leak of one of these is a load-bearing
|
|
2
|
+
// fail-closed concern, so the WS2c precision dials (filters.minConfidence,
|
|
3
|
+
// filters.allowlist) may NOT suppress a detection of any of them. minConfidence
|
|
4
|
+
// trims only the precision-risky SOFT types; the allowlist's per-value/per-path
|
|
5
|
+
// exceptions are ignored for these types (the detection still fires). Exported
|
|
6
|
+
// so the core detect→decide path enforces the same exemption set the docs pin.
|
|
7
|
+
export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card"]);
|
|
8
|
+
|
|
1
9
|
const DEFAULT_RULES = [
|
|
2
10
|
{
|
|
3
11
|
id: "email",
|
|
@@ -9,9 +17,15 @@ const DEFAULT_RULES = [
|
|
|
9
17
|
{
|
|
10
18
|
// KR mobile numbers (01[016789] prefixes); landlines are out of scope.
|
|
11
19
|
// krPhoneValid keeps a bare separator-less run from matching a timestamp/id.
|
|
20
|
+
// The leading `(?<![\w+-])` / trailing `(?![\w-])` boundaries (WS2c) stop the
|
|
21
|
+
// rule from matching a phone-shaped digit run that is a SUBSTRING of a longer
|
|
22
|
+
// hex/alnum/dashed run — e.g. the `…a716-446655440000` tail of a UUID, where
|
|
23
|
+
// the inner `16-44665544` otherwise mis-fired as a phone. The boundaries
|
|
24
|
+
// never affect a real number: a KR mobile sits on a word/space/punctuation
|
|
25
|
+
// edge and `+82` starts on the `+` (allowed before the boundary).
|
|
12
26
|
id: "kr-phone",
|
|
13
27
|
type: "phone",
|
|
14
|
-
pattern: "(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}",
|
|
28
|
+
pattern: "(?<![\\w+-])(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}(?![\\w-])",
|
|
15
29
|
flags: "g",
|
|
16
30
|
confidence: 0.9,
|
|
17
31
|
validate: krPhoneValid
|
|
@@ -40,6 +54,76 @@ const DEFAULT_RULES = [
|
|
|
40
54
|
confidence: 0.95
|
|
41
55
|
},
|
|
42
56
|
{
|
|
57
|
+
// AWS access key id: a long-lived (AKIA) or temporary (ASIA) key id is a
|
|
58
|
+
// hard-anchored prefix + EXACTLY 16 uppercase-alphanumeric chars. The fixed
|
|
59
|
+
// prefix + fixed length is what makes this high-precision (no bare base64).
|
|
60
|
+
id: "aws-access-key-id",
|
|
61
|
+
type: "api_key",
|
|
62
|
+
pattern: "\\b(?:AKIA|ASIA)[0-9A-Z]{16}\\b",
|
|
63
|
+
flags: "g",
|
|
64
|
+
confidence: 0.95
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
// GitHub token: pat (ghp_), oauth (gho_), user-to-server (ghu_), server-to-
|
|
68
|
+
// server (ghs_), refresh (ghr_). Anchored prefix + a long base64-ish body.
|
|
69
|
+
// GitHub's own format is 36 chars after the prefix; we allow >=36 (the
|
|
70
|
+
// corpus fixture is 38) and cap to keep the match bounded.
|
|
71
|
+
id: "github-token",
|
|
72
|
+
type: "secret",
|
|
73
|
+
pattern: "\\bgh[pousr]_[A-Za-z0-9]{36,255}\\b",
|
|
74
|
+
flags: "g",
|
|
75
|
+
confidence: 0.95
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
// Google API key: anchored AIza + exactly 35 chars from the URL-safe
|
|
79
|
+
// alphabet. Fixed prefix + fixed length = high precision.
|
|
80
|
+
id: "google-api-key",
|
|
81
|
+
type: "api_key",
|
|
82
|
+
pattern: "\\bAIza[0-9A-Za-z_-]{35}\\b",
|
|
83
|
+
flags: "g",
|
|
84
|
+
confidence: 0.9
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
// Slack token: bot (xoxb-), user (xoxa/xoxp-), refresh (xoxr-), legacy
|
|
88
|
+
// (xoxs-). Anchored xox[baprs]- + a >=10-char body. The corpus value is a
|
|
89
|
+
// deliberately low-entropy placeholder, so the rule anchors on the prefix +
|
|
90
|
+
// body shape, not entropy.
|
|
91
|
+
id: "slack-token",
|
|
92
|
+
type: "secret",
|
|
93
|
+
pattern: "\\bxox[baprs]-[0-9A-Za-z-]{10,}\\b",
|
|
94
|
+
flags: "g",
|
|
95
|
+
confidence: 0.9
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
// JWT: three dot-separated base64url segments where the FIRST starts with
|
|
99
|
+
// `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
|
|
100
|
+
// on `eyJ` + two more base64url groups keeps this from matching arbitrary
|
|
101
|
+
// dotted tokens (a bare base64 triplet without the JSON header is not a JWT).
|
|
102
|
+
id: "jwt",
|
|
103
|
+
type: "secret",
|
|
104
|
+
pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
|
|
105
|
+
flags: "g",
|
|
106
|
+
confidence: 0.9
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
// PEM private key: the armored header. We match the header line itself
|
|
110
|
+
// (`-----BEGIN [...] PRIVATE KEY-----`) — its presence is the credential
|
|
111
|
+
// signal; the body is high-entropy base64 we do not need to span. Covers
|
|
112
|
+
// RSA/EC/OPENSSH/DSA/ENCRYPTED variants and the bare `PRIVATE KEY` form.
|
|
113
|
+
id: "pem-private-key",
|
|
114
|
+
type: "secret",
|
|
115
|
+
pattern: "-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----",
|
|
116
|
+
flags: "g",
|
|
117
|
+
confidence: 0.98
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
// Bearer credential. Deliberately NOT context-anchored to `Authorization:`:
|
|
121
|
+
// detection runs PER STRING LEAF, and a real payload carries the credential
|
|
122
|
+
// as its own leaf (`{"Authorization": "Bearer <token>"}` walks to the bare
|
|
123
|
+
// value `"Bearer <token>"`), so a lookbehind requiring the header key in the
|
|
124
|
+
// same string would MISS the realistic case — a recall regression on a
|
|
125
|
+
// hard-block (`secret`) type. `secret` is fail-closed: a `Bearer …` prose
|
|
126
|
+
// false positive is the accepted cost of never missing a leaked token.
|
|
43
127
|
id: "bearer-token",
|
|
44
128
|
type: "secret",
|
|
45
129
|
pattern: "\\bBearer\\s+[A-Za-z0-9._~+/-]{16,}\\b",
|
|
@@ -50,11 +134,58 @@ const DEFAULT_RULES = [
|
|
|
50
134
|
id: "assignment-secret",
|
|
51
135
|
type: "secret",
|
|
52
136
|
// Lookbehind keeps the key name out of the match so transforms replace
|
|
53
|
-
// only the secret value, not the assignment prefix.
|
|
54
|
-
|
|
137
|
+
// only the secret value, not the assignment prefix. The key vocabulary
|
|
138
|
+
// covers the common credential-assignment names (cloud secrets, OAuth
|
|
139
|
+
// client secrets, PEM/private keys, access/refresh tokens) so a
|
|
140
|
+
// `<key> = <value>` leak is caught even when the value itself has no
|
|
141
|
+
// self-describing prefix (e.g. an AWS secret access key is bare base64).
|
|
142
|
+
pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
|
|
55
143
|
flags: "gi",
|
|
56
144
|
confidence: 0.85
|
|
57
145
|
},
|
|
146
|
+
{
|
|
147
|
+
// US SSN: AAA-GG-SSSS. The format alone collides with 9-digit ids, so a
|
|
148
|
+
// validator rejects the SSA-invalid ranges (area 000/666/900-999, group 00,
|
|
149
|
+
// serial 0000). The separators are required by the pattern — a bare 9-digit
|
|
150
|
+
// run is intentionally NOT matched (it is indistinguishable from an id).
|
|
151
|
+
id: "us-ssn",
|
|
152
|
+
type: "us_ssn",
|
|
153
|
+
pattern: "(?<![\\w-])\\d{3}-\\d{2}-\\d{4}(?![\\w-])",
|
|
154
|
+
flags: "g",
|
|
155
|
+
confidence: 0.85,
|
|
156
|
+
validate: usSsnValid
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
// IBAN: country(2 alpha) + 2 check digits + BBAN. The mod-97 checksum is
|
|
160
|
+
// what makes this high-precision — a random alnum run of the right shape
|
|
161
|
+
// almost never satisfies mod-97 == 1. Length 15-34 per ISO 13616.
|
|
162
|
+
id: "iban",
|
|
163
|
+
type: "iban",
|
|
164
|
+
pattern: "(?<![A-Z0-9])[A-Z]{2}\\d{2}[A-Z0-9]{11,30}(?![A-Z0-9])",
|
|
165
|
+
flags: "g",
|
|
166
|
+
confidence: 0.9,
|
|
167
|
+
validate: ibanValid
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
// E.164 international phone: ONLY with a leading `+` (a bare digit run is an
|
|
171
|
+
// id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
|
|
172
|
+
id: "e164-phone",
|
|
173
|
+
type: "phone",
|
|
174
|
+
pattern: "(?<![\\w+])\\+[1-9]\\d{6,14}(?![\\w])",
|
|
175
|
+
flags: "g",
|
|
176
|
+
confidence: 0.8
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
// US national phone: ONLY with separators — `(NXX) NXX-XXXX` or
|
|
180
|
+
// `NXX-NXX-XXXX`. A separator-less 10-digit run is deliberately NOT matched
|
|
181
|
+
// (it collides with ids/timestamps; the kr-phone rule already guards bare
|
|
182
|
+
// runs). Conservative by design — phone is the highest false-positive risk.
|
|
183
|
+
id: "us-phone",
|
|
184
|
+
type: "phone",
|
|
185
|
+
pattern: "(?<![\\w-])(?:\\(\\d{3}\\)\\s?|\\d{3}-)\\d{3}-\\d{4}(?![\\w-])",
|
|
186
|
+
flags: "g",
|
|
187
|
+
confidence: 0.75
|
|
188
|
+
},
|
|
58
189
|
// Indirect prompt injection heuristics. Response/tool-result direction only,
|
|
59
190
|
// and the policy default for the injection type is `allow` (report-only):
|
|
60
191
|
// detections are audited regardless of action, and false-positive blocks
|
|
@@ -139,8 +270,62 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
139
270
|
// marker-shaped string is NOT Haechi output (Haechi hasn't transformed it yet),
|
|
140
271
|
// so it is scanned normally — otherwise an attacker could wrap a real secret in
|
|
141
272
|
// a fake `[TOKEN:…]` to evade request-side detection.
|
|
273
|
+
// Markers are pure ASCII and NFKC-stable, so their spans are computed on the
|
|
274
|
+
// ORIGINAL value exactly as before — they line up with the same-length
|
|
275
|
+
// normalized scan (Case 2 below) and are irrelevant to the whole-leaf scan
|
|
276
|
+
// (Case 3).
|
|
142
277
|
const markerSpans = context?.direction === "response" ? haechiMarkerSpans(entry.value) : [];
|
|
143
278
|
|
|
279
|
+
// WS2d — Unicode evasion via NFKC normalization. A client can defeat every
|
|
280
|
+
// regex rule by sending PII/secrets in a Unicode form that folds to ASCII
|
|
281
|
+
// (full-width digits `4242…`, full-width `@`, mathematical/enclosed
|
|
282
|
+
// alphanumerics). NFKC normalization maps those to their compatibility ASCII
|
|
283
|
+
// form so the rules match. The crux is OFFSET INTEGRITY: detections carry
|
|
284
|
+
// {start,end} into entry.value, but the transform slices the ORIGINAL string
|
|
285
|
+
// (packages/core transformString). Three cases keep offsets valid:
|
|
286
|
+
const value = entry.value;
|
|
287
|
+
const normalized = value.normalize("NFKC");
|
|
288
|
+
if (normalized === value) {
|
|
289
|
+
// Case 1 (~99%): nothing folded. Detect on the original exactly as before —
|
|
290
|
+
// byte-identical behavior, zero regression.
|
|
291
|
+
return removeOverlaps(scanForDetections(value, rules, context, markerSpans, entry, value));
|
|
292
|
+
}
|
|
293
|
+
if (isPositionStableNfkc(value, normalized)) {
|
|
294
|
+
// Case 2: every codepoint folded to the SAME UTF-16 length and the per-
|
|
295
|
+
// codepoint folds reconstruct the whole normalization, so each original
|
|
296
|
+
// character occupies the SAME offsets in `normalized` as in `value` (e.g.
|
|
297
|
+
// full-width→ASCII digits/letters). A match's {start,end} on `normalized` are
|
|
298
|
+
// therefore valid on the ORIGINAL value — exact-span redaction of the evaded
|
|
299
|
+
// value, with the recorded `value` taken from the original slice so
|
|
300
|
+
// tokenize/AAD/audit see the real bytes. A bare `normalized.length ===
|
|
301
|
+
// value.length` check is UNSOUND: a length-contracting codepoint before the
|
|
302
|
+
// PII compensated by a length-expanding one after it keeps the total length
|
|
303
|
+
// equal yet shifts every interior offset (redacting the wrong bytes), so such
|
|
304
|
+
// inputs must fall through to the Case 3 whole-leaf path. Validators still run
|
|
305
|
+
// on the normalized match text (Luhn/RRN need ASCII digits).
|
|
306
|
+
return removeOverlaps(scanForDetections(normalized, rules, context, markerSpans, entry, value));
|
|
307
|
+
}
|
|
308
|
+
// Case 3: the fold is NOT position-stable (a length-changing decomposition, or a
|
|
309
|
+
// compensating contraction+expansion that shifts interior offsets). Offsets on
|
|
310
|
+
// the normalized copy do NOT map back to the original, so we CANNOT do exact-span
|
|
311
|
+
// redaction.
|
|
312
|
+
// FAIL CLOSED: emit ONE detection per matched type covering the WHOLE leaf so
|
|
313
|
+
// the transform redacts/blocks the entire leaf. Over-redacting an evasion
|
|
314
|
+
// attempt is the safe failure. removeOverlaps is intentionally skipped — every
|
|
315
|
+
// detection spans the whole leaf so they all "overlap"; the transform collapses
|
|
316
|
+
// them to a single whole-leaf replacement via its cursor, and any `block` among
|
|
317
|
+
// them blocks the payload, while preserving per-type detection reporting.
|
|
318
|
+
return wholeLeafDetections(normalized, rules, context, entry, value);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Run every applicable rule over `scanText` (the original value, or its
|
|
322
|
+
// same-length NFKC normalization). Offsets index `scanText`, which is positionally
|
|
323
|
+
// 1:1 with `originalValue` (Case 1: identical; Case 2: same UTF-16 length), so the
|
|
324
|
+
// {start,end} are valid on `originalValue`. The recorded `value` is the ORIGINAL
|
|
325
|
+
// slice (never the normalized form). Marker spans (response-only) are computed on
|
|
326
|
+
// the original and align under both cases.
|
|
327
|
+
function scanForDetections(scanText, rules, context, markerSpans, entry, originalValue) {
|
|
328
|
+
const detections = [];
|
|
144
329
|
for (const rule of rules) {
|
|
145
330
|
// Direction-scoped rules (e.g. injection heuristics) only run on the
|
|
146
331
|
// matching traffic direction; rules without a direction run everywhere.
|
|
@@ -148,13 +333,13 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
148
333
|
continue;
|
|
149
334
|
}
|
|
150
335
|
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
151
|
-
for (const match of
|
|
152
|
-
const
|
|
153
|
-
if (rule.validate && !rule.validate(
|
|
336
|
+
for (const match of scanText.matchAll(regex)) {
|
|
337
|
+
const matchText = match[0];
|
|
338
|
+
if (rule.validate && !rule.validate(matchText)) {
|
|
154
339
|
continue;
|
|
155
340
|
}
|
|
156
341
|
const start = match.index;
|
|
157
|
-
const end = match.index +
|
|
342
|
+
const end = match.index + matchText.length;
|
|
158
343
|
if (overlapsAny(start, end, markerSpans)) {
|
|
159
344
|
continue;
|
|
160
345
|
}
|
|
@@ -167,12 +352,73 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
167
352
|
start,
|
|
168
353
|
end,
|
|
169
354
|
confidence: rule.confidence,
|
|
170
|
-
value
|
|
355
|
+
value: originalValue.slice(start, end)
|
|
171
356
|
});
|
|
172
357
|
}
|
|
173
358
|
}
|
|
359
|
+
return detections;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Case 3 fail-closed scan: discover which types the NFKC-normalized text matches,
|
|
363
|
+
// then emit one whole-leaf detection per distinct type (start:0, end:value.length,
|
|
364
|
+
// value: the whole original leaf). The response-direction marker skip does NOT
|
|
365
|
+
// apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
|
|
366
|
+
// and NFKC-stable), so an evasion attempt can never masquerade as one.
|
|
367
|
+
function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
|
|
368
|
+
const seenTypes = new Set();
|
|
369
|
+
const detections = [];
|
|
370
|
+
for (const rule of rules) {
|
|
371
|
+
if (rule.direction && rule.direction !== context?.direction) {
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
if (seenTypes.has(rule.type)) {
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
378
|
+
let matched = false;
|
|
379
|
+
for (const match of normalized.matchAll(regex)) {
|
|
380
|
+
if (!rule.validate || rule.validate(match[0])) {
|
|
381
|
+
matched = true;
|
|
382
|
+
break;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
if (!matched) {
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
seenTypes.add(rule.type);
|
|
389
|
+
detections.push({
|
|
390
|
+
type: rule.type,
|
|
391
|
+
ruleId: rule.id,
|
|
392
|
+
path: entry.path,
|
|
393
|
+
pathText: entry.pathText,
|
|
394
|
+
kind: entry.kind ?? "value",
|
|
395
|
+
start: 0,
|
|
396
|
+
end: originalValue.length,
|
|
397
|
+
confidence: rule.confidence,
|
|
398
|
+
value: originalValue
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
return detections;
|
|
402
|
+
}
|
|
174
403
|
|
|
175
|
-
|
|
404
|
+
// Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
|
|
405
|
+
// text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
|
|
406
|
+
// the same number of UTF-16 units (so no interior offset shifts) AND the per-
|
|
407
|
+
// codepoint folds concatenate to the whole normalization (so no cross-boundary
|
|
408
|
+
// composition moved content). The bare `normalized.length === value.length` check
|
|
409
|
+
// is unsound — a contraction before the PII compensated by an expansion after it
|
|
410
|
+
// keeps the total length equal while shifting every interior offset, redacting the
|
|
411
|
+
// wrong bytes. Runs only on a leaf that actually folded (normalized !== value).
|
|
412
|
+
function isPositionStableNfkc(value, normalized) {
|
|
413
|
+
let rebuilt = "";
|
|
414
|
+
for (const ch of value) {
|
|
415
|
+
const folded = ch.normalize("NFKC");
|
|
416
|
+
if (folded.length !== ch.length) {
|
|
417
|
+
return false;
|
|
418
|
+
}
|
|
419
|
+
rebuilt += folded;
|
|
420
|
+
}
|
|
421
|
+
return rebuilt === normalized;
|
|
176
422
|
}
|
|
177
423
|
|
|
178
424
|
// Spans of Haechi's own transform markers in a string, so detection can skip
|
|
@@ -291,3 +537,47 @@ function krRrnValid(value) {
|
|
|
291
537
|
const check = (11 - (sum % 11)) % 10;
|
|
292
538
|
return check === Number(digits[12]);
|
|
293
539
|
}
|
|
540
|
+
|
|
541
|
+
// US SSN structural validity (SSA allocation rules). The format `AAA-GG-SSSS`
|
|
542
|
+
// alone collides with arbitrary 9-digit ids, so we reject the never-issued
|
|
543
|
+
// ranges: area 000, 666, and 900-999; group 00; serial 0000. This is what turns
|
|
544
|
+
// the loose shape into a high-precision detection.
|
|
545
|
+
function usSsnValid(value) {
|
|
546
|
+
const match = /^(\d{3})-(\d{2})-(\d{4})$/.exec(value);
|
|
547
|
+
if (!match) {
|
|
548
|
+
return false;
|
|
549
|
+
}
|
|
550
|
+
const area = Number(match[1]);
|
|
551
|
+
const group = Number(match[2]);
|
|
552
|
+
const serial = Number(match[3]);
|
|
553
|
+
if (area === 0 || area === 666 || area >= 900) {
|
|
554
|
+
return false;
|
|
555
|
+
}
|
|
556
|
+
if (group === 0) {
|
|
557
|
+
return false;
|
|
558
|
+
}
|
|
559
|
+
if (serial === 0) {
|
|
560
|
+
return false;
|
|
561
|
+
}
|
|
562
|
+
return true;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// IBAN mod-97 checksum (ISO 7064 / ISO 13616). Move the first four chars to the
|
|
566
|
+
// end, map letters to 10-35, and the resulting integer must be congruent to 1
|
|
567
|
+
// mod 97. Computed digit-by-digit so the big integer never overflows. This
|
|
568
|
+
// checksum is the precision guarantee — random alnum runs almost never pass.
|
|
569
|
+
function ibanValid(value) {
|
|
570
|
+
const iban = value.replace(/\s/g, "").toUpperCase();
|
|
571
|
+
if (!/^[A-Z]{2}\d{2}[A-Z0-9]{11,30}$/.test(iban)) {
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
574
|
+
const rearranged = iban.slice(4) + iban.slice(0, 4);
|
|
575
|
+
let remainder = 0;
|
|
576
|
+
for (const char of rearranged) {
|
|
577
|
+
const mapped = /\d/.test(char) ? char : String(char.charCodeAt(0) - 55);
|
|
578
|
+
for (const digit of mapped) {
|
|
579
|
+
remainder = (remainder * 10 + Number(digit)) % 97;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
return remainder === 1;
|
|
583
|
+
}
|