haechi 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +97 -97
- package/README.md +2 -2
- package/SECURITY.md +19 -11
- package/docs/README.md +2 -0
- package/docs/current/api-stability.ko.md +26 -26
- package/docs/current/compliance-mapping.ko.md +53 -0
- package/docs/current/compliance-mapping.md +53 -0
- package/docs/current/config-version.ko.md +30 -0
- package/docs/current/config-version.md +51 -0
- package/docs/current/configuration.ko.md +242 -102
- package/docs/current/configuration.md +149 -9
- package/docs/current/operations-runbook.ko.md +121 -0
- package/docs/current/operations-runbook.md +204 -0
- package/docs/current/release-process.ko.md +19 -20
- package/docs/current/release-process.md +1 -2
- package/docs/current/reliability-hardening-track.ko.md +77 -0
- package/docs/current/reliability-hardening-track.md +77 -0
- package/docs/current/risk-register-release-gate.ko.md +26 -27
- package/docs/current/risk-register-release-gate.md +27 -20
- package/docs/current/security-whitepaper.ko.md +102 -0
- package/docs/current/security-whitepaper.md +102 -0
- package/docs/current/shared-responsibility.ko.md +33 -24
- package/docs/current/shared-responsibility.md +12 -3
- package/docs/current/threat-model.ko.md +12 -12
- package/docs/current/threat-model.md +3 -3
- package/haechi.config.example.json +19 -3
- package/package.json +6 -2
- package/packages/audit/index.mjs +26 -2
- package/packages/cli/bin/haechi.mjs +54 -8
- package/packages/cli/runtime.mjs +398 -10
- package/packages/core/index.mjs +189 -15
- package/packages/filter/index.mjs +299 -9
- package/packages/metrics/index.mjs +181 -0
- package/packages/proxy/index.mjs +535 -41
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
// The hard-block detection types: a leak of one of these is a load-bearing
|
|
2
|
+
// fail-closed concern, so the WS2c precision dials (filters.minConfidence,
|
|
3
|
+
// filters.allowlist) may NOT suppress a detection of any of them. minConfidence
|
|
4
|
+
// trims only the precision-risky SOFT types; the allowlist's per-value/per-path
|
|
5
|
+
// exceptions are ignored for these types (the detection still fires). Exported
|
|
6
|
+
// so the core detect→decide path enforces the same exemption set the docs pin.
|
|
7
|
+
export const HARD_BLOCK_TYPES = new Set(["secret", "api_key", "kr_rrn", "card"]);
|
|
8
|
+
|
|
1
9
|
const DEFAULT_RULES = [
|
|
2
10
|
{
|
|
3
11
|
id: "email",
|
|
@@ -9,9 +17,15 @@ const DEFAULT_RULES = [
|
|
|
9
17
|
{
|
|
10
18
|
// KR mobile numbers (01[016789] prefixes); landlines are out of scope.
|
|
11
19
|
// krPhoneValid keeps a bare separator-less run from matching a timestamp/id.
|
|
20
|
+
// The leading `(?<![\w+-])` / trailing `(?![\w-])` boundaries (WS2c) stop the
|
|
21
|
+
// rule from matching a phone-shaped digit run that is a SUBSTRING of a longer
|
|
22
|
+
// hex/alnum/dashed run — e.g. the `…a716-446655440000` tail of a UUID, where
|
|
23
|
+
// the inner `16-44665544` otherwise mis-fired as a phone. The boundaries
|
|
24
|
+
// never affect a real number: a KR mobile sits on a word/space/punctuation
|
|
25
|
+
// edge and `+82` starts on the `+` (allowed before the boundary).
|
|
12
26
|
id: "kr-phone",
|
|
13
27
|
type: "phone",
|
|
14
|
-
pattern: "(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}",
|
|
28
|
+
pattern: "(?<![\\w+-])(?:\\+82[-\\s]?)?0?1[016789][-.\\s]?\\d{3,4}[-.\\s]?\\d{4}(?![\\w-])",
|
|
15
29
|
flags: "g",
|
|
16
30
|
confidence: 0.9,
|
|
17
31
|
validate: krPhoneValid
|
|
@@ -40,6 +54,76 @@ const DEFAULT_RULES = [
|
|
|
40
54
|
confidence: 0.95
|
|
41
55
|
},
|
|
42
56
|
{
|
|
57
|
+
// AWS access key id: a long-lived (AKIA) or temporary (ASIA) key id is a
|
|
58
|
+
// hard-anchored prefix + EXACTLY 16 uppercase-alphanumeric chars. The fixed
|
|
59
|
+
// prefix + fixed length is what makes this high-precision (no bare base64).
|
|
60
|
+
id: "aws-access-key-id",
|
|
61
|
+
type: "api_key",
|
|
62
|
+
pattern: "\\b(?:AKIA|ASIA)[0-9A-Z]{16}\\b",
|
|
63
|
+
flags: "g",
|
|
64
|
+
confidence: 0.95
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
// GitHub token: pat (ghp_), oauth (gho_), user-to-server (ghu_), server-to-
|
|
68
|
+
// server (ghs_), refresh (ghr_). Anchored prefix + a long base64-ish body.
|
|
69
|
+
// GitHub's own format is 36 chars after the prefix; we allow >=36 (the
|
|
70
|
+
// corpus fixture is 38) and cap to keep the match bounded.
|
|
71
|
+
id: "github-token",
|
|
72
|
+
type: "secret",
|
|
73
|
+
pattern: "\\bgh[pousr]_[A-Za-z0-9]{36,255}\\b",
|
|
74
|
+
flags: "g",
|
|
75
|
+
confidence: 0.95
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
// Google API key: anchored AIza + exactly 35 chars from the URL-safe
|
|
79
|
+
// alphabet. Fixed prefix + fixed length = high precision.
|
|
80
|
+
id: "google-api-key",
|
|
81
|
+
type: "api_key",
|
|
82
|
+
pattern: "\\bAIza[0-9A-Za-z_-]{35}\\b",
|
|
83
|
+
flags: "g",
|
|
84
|
+
confidence: 0.9
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
// Slack token: bot (xoxb-), user (xoxa/xoxp-), refresh (xoxr-), legacy
|
|
88
|
+
// (xoxs-). Anchored xox[baprs]- + a >=10-char body. The corpus value is a
|
|
89
|
+
// deliberately low-entropy placeholder, so the rule anchors on the prefix +
|
|
90
|
+
// body shape, not entropy.
|
|
91
|
+
id: "slack-token",
|
|
92
|
+
type: "secret",
|
|
93
|
+
pattern: "\\bxox[baprs]-[0-9A-Za-z-]{10,}\\b",
|
|
94
|
+
flags: "g",
|
|
95
|
+
confidence: 0.9
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
// JWT: three dot-separated base64url segments where the FIRST starts with
|
|
99
|
+
// `eyJ` — the base64 of `{"`, i.e. the opening of the JSON header. Anchoring
|
|
100
|
+
// on `eyJ` + two more base64url groups keeps this from matching arbitrary
|
|
101
|
+
// dotted tokens (a bare base64 triplet without the JSON header is not a JWT).
|
|
102
|
+
id: "jwt",
|
|
103
|
+
type: "secret",
|
|
104
|
+
pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
|
|
105
|
+
flags: "g",
|
|
106
|
+
confidence: 0.9
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
// PEM private key: the armored header. We match the header line itself
|
|
110
|
+
// (`-----BEGIN [...] PRIVATE KEY-----`) — its presence is the credential
|
|
111
|
+
// signal; the body is high-entropy base64 we do not need to span. Covers
|
|
112
|
+
// RSA/EC/OPENSSH/DSA/ENCRYPTED variants and the bare `PRIVATE KEY` form.
|
|
113
|
+
id: "pem-private-key",
|
|
114
|
+
type: "secret",
|
|
115
|
+
pattern: "-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----",
|
|
116
|
+
flags: "g",
|
|
117
|
+
confidence: 0.98
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
// Bearer credential. Deliberately NOT context-anchored to `Authorization:`:
|
|
121
|
+
// detection runs PER STRING LEAF, and a real payload carries the credential
|
|
122
|
+
// as its own leaf (`{"Authorization": "Bearer <token>"}` walks to the bare
|
|
123
|
+
// value `"Bearer <token>"`), so a lookbehind requiring the header key in the
|
|
124
|
+
// same string would MISS the realistic case — a recall regression on a
|
|
125
|
+
// hard-block (`secret`) type. `secret` is fail-closed: a `Bearer …` prose
|
|
126
|
+
// false positive is the accepted cost of never missing a leaked token.
|
|
43
127
|
id: "bearer-token",
|
|
44
128
|
type: "secret",
|
|
45
129
|
pattern: "\\bBearer\\s+[A-Za-z0-9._~+/-]{16,}\\b",
|
|
@@ -50,11 +134,58 @@ const DEFAULT_RULES = [
|
|
|
50
134
|
id: "assignment-secret",
|
|
51
135
|
type: "secret",
|
|
52
136
|
// Lookbehind keeps the key name out of the match so transforms replace
|
|
53
|
-
// only the secret value, not the assignment prefix.
|
|
54
|
-
|
|
137
|
+
// only the secret value, not the assignment prefix. The key vocabulary
|
|
138
|
+
// covers the common credential-assignment names (cloud secrets, OAuth
|
|
139
|
+
// client secrets, PEM/private keys, access/refresh tokens) so a
|
|
140
|
+
// `<key> = <value>` leak is caught even when the value itself has no
|
|
141
|
+
// self-describing prefix (e.g. an AWS secret access key is bare base64).
|
|
142
|
+
pattern: "(?<=\\b(?:api[_-]?key|api[_-]?secret|secret[_-]?key|secret|aws[_-]?secret[_-]?access[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?token|refresh[_-]?token|token|password)\\s*[:=]\\s*['\\\"]?)[A-Za-z0-9._~+/-]{12,}",
|
|
55
143
|
flags: "gi",
|
|
56
144
|
confidence: 0.85
|
|
57
145
|
},
|
|
146
|
+
{
|
|
147
|
+
// US SSN: AAA-GG-SSSS. The format alone collides with 9-digit ids, so a
|
|
148
|
+
// validator rejects the SSA-invalid ranges (area 000/666/900-999, group 00,
|
|
149
|
+
// serial 0000). The separators are required by the pattern — a bare 9-digit
|
|
150
|
+
// run is intentionally NOT matched (it is indistinguishable from an id).
|
|
151
|
+
id: "us-ssn",
|
|
152
|
+
type: "us_ssn",
|
|
153
|
+
pattern: "(?<![\\w-])\\d{3}-\\d{2}-\\d{4}(?![\\w-])",
|
|
154
|
+
flags: "g",
|
|
155
|
+
confidence: 0.85,
|
|
156
|
+
validate: usSsnValid
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
// IBAN: country(2 alpha) + 2 check digits + BBAN. The mod-97 checksum is
|
|
160
|
+
// what makes this high-precision — a random alnum run of the right shape
|
|
161
|
+
// almost never satisfies mod-97 == 1. Length 15-34 per ISO 13616.
|
|
162
|
+
id: "iban",
|
|
163
|
+
type: "iban",
|
|
164
|
+
pattern: "(?<![A-Z0-9])[A-Z]{2}\\d{2}[A-Z0-9]{11,30}(?![A-Z0-9])",
|
|
165
|
+
flags: "g",
|
|
166
|
+
confidence: 0.9,
|
|
167
|
+
validate: ibanValid
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
// E.164 international phone: ONLY with a leading `+` (a bare digit run is an
|
|
171
|
+
// id/timestamp, never matched here). `+` country digit (1-9) then 6-14 more.
|
|
172
|
+
id: "e164-phone",
|
|
173
|
+
type: "phone",
|
|
174
|
+
pattern: "(?<![\\w+])\\+[1-9]\\d{6,14}(?![\\w])",
|
|
175
|
+
flags: "g",
|
|
176
|
+
confidence: 0.8
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
// US national phone: ONLY with separators — `(NXX) NXX-XXXX` or
|
|
180
|
+
// `NXX-NXX-XXXX`. A separator-less 10-digit run is deliberately NOT matched
|
|
181
|
+
// (it collides with ids/timestamps; the kr-phone rule already guards bare
|
|
182
|
+
// runs). Conservative by design — phone is the highest false-positive risk.
|
|
183
|
+
id: "us-phone",
|
|
184
|
+
type: "phone",
|
|
185
|
+
pattern: "(?<![\\w-])(?:\\(\\d{3}\\)\\s?|\\d{3}-)\\d{3}-\\d{4}(?![\\w-])",
|
|
186
|
+
flags: "g",
|
|
187
|
+
confidence: 0.75
|
|
188
|
+
},
|
|
58
189
|
// Indirect prompt injection heuristics. Response/tool-result direction only,
|
|
59
190
|
// and the policy default for the injection type is `allow` (report-only):
|
|
60
191
|
// detections are audited regardless of action, and false-positive blocks
|
|
@@ -139,8 +270,62 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
139
270
|
// marker-shaped string is NOT Haechi output (Haechi hasn't transformed it yet),
|
|
140
271
|
// so it is scanned normally — otherwise an attacker could wrap a real secret in
|
|
141
272
|
// a fake `[TOKEN:…]` to evade request-side detection.
|
|
273
|
+
// Markers are pure ASCII and NFKC-stable, so their spans are computed on the
|
|
274
|
+
// ORIGINAL value exactly as before — they line up with the same-length
|
|
275
|
+
// normalized scan (Case 2 below) and are irrelevant to the whole-leaf scan
|
|
276
|
+
// (Case 3).
|
|
142
277
|
const markerSpans = context?.direction === "response" ? haechiMarkerSpans(entry.value) : [];
|
|
143
278
|
|
|
279
|
+
// WS2d — Unicode evasion via NFKC normalization. A client can defeat every
|
|
280
|
+
// regex rule by sending PII/secrets in a Unicode form that folds to ASCII
|
|
281
|
+
// (full-width digits `4242…`, full-width `@`, mathematical/enclosed
|
|
282
|
+
// alphanumerics). NFKC normalization maps those to their compatibility ASCII
|
|
283
|
+
// form so the rules match. The crux is OFFSET INTEGRITY: detections carry
|
|
284
|
+
// {start,end} into entry.value, but the transform slices the ORIGINAL string
|
|
285
|
+
// (packages/core transformString). Three cases keep offsets valid:
|
|
286
|
+
const value = entry.value;
|
|
287
|
+
const normalized = value.normalize("NFKC");
|
|
288
|
+
if (normalized === value) {
|
|
289
|
+
// Case 1 (~99%): nothing folded. Detect on the original exactly as before —
|
|
290
|
+
// byte-identical behavior, zero regression.
|
|
291
|
+
return removeOverlaps(scanForDetections(value, rules, context, markerSpans, entry, value));
|
|
292
|
+
}
|
|
293
|
+
if (isPositionStableNfkc(value, normalized)) {
|
|
294
|
+
// Case 2: every codepoint folded to the SAME UTF-16 length and the per-
|
|
295
|
+
// codepoint folds reconstruct the whole normalization, so each original
|
|
296
|
+
// character occupies the SAME offsets in `normalized` as in `value` (e.g.
|
|
297
|
+
// full-width→ASCII digits/letters). A match's {start,end} on `normalized` are
|
|
298
|
+
// therefore valid on the ORIGINAL value — exact-span redaction of the evaded
|
|
299
|
+
// value, with the recorded `value` taken from the original slice so
|
|
300
|
+
// tokenize/AAD/audit see the real bytes. A bare `normalized.length ===
|
|
301
|
+
// value.length` check is UNSOUND: a length-contracting codepoint before the
|
|
302
|
+
// PII compensated by a length-expanding one after it keeps the total length
|
|
303
|
+
// equal yet shifts every interior offset (redacting the wrong bytes), so such
|
|
304
|
+
// inputs must fall through to the Case 3 whole-leaf path. Validators still run
|
|
305
|
+
// on the normalized match text (Luhn/RRN need ASCII digits).
|
|
306
|
+
return removeOverlaps(scanForDetections(normalized, rules, context, markerSpans, entry, value));
|
|
307
|
+
}
|
|
308
|
+
// Case 3: the fold is NOT position-stable (a length-changing decomposition, or a
|
|
309
|
+
// compensating contraction+expansion that shifts interior offsets). Offsets on
|
|
310
|
+
// the normalized copy do NOT map back to the original, so we CANNOT do exact-span
|
|
311
|
+
// redaction.
|
|
312
|
+
// FAIL CLOSED: emit ONE detection per matched type covering the WHOLE leaf so
|
|
313
|
+
// the transform redacts/blocks the entire leaf. Over-redacting an evasion
|
|
314
|
+
// attempt is the safe failure. removeOverlaps is intentionally skipped — every
|
|
315
|
+
// detection spans the whole leaf so they all "overlap"; the transform collapses
|
|
316
|
+
// them to a single whole-leaf replacement via its cursor, and any `block` among
|
|
317
|
+
// them blocks the payload, while preserving per-type detection reporting.
|
|
318
|
+
return wholeLeafDetections(normalized, rules, context, entry, value);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Run every applicable rule over `scanText` (the original value, or its
|
|
322
|
+
// same-length NFKC normalization). Offsets index `scanText`, which is positionally
|
|
323
|
+
// 1:1 with `originalValue` (Case 1: identical; Case 2: same UTF-16 length), so the
|
|
324
|
+
// {start,end} are valid on `originalValue`. The recorded `value` is the ORIGINAL
|
|
325
|
+
// slice (never the normalized form). Marker spans (response-only) are computed on
|
|
326
|
+
// the original and align under both cases.
|
|
327
|
+
function scanForDetections(scanText, rules, context, markerSpans, entry, originalValue) {
|
|
328
|
+
const detections = [];
|
|
144
329
|
for (const rule of rules) {
|
|
145
330
|
// Direction-scoped rules (e.g. injection heuristics) only run on the
|
|
146
331
|
// matching traffic direction; rules without a direction run everywhere.
|
|
@@ -148,13 +333,13 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
148
333
|
continue;
|
|
149
334
|
}
|
|
150
335
|
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
151
|
-
for (const match of
|
|
152
|
-
const
|
|
153
|
-
if (rule.validate && !rule.validate(
|
|
336
|
+
for (const match of scanText.matchAll(regex)) {
|
|
337
|
+
const matchText = match[0];
|
|
338
|
+
if (rule.validate && !rule.validate(matchText)) {
|
|
154
339
|
continue;
|
|
155
340
|
}
|
|
156
341
|
const start = match.index;
|
|
157
|
-
const end = match.index +
|
|
342
|
+
const end = match.index + matchText.length;
|
|
158
343
|
if (overlapsAny(start, end, markerSpans)) {
|
|
159
344
|
continue;
|
|
160
345
|
}
|
|
@@ -167,12 +352,73 @@ export function detectEntry(entry, rules, context = {}) {
|
|
|
167
352
|
start,
|
|
168
353
|
end,
|
|
169
354
|
confidence: rule.confidence,
|
|
170
|
-
value
|
|
355
|
+
value: originalValue.slice(start, end)
|
|
171
356
|
});
|
|
172
357
|
}
|
|
173
358
|
}
|
|
359
|
+
return detections;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Case 3 fail-closed scan: discover which types the NFKC-normalized text matches,
|
|
363
|
+
// then emit one whole-leaf detection per distinct type (start:0, end:value.length,
|
|
364
|
+
// value: the whole original leaf). The response-direction marker skip does NOT
|
|
365
|
+
// apply here: a length-divergent leaf cannot BE a Haechi marker (markers are ASCII
|
|
366
|
+
// and NFKC-stable), so an evasion attempt can never masquerade as one.
|
|
367
|
+
function wholeLeafDetections(normalized, rules, context, entry, originalValue) {
|
|
368
|
+
const seenTypes = new Set();
|
|
369
|
+
const detections = [];
|
|
370
|
+
for (const rule of rules) {
|
|
371
|
+
if (rule.direction && rule.direction !== context?.direction) {
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
if (seenTypes.has(rule.type)) {
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
const regex = new RegExp(rule.pattern, rule.flags.includes("g") ? rule.flags : `${rule.flags}g`);
|
|
378
|
+
let matched = false;
|
|
379
|
+
for (const match of normalized.matchAll(regex)) {
|
|
380
|
+
if (!rule.validate || rule.validate(match[0])) {
|
|
381
|
+
matched = true;
|
|
382
|
+
break;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
if (!matched) {
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
seenTypes.add(rule.type);
|
|
389
|
+
detections.push({
|
|
390
|
+
type: rule.type,
|
|
391
|
+
ruleId: rule.id,
|
|
392
|
+
path: entry.path,
|
|
393
|
+
pathText: entry.pathText,
|
|
394
|
+
kind: entry.kind ?? "value",
|
|
395
|
+
start: 0,
|
|
396
|
+
end: originalValue.length,
|
|
397
|
+
confidence: rule.confidence,
|
|
398
|
+
value: originalValue
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
return detections;
|
|
402
|
+
}
|
|
174
403
|
|
|
175
|
-
|
|
404
|
+
// Sound precondition for Case 2: a match's {start,end} on the NFKC-normalized
|
|
405
|
+
// text map 1:1 onto the ORIGINAL value. True only when EVERY codepoint folds to
|
|
406
|
+
// the same number of UTF-16 units (so no interior offset shifts) AND the per-
|
|
407
|
+
// codepoint folds concatenate to the whole normalization (so no cross-boundary
|
|
408
|
+
// composition moved content). The bare `normalized.length === value.length` check
|
|
409
|
+
// is unsound — a contraction before the PII compensated by an expansion after it
|
|
410
|
+
// keeps the total length equal while shifting every interior offset, redacting the
|
|
411
|
+
// wrong bytes. Runs only on a leaf that actually folded (normalized !== value).
|
|
412
|
+
function isPositionStableNfkc(value, normalized) {
|
|
413
|
+
let rebuilt = "";
|
|
414
|
+
for (const ch of value) {
|
|
415
|
+
const folded = ch.normalize("NFKC");
|
|
416
|
+
if (folded.length !== ch.length) {
|
|
417
|
+
return false;
|
|
418
|
+
}
|
|
419
|
+
rebuilt += folded;
|
|
420
|
+
}
|
|
421
|
+
return rebuilt === normalized;
|
|
176
422
|
}
|
|
177
423
|
|
|
178
424
|
// Spans of Haechi's own transform markers in a string, so detection can skip
|
|
@@ -291,3 +537,47 @@ function krRrnValid(value) {
|
|
|
291
537
|
const check = (11 - (sum % 11)) % 10;
|
|
292
538
|
return check === Number(digits[12]);
|
|
293
539
|
}
|
|
540
|
+
|
|
541
|
+
// US SSN structural validity (SSA allocation rules). The format `AAA-GG-SSSS`
|
|
542
|
+
// alone collides with arbitrary 9-digit ids, so we reject the never-issued
|
|
543
|
+
// ranges: area 000, 666, and 900-999; group 00; serial 0000. This is what turns
|
|
544
|
+
// the loose shape into a high-precision detection.
|
|
545
|
+
function usSsnValid(value) {
|
|
546
|
+
const match = /^(\d{3})-(\d{2})-(\d{4})$/.exec(value);
|
|
547
|
+
if (!match) {
|
|
548
|
+
return false;
|
|
549
|
+
}
|
|
550
|
+
const area = Number(match[1]);
|
|
551
|
+
const group = Number(match[2]);
|
|
552
|
+
const serial = Number(match[3]);
|
|
553
|
+
if (area === 0 || area === 666 || area >= 900) {
|
|
554
|
+
return false;
|
|
555
|
+
}
|
|
556
|
+
if (group === 0) {
|
|
557
|
+
return false;
|
|
558
|
+
}
|
|
559
|
+
if (serial === 0) {
|
|
560
|
+
return false;
|
|
561
|
+
}
|
|
562
|
+
return true;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// IBAN mod-97 checksum (ISO 7064 / ISO 13616). Move the first four chars to the
|
|
566
|
+
// end, map letters to 10-35, and the resulting integer must be congruent to 1
|
|
567
|
+
// mod 97. Computed digit-by-digit so the big integer never overflows. This
|
|
568
|
+
// checksum is the precision guarantee — random alnum runs almost never pass.
|
|
569
|
+
function ibanValid(value) {
|
|
570
|
+
const iban = value.replace(/\s/g, "").toUpperCase();
|
|
571
|
+
if (!/^[A-Z]{2}\d{2}[A-Z0-9]{11,30}$/.test(iban)) {
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
574
|
+
const rearranged = iban.slice(4) + iban.slice(0, 4);
|
|
575
|
+
let remainder = 0;
|
|
576
|
+
for (const char of rearranged) {
|
|
577
|
+
const mapped = /\d/.test(char) ? char : String(char.charCodeAt(0) - 55);
|
|
578
|
+
for (const digit of mapped) {
|
|
579
|
+
remainder = (remainder * 10 + Number(digit)) % 97;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
return remainder === 1;
|
|
583
|
+
}
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
// WS4-A telemetry seam (reliability-hardening-track §WS4 "Telemetry").
|
|
2
|
+
//
|
|
3
|
+
// A minimal, zero-dependency in-memory metrics collector rendering the
|
|
4
|
+
// Prometheus text exposition format. It is an INJECTABLE collaborator
|
|
5
|
+
// (providers.metrics in createRuntime), mirroring auditSink/rateLimiter; an
|
|
6
|
+
// operator who wants a real metrics backend injects their own object exposing
|
|
7
|
+
// the same { increment, observe, render } contract.
|
|
8
|
+
//
|
|
9
|
+
// HARD INVARIANT (the no-plaintext-in-audit invariant, extended to telemetry):
|
|
10
|
+
// every metric name AND every label value is a BOUNDED ENUM — a route id, a
|
|
11
|
+
// policy mode, or a decision class. It is NEVER an identity id/subject, a token,
|
|
12
|
+
// a detected value, or any other unbounded/PII-bearing string. This module does
|
|
13
|
+
// not — and structurally cannot — accept a payload value: callers pass only
|
|
14
|
+
// pre-classified enum labels, and label values are coerced + length-capped here
|
|
15
|
+
// as defence in depth so an accidental high-cardinality value cannot explode
|
|
16
|
+
// the series set or leak content.
|
|
17
|
+
|
|
18
|
+
// Metric catalogue: name -> { type, help }. Counters and one histogram. Keeping
|
|
19
|
+
// the catalogue explicit (rather than letting callers invent metric names)
|
|
20
|
+
// bounds the metric-name dimension to this fixed set.
|
|
21
|
+
const COUNTERS = {
|
|
22
|
+
haechi_requests_total: "Proxy requests by route, mode, and decision class.",
|
|
23
|
+
haechi_blocks_total: "Requests blocked by a policy decision.",
|
|
24
|
+
haechi_auth_denied_total: "Requests denied at authentication.",
|
|
25
|
+
haechi_rate_limited_total: "Requests rejected by the rate limiter.",
|
|
26
|
+
haechi_upstream_timeout_total: "Upstream requests that timed out.",
|
|
27
|
+
haechi_upstream_error_total: "Upstream requests that failed (non-timeout).",
|
|
28
|
+
haechi_response_unprotected_total: "Responses forwarded without protection (size/encoding/parse).",
|
|
29
|
+
haechi_internal_error_total: "Unexpected internal proxy errors.",
|
|
30
|
+
haechi_overloaded_total: "Requests rejected by the max-in-flight backpressure ceiling (503)."
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
const HISTOGRAMS = {
|
|
34
|
+
haechi_request_duration_seconds: "End-to-end proxy request handling duration in seconds."
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
// Default request-duration histogram buckets (seconds). Bounded, fixed set.
|
|
38
|
+
const DEFAULT_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10];
|
|
39
|
+
|
|
40
|
+
// Defence-in-depth label hygiene: a label value must be a short, bounded token.
|
|
41
|
+
// We coerce to string, trim, cap length, and collapse anything outside a safe
|
|
42
|
+
// charset to "_". This guarantees that even a caller mistake cannot place a raw
|
|
43
|
+
// payload value or a long identity string into a series label.
|
|
44
|
+
const MAX_LABEL_LENGTH = 64;
|
|
45
|
+
|
|
46
|
+
function safeLabelValue(value) {
|
|
47
|
+
if (value === undefined || value === null) {
|
|
48
|
+
return "none";
|
|
49
|
+
}
|
|
50
|
+
const text = String(value).slice(0, MAX_LABEL_LENGTH);
|
|
51
|
+
// Allow a conservative identifier charset only (route ids, modes, decisions
|
|
52
|
+
// are all of this shape). Everything else becomes "_".
|
|
53
|
+
return text.replace(/[^A-Za-z0-9_.:/-]/g, "_") || "none";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function seriesKey(name, labels) {
|
|
57
|
+
const parts = Object.keys(labels)
|
|
58
|
+
.sort()
|
|
59
|
+
.map((labelName) => `${labelName}="${escapeLabel(labels[labelName])}"`);
|
|
60
|
+
return parts.length > 0 ? `${name}{${parts.join(",")}}` : name;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function escapeLabel(value) {
|
|
64
|
+
return String(value).replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, "\\\"");
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function createMetrics({ buckets = DEFAULT_BUCKETS } = {}) {
|
|
68
|
+
// counterSeries: metricName -> Map(seriesKey -> { labels, value })
|
|
69
|
+
const counterSeries = new Map();
|
|
70
|
+
// histogramSeries: metricName -> Map(seriesKey -> { labels, bucketCounts, sum, count })
|
|
71
|
+
const histogramSeries = new Map();
|
|
72
|
+
const sortedBuckets = [...buckets].sort((a, b) => a - b);
|
|
73
|
+
|
|
74
|
+
function normalizeLabels(labels = {}) {
|
|
75
|
+
const out = {};
|
|
76
|
+
for (const [key, value] of Object.entries(labels)) {
|
|
77
|
+
// Label NAMES are caller-fixed identifiers; coerce defensively anyway.
|
|
78
|
+
const labelName = String(key).replace(/[^A-Za-z0-9_]/g, "_");
|
|
79
|
+
out[labelName] = safeLabelValue(value);
|
|
80
|
+
}
|
|
81
|
+
return out;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
// Increment a known counter by `amount` (default 1), labelled by a bounded
|
|
86
|
+
// enum set. An unknown metric name is ignored (fail-soft for telemetry — a
|
|
87
|
+
// metric typo must never break a request path).
|
|
88
|
+
increment(name, labels = {}, amount = 1) {
|
|
89
|
+
if (!(name in COUNTERS)) {
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
const safe = normalizeLabels(labels);
|
|
93
|
+
const key = seriesKey(name, safe);
|
|
94
|
+
let series = counterSeries.get(name);
|
|
95
|
+
if (!series) {
|
|
96
|
+
series = new Map();
|
|
97
|
+
counterSeries.set(name, series);
|
|
98
|
+
}
|
|
99
|
+
const existing = series.get(key);
|
|
100
|
+
if (existing) {
|
|
101
|
+
existing.value += amount;
|
|
102
|
+
} else {
|
|
103
|
+
series.set(key, { labels: safe, value: amount });
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
|
|
107
|
+
// Observe a value into a known histogram (request-duration seconds).
|
|
108
|
+
observe(name, value, labels = {}) {
|
|
109
|
+
if (!(name in HISTOGRAMS) || typeof value !== "number" || !Number.isFinite(value)) {
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
const safe = normalizeLabels(labels);
|
|
113
|
+
const key = seriesKey(name, safe);
|
|
114
|
+
let series = histogramSeries.get(name);
|
|
115
|
+
if (!series) {
|
|
116
|
+
series = new Map();
|
|
117
|
+
histogramSeries.set(name, series);
|
|
118
|
+
}
|
|
119
|
+
let entry = series.get(key);
|
|
120
|
+
if (!entry) {
|
|
121
|
+
entry = { labels: safe, bucketCounts: new Array(sortedBuckets.length).fill(0), sum: 0, count: 0 };
|
|
122
|
+
series.set(key, entry);
|
|
123
|
+
}
|
|
124
|
+
entry.sum += value;
|
|
125
|
+
entry.count += 1;
|
|
126
|
+
for (let i = 0; i < sortedBuckets.length; i += 1) {
|
|
127
|
+
if (value <= sortedBuckets[i]) {
|
|
128
|
+
entry.bucketCounts[i] += 1;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
|
|
133
|
+
// Render the full Prometheus text exposition. Every declared counter and
|
|
134
|
+
// histogram emits its HELP/TYPE header even with no observations, so the
|
|
135
|
+
// surface is stable for scrapers.
|
|
136
|
+
render() {
|
|
137
|
+
const lines = [];
|
|
138
|
+
|
|
139
|
+
for (const [name, help] of Object.entries(COUNTERS)) {
|
|
140
|
+
lines.push(`# HELP ${name} ${help}`);
|
|
141
|
+
lines.push(`# TYPE ${name} counter`);
|
|
142
|
+
const series = counterSeries.get(name);
|
|
143
|
+
if (series) {
|
|
144
|
+
for (const { labels, value } of series.values()) {
|
|
145
|
+
lines.push(`${seriesKey(name, labels)} ${value}`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
for (const [name, help] of Object.entries(HISTOGRAMS)) {
|
|
151
|
+
lines.push(`# HELP ${name} ${help}`);
|
|
152
|
+
lines.push(`# TYPE ${name} histogram`);
|
|
153
|
+
const series = histogramSeries.get(name);
|
|
154
|
+
if (series) {
|
|
155
|
+
for (const entry of series.values()) {
|
|
156
|
+
// bucketCounts[i] already holds the cumulative count of observations
|
|
157
|
+
// with value <= sortedBuckets[i] (observe() increments every bucket
|
|
158
|
+
// the value falls under), which is exactly the Prometheus le="..."
|
|
159
|
+
// cumulative bucket semantics — emit it directly.
|
|
160
|
+
for (let i = 0; i < sortedBuckets.length; i += 1) {
|
|
161
|
+
const labels = { ...entry.labels, le: String(sortedBuckets[i]) };
|
|
162
|
+
lines.push(`${seriesKey(`${name}_bucket`, labels)} ${entry.bucketCounts[i]}`);
|
|
163
|
+
}
|
|
164
|
+
const infLabels = { ...entry.labels, le: "+Inf" };
|
|
165
|
+
lines.push(`${seriesKey(`${name}_bucket`, infLabels)} ${entry.count}`);
|
|
166
|
+
lines.push(`${seriesKey(`${name}_sum`, entry.labels)} ${entry.sum}`);
|
|
167
|
+
lines.push(`${seriesKey(`${name}_count`, entry.labels)} ${entry.count}`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return `${lines.join("\n")}\n`;
|
|
173
|
+
}
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Exported for tests / operators who want to assert the bounded metric surface.
|
|
178
|
+
export const METRIC_NAMES = Object.freeze({
|
|
179
|
+
counters: Object.keys(COUNTERS),
|
|
180
|
+
histograms: Object.keys(HISTOGRAMS)
|
|
181
|
+
});
|