@bookedsolid/rea 0.31.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.husky/prepare-commit-msg +80 -6
  2. package/MIGRATING.md +24 -15
  3. package/dist/cli/hook.js +60 -22
  4. package/dist/hooks/_lib/halt-check.d.ts +78 -0
  5. package/dist/hooks/_lib/halt-check.js +106 -0
  6. package/dist/hooks/_lib/payload.d.ts +124 -0
  7. package/dist/hooks/_lib/payload.js +245 -0
  8. package/dist/hooks/_lib/segments.d.ts +125 -0
  9. package/dist/hooks/_lib/segments.js +766 -0
  10. package/dist/hooks/architecture-review-gate/index.d.ts +58 -0
  11. package/dist/hooks/architecture-review-gate/index.js +250 -0
  12. package/dist/hooks/attribution-advisory/index.d.ts +72 -0
  13. package/dist/hooks/attribution-advisory/index.js +233 -0
  14. package/dist/hooks/bash-scanner/protected-scan.js +14 -2
  15. package/dist/hooks/changeset-security-gate/index.d.ts +71 -0
  16. package/dist/hooks/changeset-security-gate/index.js +330 -0
  17. package/dist/hooks/dependency-audit-gate/index.d.ts +91 -0
  18. package/dist/hooks/dependency-audit-gate/index.js +294 -0
  19. package/dist/hooks/env-file-protection/index.d.ts +55 -0
  20. package/dist/hooks/env-file-protection/index.js +159 -0
  21. package/dist/hooks/pr-issue-link-gate/index.d.ts +91 -0
  22. package/dist/hooks/pr-issue-link-gate/index.js +127 -0
  23. package/dist/hooks/security-disclosure-gate/index.d.ts +91 -0
  24. package/dist/hooks/security-disclosure-gate/index.js +502 -0
  25. package/hooks/_lib/protected-paths.sh +10 -3
  26. package/hooks/architecture-review-gate.sh +92 -77
  27. package/hooks/attribution-advisory.sh +139 -131
  28. package/hooks/changeset-security-gate.sh +114 -149
  29. package/hooks/dependency-audit-gate.sh +115 -156
  30. package/hooks/env-file-protection.sh +130 -97
  31. package/hooks/pr-issue-link-gate.sh +114 -45
  32. package/hooks/security-disclosure-gate.sh +148 -316
  33. package/hooks/settings-protection.sh +13 -9
  34. package/package.json +1 -1
  35. package/templates/architecture-review-gate.dogfood-staged.sh +116 -0
  36. package/templates/attribution-advisory.dogfood-staged.sh +170 -0
  37. package/templates/changeset-security-gate.dogfood-staged.sh +137 -0
  38. package/templates/dependency-audit-gate.dogfood-staged.sh +138 -0
  39. package/templates/env-file-protection.dogfood-staged.sh +157 -0
  40. package/templates/pr-issue-link-gate.dogfood-staged.sh +134 -0
  41. package/templates/prepare-commit-msg.husky.sh +80 -6
  42. package/templates/security-disclosure-gate.dogfood-staged.sh +171 -0
  43. package/templates/settings-protection.dogfood.patch +58 -0
@@ -0,0 +1,766 @@
1
+ /**
2
+ * Quote-aware shell-segment splitter for the Node-binary hook tier.
3
+ *
4
+ * 0.32.0 — port of the relevant primitives in
5
+ * `hooks/_lib/cmd-segments.sh`. The bash helper is 1002 LOC of
6
+ * defense-in-depth (heredoc unwrapping, nested-shell recursion,
7
+ * env-var-assignment stripping, etc.) — most of those branches exist
8
+ * to defend against bypass attempts in WRITE-tier gates (`dangerous-
9
+ * bash-interceptor`, `dependency-audit-gate`). The Phase 1 pilots
10
+ * landing in 0.32.0 (`security-disclosure-gate`,
11
+ * `attribution-advisory`) only need the SUBSET of segment behavior
12
+ * those two hooks actually exercise:
13
+ *
14
+ * 1. Split the input on shell command separators (`;`, `&&`, `||`,
15
+ * `|`, `&`, newline) while masking separators that appear inside
16
+ * matched `"..."` and `'...'` quote spans.
17
+ * 2. For each segment, strip leading `sudo`, `exec`, `time`, `then`,
18
+ * `do`, `else`, `fi`, and `VAR=value` env-prefixes so the
19
+ * caller's regex can anchor at the segment's actual command head.
20
+ * 3. Expose two query primitives:
21
+ * - `anySegmentStartsWith(cmd, regexHead)`
22
+ * true if any segment's prefix-stripped head matches the
23
+ * head-anchored regex.
24
+ * - `anySegmentMatches(cmd, regex)`
25
+ * true if any segment's raw (non-stripped) text contains a
26
+ * match for the regex (used for content scans like
27
+ * `Co-Authored-By:` markers inside `git commit -m "..."`).
28
+ *
29
+ * Out-of-scope vs. the bash helper:
30
+ *
31
+ * - No heredoc body extraction. The pilots match on the command
32
+ * line, not on heredoc contents. (Body-file resolution in
33
+ * `security-disclosure-gate` is done separately by reading the
34
+ * file path off the command.)
35
+ * - No nested-shell unwrapping (`bash -c 'PAYLOAD'`). The
36
+ * bash-scanner walker already handles that for the WRITE gates;
37
+ * the Phase 1 pilots inherit the SECURITY guarantee that any
38
+ * hostile nested shell would have been refused by the bash-scanner
39
+ * tier BEFORE this advisory tier ran.
40
+ * - No backtick/command-substitution recursion.
41
+ *
42
+ * If a future pilot needs those branches, port them here in a
43
+ * subsequent release. The CURRENT pilots' bash counterparts call only
44
+ * `any_segment_starts_with` and `any_segment_matches` against
45
+ * direct-stdin commands.
46
+ *
47
+ * Quote-handling parity with cmd-segments.sh:
48
+ *
49
+ * - Double-quoted spans (`"..."`): `\"` and `\\` are literal escapes;
50
+ * all other characters are literal.
51
+ * - Single-quoted spans (`'...'`): no escape semantics; every
52
+ * character is literal until the next `'`.
53
+ * - Unterminated quote spans extend to end-of-input (caller's bug —
54
+ * we still emit a single segment for it rather than throwing).
55
+ * - Backslash outside quotes escapes the following character (so
56
+ * `git commit \&\& foo` parses as a single segment, matching
57
+ * bash's behavior).
58
+ */
59
+ /**
60
+ * Sentinel bytes used to mask separators that appear inside quote
61
+ * spans before splitting. Multi-byte and not legal in shell command
62
+ * input — collisions are impossible for any realistic payload.
63
+ *
64
+ * The byte choices (0x1c – 0x1f are ASCII file-separator / group-
65
+ * separator / record-separator / unit-separator) are the same range
66
+ * `cmd-segments.sh` uses for its in-quote masking. We never expose
67
+ * them externally; they exist only during the split and are restored
68
+ * verbatim in the emitted segment text.
69
+ */
70
+ const MASK = {
71
+ SEMI: '\x1c\x10S\x1d',
72
+ AMP_AMP: '\x1c\x10A\x10A\x1d',
73
+ PIPE_PIPE: '\x1c\x10P\x10P\x1d',
74
+ PIPE: '\x1c\x10P\x1d',
75
+ AMP: '\x1c\x10A\x1d',
76
+ NEWLINE: '\x1c\x10N\x1d',
77
+ };
78
+ /**
79
+ * Replace separators inside quote spans with sentinels so the split
80
+ * walker doesn't see them. After splitting, the sentinels are
81
+ * unmasked back to their literal characters in each emitted segment.
82
+ */
83
+ function maskQuotedSeparators(cmd) {
84
+ let out = '';
85
+ let i = 0;
86
+ const n = cmd.length;
87
+ let mode = 'plain';
88
+ while (i < n) {
89
+ const ch = cmd[i];
90
+ if (mode === 'plain') {
91
+ if (ch === '\\' && i + 1 < n) {
92
+ // Backslash escapes the next character — emit both verbatim;
93
+ // the split walker treats `\` as not-a-separator so escaped
94
+ // `\&\&` etc. survive into the segment.
95
+ out += ch + cmd[i + 1];
96
+ i += 2;
97
+ continue;
98
+ }
99
+ if (ch === '"') {
100
+ mode = 'dquote';
101
+ out += ch;
102
+ i += 1;
103
+ continue;
104
+ }
105
+ if (ch === "'") {
106
+ mode = 'squote';
107
+ out += ch;
108
+ i += 1;
109
+ continue;
110
+ }
111
+ out += ch;
112
+ i += 1;
113
+ continue;
114
+ }
115
+ if (mode === 'dquote') {
116
+ if (ch === '\\' && i + 1 < n) {
117
+ out += ch + cmd[i + 1];
118
+ i += 2;
119
+ continue;
120
+ }
121
+ if (ch === '"') {
122
+ mode = 'plain';
123
+ out += ch;
124
+ i += 1;
125
+ continue;
126
+ }
127
+ // Mask separators inside double-quoted spans.
128
+ if (ch === ';') {
129
+ out += MASK.SEMI;
130
+ i += 1;
131
+ continue;
132
+ }
133
+ if (ch === '&' && cmd[i + 1] === '&') {
134
+ out += MASK.AMP_AMP;
135
+ i += 2;
136
+ continue;
137
+ }
138
+ if (ch === '|' && cmd[i + 1] === '|') {
139
+ out += MASK.PIPE_PIPE;
140
+ i += 2;
141
+ continue;
142
+ }
143
+ if (ch === '|') {
144
+ out += MASK.PIPE;
145
+ i += 1;
146
+ continue;
147
+ }
148
+ if (ch === '&') {
149
+ out += MASK.AMP;
150
+ i += 1;
151
+ continue;
152
+ }
153
+ if (ch === '\n') {
154
+ out += MASK.NEWLINE;
155
+ i += 1;
156
+ continue;
157
+ }
158
+ out += ch;
159
+ i += 1;
160
+ continue;
161
+ }
162
+ // mode === 'squote' — no escape semantics; mask separators verbatim.
163
+ if (ch === "'") {
164
+ mode = 'plain';
165
+ out += ch;
166
+ i += 1;
167
+ continue;
168
+ }
169
+ if (ch === ';') {
170
+ out += MASK.SEMI;
171
+ i += 1;
172
+ continue;
173
+ }
174
+ if (ch === '&' && cmd[i + 1] === '&') {
175
+ out += MASK.AMP_AMP;
176
+ i += 2;
177
+ continue;
178
+ }
179
+ if (ch === '|' && cmd[i + 1] === '|') {
180
+ out += MASK.PIPE_PIPE;
181
+ i += 2;
182
+ continue;
183
+ }
184
+ if (ch === '|') {
185
+ out += MASK.PIPE;
186
+ i += 1;
187
+ continue;
188
+ }
189
+ if (ch === '&') {
190
+ out += MASK.AMP;
191
+ i += 1;
192
+ continue;
193
+ }
194
+ if (ch === '\n') {
195
+ out += MASK.NEWLINE;
196
+ i += 1;
197
+ continue;
198
+ }
199
+ out += ch;
200
+ i += 1;
201
+ }
202
+ return out;
203
+ }
204
+ /**
205
+ * Reverse the masking. Sentinels become their literal separator
206
+ * character again so the emitted segment text reads as the caller
207
+ * authored it.
208
+ */
209
+ function unmask(text) {
210
+ return text
211
+ .replace(/\x1c\x10S\x1d/g, ';')
212
+ .replace(/\x1c\x10A\x10A\x1d/g, '&&')
213
+ .replace(/\x1c\x10P\x10P\x1d/g, '||')
214
+ .replace(/\x1c\x10P\x1d/g, '|')
215
+ .replace(/\x1c\x10A\x1d/g, '&')
216
+ .replace(/\x1c\x10N\x1d/g, '\n');
217
+ }
218
+ /**
219
+ * Split the masked command on UNQUOTED separators. The masking pass
220
+ * already replaced in-quote separators with sentinels, so a plain
221
+ * regex split is now safe.
222
+ *
223
+ * The split pattern matches any of: `;`, `&&`, `||`, `|`, `&` (when
224
+ * not part of `&&`), newline. We use a single regex with a lookbehind
225
+ * to avoid splitting `&&` as two `&`s.
226
+ *
227
+ * `\\` escapes the next character — we don't want to split on `\;`
228
+ * either. Handled by checking the preceding character is NOT `\`
229
+ * (lookbehind).
230
+ */
231
+ function splitOnUnquotedSeparators(masked) {
232
+ // 2026-05-15 codex round-3 P1 fix: walk char-by-char tracking
233
+ // backslash-escape state instead of using regex lookbehind. The
234
+ // pre-fix regex `(?<!\\)(...)` was a single-char negative lookbehind
235
+ // which treated `echo \\;` as "preceded by `\` → no split". But in
236
+ // bash semantics, `\\` is a literal `\` escape PAIR — the `;` that
237
+ // follows it is NOT escaped, so the command splits into two
238
+ // segments. The pre-fix splitter let `echo \\; npm install evil`
239
+ // pass as a single segment, defeating the dependency-audit-gate
240
+ // segment-anchor check and several other consumers.
241
+ //
242
+ // Strategy: walk left-to-right. When we encounter `\`, advance past
243
+ // the next character (the escape pair consumes 2 bytes). When we
244
+ // encounter a recognized separator at a non-pair position, emit a
245
+ // split. This matches bash's argv-tokenizer semantics for
246
+ // backslash-escape parity.
247
+ //
248
+ // The masker is byte-width-preserving so we can walk `masked`
249
+ // directly without re-syncing with the original.
250
+ const segments = [];
251
+ let segStart = 0;
252
+ let i = 0;
253
+ const n = masked.length;
254
+ while (i < n) {
255
+ const ch = masked[i];
256
+ if (ch === '\\' && i + 1 < n) {
257
+ // Escape pair — consume both, NEVER treat the next char as a
258
+ // separator. Bash `\\` is a literal `\`; the char following
259
+ // the pair is then evaluated for separator status.
260
+ i += 2;
261
+ continue;
262
+ }
263
+ // Separator detection. Order matters: `&&` and `||` are 2-byte
264
+ // separators; the 1-byte forms must not steal their first byte.
265
+ let sepLen = 0;
266
+ if (ch === '&' && masked[i + 1] === '&')
267
+ sepLen = 2;
268
+ else if (ch === '|' && masked[i + 1] === '|')
269
+ sepLen = 2;
270
+ else if (ch === ';' || ch === '|' || ch === '&' || ch === '\n')
271
+ sepLen = 1;
272
+ if (sepLen > 0) {
273
+ const piece = masked.slice(segStart, i);
274
+ const trimmed = piece.trim();
275
+ if (trimmed.length > 0)
276
+ segments.push(trimmed);
277
+ i += sepLen;
278
+ segStart = i;
279
+ continue;
280
+ }
281
+ i += 1;
282
+ }
283
+ // Tail.
284
+ if (segStart < n) {
285
+ const piece = masked.slice(segStart, n);
286
+ const trimmed = piece.trim();
287
+ if (trimmed.length > 0)
288
+ segments.push(trimmed);
289
+ }
290
+ return segments;
291
+ }
292
+ /**
293
+ * Patterns that may precede a real command head in a segment. Mirrors
294
+ * the catalog in `cmd-segments.sh#strip_segment_prefix`. Order matters
295
+ * — env-var-assignment must come AFTER `sudo` because `sudo VAR=x cmd`
296
+ * is a real shape.
297
+ *
298
+ * `--<flag>=<value>` is NOT stripped — those are part of the command.
299
+ */
300
+ const LEADING_KEYWORDS = ['sudo', 'exec', 'time', 'then', 'do', 'else', 'fi'];
301
+ /**
302
+ * Match an env-var assignment at the head of a segment, INCLUDING
303
+ * quoted and ANSI-C values. Codex round 1 P1 (2026-05-15): the
304
+ * pre-fix pattern was `^[A-Za-z_][A-Za-z0-9_]*=\S*\s+` which only
305
+ * matched unquoted single-token values. The bash helper this
306
+ * replaces handles five shapes the prior regex missed:
307
+ *
308
+ * 1. `KEY="value with spaces" cmd` (double-quoted)
309
+ * 2. `KEY='value with spaces' cmd` (single-quoted)
310
+ * 3. `KEY=$'ANSI-C\\nvalue' cmd` (ANSI-C escape form)
311
+ * 4. `KEY=` (empty value)
312
+ * 5. `KEY=value cmd` (unquoted, the old form)
313
+ *
314
+ * Without coverage of (1)-(3), an attacker could hide a relevant
315
+ * command head behind `REA_SKIP="urgent" gh issue create …` and
316
+ * the `gh issue create` head would never reach the matcher in
317
+ * `runSecurityDisclosureGate` / `runAttributionAdvisory`.
318
+ *
319
+ * Returns the consumed prefix length, or 0 if no env assignment.
320
+ */
321
+ function matchEnvAssignLength(seg) {
322
+ // Variable-name prefix: `[A-Za-z_][A-Za-z0-9_]*=`. Strict POSIX
323
+ // identifier — bash itself rejects names starting with a digit.
324
+ const namePrefix = /^[A-Za-z_][A-Za-z0-9_]*=/.exec(seg);
325
+ if (namePrefix === null)
326
+ return 0;
327
+ let i = namePrefix[0].length;
328
+ const n = seg.length;
329
+ if (i >= n)
330
+ return 0; // `KEY=` followed by nothing — not a prefix.
331
+ // Determine the value-form by the first character after `=`.
332
+ const ch = seg[i];
333
+ // 3. ANSI-C form: `$'…'`. Consume up to the matching `'`,
334
+ // honoring backslash escapes (so `$'a\\'b'` → contents are
335
+ // `a\'b`, terminator is the third `'`). Bash forbids the
336
+ // closing quote from being escaped — the `$'` shape uses C
337
+ // string conventions, not shell-quote conventions.
338
+ if (ch === '$' && i + 1 < n && seg[i + 1] === "'") {
339
+ i += 2; // consume `$'`
340
+ while (i < n && seg[i] !== "'") {
341
+ if (seg[i] === '\\' && i + 1 < n) {
342
+ i += 2;
343
+ continue;
344
+ }
345
+ i += 1;
346
+ }
347
+ if (i >= n)
348
+ return 0; // unterminated — not a clean prefix.
349
+ i += 1; // consume closing `'`
350
+ }
351
+ else if (ch === '"') {
352
+ // 1. Double-quoted form. `\"` and `\\` are escapes.
353
+ i += 1;
354
+ while (i < n && seg[i] !== '"') {
355
+ if (seg[i] === '\\' && i + 1 < n) {
356
+ i += 2;
357
+ continue;
358
+ }
359
+ i += 1;
360
+ }
361
+ if (i >= n)
362
+ return 0;
363
+ i += 1;
364
+ }
365
+ else if (ch === "'") {
366
+ // 2. Single-quoted form. No escapes — consume until next `'`.
367
+ i += 1;
368
+ while (i < n && seg[i] !== "'")
369
+ i += 1;
370
+ if (i >= n)
371
+ return 0;
372
+ i += 1;
373
+ }
374
+ else {
375
+ // 5. Unquoted form. Consume contiguous non-whitespace.
376
+ while (i < n && seg[i] !== ' ' && seg[i] !== '\t')
377
+ i += 1;
378
+ }
379
+ // Require at least one whitespace after the value so we don't
380
+ // strip `FOO=barbaz` (no command following).
381
+ if (i >= n || (seg[i] !== ' ' && seg[i] !== '\t'))
382
+ return 0;
383
+ // Consume trailing whitespace before yielding the new segment.
384
+ while (i < n && (seg[i] === ' ' || seg[i] === '\t'))
385
+ i += 1;
386
+ return i;
387
+ }
388
+ /**
389
+ * Strip leading shell keywords and env-var assignments from a segment
390
+ * so the caller's head-anchored regex sees the actual command first.
391
+ *
392
+ * Examples:
393
+ * `sudo gh pr create` → `gh pr create`
394
+ * `CI=1 pnpm add foo` → `pnpm add foo`
395
+ * `sudo CI=1 pnpm add foo` → `pnpm add foo`
396
+ * `REA_SKIP="urgent fix" gh issue create x` → `gh issue create x`
397
+ * `KEY=$'a\\nb' git commit` → `git commit`
398
+ * `then git push --force` → `git push --force`
399
+ *
400
+ * The bash counterpart loops until no more prefix matches. We mirror
401
+ * that with an iteration cap of 32 (was 8; raised to support deeply
402
+ * stacked env prefixes — bash itself has no limit so 8 was a per-
403
+ * advisory-pilot bypass surface).
404
+ */
405
+ function stripSegmentPrefix(seg) {
406
+ let current = seg;
407
+ for (let iter = 0; iter < 32; iter += 1) {
408
+ let changed = false;
409
+ for (const kw of LEADING_KEYWORDS) {
410
+ const re = new RegExp(`^${kw}\\s+`);
411
+ if (re.test(current)) {
412
+ current = current.replace(re, '');
413
+ changed = true;
414
+ break;
415
+ }
416
+ }
417
+ if (changed)
418
+ continue;
419
+ const envLen = matchEnvAssignLength(current);
420
+ if (envLen > 0) {
421
+ current = current.slice(envLen);
422
+ changed = true;
423
+ }
424
+ if (!changed)
425
+ break;
426
+ }
427
+ return current;
428
+ }
429
+ /**
430
+ * Split `cmd` into segments using the quote-aware masking → split →
431
+ * unmask pipeline. Returns an array of `{ raw, head }` tuples in the
432
+ * order they appeared in the original command.
433
+ *
434
+ * 0.33.0 — nested-shell unwrapping was added on top of the original
435
+ * 0.32.0 splitter. When a segment's head is `bash -c|-lc|--c PAYLOAD`
436
+ * or `sh -c|-lc|--c PAYLOAD` (any combination of `-l` and `-c` flags),
437
+ * the PAYLOAD inside the quoted arg becomes additional segments
438
+ * appended after the wrapper segment. Mirrors the bash counterpart's
439
+ * `_rea_unwrap_nested_shells` (helix-017 #3 fix). Recurses up to
440
+ * `MAX_NESTED_DEPTH` levels.
441
+ */
442
+ export function splitSegments(cmd) {
443
+ if (cmd.length === 0)
444
+ return [];
445
+ return splitSegmentsRecursive(cmd, 0);
446
+ }
447
+ const MAX_NESTED_DEPTH = 8;
448
+ function splitSegmentsRecursive(cmd, depth) {
449
+ const masked = maskQuotedSeparators(cmd);
450
+ const rawSegs = splitOnUnquotedSeparators(masked);
451
+ const out = [];
452
+ for (const raw of rawSegs) {
453
+ const unmaskedRaw = unmask(raw);
454
+ const head = stripSegmentPrefix(unmaskedRaw);
455
+ out.push({ raw: unmaskedRaw, head });
456
+ // Try to unwrap a nested shell payload.
457
+ if (depth < MAX_NESTED_DEPTH) {
458
+ const inner = extractNestedShellPayload(head);
459
+ if (inner !== null) {
460
+ // Append the inner payload's segments AFTER the wrapper segment.
461
+ // This preserves the bash hook's emit-order: the wrapper IS a
462
+ // segment too (so a hook that anchors on `bash` for some other
463
+ // reason still sees it), and the inner segments follow.
464
+ out.push(...splitSegmentsRecursive(inner, depth + 1));
465
+ }
466
+ }
467
+ }
468
+ return out;
469
+ }
470
+ /**
471
+ * Recognize a nested-shell wrapper segment and return the unquoted
472
+ * payload string. Returns `null` when the segment is not a wrapper.
473
+ *
474
+ * 2026-05-15 codex round-1 P1 fix — extends parity with
475
+ * `_rea_unwrap_nested_shells` in `hooks/_lib/cmd-segments.sh`.
476
+ *
477
+ * Bash-parity matrix:
478
+ *
479
+ * 1. Shell names: bash | sh | zsh | dash
480
+ * (The bash counterpart also includes ksh / mksh / oksh / posh /
481
+ * yash / csh / tcsh / fish per the 0.19.0 M1 security review. We
482
+ * cover the common quartet here; the rare shells fall through to
483
+ * the bash-scanner tier which DOES have full coverage. Extending
484
+ * this list later is a one-line change.)
485
+ * 2. Split-flag forms ANY combination of pre-flags before `-c`:
486
+ * bash -l -c '…' bash -i -c '…' bash -e -c '…'
487
+ * bash -li -c '…' bash --noprofile -c '…'
488
+ * The pre-fix regex `(?:-[a-z]*c|--c)(?:\s+-[a-z]+)*` failed
489
+ * because it required `-c` to appear IN the FIRST flag token —
490
+ * `bash -l -c 'PAYLOAD'` did not match.
491
+ * 3. Combined-flag forms: -c, -lc, -lic, -ic, -cl, -cli, -li, -il
492
+ * (the bash WRAP pattern's `-(c|lc|lic|ic|cl|cli|li|il)` set).
493
+ * 4. ANSI-C-quoted payload: `bash -c $'…'`. Pre-fix the introducer
494
+ * regex `(['"])` could not match the `$` prefix, so the entire
495
+ * ANSI-C wrapper was a single un-unwrapped segment.
496
+ *
497
+ * The walker:
498
+ * - Tokenizes the head into whitespace-separated tokens.
499
+ * - First token must be a recognized shell name.
500
+ * - Walks subsequent flag tokens, each `-[A-Za-z]+` or `--[A-Za-z]+`.
501
+ * - A flag token containing a `c` letter terminates the flag walk
502
+ * (it's the `-c` introducer). The next non-flag token is the
503
+ * payload argument.
504
+ * - The payload argument's first character determines the quote
505
+ * style: `'`, `"`, or `$'` (ANSI-C). Any other character means
506
+ * the payload is unquoted and we return null (don't unwrap — the
507
+ * payload may already be a bare argv).
508
+ */
509
+ function extractNestedShellPayload(head) {
510
+ // Tokenize on whitespace. The head has already passed through
511
+ // stripSegmentPrefix so leading `sudo`/env-prefixes are gone.
512
+ const trimmed = head.trimStart();
513
+ if (trimmed.length === 0)
514
+ return null;
515
+ // 1. Shell-name token. Full parity with cmd-segments.sh `WRAP`:
516
+ // bash | sh | zsh | dash | ksh | mksh | oksh | posh | yash |
517
+ // csh | tcsh | fish. Codex round-2 P1 (2026-05-15): the round-1
518
+ // quartet (bash|sh|zsh|dash) left ksh/mksh/oksh/posh/yash/csh/
519
+ // tcsh/fish unwrapped — on machines where any of those shells
520
+ // are installed, `mksh -c 'source .env'` and
521
+ // `ksh -c 'npm install missing-pkg'` would bypass
522
+ // env-file-protection / dependency-audit-gate entirely.
523
+ // The bash counterpart caught these via the 0.19.0 M1 security
524
+ // review (WRAP regex extension).
525
+ //
526
+ // NOTE: pwsh (PowerShell) is intentionally OUT — it accepts -c
527
+ // and -Command, and -EncodedCommand base64-decodes at runtime.
528
+ // Adding pwsh requires a separate code path with base64 decode
529
+ // (mirroring the bash counterpart's explicit pwsh exclusion).
530
+ const shellMatch = /^(bash|sh|zsh|dash|ksh|mksh|oksh|posh|yash|csh|tcsh|fish)\b/i.exec(trimmed);
531
+ if (shellMatch === null)
532
+ return null;
533
+ let cursor = shellMatch[0].length;
534
+ // 2. Walk flag tokens. Each token is whitespace-separated and starts
535
+ // with `-`. A flag token containing the letter `c` (case-insens.)
536
+ // is the `-c` introducer; the NEXT token is the payload.
537
+ let sawCFlag = false;
538
+ while (cursor < trimmed.length) {
539
+ // Skip whitespace.
540
+ while (cursor < trimmed.length && /\s/.test(trimmed[cursor])) {
541
+ cursor += 1;
542
+ }
543
+ if (cursor >= trimmed.length)
544
+ return null;
545
+ // Peek next token.
546
+ const rest = trimmed.slice(cursor);
547
+ if (rest[0] !== '-') {
548
+ // Not a flag — must be the payload argument.
549
+ break;
550
+ }
551
+ // Extract the flag token (contiguous non-whitespace).
552
+ const flagMatch = /^(\S+)/.exec(rest);
553
+ if (flagMatch === null)
554
+ return null;
555
+ const flag = flagMatch[0] ?? '';
556
+ cursor += flag.length;
557
+ // Recognized flag-token shapes:
558
+ // `-c` `-l` `-i` `-e` `-lc` `-lic` `-ic` `-cl` `-cli` `-li` `-il`
559
+ // `--c` `--noprofile` (etc.) — we don't enforce the full list,
560
+ // just that it's `-<letters>` or `--<letters>`.
561
+ if (!/^--?[A-Za-z]+$/.test(flag))
562
+ return null;
563
+ // Does this flag contain `c` (the -c introducer letter)?
564
+ // `--c` also counts (rare but bash accepts).
565
+ if (/c/i.test(flag.replace(/^--?/, ''))) {
566
+ sawCFlag = true;
567
+ // Continue the loop — the payload is the NEXT non-flag token.
568
+ // (Bash's argv parser stops walking flags as soon as it sees -c,
569
+ // but we accept additional flags between -c and the payload for
570
+ // safety; the bash WRAP regex similarly tolerates trailing
571
+ // flag-like tokens before the quoted body.)
572
+ }
573
+ }
574
+ if (!sawCFlag)
575
+ return null;
576
+ if (cursor >= trimmed.length)
577
+ return null;
578
+ // Skip whitespace before payload.
579
+ while (cursor < trimmed.length && /\s/.test(trimmed[cursor])) {
580
+ cursor += 1;
581
+ }
582
+ if (cursor >= trimmed.length)
583
+ return null;
584
+ // 3. Inspect the payload's introducer character.
585
+ const first = trimmed[cursor];
586
+ let quote;
587
+ let isAnsiC = false;
588
+ let payloadStart = cursor;
589
+ if (first === '$' && trimmed[cursor + 1] === "'") {
590
+ // ANSI-C: $'…' — single-quote-style but with C-string escapes.
591
+ quote = "'";
592
+ isAnsiC = true;
593
+ payloadStart = cursor + 2;
594
+ }
595
+ else if (first === "'" || first === '"') {
596
+ quote = first;
597
+ payloadStart = cursor + 1;
598
+ }
599
+ else {
600
+ // Unquoted payload — refuse to unwrap. The bash counterpart's
601
+ // WRAP regex requires a quote introducer too.
602
+ return null;
603
+ }
604
+ // 4. Walk the payload, collecting bytes until the matching closing
605
+ // quote. Honor quote-specific escape rules.
606
+ let i = payloadStart;
607
+ let payload = '';
608
+ while (i < trimmed.length) {
609
+ const ch = trimmed[i];
610
+ if (ch === quote) {
611
+ // Closing quote found.
612
+ return payload;
613
+ }
614
+ if (isAnsiC && ch === '\\' && i + 1 < trimmed.length) {
615
+ // ANSI-C escape decoding. Mirror the bash counterpart's escape
616
+ // table (cmd-segments.sh, _rea_unwrap_at_depth). Only the
617
+ // common-enough subset is decoded; unknowns pass through as the
618
+ // literal pair (matches awk default behavior).
619
+ const nxt = trimmed[i + 1];
620
+ switch (nxt) {
621
+ case 'n':
622
+ payload += '\n';
623
+ break;
624
+ case 't':
625
+ payload += '\t';
626
+ break;
627
+ case 'r':
628
+ payload += '\r';
629
+ break;
630
+ case '\\':
631
+ payload += '\\';
632
+ break;
633
+ case "'":
634
+ payload += "'";
635
+ break;
636
+ case '"':
637
+ payload += '"';
638
+ break;
639
+ case 'a':
640
+ payload += '\x07';
641
+ break;
642
+ case 'b':
643
+ payload += '\x08';
644
+ break;
645
+ case 'e':
646
+ case 'E':
647
+ payload += '\x1b';
648
+ break;
649
+ case 'f':
650
+ payload += '\x0c';
651
+ break;
652
+ case 'v':
653
+ payload += '\x0b';
654
+ break;
655
+ case '0':
656
+ payload += '\x00';
657
+ break;
658
+ case 'x': {
659
+ // \xHH or \xH — up to 2 hex digits.
660
+ let hex = '';
661
+ let k = i + 2;
662
+ while (k < trimmed.length && hex.length < 2) {
663
+ const hc = trimmed[k];
664
+ if (!/[0-9a-fA-F]/.test(hc))
665
+ break;
666
+ hex += hc;
667
+ k += 1;
668
+ }
669
+ if (hex.length > 0) {
670
+ payload += String.fromCharCode(parseInt(hex, 16));
671
+ i = k;
672
+ continue;
673
+ }
674
+ // Fall through — `\x` with no hex digits is a literal pair.
675
+ payload += '\\x';
676
+ break;
677
+ }
678
+ default:
679
+ // Unknown escape — preserve the literal pair (bash awk
680
+ // default). E.g. `\z` → `\z`.
681
+ payload += '\\' + nxt;
682
+ break;
683
+ }
684
+ i += 2;
685
+ continue;
686
+ }
687
+ if (!isAnsiC && quote === '"' && ch === '\\' && i + 1 < trimmed.length) {
688
+ // Double-quote: backslash escapes the next character.
689
+ payload += trimmed[i + 1] ?? '';
690
+ i += 2;
691
+ continue;
692
+ }
693
+ payload += ch;
694
+ i += 1;
695
+ }
696
+ // Unterminated quote — return what we have. The bash counterpart
697
+ // similarly accepts unterminated quotes as "rest of line is payload".
698
+ return payload;
699
+ }
700
+ /**
701
+ * Returns true if any segment's prefix-stripped head matches the
702
+ * head-anchored regex. The regex must NOT include a `^` anchor —
703
+ * we anchor by testing against the head of the segment via
704
+ * `regex.test(head.slice(0, match.length))` simulation. In practice
705
+ * we just run the regex against the head with the regex already
706
+ * head-anchored by virtue of `head` containing only the prefix-
707
+ * stripped form.
708
+ *
709
+ * The bash counterpart uses `grep -qiE PATTERN <<<"$head"` so we
710
+ * match the same posture: case-INSENSITIVE, extended regex.
711
+ *
712
+ * @param regexSource ERE source. We compile with case-insensitive
713
+ * flag. Caller passes the same string they would
714
+ * have passed to `any_segment_starts_with` in bash.
715
+ * The regex is internally anchored with `^`.
716
+ */
717
+ export function anySegmentStartsWith(cmd, regexSource) {
718
+ // Compile once. `^` anchor + `i` flag.
719
+ const re = new RegExp(`^${regexSource}`, 'i');
720
+ for (const seg of splitSegments(cmd)) {
721
+ if (re.test(seg.head))
722
+ return true;
723
+ }
724
+ return false;
725
+ }
726
+ /**
727
+ * Returns true if any segment's RAW text contains a match for the
728
+ * regex (no head anchoring). Mirrors `any_segment_matches` — used for
729
+ * content-scan patterns like `Co-Authored-By:` markers inside
730
+ * quoted `git commit -m "..."` arguments.
731
+ *
732
+ * Case-INSENSITIVE, extended regex. Same posture as the bash helper.
733
+ */
734
+ export function anySegmentMatches(cmd, regexSource) {
735
+ const re = new RegExp(regexSource, 'i');
736
+ for (const seg of splitSegments(cmd)) {
737
+ if (re.test(seg.raw))
738
+ return true;
739
+ }
740
+ return false;
741
+ }
742
+ /**
743
+ * Returns true if any single segment's RAW text contains matches for
744
+ * BOTH `regexA` AND `regexB`. Mirrors `any_segment_matches_both` from
745
+ * the bash counterpart — used by `env-file-protection` to require that
746
+ * a text-reading utility AND an `.env*` filename co-occur within the
747
+ * same shell segment (a multi-segment construction like
748
+ * `echo "log: cat .env stuff" ; touch foo.env` must NOT fire because
749
+ * the utility and filename live in different segments).
750
+ *
751
+ * Case-INSENSITIVE, extended regex on both patterns. Same posture as
752
+ * the bash helper.
753
+ *
754
+ * 0.33.0 port. The bash helper was introduced in 0.16.2 to fix the
755
+ * helix-017 P2 false-positive class where two independent booleans
756
+ * (any-utility OR any-env) were AND'd across segments.
757
+ */
758
+ export function anySegmentMatchesBoth(cmd, regexA, regexB) {
759
+ const reA = new RegExp(regexA, 'i');
760
+ const reB = new RegExp(regexB, 'i');
761
+ for (const seg of splitSegments(cmd)) {
762
+ if (reA.test(seg.raw) && reB.test(seg.raw))
763
+ return true;
764
+ }
765
+ return false;
766
+ }