typeclaw 0.32.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +1 -1
  2. package/scripts/verify-procbind-sandbox.sh +61 -0
  3. package/src/agent/multimodal/look-at.ts +7 -5
  4. package/src/agent/plugin-tools.ts +47 -12
  5. package/src/agent/session-origin.ts +15 -9
  6. package/src/agent/system-prompt.ts +6 -0
  7. package/src/agent/tools/channel-fetch-attachment.ts +8 -7
  8. package/src/agent/tools/channel-history.ts +2 -0
  9. package/src/bundled-plugins/github-cli-auth/gh-command.ts +267 -13
  10. package/src/bundled-plugins/reviewer/skills/code-review.ts +11 -9
  11. package/src/bundled-plugins/security/policies/outbound-secret-scan.ts +1 -0
  12. package/src/channels/adapters/slack-bot-reference.ts +9 -10
  13. package/src/channels/adapters/slack-bot.ts +29 -7
  14. package/src/channels/router.ts +89 -21
  15. package/src/cli/index.ts +42 -2
  16. package/src/cli/init.ts +267 -82
  17. package/src/cli/inspect.ts +5 -2
  18. package/src/cli/model.ts +5 -1
  19. package/src/cli/provider.ts +41 -10
  20. package/src/config/config.ts +23 -11
  21. package/src/config/providers.ts +304 -7
  22. package/src/container/start.ts +12 -7
  23. package/src/init/find-agent-dir.ts +44 -0
  24. package/src/init/index.ts +3 -34
  25. package/src/init/models-dev.ts +2 -0
  26. package/src/init/validate-api-key.ts +13 -0
  27. package/src/inspect/transcript-view.ts +33 -7
  28. package/src/sandbox/availability.ts +354 -2
  29. package/src/sandbox/build.ts +17 -7
  30. package/src/sandbox/index.ts +10 -1
  31. package/src/sandbox/policy.ts +27 -9
  32. package/src/secrets/oauth-xai.ts +342 -0
  33. package/src/secrets/storage.ts +2 -0
  34. package/src/skills/typeclaw-markdown-pdf/SKILL.md +64 -5
  35. package/typeclaw.schema.json +20 -2
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "typeclaw",
3
- "version": "0.32.0",
3
+ "version": "0.33.0",
4
4
  "homepage": "https://github.com/typeclaw/typeclaw#readme",
5
5
  "bugs": {
6
6
  "url": "https://github.com/typeclaw/typeclaw/issues"
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env bash
2
+ # Manual acceptance check for the default 'proc-bind' sandbox strategy
3
+ # (src/sandbox/build.ts). Not a unit test: it needs a Linux container with bwrap,
4
+ # which the macOS dev host cannot provide, so it lives here as an operator-
5
+ # runnable script instead of a skipIf-everywhere test.
6
+ #
7
+ # The point of proc-bind is that it needs NEITHER `unshare --mount-proc` NOR
8
+ # CAP_SYS_ADMIN — so this runs WITHOUT --cap-add (unlike verify-realproc-sandbox).
9
+ # It proves two properties of `bwrap --unshare-all … --ro-bind /proc /proc`:
10
+ # 1. An external package runner (bunx) runs to completion (no Bun "NotDir").
11
+ # 2. A secret in a sibling process's environment is UNREADABLE from the sandbox
12
+ # (the --unshare-all child userns blocks cross-userns /proc/<pid>/environ).
13
+ # The signal boundary (kill/ptrace fail EPERM across the userns) is a corollary
14
+ # of the same userns isolation property (2) proves, so it is not re-tested here.
15
+ #
16
+ # Usage: scripts/verify-procbind-sandbox.sh [image]
17
+ # image defaults to ghcr.io/typeclaw/typeclaw-base:<version-from-package.json>
18
+ set -euo pipefail
19
+
20
+ IMAGE="${1:-}"
21
+ if [ -z "$IMAGE" ]; then
22
+ version="$(node -p "require('./package.json').version" 2>/dev/null || echo latest)"
23
+ IMAGE="ghcr.io/typeclaw/typeclaw-base:${version}"
24
+ fi
25
+
26
+ secret="TYPECLAW_PROCBIND_LEAK_CANARY_$$"
27
+
28
+ inner='
29
+ echo "=== bunx via proc-bind sandbox (no CAP_SYS_ADMIN) ==="
30
+ bunx cowsay "proc-bind ok" 2>&1 | tail -6
31
+ echo "bunx exit=$?"
32
+ echo "=== leak scan (sandbox must NOT read the canary holders env) ==="
33
+ found=0
34
+ for f in /proc/[0-9]*/environ; do
35
+ if tr "\0" "\n" < "$f" 2>/dev/null | grep -q "CANARY_TOKEN"; then
36
+ echo "LEAK:$f"; found=1
37
+ fi
38
+ done
39
+ if [ $found -eq 0 ]; then echo "NO_LEAK_CONFIRMED"; else echo "LEAK_DETECTED"; exit 1; fi
40
+ echo "=== self /proc must be usable (the property that makes bunx work) ==="
41
+ test -r /proc/self/fd && test -r /proc/self/maps && echo "SELF_PROC_OK" || { echo "SELF_PROC_MISSING"; exit 1; }
42
+ '
43
+ inner="${inner//CANARY_TOKEN/$secret}"
44
+
45
+ # The proc-bind argv shape mirrors buildArgv() in src/sandbox/build.ts. Keep in
46
+ # sync if that helper changes. Note: NO `unshare` prefix and NO --cap-add below.
47
+ runner="
48
+ env CANARY=${secret} sleep 120 &
49
+ bwrap --unshare-all \
50
+ --new-session --die-with-parent --clearenv \
51
+ --setenv PATH /usr/local/bin:/usr/bin:/bin --setenv HOME /tmp --setenv LANG C.UTF-8 \
52
+ --ro-bind /usr /usr --ro-bind /etc /etc --dev /dev --tmpfs /tmp \
53
+ --ro-bind-try /bin /bin --ro-bind-try /sbin /sbin --ro-bind-try /lib /lib --ro-bind-try /lib64 /lib64 \
54
+ --share-net \
55
+ --ro-bind /proc /proc \
56
+ bash -c '$inner'
57
+ "
58
+
59
+ echo "Image: $IMAGE"
60
+ docker run --rm --security-opt seccomp=unconfined \
61
+ -e "CANARY=${secret}" "$IMAGE" bash -c "$runner"
@@ -89,8 +89,10 @@ export function createChannelLookAtTool(router: ChannelRouter, origin: ChannelLo
89
89
  name: 'look_at_channel_attachment',
90
90
  label: 'Look at channel attachment',
91
91
  description:
92
- 'View an image attached to the current inbound channel message. Inbound messages show ' +
93
- '`[<Platform> attachment #N: <kind> <metadata>]`; pass `N` as `attachment_id`. Do not invent ids.',
92
+ 'View an image attached to a channel message. Inbound messages show ' +
93
+ '`[<Platform> attachment #N: <kind> <metadata>]`; pass `N` as `attachment_id`. Do not invent ids. ' +
94
+ 'Images on the CURRENT inbound resolve directly; for one from an EARLIER message, call channel_history ' +
95
+ 'first to make it resolvable by the same id.',
94
96
  parameters: Type.Object({
95
97
  attachment_id: Type.Integer({
96
98
  description: 'The number N from the inbound `[<Platform> attachment #N: ...]` placeholder.',
@@ -106,10 +108,10 @@ export function createChannelLookAtTool(router: ChannelRouter, origin: ChannelLo
106
108
  const validIds = router.listInboundAttachmentIds(origin)
107
109
  const validMsg =
108
110
  validIds.length === 0
109
- ? 'no attachments are present in the current turn'
110
- : `valid attachment_ids in this turn: ${validIds.join(', ')}`
111
+ ? 'no attachments are resolvable right now'
112
+ : `resolvable attachment_ids: ${validIds.join(', ')}`
111
113
  return errorResult(
112
- `no attachment with id=${params.attachment_id} in this turn (${validMsg}). Do not call look_at_channel_attachment for attachments that do not appear in the inbound message — they do not exist.`,
114
+ `no attachment with id=${params.attachment_id} (${validMsg}). For an attachment from an earlier message, call channel_history first to make it resolvable; otherwise do not invent ids that are not in the inbound message.`,
113
115
  { count: 0, prompt: params.prompt },
114
116
  )
115
117
  }
@@ -37,6 +37,8 @@ import type {
37
37
  } from '@/plugin'
38
38
  import {
39
39
  buildSandboxedCommand,
40
+ canBindProcSafely,
41
+ canMountRealProc,
40
42
  ensureBwrapAvailable,
41
43
  ensureSessionTmpDir,
42
44
  mapVirtualTmpPath,
@@ -44,6 +46,7 @@ import {
44
46
  resolveProcSelfExe,
45
47
  resolveProtectedZones,
46
48
  resolveWritableZones,
49
+ type SandboxProcStrategy,
47
50
  subtractMasked,
48
51
  } from '@/sandbox'
49
52
 
@@ -599,17 +602,7 @@ async function applyBashSandbox(
599
602
  // bwrap does --clearenv, so the overlay must be re-introduced via env.set or
600
603
  // it would never reach the sandboxed process (the non-sandboxed spawnHook
601
604
  // path does not run when the command is rewritten to a bwrap invocation).
602
- // 'real-proc' gives a sandboxed JS package runner a working /proc/self/{fd,
603
- // maps} so `bunx`/`bun add`/`bun run <pkg>` stop aborting with Bun's NotDir.
604
- // Opt-in (default 'tmpfs') because it makes start.ts grant the container
605
- // CAP_SYS_ADMIN at boot. Read from the boot-time `config` snapshot, NOT live
606
- // getConfig(): sandbox.realProc is restart-required, and the strategy MUST
607
- // track the boot-time capability. A `typeclaw reload` that flips realProc to
608
- // true would otherwise make this emit `unshare --mount-proc` in a container
609
- // booted WITHOUT CAP_SYS_ADMIN, so the mount fails instead of the old tmpfs
610
- // strategy holding until restart. `config` never changes on reload.
611
- // procSelfExe is only consumed by the 'tmpfs' branch.
612
- const realProc = config.sandbox.realProc
605
+ const proc = await resolveProcStrategy()
613
606
  const { commandString } = buildSandboxedCommand(command, {
614
607
  mounts: [
615
608
  { type: 'ro-bind', source: agentDir, dest: agentDir },
@@ -620,13 +613,55 @@ async function applyBashSandbox(
620
613
  protected: protectedZones,
621
614
  network: 'inherit',
622
615
  cwd: agentDir,
623
- proc: realProc ? 'real-proc' : 'tmpfs',
616
+ proc,
624
617
  procSelfExe: resolveProcSelfExe(),
625
618
  ...(envOverlay !== undefined ? { env: { set: envOverlay } } : {}),
626
619
  })
627
620
  mutableArgs.command = commandString
628
621
  }
629
622
 
623
+ // Picks the /proc strategy for a sandboxed bash call. The branch order is:
624
+ // 'real-proc' ONLY when the operator explicitly opted in (sandbox.realProc) AND
625
+ // the kernel permits the mount (canMountRealProc) — it adds PID isolation but
626
+ // needs CAP_SYS_ADMIN (unshare --mount-proc), so it is a deliberate, narrow
627
+ // opt-in; else 'proc-bind' (--ro-bind /proc, NO CAP_SYS_ADMIN) when its userns
628
+ // leak-block is verified safe (canBindProcSafely); else 'tmpfs'. Because
629
+ // sandbox.realProc DEFAULTS FALSE, the first branch is normally skipped and
630
+ // proc-bind is the de-facto default — which is the point: the common path needs
631
+ // no broad outer capability. 'tmpfs' is the last-resort degraded mode where
632
+ // external packages can't run; reached only when BOTH probes fail (e.g. a kernel
633
+ // that would leak cross-userns environ — proc-bind fails closed there).
634
+ //
635
+ // Read from the boot-time `config` snapshot, NOT live getConfig(): sandbox is
636
+ // restart-required, and the strategy MUST track the boot-time CAP_SYS_ADMIN
637
+ // grant. A `typeclaw reload` flipping realProc would otherwise emit `unshare
638
+ // --mount-proc` in a container booted WITHOUT the cap (or vice versa). Both
639
+ // probes are cached process-globally, so this resolves to one spawn per
640
+ // container lifetime regardless of how many bash calls hit it.
641
+ async function resolveProcStrategy(): Promise<SandboxProcStrategy> {
642
+ if (config.sandbox.realProc && (await canMountRealProc())) return 'real-proc'
643
+ if (await canBindProcSafely()) return 'proc-bind'
644
+ // Degraded last resort: no working /proc strategy. External package runners
645
+ // (bunx/bun add/bun run <pkg-bin>) will fail with Bun's opaque "NotDir" because
646
+ // /proc/self/{fd,maps} are absent. Warn once so an operator on such an exotic
647
+ // host (no usable user namespaces at all) gets a diagnostic instead of the bare
648
+ // Bun error. Not gated on parsing the command — that heuristic is fragile (see
649
+ // PR #696); this is a strategy-level notice, fail-closed and command-agnostic.
650
+ warnTmpfsProcFallbackOnce()
651
+ return 'tmpfs'
652
+ }
653
+
654
+ let tmpfsProcFallbackWarned = false
655
+ function warnTmpfsProcFallbackOnce(): void {
656
+ if (tmpfsProcFallbackWarned) return
657
+ tmpfsProcFallbackWarned = true
658
+ console.warn(
659
+ '[sandbox] degraded /proc mode: neither real-proc nor proc-bind is available on this host, ' +
660
+ 'so sandboxed external package runners (bunx / bun add / bun run <pkg-bin>) will fail. ' +
661
+ 'This needs a runtime with working user namespaces.',
662
+ )
663
+ }
664
+
630
665
  // The builtin file tools that take a single filesystem `path` arg. For a
631
666
  // sandboxed role they all run UNSANDBOXED in the main process (only bash is
632
667
  // bwrap-wrapped), so each must apply the same /tmp -> session-dir mapping that
@@ -528,15 +528,21 @@ function renderMembershipSummary(
528
528
  function renderResearchReportDeliveryGuidance(platformInfo: PlatformInfo): string[] {
529
529
  if (!platformInfo.supportsAttachments) return []
530
530
  return [
531
- `**Ship \`researcher\` reports as a PDF by default.** ${platformInfo.displayName} accepts file`,
532
- 'attachments. When you receive a `researcher` subagent result a',
533
- '`research-<slug>.md` report file path in its `<report>` block — convert that',
534
- 'markdown to a PDF with the `typeclaw-markdown-pdf` skill and deliver it with',
535
- '`channel_send({ ..., attachments: [{ path, filename }] })`, with a one- or',
536
- 'two-line summary as the message text. A downloadable file is what a human',
537
- 'wants for a multi-page report; do not paste the full markdown into chat. Send',
538
- 'the report inline as plain text only if the caller explicitly asked for it in',
539
- 'the message, or the report is short enough that a file would be overkill.',
531
+ `**Ship reports as a PDF by default.** ${platformInfo.displayName} accepts file`,
532
+ 'attachments. When the user asks for a report, document, brief, or "the report"',
533
+ '— or a `researcher` subagent hands you a `research-<slug>.md` file path in its',
534
+ '`<report>` block — convert that markdown to a PDF with the `typeclaw-markdown-pdf`',
535
+ 'skill and deliver it with `channel_send({ ..., attachments: [{ path, filename }] })`,',
536
+ 'with a one- or two-line summary as the message text. A `researcher` `<summary>`',
537
+ 'is a teaser, NOT the deliverable: the deliverable is the report file rendered to',
538
+ 'PDF. Never build the PDF with an ad-hoc library (jsPDF, pdfkit, a raw-text dump) ',
539
+ 'that yields unrendered markdown and mojibake; the skill is the only correct path.',
540
+ "For CJK (Korean/Japanese/Chinese) reports, follow that skill's CJK font gate —",
541
+ 'never ship a tofu-rendered PDF; ask before enabling the opt-in `cjkFonts`.',
542
+ 'A downloadable file is what a human wants for a multi-page report; do not paste',
543
+ 'the full markdown into chat, and do not attach the raw `.md` when asked for a',
544
+ 'report or PDF. Send inline plain text only if the caller explicitly asked for it,',
545
+ 'or the content is short enough that a file would be overkill.',
540
546
  '',
541
547
  ]
542
548
  }
@@ -59,6 +59,12 @@ For any multi-step or long-running task, maintain a todo list with \`todo_write\
59
59
 
60
60
  Do not narrate routine, low-risk tool calls. Just call the tool. Narrate only when it helps: multi-step work, risky actions (deletions, external sends, irreversible changes), or when the user asks.
61
61
 
62
+ ## Delivering reports and documents
63
+
64
+ When the user asks for a *report*, *document*, *brief*, *PDF*, or asks you to *send/show/attach/export* a generated result — anything where the deliverable is a file a human would download, print, or forward — produce a polished file, not a wall of text pasted into chat and not a one-line summary that drops the substance. A summary (yours or a subagent's) is a pointer to the deliverable, never the deliverable itself; when the user asked for the report, ship the report.
65
+
66
+ To turn Markdown into a PDF, use the bundled \`typeclaw-markdown-pdf\` skill — it is the only supported path and it renders Markdown properly (headings, lists, tables). **Never** hand-roll a PDF with an ad-hoc library (jsPDF, pdfkit, a canvas text dump, a headless-browser raw-text print): those produce unrendered raw \`##\`/\`**\` markup and mojibake for non-Latin text. CJK fonts are opt-in, so for Korean/Japanese/Chinese reports follow that skill's CJK gate — never ship a tofu-rendered PDF; ask before enabling opt-in CJK fonts. If a request is plainly satisfied by inline chat — a short answer, a snippet, a quick explanation — stay inline; this rule is for explicit document deliverables, not for every long reply.
67
+
62
68
  ## Long-running and interactive shell work
63
69
 
64
70
  Foreground \`bash\` blocks your turn until exit, so a command that runs for minutes or waits for input (dev server, REPL, watcher, \`docker compose up\`, interactive installer) freezes the conversation. \`tmux\` is in the container — run such programs detached so your turn stays free:
@@ -37,11 +37,12 @@ export function createChannelFetchAttachmentTool({
37
37
  name: 'channel_fetch_attachment',
38
38
  label: 'Channel Fetch Attachment',
39
39
  description:
40
- 'Download a file the user attached to the current inbound channel message and save it to disk. Inbound channel ' +
40
+ 'Download a file attached to a channel message and save it to disk. Inbound channel ' +
41
41
  'messages with attachments show `[<Platform> attachment #N: <kind> <metadata>]` in the text. Pass `N` as ' +
42
- '`attachment_id`; do not invent ids that are not present in the inbound message. The router validates the id ' +
43
- 'against the current turn and resolves the private platform ref itself. On success returns the absolute path ' +
44
- 'of the saved file plus its detected mimetype and size.',
42
+ '`attachment_id`; do not invent ids that are not present in the message. The router resolves the private ' +
43
+ 'platform ref itself. Attachments on the CURRENT inbound message resolve directly; for one from an EARLIER ' +
44
+ 'message, call channel_history first (it makes those attachments resolvable by the same id). On success ' +
45
+ 'returns the absolute path of the saved file plus its detected mimetype and size.',
45
46
  parameters: Type.Object({
46
47
  attachment_id: Type.Integer({
47
48
  description:
@@ -75,10 +76,10 @@ export function createChannelFetchAttachmentTool({
75
76
  })
76
77
  const validMsg =
77
78
  validIds.length === 0
78
- ? 'no attachments are present in the current turn'
79
- : `valid attachment_ids in this turn: ${validIds.join(', ')}`
79
+ ? 'no attachments are resolvable right now'
80
+ : `resolvable attachment_ids: ${validIds.join(', ')}`
80
81
  return errorResult(
81
- `no attachment with id=${params.attachment_id} in this turn (${validMsg}). Do not call channel_fetch_attachment for attachments that do not appear in the inbound message — they do not exist.`,
82
+ `no attachment with id=${params.attachment_id} (${validMsg}). For an attachment from an earlier message, call channel_history first to make it resolvable; otherwise do not invent ids that are not in the inbound message.`,
82
83
  )
83
84
  }
84
85
  if (found.ref === '') {
@@ -94,6 +94,8 @@ export function createChannelHistoryTool({
94
94
  }
95
95
  }
96
96
 
97
+ router.registerHistoryAttachments(origin, result.messages)
98
+
97
99
  const rendered = renderMessages(result.messages)
98
100
  const cursorLine =
99
101
  result.nextCursor !== undefined
@@ -37,12 +37,14 @@ type GhSegmentDecision =
37
37
 
38
38
  const COMPOSITION_REASON =
39
39
  'A repo-targeting `gh` command receives a minted GitHub App token in its process ' +
40
- 'environment, so it must run as a single bare `gh` command — no pipes, `;`, `&&`, ' +
41
- '`||`, `&`, newlines, redirections, command/process substitution, subshells, heredocs, ' +
42
- 'or unquoted `$` expansion (any sibling process or expansion would inherit the token ' +
43
- 'and could exfiltrate it). jq/JSON metacharacters are fine INSIDE single quotes, e.g. ' +
44
- "`gh api repos/o/r --jq '.[] | {id}'`. To feed JSON to `gh api`, write it to a temp " +
45
- 'file and use `gh api --input <file>`.'
40
+ 'environment, so it must run as a single bare `gh` command — no `;`, `&&`, `||`, `&`, ' +
41
+ 'newlines, redirections, command/process substitution, subshells, heredocs, or unquoted ' +
42
+ '`$` expansion (any sibling process or expansion would inherit the token and could ' +
43
+ 'exfiltrate it). One exception is allowed: a trailing reader pipeline `gh … | <reader>` ' +
44
+ 'where every downstream stage is a stdin-only reader (`jq`, `cat`, `wc`, `sort`, `uniq`) ' +
45
+ 'with no file operand e.g. `gh api repos/o/r | jq .`. jq/JSON metacharacters are also ' +
46
+ "fine INSIDE single quotes, e.g. `gh api repos/o/r --jq '.[] | {id}'`. To feed JSON to " +
47
+ '`gh api`, write it to a temp file and use `gh api --input <file>`.'
46
48
 
47
49
  // Shell-active metacharacters that, OUTSIDE single quotes, either spawn another
48
50
  // process sharing the shell env (where the minted GH_TOKEN lives) or expand
@@ -140,15 +142,267 @@ export function analyzeGhCommand(command: string): GhCommandDecision {
140
142
  const owners = new Set(repoSlugs.map((slug) => slug.split('/')[0]))
141
143
  if (owners.size > 1) return { kind: 'block', reason: MULTI_OWNER_REASON }
142
144
 
143
- // We would inject a token. Enforce the single-bare-`gh` shape: the token
144
- // lands in the shell's env, so any sibling/upstream/downstream process or
145
- // shell expansion would inherit it.
146
- if (!isSingleBareGhCommand(command)) return { kind: 'block', reason: COMPOSITION_REASON }
145
+ const repoSlug = repoSlugs[0] as string
147
146
 
148
- if (stripRepoFlag) {
149
- return { kind: 'inject', repoSlug: repoSlugs[0] as string, rewrittenCommand: stripRepoFlagFromCommand(command) }
147
+ // We would inject a token. The token lands in the shell env, so any sibling/
148
+ // upstream/downstream process or shell expansion would inherit it. The single-
149
+ // bare-`gh` shape is the safe baseline; a trailing reader pipeline (`gh | jq`)
150
+ // is the one exception we allow, under strict conditions (see analyzeReaderPipeline).
151
+ if (isSingleBareGhCommand(command)) {
152
+ if (stripRepoFlag) return { kind: 'inject', repoSlug, rewrittenCommand: stripRepoFlagFromCommand(command) }
153
+ return { kind: 'inject', repoSlug }
150
154
  }
151
- return { kind: 'inject', repoSlug: repoSlugs[0] as string }
155
+
156
+ const piped = analyzeReaderPipeline(command, stripRepoFlag)
157
+ if (piped !== null) return { kind: 'inject', repoSlug, rewrittenCommand: piped }
158
+
159
+ return { kind: 'block', reason: COMPOSITION_REASON }
160
+ }
161
+
162
+ // stdin-only readers whose only sink is stdout (back to the agent, who already
163
+ // has gh's output) — they cannot open their own network/file/process sink, so a
164
+ // `gh <repo> | <reader>` pipeline cannot exfiltrate the minted token to a third
165
+ // party. EXCLUDED on purpose: awk (system()/getline|cmd/inet), sed (GNU `e`
166
+ // shell-exec), tee/xargs (write/spawn), less (`!cmd`), and grep/head/tail (their
167
+ // file-operand forms are too easy to abuse and not worth the parser risk yet).
168
+ const READER_ALLOWLIST = new Set(['jq', 'cat', 'wc', 'sort', 'uniq'])
169
+
170
+ // STRICT per-command flag allowlists. We allow ONLY flags known to be pure
171
+ // stdin-shaping (no file/program operand). This is allow-known-good, not
172
+ // deny-known-bad: coreutils exposes file reads AND code execution as FLAGS, not
173
+ // just operands — `wc --files0-from=F` and `sort --files0-from=F` open a file
174
+ // with no positional, and `sort --compress-program=PROG` execs a helper. Any
175
+ // such flag would let a downstream "reader" open `/proc/<pid>/environ` and
176
+ // recover the sibling token. So an unrecognized flag REJECTS the whole stage.
177
+ // jq is excluded here (its filter is a positional, handled separately).
178
+ const READER_BOOLEAN_FLAGS: Record<string, ReadonlySet<string>> = {
179
+ cat: new Set(['-n', '--number', '-b', '--number-nonblank', '-s', '--squeeze-blank', '-A', '--show-all', '-E', '-T']),
180
+ wc: new Set(['-l', '--lines', '-c', '--bytes', '-m', '--chars', '-w', '--words', '-L', '--max-line-length']),
181
+ sort: new Set(['-r', '--reverse', '-n', '--numeric-sort', '-u', '--unique', '-f', '--ignore-case', '-b', '-g', '-h']),
182
+ uniq: new Set(['-c', '--count', '-d', '--repeated', '-u', '--unique', '-i', '--ignore-case']),
183
+ }
184
+
185
+ // jq is validated allow-known-good, exactly like the coreutils readers: only
186
+ // known stdin-shaping flags pass; anything else rejects the stage. Exact-token
187
+ // deny-listing was unsound — `-f/proc/self/environ`, `-L/proc`, and clustered
188
+ // `-rf/proc/...` short forms slipped past a `Set.has(token)` check and reopened
189
+ // the file-read path. jq accepts NO `--flag=value` form (value flags take the
190
+ // value as a SEPARATE token), so long flags are matched as whole tokens.
191
+
192
+ // Safe boolean LONG flags: output/parse shaping only, no value, no file/module.
193
+ const JQ_SAFE_BOOLEAN_LONG = new Set([
194
+ '--raw-output',
195
+ '--raw-output0',
196
+ '--compact-output',
197
+ '--slurp',
198
+ '--null-input',
199
+ '--exit-status',
200
+ '--ascii-output',
201
+ '--sort-keys',
202
+ '--raw-input',
203
+ '--join-output',
204
+ '--color-output',
205
+ '--monochrome-output',
206
+ '--binary',
207
+ '--tab',
208
+ '--unbuffered',
209
+ '--stream',
210
+ '--stream-errors',
211
+ '--seq',
212
+ ])
213
+
214
+ // Safe LONG flags that consume a fixed number of FOLLOWING tokens, none a file:
215
+ // --arg/--argjson take 2 (name, value), --indent takes 1 (a number).
216
+ const JQ_SAFE_VALUE_LONG: Record<string, number> = {
217
+ '--arg': 2,
218
+ '--argjson': 2,
219
+ '--indent': 1,
220
+ }
221
+
222
+ // Safe boolean SHORT flags (single chars). A clustered short token like `-rc`
223
+ // is allowed iff EVERY char is in this set. `f` (filter-from-file) and `L`
224
+ // (module path) are the fatal ones — and any unknown char also rejects.
225
+ const JQ_SAFE_BOOLEAN_SHORT = new Set(['r', 'c', 's', 'n', 'e', 'a', 'S', 'R', 'j', 'C', 'M', 'b'])
226
+
227
+ // A reader stage is safe only if it is an allowlisted command using ONLY its
228
+ // known stdin-shaping flags, with no file operand. Backslashes are rejected
229
+ // outright: our tokenizer does not model shell backslash escaping, so a
230
+ // `jq \--from-file=…` would be seen as a harmless positional here but reach bash
231
+ // as the forbidden flag — an allowlist-bypass. Rejecting `\` closes that gap.
232
+ function isStdinOnlyReaderStage(stage: string): boolean {
233
+ if (containsShellActiveMetachar(stage)) return false
234
+ if (stage.includes('\\')) return false
235
+ const tokens = splitStageTokens(stage)
236
+ const cmd = tokens[0]
237
+ if (cmd === undefined || !READER_ALLOWLIST.has(cmd)) return false
238
+
239
+ if (cmd === 'jq') return isStdinOnlyJqStage(tokens)
240
+
241
+ const allowedFlags = READER_BOOLEAN_FLAGS[cmd]
242
+ if (allowedFlags === undefined) return false
243
+ for (let i = 1; i < tokens.length; i++) {
244
+ const tok = tokens[i] as string
245
+ if (!tok.startsWith('-')) return false
246
+ if (!allowedFlags.has(tok)) return false
247
+ }
248
+ return true
249
+ }
250
+
251
+ // jq must run pure-stdin: only known stdin-shaping flags, and EXACTLY one
252
+ // positional (the filter). A second positional is an input FILE jq would open
253
+ // (`jq . /proc/self/environ` reads that file), so it is rejected. The filter is
254
+ // additionally screened for `import`/`include`, which load modules from jq's
255
+ // default search path even without `-L` — another file-read vector.
256
+ function isStdinOnlyJqStage(tokens: readonly string[]): boolean {
257
+ let sawFilter = false
258
+ for (let i = 1; i < tokens.length; i++) {
259
+ const tok = tokens[i] as string
260
+ if (tok === '--') return false
261
+ if (tok.startsWith('--')) {
262
+ if (JQ_SAFE_BOOLEAN_LONG.has(tok)) continue
263
+ const consume = JQ_SAFE_VALUE_LONG[tok]
264
+ if (consume === undefined) return false
265
+ i += consume
266
+ continue
267
+ }
268
+ if (tok.startsWith('-') && tok.length > 1) {
269
+ for (const ch of tok.slice(1)) {
270
+ if (!JQ_SAFE_BOOLEAN_SHORT.has(ch)) return false
271
+ }
272
+ continue
273
+ }
274
+ if (sawFilter) return false
275
+ sawFilter = true
276
+ if (jqFilterLoadsModules(tok)) return false
277
+ }
278
+ return true
279
+ }
280
+
281
+ // jq `import`/`include` directives pull a module file from the search path, a
282
+ // file-read vector that `-L` rejection alone does not cover (the default path
283
+ // still applies). Match them as leading directives in the untrusted filter.
284
+ function jqFilterLoadsModules(filter: string): boolean {
285
+ return /(^|[;\s])(import|include)\s/.test(filter)
286
+ }
287
+
288
+ // Splits a single bare `gh ... | reader | reader` pipeline into its stages on
289
+ // TOP-LEVEL `|` only (quote-aware, so a `|` inside a single-quoted jq filter is
290
+ // not a stage boundary), rewriting each downstream reader to run under
291
+ // `/usr/bin/env -u GH_TOKEN`. Returns the rewritten command, or null if the
292
+ // shape is not a leading-`gh` + allowlisted-stdin-readers pipeline. Absolute
293
+ // `/usr/bin/env` (not bare `env`) so the strip can't be defeated by a PATH-
294
+ // shadowed `env`; a missing binary exits 127, failing closed.
295
+ function analyzeReaderPipeline(command: string, stripRepoFlag: boolean): string | null {
296
+ const stages = splitTopLevelPipeStages(command)
297
+ if (stages === null || stages.length < 2) return null
298
+
299
+ const ghStage = (stages[0] as string).trim()
300
+ if (!isSingleBareGhCommand(ghStage)) return null
301
+
302
+ for (let i = 1; i < stages.length; i++) {
303
+ if (!isStdinOnlyReaderStage((stages[i] as string).trim())) return null
304
+ }
305
+
306
+ const rewrittenGh = stripRepoFlag ? stripRepoFlagFromCommand(ghStage) : ghStage
307
+ const rewrittenReaders = stages.slice(1).map((s) => `/usr/bin/env -u GH_TOKEN ${s.trim()}`)
308
+ return [rewrittenGh, ...rewrittenReaders].join(' | ')
309
+ }
310
+
311
+ // Quote-aware split on top-level `|`. Returns null if any OTHER shell-active
312
+ // metachar appears outside single quotes (`;` `&` `<` `>` backtick `$` `(` `)`
313
+ // `{` `}` newline) or if a `||`/`|&` is seen — those are not simple pipelines.
314
+ function splitTopLevelPipeStages(command: string): string[] | null {
315
+ const stages: string[] = []
316
+ let current = ''
317
+ let quote: '"' | "'" | null = null
318
+ for (let i = 0; i < command.length; i++) {
319
+ const ch = command[i] as string
320
+ if (quote === "'") {
321
+ if (ch === "'") quote = null
322
+ current += ch
323
+ continue
324
+ }
325
+ if (quote === '"') {
326
+ if (ch === '$' || ch === '`') return null
327
+ if (ch === '"') quote = null
328
+ current += ch
329
+ continue
330
+ }
331
+ if (ch === "'" || ch === '"') {
332
+ quote = ch
333
+ current += ch
334
+ continue
335
+ }
336
+ if (ch === '|') {
337
+ const next = command[i + 1]
338
+ if (next === '|' || next === '&') return null
339
+ stages.push(current)
340
+ current = ''
341
+ continue
342
+ }
343
+ if (SHELL_ACTIVE_METACHARS.has(ch) && ch !== '|') return null
344
+ current += ch
345
+ }
346
+ if (quote !== null) return null
347
+ stages.push(current)
348
+ return stages
349
+ }
350
+
351
+ function containsShellActiveMetachar(stage: string): boolean {
352
+ let quote: '"' | "'" | null = null
353
+ for (let i = 0; i < stage.length; i++) {
354
+ const ch = stage[i] as string
355
+ if (quote === "'") {
356
+ if (ch === "'") quote = null
357
+ continue
358
+ }
359
+ if (quote === '"') {
360
+ if (ch === '$' || ch === '`') return true
361
+ if (ch === '"') quote = null
362
+ continue
363
+ }
364
+ if (ch === "'" || ch === '"') {
365
+ quote = ch
366
+ continue
367
+ }
368
+ if (SHELL_ACTIVE_METACHARS.has(ch)) return true
369
+ }
370
+ return false
371
+ }
372
+
373
+ // Whitespace-splits a single stage into argv-ish tokens, stripping surrounding
374
+ // quotes so a quoted filter like `'.[] | {id}'` becomes one token. Quote-aware
375
+ // so whitespace inside quotes does not split.
376
+ function splitStageTokens(stage: string): string[] {
377
+ const tokens: string[] = []
378
+ let current = ''
379
+ let has = false
380
+ let quote: '"' | "'" | null = null
381
+ for (let i = 0; i < stage.length; i++) {
382
+ const ch = stage[i] as string
383
+ if (quote !== null) {
384
+ if (ch === quote) quote = null
385
+ else current += ch
386
+ continue
387
+ }
388
+ if (ch === "'" || ch === '"') {
389
+ quote = ch
390
+ has = true
391
+ continue
392
+ }
393
+ if (ch === ' ' || ch === '\t') {
394
+ if (has) {
395
+ tokens.push(current)
396
+ current = ''
397
+ has = false
398
+ }
399
+ continue
400
+ }
401
+ current += ch
402
+ has = true
403
+ }
404
+ if (has) tokens.push(current)
405
+ return tokens
152
406
  }
153
407
 
154
408
  // Removes an unquoted `-R`/`--repo` flag (and its repo-slug value) from a single
@@ -61,13 +61,15 @@ Prioritize in this order:
61
61
 
62
62
  1. **Correctness.** Does the change do what its description claims? Off-by-one errors, missing null/undefined handling, race conditions, incorrect error propagation, broken invariants.
63
63
  2. **Security.** Injection vectors (SQL, shell, HTML), missing authz/authn checks, secret leakage in logs or error messages, unsafe deserialization, SSRF, path traversal, time-of-check-time-of-use. Cite OWASP / CWE / RFC by number when relevant; verify with \`web_search\` or \`web_fetch\` before asserting.
64
- 3. **Architecture fit.** Does the change respect existing layering? Does it introduce a new dependency where the existing pattern would have worked? Does it duplicate logic that already exists elsewhere in the repo?
65
- 4. **Test coverage.** New behavior should have new tests. Edge cases the description names should be tested. If existing tests were deleted or skipped, that is a blocker absent a stated reason. Look past the raw test count, but only flag a redundant case when you can show the *inputs themselves* reach the same pathsame branch, same validation rule, same boundary not merely that the assertion shape is identical. Table-driven and parametrized tests legitimately share one assertion across many inputs while each input exercises a distinct branch, parser, or edge case; that is coverage, not duplication. The finding is "these inputs are indistinguishable to the code under test," and you must name the path they collapse onto never "the assertions look the same."
66
- 5. **Error handling.** Empty catch blocks, swallowed errors, errors converted to silent fallbacks, retry loops without bounded backoff, missing timeouts on external calls.
67
- 6. **Performance.** Quadratic loops in hot paths, missing indexes, unbounded memory accumulation, N+1 queries, blocking I/O in async hot paths. Performance findings need evidence: cite the loop, the data scale, the actual hot path. "Could be slow" without evidence is not a finding.
68
- 7. **API surface.** Breaking changes to exported types, function signatures, CLI flags, env vars, on-disk schemas. Are they documented? Versioned? Migration noted in CHANGELOG / release notes?
69
- 8. **Naming.** Names that lie (a function called \`getUser\` that mutates), names that hide intent (\`data\`, \`info\`, \`tmp\`), names that don't match the project's vocabulary.
70
- 9. **Change hygiene.** Temporary scaffolding that escaped into the change: \`wip\`/\`fixup!\`/\`squash!\` commits left in the history, debug logging, commented-out code, leftover \`TODO\` markers for work the PR claims to finish. When you flag a stray commit, name the commit it should fold into so the author can squash it — don't just say "this looks temporary".
64
+ 3. **Architecture fit and intent drift.** Does the change respect existing layering? Does it introduce a new dependency where the existing pattern would have worked? Does it duplicate logic that already exists elsewhere in the repo? Beyond local fit, check for **intent drift** — the change technically compiles and passes its own tests, but quietly diverges from the design intent the surrounding code was built on: a "temporary" branch that bypasses an established abstraction, a special-case that erodes an invariant the module exists to protect, a layer reaching past its boundary because that was the shortest path. The diff can be locally correct and still pull the system away from the shape the author (or the codebase's own conventions) intended. When the description states an intent — "without changing the public API", "purely a refactor", "no behavior change" — verify the diff actually holds that line; a refactor that alters observable behavior, or an "internal only" change that shifts an exported contract, is drift even if nothing is strictly broken. Anchor the finding to the line where the divergence enters and name the intent it violates.
65
+ 4. **Regression risk and blast radius.** A change is rarely self-contained. For every function signature, return shape, exported type, default value, thrown-error type, or side-effecting behavior the diff alters, ask **who depended on the old behavior**. \`grep\` for callers of changed exports; trace the call sites that touch a modified invariant. A contract change that is correct *here* can silently break a caller the diff never shows that caller is the regression, and it will not appear in the test count for this PR. Removed or loosened validation, a narrowed accepted-input range, a changed enum value, an altered ordering guarantee, a default that flipped: each is a regression vector for existing consumers even when the new code reads fine in isolation. State the blast radius explicitly — which call sites, which inputs, which downstream behavior changes — so the author knows whether this is a \`concern\` or a \`blocker\`. "Looks fine in the diff" is not a regression clearance; the diff is exactly where regressions hide their other half.
66
+ 5. **Side effects and ripple.** Watch for effects that reach outside the lines being changed: mutation of shared or global state, a cache that now needs invalidating, an event/log/metric whose shape downstream consumers parse, a config or feature flag whose new value changes behavior elsewhere, a migration that must run in lockstep, a resource (file handle, connection, lock, subscription) opened on a new path and never released. The dangerous side effect is the one whose *consequence* isn't obvious from the changed line alone — a behavior that emerges from the interaction between the changed code and code it touches indirectly. There is still a line that introduces it: anchor the finding to the mutation, lifecycle, or config line where the ripple enters, then name the downstream consumer or shared state that breaks and say what goes wrong when it is not accounted for. If the change touches a shared resource's lifecycle, verify the cleanup path (\`finally\`, \`defer\`, \`using\`, teardown hook) covers the new branch too — a leak introduced on an error path is a side effect that only shows up under load.
67
+ 6. **Test coverage.** New behavior should have new tests. Edge cases the description names should be tested. If existing tests were deleted or skipped, that is a blocker absent a stated reason. Look past the raw test count, but only flag a redundant case when you can show the *inputs themselves* reach the same path — same branch, same validation rule, same boundary — not merely that the assertion shape is identical. Table-driven and parametrized tests legitimately share one assertion across many inputs while each input exercises a distinct branch, parser, or edge case; that is coverage, not duplication. The finding is "these inputs are indistinguishable to the code under test," and you must name the path they collapse onto — never "the assertions look the same."
68
+ 7. **Error handling.** Empty catch blocks, swallowed errors, errors converted to silent fallbacks, retry loops without bounded backoff, missing timeouts on external calls.
69
+ 8. **Performance.** Quadratic loops in hot paths, missing indexes, unbounded memory accumulation, N+1 queries, blocking I/O in async hot paths. Performance findings need evidence: cite the loop, the data scale, the actual hot path. "Could be slow" without evidence is not a finding.
70
+ 9. **API surface.** Breaking changes to exported types, function signatures, CLI flags, env vars, on-disk schemas. Are they documented? Versioned? Migration noted in CHANGELOG / release notes?
71
+ 10. **Naming.** Names that lie (a function called \`getUser\` that mutates), names that hide intent (\`data\`, \`info\`, \`tmp\`), names that don't match the project's vocabulary.
72
+ 11. **Change hygiene.** Temporary scaffolding that escaped into the change: \`wip\`/\`fixup!\`/\`squash!\` commits left in the history, debug logging, commented-out code, leftover \`TODO\` markers for work the PR claims to finish. When you flag a stray commit, name the commit it should fold into so the author can squash it — don't just say "this looks temporary".
71
73
 
72
74
  ## What NOT to find
73
75
 
@@ -80,8 +82,8 @@ Prioritize in this order:
80
82
 
81
83
  ## Severity hints specific to code
82
84
 
83
- - **blocker** — Correctness bug that will misbehave for users. Security vulnerability. Broken backward compatibility without migration. Crashing path on common input. Deleted tests without justification.
84
- - **concern** — Likely-bad outcome that hasn't bitten yet (missing timeout, unbounded retry, edge case ignored). Test gap on the new behavior. Architectural deviation that compounds.
85
+ - **blocker** — Correctness bug that will misbehave for users. Security vulnerability. Broken backward compatibility without migration. Crashing path on common input. Deleted tests without justification. A regression that breaks an existing caller you can name, or a side effect (leaked resource, un-invalidated cache, mutated shared state) that corrupts behavior outside the diff.
86
+ - **concern** — Likely-bad outcome that hasn't bitten yet (missing timeout, unbounded retry, edge case ignored). Test gap on the new behavior. Architectural deviation or intent drift that compounds. A plausible regression or side effect whose reach you suspect but cannot fully trace — say what you'd check to confirm, and let the blast radius decide whether it's really a blocker.
85
87
  - **nit** — Naming, micro-readability, suboptimal-but-correct code. Optional. The author can decline and you should not push back.
86
88
  - **praise** — Non-obvious good design: a tricky invariant carefully preserved, a test that catches a subtle regression, a name that captures the domain precisely. Rare on purpose.
87
89