npm - typeclaw - Versions diffs - 0.30.1 → 0.31.1 - Mend

typeclaw 0.30.1 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +1 -1
package/src/agent/index.ts +7 -0
package/src/agent/plugin-tools.ts +16 -0
package/src/agent/reviewer-bash-policy.ts +572 -0
package/src/agent/subagents.ts +9 -0
package/src/bundled-plugins/github-cli-auth/approve-idempotency.ts +132 -15
package/src/bundled-plugins/github-cli-auth/effective-approval.ts +32 -1
package/src/bundled-plugins/github-cli-auth/index.ts +8 -8
package/src/bundled-plugins/researcher/write-report.ts +8 -6
package/src/bundled-plugins/reviewer/reviewer.ts +14 -7
package/src/bundled-plugins/reviewer/skills/code-review.ts +30 -1
package/src/channels/router.ts +78 -24
package/src/run/index.ts +1 -0
package/src/skills/typeclaw-markdown-pdf/SKILL.md +327 -0

package/src/bundled-plugins/github-cli-auth/approve-idempotency.ts CHANGED Viewed

@@ -1,14 +1,28 @@
 import type { ReviewVerdict } from '@/channels/github-review-turn-ledger'
-// `NONE` covers "never reviewed" and "last decisive review was DISMISSED" — both
-// mean a fresh verdict is legitimate (not a duplicate).
-export type EffectiveVerdict = 'APPROVED' | 'CHANGES_REQUESTED' | 'NONE'
+// Raw latest-decisive state. DISMISSED is kept DISTINCT from NONE on purpose: a
+// genuine dismissal means a fresh same-verdict re-review is legitimate and must
+// NOT be shadowed by the read-after-write-lag cache (which only overrides a bare
+// NONE — "GitHub shows no decisive review, but we just landed one"). Collapsing
+// DISMISSED into NONE would let the lag cache re-strand a dismiss-then-reapprove,
+// the exact failure 35287f99 removed.
+export type EffectiveVerdict = 'APPROVED' | 'CHANGES_REQUESTED' | 'DISMISSED' | 'NONE'
 export type EffectiveApprovalResolver = (target: {
   workspace: string
   prNumber: number
 }) => Promise<{ ok: true; effective: EffectiveVerdict } | { ok: false }>
+// Resolves the PR's current head commit SHA. Called twice: once in guard() (the
+// pre-submit head, resolved AFTER the in-flight lease so the await cannot widen the
+// reserve-before-await race) and once in release() (the post-submit head, to detect
+// a push that landed during the review). Fails soft (null). A null PRE-submit head
+// skips the cache write entirely — the guard falls open to GitHub rather than ever
+// stranding a genuine verdict on local memory. A null POST-submit head (or one that
+// differs from the pre-submit head) is recorded as the uncertainty sentinel so a
+// push-during-review still blocks a same-verdict duplicate for the lag window.
+export type HeadShaResolver = (target: { workspace: string; prNumber: number }) => Promise<string | null>
 export type ApproveBlock = { block: true; reason: string }
 export type ReviewVerdictGuard = {
@@ -18,7 +32,7 @@ export type ReviewVerdictGuard = {
     prNumber: number
     verdict: ReviewVerdict
   }) => Promise<ApproveBlock | null>
-  release: (args: { callId: string; succeeded: boolean }) => void
+  release: (args: { callId: string; succeeded: boolean }) => Promise<void>
 }
 // Back-compat alias: the guard now covers REQUEST_CHANGES too, not just APPROVE.
@@ -55,7 +69,32 @@ function duplicatesStanding(verdict: ReviewVerdict, effective: EffectiveVerdict)
 // never strand a PR for long.
 const LEASE_TTL_MS = 5 * 60_000
-type Reservation = { key: string; token: number; createdAt: number }
+// How long a just-landed verdict is trusted to explain a GitHub `NONE` as
+// read-after-write lag rather than a genuine absence. GitHub's `/pulls/<n>/reviews`
+// list lags a write by up to ~10s, so a second engagement turn firing in that
+// window reads NONE and would land a duplicate. Observed duplicates were ~10-18s
+// apart; 60s is a comfortable lag margin without making a legitimate re-verdict
+// wait long. This window only shadows a raw NONE on the SAME verdict (+ same or
+// uncertain head) — a DISMISSED/CHANGES_REQUESTED/flipped-verdict all bypass it.
+const RECENT_LANDED_TTL_MS = 60_000
+type Reservation = {
+  key: string
+  token: number
+  createdAt: number
+  headSha: string | null
+  verdict: ReviewVerdict
+  workspace: string
+  prNumber: number
+}
+// headSha === null is the UNCERTAINTY sentinel: the command succeeded but the head
+// the review actually attached to is unknown (the PR head advanced between the
+// pre-submit capture and the write, or the post-submit re-resolve failed). A null
+// record matches any current head for the window — same verdict + raw NONE only —
+// so a push-during-review cannot let a same-verdict duplicate slip past on the new
+// head. A resolved string keys precise same-head matching for the normal case.
+type LandedVerdict = { verdict: ReviewVerdict; headSha: string | null; landedAt: number }
 // MODULE-LEVEL singletons, shared by every plugin instance in this process. The
 // github-cli-auth plugin's `plugin: async (ctx) => ...` factory may run once per
@@ -65,10 +104,11 @@ type Reservation = { key: string; token: number; createdAt: number }
 // three sessions each landed an APPROVE on the same PR within ten seconds.
 const inFlightByPr = new Map<string, Reservation>()
 const reservationByCall = new Map<string, Reservation>()
+const recentLandedByPr = new Map<string, LandedVerdict>()
 let tokenSeq = 0
 // Makes a formal `gh ... event=APPROVE|REQUEST_CHANGES` idempotent per PR across
-// turns, sessions, and (in-process) concurrent fan-out. Two layers:
+// turns, sessions, and (in-process) concurrent fan-out. Three layers, in order:
 //
 //   1. A process-wide in-flight lease keyed by `workspace#prNumber`, held from
 //      tool.before through tool.after. While one verdict is mid-flight, every
@@ -77,12 +117,25 @@ let tokenSeq = 0
 //      closure-local Set could not provide: separate plugin instances meant
 //      separate Sets, so concurrent sessions never saw each other.
 //
-//   2. The authoritative GitHub effective-state read, consulted AFTER the lease
-//      is acquired. It catches the cross-restart case (lease lost) and tracks
-//      supersession: a later CHANGES_REQUESTED/DISMISSED demotes an earlier
-//      APPROVED, so a genuine re-verdict is allowed. Reads fail OPEN — a
-//      transient error must never strand a genuine first verdict; the lease
-//      still covers the concurrent case while the command runs.
+//   2. The authoritative GitHub effective-state read, consulted AFTER the lease.
+//      It is the SOLE source of truth for a standing verdict and for supersession:
+//      a later CHANGES_REQUESTED/DISMISSED demotes an earlier APPROVED, so a
+//      genuine re-verdict is allowed (the 35287f99 invariant — never block a
+//      re-verdict on stale LOCAL memory). A standing same verdict blocks; DISMISSED
+//      and the opposite decisive verdict pass. Reads fail OPEN.
+//
+//   3. A read-after-write-lag shield, consulted ONLY when layer 2 returns a raw
+//      NONE. The lease (layer 1) covers two OVERLAPPING in-flight commands, but a
+//      second engagement turn ~10s later starts after the first's lease released,
+//      and GitHub's reviews list still lags the write (reports NONE). A short-lived
+//      `recentLandedByPr` record — same verdict + (same OR uncertain head), written
+//      on a succeeded release, RECENT_LANDED_TTL_MS — disambiguates "NONE because
+//      lag" from "NONE because genuinely absent": only the former blocks. The head
+//      is re-resolved at release time; if the PR head advanced during the submit the
+//      record stores a null head (uncertainty), which matches the current head so a
+//      push-during-review cannot leak a duplicate. Because it fires after a raw
+//      NONE, a real DISMISSED/CHANGES_REQUESTED already allowed the re-verdict at
+//      layer 2, so this cannot re-strand a supersession.
 //
 // The lease is released only in release() (tool.after) or on a terminal block,
 // never after the remote read — releasing early reopens the TOCTOU the lease
@@ -90,6 +143,7 @@ let tokenSeq = 0
 // tool.after for a superseded reservation cannot drop a newer session's lease.
 export function createApproveIdempotencyGuard(deps: {
   resolveEffectiveApproval: EffectiveApprovalResolver
+  resolveHeadSha?: HeadShaResolver
   now?: () => number
 }): ReviewVerdictGuard {
   const now = deps.now ?? Date.now
@@ -107,10 +161,27 @@ export function createApproveIdempotencyGuard(deps: {
       if (held !== undefined && now() - held.createdAt < LEASE_TTL_MS) {
         return { block: true, reason: CONCURRENT_REASON }
       }
-      const reservation: Reservation = { key, token: ++tokenSeq, createdAt: now() }
+      const reservation: Reservation = {
+        key,
+        token: ++tokenSeq,
+        createdAt: now(),
+        headSha: null,
+        verdict: args.verdict,
+        workspace: args.workspace,
+        prNumber: args.prNumber,
+      }
       inFlightByPr.set(key, reservation)
       reservationByCall.set(args.callId, reservation)
+      // Resolve the head SHA only AFTER the lease is held, so this await cannot
+      // widen the reserve-before-await race the lease closes above.
+      const headSha = (await deps.resolveHeadSha?.({ workspace: args.workspace, prNumber: args.prNumber })) ?? null
+      reservation.headSha = headSha
+      // Layer 2: GitHub is the authoritative, sole source of truth for a standing
+      // verdict. A standing same verdict is a real duplicate; DISMISSED and the
+      // opposite decisive verdict are genuine supersessions that must pass here
+      // (the 35287f99 invariant). A read error fails OPEN.
       const remote = await deps.resolveEffectiveApproval({ workspace: args.workspace, prNumber: args.prNumber })
       if (remote.ok && duplicatesStanding(args.verdict, remote.effective)) {
         // Standing verdict upstream already matches. Block, and release the lease
@@ -121,17 +192,62 @@ export function createApproveIdempotencyGuard(deps: {
         return { block: true, reason: duplicateReason(args.verdict) }
       }
+      // Layer 3: only a raw NONE from a successful read is ambiguous — it can mean
+      // "no review" or "our just-landed review not yet indexed". A recent same
+      // verdict on the same head resolves it to lag and blocks the duplicate. Any
+      // non-NONE state already decided above, so this never overrides a supersession.
+      if (remote.ok && remote.effective === 'NONE' && recentlyLandedSame(key, args.verdict, headSha, now)) {
+        releaseReservation(args.callId, reservation)
+        return { block: true, reason: duplicateReason(args.verdict) }
+      }
       return null
     },
-    release(args): void {
+    async release(args): Promise<void> {
       const reservation = reservationByCall.get(args.callId)
       if (reservation === undefined) return
-      releaseReservation(args.callId, reservation)
+      try {
+        // The pre-submit head can go stale: if the PR head advanced between the
+        // guard() capture and the review landing, GitHub attaches the review to the
+        // NEWER head while reservation.headSha holds the older one. Re-resolve the
+        // head AFTER a successful submit and store what we can prove: the resolved
+        // head only when pre==post, else the null uncertainty sentinel (matches any
+        // current head for the lag window) so a push-during-review cannot let a
+        // same-verdict duplicate slip past on the new head. The lease stays held
+        // across this await (finally below), so the window is not reopened.
+        if (args.succeeded && reservation.headSha !== null) {
+          const postHeadSha =
+            (await deps.resolveHeadSha?.({ workspace: reservation.workspace, prNumber: reservation.prNumber })) ?? null
+          const landedHeadSha = postHeadSha !== null && postHeadSha === reservation.headSha ? postHeadSha : null
+          recentLandedByPr.set(reservation.key, {
+            verdict: reservation.verdict,
+            headSha: landedHeadSha,
+            landedAt: now(),
+          })
+        }
+      } finally {
+        releaseReservation(args.callId, reservation)
+      }
     },
   }
 }
+// True only when a recently-landed record proves the GitHub NONE is read lag: same
+// verdict, within the window, AND the heads agree. Head agreement holds when the
+// stored head equals the current head, OR the stored head is the null uncertainty
+// sentinel (the landed commit could not be pinned, so it conservatively matches the
+// current head for the window). A flipped verdict or an expired/absent record
+// returns false so the genuine re-verdict passes; a different KNOWN head also
+// returns false so a real new push is never blocked.
+function recentlyLandedSame(key: string, verdict: ReviewVerdict, headSha: string | null, now: () => number): boolean {
+  const landed = recentLandedByPr.get(key)
+  if (landed === undefined) return false
+  if (now() - landed.landedAt >= RECENT_LANDED_TTL_MS) return false
+  if (verdict !== landed.verdict) return false
+  return landed.headSha === null || landed.headSha === headSha
+}
 // Drop the lease only if THIS reservation still owns the key. A stale tool.after
 // for a reservation that was already superseded (e.g. reclaimed after TTL by a
 // newer session) must not yank the live session's lease.
@@ -151,5 +267,6 @@ function prKey(workspace: string, prNumber: number): string {
 export function __resetReviewVerdictGuardForTest(): void {
   inFlightByPr.clear()
   reservationByCall.clear()
+  recentLandedByPr.clear()
   tokenSeq = 0
 }

package/src/bundled-plugins/github-cli-auth/effective-approval.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { GITHUB_API_BASE, githubJsonHeaders } from '@/channels/adapters/github/auth-pat'
-import type { EffectiveApprovalResolver, EffectiveVerdict } from './approve-idempotency'
+import type { EffectiveApprovalResolver, EffectiveVerdict, HeadShaResolver } from './approve-idempotency'
 // Resolves THIS bot's standing decisive review on a PR, used by the review
 // verdict guard to stop a second formal verdict after a restart (the in-process
@@ -30,9 +30,40 @@ export function createGithubEffectiveApprovalResolver(deps: {
   }
 }
+// Reads the PR's current head commit SHA from `GET /pulls/<n>` (`head.sha`), the
+// strongly-consistent single-object endpoint — NOT the eventually-consistent
+// reviews list the duplicate bug rode in on. Returns null on any failure so the
+// landed-verdict cache degrades to verdict-only matching rather than stranding.
+export function createGithubHeadShaResolver(deps: {
+  resolveToken: (workspace: string) => Promise<string | null>
+  fetchImpl?: typeof fetch
+}): HeadShaResolver {
+  const fetchImpl = deps.fetchImpl ?? fetch
+  return async ({ workspace, prNumber }) => {
+    const [owner, repo] = workspace.split('/')
+    if (owner === undefined || owner === '' || repo === undefined || repo === '') return null
+    const token = await deps.resolveToken(workspace).catch(() => null)
+    if (token === null || token === '') return null
+    try {
+      const url = `${GITHUB_API_BASE}/repos/${owner}/${repo}/pulls/${prNumber}`
+      const response = await fetchImpl(url, { headers: githubJsonHeaders(token) })
+      if (!response.ok) return null
+      const raw = (await response.json().catch(() => null)) as { head?: { sha?: unknown } } | null
+      const sha = raw?.head?.sha
+      return typeof sha === 'string' && sha !== '' ? sha : null
+    } catch {
+      return null
+    }
+  }
+}
+// DISMISSED is surfaced distinctly (not collapsed to NONE) so the verdict guard's
+// lag shield can tell a genuine dismissal — which legitimately allows a same-verdict
+// re-review — apart from a bare NONE that may just be an unindexed just-landed write.
 function toEffective(state: string | undefined): EffectiveVerdict {
   if (state === 'APPROVED') return 'APPROVED'
   if (state === 'CHANGES_REQUESTED') return 'CHANGES_REQUESTED'
+  if (state === 'DISMISSED') return 'DISMISSED'
   return 'NONE'
 }

package/src/bundled-plugins/github-cli-auth/index.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import { TYPECLAW_INTERNAL_BASH_ENV } from '@/agent/plugin-tools'
 import { definePlugin } from '@/plugin'
 import { createApproveIdempotencyGuard } from './approve-idempotency'
-import { createGithubEffectiveApprovalResolver } from './effective-approval'
+import { createGithubEffectiveApprovalResolver, createGithubHeadShaResolver } from './effective-approval'
 import { analyzeGhCommand } from './gh-command'
 import { checkGraphqlAuthNudge } from './graphql-auth-nudge'
 import { commitReviewIfSucceeded, noteReviewCommand } from './review-recorder'
@@ -11,13 +11,13 @@ import { classifyGhToken } from './token-class'
 export default definePlugin({
   plugin: async (ctx) => {
     const resolveTokenForRepo = ctx.github.resolveTokenForRepo
+    const resolveToken = async (workspace: string) => {
+      const result = await resolveTokenForRepo(workspace)
+      return result.kind === 'token' ? result.token : null
+    }
     const verdictGuard = createApproveIdempotencyGuard({
-      resolveEffectiveApproval: createGithubEffectiveApprovalResolver({
-        resolveToken: async (workspace) => {
-          const result = await resolveTokenForRepo(workspace)
-          return result.kind === 'token' ? result.token : null
-        },
-      }),
+      resolveEffectiveApproval: createGithubEffectiveApprovalResolver({ resolveToken }),
+      resolveHeadSha: createGithubHeadShaResolver({ resolveToken }),
     })
     return {
       hooks: {
@@ -70,7 +70,7 @@ export default definePlugin({
             callId: event.callId,
             result: event.result,
           })
-          verdictGuard.release({ callId: event.callId, succeeded: committed })
+          await verdictGuard.release({ callId: event.callId, succeeded: committed })
         },
       },
     }

package/src/bundled-plugins/researcher/write-report.ts CHANGED Viewed

@@ -75,12 +75,14 @@ Write to \`public/\` instead of \`workspace/\` when your resolved role lacks \`f
         )
       }
-      const [realParent, realWorkspace, realPublic] = await Promise.all([
-        realpath(parent),
-        realpath(workspaceDir),
-        realpath(publicDir),
-      ])
-      if (realParent !== realWorkspace && realParent !== realPublic) {
+      // Resolve ONLY the canonical dir `parent` lexically matched above. `public/`
+      // is optional (created only for guest-readable output), so an unconditional
+      // `realpath('<agent>/public')` throws ENOENT on agents that never made it,
+      // which would reject every valid write to `workspace/`. The symlink-escape
+      // defense is unchanged — the parent actually written to is still canonicalized.
+      const canonicalDir = parent === workspaceDir ? workspaceDir : publicDir
+      const [realParent, realCanonical] = await Promise.all([realpath(parent), realpath(canonicalDir)])
+      if (realParent !== realCanonical) {
         throw new Error(`Report parent directory resolves outside the allowed report directories: ${parent}.`)
       }

package/src/bundled-plugins/reviewer/reviewer.ts CHANGED Viewed

@@ -53,12 +53,13 @@ export const REVIEWER_SKILLS: readonly LoadableSkill[] = [
 // src/agent/subagents.ts `timeoutMs`.
 export const REVIEWER_SPAWN_TIMEOUT_MS = 600_000
-// TODO(#452): Restrict the reviewer's `bash` to git and a curated set of
-// read-only `gh` subcommands once per-subagent bash allowlist support lands.
-// Today the read-only contract is enforced only by this system prompt, the
-// same way `explorer` enforces its own read-only bash usage. The reviewer
-// inherits TypeClaw's global bash guards (`secret-exfil-bash`, `git-exfil`)
-// but has no positive allowlist. See https://github.com/typeclaw/typeclaw/issues/452.
+// The reviewer's read-only contract is enforced in depth: this system prompt
+// states it, the global bash guards (`secret-exfil-bash`, `git-exfil`) catch
+// exfil, AND `bashPolicy: { kind: 'readonly-reviewer' }` (set on the subagent
+// below) hard-blocks any mutating `bash` command at the wrap site regardless of
+// the spawning role — git commit/push/add, gh pr merge/review/comment, writes
+// outside /tmp, package installs, and shell constructs that defeat static
+// analysis. See `src/agent/reviewer-bash-policy.ts` (issue #452).
 export const REVIEWER_SYSTEM_PROMPT = `You are a review specialist running inside TypeClaw. Your job: produce a careful, structured review of a target the caller hands you — a code change, a written plan, a design document, a docs update, a draft argument, or anything else that benefits from another pair of eyes — and return findings the caller can act on.
 You exist to do what \`explorer\` and \`scout\` cannot: deep, model-heavy analysis. Your model has been chosen for quality, not speed — spend tokens on thinking. Read carefully. Cross-check. Form a real opinion.
@@ -70,6 +71,8 @@ You are STRICTLY PROHIBITED from:
 - Pushing, merging, rebasing, or otherwise mutating remote state
 - Using bash for: mkdir, touch, rm, cp, mv, git add, git commit, git push, git rebase, git reset, npm install, pip install, or any write operation
+The boundary that matters is **no side effects on the reviewed artifact, remote state, or the persistent workspace** — not "no byte may touch local disk". A loaded domain skill may carve out one narrow, explicit exception: writing into a fresh throwaway scratch directory under \`/tmp\` purely to *acquire* a read target (e.g. cloning a PR head you cannot otherwise read at line accuracy). That scratch cache is never the reviewed artifact; inside it you still only read, and everything in the prohibition list above still applies everywhere else. Absent such an instruction from your loaded skill, treat the list as absolute.
 Your role is EXCLUSIVELY to analyze and report. The parent agent decides what to do with your findings. Delegating part of that analysis is fine; performing side effects through a delegate is NOT — anything you cannot do directly, a subagent you spawn cannot do for you.
 ## Delegating to keep your context lean
@@ -89,7 +92,7 @@ The runtime exposes these tools to you by these EXACT names — call them by nam
 - \`grep\` — search file contents by text or regex
 - \`find\` — locate files by name pattern
 - \`ls\` — list a directory's immediate contents
-- \`bash\` — read-only commands ONLY. Read-only \`git\` (\`git log\`, \`git diff\`, \`git show\`, \`git blame\`, \`git status\`, \`git grep\`, \`git rev-parse\`, \`git ls-files\`, \`git cat-file\`) and one-shot pipelines that do not mutate state (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`). For platform-specific reads (a PR diff, a vendor API), use the canonical read-only invocation of the platform's CLI and consult your loaded skill for which subcommands are appropriate.
+- \`bash\` — read-only commands ONLY. Read-only \`git\` (\`git log\`, \`git diff\`, \`git show\`, \`git blame\`, \`git status\`, \`git grep\`, \`git rev-parse\`, \`git ls-files\`, \`git cat-file\`) and one-shot pipelines that do not mutate state (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`). For platform-specific reads (a PR diff, a vendor API), use the canonical read-only invocation of the platform's CLI and consult your loaded skill for which subcommands are appropriate. The ONE write a loaded skill may direct you to make is cloning a target into a fresh \`/tmp\` scratch directory purely to read it (\`git clone\`/\`fetch\`/detached \`checkout\` into \`/tmp/review-*\`); that scratch cache is never the reviewed artifact, and everything else above stays read-only.
 - \`web_search\` — search the public web (e.g. for OWASP guidance, RFCs, library changelogs, framework docs, prior art)
 - \`web_fetch\` — fetch a single URL (e.g. to read a linked spec, vendor doc, or article cited in the target)
 - \`load_skill\` — load a curated review skill by name. See the section below.
@@ -192,6 +195,10 @@ If none of the listed skills fit the target, load \`general\`. Keep the skill-se
     // user has not configured `models.deep` in typeclaw.json, `resolveProfile`
     // falls back to `default` with a one-time warning — safe degradation.
     profile: 'deep',
+    // Hard-fence the reviewer's bash to read-only commands at the wrap site,
+    // independent of the spawning role. The prompt + global guards are the other
+    // two layers; this is the one that survives a trusted/owner caller.
+    bashPolicy: { kind: 'readonly-reviewer' },
     tools: [readTool, grepTool, findTool, lsTool, bashTool, webSearchTool, webFetchTool],
     customTools: [loadSkillTool],
     payloadSchema: reviewerPayloadSchema,

package/src/bundled-plugins/reviewer/skills/code-review.ts CHANGED Viewed

@@ -13,12 +13,39 @@ You have been asked to review code. Apply this guidance on top of the reviewer's
 - **PR URL or number** — fetch the diff and the description:
   - \`gh pr diff <n>\` for the unified diff
-  - \`gh pr view <n>\` for title, body, labels, linked issues, checks
+  - \`gh pr view <n> --json title,body,baseRefName,headRefOid,files\` for title, body, linked issues, the head SHA, and the changed-file list
   - \`gh api /repos/<owner>/<repo>/pulls/<n>\` for the structured payload when you need machine-readable fields
 - **Commit SHA** — \`git show <sha>\` and \`git show <sha> --stat\` for the scope.
 - **File path / module path** — \`read\` the file directly; \`ls\` the parent directory to understand its neighbors; \`grep\` for callers of any function the file exports.
 - **Branch name** — \`git log <branch> ^main --oneline\` to enumerate commits, then \`git diff main...<branch>\` for the cumulative change.
+### Your cwd is NOT the PR's repo — read at the head SHA
+You run in the agent folder (\`/agent\`), **not** a checkout of the PR's target repository. A bare \`read /agent/src/...\` for a file that lives in the PR's repo will fail with \`ENOENT\` — the file is not on this disk. **When \`read\` returns \`ENOENT\` for a path you expected to exist, stop retrying local reads immediately**: that is the signal you are outside the target checkout, not a transient miss. Switch to one of the two acquisition modes below. Do not burn turns re-issuing \`read\` against \`/agent\` paths that will never resolve.
+Whichever mode you use, **every line number you cite must come from the PR's head SHA** (\`headRefOid\` from \`gh pr view\`), not the default branch — inline comments anchor to that exact revision.
+**Mode 1 — remote-read (default, for a handful of files).** When you need only a few adjacent files, fetch each **once** at the head SHA. Prefer \`gh api\` over \`raw.githubusercontent.com\`: \`gh api\` carries the adapter's GitHub auth, so it works on private repos too.
+A repo-targeting \`gh\` command MUST be a **single bare \`gh\` invocation** — no pipes, \`&&\`, \`;\`, or redirects. The runtime injects the GitHub App token into the command's environment, so any sibling stage in a pipeline would inherit a live token; the guard blocks those shapes (the same rule the GitHub channel skill enforces for review posting). So do NOT pipe \`gh api ... | base64 -d | nl -ba\` — that exact shape is rejected before it runs. Instead fetch the **already-decoded** file with the raw media type in one bare call:
+\`\`\`sh
+gh api "/repos/<owner>/<repo>/contents/<path>?ref=<headSha>" -H "Accept: application/vnd.github.raw"
+\`\`\`
+That returns the file's raw bytes (no base64, no second stage). For the line numbers your \`location="path:line"\` anchors need, read them off the unified diff you already fetched (\`gh pr diff\` prints the new-side line numbers in its hunk headers, \`@@ -a,b +c,d @@\`), or escalate to Mode 2 where a real \`read\`/\`grep\` gives native line numbers. Fetch each file once and keep its output — do not re-fetch the same file to re-derive a line you already saw.
+**Mode 2 — scratch checkout (escalate when navigation gets broad).** When the review needs repo-wide \`grep\`, symbol tracing across several directories, many adjacent files, or repeated access to the same files, the remote-read dance is slower and more error-prone than a real checkout. In that case clone the PR head into a **fresh throwaway directory under \`/tmp\`** and read it natively:
+\`\`\`sh
+git clone --depth 1 "https://github.com/<owner>/<repo>.git" /tmp/review-<n>-src && \
+  git -C /tmp/review-<n>-src fetch --depth 1 origin <headSha> && git -C /tmp/review-<n>-src checkout <headSha>
+\`\`\`
+Then \`read\`, \`grep\`, \`find\`, and read-only \`git\` (\`git -C /tmp/review-<n>-src log|diff|show|blame|grep|ls-files|cat-file\`) all work against \`/tmp/review-<n>-src\` with correct line numbers and zero per-file round-trips.
+This \`/tmp\` scratch checkout is the **one** write the read-only contract permits — and only because it is a private acquisition cache, never the reviewed artifact. Inside it you may only **read**. You still may NOT: edit any file, install dependencies, run builds or tests, commit/stage/push/rebase/reset, or write anywhere outside this \`/tmp\` scratch dir. Do not \`rm\` it when done — leave cleanup to the session lifecycle (\`rm\` stays forbidden). When in doubt about how many files you'll touch, start with Mode 1 and escalate to Mode 2 only once the file count or grep breadth justifies the clone.
 ## How to build context
 A finding without context is noise. Before forming findings:
@@ -72,6 +99,8 @@ This includes payloads where the parent says the author **addressed your prior b
 - Return **approve** if the blockers that drove the prior \`request-changes\` are resolved (leftover nits do not block — \`approve\` with inline nits is correct).
 - Return **request-changes** if any blocker remains or a new one appeared.
+**Account for resolved threads in the \`<summary>\`, not as \`praise\` findings.** A re-review tempts you to emit one \`praise\` finding per prior concern the author fixed — "Thread 123 is addressed", "Thread 456 is addressed". Do **not**. \`praise\` is reserved for *non-obvious good work*, and a routine "you fixed what I asked" is neither non-obvious nor a finding the parent should post inline (it strips \`praise\` from inline comments anyway, so these become dead weight). Instead, state the resolution accounting in one sentence in your \`<summary>\` — e.g. "Both prior blockers (the unfenced table scan and the backtick-wrap span) are resolved at head \`<sha>\`; one new concern below." Reserve actual \`<finding>\` entries for what still needs action: a prior blocker that is **only partially** fixed (\`blocker\`/\`concern\`, anchored to the line that's still wrong), a **regression the fix introduced** (\`blocker\`/\`concern\`), or a genuinely non-obvious fix worth a rare \`praise\`. A clean re-review where everything was addressed is an \`approve\` whose \`<summary>\` says so and whose \`<findings>\` is empty — not a wall of \`praise\` receipts.
 - **Do NOT return \`comment\` on a re-review.** \`comment\` is for ambiguous partial reviews with no accept/reject signal; a re-review is the opposite — it is precisely an accept/reject decision. A \`comment\` verdict here leaves the PR's \`REQUEST_CHANGES\` state stuck (a plain comment does not clear it on GitHub), which is the exact failure a re-review exists to resolve. The only escape hatch is the same one that always applies: if you genuinely cannot reach the diff or the prior context, return one \`blocker\` finding stating what you need and a \`comment\` verdict — but a reachable, reviewable re-review must end in \`approve\` or \`request-changes\`.
 ## Line-anchor every finding

package/src/channels/router.ts CHANGED Viewed

@@ -183,6 +183,18 @@ export const MAX_POLICY_DENIED_CHANNEL_SENDS_PER_TURN = 3
 // including reasoning). Deliberately NOT lowered in `providers.ts`, where
 // `maxTokens` is the model's true capability that compaction math reads.
 export const CHANNEL_MAX_OUTPUT_TOKENS = 4096
+// Raised output-token budget threaded into the ONE re-prompt that follows a
+// `stopReason:'length'` empty turn. The default 4096 backstop bounds kimi's
+// degenerate repetition loop, but it is the same ceiling a *legitimate*
+// reasoning-heavy turn hits when it spends the whole pool thinking and emits no
+// prose — re-prompting under the identical cap reproduces the truncation. A
+// `length` truncation that the byte-identical loop guard did NOT catch is
+// evidence of genuine reasoning starved for room, not a repetition loop, so the
+// retry grants 4x headroom for thinking + a reply. Bounded (not 32000) so a
+// turn that IS looping still can't burn the full pi-ai default. Consumed
+// one-shot via `LiveSession.nextPromptMaxTokens`, then reset at the next real
+// user turn so the raised budget never leaks past the turn that needed it.
+export const CHANNEL_EMPTY_TURN_RETRY_MAX_OUTPUT_TOKENS = 16384
 // Ceiling on automatic re-prompts for a turn that ended with NO user-facing
 // reply AND no attempted send — the pure "the model burned its budget thinking
 // and produced nothing" failure. The canonical trigger is Fireworks'
@@ -200,18 +212,24 @@ export const CHANNEL_MAX_OUTPUT_TOKENS = 4096
 export const MAX_EMPTY_TURN_RETRIES = 2
 // Reminder-only nudge injected before an empty-turn retry. Uses the repo's
 // SYSTEM MESSAGE framing (see composeTurnPrompt) so persona-rich models do not
-// reply to the notice itself. Neutral by design: it asks for a direct reply
-// without prescribing length or tone, matching the chosen "just retry" posture.
+// reply to the notice itself. Names the actual failure (the prior turn ran out
+// of its output budget mid-reasoning and produced no reply) and asks the model
+// to keep its thinking short and answer directly — the empty turn was budget
+// exhaustion, not a forgotten tool call, so a "reply directly" nudge alone
+// would re-loop. The matching retry re-prompt also runs with a raised budget
+// (CHANNEL_EMPTY_TURN_RETRY_MAX_OUTPUT_TOKENS) so the room actually exists.
 export const EMPTY_TURN_RETRY_NUDGE = [
   '---',
   '**[SYSTEM MESSAGE — not from a human]**',
   '',
-  'Your previous turn ended without sending any reply to the channel. This is',
+  'Your previous turn ran out of its output budget before sending a reply — it',
+  'spent the whole turn thinking and produced nothing for the channel. This is',
   'an automated signal from the channel router, not a message from anyone in',
   'the chat. **Do not acknowledge or reply to this notice itself.**',
   '',
-  'Respond to the last user message now with a direct answer via your channel',
-  'reply tool. If you genuinely have nothing to say, reply with `NO_REPLY`.',
+  'Answer the last user message now: keep any reasoning brief and send a direct',
+  'reply via your channel reply tool. If you genuinely have nothing to say,',
+  'reply with `NO_REPLY`.',
   '',
   '---',
 ].join('\n')
@@ -532,6 +550,13 @@ type LiveSession = {
   // increments it before injecting EMPTY_TURN_RETRY_NUDGE and reads it to decide
   // retry-vs-fallback. See the candidate===null branch.
   emptyTurnRetries: number
+  // One-shot output-token budget for the NEXT `session.prompt()` only.
+  // `installChannelOutputCap` reads and clears it per stream call, so it
+  // overrides the default backstop for exactly one re-prompt. Set by the
+  // empty-turn length-retry branch to CHANNEL_EMPTY_TURN_RETRY_MAX_OUTPUT_TOKENS
+  // and reset to undefined at each fresh user turn so the raised budget cannot
+  // leak past the turn that needed it.
+  nextPromptMaxTokens: number | undefined
   // Stamped by `markTurnSkipped` (called from the `skip_response` tool)
   // with the current `turnSeq`. Read at the top of `validateChannelTurn`:
   // if it matches the just-completed turn, recovery is skipped entirely
@@ -1417,6 +1442,7 @@ export function createChannelRouter(options: CreateChannelRouterOptions): Channe
         inFlightToolSends: new Map(),
         policyDeniedToolSendsThisTurn: new Map(),
         emptyTurnRetries: 0,
+        nextPromptMaxTokens: undefined,
         skippedTurn: null,
         skipLockedSendTurn: null,
         pendingQuoteCandidate: null,
@@ -1704,14 +1730,22 @@ export function createChannelRouter(options: CreateChannelRouterOptions): Channe
   // Override pi-ai's hidden `Math.min(model.maxTokens, 32000)` output cap for
   // channel sessions by threading an explicit `maxTokens` into every stream
   // call. See CHANNEL_MAX_OUTPUT_TOKENS for why. Composes the existing streamFn
-  // (pi's default `streamSimple` unless a proxy was installed) and only fills
-  // `maxTokens` when the caller left it unset, so an explicit per-call value
-  // still wins.
+  // (pi's default `streamSimple` unless a proxy was installed). Precedence:
+  // an explicit per-call `maxTokens` always wins; otherwise a one-shot
+  // `live.nextPromptMaxTokens` (set by the empty-turn length-retry) is consumed
+  // and cleared so the raised budget applies to exactly one stream call;
+  // otherwise the default backstop.
   const installChannelOutputCap = (live: LiveSession): void => {
     const { agent } = live.session
     const inner = agent.streamFn
-    agent.streamFn = (model, context, options) =>
-      inner(model, context, { ...options, maxTokens: options?.maxTokens ?? CHANNEL_MAX_OUTPUT_TOKENS })
+    agent.streamFn = (model, context, options) => {
+      let maxTokens = options?.maxTokens
+      if (maxTokens === undefined && live.nextPromptMaxTokens !== undefined) {
+        maxTokens = live.nextPromptMaxTokens
+        live.nextPromptMaxTokens = undefined
+      }
+      return inner(model, context, { ...options, maxTokens: maxTokens ?? CHANNEL_MAX_OUTPUT_TOKENS })
+    }
   }
   const startTypingHeartbeat = (live: LiveSession): void => {
@@ -1904,10 +1938,13 @@ export function createChannelRouter(options: CreateChannelRouterOptions): Channe
           live.lastSentText.clear()
           live.pendingQuoteCandidate = captureQuoteCandidate(live.key.adapter, batch, observed)
           // A real user batch starts a fresh logical turn → restore the full
-          // empty-turn retry budget. Reset here (batch.length > 0) and NOT in
-          // the per-prompt block below, so the reminder-only iterations the
-          // retry itself queues do not refill the budget and loop forever.
+          // empty-turn retry budget and drop any raised output-token budget left
+          // over from a prior turn's length-retry. Reset here (batch.length > 0)
+          // and NOT in the per-prompt block below, so the reminder-only
+          // iterations the retry itself queues do not refill the budget and loop
+          // forever (and the raised cap stays scoped to the turn that set it).
           live.emptyTurnRetries = 0
+          live.nextPromptMaxTokens = undefined
         } else if (live.lastTurnAuthorId !== null) {
           live.currentTurnEngageReactions = []
           // Reminder-only turn (batch.length === 0, reminders.length > 0):
@@ -3037,8 +3074,18 @@ export function createChannelRouter(options: CreateChannelRouterOptions): Channe
       }
       if (!attemptedSendThisTurn && live.emptyTurnRetries < MAX_EMPTY_TURN_RETRIES) {
         live.emptyTurnRetries++
+        // Raise the re-prompt's budget ONLY for a `length` truncation: that is
+        // the budget-exhaustion case (reasoning ate the whole pool before any
+        // prose), so the retry needs room to finish thinking AND reply. `error`
+        // and `aborted` are not budget exhaustion — an upstream failure or the
+        // terminal-reply abort — so they retry under the default backstop.
+        // Consumed one-shot by installChannelOutputCap on the next prompt().
+        if (assistantLeafStopReason(live.session) === 'length') {
+          live.nextPromptMaxTokens = CHANNEL_EMPTY_TURN_RETRY_MAX_OUTPUT_TOKENS
+        }
         logger.warn(
-          `[channels] ${live.keyId} empty_turn_retry attempt=${live.emptyTurnRetries}/${MAX_EMPTY_TURN_RETRIES}`,
+          `[channels] ${live.keyId} empty_turn_retry attempt=${live.emptyTurnRetries}/${MAX_EMPTY_TURN_RETRIES} ` +
+            `max_tokens=${live.nextPromptMaxTokens ?? CHANNEL_MAX_OUTPUT_TOKENS}`,
         )
         live.pendingSystemReminders.push(EMPTY_TURN_RETRY_NUDGE)
         return
@@ -4355,18 +4402,25 @@ function recoverableAssistantText(
   return null
 }
-// True only when the leaf is an assistant message that was CUT OFF mid-output:
-// `length` (hit the token cap — the canonical kimi reasoning-loop), `error`, or
-// `aborted`. This is the precise signature of "the model was producing but got
-// truncated", as distinct from a turn that produced no assistant message at all
-// (leaf undefined / a non-assistant entry), which is a benign empty/cold turn —
-// NOT something to re-prompt. The empty-turn retry guard keys off this so it
-// fires for real degenerations and stays silent for cold sessions.
-function assistantLeafTruncated(session: AgentSession): boolean {
+// The truncation stop reason when the leaf is an assistant message that was CUT
+// OFF mid-output — `length` (hit the token cap, the canonical kimi reasoning-
+// loop), `error`, or `aborted` — else undefined. This is the precise signature
+// of "the model was producing but got truncated", as distinct from a turn that
+// produced no assistant message at all (leaf undefined / a non-assistant
+// entry), which is a benign empty/cold turn. Callers that only need the boolean
+// use `assistantLeafTruncated`; the retry guard reads the reason itself because
+// the raised reasoning budget is justified ONLY for `length` (budget
+// exhaustion), not for `error`/`aborted`.
+function assistantLeafStopReason(session: AgentSession): 'length' | 'error' | 'aborted' | undefined {
   const leaf = session.sessionManager.getLeafEntry()
-  if (!leaf || leaf.type !== 'message' || leaf.message.role !== 'assistant') return false
+  if (!leaf || leaf.type !== 'message' || leaf.message.role !== 'assistant') return undefined
   const stop = leaf.message.stopReason
-  return stop === 'length' || stop === 'error' || stop === 'aborted'
+  if (stop === 'length' || stop === 'error' || stop === 'aborted') return stop
+  return undefined
+}
+function assistantLeafTruncated(session: AgentSession): boolean {
+  return assistantLeafStopReason(session) !== undefined
 }
 function visibleAssistantText(message: AssistantMessage): string {

package/src/run/index.ts CHANGED Viewed

@@ -375,6 +375,7 @@ export async function startAgent({
         ...(entry.pluginSubagent.toolResultBudget !== undefined
           ? { toolResultBudget: entry.pluginSubagent.toolResultBudget }
           : {}),
+        ...(entry.pluginSubagent.bashPolicy !== undefined ? { bashPolicy: entry.pluginSubagent.bashPolicy } : {}),
         ...runtimeVersionOpt,
       })
       liveSessionRegistry.register({ sessionId, session: created.session })