@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
package/src/pi.ts ADDED
@@ -0,0 +1,1236 @@
1
+ import { spawn } from 'node:child_process'
2
+ import { appendFile, mkdir, writeFile } from 'node:fs/promises'
3
+ import { homedir } from 'node:os'
4
+ import { dirname, join } from 'node:path'
5
+ import { killChildProcess } from './process.js'
6
+ import { pathExists } from './fs-utils.js'
7
+ import { redactSecrets } from './redact.js'
8
+ import { log } from './logger.js'
9
+
10
+ // Drives the Pi coding-agent CLI. Pi is pointed at the Worker's OpenAI-compatible
11
+ // proxy via a custom provider in ~/.pi/agent/models.json, authenticated with the
12
+ // per-job session token (interpolated from $PI_PROXY_TOKEN) — so no provider key
13
+ // ever lives in the image or in Pi's config on disk.
14
+
15
+ /**
16
+ * Per-completion output-token ceiling Pi requests (its model-entry `maxTokens`).
17
+ * Generous on purpose: a reasoning model (e.g. GLM-5.2) spends tokens on its
18
+ * `<think>` trace before the answer + tool calls, so a tight cap truncates it
19
+ * mid-reasoning and the agent never commits edits. It is a ceiling, not a target
20
+ * — unused output tokens are not billed and Workers AI clamps the request to the
21
+ * model's real max — so erring high is safe. Raised to 32k after a spec-writer run
22
+ * truncated an intermediate tool call at the old 16k cap; the document itself
23
+ * stopped well under it, so this is headroom for larger specs/diffs, with
24
+ * {@link runDiagnostics} flagging the rare case where even 32k is not enough.
25
+ */
26
+ export const PI_MAX_OUTPUT_TOKENS = 32_768
27
+
28
+ /** Write the Pi provider config that routes all model calls through the proxy. */
29
+ export async function writePiModelsConfig(opts: {
30
+ model: string
31
+ proxyBaseUrl: string
32
+ /** Output-token ceiling Pi may request per completion. Defaults to PI_MAX_OUTPUT_TOKENS. */
33
+ maxTokens?: number
34
+ }): Promise<string> {
35
+ const dir = join(homedir(), '.pi', 'agent')
36
+ await mkdir(dir, { recursive: true })
37
+ const config = {
38
+ providers: {
39
+ proxy: {
40
+ baseUrl: opts.proxyBaseUrl,
41
+ api: 'openai-completions',
42
+ // Interpolated by Pi from the environment at run time.
43
+ apiKey: '$PI_PROXY_TOKEN',
44
+ // OpenAI-compatible upstreams behind the proxy don't all accept the
45
+ // `developer` role or `reasoning_effort`; send a plain system message.
46
+ compat: { supportsDeveloperRole: false, supportsReasoningEffort: false },
47
+ // `maxTokens` is Pi's per-completion output ceiling — set it generously so
48
+ // a reasoning model isn't cut off mid-think (see PI_MAX_OUTPUT_TOKENS).
49
+ models: [
50
+ { id: opts.model, name: opts.model, maxTokens: opts.maxTokens ?? PI_MAX_OUTPUT_TOKENS },
51
+ ],
52
+ },
53
+ },
54
+ }
55
+ const path = join(dir, 'models.json')
56
+ await writeFile(path, JSON.stringify(config, null, 2), 'utf8')
57
+ return path
58
+ }
59
+
60
+ // Appended to every AGENTS.md so the model maintains the `todo` tool the image
61
+ // installs (rpiv-todo). Without a nudge a model may skip the tool, which would
62
+ // leave the run with no subtask progress to report; keeping the list current is
63
+ // what makes the board's "N/M done" move.
64
+ const TODO_GUIDANCE = `
65
+
66
+ ## Progress tracking (required)
67
+
68
+ You have a \`todo\` tool. For any multi-step task, before you start coding, break
69
+ the work into concrete subtasks with \`todo\` (action "create"). As you work, mark
70
+ each one \`in_progress\` when you begin it and \`completed\` when it's done (action
71
+ "update"). Keep the list accurate — it is the only signal the system has for how
72
+ far along the run is.`
73
+
74
+ // Appended to AGENTS.md only when the rpiv-web-tools extension is configured (an
75
+ // active web-search provider is set — see `webSearchConfigFromEnv`). Without a
76
+ // nudge a model rarely reaches for the tools, so it would keep relying on stale
77
+ // training data. Kept deliberately conservative: search is for facts that genuinely
78
+ // change or that the agent is unsure of, NOT a substitute for reading the repo.
79
+ const WEB_TOOLS_GUIDANCE = `
80
+
81
+ ## Web search & fetch (use sparingly)
82
+
83
+ You have \`web_search\` (returns titled result snippets for a query) and \`web_fetch\`
84
+ (reads a URL as text) tools. Reach for them ONLY when the repository itself can't
85
+ answer the question: to confirm a current library/API signature, a breaking change,
86
+ an exact error message, or a security advisory. Prefer first-party documentation,
87
+ and cite the source URL when a decision rests on what you found. Do NOT browse for
88
+ anything already discoverable in the checkout, and don't let searching replace
89
+ reading the code.`
90
+
91
+ // Appended to every AGENTS.md so an agent orients off the persisted service
92
+ // blueprint before touching code, but stays shallow by default: read the
93
+ // high-level overview first, and only open a module's deep-dive when the task
94
+ // actually touches it. Harmless when no blueprint exists yet (e.g. a fresh
95
+ // bootstrap) — the files simply aren't there to read.
96
+ const BLUEPRINT_GUIDANCE = `
97
+
98
+ ## Service blueprint (read first, stay shallow)
99
+
100
+ If a \`blueprints/\` folder exists, it is the map of this service. **Before you start,
101
+ read \`blueprints/overview.md\`** for the high-level structure (the service and its
102
+ modules). Do NOT read every module file. Only open \`blueprints/modules/<name>.md\`
103
+ for a module that is directly relevant to your task, when you need its summary and
104
+ exact code references. \`blueprints/version.json\` is a tiny manifest for quick
105
+ staleness checks. Treat the blueprint as orientation, not a task list.`
106
+
107
+ // Appended to every AGENTS.md so an agent treats the persisted spec as the
108
+ // PRESCRIPTIVE source (what must be true) and the acceptance scenarios its work must
109
+ // satisfy. Harmless when no spec exists yet — the files simply aren't there.
110
+ const SPEC_GUIDANCE = `
111
+
112
+ ## Service specification (the prescriptive spec)
113
+
114
+ If a \`spec/\` folder exists, it is the specification for this service. It is sharded
115
+ by a module (domain) → feature (group) taxonomy. **Read \`spec/overview.md\` first** —
116
+ it states what MUST be true and indexes the modules and their features (with links).
117
+ Open \`spec/modules/<module>/<feature>.md\` (or its \`.json\` for exact detail) for the
118
+ feature you are working on — it carries that feature's requirements AND the domain
119
+ rules scoped to it. \`spec/features/<module>/<feature>.feature\` are the Gherkin
120
+ acceptance scenarios your work must satisfy — treat them as the source of truth for
121
+ behaviour and tests. Read only the modules/features relevant to your task.`
122
+
123
+ /**
124
+ * Write the composed system prompt as Pi's GLOBAL agent context
125
+ * (`~/.pi/agent/AGENTS.md`), which Pi reads automatically and concatenates with
126
+ * any `AGENTS.md`/`CLAUDE.md` the repo itself ships (global file first, then the
127
+ * ones walked up from the run cwd). Deliberately OUTSIDE the checkout (the same
128
+ * `~/.pi/agent` dir `writePiModelsConfig` already uses) so the harness's
129
+ * instructions never enter the git working tree — they can't be committed into a
130
+ * PR and they never clobber a repo's own committed `AGENTS.md`.
131
+ *
132
+ * This relies on Pi's context-file resolution: the global `~/.pi/agent/AGENTS.md`
133
+ * is loaded before the project-trust decision, so it applies in non-interactive
134
+ * (`-p`) runs without a trust prompt. That contract is pinned by `PI_VERSION` in
135
+ * the Dockerfile — revisit this if that bump changes context-file resolution.
136
+ */
137
+ export async function writeAgentsContext(
138
+ systemPrompt: string,
139
+ opts: {
140
+ webSearch?: boolean
141
+ guidance?: string
142
+ serviceDirectory?: string
143
+ contextFiles?: ContextFileInfo[]
144
+ } = {},
145
+ ): Promise<void> {
146
+ const dir = join(homedir(), '.pi', 'agent')
147
+ await mkdir(dir, { recursive: true })
148
+ // Only nudge towards the web tools when they're actually configured, so an agent is
149
+ // never told about tools that would error (no provider key) the moment it calls them.
150
+ // `guidance` is the backend's per-kind nudge; fall back to the generic blurb for jobs
151
+ // that don't carry one (e.g. bootstrap, or an older dispatcher).
152
+ const webTools = opts.webSearch ? (opts.guidance ?? WEB_TOOLS_GUIDANCE) : ''
153
+ // Tell the agent it's in a monorepo and which subtree is its service, so it scopes
154
+ // its work (and its build/test commands) there. Only present when the dispatcher
155
+ // resolved a monorepo service directory; the agent's cwd already points at it.
156
+ const monorepo = opts.serviceDirectory ? monorepoGuidance(opts.serviceDirectory) : ''
157
+ // Point the agent at any linked context the backend materialised into the checkout
158
+ // (requirements / RFCs / PRDs / tracker issues) so it reads them on demand.
159
+ const context = contextGuidance(opts.contextFiles ?? [])
160
+ await writeFile(
161
+ join(dir, 'AGENTS.md'),
162
+ `${systemPrompt}${BLUEPRINT_GUIDANCE}${SPEC_GUIDANCE}${TODO_GUIDANCE}${monorepo}${webTools}${context}`,
163
+ 'utf8',
164
+ )
165
+ }
166
+
167
+ /** Directory in the checkout where linked-context files are materialised (see CONTEXT_DIR in agents). */
168
+ export const CONTEXT_DIR = '.cat-context'
169
+
170
+ /** The metadata the AGENTS.md context block needs to point an agent at a materialised file. */
171
+ export interface ContextFileInfo {
172
+ path: string
173
+ title: string
174
+ url: string
175
+ content: string
176
+ }
177
+
178
+ /** The AGENTS.md block enumerating the materialised linked-context files, or '' when none. */
179
+ function contextGuidance(files: ContextFileInfo[]): string {
180
+ if (!files.length) return ''
181
+ const list = files
182
+ .map((f) => `- \`${CONTEXT_DIR}/${f.path}\` — ${f.title}${f.url ? ` (${f.url})` : ''}`)
183
+ .join('\n')
184
+ return `
185
+
186
+ ## Linked context (read on demand)
187
+ Requirements / RFCs / PRDs / tracker issues relevant to this task are in the \`${CONTEXT_DIR}/\`
188
+ directory of your checkout. Open a file when it is relevant. Do NOT attempt to reach external
189
+ systems (Jira / Confluence / GitHub) — everything available has already been placed on disk:
190
+ ${list}`
191
+ }
192
+
193
+ /**
194
+ * Write the backend-prepared linked-context files into {@link CONTEXT_DIR} in the
195
+ * checkout so the agent can read them on demand, and add a LOCAL git exclude entry so
196
+ * even `git add -A` never commits them into the agent's PR. Best-effort on the exclude
197
+ * (a scaffold-from-scratch checkout has no `.git` yet — the files just stay untracked).
198
+ */
199
+ export async function materializeContextFiles(
200
+ cwd: string,
201
+ files: ContextFileInfo[],
202
+ ): Promise<void> {
203
+ if (!files.length) return
204
+ const dir = join(cwd, CONTEXT_DIR)
205
+ await mkdir(dir, { recursive: true })
206
+ for (const f of files) await writeFile(join(dir, f.path), f.content, 'utf8')
207
+ // The exclude pattern has no leading slash, so it matches `.cat-context/` at any depth
208
+ // — covering the monorepo case where cwd is a service subdirectory below the repo root.
209
+ // Walk up to find the repo's `.git` (best-effort; a from-scratch scaffold has none).
210
+ const gitRoot = await findGitRoot(cwd)
211
+ if (!gitRoot) return
212
+ try {
213
+ await appendFile(join(gitRoot, '.git', 'info', 'exclude'), `\n${CONTEXT_DIR}/\n`, 'utf8')
214
+ } catch {
215
+ // No writable .git/info; the files simply stay untracked (still not auto-added on most flows).
216
+ }
217
+ }
218
+
219
+ /** Walk up from `dir` (bounded) to the directory containing a `.git` folder, or null. */
220
+ async function findGitRoot(dir: string): Promise<string | null> {
221
+ let current = dir
222
+ for (let i = 0; i < 8; i++) {
223
+ if (await pathExists(join(current, '.git'))) return current
224
+ const parent = dirname(current)
225
+ if (parent === current) break
226
+ current = parent
227
+ }
228
+ return null
229
+ }
230
+
231
+ /** The monorepo note appended to AGENTS.md when a run is scoped to a service subdirectory. */
232
+ function monorepoGuidance(serviceDirectory: string): string {
233
+ return `
234
+
235
+ ## Monorepo service (work within your subdirectory)
236
+
237
+ This repository is a **monorepo** hosting more than one service. The service you are
238
+ working on lives in \`${serviceDirectory}/\` (relative to the repo root), and your
239
+ working directory is already set there. Confine your changes to that subtree — create
240
+ and edit files under \`${serviceDirectory}/\`, and run that service's own build/test/lint
241
+ commands (defined by the manifest in \`${serviceDirectory}/\`, e.g. its \`package.json\`).
242
+ Do not modify other services' directories, and only touch shared/root files (workspace
243
+ manifests, root config) when the task genuinely requires it.`
244
+ }
245
+
246
+ /**
247
+ * The active web-search backend for the rpiv-web-tools extension. Only the
248
+ * provider id is persisted to disk: the per-provider credential (and any base URL
249
+ * — `SEARXNG_URL`, `OLLAMA_HOST`) is read by the extension straight from the
250
+ * environment, so no key is ever written to the container's filesystem.
251
+ */
252
+ export interface WebSearchConfig {
253
+ /** rpiv-web-tools provider id, e.g. `brave`, `tavily`, `exa`, `searxng`. */
254
+ provider: string
255
+ }
256
+
257
+ /**
258
+ * The env var whose presence configures each rpiv-web-tools provider, in selection
259
+ * priority order. Used to AUTO-ENABLE web search whenever a deployment has wired up
260
+ * a provider — there's no separate on/off flag, mirroring how Claude Code / Codex
261
+ * turn search on once a backend is configured. `brave` leads (it's what Claude Code
262
+ * uses); the self-hosted backends (searxng/ollama) come last. For the keyless
263
+ * backends it is the base-URL var that signals "configured".
264
+ */
265
+ const WEB_SEARCH_PROVIDER_ENV: ReadonlyArray<{ provider: string; envVar: string }> = [
266
+ { provider: 'brave', envVar: 'BRAVE_SEARCH_API_KEY' },
267
+ { provider: 'tavily', envVar: 'TAVILY_API_KEY' },
268
+ { provider: 'exa', envVar: 'EXA_API_KEY' },
269
+ { provider: 'serper', envVar: 'SERPER_API_KEY' },
270
+ { provider: 'perplexity', envVar: 'PERPLEXITY_API_KEY' },
271
+ { provider: 'youcom', envVar: 'YOUCOM_API_KEY' },
272
+ { provider: 'jina', envVar: 'JINA_API_KEY' },
273
+ { provider: 'firecrawl', envVar: 'FIRECRAWL_API_KEY' },
274
+ { provider: 'searxng', envVar: 'SEARXNG_URL' },
275
+ { provider: 'ollama', envVar: 'OLLAMA_HOST' },
276
+ ]
277
+
278
+ /**
279
+ * Resolve the web-search configuration from the environment, or undefined when no
280
+ * provider is configured (⇒ the harness writes no rpiv-web-tools config and never
281
+ * nudges the agent towards the tools, so runs behave exactly as before). Enablement
282
+ * is CONDITIONAL on a provider being configured: if any provider's credential/URL
283
+ * env var is present, web search turns on with that provider (highest-priority one
284
+ * when several are set). `WEB_SEARCH_PROVIDER` is an explicit override that pins the
285
+ * active provider regardless of detection — but only when that provider's own
286
+ * credential/URL is also present, so a pin without a key never nudges the agent
287
+ * towards a tool that would error the moment it's called. No key passes through here
288
+ * — the extension reads each provider's own env var directly.
289
+ */
290
+ export function webSearchConfigFromEnv(
291
+ env: NodeJS.ProcessEnv = process.env,
292
+ ): WebSearchConfig | undefined {
293
+ const explicit = env.WEB_SEARCH_PROVIDER?.trim().toLowerCase()
294
+ if (explicit) {
295
+ // A pinned provider still needs its credential/URL present. For a provider we
296
+ // know the env var for, require it; an unknown provider id is taken on trust
297
+ // (its env var isn't in our table, so we can't validate it).
298
+ const known = WEB_SEARCH_PROVIDER_ENV.find((p) => p.provider === explicit)
299
+ if (known && !env[known.envVar]?.trim()) return undefined
300
+ return { provider: explicit }
301
+ }
302
+ for (const { provider, envVar } of WEB_SEARCH_PROVIDER_ENV) {
303
+ if (env[envVar]?.trim()) return { provider }
304
+ }
305
+ return undefined
306
+ }
307
+
308
+ /**
309
+ * The env that points the rpiv-web-tools SearXNG provider at the backend's
310
+ * search proxy: `SEARXNG_URL` = `${proxyBaseUrl}/web-search` (the controller mounted
311
+ * under the LLM proxy's `/v1`), and `SEARXNG_API_KEY` = the per-job session token,
312
+ * which the proxy verifies exactly like the LLM proxy. Handed to Pi's child via
313
+ * `runPi`'s `extraEnv`, so the search key never has to enter the sandbox — the search
314
+ * runs server-side under the deployment's own provider key.
315
+ */
316
+ export function webSearchProxyEnv(
317
+ proxyBaseUrl: string,
318
+ sessionToken: string,
319
+ ): { SEARXNG_URL: string; SEARXNG_API_KEY: string } {
320
+ return {
321
+ SEARXNG_URL: `${proxyBaseUrl.replace(/\/+$/, '')}/web-search`,
322
+ SEARXNG_API_KEY: sessionToken,
323
+ }
324
+ }
325
+
326
+ /**
327
+ * Select the active rpiv-web-tools provider by writing
328
+ * `~/.config/rpiv-web-tools/config.json` (the file the extension reads, falling
329
+ * back to `brave` when `provider` is absent). Only the provider id is written —
330
+ * credentials and base URLs come from the environment (env wins over the file in
331
+ * the extension's own resolution order), so no secret is committed to disk. Written
332
+ * 0600 to match the extension's own permissions for that path.
333
+ */
334
+ export async function writeWebToolsConfig(config: WebSearchConfig): Promise<string> {
335
+ const dir = join(homedir(), '.config', 'rpiv-web-tools')
336
+ await mkdir(dir, { recursive: true })
337
+ const path = join(dir, 'config.json')
338
+ await writeFile(path, JSON.stringify({ provider: config.provider }, null, 2), { mode: 0o600 })
339
+ return path
340
+ }
341
+
342
+ /** One entry of the agent's todo list — its subject and current status. */
343
+ export interface TodoItem {
344
+ /** The task's subject text, as the agent wrote it. */
345
+ label: string
346
+ status: 'pending' | 'in_progress' | 'completed'
347
+ }
348
+
349
+ /** Live subtask progress derived from Pi's `todo` tool — e.g. "3/8 done". */
350
+ export interface TodoProgress {
351
+ /** Tasks marked completed. */
352
+ completed: number
353
+ /** Tasks currently being worked (rpiv-todo's `in_progress` status). */
354
+ inProgress: number
355
+ /** Total live tasks (tombstoned/deleted tasks excluded). */
356
+ total: number
357
+ /**
358
+ * The individual live tasks (label + status), in list order — so the board can
359
+ * render the actual task list, not just the count. Absent for the simpler
360
+ * `todos[].done` fallback shape, which carries no per-task subject.
361
+ */
362
+ items?: TodoItem[]
363
+ }
364
+
365
+ /**
366
+ * One tool invocation in Pi's loop, captured for the run's observability trace.
367
+ * Metadata only (name + timing + ok) — never the tool's args or result — so the
368
+ * harness buffer stays tiny. The backend drains these on its existing job poll and
369
+ * emits them as child spans under the run trace.
370
+ */
371
+ export interface ToolSpan {
372
+ tool: string
373
+ /** Epoch ms the tool call started (approximated as the previous tool's end). */
374
+ startedAt: number
375
+ /** Epoch ms the tool call ended (when its `tool_execution_end` event arrived). */
376
+ endedAt: number
377
+ ok: boolean
378
+ }
379
+
380
+ function isObject(value: unknown): value is Record<string, unknown> {
381
+ return typeof value === 'object' && value !== null
382
+ }
383
+
384
+ /**
385
+ * What the agent actually did this run, independent of any file changes. Used to
386
+ * tell a genuine no-op (the agent never reached the model / never acted) apart
387
+ * from a real run, so a bootstrap that produced nothing is failed rather than
388
+ * pushed as an empty repo. `toolCalls === 0 && assistantChars === 0` is the
389
+ * signature of a run where Pi never made a successful model call.
390
+ */
391
+ export interface PiRunStats {
392
+ /** Tool calls the assistant emitted across the transcript (0 ⇒ it never acted). */
393
+ toolCalls: number
394
+ /** Total characters of assistant text (0 ⇒ the model produced nothing). */
395
+ assistantChars: number
396
+ }
397
+
398
+ /**
399
+ * Output-quality signals lifted from the agent's transcript, so the harness can fail
400
+ * LOUDLY on a malformed run instead of silently handing a half-baked artifact to the
401
+ * structured-output repair (which would manufacture a doc from garbage — the trap
402
+ * behind the spec-writer ⇄ companion rework loop). Two distinct invalid states, both
403
+ * seen in production from `kimi-k2.7-code`:
404
+ * - a completion that hit the output ceiling (its answer/tool call was cut off), and
405
+ * - a FINAL turn that carried no text at all (an empty `content: []` despite spending
406
+ * output tokens), so there is no answer to parse.
407
+ */
408
+ export interface RunDiagnostics {
409
+ /** Some completion ended at the output-token ceiling — its content was cut off. */
410
+ truncated: boolean
411
+ /** The agent's FINAL completion hit the ceiling: its ANSWER (not a mid-run step) was cut off. */
412
+ finalTruncated: boolean
413
+ /** The agent's final turn carried no text content (e.g. an empty `content: []`). */
414
+ finalAnswerEmpty: boolean
415
+ }
416
+
417
+ /** Pi's assistant summary plus {@link PiRunStats} describing what it did. */
418
+ export interface PiRunOutcome {
419
+ summary: string
420
+ stats: PiRunStats
421
+ /**
422
+ * Tail of Pi's stderr (credential-scrubbed), captured even on a clean exit.
423
+ * On a no-op run this is where the real cause shows up — e.g. an unreachable
424
+ * proxy or a model the upstream rejected — so the failure is diagnosable
425
+ * without shelling into the (ephemeral) container.
426
+ */
427
+ stderrTail?: string
428
+ /**
429
+ * Token usage lifted from the agent CLI's own event stream. Reported by the
430
+ * subscription harnesses (Claude Code / Codex), whose traffic bypasses the LLM
431
+ * proxy — so the backend folds it into the leased token's rolling-window counters
432
+ * (usage-aware rotation) and telemetry. Absent for the proxy-metered Pi harness.
433
+ */
434
+ usage?: { inputTokens: number; outputTokens: number }
435
+ /** Output-quality signals (truncation / empty final answer); see {@link RunDiagnostics}. */
436
+ diagnostics?: RunDiagnostics
437
+ }
438
+
439
+ /**
440
+ * Pull the `todo` tool's result `details` out of a Pi `--mode json` event, or
441
+ * undefined if the event isn't a successful `todo` tool result.
442
+ *
443
+ * The same tool result surfaces on the stream as two raw agent events, both of
444
+ * which we read (whichever Pi emits/orders first wins; the counts are identical):
445
+ * - `message_end` with a `toolResult` message — `message.details`
446
+ * - `tool_execution_end` — `result.details`
447
+ * A top-level `tool_result` shape is also accepted defensively. Pi has no
448
+ * built-in todo tool, so this only ever matches the installed extension's calls.
449
+ */
450
+ function todoResultDetails(event: Record<string, unknown>): Record<string, unknown> | undefined {
451
+ if (event.type === 'message_end' && isObject(event.message)) {
452
+ const m = event.message
453
+ if (
454
+ m.role === 'toolResult' &&
455
+ m.toolName === 'todo' &&
456
+ m.isError !== true &&
457
+ isObject(m.details)
458
+ ) {
459
+ return m.details
460
+ }
461
+ return undefined
462
+ }
463
+ if (event.type === 'tool_execution_end' && event.toolName === 'todo' && event.isError !== true) {
464
+ return isObject(event.result) && isObject(event.result.details)
465
+ ? event.result.details
466
+ : undefined
467
+ }
468
+ if (event.type === 'tool_result' && event.toolName === 'todo' && event.isError !== true) {
469
+ return isObject(event.details) ? event.details : undefined
470
+ }
471
+ return undefined
472
+ }
473
+
474
+ /**
475
+ * Derive {@link TodoProgress} from a single Pi `--mode json` event, or undefined
476
+ * if the event isn't a successful `todo` tool result we can read.
477
+ *
478
+ * Pi has no built-in todo tool; the image installs the `@juicesharp/rpiv-todo`
479
+ * extension, whose every successful call returns `details.tasks[]` with a
480
+ * per-task `status` (pending | in_progress | completed | deleted). We also accept
481
+ * the simpler `details.todos[].done` shape of Pi's bundled example extension, so
482
+ * swapping the extension never silently drops progress.
483
+ */
484
+ /**
485
+ * Best-effort subject for a todo task. rpiv-todo creates tasks with a `subject`
486
+ * (see the `todo` `create` action); we also accept the common alternates so a
487
+ * minor extension change never blanks the label. Falls back to "Untitled task".
488
+ */
489
+ function taskLabel(task: unknown): string {
490
+ if (task && typeof task === 'object') {
491
+ const t = task as Record<string, unknown>
492
+ for (const key of ['subject', 'title', 'content', 'text', 'name', 'task']) {
493
+ const v = t[key]
494
+ if (typeof v === 'string' && v.trim()) return v.trim()
495
+ }
496
+ }
497
+ return 'Untitled task'
498
+ }
499
+
500
+ export function parseTodoProgress(event: Record<string, unknown>): TodoProgress | undefined {
501
+ const d = todoResultDetails(event)
502
+ if (!d) return undefined
503
+
504
+ if (Array.isArray(d.tasks)) {
505
+ let total = 0
506
+ let completed = 0
507
+ let inProgress = 0
508
+ const items: TodoItem[] = []
509
+ for (const task of d.tasks) {
510
+ const status = (task as { status?: unknown } | null)?.status
511
+ if (status === 'deleted') continue
512
+ total++
513
+ if (status === 'completed') completed++
514
+ else if (status === 'in_progress') inProgress++
515
+ items.push({
516
+ label: taskLabel(task),
517
+ status:
518
+ status === 'completed'
519
+ ? 'completed'
520
+ : status === 'in_progress'
521
+ ? 'in_progress'
522
+ : 'pending',
523
+ })
524
+ }
525
+ return { completed, inProgress, total, items }
526
+ }
527
+
528
+ if (Array.isArray(d.todos)) {
529
+ const completed = d.todos.filter((t) => (t as { done?: unknown } | null)?.done === true).length
530
+ return { completed, inProgress: 0, total: d.todos.length }
531
+ }
532
+
533
+ return undefined
534
+ }
535
+
536
+ /** Tool-call signal read off a streamed Pi event, or undefined if not a tool call. */
537
+ function toolCallSignal(
538
+ event: Record<string, unknown>,
539
+ ): { name: string; isError: boolean } | undefined {
540
+ // `tool_execution_end` is the canonical per-call stream event (statsFromEvents
541
+ // counts the same one), so the guard reads it and nothing else — no double count.
542
+ if (event.type !== 'tool_execution_end') return undefined
543
+ const name = typeof event.toolName === 'string' ? event.toolName : ''
544
+ return { name, isError: event.isError === true }
545
+ }
546
+
547
+ /** Tunable bounds for the {@link ProgressGuard}. */
548
+ export interface ProgressGuardLimits {
549
+ /**
550
+ * Abort once the agent has made this many NON-exploration tool calls without ever
551
+ * using a file-editing tool (see `FILE_EDIT_TOOLS`). The signature of the credential
552
+ * rabbit-hole that motivated this: probing the environment (`bash`/exec) endlessly
553
+ * without implementing anything. Read-only exploration (`read`/`grep`/… — see
554
+ * `EXPLORATION_TOOLS`) and planning (`todo`) do NOT count, so a large task that
555
+ * legitimately reads/searches many files before its first edit is not killed for it.
556
+ * Disabled when `expectsEdits` is false (e.g. the assess-only merger / Blueprinter,
557
+ * which legitimately edit nothing). Note this bound only guards the run UNTIL its
558
+ * first edit: once the agent has edited a file at all, it has demonstrably started
559
+ * the work, so only `maxConsecutiveErrors` guards a later stall.
560
+ */
561
+ maxToolCallsWithoutEdit: number
562
+ /**
563
+ * Abort after this many consecutive failing tool calls — the agent is stuck
564
+ * retrying an operation that keeps failing rather than making progress.
565
+ */
566
+ maxConsecutiveErrors: number
567
+ /**
568
+ * Abort after this many consecutive web-search/web-fetch calls with no other tool
569
+ * call in between. Web tools are read-only exploration (they don't count toward the
570
+ * no-edit bound), so without this a model could rabbit-hole on searches indefinitely
571
+ * without ever tripping a guard. Any non-web tool call resets the streak. Optional:
572
+ * defaults to {@link DEFAULT_PROGRESS_GUARD_LIMITS} when a caller builds limits
573
+ * without it.
574
+ */
575
+ maxConsecutiveWebCalls?: number
576
+ }
577
+
578
+ // `satisfies` (not a type annotation) so each property keeps its concrete `number`
579
+ // type — `maxConsecutiveWebCalls` is optional on the interface (callers may omit it),
580
+ // but the defaults always define it, so consumers reading it off here get a `number`.
581
+ export const DEFAULT_PROGRESS_GUARD_LIMITS = {
582
+ // Counts only non-exploration, non-planning calls (see EXPLORATION_TOOLS), so the
583
+ // ceiling can be generous without risking a false kill on a read-heavy large task.
584
+ maxToolCallsWithoutEdit: 40,
585
+ maxConsecutiveErrors: 12,
586
+ // A genuine research burst is a handful of searches; an uninterrupted run of this
587
+ // many web calls (with no read/edit/bash between) is a search loop, not progress.
588
+ maxConsecutiveWebCalls: 25,
589
+ } satisfies ProgressGuardLimits
590
+
591
+ // Tool names that mutate files, so a call to one clears the no-edit suspicion. Kept
592
+ // broad on purpose: different models/extensions name the same capability differently
593
+ // (`edit`/`write`, but also `apply_patch`/`patch`/`str_replace`/`multiedit`/`create`),
594
+ // and a false "no edits" reading would kill a run that IS making changes. Matched
595
+ // case-insensitively. NOTE: a file written purely via `bash` (e.g. a heredoc) is not
596
+ // recognised here — broaden or move to a working-tree signal if that becomes common.
597
+ const FILE_EDIT_TOOLS = new Set([
598
+ 'edit',
599
+ 'write',
600
+ 'apply_patch',
601
+ 'patch',
602
+ 'str_replace',
603
+ 'multiedit',
604
+ 'create',
605
+ ])
606
+
607
+ // Planning/bookkeeping tools that are neither file edits nor the environment-probing
608
+ // the no-edit bound targets — the todo list the agent maintains as it works. These do
609
+ // NOT count toward `maxToolCallsWithoutEdit`: a run that diligently updates a long
610
+ // todo list before its first edit (common on a large task) would otherwise be killed
611
+ // for "no edits" purely from planning calls. They still reset the consecutive-error
612
+ // streak (a successful call means the agent isn't wedged). Matched case-insensitively.
613
+ const PLANNING_TOOLS = new Set(['todo'])
614
+
615
+ // Read-only exploration tools: reading/searching the repo is legitimate work-up to an
616
+ // edit, NOT the environment-probing the no-edit bound targets, so they don't count
617
+ // toward `maxToolCallsWithoutEdit` (a large task may read/search dozens of files
618
+ // before its first edit). The bound thus counts only "action" calls — chiefly `bash`
619
+ // (the credential rabbit-hole's vector) — that have yet to produce an edit. Kept broad
620
+ // since models/extensions name the same capability differently. Matched case-insensitively.
621
+ const EXPLORATION_TOOLS = new Set([
622
+ 'read',
623
+ 'grep',
624
+ 'search',
625
+ 'glob',
626
+ 'ls',
627
+ 'list',
628
+ 'find',
629
+ 'tree',
630
+ 'cat',
631
+ 'view',
632
+ 'head',
633
+ 'tail',
634
+ 'stat',
635
+ // rpiv-web-tools: querying/reading the web is read-only research up to an edit,
636
+ // not the environment-probing the no-edit bound targets, so it doesn't count.
637
+ 'web_search',
638
+ 'web_fetch',
639
+ ])
640
+
641
+ // The rpiv-web-tools calls, tracked separately so an unbounded run of them (with no
642
+ // other tool call between) can be caught as a search loop — see `maxConsecutiveWebCalls`.
643
+ const WEB_TOOLS = new Set(['web_search', 'web_fetch'])
644
+
645
+ /** Read {@link ProgressGuardLimits} from the environment, falling back to the defaults. */
646
+ export function progressGuardLimitsFromEnv(
647
+ env: NodeJS.ProcessEnv = process.env,
648
+ ): ProgressGuardLimits {
649
+ const num = (raw: string | undefined, fallback: number): number => {
650
+ const n = Number(raw)
651
+ return Number.isFinite(n) && n > 0 ? Math.floor(n) : fallback
652
+ }
653
+ return {
654
+ maxToolCallsWithoutEdit: num(
655
+ env.JOB_MAX_TOOLCALLS_WITHOUT_EDIT,
656
+ DEFAULT_PROGRESS_GUARD_LIMITS.maxToolCallsWithoutEdit,
657
+ ),
658
+ maxConsecutiveErrors: num(
659
+ env.JOB_MAX_CONSECUTIVE_TOOL_ERRORS,
660
+ DEFAULT_PROGRESS_GUARD_LIMITS.maxConsecutiveErrors,
661
+ ),
662
+ maxConsecutiveWebCalls: num(
663
+ env.JOB_MAX_CONSECUTIVE_WEB_CALLS,
664
+ DEFAULT_PROGRESS_GUARD_LIMITS.maxConsecutiveWebCalls,
665
+ ),
666
+ }
667
+ }
668
+
669
+ /**
670
+ * Apply per-knob overrides onto a base set of guard limits, ENFORCING loosen-only: an
671
+ * override can only RAISE a knob (more headroom), never lower it below the base. A
672
+ * larger value is more lenient for every knob (more no-edit tool calls / errors / web
673
+ * calls tolerated), so each result is `max(base, override)`. This is a hard guarantee,
674
+ * not a convention — a tuning entry (built-in or a custom kind's, which reaches this via
675
+ * an untrusted job body) that supplies a value TIGHTER than the base is clamped back up
676
+ * to the base rather than aborting a legitimately-progressing run. An absent/undefined
677
+ * knob keeps the base value untouched.
678
+ */
679
+ export function mergeGuardLimits(
680
+ base: ProgressGuardLimits,
681
+ overrides: Partial<ProgressGuardLimits> | undefined,
682
+ ): ProgressGuardLimits {
683
+ if (!overrides) return base
684
+ const loosen = (b: number, o: number | undefined): number =>
685
+ typeof o === 'number' ? Math.max(b, o) : b
686
+ return {
687
+ maxToolCallsWithoutEdit: loosen(
688
+ base.maxToolCallsWithoutEdit,
689
+ overrides.maxToolCallsWithoutEdit,
690
+ ),
691
+ maxConsecutiveErrors: loosen(base.maxConsecutiveErrors, overrides.maxConsecutiveErrors),
692
+ // `maxConsecutiveWebCalls` is optional on the interface (callers may omit it), so
693
+ // fall back to the default before loosening — keeps `loosen`'s base a concrete number.
694
+ maxConsecutiveWebCalls: loosen(
695
+ base.maxConsecutiveWebCalls ?? DEFAULT_PROGRESS_GUARD_LIMITS.maxConsecutiveWebCalls,
696
+ overrides.maxConsecutiveWebCalls,
697
+ ),
698
+ }
699
+ }
700
+
701
+ /**
702
+ * Live anti-rabbithole guard: fed each streamed Pi event, it returns a diagnostic
703
+ * reason the moment a run has plainly stopped making progress, so the harness can
704
+ * kill Pi early instead of letting it burn the whole budget (and then surface a
705
+ * useful failure instead of a generic "no file changes"). Pure and incremental so
706
+ * it can be unit-tested over a fixed event sequence.
707
+ */
708
+ export class ProgressGuard {
709
+ private toolCalls = 0
710
+ private edits = 0
711
+ private consecutiveErrors = 0
712
+ private consecutiveWebCalls = 0
713
+
714
+ constructor(
715
+ private readonly limits: ProgressGuardLimits,
716
+ /** When false (assess-only runs like the merger), the no-edit bound is skipped. */
717
+ private readonly expectsEdits: boolean = true,
718
+ ) {}
719
+
720
+ /** Feed one parsed Pi event; returns a diagnostic reason when the run should abort, else null. */
721
+ observe(event: Record<string, unknown>): string | null {
722
+ const tool = toolCallSignal(event)
723
+ if (!tool) return null
724
+ const name = tool.name.toLowerCase()
725
+ // The error streak tracks ANY tool call (a planning call still proves the agent
726
+ // isn't wedged in a failing-op loop), so it's updated before the planning skip.
727
+ this.consecutiveErrors = tool.isError ? this.consecutiveErrors + 1 : 0
728
+ if (this.consecutiveErrors >= this.limits.maxConsecutiveErrors) {
729
+ return (
730
+ `no progress: ${this.consecutiveErrors} consecutive failing tool calls — the agent is stuck ` +
731
+ `retrying a failing operation rather than making progress. Aborting.`
732
+ )
733
+ }
734
+
735
+ // Web search/fetch loop: web tools are read-only (they don't count toward the
736
+ // no-edit bound), so guard them separately — an uninterrupted streak of them is a
737
+ // research rabbit-hole. Any non-web tool call resets the streak.
738
+ if (WEB_TOOLS.has(name)) {
739
+ this.consecutiveWebCalls++
740
+ const webCap =
741
+ this.limits.maxConsecutiveWebCalls ?? DEFAULT_PROGRESS_GUARD_LIMITS.maxConsecutiveWebCalls
742
+ if (this.consecutiveWebCalls >= webCap) {
743
+ return (
744
+ `no progress: ${this.consecutiveWebCalls} consecutive web search/fetch calls without ` +
745
+ `any other action — the agent is stuck researching instead of doing the work. Aborting.`
746
+ )
747
+ }
748
+ } else {
749
+ this.consecutiveWebCalls = 0
750
+ }
751
+
752
+ // Planning and read-only exploration calls don't count toward the no-edit bound
753
+ // (see PLANNING_TOOLS / EXPLORATION_TOOLS) — only "action" calls without an edit do.
754
+ if (PLANNING_TOOLS.has(name) || EXPLORATION_TOOLS.has(name)) return null
755
+ this.toolCalls++
756
+ if (FILE_EDIT_TOOLS.has(name)) this.edits++
757
+
758
+ if (
759
+ this.expectsEdits &&
760
+ this.edits === 0 &&
761
+ this.toolCalls >= this.limits.maxToolCallsWithoutEdit
762
+ ) {
763
+ return (
764
+ `no progress: ${this.toolCalls} tool calls and not one file edit — the agent is exploring or ` +
765
+ `probing the environment without implementing anything. Aborting before it burns the whole run.`
766
+ )
767
+ }
768
+ return null
769
+ }
770
+ }
771
+
772
+ /**
773
+ * Run Pi non-interactively against `cwd` and return its assistant summary. Uses
774
+ * print + JSON mode (`-p --mode json`) with `--approve` so it runs unattended.
775
+ *
776
+ * The (untrusted) prompt is fed over stdin, never as an argv positional, so a
777
+ * prompt beginning with `-`/`--` can't be mis-parsed as a Pi CLI flag (Pi has no
778
+ * `--` end-of-options terminator, so a positional `-foo` errors as "Unknown
779
+ * option"). Pi's print mode reads the prompt from piped stdin; we write it and
780
+ * close the pipe so Pi gets an immediate EOF and proceeds (an open, never-closed
781
+ * stdin pipe would make print mode block forever waiting for EOF).
782
+ */
783
+ export function runPi(opts: {
784
+ cwd: string
785
+ model: string
786
+ userPrompt: string
787
+ sessionToken: string
788
+ /** Aborting this kills Pi (the job's inactivity/max-duration watchdog). */
789
+ signal?: AbortSignal
790
+ /** Called on every chunk of Pi output, so the watchdog sees the agent is alive. */
791
+ onActivity?: () => void
792
+ /** Called with the latest subtask counts each time Pi updates its todo list. */
793
+ onProgress?: (progress: TodoProgress) => void
794
+ /**
795
+ * Called once per completed tool call with a compact {@link ToolSpan}. Feeds the
796
+ * run's observability trace (drained by the backend on its job poll); a no-op when
797
+ * the container payload doesn't pass it, so production behaviour is unchanged.
798
+ */
799
+ onSpan?: (span: ToolSpan) => void
800
+ /**
801
+ * Called with every parsed Pi `--mode json` event, in stream order — the raw
802
+ * observability seam over the run. Used by offline tooling (the smoketest
803
+ * harness) to capture the full prompt/response/tool-call transcript for
804
+ * analysis; the container payload doesn't pass it, so production behaviour is
805
+ * unchanged. Throwing handlers are swallowed so a faulty observer can't break
806
+ * the run.
807
+ */
808
+ onEvent?: (event: Record<string, unknown>) => void
809
+ /** No-progress guard bounds; defaults to the env-configured limits. */
810
+ guardLimits?: ProgressGuardLimits
811
+ /** Whether this run is expected to edit files (false for assess-only runs like the merger). */
812
+ expectsEdits?: boolean
813
+ /**
814
+ * Extra environment for Pi's child process, merged over `process.env` (but under the
815
+ * proxy token). Used to hand the rpiv-web-tools extension its proxy-backed SearXNG
816
+ * config (`SEARXNG_URL` / `SEARXNG_API_KEY`) without mutating the harness's own env.
817
+ */
818
+ extraEnv?: Record<string, string>
819
+ }): Promise<PiRunOutcome> {
820
+ return new Promise((resolve, reject) => {
821
+ if (opts.signal?.aborted) {
822
+ reject(new Error('pi aborted before start'))
823
+ return
824
+ }
825
+ const child = spawn(
826
+ 'pi',
827
+ ['-p', '--mode', 'json', '--model', `proxy/${opts.model}`, '--approve'],
828
+ {
829
+ cwd: opts.cwd,
830
+ env: { ...process.env, ...opts.extraEnv, PI_PROXY_TOKEN: opts.sessionToken },
831
+ // stdin is piped (not 'ignore') so the prompt is delivered out-of-band
832
+ // rather than on argv — see the function doc for the injection rationale.
833
+ stdio: ['pipe', 'pipe', 'pipe'],
834
+ },
835
+ )
836
+ // Hand Pi the prompt over stdin, then close it so print mode sees EOF and
837
+ // runs. Ignore stdin errors (e.g. EPIPE if Pi exits before reading): the
838
+ // 'close'/'error' handlers below own the actual failure reporting.
839
+ child.stdin.on('error', () => {})
840
+ child.stdin.end(opts.userPrompt)
841
+ let stdout = ''
842
+ let stderr = ''
843
+ let aborted = false
844
+ // Set when the no-progress guard kills Pi; carries the diagnostic the run
845
+ // fails with (distinct from an external watchdog abort).
846
+ let guardReason: string | undefined
847
+ // Pi's json mode is strict LF-framed JSONL; buffer partial lines across
848
+ // chunks so we only ever parse complete records for progress + the guard.
849
+ let lineBuffer = ''
850
+ // Counters for silent losses, warned ONCE at close (not per-line, to avoid log
851
+ // spam): `{`-leading lines that failed to JSON.parse, and observer-callback throws.
852
+ let malformedLines = 0
853
+ let observerErrors = 0
854
+ const guard = new ProgressGuard(
855
+ opts.guardLimits ?? progressGuardLimitsFromEnv(),
856
+ opts.expectsEdits ?? true,
857
+ )
858
+ // Start boundary for the next tool span: each tool's slice runs from the previous
859
+ // tool's end (or the run start) to its own `tool_execution_end`. Approximate but
860
+ // contiguous — enough for the trace tree, and metadata-only.
861
+ let toolBoundary = Date.now()
862
+
863
+ // SIGTERM first, then SIGKILL if Pi ignores it. Shared by the watchdog abort
864
+ // and the no-progress guard; the `close` handler turns it into a rejection.
865
+ const killChild = (): void => killChildProcess(child)
866
+
867
+ // Parse each complete JSONL record once, feeding both the todo-progress
868
+ // emitter and the no-progress guard. A tripped guard kills Pi with a
869
+ // diagnostic the run then fails on.
870
+ // `runGuard` is false only for the at-close flush of a final unterminated line: the
871
+ // process has already exited, so feeding that record to the no-progress guard could trip
872
+ // it and turn a clean (code 0) exit into a spurious "no progress" rejection. The flush
873
+ // still recovers the record's progress/span signal; only the kill decision is skipped.
874
+ const processLine = (line: string, runGuard = true): void => {
875
+ if (!line.startsWith('{')) return
876
+ let event: Record<string, unknown>
877
+ try {
878
+ event = JSON.parse(line) as Record<string, unknown>
879
+ } catch {
880
+ // A `{`-leading line that doesn't parse is a corrupted/truncated record we drop
881
+ // (progress/spans for it are lost). Count it; warned once at close.
882
+ malformedLines++
883
+ return
884
+ }
885
+ if (opts.onEvent) {
886
+ try {
887
+ opts.onEvent(event)
888
+ } catch {
889
+ // A faulty observer must never break the run.
890
+ observerErrors++
891
+ }
892
+ }
893
+ if (opts.onProgress) {
894
+ const progress = parseTodoProgress(event)
895
+ if (progress) opts.onProgress(progress)
896
+ }
897
+ if (opts.onSpan) {
898
+ const signal = toolCallSignal(event)
899
+ if (signal && signal.name) {
900
+ const endedAt = Date.now()
901
+ try {
902
+ opts.onSpan({
903
+ tool: signal.name,
904
+ startedAt: toolBoundary,
905
+ endedAt,
906
+ ok: !signal.isError,
907
+ })
908
+ } catch {
909
+ // A faulty observer must never break the run.
910
+ observerErrors++
911
+ }
912
+ toolBoundary = endedAt
913
+ }
914
+ }
915
+ if (runGuard && !guardReason && !aborted) {
916
+ const reason = guard.observe(event)
917
+ if (reason) {
918
+ guardReason = reason
919
+ killChild()
920
+ }
921
+ }
922
+ }
923
+
924
+ const consumeStdout = (text: string): void => {
925
+ lineBuffer += text
926
+ let nl = lineBuffer.indexOf('\n')
927
+ while (nl !== -1) {
928
+ const line = lineBuffer.slice(0, nl).trim()
929
+ lineBuffer = lineBuffer.slice(nl + 1)
930
+ nl = lineBuffer.indexOf('\n')
931
+ processLine(line)
932
+ }
933
+ }
934
+
935
+ // When the watchdog aborts, terminate Pi: the `close` handler then rejects
936
+ // with the abort reason.
937
+ const onAbort = (): void => {
938
+ aborted = true
939
+ killChild()
940
+ }
941
+ opts.signal?.addEventListener('abort', onAbort, { once: true })
942
+
943
+ const onChunk = (chunk: Buffer, sink: 'out' | 'err'): void => {
944
+ const text = chunk.toString()
945
+ if (sink === 'out') {
946
+ stdout += text
947
+ consumeStdout(text)
948
+ } else stderr += text
949
+ // Any output means progress: reset the inactivity watchdog.
950
+ opts.onActivity?.()
951
+ }
952
+ child.stdout.on('data', (chunk: Buffer) => onChunk(chunk, 'out'))
953
+ child.stderr.on('data', (chunk: Buffer) => onChunk(chunk, 'err'))
954
+ child.on('error', (error) => {
955
+ opts.signal?.removeEventListener('abort', onAbort)
956
+ reject(error)
957
+ })
958
+ child.on('close', (code) => {
959
+ opts.signal?.removeEventListener('abort', onAbort)
960
+ // Flush a final record that arrived without a trailing newline: Pi usually LF-frames
961
+ // every line, but a clean exit can leave the last event (often `agent_end`) unterminated
962
+ // in the buffer, so without this its progress/span/guard signal would be silently lost.
963
+ if (lineBuffer.trim()) {
964
+ processLine(lineBuffer.trim(), false)
965
+ lineBuffer = ''
966
+ }
967
+ // Surface any silent stream losses ONCE (counts, not per-line), so a corrupted JSONL
968
+ // stream or a throwing observer is diagnosable rather than invisible.
969
+ if (malformedLines > 0 || observerErrors > 0) {
970
+ log.warn('pi: skipped malformed JSONL lines / observer errors', {
971
+ malformedLines,
972
+ observerErrors,
973
+ })
974
+ }
975
+ if (guardReason) {
976
+ const tail = redactSecrets(stderr.trim()).slice(-700)
977
+ reject(new Error(tail ? `${guardReason} Agent stderr: ${tail}` : guardReason))
978
+ } else if (aborted) {
979
+ reject(
980
+ new Error(
981
+ opts.signal?.reason instanceof Error ? opts.signal.reason.message : 'pi aborted',
982
+ ),
983
+ )
984
+ } else if (code === 0) {
985
+ const tail = redactSecrets(stderr.trim()).slice(-1500)
986
+ // Pi can exit 0 even when the agent run ended in a hard error (e.g. every
987
+ // model call failed and its retries were exhausted): the process completed,
988
+ // but the agent did not. Exit code alone then reads as success, and a run
989
+ // that RESUMED a branch with prior commits would even open a PR off work this
990
+ // pass never produced. Inspect the terminal transcript and fail loudly so the
991
+ // step is marked failed instead of masking a total failure as green.
992
+ const runError = terminalRunError(stdout)
993
+ if (runError) {
994
+ const scrubbed = redactSecrets(runError).slice(0, 1000)
995
+ reject(new Error(tail ? `${scrubbed} Agent stderr: ${tail}` : scrubbed))
996
+ } else {
997
+ resolve({ ...summarizePiRun(stdout), ...(tail ? { stderrTail: tail } : {}) })
998
+ }
999
+ } else {
1000
+ reject(new Error(`pi exited with code ${code}: ${(stderr || stdout).slice(-500)}`))
1001
+ }
1002
+ })
1003
+ })
1004
+ }
1005
+
1006
+ /** Parse Pi's LF-framed JSONL stdout into its event records, skipping noise. */
1007
+ function parsePiEvents(stdout: string): Record<string, unknown>[] {
1008
+ const events: Record<string, unknown>[] = []
1009
+ for (const raw of stdout.split('\n')) {
1010
+ const line = raw.trim()
1011
+ if (!line.startsWith('{')) continue
1012
+ try {
1013
+ events.push(JSON.parse(line) as Record<string, unknown>)
1014
+ } catch {
1015
+ // Not a JSON event line; skip.
1016
+ }
1017
+ }
1018
+ return events
1019
+ }
1020
+
1021
+ /**
1022
+ * The terminal-failure message when Pi's run ended in a hard error (the model was
1023
+ * unreachable / refused, and Pi exhausted its auto-retries), else undefined. Only
1024
+ * the FINAL outcome counts: a mid-run hiccup the agent recovered from leaves a clean
1025
+ * terminal `agent_end`, so it returns undefined. Scans from the end and decides on
1026
+ * the first terminal signal it meets — the trailing `auto_retry_end` (its `success`
1027
+ * flag) or the last `agent_end` (its `stopReason`). Pure so it is unit-testable over
1028
+ * a fixed event sequence.
1029
+ */
1030
+ export function terminalRunError(stdout: string): string | undefined {
1031
+ const events = parsePiEvents(stdout)
1032
+ for (let i = events.length - 1; i >= 0; i--) {
1033
+ const e = events[i]!
1034
+ if (e.type === 'auto_retry_end') {
1035
+ if (e.success === false) {
1036
+ return typeof e.finalError === 'string'
1037
+ ? e.finalError
1038
+ : 'the agent failed after exhausting its retries'
1039
+ }
1040
+ return undefined
1041
+ }
1042
+ if (e.type === 'agent_end') {
1043
+ return e.stopReason === 'error' && typeof e.errorMessage === 'string'
1044
+ ? e.errorMessage
1045
+ : undefined
1046
+ }
1047
+ }
1048
+ return undefined
1049
+ }
1050
+
1051
+ /**
1052
+ * Pi's assistant summary plus {@link PiRunStats}, derived from one pass over its
1053
+ * output — the canonical close-of-run signal the harness uses both to report the
1054
+ * answer and to detect a no-op run (the agent never acted).
1055
+ */
1056
+ export function summarizePiRun(stdout: string): PiRunOutcome {
1057
+ const events = parsePiEvents(stdout)
1058
+ return {
1059
+ summary: summaryFromEvents(events, stdout),
1060
+ stats: statsFromEvents(events),
1061
+ diagnostics: diagnosticsFromEvents(events),
1062
+ }
1063
+ }
1064
+
1065
+ /**
1066
+ * Output-quality signals over the canonical `agent_end` transcript: whether any
1067
+ * completion hit the output ceiling (its content was cut off), whether the FINAL
1068
+ * completion did, and whether that final turn carried no text at all. Pure so it is
1069
+ * unit-testable over a fixed event sequence. Defaults to all-false when there is no
1070
+ * terminal transcript (a no-op run is already caught by {@link agentNeverActed}).
1071
+ *
1072
+ * `cap` is the per-completion ceiling Pi requested ({@link PI_MAX_OUTPUT_TOKENS});
1073
+ * truncation is detected by an assistant message whose `usage.output` reached it,
1074
+ * which is reliable even when the model reports a non-`length` stop reason (Workers
1075
+ * AI labelled a cut-off tool call `tool_calls`, not `length`).
1076
+ */
1077
+ export function diagnosticsFromEvents(
1078
+ events: Record<string, unknown>[],
1079
+ cap: number = PI_MAX_OUTPUT_TOKENS,
1080
+ ): RunDiagnostics {
1081
+ let messages: unknown[] | undefined
1082
+ for (let i = events.length - 1; i >= 0; i--) {
1083
+ const e = events[i]!
1084
+ if (e.type === 'agent_end' && Array.isArray(e.messages)) {
1085
+ messages = e.messages as unknown[]
1086
+ break
1087
+ }
1088
+ }
1089
+ if (!messages) return { truncated: false, finalTruncated: false, finalAnswerEmpty: false }
1090
+ const assistants = messages.filter(
1091
+ (m): m is Record<string, unknown> => isObject(m) && m.role === 'assistant',
1092
+ )
1093
+ const truncated = assistants.some((m) => assistantOutputTokens(m) >= cap)
1094
+ const last = assistants.at(-1)
1095
+ return {
1096
+ truncated,
1097
+ finalTruncated: last ? assistantOutputTokens(last) >= cap : false,
1098
+ finalAnswerEmpty: last ? messageText(last) === '' : false,
1099
+ }
1100
+ }
1101
+
1102
+ /** `usage.output` (completion tokens) reported on a Pi assistant message, or 0. */
1103
+ function assistantOutputTokens(message: Record<string, unknown>): number {
1104
+ const usage = message.usage
1105
+ if (!isObject(usage)) return 0
1106
+ const output = usage.output
1107
+ return typeof output === 'number' ? output : 0
1108
+ }
1109
+
1110
+ /** {@link RunDiagnostics} over Pi's raw `--mode json` stdout (see {@link diagnosticsFromEvents}). */
1111
+ export function runDiagnostics(stdout: string, cap: number = PI_MAX_OUTPUT_TOKENS): RunDiagnostics {
1112
+ return diagnosticsFromEvents(parsePiEvents(stdout), cap)
1113
+ }
1114
+
1115
+ /**
1116
+ * Count what the agent actually did. Prefers the canonical `agent_end`
1117
+ * transcript (assistant `toolCall` parts + text); falls back to the streamed
1118
+ * `tool_execution_end` / `message_end` events when no terminal transcript was
1119
+ * emitted, so a no-op is never mistaken for a real run because of a schema tweak.
1120
+ */
1121
+ function statsFromEvents(events: Record<string, unknown>[]): PiRunStats {
1122
+ for (let i = events.length - 1; i >= 0; i--) {
1123
+ const e = events[i]!
1124
+ if (e.type === 'agent_end' && Array.isArray(e.messages)) {
1125
+ return statsFromMessages(e.messages as unknown[])
1126
+ }
1127
+ }
1128
+ let toolCalls = 0
1129
+ let toolResults = 0
1130
+ let assistantChars = 0
1131
+ for (const e of events) {
1132
+ if (e.type === 'tool_execution_end') {
1133
+ toolCalls++
1134
+ } else if (e.type === 'message_end' && isObject(e.message)) {
1135
+ const m = e.message
1136
+ if (m.role === 'assistant') assistantChars += messageText(m).length
1137
+ else if (m.role === 'toolResult') toolResults++
1138
+ }
1139
+ }
1140
+ // The same call can surface as both a `tool_execution_end` and a toolResult
1141
+ // `message_end`; prefer the former and only fall back to toolResult counts.
1142
+ return { toolCalls: toolCalls || toolResults, assistantChars }
1143
+ }
1144
+
1145
+ /** {@link PiRunStats} from a transcript: assistant `toolCall` parts + text length. */
1146
+ function statsFromMessages(messages: unknown[]): PiRunStats {
1147
+ let toolCalls = 0
1148
+ let assistantChars = 0
1149
+ for (const m of messages) {
1150
+ if (!isObject(m) || m.role !== 'assistant') continue
1151
+ const content = m.content
1152
+ if (typeof content === 'string') {
1153
+ assistantChars += content.trim().length
1154
+ } else if (Array.isArray(content)) {
1155
+ for (const part of content) {
1156
+ if (!isObject(part)) continue
1157
+ if (part.type === 'toolCall') toolCalls++
1158
+ else if (typeof part.text === 'string') assistantChars += part.text.length
1159
+ }
1160
+ }
1161
+ }
1162
+ return { toolCalls, assistantChars }
1163
+ }
1164
+
1165
+ /**
1166
+ * Extract the assistant's final summary from Pi's JSON-lines output. Pi emits a
1167
+ * terminal `agent_end` event whose `messages` is the full transcript, so the
1168
+ * last assistant message there is the canonical answer. Falls back to scanning
1169
+ * `message_end` events, then to a raw tail, so a schema tweak never loses output.
1170
+ */
1171
+ export function parsePiOutput(stdout: string): string {
1172
+ return summaryFromEvents(parsePiEvents(stdout), stdout)
1173
+ }
1174
+
1175
+ /** Shared summary extraction over already-parsed events (see {@link parsePiOutput}). */
1176
+ function summaryFromEvents(events: Record<string, unknown>[], stdout: string): string {
1177
+ // Preferred: the final transcript from the last agent_end event.
1178
+ for (let i = events.length - 1; i >= 0; i--) {
1179
+ const e = events[i]!
1180
+ if (e.type === 'agent_end' && Array.isArray(e.messages)) {
1181
+ const text = lastAssistantText(e.messages as unknown[])
1182
+ if (text) return text
1183
+ }
1184
+ }
1185
+
1186
+ // Fallback: assistant text accumulated from message_end events.
1187
+ const parts: string[] = []
1188
+ for (const e of events) {
1189
+ if (
1190
+ e.type === 'message_end' &&
1191
+ typeof e.message === 'object' &&
1192
+ e.message !== null &&
1193
+ (e.message as { role?: unknown }).role === 'assistant'
1194
+ ) {
1195
+ const text = messageText(e.message)
1196
+ if (text) parts.push(text)
1197
+ }
1198
+ }
1199
+ const joined = parts.join('\n').trim()
1200
+ if (joined) return joined
1201
+
1202
+ // Nothing structured matched — return a trimmed tail of the raw output.
1203
+ return stdout.trim().slice(-2000)
1204
+ }
1205
+
1206
+ /** The text of the last assistant message in a transcript, or '' if none. */
1207
+ function lastAssistantText(messages: unknown[]): string {
1208
+ for (let i = messages.length - 1; i >= 0; i--) {
1209
+ const m = messages[i]
1210
+ if (typeof m === 'object' && m !== null && (m as { role?: unknown }).role === 'assistant') {
1211
+ const text = messageText(m)
1212
+ if (text) return text
1213
+ }
1214
+ }
1215
+ return ''
1216
+ }
1217
+
1218
+ /** Join the text parts of a Pi message whose content is a string or parts array. */
1219
+ function messageText(message: unknown): string {
1220
+ if (typeof message !== 'object' || message === null) return ''
1221
+ const content = (message as { content?: unknown }).content
1222
+ if (typeof content === 'string') return content.trim()
1223
+ if (Array.isArray(content)) {
1224
+ return content
1225
+ .map((part) =>
1226
+ typeof part === 'object' &&
1227
+ part !== null &&
1228
+ typeof (part as { text?: unknown }).text === 'string'
1229
+ ? (part as { text: string }).text
1230
+ : '',
1231
+ )
1232
+ .join('')
1233
+ .trim()
1234
+ }
1235
+ return ''
1236
+ }