@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
package/src/agent.ts ADDED
@@ -0,0 +1,948 @@
1
+ import { join } from 'node:path'
2
+ import { tmpdir } from 'node:os'
3
+ import { mkdir, mkdtemp, opendir, rm } from 'node:fs/promises'
4
+ import { execFile } from 'node:child_process'
5
+ import { promisify } from 'node:util'
6
+ import type {
7
+ AgentInfraSpec,
8
+ AgentJob,
9
+ AgentResult,
10
+ InfraSetupRecord,
11
+ ServiceInfraSpec,
12
+ } from './job.js'
13
+ import { standUpFrontend, tearDownFrontend } from './frontend-infra.js'
14
+ import { captureRedactedOutput, redactSecrets } from './redact.js'
15
+ import {
16
+ cloneRepo,
17
+ commitAll,
18
+ conflictDiff,
19
+ hasAgentChanges,
20
+ headCommit,
21
+ mergeBranch,
22
+ openPullRequest,
23
+ prepareExistingCheckout,
24
+ pushBranch,
25
+ reinitAndPush,
26
+ unmergedPaths,
27
+ } from './git.js'
28
+ import type { PiRunStats } from './pi.js'
29
+ import { noChangesReason, runCodingAgent } from './coding-agent.js'
30
+ import {
31
+ acquireRepoCheckout,
32
+ agentNeverActed,
33
+ agentOutputTail,
34
+ NEVER_ACTED_CAUSE,
35
+ runAgentInWorkspace,
36
+ unusableFinalAnswerCause,
37
+ withWorkspace,
38
+ } from './pi-workspace.js'
39
+ import {
40
+ type StructuredOutputDiagnostics,
41
+ diagnosticsSuffix,
42
+ resolveStructuredOutput,
43
+ } from './structured-output.js'
44
+ import type { RunOptions } from './runner.js'
45
+ import { log, type Logger } from './logger.js'
46
+
47
+ // The single generic agent handler — the manifest-driven replacement for the bespoke
48
+ // per-kind handlers. It runs an LLM over an optional checkout and returns text/JSON
49
+ // (`explore`) or commits + pushes its edits and optionally opens a PR (`coding`). WHAT
50
+ // the agent does is decided by the backend and passed as job DATA (never an agent-kind
51
+ // string), and all mechanical work that CAN run without a checkout (rendering artifact
52
+ // files from the structured output, board ingest) lives on the backend before/after this
53
+ // run via the RepoFiles port.
54
+ //
55
+ // Two coding flows still carry working-tree Git mechanics that a contents-API-only
56
+ // RepoFiles cannot perform, so they are keyed off job data here (NOT off a kind string):
57
+ // `mergeBase` ⇒ surface real merge conflicts via a working-tree base→branch merge
58
+ // (conflict resolution); `bootstrap` ⇒ reinitialise history and force-push to a separate
59
+ // target repo. These are the deliberate, documented exceptions — do NOT grow this into a
60
+ // general `if (job.someFlag)` dispatch; anything that doesn't need a checkout belongs in
61
+ // backend pre/post-ops. See backend/docs/custom-agents.md.
62
+
63
+ const exec = promisify(execFile)
64
+
65
+ /**
66
+ * Bring the service's docker-compose dependencies up (local infra only). Best-effort:
67
+ * runs `docker compose -f <path> up -d --wait` in the checkout. A missing Docker daemon
68
+ * or a compose failure is logged and surfaced to the agent (as a prompt note) rather
69
+ * than failing the job — the agent can still run unit-level tests and report what it
70
+ * could. A no-op for ephemeral / no-infra / no-compose-path runs.
71
+ *
72
+ * Whether it succeeds or fails, the (redacted, bounded) command output is captured into a
73
+ * {@link InfraSetupRecord} returned alongside the prompt `note`, so the backend can surface
74
+ * the in-container dependency stand-up logs on the Tester step — the failure-class artifact
75
+ * the orchestrator-side provisioning logs can't see.
76
+ */
77
+ async function standUpInfra(
78
+ dir: string,
79
+ infra: ServiceInfraSpec,
80
+ signal: AbortSignal | undefined,
81
+ logger: Logger,
82
+ ): Promise<{ started: boolean; note?: string; record?: InfraSetupRecord }> {
83
+ if (infra.environment !== 'local' || infra.noInfraDependencies || !infra.composePath) {
84
+ return { started: false }
85
+ }
86
+ const startedAt = Date.now()
87
+ try {
88
+ logger.info('agent(explore): standing up infra', { composePath: infra.composePath })
89
+ // Raise maxBuffer well above the 1MB default so a chatty compose stand-up can't fail the
90
+ // (best-effort) infra step with ENOBUFS; the captured output is tail-bounded on storage.
91
+ const { stdout, stderr } = await exec(
92
+ 'docker',
93
+ ['compose', '-f', infra.composePath, 'up', '-d', '--wait'],
94
+ { cwd: dir, signal, timeout: 5 * 60_000, maxBuffer: 16 * 1024 * 1024 },
95
+ )
96
+ const logs = captureRedactedOutput(stdout, stderr)
97
+ return {
98
+ started: true,
99
+ record: {
100
+ started: true,
101
+ composePath: infra.composePath,
102
+ at: Date.now(),
103
+ durationMs: Date.now() - startedAt,
104
+ ...(logs ? { logs } : {}),
105
+ },
106
+ }
107
+ } catch (err) {
108
+ const note = err instanceof Error ? err.message : String(err)
109
+ logger.warn('agent(explore): infra stand-up failed', { error: note })
110
+ // `execFile` rejections carry the partial stdout/stderr on the error object — capture them
111
+ // so the stored logs explain the failure (a port clash, a pull-auth error, an exited
112
+ // dependency), not just the one-line exit message.
113
+ const e = err as { stdout?: unknown; stderr?: unknown }
114
+ const logs = captureRedactedOutput(e.stdout, e.stderr)
115
+ return {
116
+ started: false,
117
+ note,
118
+ record: {
119
+ started: false,
120
+ composePath: infra.composePath,
121
+ at: Date.now(),
122
+ durationMs: Date.now() - startedAt,
123
+ error: redactSecrets(note),
124
+ ...(logs ? { logs } : {}),
125
+ },
126
+ }
127
+ }
128
+ }
129
+
130
+ /**
131
+ * Stand the run's infra up and return a single cleanup handle, dispatching on the spec's
132
+ * `kind`: the frontend UI-test flow (`kind: 'frontend'`) builds/serves the app + WireMock as
133
+ * processes (torn down by killing them); the default backend-service flow stands the
134
+ * docker-compose stack up (torn down with `docker compose down`). Unifying the two here keeps
135
+ * `runExploreMode` free of the branch and guarantees the matching teardown runs in its finally.
136
+ *
137
+ * `dir` is the clone ROOT; `workDir` is the service subtree (equal to `dir` when the run is not
138
+ * monorepo-scoped). The docker-compose stand-up runs at the root (its `composePath` is
139
+ * repo-relative), but the FRONTEND stand-up runs in `workDir`: a monorepo frontend's
140
+ * `package.json` / `outputDir` / `mocks/` all live under the service subtree, so installing,
141
+ * building, serving and seeding WireMock from the root would target the wrong directory.
142
+ */
143
+ async function manageInfra(
144
+ dir: string,
145
+ workDir: string,
146
+ infra: AgentInfraSpec,
147
+ signal: AbortSignal | undefined,
148
+ onActivity: (() => void) | undefined,
149
+ logger: Logger,
150
+ ): Promise<{
151
+ note?: string
152
+ serveUrl?: string
153
+ record?: InfraSetupRecord
154
+ cleanup: () => Promise<void>
155
+ }> {
156
+ if (infra.kind === 'frontend') {
157
+ // `onActivity` feeds the inactivity watchdog through the frontend build/serve stand-up,
158
+ // which (unlike docker-compose's 5-min-capped `up`) can run past the inactivity window.
159
+ // Runs in `workDir` so a monorepo frontend builds/serves from its own package subtree.
160
+ const fe = await standUpFrontend(workDir, infra, signal, onActivity, logger)
161
+ return {
162
+ ...(fe.note ? { note: fe.note } : {}),
163
+ ...(fe.serveUrl ? { serveUrl: fe.serveUrl } : {}),
164
+ record: fe.record,
165
+ cleanup: () => tearDownFrontend(fe.processes, logger),
166
+ }
167
+ }
168
+ const standUp = await standUpInfra(dir, infra, signal, logger)
169
+ return {
170
+ ...(standUp.note ? { note: standUp.note } : {}),
171
+ ...(standUp.record ? { record: standUp.record } : {}),
172
+ cleanup: () => tearDownInfra(dir, infra),
173
+ }
174
+ }
175
+
176
+ /**
177
+ * Build the dynamic infra notes appended to the agent's user prompt from a stand-up outcome.
178
+ * A stand-up problem (a failed build / compose) is flagged as a concern to test around; a
179
+ * frontend serve URL points the UI tester at the app that was just built + served and pre-empts
180
+ * a live-backend CORS failure being mis-reported as an app defect. Pure (no IO) so the exact
181
+ * wording + ordering is unit-tested; returns the notes in order (problem first, serve URL next).
182
+ */
183
+ export function buildInfraNotes(managed: { note?: string; serveUrl?: string }): string[] {
184
+ const notes: string[] = []
185
+ if (managed.note) {
186
+ notes.push(
187
+ `standing the infra up reported a problem (${managed.note}). Test what you can and ` +
188
+ `flag any dependency-related gaps as concerns.`,
189
+ )
190
+ }
191
+ if (managed.serveUrl) {
192
+ notes.push(
193
+ `The frontend under test is built and served at ${managed.serveUrl}, with its other ` +
194
+ `backend upstreams handled by WireMock. Drive your UI tests against ${managed.serveUrl}. ` +
195
+ `If a call to a live backend fails with a CORS / cross-origin error, that is an infra ` +
196
+ `gap (the backend must allow the ${managed.serveUrl} origin), not an app defect — flag ` +
197
+ `it as a concern rather than a failing test.`,
198
+ )
199
+ }
200
+ return notes
201
+ }
202
+
203
+ /** Tear the docker-compose dependencies down (best-effort; a no-op when none were started). */
204
+ async function tearDownInfra(dir: string, infra: ServiceInfraSpec): Promise<void> {
205
+ if (infra.environment !== 'local' || infra.noInfraDependencies || !infra.composePath) return
206
+ try {
207
+ await exec('docker', ['compose', '-f', infra.composePath, 'down', '-v'], {
208
+ cwd: dir,
209
+ timeout: 2 * 60_000,
210
+ })
211
+ } catch {
212
+ // The container is ephemeral and torn down with the run anyway — ignore.
213
+ }
214
+ }
215
+
216
+ /** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
217
+ function extractJsonObject(text: string): unknown {
218
+ const trimmed = text.trim()
219
+ const fenced = /^```(?:json)?\s*([\s\S]*?)\s*```$/i.exec(trimmed)
220
+ const body = fenced ? (fenced[1] ?? '') : trimmed
221
+ try {
222
+ return JSON.parse(body)
223
+ } catch {
224
+ const start = body.indexOf('{')
225
+ const end = body.lastIndexOf('}')
226
+ if (start === -1 || end === -1 || end <= start) {
227
+ throw new Error('agent did not return a JSON object')
228
+ }
229
+ return JSON.parse(body.slice(start, end + 1))
230
+ }
231
+ }
232
+
233
+ /**
234
+ * The service work directory for a checkout at `dir`: the monorepo service subtree
235
+ * (`repo.serviceDirectory`, created if missing) when the job is service-scoped, else the clone
236
+ * root. Shared so the explore/preview flows derive `workDir` identically.
237
+ */
238
+ async function deriveWorkDir(dir: string, serviceDirectory: string | undefined): Promise<string> {
239
+ const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir
240
+ if (serviceDirectory) await mkdir(workDir, { recursive: true })
241
+ return workDir
242
+ }
243
+
244
+ /**
245
+ * Fresh-clone `job.branch` into `dir` and return the derived service work directory. Shared by
246
+ * the explore and preview flows, which both start from a clean single-branch checkout. (The
247
+ * coding and persistent-checkout paths keep their own resume / full-clone logic.)
248
+ */
249
+ async function cloneServiceCheckout(
250
+ dir: string,
251
+ job: AgentJob,
252
+ signal: AbortSignal | undefined,
253
+ ): Promise<string> {
254
+ await cloneRepo({
255
+ repo: { ...job.repo, baseBranch: job.branch },
256
+ ghToken: job.ghToken,
257
+ dir,
258
+ full: job.full,
259
+ signal,
260
+ })
261
+ return deriveWorkDir(dir, job.repo.serviceDirectory)
262
+ }
263
+
264
+ /** Run one generic agent job end to end, dispatching on `mode`. */
265
+ export async function handleAgent(job: AgentJob, opts: RunOptions = {}): Promise<AgentResult> {
266
+ if (job.mode === 'preview') return runPreviewMode(job, opts)
267
+ return job.mode === 'coding' ? runCodingMode(job, opts) : runExploreMode(job, opts)
268
+ }
269
+
270
+ /**
271
+ * Decide a preview stand-up's outcome from its result (pure, so the success/failure boundary
272
+ * is unit-tested without spawning a build). A preview must actually come up: unlike the tester's
273
+ * "test what you can" fallback, a stand-up that produced no reachable serve URL (failed build /
274
+ * server never bound) is a hard failure and its `note` becomes the failure reason. When the app
275
+ * is up but WireMock is not, the `note` rides along as a non-fatal warning.
276
+ */
277
+ export function buildPreviewOutcome(standUp: {
278
+ serveUrl?: string
279
+ note?: string
280
+ }): { ok: true; url: string; note?: string } | { ok: false; error: string } {
281
+ if (!standUp.serveUrl) {
282
+ return {
283
+ ok: false,
284
+ error: standUp.note
285
+ ? `the frontend preview did not come up (${standUp.note})`
286
+ : 'the frontend preview did not come up (the served app was never reachable)',
287
+ }
288
+ }
289
+ return { ok: true, url: standUp.serveUrl, ...(standUp.note ? { note: standUp.note } : {}) }
290
+ }
291
+
292
+ /**
293
+ * Long-lived browsable preview (local/node only): clone the frontend branch, then build +
294
+ * serve the app with its other upstreams mocked using the SAME {@link standUpFrontend} the UI
295
+ * tester uses — but KEEP IT RUNNING. No agent runs, and the serve / WireMock child processes
296
+ * are deliberately NOT torn down when the job returns, so the app stays reachable inside the
297
+ * container until the container itself is stopped (the transport's explicit stop path). Because
298
+ * the served files must outlive the job, the checkout is cloned into a directory that is NOT
299
+ * auto-removed (unlike the explore/coding `withWorkspace`); the ephemeral preview container
300
+ * reclaims it on teardown. A preview that never comes up is a hard failure — the partial
301
+ * stand-up is torn down and its temp checkout removed so a failed attempt leaks nothing.
302
+ */
303
+ async function runPreviewMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
304
+ const logger = opts.log ?? log
305
+ const infra = job.infra
306
+ if (infra?.kind !== 'frontend') {
307
+ // Invalid dispatch (a preview job MUST carry the frontend infra spec). No checkout or
308
+ // processes exist yet, so return the structured hard failure the rest of this flow uses
309
+ // rather than throwing a bare exception at the job registry.
310
+ return {
311
+ error: "invalid preview job: 'infra.kind' must be 'frontend'",
312
+ failureCause: 'no-usable-output',
313
+ }
314
+ }
315
+ opts.onPhase?.('clone')
316
+ logger.info('agent(preview): cloning')
317
+ // Not a `withWorkspace` temp dir: that is removed in a `finally` the moment this function
318
+ // returns, which would delete the files the kept-alive server serves. The preview container
319
+ // is single-purpose and torn down on stop, so leaving the checkout in place is intended.
320
+ const dir = await mkdtemp(join(tmpdir(), 'agent-preview-'))
321
+ try {
322
+ const workDir = await cloneServiceCheckout(dir, job, opts.signal)
323
+
324
+ opts.onPhase?.('serve')
325
+ logger.info('agent(preview): building + serving', {
326
+ serviceDirectory: job.repo.serviceDirectory,
327
+ })
328
+ const fe = await standUpFrontend(workDir, infra, opts.signal, opts.onActivity, logger)
329
+ const infraSetupFields: { infraSetup?: InfraSetupRecord } = fe.record
330
+ ? { infraSetup: fe.record }
331
+ : {}
332
+ const outcome = buildPreviewOutcome(fe)
333
+ if (!outcome.ok) {
334
+ // Never came up: tear the partial stand-up down and drop the checkout so a failed preview
335
+ // leaks neither processes nor disk. The backend surfaces the stand-up record + failure.
336
+ await tearDownFrontend(fe.processes, logger)
337
+ await rm(dir, { recursive: true, force: true })
338
+ return { error: outcome.error, failureCause: 'no-usable-output', ...infraSetupFields }
339
+ }
340
+ // Deliberately NOT torn down: the serve/WireMock children outlive this job and keep the app
341
+ // reachable until the container is stopped. `outcome.note` (WireMock down) is a soft warning.
342
+ logger.info('agent(preview): serving (kept alive)', { url: outcome.url })
343
+ return {
344
+ summary: outcome.note
345
+ ? `Frontend preview built and served at ${outcome.url} (${outcome.note}).`
346
+ : `Frontend preview built and served at ${outcome.url}.`,
347
+ preview: { url: outcome.url },
348
+ ...infraSetupFields,
349
+ }
350
+ } catch (err) {
351
+ // A throw BEFORE the stand-up handed off (a failed / aborted clone, an mkdir error) would
352
+ // otherwise leak the checkout that `withWorkspace` normally reclaims — no serve processes
353
+ // are running yet, so drop the dir and rethrow for the job registry to record the failure.
354
+ await rm(dir, { recursive: true, force: true })
355
+ throw err
356
+ }
357
+ }
358
+
359
+ /**
360
+ * Read-only exploration: clone `branch`, run the agent making no edits, and return its
361
+ * prose report — or, when `output.kind==='structured'`, the parsed JSON object as
362
+ * `custom` (the backend renders any artifact files from it in a post-op). An edit-free
363
+ * run is the expected, correct outcome; the only failure is producing no usable output.
364
+ */
365
+ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
366
+ const logger = opts.log ?? log
367
+ return acquireRepoCheckout(
368
+ { persistent: job.persistentCheckout === true, prefix: 'agent-explore', repo: job.repo },
369
+ async (dir) => {
370
+ opts.onPhase?.('clone')
371
+ // Monorepo: run with cwd set to the service subtree (created if missing), mirroring the
372
+ // coding flow so a service-scoped exploration sees the right subdirectory.
373
+ const serviceDirectory = job.repo.serviceDirectory
374
+ let workDir: string
375
+ if (job.persistentCheckout) {
376
+ logger.info('agent(explore): preparing reused checkout')
377
+ await prepareExistingCheckout({
378
+ dir,
379
+ repo: job.repo,
380
+ ghToken: job.ghToken,
381
+ branch: job.branch,
382
+ baseBranch: job.branch,
383
+ existing: true,
384
+ signal: opts.signal,
385
+ })
386
+ workDir = await deriveWorkDir(dir, serviceDirectory)
387
+ } else {
388
+ logger.info('agent(explore): cloning')
389
+ workDir = await cloneServiceCheckout(dir, job, opts.signal)
390
+ }
391
+
392
+ // Optional infra stand-up (the tester): bring the service's docker-compose
393
+ // dependencies up at the repo root for the duration of the run, tearing them down in
394
+ // the `finally`. A stand-up failure is non-fatal — it's surfaced to the agent as a
395
+ // prompt note so it can still run what it can and flag dependency gaps as concerns.
396
+ // The run-mode guidance itself lives in the backend-composed system/user prompt; the
397
+ // harness only manages the lifecycle + this dynamic stand-up note.
398
+ const infra = job.infra
399
+ const managed = infra
400
+ ? await manageInfra(dir, workDir, infra, opts.signal, opts.onActivity, logger)
401
+ : undefined
402
+ // Fold the stand-up outcome into the agent prompt: a stand-up problem (build/compose
403
+ // failure) is flagged as a concern; a frontend serve URL points the UI tester at the
404
+ // app it just built + served (the backend env resolution already reached the harness).
405
+ const infraNotes = managed ? buildInfraNotes(managed) : []
406
+ const userPrompt = infraNotes.length
407
+ ? `${job.userPrompt}\n\nNote: ${infraNotes.join(' ')}`
408
+ : job.userPrompt
409
+ // The stand-up record (success or failure, with its captured logs) rides back on EVERY
410
+ // result branch — the backend surfaces it on the Tester step regardless of whether the
411
+ // agent then produced a usable report.
412
+ const infraSetupFields: { infraSetup?: InfraSetupRecord } = managed?.record
413
+ ? { infraSetup: managed.record }
414
+ : {}
415
+
416
+ try {
417
+ opts.onPhase?.('agent')
418
+ logger.info('agent(explore): running agent', { serviceDirectory })
419
+ const {
420
+ summary,
421
+ stats,
422
+ stderrTail,
423
+ usage,
424
+ diagnostics: runDiag,
425
+ } = await runAgentInWorkspace(
426
+ {
427
+ dir: workDir,
428
+ systemPrompt: job.systemPrompt,
429
+ userPrompt,
430
+ model: job.model,
431
+ harness: job.harness,
432
+ subscriptionToken: job.subscriptionToken,
433
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
434
+ ambientAuth: job.ambientAuth,
435
+ proxyBaseUrl: job.proxyBaseUrl,
436
+ sessionToken: job.sessionToken,
437
+ serviceDirectory,
438
+ // Read-only: it inspects and reports, making no edits — so the no-progress
439
+ // guard's no-edit bound must not fire on its legitimately edit-free run.
440
+ expectsEdits: false,
441
+ webToolsGuidance: job.webToolsGuidance,
442
+ webSearchProxy: job.webSearch,
443
+ contextFiles: job.contextFiles,
444
+ guardLimits: job.guardLimits,
445
+ },
446
+ opts,
447
+ )
448
+
449
+ if (!summary.trim()) {
450
+ return {
451
+ summary,
452
+ stats,
453
+ error: noOutputReason(stats, stderrTail),
454
+ failureCause: 'no-usable-output',
455
+ ...(usage ? { usage } : {}),
456
+ ...infraSetupFields,
457
+ }
458
+ }
459
+
460
+ // Opt-in (document producers): a final answer cut off at the output ceiling — or empty —
461
+ // must FAIL LOUDLY here, BEFORE the structured repair below could launder a truncated
462
+ // reply into a half-baked doc the backend then shards/commits + hands onward. Mirrors the
463
+ // bespoke `/spec` handler's `unusableFinalAnswerCause` gate (which drove the old loop).
464
+ if (job.output?.kind === 'structured' && job.output.failOnUnusableFinal) {
465
+ const unusable = unusableFinalAnswerCause(runDiag)
466
+ if (unusable) {
467
+ return {
468
+ summary,
469
+ stats,
470
+ error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
471
+ failureCause: 'no-usable-output',
472
+ ...(usage ? { usage } : {}),
473
+ ...infraSetupFields,
474
+ }
475
+ }
476
+ }
477
+
478
+ // Prose: the summary IS the deliverable.
479
+ if (job.output?.kind !== 'structured') {
480
+ logger.info('agent(explore): done (prose)', { ...stats })
481
+ return { summary, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
482
+ }
483
+
484
+ // Structured: parse the agent's JSON. With repair enabled (default) a malformed
485
+ // reply gets ONE structured repair call before giving up; with `repair:false` we
486
+ // parse directly (no repair channel). The backend coerces/validates + renders from
487
+ // the returned object in a post-op.
488
+ let custom: unknown = null
489
+ let diagnostics: StructuredOutputDiagnostics | undefined
490
+ if (job.output.repair === false) {
491
+ try {
492
+ custom = extractJsonObject(summary)
493
+ } catch {
494
+ custom = null
495
+ }
496
+ } else {
497
+ const resolved = await resolveStructuredOutput(
498
+ {
499
+ label: 'agent',
500
+ shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
501
+ parse: (text) => extractJsonObject(text),
502
+ },
503
+ summary,
504
+ {
505
+ harness: job.harness,
506
+ subscriptionToken: job.subscriptionToken,
507
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
508
+ proxyBaseUrl: job.proxyBaseUrl,
509
+ sessionToken: job.sessionToken,
510
+ model: job.model,
511
+ jobId: job.jobId,
512
+ signal: opts.signal,
513
+ },
514
+ )
515
+ custom = resolved.value
516
+ diagnostics = resolved.diagnostics
517
+ }
518
+ if (custom === undefined || custom === null) {
519
+ return {
520
+ summary,
521
+ stats,
522
+ error: noStructuredReason(stats, stderrTail, diagnostics),
523
+ failureCause: 'no-usable-output',
524
+ ...(usage ? { usage } : {}),
525
+ ...infraSetupFields,
526
+ }
527
+ }
528
+ // Stamp the run's actual environment authoritatively onto the structured result when
529
+ // infra was managed (the tester): which env the suite ran in is decided by the job's
530
+ // infra spec, NOT the model, so the backend can echo it back to the UI deterministically
531
+ // even when the model omits it from its JSON (or a structured repair drops it). A
532
+ // frontend run tests the app against its live ephemeral backend(s), so it reports
533
+ // `ephemeral` (the TestReport env vocabulary has no separate frontend value).
534
+ const reportedEnvironment = infra
535
+ ? infra.kind === 'frontend'
536
+ ? 'ephemeral'
537
+ : infra.environment
538
+ : undefined
539
+ if (reportedEnvironment && typeof custom === 'object') {
540
+ ;(custom as Record<string, unknown>).environment = reportedEnvironment
541
+ }
542
+ logger.info('agent(explore): done (structured)', { ...stats })
543
+ return { summary, custom, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
544
+ } finally {
545
+ if (managed) await managed.cleanup()
546
+ }
547
+ },
548
+ )
549
+ }
550
+
551
+ /**
552
+ * Edit-and-push coding: clone `branch` (or resume `newBranch`), run the agent, commit +
553
+ * push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
554
+ * no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal
555
+ * no-op for the in-place fixers.
556
+ */
557
+ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
558
+ // Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
559
+ // repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
560
+ // (`bootstrap`), not the agent kind.
561
+ if (job.bootstrap) return runBootstrap(job, opts)
562
+ // Conflict resolution is a coding run with a different pre/post around the agent:
563
+ // clone full, merge the base in to surface the conflicts, then complete the merge
564
+ // commit + push (no PR). Keyed off job DATA (`mergeBase`), not the agent kind.
565
+ if (job.mergeBase) return runConflictResolution(job, opts)
566
+
567
+ const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
568
+ const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent(
569
+ {
570
+ kind: 'agent',
571
+ jobId: job.jobId,
572
+ repo: job.repo,
573
+ cloneBranch: job.branch,
574
+ ...(job.newBranch ? { newBranch: job.newBranch } : {}),
575
+ pushBranch,
576
+ ghToken: job.ghToken,
577
+ systemPrompt: job.systemPrompt,
578
+ userPrompt: job.userPrompt,
579
+ model: job.model,
580
+ harness: job.harness,
581
+ subscriptionToken: job.subscriptionToken,
582
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
583
+ ambientAuth: job.ambientAuth,
584
+ proxyBaseUrl: job.proxyBaseUrl,
585
+ sessionToken: job.sessionToken,
586
+ commitMessage: job.commitMessage ?? job.pr?.title ?? 'Agent changes',
587
+ webToolsGuidance: job.webToolsGuidance,
588
+ webSearchProxy: job.webSearch,
589
+ guardLimits: job.guardLimits,
590
+ ...(job.persistentCheckout ? { persistentCheckout: true } : {}),
591
+ ...(job.streamFollowUps ? { streamFollowUps: true } : {}),
592
+ },
593
+ opts,
594
+ )
595
+
596
+ if (!pushed) {
597
+ // A no-op: a failure for the implementer, a clean non-event for the fixers.
598
+ if (job.noChangesIsError === false) {
599
+ return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
600
+ }
601
+ return {
602
+ pushed: false,
603
+ branch: pushBranch,
604
+ summary,
605
+ stats,
606
+ error: noChangesReason('the agent produced no file changes', stats, stderrTail),
607
+ failureCause: 'no-changes',
608
+ ...(usage ? { usage } : {}),
609
+ }
610
+ }
611
+
612
+ // Changes are on the branch. Open a PR only when the job asked for one.
613
+ if (job.pr) {
614
+ const prUrl = await openPullRequest({
615
+ owner: job.repo.owner,
616
+ name: job.repo.name,
617
+ ghToken: job.ghToken,
618
+ head: pushBranch,
619
+ base: job.repo.baseBranch,
620
+ pr: job.pr,
621
+ apiBase: job.githubApiBase,
622
+ // The provider (set by the server from the configured backend) selects GitHub-PR vs
623
+ // GitLab-MR authoritatively; the clone URL supplies the GitLab REST base + project path.
624
+ // The harness's git auth is already host-neutral.
625
+ cloneUrl: job.repo.cloneUrl,
626
+ ...(job.repo.provider ? { provider: job.repo.provider } : {}),
627
+ signal: opts.signal,
628
+ })
629
+ return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
630
+ }
631
+ return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
632
+ }
633
+
634
+ /**
635
+ * Conflict-resolution coding flow (the conflict-resolver): clone the PR head `branch`
636
+ * (full history), merge `origin/<mergeBase>` into it to surface the Git conflicts, run
637
+ * the agent to resolve them, then complete the merge commit and push back onto the SAME
638
+ * branch (no new branch / PR) so the PR becomes mergeable and CI re-runs. Diverges from
639
+ * the ordinary coding flow only in needing a full clone, a base→branch merge to produce
640
+ * the conflicts, the conflict hunks surfaced into the prompt, and a guard that refuses to
641
+ * push a half-resolved tree.
642
+ */
643
+ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
644
+ const { signal } = opts
645
+ const mergeBase = job.mergeBase!
646
+ const logger = opts.log ?? log
647
+ return withWorkspace('conflict', async (dir) => {
648
+ opts.onPhase?.('clone')
649
+ logger.info('agent(conflict): cloning PR branch (full history)')
650
+ // Full clone so the merge base + `origin/<mergeBase>` are present for the merge.
651
+ await cloneRepo({
652
+ repo: { ...job.repo, baseBranch: job.branch },
653
+ ghToken: job.ghToken,
654
+ dir,
655
+ signal,
656
+ full: true,
657
+ })
658
+ const prTip = await headCommit(dir, signal)
659
+
660
+ logger.info('agent(conflict): merging base into PR branch', { base: mergeBase })
661
+ const clean = await mergeBranch(dir, mergeBase, signal)
662
+
663
+ // No conflicts to resolve. If base brought new commits the merge advanced the branch,
664
+ // so push it; otherwise the branch is already up to date — a no-op we leave alone (a
665
+ // gate that keeps seeing GitHub report this branch as "conflicting" is then a
666
+ // base-resolution problem, not the agent's — logged so that loop is diagnosable).
667
+ if (clean) {
668
+ if ((await headCommit(dir, signal)) === prTip) {
669
+ logger.info('agent(conflict): base merged clean and branch already up to date', {
670
+ base: mergeBase,
671
+ })
672
+ return {
673
+ pushed: false,
674
+ branch: job.branch,
675
+ summary: 'No conflicts: the branch is already up to date with its base.',
676
+ stats: { toolCalls: 0, assistantChars: 0 },
677
+ }
678
+ }
679
+ opts.onPhase?.('push')
680
+ logger.info('agent(conflict): base merged clean — pushing the merge commit')
681
+ await pushBranch(dir, job.branch, job.ghToken, signal)
682
+ return {
683
+ pushed: true,
684
+ branch: job.branch,
685
+ summary: 'Merged the base in cleanly (no conflicts to resolve).',
686
+ stats: { toolCalls: 0, assistantChars: 0 },
687
+ }
688
+ }
689
+
690
+ // The merge left conflicts in the working tree. Surface the EXACT files + hunks to the
691
+ // agent: the generic task prompt alone never told it which files conflict (or even that
692
+ // there were conflicts), so it would drift onto the original feature task. Lead with the
693
+ // conflict; keep the task only as trailing reference.
694
+ const conflicted = await unmergedPaths(dir, signal)
695
+ opts.onPhase?.('agent')
696
+ logger.info('agent(conflict): resolving conflicts with agent', { conflicted })
697
+ const diff = await conflictDiff(dir, conflicted, signal)
698
+ const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt)
699
+
700
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
701
+ {
702
+ dir,
703
+ systemPrompt: job.systemPrompt,
704
+ userPrompt,
705
+ model: job.model,
706
+ harness: job.harness,
707
+ subscriptionToken: job.subscriptionToken,
708
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
709
+ ambientAuth: job.ambientAuth,
710
+ proxyBaseUrl: job.proxyBaseUrl,
711
+ sessionToken: job.sessionToken,
712
+ contextFiles: job.contextFiles,
713
+ guardLimits: job.guardLimits,
714
+ },
715
+ opts,
716
+ )
717
+
718
+ // Never push a half-resolved tree: if any conflict markers / unmerged paths remain,
719
+ // the PR would still be broken. Fail so the engine can retry / notify.
720
+ const unresolved = await unmergedPaths(dir, signal)
721
+ if (unresolved.length > 0) {
722
+ logger.error('agent(conflict): unresolved conflicts remain, refusing to push', {
723
+ unresolved: unresolved.length,
724
+ })
725
+ return {
726
+ pushed: false,
727
+ branch: job.branch,
728
+ summary,
729
+ stats,
730
+ error: unresolvedReason(unresolved, stats, stderrTail),
731
+ failureCause: 'agent',
732
+ ...(usage ? { usage } : {}),
733
+ }
734
+ }
735
+ // Complete the merge commit with the agent's resolution staged, then push.
736
+ await commitAll(dir, `Merge ${mergeBase} into ${job.branch}`, signal)
737
+ opts.onPhase?.('push')
738
+ logger.info('agent(conflict): pushing resolved branch', { ...stats })
739
+ await pushBranch(dir, job.branch, job.ghToken, signal)
740
+ return { pushed: true, branch: job.branch, summary, stats, ...(usage ? { usage } : {}) }
741
+ })
742
+ }
743
+
744
+ /**
745
+ * The conflict-focused user prompt: lead with the exact conflicted files and their hunks
746
+ * (so the model acts on the real conflict, not the original feature task), then carry the
747
+ * task only as trailing reference. The role/system prompt frames it as a merge-conflict
748
+ * resolution; this gives it the concrete material.
749
+ */
750
+ function buildConflictPrompt(
751
+ baseBranch: string,
752
+ prBranch: string,
753
+ conflicted: string[],
754
+ diff: string,
755
+ taskReference: string,
756
+ ): string {
757
+ const fileList = conflicted.map((p) => `- ${p}`).join('\n')
758
+ const parts = [
759
+ `The base branch \`${baseBranch}\` was merged into this pull-request branch ` +
760
+ `\`${prBranch}\` and left Git merge conflicts in the following ${conflicted.length} ` +
761
+ `file(s):`,
762
+ '',
763
+ fileList,
764
+ '',
765
+ 'Resolve EVERY conflict in these files: open each one, understand both sides of each ' +
766
+ '`<<<<<<<` / `=======` / `>>>>>>>` region, and edit it to a correct result that ' +
767
+ "preserves the intent of BOTH the base changes and this PR's changes — never just " +
768
+ 'discard one side. Remove every conflict marker and leave the project building. Do ' +
769
+ 'not create a new branch or PR; the harness completes the merge commit and pushes once ' +
770
+ 'no conflict markers remain.',
771
+ '',
772
+ 'Conflict hunks (`git diff` of the conflicted files):',
773
+ '',
774
+ '```diff',
775
+ diff,
776
+ '```',
777
+ ]
778
+ const ref = taskReference.trim()
779
+ if (ref) {
780
+ parts.push('', 'For reference, the task this pull request implements:', '', ref)
781
+ }
782
+ return parts.join('\n')
783
+ }
784
+
785
+ /** Human-readable reason the agent failed to fully resolve the conflicts. */
786
+ function unresolvedReason(
787
+ unresolved: string[],
788
+ stats: PiRunStats,
789
+ stderrTail: string | undefined,
790
+ ): string {
791
+ const cause = agentNeverActed(stats) ? NEVER_ACTED_CAUSE : ''
792
+ const sample = unresolved.slice(0, 10).join(', ')
793
+ return (
794
+ `The agent did not resolve all merge conflicts ` +
795
+ `(${unresolved.length} file(s) still conflicted: ${sample}).${cause}` +
796
+ agentOutputTail(stderrTail)
797
+ )
798
+ }
799
+
800
+ /**
801
+ * Repo-bootstrap coding flow (the bootstrapper): with a reference architecture, clone it →
802
+ * the agent adapts it in place per the instructions; without one (`fromScratch`), start from
803
+ * an empty directory → the agent scaffolds the new service. Either way the result's history
804
+ * is reset to a single commit and force-pushed to the SEPARATE, pre-created target repo's
805
+ * default branch. Diverges from the ordinary coding flow in pushing to a different repo with
806
+ * a reinitialised history rather than a work branch + PR on the cloned repo.
807
+ */
808
+ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
809
+ const { signal } = opts
810
+ const boot = job.bootstrap!
811
+ const fromScratch = boot.fromScratch === true
812
+ const logger = (opts.log ?? log).child({ target: `${boot.target.owner}/${boot.target.name}` })
813
+ return withWorkspace('boot', async (dir) => {
814
+ if (!fromScratch) {
815
+ opts.onPhase?.('clone')
816
+ logger.info('agent(bootstrap): cloning reference architecture', {
817
+ reference: `${job.repo.owner}/${job.repo.name}`,
818
+ })
819
+ await cloneRepo({
820
+ repo: { ...job.repo, baseBranch: job.branch },
821
+ ghToken: job.ghToken,
822
+ dir,
823
+ signal,
824
+ })
825
+ } else {
826
+ logger.info('agent(bootstrap): scaffolding from scratch (no reference)')
827
+ }
828
+
829
+ opts.onPhase?.('agent')
830
+ logger.info('agent(bootstrap): running agent')
831
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
832
+ {
833
+ dir,
834
+ systemPrompt: job.systemPrompt,
835
+ userPrompt: job.userPrompt,
836
+ model: job.model,
837
+ harness: job.harness,
838
+ subscriptionToken: job.subscriptionToken,
839
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
840
+ ambientAuth: job.ambientAuth,
841
+ proxyBaseUrl: job.proxyBaseUrl,
842
+ sessionToken: job.sessionToken,
843
+ guardLimits: job.guardLimits,
844
+ },
845
+ opts,
846
+ )
847
+
848
+ // Guard against a no-op run: Pi can exit cleanly having done nothing (e.g. it never
849
+ // reached the model), and a force-push would then publish an empty tree — leaving the
850
+ // run "succeeded" but the repo bare. Fail with a structured error (carrying what the
851
+ // agent did) instead of pushing nothing.
852
+ if (!(await producedRepoContent(dir, !fromScratch, signal))) {
853
+ const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail)
854
+ logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats })
855
+ return { summary, stats, error, failureCause: 'agent', ...(usage ? { usage } : {}) }
856
+ }
857
+
858
+ opts.onPhase?.('push')
859
+ logger.info('agent(bootstrap): pushing bootstrapped contents', { ...stats })
860
+ // Bootstrap always resets history to one commit + force-pushes (the fresh history
861
+ // shares no ancestor with whatever boilerplate the new repo was created with).
862
+ await reinitAndPush({
863
+ dir,
864
+ target: boot.target,
865
+ ghToken: job.ghToken,
866
+ message: fromScratch
867
+ ? 'Bootstrap new repository'
868
+ : `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
869
+ })
870
+ logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch })
871
+ return { defaultBranch: boot.target.defaultBranch, summary, stats, ...(usage ? { usage } : {}) }
872
+ })
873
+ }
874
+
875
+ /**
876
+ * Whether the bootstrapper actually produced repository content, so a no-op run (the agent
877
+ * never reached the model / never wrote anything) is failed rather than force-pushed as an
878
+ * empty repo. With a reference architecture, "produced content" means the agent changed the
879
+ * clone; scaffolding from scratch, it means at least one file now exists in the working
880
+ * directory. (The harness writes its prompt context to Pi's global `~/.pi/agent/AGENTS.md`,
881
+ * never into `dir`, so nothing here needs to be filtered out as harness boilerplate.)
882
+ */
883
+ export async function producedRepoContent(
884
+ dir: string,
885
+ hasReference: boolean,
886
+ signal?: AbortSignal,
887
+ ): Promise<boolean> {
888
+ if (hasReference) return hasAgentChanges(dir, signal)
889
+ return containsAnyFile(dir)
890
+ }
891
+
892
+ /**
893
+ * Whether `dir` contains at least one regular file anywhere in its tree, walking
894
+ * depth-first and stopping at the FIRST file found — so the cost is bounded by how
895
+ * quickly a file turns up (a scaffold almost always writes a root-level file), not by
896
+ * the size of the produced tree (a full recursive `readdir` would materialise every
897
+ * entry before the check).
898
+ */
899
+ async function containsAnyFile(dir: string): Promise<boolean> {
900
+ const handle = await opendir(dir)
901
+ try {
902
+ for await (const entry of handle) {
903
+ if (entry.isFile()) return true
904
+ if (entry.isDirectory() && (await containsAnyFile(join(dir, entry.name)))) return true
905
+ }
906
+ } catch {
907
+ // A directory that vanished mid-walk has nothing to contribute.
908
+ }
909
+ return false
910
+ }
911
+
912
+ /** Human-readable bootstrap no-op reason, embedding what the agent did so the cause is visible. */
913
+ function bootstrapNoOpReason(
914
+ hasReference: boolean,
915
+ stats: PiRunStats,
916
+ summary: string,
917
+ stderrTail: string | undefined,
918
+ ): string {
919
+ const what = hasReference
920
+ ? 'made no changes to the reference architecture'
921
+ : 'scaffolded no files'
922
+ const cause = agentNeverActed(stats) ? NEVER_ACTED_CAUSE : ''
923
+ return (
924
+ `the bootstrapper agent ${what} ` +
925
+ `(tool calls: ${stats.toolCalls}, assistant output: ${stats.assistantChars} chars).${cause}` +
926
+ agentOutputTail(stderrTail, summary)
927
+ )
928
+ }
929
+
930
+ /** Human-readable reason a read-only run produced no usable output. */
931
+ function noOutputReason(stats: PiRunStats, stderrTail: string | undefined): string {
932
+ const cause = agentNeverActed(stats)
933
+ ? ' (the agent never acted — it most likely could not reach the model)'
934
+ : ''
935
+ return `the agent produced no report${cause}.${agentOutputTail(stderrTail)}`
936
+ }
937
+
938
+ /** Human-readable reason a structured run produced no parseable JSON. */
939
+ function noStructuredReason(
940
+ stats: PiRunStats,
941
+ stderrTail: string | undefined,
942
+ diagnostics?: StructuredOutputDiagnostics,
943
+ ): string {
944
+ const cause = agentNeverActed(stats)
945
+ ? NEVER_ACTED_CAUSE
946
+ : ' The agent did not return a parseable JSON object.'
947
+ return `the agent produced no structured result.${cause}${diagnostics ? diagnosticsSuffix(diagnostics) : ''}${agentOutputTail(stderrTail)}`
948
+ }