@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
@@ -0,0 +1,285 @@
1
+ import { mkdir } from 'node:fs/promises';
2
+ import { join } from 'node:path';
3
+ import { branchHasCommitsSince, cloneExistingBranch, cloneRepo, commitTrackedEdits, createBranch, excludeFromGit, headCommit, listUntrackedFiles, prepareExistingCheckout, pushBranch, refreshFromBaseIfClean, remoteBranchExists, } from './git.js';
4
+ import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js';
5
+ import { acquireRepoCheckout, agentNeverActed, agentOutputTail, runAgentInWorkspace, } from './pi-workspace.js';
6
+ import { log } from './logger.js';
7
+ /**
8
+ * How often the harness checkpoints the agent's work mid-run by pushing the branch.
9
+ * A per-run container can be evicted at any moment; pushing the agent's commits
10
+ * periodically means an evicted run's work survives on the branch, so a retry
11
+ * RESUMES on top of it instead of starting over. Overridable via env for tests.
12
+ */
13
+ function checkpointIntervalMs() {
14
+ const n = Number(process.env.JOB_CHECKPOINT_INTERVAL_MS);
15
+ return Number.isFinite(n) && n > 0 ? Math.floor(n) : 60_000;
16
+ }
17
+ /**
18
+ * How often the harness tails the Coder's follow-up sentinel file to surface new items.
19
+ * Short (a few seconds) so the Follow-up companion lights up promptly while the Coder is
20
+ * still running. Overridable via env for tests.
21
+ */
22
+ function followUpPollIntervalMs() {
23
+ const n = Number(process.env.JOB_FOLLOWUP_POLL_INTERVAL_MS);
24
+ return Number.isFinite(n) && n > 0 ? Math.floor(n) : 3_000;
25
+ }
26
+ /**
27
+ * Clone (or RESUME an existing branch) → write context → run Pi → push the branch
28
+ * iff it carries work. The agent commits its OWN work (it alone knows which files
29
+ * belong vs scratch/artifacts it created), so the harness never blanket-stages:
30
+ * {@link commitTrackedEdits} is only a safety net for forgotten edits to ALREADY
31
+ * tracked files, and the run is judged a no-op only when the branch never advanced
32
+ * past its pre-run tip ({@link branchHasCommitsSince}). The harness owns push + PR;
33
+ * it checkpoints (pushes) periodically so an evicted run's commits survive and a
34
+ * retry resumes on them. Returns the run's summary/stats, whether it pushed, and
35
+ * whether it resumed; callers decide what to do after a push (open a PR, or nothing).
36
+ */
37
+ export async function runCodingAgent(spec, opts = {}) {
38
+ const { signal } = opts;
39
+ // The registry already binds jobId/repo/branch; add the coding kind + the push branch
40
+ // (which differs from the cloned branch the registry bound).
41
+ const logger = (opts.log ?? log).child({ kind: spec.kind, branch: spec.pushBranch });
42
+ return acquireRepoCheckout({ persistent: spec.persistentCheckout === true, prefix: spec.kind, repo: spec.repo }, async (dir) => {
43
+ // Resume an evicted earlier run when its work branch already exists on the
44
+ // remote: clone THAT branch and continue on its commits, rather than branching
45
+ // off base and redoing everything. Only the impl path (which creates a fresh
46
+ // `newBranch`) can resume; the ci-fix/conflict paths already clone the PR branch.
47
+ //
48
+ // Resume safety relies on two invariants the dispatcher (worker) upholds, since
49
+ // the harness can't see run/PR state from inside the container:
50
+ // - At most ONE active run per block at a time. The work branch is deterministic
51
+ // per block (`cat-factory/<blockId>`), so two concurrent runs would target the
52
+ // same branch; their pushes race. A plain (non-forced) push fails safely on a
53
+ // non-fast-forward rather than clobbering the other run's commits, so the worst
54
+ // case is one run failing — never lost work — but the dispatcher should not
55
+ // knowingly run two at once.
56
+ // - Re-dispatch only NON-terminal runs (failed / evicted / stale-running), whose
57
+ // branch is by definition unmerged. Resuming a branch whose PR already merged
58
+ // could re-introduce merged work; that is avoided two ways: the platform deletes
59
+ // the work branch when its PR merges (GitHubPullRequestMerger), so a re-run finds
60
+ // no branch and starts fresh, and a `done` block is never re-dispatched anyway.
61
+ const resumed = spec.newBranch != null &&
62
+ (await remoteBranchExists(spec.repo.cloneUrl, spec.newBranch, spec.ghToken, signal));
63
+ opts.onPhase?.('clone');
64
+ if (spec.persistentCheckout) {
65
+ // Reused checkout: clean-sweep + fetch + switch branch in place. A resumed branch
66
+ // (or a run without `newBranch`, working directly on `cloneBranch`) already exists
67
+ // on the remote, so check it out directly; otherwise (re)create `newBranch` off the
68
+ // base tip — the same resume-vs-fresh decision the clone paths below make.
69
+ const targetBranch = spec.newBranch ?? spec.cloneBranch;
70
+ logger.info('coding-agent: preparing reused checkout', { branch: targetBranch, resumed });
71
+ await prepareExistingCheckout({
72
+ dir,
73
+ repo: spec.repo,
74
+ ghToken: spec.ghToken,
75
+ branch: targetBranch,
76
+ baseBranch: spec.cloneBranch,
77
+ existing: resumed || spec.newBranch == null,
78
+ signal,
79
+ });
80
+ }
81
+ else if (resumed) {
82
+ logger.info('coding-agent: resuming existing branch', { branch: spec.newBranch });
83
+ await cloneExistingBranch({
84
+ cloneUrl: spec.repo.cloneUrl,
85
+ branch: spec.newBranch,
86
+ ghToken: spec.ghToken,
87
+ dir,
88
+ signal,
89
+ });
90
+ }
91
+ else {
92
+ logger.info('coding-agent: cloning', { cloneBranch: spec.cloneBranch });
93
+ await cloneRepo({
94
+ repo: { ...spec.repo, baseBranch: spec.cloneBranch },
95
+ ghToken: spec.ghToken,
96
+ dir,
97
+ signal,
98
+ });
99
+ if (spec.newBranch)
100
+ await createBranch(dir, spec.newBranch, signal);
101
+ }
102
+ // The branch tip before the agent runs this time. A FRESH run produced work iff
103
+ // the branch advances past it; a RESUMED run already carries prior work, so it is
104
+ // never a no-op regardless of what this pass adds. Captured BEFORE the resume base
105
+ // refresh below so that refresh's merge commit counts as advancement and is pushed.
106
+ const baseSha = await headCommit(dir, signal);
107
+ // A resumed branch was cut from an OLDER base; merge the latest base in when the
108
+ // two merge cleanly, so the agent works against current base and the PR stays
109
+ // current. On a conflict this is a no-op (the run continues on the stale base — the
110
+ // merge gate handles a conflicting PR downstream, as before), so it never blocks a
111
+ // resume. Best-effort: any error is treated as "continue without refreshing".
112
+ if (resumed) {
113
+ const refreshed = await refreshFromBaseIfClean(dir, spec.cloneBranch, spec.ghToken, signal).catch(() => false);
114
+ if (!refreshed) {
115
+ logger.info('coding-agent: resume base refresh skipped (conflict or error)', {
116
+ base: spec.cloneBranch,
117
+ });
118
+ }
119
+ }
120
+ // Serialize all pushes to the work branch through a single in-flight promise.
121
+ // A checkpoint tick and the final push (or two slow checkpoint ticks) must never
122
+ // run `git push` to the same branch concurrently: overlapping pushes race on the
123
+ // remote ref and can make a push fail with a ref-lock / non-fast-forward error —
124
+ // which, on the FINAL push, would fail the whole run even though the work is
125
+ // committed. `pushWorkOnce` coalesces concurrent callers onto one push and only
126
+ // pushes once the branch has advanced past `baseSha` (see below).
127
+ //
128
+ // Only push once the branch has advanced past its pre-run tip: pushing while it
129
+ // still sits at `baseSha` would create the work branch at the base commit (a
130
+ // zero-diff branch), which a later retry would see via `remoteBranchExists` and
131
+ // treat as resumable work — then fail to open a PR ("no commits between base and
132
+ // head"). So a run that never commits leaves NO branch behind, preserving the
133
+ // clean no-op outcome.
134
+ let pushInFlight = null;
135
+ const pushWorkOnce = () => {
136
+ if (pushInFlight)
137
+ return pushInFlight;
138
+ pushInFlight = (async () => {
139
+ if (!(await branchHasCommitsSince(dir, baseSha, signal)))
140
+ return;
141
+ await pushBranch(dir, spec.pushBranch, spec.ghToken, signal);
142
+ })().finally(() => {
143
+ pushInFlight = null;
144
+ });
145
+ return pushInFlight;
146
+ };
147
+ // Read the in-flight push, if any. A function (with an explicit return type) so the
148
+ // value isn't subject to the caller's straight-line narrowing — `pushInFlight` is
149
+ // only ever assigned inside closures, which flow analysis can't observe.
150
+ const inFlightPush = () => pushInFlight;
151
+ // Checkpoint the agent's committed work to the branch periodically so an eviction
152
+ // mid-run doesn't lose it (a retry then resumes from the pushed commits). The
153
+ // agent commits its own work; this only PUSHES already-committed commits, so it
154
+ // never races the agent's staging. Best-effort: a failed checkpoint is skipped.
155
+ // Surface checkpoint-push failures at warn with a running count: a checkpoint losing
156
+ // a race is harmless once, but a steadily-climbing count means mid-run work is NOT
157
+ // being durably checkpointed, so an eviction would lose it — previously invisible at
158
+ // info level. Still best-effort: a failed checkpoint never fails the run.
159
+ let checkpointFailures = 0;
160
+ const checkpoint = setInterval(() => {
161
+ pushWorkOnce().catch((err) => {
162
+ checkpointFailures++;
163
+ logger.warn('coding-agent: checkpoint push failed', {
164
+ reason: err instanceof Error ? err.message : String(err),
165
+ checkpointFailures,
166
+ });
167
+ });
168
+ }, checkpointIntervalMs());
169
+ checkpoint.unref?.();
170
+ // In a monorepo the service lives in a subdirectory: run Pi with its cwd set to
171
+ // that subtree (git stays rooted at `dir` so commits/pushes still cover the whole
172
+ // checkout). Created if missing so a coder scaffolding a brand-new service into an
173
+ // existing monorepo has a cwd to start in. The agent is also TOLD it's in a
174
+ // monorepo (and where) via the AGENTS.md context below.
175
+ const serviceDirectory = spec.repo.serviceDirectory;
176
+ const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir;
177
+ if (serviceDirectory)
178
+ await mkdir(workDir, { recursive: true });
179
+ // Follow-up companion: tail the Coder's sentinel file and stream new items out on the
180
+ // job view. Locally exclude it from git first so the agent's own `git add` can never
181
+ // stage it and it never surfaces as an untracked leftover or in the PR. The sentinel
182
+ // lives in the agent's working directory (its cwd), where the prompt tells it to write.
183
+ const followUpTailer = spec.streamFollowUps && opts.onFollowUp
184
+ ? new FollowUpTailer(join(workDir, FOLLOW_UPS_FILENAME), opts.onFollowUp, logger)
185
+ : undefined;
186
+ let followUpTick;
187
+ if (followUpTailer) {
188
+ await excludeFromGit(dir, FOLLOW_UPS_FILENAME, signal);
189
+ followUpTick = setInterval(() => {
190
+ void followUpTailer.poll();
191
+ }, followUpPollIntervalMs());
192
+ followUpTick.unref?.();
193
+ }
194
+ let outcome;
195
+ try {
196
+ opts.onPhase?.('agent');
197
+ logger.info('coding-agent: running agent', { serviceDirectory });
198
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
199
+ dir: workDir,
200
+ systemPrompt: spec.systemPrompt,
201
+ userPrompt: spec.userPrompt,
202
+ model: spec.model,
203
+ harness: spec.harness,
204
+ subscriptionToken: spec.subscriptionToken,
205
+ subscriptionBaseUrl: spec.subscriptionBaseUrl,
206
+ ambientAuth: spec.ambientAuth,
207
+ proxyBaseUrl: spec.proxyBaseUrl,
208
+ sessionToken: spec.sessionToken,
209
+ serviceDirectory,
210
+ webToolsGuidance: spec.webToolsGuidance,
211
+ webSearchProxy: spec.webSearchProxy,
212
+ guardLimits: spec.guardLimits,
213
+ }, opts);
214
+ // Stop tailing the follow-up sentinel and flush any items written after the last
215
+ // tick, so a fast final burst still reaches the job view before the run is recorded.
216
+ if (followUpTick)
217
+ clearInterval(followUpTick);
218
+ if (followUpTailer)
219
+ await followUpTailer.poll().catch(() => { });
220
+ // Safety net for forgotten edits: commit changes to TRACKED files only (never
221
+ // untracked scratch files/artifacts — the agent owns committing new files).
222
+ await commitTrackedEdits(dir, spec.commitMessage, signal);
223
+ // Stop periodic checkpoints and let any in-flight one settle BEFORE the final
224
+ // push, so the two never run a concurrent `git push` to the same branch (the
225
+ // final push below is then a fresh attempt whose failure is the real signal).
226
+ clearInterval(checkpoint);
227
+ const inflight = inFlightPush();
228
+ if (inflight)
229
+ await inflight.catch(() => { });
230
+ // Surface (don't fail on) untracked, non-ignored files the agent left behind:
231
+ // `commitTrackedEdits` only captures edits to ALREADY tracked files, so a NEW
232
+ // file the agent created but forgot to commit is silently dropped. Logging it
233
+ // makes that loss observable when a PR turns out to be missing a file.
234
+ const leftover = await listUntrackedFiles(dir, signal);
235
+ if (leftover.length > 0) {
236
+ logger.warn('coding-agent: uncommitted new files left behind (not pushed)', {
237
+ count: leftover.length,
238
+ files: leftover.slice(0, 20),
239
+ });
240
+ }
241
+ const hasWork = resumed || (await branchHasCommitsSince(dir, baseSha, signal));
242
+ if (!hasWork) {
243
+ logger.info('coding-agent: no changes produced', { ...stats });
244
+ outcome = {
245
+ pushed: false,
246
+ resumed,
247
+ summary,
248
+ stats,
249
+ ...(stderrTail ? { stderrTail } : {}),
250
+ ...(usage ? { usage } : {}),
251
+ };
252
+ }
253
+ else {
254
+ opts.onPhase?.('push');
255
+ logger.info('coding-agent: pushing', { resumed, ...stats });
256
+ await pushWorkOnce();
257
+ outcome = {
258
+ pushed: true,
259
+ resumed,
260
+ summary,
261
+ stats,
262
+ ...(stderrTail ? { stderrTail } : {}),
263
+ ...(usage ? { usage } : {}),
264
+ };
265
+ }
266
+ }
267
+ finally {
268
+ // Safety net for the throw path (the happy path already cleared these above).
269
+ clearInterval(checkpoint);
270
+ if (followUpTick)
271
+ clearInterval(followUpTick);
272
+ }
273
+ return outcome;
274
+ });
275
+ }
276
+ /**
277
+ * The "no changes" reason both coding agents report: a caller-supplied lead phrase
278
+ * plus the shared "never acted" cause and a credential-scrubbed tail of Pi's stderr.
279
+ */
280
+ export function noChangesReason(lead, stats, stderrTail) {
281
+ const cause = agentNeverActed(stats)
282
+ ? ' (the agent never acted — it most likely could not reach the model)'
283
+ : '';
284
+ return `${lead}${cause}.${agentOutputTail(stderrTail)}`;
285
+ }
@@ -0,0 +1,138 @@
1
+ import { cloneRepo, commitAll, conflictDiff, headCommit, mergeBranch, pushBranch, unmergedPaths, } from './git.js';
2
+ import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
3
+ import { log } from './logger.js';
4
+ // Async job execution for the merge-conflict resolver. When a PR cannot be merged
5
+ // because it conflicts with its base, the engine dispatches this: clone the PR head
6
+ // branch (full history), merge the base branch into it to surface the conflicts,
7
+ // run Pi to resolve them, complete the merge commit and push back onto the SAME
8
+ // branch (no new branch / PR) so the PR becomes mergeable and CI re-runs.
9
+ //
10
+ // Shares the thin workspace/Pi base (withWorkspace + runAgentInWorkspace) with the
11
+ // other agents; it diverges only in needing a full clone, a base→branch merge to
12
+ // produce the conflicts, and a guard that refuses to push a half-resolved tree.
13
+ /** Run one conflict-resolver job: clone → merge base → Pi resolves → push (same branch). */
14
+ export async function handleConflictResolver(job, opts = {}) {
15
+ const { signal } = opts;
16
+ const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
17
+ return withWorkspace('conflict', async (dir) => {
18
+ log.info('conflict: cloning PR branch (full history)', trace);
19
+ // Full clone so the merge base + `origin/<base>` are present for the merge.
20
+ await cloneRepo({
21
+ repo: { ...job.repo, baseBranch: job.branch },
22
+ ghToken: job.ghToken,
23
+ dir,
24
+ signal,
25
+ full: true,
26
+ });
27
+ const prTip = await headCommit(dir, signal);
28
+ log.info('conflict: merging base into PR branch', { ...trace, base: job.repo.baseBranch });
29
+ const clean = await mergeBranch(dir, job.repo.baseBranch, signal);
30
+ // No conflicts to resolve. If base brought new commits the merge advanced the
31
+ // branch, so push it; otherwise the branch is already up to date — a no-op we
32
+ // leave alone (re-dispatching it never changes the PR, so a gate that keeps
33
+ // seeing GitHub report this branch as "conflicting" is a base-resolution problem,
34
+ // not the agent's — logged here so that loop is diagnosable).
35
+ if (clean) {
36
+ if ((await headCommit(dir, signal)) === prTip) {
37
+ log.info('conflict: base merged clean and branch already up to date — nothing to push', {
38
+ ...trace,
39
+ base: job.repo.baseBranch,
40
+ });
41
+ return {
42
+ resolved: true,
43
+ summary: 'No conflicts: the branch is already up to date with its base.',
44
+ stats: { toolCalls: 0, assistantChars: 0 },
45
+ };
46
+ }
47
+ log.info('conflict: base merged clean — pushing the merge commit', trace);
48
+ await pushBranch(dir, job.branch, job.ghToken, signal);
49
+ return {
50
+ resolved: true,
51
+ summary: 'Merged the base in cleanly (no conflicts to resolve).',
52
+ stats: { toolCalls: 0, assistantChars: 0 },
53
+ };
54
+ }
55
+ // The merge left conflicts in the working tree. Surface the EXACT files + hunks
56
+ // to the agent: the generic task prompt alone never told it which files conflict
57
+ // (or even that there were conflicts), so it would drift onto the original feature
58
+ // task. Lead with the conflict; keep the task only as trailing reference.
59
+ const conflicted = await unmergedPaths(dir, signal);
60
+ log.info('conflict: resolving conflicts with agent', { ...trace, conflicted });
61
+ const diff = await conflictDiff(dir, conflicted, signal);
62
+ const userPrompt = buildConflictPrompt(job.repo.baseBranch, job.branch, conflicted, diff, job.userPrompt);
63
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
64
+ dir,
65
+ systemPrompt: job.systemPrompt,
66
+ userPrompt,
67
+ model: job.model,
68
+ harness: job.harness,
69
+ subscriptionToken: job.subscriptionToken,
70
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
71
+ proxyBaseUrl: job.proxyBaseUrl,
72
+ sessionToken: job.sessionToken,
73
+ }, opts);
74
+ // Never push a half-resolved tree: if any conflict markers / unmerged paths
75
+ // remain, the PR would still be broken. Fail so the engine can retry / notify.
76
+ const unresolved = await unmergedPaths(dir, signal);
77
+ if (unresolved.length > 0) {
78
+ log.error('conflict: unresolved conflicts remain — refusing to push', {
79
+ ...trace,
80
+ unresolved: unresolved.length,
81
+ });
82
+ return {
83
+ resolved: false,
84
+ summary,
85
+ stats,
86
+ error: unresolvedReason(unresolved, stats, stderrTail),
87
+ ...(usage ? { usage } : {}),
88
+ };
89
+ }
90
+ // Complete the merge commit with the agent's resolution staged, then push.
91
+ await commitAll(dir, `Merge ${job.repo.baseBranch} into ${job.branch}`, signal);
92
+ log.info('conflict: pushing resolved branch', { ...trace, ...stats });
93
+ await pushBranch(dir, job.branch, job.ghToken, signal);
94
+ return { resolved: true, summary, stats, ...(usage ? { usage } : {}) };
95
+ });
96
+ }
97
+ /**
98
+ * The conflict-focused user prompt: lead with the exact conflicted files and their
99
+ * hunks (so the model acts on the real conflict, not the original feature task), then
100
+ * carry the task only as trailing reference. The role/system prompt frames it as a
101
+ * merge-conflict resolution; this gives it the concrete material.
102
+ */
103
+ function buildConflictPrompt(baseBranch, prBranch, conflicted, diff, taskReference) {
104
+ const fileList = conflicted.map((p) => `- ${p}`).join('\n');
105
+ const parts = [
106
+ `The base branch \`${baseBranch}\` was merged into this pull-request branch ` +
107
+ `\`${prBranch}\` and left Git merge conflicts in the following ${conflicted.length} ` +
108
+ `file(s):`,
109
+ '',
110
+ fileList,
111
+ '',
112
+ 'Resolve EVERY conflict in these files: open each one, understand both sides of each ' +
113
+ '`<<<<<<<` / `=======` / `>>>>>>>` region, and edit it to a correct result that ' +
114
+ "preserves the intent of BOTH the base changes and this PR's changes — never just " +
115
+ 'discard one side. Remove every conflict marker and leave the project building. Do ' +
116
+ 'not create a new branch or PR; the harness completes the merge commit and pushes once ' +
117
+ 'no conflict markers remain.',
118
+ '',
119
+ 'Conflict hunks (`git diff` of the conflicted files):',
120
+ '',
121
+ '```diff',
122
+ diff,
123
+ '```',
124
+ ];
125
+ const ref = taskReference.trim();
126
+ if (ref) {
127
+ parts.push('', 'For reference, the task this pull request implements:', '', ref);
128
+ }
129
+ return parts.join('\n');
130
+ }
131
+ /** Human-readable reason the agent failed to fully resolve the conflicts. */
132
+ function unresolvedReason(unresolved, stats, stderrTail) {
133
+ const cause = agentNeverActed(stats) ? NEVER_ACTED_CAUSE : '';
134
+ const sample = unresolved.slice(0, 10).join(', ');
135
+ return (`The agent did not resolve all merge conflicts ` +
136
+ `(${unresolved.length} file(s) still conflicted: ${sample}).${cause}` +
137
+ agentOutputTail(stderrTail));
138
+ }
package/dist/embed.js ADDED
@@ -0,0 +1,8 @@
1
+ // Embeddable surface of the executor harness: the Pi-driving and git helpers
2
+ // the container payload uses, re-exported so other packages (e.g. the benchmark
3
+ // harness) can run the *same* coding-agent flow outside the container — clone a
4
+ // repo, write the agent context, point Pi at an OpenAI-compatible endpoint, run
5
+ // it, and inspect what changed. The HTTP server / job lifecycle stays internal;
6
+ // only the reusable primitives are exposed here.
7
+ export { PI_MAX_OUTPUT_TOKENS, DEFAULT_PROGRESS_GUARD_LIMITS, writePiModelsConfig, writeAgentsContext, runPi, summarizePiRun, parsePiOutput, parseTodoProgress, progressGuardLimitsFromEnv, terminalRunError, } from './pi.js';
8
+ export { cloneRepo, createBranch, changedPathsFromPorcelain, hasAgentChanges, redactSecrets, } from './git.js';
@@ -0,0 +1,74 @@
1
+ import { join } from 'node:path';
2
+ import { mkdir } from 'node:fs/promises';
3
+ import { cloneRepo } from './git.js';
4
+ import { agentNeverActed, agentOutputTail, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
5
+ import { log } from './logger.js';
6
+ // The shared read-only container agent: clone a branch, run Pi to EXPLORE the
7
+ // checkout (read-only), and return its prose report/proposal. Both the architect
8
+ // (proposes a design after reading the code) and the tech-debt analysis agent use
9
+ // this one path. Unlike the coding agents (`/run`, `/ci-fix`) it pushes nothing and
10
+ // opens no PR, and — like the merger — it makes no edits, so an edit-free run is the
11
+ // expected, correct outcome rather than a "no changes" failure. The only failure
12
+ // mode is producing no text at all (the agent never reached the model).
13
+ /** Run one read-only exploration job end to end: clone branch → Pi explores → return prose. */
14
+ export async function handleExplore(job, opts = {}) {
15
+ const trace = {
16
+ jobId: job.jobId,
17
+ kind: job.label ?? 'explore',
18
+ repo: `${job.repo.owner}/${job.repo.name}`,
19
+ branch: job.branch,
20
+ };
21
+ return withWorkspace(job.label ?? 'explore', async (dir) => {
22
+ log.info('explore: cloning', trace);
23
+ await cloneRepo({
24
+ repo: { ...job.repo, baseBranch: job.branch },
25
+ ghToken: job.ghToken,
26
+ dir,
27
+ signal: opts.signal,
28
+ });
29
+ // In a monorepo the service lives in a subdirectory: run Pi with its cwd set
30
+ // there (created if missing, mirroring the coding agent) so a service-scoped
31
+ // exploration sees the right subtree.
32
+ const serviceDirectory = job.repo.serviceDirectory;
33
+ const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir;
34
+ if (serviceDirectory)
35
+ await mkdir(workDir, { recursive: true });
36
+ log.info('explore: running agent', { ...trace, serviceDirectory });
37
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
38
+ dir: workDir,
39
+ systemPrompt: job.systemPrompt,
40
+ userPrompt: job.userPrompt,
41
+ model: job.model,
42
+ harness: job.harness,
43
+ subscriptionToken: job.subscriptionToken,
44
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
45
+ proxyBaseUrl: job.proxyBaseUrl,
46
+ sessionToken: job.sessionToken,
47
+ serviceDirectory,
48
+ // Read-only: it inspects and reports, making no edits — so the no-progress
49
+ // guard's no-edit bound must not fire on its legitimately edit-free run.
50
+ expectsEdits: false,
51
+ webToolsGuidance: job.webToolsGuidance,
52
+ webSearchProxy: job.webSearch,
53
+ }, opts);
54
+ // The prose report IS the deliverable; an edit-free run is success. The only
55
+ // failure is producing no text at all (the signature of never reaching the model).
56
+ if (!summary.trim()) {
57
+ return {
58
+ summary,
59
+ stats,
60
+ error: noOutputReason(stats, stderrTail),
61
+ ...(usage ? { usage } : {}),
62
+ };
63
+ }
64
+ log.info('explore: done', { ...trace, ...stats });
65
+ return { summary, stats, ...(usage ? { usage } : {}) };
66
+ });
67
+ }
68
+ /** Human-readable reason a read-only run produced no usable output. */
69
+ function noOutputReason(stats, stderrTail) {
70
+ const cause = agentNeverActed(stats)
71
+ ? ' (the agent never acted — it most likely could not reach the model)'
72
+ : '';
73
+ return `Read-only agent produced no report${cause}.${agentOutputTail(stderrTail)}`;
74
+ }
@@ -0,0 +1,47 @@
1
+ // Single source of truth for how a job FAILS: the canonical failure-cause vocabulary plus
2
+ // the watchdog abort-message builders.
3
+ //
4
+ // WHY THIS MODULE EXISTS — the backend classifies a failed job by REGEX-matching the
5
+ // harness's free-text `error` string (it has no other signal today):
6
+ // - server `ContainerRepoBootstrapper.classifyBootstrapFailure`:
7
+ // /inactivity|no agent activity|max duration/i → 'timeout', else → 'agent'
8
+ // - orchestration `job.logic.isContainerEvictionError`: /evicted or crashed/i (FACADE-owned,
9
+ // NOT emitted here — the harness must keep NOT emitting that phrase for a non-eviction)
10
+ // Because those phrases are matched downstream, their wording MUST stay stable. Centralizing
11
+ // the builders here keeps the emitted text from drifting away from the regex that reads it.
12
+ // Alongside the strings we now also emit a STRUCTURED {@link FailureCause} on the job view so
13
+ // the backend can prefer it and treat the regex as a backward-compatible fallback.
14
+ /**
15
+ * A thrown failure that carries a structured {@link FailureCause}, so a `git` / `api`
16
+ * operation that fails deep in a helper surfaces its real cause instead of being flattened
17
+ * to the generic `agent` in the registry's catch. The watchdog kills set their cause from
18
+ * `killReason` and never throw this; anything else thrown without a cause stays `agent`.
19
+ */
20
+ export class HarnessFailure extends Error {
21
+ failureCause;
22
+ constructor(failureCause, message) {
23
+ super(message);
24
+ this.name = 'HarnessFailure';
25
+ this.failureCause = failureCause;
26
+ }
27
+ }
28
+ /** The structured cause a thrown error carries, or undefined for a plain/agent error. */
29
+ export function failureCauseOf(err) {
30
+ return err instanceof HarnessFailure ? err.failureCause : undefined;
31
+ }
32
+ /**
33
+ * The inactivity-watchdog abort message PREFIX. The `no agent activity` phrase is
34
+ * regex-matched by the backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
35
+ * The caller appends a `(likely hung ...)` diagnostic clause (phase + last tool) after this,
36
+ * so the prefix deliberately stops before the parenthetical (see `runner.ts` drive catch).
37
+ */
38
+ export function inactivityAbortMessage(inactivityMs) {
39
+ return `Aborted: no agent activity for ${Math.round(inactivityMs / 1000)}s`;
40
+ }
41
+ /**
42
+ * The max-duration-watchdog abort message. The `max duration` phrase is regex-matched by the
43
+ * backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
44
+ */
45
+ export function maxDurationAbortMessage(maxDurationMs) {
46
+ return `Aborted: exceeded max duration of ${Math.round(maxDurationMs / 1000)}s`;
47
+ }
package/dist/fixer.js ADDED
@@ -0,0 +1,44 @@
1
+ import { noChangesReason, runCodingAgent } from './coding-agent.js';
2
+ // Async job execution for the test Fixer. When a Tester withholds its greenlight the
3
+ // engine dispatches this: clone the PR HEAD branch, run Pi to fix the concerns in the
4
+ // Tester's report (folded into the user prompt by the backend), then commit + push
5
+ // back onto the SAME branch (no new branch, no new PR) so the Tester can re-run. The
6
+ // engine re-dispatches the Tester after the push and loops up to the attempt budget.
7
+ //
8
+ // The clone/Pi/push mechanics are shared with implementation + the CI-fixer via
9
+ // runCodingAgent; the Fixer only differs in working ON the existing PR branch.
10
+ /** Run one Fixer job end to end: clone branch → Pi fixes → push (same branch). */
11
+ export async function handleFixer(job, opts = {}) {
12
+ const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
13
+ kind: 'fix-tests',
14
+ jobId: job.jobId,
15
+ repo: job.repo,
16
+ // Work directly on the PR head branch — no new branch, no new PR.
17
+ cloneBranch: job.branch,
18
+ pushBranch: job.branch,
19
+ ghToken: job.ghToken,
20
+ systemPrompt: job.systemPrompt,
21
+ userPrompt: job.userPrompt,
22
+ model: job.model,
23
+ harness: job.harness,
24
+ subscriptionToken: job.subscriptionToken,
25
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
26
+ proxyBaseUrl: job.proxyBaseUrl,
27
+ sessionToken: job.sessionToken,
28
+ commitMessage: 'Fix issues found by the tester',
29
+ webToolsGuidance: job.webToolsGuidance,
30
+ webSearchProxy: job.webSearch,
31
+ }, opts);
32
+ // Not an error: the engine re-runs the Tester regardless. Report `pushed: false`
33
+ // so the (unused) result is still meaningful.
34
+ if (!pushed) {
35
+ return {
36
+ pushed: false,
37
+ summary,
38
+ stats,
39
+ error: noChangesReason('No test fix produced', stats, stderrTail),
40
+ ...(usage ? { usage } : {}),
41
+ };
42
+ }
43
+ return { pushed: true, summary, stats, ...(usage ? { usage } : {}) };
44
+ }