@cat-factory/executor-harness 1.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +143 -0
- package/dist/agent-runner.js +389 -0
- package/dist/agent.js +810 -0
- package/dist/blueprint.js +367 -0
- package/dist/bootstrap.js +99 -0
- package/dist/ci-fixer.js +46 -0
- package/dist/coding-agent.js +285 -0
- package/dist/conflict-resolver.js +138 -0
- package/dist/embed.js +8 -0
- package/dist/explore.js +74 -0
- package/dist/failure.js +47 -0
- package/dist/fixer.js +44 -0
- package/dist/follow-ups.js +103 -0
- package/dist/frontend-infra.js +283 -0
- package/dist/fs-utils.js +11 -0
- package/dist/git.js +778 -0
- package/dist/job.js +409 -0
- package/dist/logger.js +27 -0
- package/dist/merger.js +135 -0
- package/dist/on-call.js +126 -0
- package/dist/pi-workspace.js +237 -0
- package/dist/pi.js +971 -0
- package/dist/process.js +25 -0
- package/dist/redact.js +109 -0
- package/dist/runner.js +228 -0
- package/dist/server.js +135 -0
- package/dist/spec.js +754 -0
- package/dist/structured-output.js +431 -0
- package/dist/tester.js +191 -0
- package/package.json +35 -0
- package/src/agent-runner.ts +484 -0
- package/src/agent.ts +948 -0
- package/src/coding-agent.ts +393 -0
- package/src/embed.ts +32 -0
- package/src/failure.ts +73 -0
- package/src/follow-ups.ts +106 -0
- package/src/frontend-infra.ts +340 -0
- package/src/fs-utils.ts +11 -0
- package/src/git.ts +955 -0
- package/src/job.ts +766 -0
- package/src/logger.ts +45 -0
- package/src/pi-workspace.ts +348 -0
- package/src/pi.ts +1236 -0
- package/src/process.ts +33 -0
- package/src/redact.ts +109 -0
- package/src/runner.ts +384 -0
- package/src/server.ts +153 -0
- package/src/structured-output.ts +524 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import { mkdir } from 'node:fs/promises';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { branchHasCommitsSince, cloneExistingBranch, cloneRepo, commitTrackedEdits, createBranch, excludeFromGit, headCommit, listUntrackedFiles, prepareExistingCheckout, pushBranch, refreshFromBaseIfClean, remoteBranchExists, } from './git.js';
|
|
4
|
+
import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js';
|
|
5
|
+
import { acquireRepoCheckout, agentNeverActed, agentOutputTail, runAgentInWorkspace, } from './pi-workspace.js';
|
|
6
|
+
import { log } from './logger.js';
|
|
7
|
+
/**
|
|
8
|
+
* How often the harness checkpoints the agent's work mid-run by pushing the branch.
|
|
9
|
+
* A per-run container can be evicted at any moment; pushing the agent's commits
|
|
10
|
+
* periodically means an evicted run's work survives on the branch, so a retry
|
|
11
|
+
* RESUMES on top of it instead of starting over. Overridable via env for tests.
|
|
12
|
+
*/
|
|
13
|
+
function checkpointIntervalMs() {
|
|
14
|
+
const n = Number(process.env.JOB_CHECKPOINT_INTERVAL_MS);
|
|
15
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : 60_000;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* How often the harness tails the Coder's follow-up sentinel file to surface new items.
|
|
19
|
+
* Short (a few seconds) so the Follow-up companion lights up promptly while the Coder is
|
|
20
|
+
* still running. Overridable via env for tests.
|
|
21
|
+
*/
|
|
22
|
+
function followUpPollIntervalMs() {
|
|
23
|
+
const n = Number(process.env.JOB_FOLLOWUP_POLL_INTERVAL_MS);
|
|
24
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : 3_000;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Clone (or RESUME an existing branch) → write context → run Pi → push the branch
|
|
28
|
+
* iff it carries work. The agent commits its OWN work (it alone knows which files
|
|
29
|
+
* belong vs scratch/artifacts it created), so the harness never blanket-stages:
|
|
30
|
+
* {@link commitTrackedEdits} is only a safety net for forgotten edits to ALREADY
|
|
31
|
+
* tracked files, and the run is judged a no-op only when the branch never advanced
|
|
32
|
+
* past its pre-run tip ({@link branchHasCommitsSince}). The harness owns push + PR;
|
|
33
|
+
* it checkpoints (pushes) periodically so an evicted run's commits survive and a
|
|
34
|
+
* retry resumes on them. Returns the run's summary/stats, whether it pushed, and
|
|
35
|
+
* whether it resumed; callers decide what to do after a push (open a PR, or nothing).
|
|
36
|
+
*/
|
|
37
|
+
export async function runCodingAgent(spec, opts = {}) {
|
|
38
|
+
const { signal } = opts;
|
|
39
|
+
// The registry already binds jobId/repo/branch; add the coding kind + the push branch
|
|
40
|
+
// (which differs from the cloned branch the registry bound).
|
|
41
|
+
const logger = (opts.log ?? log).child({ kind: spec.kind, branch: spec.pushBranch });
|
|
42
|
+
return acquireRepoCheckout({ persistent: spec.persistentCheckout === true, prefix: spec.kind, repo: spec.repo }, async (dir) => {
|
|
43
|
+
// Resume an evicted earlier run when its work branch already exists on the
|
|
44
|
+
// remote: clone THAT branch and continue on its commits, rather than branching
|
|
45
|
+
// off base and redoing everything. Only the impl path (which creates a fresh
|
|
46
|
+
// `newBranch`) can resume; the ci-fix/conflict paths already clone the PR branch.
|
|
47
|
+
//
|
|
48
|
+
// Resume safety relies on two invariants the dispatcher (worker) upholds, since
|
|
49
|
+
// the harness can't see run/PR state from inside the container:
|
|
50
|
+
// - At most ONE active run per block at a time. The work branch is deterministic
|
|
51
|
+
// per block (`cat-factory/<blockId>`), so two concurrent runs would target the
|
|
52
|
+
// same branch; their pushes race. A plain (non-forced) push fails safely on a
|
|
53
|
+
// non-fast-forward rather than clobbering the other run's commits, so the worst
|
|
54
|
+
// case is one run failing — never lost work — but the dispatcher should not
|
|
55
|
+
// knowingly run two at once.
|
|
56
|
+
// - Re-dispatch only NON-terminal runs (failed / evicted / stale-running), whose
|
|
57
|
+
// branch is by definition unmerged. Resuming a branch whose PR already merged
|
|
58
|
+
// could re-introduce merged work; that is avoided two ways: the platform deletes
|
|
59
|
+
// the work branch when its PR merges (GitHubPullRequestMerger), so a re-run finds
|
|
60
|
+
// no branch and starts fresh, and a `done` block is never re-dispatched anyway.
|
|
61
|
+
const resumed = spec.newBranch != null &&
|
|
62
|
+
(await remoteBranchExists(spec.repo.cloneUrl, spec.newBranch, spec.ghToken, signal));
|
|
63
|
+
opts.onPhase?.('clone');
|
|
64
|
+
if (spec.persistentCheckout) {
|
|
65
|
+
// Reused checkout: clean-sweep + fetch + switch branch in place. A resumed branch
|
|
66
|
+
// (or a run without `newBranch`, working directly on `cloneBranch`) already exists
|
|
67
|
+
// on the remote, so check it out directly; otherwise (re)create `newBranch` off the
|
|
68
|
+
// base tip — the same resume-vs-fresh decision the clone paths below make.
|
|
69
|
+
const targetBranch = spec.newBranch ?? spec.cloneBranch;
|
|
70
|
+
logger.info('coding-agent: preparing reused checkout', { branch: targetBranch, resumed });
|
|
71
|
+
await prepareExistingCheckout({
|
|
72
|
+
dir,
|
|
73
|
+
repo: spec.repo,
|
|
74
|
+
ghToken: spec.ghToken,
|
|
75
|
+
branch: targetBranch,
|
|
76
|
+
baseBranch: spec.cloneBranch,
|
|
77
|
+
existing: resumed || spec.newBranch == null,
|
|
78
|
+
signal,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
else if (resumed) {
|
|
82
|
+
logger.info('coding-agent: resuming existing branch', { branch: spec.newBranch });
|
|
83
|
+
await cloneExistingBranch({
|
|
84
|
+
cloneUrl: spec.repo.cloneUrl,
|
|
85
|
+
branch: spec.newBranch,
|
|
86
|
+
ghToken: spec.ghToken,
|
|
87
|
+
dir,
|
|
88
|
+
signal,
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
logger.info('coding-agent: cloning', { cloneBranch: spec.cloneBranch });
|
|
93
|
+
await cloneRepo({
|
|
94
|
+
repo: { ...spec.repo, baseBranch: spec.cloneBranch },
|
|
95
|
+
ghToken: spec.ghToken,
|
|
96
|
+
dir,
|
|
97
|
+
signal,
|
|
98
|
+
});
|
|
99
|
+
if (spec.newBranch)
|
|
100
|
+
await createBranch(dir, spec.newBranch, signal);
|
|
101
|
+
}
|
|
102
|
+
// The branch tip before the agent runs this time. A FRESH run produced work iff
|
|
103
|
+
// the branch advances past it; a RESUMED run already carries prior work, so it is
|
|
104
|
+
// never a no-op regardless of what this pass adds. Captured BEFORE the resume base
|
|
105
|
+
// refresh below so that refresh's merge commit counts as advancement and is pushed.
|
|
106
|
+
const baseSha = await headCommit(dir, signal);
|
|
107
|
+
// A resumed branch was cut from an OLDER base; merge the latest base in when the
|
|
108
|
+
// two merge cleanly, so the agent works against current base and the PR stays
|
|
109
|
+
// current. On a conflict this is a no-op (the run continues on the stale base — the
|
|
110
|
+
// merge gate handles a conflicting PR downstream, as before), so it never blocks a
|
|
111
|
+
// resume. Best-effort: any error is treated as "continue without refreshing".
|
|
112
|
+
if (resumed) {
|
|
113
|
+
const refreshed = await refreshFromBaseIfClean(dir, spec.cloneBranch, spec.ghToken, signal).catch(() => false);
|
|
114
|
+
if (!refreshed) {
|
|
115
|
+
logger.info('coding-agent: resume base refresh skipped (conflict or error)', {
|
|
116
|
+
base: spec.cloneBranch,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
// Serialize all pushes to the work branch through a single in-flight promise.
|
|
121
|
+
// A checkpoint tick and the final push (or two slow checkpoint ticks) must never
|
|
122
|
+
// run `git push` to the same branch concurrently: overlapping pushes race on the
|
|
123
|
+
// remote ref and can make a push fail with a ref-lock / non-fast-forward error —
|
|
124
|
+
// which, on the FINAL push, would fail the whole run even though the work is
|
|
125
|
+
// committed. `pushWorkOnce` coalesces concurrent callers onto one push and only
|
|
126
|
+
// pushes once the branch has advanced past `baseSha` (see below).
|
|
127
|
+
//
|
|
128
|
+
// Only push once the branch has advanced past its pre-run tip: pushing while it
|
|
129
|
+
// still sits at `baseSha` would create the work branch at the base commit (a
|
|
130
|
+
// zero-diff branch), which a later retry would see via `remoteBranchExists` and
|
|
131
|
+
// treat as resumable work — then fail to open a PR ("no commits between base and
|
|
132
|
+
// head"). So a run that never commits leaves NO branch behind, preserving the
|
|
133
|
+
// clean no-op outcome.
|
|
134
|
+
let pushInFlight = null;
|
|
135
|
+
const pushWorkOnce = () => {
|
|
136
|
+
if (pushInFlight)
|
|
137
|
+
return pushInFlight;
|
|
138
|
+
pushInFlight = (async () => {
|
|
139
|
+
if (!(await branchHasCommitsSince(dir, baseSha, signal)))
|
|
140
|
+
return;
|
|
141
|
+
await pushBranch(dir, spec.pushBranch, spec.ghToken, signal);
|
|
142
|
+
})().finally(() => {
|
|
143
|
+
pushInFlight = null;
|
|
144
|
+
});
|
|
145
|
+
return pushInFlight;
|
|
146
|
+
};
|
|
147
|
+
// Read the in-flight push, if any. A function (with an explicit return type) so the
|
|
148
|
+
// value isn't subject to the caller's straight-line narrowing — `pushInFlight` is
|
|
149
|
+
// only ever assigned inside closures, which flow analysis can't observe.
|
|
150
|
+
const inFlightPush = () => pushInFlight;
|
|
151
|
+
// Checkpoint the agent's committed work to the branch periodically so an eviction
|
|
152
|
+
// mid-run doesn't lose it (a retry then resumes from the pushed commits). The
|
|
153
|
+
// agent commits its own work; this only PUSHES already-committed commits, so it
|
|
154
|
+
// never races the agent's staging. Best-effort: a failed checkpoint is skipped.
|
|
155
|
+
// Surface checkpoint-push failures at warn with a running count: a checkpoint losing
|
|
156
|
+
// a race is harmless once, but a steadily-climbing count means mid-run work is NOT
|
|
157
|
+
// being durably checkpointed, so an eviction would lose it — previously invisible at
|
|
158
|
+
// info level. Still best-effort: a failed checkpoint never fails the run.
|
|
159
|
+
let checkpointFailures = 0;
|
|
160
|
+
const checkpoint = setInterval(() => {
|
|
161
|
+
pushWorkOnce().catch((err) => {
|
|
162
|
+
checkpointFailures++;
|
|
163
|
+
logger.warn('coding-agent: checkpoint push failed', {
|
|
164
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
165
|
+
checkpointFailures,
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
}, checkpointIntervalMs());
|
|
169
|
+
checkpoint.unref?.();
|
|
170
|
+
// In a monorepo the service lives in a subdirectory: run Pi with its cwd set to
|
|
171
|
+
// that subtree (git stays rooted at `dir` so commits/pushes still cover the whole
|
|
172
|
+
// checkout). Created if missing so a coder scaffolding a brand-new service into an
|
|
173
|
+
// existing monorepo has a cwd to start in. The agent is also TOLD it's in a
|
|
174
|
+
// monorepo (and where) via the AGENTS.md context below.
|
|
175
|
+
const serviceDirectory = spec.repo.serviceDirectory;
|
|
176
|
+
const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir;
|
|
177
|
+
if (serviceDirectory)
|
|
178
|
+
await mkdir(workDir, { recursive: true });
|
|
179
|
+
// Follow-up companion: tail the Coder's sentinel file and stream new items out on the
|
|
180
|
+
// job view. Locally exclude it from git first so the agent's own `git add` can never
|
|
181
|
+
// stage it and it never surfaces as an untracked leftover or in the PR. The sentinel
|
|
182
|
+
// lives in the agent's working directory (its cwd), where the prompt tells it to write.
|
|
183
|
+
const followUpTailer = spec.streamFollowUps && opts.onFollowUp
|
|
184
|
+
? new FollowUpTailer(join(workDir, FOLLOW_UPS_FILENAME), opts.onFollowUp, logger)
|
|
185
|
+
: undefined;
|
|
186
|
+
let followUpTick;
|
|
187
|
+
if (followUpTailer) {
|
|
188
|
+
await excludeFromGit(dir, FOLLOW_UPS_FILENAME, signal);
|
|
189
|
+
followUpTick = setInterval(() => {
|
|
190
|
+
void followUpTailer.poll();
|
|
191
|
+
}, followUpPollIntervalMs());
|
|
192
|
+
followUpTick.unref?.();
|
|
193
|
+
}
|
|
194
|
+
let outcome;
|
|
195
|
+
try {
|
|
196
|
+
opts.onPhase?.('agent');
|
|
197
|
+
logger.info('coding-agent: running agent', { serviceDirectory });
|
|
198
|
+
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
199
|
+
dir: workDir,
|
|
200
|
+
systemPrompt: spec.systemPrompt,
|
|
201
|
+
userPrompt: spec.userPrompt,
|
|
202
|
+
model: spec.model,
|
|
203
|
+
harness: spec.harness,
|
|
204
|
+
subscriptionToken: spec.subscriptionToken,
|
|
205
|
+
subscriptionBaseUrl: spec.subscriptionBaseUrl,
|
|
206
|
+
ambientAuth: spec.ambientAuth,
|
|
207
|
+
proxyBaseUrl: spec.proxyBaseUrl,
|
|
208
|
+
sessionToken: spec.sessionToken,
|
|
209
|
+
serviceDirectory,
|
|
210
|
+
webToolsGuidance: spec.webToolsGuidance,
|
|
211
|
+
webSearchProxy: spec.webSearchProxy,
|
|
212
|
+
guardLimits: spec.guardLimits,
|
|
213
|
+
}, opts);
|
|
214
|
+
// Stop tailing the follow-up sentinel and flush any items written after the last
|
|
215
|
+
// tick, so a fast final burst still reaches the job view before the run is recorded.
|
|
216
|
+
if (followUpTick)
|
|
217
|
+
clearInterval(followUpTick);
|
|
218
|
+
if (followUpTailer)
|
|
219
|
+
await followUpTailer.poll().catch(() => { });
|
|
220
|
+
// Safety net for forgotten edits: commit changes to TRACKED files only (never
|
|
221
|
+
// untracked scratch files/artifacts — the agent owns committing new files).
|
|
222
|
+
await commitTrackedEdits(dir, spec.commitMessage, signal);
|
|
223
|
+
// Stop periodic checkpoints and let any in-flight one settle BEFORE the final
|
|
224
|
+
// push, so the two never run a concurrent `git push` to the same branch (the
|
|
225
|
+
// final push below is then a fresh attempt whose failure is the real signal).
|
|
226
|
+
clearInterval(checkpoint);
|
|
227
|
+
const inflight = inFlightPush();
|
|
228
|
+
if (inflight)
|
|
229
|
+
await inflight.catch(() => { });
|
|
230
|
+
// Surface (don't fail on) untracked, non-ignored files the agent left behind:
|
|
231
|
+
// `commitTrackedEdits` only captures edits to ALREADY tracked files, so a NEW
|
|
232
|
+
// file the agent created but forgot to commit is silently dropped. Logging it
|
|
233
|
+
// makes that loss observable when a PR turns out to be missing a file.
|
|
234
|
+
const leftover = await listUntrackedFiles(dir, signal);
|
|
235
|
+
if (leftover.length > 0) {
|
|
236
|
+
logger.warn('coding-agent: uncommitted new files left behind (not pushed)', {
|
|
237
|
+
count: leftover.length,
|
|
238
|
+
files: leftover.slice(0, 20),
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
const hasWork = resumed || (await branchHasCommitsSince(dir, baseSha, signal));
|
|
242
|
+
if (!hasWork) {
|
|
243
|
+
logger.info('coding-agent: no changes produced', { ...stats });
|
|
244
|
+
outcome = {
|
|
245
|
+
pushed: false,
|
|
246
|
+
resumed,
|
|
247
|
+
summary,
|
|
248
|
+
stats,
|
|
249
|
+
...(stderrTail ? { stderrTail } : {}),
|
|
250
|
+
...(usage ? { usage } : {}),
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
else {
|
|
254
|
+
opts.onPhase?.('push');
|
|
255
|
+
logger.info('coding-agent: pushing', { resumed, ...stats });
|
|
256
|
+
await pushWorkOnce();
|
|
257
|
+
outcome = {
|
|
258
|
+
pushed: true,
|
|
259
|
+
resumed,
|
|
260
|
+
summary,
|
|
261
|
+
stats,
|
|
262
|
+
...(stderrTail ? { stderrTail } : {}),
|
|
263
|
+
...(usage ? { usage } : {}),
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
finally {
|
|
268
|
+
// Safety net for the throw path (the happy path already cleared these above).
|
|
269
|
+
clearInterval(checkpoint);
|
|
270
|
+
if (followUpTick)
|
|
271
|
+
clearInterval(followUpTick);
|
|
272
|
+
}
|
|
273
|
+
return outcome;
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* The "no changes" reason both coding agents report: a caller-supplied lead phrase
|
|
278
|
+
* plus the shared "never acted" cause and a credential-scrubbed tail of Pi's stderr.
|
|
279
|
+
*/
|
|
280
|
+
export function noChangesReason(lead, stats, stderrTail) {
|
|
281
|
+
const cause = agentNeverActed(stats)
|
|
282
|
+
? ' (the agent never acted — it most likely could not reach the model)'
|
|
283
|
+
: '';
|
|
284
|
+
return `${lead}${cause}.${agentOutputTail(stderrTail)}`;
|
|
285
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { cloneRepo, commitAll, conflictDiff, headCommit, mergeBranch, pushBranch, unmergedPaths, } from './git.js';
|
|
2
|
+
import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
|
|
3
|
+
import { log } from './logger.js';
|
|
4
|
+
// Async job execution for the merge-conflict resolver. When a PR cannot be merged
|
|
5
|
+
// because it conflicts with its base, the engine dispatches this: clone the PR head
|
|
6
|
+
// branch (full history), merge the base branch into it to surface the conflicts,
|
|
7
|
+
// run Pi to resolve them, complete the merge commit and push back onto the SAME
|
|
8
|
+
// branch (no new branch / PR) so the PR becomes mergeable and CI re-runs.
|
|
9
|
+
//
|
|
10
|
+
// Shares the thin workspace/Pi base (withWorkspace + runAgentInWorkspace) with the
|
|
11
|
+
// other agents; it diverges only in needing a full clone, a base→branch merge to
|
|
12
|
+
// produce the conflicts, and a guard that refuses to push a half-resolved tree.
|
|
13
|
+
/** Run one conflict-resolver job: clone → merge base → Pi resolves → push (same branch). */
|
|
14
|
+
export async function handleConflictResolver(job, opts = {}) {
|
|
15
|
+
const { signal } = opts;
|
|
16
|
+
const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
|
|
17
|
+
return withWorkspace('conflict', async (dir) => {
|
|
18
|
+
log.info('conflict: cloning PR branch (full history)', trace);
|
|
19
|
+
// Full clone so the merge base + `origin/<base>` are present for the merge.
|
|
20
|
+
await cloneRepo({
|
|
21
|
+
repo: { ...job.repo, baseBranch: job.branch },
|
|
22
|
+
ghToken: job.ghToken,
|
|
23
|
+
dir,
|
|
24
|
+
signal,
|
|
25
|
+
full: true,
|
|
26
|
+
});
|
|
27
|
+
const prTip = await headCommit(dir, signal);
|
|
28
|
+
log.info('conflict: merging base into PR branch', { ...trace, base: job.repo.baseBranch });
|
|
29
|
+
const clean = await mergeBranch(dir, job.repo.baseBranch, signal);
|
|
30
|
+
// No conflicts to resolve. If base brought new commits the merge advanced the
|
|
31
|
+
// branch, so push it; otherwise the branch is already up to date — a no-op we
|
|
32
|
+
// leave alone (re-dispatching it never changes the PR, so a gate that keeps
|
|
33
|
+
// seeing GitHub report this branch as "conflicting" is a base-resolution problem,
|
|
34
|
+
// not the agent's — logged here so that loop is diagnosable).
|
|
35
|
+
if (clean) {
|
|
36
|
+
if ((await headCommit(dir, signal)) === prTip) {
|
|
37
|
+
log.info('conflict: base merged clean and branch already up to date — nothing to push', {
|
|
38
|
+
...trace,
|
|
39
|
+
base: job.repo.baseBranch,
|
|
40
|
+
});
|
|
41
|
+
return {
|
|
42
|
+
resolved: true,
|
|
43
|
+
summary: 'No conflicts: the branch is already up to date with its base.',
|
|
44
|
+
stats: { toolCalls: 0, assistantChars: 0 },
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
log.info('conflict: base merged clean — pushing the merge commit', trace);
|
|
48
|
+
await pushBranch(dir, job.branch, job.ghToken, signal);
|
|
49
|
+
return {
|
|
50
|
+
resolved: true,
|
|
51
|
+
summary: 'Merged the base in cleanly (no conflicts to resolve).',
|
|
52
|
+
stats: { toolCalls: 0, assistantChars: 0 },
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
// The merge left conflicts in the working tree. Surface the EXACT files + hunks
|
|
56
|
+
// to the agent: the generic task prompt alone never told it which files conflict
|
|
57
|
+
// (or even that there were conflicts), so it would drift onto the original feature
|
|
58
|
+
// task. Lead with the conflict; keep the task only as trailing reference.
|
|
59
|
+
const conflicted = await unmergedPaths(dir, signal);
|
|
60
|
+
log.info('conflict: resolving conflicts with agent', { ...trace, conflicted });
|
|
61
|
+
const diff = await conflictDiff(dir, conflicted, signal);
|
|
62
|
+
const userPrompt = buildConflictPrompt(job.repo.baseBranch, job.branch, conflicted, diff, job.userPrompt);
|
|
63
|
+
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
64
|
+
dir,
|
|
65
|
+
systemPrompt: job.systemPrompt,
|
|
66
|
+
userPrompt,
|
|
67
|
+
model: job.model,
|
|
68
|
+
harness: job.harness,
|
|
69
|
+
subscriptionToken: job.subscriptionToken,
|
|
70
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
71
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
72
|
+
sessionToken: job.sessionToken,
|
|
73
|
+
}, opts);
|
|
74
|
+
// Never push a half-resolved tree: if any conflict markers / unmerged paths
|
|
75
|
+
// remain, the PR would still be broken. Fail so the engine can retry / notify.
|
|
76
|
+
const unresolved = await unmergedPaths(dir, signal);
|
|
77
|
+
if (unresolved.length > 0) {
|
|
78
|
+
log.error('conflict: unresolved conflicts remain — refusing to push', {
|
|
79
|
+
...trace,
|
|
80
|
+
unresolved: unresolved.length,
|
|
81
|
+
});
|
|
82
|
+
return {
|
|
83
|
+
resolved: false,
|
|
84
|
+
summary,
|
|
85
|
+
stats,
|
|
86
|
+
error: unresolvedReason(unresolved, stats, stderrTail),
|
|
87
|
+
...(usage ? { usage } : {}),
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
// Complete the merge commit with the agent's resolution staged, then push.
|
|
91
|
+
await commitAll(dir, `Merge ${job.repo.baseBranch} into ${job.branch}`, signal);
|
|
92
|
+
log.info('conflict: pushing resolved branch', { ...trace, ...stats });
|
|
93
|
+
await pushBranch(dir, job.branch, job.ghToken, signal);
|
|
94
|
+
return { resolved: true, summary, stats, ...(usage ? { usage } : {}) };
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* The conflict-focused user prompt: lead with the exact conflicted files and their
|
|
99
|
+
* hunks (so the model acts on the real conflict, not the original feature task), then
|
|
100
|
+
* carry the task only as trailing reference. The role/system prompt frames it as a
|
|
101
|
+
* merge-conflict resolution; this gives it the concrete material.
|
|
102
|
+
*/
|
|
103
|
+
function buildConflictPrompt(baseBranch, prBranch, conflicted, diff, taskReference) {
|
|
104
|
+
const fileList = conflicted.map((p) => `- ${p}`).join('\n');
|
|
105
|
+
const parts = [
|
|
106
|
+
`The base branch \`${baseBranch}\` was merged into this pull-request branch ` +
|
|
107
|
+
`\`${prBranch}\` and left Git merge conflicts in the following ${conflicted.length} ` +
|
|
108
|
+
`file(s):`,
|
|
109
|
+
'',
|
|
110
|
+
fileList,
|
|
111
|
+
'',
|
|
112
|
+
'Resolve EVERY conflict in these files: open each one, understand both sides of each ' +
|
|
113
|
+
'`<<<<<<<` / `=======` / `>>>>>>>` region, and edit it to a correct result that ' +
|
|
114
|
+
"preserves the intent of BOTH the base changes and this PR's changes — never just " +
|
|
115
|
+
'discard one side. Remove every conflict marker and leave the project building. Do ' +
|
|
116
|
+
'not create a new branch or PR; the harness completes the merge commit and pushes once ' +
|
|
117
|
+
'no conflict markers remain.',
|
|
118
|
+
'',
|
|
119
|
+
'Conflict hunks (`git diff` of the conflicted files):',
|
|
120
|
+
'',
|
|
121
|
+
'```diff',
|
|
122
|
+
diff,
|
|
123
|
+
'```',
|
|
124
|
+
];
|
|
125
|
+
const ref = taskReference.trim();
|
|
126
|
+
if (ref) {
|
|
127
|
+
parts.push('', 'For reference, the task this pull request implements:', '', ref);
|
|
128
|
+
}
|
|
129
|
+
return parts.join('\n');
|
|
130
|
+
}
|
|
131
|
+
/** Human-readable reason the agent failed to fully resolve the conflicts. */
|
|
132
|
+
function unresolvedReason(unresolved, stats, stderrTail) {
|
|
133
|
+
const cause = agentNeverActed(stats) ? NEVER_ACTED_CAUSE : '';
|
|
134
|
+
const sample = unresolved.slice(0, 10).join(', ');
|
|
135
|
+
return (`The agent did not resolve all merge conflicts ` +
|
|
136
|
+
`(${unresolved.length} file(s) still conflicted: ${sample}).${cause}` +
|
|
137
|
+
agentOutputTail(stderrTail));
|
|
138
|
+
}
|
package/dist/embed.js
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
// Embeddable surface of the executor harness: the Pi-driving and git helpers
|
|
2
|
+
// the container payload uses, re-exported so other packages (e.g. the benchmark
|
|
3
|
+
// harness) can run the *same* coding-agent flow outside the container — clone a
|
|
4
|
+
// repo, write the agent context, point Pi at an OpenAI-compatible endpoint, run
|
|
5
|
+
// it, and inspect what changed. The HTTP server / job lifecycle stays internal;
|
|
6
|
+
// only the reusable primitives are exposed here.
|
|
7
|
+
export { PI_MAX_OUTPUT_TOKENS, DEFAULT_PROGRESS_GUARD_LIMITS, writePiModelsConfig, writeAgentsContext, runPi, summarizePiRun, parsePiOutput, parseTodoProgress, progressGuardLimitsFromEnv, terminalRunError, } from './pi.js';
|
|
8
|
+
export { cloneRepo, createBranch, changedPathsFromPorcelain, hasAgentChanges, redactSecrets, } from './git.js';
|
package/dist/explore.js
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { join } from 'node:path';
|
|
2
|
+
import { mkdir } from 'node:fs/promises';
|
|
3
|
+
import { cloneRepo } from './git.js';
|
|
4
|
+
import { agentNeverActed, agentOutputTail, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
|
|
5
|
+
import { log } from './logger.js';
|
|
6
|
+
// The shared read-only container agent: clone a branch, run Pi to EXPLORE the
|
|
7
|
+
// checkout (read-only), and return its prose report/proposal. Both the architect
|
|
8
|
+
// (proposes a design after reading the code) and the tech-debt analysis agent use
|
|
9
|
+
// this one path. Unlike the coding agents (`/run`, `/ci-fix`) it pushes nothing and
|
|
10
|
+
// opens no PR, and — like the merger — it makes no edits, so an edit-free run is the
|
|
11
|
+
// expected, correct outcome rather than a "no changes" failure. The only failure
|
|
12
|
+
// mode is producing no text at all (the agent never reached the model).
|
|
13
|
+
/** Run one read-only exploration job end to end: clone branch → Pi explores → return prose. */
|
|
14
|
+
export async function handleExplore(job, opts = {}) {
|
|
15
|
+
const trace = {
|
|
16
|
+
jobId: job.jobId,
|
|
17
|
+
kind: job.label ?? 'explore',
|
|
18
|
+
repo: `${job.repo.owner}/${job.repo.name}`,
|
|
19
|
+
branch: job.branch,
|
|
20
|
+
};
|
|
21
|
+
return withWorkspace(job.label ?? 'explore', async (dir) => {
|
|
22
|
+
log.info('explore: cloning', trace);
|
|
23
|
+
await cloneRepo({
|
|
24
|
+
repo: { ...job.repo, baseBranch: job.branch },
|
|
25
|
+
ghToken: job.ghToken,
|
|
26
|
+
dir,
|
|
27
|
+
signal: opts.signal,
|
|
28
|
+
});
|
|
29
|
+
// In a monorepo the service lives in a subdirectory: run Pi with its cwd set
|
|
30
|
+
// there (created if missing, mirroring the coding agent) so a service-scoped
|
|
31
|
+
// exploration sees the right subtree.
|
|
32
|
+
const serviceDirectory = job.repo.serviceDirectory;
|
|
33
|
+
const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir;
|
|
34
|
+
if (serviceDirectory)
|
|
35
|
+
await mkdir(workDir, { recursive: true });
|
|
36
|
+
log.info('explore: running agent', { ...trace, serviceDirectory });
|
|
37
|
+
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
38
|
+
dir: workDir,
|
|
39
|
+
systemPrompt: job.systemPrompt,
|
|
40
|
+
userPrompt: job.userPrompt,
|
|
41
|
+
model: job.model,
|
|
42
|
+
harness: job.harness,
|
|
43
|
+
subscriptionToken: job.subscriptionToken,
|
|
44
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
45
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
46
|
+
sessionToken: job.sessionToken,
|
|
47
|
+
serviceDirectory,
|
|
48
|
+
// Read-only: it inspects and reports, making no edits — so the no-progress
|
|
49
|
+
// guard's no-edit bound must not fire on its legitimately edit-free run.
|
|
50
|
+
expectsEdits: false,
|
|
51
|
+
webToolsGuidance: job.webToolsGuidance,
|
|
52
|
+
webSearchProxy: job.webSearch,
|
|
53
|
+
}, opts);
|
|
54
|
+
// The prose report IS the deliverable; an edit-free run is success. The only
|
|
55
|
+
// failure is producing no text at all (the signature of never reaching the model).
|
|
56
|
+
if (!summary.trim()) {
|
|
57
|
+
return {
|
|
58
|
+
summary,
|
|
59
|
+
stats,
|
|
60
|
+
error: noOutputReason(stats, stderrTail),
|
|
61
|
+
...(usage ? { usage } : {}),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
log.info('explore: done', { ...trace, ...stats });
|
|
65
|
+
return { summary, stats, ...(usage ? { usage } : {}) };
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
/** Human-readable reason a read-only run produced no usable output. */
|
|
69
|
+
function noOutputReason(stats, stderrTail) {
|
|
70
|
+
const cause = agentNeverActed(stats)
|
|
71
|
+
? ' (the agent never acted — it most likely could not reach the model)'
|
|
72
|
+
: '';
|
|
73
|
+
return `Read-only agent produced no report${cause}.${agentOutputTail(stderrTail)}`;
|
|
74
|
+
}
|
package/dist/failure.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
// Single source of truth for how a job FAILS: the canonical failure-cause vocabulary plus
|
|
2
|
+
// the watchdog abort-message builders.
|
|
3
|
+
//
|
|
4
|
+
// WHY THIS MODULE EXISTS — the backend classifies a failed job by REGEX-matching the
|
|
5
|
+
// harness's free-text `error` string (it has no other signal today):
|
|
6
|
+
// - server `ContainerRepoBootstrapper.classifyBootstrapFailure`:
|
|
7
|
+
// /inactivity|no agent activity|max duration/i → 'timeout', else → 'agent'
|
|
8
|
+
// - orchestration `job.logic.isContainerEvictionError`: /evicted or crashed/i (FACADE-owned,
|
|
9
|
+
// NOT emitted here — the harness must keep NOT emitting that phrase for a non-eviction)
|
|
10
|
+
// Because those phrases are matched downstream, their wording MUST stay stable. Centralizing
|
|
11
|
+
// the builders here keeps the emitted text from drifting away from the regex that reads it.
|
|
12
|
+
// Alongside the strings we now also emit a STRUCTURED {@link FailureCause} on the job view so
|
|
13
|
+
// the backend can prefer it and treat the regex as a backward-compatible fallback.
|
|
14
|
+
/**
|
|
15
|
+
* A thrown failure that carries a structured {@link FailureCause}, so a `git` / `api`
|
|
16
|
+
* operation that fails deep in a helper surfaces its real cause instead of being flattened
|
|
17
|
+
* to the generic `agent` in the registry's catch. The watchdog kills set their cause from
|
|
18
|
+
* `killReason` and never throw this; anything else thrown without a cause stays `agent`.
|
|
19
|
+
*/
|
|
20
|
+
export class HarnessFailure extends Error {
|
|
21
|
+
failureCause;
|
|
22
|
+
constructor(failureCause, message) {
|
|
23
|
+
super(message);
|
|
24
|
+
this.name = 'HarnessFailure';
|
|
25
|
+
this.failureCause = failureCause;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/** The structured cause a thrown error carries, or undefined for a plain/agent error. */
|
|
29
|
+
export function failureCauseOf(err) {
|
|
30
|
+
return err instanceof HarnessFailure ? err.failureCause : undefined;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* The inactivity-watchdog abort message PREFIX. The `no agent activity` phrase is
|
|
34
|
+
* regex-matched by the backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
|
|
35
|
+
* The caller appends a `(likely hung ...)` diagnostic clause (phase + last tool) after this,
|
|
36
|
+
* so the prefix deliberately stops before the parenthetical (see `runner.ts` drive catch).
|
|
37
|
+
*/
|
|
38
|
+
export function inactivityAbortMessage(inactivityMs) {
|
|
39
|
+
return `Aborted: no agent activity for ${Math.round(inactivityMs / 1000)}s`;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* The max-duration-watchdog abort message. The `max duration` phrase is regex-matched by the
|
|
43
|
+
* backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
|
|
44
|
+
*/
|
|
45
|
+
export function maxDurationAbortMessage(maxDurationMs) {
|
|
46
|
+
return `Aborted: exceeded max duration of ${Math.round(maxDurationMs / 1000)}s`;
|
|
47
|
+
}
|
package/dist/fixer.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { noChangesReason, runCodingAgent } from './coding-agent.js';
|
|
2
|
+
// Async job execution for the test Fixer. When a Tester withholds its greenlight the
|
|
3
|
+
// engine dispatches this: clone the PR HEAD branch, run Pi to fix the concerns in the
|
|
4
|
+
// Tester's report (folded into the user prompt by the backend), then commit + push
|
|
5
|
+
// back onto the SAME branch (no new branch, no new PR) so the Tester can re-run. The
|
|
6
|
+
// engine re-dispatches the Tester after the push and loops up to the attempt budget.
|
|
7
|
+
//
|
|
8
|
+
// The clone/Pi/push mechanics are shared with implementation + the CI-fixer via
|
|
9
|
+
// runCodingAgent; the Fixer only differs in working ON the existing PR branch.
|
|
10
|
+
/** Run one Fixer job end to end: clone branch → Pi fixes → push (same branch). */
|
|
11
|
+
export async function handleFixer(job, opts = {}) {
|
|
12
|
+
const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
|
|
13
|
+
kind: 'fix-tests',
|
|
14
|
+
jobId: job.jobId,
|
|
15
|
+
repo: job.repo,
|
|
16
|
+
// Work directly on the PR head branch — no new branch, no new PR.
|
|
17
|
+
cloneBranch: job.branch,
|
|
18
|
+
pushBranch: job.branch,
|
|
19
|
+
ghToken: job.ghToken,
|
|
20
|
+
systemPrompt: job.systemPrompt,
|
|
21
|
+
userPrompt: job.userPrompt,
|
|
22
|
+
model: job.model,
|
|
23
|
+
harness: job.harness,
|
|
24
|
+
subscriptionToken: job.subscriptionToken,
|
|
25
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
26
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
27
|
+
sessionToken: job.sessionToken,
|
|
28
|
+
commitMessage: 'Fix issues found by the tester',
|
|
29
|
+
webToolsGuidance: job.webToolsGuidance,
|
|
30
|
+
webSearchProxy: job.webSearch,
|
|
31
|
+
}, opts);
|
|
32
|
+
// Not an error: the engine re-runs the Tester regardless. Report `pushed: false`
|
|
33
|
+
// so the (unused) result is still meaningful.
|
|
34
|
+
if (!pushed) {
|
|
35
|
+
return {
|
|
36
|
+
pushed: false,
|
|
37
|
+
summary,
|
|
38
|
+
stats,
|
|
39
|
+
error: noChangesReason('No test fix produced', stats, stderrTail),
|
|
40
|
+
...(usage ? { usage } : {}),
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
return { pushed: true, summary, stats, ...(usage ? { usage } : {}) };
|
|
44
|
+
}
|