@cat-factory/executor-harness 1.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +143 -0
- package/dist/agent-runner.js +389 -0
- package/dist/agent.js +810 -0
- package/dist/blueprint.js +367 -0
- package/dist/bootstrap.js +99 -0
- package/dist/ci-fixer.js +46 -0
- package/dist/coding-agent.js +285 -0
- package/dist/conflict-resolver.js +138 -0
- package/dist/embed.js +8 -0
- package/dist/explore.js +74 -0
- package/dist/failure.js +47 -0
- package/dist/fixer.js +44 -0
- package/dist/follow-ups.js +103 -0
- package/dist/frontend-infra.js +283 -0
- package/dist/fs-utils.js +11 -0
- package/dist/git.js +778 -0
- package/dist/job.js +409 -0
- package/dist/logger.js +27 -0
- package/dist/merger.js +135 -0
- package/dist/on-call.js +126 -0
- package/dist/pi-workspace.js +237 -0
- package/dist/pi.js +971 -0
- package/dist/process.js +25 -0
- package/dist/redact.js +109 -0
- package/dist/runner.js +228 -0
- package/dist/server.js +135 -0
- package/dist/spec.js +754 -0
- package/dist/structured-output.js +431 -0
- package/dist/tester.js +191 -0
- package/package.json +35 -0
- package/src/agent-runner.ts +484 -0
- package/src/agent.ts +948 -0
- package/src/coding-agent.ts +393 -0
- package/src/embed.ts +32 -0
- package/src/failure.ts +73 -0
- package/src/follow-ups.ts +106 -0
- package/src/frontend-infra.ts +340 -0
- package/src/fs-utils.ts +11 -0
- package/src/git.ts +955 -0
- package/src/job.ts +766 -0
- package/src/logger.ts +45 -0
- package/src/pi-workspace.ts +348 -0
- package/src/pi.ts +1236 -0
- package/src/process.ts +33 -0
- package/src/redact.ts +109 -0
- package/src/runner.ts +384 -0
- package/src/server.ts +153 -0
- package/src/structured-output.ts +524 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
import { mkdir } from 'node:fs/promises'
|
|
2
|
+
import { join } from 'node:path'
|
|
3
|
+
import type { HarnessAuthFields, RepoSpec } from './job.js'
|
|
4
|
+
import {
|
|
5
|
+
branchHasCommitsSince,
|
|
6
|
+
cloneExistingBranch,
|
|
7
|
+
cloneRepo,
|
|
8
|
+
commitTrackedEdits,
|
|
9
|
+
createBranch,
|
|
10
|
+
excludeFromGit,
|
|
11
|
+
headCommit,
|
|
12
|
+
listUntrackedFiles,
|
|
13
|
+
prepareExistingCheckout,
|
|
14
|
+
pushBranch,
|
|
15
|
+
refreshFromBaseIfClean,
|
|
16
|
+
remoteBranchExists,
|
|
17
|
+
} from './git.js'
|
|
18
|
+
import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js'
|
|
19
|
+
import type { PiRunStats } from './pi.js'
|
|
20
|
+
import {
|
|
21
|
+
acquireRepoCheckout,
|
|
22
|
+
agentNeverActed,
|
|
23
|
+
agentOutputTail,
|
|
24
|
+
runAgentInWorkspace,
|
|
25
|
+
} from './pi-workspace.js'
|
|
26
|
+
import type { ProgressGuardLimits } from './pi.js'
|
|
27
|
+
import type { RunOptions } from './runner.js'
|
|
28
|
+
import { log } from './logger.js'
|
|
29
|
+
|
|
30
|
+
// The shared skeleton for the container coding agents that clone a repo, run Pi
|
|
31
|
+
// against it and push the result on a branch. The implementation (`/run`) and
|
|
32
|
+
// CI-fixer (`/ci-fix`) agents are conceptually the same job — only what they clone
|
|
33
|
+
// onto and what they do with the outcome differ — so they share this whole flow
|
|
34
|
+
// rather than each re-deriving (and separately bug-fixing) it. Built on the thinner
|
|
35
|
+
// {@link withWorkspace}/{@link runAgentInWorkspace} base shared with the non-pushing
|
|
36
|
+
// agents (bootstrap/blueprint/merger). Mirrors their secret handling: the per-job
|
|
37
|
+
// GitHub + proxy tokens arrive in the spec and live only for the job's duration.
|
|
38
|
+
|
|
39
|
+
/** What a coding agent run needs: where to clone, what to run, where to push. */
|
|
40
|
+
export interface CodingAgentSpec extends HarnessAuthFields {
|
|
41
|
+
/** Short label for the temp dir + log lines (e.g. 'impl', 'ci-fix'). */
|
|
42
|
+
kind: string
|
|
43
|
+
/** The job id, threaded into every log line for end-to-end tracing. */
|
|
44
|
+
jobId: string
|
|
45
|
+
repo: RepoSpec
|
|
46
|
+
/** Branch to clone and check out as the starting point. */
|
|
47
|
+
cloneBranch: string
|
|
48
|
+
/** A fresh branch to create off the clone before running; omit to work directly on `cloneBranch`. */
|
|
49
|
+
newBranch?: string
|
|
50
|
+
/** Branch the produced change is pushed to. */
|
|
51
|
+
pushBranch: string
|
|
52
|
+
ghToken: string
|
|
53
|
+
/** Composed role + best-practice fragments; written to Pi's global AGENTS.md context. */
|
|
54
|
+
systemPrompt: string
|
|
55
|
+
/** The concrete task prompt handed to Pi. */
|
|
56
|
+
userPrompt: string
|
|
57
|
+
model: string
|
|
58
|
+
/** Commit message for any work the agent left uncommitted. */
|
|
59
|
+
commitMessage: string
|
|
60
|
+
/** Per-kind web-search guidance (backend-composed); surfaced only when web search is on. */
|
|
61
|
+
webToolsGuidance?: string
|
|
62
|
+
/** Enable proxy-backed web search for this run (see {@link AgentRunSpec.webSearchProxy}). */
|
|
63
|
+
webSearchProxy?: boolean
|
|
64
|
+
/** Per-knob progress-guard overrides (loosen-only), set per agent kind by the backend. */
|
|
65
|
+
guardLimits?: Partial<ProgressGuardLimits>
|
|
66
|
+
/**
|
|
67
|
+
* Reuse a stable per-repo checkout (clean-sweep + fetch + switch branch) instead of a
|
|
68
|
+
* fresh clone into a throwaway temp dir. Set only by the local warm-pool transport
|
|
69
|
+
* (its containers are reused across runs); absent everywhere else.
|
|
70
|
+
*/
|
|
71
|
+
persistentCheckout?: boolean
|
|
72
|
+
/**
|
|
73
|
+
* Tail the Coder's follow-up sentinel file ({@link FOLLOW_UPS_FILENAME}) and stream the
|
|
74
|
+
* forward-looking items it surfaces out on the job view (the Follow-up companion). Set
|
|
75
|
+
* only for the implementer (`coder`) dispatch; absent ⇒ no tailing (e.g. the CI-fixer).
|
|
76
|
+
*/
|
|
77
|
+
streamFollowUps?: boolean
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** The outcome of a coding agent run, before each caller maps it to its own result shape. */
|
|
81
|
+
export interface CodingAgentOutcome {
|
|
82
|
+
/** Whether the branch carries work and was therefore pushed (new commits, or resumed prior work). */
|
|
83
|
+
pushed: boolean
|
|
84
|
+
/** Whether the run resumed an existing remote branch (prior work already pushed). */
|
|
85
|
+
resumed: boolean
|
|
86
|
+
summary: string
|
|
87
|
+
stats: PiRunStats
|
|
88
|
+
stderrTail?: string
|
|
89
|
+
/** Token usage from a subscription harness's CLI stream (absent for Pi). */
|
|
90
|
+
usage?: { inputTokens: number; outputTokens: number }
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* How often the harness checkpoints the agent's work mid-run by pushing the branch.
|
|
95
|
+
* A per-run container can be evicted at any moment; pushing the agent's commits
|
|
96
|
+
* periodically means an evicted run's work survives on the branch, so a retry
|
|
97
|
+
* RESUMES on top of it instead of starting over. Overridable via env for tests.
|
|
98
|
+
*/
|
|
99
|
+
function checkpointIntervalMs(): number {
|
|
100
|
+
const n = Number(process.env.JOB_CHECKPOINT_INTERVAL_MS)
|
|
101
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : 60_000
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* How often the harness tails the Coder's follow-up sentinel file to surface new items.
|
|
106
|
+
* Short (a few seconds) so the Follow-up companion lights up promptly while the Coder is
|
|
107
|
+
* still running. Overridable via env for tests.
|
|
108
|
+
*/
|
|
109
|
+
function followUpPollIntervalMs(): number {
|
|
110
|
+
const n = Number(process.env.JOB_FOLLOWUP_POLL_INTERVAL_MS)
|
|
111
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : 3_000
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Clone (or RESUME an existing branch) → write context → run Pi → push the branch
|
|
116
|
+
* iff it carries work. The agent commits its OWN work (it alone knows which files
|
|
117
|
+
* belong vs scratch/artifacts it created), so the harness never blanket-stages:
|
|
118
|
+
* {@link commitTrackedEdits} is only a safety net for forgotten edits to ALREADY
|
|
119
|
+
* tracked files, and the run is judged a no-op only when the branch never advanced
|
|
120
|
+
* past its pre-run tip ({@link branchHasCommitsSince}). The harness owns push + PR;
|
|
121
|
+
* it checkpoints (pushes) periodically so an evicted run's commits survive and a
|
|
122
|
+
* retry resumes on them. Returns the run's summary/stats, whether it pushed, and
|
|
123
|
+
* whether it resumed; callers decide what to do after a push (open a PR, or nothing).
|
|
124
|
+
*/
|
|
125
|
+
export async function runCodingAgent(
|
|
126
|
+
spec: CodingAgentSpec,
|
|
127
|
+
opts: RunOptions = {},
|
|
128
|
+
): Promise<CodingAgentOutcome> {
|
|
129
|
+
const { signal } = opts
|
|
130
|
+
// The registry already binds jobId/repo/branch; add the coding kind + the push branch
|
|
131
|
+
// (which differs from the cloned branch the registry bound).
|
|
132
|
+
const logger = (opts.log ?? log).child({ kind: spec.kind, branch: spec.pushBranch })
|
|
133
|
+
return acquireRepoCheckout(
|
|
134
|
+
{ persistent: spec.persistentCheckout === true, prefix: spec.kind, repo: spec.repo },
|
|
135
|
+
async (dir) => {
|
|
136
|
+
// Resume an evicted earlier run when its work branch already exists on the
|
|
137
|
+
// remote: clone THAT branch and continue on its commits, rather than branching
|
|
138
|
+
// off base and redoing everything. Only the impl path (which creates a fresh
|
|
139
|
+
// `newBranch`) can resume; the ci-fix/conflict paths already clone the PR branch.
|
|
140
|
+
//
|
|
141
|
+
// Resume safety relies on two invariants the dispatcher (worker) upholds, since
|
|
142
|
+
// the harness can't see run/PR state from inside the container:
|
|
143
|
+
// - At most ONE active run per block at a time. The work branch is deterministic
|
|
144
|
+
// per block (`cat-factory/<blockId>`), so two concurrent runs would target the
|
|
145
|
+
// same branch; their pushes race. A plain (non-forced) push fails safely on a
|
|
146
|
+
// non-fast-forward rather than clobbering the other run's commits, so the worst
|
|
147
|
+
// case is one run failing — never lost work — but the dispatcher should not
|
|
148
|
+
// knowingly run two at once.
|
|
149
|
+
// - Re-dispatch only NON-terminal runs (failed / evicted / stale-running), whose
|
|
150
|
+
// branch is by definition unmerged. Resuming a branch whose PR already merged
|
|
151
|
+
// could re-introduce merged work; that is avoided two ways: the platform deletes
|
|
152
|
+
// the work branch when its PR merges (GitHubPullRequestMerger), so a re-run finds
|
|
153
|
+
// no branch and starts fresh, and a `done` block is never re-dispatched anyway.
|
|
154
|
+
const resumed =
|
|
155
|
+
spec.newBranch != null &&
|
|
156
|
+
(await remoteBranchExists(spec.repo.cloneUrl, spec.newBranch, spec.ghToken, signal))
|
|
157
|
+
opts.onPhase?.('clone')
|
|
158
|
+
if (spec.persistentCheckout) {
|
|
159
|
+
// Reused checkout: clean-sweep + fetch + switch branch in place. A resumed branch
|
|
160
|
+
// (or a run without `newBranch`, working directly on `cloneBranch`) already exists
|
|
161
|
+
// on the remote, so check it out directly; otherwise (re)create `newBranch` off the
|
|
162
|
+
// base tip — the same resume-vs-fresh decision the clone paths below make.
|
|
163
|
+
const targetBranch = spec.newBranch ?? spec.cloneBranch
|
|
164
|
+
logger.info('coding-agent: preparing reused checkout', { branch: targetBranch, resumed })
|
|
165
|
+
await prepareExistingCheckout({
|
|
166
|
+
dir,
|
|
167
|
+
repo: spec.repo,
|
|
168
|
+
ghToken: spec.ghToken,
|
|
169
|
+
branch: targetBranch,
|
|
170
|
+
baseBranch: spec.cloneBranch,
|
|
171
|
+
existing: resumed || spec.newBranch == null,
|
|
172
|
+
signal,
|
|
173
|
+
})
|
|
174
|
+
} else if (resumed) {
|
|
175
|
+
logger.info('coding-agent: resuming existing branch', { branch: spec.newBranch })
|
|
176
|
+
await cloneExistingBranch({
|
|
177
|
+
cloneUrl: spec.repo.cloneUrl,
|
|
178
|
+
branch: spec.newBranch!,
|
|
179
|
+
ghToken: spec.ghToken,
|
|
180
|
+
dir,
|
|
181
|
+
signal,
|
|
182
|
+
})
|
|
183
|
+
} else {
|
|
184
|
+
logger.info('coding-agent: cloning', { cloneBranch: spec.cloneBranch })
|
|
185
|
+
await cloneRepo({
|
|
186
|
+
repo: { ...spec.repo, baseBranch: spec.cloneBranch },
|
|
187
|
+
ghToken: spec.ghToken,
|
|
188
|
+
dir,
|
|
189
|
+
signal,
|
|
190
|
+
})
|
|
191
|
+
if (spec.newBranch) await createBranch(dir, spec.newBranch, signal)
|
|
192
|
+
}
|
|
193
|
+
// The branch tip before the agent runs this time. A FRESH run produced work iff
|
|
194
|
+
// the branch advances past it; a RESUMED run already carries prior work, so it is
|
|
195
|
+
// never a no-op regardless of what this pass adds. Captured BEFORE the resume base
|
|
196
|
+
// refresh below so that refresh's merge commit counts as advancement and is pushed.
|
|
197
|
+
const baseSha = await headCommit(dir, signal)
|
|
198
|
+
|
|
199
|
+
// A resumed branch was cut from an OLDER base; merge the latest base in when the
|
|
200
|
+
// two merge cleanly, so the agent works against current base and the PR stays
|
|
201
|
+
// current. On a conflict this is a no-op (the run continues on the stale base — the
|
|
202
|
+
// merge gate handles a conflicting PR downstream, as before), so it never blocks a
|
|
203
|
+
// resume. Best-effort: any error is treated as "continue without refreshing".
|
|
204
|
+
if (resumed) {
|
|
205
|
+
const refreshed = await refreshFromBaseIfClean(
|
|
206
|
+
dir,
|
|
207
|
+
spec.cloneBranch,
|
|
208
|
+
spec.ghToken,
|
|
209
|
+
signal,
|
|
210
|
+
).catch(() => false)
|
|
211
|
+
if (!refreshed) {
|
|
212
|
+
logger.info('coding-agent: resume base refresh skipped (conflict or error)', {
|
|
213
|
+
base: spec.cloneBranch,
|
|
214
|
+
})
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Serialize all pushes to the work branch through a single in-flight promise.
|
|
219
|
+
// A checkpoint tick and the final push (or two slow checkpoint ticks) must never
|
|
220
|
+
// run `git push` to the same branch concurrently: overlapping pushes race on the
|
|
221
|
+
// remote ref and can make a push fail with a ref-lock / non-fast-forward error —
|
|
222
|
+
// which, on the FINAL push, would fail the whole run even though the work is
|
|
223
|
+
// committed. `pushWorkOnce` coalesces concurrent callers onto one push and only
|
|
224
|
+
// pushes once the branch has advanced past `baseSha` (see below).
|
|
225
|
+
//
|
|
226
|
+
// Only push once the branch has advanced past its pre-run tip: pushing while it
|
|
227
|
+
// still sits at `baseSha` would create the work branch at the base commit (a
|
|
228
|
+
// zero-diff branch), which a later retry would see via `remoteBranchExists` and
|
|
229
|
+
// treat as resumable work — then fail to open a PR ("no commits between base and
|
|
230
|
+
// head"). So a run that never commits leaves NO branch behind, preserving the
|
|
231
|
+
// clean no-op outcome.
|
|
232
|
+
let pushInFlight: Promise<void> | null = null
|
|
233
|
+
const pushWorkOnce = (): Promise<void> => {
|
|
234
|
+
if (pushInFlight) return pushInFlight
|
|
235
|
+
pushInFlight = (async () => {
|
|
236
|
+
if (!(await branchHasCommitsSince(dir, baseSha, signal))) return
|
|
237
|
+
await pushBranch(dir, spec.pushBranch, spec.ghToken, signal)
|
|
238
|
+
})().finally(() => {
|
|
239
|
+
pushInFlight = null
|
|
240
|
+
})
|
|
241
|
+
return pushInFlight
|
|
242
|
+
}
|
|
243
|
+
// Read the in-flight push, if any. A function (with an explicit return type) so the
|
|
244
|
+
// value isn't subject to the caller's straight-line narrowing — `pushInFlight` is
|
|
245
|
+
// only ever assigned inside closures, which flow analysis can't observe.
|
|
246
|
+
const inFlightPush = (): Promise<void> | null => pushInFlight
|
|
247
|
+
|
|
248
|
+
// Checkpoint the agent's committed work to the branch periodically so an eviction
|
|
249
|
+
// mid-run doesn't lose it (a retry then resumes from the pushed commits). The
|
|
250
|
+
// agent commits its own work; this only PUSHES already-committed commits, so it
|
|
251
|
+
// never races the agent's staging. Best-effort: a failed checkpoint is skipped.
|
|
252
|
+
// Surface checkpoint-push failures at warn with a running count: a checkpoint losing
|
|
253
|
+
// a race is harmless once, but a steadily-climbing count means mid-run work is NOT
|
|
254
|
+
// being durably checkpointed, so an eviction would lose it — previously invisible at
|
|
255
|
+
// info level. Still best-effort: a failed checkpoint never fails the run.
|
|
256
|
+
let checkpointFailures = 0
|
|
257
|
+
const checkpoint = setInterval(() => {
|
|
258
|
+
pushWorkOnce().catch((err) => {
|
|
259
|
+
checkpointFailures++
|
|
260
|
+
logger.warn('coding-agent: checkpoint push failed', {
|
|
261
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
262
|
+
checkpointFailures,
|
|
263
|
+
})
|
|
264
|
+
})
|
|
265
|
+
}, checkpointIntervalMs())
|
|
266
|
+
checkpoint.unref?.()
|
|
267
|
+
|
|
268
|
+
// In a monorepo the service lives in a subdirectory: run Pi with its cwd set to
|
|
269
|
+
// that subtree (git stays rooted at `dir` so commits/pushes still cover the whole
|
|
270
|
+
// checkout). Created if missing so a coder scaffolding a brand-new service into an
|
|
271
|
+
// existing monorepo has a cwd to start in. The agent is also TOLD it's in a
|
|
272
|
+
// monorepo (and where) via the AGENTS.md context below.
|
|
273
|
+
const serviceDirectory = spec.repo.serviceDirectory
|
|
274
|
+
const workDir = serviceDirectory ? join(dir, serviceDirectory) : dir
|
|
275
|
+
if (serviceDirectory) await mkdir(workDir, { recursive: true })
|
|
276
|
+
|
|
277
|
+
// Follow-up companion: tail the Coder's sentinel file and stream new items out on the
|
|
278
|
+
// job view. Locally exclude it from git first so the agent's own `git add` can never
|
|
279
|
+
// stage it and it never surfaces as an untracked leftover or in the PR. The sentinel
|
|
280
|
+
// lives in the agent's working directory (its cwd), where the prompt tells it to write.
|
|
281
|
+
const followUpTailer =
|
|
282
|
+
spec.streamFollowUps && opts.onFollowUp
|
|
283
|
+
? new FollowUpTailer(join(workDir, FOLLOW_UPS_FILENAME), opts.onFollowUp, logger)
|
|
284
|
+
: undefined
|
|
285
|
+
let followUpTick: ReturnType<typeof setInterval> | undefined
|
|
286
|
+
if (followUpTailer) {
|
|
287
|
+
await excludeFromGit(dir, FOLLOW_UPS_FILENAME, signal)
|
|
288
|
+
followUpTick = setInterval(() => {
|
|
289
|
+
void followUpTailer.poll()
|
|
290
|
+
}, followUpPollIntervalMs())
|
|
291
|
+
followUpTick.unref?.()
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
let outcome: CodingAgentOutcome
|
|
295
|
+
try {
|
|
296
|
+
opts.onPhase?.('agent')
|
|
297
|
+
logger.info('coding-agent: running agent', { serviceDirectory })
|
|
298
|
+
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
|
|
299
|
+
{
|
|
300
|
+
dir: workDir,
|
|
301
|
+
systemPrompt: spec.systemPrompt,
|
|
302
|
+
userPrompt: spec.userPrompt,
|
|
303
|
+
model: spec.model,
|
|
304
|
+
harness: spec.harness,
|
|
305
|
+
subscriptionToken: spec.subscriptionToken,
|
|
306
|
+
subscriptionBaseUrl: spec.subscriptionBaseUrl,
|
|
307
|
+
ambientAuth: spec.ambientAuth,
|
|
308
|
+
proxyBaseUrl: spec.proxyBaseUrl,
|
|
309
|
+
sessionToken: spec.sessionToken,
|
|
310
|
+
serviceDirectory,
|
|
311
|
+
webToolsGuidance: spec.webToolsGuidance,
|
|
312
|
+
webSearchProxy: spec.webSearchProxy,
|
|
313
|
+
guardLimits: spec.guardLimits,
|
|
314
|
+
},
|
|
315
|
+
opts,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
// Stop tailing the follow-up sentinel and flush any items written after the last
|
|
319
|
+
// tick, so a fast final burst still reaches the job view before the run is recorded.
|
|
320
|
+
if (followUpTick) clearInterval(followUpTick)
|
|
321
|
+
if (followUpTailer) await followUpTailer.poll().catch(() => {})
|
|
322
|
+
|
|
323
|
+
// Safety net for forgotten edits: commit changes to TRACKED files only (never
|
|
324
|
+
// untracked scratch files/artifacts — the agent owns committing new files).
|
|
325
|
+
await commitTrackedEdits(dir, spec.commitMessage, signal)
|
|
326
|
+
|
|
327
|
+
// Stop periodic checkpoints and let any in-flight one settle BEFORE the final
|
|
328
|
+
// push, so the two never run a concurrent `git push` to the same branch (the
|
|
329
|
+
// final push below is then a fresh attempt whose failure is the real signal).
|
|
330
|
+
clearInterval(checkpoint)
|
|
331
|
+
const inflight = inFlightPush()
|
|
332
|
+
if (inflight) await inflight.catch(() => {})
|
|
333
|
+
|
|
334
|
+
// Surface (don't fail on) untracked, non-ignored files the agent left behind:
|
|
335
|
+
// `commitTrackedEdits` only captures edits to ALREADY tracked files, so a NEW
|
|
336
|
+
// file the agent created but forgot to commit is silently dropped. Logging it
|
|
337
|
+
// makes that loss observable when a PR turns out to be missing a file.
|
|
338
|
+
const leftover = await listUntrackedFiles(dir, signal)
|
|
339
|
+
if (leftover.length > 0) {
|
|
340
|
+
logger.warn('coding-agent: uncommitted new files left behind (not pushed)', {
|
|
341
|
+
count: leftover.length,
|
|
342
|
+
files: leftover.slice(0, 20),
|
|
343
|
+
})
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const hasWork = resumed || (await branchHasCommitsSince(dir, baseSha, signal))
|
|
347
|
+
if (!hasWork) {
|
|
348
|
+
logger.info('coding-agent: no changes produced', { ...stats })
|
|
349
|
+
outcome = {
|
|
350
|
+
pushed: false,
|
|
351
|
+
resumed,
|
|
352
|
+
summary,
|
|
353
|
+
stats,
|
|
354
|
+
...(stderrTail ? { stderrTail } : {}),
|
|
355
|
+
...(usage ? { usage } : {}),
|
|
356
|
+
}
|
|
357
|
+
} else {
|
|
358
|
+
opts.onPhase?.('push')
|
|
359
|
+
logger.info('coding-agent: pushing', { resumed, ...stats })
|
|
360
|
+
await pushWorkOnce()
|
|
361
|
+
outcome = {
|
|
362
|
+
pushed: true,
|
|
363
|
+
resumed,
|
|
364
|
+
summary,
|
|
365
|
+
stats,
|
|
366
|
+
...(stderrTail ? { stderrTail } : {}),
|
|
367
|
+
...(usage ? { usage } : {}),
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
} finally {
|
|
371
|
+
// Safety net for the throw path (the happy path already cleared these above).
|
|
372
|
+
clearInterval(checkpoint)
|
|
373
|
+
if (followUpTick) clearInterval(followUpTick)
|
|
374
|
+
}
|
|
375
|
+
return outcome
|
|
376
|
+
},
|
|
377
|
+
)
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* The "no changes" reason both coding agents report: a caller-supplied lead phrase
|
|
382
|
+
* plus the shared "never acted" cause and a credential-scrubbed tail of Pi's stderr.
|
|
383
|
+
*/
|
|
384
|
+
export function noChangesReason(
|
|
385
|
+
lead: string,
|
|
386
|
+
stats: PiRunStats,
|
|
387
|
+
stderrTail: string | undefined,
|
|
388
|
+
): string {
|
|
389
|
+
const cause = agentNeverActed(stats)
|
|
390
|
+
? ' (the agent never acted — it most likely could not reach the model)'
|
|
391
|
+
: ''
|
|
392
|
+
return `${lead}${cause}.${agentOutputTail(stderrTail)}`
|
|
393
|
+
}
|
package/src/embed.ts
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// Embeddable surface of the executor harness: the Pi-driving and git helpers
|
|
2
|
+
// the container payload uses, re-exported so other packages (e.g. the benchmark
|
|
3
|
+
// harness) can run the *same* coding-agent flow outside the container — clone a
|
|
4
|
+
// repo, write the agent context, point Pi at an OpenAI-compatible endpoint, run
|
|
5
|
+
// it, and inspect what changed. The HTTP server / job lifecycle stays internal;
|
|
6
|
+
// only the reusable primitives are exposed here.
|
|
7
|
+
|
|
8
|
+
export {
|
|
9
|
+
PI_MAX_OUTPUT_TOKENS,
|
|
10
|
+
DEFAULT_PROGRESS_GUARD_LIMITS,
|
|
11
|
+
writePiModelsConfig,
|
|
12
|
+
writeAgentsContext,
|
|
13
|
+
runPi,
|
|
14
|
+
summarizePiRun,
|
|
15
|
+
parsePiOutput,
|
|
16
|
+
parseTodoProgress,
|
|
17
|
+
progressGuardLimitsFromEnv,
|
|
18
|
+
terminalRunError,
|
|
19
|
+
type PiRunOutcome,
|
|
20
|
+
type PiRunStats,
|
|
21
|
+
type ProgressGuardLimits,
|
|
22
|
+
type TodoItem,
|
|
23
|
+
type TodoProgress,
|
|
24
|
+
} from './pi.js'
|
|
25
|
+
export {
|
|
26
|
+
cloneRepo,
|
|
27
|
+
createBranch,
|
|
28
|
+
changedPathsFromPorcelain,
|
|
29
|
+
hasAgentChanges,
|
|
30
|
+
redactSecrets,
|
|
31
|
+
} from './git.js'
|
|
32
|
+
export type { RepoSpec } from './job.js'
|
package/src/failure.ts
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
// Single source of truth for how a job FAILS: the canonical failure-cause vocabulary plus
|
|
2
|
+
// the watchdog abort-message builders.
|
|
3
|
+
//
|
|
4
|
+
// WHY THIS MODULE EXISTS — the backend classifies a failed job by REGEX-matching the
|
|
5
|
+
// harness's free-text `error` string (it has no other signal today):
|
|
6
|
+
// - server `ContainerRepoBootstrapper.classifyBootstrapFailure`:
|
|
7
|
+
// /inactivity|no agent activity|max duration/i → 'timeout', else → 'agent'
|
|
8
|
+
// - orchestration `job.logic.isContainerEvictionError`: /evicted or crashed/i (FACADE-owned,
|
|
9
|
+
// NOT emitted here — the harness must keep NOT emitting that phrase for a non-eviction)
|
|
10
|
+
// Because those phrases are matched downstream, their wording MUST stay stable. Centralizing
|
|
11
|
+
// the builders here keeps the emitted text from drifting away from the regex that reads it.
|
|
12
|
+
// Alongside the strings we now also emit a STRUCTURED {@link FailureCause} on the job view so
|
|
13
|
+
// the backend can prefer it and treat the regex as a backward-compatible fallback.
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* The structured reason a harness job failed, surfaced on the job view's `failureCause`.
|
|
17
|
+
* Covers only HARNESS-owned failures — container eviction is detected by the runtime facade
|
|
18
|
+
* (a vanished container → `(container evicted or crashed)`), never set here.
|
|
19
|
+
*
|
|
20
|
+
* - `inactivity-timeout` — the inactivity watchdog fired (no agent output for the window).
|
|
21
|
+
* - `max-duration` — the overall wall-clock cap fired.
|
|
22
|
+
* - `agent` — the agent ran but produced an unusable/failed result, or threw.
|
|
23
|
+
* - `git` — a git operation failed (clone/push/merge/PR).
|
|
24
|
+
* - `api` — an upstream API call failed (e.g. the GitHub/GitLab PR/MR REST call).
|
|
25
|
+
* - `no-usable-output` — the agent finished but returned no usable report / structured output.
|
|
26
|
+
* - `no-changes` — a coding agent finished without producing any change to push.
|
|
27
|
+
*/
|
|
28
|
+
export type FailureCause =
|
|
29
|
+
| 'inactivity-timeout'
|
|
30
|
+
| 'max-duration'
|
|
31
|
+
| 'agent'
|
|
32
|
+
| 'git'
|
|
33
|
+
| 'api'
|
|
34
|
+
| 'no-usable-output'
|
|
35
|
+
| 'no-changes'
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* A thrown failure that carries a structured {@link FailureCause}, so a `git` / `api`
|
|
39
|
+
* operation that fails deep in a helper surfaces its real cause instead of being flattened
|
|
40
|
+
* to the generic `agent` in the registry's catch. The watchdog kills set their cause from
|
|
41
|
+
* `killReason` and never throw this; anything else thrown without a cause stays `agent`.
|
|
42
|
+
*/
|
|
43
|
+
export class HarnessFailure extends Error {
|
|
44
|
+
readonly failureCause: FailureCause
|
|
45
|
+
constructor(failureCause: FailureCause, message: string) {
|
|
46
|
+
super(message)
|
|
47
|
+
this.name = 'HarnessFailure'
|
|
48
|
+
this.failureCause = failureCause
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** The structured cause a thrown error carries, or undefined for a plain/agent error. */
|
|
53
|
+
export function failureCauseOf(err: unknown): FailureCause | undefined {
|
|
54
|
+
return err instanceof HarnessFailure ? err.failureCause : undefined
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* The inactivity-watchdog abort message PREFIX. The `no agent activity` phrase is
|
|
59
|
+
* regex-matched by the backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
|
|
60
|
+
* The caller appends a `(likely hung ...)` diagnostic clause (phase + last tool) after this,
|
|
61
|
+
* so the prefix deliberately stops before the parenthetical (see `runner.ts` drive catch).
|
|
62
|
+
*/
|
|
63
|
+
export function inactivityAbortMessage(inactivityMs: number): string {
|
|
64
|
+
return `Aborted: no agent activity for ${Math.round(inactivityMs / 1000)}s`
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* The max-duration-watchdog abort message. The `max duration` phrase is regex-matched by the
|
|
69
|
+
* backend's `classifyBootstrapFailure` (→ `timeout`); do not reword it.
|
|
70
|
+
*/
|
|
71
|
+
export function maxDurationAbortMessage(maxDurationMs: number): string {
|
|
72
|
+
return `Aborted: exceeded max duration of ${Math.round(maxDurationMs / 1000)}s`
|
|
73
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises'
|
|
2
|
+
import { log, type Logger } from './logger.js'
|
|
3
|
+
|
|
4
|
+
// The Coder's forward-looking side channel. As the implementer works it appends one
|
|
5
|
+
// JSON line per item to a sentinel file in its working directory; the harness tails
|
|
6
|
+
// that file and streams the new items OUT on the job view (drain-on-read), so the
|
|
7
|
+
// backend lifts them onto the run's step and the "Follow-up companion" lights up
|
|
8
|
+
// while the container is still running. This is the OUT-bound half only — there is no
|
|
9
|
+
// in-bound path back into a running container (an answer reaches the Coder via a
|
|
10
|
+
// backend-driven re-run, not by resuming the live process).
|
|
11
|
+
|
|
12
|
+
/** The sentinel file the Coder appends items to, relative to its working directory. */
|
|
13
|
+
export const FOLLOW_UPS_FILENAME = '.cat-follow-ups.jsonl'
|
|
14
|
+
|
|
15
|
+
/** One streamed item the Coder surfaced. Mirrors the backend's `streamedFollowUpSchema`. */
|
|
16
|
+
export interface FollowUpLine {
|
|
17
|
+
kind: 'follow_up' | 'question'
|
|
18
|
+
title: string
|
|
19
|
+
detail: string
|
|
20
|
+
suggestedAction?: string
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/** Coerce one parsed JSON line into a {@link FollowUpLine}, or null when unusable. */
|
|
24
|
+
function coerceLine(value: unknown): FollowUpLine | null {
|
|
25
|
+
if (typeof value !== 'object' || value === null) return null
|
|
26
|
+
const o = value as Record<string, unknown>
|
|
27
|
+
const title = typeof o.title === 'string' ? o.title.trim() : ''
|
|
28
|
+
if (!title) return null
|
|
29
|
+
const kind = o.kind === 'question' ? 'question' : 'follow_up'
|
|
30
|
+
const detail = typeof o.detail === 'string' ? o.detail : ''
|
|
31
|
+
const suggestedAction =
|
|
32
|
+
typeof o.suggestedAction === 'string' && o.suggestedAction.trim()
|
|
33
|
+
? o.suggestedAction.trim()
|
|
34
|
+
: undefined
|
|
35
|
+
return {
|
|
36
|
+
kind,
|
|
37
|
+
title: title.slice(0, 300),
|
|
38
|
+
detail,
|
|
39
|
+
...(suggestedAction ? { suggestedAction } : {}),
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Tails an append-only JSONL sentinel file, yielding only the NEW complete lines on each
|
|
45
|
+
* {@link poll}. Tracks how many characters have been consumed so a partially-written
|
|
46
|
+
* trailing line (no newline yet) is held back until it completes. Tolerant: a malformed
|
|
47
|
+
* line is skipped, a missing file yields nothing — surfacing follow-ups must never
|
|
48
|
+
* disturb the coding run.
|
|
49
|
+
*/
|
|
50
|
+
export class FollowUpTailer {
|
|
51
|
+
private consumed = 0
|
|
52
|
+
/** Running count of complete-but-unparsable lines, so silent drops become visible. */
|
|
53
|
+
private skipped = 0
|
|
54
|
+
|
|
55
|
+
constructor(
|
|
56
|
+
private readonly filePath: string,
|
|
57
|
+
private readonly onItems: (items: FollowUpLine[]) => void,
|
|
58
|
+
private readonly logger: Logger = log,
|
|
59
|
+
) {}
|
|
60
|
+
|
|
61
|
+
/** Read any new complete lines and emit the coerced items. Best-effort; never throws. */
|
|
62
|
+
async poll(): Promise<void> {
|
|
63
|
+
let content: string
|
|
64
|
+
try {
|
|
65
|
+
content = await readFile(this.filePath, 'utf8')
|
|
66
|
+
} catch {
|
|
67
|
+
// Not created yet (or vanished): nothing to surface.
|
|
68
|
+
return
|
|
69
|
+
}
|
|
70
|
+
if (content.length <= this.consumed) return
|
|
71
|
+
const fresh = content.slice(this.consumed)
|
|
72
|
+
// Only consume up to the last newline; hold any trailing partial line for next poll.
|
|
73
|
+
const lastNewline = fresh.lastIndexOf('\n')
|
|
74
|
+
if (lastNewline === -1) return
|
|
75
|
+
this.consumed += lastNewline + 1
|
|
76
|
+
const items: FollowUpLine[] = []
|
|
77
|
+
let skippedThisPoll = 0
|
|
78
|
+
for (const raw of fresh.slice(0, lastNewline).split('\n')) {
|
|
79
|
+
const line = raw.trim()
|
|
80
|
+
if (!line) continue
|
|
81
|
+
try {
|
|
82
|
+
const coerced = coerceLine(JSON.parse(line))
|
|
83
|
+
if (coerced) items.push(coerced)
|
|
84
|
+
else skippedThisPoll++
|
|
85
|
+
} catch {
|
|
86
|
+
// A non-JSON / half-written line — skip it (a later poll re-reads from `consumed`,
|
|
87
|
+
// which only advanced past complete newline-terminated lines).
|
|
88
|
+
skippedThisPoll++
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (skippedThisPoll > 0) {
|
|
92
|
+
// A complete line that didn't yield an item is dropped for good (consumed past it).
|
|
93
|
+
// Surface it at warn with a running total rather than swallowing it silently — a
|
|
94
|
+
// steadily-growing count points at a malformed-emitter bug, not a transient race.
|
|
95
|
+
this.skipped += skippedThisPoll
|
|
96
|
+
this.logger.warn('follow-ups: skipped malformed lines', {
|
|
97
|
+
skipped: skippedThisPoll,
|
|
98
|
+
skippedTotal: this.skipped,
|
|
99
|
+
})
|
|
100
|
+
}
|
|
101
|
+
if (items.length > 0) {
|
|
102
|
+
this.logger.info('follow-ups: surfaced items', { count: items.length })
|
|
103
|
+
this.onItems(items)
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|