@cat-factory/executor-harness 1.31.6 → 1.31.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-runner.js +3 -1
- package/dist/agent.js +18 -0
- package/dist/coding-agent.js +19 -2
- package/dist/git.js +37 -0
- package/dist/pi.js +3 -1
- package/dist/process.js +33 -5
- package/dist/runner.js +33 -1
- package/dist/server.js +34 -2
- package/package.json +3 -3
- package/src/agent-runner.ts +3 -1
- package/src/agent.ts +22 -0
- package/src/coding-agent.ts +19 -1
- package/src/git.ts +44 -1
- package/src/pi.ts +3 -1
- package/src/process.ts +34 -5
- package/src/runner.ts +35 -0
- package/src/server.ts +39 -2
package/dist/agent-runner.js
CHANGED
|
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process';
|
|
|
2
2
|
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
|
3
3
|
import { tmpdir } from 'node:os';
|
|
4
4
|
import { join } from 'node:path';
|
|
5
|
-
import { killChildProcess } from './process.js';
|
|
5
|
+
import { killChildProcess, spawnDetached } from './process.js';
|
|
6
6
|
import { redact, secretsToRedact } from './redact.js';
|
|
7
7
|
function isObject(value) {
|
|
8
8
|
return typeof value === 'object' && value !== null;
|
|
@@ -28,6 +28,8 @@ function streamCli(command, args, prompt, opts, env, secrets, onEvent) {
|
|
|
28
28
|
cwd: opts.cwd,
|
|
29
29
|
env: { ...process.env, ...env },
|
|
30
30
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
31
|
+
// Own process group (POSIX) so killChildProcess reaps the CLI's grandchildren too.
|
|
32
|
+
detached: spawnDetached,
|
|
31
33
|
});
|
|
32
34
|
child.stdin.on('error', () => { });
|
|
33
35
|
child.stdin.end(prompt);
|
package/dist/agent.js
CHANGED
|
@@ -533,6 +533,24 @@ async function runCodingMode(job, opts) {
|
|
|
533
533
|
...(job.repo.provider ? { provider: job.repo.provider } : {}),
|
|
534
534
|
signal: opts.signal,
|
|
535
535
|
});
|
|
536
|
+
// `null` ⇒ the branch has nothing ahead of base, so there was no PR to open (a resumed
|
|
537
|
+
// branch whose earlier PR already merged). Record it as a clean no-op rather than a push,
|
|
538
|
+
// mirroring the no-changes outcome — the `runCodingAgent` guard normally catches this, so
|
|
539
|
+
// this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
|
|
540
|
+
if (prUrl === null) {
|
|
541
|
+
if (job.noChangesIsError === false) {
|
|
542
|
+
return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
|
|
543
|
+
}
|
|
544
|
+
return {
|
|
545
|
+
pushed: false,
|
|
546
|
+
branch: pushBranch,
|
|
547
|
+
summary,
|
|
548
|
+
stats,
|
|
549
|
+
error: noChangesReason('the work branch has no commits ahead of its base (nothing to open a PR for)', stats, stderrTail),
|
|
550
|
+
failureCause: 'no-changes',
|
|
551
|
+
...(usage ? { usage } : {}),
|
|
552
|
+
};
|
|
553
|
+
}
|
|
536
554
|
return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
|
|
537
555
|
}
|
|
538
556
|
return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
|
package/dist/coding-agent.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { mkdir } from 'node:fs/promises';
|
|
2
2
|
import { join } from 'node:path';
|
|
3
|
-
import { branchHasCommitsSince, cloneExistingBranch, cloneRepo, commitTrackedEdits, createBranch, excludeFromGit, headCommit, listUntrackedFiles, prepareExistingCheckout, pushBranch, refreshFromBaseIfClean, remoteBranchExists, } from './git.js';
|
|
3
|
+
import { branchAheadOfBase, branchHasCommitsSince, cloneExistingBranch, cloneRepo, commitTrackedEdits, createBranch, excludeFromGit, headCommit, listUntrackedFiles, prepareExistingCheckout, pushBranch, refreshFromBaseIfClean, remoteBranchExists, } from './git.js';
|
|
4
4
|
import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js';
|
|
5
5
|
import { acquireRepoCheckout, agentNeverActed, agentOutputTail, runAgentInWorkspace, } from './pi-workspace.js';
|
|
6
6
|
import { log } from './logger.js';
|
|
@@ -238,7 +238,24 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
238
238
|
files: leftover.slice(0, 20),
|
|
239
239
|
});
|
|
240
240
|
}
|
|
241
|
-
|
|
241
|
+
// A fresh run produced work iff the branch advanced past its pre-run tip. A RESUMED
|
|
242
|
+
// run already carries prior work — UNLESS that branch turns out to have nothing ahead
|
|
243
|
+
// of the PR base (e.g. its earlier PR was merged with a merge commit, leaving the
|
|
244
|
+
// branch reachable from base and its best-effort delete skipped). Opening a PR for such
|
|
245
|
+
// a branch fails with GitHub's opaque 422 "No commits between ...", so a CONFIRMED-empty
|
|
246
|
+
// resumed branch is a no-op, not work. `undefined` (couldn't determine) keeps the prior
|
|
247
|
+
// resume-is-work behaviour; the PR-open path then no-ops on the 422 as a backstop.
|
|
248
|
+
const advancedThisPass = await branchHasCommitsSince(dir, baseSha, signal);
|
|
249
|
+
let hasWork = advancedThisPass || resumed;
|
|
250
|
+
if (resumed && !advancedThisPass) {
|
|
251
|
+
const ahead = await branchAheadOfBase(dir, spec.repo.baseBranch, spec.ghToken, signal);
|
|
252
|
+
if (ahead === false) {
|
|
253
|
+
logger.info('coding-agent: resumed branch has no commits ahead of base — no-op', {
|
|
254
|
+
base: spec.repo.baseBranch,
|
|
255
|
+
});
|
|
256
|
+
hasWork = false;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
242
259
|
if (!hasWork) {
|
|
243
260
|
logger.info('coding-agent: no changes produced', { ...stats });
|
|
244
261
|
outcome = {
|
package/dist/git.js
CHANGED
|
@@ -395,6 +395,36 @@ export async function excludeFromGit(dir, pattern, signal) {
|
|
|
395
395
|
export async function branchHasCommitsSince(dir, baseSha, signal) {
|
|
396
396
|
return (await headCommit(dir, signal)) !== baseSha;
|
|
397
397
|
}
|
|
398
|
+
/**
|
|
399
|
+
* Whether the checked-out branch carries at least one commit the PR base does NOT — i.e.
|
|
400
|
+
* `git rev-list --count <base>..HEAD > 0`. A resume clone is single-branch, so it has no
|
|
401
|
+
* `origin/<base>` tracking ref; this fetches the base into a dedicated local ref first and
|
|
402
|
+
* diffs HEAD against it.
|
|
403
|
+
*
|
|
404
|
+
* Tri-state on purpose:
|
|
405
|
+
* - `true` — confirmed ≥1 commit ahead (there is something to open a PR for).
|
|
406
|
+
* - `false` — confirmed 0 commits ahead (the branch is reachable from base, e.g. its earlier
|
|
407
|
+
* PR was merged with a merge commit and the best-effort branch delete was skipped).
|
|
408
|
+
* - `undefined` — could not determine (fetch / rev-list error); the caller keeps its prior
|
|
409
|
+
* behaviour rather than wrongly dropping a resumed branch that has real work.
|
|
410
|
+
*
|
|
411
|
+
* Used by the resume path to avoid declaring a merged/empty branch as work and then failing
|
|
412
|
+
* the run with GitHub's opaque 422 "No commits between <base> and <branch>".
|
|
413
|
+
*/
|
|
414
|
+
export async function branchAheadOfBase(dir, baseBranch, ghToken, signal) {
|
|
415
|
+
try {
|
|
416
|
+
await git(['fetch', 'origin', `+refs/heads/${baseBranch}:refs/cat-factory/base`], {
|
|
417
|
+
cwd: dir,
|
|
418
|
+
signal,
|
|
419
|
+
env: await authEnv(ghToken),
|
|
420
|
+
});
|
|
421
|
+
const count = (await git(['rev-list', '--count', 'refs/cat-factory/base..HEAD'], { cwd: dir, signal })).trim();
|
|
422
|
+
return Number(count) > 0;
|
|
423
|
+
}
|
|
424
|
+
catch {
|
|
425
|
+
return undefined;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
398
428
|
/**
|
|
399
429
|
* Whether the checked-out branch has a real, examinable diff against
|
|
400
430
|
* `origin/<baseBranch>` — i.e. the base branch's remote-tracking ref exists (so the
|
|
@@ -749,6 +779,13 @@ export async function openPullRequest(opts) {
|
|
|
749
779
|
if (existing)
|
|
750
780
|
return existing;
|
|
751
781
|
}
|
|
782
|
+
// The head branch has nothing ahead of base ("No commits between <base> and <head>").
|
|
783
|
+
// That is not an API failure — there is simply nothing to open a PR for (e.g. a resumed
|
|
784
|
+
// branch whose earlier PR was merged with a merge commit, leaving the branch reachable
|
|
785
|
+
// from base). Signal it with null so the caller records a clean no-op instead of failing
|
|
786
|
+
// the run with GitHub's opaque 422.
|
|
787
|
+
if (res.status === 422 && /no commits between/i.test(detail))
|
|
788
|
+
return null;
|
|
752
789
|
throw new HarnessFailure('api', redactSecrets(`Failed to open PR (HTTP ${res.status}): ${detail.slice(0, 300)}`));
|
|
753
790
|
}
|
|
754
791
|
const body = (await res.json());
|
package/dist/pi.js
CHANGED
|
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process';
|
|
|
2
2
|
import { appendFile, mkdir, writeFile } from 'node:fs/promises';
|
|
3
3
|
import { homedir } from 'node:os';
|
|
4
4
|
import { dirname, join } from 'node:path';
|
|
5
|
-
import { killChildProcess } from './process.js';
|
|
5
|
+
import { killChildProcess, spawnDetached } from './process.js';
|
|
6
6
|
import { pathExists } from './fs-utils.js';
|
|
7
7
|
import { redactSecrets } from './redact.js';
|
|
8
8
|
import { log } from './logger.js';
|
|
@@ -573,6 +573,8 @@ export function runPi(opts) {
|
|
|
573
573
|
// stdin is piped (not 'ignore') so the prompt is delivered out-of-band
|
|
574
574
|
// rather than on argv — see the function doc for the injection rationale.
|
|
575
575
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
576
|
+
// Own process group (POSIX) so killChildProcess reaps Pi's grandchildren too.
|
|
577
|
+
detached: spawnDetached,
|
|
576
578
|
});
|
|
577
579
|
// Hand Pi the prompt over stdin, then close it so print mode sees EOF and
|
|
578
580
|
// runs. Ignore stdin errors (e.g. EPIPE if Pi exits before reading): the
|
package/dist/process.js
CHANGED
|
@@ -6,20 +6,48 @@ import { log } from './logger.js';
|
|
|
6
6
|
// How long to wait after SIGTERM before escalating to SIGKILL.
|
|
7
7
|
const KILL_GRACE_MS = 5_000;
|
|
8
8
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
9
|
+
* Signal a child and, when it was spawned detached (a process-group leader on POSIX — see
|
|
10
|
+
* `spawnDetached`), the whole group with it. The agent CLIs (`claude`/`codex`/Pi) spawn their
|
|
11
|
+
* own grandchildren (a shell tool, a build, their own git); a plain `child.kill()` reaps only
|
|
12
|
+
* the direct child and those grandchildren reparent to init and keep running unsupervised.
|
|
13
|
+
* `process.kill(-pid)` targets the group instead. Falls back to a direct kill on Windows (no
|
|
14
|
+
* POSIX process groups) or when the group send fails (already reaped, or the child wasn't
|
|
15
|
+
* spawned detached so no group of its own exists).
|
|
16
|
+
*/
|
|
17
|
+
function signalTree(child, signal) {
|
|
18
|
+
if (child.pid !== undefined && process.platform !== 'win32') {
|
|
19
|
+
try {
|
|
20
|
+
process.kill(-child.pid, signal);
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
catch {
|
|
24
|
+
// Fall through to the direct kill below.
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
child.kill(signal);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Terminate a child process (and its group — see {@link signalTree}): SIGTERM first, then
|
|
31
|
+
* SIGKILL after a grace period if it hasn't exited (ignored an ordinary terminate). The
|
|
32
|
+
* escalation timer is `unref()`d so it never by itself keeps the event loop alive. Safe to
|
|
33
|
+
* call more than once.
|
|
12
34
|
*
|
|
13
35
|
* An actual escalation to SIGKILL is logged at warn level: a process that ignores
|
|
14
36
|
* SIGTERM and has to be force-killed is a signal worth seeing (a wedged Pi/CLI), and
|
|
15
37
|
* was previously invisible. Pass a child logger to carry the run's `jobId`.
|
|
16
38
|
*/
|
|
17
39
|
export function killChildProcess(child, graceMs = KILL_GRACE_MS, logger = log) {
|
|
18
|
-
child
|
|
40
|
+
signalTree(child, 'SIGTERM');
|
|
19
41
|
setTimeout(() => {
|
|
20
42
|
if (child.exitCode === null && child.signalCode === null) {
|
|
21
43
|
logger.warn('killChildProcess: process ignored SIGTERM, escalating to SIGKILL', { graceMs });
|
|
22
|
-
child
|
|
44
|
+
signalTree(child, 'SIGKILL');
|
|
23
45
|
}
|
|
24
46
|
}, graceMs).unref();
|
|
25
47
|
}
|
|
48
|
+
/**
|
|
49
|
+
* Whether a spawned agent CLI should be its own process-group leader so {@link killChildProcess}
|
|
50
|
+
* can reap the whole tree (its grandchildren) on abort. POSIX only; Windows has no process
|
|
51
|
+
* groups (and `detached` there spawns a new console we don't want), so it stays false.
|
|
52
|
+
*/
|
|
53
|
+
export const spawnDetached = process.platform !== 'win32';
|
package/dist/runner.js
CHANGED
|
@@ -20,7 +20,7 @@ export function loadRunnerLimits(env = process.env) {
|
|
|
20
20
|
};
|
|
21
21
|
}
|
|
22
22
|
function toView(entry) {
|
|
23
|
-
const { promise: _promise, spanBuffer: _spanBuffer, followUpBuffer: _followUpBuffer, ...view } = entry;
|
|
23
|
+
const { promise: _promise, spanBuffer: _spanBuffer, followUpBuffer: _followUpBuffer, abort: _abort, ...view } = entry;
|
|
24
24
|
return { ...view };
|
|
25
25
|
}
|
|
26
26
|
/**
|
|
@@ -90,6 +90,35 @@ export class JobRegistry {
|
|
|
90
90
|
}
|
|
91
91
|
return view;
|
|
92
92
|
}
|
|
93
|
+
/**
|
|
94
|
+
* Abort every RUNNING job (fires each run's abort signal, which SIGTERM→SIGKILLs its
|
|
95
|
+
* CLI/git children via `killChildProcess`). The graceful-shutdown hook: a harness dying
|
|
96
|
+
* to SIGTERM must not orphan a live agent subprocess — reparented, it would keep working
|
|
97
|
+
* unsupervised (and, in native local mode, on the developer's own login). Returns the
|
|
98
|
+
* number of jobs aborted.
|
|
99
|
+
*/
|
|
100
|
+
abortAll(reason) {
|
|
101
|
+
let aborted = 0;
|
|
102
|
+
for (const entry of this.jobs.values()) {
|
|
103
|
+
if (entry.state === 'running' && entry.abort) {
|
|
104
|
+
entry.abort(reason);
|
|
105
|
+
aborted += 1;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return aborted;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* How many jobs are still RUNNING. Graceful shutdown polls this so it can exit the moment the
|
|
112
|
+
* aborted jobs have actually settled (the common case: the CLI honours SIGTERM in ms) instead
|
|
113
|
+
* of waiting out a fixed kill-grace window.
|
|
114
|
+
*/
|
|
115
|
+
runningCount() {
|
|
116
|
+
let running = 0;
|
|
117
|
+
for (const entry of this.jobs.values())
|
|
118
|
+
if (entry.state === 'running')
|
|
119
|
+
running += 1;
|
|
120
|
+
return running;
|
|
121
|
+
}
|
|
93
122
|
async drive(entry, job) {
|
|
94
123
|
const controller = new AbortController();
|
|
95
124
|
let killReason;
|
|
@@ -130,6 +159,8 @@ export class JobRegistry {
|
|
|
130
159
|
resetInactivity();
|
|
131
160
|
};
|
|
132
161
|
resetInactivity();
|
|
162
|
+
// Expose the abort for shutdown (see abortAll); cleared in `finally` once the job settles.
|
|
163
|
+
entry.abort = (reason) => controller.abort(new Error(reason));
|
|
133
164
|
jobLog.info('job started', {});
|
|
134
165
|
try {
|
|
135
166
|
const result = await this.run(job, {
|
|
@@ -182,6 +213,7 @@ export class JobRegistry {
|
|
|
182
213
|
finally {
|
|
183
214
|
clearTimeout(inactivity);
|
|
184
215
|
clearTimeout(cap);
|
|
216
|
+
entry.abort = undefined;
|
|
185
217
|
entry.heartbeatAt = Date.now();
|
|
186
218
|
}
|
|
187
219
|
}
|
package/dist/server.js
CHANGED
|
@@ -13,6 +13,11 @@ import { log } from './logger.js';
|
|
|
13
13
|
// in the request body and live only for the duration of the job in an ephemeral
|
|
14
14
|
// workspace.
|
|
15
15
|
const PORT = Number(process.env.PORT ?? 8080);
|
|
16
|
+
// Optional bind address. Default (unset) binds all interfaces — a container needs that for
|
|
17
|
+
// its published port. The native local transport runs the harness UNSANDBOXED on the
|
|
18
|
+
// developer's host and only ever connects over loopback, so it sets 127.0.0.1 to keep the
|
|
19
|
+
// agent-spawning API off the LAN.
|
|
20
|
+
const BIND_HOST = process.env.HARNESS_BIND_HOST?.trim() || undefined;
|
|
16
21
|
// Optional inbound auth. When HARNESS_SHARED_SECRET is set, every non-health
|
|
17
22
|
// request must present a matching `x-harness-secret` header (constant-time
|
|
18
23
|
// compared). When it is unset the harness behaves as before (open), so local/dev
|
|
@@ -128,8 +133,35 @@ const server = createServer((req, res) => {
|
|
|
128
133
|
});
|
|
129
134
|
// Only auto-listen when run as the entry point (tests import handleRun directly).
|
|
130
135
|
if (process.env.NODE_ENV !== 'test') {
|
|
131
|
-
server.listen(PORT, () => {
|
|
132
|
-
console.log(`executor-harness listening on :${PORT}`);
|
|
136
|
+
server.listen(PORT, BIND_HOST, () => {
|
|
137
|
+
console.log(`executor-harness listening on ${BIND_HOST ?? ''}:${PORT}`);
|
|
133
138
|
});
|
|
139
|
+
// Graceful shutdown: dying to a bare SIGTERM/SIGINT (the default handler) would ORPHAN any
|
|
140
|
+
// in-flight `claude`/`codex`/git child — reparented, it keeps working unsupervised (and in
|
|
141
|
+
// native local mode on the developer's own login). Abort every running job first (the
|
|
142
|
+
// SIGTERM→SIGKILL escalation in killChildProcess), then exit as SOON as the aborted jobs have
|
|
143
|
+
// settled — the CLI usually honours SIGTERM in milliseconds, so don't block every shutdown on
|
|
144
|
+
// a fixed window. The 6s cap covers a job that ignored SIGTERM and had to be force-killed (the
|
|
145
|
+
// 5s escalation) plus a margin. Nothing running ⇒ exit immediately. A second signal takes the
|
|
146
|
+
// default (immediate) exit, since `once` leaves it unhandled.
|
|
147
|
+
const shutdown = (signal) => {
|
|
148
|
+
const aborted = Object.values(KINDS).reduce((count, { registry }) => count + registry.abortAll(`harness shutting down (${signal})`), 0);
|
|
149
|
+
log.info('shutting down', { signal, abortedJobs: aborted });
|
|
150
|
+
server.close();
|
|
151
|
+
if (aborted === 0) {
|
|
152
|
+
process.exit(0);
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
const deadline = Date.now() + 6_000;
|
|
156
|
+
const timer = setInterval(() => {
|
|
157
|
+
const stillRunning = Object.values(KINDS).some(({ registry }) => registry.runningCount() > 0);
|
|
158
|
+
if (!stillRunning || Date.now() >= deadline) {
|
|
159
|
+
clearInterval(timer);
|
|
160
|
+
process.exit(0);
|
|
161
|
+
}
|
|
162
|
+
}, 50);
|
|
163
|
+
};
|
|
164
|
+
process.once('SIGTERM', () => shutdown('SIGTERM'));
|
|
165
|
+
process.once('SIGINT', () => shutdown('SIGINT'));
|
|
134
166
|
}
|
|
135
167
|
export { server };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/executor-harness",
|
|
3
|
-
"version": "1.31.
|
|
3
|
+
"version": "1.31.10",
|
|
4
4
|
"description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,8 +26,8 @@
|
|
|
26
26
|
"hono": "^4.12.27",
|
|
27
27
|
"typescript": "^6.0.3",
|
|
28
28
|
"vitest": "^4.1.9",
|
|
29
|
-
"@cat-factory/
|
|
30
|
-
"@cat-factory/
|
|
29
|
+
"@cat-factory/server": "0.66.6",
|
|
30
|
+
"@cat-factory/spend": "0.10.71"
|
|
31
31
|
},
|
|
32
32
|
"scripts": {
|
|
33
33
|
"build": "tsc -p tsconfig.json",
|
package/src/agent-runner.ts
CHANGED
|
@@ -3,7 +3,7 @@ import { mkdtemp, rm, writeFile } from 'node:fs/promises'
|
|
|
3
3
|
import { tmpdir } from 'node:os'
|
|
4
4
|
import { join } from 'node:path'
|
|
5
5
|
import type { PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
|
|
6
|
-
import { killChildProcess } from './process.js'
|
|
6
|
+
import { killChildProcess, spawnDetached } from './process.js'
|
|
7
7
|
import { redact, secretsToRedact } from './redact.js'
|
|
8
8
|
|
|
9
9
|
// The alternate (subscription) harness runners. The Pi harness reaches models
|
|
@@ -93,6 +93,8 @@ function streamCli(
|
|
|
93
93
|
cwd: opts.cwd,
|
|
94
94
|
env: { ...process.env, ...env },
|
|
95
95
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
96
|
+
// Own process group (POSIX) so killChildProcess reaps the CLI's grandchildren too.
|
|
97
|
+
detached: spawnDetached,
|
|
96
98
|
})
|
|
97
99
|
child.stdin.on('error', () => {})
|
|
98
100
|
child.stdin.end(prompt)
|
package/src/agent.ts
CHANGED
|
@@ -626,6 +626,28 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
626
626
|
...(job.repo.provider ? { provider: job.repo.provider } : {}),
|
|
627
627
|
signal: opts.signal,
|
|
628
628
|
})
|
|
629
|
+
// `null` ⇒ the branch has nothing ahead of base, so there was no PR to open (a resumed
|
|
630
|
+
// branch whose earlier PR already merged). Record it as a clean no-op rather than a push,
|
|
631
|
+
// mirroring the no-changes outcome — the `runCodingAgent` guard normally catches this, so
|
|
632
|
+
// this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
|
|
633
|
+
if (prUrl === null) {
|
|
634
|
+
if (job.noChangesIsError === false) {
|
|
635
|
+
return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
|
|
636
|
+
}
|
|
637
|
+
return {
|
|
638
|
+
pushed: false,
|
|
639
|
+
branch: pushBranch,
|
|
640
|
+
summary,
|
|
641
|
+
stats,
|
|
642
|
+
error: noChangesReason(
|
|
643
|
+
'the work branch has no commits ahead of its base (nothing to open a PR for)',
|
|
644
|
+
stats,
|
|
645
|
+
stderrTail,
|
|
646
|
+
),
|
|
647
|
+
failureCause: 'no-changes',
|
|
648
|
+
...(usage ? { usage } : {}),
|
|
649
|
+
}
|
|
650
|
+
}
|
|
629
651
|
return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
|
|
630
652
|
}
|
|
631
653
|
return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
|
package/src/coding-agent.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { mkdir } from 'node:fs/promises'
|
|
|
2
2
|
import { join } from 'node:path'
|
|
3
3
|
import type { HarnessAuthFields, RepoSpec } from './job.js'
|
|
4
4
|
import {
|
|
5
|
+
branchAheadOfBase,
|
|
5
6
|
branchHasCommitsSince,
|
|
6
7
|
cloneExistingBranch,
|
|
7
8
|
cloneRepo,
|
|
@@ -343,7 +344,24 @@ export async function runCodingAgent(
|
|
|
343
344
|
})
|
|
344
345
|
}
|
|
345
346
|
|
|
346
|
-
|
|
347
|
+
// A fresh run produced work iff the branch advanced past its pre-run tip. A RESUMED
|
|
348
|
+
// run already carries prior work — UNLESS that branch turns out to have nothing ahead
|
|
349
|
+
// of the PR base (e.g. its earlier PR was merged with a merge commit, leaving the
|
|
350
|
+
// branch reachable from base and its best-effort delete skipped). Opening a PR for such
|
|
351
|
+
// a branch fails with GitHub's opaque 422 "No commits between ...", so a CONFIRMED-empty
|
|
352
|
+
// resumed branch is a no-op, not work. `undefined` (couldn't determine) keeps the prior
|
|
353
|
+
// resume-is-work behaviour; the PR-open path then no-ops on the 422 as a backstop.
|
|
354
|
+
const advancedThisPass = await branchHasCommitsSince(dir, baseSha, signal)
|
|
355
|
+
let hasWork = advancedThisPass || resumed
|
|
356
|
+
if (resumed && !advancedThisPass) {
|
|
357
|
+
const ahead = await branchAheadOfBase(dir, spec.repo.baseBranch, spec.ghToken, signal)
|
|
358
|
+
if (ahead === false) {
|
|
359
|
+
logger.info('coding-agent: resumed branch has no commits ahead of base — no-op', {
|
|
360
|
+
base: spec.repo.baseBranch,
|
|
361
|
+
})
|
|
362
|
+
hasWork = false
|
|
363
|
+
}
|
|
364
|
+
}
|
|
347
365
|
if (!hasWork) {
|
|
348
366
|
logger.info('coding-agent: no changes produced', { ...stats })
|
|
349
367
|
outcome = {
|
package/src/git.ts
CHANGED
|
@@ -495,6 +495,43 @@ export async function branchHasCommitsSince(
|
|
|
495
495
|
return (await headCommit(dir, signal)) !== baseSha
|
|
496
496
|
}
|
|
497
497
|
|
|
498
|
+
/**
|
|
499
|
+
* Whether the checked-out branch carries at least one commit the PR base does NOT — i.e.
|
|
500
|
+
* `git rev-list --count <base>..HEAD > 0`. A resume clone is single-branch, so it has no
|
|
501
|
+
* `origin/<base>` tracking ref; this fetches the base into a dedicated local ref first and
|
|
502
|
+
* diffs HEAD against it.
|
|
503
|
+
*
|
|
504
|
+
* Tri-state on purpose:
|
|
505
|
+
* - `true` — confirmed ≥1 commit ahead (there is something to open a PR for).
|
|
506
|
+
* - `false` — confirmed 0 commits ahead (the branch is reachable from base, e.g. its earlier
|
|
507
|
+
* PR was merged with a merge commit and the best-effort branch delete was skipped).
|
|
508
|
+
* - `undefined` — could not determine (fetch / rev-list error); the caller keeps its prior
|
|
509
|
+
* behaviour rather than wrongly dropping a resumed branch that has real work.
|
|
510
|
+
*
|
|
511
|
+
* Used by the resume path to avoid declaring a merged/empty branch as work and then failing
|
|
512
|
+
* the run with GitHub's opaque 422 "No commits between <base> and <branch>".
|
|
513
|
+
*/
|
|
514
|
+
export async function branchAheadOfBase(
|
|
515
|
+
dir: string,
|
|
516
|
+
baseBranch: string,
|
|
517
|
+
ghToken: string,
|
|
518
|
+
signal?: AbortSignal,
|
|
519
|
+
): Promise<boolean | undefined> {
|
|
520
|
+
try {
|
|
521
|
+
await git(['fetch', 'origin', `+refs/heads/${baseBranch}:refs/cat-factory/base`], {
|
|
522
|
+
cwd: dir,
|
|
523
|
+
signal,
|
|
524
|
+
env: await authEnv(ghToken),
|
|
525
|
+
})
|
|
526
|
+
const count = (
|
|
527
|
+
await git(['rev-list', '--count', 'refs/cat-factory/base..HEAD'], { cwd: dir, signal })
|
|
528
|
+
).trim()
|
|
529
|
+
return Number(count) > 0
|
|
530
|
+
} catch {
|
|
531
|
+
return undefined
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
498
535
|
/**
|
|
499
536
|
* Whether the checked-out branch has a real, examinable diff against
|
|
500
537
|
* `origin/<baseBranch>` — i.e. the base branch's remote-tracking ref exists (so the
|
|
@@ -876,7 +913,7 @@ function backoffMs(attempt: number): number {
|
|
|
876
913
|
* GitLab whose host isn't named `gitlab.*` still opens an MR instead of being misrouted to
|
|
877
914
|
* GitHub's API. The GitHub path is unchanged.
|
|
878
915
|
*/
|
|
879
|
-
export async function openPullRequest(opts: OpenPullRequestOptions): Promise<string> {
|
|
916
|
+
export async function openPullRequest(opts: OpenPullRequestOptions): Promise<string | null> {
|
|
880
917
|
const provider = opts.provider ?? (opts.cloneUrl ? inferVcsProvider(opts.cloneUrl) : 'github')
|
|
881
918
|
if (provider === 'gitlab') {
|
|
882
919
|
if (!opts.cloneUrl) {
|
|
@@ -917,6 +954,12 @@ export async function openPullRequest(opts: OpenPullRequestOptions): Promise<str
|
|
|
917
954
|
const existing = await findOpenPullRequestUrl(opts)
|
|
918
955
|
if (existing) return existing
|
|
919
956
|
}
|
|
957
|
+
// The head branch has nothing ahead of base ("No commits between <base> and <head>").
|
|
958
|
+
// That is not an API failure — there is simply nothing to open a PR for (e.g. a resumed
|
|
959
|
+
// branch whose earlier PR was merged with a merge commit, leaving the branch reachable
|
|
960
|
+
// from base). Signal it with null so the caller records a clean no-op instead of failing
|
|
961
|
+
// the run with GitHub's opaque 422.
|
|
962
|
+
if (res.status === 422 && /no commits between/i.test(detail)) return null
|
|
920
963
|
throw new HarnessFailure(
|
|
921
964
|
'api',
|
|
922
965
|
redactSecrets(`Failed to open PR (HTTP ${res.status}): ${detail.slice(0, 300)}`),
|
package/src/pi.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process'
|
|
|
2
2
|
import { appendFile, mkdir, writeFile } from 'node:fs/promises'
|
|
3
3
|
import { homedir } from 'node:os'
|
|
4
4
|
import { dirname, join } from 'node:path'
|
|
5
|
-
import { killChildProcess } from './process.js'
|
|
5
|
+
import { killChildProcess, spawnDetached } from './process.js'
|
|
6
6
|
import { pathExists } from './fs-utils.js'
|
|
7
7
|
import { redactSecrets } from './redact.js'
|
|
8
8
|
import { log } from './logger.js'
|
|
@@ -831,6 +831,8 @@ export function runPi(opts: {
|
|
|
831
831
|
// stdin is piped (not 'ignore') so the prompt is delivered out-of-band
|
|
832
832
|
// rather than on argv — see the function doc for the injection rationale.
|
|
833
833
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
834
|
+
// Own process group (POSIX) so killChildProcess reaps Pi's grandchildren too.
|
|
835
|
+
detached: spawnDetached,
|
|
834
836
|
},
|
|
835
837
|
)
|
|
836
838
|
// Hand Pi the prompt over stdin, then close it so print mode sees EOF and
|
package/src/process.ts
CHANGED
|
@@ -10,9 +10,31 @@ import { log, type Logger } from './logger.js'
|
|
|
10
10
|
const KILL_GRACE_MS = 5_000
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
13
|
+
* Signal a child and, when it was spawned detached (a process-group leader on POSIX — see
|
|
14
|
+
* `spawnDetached`), the whole group with it. The agent CLIs (`claude`/`codex`/Pi) spawn their
|
|
15
|
+
* own grandchildren (a shell tool, a build, their own git); a plain `child.kill()` reaps only
|
|
16
|
+
* the direct child and those grandchildren reparent to init and keep running unsupervised.
|
|
17
|
+
* `process.kill(-pid)` targets the group instead. Falls back to a direct kill on Windows (no
|
|
18
|
+
* POSIX process groups) or when the group send fails (already reaped, or the child wasn't
|
|
19
|
+
* spawned detached so no group of its own exists).
|
|
20
|
+
*/
|
|
21
|
+
function signalTree(child: ChildProcess, signal: NodeJS.Signals): void {
|
|
22
|
+
if (child.pid !== undefined && process.platform !== 'win32') {
|
|
23
|
+
try {
|
|
24
|
+
process.kill(-child.pid, signal)
|
|
25
|
+
return
|
|
26
|
+
} catch {
|
|
27
|
+
// Fall through to the direct kill below.
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
child.kill(signal)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Terminate a child process (and its group — see {@link signalTree}): SIGTERM first, then
|
|
35
|
+
* SIGKILL after a grace period if it hasn't exited (ignored an ordinary terminate). The
|
|
36
|
+
* escalation timer is `unref()`d so it never by itself keeps the event loop alive. Safe to
|
|
37
|
+
* call more than once.
|
|
16
38
|
*
|
|
17
39
|
* An actual escalation to SIGKILL is logged at warn level: a process that ignores
|
|
18
40
|
* SIGTERM and has to be force-killed is a signal worth seeing (a wedged Pi/CLI), and
|
|
@@ -23,11 +45,18 @@ export function killChildProcess(
|
|
|
23
45
|
graceMs: number = KILL_GRACE_MS,
|
|
24
46
|
logger: Logger = log,
|
|
25
47
|
): void {
|
|
26
|
-
child
|
|
48
|
+
signalTree(child, 'SIGTERM')
|
|
27
49
|
setTimeout(() => {
|
|
28
50
|
if (child.exitCode === null && child.signalCode === null) {
|
|
29
51
|
logger.warn('killChildProcess: process ignored SIGTERM, escalating to SIGKILL', { graceMs })
|
|
30
|
-
child
|
|
52
|
+
signalTree(child, 'SIGKILL')
|
|
31
53
|
}
|
|
32
54
|
}, graceMs).unref()
|
|
33
55
|
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Whether a spawned agent CLI should be its own process-group leader so {@link killChildProcess}
|
|
59
|
+
* can reap the whole tree (its grandchildren) on abort. POSIX only; Windows has no process
|
|
60
|
+
* groups (and `detached` there spawns a new console we don't want), so it stays false.
|
|
61
|
+
*/
|
|
62
|
+
export const spawnDetached = process.platform !== 'win32'
|
package/src/runner.ts
CHANGED
|
@@ -123,6 +123,8 @@ interface JobEntry<TResult extends JobResultBase> extends JobView<TResult> {
|
|
|
123
123
|
spanBuffer: ToolSpan[]
|
|
124
124
|
/** Follow-up items buffered since the last drain (see {@link JobView.followUps}). */
|
|
125
125
|
followUpBuffer: FollowUpLine[]
|
|
126
|
+
/** Abort the in-flight run (see {@link JobRegistry.abortAll}); set while running only. */
|
|
127
|
+
abort?: (reason: string) => void
|
|
126
128
|
}
|
|
127
129
|
|
|
128
130
|
/** Watchdog windows that bound every job. Tunable via the container's env. */
|
|
@@ -158,6 +160,7 @@ function toView<TResult extends JobResultBase>(entry: JobEntry<TResult>): JobVie
|
|
|
158
160
|
promise: _promise,
|
|
159
161
|
spanBuffer: _spanBuffer,
|
|
160
162
|
followUpBuffer: _followUpBuffer,
|
|
163
|
+
abort: _abort,
|
|
161
164
|
...view
|
|
162
165
|
} = entry
|
|
163
166
|
return { ...view }
|
|
@@ -228,6 +231,35 @@ export class JobRegistry<TJob = unknown, TResult extends JobResultBase = JobResu
|
|
|
228
231
|
return view
|
|
229
232
|
}
|
|
230
233
|
|
|
234
|
+
/**
|
|
235
|
+
* Abort every RUNNING job (fires each run's abort signal, which SIGTERM→SIGKILLs its
|
|
236
|
+
* CLI/git children via `killChildProcess`). The graceful-shutdown hook: a harness dying
|
|
237
|
+
* to SIGTERM must not orphan a live agent subprocess — reparented, it would keep working
|
|
238
|
+
* unsupervised (and, in native local mode, on the developer's own login). Returns the
|
|
239
|
+
* number of jobs aborted.
|
|
240
|
+
*/
|
|
241
|
+
abortAll(reason: string): number {
|
|
242
|
+
let aborted = 0
|
|
243
|
+
for (const entry of this.jobs.values()) {
|
|
244
|
+
if (entry.state === 'running' && entry.abort) {
|
|
245
|
+
entry.abort(reason)
|
|
246
|
+
aborted += 1
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return aborted
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* How many jobs are still RUNNING. Graceful shutdown polls this so it can exit the moment the
|
|
254
|
+
* aborted jobs have actually settled (the common case: the CLI honours SIGTERM in ms) instead
|
|
255
|
+
* of waiting out a fixed kill-grace window.
|
|
256
|
+
*/
|
|
257
|
+
runningCount(): number {
|
|
258
|
+
let running = 0
|
|
259
|
+
for (const entry of this.jobs.values()) if (entry.state === 'running') running += 1
|
|
260
|
+
return running
|
|
261
|
+
}
|
|
262
|
+
|
|
231
263
|
private async drive(entry: JobEntry<TResult>, job: TJob): Promise<void> {
|
|
232
264
|
const controller = new AbortController()
|
|
233
265
|
let killReason: 'inactivity' | 'max-duration' | undefined
|
|
@@ -271,6 +303,8 @@ export class JobRegistry<TJob = unknown, TResult extends JobResultBase = JobResu
|
|
|
271
303
|
resetInactivity()
|
|
272
304
|
}
|
|
273
305
|
resetInactivity()
|
|
306
|
+
// Expose the abort for shutdown (see abortAll); cleared in `finally` once the job settles.
|
|
307
|
+
entry.abort = (reason) => controller.abort(new Error(reason))
|
|
274
308
|
|
|
275
309
|
jobLog.info('job started', {})
|
|
276
310
|
try {
|
|
@@ -327,6 +361,7 @@ export class JobRegistry<TJob = unknown, TResult extends JobResultBase = JobResu
|
|
|
327
361
|
} finally {
|
|
328
362
|
clearTimeout(inactivity)
|
|
329
363
|
clearTimeout(cap)
|
|
364
|
+
entry.abort = undefined
|
|
330
365
|
entry.heartbeatAt = Date.now()
|
|
331
366
|
}
|
|
332
367
|
}
|
package/src/server.ts
CHANGED
|
@@ -16,6 +16,12 @@ import { log } from './logger.js'
|
|
|
16
16
|
|
|
17
17
|
const PORT = Number(process.env.PORT ?? 8080)
|
|
18
18
|
|
|
19
|
+
// Optional bind address. Default (unset) binds all interfaces — a container needs that for
|
|
20
|
+
// its published port. The native local transport runs the harness UNSANDBOXED on the
|
|
21
|
+
// developer's host and only ever connects over loopback, so it sets 127.0.0.1 to keep the
|
|
22
|
+
// agent-spawning API off the LAN.
|
|
23
|
+
const BIND_HOST = process.env.HARNESS_BIND_HOST?.trim() || undefined
|
|
24
|
+
|
|
19
25
|
// Optional inbound auth. When HARNESS_SHARED_SECRET is set, every non-health
|
|
20
26
|
// request must present a matching `x-harness-secret` header (constant-time
|
|
21
27
|
// compared). When it is unset the harness behaves as before (open), so local/dev
|
|
@@ -145,9 +151,40 @@ const server = createServer((req, res) => {
|
|
|
145
151
|
|
|
146
152
|
// Only auto-listen when run as the entry point (tests import handleRun directly).
|
|
147
153
|
if (process.env.NODE_ENV !== 'test') {
|
|
148
|
-
server.listen(PORT, () => {
|
|
149
|
-
console.log(`executor-harness listening on :${PORT}`)
|
|
154
|
+
server.listen(PORT, BIND_HOST, () => {
|
|
155
|
+
console.log(`executor-harness listening on ${BIND_HOST ?? ''}:${PORT}`)
|
|
150
156
|
})
|
|
157
|
+
|
|
158
|
+
// Graceful shutdown: dying to a bare SIGTERM/SIGINT (the default handler) would ORPHAN any
|
|
159
|
+
// in-flight `claude`/`codex`/git child — reparented, it keeps working unsupervised (and in
|
|
160
|
+
// native local mode on the developer's own login). Abort every running job first (the
|
|
161
|
+
// SIGTERM→SIGKILL escalation in killChildProcess), then exit as SOON as the aborted jobs have
|
|
162
|
+
// settled — the CLI usually honours SIGTERM in milliseconds, so don't block every shutdown on
|
|
163
|
+
// a fixed window. The 6s cap covers a job that ignored SIGTERM and had to be force-killed (the
|
|
164
|
+
// 5s escalation) plus a margin. Nothing running ⇒ exit immediately. A second signal takes the
|
|
165
|
+
// default (immediate) exit, since `once` leaves it unhandled.
|
|
166
|
+
const shutdown = (signal: string): void => {
|
|
167
|
+
const aborted = Object.values(KINDS).reduce(
|
|
168
|
+
(count, { registry }) => count + registry.abortAll(`harness shutting down (${signal})`),
|
|
169
|
+
0,
|
|
170
|
+
)
|
|
171
|
+
log.info('shutting down', { signal, abortedJobs: aborted })
|
|
172
|
+
server.close()
|
|
173
|
+
if (aborted === 0) {
|
|
174
|
+
process.exit(0)
|
|
175
|
+
return
|
|
176
|
+
}
|
|
177
|
+
const deadline = Date.now() + 6_000
|
|
178
|
+
const timer = setInterval(() => {
|
|
179
|
+
const stillRunning = Object.values(KINDS).some(({ registry }) => registry.runningCount() > 0)
|
|
180
|
+
if (!stillRunning || Date.now() >= deadline) {
|
|
181
|
+
clearInterval(timer)
|
|
182
|
+
process.exit(0)
|
|
183
|
+
}
|
|
184
|
+
}, 50)
|
|
185
|
+
}
|
|
186
|
+
process.once('SIGTERM', () => shutdown('SIGTERM'))
|
|
187
|
+
process.once('SIGINT', () => shutdown('SIGINT'))
|
|
151
188
|
}
|
|
152
189
|
|
|
153
190
|
export { server }
|