@cat-factory/executor-harness 1.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +143 -0
- package/dist/agent-runner.js +389 -0
- package/dist/agent.js +810 -0
- package/dist/blueprint.js +367 -0
- package/dist/bootstrap.js +99 -0
- package/dist/ci-fixer.js +46 -0
- package/dist/coding-agent.js +285 -0
- package/dist/conflict-resolver.js +138 -0
- package/dist/embed.js +8 -0
- package/dist/explore.js +74 -0
- package/dist/failure.js +47 -0
- package/dist/fixer.js +44 -0
- package/dist/follow-ups.js +103 -0
- package/dist/frontend-infra.js +283 -0
- package/dist/fs-utils.js +11 -0
- package/dist/git.js +778 -0
- package/dist/job.js +409 -0
- package/dist/logger.js +27 -0
- package/dist/merger.js +135 -0
- package/dist/on-call.js +126 -0
- package/dist/pi-workspace.js +237 -0
- package/dist/pi.js +971 -0
- package/dist/process.js +25 -0
- package/dist/redact.js +109 -0
- package/dist/runner.js +228 -0
- package/dist/server.js +135 -0
- package/dist/spec.js +754 -0
- package/dist/structured-output.js +431 -0
- package/dist/tester.js +191 -0
- package/package.json +35 -0
- package/src/agent-runner.ts +484 -0
- package/src/agent.ts +948 -0
- package/src/coding-agent.ts +393 -0
- package/src/embed.ts +32 -0
- package/src/failure.ts +73 -0
- package/src/follow-ups.ts +106 -0
- package/src/frontend-infra.ts +340 -0
- package/src/fs-utils.ts +11 -0
- package/src/git.ts +955 -0
- package/src/job.ts +766 -0
- package/src/logger.ts +45 -0
- package/src/pi-workspace.ts +348 -0
- package/src/pi.ts +1236 -0
- package/src/process.ts +33 -0
- package/src/redact.ts +109 -0
- package/src/runner.ts +384 -0
- package/src/server.ts +153 -0
- package/src/structured-output.ts +524 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Igor Savin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# @cat-factory/executor-harness
|
|
2
|
+
|
|
3
|
+
The payload that runs **inside** a per-run Cloudflare Container (or a
|
|
4
|
+
[self-hosted runner](../../docs/runner-pool-integration.md)) to perform real
|
|
5
|
+
repo work with the [Pi coding agent](https://github.com/earendil-works/pi).
|
|
6
|
+
|
|
7
|
+
It is a thin TypeScript wrapper — a `node:http` server on `:8080` — that the
|
|
8
|
+
Worker drives over a small **job protocol**. Jobs run **asynchronously**: a `POST`
|
|
9
|
+
accepts the job and returns immediately with a `jobId`; the driver then polls
|
|
10
|
+
`GET /jobs/{id}` for live progress and the terminal result.
|
|
11
|
+
|
|
12
|
+
## Table of contents
|
|
13
|
+
|
|
14
|
+
- [Job protocol](#job-protocol)
|
|
15
|
+
- [What a job does](#what-a-job-does)
|
|
16
|
+
- [No secrets in the image](#no-secrets-in-the-image)
|
|
17
|
+
- [Layout](#layout)
|
|
18
|
+
- [Runner lifecycle knobs](#runner-lifecycle-knobs)
|
|
19
|
+
- [Build / test](#build--test)
|
|
20
|
+
|
|
21
|
+
## Job protocol
|
|
22
|
+
|
|
23
|
+
| Method & path | Purpose |
|
|
24
|
+
| ----------------- | ---------------------------------------------------------------------------------------------------------------------- |
|
|
25
|
+
| `GET /health` | Liveness — `{ "status": "ok" }`. |
|
|
26
|
+
| `POST /run` | Start (or re-attach to) an **implementation** job (`coder` / `mocker` / `playwright`). Returns `202 { jobId, state }`. |
|
|
27
|
+
| `POST /bootstrap` | Start a **repo-bootstrap** job (adapt a reference architecture → force-push a new repo). |
|
|
28
|
+
| `POST /blueprint` | Start a **blueprint** job (decompose a repo → write the in-repo `blueprints/` map, commit on a branch). |
|
|
29
|
+
| `GET /jobs/{id}` | Poll any job; returns the **job view** (`state`, optional `progress {completed,inProgress,total}`, `result`, `error`). |
|
|
30
|
+
|
|
31
|
+
All jobs run in a generic `JobRegistry` (`src/runner.ts`) keyed by `jobId`, so a
|
|
32
|
+
replayed `POST` **re-attaches** to the running job rather than starting a
|
|
33
|
+
duplicate (the durable driver's retries/replays are safe). Pi's todo-tool counts
|
|
34
|
+
are surfaced as `progress` while a job runs. The exact request/response shapes
|
|
35
|
+
cat-factory sends are documented in
|
|
36
|
+
[`docs/runner-pool-integration.md`](../../docs/runner-pool-integration.md).
|
|
37
|
+
|
|
38
|
+
## What a job does
|
|
39
|
+
|
|
40
|
+
The implementation job (`POST /run`) is the canonical sequence:
|
|
41
|
+
|
|
42
|
+
1. **clone** the target repo (shallow) with a short-lived GitHub installation token,
|
|
43
|
+
2. write the composed system prompt (role + the block's best-practice fragments)
|
|
44
|
+
to Pi's **global** context file `~/.pi/agent/AGENTS.md` (outside the checkout,
|
|
45
|
+
so it never lands in a commit and never clobbers a repo's own `AGENTS.md` —
|
|
46
|
+
Pi reads and concatenates both), and point Pi at the Worker's LLM proxy via
|
|
47
|
+
`~/.pi/agent/models.json` (provider `proxy`, `api: openai-completions`),
|
|
48
|
+
3. **run Pi** non-interactively (`pi -p --mode json --model proxy/<model> --approve`),
|
|
49
|
+
4. **commit, push** a branch and **open a PR**, returning `{ prUrl, branch, summary }`.
|
|
50
|
+
|
|
51
|
+
Bootstrap differs at the ends — it may start from an empty dir, and **resets
|
|
52
|
+
history to one commit and force-pushes** the default branch instead of opening a
|
|
53
|
+
PR. Blueprint **commits onto a branch** (no history reset) and returns the tree.
|
|
54
|
+
|
|
55
|
+
## No secrets in the image
|
|
56
|
+
|
|
57
|
+
The image (built from the `Dockerfile`, base `node:26-trixie-slim`) contains
|
|
58
|
+
only `git` + the Pi CLI + this compiled wrapper — **no API keys, no GitHub
|
|
59
|
+
credentials**. Per job, the Worker passes a short-lived GitHub token and a
|
|
60
|
+
signed, model-locked LLM-proxy **session token** in the request body. Pi reaches
|
|
61
|
+
models only through the Worker proxy, which injects the real provider key (qwen /
|
|
62
|
+
Kimi / DeepSeek) and meters spend. The provider key never enters the container.
|
|
63
|
+
|
|
64
|
+
## Layout
|
|
65
|
+
|
|
66
|
+
| File | Responsibility |
|
|
67
|
+
| ------------------ | ------------------------------------------------------------------------------------------------------- |
|
|
68
|
+
| `src/server.ts` | HTTP entry point; routes `/health`, `/run`, `/bootstrap`, `/blueprint`, `/jobs/{id}`. |
|
|
69
|
+
| `src/runner.ts` | `JobRegistry` — async job lifecycle, idempotent on `jobId`, progress tracking. |
|
|
70
|
+
| `src/job.ts` | Request types + validators for the job specs. |
|
|
71
|
+
| `src/pi.ts` | Pi provider config, non-interactive run, JSON-line event + todo-progress parsing, global `AGENTS.md` guidance. |
|
|
72
|
+
| `src/git.ts` | clone / branch / commit / push + GitHub PR creation; bootstrap history reset + force-push. |
|
|
73
|
+
| `src/bootstrap.ts` | The `/bootstrap` handler (clone-or-empty → adapt → reinit + force-push). |
|
|
74
|
+
| `src/blueprint.ts` | The `/blueprint` handler (decompose → render `blueprints/` → commit on branch). |
|
|
75
|
+
| `src/embed.ts` | Bundled assets/templates written into the workspace. |
|
|
76
|
+
| `src/logger.ts` | Structured logging. |
|
|
77
|
+
|
|
78
|
+
## Runner lifecycle knobs
|
|
79
|
+
|
|
80
|
+
Read from the environment inside the container (also honoured by a self-hosted
|
|
81
|
+
runner):
|
|
82
|
+
|
|
83
|
+
| Env var | Default | Effect |
|
|
84
|
+
| --------------------- | --------------- | ----------------------------------------------------------- |
|
|
85
|
+
| `PORT` | `8080` | HTTP port the harness listens on. |
|
|
86
|
+
| `JOB_MAX_DURATION_MS` | `3600000` (60m) | Hard ceiling on a job's wall-clock time; force-fails after. |
|
|
87
|
+
| `JOB_INACTIVITY_MS` | `600000` (10m) | Kills a hung agent that produces no output for this long. |
|
|
88
|
+
|
|
89
|
+
## Build / test
|
|
90
|
+
|
|
91
|
+
```sh
|
|
92
|
+
pnpm --filter @cat-factory/executor-harness build # tsc → dist/
|
|
93
|
+
pnpm --filter @cat-factory/executor-harness test # unit tests
|
|
94
|
+
docker build -f Dockerfile . # the container image
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The build context is just this package, so its `tsconfig.json` is intentionally
|
|
98
|
+
self-contained.
|
|
99
|
+
|
|
100
|
+
## Published image (GHCR + Docker Hub)
|
|
101
|
+
|
|
102
|
+
This package is not published to npm; instead its **Docker image** is published
|
|
103
|
+
publicly, multi-arch (`linux/amd64` + `linux/arm64`), to **both GHCR and Docker
|
|
104
|
+
Hub** so anyone can pull it without building from source:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
ghcr.io/<owner>/cat-factory-executor:<version>
|
|
108
|
+
docker.io/<org>/cat-factory-executor:<version>
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Each is tagged with the package `version`, the commit `sha-…`, and `latest`.
|
|
112
|
+
|
|
113
|
+
**CI** does this automatically:
|
|
114
|
+
[`.github/workflows/docker-publish.yml`](../../../.github/workflows/docker-publish.yml)
|
|
115
|
+
republishes on every push to `main` that touches image content (`src/**`,
|
|
116
|
+
`Dockerfile`, `tsconfig.json`, `package.json`). Docker Hub is gated on the
|
|
117
|
+
`DOCKERHUB_USERNAME` / `DOCKERHUB_TOKEN` repo secrets; without them it publishes
|
|
118
|
+
to GHCR only.
|
|
119
|
+
|
|
120
|
+
**Manually** (on demand, or to publish from a fork under your own namespaces):
|
|
121
|
+
|
|
122
|
+
```sh
|
|
123
|
+
# Log in first (one per registry you target):
|
|
124
|
+
echo "$GHCR_TOKEN" | docker login ghcr.io -u <github-user> --password-stdin
|
|
125
|
+
echo "$DOCKERHUB_TOKEN" | docker login -u <dockerhub-user> --password-stdin
|
|
126
|
+
|
|
127
|
+
pnpm --filter @cat-factory/executor-harness run image:publish
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The script ([`scripts/publish-image.sh`](./scripts/publish-image.sh)) builds the
|
|
131
|
+
multi-arch image once and pushes it to the selected registries. Override defaults
|
|
132
|
+
via env vars (`REGISTRIES`, `GHCR_OWNER`, `DOCKERHUB_ORG`, `TAG`, `PUSH_LATEST`,
|
|
133
|
+
`PLATFORMS`, `EXTRA_CA`) — see the header of the script. Example: GHCR only —
|
|
134
|
+
`REGISTRIES=ghcr pnpm --filter @cat-factory/executor-harness run image:publish`.
|
|
135
|
+
|
|
136
|
+
A backend deployment references the image from `wrangler.toml`
|
|
137
|
+
(`[[containers]] image = "ghcr.io/<owner>/cat-factory-executor:<version>"` — see
|
|
138
|
+
[`deploy/backend`](../../../deploy/backend)); a self-hosted runner pool pulls the
|
|
139
|
+
same image (see [`docs/runner-pool-integration.md`](../../docs/runner-pool-integration.md)).
|
|
140
|
+
The worker library's own test/dev `wrangler.toml` still references this
|
|
141
|
+
`Dockerfile` by local path so the acceptance suite can build it. Because the
|
|
142
|
+
version is the image tag, **bump this package via a changeset whenever you change
|
|
143
|
+
image content** (see [`CONTRIBUTING.md`](../../../CONTRIBUTING.md)).
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import { spawn } from 'node:child_process';
|
|
2
|
+
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
|
3
|
+
import { tmpdir } from 'node:os';
|
|
4
|
+
import { join } from 'node:path';
|
|
5
|
+
import { killChildProcess } from './process.js';
|
|
6
|
+
import { redact, secretsToRedact } from './redact.js';
|
|
7
|
+
function isObject(value) {
|
|
8
|
+
return typeof value === 'object' && value !== null;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
|
|
12
|
+
* through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
|
|
13
|
+
* never argv), `onActivity` on every chunk, abort kills the child, and the close
|
|
14
|
+
* handler resolves/rejects. The caller's `onEvent` accumulates the outcome.
|
|
15
|
+
*
|
|
16
|
+
* `prompt` is fed over stdin: for Claude Code that is just the task prompt (the
|
|
17
|
+
* system prompt rides `--append-system-prompt`); for Codex — which has no
|
|
18
|
+
* system-prompt flag — the caller prepends the composed system prompt to it so
|
|
19
|
+
* the role + best-practice context is not lost.
|
|
20
|
+
*/
|
|
21
|
+
function streamCli(command, args, prompt, opts, env, secrets, onEvent) {
|
|
22
|
+
return new Promise((resolve, reject) => {
|
|
23
|
+
if (opts.signal?.aborted) {
|
|
24
|
+
reject(new Error(`${command} aborted before start`));
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
const child = spawn(command, args, {
|
|
28
|
+
cwd: opts.cwd,
|
|
29
|
+
env: { ...process.env, ...env },
|
|
30
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
31
|
+
});
|
|
32
|
+
child.stdin.on('error', () => { });
|
|
33
|
+
child.stdin.end(prompt);
|
|
34
|
+
let stderr = '';
|
|
35
|
+
let aborted = false;
|
|
36
|
+
let lineBuffer = '';
|
|
37
|
+
const killChild = () => killChildProcess(child);
|
|
38
|
+
const processLine = (line) => {
|
|
39
|
+
if (!line.startsWith('{'))
|
|
40
|
+
return;
|
|
41
|
+
let event;
|
|
42
|
+
try {
|
|
43
|
+
event = JSON.parse(line);
|
|
44
|
+
}
|
|
45
|
+
catch {
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
try {
|
|
49
|
+
onEvent(event);
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
// A faulty observer must never break the run.
|
|
53
|
+
}
|
|
54
|
+
};
|
|
55
|
+
const consumeStdout = (text) => {
|
|
56
|
+
lineBuffer += text;
|
|
57
|
+
let nl = lineBuffer.indexOf('\n');
|
|
58
|
+
while (nl !== -1) {
|
|
59
|
+
const line = lineBuffer.slice(0, nl).trim();
|
|
60
|
+
lineBuffer = lineBuffer.slice(nl + 1);
|
|
61
|
+
nl = lineBuffer.indexOf('\n');
|
|
62
|
+
processLine(line);
|
|
63
|
+
}
|
|
64
|
+
};
|
|
65
|
+
const onAbort = () => {
|
|
66
|
+
aborted = true;
|
|
67
|
+
killChild();
|
|
68
|
+
};
|
|
69
|
+
opts.signal?.addEventListener('abort', onAbort, { once: true });
|
|
70
|
+
child.stdout.on('data', (chunk) => {
|
|
71
|
+
opts.onActivity?.();
|
|
72
|
+
consumeStdout(chunk.toString());
|
|
73
|
+
});
|
|
74
|
+
child.stderr.on('data', (chunk) => {
|
|
75
|
+
opts.onActivity?.();
|
|
76
|
+
stderr += chunk.toString();
|
|
77
|
+
if (stderr.length > 8_000)
|
|
78
|
+
stderr = stderr.slice(-8_000);
|
|
79
|
+
});
|
|
80
|
+
child.on('error', (err) => {
|
|
81
|
+
opts.signal?.removeEventListener('abort', onAbort);
|
|
82
|
+
reject(err);
|
|
83
|
+
});
|
|
84
|
+
child.on('close', (code) => {
|
|
85
|
+
opts.signal?.removeEventListener('abort', onAbort);
|
|
86
|
+
if (lineBuffer.trim())
|
|
87
|
+
processLine(lineBuffer.trim());
|
|
88
|
+
const stderrTail = redact(stderr, secrets).slice(-700);
|
|
89
|
+
if (aborted) {
|
|
90
|
+
reject(new Error('agent run aborted by watchdog'));
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
if (code !== 0) {
|
|
94
|
+
reject(new Error(`${command} exited with code ${code}: ${stderrTail}`));
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
resolve({ stderrTail });
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
// Claude Code
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
/**
|
|
105
|
+
* Run the Claude Code CLI headlessly against `opts.cwd`, authenticated with the
|
|
106
|
+
* leased subscription OAuth token (CLAUDE_CODE_OAUTH_TOKEN), talking direct to
|
|
107
|
+
* api.anthropic.com. Streams `--output-format stream-json`, mapping the
|
|
108
|
+
* `TodoWrite` tool calls onto subtask progress and the terminal `result` event
|
|
109
|
+
* onto the summary + usage.
|
|
110
|
+
*/
|
|
111
|
+
export async function runClaudeCode(opts) {
|
|
112
|
+
const stats = { toolCalls: 0, assistantChars: 0 };
|
|
113
|
+
let summary = '';
|
|
114
|
+
let usage;
|
|
115
|
+
const onEvent = (event) => {
|
|
116
|
+
const type = event.type;
|
|
117
|
+
if (type === 'assistant' && isObject(event.message)) {
|
|
118
|
+
const content = event.message.content;
|
|
119
|
+
if (Array.isArray(content)) {
|
|
120
|
+
for (const block of content) {
|
|
121
|
+
if (!isObject(block))
|
|
122
|
+
continue;
|
|
123
|
+
if (block.type === 'text' && typeof block.text === 'string') {
|
|
124
|
+
stats.assistantChars += block.text.length;
|
|
125
|
+
}
|
|
126
|
+
if (block.type === 'tool_use') {
|
|
127
|
+
stats.toolCalls += 1;
|
|
128
|
+
if (block.name === 'TodoWrite' && opts.onProgress) {
|
|
129
|
+
const progress = todosToProgress(block.input?.todos);
|
|
130
|
+
if (progress)
|
|
131
|
+
opts.onProgress(progress);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
else if (type === 'result') {
|
|
138
|
+
if (typeof event.result === 'string')
|
|
139
|
+
summary = event.result;
|
|
140
|
+
usage = claudeUsage(event.usage) ?? usage;
|
|
141
|
+
}
|
|
142
|
+
};
|
|
143
|
+
// Native (ambient) mode: run the developer's installed `claude` with its OWN login —
|
|
144
|
+
// no isolated config home, no injected credential, no onboarding pre-seed. Otherwise,
|
|
145
|
+
// Claude Code persists user config/credentials under its config dir; point that at an
|
|
146
|
+
// isolated, per-run temp dir OUTSIDE the cloned checkout (`opts.cwd`). Otherwise the
|
|
147
|
+
// agents that finish with `git add -A` (blueprint/requirements/bootstrap) could stage a
|
|
148
|
+
// stray `.claude/` directory — and any cached credential in it — into the pushed branch.
|
|
149
|
+
// Mirrors the Codex CODEX_HOME isolation below; removed in `finally`.
|
|
150
|
+
if (!opts.ambientAuth && !opts.subscriptionToken) {
|
|
151
|
+
throw new Error('claude-code harness requires a subscription token (or ambientAuth)');
|
|
152
|
+
}
|
|
153
|
+
const configHome = opts.ambientAuth ? undefined : await mkdtemp(join(tmpdir(), 'cf-claude-'));
|
|
154
|
+
// The config dir is brand-new every run, so Claude Code would otherwise treat this
|
|
155
|
+
// as a first launch and BLOCK on the interactive onboarding / "trust this folder" /
|
|
156
|
+
// bypass-permissions acknowledgement prompts — which never get answered headlessly,
|
|
157
|
+
// hanging the job until the watchdog kills it. Pre-seed the config that marks those
|
|
158
|
+
// as already accepted so `-p` starts straight into the run. Best-effort: written
|
|
159
|
+
// before the CLI starts; unknown keys are harmless if a CLI version ignores them.
|
|
160
|
+
// (Ambient mode skips this — the developer's own config is already onboarded.)
|
|
161
|
+
if (configHome) {
|
|
162
|
+
await writeFile(join(configHome, '.claude.json'), JSON.stringify({
|
|
163
|
+
hasCompletedOnboarding: true,
|
|
164
|
+
bypassPermissionsModeAccepted: true,
|
|
165
|
+
hasTrustDialogAccepted: true,
|
|
166
|
+
}), { mode: 0o600 }).catch(() => { });
|
|
167
|
+
}
|
|
168
|
+
// Anthropic itself authenticates with the subscription OAuth token; a
|
|
169
|
+
// non-Anthropic Claude-Code vendor (GLM via Z.ai, Kimi via Moonshot, DeepSeek)
|
|
170
|
+
// points Claude Code at its Anthropic-compatible endpoint with an auth-token key.
|
|
171
|
+
// Ambient mode injects neither — the CLI uses the developer's logged-in `~/.claude`.
|
|
172
|
+
const env = opts.ambientAuth
|
|
173
|
+
? {}
|
|
174
|
+
: {
|
|
175
|
+
CLAUDE_CONFIG_DIR: configHome,
|
|
176
|
+
...(opts.subscriptionBaseUrl
|
|
177
|
+
? {
|
|
178
|
+
ANTHROPIC_BASE_URL: opts.subscriptionBaseUrl,
|
|
179
|
+
ANTHROPIC_AUTH_TOKEN: opts.subscriptionToken,
|
|
180
|
+
}
|
|
181
|
+
: { CLAUDE_CODE_OAUTH_TOKEN: opts.subscriptionToken }),
|
|
182
|
+
};
|
|
183
|
+
try {
|
|
184
|
+
const { stderrTail } = await streamCli('claude', [
|
|
185
|
+
'-p',
|
|
186
|
+
'--output-format',
|
|
187
|
+
'stream-json',
|
|
188
|
+
'--verbose',
|
|
189
|
+
// The per-run container IS the sandbox, and the run is fully headless (no one
|
|
190
|
+
// to approve a tool call) — so bypass permissions entirely. `acceptEdits`
|
|
191
|
+
// would auto-accept file edits but still gate Bash, which in `-p` mode is then
|
|
192
|
+
// denied, leaving the agent unable to run builds/tests/git to verify its work.
|
|
193
|
+
'--permission-mode',
|
|
194
|
+
'bypassPermissions',
|
|
195
|
+
'--model',
|
|
196
|
+
opts.model,
|
|
197
|
+
'--append-system-prompt',
|
|
198
|
+
opts.systemPrompt,
|
|
199
|
+
], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
200
|
+
return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
|
|
201
|
+
}
|
|
202
|
+
finally {
|
|
203
|
+
// Never leave the config dir (and any cached credential) on disk past the run.
|
|
204
|
+
if (configHome)
|
|
205
|
+
await rm(configHome, { recursive: true, force: true }).catch(() => { });
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
/** Map Claude Code's `TodoWrite` todos array onto subtask counts. */
|
|
209
|
+
function todosToProgress(todos) {
|
|
210
|
+
if (!Array.isArray(todos))
|
|
211
|
+
return undefined;
|
|
212
|
+
const items = todos.filter(isObject).map((t) => ({
|
|
213
|
+
label: typeof t.content === 'string' ? t.content : String(t.content ?? ''),
|
|
214
|
+
status: normalizeStatus(t.status),
|
|
215
|
+
}));
|
|
216
|
+
const completed = items.filter((i) => i.status === 'completed').length;
|
|
217
|
+
const inProgress = items.filter((i) => i.status === 'in_progress').length;
|
|
218
|
+
return { completed, inProgress, total: items.length, items };
|
|
219
|
+
}
|
|
220
|
+
function normalizeStatus(status) {
|
|
221
|
+
if (status === 'completed')
|
|
222
|
+
return 'completed';
|
|
223
|
+
if (status === 'in_progress')
|
|
224
|
+
return 'in_progress';
|
|
225
|
+
return 'pending';
|
|
226
|
+
}
|
|
227
|
+
function claudeUsage(raw) {
|
|
228
|
+
if (!isObject(raw))
|
|
229
|
+
return undefined;
|
|
230
|
+
// Count every input bucket Anthropic bills: fresh input plus BOTH cache reads and
|
|
231
|
+
// cache writes (cache_creation_input_tokens), which are real consumed tokens — and
|
|
232
|
+
// are the dominant share on a long agent run. Omitting them under-weights a token's
|
|
233
|
+
// true load in the usage-aware rotation window.
|
|
234
|
+
const input = numberOf(raw.input_tokens) +
|
|
235
|
+
numberOf(raw.cache_read_input_tokens) +
|
|
236
|
+
numberOf(raw.cache_creation_input_tokens);
|
|
237
|
+
const output = numberOf(raw.output_tokens);
|
|
238
|
+
if (input === 0 && output === 0)
|
|
239
|
+
return undefined;
|
|
240
|
+
return { inputTokens: input, outputTokens: output };
|
|
241
|
+
}
|
|
242
|
+
// ---------------------------------------------------------------------------
|
|
243
|
+
// Codex
|
|
244
|
+
// ---------------------------------------------------------------------------
|
|
245
|
+
/**
|
|
246
|
+
* Run the Codex CLI headlessly against `opts.cwd`, authenticated with the leased
|
|
247
|
+
* ChatGPT `auth.json` bundle written to an isolated CODEX_HOME, talking direct to
|
|
248
|
+
* the ChatGPT backend. Streams `codex exec --json`, mapping plan/todo updates onto
|
|
249
|
+
* subtask progress and the running cumulative token usage onto the outcome.
|
|
250
|
+
*/
|
|
251
|
+
export async function runCodex(opts) {
|
|
252
|
+
const stats = { toolCalls: 0, assistantChars: 0 };
|
|
253
|
+
let summary = '';
|
|
254
|
+
let usage;
|
|
255
|
+
// Codex reads its credentials from $CODEX_HOME/auth.json with file-backed
|
|
256
|
+
// storage. CRITICAL: this home must live OUTSIDE the cloned checkout (`opts.cwd`)
|
|
257
|
+
// — the blueprint/requirements/conflict-resolver handlers finish with
|
|
258
|
+
// `git add -A` + push, which would otherwise stage and publish the decrypted
|
|
259
|
+
// subscription `auth.json` (access + refresh tokens) to the PR branch. An
|
|
260
|
+
// isolated, per-run temp dir keeps the credential out of the working tree and is
|
|
261
|
+
// removed in `finally`.
|
|
262
|
+
//
|
|
263
|
+
// KNOWN LIMITATION: Codex refreshes its OAuth access token in-place by rewriting
|
|
264
|
+
// this `auth.json` mid-run. Because the home is a per-run temp dir wiped in
|
|
265
|
+
// `finally`, that refreshed credential is discarded and never written back to the
|
|
266
|
+
// pool — there is no write-back path. The stored bundle keeps working as long as
|
|
267
|
+
// its refresh token stays valid (ChatGPT refresh tokens are long-lived and reused,
|
|
268
|
+
// not rotated per refresh today), so each run re-refreshes from the same stored
|
|
269
|
+
// copy; if OpenAI ever rotates refresh tokens on use, a pooled Codex token would
|
|
270
|
+
// eventually need to be re-connected by the user. Claude OAuth tokens (from
|
|
271
|
+
// `claude setup-token`) are long-lived and unaffected.
|
|
272
|
+
// Native (ambient) mode: run the developer's installed `codex` with its OWN login —
|
|
273
|
+
// no isolated CODEX_HOME, no injected auth.json. Otherwise write the leased credential
|
|
274
|
+
// to a per-run temp home kept OUTSIDE the checkout (and removed in `finally`).
|
|
275
|
+
if (!opts.ambientAuth && !opts.subscriptionToken) {
|
|
276
|
+
throw new Error('codex harness requires a subscription token (or ambientAuth)');
|
|
277
|
+
}
|
|
278
|
+
const codexHome = opts.ambientAuth ? undefined : await mkdtemp(join(tmpdir(), 'cf-codex-'));
|
|
279
|
+
if (codexHome) {
|
|
280
|
+
await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
|
|
281
|
+
await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
|
|
282
|
+
}
|
|
283
|
+
const onEvent = (event) => {
|
|
284
|
+
const type = typeof event.type === 'string' ? event.type : '';
|
|
285
|
+
if (type.includes('agent_message') || type === 'item.completed') {
|
|
286
|
+
const text = extractText(event);
|
|
287
|
+
if (text) {
|
|
288
|
+
stats.assistantChars += text.length;
|
|
289
|
+
summary = text;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
|
|
293
|
+
stats.toolCalls += 1;
|
|
294
|
+
}
|
|
295
|
+
const progress = codexPlanProgress(event);
|
|
296
|
+
if (progress && opts.onProgress)
|
|
297
|
+
opts.onProgress(progress);
|
|
298
|
+
const turnUsage = codexUsage(event);
|
|
299
|
+
if (turnUsage)
|
|
300
|
+
usage = turnUsage;
|
|
301
|
+
};
|
|
302
|
+
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
303
|
+
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
304
|
+
const prompt = opts.systemPrompt
|
|
305
|
+
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
306
|
+
: opts.userPrompt;
|
|
307
|
+
try {
|
|
308
|
+
const { stderrTail } = await streamCli('codex', [
|
|
309
|
+
'exec',
|
|
310
|
+
'--json',
|
|
311
|
+
'--skip-git-repo-check',
|
|
312
|
+
// The per-run container IS the sandbox; let Codex write files and reach the
|
|
313
|
+
// vendor unrestricted, with no approval prompts (the run is headless).
|
|
314
|
+
'--dangerously-bypass-approvals-and-sandbox',
|
|
315
|
+
'--model',
|
|
316
|
+
opts.model,
|
|
317
|
+
'-',
|
|
318
|
+
], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
319
|
+
return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
|
|
320
|
+
}
|
|
321
|
+
finally {
|
|
322
|
+
// Never leave the decrypted credential on disk past the run.
|
|
323
|
+
if (codexHome)
|
|
324
|
+
await rm(codexHome, { recursive: true, force: true }).catch(() => { });
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/** Best-effort: pull a textual message out of a Codex event. */
|
|
328
|
+
function extractText(event) {
|
|
329
|
+
if (typeof event.message === 'string')
|
|
330
|
+
return event.message;
|
|
331
|
+
if (typeof event.text === 'string')
|
|
332
|
+
return event.text;
|
|
333
|
+
if (isObject(event.item)) {
|
|
334
|
+
const item = event.item;
|
|
335
|
+
if (typeof item.text === 'string')
|
|
336
|
+
return item.text;
|
|
337
|
+
if (typeof item.message === 'string')
|
|
338
|
+
return item.message;
|
|
339
|
+
}
|
|
340
|
+
return undefined;
|
|
341
|
+
}
|
|
342
|
+
/** Best-effort: map a Codex `update_plan`/plan event onto subtask counts. */
|
|
343
|
+
function codexPlanProgress(event) {
|
|
344
|
+
const plan = (isObject(event.plan) ? event.plan : undefined) ??
|
|
345
|
+
(isObject(event.item) && Array.isArray(event.item.plan)
|
|
346
|
+
? { steps: event.item.plan }
|
|
347
|
+
: undefined);
|
|
348
|
+
const steps = isObject(plan) ? plan.steps : Array.isArray(event.steps) ? event.steps : undefined;
|
|
349
|
+
if (!Array.isArray(steps))
|
|
350
|
+
return undefined;
|
|
351
|
+
const items = steps.filter(isObject).map((s) => ({
|
|
352
|
+
label: typeof s.step === 'string' ? s.step : String(s.step ?? s.content ?? ''),
|
|
353
|
+
status: normalizeStatus(s.status),
|
|
354
|
+
}));
|
|
355
|
+
if (items.length === 0)
|
|
356
|
+
return undefined;
|
|
357
|
+
const completed = items.filter((i) => i.status === 'completed').length;
|
|
358
|
+
const inProgress = items.filter((i) => i.status === 'in_progress').length;
|
|
359
|
+
return { completed, inProgress, total: items.length, items };
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Best-effort: pull token usage out of a Codex usage event. Codex `exec --json`
|
|
363
|
+
* reports a running CUMULATIVE total on `token_count` events under
|
|
364
|
+
* `info.total_token_usage` (it also carries the per-turn `last_token_usage`); older /
|
|
365
|
+
* other shapes put it on `usage` / `info.usage` directly. We read the cumulative
|
|
366
|
+
* total when present so the caller can simply overwrite (not sum) — summing
|
|
367
|
+
* cumulative totals across events would multiply-count. Checked most-likely first.
|
|
368
|
+
*/
|
|
369
|
+
function codexUsage(event) {
|
|
370
|
+
const info = isObject(event.info) ? event.info : undefined;
|
|
371
|
+
const raw = (info && isObject(info.total_token_usage) ? info.total_token_usage : undefined) ??
|
|
372
|
+
(isObject(event.total_token_usage) ? event.total_token_usage : undefined) ??
|
|
373
|
+
(isObject(event.usage) ? event.usage : undefined) ??
|
|
374
|
+
(info && isObject(info.usage) ? info.usage : undefined);
|
|
375
|
+
if (!isObject(raw))
|
|
376
|
+
return undefined;
|
|
377
|
+
const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens);
|
|
378
|
+
const output = numberOf(raw.output_tokens);
|
|
379
|
+
if (input === 0 && output === 0)
|
|
380
|
+
return undefined;
|
|
381
|
+
return { inputTokens: input, outputTokens: output };
|
|
382
|
+
}
|
|
383
|
+
function numberOf(value) {
|
|
384
|
+
return typeof value === 'number' && Number.isFinite(value) ? value : 0;
|
|
385
|
+
}
|
|
386
|
+
/** Dispatch to the configured subscription harness runner. */
|
|
387
|
+
export function runSubscriptionHarness(harness, opts) {
|
|
388
|
+
return harness === 'claude-code' ? runClaudeCode(opts) : runCodex(opts);
|
|
389
|
+
}
|