@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Igor Savin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,143 @@
1
+ # @cat-factory/executor-harness
2
+
3
+ The payload that runs **inside** a per-run Cloudflare Container (or a
4
+ [self-hosted runner](../../docs/runner-pool-integration.md)) to perform real
5
+ repo work with the [Pi coding agent](https://github.com/earendil-works/pi).
6
+
7
+ It is a thin TypeScript wrapper — a `node:http` server on `:8080` — that the
8
+ Worker drives over a small **job protocol**. Jobs run **asynchronously**: a `POST`
9
+ accepts the job and returns immediately with a `jobId`; the driver then polls
10
+ `GET /jobs/{id}` for live progress and the terminal result.
11
+
12
+ ## Table of contents
13
+
14
+ - [Job protocol](#job-protocol)
15
+ - [What a job does](#what-a-job-does)
16
+ - [No secrets in the image](#no-secrets-in-the-image)
17
+ - [Layout](#layout)
18
+ - [Runner lifecycle knobs](#runner-lifecycle-knobs)
19
+ - [Build / test](#build--test)
20
+
21
+ ## Job protocol
22
+
23
+ | Method & path | Purpose |
24
+ | ----------------- | ---------------------------------------------------------------------------------------------------------------------- |
25
+ | `GET /health` | Liveness — `{ "status": "ok" }`. |
26
+ | `POST /run` | Start (or re-attach to) an **implementation** job (`coder` / `mocker` / `playwright`). Returns `202 { jobId, state }`. |
27
+ | `POST /bootstrap` | Start a **repo-bootstrap** job (adapt a reference architecture → force-push a new repo). |
28
+ | `POST /blueprint` | Start a **blueprint** job (decompose a repo → write the in-repo `blueprints/` map, commit on a branch). |
29
+ | `GET /jobs/{id}` | Poll any job; returns the **job view** (`state`, optional `progress {completed,inProgress,total}`, `result`, `error`). |
30
+
31
+ All jobs run in a generic `JobRegistry` (`src/runner.ts`) keyed by `jobId`, so a
32
+ replayed `POST` **re-attaches** to the running job rather than starting a
33
+ duplicate (the durable driver's retries/replays are safe). Pi's todo-tool counts
34
+ are surfaced as `progress` while a job runs. The exact request/response shapes
35
+ cat-factory sends are documented in
36
+ [`docs/runner-pool-integration.md`](../../docs/runner-pool-integration.md).
37
+
38
+ ## What a job does
39
+
40
+ The implementation job (`POST /run`) is the canonical sequence:
41
+
42
+ 1. **clone** the target repo (shallow) with a short-lived GitHub installation token,
43
+ 2. write the composed system prompt (role + the block's best-practice fragments)
44
+ to Pi's **global** context file `~/.pi/agent/AGENTS.md` (outside the checkout,
45
+ so it never lands in a commit and never clobbers a repo's own `AGENTS.md` —
46
+ Pi reads and concatenates both), and point Pi at the Worker's LLM proxy via
47
+ `~/.pi/agent/models.json` (provider `proxy`, `api: openai-completions`),
48
+ 3. **run Pi** non-interactively (`pi -p --mode json --model proxy/<model> --approve`),
49
+ 4. **commit, push** a branch and **open a PR**, returning `{ prUrl, branch, summary }`.
50
+
51
+ Bootstrap differs at the ends — it may start from an empty dir, and **resets
52
+ history to one commit and force-pushes** the default branch instead of opening a
53
+ PR. Blueprint **commits onto a branch** (no history reset) and returns the tree.
54
+
55
+ ## No secrets in the image
56
+
57
+ The image (built from the `Dockerfile`, base `node:26-trixie-slim`) contains
58
+ only `git` + the Pi CLI + this compiled wrapper — **no API keys, no GitHub
59
+ credentials**. Per job, the Worker passes a short-lived GitHub token and a
60
+ signed, model-locked LLM-proxy **session token** in the request body. Pi reaches
61
+ models only through the Worker proxy, which injects the real provider key (qwen /
62
+ Kimi / DeepSeek) and meters spend. The provider key never enters the container.
63
+
64
+ ## Layout
65
+
66
+ | File | Responsibility |
67
+ | ------------------ | ------------------------------------------------------------------------------------------------------- |
68
+ | `src/server.ts` | HTTP entry point; routes `/health`, `/run`, `/bootstrap`, `/blueprint`, `/jobs/{id}`. |
69
+ | `src/runner.ts` | `JobRegistry` — async job lifecycle, idempotent on `jobId`, progress tracking. |
70
+ | `src/job.ts` | Request types + validators for the job specs. |
71
+ | `src/pi.ts` | Pi provider config, non-interactive run, JSON-line event + todo-progress parsing, global `AGENTS.md` guidance. |
72
+ | `src/git.ts` | clone / branch / commit / push + GitHub PR creation; bootstrap history reset + force-push. |
73
+ | `src/bootstrap.ts` | The `/bootstrap` handler (clone-or-empty → adapt → reinit + force-push). |
74
+ | `src/blueprint.ts` | The `/blueprint` handler (decompose → render `blueprints/` → commit on branch). |
75
+ | `src/embed.ts` | Bundled assets/templates written into the workspace. |
76
+ | `src/logger.ts` | Structured logging. |
77
+
78
+ ## Runner lifecycle knobs
79
+
80
+ Read from the environment inside the container (also honoured by a self-hosted
81
+ runner):
82
+
83
+ | Env var | Default | Effect |
84
+ | --------------------- | --------------- | ----------------------------------------------------------- |
85
+ | `PORT` | `8080` | HTTP port the harness listens on. |
86
+ | `JOB_MAX_DURATION_MS` | `3600000` (60m) | Hard ceiling on a job's wall-clock time; force-fails after. |
87
+ | `JOB_INACTIVITY_MS` | `600000` (10m) | Kills a hung agent that produces no output for this long. |
88
+
89
+ ## Build / test
90
+
91
+ ```sh
92
+ pnpm --filter @cat-factory/executor-harness build # tsc → dist/
93
+ pnpm --filter @cat-factory/executor-harness test # unit tests
94
+ docker build -f Dockerfile . # the container image
95
+ ```
96
+
97
+ The build context is just this package, so its `tsconfig.json` is intentionally
98
+ self-contained.
99
+
100
+ ## Published image (GHCR + Docker Hub)
101
+
102
+ This package is not published to npm; instead its **Docker image** is published
103
+ publicly, multi-arch (`linux/amd64` + `linux/arm64`), to **both GHCR and Docker
104
+ Hub** so anyone can pull it without building from source:
105
+
106
+ ```
107
+ ghcr.io/<owner>/cat-factory-executor:<version>
108
+ docker.io/<org>/cat-factory-executor:<version>
109
+ ```
110
+
111
+ Each is tagged with the package `version`, the commit `sha-…`, and `latest`.
112
+
113
+ **CI** does this automatically:
114
+ [`.github/workflows/docker-publish.yml`](../../../.github/workflows/docker-publish.yml)
115
+ republishes on every push to `main` that touches image content (`src/**`,
116
+ `Dockerfile`, `tsconfig.json`, `package.json`). Docker Hub is gated on the
117
+ `DOCKERHUB_USERNAME` / `DOCKERHUB_TOKEN` repo secrets; without them it publishes
118
+ to GHCR only.
119
+
120
+ **Manually** (on demand, or to publish from a fork under your own namespaces):
121
+
122
+ ```sh
123
+ # Log in first (one per registry you target):
124
+ echo "$GHCR_TOKEN" | docker login ghcr.io -u <github-user> --password-stdin
125
+ echo "$DOCKERHUB_TOKEN" | docker login -u <dockerhub-user> --password-stdin
126
+
127
+ pnpm --filter @cat-factory/executor-harness run image:publish
128
+ ```
129
+
130
+ The script ([`scripts/publish-image.sh`](./scripts/publish-image.sh)) builds the
131
+ multi-arch image once and pushes it to the selected registries. Override defaults
132
+ via env vars (`REGISTRIES`, `GHCR_OWNER`, `DOCKERHUB_ORG`, `TAG`, `PUSH_LATEST`,
133
+ `PLATFORMS`, `EXTRA_CA`) — see the header of the script. Example: GHCR only —
134
+ `REGISTRIES=ghcr pnpm --filter @cat-factory/executor-harness run image:publish`.
135
+
136
+ A backend deployment references the image from `wrangler.toml`
137
+ (`[[containers]] image = "ghcr.io/<owner>/cat-factory-executor:<version>"` — see
138
+ [`deploy/backend`](../../../deploy/backend)); a self-hosted runner pool pulls the
139
+ same image (see [`docs/runner-pool-integration.md`](../../docs/runner-pool-integration.md)).
140
+ The worker library's own test/dev `wrangler.toml` still references this
141
+ `Dockerfile` by local path so the acceptance suite can build it. Because the
142
+ version is the image tag, **bump this package via a changeset whenever you change
143
+ image content** (see [`CONTRIBUTING.md`](../../../CONTRIBUTING.md)).
@@ -0,0 +1,389 @@
1
+ import { spawn } from 'node:child_process';
2
+ import { mkdtemp, rm, writeFile } from 'node:fs/promises';
3
+ import { tmpdir } from 'node:os';
4
+ import { join } from 'node:path';
5
+ import { killChildProcess } from './process.js';
6
+ import { redact, secretsToRedact } from './redact.js';
7
+ function isObject(value) {
8
+ return typeof value === 'object' && value !== null;
9
+ }
10
+ /**
11
+ * Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
12
+ * through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
13
+ * never argv), `onActivity` on every chunk, abort kills the child, and the close
14
+ * handler resolves/rejects. The caller's `onEvent` accumulates the outcome.
15
+ *
16
+ * `prompt` is fed over stdin: for Claude Code that is just the task prompt (the
17
+ * system prompt rides `--append-system-prompt`); for Codex — which has no
18
+ * system-prompt flag — the caller prepends the composed system prompt to it so
19
+ * the role + best-practice context is not lost.
20
+ */
21
+ function streamCli(command, args, prompt, opts, env, secrets, onEvent) {
22
+ return new Promise((resolve, reject) => {
23
+ if (opts.signal?.aborted) {
24
+ reject(new Error(`${command} aborted before start`));
25
+ return;
26
+ }
27
+ const child = spawn(command, args, {
28
+ cwd: opts.cwd,
29
+ env: { ...process.env, ...env },
30
+ stdio: ['pipe', 'pipe', 'pipe'],
31
+ });
32
+ child.stdin.on('error', () => { });
33
+ child.stdin.end(prompt);
34
+ let stderr = '';
35
+ let aborted = false;
36
+ let lineBuffer = '';
37
+ const killChild = () => killChildProcess(child);
38
+ const processLine = (line) => {
39
+ if (!line.startsWith('{'))
40
+ return;
41
+ let event;
42
+ try {
43
+ event = JSON.parse(line);
44
+ }
45
+ catch {
46
+ return;
47
+ }
48
+ try {
49
+ onEvent(event);
50
+ }
51
+ catch {
52
+ // A faulty observer must never break the run.
53
+ }
54
+ };
55
+ const consumeStdout = (text) => {
56
+ lineBuffer += text;
57
+ let nl = lineBuffer.indexOf('\n');
58
+ while (nl !== -1) {
59
+ const line = lineBuffer.slice(0, nl).trim();
60
+ lineBuffer = lineBuffer.slice(nl + 1);
61
+ nl = lineBuffer.indexOf('\n');
62
+ processLine(line);
63
+ }
64
+ };
65
+ const onAbort = () => {
66
+ aborted = true;
67
+ killChild();
68
+ };
69
+ opts.signal?.addEventListener('abort', onAbort, { once: true });
70
+ child.stdout.on('data', (chunk) => {
71
+ opts.onActivity?.();
72
+ consumeStdout(chunk.toString());
73
+ });
74
+ child.stderr.on('data', (chunk) => {
75
+ opts.onActivity?.();
76
+ stderr += chunk.toString();
77
+ if (stderr.length > 8_000)
78
+ stderr = stderr.slice(-8_000);
79
+ });
80
+ child.on('error', (err) => {
81
+ opts.signal?.removeEventListener('abort', onAbort);
82
+ reject(err);
83
+ });
84
+ child.on('close', (code) => {
85
+ opts.signal?.removeEventListener('abort', onAbort);
86
+ if (lineBuffer.trim())
87
+ processLine(lineBuffer.trim());
88
+ const stderrTail = redact(stderr, secrets).slice(-700);
89
+ if (aborted) {
90
+ reject(new Error('agent run aborted by watchdog'));
91
+ return;
92
+ }
93
+ if (code !== 0) {
94
+ reject(new Error(`${command} exited with code ${code}: ${stderrTail}`));
95
+ return;
96
+ }
97
+ resolve({ stderrTail });
98
+ });
99
+ });
100
+ }
101
+ // ---------------------------------------------------------------------------
102
+ // Claude Code
103
+ // ---------------------------------------------------------------------------
104
+ /**
105
+ * Run the Claude Code CLI headlessly against `opts.cwd`, authenticated with the
106
+ * leased subscription OAuth token (CLAUDE_CODE_OAUTH_TOKEN), talking direct to
107
+ * api.anthropic.com. Streams `--output-format stream-json`, mapping the
108
+ * `TodoWrite` tool calls onto subtask progress and the terminal `result` event
109
+ * onto the summary + usage.
110
+ */
111
+ export async function runClaudeCode(opts) {
112
+ const stats = { toolCalls: 0, assistantChars: 0 };
113
+ let summary = '';
114
+ let usage;
115
+ const onEvent = (event) => {
116
+ const type = event.type;
117
+ if (type === 'assistant' && isObject(event.message)) {
118
+ const content = event.message.content;
119
+ if (Array.isArray(content)) {
120
+ for (const block of content) {
121
+ if (!isObject(block))
122
+ continue;
123
+ if (block.type === 'text' && typeof block.text === 'string') {
124
+ stats.assistantChars += block.text.length;
125
+ }
126
+ if (block.type === 'tool_use') {
127
+ stats.toolCalls += 1;
128
+ if (block.name === 'TodoWrite' && opts.onProgress) {
129
+ const progress = todosToProgress(block.input?.todos);
130
+ if (progress)
131
+ opts.onProgress(progress);
132
+ }
133
+ }
134
+ }
135
+ }
136
+ }
137
+ else if (type === 'result') {
138
+ if (typeof event.result === 'string')
139
+ summary = event.result;
140
+ usage = claudeUsage(event.usage) ?? usage;
141
+ }
142
+ };
143
+ // Native (ambient) mode: run the developer's installed `claude` with its OWN login —
144
+ // no isolated config home, no injected credential, no onboarding pre-seed. Otherwise,
145
+ // Claude Code persists user config/credentials under its config dir; point that at an
146
+ // isolated, per-run temp dir OUTSIDE the cloned checkout (`opts.cwd`). Otherwise the
147
+ // agents that finish with `git add -A` (blueprint/requirements/bootstrap) could stage a
148
+ // stray `.claude/` directory — and any cached credential in it — into the pushed branch.
149
+ // Mirrors the Codex CODEX_HOME isolation below; removed in `finally`.
150
+ if (!opts.ambientAuth && !opts.subscriptionToken) {
151
+ throw new Error('claude-code harness requires a subscription token (or ambientAuth)');
152
+ }
153
+ const configHome = opts.ambientAuth ? undefined : await mkdtemp(join(tmpdir(), 'cf-claude-'));
154
+ // The config dir is brand-new every run, so Claude Code would otherwise treat this
155
+ // as a first launch and BLOCK on the interactive onboarding / "trust this folder" /
156
+ // bypass-permissions acknowledgement prompts — which never get answered headlessly,
157
+ // hanging the job until the watchdog kills it. Pre-seed the config that marks those
158
+ // as already accepted so `-p` starts straight into the run. Best-effort: written
159
+ // before the CLI starts; unknown keys are harmless if a CLI version ignores them.
160
+ // (Ambient mode skips this — the developer's own config is already onboarded.)
161
+ if (configHome) {
162
+ await writeFile(join(configHome, '.claude.json'), JSON.stringify({
163
+ hasCompletedOnboarding: true,
164
+ bypassPermissionsModeAccepted: true,
165
+ hasTrustDialogAccepted: true,
166
+ }), { mode: 0o600 }).catch(() => { });
167
+ }
168
+ // Anthropic itself authenticates with the subscription OAuth token; a
169
+ // non-Anthropic Claude-Code vendor (GLM via Z.ai, Kimi via Moonshot, DeepSeek)
170
+ // points Claude Code at its Anthropic-compatible endpoint with an auth-token key.
171
+ // Ambient mode injects neither — the CLI uses the developer's logged-in `~/.claude`.
172
+ const env = opts.ambientAuth
173
+ ? {}
174
+ : {
175
+ CLAUDE_CONFIG_DIR: configHome,
176
+ ...(opts.subscriptionBaseUrl
177
+ ? {
178
+ ANTHROPIC_BASE_URL: opts.subscriptionBaseUrl,
179
+ ANTHROPIC_AUTH_TOKEN: opts.subscriptionToken,
180
+ }
181
+ : { CLAUDE_CODE_OAUTH_TOKEN: opts.subscriptionToken }),
182
+ };
183
+ try {
184
+ const { stderrTail } = await streamCli('claude', [
185
+ '-p',
186
+ '--output-format',
187
+ 'stream-json',
188
+ '--verbose',
189
+ // The per-run container IS the sandbox, and the run is fully headless (no one
190
+ // to approve a tool call) — so bypass permissions entirely. `acceptEdits`
191
+ // would auto-accept file edits but still gate Bash, which in `-p` mode is then
192
+ // denied, leaving the agent unable to run builds/tests/git to verify its work.
193
+ '--permission-mode',
194
+ 'bypassPermissions',
195
+ '--model',
196
+ opts.model,
197
+ '--append-system-prompt',
198
+ opts.systemPrompt,
199
+ ], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
200
+ return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
201
+ }
202
+ finally {
203
+ // Never leave the config dir (and any cached credential) on disk past the run.
204
+ if (configHome)
205
+ await rm(configHome, { recursive: true, force: true }).catch(() => { });
206
+ }
207
+ }
208
+ /** Map Claude Code's `TodoWrite` todos array onto subtask counts. */
209
+ function todosToProgress(todos) {
210
+ if (!Array.isArray(todos))
211
+ return undefined;
212
+ const items = todos.filter(isObject).map((t) => ({
213
+ label: typeof t.content === 'string' ? t.content : String(t.content ?? ''),
214
+ status: normalizeStatus(t.status),
215
+ }));
216
+ const completed = items.filter((i) => i.status === 'completed').length;
217
+ const inProgress = items.filter((i) => i.status === 'in_progress').length;
218
+ return { completed, inProgress, total: items.length, items };
219
+ }
220
+ function normalizeStatus(status) {
221
+ if (status === 'completed')
222
+ return 'completed';
223
+ if (status === 'in_progress')
224
+ return 'in_progress';
225
+ return 'pending';
226
+ }
227
+ function claudeUsage(raw) {
228
+ if (!isObject(raw))
229
+ return undefined;
230
+ // Count every input bucket Anthropic bills: fresh input plus BOTH cache reads and
231
+ // cache writes (cache_creation_input_tokens), which are real consumed tokens — and
232
+ // are the dominant share on a long agent run. Omitting them under-weights a token's
233
+ // true load in the usage-aware rotation window.
234
+ const input = numberOf(raw.input_tokens) +
235
+ numberOf(raw.cache_read_input_tokens) +
236
+ numberOf(raw.cache_creation_input_tokens);
237
+ const output = numberOf(raw.output_tokens);
238
+ if (input === 0 && output === 0)
239
+ return undefined;
240
+ return { inputTokens: input, outputTokens: output };
241
+ }
242
+ // ---------------------------------------------------------------------------
243
+ // Codex
244
+ // ---------------------------------------------------------------------------
245
+ /**
246
+ * Run the Codex CLI headlessly against `opts.cwd`, authenticated with the leased
247
+ * ChatGPT `auth.json` bundle written to an isolated CODEX_HOME, talking direct to
248
+ * the ChatGPT backend. Streams `codex exec --json`, mapping plan/todo updates onto
249
+ * subtask progress and the running cumulative token usage onto the outcome.
250
+ */
251
+ export async function runCodex(opts) {
252
+ const stats = { toolCalls: 0, assistantChars: 0 };
253
+ let summary = '';
254
+ let usage;
255
+ // Codex reads its credentials from $CODEX_HOME/auth.json with file-backed
256
+ // storage. CRITICAL: this home must live OUTSIDE the cloned checkout (`opts.cwd`)
257
+ // — the blueprint/requirements/conflict-resolver handlers finish with
258
+ // `git add -A` + push, which would otherwise stage and publish the decrypted
259
+ // subscription `auth.json` (access + refresh tokens) to the PR branch. An
260
+ // isolated, per-run temp dir keeps the credential out of the working tree and is
261
+ // removed in `finally`.
262
+ //
263
+ // KNOWN LIMITATION: Codex refreshes its OAuth access token in-place by rewriting
264
+ // this `auth.json` mid-run. Because the home is a per-run temp dir wiped in
265
+ // `finally`, that refreshed credential is discarded and never written back to the
266
+ // pool — there is no write-back path. The stored bundle keeps working as long as
267
+ // its refresh token stays valid (ChatGPT refresh tokens are long-lived and reused,
268
+ // not rotated per refresh today), so each run re-refreshes from the same stored
269
+ // copy; if OpenAI ever rotates refresh tokens on use, a pooled Codex token would
270
+ // eventually need to be re-connected by the user. Claude OAuth tokens (from
271
+ // `claude setup-token`) are long-lived and unaffected.
272
+ // Native (ambient) mode: run the developer's installed `codex` with its OWN login —
273
+ // no isolated CODEX_HOME, no injected auth.json. Otherwise write the leased credential
274
+ // to a per-run temp home kept OUTSIDE the checkout (and removed in `finally`).
275
+ if (!opts.ambientAuth && !opts.subscriptionToken) {
276
+ throw new Error('codex harness requires a subscription token (or ambientAuth)');
277
+ }
278
+ const codexHome = opts.ambientAuth ? undefined : await mkdtemp(join(tmpdir(), 'cf-codex-'));
279
+ if (codexHome) {
280
+ await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
281
+ await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
282
+ }
283
+ const onEvent = (event) => {
284
+ const type = typeof event.type === 'string' ? event.type : '';
285
+ if (type.includes('agent_message') || type === 'item.completed') {
286
+ const text = extractText(event);
287
+ if (text) {
288
+ stats.assistantChars += text.length;
289
+ summary = text;
290
+ }
291
+ }
292
+ if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
293
+ stats.toolCalls += 1;
294
+ }
295
+ const progress = codexPlanProgress(event);
296
+ if (progress && opts.onProgress)
297
+ opts.onProgress(progress);
298
+ const turnUsage = codexUsage(event);
299
+ if (turnUsage)
300
+ usage = turnUsage;
301
+ };
302
+ // Codex has no system-prompt flag, so fold the composed role + best-practice
303
+ // context into the prompt itself (Claude Code instead rides --append-system-prompt).
304
+ const prompt = opts.systemPrompt
305
+ ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
306
+ : opts.userPrompt;
307
+ try {
308
+ const { stderrTail } = await streamCli('codex', [
309
+ 'exec',
310
+ '--json',
311
+ '--skip-git-repo-check',
312
+ // The per-run container IS the sandbox; let Codex write files and reach the
313
+ // vendor unrestricted, with no approval prompts (the run is headless).
314
+ '--dangerously-bypass-approvals-and-sandbox',
315
+ '--model',
316
+ opts.model,
317
+ '-',
318
+ ], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
319
+ return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
320
+ }
321
+ finally {
322
+ // Never leave the decrypted credential on disk past the run.
323
+ if (codexHome)
324
+ await rm(codexHome, { recursive: true, force: true }).catch(() => { });
325
+ }
326
+ }
327
+ /** Best-effort: pull a textual message out of a Codex event. */
328
+ function extractText(event) {
329
+ if (typeof event.message === 'string')
330
+ return event.message;
331
+ if (typeof event.text === 'string')
332
+ return event.text;
333
+ if (isObject(event.item)) {
334
+ const item = event.item;
335
+ if (typeof item.text === 'string')
336
+ return item.text;
337
+ if (typeof item.message === 'string')
338
+ return item.message;
339
+ }
340
+ return undefined;
341
+ }
342
+ /** Best-effort: map a Codex `update_plan`/plan event onto subtask counts. */
343
+ function codexPlanProgress(event) {
344
+ const plan = (isObject(event.plan) ? event.plan : undefined) ??
345
+ (isObject(event.item) && Array.isArray(event.item.plan)
346
+ ? { steps: event.item.plan }
347
+ : undefined);
348
+ const steps = isObject(plan) ? plan.steps : Array.isArray(event.steps) ? event.steps : undefined;
349
+ if (!Array.isArray(steps))
350
+ return undefined;
351
+ const items = steps.filter(isObject).map((s) => ({
352
+ label: typeof s.step === 'string' ? s.step : String(s.step ?? s.content ?? ''),
353
+ status: normalizeStatus(s.status),
354
+ }));
355
+ if (items.length === 0)
356
+ return undefined;
357
+ const completed = items.filter((i) => i.status === 'completed').length;
358
+ const inProgress = items.filter((i) => i.status === 'in_progress').length;
359
+ return { completed, inProgress, total: items.length, items };
360
+ }
361
+ /**
362
+ * Best-effort: pull token usage out of a Codex usage event. Codex `exec --json`
363
+ * reports a running CUMULATIVE total on `token_count` events under
364
+ * `info.total_token_usage` (it also carries the per-turn `last_token_usage`); older /
365
+ * other shapes put it on `usage` / `info.usage` directly. We read the cumulative
366
+ * total when present so the caller can simply overwrite (not sum) — summing
367
+ * cumulative totals across events would multiply-count. Checked most-likely first.
368
+ */
369
+ function codexUsage(event) {
370
+ const info = isObject(event.info) ? event.info : undefined;
371
+ const raw = (info && isObject(info.total_token_usage) ? info.total_token_usage : undefined) ??
372
+ (isObject(event.total_token_usage) ? event.total_token_usage : undefined) ??
373
+ (isObject(event.usage) ? event.usage : undefined) ??
374
+ (info && isObject(info.usage) ? info.usage : undefined);
375
+ if (!isObject(raw))
376
+ return undefined;
377
+ const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens);
378
+ const output = numberOf(raw.output_tokens);
379
+ if (input === 0 && output === 0)
380
+ return undefined;
381
+ return { inputTokens: input, outputTokens: output };
382
+ }
383
+ function numberOf(value) {
384
+ return typeof value === 'number' && Number.isFinite(value) ? value : 0;
385
+ }
386
+ /** Dispatch to the configured subscription harness runner. */
387
+ export function runSubscriptionHarness(harness, opts) {
388
+ return harness === 'claude-code' ? runClaudeCode(opts) : runCodex(opts);
389
+ }