@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
package/src/job.ts ADDED
@@ -0,0 +1,766 @@
1
+ import type { PiRunStats } from './pi.js'
2
+ import type { HarnessKind } from './pi-workspace.js'
3
+ import type { FailureCause } from './failure.js'
4
+
5
+ // The job the Worker's ContainerAgentExecutor POSTs to /run. Kept as plain
6
+ // types with a hand-rolled validator so the image needs no schema dependency.
7
+ // `ghToken`, `sessionToken` and `subscriptionToken` are secrets: they are
8
+ // consumed (moved into env / git config) and never logged.
9
+
10
+ /**
11
+ * Per-job auth fields, shared across every job shape. The Pi harness carries the
12
+ * proxy base URL + a model-locked session token; the subscription harnesses
13
+ * (Claude Code / Codex) carry a leased subscription token instead and talk direct
14
+ * to the vendor. `harness` selects which; absent ⇒ Pi.
15
+ */
16
+ export interface HarnessAuthFields {
17
+ harness?: HarnessKind
18
+ /** Worker LLM proxy base URL, including /v1 (Pi harness only). */
19
+ proxyBaseUrl?: string
20
+ /** Signed, model-locked proxy session token (Pi harness only). */
21
+ sessionToken?: string
22
+ /** Leased subscription credential (Claude Code OAuth token / Codex auth.json). */
23
+ subscriptionToken?: string
24
+ /**
25
+ * Anthropic-compatible base URL for a non-Anthropic Claude-Code vendor (GLM via
26
+ * Z.ai, Kimi via Moonshot). Present ⇒ the claude-code runner points
27
+ * ANTHROPIC_BASE_URL there with ANTHROPIC_AUTH_TOKEN; absent ⇒ Anthropic itself
28
+ * (CLAUDE_CODE_OAUTH_TOKEN against api.anthropic.com).
29
+ */
30
+ subscriptionBaseUrl?: string
31
+ /**
32
+ * Native local execution: the `claude-code` / `codex` CLI runs with the developer's
33
+ * OWN ambient login (`~/.claude` / `~/.codex`) instead of a leased subscription token.
34
+ * Set only by the local native transport; when true `subscriptionToken` is not required.
35
+ */
36
+ ambientAuth?: boolean
37
+ }
38
+
39
+ export interface RepoSpec {
40
+ owner: string
41
+ name: string
42
+ baseBranch: string
43
+ cloneUrl: string
44
+ /**
45
+ * The VCS provider the repo lives on, when the dispatcher set it. Selects GitHub-PR vs
46
+ * GitLab-MR for the "open the PR" call AUTHORITATIVELY (rather than guessing from the
47
+ * clone URL host, which can't recognise an arbitrarily-named self-managed GitLab). Absent
48
+ * ⇒ inferred from the clone URL.
49
+ */
50
+ provider?: 'github' | 'gitlab'
51
+ /**
52
+ * For a monorepo service, the subdirectory (relative to the repo root, e.g.
53
+ * `packages/api`) the agent should run within. Sanitised on parse to a safe
54
+ * relative path so it can never escape the checkout. Absent ⇒ run at the repo root.
55
+ */
56
+ serviceDirectory?: string
57
+ }
58
+
59
+ export interface PrSpec {
60
+ title: string
61
+ body: string
62
+ }
63
+
64
+ function str(value: unknown, path: string): string {
65
+ if (typeof value !== 'string' || value.length === 0) {
66
+ throw new Error(`Invalid job: '${path}' must be a non-empty string`)
67
+ }
68
+ return value
69
+ }
70
+
71
+ /** A positive finite integer, or undefined for any other input (silently ignored). */
72
+ function posInt(value: unknown): number | undefined {
73
+ return typeof value === 'number' && Number.isFinite(value) && value > 0
74
+ ? Math.floor(value)
75
+ : undefined
76
+ }
77
+
78
+ /**
79
+ * A valid TCP port (1..65535), or undefined for anything else. The backend already validates
80
+ * frontend ports against this range, but the harness re-checks at its untrusted-body boundary:
81
+ * an out-of-range value can never bind, so dropping it falls back to the harness default rather
82
+ * than spawning a server that fails to listen.
83
+ */
84
+ function port(value: unknown): number | undefined {
85
+ const n = posInt(value)
86
+ return n !== undefined && n <= 65535 ? n : undefined
87
+ }
88
+
89
+ /**
90
+ * Parse the optional per-job progress-guard overrides. Each knob must be a positive
91
+ * int; a malformed value is dropped (the run keeps the env / default for that knob).
92
+ * This only validates the SHAPE — it does NOT enforce loosen-only. The loosen-only
93
+ * guarantee (an override can only raise a knob, never tighten it below the base) is
94
+ * applied later, where the override meets the base, by {@link mergeGuardLimits}. So a
95
+ * tighter-than-default value parses fine here and is clamped back up to the base there.
96
+ * Returns undefined when nothing usable was supplied so the job body stays sparse.
97
+ */
98
+ function parseGuardLimits(value: unknown): GuardLimitsSpec | undefined {
99
+ if (typeof value !== 'object' || value === null) return undefined
100
+ const o = value as Record<string, unknown>
101
+ const spec: GuardLimitsSpec = {}
102
+ const noEdit = posInt(o.maxToolCallsWithoutEdit)
103
+ const errors = posInt(o.maxConsecutiveErrors)
104
+ const web = posInt(o.maxConsecutiveWebCalls)
105
+ if (noEdit !== undefined) spec.maxToolCallsWithoutEdit = noEdit
106
+ if (errors !== undefined) spec.maxConsecutiveErrors = errors
107
+ if (web !== undefined) spec.maxConsecutiveWebCalls = web
108
+ return Object.keys(spec).length > 0 ? spec : undefined
109
+ }
110
+
111
+ /**
112
+ * Parse the shared per-job auth fields, validating per harness: a subscription
113
+ * harness (`claude-code` / `codex`) requires `subscriptionToken`; the default Pi
114
+ * harness requires `proxyBaseUrl` + `sessionToken`.
115
+ */
116
+ function parseHarnessAuth(o: Record<string, unknown>): HarnessAuthFields {
117
+ const harness =
118
+ o.harness === 'claude-code' || o.harness === 'codex' || o.harness === 'pi'
119
+ ? (o.harness as HarnessKind)
120
+ : undefined
121
+ if (harness === 'claude-code' || harness === 'codex') {
122
+ // Native ambient auth uses the developer's own CLI login, so no leased token is
123
+ // required (and none should be sent); otherwise the subscription token is mandatory.
124
+ const ambientAuth = o.ambientAuth === true
125
+ return {
126
+ harness,
127
+ ...(ambientAuth
128
+ ? { ambientAuth: true }
129
+ : { subscriptionToken: str(o.subscriptionToken, 'subscriptionToken') }),
130
+ ...(typeof o.subscriptionBaseUrl === 'string' && o.subscriptionBaseUrl
131
+ ? { subscriptionBaseUrl: o.subscriptionBaseUrl }
132
+ : {}),
133
+ }
134
+ }
135
+ return {
136
+ harness,
137
+ proxyBaseUrl: str(o.proxyBaseUrl, 'proxyBaseUrl'),
138
+ sessionToken: str(o.sessionToken, 'sessionToken'),
139
+ }
140
+ }
141
+
142
+ /**
143
+ * Coerce a body-supplied monorepo service directory into a SAFE relative path, or
144
+ * undefined when absent/empty. Normalises separators, strips leading/trailing
145
+ * slashes, and rejects anything that could escape the checkout (absolute paths or a
146
+ * `..` segment) — the agent's cwd is built from this, so a hostile value must never
147
+ * point outside the cloned repo.
148
+ */
149
+ function sanitizeServiceDirectory(value: unknown): string | undefined {
150
+ if (typeof value !== 'string') return undefined
151
+ const normalized = value
152
+ .trim()
153
+ .replace(/\\/g, '/')
154
+ .replace(/^\/+|\/+$/g, '')
155
+ if (!normalized) return undefined
156
+ const segments = normalized.split('/').filter((s) => s !== '' && s !== '.')
157
+ if (segments.length === 0) return undefined
158
+ if (segments.some((s) => s === '..')) {
159
+ throw new Error("Invalid job: 'repo.serviceDirectory' must be a path inside the repo")
160
+ }
161
+ return segments.join('/')
162
+ }
163
+
164
+ /** Parse the shared repo spec, including the optional monorepo service subdirectory. */
165
+ function parseRepoSpec(repo: Record<string, unknown>): RepoSpec {
166
+ const spec: RepoSpec = {
167
+ owner: str(repo.owner, 'repo.owner'),
168
+ name: str(repo.name, 'repo.name'),
169
+ baseBranch: str(repo.baseBranch, 'repo.baseBranch'),
170
+ cloneUrl: str(repo.cloneUrl, 'repo.cloneUrl'),
171
+ }
172
+ const provider = parseVcsProvider(repo.provider)
173
+ if (provider) spec.provider = provider
174
+ const dir = sanitizeServiceDirectory(repo.serviceDirectory)
175
+ if (dir) spec.serviceDirectory = dir
176
+ return spec
177
+ }
178
+
179
+ /** Parse the optional `repo.provider` discriminator (defaults to undefined ⇒ host inference). */
180
+ function parseVcsProvider(value: unknown): 'github' | 'gitlab' | undefined {
181
+ if (value === undefined || value === null) return undefined
182
+ if (value === 'github' || value === 'gitlab') return value
183
+ throw new Error("Invalid job: 'repo.provider' must be 'github' or 'gitlab'")
184
+ }
185
+
186
+ // ---- Host allowlist -------------------------------------------------------
187
+ // The short-lived GitHub installation token is sent (a) to the clone/push remote
188
+ // over HTTPS and (b) to the REST API base. A body-supplied URL pointing at an
189
+ // attacker-named host would exfiltrate that token, so every such URL's host is
190
+ // checked against an allowlist before use. Defaults to github.com /
191
+ // api.github.com; a GitHub Enterprise deployment can add its host via env.
192
+
193
+ /** Hosts the harness is willing to send the installation token to. */
194
+ export function allowedGithubHosts(env: NodeJS.ProcessEnv = process.env): Set<string> {
195
+ const hosts = new Set(['github.com', 'api.github.com'])
196
+ const enterprise = env.GITHUB_ENTERPRISE_HOST?.trim().toLowerCase()
197
+ if (enterprise) hosts.add(enterprise)
198
+ // Optional extra allowlist (comma-separated) for tests / bespoke deployments.
199
+ for (const h of (env.GITHUB_ALLOWED_HOSTS ?? '').split(',')) {
200
+ const t = h.trim().toLowerCase()
201
+ if (t) hosts.add(t)
202
+ }
203
+ return hosts
204
+ }
205
+
206
+ /**
207
+ * Reject a URL whose host isn't an allowed GitHub host. `file://` clone sources
208
+ * are local (no token leaves the box) and so are always permitted; anything else
209
+ * must be http(s) to an allowlisted host.
210
+ */
211
+ function assertAllowedHost(
212
+ rawUrl: string,
213
+ path: string,
214
+ env: NodeJS.ProcessEnv = process.env,
215
+ ): void {
216
+ let url: URL
217
+ try {
218
+ url = new URL(rawUrl)
219
+ } catch {
220
+ throw new Error(`Invalid job: '${path}' must be a valid URL`)
221
+ }
222
+ if (url.protocol === 'file:') return
223
+ if (url.protocol !== 'https:' && url.protocol !== 'http:') {
224
+ throw new Error(`Invalid job: '${path}' must be an https or file URL`)
225
+ }
226
+ const host = url.hostname.toLowerCase()
227
+ if (!allowedGithubHosts(env).has(host)) {
228
+ throw new Error(`Invalid job: '${path}' host '${host}' is not an allowed GitHub host`)
229
+ }
230
+ }
231
+
232
+ // ---- Shared repo-bootstrap target ---------------------------------------
233
+
234
+ /** The new repository a repo-bootstrap run force-pushes its fresh history to. */
235
+ export interface BootstrapTargetSpec {
236
+ owner: string
237
+ name: string
238
+ cloneUrl: string
239
+ defaultBranch: string
240
+ }
241
+
242
+ // ---- Generic agent job (POST /jobs, kind=agent) ---------------------------
243
+ //
244
+ // The single, manifest-driven kind that subsumes the bespoke per-kind handlers. The
245
+ // backend decides WHAT the agent does (read-only explore vs edit-and-push coding) and
246
+ // passes it as data; the harness stays a generic LLM-over-a-checkout runner with no
247
+ // per-agent-kind code. Mechanical work (rendering artifact files, opening structured
248
+ // results onto the board) is the backend's job — done before/after this run via the
249
+ // RepoFiles port — never here.
250
+
251
+ /** How the generic agent runs: read-only exploration, or edit-and-push coding. */
252
+ export type AgentMode = 'explore' | 'coding' | 'preview'
253
+
254
+ /**
255
+ * Explore mode: how a container agent stands its dependencies up before the run (the
256
+ * tester). Two shapes, discriminated by `kind` (absent ⇒ `service`, the backend tester):
257
+ * - `service` — a backend service under test: `local` brings the service's
258
+ * docker-compose infra up on localhost for the run; `ephemeral` is a no-op stand-up
259
+ * (the env is already deployed and its URL reaches the agent through its prompt).
260
+ * - `frontend` — a frontend app under test (the self-contained UI-test flow): build the
261
+ * frontend, stand WireMock up for its mocked upstreams, serve the built app, and point
262
+ * the (`tester-ui`) agent at it. Everything runs as localhost PROCESSES in the one
263
+ * container (no Docker-in-Docker), so it works on Cloudflare + Apple `container` too.
264
+ * Absent ⇒ the harness manages no infra.
265
+ */
266
+ export type AgentInfraSpec = ServiceInfraSpec | FrontendInfraSpec
267
+
268
+ /** Backend-service tester infra (docker-compose local, or a deployed ephemeral env). */
269
+ export interface ServiceInfraSpec {
270
+ /** Discriminant. Absent ⇒ `service` (the backend tester). */
271
+ kind?: 'service'
272
+ /** `local` stands infra up via docker-compose; `ephemeral` tests a deployed env. */
273
+ environment: 'local' | 'ephemeral'
274
+ /** Local mode: the service declared no infra dependencies (spin nothing up). */
275
+ noInfraDependencies?: boolean
276
+ /** Local mode: repo-relative docker-compose path to stand the dependencies up. */
277
+ composePath?: string
278
+ /** Ephemeral mode: the provisioned environment URL (echoed for context only). */
279
+ environmentUrl?: string
280
+ }
281
+
282
+ /**
283
+ * Frontend UI-test infra (the self-contained `tester-ui` flow). The backend has already
284
+ * resolved every backend upstream to a concrete URL — the bound service's live ephemeral
285
+ * env URL for the service under test, `http://localhost:<wiremockPort>` for every mocked
286
+ * upstream — and handed them here as {@link env}. The harness installs, builds (injecting
287
+ * `env` at build time, or writing a `window.env` shim for runtime injection), stands
288
+ * WireMock up on {@link wiremockPort} seeded from {@link wiremockMappingsPath}, serves the
289
+ * built app on {@link servePort}, health-checks it, and tells the agent the serve URL.
290
+ */
291
+ export interface FrontendInfraSpec {
292
+ kind: 'frontend'
293
+ /** Package manager for install/build. Default `pnpm`. */
294
+ packageManager?: 'pnpm' | 'npm' | 'yarn'
295
+ /** Explicit install command, overriding the one derived from `packageManager`. */
296
+ install?: string
297
+ /** package.json script that produces the built app. Default `build`. */
298
+ buildScript?: string
299
+ /** The build's output directory, served in `static` mode. Default `dist`. */
300
+ outputDir?: string
301
+ /** How the built app is served: static server of `outputDir`, or run `serveScript`. */
302
+ serveMode?: 'static' | 'command'
303
+ /** package.json script to run when `serveMode: 'command'` (e.g. `preview`). */
304
+ serveScript?: string
305
+ /** The port the served app listens on inside the container. Default 4173. */
306
+ servePort?: number
307
+ /** Build-time env vars vs a runtime `window.env` shim. Default `build`. */
308
+ envInjection?: 'build' | 'runtime'
309
+ /** Resolved backend upstream env vars (name → URL) to inject. Empty names filtered out. */
310
+ env?: Record<string, string>
311
+ /** The WireMock mappings directory in the FE repo. Default `mocks/`. */
312
+ wiremockMappingsPath?: string
313
+ /** The port WireMock listens on inside the container. Default 8089. */
314
+ wiremockPort?: number
315
+ }
316
+
317
+ /**
318
+ * Coding mode (repo bootstrap): the divergent push of a bootstrap run. Instead of pushing
319
+ * a work branch on the cloned repo, the agent's result is force-pushed as a fresh
320
+ * single-commit history to a SEPARATE, pre-created target repository's default branch.
321
+ * Clone-and-adapt: `job.repo` is the reference architecture to clone + adapt, `target` is
322
+ * the new repo. From-scratch (`fromScratch`): start from an empty directory (the agent
323
+ * scaffolds), `job.repo` is unused as a clone source. Absent ⇒ the ordinary coding flow.
324
+ */
325
+ export interface AgentBootstrapSpec {
326
+ /** The new repository the bootstrapped contents are pushed to (the push target). */
327
+ target: BootstrapTargetSpec
328
+ /** Scaffold from an empty directory instead of cloning `job.repo` (no reference). */
329
+ fromScratch?: boolean
330
+ }
331
+
332
+ /**
333
+ * A linked-context file the backend prepared (requirements / RFC / PRD / tracker issue)
334
+ * for the harness to materialise under CONTEXT_DIR in the checkout, so the agent can read
335
+ * it on demand. The harness can't reach Jira/GitHub itself, so all such context is fetched
336
+ * and shipped here up front. `path` is sanitised to a safe basename on parse.
337
+ */
338
+ export interface ContextFileSpec {
339
+ path: string
340
+ title: string
341
+ url: string
342
+ content: string
343
+ }
344
+
345
+ /** How an explore agent's reply is consumed. */
346
+ export interface AgentOutputSpec {
347
+ /** `prose` keeps the reply text; `structured` parses (and optionally repairs) it to JSON. */
348
+ kind: 'prose' | 'structured'
349
+ /** Compact shape description fed to the one-shot structured-output repair call. */
350
+ shapeHint?: string
351
+ /** Whether to attempt the one-shot repair on a malformed reply (structured only). */
352
+ repair?: boolean
353
+ /**
354
+ * Fail the run LOUDLY when the FINAL answer is unusable (cut off at the output ceiling,
355
+ * or empty) instead of repairing it — opt-in for kinds whose JSON deliverable is handed
356
+ * onward to be parsed/committed (e.g. the spec-writer). Absent ⇒ off.
357
+ */
358
+ failOnUnusableFinal?: boolean
359
+ }
360
+
361
+ /**
362
+ * The generic agent job. `mode` selects the flow; the remaining fields are the union
363
+ * the flows need. Explore: clone `branch`, run read-only, return prose (or a parsed
364
+ * `custom` JSON object when `output.kind==='structured'`). Coding: clone `branch` (or
365
+ * resume `newBranch`), run, commit + push to `pushBranch`, and open `pr` when one is set
366
+ * and the run produced changes. Preview (local/node only): clone `branch`, build + serve
367
+ * the frontend (`infra.kind==='frontend'`) with its other upstreams mocked and KEEP IT
368
+ * RUNNING — no agent runs and the serve is deliberately not torn down when the job returns
369
+ * (see {@link AgentResult.preview}).
370
+ */
371
+ export interface AgentJob extends HarnessAuthFields {
372
+ jobId: string
373
+ mode: AgentMode
374
+ systemPrompt: string
375
+ userPrompt: string
376
+ model: string
377
+ ghToken: string
378
+ repo: RepoSpec
379
+ /** The branch to clone (the backend resolves base/pr/work to a concrete name). */
380
+ branch: string
381
+ githubApiBase?: string
382
+ webToolsGuidance?: string
383
+ webSearch?: boolean
384
+ /** Full-history clone (needed to diff against / merge the base). Default shallow. */
385
+ full?: boolean
386
+ /**
387
+ * Coding mode (conflict-resolver): merge `origin/<mergeBase>` into the cloned PR branch
388
+ * to surface the Git conflicts, run the agent to resolve them, then complete the merge
389
+ * commit and push back onto the SAME branch (no new branch / PR). Requires `full` so the
390
+ * merge base + `origin/<mergeBase>` are present. Absent ⇒ the ordinary coding flow.
391
+ */
392
+ mergeBase?: string
393
+ /**
394
+ * Coding mode (repo bootstrap): force-push the agent's output as a fresh single-commit
395
+ * history to a separate, pre-created target repo (clone + adapt `repo`, or scaffold from
396
+ * scratch). Absent ⇒ the ordinary clone-edit-push-on-the-same-repo coding flow.
397
+ */
398
+ bootstrap?: AgentBootstrapSpec
399
+ /** Explore mode: how to consume the reply. Absent ⇒ prose. */
400
+ output?: AgentOutputSpec
401
+ /**
402
+ * Linked-context files to materialise under CONTEXT_DIR before the run (both modes).
403
+ * The agent reads them on demand; they are kept out of any commit. Absent ⇒ none.
404
+ */
405
+ contextFiles?: ContextFileSpec[]
406
+ /**
407
+ * Explore mode: stand the service's dependencies up before the agent runs (the
408
+ * tester). Brings the docker-compose infra up on localhost for the duration of the
409
+ * run and tears it down afterward; a stand-up failure is non-fatal (surfaced to the
410
+ * agent as a note). The agent makes no commits regardless. Absent ⇒ no infra managed.
411
+ *
412
+ * Preview mode: REQUIRED and must be the `frontend` variant — it is the whole job (build
413
+ * + serve + WireMock, kept alive). No agent runs and, unlike the tester, the stand-up is
414
+ * NOT torn down when the job returns.
415
+ */
416
+ infra?: AgentInfraSpec
417
+ /** Coding mode: a fresh branch to create off the clone before running (else work on `branch`). */
418
+ newBranch?: string
419
+ /** Coding mode: branch the produced change is pushed to (defaults to `newBranch ?? branch`). */
420
+ pushBranch?: string
421
+ /** Coding mode: commit message for any work the agent left uncommitted. */
422
+ commitMessage?: string
423
+ /** Coding mode: open this PR when the run pushed changes. Absent ⇒ push only, no PR. */
424
+ pr?: PrSpec
425
+ /**
426
+ * Coding mode: whether a no-op run (nothing changed) is a failure. The implementer
427
+ * fails on a no-op; the in-place fixers (ci-fix / fix-tests) treat it as a non-fatal
428
+ * no-op. Default true.
429
+ */
430
+ noChangesIsError?: boolean
431
+ /**
432
+ * Reuse a STABLE per-repo checkout (clean-sweep + fetch + switch branch) instead of a
433
+ * fresh clone into a throwaway temp dir. Set ONLY by the local warm-pool transport,
434
+ * whose containers are reused across runs; absent everywhere else, so every other
435
+ * runtime keeps the ephemeral fresh-clone behaviour. The explore + ordinary coding
436
+ * flows honour it; bootstrap (resets `.git`) and conflict-resolution (needs full
437
+ * multi-branch state) always run ephemeral regardless.
438
+ */
439
+ persistentCheckout?: boolean
440
+ /**
441
+ * Coding mode (implementer): tail the Coder's follow-up sentinel file and stream the
442
+ * forward-looking items it surfaces (loose ends / side-tasks / questions) out on the job
443
+ * view, so the backend lifts them onto the run's step (the Follow-up companion). Set only
444
+ * for the `coder` dispatch when the companion is enabled. Absent ⇒ no follow-up streaming.
445
+ */
446
+ streamFollowUps?: boolean
447
+ /**
448
+ * Per-job overrides for the anti-rabbithole progress guard, set by the backend per
449
+ * AGENT KIND (a read-heavy kind tolerates more web/exploration before it counts as a
450
+ * stall). Each knob is optional and falls back to the env / built-in default
451
+ * ({@link progressGuardLimitsFromEnv}); only the knobs present here override. These are
452
+ * loosen-only: `mergeGuardLimits` clamps each override up to the base, so a value
453
+ * tighter than the default is ignored and a legitimately-progressing run is never
454
+ * killed for a kind's normal working pattern. Absent ⇒ env/default for all knobs.
455
+ */
456
+ guardLimits?: GuardLimitsSpec
457
+ }
458
+
459
+ /** Per-job, per-knob progress-guard overrides (see {@link AgentJob.guardLimits}). */
460
+ export interface GuardLimitsSpec {
461
+ maxToolCallsWithoutEdit?: number
462
+ maxConsecutiveErrors?: number
463
+ maxConsecutiveWebCalls?: number
464
+ }
465
+
466
+ /**
467
+ * The record of standing the service's docker-compose dependencies up before a tester
468
+ * run (explore mode, `infra.environment === 'local'`). The compose stand-up happens
469
+ * INSIDE the container, so its output never reaches the orchestrator's provisioning-log
470
+ * store (which records only the backend-side container/env spin-up); this carries the
471
+ * captured (redacted + bounded) command output back structurally so the test window can
472
+ * show WHY the dependencies failed to come up — previously this was trapped in the
473
+ * harness's own logs. Absent for ephemeral / no-infra / no-compose-path runs.
474
+ */
475
+ export interface InfraSetupRecord {
476
+ /** Whether `docker compose up --wait` succeeded (the dependencies are up). */
477
+ started: boolean
478
+ /** The repo-relative compose file that was stood up. */
479
+ composePath?: string
480
+ /** Epoch ms the stand-up attempt finished. */
481
+ at: number
482
+ /** Wall-clock of the stand-up attempt, ms. */
483
+ durationMs?: number
484
+ /** Captured (redacted, tail-bounded) stdout+stderr of the stand-up command. */
485
+ logs?: string
486
+ /** The verbatim (redacted) failure message when stand-up failed, else absent. */
487
+ error?: string
488
+ }
489
+
490
+ /** The generic agent response. `custom` carries a structured explore result. */
491
+ export interface AgentResult {
492
+ summary?: string
493
+ stats?: PiRunStats
494
+ /** Structured explore output (the parsed JSON object) when `output.kind==='structured'`. */
495
+ custom?: unknown
496
+ /**
497
+ * The tester's docker-compose stand-up record (explore mode, local infra). Carried back
498
+ * so the backend can surface the in-container dependency stand-up logs on the Tester step
499
+ * — the failure-class artifact the orchestrator-side provisioning logs can't capture.
500
+ */
501
+ infraSetup?: InfraSetupRecord
502
+ /**
503
+ * Preview mode: the in-container URL the built app is served at (e.g. `http://localhost:4173`).
504
+ * This is NOT host-reachable on its own — the container runtime publishes the serve port to an
505
+ * ephemeral host port and the backend forms the browsable URL from that; this is echoed for
506
+ * logging/context. Present only on a successful preview stand-up.
507
+ */
508
+ preview?: { url: string }
509
+ /** Coding mode: whether a change was pushed. */
510
+ pushed?: boolean
511
+ prUrl?: string
512
+ branch?: string
513
+ /** Coding mode (bootstrap): the default branch the bootstrapped contents were pushed to. */
514
+ defaultBranch?: string
515
+ error?: string
516
+ /**
517
+ * The structured failure cause set alongside `error` on a clean-exit failure (no usable
518
+ * output, no changes to push, unresolved conflicts, …). The job registry copies it onto
519
+ * the job view so the backend classifies the failure without regex. See {@link FailureCause}.
520
+ */
521
+ failureCause?: FailureCause
522
+ usage?: { inputTokens: number; outputTokens: number }
523
+ }
524
+
525
+ /** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */
526
+ function parseAgentBootstrapSpec(value: unknown): AgentBootstrapSpec | undefined {
527
+ if (typeof value !== 'object' || value === null) return undefined
528
+ const o = value as Record<string, unknown>
529
+ const t = (typeof o.target === 'object' && o.target !== null ? o.target : {}) as Record<
530
+ string,
531
+ unknown
532
+ >
533
+ const target: BootstrapTargetSpec = {
534
+ owner: str(t.owner, 'bootstrap.target.owner'),
535
+ name: str(t.name, 'bootstrap.target.name'),
536
+ cloneUrl: str(t.cloneUrl, 'bootstrap.target.cloneUrl'),
537
+ defaultBranch: str(t.defaultBranch, 'bootstrap.target.defaultBranch'),
538
+ }
539
+ return {
540
+ target,
541
+ ...(o.fromScratch === true ? { fromScratch: true } : {}),
542
+ }
543
+ }
544
+
545
+ /**
546
+ * Sanitise a body-supplied context filename to a safe basename within CONTEXT_DIR:
547
+ * strip any directory part, allow only `[A-Za-z0-9._-]`, and reject empties / dotfiles
548
+ * / `..` so a hostile value can't escape the directory or clobber repo files.
549
+ */
550
+ function sanitizeContextFileName(value: unknown): string | undefined {
551
+ if (typeof value !== 'string') return undefined
552
+ const base = value.replace(/\\/g, '/').split('/').pop() ?? ''
553
+ const cleaned = base.replace(/[^A-Za-z0-9._-]/g, '')
554
+ if (!cleaned || cleaned === '.' || cleaned === '..' || cleaned.startsWith('.')) return undefined
555
+ return cleaned
556
+ }
557
+
558
+ /** Parse the linked-context files, dropping any malformed/unsafe entry. */
559
+ function parseContextFiles(value: unknown): ContextFileSpec[] {
560
+ if (!Array.isArray(value)) return []
561
+ const files: ContextFileSpec[] = []
562
+ const used = new Set<string>()
563
+ for (const entry of value) {
564
+ if (typeof entry !== 'object' || entry === null) continue
565
+ const e = entry as Record<string, unknown>
566
+ const path = sanitizeContextFileName(e.path)
567
+ if (!path || used.has(path)) continue
568
+ if (typeof e.content !== 'string') continue
569
+ used.add(path)
570
+ files.push({
571
+ path,
572
+ title: typeof e.title === 'string' ? e.title : path,
573
+ url: typeof e.url === 'string' ? e.url : '',
574
+ content: e.content,
575
+ })
576
+ }
577
+ return files
578
+ }
579
+
580
+ /** Parse the explore-mode infra stand-up spec, or undefined when absent/unrecognised. */
581
+ function parseAgentInfraSpec(value: unknown): AgentInfraSpec | undefined {
582
+ if (typeof value !== 'object' || value === null) return undefined
583
+ const o = value as Record<string, unknown>
584
+ if (o.kind === 'frontend') return parseFrontendInfraSpec(o)
585
+ const environment =
586
+ o.environment === 'local' ? 'local' : o.environment === 'ephemeral' ? 'ephemeral' : undefined
587
+ if (!environment) return undefined
588
+ return {
589
+ environment,
590
+ ...(o.noInfraDependencies === true ? { noInfraDependencies: true } : {}),
591
+ ...(typeof o.composePath === 'string' && o.composePath ? { composePath: o.composePath } : {}),
592
+ ...(typeof o.environmentUrl === 'string' && o.environmentUrl
593
+ ? { environmentUrl: o.environmentUrl }
594
+ : {}),
595
+ }
596
+ }
597
+
598
+ /**
599
+ * Env-var names never injected from a frontend binding: spread over `process.env` at build
600
+ * time, so any of these would break the toolchain (or enable code execution / cert overrides)
601
+ * rather than name an upstream URL. Matched exactly (Linux env is case-sensitive); the
602
+ * {@link RESERVED_ENV_PREFIXES} below cover whole families (`npm_config_*`, `GIT_*`, …).
603
+ */
604
+ const RESERVED_ENV_NAMES = new Set([
605
+ 'PATH',
606
+ 'HOME',
607
+ 'NODE_OPTIONS',
608
+ 'NODE_PATH',
609
+ 'NODE_EXTRA_CA_CERTS',
610
+ 'LD_PRELOAD',
611
+ 'LD_LIBRARY_PATH',
612
+ 'BASH_ENV',
613
+ 'ENV',
614
+ 'SHELL',
615
+ 'IFS',
616
+ ])
617
+
618
+ /**
619
+ * Env-var name PREFIXES never injected from a frontend binding. `npm_config_*` reconfigures the
620
+ * package manager (registry, scripts, prefix), and `GIT_*` reconfigures git — both run during a
621
+ * frontend install/build, so a binding in either family is toolchain control, not an upstream URL.
622
+ * Compared case-INSENSITIVELY (lower-cased here, matched lower-cased below): npm reads its config
623
+ * env with a case-insensitive `/^npm_config_/i`, so `NPM_CONFIG_REGISTRY` is honoured just like
624
+ * `npm_config_registry` — a case-sensitive prefix match would let the upper-cased form slip through.
625
+ */
626
+ const RESERVED_ENV_PREFIXES = ['npm_config_', 'git_']
627
+
628
+ /**
629
+ * Whether an env-var name is reserved (an exact name, or a reserved family prefix). The exact
630
+ * names are canonical upper-case env vars matched verbatim (Linux env is case-sensitive, so a
631
+ * distinct lower-cased `home` is a different, harmless var); the family PREFIXES are matched
632
+ * case-insensitively because npm interprets `npm_config_*` regardless of case (see above).
633
+ */
634
+ function isReservedEnvName(key: string): boolean {
635
+ if (RESERVED_ENV_NAMES.has(key)) return true
636
+ const lower = key.toLowerCase()
637
+ return RESERVED_ENV_PREFIXES.some((p) => lower.startsWith(p))
638
+ }
639
+
640
+ /** Parse the frontend UI-test infra spec (`kind: 'frontend'`), tolerating missing knobs. */
641
+ function parseFrontendInfraSpec(o: Record<string, unknown>): FrontendInfraSpec {
642
+ const packageManager =
643
+ o.packageManager === 'pnpm' || o.packageManager === 'npm' || o.packageManager === 'yarn'
644
+ ? o.packageManager
645
+ : undefined
646
+ const serveMode = o.serveMode === 'static' || o.serveMode === 'command' ? o.serveMode : undefined
647
+ const envInjection =
648
+ o.envInjection === 'build' || o.envInjection === 'runtime' ? o.envInjection : undefined
649
+ // Only string→string entries survive; a non-string value is dropped so a malformed
650
+ // binding can't inject `[object Object]` (or undefined) as an upstream URL. Reserved names
651
+ // that would break the toolchain or enable injection (PATH, NODE_OPTIONS, LD_PRELOAD, …) are
652
+ // dropped too: they are spread over `process.env` at build time, so a binding named `PATH`
653
+ // would replace it with a URL and the build would no longer find its tools.
654
+ const env: Record<string, string> = {}
655
+ if (typeof o.env === 'object' && o.env !== null) {
656
+ for (const [key, val] of Object.entries(o.env as Record<string, unknown>)) {
657
+ if (key && !isReservedEnvName(key) && typeof val === 'string') env[key] = val
658
+ }
659
+ }
660
+ const servePort = port(o.servePort)
661
+ const wiremockPort = port(o.wiremockPort)
662
+ return {
663
+ kind: 'frontend',
664
+ ...(packageManager ? { packageManager } : {}),
665
+ ...(typeof o.install === 'string' && o.install ? { install: o.install } : {}),
666
+ ...(typeof o.buildScript === 'string' && o.buildScript ? { buildScript: o.buildScript } : {}),
667
+ ...(typeof o.outputDir === 'string' && o.outputDir ? { outputDir: o.outputDir } : {}),
668
+ ...(serveMode ? { serveMode } : {}),
669
+ ...(typeof o.serveScript === 'string' && o.serveScript ? { serveScript: o.serveScript } : {}),
670
+ ...(servePort !== undefined ? { servePort } : {}),
671
+ ...(envInjection ? { envInjection } : {}),
672
+ ...(Object.keys(env).length ? { env } : {}),
673
+ ...(typeof o.wiremockMappingsPath === 'string' && o.wiremockMappingsPath
674
+ ? { wiremockMappingsPath: o.wiremockMappingsPath }
675
+ : {}),
676
+ ...(wiremockPort !== undefined ? { wiremockPort } : {}),
677
+ }
678
+ }
679
+
680
+ /** Validate + narrow an untrusted body into an {@link AgentJob}, throwing on bad input. */
681
+ export function parseAgentJob(input: unknown): AgentJob {
682
+ if (typeof input !== 'object' || input === null) {
683
+ throw new Error('Invalid job: body must be an object')
684
+ }
685
+ const o = input as Record<string, unknown>
686
+ const mode =
687
+ o.mode === 'coding'
688
+ ? 'coding'
689
+ : o.mode === 'explore'
690
+ ? 'explore'
691
+ : o.mode === 'preview'
692
+ ? 'preview'
693
+ : undefined
694
+ if (!mode) throw new Error("Invalid job: 'mode' must be 'explore', 'coding' or 'preview'")
695
+ // Preview runs NO agent (it only builds + serves the frontend), so the agent-only fields
696
+ // (system/user prompt, model) are unused there — accept them absent rather than forcing the
697
+ // preview dispatch to send dummy values it has no reason to supply. Every other mode still
698
+ // requires them (throws when missing/empty), exactly as before.
699
+ const agentField = (value: unknown, path: string): string =>
700
+ mode === 'preview' ? (typeof value === 'string' ? value : '') : str(value, path)
701
+ const repo = (o.repo ?? {}) as Record<string, unknown>
702
+ const output =
703
+ typeof o.output === 'object' && o.output !== null
704
+ ? (() => {
705
+ const so = o.output as Record<string, unknown>
706
+ const kind = so.kind === 'structured' ? 'structured' : 'prose'
707
+ const spec: AgentOutputSpec = { kind }
708
+ if (typeof so.shapeHint === 'string') spec.shapeHint = so.shapeHint
709
+ // Carry an explicit `repair: false` through — the handler defaults to repair-on
710
+ // when absent, so dropping `false` would silently re-enable the repair call for a
711
+ // kind that opted out (it keys off `output.repair === false`).
712
+ if (typeof so.repair === 'boolean') spec.repair = so.repair
713
+ // Carry the opt-in truncation gate through (document producers set it); dropping
714
+ // it would silently re-enable laundering a cut-off reply into a half-baked doc.
715
+ if (so.failOnUnusableFinal === true) spec.failOnUnusableFinal = true
716
+ return spec
717
+ })()
718
+ : undefined
719
+ const pr =
720
+ typeof o.pr === 'object' && o.pr !== null
721
+ ? (() => {
722
+ const p = o.pr as Record<string, unknown>
723
+ return { title: str(p.title, 'pr.title'), body: typeof p.body === 'string' ? p.body : '' }
724
+ })()
725
+ : undefined
726
+ const infra = parseAgentInfraSpec(o.infra)
727
+ const bootstrap = parseAgentBootstrapSpec(o.bootstrap)
728
+ const contextFiles = parseContextFiles(o.contextFiles)
729
+ const guardLimits = parseGuardLimits(o.guardLimits)
730
+ const job: AgentJob = {
731
+ jobId: str(o.jobId, 'jobId'),
732
+ mode,
733
+ systemPrompt: agentField(o.systemPrompt, 'systemPrompt'),
734
+ userPrompt: agentField(o.userPrompt, 'userPrompt'),
735
+ model: agentField(o.model, 'model'),
736
+ ...parseHarnessAuth(o),
737
+ ghToken: str(o.ghToken, 'ghToken'),
738
+ repo: parseRepoSpec(repo),
739
+ branch: str(o.branch, 'branch'),
740
+ ...(typeof o.githubApiBase === 'string' ? { githubApiBase: o.githubApiBase } : {}),
741
+ ...(typeof o.webToolsGuidance === 'string' ? { webToolsGuidance: o.webToolsGuidance } : {}),
742
+ ...(o.webSearch === true ? { webSearch: true } : {}),
743
+ ...(o.full === true ? { full: true } : {}),
744
+ ...(typeof o.mergeBase === 'string' && o.mergeBase ? { mergeBase: o.mergeBase } : {}),
745
+ ...(bootstrap ? { bootstrap } : {}),
746
+ ...(output ? { output } : {}),
747
+ ...(contextFiles.length ? { contextFiles } : {}),
748
+ ...(infra ? { infra } : {}),
749
+ ...(typeof o.newBranch === 'string' && o.newBranch ? { newBranch: o.newBranch } : {}),
750
+ ...(typeof o.pushBranch === 'string' && o.pushBranch ? { pushBranch: o.pushBranch } : {}),
751
+ ...(typeof o.commitMessage === 'string' && o.commitMessage
752
+ ? { commitMessage: o.commitMessage }
753
+ : {}),
754
+ ...(pr ? { pr } : {}),
755
+ ...(o.noChangesIsError === false ? { noChangesIsError: false } : {}),
756
+ ...(o.persistentCheckout === true ? { persistentCheckout: true } : {}),
757
+ ...(o.streamFollowUps === true ? { streamFollowUps: true } : {}),
758
+ ...(guardLimits ? { guardLimits } : {}),
759
+ }
760
+ assertAllowedHost(job.repo.cloneUrl, 'repo.cloneUrl')
761
+ if (job.githubApiBase) assertAllowedHost(job.githubApiBase, 'githubApiBase')
762
+ // Bootstrap pushes the result to a SEPARATE target repo, so its clone URL must be an
763
+ // allowed GitHub host too (the installation token is sent to it on the force-push).
764
+ if (job.bootstrap) assertAllowedHost(job.bootstrap.target.cloneUrl, 'bootstrap.target.cloneUrl')
765
+ return job
766
+ }