@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
package/src/process.ts ADDED
@@ -0,0 +1,33 @@
1
+ import type { ChildProcess } from 'node:child_process'
2
+ import { log, type Logger } from './logger.js'
3
+
4
+ // Shared child-process lifecycle helpers. Every CLI the harness spawns (Pi and the
5
+ // subscription harnesses) must die the same way when the watchdog or a no-progress
6
+ // guard aborts, so the SIGTERM→SIGKILL escalation lives here rather than being
7
+ // re-implemented per runner.
8
+
9
+ // How long to wait after SIGTERM before escalating to SIGKILL.
10
+ const KILL_GRACE_MS = 5_000
11
+
12
+ /**
13
+ * Terminate a child process: SIGTERM first, then SIGKILL after a grace period if it
14
+ * hasn't exited (ignored an ordinary terminate). The escalation timer is `unref()`d
15
+ * so it never by itself keeps the event loop alive. Safe to call more than once.
16
+ *
17
+ * An actual escalation to SIGKILL is logged at warn level: a process that ignores
18
+ * SIGTERM and has to be force-killed is a signal worth seeing (a wedged Pi/CLI), and
19
+ * was previously invisible. Pass a child logger to carry the run's `jobId`.
20
+ */
21
+ export function killChildProcess(
22
+ child: ChildProcess,
23
+ graceMs: number = KILL_GRACE_MS,
24
+ logger: Logger = log,
25
+ ): void {
26
+ child.kill('SIGTERM')
27
+ setTimeout(() => {
28
+ if (child.exitCode === null && child.signalCode === null) {
29
+ logger.warn('killChildProcess: process ignored SIGTERM, escalating to SIGKILL', { graceMs })
30
+ child.kill('SIGKILL')
31
+ }
32
+ }, graceMs).unref()
33
+ }
package/src/redact.ts ADDED
@@ -0,0 +1,109 @@
1
+ // Single source of truth for credential redaction. Two complementary rules run on
2
+ // EVERY redaction so no error path can scrub one class of secret and leak the other:
3
+ //
4
+ // - PATTERN-based: scrubs credential SHAPES (URL userinfo, `x-access-token:<tok>`,
5
+ // bare GitHub token prefixes, and `KEY=value` / `KEY: value` assignments whose key
6
+ // names a credential) even when the exact value isn't known ahead of time — this is
7
+ // what catches a freshly-minted installation token in a git error, or a plaintext
8
+ // `POSTGRES_PASSWORD=…` echoed by a docker-compose dependency stand-up.
9
+ // - VALUE-based: scrubs a list of KNOWN secret strings (the leased subscription
10
+ // token + any token-like JSON leaf harvested from a credential blob).
11
+ //
12
+ // Historically these lived in two modules (git.ts pattern-only, agent-runner.ts
13
+ // value-only) and ran on disjoint paths, so a secret only one rule covered could leak
14
+ // on the other. They are unified here.
15
+
16
+ // Below this length a "known secret" is too short to scrub without mangling
17
+ // legitimate output (it would replace common substrings).
18
+ const MIN_REDACT_LEN = 6
19
+
20
+ // Only harvest token-like JSON leaves: real OAuth access/refresh tokens and ids are
21
+ // long, while short values (`auth_mode: "chatgpt"`, `type: "oauth"`, …) are non-secret
22
+ // words that would over-redact legitimate error text if scrubbed. 12 chars is a safe
23
+ // floor below which a value is not a credential.
24
+ const MIN_HARVEST_LEN = 12
25
+
26
+ // `KEY=value` / `KEY: value` assignments whose key NAMES a credential. Catches plaintext
27
+ // secrets the shape rules above miss — e.g. a docker-compose dependency echoing
28
+ // `POSTGRES_PASSWORD=hunter2` or `DATABASE_PASSWORD: hunter2` on a failed stand-up, which
29
+ // is not a token shape and is not in the known-value list (the harness never sees the
30
+ // service's own secrets). The key token is matched within a surrounding identifier so
31
+ // `DB_ACCESS_KEY`/`api_key` are covered; `auth` is deliberately excluded so it can't
32
+ // clobber a git `Author:` line. The value is the first whitespace-delimited run.
33
+ const CREDENTIAL_ASSIGNMENT =
34
+ /\b([A-Za-z0-9_]*(?:password|passwd|pwd|secret|token|key|credential)[A-Za-z0-9_]*\s*[:=]\s*)\S+/gi
35
+
36
+ /**
37
+ * Strip credentials out of any string before it is logged or stored. Applies the
38
+ * pattern rules (URL userinfo `https://user:pass@host`, `x-access-token:<token>`, bare
39
+ * `ghs_`/`ghp_`/`gho_`/`github_pat_` shapes, and credential-named `KEY=value` / `KEY:
40
+ * value` assignments) and then scrubs every supplied known-secret value. Idempotent —
41
+ * safe to call on already-redacted text.
42
+ */
43
+ export function redact(input: string, knownSecrets: readonly string[] = []): string {
44
+ let out = input
45
+ .replace(/(https?:\/\/)[^@\s/]*@/gi, '$1***@')
46
+ .replace(/x-access-token:[^@\s]+/gi, 'x-access-token:***')
47
+ .replace(/\b(gh[pso]_|github_pat_)[A-Za-z0-9_]+/g, '$1***')
48
+ .replace(CREDENTIAL_ASSIGNMENT, '$1***')
49
+ for (const secret of knownSecrets) {
50
+ // Guard against scrubbing trivially-short values that would mangle output.
51
+ if (secret.length >= MIN_REDACT_LEN) out = out.split(secret).join('***')
52
+ }
53
+ return out
54
+ }
55
+
56
+ /** Pattern-only redaction (no known values). Kept for callers without a secret list. */
57
+ export function redactSecrets(input: string): string {
58
+ return redact(input)
59
+ }
60
+
61
+ /** Cap on captured command output kept on an infra record (tail-biased — failures show last). */
62
+ export const MAX_CAPTURED_OUTPUT_CHARS = 16_000
63
+
64
+ /**
65
+ * Combine, redact and tail-bound captured stdout+stderr into a single stored string. Keeps
66
+ * the LAST {@link MAX_CAPTURED_OUTPUT_CHARS} (where a failure's error lives), prefixed with a
67
+ * truncation marker when trimmed. Returns undefined for empty output so a record stays sparse.
68
+ * Shared by the docker-compose and the frontend UI-test stand-ups.
69
+ */
70
+ export function captureRedactedOutput(stdout: unknown, stderr: unknown): string | undefined {
71
+ const merged = [String(stdout ?? ''), String(stderr ?? '')]
72
+ .map((s) => s.trim())
73
+ .filter(Boolean)
74
+ .join('\n')
75
+ if (!merged) return undefined
76
+ const redacted = redactSecrets(merged)
77
+ if (redacted.length <= MAX_CAPTURED_OUTPUT_CHARS) return redacted
78
+ return `…(${redacted.length - MAX_CAPTURED_OUTPUT_CHARS} earlier chars trimmed)\n${redacted.slice(-MAX_CAPTURED_OUTPUT_CHARS)}`
79
+ }
80
+
81
+ /** Recursively harvest token-like string leaves from a parsed JSON value. */
82
+ function collectStrings(value: unknown, out: Set<string>): void {
83
+ if (typeof value === 'string') {
84
+ if (value.length >= MIN_HARVEST_LEN) out.add(value)
85
+ } else if (Array.isArray(value)) {
86
+ for (const v of value) collectStrings(v, out)
87
+ } else if (value && typeof value === 'object') {
88
+ for (const v of Object.values(value)) collectStrings(v, out)
89
+ }
90
+ }
91
+
92
+ /**
93
+ * The set of secret strings to scrub from a run's stderr/output. For Claude (and the
94
+ * Anthropic-compatible vendors GLM/Kimi/DeepSeek) the credential IS the token string,
95
+ * so the whole-string entry covers it. For Codex the credential is a whole `auth.json`
96
+ * blob, so we ALSO scrub every string value parsed out of it (access/refresh tokens,
97
+ * ids): a token echoed on its OWN — not as part of the whole blob — would otherwise
98
+ * slip past a whole-blob-only match and leak into an error message.
99
+ */
100
+ export function secretsToRedact(subscriptionToken: string): string[] {
101
+ const secrets = new Set<string>()
102
+ if (subscriptionToken) secrets.add(subscriptionToken)
103
+ try {
104
+ collectStrings(JSON.parse(subscriptionToken), secrets)
105
+ } catch {
106
+ // Not JSON (a Claude OAuth token / API key) — the whole-string entry covers it.
107
+ }
108
+ return [...secrets]
109
+ }
package/src/runner.ts ADDED
@@ -0,0 +1,384 @@
1
+ import { redactSecrets } from './redact.js'
2
+ import type { FollowUpLine } from './follow-ups.js'
3
+ import type { TodoProgress, ToolSpan } from './pi.js'
4
+ import { log, type Logger } from './logger.js'
5
+ import {
6
+ type FailureCause,
7
+ failureCauseOf,
8
+ inactivityAbortMessage,
9
+ maxDurationAbortMessage,
10
+ } from './failure.js'
11
+
12
+ /** Non-secret correlation fields a job carries on every log line (jobId, repo, branch, …). */
13
+ type LogFields = Record<string, unknown>
14
+
15
+ // The async job lifecycle for the container. A coding/explore run can take many
16
+ // minutes, so the backend does not hold a single synchronous request open: it POSTs
17
+ // /jobs (which starts a background job and returns immediately) and then polls
18
+ // GET /jobs/{id}. Two watchdogs bound every job so a container can never run forever —
19
+ // an inactivity timer (kills the agent when it stops producing output) and an overall
20
+ // max-duration cap. The work itself is the generic `agent` handler (see agent.ts); this
21
+ // file owns only the registry + watchdogs that drive any job to completion.
22
+
23
+ /** Options threaded into the long-running git/Pi work so a watchdog can cancel it. */
24
+ export interface RunOptions {
25
+ signal?: AbortSignal
26
+ onActivity?: () => void
27
+ /** Receives the latest subtask counts as Pi updates its todo list. */
28
+ onProgress?: (progress: TodoProgress) => void
29
+ /** Receives one compact {@link ToolSpan} per completed tool call (observability). */
30
+ onSpan?: (span: ToolSpan) => void
31
+ /** Receives the forward-looking follow-up / question items the Coder streamed since the last poll. */
32
+ onFollowUp?: (items: FollowUpLine[]) => void
33
+ /**
34
+ * Mark the coarse lifecycle phase the handler has entered (`clone` / `agent` / `push` / …).
35
+ * Drives the stuck-run breadcrumb: an inactivity kill reports WHICH phase was hung, and the
36
+ * per-phase wall-clock is logged on completion. Free-form; unknown phases just show verbatim.
37
+ */
38
+ onPhase?: (phase: string) => void
39
+ /** A per-job child logger carrying the run's correlation fields (jobId, repo, branch, …). */
40
+ log?: Logger
41
+ }
42
+
43
+ export type JobState = 'running' | 'done' | 'failed'
44
+
45
+ /**
46
+ * The minimum a job result must expose: a structured `error` marks a job-level
47
+ * failure even when the HTTP run itself succeeded. Every agent result (explore /
48
+ * coding / bootstrap / conflict) satisfies this, so {@link JobRegistry} is generic
49
+ * over the result it tracks while reusing one watchdog/lifecycle.
50
+ */
51
+ export interface JobResultBase {
52
+ error?: string
53
+ /**
54
+ * The structured reason a clean-exit result failed (set alongside `error` by a handler that
55
+ * finished but produced an unusable/failed result — no-usable-output, no-changes, …). The
56
+ * registry copies it onto the job view's `failureCause`. Absent on a watchdog/throw failure
57
+ * (the registry sets that cause itself). See {@link FailureCause}.
58
+ */
59
+ failureCause?: FailureCause
60
+ }
61
+
62
+ /** The job view returned by GET /jobs/{id}, generic over the orchestration's result. */
63
+ export interface JobView<TResult extends JobResultBase = JobResultBase> {
64
+ id: string
65
+ state: JobState
66
+ startedAt: number
67
+ /** Epoch ms of the last sign of progress (job start, or Pi output). */
68
+ heartbeatAt: number
69
+ /**
70
+ * The coarse lifecycle phase the job is CURRENTLY in (`starting` → `clone` → `agent`
71
+ * → `push` → `done`/`failed`), so the backend can surface WHAT the container is doing
72
+ * rather than a blank "working" state — is it still cloning/preparing the checkout, or
73
+ * has the agent begun making calls? The same per-phase marker that drives the stuck-run
74
+ * breadcrumb on a failure, exposed live here while the job runs. Free-form; unknown
75
+ * phases just show verbatim. Always present (seeded `starting` at job start).
76
+ */
77
+ phase?: string
78
+ /**
79
+ * Latest subtask progress from Pi's `todo` tool while the job runs — the
80
+ * Worker poll surfaces it to the board (e.g. "3/8 done"). Absent until Pi
81
+ * first touches its todo list (or if the model never uses it).
82
+ */
83
+ progress?: TodoProgress
84
+ /** Present when `state === 'done'`: the orchestration's structured result. */
85
+ result?: TResult
86
+ /** Present when `state === 'failed'`: why the job faulted (or was killed). */
87
+ error?: string
88
+ /**
89
+ * Present when `state === 'failed'`: the STRUCTURED failure cause, so the backend can
90
+ * classify the failure without regex-matching {@link error}. Backward compatible — the
91
+ * backend prefers this and falls back to the (still-stable) `error` regex when absent.
92
+ * Container eviction is NOT represented here (the runtime facade detects that from a
93
+ * vanished container); see {@link FailureCause}.
94
+ */
95
+ failureCause?: FailureCause
96
+ /**
97
+ * Present when `state === 'failed'`: an extended, redacted diagnostic (phase-timing
98
+ * breakdown, last-tool breadcrumb, …) distinct from the one-line {@link error}. The
99
+ * backend surfaces it as the failure `detail` on the board card. Best-effort.
100
+ */
101
+ detail?: string
102
+ /**
103
+ * Tool spans accumulated SINCE THE LAST POLL (drain-on-read): the GET /jobs/{id}
104
+ * handler returns the spans buffered since the previous poll and clears the buffer,
105
+ * so the harness only ever holds one poll-interval's worth. Best-effort observability
106
+ * — a dropped poll response loses at most one window. Absent until a tool runs.
107
+ */
108
+ spans?: ToolSpan[]
109
+ /**
110
+ * Forward-looking follow-up / question items the Coder streamed SINCE THE LAST POLL
111
+ * (drain-on-read, exactly like {@link spans}): the GET /jobs/{id} handler returns the
112
+ * items buffered since the previous poll and clears the buffer. The backend appends them
113
+ * to the run's step so the Follow-up companion surfaces them live. Absent until the Coder
114
+ * surfaces the first one (and only on a follow-ups-enabled coding run).
115
+ */
116
+ followUps?: FollowUpLine[]
117
+ }
118
+
119
+ interface JobEntry<TResult extends JobResultBase> extends JobView<TResult> {
120
+ /** The in-flight work; retained so the entry isn't GC-surprising (not awaited externally). */
121
+ promise: Promise<void>
122
+ /** Spans buffered since the last drain (see {@link JobView.spans}). */
123
+ spanBuffer: ToolSpan[]
124
+ /** Follow-up items buffered since the last drain (see {@link JobView.followUps}). */
125
+ followUpBuffer: FollowUpLine[]
126
+ }
127
+
128
+ /** Watchdog windows that bound every job. Tunable via the container's env. */
129
+ export interface RunnerLimits {
130
+ /** Hard ceiling on total job wall-clock before it's force-failed. */
131
+ maxDurationMs: number
132
+ /** Force-fail the job if the agent produces no output for this long (hang guard). */
133
+ inactivityMs: number
134
+ }
135
+
136
+ function intEnv(value: string | undefined, fallback: number): number {
137
+ const n = value ? Number(value) : NaN
138
+ return Number.isFinite(n) && n > 0 ? n : fallback
139
+ }
140
+
141
+ export function loadRunnerLimits(env: NodeJS.ProcessEnv = process.env): RunnerLimits {
142
+ return {
143
+ // 60 minutes: generous headroom for serious multi-file coding tasks while
144
+ // still bounding a runaway container.
145
+ maxDurationMs: intEnv(env.JOB_MAX_DURATION_MS, 60 * 60_000),
146
+ // 10 minutes of zero output is treated as hung (a single long LLM/tool call
147
+ // is far shorter; Pi streams events as it works). The per-git command ceiling
148
+ // (`GIT_TIMEOUT_MS` in git.ts) is DERIVED from this value — a fixed margin below
149
+ // it — so a slow clone/push (which emits no activity events) always times out
150
+ // with git's own clear reason rather than this watchdog's "likely hung" message,
151
+ // for any configured window. See the invariant note in git.ts.
152
+ inactivityMs: intEnv(env.JOB_INACTIVITY_MS, 10 * 60_000),
153
+ }
154
+ }
155
+
156
+ function toView<TResult extends JobResultBase>(entry: JobEntry<TResult>): JobView<TResult> {
157
+ const {
158
+ promise: _promise,
159
+ spanBuffer: _spanBuffer,
160
+ followUpBuffer: _followUpBuffer,
161
+ ...view
162
+ } = entry
163
+ return { ...view }
164
+ }
165
+
166
+ /**
167
+ * Tracks background jobs by id. Keyed by the backend-supplied job id (the per-step
168
+ * job id) so a re-dispatched start re-attaches to the running job rather than starting
169
+ * a duplicate — which keeps the durable driver's retries idempotent and avoids redoing
170
+ * already-running work. Generic over the job/result shape so the same lifecycle +
171
+ * inactivity/max-duration watchdogs drive every agent run.
172
+ */
173
+ export class JobRegistry<TJob = unknown, TResult extends JobResultBase = JobResultBase> {
174
+ private readonly jobs = new Map<string, JobEntry<TResult>>()
175
+
176
+ constructor(
177
+ private readonly limits: RunnerLimits,
178
+ // The unit of work (the `agent` handler). Injectable so tests can drive the
179
+ // registry's lifecycle/watchdog logic with a different runner.
180
+ private readonly run: (job: TJob, opts: RunOptions) => Promise<TResult>,
181
+ // Non-secret correlation fields to bind on the per-job logger (repo, branch, agentKind).
182
+ // The registry is generic over the job shape, so the kind supplies this extractor; the
183
+ // job id is always bound. Defaults to no extra fields.
184
+ private readonly describe: (job: TJob) => LogFields = () => ({}),
185
+ ) {}
186
+
187
+ /** Start the job for `id`, or return the existing one (idempotent re-attach). */
188
+ start(id: string, job: TJob): JobView<TResult> {
189
+ const existing = this.jobs.get(id)
190
+ if (existing) return toView(existing)
191
+
192
+ const now = Date.now()
193
+ const entry: JobEntry<TResult> = {
194
+ id,
195
+ state: 'running',
196
+ startedAt: now,
197
+ // Seed the live phase so a poll BEFORE the handler enters its first phase still
198
+ // shows "starting" (the container is up; the agent hasn't begun cloning yet)
199
+ // rather than an absent/blank phase.
200
+ phase: 'starting',
201
+ heartbeatAt: now,
202
+ promise: Promise.resolve(),
203
+ spanBuffer: [],
204
+ followUpBuffer: [],
205
+ }
206
+ this.jobs.set(id, entry)
207
+ entry.promise = this.drive(entry, job)
208
+ return toView(entry)
209
+ }
210
+
211
+ /**
212
+ * Poll the job — and DRAIN its tool-span buffer (drain-on-read). The GET /jobs/{id}
213
+ * handler is the sole caller, so each poll returns the spans accumulated since the
214
+ * previous poll and clears them, bounding the harness buffer to one poll interval.
215
+ */
216
+ get(id: string): JobView<TResult> | undefined {
217
+ const entry = this.jobs.get(id)
218
+ if (!entry) return undefined
219
+ const view = toView(entry)
220
+ if (entry.spanBuffer.length > 0) {
221
+ view.spans = entry.spanBuffer
222
+ entry.spanBuffer = []
223
+ }
224
+ if (entry.followUpBuffer.length > 0) {
225
+ view.followUps = entry.followUpBuffer
226
+ entry.followUpBuffer = []
227
+ }
228
+ return view
229
+ }
230
+
231
+ private async drive(entry: JobEntry<TResult>, job: TJob): Promise<void> {
232
+ const controller = new AbortController()
233
+ let killReason: 'inactivity' | 'max-duration' | undefined
234
+
235
+ const jobLog = log.child({ jobId: entry.id, ...this.describe(job) })
236
+
237
+ // Stuck-run breadcrumb: the coarse phase the handler is in, per-phase wall-clock, and
238
+ // the last completed tool — so an inactivity kill can say WHERE it hung instead of a
239
+ // bare "likely hung", and the finish/fail log carries the phase-timing breakdown.
240
+ let phase = 'starting'
241
+ let phaseEnteredAt = Date.now()
242
+ const phaseTimingsMs: Record<string, number> = {}
243
+ const markPhase = (next: string): void => {
244
+ const now = Date.now()
245
+ phaseTimingsMs[phase] = (phaseTimingsMs[phase] ?? 0) + (now - phaseEnteredAt)
246
+ phase = next
247
+ phaseEnteredAt = now
248
+ // Surface the live phase on the view so a poll shows WHAT the container is doing
249
+ // (cloning / running the agent / pushing) — the same marker drives the failure
250
+ // breadcrumb. A terminal `done`/`failed` is set by the caller below.
251
+ entry.phase = next
252
+ }
253
+ let lastTool: { name: string; at: number } | undefined
254
+
255
+ let inactivity: ReturnType<typeof setTimeout> | undefined
256
+ const resetInactivity = (): void => {
257
+ clearTimeout(inactivity)
258
+ inactivity = setTimeout(() => {
259
+ // First watchdog to fire wins the reason (a later timer firing in the
260
+ // teardown window must not relabel why the job was killed).
261
+ killReason ??= 'inactivity'
262
+ controller.abort(new Error('inactivity timeout'))
263
+ }, this.limits.inactivityMs)
264
+ }
265
+ const cap = setTimeout(() => {
266
+ killReason ??= 'max-duration'
267
+ controller.abort(new Error('max duration exceeded'))
268
+ }, this.limits.maxDurationMs)
269
+ const heartbeat = (): void => {
270
+ entry.heartbeatAt = Date.now()
271
+ resetInactivity()
272
+ }
273
+ resetInactivity()
274
+
275
+ jobLog.info('job started', {})
276
+ try {
277
+ const result = await this.run(job, {
278
+ signal: controller.signal,
279
+ onActivity: heartbeat,
280
+ onProgress: (progress) => {
281
+ entry.progress = progress
282
+ },
283
+ onSpan: (span) => {
284
+ entry.spanBuffer.push(span)
285
+ lastTool = { name: span.tool, at: span.endedAt }
286
+ },
287
+ onFollowUp: (items) => {
288
+ entry.followUpBuffer.push(...items)
289
+ },
290
+ onPhase: (next) => markPhase(next),
291
+ log: jobLog,
292
+ })
293
+ markPhase('done')
294
+ entry.state = 'done'
295
+ entry.result = result
296
+ // A clean-exit result can still be a failure (e.g. no usable output): carry its
297
+ // structured cause onto the view so the backend classifies it without regex.
298
+ if (result.error && result.failureCause) entry.failureCause = result.failureCause
299
+ jobLog.info('job finished', {
300
+ durationMs: Date.now() - entry.startedAt,
301
+ jobError: result.error ?? null,
302
+ phaseTimingsMs,
303
+ })
304
+ } catch (error) {
305
+ // Capture the phase the job was IN before recording the 'failed' transition, so the
306
+ // breadcrumb names where it hung (markPhase below would otherwise overwrite it).
307
+ const failedInPhase = phase
308
+ markPhase('failed')
309
+ const { message, cause, detail } = this.describeFailure(
310
+ killReason,
311
+ error,
312
+ failedInPhase,
313
+ lastTool,
314
+ phaseTimingsMs,
315
+ )
316
+ entry.state = 'failed'
317
+ entry.error = message
318
+ entry.failureCause = cause
319
+ entry.detail = detail
320
+ jobLog.error('job failed', {
321
+ durationMs: Date.now() - entry.startedAt,
322
+ reason: killReason ?? 'error',
323
+ failureCause: cause,
324
+ error: message,
325
+ phaseTimingsMs,
326
+ })
327
+ } finally {
328
+ clearTimeout(inactivity)
329
+ clearTimeout(cap)
330
+ entry.heartbeatAt = Date.now()
331
+ }
332
+ }
333
+
334
+ /**
335
+ * Build the redacted one-line `error`, the structured {@link FailureCause}, and the extended
336
+ * `detail` for a failed job. Watchdog kills keep their regex-stable phrase (so the backend's
337
+ * `classifyBootstrapFailure` fallback still works) and gain a breadcrumb of where they hung;
338
+ * a thrown error keeps its own message and its structured cause when tagged (a git op → `git`,
339
+ * an upstream API call → `api`), else `agent`. All strings are credential-scrubbed.
340
+ */
341
+ private describeFailure(
342
+ killReason: 'inactivity' | 'max-duration' | undefined,
343
+ error: unknown,
344
+ phase: string,
345
+ lastTool: { name: string; at: number } | undefined,
346
+ phaseTimingsMs: Record<string, number>,
347
+ ): { message: string; cause: FailureCause; detail: string } {
348
+ // `lastTool` is the last tool that COMPLETED (a span is emitted on tool end), so when the
349
+ // hang is inside a still-running tool the breadcrumb points at the prior one — worded
350
+ // "last completed tool" so the reader knows the stuck call may be the next, unfinished one.
351
+ const breadcrumb = lastTool
352
+ ? `last completed tool ${lastTool.name} ${Math.round((Date.now() - lastTool.at) / 1000)}s ago`
353
+ : 'no tool had completed yet'
354
+ const phaseBreakdown = Object.entries(phaseTimingsMs)
355
+ .map(([p, ms]) => `${p}=${Math.round(ms / 1000)}s`)
356
+ .join(', ')
357
+ if (killReason === 'inactivity') {
358
+ return {
359
+ message: redactSecrets(
360
+ `${inactivityAbortMessage(this.limits.inactivityMs)} (likely hung in ${phase} phase; ${breadcrumb})`,
361
+ ),
362
+ cause: 'inactivity-timeout',
363
+ detail: redactSecrets(`Phase timings: ${phaseBreakdown || '(none)'}. ${breadcrumb}.`),
364
+ }
365
+ }
366
+ if (killReason === 'max-duration') {
367
+ return {
368
+ message: redactSecrets(maxDurationAbortMessage(this.limits.maxDurationMs)),
369
+ cause: 'max-duration',
370
+ detail: redactSecrets(`Phase timings: ${phaseBreakdown || '(none)'}. ${breadcrumb}.`),
371
+ }
372
+ }
373
+ const raw = error instanceof Error ? error.message : String(error)
374
+ // A thrown error tagged with a structured cause (a git op / an upstream API call) keeps
375
+ // it; an untagged throw is a generic agent failure.
376
+ return {
377
+ message: redactSecrets(raw),
378
+ cause: failureCauseOf(error) ?? 'agent',
379
+ detail: redactSecrets(
380
+ `${phaseBreakdown ? `Phase timings: ${phaseBreakdown}. ` : ''}Failed in ${phase} phase; ${breadcrumb}.`,
381
+ ),
382
+ }
383
+ }
384
+ }
package/src/server.ts ADDED
@@ -0,0 +1,153 @@
1
+ import { timingSafeEqual } from 'node:crypto'
2
+ import { createServer, type IncomingMessage, type ServerResponse } from 'node:http'
3
+ import { parseAgentJob } from './job.js'
4
+ import { handleAgent } from './agent.js'
5
+ import { redactSecrets } from './git.js'
6
+ import { JobRegistry, loadRunnerLimits, type JobResultBase, type RunOptions } from './runner.js'
7
+ import { log } from './logger.js'
8
+
9
+ // The container's HTTP entry point. The Worker addresses one instance per run and
10
+ // POSTs a job to /jobs (the body's `kind` selects which agent runs); the harness
11
+ // starts that job in the background (bounded by an inactivity + max-duration
12
+ // watchdog) and returns a job id, which the Worker then polls via GET /jobs/{id}.
13
+ // Nothing here holds long-lived secrets: the per-job GitHub + proxy tokens arrive
14
+ // in the request body and live only for the duration of the job in an ephemeral
15
+ // workspace.
16
+
17
+ const PORT = Number(process.env.PORT ?? 8080)
18
+
19
+ // Optional inbound auth. When HARNESS_SHARED_SECRET is set, every non-health
20
+ // request must present a matching `x-harness-secret` header (constant-time
21
+ // compared). When it is unset the harness behaves as before (open), so local/dev
22
+ // and the existing acceptance flow keep working without configuration.
23
+ // The direct callers send the matching header when the secret is configured: the
24
+ // local Docker transport (LocalContainerRunnerTransport) and the Cloudflare
25
+ // transport (CloudflareContainerTransport, which also injects the secret into the
26
+ // container env). A self-hosted runner pool reaches the harness through its own
27
+ // control plane, so its operator configures the secret pool-side.
28
+ const SHARED_SECRET = process.env.HARNESS_SHARED_SECRET
29
+
30
+ const HEADER = 'x-harness-secret'
31
+
32
+ /** Constant-time check of the shared-secret header; true when auth is disabled. */
33
+ function authorized(req: IncomingMessage): boolean {
34
+ if (!SHARED_SECRET) return true
35
+ const provided = req.headers[HEADER]
36
+ const got = Buffer.from(Array.isArray(provided) ? (provided[0] ?? '') : (provided ?? ''))
37
+ const want = Buffer.from(SHARED_SECRET)
38
+ // Length check first; timingSafeEqual requires equal-length buffers.
39
+ return got.length === want.length && timingSafeEqual(got, want)
40
+ }
41
+
42
+ // One registry per kind per container process. A run addresses its own container
43
+ // instance (one Durable Object id per execution / bootstrap job) and dispatches its
44
+ // sequence of step jobs to it; every kind shares the same watchdog/lifecycle but
45
+ // produces a different result, so each gets its own registry keyed by the job id.
46
+ const limits = loadRunnerLimits()
47
+
48
+ /** A dispatchable kind: how to validate its body and the registry that runs it. */
49
+ interface KindEntry {
50
+ parse: (input: unknown) => { jobId: string }
51
+ registry: JobRegistry<never, JobResultBase>
52
+ }
53
+
54
+ /** Pair a body validator with a registry running its handler under the shared limits. */
55
+ function defineKind<TJob extends { jobId: string }, TResult extends JobResultBase>(
56
+ parse: (input: unknown) => TJob,
57
+ handler: (job: TJob, opts: RunOptions) => Promise<TResult>,
58
+ // Non-secret correlation fields bound on the per-job logger (see JobRegistry.describe).
59
+ describe?: (job: TJob) => Record<string, unknown>,
60
+ ): KindEntry {
61
+ return {
62
+ parse,
63
+ registry: new JobRegistry<TJob, TResult>(limits, handler, describe),
64
+ } as unknown as KindEntry
65
+ }
66
+
67
+ // The dispatch table. The harness now serves a SINGLE, manifest-driven kind: the
68
+ // generic `agent` (the job body's `mode` — explore | coding — and its data select the
69
+ // flow; WHAT the agent does is decided entirely by the backend). The per-kind bespoke
70
+ // handlers (run/blueprint/spec/explore/merge/test/…) were strangled onto this one kind
71
+ // and removed. A `POST /jobs` reads the body's `kind` to pick the entry; `GET /jobs/{id}`
72
+ // checks every registry (job ids never collide across kinds). `kind` mirrors kernel's
73
+ // `RunnerDispatchKind` (now also just `'agent'`); the harness keeps its own copy so the
74
+ // image carries no runtime deps.
75
+ const KINDS: Record<string, KindEntry> = {
76
+ agent: defineKind(parseAgentJob, handleAgent, (job) => ({
77
+ mode: job.mode,
78
+ repo: `${job.repo.owner}/${job.repo.name}`,
79
+ branch: job.branch,
80
+ })),
81
+ }
82
+
83
+ async function readBody(req: IncomingMessage): Promise<string> {
84
+ const chunks: Buffer[] = []
85
+ for await (const chunk of req) chunks.push(chunk as Buffer)
86
+ return Buffer.concat(chunks).toString('utf8')
87
+ }
88
+
89
+ function send(res: ServerResponse, status: number, body: unknown): void {
90
+ const payload = JSON.stringify(body)
91
+ res.writeHead(status, { 'content-type': 'application/json' })
92
+ res.end(payload)
93
+ }
94
+
95
+ const server = createServer((req, res) => {
96
+ void (async () => {
97
+ if (req.method === 'GET' && req.url === '/health') {
98
+ return send(res, 200, { status: 'ok' })
99
+ }
100
+ // All non-health endpoints are gated by the optional shared secret.
101
+ if (!authorized(req)) {
102
+ return send(res, 401, { error: 'unauthorized' })
103
+ }
104
+ // Poll a running/finished job: GET /jobs/{id}. Job ids are unique per kind, so
105
+ // check each registry in turn; the first hit wins.
106
+ if (req.method === 'GET' && req.url?.startsWith('/jobs/')) {
107
+ const id = decodeURIComponent(req.url.slice('/jobs/'.length))
108
+ for (const { registry } of Object.values(KINDS)) {
109
+ const view = registry.get(id)
110
+ if (view) return send(res, 200, view)
111
+ }
112
+ return send(res, 404, { error: 'job not found' })
113
+ }
114
+ // Start (or re-attach to) a job: POST /jobs with the kind in the body. The body's
115
+ // `kind` selects the validator + registry; the rest is that kind's job spec.
116
+ // Returns immediately with the job id; the caller polls GET /jobs/{id} for live
117
+ // subtask progress and the final result. Idempotent: a re-dispatched POST
118
+ // (a durable-driver replay) re-attaches to the job already running for the id
119
+ // rather than starting a duplicate.
120
+ if (req.method === 'POST' && req.url === '/jobs') {
121
+ let kind: unknown
122
+ try {
123
+ const raw = JSON.parse(await readBody(req)) as Record<string, unknown>
124
+ kind = raw.kind
125
+ const entry = typeof kind === 'string' ? KINDS[kind] : undefined
126
+ if (!entry) {
127
+ return send(res, 404, { error: `unknown job kind '${String(kind)}'` })
128
+ }
129
+ const job = entry.parse(raw)
130
+ const view = entry.registry.start(job.jobId, job as never)
131
+ return send(res, 202, { jobId: view.id, state: view.state })
132
+ } catch (error) {
133
+ // Parse failures (incl. host-allowlist rejection) are client errors → 400.
134
+ const message = redactSecrets(error instanceof Error ? error.message : String(error))
135
+ log.error('failed to start job', {
136
+ kind: typeof kind === 'string' ? kind : undefined,
137
+ error: message,
138
+ })
139
+ return send(res, 400, { error: message })
140
+ }
141
+ }
142
+ return send(res, 404, { error: 'not found' })
143
+ })()
144
+ })
145
+
146
+ // Only auto-listen when run as the entry point (tests import handleRun directly).
147
+ if (process.env.NODE_ENV !== 'test') {
148
+ server.listen(PORT, () => {
149
+ console.log(`executor-harness listening on :${PORT}`)
150
+ })
151
+ }
152
+
153
+ export { server }