npm - @cat-factory/executor-harness - Versions diffs - 1.31.0 - Mend

@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/LICENSE +21 -0
package/README.md +143 -0
package/dist/agent-runner.js +389 -0
package/dist/agent.js +810 -0
package/dist/blueprint.js +367 -0
package/dist/bootstrap.js +99 -0
package/dist/ci-fixer.js +46 -0
package/dist/coding-agent.js +285 -0
package/dist/conflict-resolver.js +138 -0
package/dist/embed.js +8 -0
package/dist/explore.js +74 -0
package/dist/failure.js +47 -0
package/dist/fixer.js +44 -0
package/dist/follow-ups.js +103 -0
package/dist/frontend-infra.js +283 -0
package/dist/fs-utils.js +11 -0
package/dist/git.js +778 -0
package/dist/job.js +409 -0
package/dist/logger.js +27 -0
package/dist/merger.js +135 -0
package/dist/on-call.js +126 -0
package/dist/pi-workspace.js +237 -0
package/dist/pi.js +971 -0
package/dist/process.js +25 -0
package/dist/redact.js +109 -0
package/dist/runner.js +228 -0
package/dist/server.js +135 -0
package/dist/spec.js +754 -0
package/dist/structured-output.js +431 -0
package/dist/tester.js +191 -0
package/package.json +35 -0
package/src/agent-runner.ts +484 -0
package/src/agent.ts +948 -0
package/src/coding-agent.ts +393 -0
package/src/embed.ts +32 -0
package/src/failure.ts +73 -0
package/src/follow-ups.ts +106 -0
package/src/frontend-infra.ts +340 -0
package/src/fs-utils.ts +11 -0
package/src/git.ts +955 -0
package/src/job.ts +766 -0
package/src/logger.ts +45 -0
package/src/pi-workspace.ts +348 -0
package/src/pi.ts +1236 -0
package/src/process.ts +33 -0
package/src/redact.ts +109 -0
package/src/runner.ts +384 -0
package/src/server.ts +153 -0
package/src/structured-output.ts +524 -0

package/src/process.ts ADDED Viewed

@@ -0,0 +1,33 @@
+import type { ChildProcess } from 'node:child_process'
+import { log, type Logger } from './logger.js'
+// Shared child-process lifecycle helpers. Every CLI the harness spawns (Pi and the
+// subscription harnesses) must die the same way when the watchdog or a no-progress
+// guard aborts, so the SIGTERM→SIGKILL escalation lives here rather than being
+// re-implemented per runner.
+// How long to wait after SIGTERM before escalating to SIGKILL.
+const KILL_GRACE_MS = 5_000
+/**
+ * Terminate a child process: SIGTERM first, then SIGKILL after a grace period if it
+ * hasn't exited (ignored an ordinary terminate). The escalation timer is `unref()`d
+ * so it never by itself keeps the event loop alive. Safe to call more than once.
+ *
+ * An actual escalation to SIGKILL is logged at warn level: a process that ignores
+ * SIGTERM and has to be force-killed is a signal worth seeing (a wedged Pi/CLI), and
+ * was previously invisible. Pass a child logger to carry the run's `jobId`.
+ */
+export function killChildProcess(
+  child: ChildProcess,
+  graceMs: number = KILL_GRACE_MS,
+  logger: Logger = log,
+): void {
+  child.kill('SIGTERM')
+  setTimeout(() => {
+    if (child.exitCode === null && child.signalCode === null) {
+      logger.warn('killChildProcess: process ignored SIGTERM, escalating to SIGKILL', { graceMs })
+      child.kill('SIGKILL')
+    }
+  }, graceMs).unref()
+}

package/src/redact.ts ADDED Viewed

@@ -0,0 +1,109 @@
+// Single source of truth for credential redaction. Two complementary rules run on
+// EVERY redaction so no error path can scrub one class of secret and leak the other:
+//
+//  - PATTERN-based: scrubs credential SHAPES (URL userinfo, `x-access-token:<tok>`,
+//    bare GitHub token prefixes, and `KEY=value` / `KEY: value` assignments whose key
+//    names a credential) even when the exact value isn't known ahead of time — this is
+//    what catches a freshly-minted installation token in a git error, or a plaintext
+//    `POSTGRES_PASSWORD=…` echoed by a docker-compose dependency stand-up.
+//  - VALUE-based: scrubs a list of KNOWN secret strings (the leased subscription
+//    token + any token-like JSON leaf harvested from a credential blob).
+//
+// Historically these lived in two modules (git.ts pattern-only, agent-runner.ts
+// value-only) and ran on disjoint paths, so a secret only one rule covered could leak
+// on the other. They are unified here.
+// Below this length a "known secret" is too short to scrub without mangling
+// legitimate output (it would replace common substrings).
+const MIN_REDACT_LEN = 6
+// Only harvest token-like JSON leaves: real OAuth access/refresh tokens and ids are
+// long, while short values (`auth_mode: "chatgpt"`, `type: "oauth"`, …) are non-secret
+// words that would over-redact legitimate error text if scrubbed. 12 chars is a safe
+// floor below which a value is not a credential.
+const MIN_HARVEST_LEN = 12
+// `KEY=value` / `KEY: value` assignments whose key NAMES a credential. Catches plaintext
+// secrets the shape rules above miss — e.g. a docker-compose dependency echoing
+// `POSTGRES_PASSWORD=hunter2` or `DATABASE_PASSWORD: hunter2` on a failed stand-up, which
+// is not a token shape and is not in the known-value list (the harness never sees the
+// service's own secrets). The key token is matched within a surrounding identifier so
+// `DB_ACCESS_KEY`/`api_key` are covered; `auth` is deliberately excluded so it can't
+// clobber a git `Author:` line. The value is the first whitespace-delimited run.
+const CREDENTIAL_ASSIGNMENT =
+  /\b([A-Za-z0-9_]*(?:password|passwd|pwd|secret|token|key|credential)[A-Za-z0-9_]*\s*[:=]\s*)\S+/gi
+/**
+ * Strip credentials out of any string before it is logged or stored. Applies the
+ * pattern rules (URL userinfo `https://user:pass@host`, `x-access-token:<token>`, bare
+ * `ghs_`/`ghp_`/`gho_`/`github_pat_` shapes, and credential-named `KEY=value` / `KEY:
+ * value` assignments) and then scrubs every supplied known-secret value. Idempotent —
+ * safe to call on already-redacted text.
+ */
+export function redact(input: string, knownSecrets: readonly string[] = []): string {
+  let out = input
+    .replace(/(https?:\/\/)[^@\s/]*@/gi, '$1***@')
+    .replace(/x-access-token:[^@\s]+/gi, 'x-access-token:***')
+    .replace(/\b(gh[pso]_|github_pat_)[A-Za-z0-9_]+/g, '$1***')
+    .replace(CREDENTIAL_ASSIGNMENT, '$1***')
+  for (const secret of knownSecrets) {
+    // Guard against scrubbing trivially-short values that would mangle output.
+    if (secret.length >= MIN_REDACT_LEN) out = out.split(secret).join('***')
+  }
+  return out
+}
+/** Pattern-only redaction (no known values). Kept for callers without a secret list. */
+export function redactSecrets(input: string): string {
+  return redact(input)
+}
+/** Cap on captured command output kept on an infra record (tail-biased — failures show last). */
+export const MAX_CAPTURED_OUTPUT_CHARS = 16_000
+/**
+ * Combine, redact and tail-bound captured stdout+stderr into a single stored string. Keeps
+ * the LAST {@link MAX_CAPTURED_OUTPUT_CHARS} (where a failure's error lives), prefixed with a
+ * truncation marker when trimmed. Returns undefined for empty output so a record stays sparse.
+ * Shared by the docker-compose and the frontend UI-test stand-ups.
+ */
+export function captureRedactedOutput(stdout: unknown, stderr: unknown): string | undefined {
+  const merged = [String(stdout ?? ''), String(stderr ?? '')]
+    .map((s) => s.trim())
+    .filter(Boolean)
+    .join('\n')
+  if (!merged) return undefined
+  const redacted = redactSecrets(merged)
+  if (redacted.length <= MAX_CAPTURED_OUTPUT_CHARS) return redacted
+  return `…(${redacted.length - MAX_CAPTURED_OUTPUT_CHARS} earlier chars trimmed)\n${redacted.slice(-MAX_CAPTURED_OUTPUT_CHARS)}`
+}
+/** Recursively harvest token-like string leaves from a parsed JSON value. */
+function collectStrings(value: unknown, out: Set<string>): void {
+  if (typeof value === 'string') {
+    if (value.length >= MIN_HARVEST_LEN) out.add(value)
+  } else if (Array.isArray(value)) {
+    for (const v of value) collectStrings(v, out)
+  } else if (value && typeof value === 'object') {
+    for (const v of Object.values(value)) collectStrings(v, out)
+  }
+}
+/**
+ * The set of secret strings to scrub from a run's stderr/output. For Claude (and the
+ * Anthropic-compatible vendors GLM/Kimi/DeepSeek) the credential IS the token string,
+ * so the whole-string entry covers it. For Codex the credential is a whole `auth.json`
+ * blob, so we ALSO scrub every string value parsed out of it (access/refresh tokens,
+ * ids): a token echoed on its OWN — not as part of the whole blob — would otherwise
+ * slip past a whole-blob-only match and leak into an error message.
+ */
+export function secretsToRedact(subscriptionToken: string): string[] {
+  const secrets = new Set<string>()
+  if (subscriptionToken) secrets.add(subscriptionToken)
+  try {
+    collectStrings(JSON.parse(subscriptionToken), secrets)
+  } catch {
+    // Not JSON (a Claude OAuth token / API key) — the whole-string entry covers it.
+  }
+  return [...secrets]
+}

package/src/runner.ts ADDED Viewed

@@ -0,0 +1,384 @@
+import { redactSecrets } from './redact.js'
+import type { FollowUpLine } from './follow-ups.js'
+import type { TodoProgress, ToolSpan } from './pi.js'
+import { log, type Logger } from './logger.js'
+import {
+  type FailureCause,
+  failureCauseOf,
+  inactivityAbortMessage,
+  maxDurationAbortMessage,
+} from './failure.js'
+/** Non-secret correlation fields a job carries on every log line (jobId, repo, branch, …). */
+type LogFields = Record<string, unknown>
+// The async job lifecycle for the container. A coding/explore run can take many
+// minutes, so the backend does not hold a single synchronous request open: it POSTs
+// /jobs (which starts a background job and returns immediately) and then polls
+// GET /jobs/{id}. Two watchdogs bound every job so a container can never run forever —
+// an inactivity timer (kills the agent when it stops producing output) and an overall
+// max-duration cap. The work itself is the generic `agent` handler (see agent.ts); this
+// file owns only the registry + watchdogs that drive any job to completion.
+/** Options threaded into the long-running git/Pi work so a watchdog can cancel it. */
+export interface RunOptions {
+  signal?: AbortSignal
+  onActivity?: () => void
+  /** Receives the latest subtask counts as Pi updates its todo list. */
+  onProgress?: (progress: TodoProgress) => void
+  /** Receives one compact {@link ToolSpan} per completed tool call (observability). */
+  onSpan?: (span: ToolSpan) => void
+  /** Receives the forward-looking follow-up / question items the Coder streamed since the last poll. */
+  onFollowUp?: (items: FollowUpLine[]) => void
+  /**
+   * Mark the coarse lifecycle phase the handler has entered (`clone` / `agent` / `push` / …).
+   * Drives the stuck-run breadcrumb: an inactivity kill reports WHICH phase was hung, and the
+   * per-phase wall-clock is logged on completion. Free-form; unknown phases just show verbatim.
+   */
+  onPhase?: (phase: string) => void
+  /** A per-job child logger carrying the run's correlation fields (jobId, repo, branch, …). */
+  log?: Logger
+}
+export type JobState = 'running' | 'done' | 'failed'
+/**
+ * The minimum a job result must expose: a structured `error` marks a job-level
+ * failure even when the HTTP run itself succeeded. Every agent result (explore /
+ * coding / bootstrap / conflict) satisfies this, so {@link JobRegistry} is generic
+ * over the result it tracks while reusing one watchdog/lifecycle.
+ */
+export interface JobResultBase {
+  error?: string
+  /**
+   * The structured reason a clean-exit result failed (set alongside `error` by a handler that
+   * finished but produced an unusable/failed result — no-usable-output, no-changes, …). The
+   * registry copies it onto the job view's `failureCause`. Absent on a watchdog/throw failure
+   * (the registry sets that cause itself). See {@link FailureCause}.
+   */
+  failureCause?: FailureCause
+}
+/** The job view returned by GET /jobs/{id}, generic over the orchestration's result. */
+export interface JobView<TResult extends JobResultBase = JobResultBase> {
+  id: string
+  state: JobState
+  startedAt: number
+  /** Epoch ms of the last sign of progress (job start, or Pi output). */
+  heartbeatAt: number
+  /**
+   * The coarse lifecycle phase the job is CURRENTLY in (`starting` → `clone` → `agent`
+   * → `push` → `done`/`failed`), so the backend can surface WHAT the container is doing
+   * rather than a blank "working" state — is it still cloning/preparing the checkout, or
+   * has the agent begun making calls? The same per-phase marker that drives the stuck-run
+   * breadcrumb on a failure, exposed live here while the job runs. Free-form; unknown
+   * phases just show verbatim. Always present (seeded `starting` at job start).
+   */
+  phase?: string
+  /**
+   * Latest subtask progress from Pi's `todo` tool while the job runs — the
+   * Worker poll surfaces it to the board (e.g. "3/8 done"). Absent until Pi
+   * first touches its todo list (or if the model never uses it).
+   */
+  progress?: TodoProgress
+  /** Present when `state === 'done'`: the orchestration's structured result. */
+  result?: TResult
+  /** Present when `state === 'failed'`: why the job faulted (or was killed). */
+  error?: string
+  /**
+   * Present when `state === 'failed'`: the STRUCTURED failure cause, so the backend can
+   * classify the failure without regex-matching {@link error}. Backward compatible — the
+   * backend prefers this and falls back to the (still-stable) `error` regex when absent.
+   * Container eviction is NOT represented here (the runtime facade detects that from a
+   * vanished container); see {@link FailureCause}.
+   */
+  failureCause?: FailureCause
+  /**
+   * Present when `state === 'failed'`: an extended, redacted diagnostic (phase-timing
+   * breakdown, last-tool breadcrumb, …) distinct from the one-line {@link error}. The
+   * backend surfaces it as the failure `detail` on the board card. Best-effort.
+   */
+  detail?: string
+  /**
+   * Tool spans accumulated SINCE THE LAST POLL (drain-on-read): the GET /jobs/{id}
+   * handler returns the spans buffered since the previous poll and clears the buffer,
+   * so the harness only ever holds one poll-interval's worth. Best-effort observability
+   * — a dropped poll response loses at most one window. Absent until a tool runs.
+   */
+  spans?: ToolSpan[]
+  /**
+   * Forward-looking follow-up / question items the Coder streamed SINCE THE LAST POLL
+   * (drain-on-read, exactly like {@link spans}): the GET /jobs/{id} handler returns the
+   * items buffered since the previous poll and clears the buffer. The backend appends them
+   * to the run's step so the Follow-up companion surfaces them live. Absent until the Coder
+   * surfaces the first one (and only on a follow-ups-enabled coding run).
+   */
+  followUps?: FollowUpLine[]
+}
+interface JobEntry<TResult extends JobResultBase> extends JobView<TResult> {
+  /** The in-flight work; retained so the entry isn't GC-surprising (not awaited externally). */
+  promise: Promise<void>
+  /** Spans buffered since the last drain (see {@link JobView.spans}). */
+  spanBuffer: ToolSpan[]
+  /** Follow-up items buffered since the last drain (see {@link JobView.followUps}). */
+  followUpBuffer: FollowUpLine[]
+}
+/** Watchdog windows that bound every job. Tunable via the container's env. */
+export interface RunnerLimits {
+  /** Hard ceiling on total job wall-clock before it's force-failed. */
+  maxDurationMs: number
+  /** Force-fail the job if the agent produces no output for this long (hang guard). */
+  inactivityMs: number
+}
+function intEnv(value: string | undefined, fallback: number): number {
+  const n = value ? Number(value) : NaN
+  return Number.isFinite(n) && n > 0 ? n : fallback
+}
+export function loadRunnerLimits(env: NodeJS.ProcessEnv = process.env): RunnerLimits {
+  return {
+    // 60 minutes: generous headroom for serious multi-file coding tasks while
+    // still bounding a runaway container.
+    maxDurationMs: intEnv(env.JOB_MAX_DURATION_MS, 60 * 60_000),
+    // 10 minutes of zero output is treated as hung (a single long LLM/tool call
+    // is far shorter; Pi streams events as it works). The per-git command ceiling
+    // (`GIT_TIMEOUT_MS` in git.ts) is DERIVED from this value — a fixed margin below
+    // it — so a slow clone/push (which emits no activity events) always times out
+    // with git's own clear reason rather than this watchdog's "likely hung" message,
+    // for any configured window. See the invariant note in git.ts.
+    inactivityMs: intEnv(env.JOB_INACTIVITY_MS, 10 * 60_000),
+  }
+}
+function toView<TResult extends JobResultBase>(entry: JobEntry<TResult>): JobView<TResult> {
+  const {
+    promise: _promise,
+    spanBuffer: _spanBuffer,
+    followUpBuffer: _followUpBuffer,
+    ...view
+  } = entry
+  return { ...view }
+}
+/**
+ * Tracks background jobs by id. Keyed by the backend-supplied job id (the per-step
+ * job id) so a re-dispatched start re-attaches to the running job rather than starting
+ * a duplicate — which keeps the durable driver's retries idempotent and avoids redoing
+ * already-running work. Generic over the job/result shape so the same lifecycle +
+ * inactivity/max-duration watchdogs drive every agent run.
+ */
+export class JobRegistry<TJob = unknown, TResult extends JobResultBase = JobResultBase> {
+  private readonly jobs = new Map<string, JobEntry<TResult>>()
+  constructor(
+    private readonly limits: RunnerLimits,
+    // The unit of work (the `agent` handler). Injectable so tests can drive the
+    // registry's lifecycle/watchdog logic with a different runner.
+    private readonly run: (job: TJob, opts: RunOptions) => Promise<TResult>,
+    // Non-secret correlation fields to bind on the per-job logger (repo, branch, agentKind).
+    // The registry is generic over the job shape, so the kind supplies this extractor; the
+    // job id is always bound. Defaults to no extra fields.
+    private readonly describe: (job: TJob) => LogFields = () => ({}),
+  ) {}
+  /** Start the job for `id`, or return the existing one (idempotent re-attach). */
+  start(id: string, job: TJob): JobView<TResult> {
+    const existing = this.jobs.get(id)
+    if (existing) return toView(existing)
+    const now = Date.now()
+    const entry: JobEntry<TResult> = {
+      id,
+      state: 'running',
+      startedAt: now,
+      // Seed the live phase so a poll BEFORE the handler enters its first phase still
+      // shows "starting" (the container is up; the agent hasn't begun cloning yet)
+      // rather than an absent/blank phase.
+      phase: 'starting',
+      heartbeatAt: now,
+      promise: Promise.resolve(),
+      spanBuffer: [],
+      followUpBuffer: [],
+    }
+    this.jobs.set(id, entry)
+    entry.promise = this.drive(entry, job)
+    return toView(entry)
+  }
+  /**
+   * Poll the job — and DRAIN its tool-span buffer (drain-on-read). The GET /jobs/{id}
+   * handler is the sole caller, so each poll returns the spans accumulated since the
+   * previous poll and clears them, bounding the harness buffer to one poll interval.
+   */
+  get(id: string): JobView<TResult> | undefined {
+    const entry = this.jobs.get(id)
+    if (!entry) return undefined
+    const view = toView(entry)
+    if (entry.spanBuffer.length > 0) {
+      view.spans = entry.spanBuffer
+      entry.spanBuffer = []
+    }
+    if (entry.followUpBuffer.length > 0) {
+      view.followUps = entry.followUpBuffer
+      entry.followUpBuffer = []
+    }
+    return view
+  }
+  private async drive(entry: JobEntry<TResult>, job: TJob): Promise<void> {
+    const controller = new AbortController()
+    let killReason: 'inactivity' | 'max-duration' | undefined
+    const jobLog = log.child({ jobId: entry.id, ...this.describe(job) })
+    // Stuck-run breadcrumb: the coarse phase the handler is in, per-phase wall-clock, and
+    // the last completed tool — so an inactivity kill can say WHERE it hung instead of a
+    // bare "likely hung", and the finish/fail log carries the phase-timing breakdown.
+    let phase = 'starting'
+    let phaseEnteredAt = Date.now()
+    const phaseTimingsMs: Record<string, number> = {}
+    const markPhase = (next: string): void => {
+      const now = Date.now()
+      phaseTimingsMs[phase] = (phaseTimingsMs[phase] ?? 0) + (now - phaseEnteredAt)
+      phase = next
+      phaseEnteredAt = now
+      // Surface the live phase on the view so a poll shows WHAT the container is doing
+      // (cloning / running the agent / pushing) — the same marker drives the failure
+      // breadcrumb. A terminal `done`/`failed` is set by the caller below.
+      entry.phase = next
+    }
+    let lastTool: { name: string; at: number } | undefined
+    let inactivity: ReturnType<typeof setTimeout> | undefined
+    const resetInactivity = (): void => {
+      clearTimeout(inactivity)
+      inactivity = setTimeout(() => {
+        // First watchdog to fire wins the reason (a later timer firing in the
+        // teardown window must not relabel why the job was killed).
+        killReason ??= 'inactivity'
+        controller.abort(new Error('inactivity timeout'))
+      }, this.limits.inactivityMs)
+    }
+    const cap = setTimeout(() => {
+      killReason ??= 'max-duration'
+      controller.abort(new Error('max duration exceeded'))
+    }, this.limits.maxDurationMs)
+    const heartbeat = (): void => {
+      entry.heartbeatAt = Date.now()
+      resetInactivity()
+    }
+    resetInactivity()
+    jobLog.info('job started', {})
+    try {
+      const result = await this.run(job, {
+        signal: controller.signal,
+        onActivity: heartbeat,
+        onProgress: (progress) => {
+          entry.progress = progress
+        },
+        onSpan: (span) => {
+          entry.spanBuffer.push(span)
+          lastTool = { name: span.tool, at: span.endedAt }
+        },
+        onFollowUp: (items) => {
+          entry.followUpBuffer.push(...items)
+        },
+        onPhase: (next) => markPhase(next),
+        log: jobLog,
+      })
+      markPhase('done')
+      entry.state = 'done'
+      entry.result = result
+      // A clean-exit result can still be a failure (e.g. no usable output): carry its
+      // structured cause onto the view so the backend classifies it without regex.
+      if (result.error && result.failureCause) entry.failureCause = result.failureCause
+      jobLog.info('job finished', {
+        durationMs: Date.now() - entry.startedAt,
+        jobError: result.error ?? null,
+        phaseTimingsMs,
+      })
+    } catch (error) {
+      // Capture the phase the job was IN before recording the 'failed' transition, so the
+      // breadcrumb names where it hung (markPhase below would otherwise overwrite it).
+      const failedInPhase = phase
+      markPhase('failed')
+      const { message, cause, detail } = this.describeFailure(
+        killReason,
+        error,
+        failedInPhase,
+        lastTool,
+        phaseTimingsMs,
+      )
+      entry.state = 'failed'
+      entry.error = message
+      entry.failureCause = cause
+      entry.detail = detail
+      jobLog.error('job failed', {
+        durationMs: Date.now() - entry.startedAt,
+        reason: killReason ?? 'error',
+        failureCause: cause,
+        error: message,
+        phaseTimingsMs,
+      })
+    } finally {
+      clearTimeout(inactivity)
+      clearTimeout(cap)
+      entry.heartbeatAt = Date.now()
+    }
+  }
+  /**
+   * Build the redacted one-line `error`, the structured {@link FailureCause}, and the extended
+   * `detail` for a failed job. Watchdog kills keep their regex-stable phrase (so the backend's
+   * `classifyBootstrapFailure` fallback still works) and gain a breadcrumb of where they hung;
+   * a thrown error keeps its own message and its structured cause when tagged (a git op → `git`,
+   * an upstream API call → `api`), else `agent`. All strings are credential-scrubbed.
+   */
+  private describeFailure(
+    killReason: 'inactivity' | 'max-duration' | undefined,
+    error: unknown,
+    phase: string,
+    lastTool: { name: string; at: number } | undefined,
+    phaseTimingsMs: Record<string, number>,
+  ): { message: string; cause: FailureCause; detail: string } {
+    // `lastTool` is the last tool that COMPLETED (a span is emitted on tool end), so when the
+    // hang is inside a still-running tool the breadcrumb points at the prior one — worded
+    // "last completed tool" so the reader knows the stuck call may be the next, unfinished one.
+    const breadcrumb = lastTool
+      ? `last completed tool ${lastTool.name} ${Math.round((Date.now() - lastTool.at) / 1000)}s ago`
+      : 'no tool had completed yet'
+    const phaseBreakdown = Object.entries(phaseTimingsMs)
+      .map(([p, ms]) => `${p}=${Math.round(ms / 1000)}s`)
+      .join(', ')
+    if (killReason === 'inactivity') {
+      return {
+        message: redactSecrets(
+          `${inactivityAbortMessage(this.limits.inactivityMs)} (likely hung in ${phase} phase; ${breadcrumb})`,
+        ),
+        cause: 'inactivity-timeout',
+        detail: redactSecrets(`Phase timings: ${phaseBreakdown || '(none)'}. ${breadcrumb}.`),
+      }
+    }
+    if (killReason === 'max-duration') {
+      return {
+        message: redactSecrets(maxDurationAbortMessage(this.limits.maxDurationMs)),
+        cause: 'max-duration',
+        detail: redactSecrets(`Phase timings: ${phaseBreakdown || '(none)'}. ${breadcrumb}.`),
+      }
+    }
+    const raw = error instanceof Error ? error.message : String(error)
+    // A thrown error tagged with a structured cause (a git op / an upstream API call) keeps
+    // it; an untagged throw is a generic agent failure.
+    return {
+      message: redactSecrets(raw),
+      cause: failureCauseOf(error) ?? 'agent',
+      detail: redactSecrets(
+        `${phaseBreakdown ? `Phase timings: ${phaseBreakdown}. ` : ''}Failed in ${phase} phase; ${breadcrumb}.`,
+      ),
+    }
+  }
+}

package/src/server.ts ADDED Viewed

@@ -0,0 +1,153 @@
+import { timingSafeEqual } from 'node:crypto'
+import { createServer, type IncomingMessage, type ServerResponse } from 'node:http'
+import { parseAgentJob } from './job.js'
+import { handleAgent } from './agent.js'
+import { redactSecrets } from './git.js'
+import { JobRegistry, loadRunnerLimits, type JobResultBase, type RunOptions } from './runner.js'
+import { log } from './logger.js'
+// The container's HTTP entry point. The Worker addresses one instance per run and
+// POSTs a job to /jobs (the body's `kind` selects which agent runs); the harness
+// starts that job in the background (bounded by an inactivity + max-duration
+// watchdog) and returns a job id, which the Worker then polls via GET /jobs/{id}.
+// Nothing here holds long-lived secrets: the per-job GitHub + proxy tokens arrive
+// in the request body and live only for the duration of the job in an ephemeral
+// workspace.
+const PORT = Number(process.env.PORT ?? 8080)
+// Optional inbound auth. When HARNESS_SHARED_SECRET is set, every non-health
+// request must present a matching `x-harness-secret` header (constant-time
+// compared). When it is unset the harness behaves as before (open), so local/dev
+// and the existing acceptance flow keep working without configuration.
+// The direct callers send the matching header when the secret is configured: the
+// local Docker transport (LocalContainerRunnerTransport) and the Cloudflare
+// transport (CloudflareContainerTransport, which also injects the secret into the
+// container env). A self-hosted runner pool reaches the harness through its own
+// control plane, so its operator configures the secret pool-side.
+const SHARED_SECRET = process.env.HARNESS_SHARED_SECRET
+const HEADER = 'x-harness-secret'
+/** Constant-time check of the shared-secret header; true when auth is disabled. */
+function authorized(req: IncomingMessage): boolean {
+  if (!SHARED_SECRET) return true
+  const provided = req.headers[HEADER]
+  const got = Buffer.from(Array.isArray(provided) ? (provided[0] ?? '') : (provided ?? ''))
+  const want = Buffer.from(SHARED_SECRET)
+  // Length check first; timingSafeEqual requires equal-length buffers.
+  return got.length === want.length && timingSafeEqual(got, want)
+}
+// One registry per kind per container process. A run addresses its own container
+// instance (one Durable Object id per execution / bootstrap job) and dispatches its
+// sequence of step jobs to it; every kind shares the same watchdog/lifecycle but
+// produces a different result, so each gets its own registry keyed by the job id.
+const limits = loadRunnerLimits()
+/** A dispatchable kind: how to validate its body and the registry that runs it. */
+interface KindEntry {
+  parse: (input: unknown) => { jobId: string }
+  registry: JobRegistry<never, JobResultBase>
+}
+/** Pair a body validator with a registry running its handler under the shared limits. */
+function defineKind<TJob extends { jobId: string }, TResult extends JobResultBase>(
+  parse: (input: unknown) => TJob,
+  handler: (job: TJob, opts: RunOptions) => Promise<TResult>,
+  // Non-secret correlation fields bound on the per-job logger (see JobRegistry.describe).
+  describe?: (job: TJob) => Record<string, unknown>,
+): KindEntry {
+  return {
+    parse,
+    registry: new JobRegistry<TJob, TResult>(limits, handler, describe),
+  } as unknown as KindEntry
+}
+// The dispatch table. The harness now serves a SINGLE, manifest-driven kind: the
+// generic `agent` (the job body's `mode` — explore | coding — and its data select the
+// flow; WHAT the agent does is decided entirely by the backend). The per-kind bespoke
+// handlers (run/blueprint/spec/explore/merge/test/…) were strangled onto this one kind
+// and removed. A `POST /jobs` reads the body's `kind` to pick the entry; `GET /jobs/{id}`
+// checks every registry (job ids never collide across kinds). `kind` mirrors kernel's
+// `RunnerDispatchKind` (now also just `'agent'`); the harness keeps its own copy so the
+// image carries no runtime deps.
+const KINDS: Record<string, KindEntry> = {
+  agent: defineKind(parseAgentJob, handleAgent, (job) => ({
+    mode: job.mode,
+    repo: `${job.repo.owner}/${job.repo.name}`,
+    branch: job.branch,
+  })),
+}
+async function readBody(req: IncomingMessage): Promise<string> {
+  const chunks: Buffer[] = []
+  for await (const chunk of req) chunks.push(chunk as Buffer)
+  return Buffer.concat(chunks).toString('utf8')
+}
+function send(res: ServerResponse, status: number, body: unknown): void {
+  const payload = JSON.stringify(body)
+  res.writeHead(status, { 'content-type': 'application/json' })
+  res.end(payload)
+}
+const server = createServer((req, res) => {
+  void (async () => {
+    if (req.method === 'GET' && req.url === '/health') {
+      return send(res, 200, { status: 'ok' })
+    }
+    // All non-health endpoints are gated by the optional shared secret.
+    if (!authorized(req)) {
+      return send(res, 401, { error: 'unauthorized' })
+    }
+    // Poll a running/finished job: GET /jobs/{id}. Job ids are unique per kind, so
+    // check each registry in turn; the first hit wins.
+    if (req.method === 'GET' && req.url?.startsWith('/jobs/')) {
+      const id = decodeURIComponent(req.url.slice('/jobs/'.length))
+      for (const { registry } of Object.values(KINDS)) {
+        const view = registry.get(id)
+        if (view) return send(res, 200, view)
+      }
+      return send(res, 404, { error: 'job not found' })
+    }
+    // Start (or re-attach to) a job: POST /jobs with the kind in the body. The body's
+    // `kind` selects the validator + registry; the rest is that kind's job spec.
+    // Returns immediately with the job id; the caller polls GET /jobs/{id} for live
+    // subtask progress and the final result. Idempotent: a re-dispatched POST
+    // (a durable-driver replay) re-attaches to the job already running for the id
+    // rather than starting a duplicate.
+    if (req.method === 'POST' && req.url === '/jobs') {
+      let kind: unknown
+      try {
+        const raw = JSON.parse(await readBody(req)) as Record<string, unknown>
+        kind = raw.kind
+        const entry = typeof kind === 'string' ? KINDS[kind] : undefined
+        if (!entry) {
+          return send(res, 404, { error: `unknown job kind '${String(kind)}'` })
+        }
+        const job = entry.parse(raw)
+        const view = entry.registry.start(job.jobId, job as never)
+        return send(res, 202, { jobId: view.id, state: view.state })
+      } catch (error) {
+        // Parse failures (incl. host-allowlist rejection) are client errors → 400.
+        const message = redactSecrets(error instanceof Error ? error.message : String(error))
+        log.error('failed to start job', {
+          kind: typeof kind === 'string' ? kind : undefined,
+          error: message,
+        })
+        return send(res, 400, { error: message })
+      }
+    }
+    return send(res, 404, { error: 'not found' })
+  })()
+})
+// Only auto-listen when run as the entry point (tests import handleRun directly).
+if (process.env.NODE_ENV !== 'test') {
+  server.listen(PORT, () => {
+    console.log(`executor-harness listening on :${PORT}`)
+  })
+}
+export { server }