npm - @cat-factory/executor-harness - Versions diffs - 1.31.0 - Mend

@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/LICENSE +21 -0
package/README.md +143 -0
package/dist/agent-runner.js +389 -0
package/dist/agent.js +810 -0
package/dist/blueprint.js +367 -0
package/dist/bootstrap.js +99 -0
package/dist/ci-fixer.js +46 -0
package/dist/coding-agent.js +285 -0
package/dist/conflict-resolver.js +138 -0
package/dist/embed.js +8 -0
package/dist/explore.js +74 -0
package/dist/failure.js +47 -0
package/dist/fixer.js +44 -0
package/dist/follow-ups.js +103 -0
package/dist/frontend-infra.js +283 -0
package/dist/fs-utils.js +11 -0
package/dist/git.js +778 -0
package/dist/job.js +409 -0
package/dist/logger.js +27 -0
package/dist/merger.js +135 -0
package/dist/on-call.js +126 -0
package/dist/pi-workspace.js +237 -0
package/dist/pi.js +971 -0
package/dist/process.js +25 -0
package/dist/redact.js +109 -0
package/dist/runner.js +228 -0
package/dist/server.js +135 -0
package/dist/spec.js +754 -0
package/dist/structured-output.js +431 -0
package/dist/tester.js +191 -0
package/package.json +35 -0
package/src/agent-runner.ts +484 -0
package/src/agent.ts +948 -0
package/src/coding-agent.ts +393 -0
package/src/embed.ts +32 -0
package/src/failure.ts +73 -0
package/src/follow-ups.ts +106 -0
package/src/frontend-infra.ts +340 -0
package/src/fs-utils.ts +11 -0
package/src/git.ts +955 -0
package/src/job.ts +766 -0
package/src/logger.ts +45 -0
package/src/pi-workspace.ts +348 -0
package/src/pi.ts +1236 -0
package/src/process.ts +33 -0
package/src/redact.ts +109 -0
package/src/runner.ts +384 -0
package/src/server.ts +153 -0
package/src/structured-output.ts +524 -0

package/dist/structured-output.js ADDED Viewed

@@ -0,0 +1,431 @@
+import { redact, redactSecrets, secretsToRedact } from './redact.js';
+import { log } from './logger.js';
+import { PI_MAX_OUTPUT_TOKENS } from './pi.js';
+// A reusable abstraction for the "agent returns a structured JSON document as its
+// final assistant message" pattern (requirements, blueprint, merger — and any future
+// kind). An agent of this kind emits its result as text, not a tool call, and the
+// harness parses it. A model can produce text that won't parse: truncated JSON,
+// prose/fences around it, trailing commas, or the workers-ai-provider reasoning-model
+// streaming corruption that duplicates every token (`serviceservice…`).
+//
+// Instead of failing the whole container run on the first unparseable reply, a caller
+// describes its output once as a `StructuredOutputSpec<T>` (a label, a shape hint, and
+// a parser) and calls `resolveStructuredOutput`. That:
+//   1. tries to parse the primary (Pi) output;
+//   2. on failure, makes ONE structured repair call — a single-shot, no-tools,
+//      NON-streaming completion through the same proxy with `response_format:
+//      json_object`, asking the model to return only the corrected JSON — and reparses;
+//   3. returns the value (or null) plus structured diagnostics.
+//
+// It is provider-agnostic (external OpenAI-compatible upstreams honour
+// `response_format`; the in-process Workers AI path ignores it but answers buffered,
+// sidestepping the streaming double-emit, and the focused prompt keeps it to JSON) and
+// observable (the repair call lands in `llm_call_metrics` as a NON-streaming row, and
+// every parse failure / repair outcome is logged so "this happened" and "the retry
+// didn't help" are both queryable).
+/** Output-token ceiling for the repair call — mirrors the harness's PI_MAX_OUTPUT_TOKENS. */
+const REPAIR_MAX_OUTPUT_TOKENS = PI_MAX_OUTPUT_TOKENS;
+/** Hard cap on how much malformed text we feed the repair model (keep the call cheap). */
+const MAX_REPAIR_INPUT_CHARS = 40_000;
+const REPAIR_SYSTEM = 'You repair malformed JSON. You are given text that was meant to be a single ' +
+    'JSON object but does not parse. Return ONLY the corrected JSON object: no prose, ' +
+    'no markdown code fences, no commentary, and never repeat or duplicate any tokens. ' +
+    'Preserve the original content faithfully; only fix the JSON structure.';
+/**
+ * Largest immediately-repeated run length we look for. The corruption duplicates
+ * whole model tokens, which carry whitespace/punctuation context and run to ~10-15
+ * chars (`"service"`, `observability`); 24 covers them with headroom while staying
+ * cheap. We don't match single chars (k>=2): a lone doubled `{`/space is normal.
+ */
+const MAX_DOUBLE_RUN = 24;
+/**
+ * Cap on how much of a (possibly huge) failed output the doubling heuristic scans.
+ * The corruption is uniform across the whole reply, so a prefix is representative,
+ * and this bounds the otherwise O(n·{@link MAX_DOUBLE_RUN}²) scan on a large
+ * document. The detector only runs on the parse-failure path, so this is belt-and-
+ * braces rather than a hot-path concern.
+ */
+const MAX_DOUBLE_SCAN_CHARS = 20_000;
+/**
+ * Heuristic detector for the token-doubling corruption ("serviceservice",
+ * "observobservabilityability", `{\n{\n`). Greedy scan over a bounded prefix: at each
+ * position, find the longest 2..{@link MAX_DOUBLE_RUN}-char run that is immediately
+ * repeated and count both copies as "doubled", then measure the doubled fraction of
+ * the scanned text. Token-doubled text (consecutive `t t` pairs) scores near 1.0;
+ * normal JSON/prose scores low (only incidental short repeats). Advisory ONLY — it
+ * labels a failure for telemetry, it never mutates output.
+ */
+export function looksTokenDoubled(text) {
+    // Scan at most MAX_DOUBLE_SCAN_CHARS; `startsWith` stays within this prefix because
+    // `maxK` bounds each match so `i + matched * 2 <= n`.
+    const n = Math.min(text.length, MAX_DOUBLE_SCAN_CHARS);
+    if (n < 40)
+        return { doubled: false, ratio: 0 };
+    let covered = 0;
+    let i = 0;
+    while (i < n) {
+        let matched = 0;
+        const maxK = Math.min(MAX_DOUBLE_RUN, Math.floor((n - i) / 2));
+        for (let k = maxK; k >= 2; k--) {
+            // Is the k-char run at i immediately followed by an identical run?
+            if (text.startsWith(text.slice(i, i + k), i + k)) {
+                matched = k;
+                break;
+            }
+        }
+        if (matched > 0) {
+            covered += matched * 2;
+            i += matched * 2;
+        }
+        else {
+            i += 1;
+        }
+    }
+    const ratio = covered / n;
+    return { doubled: ratio >= 0.5, ratio };
+}
+/**
+ * Resolve a structured output: parse the agent's `primaryText` via `spec.parse`; on
+ * failure, make ONE structured repair call and re-parse. Returns the value (or null
+ * when both attempts fail) plus {@link StructuredOutputDiagnostics}. Logging side
+ * effects only; never throws (a repair transport error is captured in the diagnostics).
+ */
+export async function resolveStructuredOutput(spec, primaryText, access) {
+    const trace = { agent: spec.label, jobId: access.jobId };
+    const primaryChars = primaryText.length;
+    const primary = safeParse(primaryText, spec.parse);
+    if (primary !== null) {
+        return {
+            value: primary,
+            diagnostics: {
+                parsedOn: 'primary',
+                primaryChars,
+                looksDoubled: false,
+                repairAttempted: false,
+                repairSucceeded: false,
+            },
+        };
+    }
+    // Pick a repair channel. The Pi harness repairs through the LLM proxy; the
+    // claude-code subscription harness has no proxy but DOES speak a standard
+    // Anthropic Messages API (Anthropic itself, or an Anthropic-compatible endpoint
+    // for GLM/Kimi/DeepSeek), so it repairs straight against the vendor with the
+    // leased token. Codex has no simple JSON API, so it keeps the graceful no-repair
+    // path (the smaller GLM/Kimi/DeepSeek models — most prone to malformed JSON — are
+    // covered by the claude-code channel).
+    const canProxyRepair = !!access.proxyBaseUrl && !!access.sessionToken;
+    const canSubscriptionRepair = access.harness === 'claude-code' && !!access.subscriptionToken;
+    if (!canProxyRepair && !canSubscriptionRepair) {
+        return {
+            value: null,
+            diagnostics: {
+                parsedOn: 'none',
+                primaryChars,
+                looksDoubled: looksTokenDoubled(primaryText).doubled,
+                repairAttempted: false,
+                repairSucceeded: false,
+                repairError: `structured-output repair unavailable for the ${access.harness ?? 'pi'} harness`,
+            },
+        };
+    }
+    // Primary failed: label the corruption (doubling is the known reasoning-model
+    // streaming bug) and record the event before spending a repair call.
+    const doubled = looksTokenDoubled(primaryText);
+    log.warn('structured-output: primary unparseable — attempting structured repair', {
+        ...trace,
+        primaryChars,
+        looksDoubled: doubled.doubled,
+        doubledRatio: Number(doubled.ratio.toFixed(2)),
+    });
+    let repairError;
+    let repaired = null;
+    try {
+        const repairedText = await callRepair(primaryText, spec, access);
+        repaired = safeParse(repairedText, spec.parse);
+        if (repaired === null)
+            repairError = 'repair output still did not parse';
+    }
+    catch (err) {
+        repairError = err instanceof Error ? err.message : String(err);
+    }
+    if (repaired !== null) {
+        log.info('structured-output: repair recovered a usable document', { ...trace, primaryChars });
+        return {
+            value: repaired,
+            diagnostics: {
+                parsedOn: 'repair',
+                primaryChars,
+                looksDoubled: doubled.doubled,
+                repairAttempted: true,
+                repairSucceeded: true,
+            },
+        };
+    }
+    // The retry did not help — the case we explicitly want visible in telemetry.
+    log.error('structured-output: unrecoverable after structured repair', {
+        ...trace,
+        primaryChars,
+        looksDoubled: doubled.doubled,
+        doubledRatio: Number(doubled.ratio.toFixed(2)),
+        repairError,
+    });
+    return {
+        value: null,
+        diagnostics: {
+            parsedOn: 'none',
+            primaryChars,
+            looksDoubled: doubled.doubled,
+            repairAttempted: true,
+            repairSucceeded: false,
+            repairError,
+        },
+    };
+}
+/**
+ * Make the structured repair call and return the model's text (the corrected JSON,
+ * ideally). Throws on a transport/HTTP error so the caller records it as the repair
+ * failure reason. Routes to the LLM proxy (Pi harness) when present, else to the
+ * claude-code subscription harness's own Anthropic-compatible endpoint.
+ */
+async function callRepair(badText, spec, access) {
+    if ((!access.proxyBaseUrl || !access.sessionToken) && access.subscriptionToken) {
+        return callSubscriptionRepair(badText, spec, access);
+    }
+    // Only ever called after the caller verified the proxy is present (Pi harness).
+    if (!access.proxyBaseUrl || !access.sessionToken) {
+        throw new Error('structured-output repair requires the LLM proxy (Pi harness)');
+    }
+    const url = `${access.proxyBaseUrl.replace(/\/+$/, '')}/chat/completions`;
+    const messages = [
+        { role: 'system', content: REPAIR_SYSTEM },
+        {
+            role: 'user',
+            content: `${spec.shapeHint}\n\n` +
+                'The text below was meant to be that JSON object but does not parse. Return ' +
+                'ONLY the corrected JSON object.\n\n' +
+                badText.slice(0, MAX_REPAIR_INPUT_CHARS),
+        },
+    ];
+    const base = {
+        // The proxy locks the model to the session's; sent for completeness.
+        model: access.model,
+        stream: false,
+        max_tokens: REPAIR_MAX_OUTPUT_TOKENS,
+        // No `temperature`: the newest models (Anthropic Opus 4.7+/the Claude 5 family) reject
+        // any sampling parameter with a 400, and a single-shot repair whose system prompt already
+        // forces JSON-only output doesn't need one — so we omit it for every model/provider.
+        messages,
+    };
+    // Capability gate: ask for `json_object` structured output (honoured by external
+    // OpenAI-compatible upstreams; ignored by the in-process Workers AI path). If an
+    // upstream REJECTS the parameter (4xx), fall back to the prompt-only path — the
+    // system prompt already demands JSON — rather than failing the repair outright.
+    const withFormat = { ...base, response_format: { type: 'json_object' } };
+    let res = await post(url, access, withFormat);
+    // A 4xx here means the upstream REJECTED `response_format` → fall back to prompt-only. Exclude
+    // 429: it is a rate-limit (already retried with backoff inside `post`), not a param rejection,
+    // so re-interpreting it as one would waste a second full prompt-only round on a rate-limit.
+    if (!res.ok && res.status !== 429 && res.status >= 400 && res.status < 500) {
+        log.warn('structured-output: repair upstream rejected response_format — retrying prompt-only', {
+            agent: spec.label,
+            jobId: access.jobId,
+            status: res.status,
+        });
+        res = await post(url, access, base);
+    }
+    if (!res.ok) {
+        const detail = redactSecrets((await res.text().catch(() => '')).slice(0, 300));
+        throw new Error(`repair call failed: HTTP ${res.status}${detail ? ` — ${detail}` : ''}`);
+    }
+    const json = (await res.json());
+    const content = json.choices?.[0]?.message?.content;
+    return typeof content === 'string' ? content : '';
+}
+/**
+ * Repair via the claude-code subscription harness's own vendor endpoint (no proxy):
+ * a single non-streaming Anthropic Messages call with the leased token. Anthropic
+ * itself uses the OAuth token (Bearer + the oauth beta header) against
+ * api.anthropic.com; an Anthropic-compatible vendor (GLM/Kimi/DeepSeek) uses its
+ * `subscriptionBaseUrl` with the API-token `x-api-key` header. Best-effort: any
+ * error propagates to the caller's `repairError` and degrades to the null path.
+ */
+async function callSubscriptionRepair(badText, spec, access) {
+    if (!access.subscriptionToken) {
+        throw new Error('structured-output subscription repair requires a subscription token');
+    }
+    const base = access.subscriptionBaseUrl?.replace(/\/+$/, '') ?? 'https://api.anthropic.com';
+    const url = `${base}/v1/messages`;
+    const headers = {
+        'content-type': 'application/json',
+        'anthropic-version': '2023-06-01',
+    };
+    if (access.subscriptionBaseUrl) {
+        // Anthropic-compatible vendor (GLM/Kimi/DeepSeek): API token via x-api-key.
+        headers['x-api-key'] = access.subscriptionToken;
+    }
+    else {
+        // Anthropic on a Claude subscription OAuth token.
+        headers.authorization = `Bearer ${access.subscriptionToken}`;
+        headers['anthropic-beta'] = 'oauth-2025-04-20';
+    }
+    const body = {
+        model: access.model,
+        max_tokens: REPAIR_MAX_OUTPUT_TOKENS,
+        // No `temperature`: Anthropic's newest models (Opus 4.7+/Claude 5 family) reject the
+        // sampling parameters with `400 invalid_request_error: temperature is deprecated for this
+        // model`. The repair prompt fully constrains the output to JSON, so determinism via
+        // temperature=0 isn't needed — omitting it keeps the call valid on every model.
+        system: REPAIR_SYSTEM,
+        messages: [
+            {
+                role: 'user',
+                content: `${spec.shapeHint}\n\n` +
+                    'The text below was meant to be that JSON object but does not parse. Return ' +
+                    'ONLY the corrected JSON object.\n\n' +
+                    badText.slice(0, MAX_REPAIR_INPUT_CHARS),
+            },
+        ],
+    };
+    const res = await fetchRepairWithRetry(() => fetch(url, {
+        method: 'POST',
+        headers,
+        body: JSON.stringify(body),
+        signal: access.signal,
+    }), access.signal, access.jobId);
+    if (!res.ok) {
+        // A vendor 4xx body can echo the API key/token back; `redact` applies both the
+        // GitHub-shaped pattern rules AND scrubs the leased subscription credential (the raw
+        // value, and — for a JSON auth bundle — its nested token leaves) before surfacing.
+        const raw = (await res.text().catch(() => '')).slice(0, 300);
+        const detail = redact(raw, secretsToRedact(access.subscriptionToken ?? ''));
+        throw new Error(`subscription repair call failed: HTTP ${res.status}${detail ? ` — ${detail}` : ''}`);
+    }
+    const json = (await res.json());
+    // Concatenate the text blocks of the Anthropic Messages response.
+    return (json.content ?? [])
+        .filter((b) => b?.type === 'text' && typeof b.text === 'string')
+        .map((b) => b.text)
+        .join('');
+}
+/** POST a chat-completions body to the proxy with the session bearer token. */
+function post(url, access, body) {
+    return fetchRepairWithRetry(() => fetch(url, {
+        method: 'POST',
+        headers: {
+            authorization: `Bearer ${access.sessionToken}`,
+            'content-type': 'application/json',
+        },
+        body: JSON.stringify(body),
+        signal: access.signal,
+    }), access.signal, access.jobId);
+}
+// A single structured-repair call is the LAST line of defence before an unparseable agent
+// reply fails the whole run. A TRANSIENT upstream blip on that one call — most importantly a
+// 429 rate-limit (which once turned a recoverable parse into a hard `no structured result`
+// failure), but also a 5xx or a dropped connection — must not be fatal, so retry it with
+// exponential backoff honoring `Retry-After`.
+const REPAIR_RETRY_ATTEMPTS = 3;
+const REPAIR_RETRY_BASE_MS = 500;
+const REPAIR_RETRY_MAX_MS = 8_000;
+/** `Retry-After` (seconds or HTTP-date) as ms, capped; undefined if absent/invalid. */
+function repairRetryAfterMs(res) {
+    const raw = res.headers.get('retry-after');
+    if (!raw)
+        return undefined;
+    const secs = Number(raw);
+    if (Number.isFinite(secs))
+        return secs > 0 ? Math.min(secs * 1000, REPAIR_RETRY_MAX_MS) : undefined;
+    const at = Date.parse(raw);
+    if (Number.isNaN(at))
+        return undefined;
+    const ms = at - Date.now();
+    return ms > 0 ? Math.min(ms, REPAIR_RETRY_MAX_MS) : undefined;
+}
+/** Exponential backoff (base 500ms, capped 8s) with up to 25% positive jitter. */
+function repairBackoffMs(attempt) {
+    const base = Math.min(REPAIR_RETRY_MAX_MS, REPAIR_RETRY_BASE_MS * 2 ** (attempt - 1));
+    return base + Math.floor(base * 0.25 * Math.random());
+}
+/** Sleep `ms`, rejecting early if the abort signal fires. */
+async function abortableDelay(ms, signal) {
+    if (ms <= 0)
+        return;
+    if (signal?.aborted)
+        throw signal.reason ?? new Error('aborted');
+    await new Promise((resolve, reject) => {
+        const onAbort = () => {
+            clearTimeout(timer);
+            reject(signal?.reason ?? new Error('aborted'));
+        };
+        const timer = setTimeout(() => {
+            signal?.removeEventListener('abort', onAbort);
+            resolve();
+        }, ms);
+        signal?.addEventListener('abort', onAbort, { once: true });
+    });
+}
+/**
+ * Run a repair fetch, retrying TRANSIENT failures (HTTP 429 / >=500 / network error) with
+ * exponential backoff honoring `Retry-After`. A caller abort is terminal. Non-transient
+ * responses (2xx/4xx, e.g. a `response_format` rejection) and the final attempt return
+ * as-is — the caller's existing `!res.ok` handling then produces the repair diagnostic
+ * without this masking a genuine, non-retryable error.
+ */
+async function fetchRepairWithRetry(doFetch, signal, jobId) {
+    let lastError;
+    for (let attempt = 1; attempt <= REPAIR_RETRY_ATTEMPTS; attempt++) {
+        if (signal?.aborted)
+            throw signal.reason ?? new Error('aborted');
+        let res;
+        try {
+            res = await doFetch();
+        }
+        catch (err) {
+            // A caller/watchdog abort is terminal; a network error is transient → retry.
+            if (signal?.aborted)
+                throw err;
+            lastError = err;
+        }
+        if (res) {
+            const transient = res.status === 429 || res.status >= 500;
+            if (!transient || attempt >= REPAIR_RETRY_ATTEMPTS)
+                return res;
+            const wait = repairRetryAfterMs(res) ?? repairBackoffMs(attempt);
+            // Discard the unread body before retrying so the connection can be reused.
+            await res.body?.cancel().catch(() => { });
+            log.warn('structured-output: repair upstream transient failure — backing off', {
+                jobId,
+                status: res.status,
+                attempt,
+                waitMs: wait,
+            });
+            await abortableDelay(wait, signal);
+            continue;
+        }
+        if (attempt >= REPAIR_RETRY_ATTEMPTS)
+            break;
+        await abortableDelay(repairBackoffMs(attempt), signal);
+    }
+    throw lastError instanceof Error ? lastError : new Error('repair request failed after retries');
+}
+/** Run `parse`, treating a thrown error (e.g. `extractJsonObject`) as "no value". */
+function safeParse(text, parse) {
+    try {
+        return parse(text);
+    }
+    catch {
+        return null;
+    }
+}
+/** Append a compact, human-readable diagnostics suffix to a no-document failure reason. */
+export function diagnosticsSuffix(d) {
+    const parts = [];
+    if (d.looksDoubled)
+        parts.push('output appeared token-doubled (streaming corruption)');
+    if (d.repairAttempted) {
+        parts.push(d.repairSucceeded
+            ? 'structured repair recovered it'
+            : `structured repair did not help${d.repairError ? ` (${d.repairError})` : ''}`);
+    }
+    return parts.length > 0 ? ` [${parts.join('; ')}]` : '';
+}

package/dist/tester.js ADDED Viewed

@@ -0,0 +1,191 @@
+import { execFile } from 'node:child_process';
+import { promisify } from 'node:util';
+import { cloneRepo } from './git.js';
+import { extractJsonObject } from './blueprint.js';
+import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
+import { diagnosticsSuffix, resolveStructuredOutput, } from './structured-output.js';
+import { log } from './logger.js';
+const exec = promisify(execFile);
+// Async job execution for the Tester. The engine dispatches this to run the project's
+// tests: clone the PR HEAD branch, stand its dependencies up (local docker-compose
+// infra, or test against an ephemeral env), run Pi to exercise the change + regress
+// related behaviour, and return ONLY a structured JSON report. The Tester makes NO
+// commits — on a withheld greenlight the engine loops the `fixer` and re-tests.
+/** Compact description of the report shape, fed to the JSON repair call. */
+const REPORT_SHAPE_HINT = 'Expected a test report: {"greenlight": boolean, "summary": string, "tested": string[], ' +
+    '"outcomes": [{"name": string, "status": "passed"|"failed"|"skipped", "detail"?: string}], ' +
+    '"concerns": [{"title": string, "detail": string, "severity": "low"|"medium"|"high"|"critical"}]}.';
+const SEVERITIES = new Set(['low', 'medium', 'high', 'critical']);
+const STATUSES = new Set(['passed', 'failed', 'skipped']);
+/** Coerce the agent's JSON into a well-formed report, defaulting conservatively. */
+function coerceReport(raw, summary, env) {
+    const o = (typeof raw === 'object' && raw !== null ? raw : {});
+    const outcomes = Array.isArray(o.outcomes)
+        ? o.outcomes
+            .filter((x) => typeof x === 'object' && x !== null)
+            .map((x) => ({
+            name: typeof x.name === 'string' ? x.name : '(unnamed)',
+            status: (STATUSES.has(x.status)
+                ? x.status
+                : 'skipped'),
+            ...(typeof x.detail === 'string' && x.detail ? { detail: x.detail } : {}),
+        }))
+        : [];
+    const concerns = Array.isArray(o.concerns)
+        ? o.concerns
+            .filter((x) => typeof x === 'object' && x !== null)
+            .map((x) => ({
+            title: typeof x.title === 'string' ? x.title : '(concern)',
+            detail: typeof x.detail === 'string' ? x.detail : '',
+            severity: (SEVERITIES.has(x.severity)
+                ? x.severity
+                : 'medium'),
+        }))
+        : [];
+    // A greenlight is only honoured when no BLOCKING (high/critical) concern was
+    // raised — never auto-pass a run with an open blocker, even if the model set
+    // greenlight:true by mistake. Low/medium concerns are advisory: they're reported
+    // but don't, on their own, withhold the greenlight (which would otherwise burn the
+    // whole fixer budget looping on a trivial nit). The engine re-applies this rule.
+    const blocking = concerns.some((c) => c.severity === 'high' || c.severity === 'critical');
+    const greenlight = o.greenlight === true && !blocking;
+    return {
+        greenlight,
+        summary: typeof o.summary === 'string' && o.summary ? o.summary : summary.slice(0, 2000),
+        tested: Array.isArray(o.tested)
+            ? o.tested.filter((t) => typeof t === 'string')
+            : [],
+        outcomes,
+        concerns,
+        environment: env,
+    };
+}
+/** Build the tester task prompt: how to bring the deps up + what to test. */
+function buildUserPrompt(job) {
+    const lines = [job.userPrompt, ''];
+    if (job.test.environment === 'ephemeral') {
+        lines.push('Run mode: ephemeral environment.', job.test.environmentUrl
+            ? `Test against the deployed environment at ${job.test.environmentUrl}. Do not start the service locally.`
+            : 'Test against the provided ephemeral environment URL from your context. Do not start the service locally.');
+    }
+    else if (job.test.noInfraDependencies) {
+        lines.push('Run mode: local, no infra dependencies — just install, build and run the test suite directly.');
+    }
+    else {
+        lines.push("Run mode: local. The service's infra dependencies from its docker-compose file have been started and are reachable on localhost. Read the README to learn how to configure the service against them, run any migrations, start the service and exercise it.");
+    }
+    lines.push('', 'Respond with ONLY the JSON test report described in your instructions.');
+    return lines.join('\n');
+}
+/**
+ * Bring the service's docker-compose dependencies up (local mode only). Best-effort:
+ * runs `docker compose -f <path> up -d --wait` in the checkout. A missing Docker
+ * daemon or a compose failure is logged and surfaced to the agent rather than failing
+ * the whole job — the agent can still run unit-level tests and report what it could.
+ */
+async function standUpInfra(dir, test, signal, trace) {
+    if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath) {
+        return { started: false };
+    }
+    try {
+        log.info('test: standing up infra', { ...trace, composePath: test.composePath });
+        await exec('docker', ['compose', '-f', test.composePath, 'up', '-d', '--wait'], {
+            cwd: dir,
+            signal,
+            timeout: 5 * 60_000,
+        });
+        return { started: true };
+    }
+    catch (err) {
+        const note = err instanceof Error ? err.message : String(err);
+        log.warn('test: infra stand-up failed', { ...trace, error: note });
+        return { started: false, note };
+    }
+}
+/** Tear the docker-compose dependencies down (best-effort). */
+async function tearDownInfra(dir, test) {
+    if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath)
+        return;
+    try {
+        await exec('docker', ['compose', '-f', test.composePath, 'down', '-v'], {
+            cwd: dir,
+            timeout: 2 * 60_000,
+        });
+    }
+    catch {
+        // The container is ephemeral and torn down with the run anyway — ignore.
+    }
+}
+/** Run one Tester job end to end: clone branch → stand up infra → Pi tests → report. */
+export async function handleTester(job, opts = {}) {
+    const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
+    return withWorkspace('test', async (dir) => {
+        log.info('test: cloning PR branch', trace);
+        await cloneRepo({
+            repo: { ...job.repo, baseBranch: job.branch },
+            ghToken: job.ghToken,
+            dir,
+            signal: opts.signal,
+        });
+        const infra = await standUpInfra(dir, job.test, opts.signal, trace);
+        try {
+            log.info('test: running agent', { ...trace, environment: job.test.environment });
+            let userPrompt = buildUserPrompt(job);
+            if (infra.note) {
+                userPrompt += `\n\nNote: standing the infra up reported a problem (${infra.note}). Test what you can and flag any dependency-related gaps as concerns.`;
+            }
+            const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
+                dir,
+                systemPrompt: job.systemPrompt,
+                userPrompt,
+                model: job.model,
+                harness: job.harness,
+                subscriptionToken: job.subscriptionToken,
+                subscriptionBaseUrl: job.subscriptionBaseUrl,
+                proxyBaseUrl: job.proxyBaseUrl,
+                sessionToken: job.sessionToken,
+                // The tester only assesses (it commits nothing), so the no-edit guard must
+                // not fire on its legitimately edit-free run.
+                expectsEdits: false,
+            }, opts);
+            const { value: report, diagnostics } = await resolveStructuredOutput({
+                label: 'tester',
+                shapeHint: REPORT_SHAPE_HINT,
+                parse: (text) => coerceReport(extractJsonObject(text), text, job.test.environment),
+            }, summary, {
+                harness: job.harness,
+                subscriptionToken: job.subscriptionToken,
+                subscriptionBaseUrl: job.subscriptionBaseUrl,
+                proxyBaseUrl: job.proxyBaseUrl,
+                sessionToken: job.sessionToken,
+                model: job.model,
+                jobId: job.jobId,
+                signal: opts.signal,
+            });
+            if (!report) {
+                return {
+                    summary,
+                    stats,
+                    error: noReportReason(stats, stderrTail, diagnostics),
+                    ...(usage ? { usage } : {}),
+                };
+            }
+            log.info('test: reported', {
+                ...trace,
+                greenlight: report.greenlight,
+                concerns: report.concerns.length,
+            });
+            return { report, summary, stats, ...(usage ? { usage } : {}) };
+        }
+        finally {
+            await tearDownInfra(dir, job.test);
+        }
+    });
+}
+/** Human-readable reason a tester run produced no usable report. */
+function noReportReason(stats, stderrTail, diagnostics) {
+    const cause = agentNeverActed(stats)
+        ? NEVER_ACTED_CAUSE
+        : ' The agent did not return a parseable JSON test report.';
+    return `Tester produced no report.${cause}${diagnostics ? diagnosticsSuffix(diagnostics) : ''}${agentOutputTail(stderrTail)}`;
+}

package/package.json ADDED Viewed

@@ -0,0 +1,35 @@
+{
+  "name": "@cat-factory/executor-harness",
+  "version": "1.31.0",
+  "description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
+  "type": "module",
+  "main": "./dist/server.js",
+  "exports": {
+    ".": "./dist/server.js",
+    "./embed": "./src/embed.ts"
+  },
+  "files": [
+    "dist",
+    "src"
+  ],
+  "publishConfig": {
+    "access": "public"
+  },
+  "devDependencies": {
+    "@hono/node-server": "^2.0.6",
+    "@types/node": "^26.0.0",
+    "hono": "^4.12.27",
+    "typescript": "^6.0.3",
+    "vitest": "^4.1.9",
+    "@cat-factory/server": "0.65.2",
+    "@cat-factory/spend": "0.10.67"
+  },
+  "scripts": {
+    "build": "tsc -p tsconfig.json",
+    "typecheck": "tsc -p tsconfig.typecheck.json --noEmit",
+    "start": "node dist/server.js",
+    "test": "vitest run",
+    "test:acceptance": "vitest run --config vitest.acceptance.config.ts",
+    "image:publish": "bash scripts/publish-image.sh"
+  }
+}