@cat-factory/executor-harness 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +143 -0
  3. package/dist/agent-runner.js +389 -0
  4. package/dist/agent.js +810 -0
  5. package/dist/blueprint.js +367 -0
  6. package/dist/bootstrap.js +99 -0
  7. package/dist/ci-fixer.js +46 -0
  8. package/dist/coding-agent.js +285 -0
  9. package/dist/conflict-resolver.js +138 -0
  10. package/dist/embed.js +8 -0
  11. package/dist/explore.js +74 -0
  12. package/dist/failure.js +47 -0
  13. package/dist/fixer.js +44 -0
  14. package/dist/follow-ups.js +103 -0
  15. package/dist/frontend-infra.js +283 -0
  16. package/dist/fs-utils.js +11 -0
  17. package/dist/git.js +778 -0
  18. package/dist/job.js +409 -0
  19. package/dist/logger.js +27 -0
  20. package/dist/merger.js +135 -0
  21. package/dist/on-call.js +126 -0
  22. package/dist/pi-workspace.js +237 -0
  23. package/dist/pi.js +971 -0
  24. package/dist/process.js +25 -0
  25. package/dist/redact.js +109 -0
  26. package/dist/runner.js +228 -0
  27. package/dist/server.js +135 -0
  28. package/dist/spec.js +754 -0
  29. package/dist/structured-output.js +431 -0
  30. package/dist/tester.js +191 -0
  31. package/package.json +35 -0
  32. package/src/agent-runner.ts +484 -0
  33. package/src/agent.ts +948 -0
  34. package/src/coding-agent.ts +393 -0
  35. package/src/embed.ts +32 -0
  36. package/src/failure.ts +73 -0
  37. package/src/follow-ups.ts +106 -0
  38. package/src/frontend-infra.ts +340 -0
  39. package/src/fs-utils.ts +11 -0
  40. package/src/git.ts +955 -0
  41. package/src/job.ts +766 -0
  42. package/src/logger.ts +45 -0
  43. package/src/pi-workspace.ts +348 -0
  44. package/src/pi.ts +1236 -0
  45. package/src/process.ts +33 -0
  46. package/src/redact.ts +109 -0
  47. package/src/runner.ts +384 -0
  48. package/src/server.ts +153 -0
  49. package/src/structured-output.ts +524 -0
@@ -0,0 +1,431 @@
1
+ import { redact, redactSecrets, secretsToRedact } from './redact.js';
2
+ import { log } from './logger.js';
3
+ import { PI_MAX_OUTPUT_TOKENS } from './pi.js';
4
+ // A reusable abstraction for the "agent returns a structured JSON document as its
5
+ // final assistant message" pattern (requirements, blueprint, merger — and any future
6
+ // kind). An agent of this kind emits its result as text, not a tool call, and the
7
+ // harness parses it. A model can produce text that won't parse: truncated JSON,
8
+ // prose/fences around it, trailing commas, or the workers-ai-provider reasoning-model
9
+ // streaming corruption that duplicates every token (`serviceservice…`).
10
+ //
11
+ // Instead of failing the whole container run on the first unparseable reply, a caller
12
+ // describes its output once as a `StructuredOutputSpec<T>` (a label, a shape hint, and
13
+ // a parser) and calls `resolveStructuredOutput`. That:
14
+ // 1. tries to parse the primary (Pi) output;
15
+ // 2. on failure, makes ONE structured repair call — a single-shot, no-tools,
16
+ // NON-streaming completion through the same proxy with `response_format:
17
+ // json_object`, asking the model to return only the corrected JSON — and reparses;
18
+ // 3. returns the value (or null) plus structured diagnostics.
19
+ //
20
+ // It is provider-agnostic (external OpenAI-compatible upstreams honour
21
+ // `response_format`; the in-process Workers AI path ignores it but answers buffered,
22
+ // sidestepping the streaming double-emit, and the focused prompt keeps it to JSON) and
23
+ // observable (the repair call lands in `llm_call_metrics` as a NON-streaming row, and
24
+ // every parse failure / repair outcome is logged so "this happened" and "the retry
25
+ // didn't help" are both queryable).
26
+ /** Output-token ceiling for the repair call — mirrors the harness's PI_MAX_OUTPUT_TOKENS. */
27
+ const REPAIR_MAX_OUTPUT_TOKENS = PI_MAX_OUTPUT_TOKENS;
28
+ /** Hard cap on how much malformed text we feed the repair model (keep the call cheap). */
29
+ const MAX_REPAIR_INPUT_CHARS = 40_000;
30
+ const REPAIR_SYSTEM = 'You repair malformed JSON. You are given text that was meant to be a single ' +
31
+ 'JSON object but does not parse. Return ONLY the corrected JSON object: no prose, ' +
32
+ 'no markdown code fences, no commentary, and never repeat or duplicate any tokens. ' +
33
+ 'Preserve the original content faithfully; only fix the JSON structure.';
34
+ /**
35
+ * Largest immediately-repeated run length we look for. The corruption duplicates
36
+ * whole model tokens, which carry whitespace/punctuation context and run to ~10-15
37
+ * chars (`"service"`, `observability`); 24 covers them with headroom while staying
38
+ * cheap. We don't match single chars (k>=2): a lone doubled `{`/space is normal.
39
+ */
40
+ const MAX_DOUBLE_RUN = 24;
41
+ /**
42
+ * Cap on how much of a (possibly huge) failed output the doubling heuristic scans.
43
+ * The corruption is uniform across the whole reply, so a prefix is representative,
44
+ * and this bounds the otherwise O(n·{@link MAX_DOUBLE_RUN}²) scan on a large
45
+ * document. The detector only runs on the parse-failure path, so this is belt-and-
46
+ * braces rather than a hot-path concern.
47
+ */
48
+ const MAX_DOUBLE_SCAN_CHARS = 20_000;
49
+ /**
50
+ * Heuristic detector for the token-doubling corruption ("serviceservice",
51
+ * "observobservabilityability", `{\n{\n`). Greedy scan over a bounded prefix: at each
52
+ * position, find the longest 2..{@link MAX_DOUBLE_RUN}-char run that is immediately
53
+ * repeated and count both copies as "doubled", then measure the doubled fraction of
54
+ * the scanned text. Token-doubled text (consecutive `t t` pairs) scores near 1.0;
55
+ * normal JSON/prose scores low (only incidental short repeats). Advisory ONLY — it
56
+ * labels a failure for telemetry, it never mutates output.
57
+ */
58
+ export function looksTokenDoubled(text) {
59
+ // Scan at most MAX_DOUBLE_SCAN_CHARS; `startsWith` stays within this prefix because
60
+ // `maxK` bounds each match so `i + matched * 2 <= n`.
61
+ const n = Math.min(text.length, MAX_DOUBLE_SCAN_CHARS);
62
+ if (n < 40)
63
+ return { doubled: false, ratio: 0 };
64
+ let covered = 0;
65
+ let i = 0;
66
+ while (i < n) {
67
+ let matched = 0;
68
+ const maxK = Math.min(MAX_DOUBLE_RUN, Math.floor((n - i) / 2));
69
+ for (let k = maxK; k >= 2; k--) {
70
+ // Is the k-char run at i immediately followed by an identical run?
71
+ if (text.startsWith(text.slice(i, i + k), i + k)) {
72
+ matched = k;
73
+ break;
74
+ }
75
+ }
76
+ if (matched > 0) {
77
+ covered += matched * 2;
78
+ i += matched * 2;
79
+ }
80
+ else {
81
+ i += 1;
82
+ }
83
+ }
84
+ const ratio = covered / n;
85
+ return { doubled: ratio >= 0.5, ratio };
86
+ }
87
+ /**
88
+ * Resolve a structured output: parse the agent's `primaryText` via `spec.parse`; on
89
+ * failure, make ONE structured repair call and re-parse. Returns the value (or null
90
+ * when both attempts fail) plus {@link StructuredOutputDiagnostics}. Logging side
91
+ * effects only; never throws (a repair transport error is captured in the diagnostics).
92
+ */
93
+ export async function resolveStructuredOutput(spec, primaryText, access) {
94
+ const trace = { agent: spec.label, jobId: access.jobId };
95
+ const primaryChars = primaryText.length;
96
+ const primary = safeParse(primaryText, spec.parse);
97
+ if (primary !== null) {
98
+ return {
99
+ value: primary,
100
+ diagnostics: {
101
+ parsedOn: 'primary',
102
+ primaryChars,
103
+ looksDoubled: false,
104
+ repairAttempted: false,
105
+ repairSucceeded: false,
106
+ },
107
+ };
108
+ }
109
+ // Pick a repair channel. The Pi harness repairs through the LLM proxy; the
110
+ // claude-code subscription harness has no proxy but DOES speak a standard
111
+ // Anthropic Messages API (Anthropic itself, or an Anthropic-compatible endpoint
112
+ // for GLM/Kimi/DeepSeek), so it repairs straight against the vendor with the
113
+ // leased token. Codex has no simple JSON API, so it keeps the graceful no-repair
114
+ // path (the smaller GLM/Kimi/DeepSeek models — most prone to malformed JSON — are
115
+ // covered by the claude-code channel).
116
+ const canProxyRepair = !!access.proxyBaseUrl && !!access.sessionToken;
117
+ const canSubscriptionRepair = access.harness === 'claude-code' && !!access.subscriptionToken;
118
+ if (!canProxyRepair && !canSubscriptionRepair) {
119
+ return {
120
+ value: null,
121
+ diagnostics: {
122
+ parsedOn: 'none',
123
+ primaryChars,
124
+ looksDoubled: looksTokenDoubled(primaryText).doubled,
125
+ repairAttempted: false,
126
+ repairSucceeded: false,
127
+ repairError: `structured-output repair unavailable for the ${access.harness ?? 'pi'} harness`,
128
+ },
129
+ };
130
+ }
131
+ // Primary failed: label the corruption (doubling is the known reasoning-model
132
+ // streaming bug) and record the event before spending a repair call.
133
+ const doubled = looksTokenDoubled(primaryText);
134
+ log.warn('structured-output: primary unparseable — attempting structured repair', {
135
+ ...trace,
136
+ primaryChars,
137
+ looksDoubled: doubled.doubled,
138
+ doubledRatio: Number(doubled.ratio.toFixed(2)),
139
+ });
140
+ let repairError;
141
+ let repaired = null;
142
+ try {
143
+ const repairedText = await callRepair(primaryText, spec, access);
144
+ repaired = safeParse(repairedText, spec.parse);
145
+ if (repaired === null)
146
+ repairError = 'repair output still did not parse';
147
+ }
148
+ catch (err) {
149
+ repairError = err instanceof Error ? err.message : String(err);
150
+ }
151
+ if (repaired !== null) {
152
+ log.info('structured-output: repair recovered a usable document', { ...trace, primaryChars });
153
+ return {
154
+ value: repaired,
155
+ diagnostics: {
156
+ parsedOn: 'repair',
157
+ primaryChars,
158
+ looksDoubled: doubled.doubled,
159
+ repairAttempted: true,
160
+ repairSucceeded: true,
161
+ },
162
+ };
163
+ }
164
+ // The retry did not help — the case we explicitly want visible in telemetry.
165
+ log.error('structured-output: unrecoverable after structured repair', {
166
+ ...trace,
167
+ primaryChars,
168
+ looksDoubled: doubled.doubled,
169
+ doubledRatio: Number(doubled.ratio.toFixed(2)),
170
+ repairError,
171
+ });
172
+ return {
173
+ value: null,
174
+ diagnostics: {
175
+ parsedOn: 'none',
176
+ primaryChars,
177
+ looksDoubled: doubled.doubled,
178
+ repairAttempted: true,
179
+ repairSucceeded: false,
180
+ repairError,
181
+ },
182
+ };
183
+ }
184
+ /**
185
+ * Make the structured repair call and return the model's text (the corrected JSON,
186
+ * ideally). Throws on a transport/HTTP error so the caller records it as the repair
187
+ * failure reason. Routes to the LLM proxy (Pi harness) when present, else to the
188
+ * claude-code subscription harness's own Anthropic-compatible endpoint.
189
+ */
190
+ async function callRepair(badText, spec, access) {
191
+ if ((!access.proxyBaseUrl || !access.sessionToken) && access.subscriptionToken) {
192
+ return callSubscriptionRepair(badText, spec, access);
193
+ }
194
+ // Only ever called after the caller verified the proxy is present (Pi harness).
195
+ if (!access.proxyBaseUrl || !access.sessionToken) {
196
+ throw new Error('structured-output repair requires the LLM proxy (Pi harness)');
197
+ }
198
+ const url = `${access.proxyBaseUrl.replace(/\/+$/, '')}/chat/completions`;
199
+ const messages = [
200
+ { role: 'system', content: REPAIR_SYSTEM },
201
+ {
202
+ role: 'user',
203
+ content: `${spec.shapeHint}\n\n` +
204
+ 'The text below was meant to be that JSON object but does not parse. Return ' +
205
+ 'ONLY the corrected JSON object.\n\n' +
206
+ badText.slice(0, MAX_REPAIR_INPUT_CHARS),
207
+ },
208
+ ];
209
+ const base = {
210
+ // The proxy locks the model to the session's; sent for completeness.
211
+ model: access.model,
212
+ stream: false,
213
+ max_tokens: REPAIR_MAX_OUTPUT_TOKENS,
214
+ // No `temperature`: the newest models (Anthropic Opus 4.7+/the Claude 5 family) reject
215
+ // any sampling parameter with a 400, and a single-shot repair whose system prompt already
216
+ // forces JSON-only output doesn't need one — so we omit it for every model/provider.
217
+ messages,
218
+ };
219
+ // Capability gate: ask for `json_object` structured output (honoured by external
220
+ // OpenAI-compatible upstreams; ignored by the in-process Workers AI path). If an
221
+ // upstream REJECTS the parameter (4xx), fall back to the prompt-only path — the
222
+ // system prompt already demands JSON — rather than failing the repair outright.
223
+ const withFormat = { ...base, response_format: { type: 'json_object' } };
224
+ let res = await post(url, access, withFormat);
225
+ // A 4xx here means the upstream REJECTED `response_format` → fall back to prompt-only. Exclude
226
+ // 429: it is a rate-limit (already retried with backoff inside `post`), not a param rejection,
227
+ // so re-interpreting it as one would waste a second full prompt-only round on a rate-limit.
228
+ if (!res.ok && res.status !== 429 && res.status >= 400 && res.status < 500) {
229
+ log.warn('structured-output: repair upstream rejected response_format — retrying prompt-only', {
230
+ agent: spec.label,
231
+ jobId: access.jobId,
232
+ status: res.status,
233
+ });
234
+ res = await post(url, access, base);
235
+ }
236
+ if (!res.ok) {
237
+ const detail = redactSecrets((await res.text().catch(() => '')).slice(0, 300));
238
+ throw new Error(`repair call failed: HTTP ${res.status}${detail ? ` — ${detail}` : ''}`);
239
+ }
240
+ const json = (await res.json());
241
+ const content = json.choices?.[0]?.message?.content;
242
+ return typeof content === 'string' ? content : '';
243
+ }
244
+ /**
245
+ * Repair via the claude-code subscription harness's own vendor endpoint (no proxy):
246
+ * a single non-streaming Anthropic Messages call with the leased token. Anthropic
247
+ * itself uses the OAuth token (Bearer + the oauth beta header) against
248
+ * api.anthropic.com; an Anthropic-compatible vendor (GLM/Kimi/DeepSeek) uses its
249
+ * `subscriptionBaseUrl` with the API-token `x-api-key` header. Best-effort: any
250
+ * error propagates to the caller's `repairError` and degrades to the null path.
251
+ */
252
+ async function callSubscriptionRepair(badText, spec, access) {
253
+ if (!access.subscriptionToken) {
254
+ throw new Error('structured-output subscription repair requires a subscription token');
255
+ }
256
+ const base = access.subscriptionBaseUrl?.replace(/\/+$/, '') ?? 'https://api.anthropic.com';
257
+ const url = `${base}/v1/messages`;
258
+ const headers = {
259
+ 'content-type': 'application/json',
260
+ 'anthropic-version': '2023-06-01',
261
+ };
262
+ if (access.subscriptionBaseUrl) {
263
+ // Anthropic-compatible vendor (GLM/Kimi/DeepSeek): API token via x-api-key.
264
+ headers['x-api-key'] = access.subscriptionToken;
265
+ }
266
+ else {
267
+ // Anthropic on a Claude subscription OAuth token.
268
+ headers.authorization = `Bearer ${access.subscriptionToken}`;
269
+ headers['anthropic-beta'] = 'oauth-2025-04-20';
270
+ }
271
+ const body = {
272
+ model: access.model,
273
+ max_tokens: REPAIR_MAX_OUTPUT_TOKENS,
274
+ // No `temperature`: Anthropic's newest models (Opus 4.7+/Claude 5 family) reject the
275
+ // sampling parameters with `400 invalid_request_error: temperature is deprecated for this
276
+ // model`. The repair prompt fully constrains the output to JSON, so determinism via
277
+ // temperature=0 isn't needed — omitting it keeps the call valid on every model.
278
+ system: REPAIR_SYSTEM,
279
+ messages: [
280
+ {
281
+ role: 'user',
282
+ content: `${spec.shapeHint}\n\n` +
283
+ 'The text below was meant to be that JSON object but does not parse. Return ' +
284
+ 'ONLY the corrected JSON object.\n\n' +
285
+ badText.slice(0, MAX_REPAIR_INPUT_CHARS),
286
+ },
287
+ ],
288
+ };
289
+ const res = await fetchRepairWithRetry(() => fetch(url, {
290
+ method: 'POST',
291
+ headers,
292
+ body: JSON.stringify(body),
293
+ signal: access.signal,
294
+ }), access.signal, access.jobId);
295
+ if (!res.ok) {
296
+ // A vendor 4xx body can echo the API key/token back; `redact` applies both the
297
+ // GitHub-shaped pattern rules AND scrubs the leased subscription credential (the raw
298
+ // value, and — for a JSON auth bundle — its nested token leaves) before surfacing.
299
+ const raw = (await res.text().catch(() => '')).slice(0, 300);
300
+ const detail = redact(raw, secretsToRedact(access.subscriptionToken ?? ''));
301
+ throw new Error(`subscription repair call failed: HTTP ${res.status}${detail ? ` — ${detail}` : ''}`);
302
+ }
303
+ const json = (await res.json());
304
+ // Concatenate the text blocks of the Anthropic Messages response.
305
+ return (json.content ?? [])
306
+ .filter((b) => b?.type === 'text' && typeof b.text === 'string')
307
+ .map((b) => b.text)
308
+ .join('');
309
+ }
310
+ /** POST a chat-completions body to the proxy with the session bearer token. */
311
+ function post(url, access, body) {
312
+ return fetchRepairWithRetry(() => fetch(url, {
313
+ method: 'POST',
314
+ headers: {
315
+ authorization: `Bearer ${access.sessionToken}`,
316
+ 'content-type': 'application/json',
317
+ },
318
+ body: JSON.stringify(body),
319
+ signal: access.signal,
320
+ }), access.signal, access.jobId);
321
+ }
322
+ // A single structured-repair call is the LAST line of defence before an unparseable agent
323
+ // reply fails the whole run. A TRANSIENT upstream blip on that one call — most importantly a
324
+ // 429 rate-limit (which once turned a recoverable parse into a hard `no structured result`
325
+ // failure), but also a 5xx or a dropped connection — must not be fatal, so retry it with
326
+ // exponential backoff honoring `Retry-After`.
327
+ const REPAIR_RETRY_ATTEMPTS = 3;
328
+ const REPAIR_RETRY_BASE_MS = 500;
329
+ const REPAIR_RETRY_MAX_MS = 8_000;
330
+ /** `Retry-After` (seconds or HTTP-date) as ms, capped; undefined if absent/invalid. */
331
+ function repairRetryAfterMs(res) {
332
+ const raw = res.headers.get('retry-after');
333
+ if (!raw)
334
+ return undefined;
335
+ const secs = Number(raw);
336
+ if (Number.isFinite(secs))
337
+ return secs > 0 ? Math.min(secs * 1000, REPAIR_RETRY_MAX_MS) : undefined;
338
+ const at = Date.parse(raw);
339
+ if (Number.isNaN(at))
340
+ return undefined;
341
+ const ms = at - Date.now();
342
+ return ms > 0 ? Math.min(ms, REPAIR_RETRY_MAX_MS) : undefined;
343
+ }
344
+ /** Exponential backoff (base 500ms, capped 8s) with up to 25% positive jitter. */
345
+ function repairBackoffMs(attempt) {
346
+ const base = Math.min(REPAIR_RETRY_MAX_MS, REPAIR_RETRY_BASE_MS * 2 ** (attempt - 1));
347
+ return base + Math.floor(base * 0.25 * Math.random());
348
+ }
349
+ /** Sleep `ms`, rejecting early if the abort signal fires. */
350
+ async function abortableDelay(ms, signal) {
351
+ if (ms <= 0)
352
+ return;
353
+ if (signal?.aborted)
354
+ throw signal.reason ?? new Error('aborted');
355
+ await new Promise((resolve, reject) => {
356
+ const onAbort = () => {
357
+ clearTimeout(timer);
358
+ reject(signal?.reason ?? new Error('aborted'));
359
+ };
360
+ const timer = setTimeout(() => {
361
+ signal?.removeEventListener('abort', onAbort);
362
+ resolve();
363
+ }, ms);
364
+ signal?.addEventListener('abort', onAbort, { once: true });
365
+ });
366
+ }
367
+ /**
368
+ * Run a repair fetch, retrying TRANSIENT failures (HTTP 429 / >=500 / network error) with
369
+ * exponential backoff honoring `Retry-After`. A caller abort is terminal. Non-transient
370
+ * responses (2xx/4xx, e.g. a `response_format` rejection) and the final attempt return
371
+ * as-is — the caller's existing `!res.ok` handling then produces the repair diagnostic
372
+ * without this masking a genuine, non-retryable error.
373
+ */
374
+ async function fetchRepairWithRetry(doFetch, signal, jobId) {
375
+ let lastError;
376
+ for (let attempt = 1; attempt <= REPAIR_RETRY_ATTEMPTS; attempt++) {
377
+ if (signal?.aborted)
378
+ throw signal.reason ?? new Error('aborted');
379
+ let res;
380
+ try {
381
+ res = await doFetch();
382
+ }
383
+ catch (err) {
384
+ // A caller/watchdog abort is terminal; a network error is transient → retry.
385
+ if (signal?.aborted)
386
+ throw err;
387
+ lastError = err;
388
+ }
389
+ if (res) {
390
+ const transient = res.status === 429 || res.status >= 500;
391
+ if (!transient || attempt >= REPAIR_RETRY_ATTEMPTS)
392
+ return res;
393
+ const wait = repairRetryAfterMs(res) ?? repairBackoffMs(attempt);
394
+ // Discard the unread body before retrying so the connection can be reused.
395
+ await res.body?.cancel().catch(() => { });
396
+ log.warn('structured-output: repair upstream transient failure — backing off', {
397
+ jobId,
398
+ status: res.status,
399
+ attempt,
400
+ waitMs: wait,
401
+ });
402
+ await abortableDelay(wait, signal);
403
+ continue;
404
+ }
405
+ if (attempt >= REPAIR_RETRY_ATTEMPTS)
406
+ break;
407
+ await abortableDelay(repairBackoffMs(attempt), signal);
408
+ }
409
+ throw lastError instanceof Error ? lastError : new Error('repair request failed after retries');
410
+ }
411
+ /** Run `parse`, treating a thrown error (e.g. `extractJsonObject`) as "no value". */
412
+ function safeParse(text, parse) {
413
+ try {
414
+ return parse(text);
415
+ }
416
+ catch {
417
+ return null;
418
+ }
419
+ }
420
+ /** Append a compact, human-readable diagnostics suffix to a no-document failure reason. */
421
+ export function diagnosticsSuffix(d) {
422
+ const parts = [];
423
+ if (d.looksDoubled)
424
+ parts.push('output appeared token-doubled (streaming corruption)');
425
+ if (d.repairAttempted) {
426
+ parts.push(d.repairSucceeded
427
+ ? 'structured repair recovered it'
428
+ : `structured repair did not help${d.repairError ? ` (${d.repairError})` : ''}`);
429
+ }
430
+ return parts.length > 0 ? ` [${parts.join('; ')}]` : '';
431
+ }
package/dist/tester.js ADDED
@@ -0,0 +1,191 @@
1
+ import { execFile } from 'node:child_process';
2
+ import { promisify } from 'node:util';
3
+ import { cloneRepo } from './git.js';
4
+ import { extractJsonObject } from './blueprint.js';
5
+ import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
6
+ import { diagnosticsSuffix, resolveStructuredOutput, } from './structured-output.js';
7
+ import { log } from './logger.js';
8
+ const exec = promisify(execFile);
9
+ // Async job execution for the Tester. The engine dispatches this to run the project's
10
+ // tests: clone the PR HEAD branch, stand its dependencies up (local docker-compose
11
+ // infra, or test against an ephemeral env), run Pi to exercise the change + regress
12
+ // related behaviour, and return ONLY a structured JSON report. The Tester makes NO
13
+ // commits — on a withheld greenlight the engine loops the `fixer` and re-tests.
14
+ /** Compact description of the report shape, fed to the JSON repair call. */
15
+ const REPORT_SHAPE_HINT = 'Expected a test report: {"greenlight": boolean, "summary": string, "tested": string[], ' +
16
+ '"outcomes": [{"name": string, "status": "passed"|"failed"|"skipped", "detail"?: string}], ' +
17
+ '"concerns": [{"title": string, "detail": string, "severity": "low"|"medium"|"high"|"critical"}]}.';
18
+ const SEVERITIES = new Set(['low', 'medium', 'high', 'critical']);
19
+ const STATUSES = new Set(['passed', 'failed', 'skipped']);
20
+ /** Coerce the agent's JSON into a well-formed report, defaulting conservatively. */
21
+ function coerceReport(raw, summary, env) {
22
+ const o = (typeof raw === 'object' && raw !== null ? raw : {});
23
+ const outcomes = Array.isArray(o.outcomes)
24
+ ? o.outcomes
25
+ .filter((x) => typeof x === 'object' && x !== null)
26
+ .map((x) => ({
27
+ name: typeof x.name === 'string' ? x.name : '(unnamed)',
28
+ status: (STATUSES.has(x.status)
29
+ ? x.status
30
+ : 'skipped'),
31
+ ...(typeof x.detail === 'string' && x.detail ? { detail: x.detail } : {}),
32
+ }))
33
+ : [];
34
+ const concerns = Array.isArray(o.concerns)
35
+ ? o.concerns
36
+ .filter((x) => typeof x === 'object' && x !== null)
37
+ .map((x) => ({
38
+ title: typeof x.title === 'string' ? x.title : '(concern)',
39
+ detail: typeof x.detail === 'string' ? x.detail : '',
40
+ severity: (SEVERITIES.has(x.severity)
41
+ ? x.severity
42
+ : 'medium'),
43
+ }))
44
+ : [];
45
+ // A greenlight is only honoured when no BLOCKING (high/critical) concern was
46
+ // raised — never auto-pass a run with an open blocker, even if the model set
47
+ // greenlight:true by mistake. Low/medium concerns are advisory: they're reported
48
+ // but don't, on their own, withhold the greenlight (which would otherwise burn the
49
+ // whole fixer budget looping on a trivial nit). The engine re-applies this rule.
50
+ const blocking = concerns.some((c) => c.severity === 'high' || c.severity === 'critical');
51
+ const greenlight = o.greenlight === true && !blocking;
52
+ return {
53
+ greenlight,
54
+ summary: typeof o.summary === 'string' && o.summary ? o.summary : summary.slice(0, 2000),
55
+ tested: Array.isArray(o.tested)
56
+ ? o.tested.filter((t) => typeof t === 'string')
57
+ : [],
58
+ outcomes,
59
+ concerns,
60
+ environment: env,
61
+ };
62
+ }
63
+ /** Build the tester task prompt: how to bring the deps up + what to test. */
64
+ function buildUserPrompt(job) {
65
+ const lines = [job.userPrompt, ''];
66
+ if (job.test.environment === 'ephemeral') {
67
+ lines.push('Run mode: ephemeral environment.', job.test.environmentUrl
68
+ ? `Test against the deployed environment at ${job.test.environmentUrl}. Do not start the service locally.`
69
+ : 'Test against the provided ephemeral environment URL from your context. Do not start the service locally.');
70
+ }
71
+ else if (job.test.noInfraDependencies) {
72
+ lines.push('Run mode: local, no infra dependencies — just install, build and run the test suite directly.');
73
+ }
74
+ else {
75
+ lines.push("Run mode: local. The service's infra dependencies from its docker-compose file have been started and are reachable on localhost. Read the README to learn how to configure the service against them, run any migrations, start the service and exercise it.");
76
+ }
77
+ lines.push('', 'Respond with ONLY the JSON test report described in your instructions.');
78
+ return lines.join('\n');
79
+ }
80
+ /**
81
+ * Bring the service's docker-compose dependencies up (local mode only). Best-effort:
82
+ * runs `docker compose -f <path> up -d --wait` in the checkout. A missing Docker
83
+ * daemon or a compose failure is logged and surfaced to the agent rather than failing
84
+ * the whole job — the agent can still run unit-level tests and report what it could.
85
+ */
86
+ async function standUpInfra(dir, test, signal, trace) {
87
+ if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath) {
88
+ return { started: false };
89
+ }
90
+ try {
91
+ log.info('test: standing up infra', { ...trace, composePath: test.composePath });
92
+ await exec('docker', ['compose', '-f', test.composePath, 'up', '-d', '--wait'], {
93
+ cwd: dir,
94
+ signal,
95
+ timeout: 5 * 60_000,
96
+ });
97
+ return { started: true };
98
+ }
99
+ catch (err) {
100
+ const note = err instanceof Error ? err.message : String(err);
101
+ log.warn('test: infra stand-up failed', { ...trace, error: note });
102
+ return { started: false, note };
103
+ }
104
+ }
105
+ /** Tear the docker-compose dependencies down (best-effort). */
106
+ async function tearDownInfra(dir, test) {
107
+ if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath)
108
+ return;
109
+ try {
110
+ await exec('docker', ['compose', '-f', test.composePath, 'down', '-v'], {
111
+ cwd: dir,
112
+ timeout: 2 * 60_000,
113
+ });
114
+ }
115
+ catch {
116
+ // The container is ephemeral and torn down with the run anyway — ignore.
117
+ }
118
+ }
119
+ /** Run one Tester job end to end: clone branch → stand up infra → Pi tests → report. */
120
+ export async function handleTester(job, opts = {}) {
121
+ const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
122
+ return withWorkspace('test', async (dir) => {
123
+ log.info('test: cloning PR branch', trace);
124
+ await cloneRepo({
125
+ repo: { ...job.repo, baseBranch: job.branch },
126
+ ghToken: job.ghToken,
127
+ dir,
128
+ signal: opts.signal,
129
+ });
130
+ const infra = await standUpInfra(dir, job.test, opts.signal, trace);
131
+ try {
132
+ log.info('test: running agent', { ...trace, environment: job.test.environment });
133
+ let userPrompt = buildUserPrompt(job);
134
+ if (infra.note) {
135
+ userPrompt += `\n\nNote: standing the infra up reported a problem (${infra.note}). Test what you can and flag any dependency-related gaps as concerns.`;
136
+ }
137
+ const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
138
+ dir,
139
+ systemPrompt: job.systemPrompt,
140
+ userPrompt,
141
+ model: job.model,
142
+ harness: job.harness,
143
+ subscriptionToken: job.subscriptionToken,
144
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
145
+ proxyBaseUrl: job.proxyBaseUrl,
146
+ sessionToken: job.sessionToken,
147
+ // The tester only assesses (it commits nothing), so the no-edit guard must
148
+ // not fire on its legitimately edit-free run.
149
+ expectsEdits: false,
150
+ }, opts);
151
+ const { value: report, diagnostics } = await resolveStructuredOutput({
152
+ label: 'tester',
153
+ shapeHint: REPORT_SHAPE_HINT,
154
+ parse: (text) => coerceReport(extractJsonObject(text), text, job.test.environment),
155
+ }, summary, {
156
+ harness: job.harness,
157
+ subscriptionToken: job.subscriptionToken,
158
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
159
+ proxyBaseUrl: job.proxyBaseUrl,
160
+ sessionToken: job.sessionToken,
161
+ model: job.model,
162
+ jobId: job.jobId,
163
+ signal: opts.signal,
164
+ });
165
+ if (!report) {
166
+ return {
167
+ summary,
168
+ stats,
169
+ error: noReportReason(stats, stderrTail, diagnostics),
170
+ ...(usage ? { usage } : {}),
171
+ };
172
+ }
173
+ log.info('test: reported', {
174
+ ...trace,
175
+ greenlight: report.greenlight,
176
+ concerns: report.concerns.length,
177
+ });
178
+ return { report, summary, stats, ...(usage ? { usage } : {}) };
179
+ }
180
+ finally {
181
+ await tearDownInfra(dir, job.test);
182
+ }
183
+ });
184
+ }
185
+ /** Human-readable reason a tester run produced no usable report. */
186
+ function noReportReason(stats, stderrTail, diagnostics) {
187
+ const cause = agentNeverActed(stats)
188
+ ? NEVER_ACTED_CAUSE
189
+ : ' The agent did not return a parseable JSON test report.';
190
+ return `Tester produced no report.${cause}${diagnostics ? diagnosticsSuffix(diagnostics) : ''}${agentOutputTail(stderrTail)}`;
191
+ }
package/package.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "@cat-factory/executor-harness",
3
+ "version": "1.31.0",
4
+ "description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
5
+ "type": "module",
6
+ "main": "./dist/server.js",
7
+ "exports": {
8
+ ".": "./dist/server.js",
9
+ "./embed": "./src/embed.ts"
10
+ },
11
+ "files": [
12
+ "dist",
13
+ "src"
14
+ ],
15
+ "publishConfig": {
16
+ "access": "public"
17
+ },
18
+ "devDependencies": {
19
+ "@hono/node-server": "^2.0.6",
20
+ "@types/node": "^26.0.0",
21
+ "hono": "^4.12.27",
22
+ "typescript": "^6.0.3",
23
+ "vitest": "^4.1.9",
24
+ "@cat-factory/server": "0.65.2",
25
+ "@cat-factory/spend": "0.10.67"
26
+ },
27
+ "scripts": {
28
+ "build": "tsc -p tsconfig.json",
29
+ "typecheck": "tsc -p tsconfig.typecheck.json --noEmit",
30
+ "start": "node dist/server.js",
31
+ "test": "vitest run",
32
+ "test:acceptance": "vitest run --config vitest.acceptance.config.ts",
33
+ "image:publish": "bash scripts/publish-image.sh"
34
+ }
35
+ }