@cat-factory/executor-harness 1.34.8 → 1.34.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/agent.js +68 -39
  2. package/package.json +3 -3
  3. package/src/agent.ts +76 -40
package/dist/agent.js CHANGED
@@ -152,6 +152,39 @@ async function tearDownInfra(dir, infra) {
152
152
  // The container is ephemeral and torn down with the run anyway — ignore.
153
153
  }
154
154
  }
155
+ /**
156
+ * Parse an agent's final reply into the structured JSON `custom`, shared by the explore and
157
+ * coding structured-output paths. With repair enabled (default) a malformed reply gets ONE
158
+ * structured repair call before giving up; with `output.repair === false` it parses directly.
159
+ * Returns the parsed value (or null when unusable) plus the repair diagnostics. Never throws —
160
+ * a parse failure is a null value, and each caller decides whether that is fatal (explore: yes;
161
+ * coding: no, the pushed commits are the deliverable).
162
+ */
163
+ async function resolveReplyCustom(job, summary, signal) {
164
+ if (job.output?.repair === false) {
165
+ try {
166
+ return { value: extractJsonObject(summary) };
167
+ }
168
+ catch {
169
+ return { value: null };
170
+ }
171
+ }
172
+ const resolved = await resolveStructuredOutput({
173
+ label: 'agent',
174
+ shapeHint: job.output?.shapeHint ?? 'Expected a single JSON object.',
175
+ parse: (text) => extractJsonObject(text),
176
+ }, summary, {
177
+ harness: job.harness,
178
+ subscriptionToken: job.subscriptionToken,
179
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
180
+ proxyBaseUrl: job.proxyBaseUrl,
181
+ sessionToken: job.sessionToken,
182
+ model: job.model,
183
+ jobId: job.jobId,
184
+ signal,
185
+ });
186
+ return { value: resolved.value, diagnostics: resolved.diagnostics };
187
+ }
155
188
  /** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
156
189
  function extractJsonObject(text) {
157
190
  const trimmed = text.trim();
@@ -434,38 +467,12 @@ async function finalizeExploreResult(job, run, ctx) {
434
467
  ...infraSetupFields,
435
468
  };
436
469
  }
437
- // Structured: parse the agent's JSON. With repair enabled (default) a malformed
438
- // reply gets ONE structured repair call before giving up; with `repair:false` we
439
- // parse directly (no repair channel). The backend coerces/validates + renders from
440
- // the returned object in a post-op.
441
- let custom = null;
442
- let diagnostics;
443
- if (job.output.repair === false) {
444
- try {
445
- custom = extractJsonObject(summary);
446
- }
447
- catch {
448
- custom = null;
449
- }
450
- }
451
- else {
452
- const resolved = await resolveStructuredOutput({
453
- label: 'agent',
454
- shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
455
- parse: (text) => extractJsonObject(text),
456
- }, summary, {
457
- harness: job.harness,
458
- subscriptionToken: job.subscriptionToken,
459
- subscriptionBaseUrl: job.subscriptionBaseUrl,
460
- proxyBaseUrl: job.proxyBaseUrl,
461
- sessionToken: job.sessionToken,
462
- model: job.model,
463
- jobId: job.jobId,
464
- signal,
465
- });
466
- custom = resolved.value;
467
- diagnostics = resolved.diagnostics;
468
- }
470
+ // Structured: parse the agent's JSON via the shared resolver. With repair enabled (default)
471
+ // a malformed reply gets ONE structured repair call before giving up; with `repair:false` it
472
+ // parses directly (no repair channel). The backend coerces/validates + renders from the
473
+ // returned object in a post-op. Unlike the coding path, an unparseable explore reply IS a
474
+ // failure the report/JSON is the whole deliverable.
475
+ const { value: custom, diagnostics } = await resolveReplyCustom(job, summary, signal);
469
476
  if (custom === undefined || custom === null) {
470
477
  return {
471
478
  summary,
@@ -577,15 +584,18 @@ async function runMultiRepoExplore(job, opts) {
577
584
  });
578
585
  }
579
586
  /**
580
- * Edit-and-push coding: clone `branch` (or resume `newBranch`), run the agent, commit +
581
- * push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
582
- * no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal
583
- * no-op for the in-place fixers.
587
+ * Edit-and-push coding, dispatching on job DATA: repo-bootstrap (force-push a fresh history to a
588
+ * separate target repo), conflict-resolution (merge the base in, resolve, push back), multi-repo
589
+ * fan-out (sibling checkouts + one PR per changed repo), else the ordinary single-repo flow.
590
+ * After the flow, a STRUCTURED coding kind (e.g. `repro-test`, whose deliverable is BOTH a pushed
591
+ * commit AND a JSON outcome) parses its final reply into `custom` — best-effort, so an unparseable
592
+ * outcome degrades to no `custom` (the backend resolver then defaults) rather than failing the
593
+ * run, whose real deliverable is the pushed commits.
584
594
  */
585
595
  async function runCodingMode(job, opts) {
586
596
  // Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
587
597
  // repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
588
- // (`bootstrap`), not the agent kind.
598
+ // (`bootstrap`), not the agent kind. Bootstrap/conflict never carry a structured `output`.
589
599
  if (job.bootstrap)
590
600
  return runBootstrap(job, opts);
591
601
  // Conflict resolution is a coding run with a different pre/post around the agent:
@@ -597,8 +607,27 @@ async function runCodingMode(job, opts) {
597
607
  // sibling, run the agent once across all of them, and open one PR per changed repo. Keyed
598
608
  // off job DATA (`peerRepos`), not the agent kind — the implementer sets it when the task
599
609
  // has involved services in distinct repos.
600
- if (job.peerRepos?.length)
601
- return runMultiRepoCoding(job, opts);
610
+ const result = job.peerRepos?.length
611
+ ? await runMultiRepoCoding(job, opts)
612
+ : await runSingleRepoCoding(job, opts);
613
+ // Structured coding kind (repro-test): fold the final reply's JSON onto `custom` so the
614
+ // backend post-completion resolver records the outcome. Skipped on a failed run (its `error`
615
+ // is the signal) and when there is no reply to parse. Best-effort: a null parse leaves
616
+ // `custom` unset (the run still succeeds on its commits).
617
+ if (job.output?.kind === 'structured' && !result.error && result.summary) {
618
+ const { value } = await resolveReplyCustom(job, result.summary, opts.signal);
619
+ if (value !== null && value !== undefined)
620
+ result.custom = value;
621
+ }
622
+ return result;
623
+ }
624
+ /**
625
+ * The ordinary single-repo coding flow: clone `branch` (or resume `newBranch`), run the agent,
626
+ * commit + push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
627
+ * no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal no-op for
628
+ * the in-place fixers (and for a seed-only kind like `repro-test`).
629
+ */
630
+ async function runSingleRepoCoding(job, opts) {
602
631
  const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
603
632
  const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
604
633
  kind: 'agent',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cat-factory/executor-harness",
3
- "version": "1.34.8",
3
+ "version": "1.34.10",
4
4
  "description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -26,8 +26,8 @@
26
26
  "hono": "^4.12.27",
27
27
  "typescript": "^6.0.3",
28
28
  "vitest": "^4.1.9",
29
- "@cat-factory/server": "0.81.0",
30
- "@cat-factory/spend": "0.10.93"
29
+ "@cat-factory/spend": "0.10.95",
30
+ "@cat-factory/server": "0.82.0"
31
31
  },
32
32
  "scripts": {
33
33
  "build": "tsc -p tsconfig.json",
package/src/agent.ts CHANGED
@@ -219,6 +219,47 @@ async function tearDownInfra(dir: string, infra: ServiceInfraSpec): Promise<void
219
219
  }
220
220
  }
221
221
 
222
+ /**
223
+ * Parse an agent's final reply into the structured JSON `custom`, shared by the explore and
224
+ * coding structured-output paths. With repair enabled (default) a malformed reply gets ONE
225
+ * structured repair call before giving up; with `output.repair === false` it parses directly.
226
+ * Returns the parsed value (or null when unusable) plus the repair diagnostics. Never throws —
227
+ * a parse failure is a null value, and each caller decides whether that is fatal (explore: yes;
228
+ * coding: no, the pushed commits are the deliverable).
229
+ */
230
+ async function resolveReplyCustom(
231
+ job: AgentJob,
232
+ summary: string,
233
+ signal: AbortSignal | undefined,
234
+ ): Promise<{ value: unknown; diagnostics?: StructuredOutputDiagnostics }> {
235
+ if (job.output?.repair === false) {
236
+ try {
237
+ return { value: extractJsonObject(summary) }
238
+ } catch {
239
+ return { value: null }
240
+ }
241
+ }
242
+ const resolved = await resolveStructuredOutput(
243
+ {
244
+ label: 'agent',
245
+ shapeHint: job.output?.shapeHint ?? 'Expected a single JSON object.',
246
+ parse: (text) => extractJsonObject(text),
247
+ },
248
+ summary,
249
+ {
250
+ harness: job.harness,
251
+ subscriptionToken: job.subscriptionToken,
252
+ subscriptionBaseUrl: job.subscriptionBaseUrl,
253
+ proxyBaseUrl: job.proxyBaseUrl,
254
+ sessionToken: job.sessionToken,
255
+ model: job.model,
256
+ jobId: job.jobId,
257
+ signal,
258
+ },
259
+ )
260
+ return { value: resolved.value, diagnostics: resolved.diagnostics }
261
+ }
262
+
222
263
  /** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
223
264
  function extractJsonObject(text: string): unknown {
224
265
  const trimmed = text.trim()
@@ -551,40 +592,12 @@ async function finalizeExploreResult(
551
592
  }
552
593
  }
553
594
 
554
- // Structured: parse the agent's JSON. With repair enabled (default) a malformed
555
- // reply gets ONE structured repair call before giving up; with `repair:false` we
556
- // parse directly (no repair channel). The backend coerces/validates + renders from
557
- // the returned object in a post-op.
558
- let custom: unknown = null
559
- let diagnostics: StructuredOutputDiagnostics | undefined
560
- if (job.output.repair === false) {
561
- try {
562
- custom = extractJsonObject(summary)
563
- } catch {
564
- custom = null
565
- }
566
- } else {
567
- const resolved = await resolveStructuredOutput(
568
- {
569
- label: 'agent',
570
- shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
571
- parse: (text) => extractJsonObject(text),
572
- },
573
- summary,
574
- {
575
- harness: job.harness,
576
- subscriptionToken: job.subscriptionToken,
577
- subscriptionBaseUrl: job.subscriptionBaseUrl,
578
- proxyBaseUrl: job.proxyBaseUrl,
579
- sessionToken: job.sessionToken,
580
- model: job.model,
581
- jobId: job.jobId,
582
- signal,
583
- },
584
- )
585
- custom = resolved.value
586
- diagnostics = resolved.diagnostics
587
- }
595
+ // Structured: parse the agent's JSON via the shared resolver. With repair enabled (default)
596
+ // a malformed reply gets ONE structured repair call before giving up; with `repair:false` it
597
+ // parses directly (no repair channel). The backend coerces/validates + renders from the
598
+ // returned object in a post-op. Unlike the coding path, an unparseable explore reply IS a
599
+ // failure the report/JSON is the whole deliverable.
600
+ const { value: custom, diagnostics } = await resolveReplyCustom(job, summary, signal)
588
601
  if (custom === undefined || custom === null) {
589
602
  return {
590
603
  summary,
@@ -709,15 +722,18 @@ async function runMultiRepoExplore(job: AgentJob, opts: RunOptions): Promise<Age
709
722
  }
710
723
 
711
724
  /**
712
- * Edit-and-push coding: clone `branch` (or resume `newBranch`), run the agent, commit +
713
- * push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
714
- * no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal
715
- * no-op for the in-place fixers.
725
+ * Edit-and-push coding, dispatching on job DATA: repo-bootstrap (force-push a fresh history to a
726
+ * separate target repo), conflict-resolution (merge the base in, resolve, push back), multi-repo
727
+ * fan-out (sibling checkouts + one PR per changed repo), else the ordinary single-repo flow.
728
+ * After the flow, a STRUCTURED coding kind (e.g. `repro-test`, whose deliverable is BOTH a pushed
729
+ * commit AND a JSON outcome) parses its final reply into `custom` — best-effort, so an unparseable
730
+ * outcome degrades to no `custom` (the backend resolver then defaults) rather than failing the
731
+ * run, whose real deliverable is the pushed commits.
716
732
  */
717
733
  async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
718
734
  // Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
719
735
  // repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
720
- // (`bootstrap`), not the agent kind.
736
+ // (`bootstrap`), not the agent kind. Bootstrap/conflict never carry a structured `output`.
721
737
  if (job.bootstrap) return runBootstrap(job, opts)
722
738
  // Conflict resolution is a coding run with a different pre/post around the agent:
723
739
  // clone full, merge the base in to surface the conflicts, then complete the merge
@@ -727,8 +743,28 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
727
743
  // sibling, run the agent once across all of them, and open one PR per changed repo. Keyed
728
744
  // off job DATA (`peerRepos`), not the agent kind — the implementer sets it when the task
729
745
  // has involved services in distinct repos.
730
- if (job.peerRepos?.length) return runMultiRepoCoding(job, opts)
746
+ const result = job.peerRepos?.length
747
+ ? await runMultiRepoCoding(job, opts)
748
+ : await runSingleRepoCoding(job, opts)
731
749
 
750
+ // Structured coding kind (repro-test): fold the final reply's JSON onto `custom` so the
751
+ // backend post-completion resolver records the outcome. Skipped on a failed run (its `error`
752
+ // is the signal) and when there is no reply to parse. Best-effort: a null parse leaves
753
+ // `custom` unset (the run still succeeds on its commits).
754
+ if (job.output?.kind === 'structured' && !result.error && result.summary) {
755
+ const { value } = await resolveReplyCustom(job, result.summary, opts.signal)
756
+ if (value !== null && value !== undefined) result.custom = value
757
+ }
758
+ return result
759
+ }
760
+
761
+ /**
762
+ * The ordinary single-repo coding flow: clone `branch` (or resume `newBranch`), run the agent,
763
+ * commit + push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
764
+ * no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal no-op for
765
+ * the in-place fixers (and for a seed-only kind like `repro-test`).
766
+ */
767
+ async function runSingleRepoCoding(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
732
768
  const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
733
769
  const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
734
770
  {