@cat-factory/executor-harness 1.34.8 → 1.34.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.js +68 -39
- package/package.json +3 -3
- package/src/agent.ts +76 -40
package/dist/agent.js
CHANGED
|
@@ -152,6 +152,39 @@ async function tearDownInfra(dir, infra) {
|
|
|
152
152
|
// The container is ephemeral and torn down with the run anyway — ignore.
|
|
153
153
|
}
|
|
154
154
|
}
|
|
155
|
+
/**
|
|
156
|
+
* Parse an agent's final reply into the structured JSON `custom`, shared by the explore and
|
|
157
|
+
* coding structured-output paths. With repair enabled (default) a malformed reply gets ONE
|
|
158
|
+
* structured repair call before giving up; with `output.repair === false` it parses directly.
|
|
159
|
+
* Returns the parsed value (or null when unusable) plus the repair diagnostics. Never throws —
|
|
160
|
+
* a parse failure is a null value, and each caller decides whether that is fatal (explore: yes;
|
|
161
|
+
* coding: no, the pushed commits are the deliverable).
|
|
162
|
+
*/
|
|
163
|
+
async function resolveReplyCustom(job, summary, signal) {
|
|
164
|
+
if (job.output?.repair === false) {
|
|
165
|
+
try {
|
|
166
|
+
return { value: extractJsonObject(summary) };
|
|
167
|
+
}
|
|
168
|
+
catch {
|
|
169
|
+
return { value: null };
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
const resolved = await resolveStructuredOutput({
|
|
173
|
+
label: 'agent',
|
|
174
|
+
shapeHint: job.output?.shapeHint ?? 'Expected a single JSON object.',
|
|
175
|
+
parse: (text) => extractJsonObject(text),
|
|
176
|
+
}, summary, {
|
|
177
|
+
harness: job.harness,
|
|
178
|
+
subscriptionToken: job.subscriptionToken,
|
|
179
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
180
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
181
|
+
sessionToken: job.sessionToken,
|
|
182
|
+
model: job.model,
|
|
183
|
+
jobId: job.jobId,
|
|
184
|
+
signal,
|
|
185
|
+
});
|
|
186
|
+
return { value: resolved.value, diagnostics: resolved.diagnostics };
|
|
187
|
+
}
|
|
155
188
|
/** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
|
|
156
189
|
function extractJsonObject(text) {
|
|
157
190
|
const trimmed = text.trim();
|
|
@@ -434,38 +467,12 @@ async function finalizeExploreResult(job, run, ctx) {
|
|
|
434
467
|
...infraSetupFields,
|
|
435
468
|
};
|
|
436
469
|
}
|
|
437
|
-
// Structured: parse the agent's JSON. With repair enabled (default)
|
|
438
|
-
// reply gets ONE structured repair call before giving up; with `repair:false`
|
|
439
|
-
//
|
|
440
|
-
//
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
if (job.output.repair === false) {
|
|
444
|
-
try {
|
|
445
|
-
custom = extractJsonObject(summary);
|
|
446
|
-
}
|
|
447
|
-
catch {
|
|
448
|
-
custom = null;
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
else {
|
|
452
|
-
const resolved = await resolveStructuredOutput({
|
|
453
|
-
label: 'agent',
|
|
454
|
-
shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
|
|
455
|
-
parse: (text) => extractJsonObject(text),
|
|
456
|
-
}, summary, {
|
|
457
|
-
harness: job.harness,
|
|
458
|
-
subscriptionToken: job.subscriptionToken,
|
|
459
|
-
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
460
|
-
proxyBaseUrl: job.proxyBaseUrl,
|
|
461
|
-
sessionToken: job.sessionToken,
|
|
462
|
-
model: job.model,
|
|
463
|
-
jobId: job.jobId,
|
|
464
|
-
signal,
|
|
465
|
-
});
|
|
466
|
-
custom = resolved.value;
|
|
467
|
-
diagnostics = resolved.diagnostics;
|
|
468
|
-
}
|
|
470
|
+
// Structured: parse the agent's JSON via the shared resolver. With repair enabled (default)
|
|
471
|
+
// a malformed reply gets ONE structured repair call before giving up; with `repair:false` it
|
|
472
|
+
// parses directly (no repair channel). The backend coerces/validates + renders from the
|
|
473
|
+
// returned object in a post-op. Unlike the coding path, an unparseable explore reply IS a
|
|
474
|
+
// failure — the report/JSON is the whole deliverable.
|
|
475
|
+
const { value: custom, diagnostics } = await resolveReplyCustom(job, summary, signal);
|
|
469
476
|
if (custom === undefined || custom === null) {
|
|
470
477
|
return {
|
|
471
478
|
summary,
|
|
@@ -577,15 +584,18 @@ async function runMultiRepoExplore(job, opts) {
|
|
|
577
584
|
});
|
|
578
585
|
}
|
|
579
586
|
/**
|
|
580
|
-
* Edit-and-push coding
|
|
581
|
-
*
|
|
582
|
-
*
|
|
583
|
-
*
|
|
587
|
+
* Edit-and-push coding, dispatching on job DATA: repo-bootstrap (force-push a fresh history to a
|
|
588
|
+
* separate target repo), conflict-resolution (merge the base in, resolve, push back), multi-repo
|
|
589
|
+
* fan-out (sibling checkouts + one PR per changed repo), else the ordinary single-repo flow.
|
|
590
|
+
* After the flow, a STRUCTURED coding kind (e.g. `repro-test`, whose deliverable is BOTH a pushed
|
|
591
|
+
* commit AND a JSON outcome) parses its final reply into `custom` — best-effort, so an unparseable
|
|
592
|
+
* outcome degrades to no `custom` (the backend resolver then defaults) rather than failing the
|
|
593
|
+
* run, whose real deliverable is the pushed commits.
|
|
584
594
|
*/
|
|
585
595
|
async function runCodingMode(job, opts) {
|
|
586
596
|
// Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
|
|
587
597
|
// repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
|
|
588
|
-
// (`bootstrap`), not the agent kind.
|
|
598
|
+
// (`bootstrap`), not the agent kind. Bootstrap/conflict never carry a structured `output`.
|
|
589
599
|
if (job.bootstrap)
|
|
590
600
|
return runBootstrap(job, opts);
|
|
591
601
|
// Conflict resolution is a coding run with a different pre/post around the agent:
|
|
@@ -597,8 +607,27 @@ async function runCodingMode(job, opts) {
|
|
|
597
607
|
// sibling, run the agent once across all of them, and open one PR per changed repo. Keyed
|
|
598
608
|
// off job DATA (`peerRepos`), not the agent kind — the implementer sets it when the task
|
|
599
609
|
// has involved services in distinct repos.
|
|
600
|
-
|
|
601
|
-
|
|
610
|
+
const result = job.peerRepos?.length
|
|
611
|
+
? await runMultiRepoCoding(job, opts)
|
|
612
|
+
: await runSingleRepoCoding(job, opts);
|
|
613
|
+
// Structured coding kind (repro-test): fold the final reply's JSON onto `custom` so the
|
|
614
|
+
// backend post-completion resolver records the outcome. Skipped on a failed run (its `error`
|
|
615
|
+
// is the signal) and when there is no reply to parse. Best-effort: a null parse leaves
|
|
616
|
+
// `custom` unset (the run still succeeds on its commits).
|
|
617
|
+
if (job.output?.kind === 'structured' && !result.error && result.summary) {
|
|
618
|
+
const { value } = await resolveReplyCustom(job, result.summary, opts.signal);
|
|
619
|
+
if (value !== null && value !== undefined)
|
|
620
|
+
result.custom = value;
|
|
621
|
+
}
|
|
622
|
+
return result;
|
|
623
|
+
}
|
|
624
|
+
/**
|
|
625
|
+
* The ordinary single-repo coding flow: clone `branch` (or resume `newBranch`), run the agent,
|
|
626
|
+
* commit + push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
|
|
627
|
+
* no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal no-op for
|
|
628
|
+
* the in-place fixers (and for a seed-only kind like `repro-test`).
|
|
629
|
+
*/
|
|
630
|
+
async function runSingleRepoCoding(job, opts) {
|
|
602
631
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
|
|
603
632
|
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
|
|
604
633
|
kind: 'agent',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/executor-harness",
|
|
3
|
-
"version": "1.34.
|
|
3
|
+
"version": "1.34.10",
|
|
4
4
|
"description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,8 +26,8 @@
|
|
|
26
26
|
"hono": "^4.12.27",
|
|
27
27
|
"typescript": "^6.0.3",
|
|
28
28
|
"vitest": "^4.1.9",
|
|
29
|
-
"@cat-factory/
|
|
30
|
-
"@cat-factory/
|
|
29
|
+
"@cat-factory/spend": "0.10.95",
|
|
30
|
+
"@cat-factory/server": "0.82.0"
|
|
31
31
|
},
|
|
32
32
|
"scripts": {
|
|
33
33
|
"build": "tsc -p tsconfig.json",
|
package/src/agent.ts
CHANGED
|
@@ -219,6 +219,47 @@ async function tearDownInfra(dir: string, infra: ServiceInfraSpec): Promise<void
|
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
221
|
|
|
222
|
+
/**
|
|
223
|
+
* Parse an agent's final reply into the structured JSON `custom`, shared by the explore and
|
|
224
|
+
* coding structured-output paths. With repair enabled (default) a malformed reply gets ONE
|
|
225
|
+
* structured repair call before giving up; with `output.repair === false` it parses directly.
|
|
226
|
+
* Returns the parsed value (or null when unusable) plus the repair diagnostics. Never throws —
|
|
227
|
+
* a parse failure is a null value, and each caller decides whether that is fatal (explore: yes;
|
|
228
|
+
* coding: no, the pushed commits are the deliverable).
|
|
229
|
+
*/
|
|
230
|
+
async function resolveReplyCustom(
|
|
231
|
+
job: AgentJob,
|
|
232
|
+
summary: string,
|
|
233
|
+
signal: AbortSignal | undefined,
|
|
234
|
+
): Promise<{ value: unknown; diagnostics?: StructuredOutputDiagnostics }> {
|
|
235
|
+
if (job.output?.repair === false) {
|
|
236
|
+
try {
|
|
237
|
+
return { value: extractJsonObject(summary) }
|
|
238
|
+
} catch {
|
|
239
|
+
return { value: null }
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
const resolved = await resolveStructuredOutput(
|
|
243
|
+
{
|
|
244
|
+
label: 'agent',
|
|
245
|
+
shapeHint: job.output?.shapeHint ?? 'Expected a single JSON object.',
|
|
246
|
+
parse: (text) => extractJsonObject(text),
|
|
247
|
+
},
|
|
248
|
+
summary,
|
|
249
|
+
{
|
|
250
|
+
harness: job.harness,
|
|
251
|
+
subscriptionToken: job.subscriptionToken,
|
|
252
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
253
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
254
|
+
sessionToken: job.sessionToken,
|
|
255
|
+
model: job.model,
|
|
256
|
+
jobId: job.jobId,
|
|
257
|
+
signal,
|
|
258
|
+
},
|
|
259
|
+
)
|
|
260
|
+
return { value: resolved.value, diagnostics: resolved.diagnostics }
|
|
261
|
+
}
|
|
262
|
+
|
|
222
263
|
/** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
|
|
223
264
|
function extractJsonObject(text: string): unknown {
|
|
224
265
|
const trimmed = text.trim()
|
|
@@ -551,40 +592,12 @@ async function finalizeExploreResult(
|
|
|
551
592
|
}
|
|
552
593
|
}
|
|
553
594
|
|
|
554
|
-
// Structured: parse the agent's JSON. With repair enabled (default)
|
|
555
|
-
// reply gets ONE structured repair call before giving up; with `repair:false`
|
|
556
|
-
//
|
|
557
|
-
//
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
if (job.output.repair === false) {
|
|
561
|
-
try {
|
|
562
|
-
custom = extractJsonObject(summary)
|
|
563
|
-
} catch {
|
|
564
|
-
custom = null
|
|
565
|
-
}
|
|
566
|
-
} else {
|
|
567
|
-
const resolved = await resolveStructuredOutput(
|
|
568
|
-
{
|
|
569
|
-
label: 'agent',
|
|
570
|
-
shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
|
|
571
|
-
parse: (text) => extractJsonObject(text),
|
|
572
|
-
},
|
|
573
|
-
summary,
|
|
574
|
-
{
|
|
575
|
-
harness: job.harness,
|
|
576
|
-
subscriptionToken: job.subscriptionToken,
|
|
577
|
-
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
578
|
-
proxyBaseUrl: job.proxyBaseUrl,
|
|
579
|
-
sessionToken: job.sessionToken,
|
|
580
|
-
model: job.model,
|
|
581
|
-
jobId: job.jobId,
|
|
582
|
-
signal,
|
|
583
|
-
},
|
|
584
|
-
)
|
|
585
|
-
custom = resolved.value
|
|
586
|
-
diagnostics = resolved.diagnostics
|
|
587
|
-
}
|
|
595
|
+
// Structured: parse the agent's JSON via the shared resolver. With repair enabled (default)
|
|
596
|
+
// a malformed reply gets ONE structured repair call before giving up; with `repair:false` it
|
|
597
|
+
// parses directly (no repair channel). The backend coerces/validates + renders from the
|
|
598
|
+
// returned object in a post-op. Unlike the coding path, an unparseable explore reply IS a
|
|
599
|
+
// failure — the report/JSON is the whole deliverable.
|
|
600
|
+
const { value: custom, diagnostics } = await resolveReplyCustom(job, summary, signal)
|
|
588
601
|
if (custom === undefined || custom === null) {
|
|
589
602
|
return {
|
|
590
603
|
summary,
|
|
@@ -709,15 +722,18 @@ async function runMultiRepoExplore(job: AgentJob, opts: RunOptions): Promise<Age
|
|
|
709
722
|
}
|
|
710
723
|
|
|
711
724
|
/**
|
|
712
|
-
* Edit-and-push coding
|
|
713
|
-
*
|
|
714
|
-
*
|
|
715
|
-
*
|
|
725
|
+
* Edit-and-push coding, dispatching on job DATA: repo-bootstrap (force-push a fresh history to a
|
|
726
|
+
* separate target repo), conflict-resolution (merge the base in, resolve, push back), multi-repo
|
|
727
|
+
* fan-out (sibling checkouts + one PR per changed repo), else the ordinary single-repo flow.
|
|
728
|
+
* After the flow, a STRUCTURED coding kind (e.g. `repro-test`, whose deliverable is BOTH a pushed
|
|
729
|
+
* commit AND a JSON outcome) parses its final reply into `custom` — best-effort, so an unparseable
|
|
730
|
+
* outcome degrades to no `custom` (the backend resolver then defaults) rather than failing the
|
|
731
|
+
* run, whose real deliverable is the pushed commits.
|
|
716
732
|
*/
|
|
717
733
|
async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
718
734
|
// Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
|
|
719
735
|
// repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
|
|
720
|
-
// (`bootstrap`), not the agent kind.
|
|
736
|
+
// (`bootstrap`), not the agent kind. Bootstrap/conflict never carry a structured `output`.
|
|
721
737
|
if (job.bootstrap) return runBootstrap(job, opts)
|
|
722
738
|
// Conflict resolution is a coding run with a different pre/post around the agent:
|
|
723
739
|
// clone full, merge the base in to surface the conflicts, then complete the merge
|
|
@@ -727,8 +743,28 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
727
743
|
// sibling, run the agent once across all of them, and open one PR per changed repo. Keyed
|
|
728
744
|
// off job DATA (`peerRepos`), not the agent kind — the implementer sets it when the task
|
|
729
745
|
// has involved services in distinct repos.
|
|
730
|
-
|
|
746
|
+
const result = job.peerRepos?.length
|
|
747
|
+
? await runMultiRepoCoding(job, opts)
|
|
748
|
+
: await runSingleRepoCoding(job, opts)
|
|
731
749
|
|
|
750
|
+
// Structured coding kind (repro-test): fold the final reply's JSON onto `custom` so the
|
|
751
|
+
// backend post-completion resolver records the outcome. Skipped on a failed run (its `error`
|
|
752
|
+
// is the signal) and when there is no reply to parse. Best-effort: a null parse leaves
|
|
753
|
+
// `custom` unset (the run still succeeds on its commits).
|
|
754
|
+
if (job.output?.kind === 'structured' && !result.error && result.summary) {
|
|
755
|
+
const { value } = await resolveReplyCustom(job, result.summary, opts.signal)
|
|
756
|
+
if (value !== null && value !== undefined) result.custom = value
|
|
757
|
+
}
|
|
758
|
+
return result
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* The ordinary single-repo coding flow: clone `branch` (or resume `newBranch`), run the agent,
|
|
763
|
+
* commit + push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
|
|
764
|
+
* no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal no-op for
|
|
765
|
+
* the in-place fixers (and for a seed-only kind like `repro-test`).
|
|
766
|
+
*/
|
|
767
|
+
async function runSingleRepoCoding(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
732
768
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
|
|
733
769
|
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
|
|
734
770
|
{
|