@cat-factory/executor-harness 1.34.4 → 1.34.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.js +240 -110
- package/dist/coding-agent.js +242 -2
- package/dist/job.js +44 -0
- package/dist/pi-workspace.js +1 -0
- package/dist/pi.js +27 -3
- package/package.json +3 -3
- package/src/agent.ts +296 -118
- package/src/coding-agent.ts +289 -1
- package/src/job.ts +77 -0
- package/src/pi-workspace.ts +7 -0
- package/src/pi.ts +30 -3
package/dist/job.js
CHANGED
|
@@ -114,6 +114,42 @@ function parseRepoSpec(repo) {
|
|
|
114
114
|
spec.serviceDirectory = dir;
|
|
115
115
|
return spec;
|
|
116
116
|
}
|
|
117
|
+
/**
|
|
118
|
+
* Parse the optional multi-repo peer list (service-connections phase 3). Each entry carries a
|
|
119
|
+
* full {@link RepoSpec} (validated + sanitised like the primary), the work branch to push, and
|
|
120
|
+
* an optional PR + per-repo token. A malformed list throws; an absent one yields `[]`.
|
|
121
|
+
*/
|
|
122
|
+
function parsePeerRepos(value) {
|
|
123
|
+
if (value === undefined || value === null)
|
|
124
|
+
return [];
|
|
125
|
+
if (!Array.isArray(value))
|
|
126
|
+
throw new Error("Invalid job: 'peerRepos' must be an array");
|
|
127
|
+
return value.map((entry, i) => {
|
|
128
|
+
if (typeof entry !== 'object' || entry === null) {
|
|
129
|
+
throw new Error(`Invalid job: 'peerRepos[${i}]' must be an object`);
|
|
130
|
+
}
|
|
131
|
+
const e = entry;
|
|
132
|
+
const spec = {
|
|
133
|
+
repo: parseRepoSpec((e.repo ?? {})),
|
|
134
|
+
};
|
|
135
|
+
// `newBranch` is required for a coding fan-out (it pushes to it) but ABSENT for a
|
|
136
|
+
// read-only explore fan-out (bug-investigator) — validate it only when present.
|
|
137
|
+
if (e.newBranch !== undefined)
|
|
138
|
+
spec.newBranch = str(e.newBranch, `peerRepos[${i}].newBranch`);
|
|
139
|
+
if (typeof e.frameId === 'string' && e.frameId)
|
|
140
|
+
spec.frameId = e.frameId;
|
|
141
|
+
if (typeof e.ghToken === 'string' && e.ghToken)
|
|
142
|
+
spec.ghToken = e.ghToken;
|
|
143
|
+
if (typeof e.pr === 'object' && e.pr !== null) {
|
|
144
|
+
const p = e.pr;
|
|
145
|
+
spec.pr = {
|
|
146
|
+
title: str(p.title, `peerRepos[${i}].pr.title`),
|
|
147
|
+
body: typeof p.body === 'string' ? p.body : '',
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
return spec;
|
|
151
|
+
});
|
|
152
|
+
}
|
|
117
153
|
/** Parse the optional `repo.provider` discriminator (defaults to undefined ⇒ host inference). */
|
|
118
154
|
function parseVcsProvider(value) {
|
|
119
155
|
if (value === undefined || value === null)
|
|
@@ -442,6 +478,7 @@ export function parseAgentJob(input) {
|
|
|
442
478
|
})()
|
|
443
479
|
: undefined;
|
|
444
480
|
const infra = parseAgentInfraSpec(o.infra);
|
|
481
|
+
const peerRepos = parsePeerRepos(o.peerRepos);
|
|
445
482
|
const bootstrap = parseAgentBootstrapSpec(o.bootstrap);
|
|
446
483
|
const contextFiles = parseContextFiles(o.contextFiles);
|
|
447
484
|
const packageRegistries = parsePackageRegistries(o.packageRegistries);
|
|
@@ -472,6 +509,7 @@ export function parseAgentJob(input) {
|
|
|
472
509
|
? { commitMessage: o.commitMessage }
|
|
473
510
|
: {}),
|
|
474
511
|
...(pr ? { pr } : {}),
|
|
512
|
+
...(peerRepos.length ? { peerRepos } : {}),
|
|
475
513
|
...(o.noChangesIsError === false ? { noChangesIsError: false } : {}),
|
|
476
514
|
...(o.persistentCheckout === true ? { persistentCheckout: true } : {}),
|
|
477
515
|
...(o.streamFollowUps === true ? { streamFollowUps: true } : {}),
|
|
@@ -484,5 +522,11 @@ export function parseAgentJob(input) {
|
|
|
484
522
|
// allowed GitHub host too (the installation token is sent to it on the force-push).
|
|
485
523
|
if (job.bootstrap)
|
|
486
524
|
assertAllowedHost(job.bootstrap.target.cloneUrl, 'bootstrap.target.cloneUrl');
|
|
525
|
+
// Each peer repo's clone URL receives the installation token on clone/push, so it must be
|
|
526
|
+
// an allowed GitHub host too — a body-supplied peer pointing at an attacker host would
|
|
527
|
+
// exfiltrate the token exactly like a rogue primary clone URL.
|
|
528
|
+
for (const [i, peer] of (job.peerRepos ?? []).entries()) {
|
|
529
|
+
assertAllowedHost(peer.repo.cloneUrl, `peerRepos[${i}].repo.cloneUrl`);
|
|
530
|
+
}
|
|
487
531
|
return job;
|
|
488
532
|
}
|
package/dist/pi-workspace.js
CHANGED
|
@@ -165,6 +165,7 @@ export async function runAgentInWorkspace(spec, opts = {}) {
|
|
|
165
165
|
guidance: spec.webToolsGuidance,
|
|
166
166
|
serviceDirectory: spec.serviceDirectory,
|
|
167
167
|
contextFiles,
|
|
168
|
+
...(spec.multiRepo ? { multiRepo: true } : {}),
|
|
168
169
|
});
|
|
169
170
|
await writePiModelsConfig({ model: spec.model, proxyBaseUrl });
|
|
170
171
|
const { signal, onActivity, onProgress, onSpan } = opts;
|
package/dist/pi.js
CHANGED
|
@@ -131,13 +131,37 @@ export async function writeAgentsContext(systemPrompt, opts = {}) {
|
|
|
131
131
|
const webTools = opts.webSearch ? (opts.guidance ?? WEB_TOOLS_GUIDANCE) : '';
|
|
132
132
|
// Tell the agent it's in a monorepo and which subtree is its service, so it scopes
|
|
133
133
|
// its work (and its build/test commands) there. Only present when the dispatcher
|
|
134
|
-
// resolved a monorepo service directory; the agent's cwd already points at it.
|
|
135
|
-
|
|
134
|
+
// resolved a monorepo service directory; the agent's cwd already points at it. A
|
|
135
|
+
// MULTI-REPO run runs at the workspace root (cwd spans sibling checkouts), so the
|
|
136
|
+
// monorepo note is suppressed there — the multi-repo mechanics note replaces it.
|
|
137
|
+
const monorepo = opts.serviceDirectory && !opts.multiRepo ? monorepoGuidance(opts.serviceDirectory) : '';
|
|
138
|
+
// Multi-repo mechanics note (service-connections phase 3): the concrete repo→role mapping
|
|
139
|
+
// is in the backend-composed system prompt above; this explains the shared MECHANICS (cwd
|
|
140
|
+
// is the workspace root, repos are sibling checkouts, one PR per dirty repo).
|
|
141
|
+
const multiRepo = opts.multiRepo ? MULTI_REPO_GUIDANCE : '';
|
|
136
142
|
// Point the agent at any linked context the backend materialised into the checkout
|
|
137
143
|
// (requirements / RFCs / PRDs / tracker issues) so it reads them on demand.
|
|
138
144
|
const context = contextGuidance(opts.contextFiles ?? []);
|
|
139
|
-
await writeFile(join(dir, 'AGENTS.md'), `${systemPrompt}${BLUEPRINT_GUIDANCE}${SPEC_GUIDANCE}${TODO_GUIDANCE}${monorepo}${webTools}${context}`, 'utf8');
|
|
145
|
+
await writeFile(join(dir, 'AGENTS.md'), `${systemPrompt}${BLUEPRINT_GUIDANCE}${SPEC_GUIDANCE}${TODO_GUIDANCE}${monorepo}${multiRepo}${webTools}${context}`, 'utf8');
|
|
140
146
|
}
|
|
147
|
+
/** The MULTI-REPO mechanics note appended to AGENTS.md when a run spans sibling checkouts. */
|
|
148
|
+
const MULTI_REPO_GUIDANCE = `
|
|
149
|
+
|
|
150
|
+
## Multi-repo workspace (work across sibling checkouts)
|
|
151
|
+
|
|
152
|
+
This task spans MORE THAN ONE repository. Your working directory is the WORKSPACE ROOT, and
|
|
153
|
+
each involved repository is checked out as a sibling directory directly under it. The workspace
|
|
154
|
+
root itself is NOT a git repository — run git INSIDE each repository's directory. The system
|
|
155
|
+
prompt above lists which repository is which and each one's role. Make the cross-service
|
|
156
|
+
change coherently across every repository the task requires — a provider's API and its
|
|
157
|
+
consumer's call site belong in the SAME piece of work. Run each repository's own build/test
|
|
158
|
+
commands inside that repository's directory.
|
|
159
|
+
|
|
160
|
+
Commit your own work inside each repository you change (\`cd\` into it, stage the files that
|
|
161
|
+
belong — INCLUDING any new files you added — and commit). The platform will NOT add untracked
|
|
162
|
+
files for you, so anything you leave uncommitted and untracked is lost. Each repository you
|
|
163
|
+
change is opened as a SEPARATE pull request; leave a repository untouched if the task does not
|
|
164
|
+
require changing it.`;
|
|
141
165
|
/** Directory in the checkout where linked-context files are materialised (see CONTEXT_DIR in agents). */
|
|
142
166
|
export const CONTEXT_DIR = '.cat-context';
|
|
143
167
|
/** The AGENTS.md block enumerating the materialised linked-context files, or '' when none. */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/executor-harness",
|
|
3
|
-
"version": "1.34.
|
|
3
|
+
"version": "1.34.10",
|
|
4
4
|
"description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,8 +26,8 @@
|
|
|
26
26
|
"hono": "^4.12.27",
|
|
27
27
|
"typescript": "^6.0.3",
|
|
28
28
|
"vitest": "^4.1.9",
|
|
29
|
-
"@cat-factory/
|
|
30
|
-
"@cat-factory/
|
|
29
|
+
"@cat-factory/spend": "0.10.95",
|
|
30
|
+
"@cat-factory/server": "0.82.0"
|
|
31
31
|
},
|
|
32
32
|
"scripts": {
|
|
33
33
|
"build": "tsc -p tsconfig.json",
|
package/src/agent.ts
CHANGED
|
@@ -26,8 +26,13 @@ import {
|
|
|
26
26
|
reinitAndPush,
|
|
27
27
|
unmergedPaths,
|
|
28
28
|
} from './git.js'
|
|
29
|
-
import type { PiRunStats } from './pi.js'
|
|
30
|
-
import {
|
|
29
|
+
import type { PiRunStats, RunDiagnostics } from './pi.js'
|
|
30
|
+
import {
|
|
31
|
+
makeDirClaimer,
|
|
32
|
+
noChangesReason,
|
|
33
|
+
runCodingAgent,
|
|
34
|
+
runMultiRepoCoding,
|
|
35
|
+
} from './coding-agent.js'
|
|
31
36
|
import {
|
|
32
37
|
acquireRepoCheckout,
|
|
33
38
|
agentNeverActed,
|
|
@@ -214,6 +219,47 @@ async function tearDownInfra(dir: string, infra: ServiceInfraSpec): Promise<void
|
|
|
214
219
|
}
|
|
215
220
|
}
|
|
216
221
|
|
|
222
|
+
/**
|
|
223
|
+
* Parse an agent's final reply into the structured JSON `custom`, shared by the explore and
|
|
224
|
+
* coding structured-output paths. With repair enabled (default) a malformed reply gets ONE
|
|
225
|
+
* structured repair call before giving up; with `output.repair === false` it parses directly.
|
|
226
|
+
* Returns the parsed value (or null when unusable) plus the repair diagnostics. Never throws —
|
|
227
|
+
* a parse failure is a null value, and each caller decides whether that is fatal (explore: yes;
|
|
228
|
+
* coding: no, the pushed commits are the deliverable).
|
|
229
|
+
*/
|
|
230
|
+
async function resolveReplyCustom(
|
|
231
|
+
job: AgentJob,
|
|
232
|
+
summary: string,
|
|
233
|
+
signal: AbortSignal | undefined,
|
|
234
|
+
): Promise<{ value: unknown; diagnostics?: StructuredOutputDiagnostics }> {
|
|
235
|
+
if (job.output?.repair === false) {
|
|
236
|
+
try {
|
|
237
|
+
return { value: extractJsonObject(summary) }
|
|
238
|
+
} catch {
|
|
239
|
+
return { value: null }
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
const resolved = await resolveStructuredOutput(
|
|
243
|
+
{
|
|
244
|
+
label: 'agent',
|
|
245
|
+
shapeHint: job.output?.shapeHint ?? 'Expected a single JSON object.',
|
|
246
|
+
parse: (text) => extractJsonObject(text),
|
|
247
|
+
},
|
|
248
|
+
summary,
|
|
249
|
+
{
|
|
250
|
+
harness: job.harness,
|
|
251
|
+
subscriptionToken: job.subscriptionToken,
|
|
252
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
253
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
254
|
+
sessionToken: job.sessionToken,
|
|
255
|
+
model: job.model,
|
|
256
|
+
jobId: job.jobId,
|
|
257
|
+
signal,
|
|
258
|
+
},
|
|
259
|
+
)
|
|
260
|
+
return { value: resolved.value, diagnostics: resolved.diagnostics }
|
|
261
|
+
}
|
|
262
|
+
|
|
217
263
|
/** Extract the first JSON object from an agent's final message (tolerating fences/prose). */
|
|
218
264
|
function extractJsonObject(text: string): unknown {
|
|
219
265
|
const trimmed = text.trim()
|
|
@@ -370,6 +416,14 @@ async function runPreviewMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
370
416
|
*/
|
|
371
417
|
async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
372
418
|
const logger = opts.log ?? log
|
|
419
|
+
// Multi-repo read-only exploration (service-connections phase 3): when the job carries peer
|
|
420
|
+
// repos, clone them all as siblings and run at the workspace root. Keyed off job DATA
|
|
421
|
+
// (`peerRepos`), not the agent kind — the backend sets it for the bug-investigator when the
|
|
422
|
+
// task has involved services in distinct repos. `runMultiRepoExplore` uses its own ephemeral
|
|
423
|
+
// `withWorkspace`, so a `persistentCheckout` flag (which a warm-pool dispatch injects on EVERY
|
|
424
|
+
// job) is harmlessly ignored — it must NOT suppress the fan-out, or a pooled bug-investigator
|
|
425
|
+
// would silently drop its peer repos and only ever see the primary one.
|
|
426
|
+
if (job.peerRepos?.length) return runMultiRepoExplore(job, opts)
|
|
373
427
|
return acquireRepoCheckout(
|
|
374
428
|
{ persistent: job.persistentCheckout === true, prefix: 'agent-explore', repo: job.repo },
|
|
375
429
|
async (dir) => {
|
|
@@ -453,117 +507,11 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
453
507
|
opts,
|
|
454
508
|
)
|
|
455
509
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
failureCause: 'no-usable-output',
|
|
462
|
-
...(usage ? { usage } : {}),
|
|
463
|
-
...(callMetrics ? { callMetrics } : {}),
|
|
464
|
-
...infraSetupFields,
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
// Opt-in (document producers): a final answer cut off at the output ceiling — or empty —
|
|
469
|
-
// must FAIL LOUDLY here, BEFORE the structured repair below could launder a truncated
|
|
470
|
-
// reply into a half-baked doc the backend then shards/commits + hands onward. Mirrors the
|
|
471
|
-
// bespoke `/spec` handler's `unusableFinalAnswerCause` gate (which drove the old loop).
|
|
472
|
-
if (job.output?.kind === 'structured' && job.output.failOnUnusableFinal) {
|
|
473
|
-
const unusable = unusableFinalAnswerCause(runDiag)
|
|
474
|
-
if (unusable) {
|
|
475
|
-
return {
|
|
476
|
-
summary,
|
|
477
|
-
stats,
|
|
478
|
-
error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
|
|
479
|
-
failureCause: 'no-usable-output',
|
|
480
|
-
...(usage ? { usage } : {}),
|
|
481
|
-
...(callMetrics ? { callMetrics } : {}),
|
|
482
|
-
...infraSetupFields,
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
// Prose: the summary IS the deliverable.
|
|
488
|
-
if (job.output?.kind !== 'structured') {
|
|
489
|
-
logger.info('agent(explore): done (prose)', { ...stats })
|
|
490
|
-
return {
|
|
491
|
-
summary,
|
|
492
|
-
stats,
|
|
493
|
-
...(usage ? { usage } : {}),
|
|
494
|
-
...(callMetrics ? { callMetrics } : {}),
|
|
495
|
-
...infraSetupFields,
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
// Structured: parse the agent's JSON. With repair enabled (default) a malformed
|
|
500
|
-
// reply gets ONE structured repair call before giving up; with `repair:false` we
|
|
501
|
-
// parse directly (no repair channel). The backend coerces/validates + renders from
|
|
502
|
-
// the returned object in a post-op.
|
|
503
|
-
let custom: unknown = null
|
|
504
|
-
let diagnostics: StructuredOutputDiagnostics | undefined
|
|
505
|
-
if (job.output.repair === false) {
|
|
506
|
-
try {
|
|
507
|
-
custom = extractJsonObject(summary)
|
|
508
|
-
} catch {
|
|
509
|
-
custom = null
|
|
510
|
-
}
|
|
511
|
-
} else {
|
|
512
|
-
const resolved = await resolveStructuredOutput(
|
|
513
|
-
{
|
|
514
|
-
label: 'agent',
|
|
515
|
-
shapeHint: job.output.shapeHint ?? 'Expected a single JSON object.',
|
|
516
|
-
parse: (text) => extractJsonObject(text),
|
|
517
|
-
},
|
|
518
|
-
summary,
|
|
519
|
-
{
|
|
520
|
-
harness: job.harness,
|
|
521
|
-
subscriptionToken: job.subscriptionToken,
|
|
522
|
-
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
523
|
-
proxyBaseUrl: job.proxyBaseUrl,
|
|
524
|
-
sessionToken: job.sessionToken,
|
|
525
|
-
model: job.model,
|
|
526
|
-
jobId: job.jobId,
|
|
527
|
-
signal: opts.signal,
|
|
528
|
-
},
|
|
529
|
-
)
|
|
530
|
-
custom = resolved.value
|
|
531
|
-
diagnostics = resolved.diagnostics
|
|
532
|
-
}
|
|
533
|
-
if (custom === undefined || custom === null) {
|
|
534
|
-
return {
|
|
535
|
-
summary,
|
|
536
|
-
stats,
|
|
537
|
-
error: noStructuredReason(stats, stderrTail, diagnostics),
|
|
538
|
-
failureCause: 'no-usable-output',
|
|
539
|
-
...(usage ? { usage } : {}),
|
|
540
|
-
...(callMetrics ? { callMetrics } : {}),
|
|
541
|
-
...infraSetupFields,
|
|
542
|
-
}
|
|
543
|
-
}
|
|
544
|
-
// Stamp the run's actual environment authoritatively onto the structured result when
|
|
545
|
-
// infra was managed (the tester): which env the suite ran in is decided by the job's
|
|
546
|
-
// infra spec, NOT the model, so the backend can echo it back to the UI deterministically
|
|
547
|
-
// even when the model omits it from its JSON (or a structured repair drops it). A
|
|
548
|
-
// frontend run tests the app against its live ephemeral backend(s), so it reports
|
|
549
|
-
// `ephemeral` (the TestReport env vocabulary has no separate frontend value).
|
|
550
|
-
const reportedEnvironment = infra
|
|
551
|
-
? infra.kind === 'frontend'
|
|
552
|
-
? 'ephemeral'
|
|
553
|
-
: infra.environment
|
|
554
|
-
: undefined
|
|
555
|
-
if (reportedEnvironment && typeof custom === 'object') {
|
|
556
|
-
;(custom as Record<string, unknown>).environment = reportedEnvironment
|
|
557
|
-
}
|
|
558
|
-
logger.info('agent(explore): done (structured)', { ...stats })
|
|
559
|
-
return {
|
|
560
|
-
summary,
|
|
561
|
-
custom,
|
|
562
|
-
stats,
|
|
563
|
-
...(usage ? { usage } : {}),
|
|
564
|
-
...(callMetrics ? { callMetrics } : {}),
|
|
565
|
-
...infraSetupFields,
|
|
566
|
-
}
|
|
510
|
+
return await finalizeExploreResult(
|
|
511
|
+
job,
|
|
512
|
+
{ summary, stats, stderrTail, usage, callMetrics, runDiag },
|
|
513
|
+
{ infra, infraSetupFields, logger, signal: opts.signal },
|
|
514
|
+
)
|
|
567
515
|
} finally {
|
|
568
516
|
if (managed) await managed.cleanup()
|
|
569
517
|
}
|
|
@@ -571,22 +519,252 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
571
519
|
)
|
|
572
520
|
}
|
|
573
521
|
|
|
522
|
+
/** The agent-run outputs the explore result-parsing reads (shared single-/multi-repo). */
|
|
523
|
+
interface ExploreAgentRun {
|
|
524
|
+
summary: string
|
|
525
|
+
stats: PiRunStats
|
|
526
|
+
stderrTail?: string
|
|
527
|
+
usage?: AgentResult['usage']
|
|
528
|
+
callMetrics?: AgentResult['callMetrics']
|
|
529
|
+
runDiag?: RunDiagnostics
|
|
530
|
+
}
|
|
531
|
+
|
|
574
532
|
/**
|
|
575
|
-
*
|
|
576
|
-
*
|
|
577
|
-
*
|
|
578
|
-
*
|
|
533
|
+
* Turn an explore agent's raw run into an {@link AgentResult}: guard an empty/truncated reply,
|
|
534
|
+
* then either return the prose summary or parse (+ optionally repair) the structured JSON as
|
|
535
|
+
* `custom` — the backend renders any artifact files from it in a post-op. Extracted so the
|
|
536
|
+
* single-repo {@link runExploreMode} and the read-only {@link runMultiRepoExplore} share ONE
|
|
537
|
+
* result contract (the multi-repo path passes no infra, so the tester-only env stamping no-ops).
|
|
538
|
+
*/
|
|
539
|
+
async function finalizeExploreResult(
|
|
540
|
+
job: AgentJob,
|
|
541
|
+
run: ExploreAgentRun,
|
|
542
|
+
ctx: {
|
|
543
|
+
infra?: AgentInfraSpec | ServiceInfraSpec
|
|
544
|
+
infraSetupFields: { infraSetup?: InfraSetupRecord }
|
|
545
|
+
logger: Logger
|
|
546
|
+
signal?: AbortSignal
|
|
547
|
+
},
|
|
548
|
+
): Promise<AgentResult> {
|
|
549
|
+
const { summary, stats, stderrTail, usage, callMetrics, runDiag } = run
|
|
550
|
+
const { infra, infraSetupFields, logger, signal } = ctx
|
|
551
|
+
|
|
552
|
+
if (!summary.trim()) {
|
|
553
|
+
return {
|
|
554
|
+
summary,
|
|
555
|
+
stats,
|
|
556
|
+
error: noOutputReason(stats, stderrTail),
|
|
557
|
+
failureCause: 'no-usable-output',
|
|
558
|
+
...(usage ? { usage } : {}),
|
|
559
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
560
|
+
...infraSetupFields,
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Opt-in (document producers): a final answer cut off at the output ceiling — or empty —
|
|
565
|
+
// must FAIL LOUDLY here, BEFORE the structured repair below could launder a truncated
|
|
566
|
+
// reply into a half-baked doc the backend then shards/commits + hands onward. Mirrors the
|
|
567
|
+
// bespoke `/spec` handler's `unusableFinalAnswerCause` gate (which drove the old loop).
|
|
568
|
+
if (job.output?.kind === 'structured' && job.output.failOnUnusableFinal) {
|
|
569
|
+
const unusable = unusableFinalAnswerCause(runDiag)
|
|
570
|
+
if (unusable) {
|
|
571
|
+
return {
|
|
572
|
+
summary,
|
|
573
|
+
stats,
|
|
574
|
+
error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
|
|
575
|
+
failureCause: 'no-usable-output',
|
|
576
|
+
...(usage ? { usage } : {}),
|
|
577
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
578
|
+
...infraSetupFields,
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// Prose: the summary IS the deliverable.
|
|
584
|
+
if (job.output?.kind !== 'structured') {
|
|
585
|
+
logger.info('agent(explore): done (prose)', { ...stats })
|
|
586
|
+
return {
|
|
587
|
+
summary,
|
|
588
|
+
stats,
|
|
589
|
+
...(usage ? { usage } : {}),
|
|
590
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
591
|
+
...infraSetupFields,
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// Structured: parse the agent's JSON via the shared resolver. With repair enabled (default)
|
|
596
|
+
// a malformed reply gets ONE structured repair call before giving up; with `repair:false` it
|
|
597
|
+
// parses directly (no repair channel). The backend coerces/validates + renders from the
|
|
598
|
+
// returned object in a post-op. Unlike the coding path, an unparseable explore reply IS a
|
|
599
|
+
// failure — the report/JSON is the whole deliverable.
|
|
600
|
+
const { value: custom, diagnostics } = await resolveReplyCustom(job, summary, signal)
|
|
601
|
+
if (custom === undefined || custom === null) {
|
|
602
|
+
return {
|
|
603
|
+
summary,
|
|
604
|
+
stats,
|
|
605
|
+
error: noStructuredReason(stats, stderrTail, diagnostics),
|
|
606
|
+
failureCause: 'no-usable-output',
|
|
607
|
+
...(usage ? { usage } : {}),
|
|
608
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
609
|
+
...infraSetupFields,
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
// Stamp the run's actual environment authoritatively onto the structured result when
|
|
613
|
+
// infra was managed (the tester): which env the suite ran in is decided by the job's
|
|
614
|
+
// infra spec, NOT the model, so the backend can echo it back to the UI deterministically
|
|
615
|
+
// even when the model omits it from its JSON (or a structured repair drops it). A
|
|
616
|
+
// frontend run tests the app against its live ephemeral backend(s), so it reports
|
|
617
|
+
// `ephemeral` (the TestReport env vocabulary has no separate frontend value).
|
|
618
|
+
const reportedEnvironment = infra
|
|
619
|
+
? infra.kind === 'frontend'
|
|
620
|
+
? 'ephemeral'
|
|
621
|
+
: infra.environment
|
|
622
|
+
: undefined
|
|
623
|
+
if (reportedEnvironment && typeof custom === 'object') {
|
|
624
|
+
;(custom as Record<string, unknown>).environment = reportedEnvironment
|
|
625
|
+
}
|
|
626
|
+
logger.info('agent(explore): done (structured)', { ...stats })
|
|
627
|
+
return {
|
|
628
|
+
summary,
|
|
629
|
+
custom,
|
|
630
|
+
stats,
|
|
631
|
+
...(usage ? { usage } : {}),
|
|
632
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
633
|
+
...infraSetupFields,
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Read-only MULTI-REPO exploration (service-connections phase 3, read-only): clone the primary
|
|
639
|
+
* repo PLUS every connected peer repo as SIBLING checkouts under one workspace root, run the
|
|
640
|
+
* agent ONCE with its cwd at the root (so it can read across every repo the bug touches), and
|
|
641
|
+
* return its prose/structured result — making NO edits, NO commits and opening NO PR. The
|
|
642
|
+
* counterpart of {@link runMultiRepoCoding} for the `bug-investigator`, but strictly read-only:
|
|
643
|
+
* peers carry no `newBranch`/`pr`, nothing is pushed, and the peers exist only to be read. The
|
|
644
|
+
* multi-repo layout is explained to the agent by the backend-composed system-prompt section
|
|
645
|
+
* (which repo/subdir each service lives in) + the harness's own AGENTS.md multi-repo note.
|
|
646
|
+
*/
|
|
647
|
+
async function runMultiRepoExplore(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
648
|
+
const logger = (opts.log ?? log).child({ kind: 'multi-repo-explore', jobId: job.jobId })
|
|
649
|
+
const peers = job.peerRepos ?? []
|
|
650
|
+
|
|
651
|
+
// Unique sibling directory per repo (owner-prefixed on a name collision), so two repos
|
|
652
|
+
// named the same never clobber each other — shared claim scheme with the coding fan-out.
|
|
653
|
+
const claimDir = makeDirClaimer()
|
|
654
|
+
const legs = [
|
|
655
|
+
{ repo: job.repo, cloneBranch: job.branch, ghToken: job.ghToken },
|
|
656
|
+
...peers.map((peer) => ({
|
|
657
|
+
repo: peer.repo,
|
|
658
|
+
cloneBranch: peer.repo.baseBranch,
|
|
659
|
+
ghToken: peer.ghToken ?? job.ghToken,
|
|
660
|
+
})),
|
|
661
|
+
].map((leg) => ({ ...leg, dirName: claimDir(leg.repo) }))
|
|
662
|
+
|
|
663
|
+
return withWorkspace('explore-multi', async (root) => {
|
|
664
|
+
// Clone phase: every repo (read-only) into its sibling dir under the workspace root. No
|
|
665
|
+
// work branch, no resume — the investigator only reads — so the legs are independent and
|
|
666
|
+
// clone in parallel (wall-clock is the slowest single clone, not the sum).
|
|
667
|
+
opts.onPhase?.('clone')
|
|
668
|
+
await Promise.all(
|
|
669
|
+
legs.map(async (leg) => {
|
|
670
|
+
const dir = join(root, leg.dirName)
|
|
671
|
+
await mkdir(dir, { recursive: true })
|
|
672
|
+
logger.info('multi-repo-explore: cloning', {
|
|
673
|
+
repo: leg.dirName,
|
|
674
|
+
cloneBranch: leg.cloneBranch,
|
|
675
|
+
})
|
|
676
|
+
await cloneRepo({
|
|
677
|
+
repo: { ...leg.repo, baseBranch: leg.cloneBranch },
|
|
678
|
+
ghToken: leg.ghToken,
|
|
679
|
+
dir,
|
|
680
|
+
signal: opts.signal,
|
|
681
|
+
})
|
|
682
|
+
}),
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
opts.onPhase?.('agent')
|
|
686
|
+
logger.info('multi-repo-explore: running agent', { repos: legs.map((l) => l.dirName) })
|
|
687
|
+
const run = await runAgentInWorkspace(
|
|
688
|
+
{
|
|
689
|
+
dir: root,
|
|
690
|
+
systemPrompt: job.systemPrompt,
|
|
691
|
+
userPrompt: job.userPrompt,
|
|
692
|
+
model: job.model,
|
|
693
|
+
harness: job.harness,
|
|
694
|
+
subscriptionToken: job.subscriptionToken,
|
|
695
|
+
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
696
|
+
ambientAuth: job.ambientAuth,
|
|
697
|
+
proxyBaseUrl: job.proxyBaseUrl,
|
|
698
|
+
sessionToken: job.sessionToken,
|
|
699
|
+
// Read-only: no edits expected, so the no-progress guard's no-edit bound must not fire.
|
|
700
|
+
expectsEdits: false,
|
|
701
|
+
webToolsGuidance: job.webToolsGuidance,
|
|
702
|
+
webSearchProxy: job.webSearch,
|
|
703
|
+
...(job.contextFiles ? { contextFiles: job.contextFiles } : {}),
|
|
704
|
+
guardLimits: job.guardLimits,
|
|
705
|
+
multiRepo: true,
|
|
706
|
+
},
|
|
707
|
+
opts,
|
|
708
|
+
)
|
|
709
|
+
return finalizeExploreResult(
|
|
710
|
+
job,
|
|
711
|
+
{
|
|
712
|
+
summary: run.summary,
|
|
713
|
+
stats: run.stats,
|
|
714
|
+
stderrTail: run.stderrTail,
|
|
715
|
+
usage: run.usage,
|
|
716
|
+
callMetrics: run.callMetrics,
|
|
717
|
+
runDiag: run.diagnostics,
|
|
718
|
+
},
|
|
719
|
+
{ infraSetupFields: {}, logger, signal: opts.signal },
|
|
720
|
+
)
|
|
721
|
+
})
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
/**
|
|
725
|
+
* Edit-and-push coding, dispatching on job DATA: repo-bootstrap (force-push a fresh history to a
|
|
726
|
+
* separate target repo), conflict-resolution (merge the base in, resolve, push back), multi-repo
|
|
727
|
+
* fan-out (sibling checkouts + one PR per changed repo), else the ordinary single-repo flow.
|
|
728
|
+
* After the flow, a STRUCTURED coding kind (e.g. `repro-test`, whose deliverable is BOTH a pushed
|
|
729
|
+
* commit AND a JSON outcome) parses its final reply into `custom` — best-effort, so an unparseable
|
|
730
|
+
* outcome degrades to no `custom` (the backend resolver then defaults) rather than failing the
|
|
731
|
+
* run, whose real deliverable is the pushed commits.
|
|
579
732
|
*/
|
|
580
733
|
async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
581
734
|
// Repo bootstrap is a coding run that force-pushes a fresh history to a SEPARATE target
|
|
582
735
|
// repo (clone + adapt a reference, or scaffold from scratch). Keyed off job DATA
|
|
583
|
-
// (`bootstrap`), not the agent kind.
|
|
736
|
+
// (`bootstrap`), not the agent kind. Bootstrap/conflict never carry a structured `output`.
|
|
584
737
|
if (job.bootstrap) return runBootstrap(job, opts)
|
|
585
738
|
// Conflict resolution is a coding run with a different pre/post around the agent:
|
|
586
739
|
// clone full, merge the base in to surface the conflicts, then complete the merge
|
|
587
740
|
// commit + push (no PR). Keyed off job DATA (`mergeBase`), not the agent kind.
|
|
588
741
|
if (job.mergeBase) return runConflictResolution(job, opts)
|
|
742
|
+
// Multi-repo coding (service-connections phase 3): clone every connected peer repo as a
|
|
743
|
+
// sibling, run the agent once across all of them, and open one PR per changed repo. Keyed
|
|
744
|
+
// off job DATA (`peerRepos`), not the agent kind — the implementer sets it when the task
|
|
745
|
+
// has involved services in distinct repos.
|
|
746
|
+
const result = job.peerRepos?.length
|
|
747
|
+
? await runMultiRepoCoding(job, opts)
|
|
748
|
+
: await runSingleRepoCoding(job, opts)
|
|
749
|
+
|
|
750
|
+
// Structured coding kind (repro-test): fold the final reply's JSON onto `custom` so the
|
|
751
|
+
// backend post-completion resolver records the outcome. Skipped on a failed run (its `error`
|
|
752
|
+
// is the signal) and when there is no reply to parse. Best-effort: a null parse leaves
|
|
753
|
+
// `custom` unset (the run still succeeds on its commits).
|
|
754
|
+
if (job.output?.kind === 'structured' && !result.error && result.summary) {
|
|
755
|
+
const { value } = await resolveReplyCustom(job, result.summary, opts.signal)
|
|
756
|
+
if (value !== null && value !== undefined) result.custom = value
|
|
757
|
+
}
|
|
758
|
+
return result
|
|
759
|
+
}
|
|
589
760
|
|
|
761
|
+
/**
|
|
762
|
+
* The ordinary single-repo coding flow: clone `branch` (or resume `newBranch`), run the agent,
|
|
763
|
+
* commit + push to `pushBranch`, and open `pr` when one is set and the run produced changes. A
|
|
764
|
+
* no-op is a failure for the implementer (`noChangesIsError` default) and a non-fatal no-op for
|
|
765
|
+
* the in-place fixers (and for a seed-only kind like `repro-test`).
|
|
766
|
+
*/
|
|
767
|
+
async function runSingleRepoCoding(job: AgentJob, opts: RunOptions): Promise<AgentResult> {
|
|
590
768
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
|
|
591
769
|
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
|
|
592
770
|
{
|