@cat-factory/executor-harness 1.31.0 → 1.31.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/dist/git.js +86 -16
- package/dist/pi-workspace.js +15 -1
- package/package.json +12 -7
- package/src/git.ts +100 -14
- package/src/pi-workspace.ts +17 -1
- package/dist/blueprint.js +0 -367
- package/dist/bootstrap.js +0 -99
- package/dist/ci-fixer.js +0 -46
- package/dist/conflict-resolver.js +0 -138
- package/dist/explore.js +0 -74
- package/dist/fixer.js +0 -44
- package/dist/merger.js +0 -135
- package/dist/on-call.js +0 -126
- package/dist/spec.js +0 -754
- package/dist/tester.js +0 -191
package/dist/tester.js
DELETED
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
import { execFile } from 'node:child_process';
|
|
2
|
-
import { promisify } from 'node:util';
|
|
3
|
-
import { cloneRepo } from './git.js';
|
|
4
|
-
import { extractJsonObject } from './blueprint.js';
|
|
5
|
-
import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
|
|
6
|
-
import { diagnosticsSuffix, resolveStructuredOutput, } from './structured-output.js';
|
|
7
|
-
import { log } from './logger.js';
|
|
8
|
-
const exec = promisify(execFile);
|
|
9
|
-
// Async job execution for the Tester. The engine dispatches this to run the project's
|
|
10
|
-
// tests: clone the PR HEAD branch, stand its dependencies up (local docker-compose
|
|
11
|
-
// infra, or test against an ephemeral env), run Pi to exercise the change + regress
|
|
12
|
-
// related behaviour, and return ONLY a structured JSON report. The Tester makes NO
|
|
13
|
-
// commits — on a withheld greenlight the engine loops the `fixer` and re-tests.
|
|
14
|
-
/** Compact description of the report shape, fed to the JSON repair call. */
|
|
15
|
-
const REPORT_SHAPE_HINT = 'Expected a test report: {"greenlight": boolean, "summary": string, "tested": string[], ' +
|
|
16
|
-
'"outcomes": [{"name": string, "status": "passed"|"failed"|"skipped", "detail"?: string}], ' +
|
|
17
|
-
'"concerns": [{"title": string, "detail": string, "severity": "low"|"medium"|"high"|"critical"}]}.';
|
|
18
|
-
const SEVERITIES = new Set(['low', 'medium', 'high', 'critical']);
|
|
19
|
-
const STATUSES = new Set(['passed', 'failed', 'skipped']);
|
|
20
|
-
/** Coerce the agent's JSON into a well-formed report, defaulting conservatively. */
|
|
21
|
-
function coerceReport(raw, summary, env) {
|
|
22
|
-
const o = (typeof raw === 'object' && raw !== null ? raw : {});
|
|
23
|
-
const outcomes = Array.isArray(o.outcomes)
|
|
24
|
-
? o.outcomes
|
|
25
|
-
.filter((x) => typeof x === 'object' && x !== null)
|
|
26
|
-
.map((x) => ({
|
|
27
|
-
name: typeof x.name === 'string' ? x.name : '(unnamed)',
|
|
28
|
-
status: (STATUSES.has(x.status)
|
|
29
|
-
? x.status
|
|
30
|
-
: 'skipped'),
|
|
31
|
-
...(typeof x.detail === 'string' && x.detail ? { detail: x.detail } : {}),
|
|
32
|
-
}))
|
|
33
|
-
: [];
|
|
34
|
-
const concerns = Array.isArray(o.concerns)
|
|
35
|
-
? o.concerns
|
|
36
|
-
.filter((x) => typeof x === 'object' && x !== null)
|
|
37
|
-
.map((x) => ({
|
|
38
|
-
title: typeof x.title === 'string' ? x.title : '(concern)',
|
|
39
|
-
detail: typeof x.detail === 'string' ? x.detail : '',
|
|
40
|
-
severity: (SEVERITIES.has(x.severity)
|
|
41
|
-
? x.severity
|
|
42
|
-
: 'medium'),
|
|
43
|
-
}))
|
|
44
|
-
: [];
|
|
45
|
-
// A greenlight is only honoured when no BLOCKING (high/critical) concern was
|
|
46
|
-
// raised — never auto-pass a run with an open blocker, even if the model set
|
|
47
|
-
// greenlight:true by mistake. Low/medium concerns are advisory: they're reported
|
|
48
|
-
// but don't, on their own, withhold the greenlight (which would otherwise burn the
|
|
49
|
-
// whole fixer budget looping on a trivial nit). The engine re-applies this rule.
|
|
50
|
-
const blocking = concerns.some((c) => c.severity === 'high' || c.severity === 'critical');
|
|
51
|
-
const greenlight = o.greenlight === true && !blocking;
|
|
52
|
-
return {
|
|
53
|
-
greenlight,
|
|
54
|
-
summary: typeof o.summary === 'string' && o.summary ? o.summary : summary.slice(0, 2000),
|
|
55
|
-
tested: Array.isArray(o.tested)
|
|
56
|
-
? o.tested.filter((t) => typeof t === 'string')
|
|
57
|
-
: [],
|
|
58
|
-
outcomes,
|
|
59
|
-
concerns,
|
|
60
|
-
environment: env,
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
/** Build the tester task prompt: how to bring the deps up + what to test. */
|
|
64
|
-
function buildUserPrompt(job) {
|
|
65
|
-
const lines = [job.userPrompt, ''];
|
|
66
|
-
if (job.test.environment === 'ephemeral') {
|
|
67
|
-
lines.push('Run mode: ephemeral environment.', job.test.environmentUrl
|
|
68
|
-
? `Test against the deployed environment at ${job.test.environmentUrl}. Do not start the service locally.`
|
|
69
|
-
: 'Test against the provided ephemeral environment URL from your context. Do not start the service locally.');
|
|
70
|
-
}
|
|
71
|
-
else if (job.test.noInfraDependencies) {
|
|
72
|
-
lines.push('Run mode: local, no infra dependencies — just install, build and run the test suite directly.');
|
|
73
|
-
}
|
|
74
|
-
else {
|
|
75
|
-
lines.push("Run mode: local. The service's infra dependencies from its docker-compose file have been started and are reachable on localhost. Read the README to learn how to configure the service against them, run any migrations, start the service and exercise it.");
|
|
76
|
-
}
|
|
77
|
-
lines.push('', 'Respond with ONLY the JSON test report described in your instructions.');
|
|
78
|
-
return lines.join('\n');
|
|
79
|
-
}
|
|
80
|
-
/**
|
|
81
|
-
* Bring the service's docker-compose dependencies up (local mode only). Best-effort:
|
|
82
|
-
* runs `docker compose -f <path> up -d --wait` in the checkout. A missing Docker
|
|
83
|
-
* daemon or a compose failure is logged and surfaced to the agent rather than failing
|
|
84
|
-
* the whole job — the agent can still run unit-level tests and report what it could.
|
|
85
|
-
*/
|
|
86
|
-
async function standUpInfra(dir, test, signal, trace) {
|
|
87
|
-
if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath) {
|
|
88
|
-
return { started: false };
|
|
89
|
-
}
|
|
90
|
-
try {
|
|
91
|
-
log.info('test: standing up infra', { ...trace, composePath: test.composePath });
|
|
92
|
-
await exec('docker', ['compose', '-f', test.composePath, 'up', '-d', '--wait'], {
|
|
93
|
-
cwd: dir,
|
|
94
|
-
signal,
|
|
95
|
-
timeout: 5 * 60_000,
|
|
96
|
-
});
|
|
97
|
-
return { started: true };
|
|
98
|
-
}
|
|
99
|
-
catch (err) {
|
|
100
|
-
const note = err instanceof Error ? err.message : String(err);
|
|
101
|
-
log.warn('test: infra stand-up failed', { ...trace, error: note });
|
|
102
|
-
return { started: false, note };
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
/** Tear the docker-compose dependencies down (best-effort). */
|
|
106
|
-
async function tearDownInfra(dir, test) {
|
|
107
|
-
if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath)
|
|
108
|
-
return;
|
|
109
|
-
try {
|
|
110
|
-
await exec('docker', ['compose', '-f', test.composePath, 'down', '-v'], {
|
|
111
|
-
cwd: dir,
|
|
112
|
-
timeout: 2 * 60_000,
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
catch {
|
|
116
|
-
// The container is ephemeral and torn down with the run anyway — ignore.
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
/** Run one Tester job end to end: clone branch → stand up infra → Pi tests → report. */
|
|
120
|
-
export async function handleTester(job, opts = {}) {
|
|
121
|
-
const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
|
|
122
|
-
return withWorkspace('test', async (dir) => {
|
|
123
|
-
log.info('test: cloning PR branch', trace);
|
|
124
|
-
await cloneRepo({
|
|
125
|
-
repo: { ...job.repo, baseBranch: job.branch },
|
|
126
|
-
ghToken: job.ghToken,
|
|
127
|
-
dir,
|
|
128
|
-
signal: opts.signal,
|
|
129
|
-
});
|
|
130
|
-
const infra = await standUpInfra(dir, job.test, opts.signal, trace);
|
|
131
|
-
try {
|
|
132
|
-
log.info('test: running agent', { ...trace, environment: job.test.environment });
|
|
133
|
-
let userPrompt = buildUserPrompt(job);
|
|
134
|
-
if (infra.note) {
|
|
135
|
-
userPrompt += `\n\nNote: standing the infra up reported a problem (${infra.note}). Test what you can and flag any dependency-related gaps as concerns.`;
|
|
136
|
-
}
|
|
137
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
138
|
-
dir,
|
|
139
|
-
systemPrompt: job.systemPrompt,
|
|
140
|
-
userPrompt,
|
|
141
|
-
model: job.model,
|
|
142
|
-
harness: job.harness,
|
|
143
|
-
subscriptionToken: job.subscriptionToken,
|
|
144
|
-
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
145
|
-
proxyBaseUrl: job.proxyBaseUrl,
|
|
146
|
-
sessionToken: job.sessionToken,
|
|
147
|
-
// The tester only assesses (it commits nothing), so the no-edit guard must
|
|
148
|
-
// not fire on its legitimately edit-free run.
|
|
149
|
-
expectsEdits: false,
|
|
150
|
-
}, opts);
|
|
151
|
-
const { value: report, diagnostics } = await resolveStructuredOutput({
|
|
152
|
-
label: 'tester',
|
|
153
|
-
shapeHint: REPORT_SHAPE_HINT,
|
|
154
|
-
parse: (text) => coerceReport(extractJsonObject(text), text, job.test.environment),
|
|
155
|
-
}, summary, {
|
|
156
|
-
harness: job.harness,
|
|
157
|
-
subscriptionToken: job.subscriptionToken,
|
|
158
|
-
subscriptionBaseUrl: job.subscriptionBaseUrl,
|
|
159
|
-
proxyBaseUrl: job.proxyBaseUrl,
|
|
160
|
-
sessionToken: job.sessionToken,
|
|
161
|
-
model: job.model,
|
|
162
|
-
jobId: job.jobId,
|
|
163
|
-
signal: opts.signal,
|
|
164
|
-
});
|
|
165
|
-
if (!report) {
|
|
166
|
-
return {
|
|
167
|
-
summary,
|
|
168
|
-
stats,
|
|
169
|
-
error: noReportReason(stats, stderrTail, diagnostics),
|
|
170
|
-
...(usage ? { usage } : {}),
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
log.info('test: reported', {
|
|
174
|
-
...trace,
|
|
175
|
-
greenlight: report.greenlight,
|
|
176
|
-
concerns: report.concerns.length,
|
|
177
|
-
});
|
|
178
|
-
return { report, summary, stats, ...(usage ? { usage } : {}) };
|
|
179
|
-
}
|
|
180
|
-
finally {
|
|
181
|
-
await tearDownInfra(dir, job.test);
|
|
182
|
-
}
|
|
183
|
-
});
|
|
184
|
-
}
|
|
185
|
-
/** Human-readable reason a tester run produced no usable report. */
|
|
186
|
-
function noReportReason(stats, stderrTail, diagnostics) {
|
|
187
|
-
const cause = agentNeverActed(stats)
|
|
188
|
-
? NEVER_ACTED_CAUSE
|
|
189
|
-
: ' The agent did not return a parseable JSON test report.';
|
|
190
|
-
return `Tester produced no report.${cause}${diagnostics ? diagnosticsSuffix(diagnostics) : ''}${agentOutputTail(stderrTail)}`;
|
|
191
|
-
}
|