@cat-factory/executor-harness 1.31.0 → 1.31.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/tester.js DELETED
@@ -1,191 +0,0 @@
1
- import { execFile } from 'node:child_process';
2
- import { promisify } from 'node:util';
3
- import { cloneRepo } from './git.js';
4
- import { extractJsonObject } from './blueprint.js';
5
- import { agentNeverActed, agentOutputTail, NEVER_ACTED_CAUSE, runAgentInWorkspace, withWorkspace, } from './pi-workspace.js';
6
- import { diagnosticsSuffix, resolveStructuredOutput, } from './structured-output.js';
7
- import { log } from './logger.js';
8
- const exec = promisify(execFile);
9
- // Async job execution for the Tester. The engine dispatches this to run the project's
10
- // tests: clone the PR HEAD branch, stand its dependencies up (local docker-compose
11
- // infra, or test against an ephemeral env), run Pi to exercise the change + regress
12
- // related behaviour, and return ONLY a structured JSON report. The Tester makes NO
13
- // commits — on a withheld greenlight the engine loops the `fixer` and re-tests.
14
- /** Compact description of the report shape, fed to the JSON repair call. */
15
- const REPORT_SHAPE_HINT = 'Expected a test report: {"greenlight": boolean, "summary": string, "tested": string[], ' +
16
- '"outcomes": [{"name": string, "status": "passed"|"failed"|"skipped", "detail"?: string}], ' +
17
- '"concerns": [{"title": string, "detail": string, "severity": "low"|"medium"|"high"|"critical"}]}.';
18
- const SEVERITIES = new Set(['low', 'medium', 'high', 'critical']);
19
- const STATUSES = new Set(['passed', 'failed', 'skipped']);
20
- /** Coerce the agent's JSON into a well-formed report, defaulting conservatively. */
21
- function coerceReport(raw, summary, env) {
22
- const o = (typeof raw === 'object' && raw !== null ? raw : {});
23
- const outcomes = Array.isArray(o.outcomes)
24
- ? o.outcomes
25
- .filter((x) => typeof x === 'object' && x !== null)
26
- .map((x) => ({
27
- name: typeof x.name === 'string' ? x.name : '(unnamed)',
28
- status: (STATUSES.has(x.status)
29
- ? x.status
30
- : 'skipped'),
31
- ...(typeof x.detail === 'string' && x.detail ? { detail: x.detail } : {}),
32
- }))
33
- : [];
34
- const concerns = Array.isArray(o.concerns)
35
- ? o.concerns
36
- .filter((x) => typeof x === 'object' && x !== null)
37
- .map((x) => ({
38
- title: typeof x.title === 'string' ? x.title : '(concern)',
39
- detail: typeof x.detail === 'string' ? x.detail : '',
40
- severity: (SEVERITIES.has(x.severity)
41
- ? x.severity
42
- : 'medium'),
43
- }))
44
- : [];
45
- // A greenlight is only honoured when no BLOCKING (high/critical) concern was
46
- // raised — never auto-pass a run with an open blocker, even if the model set
47
- // greenlight:true by mistake. Low/medium concerns are advisory: they're reported
48
- // but don't, on their own, withhold the greenlight (which would otherwise burn the
49
- // whole fixer budget looping on a trivial nit). The engine re-applies this rule.
50
- const blocking = concerns.some((c) => c.severity === 'high' || c.severity === 'critical');
51
- const greenlight = o.greenlight === true && !blocking;
52
- return {
53
- greenlight,
54
- summary: typeof o.summary === 'string' && o.summary ? o.summary : summary.slice(0, 2000),
55
- tested: Array.isArray(o.tested)
56
- ? o.tested.filter((t) => typeof t === 'string')
57
- : [],
58
- outcomes,
59
- concerns,
60
- environment: env,
61
- };
62
- }
63
- /** Build the tester task prompt: how to bring the deps up + what to test. */
64
- function buildUserPrompt(job) {
65
- const lines = [job.userPrompt, ''];
66
- if (job.test.environment === 'ephemeral') {
67
- lines.push('Run mode: ephemeral environment.', job.test.environmentUrl
68
- ? `Test against the deployed environment at ${job.test.environmentUrl}. Do not start the service locally.`
69
- : 'Test against the provided ephemeral environment URL from your context. Do not start the service locally.');
70
- }
71
- else if (job.test.noInfraDependencies) {
72
- lines.push('Run mode: local, no infra dependencies — just install, build and run the test suite directly.');
73
- }
74
- else {
75
- lines.push("Run mode: local. The service's infra dependencies from its docker-compose file have been started and are reachable on localhost. Read the README to learn how to configure the service against them, run any migrations, start the service and exercise it.");
76
- }
77
- lines.push('', 'Respond with ONLY the JSON test report described in your instructions.');
78
- return lines.join('\n');
79
- }
80
- /**
81
- * Bring the service's docker-compose dependencies up (local mode only). Best-effort:
82
- * runs `docker compose -f <path> up -d --wait` in the checkout. A missing Docker
83
- * daemon or a compose failure is logged and surfaced to the agent rather than failing
84
- * the whole job — the agent can still run unit-level tests and report what it could.
85
- */
86
- async function standUpInfra(dir, test, signal, trace) {
87
- if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath) {
88
- return { started: false };
89
- }
90
- try {
91
- log.info('test: standing up infra', { ...trace, composePath: test.composePath });
92
- await exec('docker', ['compose', '-f', test.composePath, 'up', '-d', '--wait'], {
93
- cwd: dir,
94
- signal,
95
- timeout: 5 * 60_000,
96
- });
97
- return { started: true };
98
- }
99
- catch (err) {
100
- const note = err instanceof Error ? err.message : String(err);
101
- log.warn('test: infra stand-up failed', { ...trace, error: note });
102
- return { started: false, note };
103
- }
104
- }
105
- /** Tear the docker-compose dependencies down (best-effort). */
106
- async function tearDownInfra(dir, test) {
107
- if (test.environment !== 'local' || test.noInfraDependencies || !test.composePath)
108
- return;
109
- try {
110
- await exec('docker', ['compose', '-f', test.composePath, 'down', '-v'], {
111
- cwd: dir,
112
- timeout: 2 * 60_000,
113
- });
114
- }
115
- catch {
116
- // The container is ephemeral and torn down with the run anyway — ignore.
117
- }
118
- }
119
- /** Run one Tester job end to end: clone branch → stand up infra → Pi tests → report. */
120
- export async function handleTester(job, opts = {}) {
121
- const trace = { jobId: job.jobId, repo: `${job.repo.owner}/${job.repo.name}`, branch: job.branch };
122
- return withWorkspace('test', async (dir) => {
123
- log.info('test: cloning PR branch', trace);
124
- await cloneRepo({
125
- repo: { ...job.repo, baseBranch: job.branch },
126
- ghToken: job.ghToken,
127
- dir,
128
- signal: opts.signal,
129
- });
130
- const infra = await standUpInfra(dir, job.test, opts.signal, trace);
131
- try {
132
- log.info('test: running agent', { ...trace, environment: job.test.environment });
133
- let userPrompt = buildUserPrompt(job);
134
- if (infra.note) {
135
- userPrompt += `\n\nNote: standing the infra up reported a problem (${infra.note}). Test what you can and flag any dependency-related gaps as concerns.`;
136
- }
137
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
138
- dir,
139
- systemPrompt: job.systemPrompt,
140
- userPrompt,
141
- model: job.model,
142
- harness: job.harness,
143
- subscriptionToken: job.subscriptionToken,
144
- subscriptionBaseUrl: job.subscriptionBaseUrl,
145
- proxyBaseUrl: job.proxyBaseUrl,
146
- sessionToken: job.sessionToken,
147
- // The tester only assesses (it commits nothing), so the no-edit guard must
148
- // not fire on its legitimately edit-free run.
149
- expectsEdits: false,
150
- }, opts);
151
- const { value: report, diagnostics } = await resolveStructuredOutput({
152
- label: 'tester',
153
- shapeHint: REPORT_SHAPE_HINT,
154
- parse: (text) => coerceReport(extractJsonObject(text), text, job.test.environment),
155
- }, summary, {
156
- harness: job.harness,
157
- subscriptionToken: job.subscriptionToken,
158
- subscriptionBaseUrl: job.subscriptionBaseUrl,
159
- proxyBaseUrl: job.proxyBaseUrl,
160
- sessionToken: job.sessionToken,
161
- model: job.model,
162
- jobId: job.jobId,
163
- signal: opts.signal,
164
- });
165
- if (!report) {
166
- return {
167
- summary,
168
- stats,
169
- error: noReportReason(stats, stderrTail, diagnostics),
170
- ...(usage ? { usage } : {}),
171
- };
172
- }
173
- log.info('test: reported', {
174
- ...trace,
175
- greenlight: report.greenlight,
176
- concerns: report.concerns.length,
177
- });
178
- return { report, summary, stats, ...(usage ? { usage } : {}) };
179
- }
180
- finally {
181
- await tearDownInfra(dir, job.test);
182
- }
183
- });
184
- }
185
- /** Human-readable reason a tester run produced no usable report. */
186
- function noReportReason(stats, stderrTail, diagnostics) {
187
- const cause = agentNeverActed(stats)
188
- ? NEVER_ACTED_CAUSE
189
- : ' The agent did not return a parseable JSON test report.';
190
- return `Tester produced no report.${cause}${diagnostics ? diagnosticsSuffix(diagnostics) : ''}${agentOutputTail(stderrTail)}`;
191
- }