@smithers-orchestrator/cli 0.20.4 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-detection.d.ts +16 -3
- package/dist/argv-utils.d.ts +21 -0
- package/dist/eval-suite.d.ts +201 -0
- package/dist/hijack.d.ts +1 -1
- package/dist/json-args.d.ts +24 -0
- package/dist/token-store.d.ts +8 -0
- package/dist/workflows.d.ts +30 -1
- package/package.json +16 -16
- package/src/AgentAvailability.ts +3 -1
- package/src/AskOptions.ts +1 -1
- package/src/DiscoveredWorkflow.ts +4 -0
- package/src/NativeHijackEngine.ts +1 -0
- package/src/agent-commands/agentAddWizard.js +16 -3
- package/src/agent-commands/regenerateAgentsTsIfPresent.js +15 -2
- package/src/agent-commands/runAgentAdd.js +14 -2
- package/src/agent-detection.js +123 -22
- package/src/argv-utils.js +73 -0
- package/src/ask.js +13 -2
- package/src/eval-suite.js +560 -0
- package/src/hijack.js +9 -0
- package/src/index.js +335 -173
- package/src/json-args.js +59 -0
- package/src/mcp/semantic-tools.js +9 -1
- package/src/token-store.js +39 -0
- package/src/workflow-pack.js +238 -10
- package/src/workflows.js +193 -5
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { basename, dirname, extname, isAbsolute, join, resolve } from "node:path";
|
|
3
|
+
import crypto from "node:crypto";
|
|
4
|
+
import { SmithersError } from "@smithers-orchestrator/errors";
|
|
5
|
+
|
|
6
|
+
export const EVAL_CASE_STATUSES = [
|
|
7
|
+
"finished",
|
|
8
|
+
"continued",
|
|
9
|
+
"failed",
|
|
10
|
+
"cancelled",
|
|
11
|
+
"waiting-approval",
|
|
12
|
+
"waiting-event",
|
|
13
|
+
"waiting-timer",
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
const RUN_ID_MAX_LENGTH = 64;
|
|
17
|
+
const EVAL_EXPECTED_KEYS = new Set(["status", "output", "outputContains", "errorContains"]);
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @param {unknown} value
|
|
21
|
+
* @returns {value is Record<string, unknown>}
|
|
22
|
+
*/
|
|
23
|
+
function isPlainObject(value) {
|
|
24
|
+
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* @param {string} value
|
|
29
|
+
*/
|
|
30
|
+
function stableHash(value) {
|
|
31
|
+
return crypto.createHash("sha1").update(value).digest("hex").slice(0, 8);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @param {string} value
|
|
36
|
+
* @param {string} fallback
|
|
37
|
+
* @param {number} maxLength
|
|
38
|
+
*/
|
|
39
|
+
export function slugifyEvalToken(value, fallback = "case", maxLength = 32) {
|
|
40
|
+
const slug = value
|
|
41
|
+
.trim()
|
|
42
|
+
.toLowerCase()
|
|
43
|
+
.replace(/[^a-z0-9_-]+/g, "-")
|
|
44
|
+
.replace(/^-+|-+$/g, "")
|
|
45
|
+
.replace(/-{2,}/g, "-");
|
|
46
|
+
const normalized = slug || fallback;
|
|
47
|
+
if (normalized.length <= maxLength) {
|
|
48
|
+
return normalized;
|
|
49
|
+
}
|
|
50
|
+
return `${normalized.slice(0, Math.max(1, maxLength - 9)).replace(/-+$/g, "")}-${stableHash(normalized)}`;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* @param {unknown} value
|
|
55
|
+
* @param {string} label
|
|
56
|
+
*/
|
|
57
|
+
function assertJsonObject(value, label) {
|
|
58
|
+
if (!isPlainObject(value)) {
|
|
59
|
+
throw new SmithersError("INVALID_INPUT", `${label} must be a JSON object.`, { label });
|
|
60
|
+
}
|
|
61
|
+
return value;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @param {unknown} value
|
|
66
|
+
* @param {string} label
|
|
67
|
+
* @returns {Record<string, string | number | boolean>}
|
|
68
|
+
*/
|
|
69
|
+
function normalizeAnnotations(value, label) {
|
|
70
|
+
if (value === undefined || value === null) {
|
|
71
|
+
return {};
|
|
72
|
+
}
|
|
73
|
+
const object = assertJsonObject(value, label);
|
|
74
|
+
/** @type {Record<string, string | number | boolean>} */
|
|
75
|
+
const normalized = {};
|
|
76
|
+
for (const [key, entry] of Object.entries(object)) {
|
|
77
|
+
if (typeof entry !== "string" && typeof entry !== "number" && typeof entry !== "boolean") {
|
|
78
|
+
throw new SmithersError("INVALID_INPUT", `${label}.${key} must be a string, number, or boolean.`, { key });
|
|
79
|
+
}
|
|
80
|
+
normalized[key] = entry;
|
|
81
|
+
}
|
|
82
|
+
return normalized;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* @param {unknown} value
|
|
87
|
+
* @param {string} label
|
|
88
|
+
*/
|
|
89
|
+
function normalizeExpected(value, label) {
|
|
90
|
+
if (value === undefined || value === null) {
|
|
91
|
+
return { status: "finished" };
|
|
92
|
+
}
|
|
93
|
+
const object = assertJsonObject(value, label);
|
|
94
|
+
const unknownKeys = Object.keys(object).filter((key) => !EVAL_EXPECTED_KEYS.has(key));
|
|
95
|
+
if (unknownKeys.length > 0) {
|
|
96
|
+
throw new SmithersError("INVALID_INPUT", `${label} contains unsupported assertion keys: ${unknownKeys.join(", ")}.`, {
|
|
97
|
+
keys: unknownKeys,
|
|
98
|
+
supportedKeys: [...EVAL_EXPECTED_KEYS],
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
const status = object.status ?? "finished";
|
|
102
|
+
if (typeof status !== "string" || !EVAL_CASE_STATUSES.includes(status)) {
|
|
103
|
+
throw new SmithersError("INVALID_INPUT", `${label}.status must be one of ${EVAL_CASE_STATUSES.join(", ")}.`, { status });
|
|
104
|
+
}
|
|
105
|
+
return { ...object, status };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* @param {unknown} value
|
|
110
|
+
* @returns {string}
|
|
111
|
+
*/
|
|
112
|
+
function stableJson(value) {
|
|
113
|
+
if (Array.isArray(value)) {
|
|
114
|
+
return `[${value.map(stableJson).join(",")}]`;
|
|
115
|
+
}
|
|
116
|
+
if (isPlainObject(value)) {
|
|
117
|
+
return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(",")}}`;
|
|
118
|
+
}
|
|
119
|
+
return JSON.stringify(value);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* @param {unknown} actual
|
|
124
|
+
* @param {unknown} expected
|
|
125
|
+
*/
|
|
126
|
+
function jsonEquals(actual, expected) {
|
|
127
|
+
return stableJson(actual) === stableJson(expected);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* @param {unknown} actual
|
|
132
|
+
* @param {unknown} expected
|
|
133
|
+
*/
|
|
134
|
+
function jsonContains(actual, expected) {
|
|
135
|
+
if (isPlainObject(expected)) {
|
|
136
|
+
if (!isPlainObject(actual)) {
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
for (const [key, value] of Object.entries(expected)) {
|
|
140
|
+
if (!jsonContains(actual[key], value)) {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return true;
|
|
145
|
+
}
|
|
146
|
+
if (Array.isArray(expected)) {
|
|
147
|
+
if (!Array.isArray(actual) || actual.length < expected.length) {
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
const matchedActualIndexes = new Set();
|
|
151
|
+
return expected.every((entry) => {
|
|
152
|
+
const matchIndex = actual.findIndex((actualEntry, index) => {
|
|
153
|
+
return !matchedActualIndexes.has(index) && jsonContains(actualEntry, entry);
|
|
154
|
+
});
|
|
155
|
+
if (matchIndex < 0) {
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
matchedActualIndexes.add(matchIndex);
|
|
159
|
+
return true;
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
return jsonEquals(actual, expected);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* @param {unknown} error
|
|
167
|
+
*/
|
|
168
|
+
function formatEvalError(error) {
|
|
169
|
+
if (error === undefined || error === null) {
|
|
170
|
+
return "";
|
|
171
|
+
}
|
|
172
|
+
if (error instanceof Error) {
|
|
173
|
+
return error.message;
|
|
174
|
+
}
|
|
175
|
+
if (isPlainObject(error)) {
|
|
176
|
+
if (typeof error.message === "string") {
|
|
177
|
+
return error.message;
|
|
178
|
+
}
|
|
179
|
+
return stableJson(error);
|
|
180
|
+
}
|
|
181
|
+
return String(error);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* @param {unknown} raw
|
|
186
|
+
* @param {number} index
|
|
187
|
+
*/
|
|
188
|
+
export function normalizeEvalCase(raw, index) {
|
|
189
|
+
const object = assertJsonObject(raw, `cases[${index}]`);
|
|
190
|
+
const rawId = typeof object.id === "string"
|
|
191
|
+
? object.id
|
|
192
|
+
: typeof object.name === "string"
|
|
193
|
+
? object.name
|
|
194
|
+
: `case-${String(index + 1).padStart(3, "0")}`;
|
|
195
|
+
const id = slugifyEvalToken(rawId, `case-${index + 1}`, 40);
|
|
196
|
+
const input = object.input === undefined ? {} : assertJsonObject(object.input, `cases[${index}].input`);
|
|
197
|
+
const annotations = normalizeAnnotations(object.annotations, `cases[${index}].annotations`);
|
|
198
|
+
const expected = normalizeExpected(object.expected, `cases[${index}].expected`);
|
|
199
|
+
const metadata = object.metadata === undefined || object.metadata === null
|
|
200
|
+
? {}
|
|
201
|
+
: assertJsonObject(object.metadata, `cases[${index}].metadata`);
|
|
202
|
+
return {
|
|
203
|
+
id,
|
|
204
|
+
name: rawId,
|
|
205
|
+
input,
|
|
206
|
+
annotations,
|
|
207
|
+
expected,
|
|
208
|
+
metadata,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* @param {Array<ReturnType<typeof normalizeEvalCase>>} cases
|
|
214
|
+
*/
|
|
215
|
+
function assertUniqueEvalCaseIds(cases) {
|
|
216
|
+
/** @type {Map<string, number>} */
|
|
217
|
+
const seen = new Map();
|
|
218
|
+
for (let index = 0; index < cases.length; index += 1) {
|
|
219
|
+
const testCase = cases[index];
|
|
220
|
+
const firstIndex = seen.get(testCase.id);
|
|
221
|
+
if (firstIndex !== undefined) {
|
|
222
|
+
throw new SmithersError("INVALID_INPUT", `Duplicate eval case ID after normalization: ${testCase.id}`, {
|
|
223
|
+
id: testCase.id,
|
|
224
|
+
firstIndex,
|
|
225
|
+
duplicateIndex: index,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
seen.set(testCase.id, index);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* @param {string} text
|
|
234
|
+
* @param {string} path
|
|
235
|
+
*/
|
|
236
|
+
function parseCasesText(text, path) {
|
|
237
|
+
if (extname(path).toLowerCase() === ".jsonl") {
|
|
238
|
+
return text
|
|
239
|
+
.split(/\r?\n/)
|
|
240
|
+
.map((line) => line.trim())
|
|
241
|
+
.filter(Boolean)
|
|
242
|
+
.map((line, index) => {
|
|
243
|
+
try {
|
|
244
|
+
return JSON.parse(line);
|
|
245
|
+
}
|
|
246
|
+
catch (err) {
|
|
247
|
+
throw new SmithersError("INVALID_JSON", `Invalid JSONL case at line ${index + 1}: ${err?.message ?? String(err)}`, { line: index + 1 });
|
|
248
|
+
}
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
try {
|
|
252
|
+
const parsed = JSON.parse(text);
|
|
253
|
+
if (Array.isArray(parsed)) {
|
|
254
|
+
return parsed;
|
|
255
|
+
}
|
|
256
|
+
if (isPlainObject(parsed) && Array.isArray(parsed.cases)) {
|
|
257
|
+
return parsed.cases;
|
|
258
|
+
}
|
|
259
|
+
throw new SmithersError("INVALID_INPUT", "Eval case file must be a JSON array, a { cases: [...] } object, or JSONL.", { path });
|
|
260
|
+
}
|
|
261
|
+
catch (err) {
|
|
262
|
+
if (err instanceof SmithersError) {
|
|
263
|
+
throw err;
|
|
264
|
+
}
|
|
265
|
+
throw new SmithersError("INVALID_JSON", `Invalid JSON case file: ${err?.message ?? String(err)}`, { path });
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* @param {string} root
|
|
271
|
+
* @param {string} path
|
|
272
|
+
* @param {{ maxCases?: number }} [options]
|
|
273
|
+
*/
|
|
274
|
+
export function loadEvalCases(root, path, options = {}) {
|
|
275
|
+
const absolutePath = isAbsolute(path) ? path : resolve(root, path);
|
|
276
|
+
if (!existsSync(absolutePath)) {
|
|
277
|
+
throw new SmithersError("INVALID_INPUT", `Eval case file not found: ${path}`, { path });
|
|
278
|
+
}
|
|
279
|
+
const rawCases = parseCasesText(readFileSync(absolutePath, "utf8"), absolutePath);
|
|
280
|
+
if (rawCases.length === 0) {
|
|
281
|
+
throw new SmithersError("INVALID_INPUT", "Eval case file must contain at least one case.", { path });
|
|
282
|
+
}
|
|
283
|
+
const limit = options.maxCases ?? rawCases.length;
|
|
284
|
+
const cases = rawCases.slice(0, limit).map(normalizeEvalCase);
|
|
285
|
+
assertUniqueEvalCaseIds(cases);
|
|
286
|
+
return {
|
|
287
|
+
path: absolutePath,
|
|
288
|
+
cases,
|
|
289
|
+
totalCases: rawCases.length,
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* @param {string} suiteId
|
|
295
|
+
* @param {string} caseId
|
|
296
|
+
*/
|
|
297
|
+
export function evalRunId(suiteId, caseId) {
|
|
298
|
+
const suite = slugifyEvalToken(suiteId, "suite", 24);
|
|
299
|
+
const testCase = slugifyEvalToken(caseId, "case", 24);
|
|
300
|
+
const base = `eval-${suite}-${testCase}`;
|
|
301
|
+
if (base.length <= RUN_ID_MAX_LENGTH) {
|
|
302
|
+
return base;
|
|
303
|
+
}
|
|
304
|
+
return `${base.slice(0, RUN_ID_MAX_LENGTH - 9).replace(/-+$/g, "")}-${stableHash(base)}`;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* @param {{
|
|
309
|
+
* suiteId?: string;
|
|
310
|
+
* runLabel?: string;
|
|
311
|
+
* workflowPath: string;
|
|
312
|
+
* casesPath: string;
|
|
313
|
+
* loadedCases: ReturnType<typeof loadEvalCases>;
|
|
314
|
+
* }} input
|
|
315
|
+
*/
|
|
316
|
+
export function buildEvalPlan(input) {
|
|
317
|
+
const defaultSuite = basename(input.casesPath, extname(input.casesPath));
|
|
318
|
+
const suiteId = slugifyEvalToken(input.suiteId ?? defaultSuite, "suite", 32);
|
|
319
|
+
const runLabel = input.runLabel ? slugifyEvalToken(input.runLabel, "run", 24) : null;
|
|
320
|
+
const runSuiteId = runLabel ? `${suiteId}-${runLabel}` : suiteId;
|
|
321
|
+
const cases = input.loadedCases.cases.map((testCase) => ({
|
|
322
|
+
...testCase,
|
|
323
|
+
runId: evalRunId(runSuiteId, testCase.id),
|
|
324
|
+
}));
|
|
325
|
+
assertUniqueEvalRunIds(cases);
|
|
326
|
+
return {
|
|
327
|
+
suiteId,
|
|
328
|
+
runLabel,
|
|
329
|
+
workflowPath: input.workflowPath,
|
|
330
|
+
casesPath: input.loadedCases.path,
|
|
331
|
+
totalCases: input.loadedCases.totalCases,
|
|
332
|
+
plannedCases: input.loadedCases.cases.length,
|
|
333
|
+
cases,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* @param {Array<{ runId: string; id: string }>} cases
|
|
339
|
+
*/
|
|
340
|
+
function assertUniqueEvalRunIds(cases) {
|
|
341
|
+
/** @type {Map<string, string>} */
|
|
342
|
+
const seen = new Map();
|
|
343
|
+
for (const testCase of cases) {
|
|
344
|
+
const firstCaseId = seen.get(testCase.runId);
|
|
345
|
+
if (firstCaseId !== undefined) {
|
|
346
|
+
throw new SmithersError("INVALID_INPUT", `Duplicate eval run ID after normalization: ${testCase.runId}`, {
|
|
347
|
+
runId: testCase.runId,
|
|
348
|
+
firstCaseId,
|
|
349
|
+
duplicateCaseId: testCase.id,
|
|
350
|
+
});
|
|
351
|
+
}
|
|
352
|
+
seen.set(testCase.runId, testCase.id);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* @param {{ getRun(runId: string): Promise<unknown> }} adapter
|
|
358
|
+
* @param {Array<{ runId: string }>} cases
|
|
359
|
+
*/
|
|
360
|
+
export async function assertEvalRunIdsAvailable(adapter, cases) {
|
|
361
|
+
const existing = [];
|
|
362
|
+
for (const testCase of cases) {
|
|
363
|
+
if (await adapter.getRun(testCase.runId)) {
|
|
364
|
+
existing.push(testCase.runId);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
if (existing.length > 0) {
|
|
368
|
+
throw new SmithersError("EVAL_RUN_ID_EXISTS", `Eval run ID${existing.length === 1 ? "" : "s"} already ${existing.length === 1 ? "exists" : "exist"}: ${existing.join(", ")}. Use a unique --run-label.`, {
|
|
369
|
+
runIds: existing,
|
|
370
|
+
});
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* @param {Array<{ passed: boolean; status?: string; durationMs?: number }>} results
|
|
376
|
+
*/
|
|
377
|
+
export function summarizeEvalResults(results) {
|
|
378
|
+
const byStatus = {};
|
|
379
|
+
for (const result of results) {
|
|
380
|
+
const status = result.status ?? "error";
|
|
381
|
+
byStatus[status] = (byStatus[status] ?? 0) + 1;
|
|
382
|
+
}
|
|
383
|
+
const passed = results.filter((result) => result.passed).length;
|
|
384
|
+
const failed = results.length - passed;
|
|
385
|
+
return {
|
|
386
|
+
total: results.length,
|
|
387
|
+
passed,
|
|
388
|
+
failed,
|
|
389
|
+
byStatus,
|
|
390
|
+
durationMs: results.reduce((sum, result) => sum + (result.durationMs ?? 0), 0),
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* @param {ReturnType<typeof normalizeEvalCase>} testCase
|
|
396
|
+
* @param {{ status?: string; output?: unknown; error?: unknown }} result
|
|
397
|
+
*/
|
|
398
|
+
export function evaluateEvalCaseResult(testCase, result) {
|
|
399
|
+
const assertions = [];
|
|
400
|
+
const actualStatus = result.status ?? "error";
|
|
401
|
+
assertions.push({
|
|
402
|
+
name: "status",
|
|
403
|
+
passed: actualStatus === testCase.expected.status,
|
|
404
|
+
expected: testCase.expected.status,
|
|
405
|
+
actual: actualStatus,
|
|
406
|
+
});
|
|
407
|
+
if (Object.prototype.hasOwnProperty.call(testCase.expected, "output")) {
|
|
408
|
+
assertions.push({
|
|
409
|
+
name: "output",
|
|
410
|
+
passed: jsonEquals(result.output, testCase.expected.output),
|
|
411
|
+
expected: testCase.expected.output,
|
|
412
|
+
actual: result.output,
|
|
413
|
+
});
|
|
414
|
+
}
|
|
415
|
+
if (Object.prototype.hasOwnProperty.call(testCase.expected, "outputContains")) {
|
|
416
|
+
assertions.push({
|
|
417
|
+
name: "outputContains",
|
|
418
|
+
passed: jsonContains(result.output, testCase.expected.outputContains),
|
|
419
|
+
expected: testCase.expected.outputContains,
|
|
420
|
+
actual: result.output,
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
if (Object.prototype.hasOwnProperty.call(testCase.expected, "errorContains")) {
|
|
424
|
+
const actualError = formatEvalError(result.error);
|
|
425
|
+
assertions.push({
|
|
426
|
+
name: "errorContains",
|
|
427
|
+
passed: actualError.includes(String(testCase.expected.errorContains)),
|
|
428
|
+
expected: String(testCase.expected.errorContains),
|
|
429
|
+
actual: actualError,
|
|
430
|
+
});
|
|
431
|
+
}
|
|
432
|
+
return {
|
|
433
|
+
passed: assertions.every((assertion) => assertion.passed),
|
|
434
|
+
assertions,
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* @param {{
|
|
440
|
+
* plan: ReturnType<typeof buildEvalPlan>;
|
|
441
|
+
* results: Array<Record<string, unknown> & { passed: boolean; status?: string; durationMs?: number }>;
|
|
442
|
+
* startedAtMs: number;
|
|
443
|
+
* finishedAtMs: number;
|
|
444
|
+
* reportPath?: string | null;
|
|
445
|
+
* }} input
|
|
446
|
+
*/
|
|
447
|
+
export function buildEvalReport(input) {
|
|
448
|
+
return {
|
|
449
|
+
suiteId: input.plan.suiteId,
|
|
450
|
+
runLabel: input.plan.runLabel,
|
|
451
|
+
workflowPath: input.plan.workflowPath,
|
|
452
|
+
casesPath: input.plan.casesPath,
|
|
453
|
+
startedAtMs: input.startedAtMs,
|
|
454
|
+
finishedAtMs: input.finishedAtMs,
|
|
455
|
+
durationMs: input.finishedAtMs - input.startedAtMs,
|
|
456
|
+
reportPath: input.reportPath ?? null,
|
|
457
|
+
summary: summarizeEvalResults(input.results),
|
|
458
|
+
results: input.results,
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* @param {string} root
|
|
464
|
+
* @param {string} suiteId
|
|
465
|
+
*/
|
|
466
|
+
export function defaultEvalReportPath(root, suiteId) {
|
|
467
|
+
return join(root, ".smithers", "evals", `${suiteId}.json`);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
/**
|
|
471
|
+
* @param {string} root
|
|
472
|
+
* @param {string | undefined} path
|
|
473
|
+
*/
|
|
474
|
+
function resolveOutputPath(root, path) {
|
|
475
|
+
if (!path) {
|
|
476
|
+
return null;
|
|
477
|
+
}
|
|
478
|
+
return isAbsolute(path) ? path : resolve(root, path);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* @param {string} root
|
|
483
|
+
* @param {string} suiteId
|
|
484
|
+
* @param {string | undefined} path
|
|
485
|
+
*/
|
|
486
|
+
export function resolveEvalReportPath(root, suiteId, path) {
|
|
487
|
+
return resolveOutputPath(root, path) ?? defaultEvalReportPath(root, suiteId);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
/**
|
|
491
|
+
* @param {string} root
|
|
492
|
+
* @param {string} suiteId
|
|
493
|
+
* @param {{ path?: string; force?: boolean }} [options]
|
|
494
|
+
*/
|
|
495
|
+
export function assertEvalReportWritable(root, suiteId, options = {}) {
|
|
496
|
+
const target = resolveEvalReportPath(root, suiteId, options.path);
|
|
497
|
+
if (existsSync(target) && !options.force) {
|
|
498
|
+
throw new SmithersError("INVALID_INPUT", `Eval report already exists: ${target}. Pass --force to overwrite.`, { path: target });
|
|
499
|
+
}
|
|
500
|
+
return target;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* @param {string} root
|
|
505
|
+
* @param {Record<string, unknown>} report
|
|
506
|
+
* @param {{ path?: string; force?: boolean }} [options]
|
|
507
|
+
*/
|
|
508
|
+
export function writeEvalReport(root, report, options = {}) {
|
|
509
|
+
const suiteId = typeof report.suiteId === "string" ? report.suiteId : "suite";
|
|
510
|
+
const target = assertEvalReportWritable(root, suiteId, options);
|
|
511
|
+
mkdirSync(dirname(target), { recursive: true });
|
|
512
|
+
writeFileSync(target, `${JSON.stringify({ ...report, reportPath: target }, null, 2)}\n`, "utf8");
|
|
513
|
+
return target;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* @param {ReturnType<typeof buildEvalPlan>} plan
|
|
518
|
+
*/
|
|
519
|
+
export function renderEvalPlan(plan) {
|
|
520
|
+
const lines = [
|
|
521
|
+
`Eval suite: ${plan.suiteId}`,
|
|
522
|
+
...(plan.runLabel ? [`Run label: ${plan.runLabel}`] : []),
|
|
523
|
+
`Workflow: ${plan.workflowPath}`,
|
|
524
|
+
`Cases: ${plan.plannedCases}${plan.totalCases !== plan.plannedCases ? ` of ${plan.totalCases}` : ""}`,
|
|
525
|
+
"",
|
|
526
|
+
"Planned runs:",
|
|
527
|
+
];
|
|
528
|
+
for (const testCase of plan.cases) {
|
|
529
|
+
lines.push(`- ${testCase.id} -> ${testCase.runId} (expect ${testCase.expected.status})`);
|
|
530
|
+
}
|
|
531
|
+
lines.push("");
|
|
532
|
+
lines.push("Dry run only. Re-run without --dry-run to execute the suite.");
|
|
533
|
+
return lines.join("\n");
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
/**
|
|
537
|
+
* @param {ReturnType<typeof buildEvalReport>} report
|
|
538
|
+
*/
|
|
539
|
+
export function renderEvalReport(report) {
|
|
540
|
+
const lines = [
|
|
541
|
+
`Eval suite: ${report.suiteId}`,
|
|
542
|
+
...(report.runLabel ? [`Run label: ${report.runLabel}`] : []),
|
|
543
|
+
`Workflow: ${report.workflowPath}`,
|
|
544
|
+
`Result: ${report.summary.passed}/${report.summary.total} passed`,
|
|
545
|
+
`Duration: ${report.durationMs}ms`,
|
|
546
|
+
];
|
|
547
|
+
if (report.reportPath) {
|
|
548
|
+
lines.push(`Report: ${report.reportPath}`);
|
|
549
|
+
}
|
|
550
|
+
lines.push("");
|
|
551
|
+
lines.push("Cases:");
|
|
552
|
+
for (const result of report.results) {
|
|
553
|
+
const mark = result.passed ? "PASS" : "FAIL";
|
|
554
|
+
lines.push(`- ${mark} ${result.caseId} -> ${result.runId} (${result.status ?? "error"}, ${result.durationMs ?? 0}ms)`);
|
|
555
|
+
if (result.error) {
|
|
556
|
+
lines.push(` ${result.error}`);
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
return lines.join("\n");
|
|
560
|
+
}
|
package/src/hijack.js
CHANGED
|
@@ -28,6 +28,7 @@ function parseAttemptMeta(metaJson) {
|
|
|
28
28
|
*/
|
|
29
29
|
function asNativeHijackEngine(value) {
|
|
30
30
|
return value === "claude-code" ||
|
|
31
|
+
value === "antigravity" ||
|
|
31
32
|
value === "codex" ||
|
|
32
33
|
value === "gemini" ||
|
|
33
34
|
value === "pi" ||
|
|
@@ -161,6 +162,14 @@ export function buildHijackLaunchSpec(candidate) {
|
|
|
161
162
|
env,
|
|
162
163
|
};
|
|
163
164
|
}
|
|
165
|
+
if (candidate.engine === "antigravity") {
|
|
166
|
+
return {
|
|
167
|
+
command: "agy",
|
|
168
|
+
args: ["--resume", candidate.resume],
|
|
169
|
+
cwd: candidate.cwd,
|
|
170
|
+
env,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
164
173
|
if (candidate.engine === "pi") {
|
|
165
174
|
return {
|
|
166
175
|
command: "pi",
|