cclaw-cli 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -2
- package/dist/cli.js +388 -18
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +22 -16
- package/dist/eval/agents/workflow.d.ts +31 -0
- package/dist/eval/agents/workflow.js +135 -0
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +52 -19
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +36 -1
- package/dist/eval/runner.d.ts +37 -8
- package/dist/eval/runner.js +351 -42
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +158 -15
- package/dist/eval/types.js +39 -7
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import type { EvalReport } from "./types.js";
|
|
2
|
+
export interface EvalDiffInput {
|
|
3
|
+
projectRoot: string;
|
|
4
|
+
/** Version string, filename, or "latest". */
|
|
5
|
+
old: string;
|
|
6
|
+
/** Version string, filename, or "latest". */
|
|
7
|
+
new: string;
|
|
8
|
+
}
|
|
9
|
+
export interface EvalDiffCaseEntry {
|
|
10
|
+
caseId: string;
|
|
11
|
+
stage: string;
|
|
12
|
+
/** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
|
|
13
|
+
transition: "same" | "regressed" | "recovered" | "added" | "removed";
|
|
14
|
+
previousPassed?: boolean;
|
|
15
|
+
currentPassed?: boolean;
|
|
16
|
+
durationDeltaMs?: number;
|
|
17
|
+
costDeltaUsd?: number;
|
|
18
|
+
verifierDeltas: EvalDiffVerifierEntry[];
|
|
19
|
+
stageDeltas?: EvalDiffStageEntry[];
|
|
20
|
+
}
|
|
21
|
+
export interface EvalDiffVerifierEntry {
|
|
22
|
+
verifierId: string;
|
|
23
|
+
kind: string;
|
|
24
|
+
transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
|
|
25
|
+
previousScore?: number;
|
|
26
|
+
currentScore?: number;
|
|
27
|
+
previousOk?: boolean;
|
|
28
|
+
currentOk?: boolean;
|
|
29
|
+
}
|
|
30
|
+
export interface EvalDiffStageEntry {
|
|
31
|
+
stage: string;
|
|
32
|
+
durationDeltaMs: number;
|
|
33
|
+
costDeltaUsd: number;
|
|
34
|
+
turnsDelta: number;
|
|
35
|
+
callsDelta: number;
|
|
36
|
+
}
|
|
37
|
+
export interface EvalDiffReport {
|
|
38
|
+
old: EvalDiffReportMeta;
|
|
39
|
+
new: EvalDiffReportMeta;
|
|
40
|
+
summaryDelta: {
|
|
41
|
+
totalCasesDelta: number;
|
|
42
|
+
passedDelta: number;
|
|
43
|
+
failedDelta: number;
|
|
44
|
+
skippedDelta: number;
|
|
45
|
+
totalCostUsdDelta: number;
|
|
46
|
+
totalDurationMsDelta: number;
|
|
47
|
+
};
|
|
48
|
+
cases: EvalDiffCaseEntry[];
|
|
49
|
+
/** True when any case regressed or any verifier dropped. */
|
|
50
|
+
regressed: boolean;
|
|
51
|
+
}
|
|
52
|
+
export interface EvalDiffReportMeta {
|
|
53
|
+
runId: string;
|
|
54
|
+
cclawVersion: string;
|
|
55
|
+
generatedAt: string;
|
|
56
|
+
mode: string;
|
|
57
|
+
model: string;
|
|
58
|
+
sourcePath: string;
|
|
59
|
+
}
|
|
60
|
+
export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
|
|
61
|
+
export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
|
|
62
|
+
export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
|
|
63
|
+
/** Render the diff as a terse human-readable Markdown block. */
|
|
64
|
+
export declare function formatDiffMarkdown(diff: EvalDiffReport): string;
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `cclaw eval diff <old> <new>` — side-by-side report comparison.
|
|
3
|
+
*
|
|
4
|
+
* Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
|
|
5
|
+
* explicit filename) and emits a compact human-readable + JSON diff:
|
|
6
|
+
*
|
|
7
|
+
* - summary-level deltas (passed/failed/cost/duration)
|
|
8
|
+
* - per-case pass/fail transitions
|
|
9
|
+
* - per-verifier score drops (only the drops — new passes are noted in
|
|
10
|
+
* the summary line, not repeated per verifier)
|
|
11
|
+
* - Workflow-mode stage-level cost & duration deltas when both reports
|
|
12
|
+
* carry a `workflow` summary for the same case id
|
|
13
|
+
*
|
|
14
|
+
* The resolver accepts three shapes for the `<old>` / `<new>` arguments:
|
|
15
|
+
*
|
|
16
|
+
* 1. A bare version string (`0.26.0`) — matched against any report JSON
|
|
17
|
+
* whose `cclawVersion` field equals the string.
|
|
18
|
+
* 2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
|
|
19
|
+
* 3. The literal `latest` — picks the most recent report on disk by
|
|
20
|
+
* mtime.
|
|
21
|
+
*
|
|
22
|
+
* The diff is deterministic: sorted by case id, then verifier id. Missing
|
|
23
|
+
* cases in one report show up as `added` or `removed` so callers can see
|
|
24
|
+
* which corpus changes slipped in between versions.
|
|
25
|
+
*/
|
|
26
|
+
import fs from "node:fs/promises";
|
|
27
|
+
import path from "node:path";
|
|
28
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
29
|
+
import { exists } from "../fs-utils.js";
|
|
30
|
+
const SCORE_DROP_EPSILON = 0.0001;
|
|
31
|
+
export async function resolveReportPath(projectRoot, selector) {
|
|
32
|
+
const dir = path.join(projectRoot, EVALS_ROOT, "reports");
|
|
33
|
+
if (!(await exists(dir))) {
|
|
34
|
+
throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
|
|
35
|
+
`Run \`cclaw eval\` at least once before comparing reports.`);
|
|
36
|
+
}
|
|
37
|
+
const trimmed = selector.trim();
|
|
38
|
+
if (trimmed.length === 0) {
|
|
39
|
+
throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
|
|
40
|
+
}
|
|
41
|
+
// 1. Explicit filename (absolute or relative).
|
|
42
|
+
const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
|
|
43
|
+
if (await exists(asPath))
|
|
44
|
+
return asPath;
|
|
45
|
+
if (trimmed.endsWith(".json") && (await exists(asPath)))
|
|
46
|
+
return asPath;
|
|
47
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
48
|
+
const jsonFiles = entries
|
|
49
|
+
.filter((e) => e.isFile() && e.name.endsWith(".json"))
|
|
50
|
+
.map((e) => path.join(dir, e.name));
|
|
51
|
+
if (jsonFiles.length === 0) {
|
|
52
|
+
throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
|
|
53
|
+
}
|
|
54
|
+
if (trimmed === "latest") {
|
|
55
|
+
let latest = jsonFiles[0];
|
|
56
|
+
let latestMtime = (await fs.stat(latest)).mtimeMs;
|
|
57
|
+
for (const f of jsonFiles.slice(1)) {
|
|
58
|
+
const stat = await fs.stat(f);
|
|
59
|
+
if (stat.mtimeMs > latestMtime) {
|
|
60
|
+
latest = f;
|
|
61
|
+
latestMtime = stat.mtimeMs;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return latest;
|
|
65
|
+
}
|
|
66
|
+
// 3. Version match — pick most recent by mtime among matches.
|
|
67
|
+
const matches = [];
|
|
68
|
+
for (const file of jsonFiles) {
|
|
69
|
+
try {
|
|
70
|
+
const raw = await fs.readFile(file, "utf8");
|
|
71
|
+
const parsed = JSON.parse(raw);
|
|
72
|
+
if (parsed.cclawVersion === trimmed) {
|
|
73
|
+
const stat = await fs.stat(file);
|
|
74
|
+
matches.push({ file, mtimeMs: stat.mtimeMs });
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (matches.length === 0) {
|
|
82
|
+
throw new Error(`No report matched selector "${selector}". ` +
|
|
83
|
+
`Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
|
|
84
|
+
}
|
|
85
|
+
matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
|
|
86
|
+
return matches[0].file;
|
|
87
|
+
}
|
|
88
|
+
async function loadReport(filePath) {
|
|
89
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
90
|
+
const parsed = JSON.parse(raw);
|
|
91
|
+
if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
|
|
92
|
+
throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
|
|
93
|
+
}
|
|
94
|
+
return parsed;
|
|
95
|
+
}
|
|
96
|
+
function meta(report, sourcePath) {
|
|
97
|
+
return {
|
|
98
|
+
runId: report.runId,
|
|
99
|
+
cclawVersion: report.cclawVersion,
|
|
100
|
+
generatedAt: report.generatedAt,
|
|
101
|
+
mode: report.mode,
|
|
102
|
+
model: report.model,
|
|
103
|
+
sourcePath
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
function verifierMap(results) {
|
|
107
|
+
const out = new Map();
|
|
108
|
+
for (const v of results)
|
|
109
|
+
out.set(v.id, v);
|
|
110
|
+
return out;
|
|
111
|
+
}
|
|
112
|
+
function diffCase(caseId, previous, current) {
|
|
113
|
+
const stage = (current ?? previous).stage;
|
|
114
|
+
if (!previous) {
|
|
115
|
+
return {
|
|
116
|
+
caseId,
|
|
117
|
+
stage,
|
|
118
|
+
transition: "added",
|
|
119
|
+
currentPassed: current?.passed,
|
|
120
|
+
verifierDeltas: []
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
if (!current) {
|
|
124
|
+
return {
|
|
125
|
+
caseId,
|
|
126
|
+
stage,
|
|
127
|
+
transition: "removed",
|
|
128
|
+
previousPassed: previous.passed,
|
|
129
|
+
verifierDeltas: []
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
const transition = previous.passed === current.passed
|
|
133
|
+
? "same"
|
|
134
|
+
: previous.passed && !current.passed
|
|
135
|
+
? "regressed"
|
|
136
|
+
: "recovered";
|
|
137
|
+
const prevMap = verifierMap(previous.verifierResults);
|
|
138
|
+
const currMap = verifierMap(current.verifierResults);
|
|
139
|
+
const verifierDeltas = [];
|
|
140
|
+
const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
|
|
141
|
+
for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
|
|
142
|
+
const p = prevMap.get(id);
|
|
143
|
+
const c = currMap.get(id);
|
|
144
|
+
const kind = (c ?? p).kind;
|
|
145
|
+
if (!p && c) {
|
|
146
|
+
verifierDeltas.push({
|
|
147
|
+
verifierId: id,
|
|
148
|
+
kind,
|
|
149
|
+
transition: "added",
|
|
150
|
+
currentOk: c.ok,
|
|
151
|
+
...(c.score !== undefined ? { currentScore: c.score } : {})
|
|
152
|
+
});
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
if (p && !c) {
|
|
156
|
+
verifierDeltas.push({
|
|
157
|
+
verifierId: id,
|
|
158
|
+
kind,
|
|
159
|
+
transition: "removed",
|
|
160
|
+
previousOk: p.ok,
|
|
161
|
+
...(p.score !== undefined ? { previousScore: p.score } : {})
|
|
162
|
+
});
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
if (!p || !c)
|
|
166
|
+
continue;
|
|
167
|
+
const okChanged = p.ok !== c.ok;
|
|
168
|
+
const scoreChanged = typeof p.score === "number" &&
|
|
169
|
+
typeof c.score === "number" &&
|
|
170
|
+
Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
|
|
171
|
+
if (!okChanged && !scoreChanged)
|
|
172
|
+
continue;
|
|
173
|
+
const entry = {
|
|
174
|
+
verifierId: id,
|
|
175
|
+
kind,
|
|
176
|
+
transition: okChanged
|
|
177
|
+
? p.ok
|
|
178
|
+
? "regressed"
|
|
179
|
+
: "recovered"
|
|
180
|
+
: typeof p.score === "number" &&
|
|
181
|
+
typeof c.score === "number" &&
|
|
182
|
+
c.score < p.score
|
|
183
|
+
? "score-drop"
|
|
184
|
+
: "same",
|
|
185
|
+
previousOk: p.ok,
|
|
186
|
+
currentOk: c.ok
|
|
187
|
+
};
|
|
188
|
+
if (typeof p.score === "number")
|
|
189
|
+
entry.previousScore = p.score;
|
|
190
|
+
if (typeof c.score === "number")
|
|
191
|
+
entry.currentScore = c.score;
|
|
192
|
+
if (entry.transition !== "same")
|
|
193
|
+
verifierDeltas.push(entry);
|
|
194
|
+
}
|
|
195
|
+
const caseEntry = {
|
|
196
|
+
caseId,
|
|
197
|
+
stage,
|
|
198
|
+
transition,
|
|
199
|
+
previousPassed: previous.passed,
|
|
200
|
+
currentPassed: current.passed,
|
|
201
|
+
durationDeltaMs: current.durationMs - previous.durationMs,
|
|
202
|
+
verifierDeltas
|
|
203
|
+
};
|
|
204
|
+
const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
|
|
205
|
+
if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
|
|
206
|
+
caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
|
|
207
|
+
}
|
|
208
|
+
if (previous.workflow && current.workflow) {
|
|
209
|
+
const prevStages = new Map();
|
|
210
|
+
for (const s of previous.workflow.stages)
|
|
211
|
+
prevStages.set(s.stage, s);
|
|
212
|
+
const stageDeltas = [];
|
|
213
|
+
for (const curStage of current.workflow.stages) {
|
|
214
|
+
const prevStage = prevStages.get(curStage.stage);
|
|
215
|
+
if (!prevStage)
|
|
216
|
+
continue;
|
|
217
|
+
stageDeltas.push({
|
|
218
|
+
stage: curStage.stage,
|
|
219
|
+
durationDeltaMs: curStage.durationMs - prevStage.durationMs,
|
|
220
|
+
costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
|
|
221
|
+
turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
|
|
222
|
+
callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
if (stageDeltas.length > 0)
|
|
226
|
+
caseEntry.stageDeltas = stageDeltas;
|
|
227
|
+
}
|
|
228
|
+
return caseEntry;
|
|
229
|
+
}
|
|
230
|
+
export function diffReports(previous, current, prevPath, currPath) {
|
|
231
|
+
const prevMap = new Map();
|
|
232
|
+
const currMap = new Map();
|
|
233
|
+
for (const c of previous.cases)
|
|
234
|
+
prevMap.set(c.caseId, c);
|
|
235
|
+
for (const c of current.cases)
|
|
236
|
+
currMap.set(c.caseId, c);
|
|
237
|
+
const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
|
|
238
|
+
const cases = [...allIds]
|
|
239
|
+
.sort((a, b) => a.localeCompare(b))
|
|
240
|
+
.map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
|
|
241
|
+
const regressed = cases.some((c) => c.transition === "regressed" ||
|
|
242
|
+
c.transition === "removed" ||
|
|
243
|
+
c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
|
|
244
|
+
return {
|
|
245
|
+
old: meta(previous, prevPath),
|
|
246
|
+
new: meta(current, currPath),
|
|
247
|
+
summaryDelta: {
|
|
248
|
+
totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
|
|
249
|
+
passedDelta: current.summary.passed - previous.summary.passed,
|
|
250
|
+
failedDelta: current.summary.failed - previous.summary.failed,
|
|
251
|
+
skippedDelta: current.summary.skipped - previous.summary.skipped,
|
|
252
|
+
totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
|
|
253
|
+
totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
|
|
254
|
+
},
|
|
255
|
+
cases,
|
|
256
|
+
regressed
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
export async function runEvalDiff(input) {
|
|
260
|
+
const [oldPath, newPath] = await Promise.all([
|
|
261
|
+
resolveReportPath(input.projectRoot, input.old),
|
|
262
|
+
resolveReportPath(input.projectRoot, input.new)
|
|
263
|
+
]);
|
|
264
|
+
const [oldReport, newReport] = await Promise.all([
|
|
265
|
+
loadReport(oldPath),
|
|
266
|
+
loadReport(newPath)
|
|
267
|
+
]);
|
|
268
|
+
return diffReports(oldReport, newReport, oldPath, newPath);
|
|
269
|
+
}
|
|
270
|
+
/** Render the diff as a terse human-readable Markdown block. */
|
|
271
|
+
export function formatDiffMarkdown(diff) {
|
|
272
|
+
const lines = [];
|
|
273
|
+
lines.push(`# cclaw eval diff`);
|
|
274
|
+
lines.push(``);
|
|
275
|
+
lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
|
|
276
|
+
lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
|
|
277
|
+
lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
|
|
278
|
+
lines.push(``);
|
|
279
|
+
lines.push(`## Summary delta`);
|
|
280
|
+
lines.push(``);
|
|
281
|
+
const sd = diff.summaryDelta;
|
|
282
|
+
lines.push(`| metric | delta |`);
|
|
283
|
+
lines.push(`| --- | --- |`);
|
|
284
|
+
lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
|
|
285
|
+
lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
|
|
286
|
+
lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
|
|
287
|
+
lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
|
|
288
|
+
lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
|
|
289
|
+
lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
|
|
290
|
+
lines.push(``);
|
|
291
|
+
const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
|
|
292
|
+
if (noisyCases.length === 0) {
|
|
293
|
+
lines.push(`No case-level changes.`);
|
|
294
|
+
lines.push(``);
|
|
295
|
+
return `${lines.join("\n")}\n`;
|
|
296
|
+
}
|
|
297
|
+
lines.push(`## Case changes`);
|
|
298
|
+
lines.push(``);
|
|
299
|
+
lines.push(`| case id | stage | transition | prev | curr |`);
|
|
300
|
+
lines.push(`| --- | --- | --- | --- | --- |`);
|
|
301
|
+
for (const c of noisyCases) {
|
|
302
|
+
const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
|
|
303
|
+
const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
|
|
304
|
+
lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
|
|
305
|
+
}
|
|
306
|
+
lines.push(``);
|
|
307
|
+
const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
|
|
308
|
+
if (withVerifiers.length > 0) {
|
|
309
|
+
lines.push(`## Verifier changes`);
|
|
310
|
+
lines.push(``);
|
|
311
|
+
lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
|
|
312
|
+
lines.push(`| --- | --- | --- | --- | --- | --- |`);
|
|
313
|
+
for (const c of withVerifiers) {
|
|
314
|
+
for (const v of c.verifierDeltas) {
|
|
315
|
+
const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
|
|
316
|
+
const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
|
|
317
|
+
lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
lines.push(``);
|
|
321
|
+
}
|
|
322
|
+
return `${lines.join("\n")}\n`;
|
|
323
|
+
}
|
|
@@ -7,7 +7,7 @@ export interface ChatMessage {
|
|
|
7
7
|
toolCallId?: string;
|
|
8
8
|
/**
|
|
9
9
|
* OpenAI-style tool calls carried on a preceding assistant message.
|
|
10
|
-
* Populated by the
|
|
10
|
+
* Populated by the with-tools loop so the wire transcript stays
|
|
11
11
|
* consistent (assistant message → tool responses).
|
|
12
12
|
*/
|
|
13
13
|
toolCalls?: Array<{
|
|
@@ -35,7 +35,7 @@ export interface ChatRequest {
|
|
|
35
35
|
seed?: number;
|
|
36
36
|
/**
|
|
37
37
|
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
38
|
-
* by
|
|
38
|
+
* by agent/workflow modes. Ignored by the single-shot path.
|
|
39
39
|
*/
|
|
40
40
|
tools?: unknown[];
|
|
41
41
|
toolChoice?: "auto" | "none";
|
|
@@ -111,6 +111,17 @@ export interface CreateEvalClientOptions {
|
|
|
111
111
|
retryPolicy?: RetryPolicy;
|
|
112
112
|
/** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
|
|
113
113
|
sleep?: (ms: number) => Promise<void>;
|
|
114
|
+
/**
|
|
115
|
+
* Observer invoked when a chat() call is about to sleep before the next
|
|
116
|
+
* retry attempt. Use this to surface "we are retrying" status via the
|
|
117
|
+
* progress logger so long, silent backoff windows become visible.
|
|
118
|
+
*/
|
|
119
|
+
onRetry?: (event: {
|
|
120
|
+
attempt: number;
|
|
121
|
+
maxAttempts: number;
|
|
122
|
+
waitMs: number;
|
|
123
|
+
error: EvalLlmError;
|
|
124
|
+
}) => void;
|
|
114
125
|
}
|
|
115
126
|
export interface RetryPolicy {
|
|
116
127
|
/** Max retries *on top of* the initial attempt. 0 = single attempt. */
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -251,7 +251,14 @@ export function createEvalClient(config, options = {}) {
|
|
|
251
251
|
const isLastAttempt = attempt === maxAttempts - 1;
|
|
252
252
|
if (!normalized.retryable || isLastAttempt)
|
|
253
253
|
throw normalized;
|
|
254
|
-
|
|
254
|
+
const waitMs = backoffDelay(attempt, retryPolicy);
|
|
255
|
+
options.onRetry?.({
|
|
256
|
+
attempt: attempt + 1,
|
|
257
|
+
maxAttempts,
|
|
258
|
+
waitMs,
|
|
259
|
+
error: normalized
|
|
260
|
+
});
|
|
261
|
+
await sleep(waitMs);
|
|
255
262
|
}
|
|
256
263
|
}
|
|
257
264
|
throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helpers that translate between the legacy `Tier A/B/C` naming and the
|
|
3
|
+
* current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
|
|
4
|
+
*
|
|
5
|
+
* The names we actually carry in reports, config, CLI flags, and verifier
|
|
6
|
+
* messages are the `EvalMode` ones; legacy tier inputs are accepted with a
|
|
7
|
+
* single deprecation warning per process so existing scripts keep working
|
|
8
|
+
* through the 0.28.x line.
|
|
9
|
+
*/
|
|
10
|
+
import { type EvalMode } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Reset the per-process "already warned about legacy tier" flag. Used by
|
|
13
|
+
* tests so each test file gets a deterministic warning surface.
|
|
14
|
+
*/
|
|
15
|
+
export declare function __resetLegacyWarningForTests(): void;
|
|
16
|
+
export interface LegacyTierInput {
|
|
17
|
+
source: "cli" | "env" | "config";
|
|
18
|
+
raw: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Normalize a raw string from the CLI / env / config into an `EvalMode`.
|
|
22
|
+
* Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
|
|
23
|
+
* Emits a deprecation warning to stderr at most once per process when a
|
|
24
|
+
* legacy tier name is seen.
|
|
25
|
+
*/
|
|
26
|
+
export declare function parseModeInput(raw: string, input: LegacyTierInput, writeWarning?: (message: string) => void): EvalMode;
|
|
27
|
+
/** @deprecated kept for callers that still need to serialize as legacy. */
|
|
28
|
+
export declare function modeToLegacyTier(mode: EvalMode): "A" | "B" | "C";
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helpers that translate between the legacy `Tier A/B/C` naming and the
|
|
3
|
+
* current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
|
|
4
|
+
*
|
|
5
|
+
* The names we actually carry in reports, config, CLI flags, and verifier
|
|
6
|
+
* messages are the `EvalMode` ones; legacy tier inputs are accepted with a
|
|
7
|
+
* single deprecation warning per process so existing scripts keep working
|
|
8
|
+
* through the 0.28.x line.
|
|
9
|
+
*/
|
|
10
|
+
import { EVAL_MODES } from "./types.js";
|
|
11
|
+
const LEGACY_TIER_TO_MODE = {
|
|
12
|
+
A: "fixture",
|
|
13
|
+
B: "agent",
|
|
14
|
+
C: "workflow"
|
|
15
|
+
};
|
|
16
|
+
const MODE_TO_LEGACY_TIER = {
|
|
17
|
+
fixture: "A",
|
|
18
|
+
agent: "B",
|
|
19
|
+
workflow: "C"
|
|
20
|
+
};
|
|
21
|
+
const DEPRECATED_NAMES = new Set(Object.keys(LEGACY_TIER_TO_MODE));
|
|
22
|
+
let legacyWarningEmitted = false;
|
|
23
|
+
/**
|
|
24
|
+
* Reset the per-process "already warned about legacy tier" flag. Used by
|
|
25
|
+
* tests so each test file gets a deterministic warning surface.
|
|
26
|
+
*/
|
|
27
|
+
export function __resetLegacyWarningForTests() {
|
|
28
|
+
legacyWarningEmitted = false;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Normalize a raw string from the CLI / env / config into an `EvalMode`.
|
|
32
|
+
* Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
|
|
33
|
+
* Emits a deprecation warning to stderr at most once per process when a
|
|
34
|
+
* legacy tier name is seen.
|
|
35
|
+
*/
|
|
36
|
+
export function parseModeInput(raw, input, writeWarning = defaultWriteWarning) {
|
|
37
|
+
const trimmed = raw.trim();
|
|
38
|
+
if (trimmed.length === 0) {
|
|
39
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C).`);
|
|
40
|
+
}
|
|
41
|
+
if (EVAL_MODES.includes(trimmed)) {
|
|
42
|
+
return trimmed;
|
|
43
|
+
}
|
|
44
|
+
if (DEPRECATED_NAMES.has(trimmed)) {
|
|
45
|
+
const replacement = LEGACY_TIER_TO_MODE[trimmed];
|
|
46
|
+
if (!legacyWarningEmitted) {
|
|
47
|
+
legacyWarningEmitted = true;
|
|
48
|
+
writeWarning(`[cclaw] "${input.source}: ${input.raw}" is using the legacy tier name "${trimmed}". ` +
|
|
49
|
+
`Please switch to --mode=${replacement} (legacy --tier=A|B|C will be removed in the next major release).`);
|
|
50
|
+
}
|
|
51
|
+
return replacement;
|
|
52
|
+
}
|
|
53
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C), got: ${raw}`);
|
|
54
|
+
}
|
|
55
|
+
/** @deprecated kept for callers that still need to serialize as legacy. */
|
|
56
|
+
export function modeToLegacyTier(mode) {
|
|
57
|
+
return MODE_TO_LEGACY_TIER[mode];
|
|
58
|
+
}
|
|
59
|
+
function defaultWriteWarning(message) {
|
|
60
|
+
process.stderr.write(`${message}\n`);
|
|
61
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight progress logger for `cclaw eval`.
|
|
3
|
+
*
|
|
4
|
+
* The runner is otherwise silent: a full workflow-mode run can easily take
|
|
5
|
+
* a few minutes and the user would see nothing until the Markdown report
|
|
6
|
+
* hits disk. We emit structured events here so the CLI can print concise
|
|
7
|
+
* one-line status updates to stderr (stdout stays reserved for the final
|
|
8
|
+
* report + `--json` output).
|
|
9
|
+
*
|
|
10
|
+
* The logger is intentionally minimal: no ANSI colors, no spinners, no
|
|
11
|
+
* carriage-return rewrites. Those do not survive `tee`, CI log viewers,
|
|
12
|
+
* or the background `runs/tail` path (which copies the stream to a log
|
|
13
|
+
* file), and users also told us "nothing is clear now, everything is
|
|
14
|
+
* long" — so we optimize for log-friendly line-by-line readability.
|
|
15
|
+
*/
|
|
16
|
+
import type { EvalMode, WorkflowStageName } from "./types.js";
|
|
17
|
+
export type ProgressEvent = {
|
|
18
|
+
kind: "run-start";
|
|
19
|
+
mode: EvalMode;
|
|
20
|
+
totalCases: number;
|
|
21
|
+
} | {
|
|
22
|
+
kind: "case-start";
|
|
23
|
+
caseId: string;
|
|
24
|
+
stage: string;
|
|
25
|
+
index: number;
|
|
26
|
+
total: number;
|
|
27
|
+
} | {
|
|
28
|
+
kind: "case-end";
|
|
29
|
+
caseId: string;
|
|
30
|
+
stage: string;
|
|
31
|
+
index: number;
|
|
32
|
+
total: number;
|
|
33
|
+
passed: boolean;
|
|
34
|
+
durationMs: number;
|
|
35
|
+
costUsd?: number;
|
|
36
|
+
} | {
|
|
37
|
+
kind: "stage-start";
|
|
38
|
+
caseId: string;
|
|
39
|
+
stage: WorkflowStageName;
|
|
40
|
+
index: number;
|
|
41
|
+
total: number;
|
|
42
|
+
} | {
|
|
43
|
+
kind: "stage-end";
|
|
44
|
+
caseId: string;
|
|
45
|
+
stage: WorkflowStageName;
|
|
46
|
+
index: number;
|
|
47
|
+
total: number;
|
|
48
|
+
passed: boolean;
|
|
49
|
+
durationMs: number;
|
|
50
|
+
costUsd?: number;
|
|
51
|
+
} | {
|
|
52
|
+
kind: "retry";
|
|
53
|
+
caseId: string;
|
|
54
|
+
stage?: string;
|
|
55
|
+
attempt: number;
|
|
56
|
+
maxAttempts: number;
|
|
57
|
+
waitMs: number;
|
|
58
|
+
reason: string;
|
|
59
|
+
} | {
|
|
60
|
+
kind: "run-end";
|
|
61
|
+
totalCases: number;
|
|
62
|
+
passed: number;
|
|
63
|
+
failed: number;
|
|
64
|
+
durationMs: number;
|
|
65
|
+
};
|
|
66
|
+
export interface ProgressLogger {
|
|
67
|
+
emit(event: ProgressEvent): void;
|
|
68
|
+
}
|
|
69
|
+
export declare function noopProgressLogger(): ProgressLogger;
|
|
70
|
+
export interface StderrProgressLoggerOptions {
|
|
71
|
+
/** Override the underlying write target; defaults to `process.stderr.write`. */
|
|
72
|
+
writer?: (message: string) => void;
|
|
73
|
+
/** Return wall-clock in ms. Injectable for tests. */
|
|
74
|
+
now?: () => number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Emit a one-line status update per event to stderr.
|
|
78
|
+
*
|
|
79
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
80
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
81
|
+
* so sub-cent runs still show a non-zero value.
|
|
82
|
+
*/
|
|
83
|
+
export declare function createStderrProgressLogger(opts?: StderrProgressLoggerOptions): ProgressLogger;
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
const NOOP_LOGGER = { emit() { } };
|
|
2
|
+
export function noopProgressLogger() {
|
|
3
|
+
return NOOP_LOGGER;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Emit a one-line status update per event to stderr.
|
|
7
|
+
*
|
|
8
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
9
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
10
|
+
* so sub-cent runs still show a non-zero value.
|
|
11
|
+
*/
|
|
12
|
+
export function createStderrProgressLogger(opts = {}) {
|
|
13
|
+
const writer = opts.writer ?? ((s) => process.stderr.write(s));
|
|
14
|
+
return {
|
|
15
|
+
emit(event) {
|
|
16
|
+
writer(`[cclaw eval] ${formatEvent(event)}\n`);
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function formatDuration(ms) {
|
|
21
|
+
if (ms < 1000)
|
|
22
|
+
return `${ms}ms`;
|
|
23
|
+
const s = ms / 1000;
|
|
24
|
+
if (s < 60)
|
|
25
|
+
return `${s.toFixed(1)}s`;
|
|
26
|
+
const m = Math.floor(s / 60);
|
|
27
|
+
const rem = Math.round(s - m * 60);
|
|
28
|
+
return `${m}m${rem.toString().padStart(2, "0")}s`;
|
|
29
|
+
}
|
|
30
|
+
function formatCost(usd) {
|
|
31
|
+
if (usd === undefined || usd <= 0)
|
|
32
|
+
return "";
|
|
33
|
+
return ` $${usd.toFixed(4)}`;
|
|
34
|
+
}
|
|
35
|
+
function formatEvent(event) {
|
|
36
|
+
switch (event.kind) {
|
|
37
|
+
case "run-start":
|
|
38
|
+
return `start mode=${event.mode} cases=${event.totalCases}`;
|
|
39
|
+
case "case-start":
|
|
40
|
+
return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
|
|
41
|
+
case "case-end": {
|
|
42
|
+
const status = event.passed ? "PASS" : "FAIL";
|
|
43
|
+
return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
|
|
44
|
+
`in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
|
|
45
|
+
}
|
|
46
|
+
case "stage-start":
|
|
47
|
+
return ` stage ${event.stage} ...`;
|
|
48
|
+
case "stage-end": {
|
|
49
|
+
const status = event.passed ? "ok" : "fail";
|
|
50
|
+
return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
|
|
51
|
+
}
|
|
52
|
+
case "retry":
|
|
53
|
+
return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
|
|
54
|
+
`attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
|
|
55
|
+
case "run-end":
|
|
56
|
+
return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
|
|
57
|
+
`in ${formatDuration(event.durationMs)}`);
|
|
58
|
+
}
|
|
59
|
+
}
|