cclaw-cli 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,323 @@
1
+ /**
2
+ * `cclaw eval diff <old> <new>` — side-by-side report comparison.
3
+ *
4
+ * Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
5
+ * explicit filename) and emits a compact human-readable + JSON diff:
6
+ *
7
+ * - summary-level deltas (passed/failed/cost/duration)
8
+ * - per-case pass/fail transitions
9
+ * - per-verifier score drops (only the drops — new passes are noted in
10
+ * the summary line, not repeated per verifier)
11
+ * - Tier C stage-level cost & duration deltas when both reports carry a
12
+ * `workflow` summary for the same case id
13
+ *
14
+ * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
15
+ *
16
+ * 1. A bare version string (`0.26.0`) — matched against any report JSON
17
+ * whose `cclawVersion` field equals the string.
18
+ * 2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
19
+ * 3. The literal `latest` — picks the most recent report on disk by
20
+ * mtime.
21
+ *
22
+ * The diff is deterministic: sorted by case id, then verifier id. Missing
23
+ * cases in one report show up as `added` or `removed` so callers can see
24
+ * which corpus changes slipped in between versions.
25
+ */
26
+ import fs from "node:fs/promises";
27
+ import path from "node:path";
28
+ import { EVALS_ROOT } from "../constants.js";
29
+ import { exists } from "../fs-utils.js";
30
+ const SCORE_DROP_EPSILON = 0.0001;
31
+ export async function resolveReportPath(projectRoot, selector) {
32
+ const dir = path.join(projectRoot, EVALS_ROOT, "reports");
33
+ if (!(await exists(dir))) {
34
+ throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
35
+ `Run \`cclaw eval\` at least once before comparing reports.`);
36
+ }
37
+ const trimmed = selector.trim();
38
+ if (trimmed.length === 0) {
39
+ throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
40
+ }
41
+ // 1. Explicit filename (absolute or relative).
42
+ const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
43
+ if (await exists(asPath))
44
+ return asPath;
45
+ if (trimmed.endsWith(".json") && (await exists(asPath)))
46
+ return asPath;
47
+ const entries = await fs.readdir(dir, { withFileTypes: true });
48
+ const jsonFiles = entries
49
+ .filter((e) => e.isFile() && e.name.endsWith(".json"))
50
+ .map((e) => path.join(dir, e.name));
51
+ if (jsonFiles.length === 0) {
52
+ throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
53
+ }
54
+ if (trimmed === "latest") {
55
+ let latest = jsonFiles[0];
56
+ let latestMtime = (await fs.stat(latest)).mtimeMs;
57
+ for (const f of jsonFiles.slice(1)) {
58
+ const stat = await fs.stat(f);
59
+ if (stat.mtimeMs > latestMtime) {
60
+ latest = f;
61
+ latestMtime = stat.mtimeMs;
62
+ }
63
+ }
64
+ return latest;
65
+ }
66
+ // 3. Version match — pick most recent by mtime among matches.
67
+ const matches = [];
68
+ for (const file of jsonFiles) {
69
+ try {
70
+ const raw = await fs.readFile(file, "utf8");
71
+ const parsed = JSON.parse(raw);
72
+ if (parsed.cclawVersion === trimmed) {
73
+ const stat = await fs.stat(file);
74
+ matches.push({ file, mtimeMs: stat.mtimeMs });
75
+ }
76
+ }
77
+ catch {
78
+ continue;
79
+ }
80
+ }
81
+ if (matches.length === 0) {
82
+ throw new Error(`No report matched selector "${selector}". ` +
83
+ `Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
84
+ }
85
+ matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
86
+ return matches[0].file;
87
+ }
88
+ async function loadReport(filePath) {
89
+ const raw = await fs.readFile(filePath, "utf8");
90
+ const parsed = JSON.parse(raw);
91
+ if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
92
+ throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
93
+ }
94
+ return parsed;
95
+ }
96
+ function meta(report, sourcePath) {
97
+ return {
98
+ runId: report.runId,
99
+ cclawVersion: report.cclawVersion,
100
+ generatedAt: report.generatedAt,
101
+ tier: report.tier,
102
+ model: report.model,
103
+ sourcePath
104
+ };
105
+ }
106
+ function verifierMap(results) {
107
+ const out = new Map();
108
+ for (const v of results)
109
+ out.set(v.id, v);
110
+ return out;
111
+ }
112
+ function diffCase(caseId, previous, current) {
113
+ const stage = (current ?? previous).stage;
114
+ if (!previous) {
115
+ return {
116
+ caseId,
117
+ stage,
118
+ transition: "added",
119
+ currentPassed: current?.passed,
120
+ verifierDeltas: []
121
+ };
122
+ }
123
+ if (!current) {
124
+ return {
125
+ caseId,
126
+ stage,
127
+ transition: "removed",
128
+ previousPassed: previous.passed,
129
+ verifierDeltas: []
130
+ };
131
+ }
132
+ const transition = previous.passed === current.passed
133
+ ? "same"
134
+ : previous.passed && !current.passed
135
+ ? "regressed"
136
+ : "recovered";
137
+ const prevMap = verifierMap(previous.verifierResults);
138
+ const currMap = verifierMap(current.verifierResults);
139
+ const verifierDeltas = [];
140
+ const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
141
+ for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
142
+ const p = prevMap.get(id);
143
+ const c = currMap.get(id);
144
+ const kind = (c ?? p).kind;
145
+ if (!p && c) {
146
+ verifierDeltas.push({
147
+ verifierId: id,
148
+ kind,
149
+ transition: "added",
150
+ currentOk: c.ok,
151
+ ...(c.score !== undefined ? { currentScore: c.score } : {})
152
+ });
153
+ continue;
154
+ }
155
+ if (p && !c) {
156
+ verifierDeltas.push({
157
+ verifierId: id,
158
+ kind,
159
+ transition: "removed",
160
+ previousOk: p.ok,
161
+ ...(p.score !== undefined ? { previousScore: p.score } : {})
162
+ });
163
+ continue;
164
+ }
165
+ if (!p || !c)
166
+ continue;
167
+ const okChanged = p.ok !== c.ok;
168
+ const scoreChanged = typeof p.score === "number" &&
169
+ typeof c.score === "number" &&
170
+ Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
171
+ if (!okChanged && !scoreChanged)
172
+ continue;
173
+ const entry = {
174
+ verifierId: id,
175
+ kind,
176
+ transition: okChanged
177
+ ? p.ok
178
+ ? "regressed"
179
+ : "recovered"
180
+ : typeof p.score === "number" &&
181
+ typeof c.score === "number" &&
182
+ c.score < p.score
183
+ ? "score-drop"
184
+ : "same",
185
+ previousOk: p.ok,
186
+ currentOk: c.ok
187
+ };
188
+ if (typeof p.score === "number")
189
+ entry.previousScore = p.score;
190
+ if (typeof c.score === "number")
191
+ entry.currentScore = c.score;
192
+ if (entry.transition !== "same")
193
+ verifierDeltas.push(entry);
194
+ }
195
+ const caseEntry = {
196
+ caseId,
197
+ stage,
198
+ transition,
199
+ previousPassed: previous.passed,
200
+ currentPassed: current.passed,
201
+ durationDeltaMs: current.durationMs - previous.durationMs,
202
+ verifierDeltas
203
+ };
204
+ const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
205
+ if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
206
+ caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
207
+ }
208
+ if (previous.workflow && current.workflow) {
209
+ const prevStages = new Map();
210
+ for (const s of previous.workflow.stages)
211
+ prevStages.set(s.stage, s);
212
+ const stageDeltas = [];
213
+ for (const curStage of current.workflow.stages) {
214
+ const prevStage = prevStages.get(curStage.stage);
215
+ if (!prevStage)
216
+ continue;
217
+ stageDeltas.push({
218
+ stage: curStage.stage,
219
+ durationDeltaMs: curStage.durationMs - prevStage.durationMs,
220
+ costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
221
+ turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
222
+ callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
223
+ });
224
+ }
225
+ if (stageDeltas.length > 0)
226
+ caseEntry.stageDeltas = stageDeltas;
227
+ }
228
+ return caseEntry;
229
+ }
230
+ export function diffReports(previous, current, prevPath, currPath) {
231
+ const prevMap = new Map();
232
+ const currMap = new Map();
233
+ for (const c of previous.cases)
234
+ prevMap.set(c.caseId, c);
235
+ for (const c of current.cases)
236
+ currMap.set(c.caseId, c);
237
+ const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
238
+ const cases = [...allIds]
239
+ .sort((a, b) => a.localeCompare(b))
240
+ .map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
241
+ const regressed = cases.some((c) => c.transition === "regressed" ||
242
+ c.transition === "removed" ||
243
+ c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
244
+ return {
245
+ old: meta(previous, prevPath),
246
+ new: meta(current, currPath),
247
+ summaryDelta: {
248
+ totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
249
+ passedDelta: current.summary.passed - previous.summary.passed,
250
+ failedDelta: current.summary.failed - previous.summary.failed,
251
+ skippedDelta: current.summary.skipped - previous.summary.skipped,
252
+ totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
253
+ totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
254
+ },
255
+ cases,
256
+ regressed
257
+ };
258
+ }
259
+ export async function runEvalDiff(input) {
260
+ const [oldPath, newPath] = await Promise.all([
261
+ resolveReportPath(input.projectRoot, input.old),
262
+ resolveReportPath(input.projectRoot, input.new)
263
+ ]);
264
+ const [oldReport, newReport] = await Promise.all([
265
+ loadReport(oldPath),
266
+ loadReport(newPath)
267
+ ]);
268
+ return diffReports(oldReport, newReport, oldPath, newPath);
269
+ }
270
+ /** Render the diff as a terse human-readable Markdown block. */
271
+ export function formatDiffMarkdown(diff) {
272
+ const lines = [];
273
+ lines.push(`# cclaw eval diff`);
274
+ lines.push(``);
275
+ lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
276
+ lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
277
+ lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
278
+ lines.push(``);
279
+ lines.push(`## Summary delta`);
280
+ lines.push(``);
281
+ const sd = diff.summaryDelta;
282
+ lines.push(`| metric | delta |`);
283
+ lines.push(`| --- | --- |`);
284
+ lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
285
+ lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
286
+ lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
287
+ lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
288
+ lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
289
+ lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
290
+ lines.push(``);
291
+ const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
292
+ if (noisyCases.length === 0) {
293
+ lines.push(`No case-level changes.`);
294
+ lines.push(``);
295
+ return `${lines.join("\n")}\n`;
296
+ }
297
+ lines.push(`## Case changes`);
298
+ lines.push(``);
299
+ lines.push(`| case id | stage | transition | prev | curr |`);
300
+ lines.push(`| --- | --- | --- | --- | --- |`);
301
+ for (const c of noisyCases) {
302
+ const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
303
+ const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
304
+ lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
305
+ }
306
+ lines.push(``);
307
+ const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
308
+ if (withVerifiers.length > 0) {
309
+ lines.push(`## Verifier changes`);
310
+ lines.push(``);
311
+ lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
312
+ lines.push(`| --- | --- | --- | --- | --- | --- |`);
313
+ for (const c of withVerifiers) {
314
+ for (const v of c.verifierDeltas) {
315
+ const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
316
+ const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
317
+ lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
318
+ }
319
+ }
320
+ lines.push(``);
321
+ }
322
+ return `${lines.join("\n")}\n`;
323
+ }
@@ -120,6 +120,41 @@ export function formatMarkdownReport(report) {
120
120
  }
121
121
  lines.push(``);
122
122
  }
123
+ const workflowCases = report.cases.filter((item) => !!item.workflow);
124
+ if (workflowCases.length > 0) {
125
+ lines.push(`## Workflow stages`);
126
+ lines.push(``);
127
+ lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
128
+ lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
129
+ for (const item of workflowCases) {
130
+ const wf = item.workflow;
131
+ for (const stage of wf.stages) {
132
+ const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
133
+ const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
134
+ lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
135
+ `${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
136
+ }
137
+ }
138
+ lines.push(``);
139
+ }
140
+ const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
141
+ if (consistencyCases.length > 0) {
142
+ lines.push(`## Consistency checks`);
143
+ lines.push(``);
144
+ lines.push(`| case id | check id | ok | message |`);
145
+ lines.push(`| --- | --- | --- | --- |`);
146
+ for (const item of consistencyCases) {
147
+ for (const verifier of item.verifierResults) {
148
+ if (verifier.kind !== "consistency")
149
+ continue;
150
+ const message = verifier.message
151
+ ? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
152
+ : "-";
153
+ lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
154
+ }
155
+ }
156
+ lines.push(``);
157
+ }
123
158
  lines.push(`## Verifier details`);
124
159
  lines.push(``);
125
160
  for (const item of report.cases) {
@@ -1,6 +1,6 @@
1
1
  import type { FlowStage } from "../types.js";
2
2
  import { type EvalLlmClient } from "./llm-client.js";
3
- import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
3
+ import type { EvalReport, EvalTier, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
4
4
  export interface RunEvalOptions {
5
5
  projectRoot: string;
6
6
  stage?: FlowStage;
@@ -33,12 +33,21 @@ export interface DryRunSummary {
33
33
  stage: FlowStage;
34
34
  }>;
35
35
  };
36
+ /** Tier C-only workflow corpus summary. Empty for Tier A/B planned runs. */
37
+ workflowCorpus: {
38
+ total: number;
39
+ cases: Array<{
40
+ id: string;
41
+ stages: WorkflowStageName[];
42
+ }>;
43
+ };
36
44
  plannedTier: EvalTier;
37
45
  verifiersAvailable: {
38
46
  structural: boolean;
39
47
  rules: boolean;
40
48
  judge: boolean;
41
49
  workflow: boolean;
50
+ consistency: boolean;
42
51
  };
43
52
  notes: string[];
44
53
  }