cclaw-cli 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/cli.d.ts +10 -2
  2. package/dist/cli.js +388 -18
  3. package/dist/content/eval-scaffold.d.ts +2 -2
  4. package/dist/content/eval-scaffold.js +7 -6
  5. package/dist/eval/agents/single-shot.d.ts +1 -1
  6. package/dist/eval/agents/single-shot.js +4 -4
  7. package/dist/eval/agents/with-tools.d.ts +14 -1
  8. package/dist/eval/agents/with-tools.js +22 -16
  9. package/dist/eval/agents/workflow.d.ts +31 -0
  10. package/dist/eval/agents/workflow.js +135 -0
  11. package/dist/eval/baseline.d.ts +24 -0
  12. package/dist/eval/baseline.js +75 -2
  13. package/dist/eval/config-loader.js +52 -19
  14. package/dist/eval/cost-guard.d.ts +22 -0
  15. package/dist/eval/cost-guard.js +38 -1
  16. package/dist/eval/diff.d.ts +64 -0
  17. package/dist/eval/diff.js +323 -0
  18. package/dist/eval/llm-client.d.ts +13 -2
  19. package/dist/eval/llm-client.js +8 -1
  20. package/dist/eval/mode.d.ts +28 -0
  21. package/dist/eval/mode.js +61 -0
  22. package/dist/eval/progress.d.ts +83 -0
  23. package/dist/eval/progress.js +59 -0
  24. package/dist/eval/report.js +36 -1
  25. package/dist/eval/runner.d.ts +37 -8
  26. package/dist/eval/runner.js +351 -42
  27. package/dist/eval/runs.d.ts +41 -0
  28. package/dist/eval/runs.js +114 -0
  29. package/dist/eval/sandbox.js +1 -1
  30. package/dist/eval/tools/index.js +1 -1
  31. package/dist/eval/tools/types.d.ts +1 -1
  32. package/dist/eval/types.d.ts +158 -15
  33. package/dist/eval/types.js +39 -7
  34. package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
  35. package/dist/eval/verifiers/workflow-consistency.js +225 -0
  36. package/dist/eval/workflow-corpus.d.ts +7 -0
  37. package/dist/eval/workflow-corpus.js +207 -0
  38. package/package.json +1 -1
@@ -0,0 +1,64 @@
1
+ import type { EvalReport } from "./types.js";
2
+ export interface EvalDiffInput {
3
+ projectRoot: string;
4
+ /** Version string, filename, or "latest". */
5
+ old: string;
6
+ /** Version string, filename, or "latest". */
7
+ new: string;
8
+ }
9
+ export interface EvalDiffCaseEntry {
10
+ caseId: string;
11
+ stage: string;
12
+ /** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
13
+ transition: "same" | "regressed" | "recovered" | "added" | "removed";
14
+ previousPassed?: boolean;
15
+ currentPassed?: boolean;
16
+ durationDeltaMs?: number;
17
+ costDeltaUsd?: number;
18
+ verifierDeltas: EvalDiffVerifierEntry[];
19
+ stageDeltas?: EvalDiffStageEntry[];
20
+ }
21
+ export interface EvalDiffVerifierEntry {
22
+ verifierId: string;
23
+ kind: string;
24
+ transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
25
+ previousScore?: number;
26
+ currentScore?: number;
27
+ previousOk?: boolean;
28
+ currentOk?: boolean;
29
+ }
30
+ export interface EvalDiffStageEntry {
31
+ stage: string;
32
+ durationDeltaMs: number;
33
+ costDeltaUsd: number;
34
+ turnsDelta: number;
35
+ callsDelta: number;
36
+ }
37
+ export interface EvalDiffReport {
38
+ old: EvalDiffReportMeta;
39
+ new: EvalDiffReportMeta;
40
+ summaryDelta: {
41
+ totalCasesDelta: number;
42
+ passedDelta: number;
43
+ failedDelta: number;
44
+ skippedDelta: number;
45
+ totalCostUsdDelta: number;
46
+ totalDurationMsDelta: number;
47
+ };
48
+ cases: EvalDiffCaseEntry[];
49
+ /** True when any case regressed or any verifier dropped. */
50
+ regressed: boolean;
51
+ }
52
+ export interface EvalDiffReportMeta {
53
+ runId: string;
54
+ cclawVersion: string;
55
+ generatedAt: string;
56
+ mode: string;
57
+ model: string;
58
+ sourcePath: string;
59
+ }
60
+ export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
61
+ export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
62
+ export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
63
+ /** Render the diff as a terse human-readable Markdown block. */
64
+ export declare function formatDiffMarkdown(diff: EvalDiffReport): string;
@@ -0,0 +1,323 @@
1
+ /**
2
+ * `cclaw eval diff <old> <new>` — side-by-side report comparison.
3
+ *
4
+ * Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
5
+ * explicit filename) and emits a compact human-readable + JSON diff:
6
+ *
7
+ * - summary-level deltas (passed/failed/cost/duration)
8
+ * - per-case pass/fail transitions
9
+ * - per-verifier score drops (only the drops — new passes are noted in
10
+ * the summary line, not repeated per verifier)
11
+ * - Workflow-mode stage-level cost & duration deltas when both reports
12
+ * carry a `workflow` summary for the same case id
13
+ *
14
+ * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
15
+ *
16
+ * 1. A bare version string (`0.26.0`) — matched against any report JSON
17
+ * whose `cclawVersion` field equals the string.
18
+ * 2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
19
+ * 3. The literal `latest` — picks the most recent report on disk by
20
+ * mtime.
21
+ *
22
+ * The diff is deterministic: sorted by case id, then verifier id. Missing
23
+ * cases in one report show up as `added` or `removed` so callers can see
24
+ * which corpus changes slipped in between versions.
25
+ */
26
+ import fs from "node:fs/promises";
27
+ import path from "node:path";
28
+ import { EVALS_ROOT } from "../constants.js";
29
+ import { exists } from "../fs-utils.js";
30
+ const SCORE_DROP_EPSILON = 0.0001;
31
+ export async function resolveReportPath(projectRoot, selector) {
32
+ const dir = path.join(projectRoot, EVALS_ROOT, "reports");
33
+ if (!(await exists(dir))) {
34
+ throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
35
+ `Run \`cclaw eval\` at least once before comparing reports.`);
36
+ }
37
+ const trimmed = selector.trim();
38
+ if (trimmed.length === 0) {
39
+ throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
40
+ }
41
+ // 1. Explicit filename (absolute or relative).
42
+ const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
43
+ if (await exists(asPath))
44
+ return asPath;
45
+ if (trimmed.endsWith(".json") && (await exists(asPath)))
46
+ return asPath;
47
+ const entries = await fs.readdir(dir, { withFileTypes: true });
48
+ const jsonFiles = entries
49
+ .filter((e) => e.isFile() && e.name.endsWith(".json"))
50
+ .map((e) => path.join(dir, e.name));
51
+ if (jsonFiles.length === 0) {
52
+ throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
53
+ }
54
+ if (trimmed === "latest") {
55
+ let latest = jsonFiles[0];
56
+ let latestMtime = (await fs.stat(latest)).mtimeMs;
57
+ for (const f of jsonFiles.slice(1)) {
58
+ const stat = await fs.stat(f);
59
+ if (stat.mtimeMs > latestMtime) {
60
+ latest = f;
61
+ latestMtime = stat.mtimeMs;
62
+ }
63
+ }
64
+ return latest;
65
+ }
66
+ // 3. Version match — pick most recent by mtime among matches.
67
+ const matches = [];
68
+ for (const file of jsonFiles) {
69
+ try {
70
+ const raw = await fs.readFile(file, "utf8");
71
+ const parsed = JSON.parse(raw);
72
+ if (parsed.cclawVersion === trimmed) {
73
+ const stat = await fs.stat(file);
74
+ matches.push({ file, mtimeMs: stat.mtimeMs });
75
+ }
76
+ }
77
+ catch {
78
+ continue;
79
+ }
80
+ }
81
+ if (matches.length === 0) {
82
+ throw new Error(`No report matched selector "${selector}". ` +
83
+ `Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
84
+ }
85
+ matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
86
+ return matches[0].file;
87
+ }
88
+ async function loadReport(filePath) {
89
+ const raw = await fs.readFile(filePath, "utf8");
90
+ const parsed = JSON.parse(raw);
91
+ if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
92
+ throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
93
+ }
94
+ return parsed;
95
+ }
96
+ function meta(report, sourcePath) {
97
+ return {
98
+ runId: report.runId,
99
+ cclawVersion: report.cclawVersion,
100
+ generatedAt: report.generatedAt,
101
+ mode: report.mode,
102
+ model: report.model,
103
+ sourcePath
104
+ };
105
+ }
106
+ function verifierMap(results) {
107
+ const out = new Map();
108
+ for (const v of results)
109
+ out.set(v.id, v);
110
+ return out;
111
+ }
112
+ function diffCase(caseId, previous, current) {
113
+ const stage = (current ?? previous).stage;
114
+ if (!previous) {
115
+ return {
116
+ caseId,
117
+ stage,
118
+ transition: "added",
119
+ currentPassed: current?.passed,
120
+ verifierDeltas: []
121
+ };
122
+ }
123
+ if (!current) {
124
+ return {
125
+ caseId,
126
+ stage,
127
+ transition: "removed",
128
+ previousPassed: previous.passed,
129
+ verifierDeltas: []
130
+ };
131
+ }
132
+ const transition = previous.passed === current.passed
133
+ ? "same"
134
+ : previous.passed && !current.passed
135
+ ? "regressed"
136
+ : "recovered";
137
+ const prevMap = verifierMap(previous.verifierResults);
138
+ const currMap = verifierMap(current.verifierResults);
139
+ const verifierDeltas = [];
140
+ const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
141
+ for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
142
+ const p = prevMap.get(id);
143
+ const c = currMap.get(id);
144
+ const kind = (c ?? p).kind;
145
+ if (!p && c) {
146
+ verifierDeltas.push({
147
+ verifierId: id,
148
+ kind,
149
+ transition: "added",
150
+ currentOk: c.ok,
151
+ ...(c.score !== undefined ? { currentScore: c.score } : {})
152
+ });
153
+ continue;
154
+ }
155
+ if (p && !c) {
156
+ verifierDeltas.push({
157
+ verifierId: id,
158
+ kind,
159
+ transition: "removed",
160
+ previousOk: p.ok,
161
+ ...(p.score !== undefined ? { previousScore: p.score } : {})
162
+ });
163
+ continue;
164
+ }
165
+ if (!p || !c)
166
+ continue;
167
+ const okChanged = p.ok !== c.ok;
168
+ const scoreChanged = typeof p.score === "number" &&
169
+ typeof c.score === "number" &&
170
+ Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
171
+ if (!okChanged && !scoreChanged)
172
+ continue;
173
+ const entry = {
174
+ verifierId: id,
175
+ kind,
176
+ transition: okChanged
177
+ ? p.ok
178
+ ? "regressed"
179
+ : "recovered"
180
+ : typeof p.score === "number" &&
181
+ typeof c.score === "number" &&
182
+ c.score < p.score
183
+ ? "score-drop"
184
+ : "same",
185
+ previousOk: p.ok,
186
+ currentOk: c.ok
187
+ };
188
+ if (typeof p.score === "number")
189
+ entry.previousScore = p.score;
190
+ if (typeof c.score === "number")
191
+ entry.currentScore = c.score;
192
+ if (entry.transition !== "same")
193
+ verifierDeltas.push(entry);
194
+ }
195
+ const caseEntry = {
196
+ caseId,
197
+ stage,
198
+ transition,
199
+ previousPassed: previous.passed,
200
+ currentPassed: current.passed,
201
+ durationDeltaMs: current.durationMs - previous.durationMs,
202
+ verifierDeltas
203
+ };
204
+ const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
205
+ if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
206
+ caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
207
+ }
208
+ if (previous.workflow && current.workflow) {
209
+ const prevStages = new Map();
210
+ for (const s of previous.workflow.stages)
211
+ prevStages.set(s.stage, s);
212
+ const stageDeltas = [];
213
+ for (const curStage of current.workflow.stages) {
214
+ const prevStage = prevStages.get(curStage.stage);
215
+ if (!prevStage)
216
+ continue;
217
+ stageDeltas.push({
218
+ stage: curStage.stage,
219
+ durationDeltaMs: curStage.durationMs - prevStage.durationMs,
220
+ costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
221
+ turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
222
+ callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
223
+ });
224
+ }
225
+ if (stageDeltas.length > 0)
226
+ caseEntry.stageDeltas = stageDeltas;
227
+ }
228
+ return caseEntry;
229
+ }
230
+ export function diffReports(previous, current, prevPath, currPath) {
231
+ const prevMap = new Map();
232
+ const currMap = new Map();
233
+ for (const c of previous.cases)
234
+ prevMap.set(c.caseId, c);
235
+ for (const c of current.cases)
236
+ currMap.set(c.caseId, c);
237
+ const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
238
+ const cases = [...allIds]
239
+ .sort((a, b) => a.localeCompare(b))
240
+ .map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
241
+ const regressed = cases.some((c) => c.transition === "regressed" ||
242
+ c.transition === "removed" ||
243
+ c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
244
+ return {
245
+ old: meta(previous, prevPath),
246
+ new: meta(current, currPath),
247
+ summaryDelta: {
248
+ totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
249
+ passedDelta: current.summary.passed - previous.summary.passed,
250
+ failedDelta: current.summary.failed - previous.summary.failed,
251
+ skippedDelta: current.summary.skipped - previous.summary.skipped,
252
+ totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
253
+ totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
254
+ },
255
+ cases,
256
+ regressed
257
+ };
258
+ }
259
+ export async function runEvalDiff(input) {
260
+ const [oldPath, newPath] = await Promise.all([
261
+ resolveReportPath(input.projectRoot, input.old),
262
+ resolveReportPath(input.projectRoot, input.new)
263
+ ]);
264
+ const [oldReport, newReport] = await Promise.all([
265
+ loadReport(oldPath),
266
+ loadReport(newPath)
267
+ ]);
268
+ return diffReports(oldReport, newReport, oldPath, newPath);
269
+ }
270
+ /** Render the diff as a terse human-readable Markdown block. */
271
+ export function formatDiffMarkdown(diff) {
272
+ const lines = [];
273
+ lines.push(`# cclaw eval diff`);
274
+ lines.push(``);
275
+ lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
276
+ lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
277
+ lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
278
+ lines.push(``);
279
+ lines.push(`## Summary delta`);
280
+ lines.push(``);
281
+ const sd = diff.summaryDelta;
282
+ lines.push(`| metric | delta |`);
283
+ lines.push(`| --- | --- |`);
284
+ lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
285
+ lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
286
+ lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
287
+ lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
288
+ lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
289
+ lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
290
+ lines.push(``);
291
+ const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
292
+ if (noisyCases.length === 0) {
293
+ lines.push(`No case-level changes.`);
294
+ lines.push(``);
295
+ return `${lines.join("\n")}\n`;
296
+ }
297
+ lines.push(`## Case changes`);
298
+ lines.push(``);
299
+ lines.push(`| case id | stage | transition | prev | curr |`);
300
+ lines.push(`| --- | --- | --- | --- | --- |`);
301
+ for (const c of noisyCases) {
302
+ const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
303
+ const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
304
+ lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
305
+ }
306
+ lines.push(``);
307
+ const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
308
+ if (withVerifiers.length > 0) {
309
+ lines.push(`## Verifier changes`);
310
+ lines.push(``);
311
+ lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
312
+ lines.push(`| --- | --- | --- | --- | --- | --- |`);
313
+ for (const c of withVerifiers) {
314
+ for (const v of c.verifierDeltas) {
315
+ const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
316
+ const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
317
+ lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
318
+ }
319
+ }
320
+ lines.push(``);
321
+ }
322
+ return `${lines.join("\n")}\n`;
323
+ }
@@ -7,7 +7,7 @@ export interface ChatMessage {
7
7
  toolCallId?: string;
8
8
  /**
9
9
  * OpenAI-style tool calls carried on a preceding assistant message.
10
- * Populated by the Tier B loop so the wire transcript stays
10
+ * Populated by the with-tools loop so the wire transcript stays
11
11
  * consistent (assistant message → tool responses).
12
12
  */
13
13
  toolCalls?: Array<{
@@ -35,7 +35,7 @@ export interface ChatRequest {
35
35
  seed?: number;
36
36
  /**
37
37
  * Tool/function-calling definitions in OpenAI wire format. Populated only
38
- * by Tier B. Ignored by the Tier A single-shot path.
38
+ * by agent/workflow modes. Ignored by the single-shot path.
39
39
  */
40
40
  tools?: unknown[];
41
41
  toolChoice?: "auto" | "none";
@@ -111,6 +111,17 @@ export interface CreateEvalClientOptions {
111
111
  retryPolicy?: RetryPolicy;
112
112
  /** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
113
113
  sleep?: (ms: number) => Promise<void>;
114
+ /**
115
+ * Observer invoked when a chat() call is about to sleep before the next
116
+ * retry attempt. Use this to surface "we are retrying" status via the
117
+ * progress logger so long, silent backoff windows become visible.
118
+ */
119
+ onRetry?: (event: {
120
+ attempt: number;
121
+ maxAttempts: number;
122
+ waitMs: number;
123
+ error: EvalLlmError;
124
+ }) => void;
114
125
  }
115
126
  export interface RetryPolicy {
116
127
  /** Max retries *on top of* the initial attempt. 0 = single attempt. */
@@ -251,7 +251,14 @@ export function createEvalClient(config, options = {}) {
251
251
  const isLastAttempt = attempt === maxAttempts - 1;
252
252
  if (!normalized.retryable || isLastAttempt)
253
253
  throw normalized;
254
- await sleep(backoffDelay(attempt, retryPolicy));
254
+ const waitMs = backoffDelay(attempt, retryPolicy);
255
+ options.onRetry?.({
256
+ attempt: attempt + 1,
257
+ maxAttempts,
258
+ waitMs,
259
+ error: normalized
260
+ });
261
+ await sleep(waitMs);
255
262
  }
256
263
  }
257
264
  throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Helpers that translate between the legacy `Tier A/B/C` naming and the
3
+ * current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
4
+ *
5
+ * The names we actually carry in reports, config, CLI flags, and verifier
6
+ * messages are the `EvalMode` ones; legacy tier inputs are accepted with a
7
+ * single deprecation warning per process so existing scripts keep working
8
+ * through the 0.28.x line.
9
+ */
10
+ import { type EvalMode } from "./types.js";
11
+ /**
12
+ * Reset the per-process "already warned about legacy tier" flag. Used by
13
+ * tests so each test file gets a deterministic warning surface.
14
+ */
15
+ export declare function __resetLegacyWarningForTests(): void;
16
+ export interface LegacyTierInput {
17
+ source: "cli" | "env" | "config";
18
+ raw: string;
19
+ }
20
+ /**
21
+ * Normalize a raw string from the CLI / env / config into an `EvalMode`.
22
+ * Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
23
+ * Emits a deprecation warning to stderr at most once per process when a
24
+ * legacy tier name is seen.
25
+ */
26
+ export declare function parseModeInput(raw: string, input: LegacyTierInput, writeWarning?: (message: string) => void): EvalMode;
27
+ /** @deprecated kept for callers that still need to serialize as legacy. */
28
+ export declare function modeToLegacyTier(mode: EvalMode): "A" | "B" | "C";
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Helpers that translate between the legacy `Tier A/B/C` naming and the
3
+ * current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
4
+ *
5
+ * The names we actually carry in reports, config, CLI flags, and verifier
6
+ * messages are the `EvalMode` ones; legacy tier inputs are accepted with a
7
+ * single deprecation warning per process so existing scripts keep working
8
+ * through the 0.28.x line.
9
+ */
10
+ import { EVAL_MODES } from "./types.js";
11
+ const LEGACY_TIER_TO_MODE = {
12
+ A: "fixture",
13
+ B: "agent",
14
+ C: "workflow"
15
+ };
16
+ const MODE_TO_LEGACY_TIER = {
17
+ fixture: "A",
18
+ agent: "B",
19
+ workflow: "C"
20
+ };
21
+ const DEPRECATED_NAMES = new Set(Object.keys(LEGACY_TIER_TO_MODE));
22
+ let legacyWarningEmitted = false;
23
+ /**
24
+ * Reset the per-process "already warned about legacy tier" flag. Used by
25
+ * tests so each test file gets a deterministic warning surface.
26
+ */
27
+ export function __resetLegacyWarningForTests() {
28
+ legacyWarningEmitted = false;
29
+ }
30
+ /**
31
+ * Normalize a raw string from the CLI / env / config into an `EvalMode`.
32
+ * Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
33
+ * Emits a deprecation warning to stderr at most once per process when a
34
+ * legacy tier name is seen.
35
+ */
36
+ export function parseModeInput(raw, input, writeWarning = defaultWriteWarning) {
37
+ const trimmed = raw.trim();
38
+ if (trimmed.length === 0) {
39
+ throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C).`);
40
+ }
41
+ if (EVAL_MODES.includes(trimmed)) {
42
+ return trimmed;
43
+ }
44
+ if (DEPRECATED_NAMES.has(trimmed)) {
45
+ const replacement = LEGACY_TIER_TO_MODE[trimmed];
46
+ if (!legacyWarningEmitted) {
47
+ legacyWarningEmitted = true;
48
+ writeWarning(`[cclaw] "${input.source}: ${input.raw}" is using the legacy tier name "${trimmed}". ` +
49
+ `Please switch to --mode=${replacement} (legacy --tier=A|B|C will be removed in the next major release).`);
50
+ }
51
+ return replacement;
52
+ }
53
+ throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C), got: ${raw}`);
54
+ }
55
+ /** @deprecated kept for callers that still need to serialize as legacy. */
56
+ export function modeToLegacyTier(mode) {
57
+ return MODE_TO_LEGACY_TIER[mode];
58
+ }
59
+ function defaultWriteWarning(message) {
60
+ process.stderr.write(`${message}\n`);
61
+ }
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Lightweight progress logger for `cclaw eval`.
3
+ *
4
+ * The runner is otherwise silent: a full workflow-mode run can easily take
5
+ * a few minutes and the user would see nothing until the Markdown report
6
+ * hits disk. We emit structured events here so the CLI can print concise
7
+ * one-line status updates to stderr (stdout stays reserved for the final
8
+ * report + `--json` output).
9
+ *
10
+ * The logger is intentionally minimal: no ANSI colors, no spinners, no
11
+ * carriage-return rewrites. Those do not survive `tee`, CI log viewers,
12
+ * or the background `runs/tail` path (which copies the stream to a log
13
+ * file), and users also told us "nothing is clear now, everything is
14
+ * long" — so we optimize for log-friendly line-by-line readability.
15
+ */
16
+ import type { EvalMode, WorkflowStageName } from "./types.js";
17
+ export type ProgressEvent = {
18
+ kind: "run-start";
19
+ mode: EvalMode;
20
+ totalCases: number;
21
+ } | {
22
+ kind: "case-start";
23
+ caseId: string;
24
+ stage: string;
25
+ index: number;
26
+ total: number;
27
+ } | {
28
+ kind: "case-end";
29
+ caseId: string;
30
+ stage: string;
31
+ index: number;
32
+ total: number;
33
+ passed: boolean;
34
+ durationMs: number;
35
+ costUsd?: number;
36
+ } | {
37
+ kind: "stage-start";
38
+ caseId: string;
39
+ stage: WorkflowStageName;
40
+ index: number;
41
+ total: number;
42
+ } | {
43
+ kind: "stage-end";
44
+ caseId: string;
45
+ stage: WorkflowStageName;
46
+ index: number;
47
+ total: number;
48
+ passed: boolean;
49
+ durationMs: number;
50
+ costUsd?: number;
51
+ } | {
52
+ kind: "retry";
53
+ caseId: string;
54
+ stage?: string;
55
+ attempt: number;
56
+ maxAttempts: number;
57
+ waitMs: number;
58
+ reason: string;
59
+ } | {
60
+ kind: "run-end";
61
+ totalCases: number;
62
+ passed: number;
63
+ failed: number;
64
+ durationMs: number;
65
+ };
66
+ export interface ProgressLogger {
67
+ emit(event: ProgressEvent): void;
68
+ }
69
+ export declare function noopProgressLogger(): ProgressLogger;
70
+ export interface StderrProgressLoggerOptions {
71
+ /** Override the underlying write target; defaults to `process.stderr.write`. */
72
+ writer?: (message: string) => void;
73
+ /** Return wall-clock in ms. Injectable for tests. */
74
+ now?: () => number;
75
+ }
76
+ /**
77
+ * Emit a one-line status update per event to stderr.
78
+ *
79
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
80
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
81
+ * so sub-cent runs still show a non-zero value.
82
+ */
83
+ export declare function createStderrProgressLogger(opts?: StderrProgressLoggerOptions): ProgressLogger;
@@ -0,0 +1,59 @@
1
+ const NOOP_LOGGER = { emit() { } };
2
+ export function noopProgressLogger() {
3
+ return NOOP_LOGGER;
4
+ }
5
+ /**
6
+ * Emit a one-line status update per event to stderr.
7
+ *
8
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
9
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
10
+ * so sub-cent runs still show a non-zero value.
11
+ */
12
+ export function createStderrProgressLogger(opts = {}) {
13
+ const writer = opts.writer ?? ((s) => process.stderr.write(s));
14
+ return {
15
+ emit(event) {
16
+ writer(`[cclaw eval] ${formatEvent(event)}\n`);
17
+ }
18
+ };
19
+ }
20
+ function formatDuration(ms) {
21
+ if (ms < 1000)
22
+ return `${ms}ms`;
23
+ const s = ms / 1000;
24
+ if (s < 60)
25
+ return `${s.toFixed(1)}s`;
26
+ const m = Math.floor(s / 60);
27
+ const rem = Math.round(s - m * 60);
28
+ return `${m}m${rem.toString().padStart(2, "0")}s`;
29
+ }
30
+ function formatCost(usd) {
31
+ if (usd === undefined || usd <= 0)
32
+ return "";
33
+ return ` $${usd.toFixed(4)}`;
34
+ }
35
+ function formatEvent(event) {
36
+ switch (event.kind) {
37
+ case "run-start":
38
+ return `start mode=${event.mode} cases=${event.totalCases}`;
39
+ case "case-start":
40
+ return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
41
+ case "case-end": {
42
+ const status = event.passed ? "PASS" : "FAIL";
43
+ return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
44
+ `in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
45
+ }
46
+ case "stage-start":
47
+ return ` stage ${event.stage} ...`;
48
+ case "stage-end": {
49
+ const status = event.passed ? "ok" : "fail";
50
+ return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
51
+ }
52
+ case "retry":
53
+ return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
54
+ `attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
55
+ case "run-end":
56
+ return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
57
+ `in ${formatDuration(event.durationMs)}`);
58
+ }
59
+ }