cclaw-cli 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ const NOOP_LOGGER = { emit() { } };
2
+ export function noopProgressLogger() {
3
+ return NOOP_LOGGER;
4
+ }
5
+ /**
6
+ * Emit a one-line status update per event to stderr.
7
+ *
8
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
9
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
10
+ * so sub-cent runs still show a non-zero value.
11
+ */
12
+ export function createStderrProgressLogger(opts = {}) {
13
+ const writer = opts.writer ?? ((s) => process.stderr.write(s));
14
+ return {
15
+ emit(event) {
16
+ writer(`[cclaw eval] ${formatEvent(event)}\n`);
17
+ }
18
+ };
19
+ }
20
+ function formatDuration(ms) {
21
+ if (ms < 1000)
22
+ return `${ms}ms`;
23
+ const s = ms / 1000;
24
+ if (s < 60)
25
+ return `${s.toFixed(1)}s`;
26
+ const m = Math.floor(s / 60);
27
+ const rem = Math.round(s - m * 60);
28
+ return `${m}m${rem.toString().padStart(2, "0")}s`;
29
+ }
30
+ function formatCost(usd) {
31
+ if (usd === undefined || usd <= 0)
32
+ return "";
33
+ return ` $${usd.toFixed(4)}`;
34
+ }
35
+ function formatEvent(event) {
36
+ switch (event.kind) {
37
+ case "run-start":
38
+ return `start mode=${event.mode} cases=${event.totalCases}`;
39
+ case "case-start":
40
+ return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
41
+ case "case-end": {
42
+ const status = event.passed ? "PASS" : "FAIL";
43
+ return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
44
+ `in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
45
+ }
46
+ case "stage-start":
47
+ return ` stage ${event.stage} ...`;
48
+ case "stage-end": {
49
+ const status = event.passed ? "ok" : "fail";
50
+ return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
51
+ }
52
+ case "retry":
53
+ return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
54
+ `attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
55
+ case "run-end":
56
+ return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
57
+ `in ${formatDuration(event.durationMs)}`);
58
+ }
59
+ }
@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
24
24
  lines.push(`- cclaw version: ${report.cclawVersion}`);
25
25
  lines.push(`- provider: ${report.provider}`);
26
26
  lines.push(`- model: ${report.model}`);
27
- lines.push(`- tier: ${report.tier}`);
27
+ lines.push(`- mode: ${report.mode}`);
28
28
  lines.push(`- stages: ${stages}`);
29
29
  lines.push(``);
30
30
  lines.push(`## Summary`);
@@ -1,10 +1,11 @@
1
1
  import type { FlowStage } from "../types.js";
2
2
  import { type EvalLlmClient } from "./llm-client.js";
3
- import type { EvalReport, EvalTier, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
3
+ import { type ProgressLogger } from "./progress.js";
4
+ import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
4
5
  export interface RunEvalOptions {
5
6
  projectRoot: string;
6
7
  stage?: FlowStage;
7
- tier?: EvalTier;
8
+ mode?: EvalMode;
8
9
  /** When true, run only structural verifiers (Step 1). */
9
10
  schemaOnly?: boolean;
10
11
  /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
21
22
  * without hitting the network.
22
23
  */
23
24
  llmClient?: EvalLlmClient;
25
+ /**
26
+ * Optional progress logger. The CLI wires a stderr-backed logger by
27
+ * default so users see one-line updates during long runs; tests and
28
+ * programmatic callers can inject a silent (noop) logger or capture
29
+ * events for assertions. When omitted, progress is silenced.
30
+ */
31
+ progress?: ProgressLogger;
32
+ /**
33
+ * Per-run USD cap. Enforced in-memory; independent from the daily cap
34
+ * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
35
+ * invocations. Undefined means no cap.
36
+ */
37
+ maxCostUsd?: number;
38
+ /**
39
+ * Override the configured `model` (and `judgeModel`) for this run.
40
+ * Used by `cclaw eval --compare-model` to replay the same corpus
41
+ * against an alternative model without editing `config.yaml`.
42
+ */
43
+ modelOverride?: string;
24
44
  }
25
45
  export interface DryRunSummary {
26
46
  kind: "dry-run";
@@ -33,7 +53,7 @@ export interface DryRunSummary {
33
53
  stage: FlowStage;
34
54
  }>;
35
55
  };
36
- /** Tier C-only workflow corpus summary. Empty for Tier A/B planned runs. */
56
+ /** Only populated in `workflow` mode; empty for fixture / agent modes. */
37
57
  workflowCorpus: {
38
58
  total: number;
39
59
  cases: Array<{
@@ -41,7 +61,7 @@ export interface DryRunSummary {
41
61
  stages: WorkflowStageName[];
42
62
  }>;
43
63
  };
44
- plannedTier: EvalTier;
64
+ plannedMode: EvalMode;
45
65
  verifiersAvailable: {
46
66
  structural: boolean;
47
67
  rules: boolean;
@@ -52,10 +72,10 @@ export interface DryRunSummary {
52
72
  notes: string[];
53
73
  }
54
74
  /**
55
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
56
- * active), runs structural verifiers against fixture-backed cases and loads
57
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
58
- * arrive in later steps; until then cases without `fixture` are marked as
59
- * skipped rather than failing.
75
+ * Main eval runner. Dispatches between fixture-backed verification, the
76
+ * single-stage agent-with-tools loop, and the multi-stage workflow
77
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
78
+ * regression comparison. Cases without a `fixture` path in the yaml are
79
+ * marked skipped (not failed) when no LLM drafting runs.
60
80
  */
61
81
  export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
@@ -8,8 +8,9 @@ import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
8
8
  import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
9
9
  import { loadWorkflowCorpus } from "./workflow-corpus.js";
10
10
  import { loadEvalConfig } from "./config-loader.js";
11
- import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
11
+ import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
12
12
  import { createEvalClient, EvalLlmError } from "./llm-client.js";
13
+ import { noopProgressLogger } from "./progress.js";
13
14
  import { loadAllRubrics } from "./rubric-loader.js";
14
15
  import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
15
16
  import { verifyRules } from "./verifiers/rules.js";
@@ -35,23 +36,24 @@ function skeletonVerifierResult(message, details) {
35
36
  /**
36
37
  * --schema-only narrows to structural. --rules opens up rules + traceability
37
38
  * on top of structural (traceability is a rule-family verifier even though
38
- * it lives in its own module). --judge opens up the LLM judge and, for
39
- * Tier A, the single-shot agent-under-test. --schema-only always wins so
40
- * the LLM-free PR gate never pays for tokens even if stale flags collide.
39
+ * it lives in its own module). --judge opens up the LLM judge and, in
40
+ * `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
41
+ * wins so the LLM-free PR gate never pays for tokens even if stale flags
42
+ * collide.
41
43
  */
42
44
  function resolveRunFlags(options) {
43
45
  const rulesRequested = options.rules === true;
44
46
  const schemaOnly = options.schemaOnly === true;
45
47
  const judgeRequested = options.judge === true;
46
- const tier = options.tier ?? "A";
48
+ const mode = options.mode ?? "fixture";
47
49
  const runJudge = judgeRequested && !schemaOnly;
48
- // Tier C always needs the agent loop (no fixture fallback for workflows),
49
- // so we still require an LLM client but we do NOT require --judge on the
50
- // CLI to produce a workflow run. The judge piece itself stays gated by
51
- // `runJudge` so consistency-only runs are cheap and deterministic.
52
- const runAgent = tier === "C"
50
+ // `workflow` always needs the agent loop (no fixture fallback), so we still
51
+ // require an LLM client but do NOT require --judge on the CLI to produce a
52
+ // workflow run. The judge piece stays gated by `runJudge` so consistency-
53
+ // only runs remain cheap and deterministic.
54
+ const runAgent = mode === "workflow"
53
55
  ? !schemaOnly
54
- : runJudge && (tier === "A" || tier === "B");
56
+ : runJudge && (mode === "fixture" || mode === "agent");
55
57
  return {
56
58
  runStructural: true,
57
59
  runRules: rulesRequested && !schemaOnly,
@@ -103,7 +105,7 @@ function stageJudgeHint(step) {
103
105
  return hint;
104
106
  }
105
107
  async function runWorkflowCase(ctx) {
106
- const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
108
+ const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
107
109
  const started = Date.now();
108
110
  const verifierResults = [];
109
111
  let caseCostUsd = 0;
@@ -115,14 +117,14 @@ async function runWorkflowCase(ctx) {
115
117
  id: "workflow:agent:disabled",
116
118
  ok: false,
117
119
  score: 0,
118
- message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
120
+ message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
119
121
  "Re-run with credentials to execute the workflow.",
120
122
  details: { stages: workflow.stages.map((s) => s.name) }
121
123
  });
122
124
  return {
123
125
  caseId: workflow.id,
124
126
  stage: lastStage,
125
- tier: plannedTier,
127
+ mode: plannedMode,
126
128
  passed: false,
127
129
  durationMs: Date.now() - started,
128
130
  verifierResults
@@ -134,11 +136,28 @@ async function runWorkflowCase(ctx) {
134
136
  workflow,
135
137
  config,
136
138
  projectRoot,
137
- client
139
+ client,
140
+ onStageStart: (stage) => progress.emit({
141
+ kind: "stage-start",
142
+ caseId: workflow.id,
143
+ stage,
144
+ index: caseIndex,
145
+ total: totalCases
146
+ }),
147
+ onStageEnd: (stage, stageResult) => progress.emit({
148
+ kind: "stage-end",
149
+ caseId: workflow.id,
150
+ stage,
151
+ index: caseIndex,
152
+ total: totalCases,
153
+ passed: true,
154
+ durationMs: stageResult.durationMs,
155
+ ...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
156
+ })
138
157
  });
139
158
  }
140
159
  catch (err) {
141
- if (err instanceof DailyCostCapExceededError)
160
+ if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
142
161
  throw err;
143
162
  const retryable = err instanceof EvalLlmError ? err.retryable : false;
144
163
  const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
@@ -156,7 +175,7 @@ async function runWorkflowCase(ctx) {
156
175
  return {
157
176
  caseId: workflow.id,
158
177
  stage: lastStage,
159
- tier: plannedTier,
178
+ mode: plannedMode,
160
179
  passed: false,
161
180
  durationMs: Date.now() - started,
162
181
  verifierResults
@@ -230,7 +249,7 @@ async function runWorkflowCase(ctx) {
230
249
  }
231
250
  }
232
251
  catch (err) {
233
- if (err instanceof DailyCostCapExceededError)
252
+ if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
234
253
  throw err;
235
254
  const retryable = err instanceof EvalLlmError ? err.retryable : false;
236
255
  verifierResults.push({
@@ -262,7 +281,7 @@ async function runWorkflowCase(ctx) {
262
281
  return {
263
282
  caseId: workflow.id,
264
283
  stage: lastStage,
265
- tier: plannedTier,
284
+ mode: plannedMode,
266
285
  passed: allOk,
267
286
  durationMs: Date.now() - started,
268
287
  costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
@@ -271,7 +290,7 @@ async function runWorkflowCase(ctx) {
271
290
  };
272
291
  }
273
292
  async function runCase(ctx) {
274
- const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
293
+ const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
275
294
  const started = Date.now();
276
295
  const verifierResults = [];
277
296
  const expected = caseEntry.expected;
@@ -283,7 +302,7 @@ async function runCase(ctx) {
283
302
  const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
284
303
  let artifact;
285
304
  if (needsArtifact) {
286
- if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
305
+ if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
287
306
  try {
288
307
  const produced = await runSingleShot({
289
308
  caseEntry,
@@ -309,7 +328,7 @@ async function runCase(ctx) {
309
328
  });
310
329
  }
311
330
  catch (err) {
312
- if (err instanceof DailyCostCapExceededError)
331
+ if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
313
332
  throw err;
314
333
  const retryable = err instanceof EvalLlmError ? err.retryable : false;
315
334
  verifierResults.push({
@@ -322,7 +341,7 @@ async function runCase(ctx) {
322
341
  });
323
342
  }
324
343
  }
325
- else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
344
+ else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
326
345
  try {
327
346
  const produced = await runWithTools({
328
347
  caseEntry,
@@ -351,7 +370,7 @@ async function runCase(ctx) {
351
370
  });
352
371
  }
353
372
  catch (err) {
354
- if (err instanceof DailyCostCapExceededError)
373
+ if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
355
374
  throw err;
356
375
  const retryable = err instanceof EvalLlmError ? err.retryable : false;
357
376
  const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
@@ -443,7 +462,7 @@ async function runCase(ctx) {
443
462
  verifierResults.push(...judgeVerifiers);
444
463
  }
445
464
  catch (err) {
446
- if (err instanceof DailyCostCapExceededError)
465
+ if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
447
466
  throw err;
448
467
  const retryable = err instanceof EvalLlmError ? err.retryable : false;
449
468
  verifierResults.push({
@@ -464,7 +483,7 @@ async function runCase(ctx) {
464
483
  return {
465
484
  caseId: caseEntry.id,
466
485
  stage: caseEntry.stage,
467
- tier: plannedTier,
486
+ mode: plannedMode,
468
487
  passed: allOk,
469
488
  durationMs: Date.now() - started,
470
489
  costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
@@ -506,30 +525,37 @@ function stagesInResults(caseResults) {
506
525
  return FLOW_STAGES.filter((s) => set.has(s));
507
526
  }
508
527
  /**
509
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
510
- * active), runs structural verifiers against fixture-backed cases and loads
511
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
512
- * arrive in later steps; until then cases without `fixture` are marked as
513
- * skipped rather than failing.
528
+ * Main eval runner. Dispatches between fixture-backed verification, the
529
+ * single-stage agent-with-tools loop, and the multi-stage workflow
530
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
531
+ * regression comparison. Cases without a `fixture` path in the yaml are
532
+ * marked skipped (not failed) when no LLM drafting runs.
514
533
  */
515
534
  export async function runEval(options) {
516
- const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
517
- const plannedTier = options.tier ?? config.defaultTier;
518
- const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
519
- const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
535
+ const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
536
+ const config = options.modelOverride
537
+ ? {
538
+ ...baseConfig,
539
+ model: options.modelOverride,
540
+ judgeModel: options.modelOverride
541
+ }
542
+ : baseConfig;
543
+ const plannedMode = options.mode ?? config.defaultMode;
544
+ const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
545
+ const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
520
546
  const notes = [];
521
- if (plannedTier !== "C" && corpus.length === 0) {
547
+ if (plannedMode !== "workflow" && corpus.length === 0) {
522
548
  notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
523
549
  }
524
- if (plannedTier === "C" && workflowCorpus.length === 0) {
525
- notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
550
+ if (plannedMode === "workflow" && workflowCorpus.length === 0) {
551
+ notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
526
552
  }
527
553
  const flags = resolveRunFlags(options);
528
554
  if (flags.runJudge && !config.apiKey && !options.llmClient) {
529
555
  notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
530
556
  }
531
- if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
532
- notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
557
+ if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
558
+ notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
533
559
  }
534
560
  if (options.dryRun === true) {
535
561
  const summary = {
@@ -547,23 +573,34 @@ export async function runEval(options) {
547
573
  stages: item.stages.map((s) => s.name)
548
574
  }))
549
575
  },
550
- plannedTier,
576
+ plannedMode,
551
577
  verifiersAvailable: {
552
578
  structural: flags.runStructural,
553
579
  rules: flags.runRules,
554
580
  judge: flags.runJudge,
555
581
  workflow: flags.runAgent,
556
- consistency: plannedTier === "C"
582
+ consistency: plannedMode === "workflow"
557
583
  },
558
584
  notes
559
585
  };
560
586
  return summary;
561
587
  }
562
- const costGuard = createCostGuard(options.projectRoot, config);
588
+ const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
589
+ const progress = options.progress ?? noopProgressLogger();
563
590
  let wrappedClient;
564
- const clientNeeded = flags.runJudge || plannedTier === "C";
591
+ const clientNeeded = flags.runJudge || plannedMode === "workflow";
565
592
  if (clientNeeded) {
566
- const base = options.llmClient ?? createEvalClient(config);
593
+ const base = options.llmClient ??
594
+ createEvalClient(config, {
595
+ onRetry: (event) => progress.emit({
596
+ kind: "retry",
597
+ caseId: "llm",
598
+ attempt: event.attempt,
599
+ maxAttempts: event.maxAttempts,
600
+ waitMs: event.waitMs,
601
+ reason: event.error.message
602
+ })
603
+ });
567
604
  wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
568
605
  }
569
606
  const rubricsNeeded = flags.runJudge;
@@ -572,32 +609,80 @@ export async function runEval(options) {
572
609
  : new Map();
573
610
  const now = new Date().toISOString();
574
611
  const caseResults = [];
575
- if (plannedTier === "C") {
576
- for (const wf of workflowCorpus) {
577
- caseResults.push(await runWorkflowCase({
612
+ const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
613
+ const runStarted = Date.now();
614
+ progress.emit({
615
+ kind: "run-start",
616
+ mode: plannedMode,
617
+ totalCases: totalPlannedCases
618
+ });
619
+ if (plannedMode === "workflow") {
620
+ for (let i = 0; i < workflowCorpus.length; i += 1) {
621
+ const wf = workflowCorpus[i];
622
+ progress.emit({
623
+ kind: "case-start",
624
+ caseId: wf.id,
625
+ stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
626
+ index: i + 1,
627
+ total: workflowCorpus.length
628
+ });
629
+ const result = await runWorkflowCase({
578
630
  projectRoot: options.projectRoot,
579
631
  workflow: wf,
580
- plannedTier,
632
+ plannedMode,
581
633
  flags,
582
634
  config,
583
635
  client: wrappedClient,
584
636
  costGuard,
585
- rubrics
586
- }));
637
+ rubrics,
638
+ progress,
639
+ caseIndex: i + 1,
640
+ totalCases: workflowCorpus.length
641
+ });
642
+ progress.emit({
643
+ kind: "case-end",
644
+ caseId: wf.id,
645
+ stage: result.stage,
646
+ index: i + 1,
647
+ total: workflowCorpus.length,
648
+ passed: result.passed,
649
+ durationMs: result.durationMs,
650
+ ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
651
+ });
652
+ caseResults.push(result);
587
653
  }
588
654
  }
589
655
  else {
590
- for (const item of corpus) {
591
- caseResults.push(await runCase({
656
+ for (let i = 0; i < corpus.length; i += 1) {
657
+ const item = corpus[i];
658
+ progress.emit({
659
+ kind: "case-start",
660
+ caseId: item.id,
661
+ stage: item.stage,
662
+ index: i + 1,
663
+ total: corpus.length
664
+ });
665
+ const result = await runCase({
592
666
  projectRoot: options.projectRoot,
593
667
  caseEntry: item,
594
- plannedTier,
668
+ plannedMode,
595
669
  flags,
596
670
  config,
597
671
  client: wrappedClient,
598
672
  costGuard,
599
673
  rubrics
600
- }));
674
+ });
675
+ progress.emit({
676
+ kind: "case-end",
677
+ caseId: item.id,
678
+ stage: item.stage,
679
+ index: i + 1,
680
+ total: corpus.length,
681
+ passed: result.passed,
682
+ durationMs: result.durationMs,
683
+ ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
684
+ });
685
+ caseResults.push(result);
601
686
  }
602
687
  }
603
688
  const stages = stagesInResults(caseResults);
@@ -610,7 +695,7 @@ export async function runEval(options) {
610
695
  cclawVersion: CCLAW_VERSION,
611
696
  provider: config.provider,
612
697
  model: config.model,
613
- tier: plannedTier,
698
+ mode: plannedMode,
614
699
  stages,
615
700
  cases: caseResults,
616
701
  summary
@@ -618,5 +703,12 @@ export async function runEval(options) {
618
703
  const baselineDelta = compareAgainstBaselines(report, baselines);
619
704
  if (baselineDelta)
620
705
  report.baselineDelta = baselineDelta;
706
+ progress.emit({
707
+ kind: "run-end",
708
+ totalCases: summary.totalCases,
709
+ passed: summary.passed,
710
+ failed: summary.failed,
711
+ durationMs: Date.now() - runStarted
712
+ });
621
713
  return report;
622
714
  }
@@ -0,0 +1,41 @@
1
+ export declare const RUNS_DIR = "runs";
2
+ export interface EvalRunStatus {
3
+ id: string;
4
+ startedAt: string;
5
+ endedAt?: string;
6
+ pid: number;
7
+ argv: string[];
8
+ cwd: string;
9
+ exitCode?: number;
10
+ state: "running" | "succeeded" | "failed";
11
+ }
12
+ export declare function runsRoot(projectRoot: string): string;
13
+ export declare function runDir(projectRoot: string, id: string): string;
14
+ export declare function runLogPath(projectRoot: string, id: string): string;
15
+ export declare function runStatusPath(projectRoot: string, id: string): string;
16
+ /**
17
+ * Generate a short, lexicographically-sortable run id. The timestamp
18
+ * prefix means `ls -1` already returns the runs in chronological order
19
+ * which keeps the `runs list` subcommand trivial.
20
+ */
21
+ export declare function generateRunId(now?: Date): string;
22
+ export declare function ensureRunDir(projectRoot: string, id: string): Promise<string>;
23
+ export declare function writeRunStatus(projectRoot: string, status: EvalRunStatus): Promise<void>;
24
+ export declare function readRunStatus(projectRoot: string, id: string): Promise<EvalRunStatus | null>;
25
+ /**
26
+ * List run ids under `.cclaw/evals/runs/`, most recent first. Directory
27
+ * entries that don't contain a `run.json` are skipped (half-initialized
28
+ * or manually mkdir'd folders).
29
+ */
30
+ export declare function listRuns(projectRoot: string): Promise<EvalRunStatus[]>;
31
+ /**
32
+ * Resolve `"latest"` (or undefined) to the most recent run id.
33
+ * Returns `null` when there are no runs.
34
+ */
35
+ export declare function resolveRunId(projectRoot: string, hint: string | undefined): Promise<string | null>;
36
+ /**
37
+ * Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
38
+ * (process crashed mid-commit), so we double-check with `kill(pid, 0)`
39
+ * before trusting the `state: "running"` field.
40
+ */
41
+ export declare function isRunAlive(status: EvalRunStatus): boolean;