npm - cclaw-cli - Versions diffs - 0.26.0 → 0.28.0 - Mend

cclaw-cli 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist/cli.d.ts +10 -2
package/dist/cli.js +388 -18
package/dist/content/eval-scaffold.d.ts +2 -2
package/dist/content/eval-scaffold.js +7 -6
package/dist/eval/agents/single-shot.d.ts +1 -1
package/dist/eval/agents/single-shot.js +4 -4
package/dist/eval/agents/with-tools.d.ts +14 -1
package/dist/eval/agents/with-tools.js +22 -16
package/dist/eval/agents/workflow.d.ts +31 -0
package/dist/eval/agents/workflow.js +135 -0
package/dist/eval/baseline.d.ts +24 -0
package/dist/eval/baseline.js +75 -2
package/dist/eval/config-loader.js +52 -19
package/dist/eval/cost-guard.d.ts +22 -0
package/dist/eval/cost-guard.js +38 -1
package/dist/eval/diff.d.ts +64 -0
package/dist/eval/diff.js +323 -0
package/dist/eval/llm-client.d.ts +13 -2
package/dist/eval/llm-client.js +8 -1
package/dist/eval/mode.d.ts +28 -0
package/dist/eval/mode.js +61 -0
package/dist/eval/progress.d.ts +83 -0
package/dist/eval/progress.js +59 -0
package/dist/eval/report.js +36 -1
package/dist/eval/runner.d.ts +37 -8
package/dist/eval/runner.js +351 -42
package/dist/eval/runs.d.ts +41 -0
package/dist/eval/runs.js +114 -0
package/dist/eval/sandbox.js +1 -1
package/dist/eval/tools/index.js +1 -1
package/dist/eval/tools/types.d.ts +1 -1
package/dist/eval/types.d.ts +158 -15
package/dist/eval/types.js +39 -7
package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
package/dist/eval/verifiers/workflow-consistency.js +225 -0
package/dist/eval/workflow-corpus.d.ts +7 -0
package/dist/eval/workflow-corpus.js +207 -0
package/package.json +1 -1

package/dist/eval/report.js CHANGED Viewed

@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
     lines.push(`- cclaw version: ${report.cclawVersion}`);
     lines.push(`- provider: ${report.provider}`);
     lines.push(`- model: ${report.model}`);
-    lines.push(`- tier: ${report.tier}`);
+    lines.push(`- mode: ${report.mode}`);
     lines.push(`- stages: ${stages}`);
     lines.push(``);
     lines.push(`## Summary`);
@@ -120,6 +120,41 @@ export function formatMarkdownReport(report) {
         }
         lines.push(``);
     }
+    const workflowCases = report.cases.filter((item) => !!item.workflow);
+    if (workflowCases.length > 0) {
+        lines.push(`## Workflow stages`);
+        lines.push(``);
+        lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
+        lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
+        for (const item of workflowCases) {
+            const wf = item.workflow;
+            for (const stage of wf.stages) {
+                const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
+                const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
+                lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
+                    `${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
+            }
+        }
+        lines.push(``);
+    }
+    const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
+    if (consistencyCases.length > 0) {
+        lines.push(`## Consistency checks`);
+        lines.push(``);
+        lines.push(`| case id | check id | ok | message |`);
+        lines.push(`| --- | --- | --- | --- |`);
+        for (const item of consistencyCases) {
+            for (const verifier of item.verifierResults) {
+                if (verifier.kind !== "consistency")
+                    continue;
+                const message = verifier.message
+                    ? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
+                    : "-";
+                lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
+            }
+        }
+        lines.push(``);
+    }
     lines.push(`## Verifier details`);
     lines.push(``);
     for (const item of report.cases) {

package/dist/eval/runner.d.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 import type { FlowStage } from "../types.js";
 import { type EvalLlmClient } from "./llm-client.js";
-import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
+import { type ProgressLogger } from "./progress.js";
+import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
 export interface RunEvalOptions {
     projectRoot: string;
     stage?: FlowStage;
-    tier?: EvalTier;
+    mode?: EvalMode;
     /** When true, run only structural verifiers (Step 1). */
     schemaOnly?: boolean;
     /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
      * without hitting the network.
      */
     llmClient?: EvalLlmClient;
+    /**
+     * Optional progress logger. The CLI wires a stderr-backed logger by
+     * default so users see one-line updates during long runs; tests and
+     * programmatic callers can inject a silent (noop) logger or capture
+     * events for assertions. When omitted, progress is silenced.
+     */
+    progress?: ProgressLogger;
+    /**
+     * Per-run USD cap. Enforced in-memory; independent from the daily cap
+     * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
+     * invocations. Undefined means no cap.
+     */
+    maxCostUsd?: number;
+    /**
+     * Override the configured `model` (and `judgeModel`) for this run.
+     * Used by `cclaw eval --compare-model` to replay the same corpus
+     * against an alternative model without editing `config.yaml`.
+     */
+    modelOverride?: string;
 }
 export interface DryRunSummary {
     kind: "dry-run";
@@ -33,20 +53,29 @@ export interface DryRunSummary {
             stage: FlowStage;
         }>;
     };
-    plannedTier: EvalTier;
+    /** Only populated in `workflow` mode; empty for fixture / agent modes. */
+    workflowCorpus: {
+        total: number;
+        cases: Array<{
+            id: string;
+            stages: WorkflowStageName[];
+        }>;
+    };
+    plannedMode: EvalMode;
     verifiersAvailable: {
         structural: boolean;
         rules: boolean;
         judge: boolean;
         workflow: boolean;
+        consistency: boolean;
     };
     notes: string[];
 }
 /**
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
- * active), runs structural verifiers against fixture-backed cases and loads
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
- * arrive in later steps; until then cases without `fixture` are marked as
- * skipped rather than failing.
+ * Main eval runner. Dispatches between fixture-backed verification, the
+ * single-stage agent-with-tools loop, and the multi-stage workflow
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
+ * regression comparison. Cases without a `fixture` path in the yaml are
+ * marked skipped (not failed) when no LLM drafting runs.
  */
 export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;

package/dist/eval/runner.js CHANGED Viewed

@@ -3,16 +3,20 @@ import { CCLAW_VERSION } from "../constants.js";
 import { FLOW_STAGES } from "../types.js";
 import { runSingleShot } from "./agents/single-shot.js";
 import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
+import { runWorkflow } from "./agents/workflow.js";
 import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
+import { loadWorkflowCorpus } from "./workflow-corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
-import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
+import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
 import { createEvalClient, EvalLlmError } from "./llm-client.js";
+import { noopProgressLogger } from "./progress.js";
 import { loadAllRubrics } from "./rubric-loader.js";
 import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
 import { verifyRules } from "./verifiers/rules.js";
 import { verifyStructural } from "./verifiers/structural.js";
 import { verifyTraceability } from "./verifiers/traceability.js";
+import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
 function groupByStage(cases) {
     return cases.reduce((acc, item) => {
         acc[item.stage] = (acc[item.stage] ?? 0) + 1;
@@ -32,17 +36,24 @@ function skeletonVerifierResult(message, details) {
 /**
  * --schema-only narrows to structural. --rules opens up rules + traceability
  * on top of structural (traceability is a rule-family verifier even though
- * it lives in its own module). --judge opens up the LLM judge and, for
- * Tier A, the single-shot agent-under-test. --schema-only always wins so
- * the LLM-free PR gate never pays for tokens even if stale flags collide.
+ * it lives in its own module). --judge opens up the LLM judge and, in
+ * `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
+ * wins so the LLM-free PR gate never pays for tokens even if stale flags
+ * collide.
  */
 function resolveRunFlags(options) {
     const rulesRequested = options.rules === true;
     const schemaOnly = options.schemaOnly === true;
     const judgeRequested = options.judge === true;
-    const tier = options.tier ?? "A";
+    const mode = options.mode ?? "fixture";
     const runJudge = judgeRequested && !schemaOnly;
-    const runAgent = runJudge && (tier === "A" || tier === "B");
+    // `workflow` always needs the agent loop (no fixture fallback), so we still
+    // require an LLM client but do NOT require --judge on the CLI to produce a
+    // workflow run. The judge piece stays gated by `runJudge` so consistency-
+    // only runs remain cheap and deterministic.
+    const runAgent = mode === "workflow"
+        ? !schemaOnly
+        : runJudge && (mode === "fixture" || mode === "agent");
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
@@ -83,8 +94,203 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
         return undefined;
     }
 }
+function stageJudgeHint(step) {
+    const hint = {};
+    if (step.rubric)
+        hint.rubric = step.rubric;
+    if (step.requiredChecks)
+        hint.requiredChecks = step.requiredChecks;
+    if (step.minimumScores)
+        hint.minimumScores = step.minimumScores;
+    return hint;
+}
+async function runWorkflowCase(ctx) {
+    const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
+    const started = Date.now();
+    const verifierResults = [];
+    let caseCostUsd = 0;
+    const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
+        "plan";
+    if (!flags.runAgent || !client) {
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:disabled",
+            ok: false,
+            score: 0,
+            message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
+                "Re-run with credentials to execute the workflow.",
+            details: { stages: workflow.stages.map((s) => s.name) }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            mode: plannedMode,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    let workflowResult;
+    try {
+        workflowResult = await runWorkflow({
+            workflow,
+            config,
+            projectRoot,
+            client,
+            onStageStart: (stage) => progress.emit({
+                kind: "stage-start",
+                caseId: workflow.id,
+                stage,
+                index: caseIndex,
+                total: totalCases
+            }),
+            onStageEnd: (stage, stageResult) => progress.emit({
+                kind: "stage-end",
+                caseId: workflow.id,
+                stage,
+                index: caseIndex,
+                total: totalCases,
+                passed: true,
+                durationMs: stageResult.durationMs,
+                ...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
+            })
+        });
+    }
+    catch (err) {
+        if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
+            throw err;
+        const retryable = err instanceof EvalLlmError ? err.retryable : false;
+        const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:error",
+            ok: false,
+            score: 0,
+            message: err instanceof Error ? err.message : String(err),
+            details: {
+                retryable,
+                ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
+            }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            mode: plannedMode,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    caseCostUsd += workflowResult.totalUsageUsd;
+    const stageResults = [...workflowResult.stages];
+    verifierResults.push({
+        kind: "workflow",
+        id: "workflow:agent",
+        ok: true,
+        score: 1,
+        message: `workflow ran ${stageResults.length} stage(s) in ` +
+            `${workflowResult.totalDurationMs}ms ` +
+            `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
+        details: {
+            stages: stageResults.map((s) => ({
+                name: s.stage,
+                durationMs: s.durationMs,
+                usageUsd: s.usageUsd,
+                turns: s.toolUse.turns,
+                calls: s.toolUse.calls
+            }))
+        }
+    });
+    let allJudgeOk = true;
+    if (flags.runJudge) {
+        for (let i = 0; i < workflow.stages.length; i += 1) {
+            const step = workflow.stages[i];
+            const stageResult = stageResults[i];
+            const rubric = rubrics.get(step.name);
+            if (!rubric) {
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:rubric:missing:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
+                    details: { stage: step.name }
+                });
+                allJudgeOk = false;
+                stageResult.judgeOk = false;
+                continue;
+            }
+            const hint = stageJudgeHint(step);
+            try {
+                const invocation = await runJudge({
+                    artifact: stageResult.artifact,
+                    rubric,
+                    config,
+                    client,
+                    caseHint: hint
+                });
+                caseCostUsd += invocation.usageUsd;
+                const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
+                const medians = {};
+                for (const agg of invocation.aggregates) {
+                    medians[agg.checkId] = agg.median;
+                }
+                stageResult.judgeMedians = medians;
+                const stageOk = judgeVerifiers.every((v) => v.ok);
+                stageResult.judgeOk = stageOk;
+                if (!stageOk)
+                    allJudgeOk = false;
+                for (const v of judgeVerifiers) {
+                    verifierResults.push({
+                        ...v,
+                        id: `${v.id}:${step.name}`,
+                        details: { ...(v.details ?? {}), stage: step.name }
+                    });
+                }
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:invocation:error:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: { retryable, rubricId: rubric.id, stage: step.name }
+                });
+                stageResult.judgeOk = false;
+                allJudgeOk = false;
+            }
+        }
+    }
+    const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
+    verifierResults.push(...consistencyResults);
+    const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
+    const allOk = nonSkipped.length === 0
+        ? verifierResults.every((r) => r.ok)
+        : nonSkipped.every((r) => r.ok);
+    const workflowSummary = {
+        caseId: workflow.id,
+        stages: stageResults,
+        totalUsageUsd: workflowResult.totalUsageUsd,
+        totalDurationMs: workflowResult.totalDurationMs,
+        allJudgeOk: flags.runJudge ? allJudgeOk : true
+    };
+    return {
+        caseId: workflow.id,
+        stage: lastStage,
+        mode: plannedMode,
+        passed: allOk,
+        durationMs: Date.now() - started,
+        costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
+        verifierResults,
+        workflow: workflowSummary
+    };
+}
 async function runCase(ctx) {
-    const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
+    const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
     const started = Date.now();
     const verifierResults = [];
     const expected = caseEntry.expected;
@@ -96,7 +302,7 @@ async function runCase(ctx) {
     const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
     let artifact;
     if (needsArtifact) {
-        if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
+        if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
             try {
                 const produced = await runSingleShot({
                     caseEntry,
@@ -122,7 +328,7 @@ async function runCase(ctx) {
                 });
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 verifierResults.push({
@@ -135,7 +341,7 @@ async function runCase(ctx) {
                 });
             }
         }
-        else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
+        else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
             try {
                 const produced = await runWithTools({
                     caseEntry,
@@ -164,7 +370,7 @@ async function runCase(ctx) {
                 });
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
@@ -256,7 +462,7 @@ async function runCase(ctx) {
                 verifierResults.push(...judgeVerifiers);
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 verifierResults.push({
@@ -277,7 +483,7 @@ async function runCase(ctx) {
     return {
         caseId: caseEntry.id,
         stage: caseEntry.stage,
-        tier: plannedTier,
+        mode: plannedMode,
         passed: allOk,
         durationMs: Date.now() - started,
         costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
@@ -319,26 +525,37 @@ function stagesInResults(caseResults) {
     return FLOW_STAGES.filter((s) => set.has(s));
 }
 /**
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
- * active), runs structural verifiers against fixture-backed cases and loads
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
- * arrive in later steps; until then cases without `fixture` are marked as
- * skipped rather than failing.
+ * Main eval runner. Dispatches between fixture-backed verification, the
+ * single-stage agent-with-tools loop, and the multi-stage workflow
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
+ * regression comparison. Cases without a `fixture` path in the yaml are
+ * marked skipped (not failed) when no LLM drafting runs.
  */
 export async function runEval(options) {
-    const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
-    const corpus = await loadCorpus(options.projectRoot, options.stage);
-    const plannedTier = options.tier ?? config.defaultTier;
+    const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
+    const config = options.modelOverride
+        ? {
+            ...baseConfig,
+            model: options.modelOverride,
+            judgeModel: options.modelOverride
+        }
+        : baseConfig;
+    const plannedMode = options.mode ?? config.defaultMode;
+    const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
+    const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
     const notes = [];
-    if (corpus.length === 0) {
+    if (plannedMode !== "workflow" && corpus.length === 0) {
         notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
+    if (plannedMode === "workflow" && workflowCorpus.length === 0) {
+        notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
+    }
     const flags = resolveRunFlags(options);
     if (flags.runJudge && !config.apiKey && !options.llmClient) {
         notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
     }
-    if ((options.tier ?? "A") !== "A" && flags.runJudge) {
-        notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
+    if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
+        notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
     }
     if (options.dryRun === true) {
         const summary = {
@@ -349,39 +566,124 @@ export async function runEval(options) {
                 byStage: groupByStage(corpus),
                 cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
             },
-            plannedTier,
+            workflowCorpus: {
+                total: workflowCorpus.length,
+                cases: workflowCorpus.map((item) => ({
+                    id: item.id,
+                    stages: item.stages.map((s) => s.name)
+                }))
+            },
+            plannedMode,
             verifiersAvailable: {
                 structural: flags.runStructural,
                 rules: flags.runRules,
                 judge: flags.runJudge,
-                workflow: flags.runAgent
+                workflow: flags.runAgent,
+                consistency: plannedMode === "workflow"
             },
             notes
         };
         return summary;
     }
-    const costGuard = createCostGuard(options.projectRoot, config);
+    const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
+    const progress = options.progress ?? noopProgressLogger();
     let wrappedClient;
-    if (flags.runJudge) {
-        const base = options.llmClient ?? createEvalClient(config);
+    const clientNeeded = flags.runJudge || plannedMode === "workflow";
+    if (clientNeeded) {
+        const base = options.llmClient ??
+            createEvalClient(config, {
+                onRetry: (event) => progress.emit({
+                    kind: "retry",
+                    caseId: "llm",
+                    attempt: event.attempt,
+                    maxAttempts: event.maxAttempts,
+                    waitMs: event.waitMs,
+                    reason: event.error.message
+                })
+            });
         wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
     }
-    const rubrics = flags.runJudge
+    const rubricsNeeded = flags.runJudge;
+    const rubrics = rubricsNeeded
         ? await loadAllRubrics(options.projectRoot)
         : new Map();
     const now = new Date().toISOString();
     const caseResults = [];
-    for (const item of corpus) {
-        caseResults.push(await runCase({
-            projectRoot: options.projectRoot,
-            caseEntry: item,
-            plannedTier,
-            flags,
-            config,
-            client: wrappedClient,
-            costGuard,
-            rubrics
-        }));
+    const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
+    const runStarted = Date.now();
+    progress.emit({
+        kind: "run-start",
+        mode: plannedMode,
+        totalCases: totalPlannedCases
+    });
+    if (plannedMode === "workflow") {
+        for (let i = 0; i < workflowCorpus.length; i += 1) {
+            const wf = workflowCorpus[i];
+            progress.emit({
+                kind: "case-start",
+                caseId: wf.id,
+                stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
+                index: i + 1,
+                total: workflowCorpus.length
+            });
+            const result = await runWorkflowCase({
+                projectRoot: options.projectRoot,
+                workflow: wf,
+                plannedMode,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics,
+                progress,
+                caseIndex: i + 1,
+                totalCases: workflowCorpus.length
+            });
+            progress.emit({
+                kind: "case-end",
+                caseId: wf.id,
+                stage: result.stage,
+                index: i + 1,
+                total: workflowCorpus.length,
+                passed: result.passed,
+                durationMs: result.durationMs,
+                ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
+            });
+            caseResults.push(result);
+        }
+    }
+    else {
+        for (let i = 0; i < corpus.length; i += 1) {
+            const item = corpus[i];
+            progress.emit({
+                kind: "case-start",
+                caseId: item.id,
+                stage: item.stage,
+                index: i + 1,
+                total: corpus.length
+            });
+            const result = await runCase({
+                projectRoot: options.projectRoot,
+                caseEntry: item,
+                plannedMode,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics
+            });
+            progress.emit({
+                kind: "case-end",
+                caseId: item.id,
+                stage: item.stage,
+                index: i + 1,
+                total: corpus.length,
+                passed: result.passed,
+                durationMs: result.durationMs,
+                ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
+            });
+            caseResults.push(result);
+        }
     }
     const stages = stagesInResults(caseResults);
     const baselines = await loadBaselinesByStage(options.projectRoot, stages);
@@ -393,7 +695,7 @@ export async function runEval(options) {
         cclawVersion: CCLAW_VERSION,
         provider: config.provider,
         model: config.model,
-        tier: plannedTier,
+        mode: plannedMode,
         stages,
         cases: caseResults,
         summary
@@ -401,5 +703,12 @@ export async function runEval(options) {
     const baselineDelta = compareAgainstBaselines(report, baselines);
     if (baselineDelta)
         report.baselineDelta = baselineDelta;
+    progress.emit({
+        kind: "run-end",
+        totalCases: summary.totalCases,
+        passed: summary.passed,
+        failed: summary.failed,
+        durationMs: Date.now() - runStarted
+    });
     return report;
 }