npm - cclaw-cli - Versions diffs - 0.27.0 → 0.28.0 - Mend

cclaw-cli 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/cli.d.ts +8 -4
package/dist/cli.js +316 -20
package/dist/content/eval-scaffold.d.ts +2 -2
package/dist/content/eval-scaffold.js +7 -6
package/dist/eval/agents/single-shot.d.ts +1 -1
package/dist/eval/agents/single-shot.js +4 -4
package/dist/eval/agents/with-tools.d.ts +6 -6
package/dist/eval/agents/with-tools.js +5 -5
package/dist/eval/agents/workflow.d.ts +7 -0
package/dist/eval/agents/workflow.js +5 -3
package/dist/eval/baseline.d.ts +24 -0
package/dist/eval/baseline.js +75 -2
package/dist/eval/config-loader.js +46 -17
package/dist/eval/cost-guard.d.ts +22 -0
package/dist/eval/cost-guard.js +38 -1
package/dist/eval/diff.d.ts +1 -1
package/dist/eval/diff.js +3 -3
package/dist/eval/llm-client.d.ts +13 -2
package/dist/eval/llm-client.js +8 -1
package/dist/eval/mode.d.ts +28 -0
package/dist/eval/mode.js +61 -0
package/dist/eval/progress.d.ts +83 -0
package/dist/eval/progress.js +59 -0
package/dist/eval/report.js +1 -1
package/dist/eval/runner.d.ts +29 -9
package/dist/eval/runner.js +148 -56
package/dist/eval/runs.d.ts +41 -0
package/dist/eval/runs.js +114 -0
package/dist/eval/sandbox.js +1 -1
package/dist/eval/tools/index.js +1 -1
package/dist/eval/tools/types.d.ts +1 -1
package/dist/eval/types.d.ts +54 -27
package/dist/eval/types.js +21 -9
package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
package/dist/eval/workflow-corpus.d.ts +2 -2
package/dist/eval/workflow-corpus.js +4 -4
package/package.json +1 -1

package/dist/eval/progress.js ADDED Viewed

@@ -0,0 +1,59 @@
+const NOOP_LOGGER = { emit() { } };
+export function noopProgressLogger() {
+    return NOOP_LOGGER;
+}
+/**
+ * Emit a one-line status update per event to stderr.
+ *
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
+ * so sub-cent runs still show a non-zero value.
+ */
+export function createStderrProgressLogger(opts = {}) {
+    const writer = opts.writer ?? ((s) => process.stderr.write(s));
+    return {
+        emit(event) {
+            writer(`[cclaw eval] ${formatEvent(event)}\n`);
+        }
+    };
+}
+function formatDuration(ms) {
+    if (ms < 1000)
+        return `${ms}ms`;
+    const s = ms / 1000;
+    if (s < 60)
+        return `${s.toFixed(1)}s`;
+    const m = Math.floor(s / 60);
+    const rem = Math.round(s - m * 60);
+    return `${m}m${rem.toString().padStart(2, "0")}s`;
+}
+function formatCost(usd) {
+    if (usd === undefined || usd <= 0)
+        return "";
+    return ` $${usd.toFixed(4)}`;
+}
+function formatEvent(event) {
+    switch (event.kind) {
+        case "run-start":
+            return `start mode=${event.mode} cases=${event.totalCases}`;
+        case "case-start":
+            return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
+        case "case-end": {
+            const status = event.passed ? "PASS" : "FAIL";
+            return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
+                `in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
+        }
+        case "stage-start":
+            return `  stage ${event.stage} ...`;
+        case "stage-end": {
+            const status = event.passed ? "ok" : "fail";
+            return `  stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
+        }
+        case "retry":
+            return (`  retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
+                `attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
+        case "run-end":
+            return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
+                `in ${formatDuration(event.durationMs)}`);
+    }
+}

package/dist/eval/report.js CHANGED Viewed

@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
     lines.push(`- cclaw version: ${report.cclawVersion}`);
     lines.push(`- provider: ${report.provider}`);
     lines.push(`- model: ${report.model}`);
-    lines.push(`- tier: ${report.tier}`);
+    lines.push(`- mode: ${report.mode}`);
     lines.push(`- stages: ${stages}`);
     lines.push(``);
     lines.push(`## Summary`);

package/dist/eval/runner.d.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 import type { FlowStage } from "../types.js";
 import { type EvalLlmClient } from "./llm-client.js";
-import type { EvalReport, EvalTier, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
+import { type ProgressLogger } from "./progress.js";
+import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
 export interface RunEvalOptions {
     projectRoot: string;
     stage?: FlowStage;
-    tier?: EvalTier;
+    mode?: EvalMode;
     /** When true, run only structural verifiers (Step 1). */
     schemaOnly?: boolean;
     /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
      * without hitting the network.
      */
     llmClient?: EvalLlmClient;
+    /**
+     * Optional progress logger. The CLI wires a stderr-backed logger by
+     * default so users see one-line updates during long runs; tests and
+     * programmatic callers can inject a silent (noop) logger or capture
+     * events for assertions. When omitted, progress is silenced.
+     */
+    progress?: ProgressLogger;
+    /**
+     * Per-run USD cap. Enforced in-memory; independent from the daily cap
+     * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
+     * invocations. Undefined means no cap.
+     */
+    maxCostUsd?: number;
+    /**
+     * Override the configured `model` (and `judgeModel`) for this run.
+     * Used by `cclaw eval --compare-model` to replay the same corpus
+     * against an alternative model without editing `config.yaml`.
+     */
+    modelOverride?: string;
 }
 export interface DryRunSummary {
     kind: "dry-run";
@@ -33,7 +53,7 @@ export interface DryRunSummary {
             stage: FlowStage;
         }>;
     };
-    /** Tier C-only workflow corpus summary. Empty for Tier A/B planned runs. */
+    /** Only populated in `workflow` mode; empty for fixture / agent modes. */
     workflowCorpus: {
         total: number;
         cases: Array<{
@@ -41,7 +61,7 @@ export interface DryRunSummary {
             stages: WorkflowStageName[];
         }>;
     };
-    plannedTier: EvalTier;
+    plannedMode: EvalMode;
     verifiersAvailable: {
         structural: boolean;
         rules: boolean;
@@ -52,10 +72,10 @@ export interface DryRunSummary {
     notes: string[];
 }
 /**
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
- * active), runs structural verifiers against fixture-backed cases and loads
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
- * arrive in later steps; until then cases without `fixture` are marked as
- * skipped rather than failing.
+ * Main eval runner. Dispatches between fixture-backed verification, the
+ * single-stage agent-with-tools loop, and the multi-stage workflow
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
+ * regression comparison. Cases without a `fixture` path in the yaml are
+ * marked skipped (not failed) when no LLM drafting runs.
  */
 export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;

package/dist/eval/runner.js CHANGED Viewed

@@ -8,8 +8,9 @@ import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
 import { loadWorkflowCorpus } from "./workflow-corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
-import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
+import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
 import { createEvalClient, EvalLlmError } from "./llm-client.js";
+import { noopProgressLogger } from "./progress.js";
 import { loadAllRubrics } from "./rubric-loader.js";
 import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
 import { verifyRules } from "./verifiers/rules.js";
@@ -35,23 +36,24 @@ function skeletonVerifierResult(message, details) {
 /**
  * --schema-only narrows to structural. --rules opens up rules + traceability
  * on top of structural (traceability is a rule-family verifier even though
- * it lives in its own module). --judge opens up the LLM judge and, for
- * Tier A, the single-shot agent-under-test. --schema-only always wins so
- * the LLM-free PR gate never pays for tokens even if stale flags collide.
+ * it lives in its own module). --judge opens up the LLM judge and, in
+ * `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
+ * wins so the LLM-free PR gate never pays for tokens even if stale flags
+ * collide.
  */
 function resolveRunFlags(options) {
     const rulesRequested = options.rules === true;
     const schemaOnly = options.schemaOnly === true;
     const judgeRequested = options.judge === true;
-    const tier = options.tier ?? "A";
+    const mode = options.mode ?? "fixture";
     const runJudge = judgeRequested && !schemaOnly;
-    // Tier C always needs the agent loop (no fixture fallback for workflows),
-    // so we still require an LLM client but we do NOT require --judge on the
-    // CLI to produce a workflow run. The judge piece itself stays gated by
-    // `runJudge` so consistency-only runs are cheap and deterministic.
-    const runAgent = tier === "C"
+    // `workflow` always needs the agent loop (no fixture fallback), so we still
+    // require an LLM client but do NOT require --judge on the CLI to produce a
+    // workflow run. The judge piece stays gated by `runJudge` so consistency-
+    // only runs remain cheap and deterministic.
+    const runAgent = mode === "workflow"
         ? !schemaOnly
-        : runJudge && (tier === "A" || tier === "B");
+        : runJudge && (mode === "fixture" || mode === "agent");
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
@@ -103,7 +105,7 @@ function stageJudgeHint(step) {
     return hint;
 }
 async function runWorkflowCase(ctx) {
-    const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
+    const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
     const started = Date.now();
     const verifierResults = [];
     let caseCostUsd = 0;
@@ -115,14 +117,14 @@ async function runWorkflowCase(ctx) {
             id: "workflow:agent:disabled",
             ok: false,
             score: 0,
-            message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
+            message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
                 "Re-run with credentials to execute the workflow.",
             details: { stages: workflow.stages.map((s) => s.name) }
         });
         return {
             caseId: workflow.id,
             stage: lastStage,
-            tier: plannedTier,
+            mode: plannedMode,
             passed: false,
             durationMs: Date.now() - started,
             verifierResults
@@ -134,11 +136,28 @@ async function runWorkflowCase(ctx) {
             workflow,
             config,
             projectRoot,
-            client
+            client,
+            onStageStart: (stage) => progress.emit({
+                kind: "stage-start",
+                caseId: workflow.id,
+                stage,
+                index: caseIndex,
+                total: totalCases
+            }),
+            onStageEnd: (stage, stageResult) => progress.emit({
+                kind: "stage-end",
+                caseId: workflow.id,
+                stage,
+                index: caseIndex,
+                total: totalCases,
+                passed: true,
+                durationMs: stageResult.durationMs,
+                ...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
+            })
         });
     }
     catch (err) {
-        if (err instanceof DailyCostCapExceededError)
+        if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
             throw err;
         const retryable = err instanceof EvalLlmError ? err.retryable : false;
         const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
@@ -156,7 +175,7 @@ async function runWorkflowCase(ctx) {
         return {
             caseId: workflow.id,
             stage: lastStage,
-            tier: plannedTier,
+            mode: plannedMode,
             passed: false,
             durationMs: Date.now() - started,
             verifierResults
@@ -230,7 +249,7 @@ async function runWorkflowCase(ctx) {
                 }
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 verifierResults.push({
@@ -262,7 +281,7 @@ async function runWorkflowCase(ctx) {
     return {
         caseId: workflow.id,
         stage: lastStage,
-        tier: plannedTier,
+        mode: plannedMode,
         passed: allOk,
         durationMs: Date.now() - started,
         costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
@@ -271,7 +290,7 @@ async function runWorkflowCase(ctx) {
     };
 }
 async function runCase(ctx) {
-    const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
+    const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
     const started = Date.now();
     const verifierResults = [];
     const expected = caseEntry.expected;
@@ -283,7 +302,7 @@ async function runCase(ctx) {
     const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
     let artifact;
     if (needsArtifact) {
-        if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
+        if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
             try {
                 const produced = await runSingleShot({
                     caseEntry,
@@ -309,7 +328,7 @@ async function runCase(ctx) {
                 });
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 verifierResults.push({
@@ -322,7 +341,7 @@ async function runCase(ctx) {
                 });
             }
         }
-        else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
+        else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
             try {
                 const produced = await runWithTools({
                     caseEntry,
@@ -351,7 +370,7 @@ async function runCase(ctx) {
                 });
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
@@ -443,7 +462,7 @@ async function runCase(ctx) {
                 verifierResults.push(...judgeVerifiers);
             }
             catch (err) {
-                if (err instanceof DailyCostCapExceededError)
+                if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
                     throw err;
                 const retryable = err instanceof EvalLlmError ? err.retryable : false;
                 verifierResults.push({
@@ -464,7 +483,7 @@ async function runCase(ctx) {
     return {
         caseId: caseEntry.id,
         stage: caseEntry.stage,
-        tier: plannedTier,
+        mode: plannedMode,
         passed: allOk,
         durationMs: Date.now() - started,
         costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
@@ -506,30 +525,37 @@ function stagesInResults(caseResults) {
     return FLOW_STAGES.filter((s) => set.has(s));
 }
 /**
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
- * active), runs structural verifiers against fixture-backed cases and loads
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
- * arrive in later steps; until then cases without `fixture` are marked as
- * skipped rather than failing.
+ * Main eval runner. Dispatches between fixture-backed verification, the
+ * single-stage agent-with-tools loop, and the multi-stage workflow
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
+ * regression comparison. Cases without a `fixture` path in the yaml are
+ * marked skipped (not failed) when no LLM drafting runs.
  */
 export async function runEval(options) {
-    const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
-    const plannedTier = options.tier ?? config.defaultTier;
-    const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
-    const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
+    const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
+    const config = options.modelOverride
+        ? {
+            ...baseConfig,
+            model: options.modelOverride,
+            judgeModel: options.modelOverride
+        }
+        : baseConfig;
+    const plannedMode = options.mode ?? config.defaultMode;
+    const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
+    const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
     const notes = [];
-    if (plannedTier !== "C" && corpus.length === 0) {
+    if (plannedMode !== "workflow" && corpus.length === 0) {
         notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
-    if (plannedTier === "C" && workflowCorpus.length === 0) {
-        notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
+    if (plannedMode === "workflow" && workflowCorpus.length === 0) {
+        notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
     }
     const flags = resolveRunFlags(options);
     if (flags.runJudge && !config.apiKey && !options.llmClient) {
         notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
     }
-    if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
-        notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
+    if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
+        notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
     }
     if (options.dryRun === true) {
         const summary = {
@@ -547,23 +573,34 @@ export async function runEval(options) {
                     stages: item.stages.map((s) => s.name)
                 }))
             },
-            plannedTier,
+            plannedMode,
             verifiersAvailable: {
                 structural: flags.runStructural,
                 rules: flags.runRules,
                 judge: flags.runJudge,
                 workflow: flags.runAgent,
-                consistency: plannedTier === "C"
+                consistency: plannedMode === "workflow"
             },
             notes
         };
         return summary;
     }
-    const costGuard = createCostGuard(options.projectRoot, config);
+    const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
+    const progress = options.progress ?? noopProgressLogger();
     let wrappedClient;
-    const clientNeeded = flags.runJudge || plannedTier === "C";
+    const clientNeeded = flags.runJudge || plannedMode === "workflow";
     if (clientNeeded) {
-        const base = options.llmClient ?? createEvalClient(config);
+        const base = options.llmClient ??
+            createEvalClient(config, {
+                onRetry: (event) => progress.emit({
+                    kind: "retry",
+                    caseId: "llm",
+                    attempt: event.attempt,
+                    maxAttempts: event.maxAttempts,
+                    waitMs: event.waitMs,
+                    reason: event.error.message
+                })
+            });
         wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
     }
     const rubricsNeeded = flags.runJudge;
@@ -572,32 +609,80 @@ export async function runEval(options) {
         : new Map();
     const now = new Date().toISOString();
     const caseResults = [];
-    if (plannedTier === "C") {
-        for (const wf of workflowCorpus) {
-            caseResults.push(await runWorkflowCase({
+    const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
+    const runStarted = Date.now();
+    progress.emit({
+        kind: "run-start",
+        mode: plannedMode,
+        totalCases: totalPlannedCases
+    });
+    if (plannedMode === "workflow") {
+        for (let i = 0; i < workflowCorpus.length; i += 1) {
+            const wf = workflowCorpus[i];
+            progress.emit({
+                kind: "case-start",
+                caseId: wf.id,
+                stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
+                index: i + 1,
+                total: workflowCorpus.length
+            });
+            const result = await runWorkflowCase({
                 projectRoot: options.projectRoot,
                 workflow: wf,
-                plannedTier,
+                plannedMode,
                 flags,
                 config,
                 client: wrappedClient,
                 costGuard,
-                rubrics
-            }));
+                rubrics,
+                progress,
+                caseIndex: i + 1,
+                totalCases: workflowCorpus.length
+            });
+            progress.emit({
+                kind: "case-end",
+                caseId: wf.id,
+                stage: result.stage,
+                index: i + 1,
+                total: workflowCorpus.length,
+                passed: result.passed,
+                durationMs: result.durationMs,
+                ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
+            });
+            caseResults.push(result);
         }
     }
     else {
-        for (const item of corpus) {
-            caseResults.push(await runCase({
+        for (let i = 0; i < corpus.length; i += 1) {
+            const item = corpus[i];
+            progress.emit({
+                kind: "case-start",
+                caseId: item.id,
+                stage: item.stage,
+                index: i + 1,
+                total: corpus.length
+            });
+            const result = await runCase({
                 projectRoot: options.projectRoot,
                 caseEntry: item,
-                plannedTier,
+                plannedMode,
                 flags,
                 config,
                 client: wrappedClient,
                 costGuard,
                 rubrics
-            }));
+            });
+            progress.emit({
+                kind: "case-end",
+                caseId: item.id,
+                stage: item.stage,
+                index: i + 1,
+                total: corpus.length,
+                passed: result.passed,
+                durationMs: result.durationMs,
+                ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
+            });
+            caseResults.push(result);
         }
     }
     const stages = stagesInResults(caseResults);
@@ -610,7 +695,7 @@ export async function runEval(options) {
         cclawVersion: CCLAW_VERSION,
         provider: config.provider,
         model: config.model,
-        tier: plannedTier,
+        mode: plannedMode,
         stages,
         cases: caseResults,
         summary
@@ -618,5 +703,12 @@ export async function runEval(options) {
     const baselineDelta = compareAgainstBaselines(report, baselines);
     if (baselineDelta)
         report.baselineDelta = baselineDelta;
+    progress.emit({
+        kind: "run-end",
+        totalCases: summary.totalCases,
+        passed: summary.passed,
+        failed: summary.failed,
+        durationMs: Date.now() - runStarted
+    });
     return report;
 }

package/dist/eval/runs.d.ts ADDED Viewed

@@ -0,0 +1,41 @@
+export declare const RUNS_DIR = "runs";
+export interface EvalRunStatus {
+    id: string;
+    startedAt: string;
+    endedAt?: string;
+    pid: number;
+    argv: string[];
+    cwd: string;
+    exitCode?: number;
+    state: "running" | "succeeded" | "failed";
+}
+export declare function runsRoot(projectRoot: string): string;
+export declare function runDir(projectRoot: string, id: string): string;
+export declare function runLogPath(projectRoot: string, id: string): string;
+export declare function runStatusPath(projectRoot: string, id: string): string;
+/**
+ * Generate a short, lexicographically-sortable run id. The timestamp
+ * prefix means `ls -1` already returns the runs in chronological order
+ * which keeps the `runs list` subcommand trivial.
+ */
+export declare function generateRunId(now?: Date): string;
+export declare function ensureRunDir(projectRoot: string, id: string): Promise<string>;
+export declare function writeRunStatus(projectRoot: string, status: EvalRunStatus): Promise<void>;
+export declare function readRunStatus(projectRoot: string, id: string): Promise<EvalRunStatus | null>;
+/**
+ * List run ids under `.cclaw/evals/runs/`, most recent first. Directory
+ * entries that don't contain a `run.json` are skipped (half-initialized
+ * or manually mkdir'd folders).
+ */
+export declare function listRuns(projectRoot: string): Promise<EvalRunStatus[]>;
+/**
+ * Resolve `"latest"` (or undefined) to the most recent run id.
+ * Returns `null` when there are no runs.
+ */
+export declare function resolveRunId(projectRoot: string, hint: string | undefined): Promise<string | null>;
+/**
+ * Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
+ * (process crashed mid-commit), so we double-check with `kill(pid, 0)`
+ * before trusting the `state: "running"` field.
+ */
+export declare function isRunAlive(status: EvalRunStatus): boolean;