npm - cclaw-cli - Versions diffs - 0.22.0 → 0.23.0 - Mend

cclaw-cli 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/cli.d.ts +2 -0
package/dist/cli.js +42 -11
package/dist/eval/baseline.d.ts +14 -0
package/dist/eval/baseline.js +209 -0
package/dist/eval/corpus.d.ts +13 -2
package/dist/eval/corpus.js +97 -13
package/dist/eval/report.js +16 -3
package/dist/eval/runner.d.ts +6 -14
package/dist/eval/runner.js +122 -40
package/dist/eval/types.d.ts +88 -8
package/dist/eval/verifiers/structural.d.ts +14 -0
package/dist/eval/verifiers/structural.js +171 -0
package/package.json +1 -1

package/dist/cli.d.ts CHANGED Viewed

@@ -24,6 +24,8 @@ interface ParsedArgs {
     evalJudge?: boolean;
     evalJson?: boolean;
     evalNoWrite?: boolean;
+    evalUpdateBaseline?: boolean;
+    evalConfirm?: boolean;
     showHelp?: boolean;
     showVersion?: boolean;
 }

package/dist/cli.js CHANGED Viewed

@@ -14,6 +14,7 @@ import { createDefaultConfig, createProfileConfig } from "./config.js";
 import { detectHarnesses } from "./init-detect.js";
 import { HARNESS_ADAPTERS } from "./harness-adapters.js";
 import { runEval } from "./eval/runner.js";
+import { writeBaselinesFromReport } from "./eval/baseline.js";
 import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
 import { EVAL_TIERS } from "./eval/types.js";
 import { FLOW_STAGES } from "./types.js";
@@ -53,15 +54,17 @@ Commands:
              Flags: --name=<feature>    Feature slug (default: inferred from 00-idea.md).
                     --skip-retro       Bypass mandatory retro gate (requires --retro-reason).
                     --retro-reason=<t> Reason for bypassing retro gate.
-  eval       Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
-             Flags: --stage=<id>        Limit to one flow stage (${FLOW_STAGES.join("|")}).
-                    --tier=<A|B|C>      Fidelity tier (A=single-shot, B=tools, C=workflow).
-                    --schema-only       Run only structural verifiers (Wave 7.1).
-                    --rules             Run structural + rule verifiers (Wave 7.2).
-                    --judge             Include LLM judging (Wave 7.3; requires API key).
-                    --dry-run           Validate config + corpus, print summary, do not execute.
-                    --json              Emit machine-readable JSON on stdout.
-                    --no-write          Skip writing the report to .cclaw/evals/reports/.
+  eval       Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.1: structural verifier).
+             Flags: --stage=<id>         Limit to one flow stage (${FLOW_STAGES.join("|")}).
+                    --tier=<A|B|C>       Fidelity tier (A=single-shot, B=tools, C=workflow).
+                    --schema-only        Run only structural verifiers (Wave 7.1, default).
+                    --rules              Run structural + rule verifiers (Wave 7.2).
+                    --judge              Include LLM judging (Wave 7.3; requires API key).
+                    --dry-run            Validate config + corpus, print summary, do not execute.
+                    --json               Emit machine-readable JSON on stdout.
+                    --no-write           Skip writing the report to .cclaw/evals/reports/.
+                    --update-baseline    Overwrite baselines from the current run (requires --confirm).
+                    --confirm            Acknowledge --update-baseline (prevents accidental resets).
   upgrade    Refresh generated files in .cclaw without modifying user artifacts.
   uninstall  Remove .cclaw runtime and the generated harness shim files.
@@ -453,6 +456,14 @@ function parseArgs(argv) {
             parsed.evalNoWrite = true;
             continue;
         }
+        if (flag === "--update-baseline") {
+            parsed.evalUpdateBaseline = true;
+            continue;
+        }
+        if (flag === "--confirm") {
+            parsed.evalConfirm = true;
+            continue;
+        }
     }
     // `--json` is shared between doctor and eval. Disambiguate by command.
     if (parsed.command === "eval" && parsed.doctorJson === true) {
@@ -592,22 +603,42 @@ async function runCommand(parsed, ctx) {
             }
             return 0;
         }
+        if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
+            error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
+            return 1;
+        }
+        if (parsed.evalUpdateBaseline === true) {
+            if (result.summary.failed > 0) {
+                error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
+                return 1;
+            }
+            const written = await writeBaselinesFromReport(ctx.cwd, result);
+            for (const file of written) {
+                info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
+            }
+        }
         if (parsed.evalNoWrite !== true) {
             const jsonPath = await writeJsonReport(ctx.cwd, result);
             const mdPath = await writeMarkdownReport(ctx.cwd, result);
             info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
             info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
         }
+        const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
         if (parsed.evalJson === true) {
             ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
         }
         else {
+            const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
             ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
                 `${result.summary.passed} passed, ` +
                 `${result.summary.failed} failed, ` +
-                `${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
+                `${result.summary.skipped} skipped${regressionNote}\n`);
         }
-        return result.summary.failed > 0 ? 1 : 0;
+        if (result.summary.failed > 0)
+            return 1;
+        if (regressionCount > 0)
+            return 1;
+        return 0;
     }
     if (command === "archive") {
         const archived = await archiveRun(ctx.cwd, parsed.archiveName, {

package/dist/eval/baseline.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type { FlowStage } from "../types.js";
+import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
+export declare const BASELINE_SCHEMA_VERSION = 1;
+export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
+export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
+export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
+export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
+/**
+ * Compare a freshly computed report against loaded baselines. If no baseline
+ * exists for a stage covered by the report, that stage contributes zero
+ * regressions (first run of that stage). Current is the source of truth.
+ */
+export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
+export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;

package/dist/eval/baseline.js ADDED Viewed

@@ -0,0 +1,209 @@
+/**
+ * Baseline I/O + regression comparison (Wave 7.1).
+ *
+ * Layout on disk (committed):
+ *
+ *   .cclaw/evals/baselines/<stage>.json
+ *
+ * Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
+ * regressions by comparing per-verifier `ok` flags across runs: any verifier
+ * that was `ok:true` in the baseline and is `ok:false` now counts as a
+ * critical failure. A case whose aggregate `passed` flipped from true to
+ * false is flagged as `case-now-failing` regardless of per-verifier churn.
+ *
+ * Writes are gated behind an explicit `--update-baseline --confirm` pair at
+ * the CLI layer so accidental resets do not slip into PRs.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
+import { exists } from "../fs-utils.js";
+import { FLOW_STAGES } from "../types.js";
+export const BASELINE_SCHEMA_VERSION = 1;
+function baselinePath(projectRoot, stage) {
+    return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
+}
+export async function loadBaseline(projectRoot, stage) {
+    const filePath = baselinePath(projectRoot, stage);
+    if (!(await exists(filePath)))
+        return null;
+    const raw = await fs.readFile(filePath, "utf8");
+    let parsed;
+    try {
+        parsed = JSON.parse(raw);
+    }
+    catch (err) {
+        throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
+    }
+    if (!isBaseline(parsed, stage)) {
+        throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
+    }
+    return parsed;
+}
+function isBaseline(value, stage) {
+    if (!value || typeof value !== "object")
+        return false;
+    const candidate = value;
+    if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
+        return false;
+    if (candidate.stage !== stage)
+        return false;
+    if (typeof candidate.generatedAt !== "string")
+        return false;
+    if (typeof candidate.cclawVersion !== "string")
+        return false;
+    if (!candidate.cases || typeof candidate.cases !== "object")
+        return false;
+    return true;
+}
+export async function loadBaselinesByStage(projectRoot, stages) {
+    const out = new Map();
+    for (const stage of stages) {
+        const snapshot = await loadBaseline(projectRoot, stage);
+        if (snapshot)
+            out.set(stage, snapshot);
+    }
+    return out;
+}
+function entryFromResult(result) {
+    const verifierResults = result.verifierResults.map((v) => ({
+        id: v.id,
+        kind: v.kind,
+        ok: v.ok,
+        ...(v.score !== undefined ? { score: v.score } : {})
+    }));
+    return { passed: result.passed, verifierResults };
+}
+export function buildBaselineForStage(stage, report) {
+    const stageCases = report.cases.filter((c) => c.stage === stage);
+    const cases = {};
+    for (const c of stageCases) {
+        cases[c.caseId] = entryFromResult(c);
+    }
+    return {
+        schemaVersion: BASELINE_SCHEMA_VERSION,
+        stage,
+        generatedAt: new Date().toISOString(),
+        cclawVersion: CCLAW_VERSION,
+        cases
+    };
+}
+export async function writeBaselinesFromReport(projectRoot, report) {
+    const written = [];
+    const stages = new Set(report.cases.map((c) => c.stage));
+    for (const stage of stages) {
+        const snapshot = buildBaselineForStage(stage, report);
+        const file = baselinePath(projectRoot, stage);
+        await fs.mkdir(path.dirname(file), { recursive: true });
+        await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
+        written.push(file);
+    }
+    return written.sort();
+}
+function verifierMap(entries) {
+    const out = new Map();
+    for (const entry of entries) {
+        out.set(entry.id, entry);
+    }
+    return out;
+}
+function computePassRate(cases) {
+    if (cases.length === 0)
+        return 1;
+    const passed = cases.filter((c) => c.passed).length;
+    return passed / cases.length;
+}
+function baselinePassRate(snapshot) {
+    const entries = Object.values(snapshot.cases);
+    if (entries.length === 0)
+        return 1;
+    const passed = entries.filter((e) => e.passed).length;
+    return passed / entries.length;
+}
+/**
+ * Compare a freshly computed report against loaded baselines. If no baseline
+ * exists for a stage covered by the report, that stage contributes zero
+ * regressions (first run of that stage). Current is the source of truth.
+ */
+export function compareAgainstBaselines(report, baselines) {
+    if (baselines.size === 0)
+        return undefined;
+    const regressions = [];
+    const caseResultsByStage = new Map();
+    for (const c of report.cases) {
+        const bucket = caseResultsByStage.get(c.stage) ?? [];
+        bucket.push(c);
+        caseResultsByStage.set(c.stage, bucket);
+    }
+    let baselineTotalPassRate = 0;
+    let baselineStagesCounted = 0;
+    for (const [stage, snapshot] of baselines) {
+        const current = caseResultsByStage.get(stage) ?? [];
+        baselineTotalPassRate += baselinePassRate(snapshot);
+        baselineStagesCounted += 1;
+        for (const caseResult of current) {
+            const baselineEntry = snapshot.cases[caseResult.caseId];
+            if (!baselineEntry)
+                continue;
+            if (baselineEntry.passed && !caseResult.passed) {
+                regressions.push({
+                    caseId: caseResult.caseId,
+                    stage,
+                    verifierId: "<case>",
+                    reason: "case-now-failing",
+                    previousScore: 1,
+                    currentScore: 0
+                });
+            }
+            const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
+            for (const currentVerifier of caseResult.verifierResults) {
+                const prev = baselineVerifiers.get(currentVerifier.id);
+                if (!prev)
+                    continue;
+                if (prev.ok && !currentVerifier.ok) {
+                    regressions.push({
+                        caseId: caseResult.caseId,
+                        stage,
+                        verifierId: currentVerifier.id,
+                        reason: "newly-failing",
+                        previousScore: prev.score ?? 1,
+                        currentScore: currentVerifier.score ?? 0
+                    });
+                }
+                else if (prev.score !== undefined &&
+                    currentVerifier.score !== undefined &&
+                    currentVerifier.score < prev.score) {
+                    regressions.push({
+                        caseId: caseResult.caseId,
+                        stage,
+                        verifierId: currentVerifier.id,
+                        reason: "score-drop",
+                        previousScore: prev.score,
+                        currentScore: currentVerifier.score
+                    });
+                }
+            }
+        }
+    }
+    const currentPassRate = computePassRate(report.cases);
+    const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
+    const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
+    const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
+    const baselineStages = [...baselines.keys()].sort().join(",");
+    return {
+        baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
+        scoreDelta,
+        criticalFailures,
+        regressions
+    };
+}
+export function listBaselineStages(projectRoot) {
+    const root = path.join(projectRoot, EVALS_ROOT, "baselines");
+    return fs
+        .readdir(root, { withFileTypes: true })
+        .then((entries) => entries
+        .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
+        .map((entry) => entry.name.replace(/\.json$/, ""))
+        .filter((name) => FLOW_STAGES.includes(name)))
+        .catch(() => []);
+}

package/dist/eval/corpus.d.ts CHANGED Viewed

@@ -2,7 +2,18 @@ import type { FlowStage } from "../types.js";
 import type { EvalCase } from "./types.js";
 /**
  * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
- * single stage. Returns an empty array for a fresh install (Wave 7.0 ships
- * without seed cases; corpus is authored in Wave 7.1+).
+ * single stage. Returns an empty array for a fresh install.
  */
 export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
+/**
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
+ * field is interpreted relative to the case's stage directory (i.e., a
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
+ */
+export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
+/**
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
+ * has no fixture reference. Throws a descriptive error if the path exists in
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
+ */
+export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;

package/dist/eval/corpus.js CHANGED Viewed

@@ -12,6 +12,76 @@ function corpusError(filePath, reason) {
 function isRecord(value) {
     return typeof value === "object" && value !== null && !Array.isArray(value);
 }
+function readStringArray(filePath, context, value) {
+    if (value === undefined)
+        return undefined;
+    if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
+        throw corpusError(filePath, `"${context}" must be an array of strings`);
+    }
+    return value;
+}
+function readNonNegativeInteger(filePath, context, value) {
+    if (value === undefined)
+        return undefined;
+    if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
+        throw corpusError(filePath, `"${context}" must be a non-negative integer`);
+    }
+    return value;
+}
+function parseStructural(filePath, raw) {
+    if (raw === undefined)
+        return undefined;
+    if (!isRecord(raw)) {
+        throw corpusError(filePath, `"expected.structural" must be a mapping`);
+    }
+    const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
+    const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
+    const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
+    const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
+    const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
+    const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
+    const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
+    const structural = {};
+    if (requiredSections)
+        structural.requiredSections = requiredSections;
+    if (forbiddenPatterns)
+        structural.forbiddenPatterns = forbiddenPatterns;
+    if (requiredFrontmatterKeys)
+        structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
+    if (minLines !== undefined)
+        structural.minLines = minLines;
+    if (maxLines !== undefined)
+        structural.maxLines = maxLines;
+    if (minChars !== undefined)
+        structural.minChars = minChars;
+    if (maxChars !== undefined)
+        structural.maxChars = maxChars;
+    return structural;
+}
+function parseExpected(filePath, raw) {
+    if (raw === undefined)
+        return undefined;
+    if (!isRecord(raw)) {
+        throw corpusError(filePath, `"expected" must be a mapping`);
+    }
+    const shape = {};
+    const structural = parseStructural(filePath, raw.structural);
+    if (structural)
+        shape.structural = structural;
+    if (raw.rules !== undefined) {
+        if (!isRecord(raw.rules)) {
+            throw corpusError(filePath, `"expected.rules" must be a mapping`);
+        }
+        shape.rules = raw.rules;
+    }
+    if (raw.judge !== undefined) {
+        if (!isRecord(raw.judge)) {
+            throw corpusError(filePath, `"expected.judge" must be a mapping`);
+        }
+        shape.judge = raw.judge;
+    }
+    return Object.keys(shape).length === 0 ? undefined : shape;
+}
 function validateCase(filePath, raw) {
     if (!isRecord(raw)) {
         throw corpusError(filePath, "top-level value must be a mapping");
@@ -28,17 +98,8 @@ function validateCase(filePath, raw) {
     if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
         throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
     }
-    const contextFilesRaw = raw.context_files ?? raw.contextFiles;
-    let contextFiles;
-    if (contextFilesRaw !== undefined) {
-        if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
-            throw corpusError(filePath, `"context_files" must be an array of strings`);
-        }
-        contextFiles = contextFilesRaw;
-    }
-    const expected = raw.expected !== undefined && isRecord(raw.expected)
-        ? raw.expected
-        : undefined;
+    const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
+    const expected = parseExpected(filePath, raw.expected);
     const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
     return {
         id: id.trim(),
@@ -51,8 +112,7 @@ function validateCase(filePath, raw) {
 }
 /**
  * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
- * single stage. Returns an empty array for a fresh install (Wave 7.0 ships
- * without seed cases; corpus is authored in Wave 7.1+).
+ * single stage. Returns an empty array for a fresh install.
  */
 export async function loadCorpus(projectRoot, stage) {
     const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
@@ -89,3 +149,27 @@ export async function loadCorpus(projectRoot, stage) {
     cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
     return cases;
 }
+/**
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
+ * field is interpreted relative to the case's stage directory (i.e., a
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
+ */
+export function fixturePathFor(projectRoot, caseEntry) {
+    if (!caseEntry.fixture)
+        return undefined;
+    return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
+}
+/**
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
+ * has no fixture reference. Throws a descriptive error if the path exists in
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
+ */
+export async function readFixtureArtifact(projectRoot, caseEntry) {
+    const fixturePath = fixturePathFor(projectRoot, caseEntry);
+    if (!fixturePath)
+        return undefined;
+    if (!(await exists(fixturePath))) {
+        throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
+    }
+    return fs.readFile(fixturePath, "utf8");
+}

package/dist/eval/report.js CHANGED Viewed

@@ -39,12 +39,25 @@ export function formatMarkdownReport(report) {
     lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
     lines.push(``);
     if (report.baselineDelta) {
+        const delta = report.baselineDelta;
         lines.push(`## Baseline delta`);
         lines.push(``);
-        lines.push(`- baseline: ${report.baselineDelta.baselineId}`);
-        lines.push(`- score delta: ${report.baselineDelta.scoreDelta.toFixed(4)}`);
-        lines.push(`- critical failures: ${report.baselineDelta.criticalFailures}`);
+        lines.push(`- baseline: ${delta.baselineId}`);
+        lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
+        lines.push(`- critical failures: ${delta.criticalFailures}`);
         lines.push(``);
+        if (delta.regressions.length > 0) {
+            lines.push(`### Regressions`);
+            lines.push(``);
+            lines.push(`| stage | case id | verifier | reason | prev | curr |`);
+            lines.push(`| --- | --- | --- | --- | --- | --- |`);
+            for (const reg of delta.regressions) {
+                const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
+                const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
+                lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
+            }
+            lines.push(``);
+        }
     }
     if (report.cases.length === 0) {
         lines.push(`## Cases`);

package/dist/eval/runner.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@ export interface RunEvalOptions {
     projectRoot: string;
     stage?: FlowStage;
     tier?: EvalTier;
-    /** When true, run only structural verifiers. Wave 7.1 wires actual verifiers. */
+    /** When true, run only structural verifiers (Wave 7.1). */
     schemaOnly?: boolean;
     /** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
     rules?: boolean;
@@ -27,10 +27,6 @@ export interface DryRunSummary {
         }>;
     };
     plannedTier: EvalTier;
-    /**
-     * Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
-     * across the board because no verifier is implemented yet.
-     */
     verifiersAvailable: {
         structural: boolean;
         rules: boolean;
@@ -40,14 +36,10 @@ export interface DryRunSummary {
     notes: string[];
 }
 /**
- * Wave 7.0 runner. Responsibilities:
- * - Load resolved config (defaults + file + env).
- * - Load corpus (empty on a fresh install).
- * - Validate that no verifier flag asks for a capability that does not exist yet.
- * - Return either a dry-run summary or an empty report.
- *
- * Waves 7.1+ will replace the "no verifiers available" branch with the real
- * verifier dispatch pipeline. The signature stays stable so CLI wiring does
- * not churn.
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
+ * active), runs structural verifiers against fixture-backed cases and loads
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
+ * as skipped rather than failing.
  */
 export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;

package/dist/eval/runner.js CHANGED Viewed

@@ -1,23 +1,121 @@
 import { randomUUID } from "node:crypto";
 import { CCLAW_VERSION } from "../constants.js";
-import { loadCorpus } from "./corpus.js";
+import { FLOW_STAGES } from "../types.js";
+import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
+import { loadCorpus, readFixtureArtifact } from "./corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
+import { verifyStructural } from "./verifiers/structural.js";
 function groupByStage(cases) {
     return cases.reduce((acc, item) => {
         acc[item.stage] = (acc[item.stage] ?? 0) + 1;
         return acc;
     }, {});
 }
+function skeletonVerifierResult(message, details) {
+    return {
+        kind: "structural",
+        id: "wave-7-1-no-structural-expected",
+        ok: true,
+        score: 1,
+        message,
+        ...(details !== undefined ? { details } : {})
+    };
+}
+async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
+    const started = Date.now();
+    const structuralExpected = caseEntry.expected?.structural;
+    const verifierResults = [];
+    if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
+        // No structural expectations declared — case is treated as "N/A" for this
+        // verifier kind; a placeholder pass keeps downstream math simple while
+        // making the situation visible in the report.
+        verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
+    }
+    else {
+        let artifact;
+        try {
+            artifact = await readFixtureArtifact(projectRoot, caseEntry);
+        }
+        catch (err) {
+            verifierResults.push({
+                kind: "structural",
+                id: "structural:fixture:missing",
+                ok: false,
+                score: 0,
+                message: err instanceof Error ? err.message : String(err),
+                details: { fixture: caseEntry.fixture }
+            });
+        }
+        if (artifact !== undefined) {
+            const results = verifyStructural(artifact, structuralExpected);
+            if (results.length === 0) {
+                verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
+            }
+            else {
+                verifierResults.push(...results);
+            }
+        }
+        else if (verifierResults.length === 0) {
+            verifierResults.push({
+                kind: "structural",
+                id: "structural:fixture:absent",
+                ok: false,
+                score: 0,
+                message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
+                details: { fixtureProvided: false }
+            });
+        }
+    }
+    const allOk = verifierResults.every((r) => r.ok);
+    return {
+        caseId: caseEntry.id,
+        stage: caseEntry.stage,
+        tier: plannedTier,
+        passed: allOk,
+        durationMs: Date.now() - started,
+        verifierResults
+    };
+}
+function reduceSummary(caseResults) {
+    let passed = 0;
+    let failed = 0;
+    let skipped = 0;
+    let totalCostUsd = 0;
+    let totalDurationMs = 0;
+    for (const c of caseResults) {
+        totalDurationMs += c.durationMs;
+        if (c.costUsd !== undefined)
+            totalCostUsd += c.costUsd;
+        if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
+            skipped += 1;
+            continue;
+        }
+        if (c.passed)
+            passed += 1;
+        else
+            failed += 1;
+    }
+    return {
+        totalCases: caseResults.length,
+        passed,
+        failed,
+        skipped,
+        totalCostUsd: Number(totalCostUsd.toFixed(6)),
+        totalDurationMs
+    };
+}
+function stagesInResults(caseResults) {
+    const set = new Set();
+    for (const c of caseResults)
+        set.add(c.stage);
+    return FLOW_STAGES.filter((s) => set.has(s));
+}
 /**
- * Wave 7.0 runner. Responsibilities:
- * - Load resolved config (defaults + file + env).
- * - Load corpus (empty on a fresh install).
- * - Validate that no verifier flag asks for a capability that does not exist yet.
- * - Return either a dry-run summary or an empty report.
- *
- * Waves 7.1+ will replace the "no verifiers available" branch with the real
- * verifier dispatch pipeline. The signature stays stable so CLI wiring does
- * not churn.
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
+ * active), runs structural verifiers against fixture-backed cases and loads
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
+ * as skipped rather than failing.
  */
 export async function runEval(options) {
     const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
@@ -25,10 +123,7 @@ export async function runEval(options) {
     const plannedTier = options.tier ?? config.defaultTier;
     const notes = [];
     if (corpus.length === 0) {
-        notes.push("Corpus is empty. Seed cases land in Wave 7.1 (`.cclaw/evals/corpus/<stage>/*.yaml`).");
-    }
-    if (options.schemaOnly) {
-        notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
+        notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
     if (options.rules) {
         notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
@@ -47,7 +142,7 @@ export async function runEval(options) {
             },
             plannedTier,
             verifiersAvailable: {
-                structural: false,
+                structural: true,
                 rules: false,
                 judge: false,
                 workflow: false
@@ -57,22 +152,13 @@ export async function runEval(options) {
         return summary;
     }
     const now = new Date().toISOString();
-    const caseResults = corpus.map((item) => ({
-        caseId: item.id,
-        stage: item.stage,
-        tier: plannedTier,
-        passed: false,
-        durationMs: 0,
-        verifierResults: [
-            {
-                kind: "structural",
-                id: "wave-7-0-skeleton",
-                ok: false,
-                message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
-                details: { skipped: true }
-            }
-        ]
-    }));
+    const caseResults = [];
+    for (const item of corpus) {
+        caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
+    }
+    const stages = stagesInResults(caseResults);
+    const baselines = await loadBaselinesByStage(options.projectRoot, stages);
+    const summary = reduceSummary(caseResults);
     const report = {
         schemaVersion: 1,
         generatedAt: now,
@@ -81,16 +167,12 @@ export async function runEval(options) {
         provider: config.provider,
         model: config.model,
         tier: plannedTier,
-        stages: options.stage ? [options.stage] : [],
+        stages,
         cases: caseResults,
-        summary: {
-            totalCases: caseResults.length,
-            passed: 0,
-            failed: 0,
-            skipped: caseResults.length,
-            totalCostUsd: 0,
-            totalDurationMs: 0
-        }
+        summary
     };
+    const baselineDelta = compareAgainstBaselines(report, baselines);
+    if (baselineDelta)
+        report.baselineDelta = baselineDelta;
     return report;
 }

package/dist/eval/types.d.ts CHANGED Viewed

@@ -27,6 +27,45 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
  */
 export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
 export type VerifierKind = (typeof VERIFIER_KINDS)[number];
+/**
+ * Structural expectations — deterministic, LLM-free checks against a single
+ * text artifact. Wave 7.1 implements all fields below; Wave 7.2 adds the
+ * sibling `rules` shape, Wave 7.3 adds `judge`.
+ */
+export interface StructuralExpected {
+    /**
+     * Case-insensitive substrings that must each appear on at least one markdown
+     * heading line (line starting with `#`). Useful for "required sections".
+     */
+    requiredSections?: string[];
+    /**
+     * Case-insensitive substrings that must NOT appear anywhere in the body
+     * (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
+     */
+    forbiddenPatterns?: string[];
+    /** Inclusive minimum line count of the artifact body (frontmatter excluded). */
+    minLines?: number;
+    /** Inclusive maximum line count of the artifact body (frontmatter excluded). */
+    maxLines?: number;
+    /** Inclusive minimum character count of the artifact body. */
+    minChars?: number;
+    /** Inclusive maximum character count of the artifact body. */
+    maxChars?: number;
+    /**
+     * Keys that must appear in the leading YAML frontmatter (between a pair of
+     * `---` delimiters at the very top of the file). An artifact without
+     * frontmatter will fail every entry.
+     */
+    requiredFrontmatterKeys?: string[];
+}
+/** Superset of per-verifier expectation shapes. Only `structural` is wired in Wave 7.1. */
+export interface ExpectedShape {
+    structural?: StructuralExpected;
+    /** Rule-based (keyword/regex/traceability) checks — Wave 7.2. */
+    rules?: Record<string, unknown>;
+    /** LLM-judge rubrics — Wave 7.3. */
+    judge?: Record<string, unknown>;
+}
 /**
  * A single eval case describes one input scenario for one stage. Cases live in
  * `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
@@ -40,10 +79,10 @@ export interface EvalCase {
     /** Project files copied into the Tier B/C sandbox before the agent runs. */
     contextFiles?: string[];
     /**
-     * Optional expected-shape hints consumed by structural/rule verifiers.
-     * Left intentionally loose; verifiers in Waves 7.1–7.2 will narrow this.
+     * Typed expectation hints consumed by the structural/rules/judge verifiers.
+     * Each sub-shape is optional; missing sub-shapes skip that verifier tier.
      */
-    expected?: Record<string, unknown>;
+    expected?: ExpectedShape;
     /**
      * Path (relative to the corpus case file) of a pre-generated artifact used
      * when verifiers are exercised without a live agent loop. Primarily a Wave
@@ -91,11 +130,7 @@ export interface EvalReport {
         totalDurationMs: number;
     };
     /** Present when comparing against a saved baseline (Wave 7.1+). */
-    baselineDelta?: {
-        baselineId: string;
-        scoreDelta: number;
-        criticalFailures: number;
-    };
+    baselineDelta?: BaselineDelta;
 }
 /**
  * Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
@@ -134,3 +169,48 @@ export interface ResolvedEvalConfig extends EvalConfig {
     apiKey?: string;
     source: "default" | "file" | "env" | "file+env";
 }
+/**
+ * Frozen per-stage baseline used by regression gating (Wave 7.1). Baselines
+ * are committed to git; `cclaw eval --update-baseline --confirm` rewrites
+ * them. The shape is intentionally flat so a quick `git diff` reveals what
+ * changed between runs.
+ */
+export interface BaselineSnapshot {
+    schemaVersion: 1;
+    stage: FlowStage;
+    generatedAt: string;
+    cclawVersion: string;
+    /** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
+    cases: Record<string, BaselineCaseEntry>;
+}
+export interface BaselineCaseEntry {
+    passed: boolean;
+    verifierResults: BaselineVerifierEntry[];
+}
+export interface BaselineVerifierEntry {
+    id: string;
+    kind: VerifierKind;
+    ok: boolean;
+    score?: number;
+}
+/**
+ * Delta between a fresh report and the saved baseline. Populated when
+ * baselines exist on disk and the run covers matching cases.
+ */
+export interface BaselineDelta {
+    baselineId: string;
+    /** Fresh-score − baseline-score, bounded to [-1, 1]. */
+    scoreDelta: number;
+    /** Count of checks that flipped from `ok:true` to `ok:false`. */
+    criticalFailures: number;
+    /** Per-case regression details for the Markdown report. */
+    regressions: BaselineRegression[];
+}
+export interface BaselineRegression {
+    caseId: string;
+    stage: FlowStage;
+    verifierId: string;
+    reason: "newly-failing" | "case-now-failing" | "score-drop";
+    previousScore?: number;
+    currentScore?: number;
+}

package/dist/eval/verifiers/structural.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type { StructuralExpected, VerifierResult } from "../types.js";
+export interface ArtifactSplit {
+    hasFrontmatter: boolean;
+    frontmatterRaw: string;
+    frontmatterParsed?: Record<string, unknown>;
+    body: string;
+}
+export declare function splitFrontmatter(artifact: string): ArtifactSplit;
+/**
+ * Run every configured structural check against the artifact text.
+ * Returns [] when `expected` is undefined/empty so the runner can treat
+ * "no structural expectations" as "no verifier results" rather than "pass".
+ */
+export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];

package/dist/eval/verifiers/structural.js ADDED Viewed

@@ -0,0 +1,171 @@
+/**
+ * Structural verifier (Wave 7.1): deterministic, zero-LLM checks against a
+ * single markdown artifact. Each structural expectation produces one
+ * `VerifierResult` so baselines diff cleanly at the check level rather than
+ * lumping everything into a single boolean.
+ *
+ * Design notes:
+ *
+ * - All pattern matching is case-insensitive. Authoring a check as
+ *   `"Directions"` matches `## Directions` and `### directions-suggested`.
+ * - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
+ *   and close on a subsequent `---` line. Anything else is treated as "no
+ *   frontmatter", which fails every `requiredFrontmatterKeys` entry
+ *   deterministically.
+ * - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
+ *   adds metadata does not accidentally drop the body below the floor.
+ * - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
+ *   all individual `ok` flags. This keeps Wave 7.1 deterministic; the 0..1
+ *   rubric scale shows up in Wave 7.3 (judge).
+ */
+import { parse as parseYaml } from "yaml";
+const FRONTMATTER_OPEN = /^---\r?\n/;
+const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
+function slugify(input) {
+    return input
+        .toLowerCase()
+        .replace(/[^a-z0-9]+/g, "-")
+        .replace(/(^-|-$)/g, "")
+        .slice(0, 64);
+}
+export function splitFrontmatter(artifact) {
+    if (!FRONTMATTER_OPEN.test(artifact)) {
+        return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
+    }
+    const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
+    const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
+    if (!closeMatch || closeMatch.index === undefined) {
+        return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
+    }
+    const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
+    const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
+    let frontmatterParsed;
+    try {
+        const parsed = parseYaml(frontmatterRaw);
+        if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
+            frontmatterParsed = parsed;
+        }
+    }
+    catch {
+        frontmatterParsed = undefined;
+    }
+    return {
+        hasFrontmatter: true,
+        frontmatterRaw,
+        frontmatterParsed,
+        body
+    };
+}
+function extractHeadingLines(body) {
+    return body
+        .split(/\r?\n/)
+        .map((line) => line.trimStart())
+        .filter((line) => /^#{1,6}\s+\S/.test(line));
+}
+function result(id, ok, message, details) {
+    return {
+        kind: "structural",
+        id,
+        ok,
+        score: ok ? 1 : 0,
+        message,
+        ...(details !== undefined ? { details } : {})
+    };
+}
+function checkRequiredSections(sections, body) {
+    const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
+    return sections.map((section) => {
+        const needle = section.toLowerCase().trim();
+        const found = headings.some((heading) => heading.includes(needle));
+        return result(`structural:section:${slugify(section)}`, found, found
+            ? `Section matching "${section}" present.`
+            : `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
+    });
+}
+function checkForbiddenPatterns(patterns, body) {
+    const bodyLower = body.toLowerCase();
+    return patterns.map((pattern) => {
+        const needle = pattern.toLowerCase();
+        const hits = countOccurrences(bodyLower, needle);
+        const ok = hits === 0;
+        return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
+            ? `Pattern "${pattern}" absent (as required).`
+            : `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
+    });
+}
+function countOccurrences(haystack, needle) {
+    if (needle.length === 0)
+        return 0;
+    let index = 0;
+    let count = 0;
+    while (true) {
+        const at = haystack.indexOf(needle, index);
+        if (at < 0)
+            return count;
+        count += 1;
+        index = at + needle.length;
+    }
+}
+function checkLengthBounds(expected, body) {
+    const results = [];
+    const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
+    const charCount = body.length;
+    if (expected.minLines !== undefined || expected.maxLines !== undefined) {
+        const min = expected.minLines;
+        const max = expected.maxLines;
+        const withinMin = min === undefined || lineCount >= min;
+        const withinMax = max === undefined || lineCount <= max;
+        const ok = withinMin && withinMax;
+        results.push(result("structural:length:lines", ok, ok
+            ? `Body has ${lineCount} line(s), within bounds.`
+            : buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
+    }
+    if (expected.minChars !== undefined || expected.maxChars !== undefined) {
+        const min = expected.minChars;
+        const max = expected.maxChars;
+        const withinMin = min === undefined || charCount >= min;
+        const withinMax = max === undefined || charCount <= max;
+        const ok = withinMin && withinMax;
+        results.push(result("structural:length:chars", ok, ok
+            ? `Body has ${charCount} char(s), within bounds.`
+            : buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
+    }
+    return results;
+}
+function buildOutOfRangeMessage(unit, actual, min, max) {
+    const lo = min === undefined ? "0" : String(min);
+    const hi = max === undefined ? "∞" : String(max);
+    return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
+}
+function checkFrontmatterKeys(keys, split) {
+    if (!split.hasFrontmatter || !split.frontmatterParsed) {
+        return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
+    }
+    const present = new Set(Object.keys(split.frontmatterParsed));
+    return keys.map((key) => {
+        const ok = present.has(key);
+        return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
+    });
+}
+/**
+ * Run every configured structural check against the artifact text.
+ * Returns [] when `expected` is undefined/empty so the runner can treat
+ * "no structural expectations" as "no verifier results" rather than "pass".
+ */
+export function verifyStructural(artifact, expected) {
+    if (!expected)
+        return [];
+    const split = splitFrontmatter(artifact);
+    const results = [];
+    if (expected.requiredSections?.length) {
+        results.push(...checkRequiredSections(expected.requiredSections, split.body));
+    }
+    if (expected.forbiddenPatterns?.length) {
+        results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
+    }
+    results.push(...checkLengthBounds(expected, split.body));
+    if (expected.requiredFrontmatterKeys?.length) {
+        results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
+    }
+    return results;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cclaw-cli",
-  "version": "0.22.0",
+  "version": "0.23.0",
   "description": "Installer-first flow toolkit for coding agents",
   "type": "module",
   "bin": {