cclaw-cli 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts CHANGED
@@ -24,6 +24,8 @@ interface ParsedArgs {
24
24
  evalJudge?: boolean;
25
25
  evalJson?: boolean;
26
26
  evalNoWrite?: boolean;
27
+ evalUpdateBaseline?: boolean;
28
+ evalConfirm?: boolean;
27
29
  showHelp?: boolean;
28
30
  showVersion?: boolean;
29
31
  }
package/dist/cli.js CHANGED
@@ -14,6 +14,7 @@ import { createDefaultConfig, createProfileConfig } from "./config.js";
14
14
  import { detectHarnesses } from "./init-detect.js";
15
15
  import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
16
  import { runEval } from "./eval/runner.js";
17
+ import { writeBaselinesFromReport } from "./eval/baseline.js";
17
18
  import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
18
19
  import { EVAL_TIERS } from "./eval/types.js";
19
20
  import { FLOW_STAGES } from "./types.js";
@@ -53,15 +54,17 @@ Commands:
53
54
  Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
54
55
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
55
56
  --retro-reason=<t> Reason for bypassing retro gate.
56
- eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
57
- Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
58
- --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
59
- --schema-only Run only structural verifiers (Wave 7.1).
60
- --rules Run structural + rule verifiers (Wave 7.2).
61
- --judge Include LLM judging (Wave 7.3; requires API key).
62
- --dry-run Validate config + corpus, print summary, do not execute.
63
- --json Emit machine-readable JSON on stdout.
64
- --no-write Skip writing the report to .cclaw/evals/reports/.
57
+ eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.1: structural verifier).
58
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
59
+ --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
60
+ --schema-only Run only structural verifiers (Wave 7.1, default).
61
+ --rules Run structural + rule verifiers (Wave 7.2).
62
+ --judge Include LLM judging (Wave 7.3; requires API key).
63
+ --dry-run Validate config + corpus, print summary, do not execute.
64
+ --json Emit machine-readable JSON on stdout.
65
+ --no-write Skip writing the report to .cclaw/evals/reports/.
66
+ --update-baseline Overwrite baselines from the current run (requires --confirm).
67
+ --confirm Acknowledge --update-baseline (prevents accidental resets).
65
68
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
66
69
  uninstall Remove .cclaw runtime and the generated harness shim files.
67
70
 
@@ -453,6 +456,14 @@ function parseArgs(argv) {
453
456
  parsed.evalNoWrite = true;
454
457
  continue;
455
458
  }
459
+ if (flag === "--update-baseline") {
460
+ parsed.evalUpdateBaseline = true;
461
+ continue;
462
+ }
463
+ if (flag === "--confirm") {
464
+ parsed.evalConfirm = true;
465
+ continue;
466
+ }
456
467
  }
457
468
  // `--json` is shared between doctor and eval. Disambiguate by command.
458
469
  if (parsed.command === "eval" && parsed.doctorJson === true) {
@@ -592,22 +603,42 @@ async function runCommand(parsed, ctx) {
592
603
  }
593
604
  return 0;
594
605
  }
606
+ if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
607
+ error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
608
+ return 1;
609
+ }
610
+ if (parsed.evalUpdateBaseline === true) {
611
+ if (result.summary.failed > 0) {
612
+ error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
613
+ return 1;
614
+ }
615
+ const written = await writeBaselinesFromReport(ctx.cwd, result);
616
+ for (const file of written) {
617
+ info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
618
+ }
619
+ }
595
620
  if (parsed.evalNoWrite !== true) {
596
621
  const jsonPath = await writeJsonReport(ctx.cwd, result);
597
622
  const mdPath = await writeMarkdownReport(ctx.cwd, result);
598
623
  info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
599
624
  info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
600
625
  }
626
+ const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
601
627
  if (parsed.evalJson === true) {
602
628
  ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
603
629
  }
604
630
  else {
631
+ const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
605
632
  ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
606
633
  `${result.summary.passed} passed, ` +
607
634
  `${result.summary.failed} failed, ` +
608
- `${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
635
+ `${result.summary.skipped} skipped${regressionNote}\n`);
609
636
  }
610
- return result.summary.failed > 0 ? 1 : 0;
637
+ if (result.summary.failed > 0)
638
+ return 1;
639
+ if (regressionCount > 0)
640
+ return 1;
641
+ return 0;
611
642
  }
612
643
  if (command === "archive") {
613
644
  const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
@@ -0,0 +1,14 @@
1
+ import type { FlowStage } from "../types.js";
2
+ import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
3
+ export declare const BASELINE_SCHEMA_VERSION = 1;
4
+ export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
5
+ export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
6
+ export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
7
+ export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
8
+ /**
9
+ * Compare a freshly computed report against loaded baselines. If no baseline
10
+ * exists for a stage covered by the report, that stage contributes zero
11
+ * regressions (first run of that stage). Current is the source of truth.
12
+ */
13
+ export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
14
+ export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
@@ -0,0 +1,209 @@
1
+ /**
2
+ * Baseline I/O + regression comparison (Wave 7.1).
3
+ *
4
+ * Layout on disk (committed):
5
+ *
6
+ * .cclaw/evals/baselines/<stage>.json
7
+ *
8
+ * Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
9
+ * regressions by comparing per-verifier `ok` flags across runs: any verifier
10
+ * that was `ok:true` in the baseline and is `ok:false` now counts as a
11
+ * critical failure. A case whose aggregate `passed` flipped from true to
12
+ * false is flagged as `case-now-failing` regardless of per-verifier churn.
13
+ *
14
+ * Writes are gated behind an explicit `--update-baseline --confirm` pair at
15
+ * the CLI layer so accidental resets do not slip into PRs.
16
+ */
17
+ import fs from "node:fs/promises";
18
+ import path from "node:path";
19
+ import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
20
+ import { exists } from "../fs-utils.js";
21
+ import { FLOW_STAGES } from "../types.js";
22
+ export const BASELINE_SCHEMA_VERSION = 1;
23
+ function baselinePath(projectRoot, stage) {
24
+ return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
25
+ }
26
+ export async function loadBaseline(projectRoot, stage) {
27
+ const filePath = baselinePath(projectRoot, stage);
28
+ if (!(await exists(filePath)))
29
+ return null;
30
+ const raw = await fs.readFile(filePath, "utf8");
31
+ let parsed;
32
+ try {
33
+ parsed = JSON.parse(raw);
34
+ }
35
+ catch (err) {
36
+ throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
37
+ }
38
+ if (!isBaseline(parsed, stage)) {
39
+ throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
40
+ }
41
+ return parsed;
42
+ }
43
+ function isBaseline(value, stage) {
44
+ if (!value || typeof value !== "object")
45
+ return false;
46
+ const candidate = value;
47
+ if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
48
+ return false;
49
+ if (candidate.stage !== stage)
50
+ return false;
51
+ if (typeof candidate.generatedAt !== "string")
52
+ return false;
53
+ if (typeof candidate.cclawVersion !== "string")
54
+ return false;
55
+ if (!candidate.cases || typeof candidate.cases !== "object")
56
+ return false;
57
+ return true;
58
+ }
59
+ export async function loadBaselinesByStage(projectRoot, stages) {
60
+ const out = new Map();
61
+ for (const stage of stages) {
62
+ const snapshot = await loadBaseline(projectRoot, stage);
63
+ if (snapshot)
64
+ out.set(stage, snapshot);
65
+ }
66
+ return out;
67
+ }
68
+ function entryFromResult(result) {
69
+ const verifierResults = result.verifierResults.map((v) => ({
70
+ id: v.id,
71
+ kind: v.kind,
72
+ ok: v.ok,
73
+ ...(v.score !== undefined ? { score: v.score } : {})
74
+ }));
75
+ return { passed: result.passed, verifierResults };
76
+ }
77
+ export function buildBaselineForStage(stage, report) {
78
+ const stageCases = report.cases.filter((c) => c.stage === stage);
79
+ const cases = {};
80
+ for (const c of stageCases) {
81
+ cases[c.caseId] = entryFromResult(c);
82
+ }
83
+ return {
84
+ schemaVersion: BASELINE_SCHEMA_VERSION,
85
+ stage,
86
+ generatedAt: new Date().toISOString(),
87
+ cclawVersion: CCLAW_VERSION,
88
+ cases
89
+ };
90
+ }
91
+ export async function writeBaselinesFromReport(projectRoot, report) {
92
+ const written = [];
93
+ const stages = new Set(report.cases.map((c) => c.stage));
94
+ for (const stage of stages) {
95
+ const snapshot = buildBaselineForStage(stage, report);
96
+ const file = baselinePath(projectRoot, stage);
97
+ await fs.mkdir(path.dirname(file), { recursive: true });
98
+ await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
99
+ written.push(file);
100
+ }
101
+ return written.sort();
102
+ }
103
+ function verifierMap(entries) {
104
+ const out = new Map();
105
+ for (const entry of entries) {
106
+ out.set(entry.id, entry);
107
+ }
108
+ return out;
109
+ }
110
+ function computePassRate(cases) {
111
+ if (cases.length === 0)
112
+ return 1;
113
+ const passed = cases.filter((c) => c.passed).length;
114
+ return passed / cases.length;
115
+ }
116
+ function baselinePassRate(snapshot) {
117
+ const entries = Object.values(snapshot.cases);
118
+ if (entries.length === 0)
119
+ return 1;
120
+ const passed = entries.filter((e) => e.passed).length;
121
+ return passed / entries.length;
122
+ }
123
+ /**
124
+ * Compare a freshly computed report against loaded baselines. If no baseline
125
+ * exists for a stage covered by the report, that stage contributes zero
126
+ * regressions (first run of that stage). Current is the source of truth.
127
+ */
128
+ export function compareAgainstBaselines(report, baselines) {
129
+ if (baselines.size === 0)
130
+ return undefined;
131
+ const regressions = [];
132
+ const caseResultsByStage = new Map();
133
+ for (const c of report.cases) {
134
+ const bucket = caseResultsByStage.get(c.stage) ?? [];
135
+ bucket.push(c);
136
+ caseResultsByStage.set(c.stage, bucket);
137
+ }
138
+ let baselineTotalPassRate = 0;
139
+ let baselineStagesCounted = 0;
140
+ for (const [stage, snapshot] of baselines) {
141
+ const current = caseResultsByStage.get(stage) ?? [];
142
+ baselineTotalPassRate += baselinePassRate(snapshot);
143
+ baselineStagesCounted += 1;
144
+ for (const caseResult of current) {
145
+ const baselineEntry = snapshot.cases[caseResult.caseId];
146
+ if (!baselineEntry)
147
+ continue;
148
+ if (baselineEntry.passed && !caseResult.passed) {
149
+ regressions.push({
150
+ caseId: caseResult.caseId,
151
+ stage,
152
+ verifierId: "<case>",
153
+ reason: "case-now-failing",
154
+ previousScore: 1,
155
+ currentScore: 0
156
+ });
157
+ }
158
+ const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
159
+ for (const currentVerifier of caseResult.verifierResults) {
160
+ const prev = baselineVerifiers.get(currentVerifier.id);
161
+ if (!prev)
162
+ continue;
163
+ if (prev.ok && !currentVerifier.ok) {
164
+ regressions.push({
165
+ caseId: caseResult.caseId,
166
+ stage,
167
+ verifierId: currentVerifier.id,
168
+ reason: "newly-failing",
169
+ previousScore: prev.score ?? 1,
170
+ currentScore: currentVerifier.score ?? 0
171
+ });
172
+ }
173
+ else if (prev.score !== undefined &&
174
+ currentVerifier.score !== undefined &&
175
+ currentVerifier.score < prev.score) {
176
+ regressions.push({
177
+ caseId: caseResult.caseId,
178
+ stage,
179
+ verifierId: currentVerifier.id,
180
+ reason: "score-drop",
181
+ previousScore: prev.score,
182
+ currentScore: currentVerifier.score
183
+ });
184
+ }
185
+ }
186
+ }
187
+ }
188
+ const currentPassRate = computePassRate(report.cases);
189
+ const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
190
+ const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
191
+ const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
192
+ const baselineStages = [...baselines.keys()].sort().join(",");
193
+ return {
194
+ baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
195
+ scoreDelta,
196
+ criticalFailures,
197
+ regressions
198
+ };
199
+ }
200
+ export function listBaselineStages(projectRoot) {
201
+ const root = path.join(projectRoot, EVALS_ROOT, "baselines");
202
+ return fs
203
+ .readdir(root, { withFileTypes: true })
204
+ .then((entries) => entries
205
+ .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
206
+ .map((entry) => entry.name.replace(/\.json$/, ""))
207
+ .filter((name) => FLOW_STAGES.includes(name)))
208
+ .catch(() => []);
209
+ }
@@ -2,7 +2,18 @@ import type { FlowStage } from "../types.js";
2
2
  import type { EvalCase } from "./types.js";
3
3
  /**
4
4
  * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
5
- * single stage. Returns an empty array for a fresh install (Wave 7.0 ships
6
- * without seed cases; corpus is authored in Wave 7.1+).
5
+ * single stage. Returns an empty array for a fresh install.
7
6
  */
8
7
  export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
8
+ /**
9
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
10
+ * field is interpreted relative to the case's stage directory (i.e., a
11
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
12
+ */
13
+ export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
14
+ /**
15
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
16
+ * has no fixture reference. Throws a descriptive error if the path exists in
17
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
18
+ */
19
+ export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
@@ -12,6 +12,76 @@ function corpusError(filePath, reason) {
12
12
  function isRecord(value) {
13
13
  return typeof value === "object" && value !== null && !Array.isArray(value);
14
14
  }
15
+ function readStringArray(filePath, context, value) {
16
+ if (value === undefined)
17
+ return undefined;
18
+ if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
19
+ throw corpusError(filePath, `"${context}" must be an array of strings`);
20
+ }
21
+ return value;
22
+ }
23
+ function readNonNegativeInteger(filePath, context, value) {
24
+ if (value === undefined)
25
+ return undefined;
26
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
27
+ throw corpusError(filePath, `"${context}" must be a non-negative integer`);
28
+ }
29
+ return value;
30
+ }
31
+ function parseStructural(filePath, raw) {
32
+ if (raw === undefined)
33
+ return undefined;
34
+ if (!isRecord(raw)) {
35
+ throw corpusError(filePath, `"expected.structural" must be a mapping`);
36
+ }
37
+ const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
38
+ const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
39
+ const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
40
+ const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
41
+ const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
42
+ const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
43
+ const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
44
+ const structural = {};
45
+ if (requiredSections)
46
+ structural.requiredSections = requiredSections;
47
+ if (forbiddenPatterns)
48
+ structural.forbiddenPatterns = forbiddenPatterns;
49
+ if (requiredFrontmatterKeys)
50
+ structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
51
+ if (minLines !== undefined)
52
+ structural.minLines = minLines;
53
+ if (maxLines !== undefined)
54
+ structural.maxLines = maxLines;
55
+ if (minChars !== undefined)
56
+ structural.minChars = minChars;
57
+ if (maxChars !== undefined)
58
+ structural.maxChars = maxChars;
59
+ return structural;
60
+ }
61
+ function parseExpected(filePath, raw) {
62
+ if (raw === undefined)
63
+ return undefined;
64
+ if (!isRecord(raw)) {
65
+ throw corpusError(filePath, `"expected" must be a mapping`);
66
+ }
67
+ const shape = {};
68
+ const structural = parseStructural(filePath, raw.structural);
69
+ if (structural)
70
+ shape.structural = structural;
71
+ if (raw.rules !== undefined) {
72
+ if (!isRecord(raw.rules)) {
73
+ throw corpusError(filePath, `"expected.rules" must be a mapping`);
74
+ }
75
+ shape.rules = raw.rules;
76
+ }
77
+ if (raw.judge !== undefined) {
78
+ if (!isRecord(raw.judge)) {
79
+ throw corpusError(filePath, `"expected.judge" must be a mapping`);
80
+ }
81
+ shape.judge = raw.judge;
82
+ }
83
+ return Object.keys(shape).length === 0 ? undefined : shape;
84
+ }
15
85
  function validateCase(filePath, raw) {
16
86
  if (!isRecord(raw)) {
17
87
  throw corpusError(filePath, "top-level value must be a mapping");
@@ -28,17 +98,8 @@ function validateCase(filePath, raw) {
28
98
  if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
29
99
  throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
30
100
  }
31
- const contextFilesRaw = raw.context_files ?? raw.contextFiles;
32
- let contextFiles;
33
- if (contextFilesRaw !== undefined) {
34
- if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
35
- throw corpusError(filePath, `"context_files" must be an array of strings`);
36
- }
37
- contextFiles = contextFilesRaw;
38
- }
39
- const expected = raw.expected !== undefined && isRecord(raw.expected)
40
- ? raw.expected
41
- : undefined;
101
+ const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
102
+ const expected = parseExpected(filePath, raw.expected);
42
103
  const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
43
104
  return {
44
105
  id: id.trim(),
@@ -51,8 +112,7 @@ function validateCase(filePath, raw) {
51
112
  }
52
113
  /**
53
114
  * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
54
- * single stage. Returns an empty array for a fresh install (Wave 7.0 ships
55
- * without seed cases; corpus is authored in Wave 7.1+).
115
+ * single stage. Returns an empty array for a fresh install.
56
116
  */
57
117
  export async function loadCorpus(projectRoot, stage) {
58
118
  const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
@@ -89,3 +149,27 @@ export async function loadCorpus(projectRoot, stage) {
89
149
  cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
90
150
  return cases;
91
151
  }
152
+ /**
153
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
154
+ * field is interpreted relative to the case's stage directory (i.e., a
155
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
156
+ */
157
+ export function fixturePathFor(projectRoot, caseEntry) {
158
+ if (!caseEntry.fixture)
159
+ return undefined;
160
+ return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
161
+ }
162
+ /**
163
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
164
+ * has no fixture reference. Throws a descriptive error if the path exists in
165
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
166
+ */
167
+ export async function readFixtureArtifact(projectRoot, caseEntry) {
168
+ const fixturePath = fixturePathFor(projectRoot, caseEntry);
169
+ if (!fixturePath)
170
+ return undefined;
171
+ if (!(await exists(fixturePath))) {
172
+ throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
173
+ }
174
+ return fs.readFile(fixturePath, "utf8");
175
+ }
@@ -39,12 +39,25 @@ export function formatMarkdownReport(report) {
39
39
  lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
40
40
  lines.push(``);
41
41
  if (report.baselineDelta) {
42
+ const delta = report.baselineDelta;
42
43
  lines.push(`## Baseline delta`);
43
44
  lines.push(``);
44
- lines.push(`- baseline: ${report.baselineDelta.baselineId}`);
45
- lines.push(`- score delta: ${report.baselineDelta.scoreDelta.toFixed(4)}`);
46
- lines.push(`- critical failures: ${report.baselineDelta.criticalFailures}`);
45
+ lines.push(`- baseline: ${delta.baselineId}`);
46
+ lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
47
+ lines.push(`- critical failures: ${delta.criticalFailures}`);
47
48
  lines.push(``);
49
+ if (delta.regressions.length > 0) {
50
+ lines.push(`### Regressions`);
51
+ lines.push(``);
52
+ lines.push(`| stage | case id | verifier | reason | prev | curr |`);
53
+ lines.push(`| --- | --- | --- | --- | --- | --- |`);
54
+ for (const reg of delta.regressions) {
55
+ const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
56
+ const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
57
+ lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
58
+ }
59
+ lines.push(``);
60
+ }
48
61
  }
49
62
  if (report.cases.length === 0) {
50
63
  lines.push(`## Cases`);
@@ -4,7 +4,7 @@ export interface RunEvalOptions {
4
4
  projectRoot: string;
5
5
  stage?: FlowStage;
6
6
  tier?: EvalTier;
7
- /** When true, run only structural verifiers. Wave 7.1 wires actual verifiers. */
7
+ /** When true, run only structural verifiers (Wave 7.1). */
8
8
  schemaOnly?: boolean;
9
9
  /** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
10
10
  rules?: boolean;
@@ -27,10 +27,6 @@ export interface DryRunSummary {
27
27
  }>;
28
28
  };
29
29
  plannedTier: EvalTier;
30
- /**
31
- * Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
32
- * across the board because no verifier is implemented yet.
33
- */
34
30
  verifiersAvailable: {
35
31
  structural: boolean;
36
32
  rules: boolean;
@@ -40,14 +36,10 @@ export interface DryRunSummary {
40
36
  notes: string[];
41
37
  }
42
38
  /**
43
- * Wave 7.0 runner. Responsibilities:
44
- * - Load resolved config (defaults + file + env).
45
- * - Load corpus (empty on a fresh install).
46
- * - Validate that no verifier flag asks for a capability that does not exist yet.
47
- * - Return either a dry-run summary or an empty report.
48
- *
49
- * Waves 7.1+ will replace the "no verifiers available" branch with the real
50
- * verifier dispatch pipeline. The signature stays stable so CLI wiring does
51
- * not churn.
39
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
40
+ * active), runs structural verifiers against fixture-backed cases and loads
41
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
42
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
43
+ * as skipped rather than failing.
52
44
  */
53
45
  export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
@@ -1,23 +1,121 @@
1
1
  import { randomUUID } from "node:crypto";
2
2
  import { CCLAW_VERSION } from "../constants.js";
3
- import { loadCorpus } from "./corpus.js";
3
+ import { FLOW_STAGES } from "../types.js";
4
+ import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
5
+ import { loadCorpus, readFixtureArtifact } from "./corpus.js";
4
6
  import { loadEvalConfig } from "./config-loader.js";
7
+ import { verifyStructural } from "./verifiers/structural.js";
5
8
  function groupByStage(cases) {
6
9
  return cases.reduce((acc, item) => {
7
10
  acc[item.stage] = (acc[item.stage] ?? 0) + 1;
8
11
  return acc;
9
12
  }, {});
10
13
  }
14
+ function skeletonVerifierResult(message, details) {
15
+ return {
16
+ kind: "structural",
17
+ id: "wave-7-1-no-structural-expected",
18
+ ok: true,
19
+ score: 1,
20
+ message,
21
+ ...(details !== undefined ? { details } : {})
22
+ };
23
+ }
24
+ async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
25
+ const started = Date.now();
26
+ const structuralExpected = caseEntry.expected?.structural;
27
+ const verifierResults = [];
28
+ if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
29
+ // No structural expectations declared — case is treated as "N/A" for this
30
+ // verifier kind; a placeholder pass keeps downstream math simple while
31
+ // making the situation visible in the report.
32
+ verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
33
+ }
34
+ else {
35
+ let artifact;
36
+ try {
37
+ artifact = await readFixtureArtifact(projectRoot, caseEntry);
38
+ }
39
+ catch (err) {
40
+ verifierResults.push({
41
+ kind: "structural",
42
+ id: "structural:fixture:missing",
43
+ ok: false,
44
+ score: 0,
45
+ message: err instanceof Error ? err.message : String(err),
46
+ details: { fixture: caseEntry.fixture }
47
+ });
48
+ }
49
+ if (artifact !== undefined) {
50
+ const results = verifyStructural(artifact, structuralExpected);
51
+ if (results.length === 0) {
52
+ verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
53
+ }
54
+ else {
55
+ verifierResults.push(...results);
56
+ }
57
+ }
58
+ else if (verifierResults.length === 0) {
59
+ verifierResults.push({
60
+ kind: "structural",
61
+ id: "structural:fixture:absent",
62
+ ok: false,
63
+ score: 0,
64
+ message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
65
+ details: { fixtureProvided: false }
66
+ });
67
+ }
68
+ }
69
+ const allOk = verifierResults.every((r) => r.ok);
70
+ return {
71
+ caseId: caseEntry.id,
72
+ stage: caseEntry.stage,
73
+ tier: plannedTier,
74
+ passed: allOk,
75
+ durationMs: Date.now() - started,
76
+ verifierResults
77
+ };
78
+ }
79
+ function reduceSummary(caseResults) {
80
+ let passed = 0;
81
+ let failed = 0;
82
+ let skipped = 0;
83
+ let totalCostUsd = 0;
84
+ let totalDurationMs = 0;
85
+ for (const c of caseResults) {
86
+ totalDurationMs += c.durationMs;
87
+ if (c.costUsd !== undefined)
88
+ totalCostUsd += c.costUsd;
89
+ if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
90
+ skipped += 1;
91
+ continue;
92
+ }
93
+ if (c.passed)
94
+ passed += 1;
95
+ else
96
+ failed += 1;
97
+ }
98
+ return {
99
+ totalCases: caseResults.length,
100
+ passed,
101
+ failed,
102
+ skipped,
103
+ totalCostUsd: Number(totalCostUsd.toFixed(6)),
104
+ totalDurationMs
105
+ };
106
+ }
107
+ function stagesInResults(caseResults) {
108
+ const set = new Set();
109
+ for (const c of caseResults)
110
+ set.add(c.stage);
111
+ return FLOW_STAGES.filter((s) => set.has(s));
112
+ }
11
113
  /**
12
- * Wave 7.0 runner. Responsibilities:
13
- * - Load resolved config (defaults + file + env).
14
- * - Load corpus (empty on a fresh install).
15
- * - Validate that no verifier flag asks for a capability that does not exist yet.
16
- * - Return either a dry-run summary or an empty report.
17
- *
18
- * Waves 7.1+ will replace the "no verifiers available" branch with the real
19
- * verifier dispatch pipeline. The signature stays stable so CLI wiring does
20
- * not churn.
114
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
115
+ * active), runs structural verifiers against fixture-backed cases and loads
116
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
117
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
118
+ * as skipped rather than failing.
21
119
  */
22
120
  export async function runEval(options) {
23
121
  const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
@@ -25,10 +123,7 @@ export async function runEval(options) {
25
123
  const plannedTier = options.tier ?? config.defaultTier;
26
124
  const notes = [];
27
125
  if (corpus.length === 0) {
28
- notes.push("Corpus is empty. Seed cases land in Wave 7.1 (`.cclaw/evals/corpus/<stage>/*.yaml`).");
29
- }
30
- if (options.schemaOnly) {
31
- notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
126
+ notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
32
127
  }
33
128
  if (options.rules) {
34
129
  notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
@@ -47,7 +142,7 @@ export async function runEval(options) {
47
142
  },
48
143
  plannedTier,
49
144
  verifiersAvailable: {
50
- structural: false,
145
+ structural: true,
51
146
  rules: false,
52
147
  judge: false,
53
148
  workflow: false
@@ -57,22 +152,13 @@ export async function runEval(options) {
57
152
  return summary;
58
153
  }
59
154
  const now = new Date().toISOString();
60
- const caseResults = corpus.map((item) => ({
61
- caseId: item.id,
62
- stage: item.stage,
63
- tier: plannedTier,
64
- passed: false,
65
- durationMs: 0,
66
- verifierResults: [
67
- {
68
- kind: "structural",
69
- id: "wave-7-0-skeleton",
70
- ok: false,
71
- message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
72
- details: { skipped: true }
73
- }
74
- ]
75
- }));
155
+ const caseResults = [];
156
+ for (const item of corpus) {
157
+ caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
158
+ }
159
+ const stages = stagesInResults(caseResults);
160
+ const baselines = await loadBaselinesByStage(options.projectRoot, stages);
161
+ const summary = reduceSummary(caseResults);
76
162
  const report = {
77
163
  schemaVersion: 1,
78
164
  generatedAt: now,
@@ -81,16 +167,12 @@ export async function runEval(options) {
81
167
  provider: config.provider,
82
168
  model: config.model,
83
169
  tier: plannedTier,
84
- stages: options.stage ? [options.stage] : [],
170
+ stages,
85
171
  cases: caseResults,
86
- summary: {
87
- totalCases: caseResults.length,
88
- passed: 0,
89
- failed: 0,
90
- skipped: caseResults.length,
91
- totalCostUsd: 0,
92
- totalDurationMs: 0
93
- }
172
+ summary
94
173
  };
174
+ const baselineDelta = compareAgainstBaselines(report, baselines);
175
+ if (baselineDelta)
176
+ report.baselineDelta = baselineDelta;
95
177
  return report;
96
178
  }
@@ -27,6 +27,45 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
27
27
  */
28
28
  export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
29
29
  export type VerifierKind = (typeof VERIFIER_KINDS)[number];
30
+ /**
31
+ * Structural expectations — deterministic, LLM-free checks against a single
32
+ * text artifact. Wave 7.1 implements all fields below; Wave 7.2 adds the
33
+ * sibling `rules` shape, Wave 7.3 adds `judge`.
34
+ */
35
+ export interface StructuralExpected {
36
+ /**
37
+ * Case-insensitive substrings that must each appear on at least one markdown
38
+ * heading line (line starting with `#`). Useful for "required sections".
39
+ */
40
+ requiredSections?: string[];
41
+ /**
42
+ * Case-insensitive substrings that must NOT appear anywhere in the body
43
+ * (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
44
+ */
45
+ forbiddenPatterns?: string[];
46
+ /** Inclusive minimum line count of the artifact body (frontmatter excluded). */
47
+ minLines?: number;
48
+ /** Inclusive maximum line count of the artifact body (frontmatter excluded). */
49
+ maxLines?: number;
50
+ /** Inclusive minimum character count of the artifact body. */
51
+ minChars?: number;
52
+ /** Inclusive maximum character count of the artifact body. */
53
+ maxChars?: number;
54
+ /**
55
+ * Keys that must appear in the leading YAML frontmatter (between a pair of
56
+ * `---` delimiters at the very top of the file). An artifact without
57
+ * frontmatter will fail every entry.
58
+ */
59
+ requiredFrontmatterKeys?: string[];
60
+ }
61
+ /** Superset of per-verifier expectation shapes. Only `structural` is wired in Wave 7.1. */
62
+ export interface ExpectedShape {
63
+ structural?: StructuralExpected;
64
+ /** Rule-based (keyword/regex/traceability) checks — Wave 7.2. */
65
+ rules?: Record<string, unknown>;
66
+ /** LLM-judge rubrics — Wave 7.3. */
67
+ judge?: Record<string, unknown>;
68
+ }
30
69
  /**
31
70
  * A single eval case describes one input scenario for one stage. Cases live in
32
71
  * `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
@@ -40,10 +79,10 @@ export interface EvalCase {
40
79
  /** Project files copied into the Tier B/C sandbox before the agent runs. */
41
80
  contextFiles?: string[];
42
81
  /**
43
- * Optional expected-shape hints consumed by structural/rule verifiers.
44
- * Left intentionally loose; verifiers in Waves 7.1–7.2 will narrow this.
82
+ * Typed expectation hints consumed by the structural/rules/judge verifiers.
83
+ * Each sub-shape is optional; missing sub-shapes skip that verifier tier.
45
84
  */
46
- expected?: Record<string, unknown>;
85
+ expected?: ExpectedShape;
47
86
  /**
48
87
  * Path (relative to the corpus case file) of a pre-generated artifact used
49
88
  * when verifiers are exercised without a live agent loop. Primarily a Wave
@@ -91,11 +130,7 @@ export interface EvalReport {
91
130
  totalDurationMs: number;
92
131
  };
93
132
  /** Present when comparing against a saved baseline (Wave 7.1+). */
94
- baselineDelta?: {
95
- baselineId: string;
96
- scoreDelta: number;
97
- criticalFailures: number;
98
- };
133
+ baselineDelta?: BaselineDelta;
99
134
  }
100
135
  /**
101
136
  * Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
@@ -134,3 +169,48 @@ export interface ResolvedEvalConfig extends EvalConfig {
134
169
  apiKey?: string;
135
170
  source: "default" | "file" | "env" | "file+env";
136
171
  }
172
+ /**
173
+ * Frozen per-stage baseline used by regression gating (Wave 7.1). Baselines
174
+ * are committed to git; `cclaw eval --update-baseline --confirm` rewrites
175
+ * them. The shape is intentionally flat so a quick `git diff` reveals what
176
+ * changed between runs.
177
+ */
178
+ export interface BaselineSnapshot {
179
+ schemaVersion: 1;
180
+ stage: FlowStage;
181
+ generatedAt: string;
182
+ cclawVersion: string;
183
+ /** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
184
+ cases: Record<string, BaselineCaseEntry>;
185
+ }
186
+ export interface BaselineCaseEntry {
187
+ passed: boolean;
188
+ verifierResults: BaselineVerifierEntry[];
189
+ }
190
+ export interface BaselineVerifierEntry {
191
+ id: string;
192
+ kind: VerifierKind;
193
+ ok: boolean;
194
+ score?: number;
195
+ }
196
+ /**
197
+ * Delta between a fresh report and the saved baseline. Populated when
198
+ * baselines exist on disk and the run covers matching cases.
199
+ */
200
+ export interface BaselineDelta {
201
+ baselineId: string;
202
+ /** Fresh-score − baseline-score, bounded to [-1, 1]. */
203
+ scoreDelta: number;
204
+ /** Count of checks that flipped from `ok:true` to `ok:false`. */
205
+ criticalFailures: number;
206
+ /** Per-case regression details for the Markdown report. */
207
+ regressions: BaselineRegression[];
208
+ }
209
+ export interface BaselineRegression {
210
+ caseId: string;
211
+ stage: FlowStage;
212
+ verifierId: string;
213
+ reason: "newly-failing" | "case-now-failing" | "score-drop";
214
+ previousScore?: number;
215
+ currentScore?: number;
216
+ }
@@ -0,0 +1,14 @@
1
+ import type { StructuralExpected, VerifierResult } from "../types.js";
2
+ export interface ArtifactSplit {
3
+ hasFrontmatter: boolean;
4
+ frontmatterRaw: string;
5
+ frontmatterParsed?: Record<string, unknown>;
6
+ body: string;
7
+ }
8
+ export declare function splitFrontmatter(artifact: string): ArtifactSplit;
9
+ /**
10
+ * Run every configured structural check against the artifact text.
11
+ * Returns [] when `expected` is undefined/empty so the runner can treat
12
+ * "no structural expectations" as "no verifier results" rather than "pass".
13
+ */
14
+ export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];
@@ -0,0 +1,171 @@
1
+ /**
2
+ * Structural verifier (Wave 7.1): deterministic, zero-LLM checks against a
3
+ * single markdown artifact. Each structural expectation produces one
4
+ * `VerifierResult` so baselines diff cleanly at the check level rather than
5
+ * lumping everything into a single boolean.
6
+ *
7
+ * Design notes:
8
+ *
9
+ * - All pattern matching is case-insensitive. Authoring a check as
10
+ * `"Directions"` matches `## Directions` and `### directions-suggested`.
11
+ * - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
12
+ * and close on a subsequent `---` line. Anything else is treated as "no
13
+ * frontmatter", which fails every `requiredFrontmatterKeys` entry
14
+ * deterministically.
15
+ * - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
16
+ * adds metadata does not accidentally drop the body below the floor.
17
+ * - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
18
+ * all individual `ok` flags. This keeps Wave 7.1 deterministic; the 0..1
19
+ * rubric scale shows up in Wave 7.3 (judge).
20
+ */
21
+ import { parse as parseYaml } from "yaml";
22
+ const FRONTMATTER_OPEN = /^---\r?\n/;
23
+ const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
24
+ function slugify(input) {
25
+ return input
26
+ .toLowerCase()
27
+ .replace(/[^a-z0-9]+/g, "-")
28
+ .replace(/(^-|-$)/g, "")
29
+ .slice(0, 64);
30
+ }
31
+ export function splitFrontmatter(artifact) {
32
+ if (!FRONTMATTER_OPEN.test(artifact)) {
33
+ return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
34
+ }
35
+ const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
36
+ const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
37
+ if (!closeMatch || closeMatch.index === undefined) {
38
+ return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
39
+ }
40
+ const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
41
+ const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
42
+ let frontmatterParsed;
43
+ try {
44
+ const parsed = parseYaml(frontmatterRaw);
45
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
46
+ frontmatterParsed = parsed;
47
+ }
48
+ }
49
+ catch {
50
+ frontmatterParsed = undefined;
51
+ }
52
+ return {
53
+ hasFrontmatter: true,
54
+ frontmatterRaw,
55
+ frontmatterParsed,
56
+ body
57
+ };
58
+ }
59
+ function extractHeadingLines(body) {
60
+ return body
61
+ .split(/\r?\n/)
62
+ .map((line) => line.trimStart())
63
+ .filter((line) => /^#{1,6}\s+\S/.test(line));
64
+ }
65
+ function result(id, ok, message, details) {
66
+ return {
67
+ kind: "structural",
68
+ id,
69
+ ok,
70
+ score: ok ? 1 : 0,
71
+ message,
72
+ ...(details !== undefined ? { details } : {})
73
+ };
74
+ }
75
+ function checkRequiredSections(sections, body) {
76
+ const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
77
+ return sections.map((section) => {
78
+ const needle = section.toLowerCase().trim();
79
+ const found = headings.some((heading) => heading.includes(needle));
80
+ return result(`structural:section:${slugify(section)}`, found, found
81
+ ? `Section matching "${section}" present.`
82
+ : `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
83
+ });
84
+ }
85
+ function checkForbiddenPatterns(patterns, body) {
86
+ const bodyLower = body.toLowerCase();
87
+ return patterns.map((pattern) => {
88
+ const needle = pattern.toLowerCase();
89
+ const hits = countOccurrences(bodyLower, needle);
90
+ const ok = hits === 0;
91
+ return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
92
+ ? `Pattern "${pattern}" absent (as required).`
93
+ : `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
94
+ });
95
+ }
96
+ function countOccurrences(haystack, needle) {
97
+ if (needle.length === 0)
98
+ return 0;
99
+ let index = 0;
100
+ let count = 0;
101
+ while (true) {
102
+ const at = haystack.indexOf(needle, index);
103
+ if (at < 0)
104
+ return count;
105
+ count += 1;
106
+ index = at + needle.length;
107
+ }
108
+ }
109
+ function checkLengthBounds(expected, body) {
110
+ const results = [];
111
+ const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
112
+ const charCount = body.length;
113
+ if (expected.minLines !== undefined || expected.maxLines !== undefined) {
114
+ const min = expected.minLines;
115
+ const max = expected.maxLines;
116
+ const withinMin = min === undefined || lineCount >= min;
117
+ const withinMax = max === undefined || lineCount <= max;
118
+ const ok = withinMin && withinMax;
119
+ results.push(result("structural:length:lines", ok, ok
120
+ ? `Body has ${lineCount} line(s), within bounds.`
121
+ : buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
122
+ }
123
+ if (expected.minChars !== undefined || expected.maxChars !== undefined) {
124
+ const min = expected.minChars;
125
+ const max = expected.maxChars;
126
+ const withinMin = min === undefined || charCount >= min;
127
+ const withinMax = max === undefined || charCount <= max;
128
+ const ok = withinMin && withinMax;
129
+ results.push(result("structural:length:chars", ok, ok
130
+ ? `Body has ${charCount} char(s), within bounds.`
131
+ : buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
132
+ }
133
+ return results;
134
+ }
135
+ function buildOutOfRangeMessage(unit, actual, min, max) {
136
+ const lo = min === undefined ? "0" : String(min);
137
+ const hi = max === undefined ? "∞" : String(max);
138
+ return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
139
+ }
140
+ function checkFrontmatterKeys(keys, split) {
141
+ if (!split.hasFrontmatter || !split.frontmatterParsed) {
142
+ return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
143
+ }
144
+ const present = new Set(Object.keys(split.frontmatterParsed));
145
+ return keys.map((key) => {
146
+ const ok = present.has(key);
147
+ return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
148
+ });
149
+ }
150
+ /**
151
+ * Run every configured structural check against the artifact text.
152
+ * Returns [] when `expected` is undefined/empty so the runner can treat
153
+ * "no structural expectations" as "no verifier results" rather than "pass".
154
+ */
155
+ export function verifyStructural(artifact, expected) {
156
+ if (!expected)
157
+ return [];
158
+ const split = splitFrontmatter(artifact);
159
+ const results = [];
160
+ if (expected.requiredSections?.length) {
161
+ results.push(...checkRequiredSections(expected.requiredSections, split.body));
162
+ }
163
+ if (expected.forbiddenPatterns?.length) {
164
+ results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
165
+ }
166
+ results.push(...checkLengthBounds(expected, split.body));
167
+ if (expected.requiredFrontmatterKeys?.length) {
168
+ results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
169
+ }
170
+ return results;
171
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cclaw-cli",
3
- "version": "0.22.0",
3
+ "version": "0.23.0",
4
4
  "description": "Installer-first flow toolkit for coding agents",
5
5
  "type": "module",
6
6
  "bin": {