cclaw-cli 0.21.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,178 @@
1
+ import { randomUUID } from "node:crypto";
2
+ import { CCLAW_VERSION } from "../constants.js";
3
+ import { FLOW_STAGES } from "../types.js";
4
+ import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
5
+ import { loadCorpus, readFixtureArtifact } from "./corpus.js";
6
+ import { loadEvalConfig } from "./config-loader.js";
7
+ import { verifyStructural } from "./verifiers/structural.js";
8
+ function groupByStage(cases) {
9
+ return cases.reduce((acc, item) => {
10
+ acc[item.stage] = (acc[item.stage] ?? 0) + 1;
11
+ return acc;
12
+ }, {});
13
+ }
14
+ function skeletonVerifierResult(message, details) {
15
+ return {
16
+ kind: "structural",
17
+ id: "wave-7-1-no-structural-expected",
18
+ ok: true,
19
+ score: 1,
20
+ message,
21
+ ...(details !== undefined ? { details } : {})
22
+ };
23
+ }
24
+ async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
25
+ const started = Date.now();
26
+ const structuralExpected = caseEntry.expected?.structural;
27
+ const verifierResults = [];
28
+ if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
29
+ // No structural expectations declared — case is treated as "N/A" for this
30
+ // verifier kind; a placeholder pass keeps downstream math simple while
31
+ // making the situation visible in the report.
32
+ verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
33
+ }
34
+ else {
35
+ let artifact;
36
+ try {
37
+ artifact = await readFixtureArtifact(projectRoot, caseEntry);
38
+ }
39
+ catch (err) {
40
+ verifierResults.push({
41
+ kind: "structural",
42
+ id: "structural:fixture:missing",
43
+ ok: false,
44
+ score: 0,
45
+ message: err instanceof Error ? err.message : String(err),
46
+ details: { fixture: caseEntry.fixture }
47
+ });
48
+ }
49
+ if (artifact !== undefined) {
50
+ const results = verifyStructural(artifact, structuralExpected);
51
+ if (results.length === 0) {
52
+ verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
53
+ }
54
+ else {
55
+ verifierResults.push(...results);
56
+ }
57
+ }
58
+ else if (verifierResults.length === 0) {
59
+ verifierResults.push({
60
+ kind: "structural",
61
+ id: "structural:fixture:absent",
62
+ ok: false,
63
+ score: 0,
64
+ message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
65
+ details: { fixtureProvided: false }
66
+ });
67
+ }
68
+ }
69
+ const allOk = verifierResults.every((r) => r.ok);
70
+ return {
71
+ caseId: caseEntry.id,
72
+ stage: caseEntry.stage,
73
+ tier: plannedTier,
74
+ passed: allOk,
75
+ durationMs: Date.now() - started,
76
+ verifierResults
77
+ };
78
+ }
79
+ function reduceSummary(caseResults) {
80
+ let passed = 0;
81
+ let failed = 0;
82
+ let skipped = 0;
83
+ let totalCostUsd = 0;
84
+ let totalDurationMs = 0;
85
+ for (const c of caseResults) {
86
+ totalDurationMs += c.durationMs;
87
+ if (c.costUsd !== undefined)
88
+ totalCostUsd += c.costUsd;
89
+ if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
90
+ skipped += 1;
91
+ continue;
92
+ }
93
+ if (c.passed)
94
+ passed += 1;
95
+ else
96
+ failed += 1;
97
+ }
98
+ return {
99
+ totalCases: caseResults.length,
100
+ passed,
101
+ failed,
102
+ skipped,
103
+ totalCostUsd: Number(totalCostUsd.toFixed(6)),
104
+ totalDurationMs
105
+ };
106
+ }
107
+ function stagesInResults(caseResults) {
108
+ const set = new Set();
109
+ for (const c of caseResults)
110
+ set.add(c.stage);
111
+ return FLOW_STAGES.filter((s) => set.has(s));
112
+ }
113
+ /**
114
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
115
+ * active), runs structural verifiers against fixture-backed cases and loads
116
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
117
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
118
+ * as skipped rather than failing.
119
+ */
120
+ export async function runEval(options) {
121
+ const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
122
+ const corpus = await loadCorpus(options.projectRoot, options.stage);
123
+ const plannedTier = options.tier ?? config.defaultTier;
124
+ const notes = [];
125
+ if (corpus.length === 0) {
126
+ notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
127
+ }
128
+ if (options.rules) {
129
+ notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
130
+ }
131
+ if (options.judge) {
132
+ notes.push("--judge is accepted; LLM judging wires up in Wave 7.3.");
133
+ }
134
+ if (options.dryRun === true) {
135
+ const summary = {
136
+ kind: "dry-run",
137
+ config,
138
+ corpus: {
139
+ total: corpus.length,
140
+ byStage: groupByStage(corpus),
141
+ cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
142
+ },
143
+ plannedTier,
144
+ verifiersAvailable: {
145
+ structural: true,
146
+ rules: false,
147
+ judge: false,
148
+ workflow: false
149
+ },
150
+ notes
151
+ };
152
+ return summary;
153
+ }
154
+ const now = new Date().toISOString();
155
+ const caseResults = [];
156
+ for (const item of corpus) {
157
+ caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
158
+ }
159
+ const stages = stagesInResults(caseResults);
160
+ const baselines = await loadBaselinesByStage(options.projectRoot, stages);
161
+ const summary = reduceSummary(caseResults);
162
+ const report = {
163
+ schemaVersion: 1,
164
+ generatedAt: now,
165
+ runId: randomUUID(),
166
+ cclawVersion: CCLAW_VERSION,
167
+ provider: config.provider,
168
+ model: config.model,
169
+ tier: plannedTier,
170
+ stages,
171
+ cases: caseResults,
172
+ summary
173
+ };
174
+ const baselineDelta = compareAgainstBaselines(report, baselines);
175
+ if (baselineDelta)
176
+ report.baselineDelta = baselineDelta;
177
+ return report;
178
+ }
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Core types for the cclaw eval subsystem (Phase 7).
3
+ *
4
+ * The eval subsystem lets us measure whether a change to a prompt, skill, or
5
+ * stage contract improves or regresses the quality of agent output. It is
6
+ * deliberately decoupled from the main cclaw runtime so that:
7
+ *
8
+ * - Users who never run `cclaw eval` pay zero runtime cost.
9
+ * - The verifier / rubric / LLM stack evolves on its own release cadence (Waves 7.0-7.6).
10
+ * - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
11
+ */
12
+ import type { FlowStage } from "../types.js";
13
+ /**
14
+ * Fidelity tier for the agent-under-test.
15
+ *
16
+ * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
17
+ * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
18
+ * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
19
+ * artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
20
+ * (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
21
+ */
22
+ export declare const EVAL_TIERS: readonly ["A", "B", "C"];
23
+ export type EvalTier = (typeof EVAL_TIERS)[number];
24
+ /**
25
+ * Verifier kinds, in increasing cost and decreasing determinism:
26
+ * structural and rules run without LLM; judge and workflow use the configured model.
27
+ */
28
+ export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
29
+ export type VerifierKind = (typeof VERIFIER_KINDS)[number];
30
+ /**
31
+ * Structural expectations — deterministic, LLM-free checks against a single
32
+ * text artifact. Wave 7.1 implements all fields below; Wave 7.2 adds the
33
+ * sibling `rules` shape, Wave 7.3 adds `judge`.
34
+ */
35
+ export interface StructuralExpected {
36
+ /**
37
+ * Case-insensitive substrings that must each appear on at least one markdown
38
+ * heading line (line starting with `#`). Useful for "required sections".
39
+ */
40
+ requiredSections?: string[];
41
+ /**
42
+ * Case-insensitive substrings that must NOT appear anywhere in the body
43
+ * (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
44
+ */
45
+ forbiddenPatterns?: string[];
46
+ /** Inclusive minimum line count of the artifact body (frontmatter excluded). */
47
+ minLines?: number;
48
+ /** Inclusive maximum line count of the artifact body (frontmatter excluded). */
49
+ maxLines?: number;
50
+ /** Inclusive minimum character count of the artifact body. */
51
+ minChars?: number;
52
+ /** Inclusive maximum character count of the artifact body. */
53
+ maxChars?: number;
54
+ /**
55
+ * Keys that must appear in the leading YAML frontmatter (between a pair of
56
+ * `---` delimiters at the very top of the file). An artifact without
57
+ * frontmatter will fail every entry.
58
+ */
59
+ requiredFrontmatterKeys?: string[];
60
+ }
61
+ /** Superset of per-verifier expectation shapes. Only `structural` is wired in Wave 7.1. */
62
+ export interface ExpectedShape {
63
+ structural?: StructuralExpected;
64
+ /** Rule-based (keyword/regex/traceability) checks — Wave 7.2. */
65
+ rules?: Record<string, unknown>;
66
+ /** LLM-judge rubrics — Wave 7.3. */
67
+ judge?: Record<string, unknown>;
68
+ }
69
+ /**
70
+ * A single eval case describes one input scenario for one stage. Cases live in
71
+ * `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
72
+ * fixture artifact for verifier development (Wave 7.1) before the agent loop
73
+ * exists (Wave 7.3+).
74
+ */
75
+ export interface EvalCase {
76
+ id: string;
77
+ stage: FlowStage;
78
+ inputPrompt: string;
79
+ /** Project files copied into the Tier B/C sandbox before the agent runs. */
80
+ contextFiles?: string[];
81
+ /**
82
+ * Typed expectation hints consumed by the structural/rules/judge verifiers.
83
+ * Each sub-shape is optional; missing sub-shapes skip that verifier tier.
84
+ */
85
+ expected?: ExpectedShape;
86
+ /**
87
+ * Path (relative to the corpus case file) of a pre-generated artifact used
88
+ * when verifiers are exercised without a live agent loop. Primarily a Wave
89
+ * 7.1 development aid.
90
+ */
91
+ fixture?: string;
92
+ }
93
+ /** Result of one verifier applied to one case. */
94
+ export interface VerifierResult {
95
+ kind: VerifierKind;
96
+ id: string;
97
+ ok: boolean;
98
+ /** Normalized 0..1 score when the verifier produces a numeric signal. */
99
+ score?: number;
100
+ message?: string;
101
+ details?: Record<string, unknown>;
102
+ }
103
+ /** Aggregate result for one case after all verifiers run. */
104
+ export interface EvalCaseResult {
105
+ caseId: string;
106
+ stage: FlowStage;
107
+ tier: EvalTier;
108
+ passed: boolean;
109
+ durationMs: number;
110
+ costUsd?: number;
111
+ verifierResults: VerifierResult[];
112
+ }
113
+ /** Top-level eval report, serialized to JSON and rendered to Markdown. */
114
+ export interface EvalReport {
115
+ schemaVersion: 1;
116
+ generatedAt: string;
117
+ runId: string;
118
+ cclawVersion: string;
119
+ provider: string;
120
+ model: string;
121
+ tier: EvalTier;
122
+ stages: FlowStage[];
123
+ cases: EvalCaseResult[];
124
+ summary: {
125
+ totalCases: number;
126
+ passed: number;
127
+ failed: number;
128
+ skipped: number;
129
+ totalCostUsd: number;
130
+ totalDurationMs: number;
131
+ };
132
+ /** Present when comparing against a saved baseline (Wave 7.1+). */
133
+ baselineDelta?: BaselineDelta;
134
+ }
135
+ /**
136
+ * Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
137
+ * with `CCLAW_EVAL_*` environment variables at runtime.
138
+ */
139
+ export interface EvalConfig {
140
+ /**
141
+ * Free-form provider name used in reports. The actual HTTP protocol is
142
+ * determined by `baseUrl`, which is expected to be OpenAI-compatible.
143
+ */
144
+ provider: string;
145
+ /** OpenAI-compatible base URL, e.g. `https://api.z.ai/api/coding/paas/v4`. */
146
+ baseUrl: string;
147
+ /** Model identifier for both agent-under-test and judge unless `judgeModel` overrides. */
148
+ model: string;
149
+ /** Optional separate model for the judge role. Defaults to `model`. */
150
+ judgeModel?: string;
151
+ /** Default tier when `--tier` is not supplied. */
152
+ defaultTier: EvalTier;
153
+ /** Optional hard stop on estimated USD spend per day. Unset = no cap. */
154
+ dailyUsdCap?: number;
155
+ /** Regression thresholds for CI gates. */
156
+ regression: {
157
+ /** Fail when overall score drops by more than this fraction (e.g. 0.15 = 15%). */
158
+ failIfDeltaBelow: number;
159
+ /** Fail when any single critical rubric drops below this absolute score. */
160
+ failIfCriticalBelow: number;
161
+ };
162
+ /** Per-agent-run timeout in milliseconds. */
163
+ timeoutMs: number;
164
+ /** Max retries per API call on transient failures. */
165
+ maxRetries: number;
166
+ }
167
+ /** Resolved config with env overrides applied. */
168
+ export interface ResolvedEvalConfig extends EvalConfig {
169
+ apiKey?: string;
170
+ source: "default" | "file" | "env" | "file+env";
171
+ }
172
+ /**
173
+ * Frozen per-stage baseline used by regression gating (Wave 7.1). Baselines
174
+ * are committed to git; `cclaw eval --update-baseline --confirm` rewrites
175
+ * them. The shape is intentionally flat so a quick `git diff` reveals what
176
+ * changed between runs.
177
+ */
178
+ export interface BaselineSnapshot {
179
+ schemaVersion: 1;
180
+ stage: FlowStage;
181
+ generatedAt: string;
182
+ cclawVersion: string;
183
+ /** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
184
+ cases: Record<string, BaselineCaseEntry>;
185
+ }
186
+ export interface BaselineCaseEntry {
187
+ passed: boolean;
188
+ verifierResults: BaselineVerifierEntry[];
189
+ }
190
+ export interface BaselineVerifierEntry {
191
+ id: string;
192
+ kind: VerifierKind;
193
+ ok: boolean;
194
+ score?: number;
195
+ }
196
+ /**
197
+ * Delta between a fresh report and the saved baseline. Populated when
198
+ * baselines exist on disk and the run covers matching cases.
199
+ */
200
+ export interface BaselineDelta {
201
+ baselineId: string;
202
+ /** Fresh-score − baseline-score, bounded to [-1, 1]. */
203
+ scoreDelta: number;
204
+ /** Count of checks that flipped from `ok:true` to `ok:false`. */
205
+ criticalFailures: number;
206
+ /** Per-case regression details for the Markdown report. */
207
+ regressions: BaselineRegression[];
208
+ }
209
+ export interface BaselineRegression {
210
+ caseId: string;
211
+ stage: FlowStage;
212
+ verifierId: string;
213
+ reason: "newly-failing" | "case-now-failing" | "score-drop";
214
+ previousScore?: number;
215
+ currentScore?: number;
216
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Fidelity tier for the agent-under-test.
3
+ *
4
+ * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
5
+ * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
6
+ * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
7
+ * artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
8
+ * (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
9
+ */
10
+ export const EVAL_TIERS = ["A", "B", "C"];
11
+ /**
12
+ * Verifier kinds, in increasing cost and decreasing determinism:
13
+ * structural and rules run without LLM; judge and workflow use the configured model.
14
+ */
15
+ export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];
@@ -0,0 +1,14 @@
1
+ import type { StructuralExpected, VerifierResult } from "../types.js";
2
+ export interface ArtifactSplit {
3
+ hasFrontmatter: boolean;
4
+ frontmatterRaw: string;
5
+ frontmatterParsed?: Record<string, unknown>;
6
+ body: string;
7
+ }
8
+ export declare function splitFrontmatter(artifact: string): ArtifactSplit;
9
+ /**
10
+ * Run every configured structural check against the artifact text.
11
+ * Returns [] when `expected` is undefined/empty so the runner can treat
12
+ * "no structural expectations" as "no verifier results" rather than "pass".
13
+ */
14
+ export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];
@@ -0,0 +1,171 @@
1
+ /**
2
+ * Structural verifier (Wave 7.1): deterministic, zero-LLM checks against a
3
+ * single markdown artifact. Each structural expectation produces one
4
+ * `VerifierResult` so baselines diff cleanly at the check level rather than
5
+ * lumping everything into a single boolean.
6
+ *
7
+ * Design notes:
8
+ *
9
+ * - All pattern matching is case-insensitive. Authoring a check as
10
+ * `"Directions"` matches `## Directions` and `### directions-suggested`.
11
+ * - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
12
+ * and close on a subsequent `---` line. Anything else is treated as "no
13
+ * frontmatter", which fails every `requiredFrontmatterKeys` entry
14
+ * deterministically.
15
+ * - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
16
+ * adds metadata does not accidentally drop the body below the floor.
17
+ * - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
18
+ * all individual `ok` flags. This keeps Wave 7.1 deterministic; the 0..1
19
+ * rubric scale shows up in Wave 7.3 (judge).
20
+ */
21
+ import { parse as parseYaml } from "yaml";
22
+ const FRONTMATTER_OPEN = /^---\r?\n/;
23
+ const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
24
+ function slugify(input) {
25
+ return input
26
+ .toLowerCase()
27
+ .replace(/[^a-z0-9]+/g, "-")
28
+ .replace(/(^-|-$)/g, "")
29
+ .slice(0, 64);
30
+ }
31
+ export function splitFrontmatter(artifact) {
32
+ if (!FRONTMATTER_OPEN.test(artifact)) {
33
+ return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
34
+ }
35
+ const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
36
+ const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
37
+ if (!closeMatch || closeMatch.index === undefined) {
38
+ return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
39
+ }
40
+ const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
41
+ const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
42
+ let frontmatterParsed;
43
+ try {
44
+ const parsed = parseYaml(frontmatterRaw);
45
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
46
+ frontmatterParsed = parsed;
47
+ }
48
+ }
49
+ catch {
50
+ frontmatterParsed = undefined;
51
+ }
52
+ return {
53
+ hasFrontmatter: true,
54
+ frontmatterRaw,
55
+ frontmatterParsed,
56
+ body
57
+ };
58
+ }
59
+ function extractHeadingLines(body) {
60
+ return body
61
+ .split(/\r?\n/)
62
+ .map((line) => line.trimStart())
63
+ .filter((line) => /^#{1,6}\s+\S/.test(line));
64
+ }
65
+ function result(id, ok, message, details) {
66
+ return {
67
+ kind: "structural",
68
+ id,
69
+ ok,
70
+ score: ok ? 1 : 0,
71
+ message,
72
+ ...(details !== undefined ? { details } : {})
73
+ };
74
+ }
75
+ function checkRequiredSections(sections, body) {
76
+ const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
77
+ return sections.map((section) => {
78
+ const needle = section.toLowerCase().trim();
79
+ const found = headings.some((heading) => heading.includes(needle));
80
+ return result(`structural:section:${slugify(section)}`, found, found
81
+ ? `Section matching "${section}" present.`
82
+ : `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
83
+ });
84
+ }
85
+ function checkForbiddenPatterns(patterns, body) {
86
+ const bodyLower = body.toLowerCase();
87
+ return patterns.map((pattern) => {
88
+ const needle = pattern.toLowerCase();
89
+ const hits = countOccurrences(bodyLower, needle);
90
+ const ok = hits === 0;
91
+ return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
92
+ ? `Pattern "${pattern}" absent (as required).`
93
+ : `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
94
+ });
95
+ }
96
+ function countOccurrences(haystack, needle) {
97
+ if (needle.length === 0)
98
+ return 0;
99
+ let index = 0;
100
+ let count = 0;
101
+ while (true) {
102
+ const at = haystack.indexOf(needle, index);
103
+ if (at < 0)
104
+ return count;
105
+ count += 1;
106
+ index = at + needle.length;
107
+ }
108
+ }
109
+ function checkLengthBounds(expected, body) {
110
+ const results = [];
111
+ const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
112
+ const charCount = body.length;
113
+ if (expected.minLines !== undefined || expected.maxLines !== undefined) {
114
+ const min = expected.minLines;
115
+ const max = expected.maxLines;
116
+ const withinMin = min === undefined || lineCount >= min;
117
+ const withinMax = max === undefined || lineCount <= max;
118
+ const ok = withinMin && withinMax;
119
+ results.push(result("structural:length:lines", ok, ok
120
+ ? `Body has ${lineCount} line(s), within bounds.`
121
+ : buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
122
+ }
123
+ if (expected.minChars !== undefined || expected.maxChars !== undefined) {
124
+ const min = expected.minChars;
125
+ const max = expected.maxChars;
126
+ const withinMin = min === undefined || charCount >= min;
127
+ const withinMax = max === undefined || charCount <= max;
128
+ const ok = withinMin && withinMax;
129
+ results.push(result("structural:length:chars", ok, ok
130
+ ? `Body has ${charCount} char(s), within bounds.`
131
+ : buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
132
+ }
133
+ return results;
134
+ }
135
+ function buildOutOfRangeMessage(unit, actual, min, max) {
136
+ const lo = min === undefined ? "0" : String(min);
137
+ const hi = max === undefined ? "∞" : String(max);
138
+ return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
139
+ }
140
+ function checkFrontmatterKeys(keys, split) {
141
+ if (!split.hasFrontmatter || !split.frontmatterParsed) {
142
+ return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
143
+ }
144
+ const present = new Set(Object.keys(split.frontmatterParsed));
145
+ return keys.map((key) => {
146
+ const ok = present.has(key);
147
+ return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
148
+ });
149
+ }
150
+ /**
151
+ * Run every configured structural check against the artifact text.
152
+ * Returns [] when `expected` is undefined/empty so the runner can treat
153
+ * "no structural expectations" as "no verifier results" rather than "pass".
154
+ */
155
+ export function verifyStructural(artifact, expected) {
156
+ if (!expected)
157
+ return [];
158
+ const split = splitFrontmatter(artifact);
159
+ const results = [];
160
+ if (expected.requiredSections?.length) {
161
+ results.push(...checkRequiredSections(expected.requiredSections, split.body));
162
+ }
163
+ if (expected.forbiddenPatterns?.length) {
164
+ results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
165
+ }
166
+ results.push(...checkLengthBounds(expected, split.body));
167
+ if (expected.requiredFrontmatterKeys?.length) {
168
+ results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
169
+ }
170
+ return results;
171
+ }
package/dist/install.js CHANGED
@@ -28,6 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
28
28
  import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
29
29
  import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
30
30
  import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
31
+ import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
31
32
  import { TDD_WAVE_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
32
33
  import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
33
34
  import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
@@ -184,6 +185,26 @@ async function writeArtifactTemplates(projectRoot) {
184
185
  await writeFileSafe(runtimePath(projectRoot, "templates", fileName), content);
185
186
  }
186
187
  }
188
+ /**
189
+ * Seed the `.cclaw/evals/` scaffold. Only writes files that do not already
190
+ * exist so that user-authored config.yaml / corpus / rubrics / baselines are
191
+ * never clobbered by `cclaw sync`.
192
+ */
193
+ async function writeEvalScaffold(projectRoot) {
194
+ const targets = [
195
+ { rel: "evals/config.yaml", content: EVAL_CONFIG_YAML },
196
+ { rel: "evals/corpus/README.md", content: EVAL_CORPUS_README },
197
+ { rel: "evals/rubrics/README.md", content: EVAL_RUBRICS_README },
198
+ { rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
199
+ { rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
200
+ ];
201
+ for (const target of targets) {
202
+ const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
203
+ if (await exists(absolute))
204
+ continue;
205
+ await writeFileSafe(absolute, target.content);
206
+ }
207
+ }
187
208
  async function writeSkills(projectRoot, config) {
188
209
  for (const stage of COMMAND_FILE_ORDER) {
189
210
  const folder = stageSkillFolder(stage);
@@ -1044,6 +1065,7 @@ async function materializeRuntime(projectRoot, config, forceStateReset) {
1044
1065
  await writeSkills(projectRoot, config);
1045
1066
  await writeContextModes(projectRoot);
1046
1067
  await writeArtifactTemplates(projectRoot);
1068
+ await writeEvalScaffold(projectRoot);
1047
1069
  await writeRulebook(projectRoot);
1048
1070
  await writeState(projectRoot, config, forceStateReset);
1049
1071
  await ensureRunSystem(projectRoot, { createIfMissing: false });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cclaw-cli",
3
- "version": "0.21.2",
3
+ "version": "0.23.0",
4
4
  "description": "Installer-first flow toolkit for coding agents",
5
5
  "type": "module",
6
6
  "bin": {