cclaw-cli 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +42 -11
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/corpus.d.ts +13 -2
- package/dist/eval/corpus.js +97 -13
- package/dist/eval/report.js +16 -3
- package/dist/eval/runner.d.ts +6 -14
- package/dist/eval/runner.js +122 -40
- package/dist/eval/types.d.ts +88 -8
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
package/dist/cli.js
CHANGED
|
@@ -14,6 +14,7 @@ import { createDefaultConfig, createProfileConfig } from "./config.js";
|
|
|
14
14
|
import { detectHarnesses } from "./init-detect.js";
|
|
15
15
|
import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
16
16
|
import { runEval } from "./eval/runner.js";
|
|
17
|
+
import { writeBaselinesFromReport } from "./eval/baseline.js";
|
|
17
18
|
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
18
19
|
import { EVAL_TIERS } from "./eval/types.js";
|
|
19
20
|
import { FLOW_STAGES } from "./types.js";
|
|
@@ -53,15 +54,17 @@ Commands:
|
|
|
53
54
|
Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
|
|
54
55
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
55
56
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
56
|
-
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.
|
|
57
|
-
Flags: --stage=<id>
|
|
58
|
-
--tier=<A|B|C>
|
|
59
|
-
--schema-only
|
|
60
|
-
--rules
|
|
61
|
-
--judge
|
|
62
|
-
--dry-run
|
|
63
|
-
--json
|
|
64
|
-
--no-write
|
|
57
|
+
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.1: structural verifier).
|
|
58
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
59
|
+
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
60
|
+
--schema-only Run only structural verifiers (Wave 7.1, default).
|
|
61
|
+
--rules Run structural + rule verifiers (Wave 7.2).
|
|
62
|
+
--judge Include LLM judging (Wave 7.3; requires API key).
|
|
63
|
+
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
|
+
--json Emit machine-readable JSON on stdout.
|
|
65
|
+
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
66
|
+
--update-baseline Overwrite baselines from the current run (requires --confirm).
|
|
67
|
+
--confirm Acknowledge --update-baseline (prevents accidental resets).
|
|
65
68
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
66
69
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
67
70
|
|
|
@@ -453,6 +456,14 @@ function parseArgs(argv) {
|
|
|
453
456
|
parsed.evalNoWrite = true;
|
|
454
457
|
continue;
|
|
455
458
|
}
|
|
459
|
+
if (flag === "--update-baseline") {
|
|
460
|
+
parsed.evalUpdateBaseline = true;
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
if (flag === "--confirm") {
|
|
464
|
+
parsed.evalConfirm = true;
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
456
467
|
}
|
|
457
468
|
// `--json` is shared between doctor and eval. Disambiguate by command.
|
|
458
469
|
if (parsed.command === "eval" && parsed.doctorJson === true) {
|
|
@@ -592,22 +603,42 @@ async function runCommand(parsed, ctx) {
|
|
|
592
603
|
}
|
|
593
604
|
return 0;
|
|
594
605
|
}
|
|
606
|
+
if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
|
|
607
|
+
error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
|
|
608
|
+
return 1;
|
|
609
|
+
}
|
|
610
|
+
if (parsed.evalUpdateBaseline === true) {
|
|
611
|
+
if (result.summary.failed > 0) {
|
|
612
|
+
error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
|
|
613
|
+
return 1;
|
|
614
|
+
}
|
|
615
|
+
const written = await writeBaselinesFromReport(ctx.cwd, result);
|
|
616
|
+
for (const file of written) {
|
|
617
|
+
info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
595
620
|
if (parsed.evalNoWrite !== true) {
|
|
596
621
|
const jsonPath = await writeJsonReport(ctx.cwd, result);
|
|
597
622
|
const mdPath = await writeMarkdownReport(ctx.cwd, result);
|
|
598
623
|
info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
|
|
599
624
|
info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
|
|
600
625
|
}
|
|
626
|
+
const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
|
|
601
627
|
if (parsed.evalJson === true) {
|
|
602
628
|
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
603
629
|
}
|
|
604
630
|
else {
|
|
631
|
+
const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
|
|
605
632
|
ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
|
|
606
633
|
`${result.summary.passed} passed, ` +
|
|
607
634
|
`${result.summary.failed} failed, ` +
|
|
608
|
-
`${result.summary.skipped} skipped
|
|
635
|
+
`${result.summary.skipped} skipped${regressionNote}\n`);
|
|
609
636
|
}
|
|
610
|
-
|
|
637
|
+
if (result.summary.failed > 0)
|
|
638
|
+
return 1;
|
|
639
|
+
if (regressionCount > 0)
|
|
640
|
+
return 1;
|
|
641
|
+
return 0;
|
|
611
642
|
}
|
|
612
643
|
if (command === "archive") {
|
|
613
644
|
const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
|
+
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
+
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
5
|
+
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
6
|
+
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
|
7
|
+
export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
10
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
11
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
12
|
+
*/
|
|
13
|
+
export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
|
|
14
|
+
export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline I/O + regression comparison (Wave 7.1).
|
|
3
|
+
*
|
|
4
|
+
* Layout on disk (committed):
|
|
5
|
+
*
|
|
6
|
+
* .cclaw/evals/baselines/<stage>.json
|
|
7
|
+
*
|
|
8
|
+
* Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
|
|
9
|
+
* regressions by comparing per-verifier `ok` flags across runs: any verifier
|
|
10
|
+
* that was `ok:true` in the baseline and is `ok:false` now counts as a
|
|
11
|
+
* critical failure. A case whose aggregate `passed` flipped from true to
|
|
12
|
+
* false is flagged as `case-now-failing` regardless of per-verifier churn.
|
|
13
|
+
*
|
|
14
|
+
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
|
+
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
|
+
*/
|
|
17
|
+
import fs from "node:fs/promises";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
20
|
+
import { exists } from "../fs-utils.js";
|
|
21
|
+
import { FLOW_STAGES } from "../types.js";
|
|
22
|
+
export const BASELINE_SCHEMA_VERSION = 1;
|
|
23
|
+
function baselinePath(projectRoot, stage) {
|
|
24
|
+
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
25
|
+
}
|
|
26
|
+
export async function loadBaseline(projectRoot, stage) {
|
|
27
|
+
const filePath = baselinePath(projectRoot, stage);
|
|
28
|
+
if (!(await exists(filePath)))
|
|
29
|
+
return null;
|
|
30
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
31
|
+
let parsed;
|
|
32
|
+
try {
|
|
33
|
+
parsed = JSON.parse(raw);
|
|
34
|
+
}
|
|
35
|
+
catch (err) {
|
|
36
|
+
throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
37
|
+
}
|
|
38
|
+
if (!isBaseline(parsed, stage)) {
|
|
39
|
+
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
40
|
+
}
|
|
41
|
+
return parsed;
|
|
42
|
+
}
|
|
43
|
+
function isBaseline(value, stage) {
|
|
44
|
+
if (!value || typeof value !== "object")
|
|
45
|
+
return false;
|
|
46
|
+
const candidate = value;
|
|
47
|
+
if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
|
|
48
|
+
return false;
|
|
49
|
+
if (candidate.stage !== stage)
|
|
50
|
+
return false;
|
|
51
|
+
if (typeof candidate.generatedAt !== "string")
|
|
52
|
+
return false;
|
|
53
|
+
if (typeof candidate.cclawVersion !== "string")
|
|
54
|
+
return false;
|
|
55
|
+
if (!candidate.cases || typeof candidate.cases !== "object")
|
|
56
|
+
return false;
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
export async function loadBaselinesByStage(projectRoot, stages) {
|
|
60
|
+
const out = new Map();
|
|
61
|
+
for (const stage of stages) {
|
|
62
|
+
const snapshot = await loadBaseline(projectRoot, stage);
|
|
63
|
+
if (snapshot)
|
|
64
|
+
out.set(stage, snapshot);
|
|
65
|
+
}
|
|
66
|
+
return out;
|
|
67
|
+
}
|
|
68
|
+
function entryFromResult(result) {
|
|
69
|
+
const verifierResults = result.verifierResults.map((v) => ({
|
|
70
|
+
id: v.id,
|
|
71
|
+
kind: v.kind,
|
|
72
|
+
ok: v.ok,
|
|
73
|
+
...(v.score !== undefined ? { score: v.score } : {})
|
|
74
|
+
}));
|
|
75
|
+
return { passed: result.passed, verifierResults };
|
|
76
|
+
}
|
|
77
|
+
export function buildBaselineForStage(stage, report) {
|
|
78
|
+
const stageCases = report.cases.filter((c) => c.stage === stage);
|
|
79
|
+
const cases = {};
|
|
80
|
+
for (const c of stageCases) {
|
|
81
|
+
cases[c.caseId] = entryFromResult(c);
|
|
82
|
+
}
|
|
83
|
+
return {
|
|
84
|
+
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
85
|
+
stage,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
cclawVersion: CCLAW_VERSION,
|
|
88
|
+
cases
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
92
|
+
const written = [];
|
|
93
|
+
const stages = new Set(report.cases.map((c) => c.stage));
|
|
94
|
+
for (const stage of stages) {
|
|
95
|
+
const snapshot = buildBaselineForStage(stage, report);
|
|
96
|
+
const file = baselinePath(projectRoot, stage);
|
|
97
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
98
|
+
await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
|
|
99
|
+
written.push(file);
|
|
100
|
+
}
|
|
101
|
+
return written.sort();
|
|
102
|
+
}
|
|
103
|
+
function verifierMap(entries) {
|
|
104
|
+
const out = new Map();
|
|
105
|
+
for (const entry of entries) {
|
|
106
|
+
out.set(entry.id, entry);
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
function computePassRate(cases) {
|
|
111
|
+
if (cases.length === 0)
|
|
112
|
+
return 1;
|
|
113
|
+
const passed = cases.filter((c) => c.passed).length;
|
|
114
|
+
return passed / cases.length;
|
|
115
|
+
}
|
|
116
|
+
function baselinePassRate(snapshot) {
|
|
117
|
+
const entries = Object.values(snapshot.cases);
|
|
118
|
+
if (entries.length === 0)
|
|
119
|
+
return 1;
|
|
120
|
+
const passed = entries.filter((e) => e.passed).length;
|
|
121
|
+
return passed / entries.length;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
125
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
126
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
127
|
+
*/
|
|
128
|
+
export function compareAgainstBaselines(report, baselines) {
|
|
129
|
+
if (baselines.size === 0)
|
|
130
|
+
return undefined;
|
|
131
|
+
const regressions = [];
|
|
132
|
+
const caseResultsByStage = new Map();
|
|
133
|
+
for (const c of report.cases) {
|
|
134
|
+
const bucket = caseResultsByStage.get(c.stage) ?? [];
|
|
135
|
+
bucket.push(c);
|
|
136
|
+
caseResultsByStage.set(c.stage, bucket);
|
|
137
|
+
}
|
|
138
|
+
let baselineTotalPassRate = 0;
|
|
139
|
+
let baselineStagesCounted = 0;
|
|
140
|
+
for (const [stage, snapshot] of baselines) {
|
|
141
|
+
const current = caseResultsByStage.get(stage) ?? [];
|
|
142
|
+
baselineTotalPassRate += baselinePassRate(snapshot);
|
|
143
|
+
baselineStagesCounted += 1;
|
|
144
|
+
for (const caseResult of current) {
|
|
145
|
+
const baselineEntry = snapshot.cases[caseResult.caseId];
|
|
146
|
+
if (!baselineEntry)
|
|
147
|
+
continue;
|
|
148
|
+
if (baselineEntry.passed && !caseResult.passed) {
|
|
149
|
+
regressions.push({
|
|
150
|
+
caseId: caseResult.caseId,
|
|
151
|
+
stage,
|
|
152
|
+
verifierId: "<case>",
|
|
153
|
+
reason: "case-now-failing",
|
|
154
|
+
previousScore: 1,
|
|
155
|
+
currentScore: 0
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
|
|
159
|
+
for (const currentVerifier of caseResult.verifierResults) {
|
|
160
|
+
const prev = baselineVerifiers.get(currentVerifier.id);
|
|
161
|
+
if (!prev)
|
|
162
|
+
continue;
|
|
163
|
+
if (prev.ok && !currentVerifier.ok) {
|
|
164
|
+
regressions.push({
|
|
165
|
+
caseId: caseResult.caseId,
|
|
166
|
+
stage,
|
|
167
|
+
verifierId: currentVerifier.id,
|
|
168
|
+
reason: "newly-failing",
|
|
169
|
+
previousScore: prev.score ?? 1,
|
|
170
|
+
currentScore: currentVerifier.score ?? 0
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
else if (prev.score !== undefined &&
|
|
174
|
+
currentVerifier.score !== undefined &&
|
|
175
|
+
currentVerifier.score < prev.score) {
|
|
176
|
+
regressions.push({
|
|
177
|
+
caseId: caseResult.caseId,
|
|
178
|
+
stage,
|
|
179
|
+
verifierId: currentVerifier.id,
|
|
180
|
+
reason: "score-drop",
|
|
181
|
+
previousScore: prev.score,
|
|
182
|
+
currentScore: currentVerifier.score
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
const currentPassRate = computePassRate(report.cases);
|
|
189
|
+
const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
|
|
190
|
+
const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
|
|
191
|
+
const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
|
|
192
|
+
const baselineStages = [...baselines.keys()].sort().join(",");
|
|
193
|
+
return {
|
|
194
|
+
baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
|
|
195
|
+
scoreDelta,
|
|
196
|
+
criticalFailures,
|
|
197
|
+
regressions
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
export function listBaselineStages(projectRoot) {
|
|
201
|
+
const root = path.join(projectRoot, EVALS_ROOT, "baselines");
|
|
202
|
+
return fs
|
|
203
|
+
.readdir(root, { withFileTypes: true })
|
|
204
|
+
.then((entries) => entries
|
|
205
|
+
.filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
|
|
206
|
+
.map((entry) => entry.name.replace(/\.json$/, ""))
|
|
207
|
+
.filter((name) => FLOW_STAGES.includes(name)))
|
|
208
|
+
.catch(() => []);
|
|
209
|
+
}
|
package/dist/eval/corpus.d.ts
CHANGED
|
@@ -2,7 +2,18 @@ import type { FlowStage } from "../types.js";
|
|
|
2
2
|
import type { EvalCase } from "./types.js";
|
|
3
3
|
/**
|
|
4
4
|
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
5
|
-
* single stage. Returns an empty array for a fresh install
|
|
6
|
-
* without seed cases; corpus is authored in Wave 7.1+).
|
|
5
|
+
* single stage. Returns an empty array for a fresh install.
|
|
7
6
|
*/
|
|
8
7
|
export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
10
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
11
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
12
|
+
*/
|
|
13
|
+
export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
|
|
14
|
+
/**
|
|
15
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
16
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
17
|
+
* the case but not on disk — Wave 7.1 fixtures ship alongside cases.
|
|
18
|
+
*/
|
|
19
|
+
export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
|
package/dist/eval/corpus.js
CHANGED
|
@@ -12,6 +12,76 @@ function corpusError(filePath, reason) {
|
|
|
12
12
|
function isRecord(value) {
|
|
13
13
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
14
14
|
}
|
|
15
|
+
function readStringArray(filePath, context, value) {
|
|
16
|
+
if (value === undefined)
|
|
17
|
+
return undefined;
|
|
18
|
+
if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
|
|
19
|
+
throw corpusError(filePath, `"${context}" must be an array of strings`);
|
|
20
|
+
}
|
|
21
|
+
return value;
|
|
22
|
+
}
|
|
23
|
+
function readNonNegativeInteger(filePath, context, value) {
|
|
24
|
+
if (value === undefined)
|
|
25
|
+
return undefined;
|
|
26
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
|
|
27
|
+
throw corpusError(filePath, `"${context}" must be a non-negative integer`);
|
|
28
|
+
}
|
|
29
|
+
return value;
|
|
30
|
+
}
|
|
31
|
+
function parseStructural(filePath, raw) {
|
|
32
|
+
if (raw === undefined)
|
|
33
|
+
return undefined;
|
|
34
|
+
if (!isRecord(raw)) {
|
|
35
|
+
throw corpusError(filePath, `"expected.structural" must be a mapping`);
|
|
36
|
+
}
|
|
37
|
+
const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
|
|
38
|
+
const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
|
|
39
|
+
const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
|
|
40
|
+
const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
|
|
41
|
+
const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
|
|
42
|
+
const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
|
|
43
|
+
const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
|
|
44
|
+
const structural = {};
|
|
45
|
+
if (requiredSections)
|
|
46
|
+
structural.requiredSections = requiredSections;
|
|
47
|
+
if (forbiddenPatterns)
|
|
48
|
+
structural.forbiddenPatterns = forbiddenPatterns;
|
|
49
|
+
if (requiredFrontmatterKeys)
|
|
50
|
+
structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
|
|
51
|
+
if (minLines !== undefined)
|
|
52
|
+
structural.minLines = minLines;
|
|
53
|
+
if (maxLines !== undefined)
|
|
54
|
+
structural.maxLines = maxLines;
|
|
55
|
+
if (minChars !== undefined)
|
|
56
|
+
structural.minChars = minChars;
|
|
57
|
+
if (maxChars !== undefined)
|
|
58
|
+
structural.maxChars = maxChars;
|
|
59
|
+
return structural;
|
|
60
|
+
}
|
|
61
|
+
function parseExpected(filePath, raw) {
|
|
62
|
+
if (raw === undefined)
|
|
63
|
+
return undefined;
|
|
64
|
+
if (!isRecord(raw)) {
|
|
65
|
+
throw corpusError(filePath, `"expected" must be a mapping`);
|
|
66
|
+
}
|
|
67
|
+
const shape = {};
|
|
68
|
+
const structural = parseStructural(filePath, raw.structural);
|
|
69
|
+
if (structural)
|
|
70
|
+
shape.structural = structural;
|
|
71
|
+
if (raw.rules !== undefined) {
|
|
72
|
+
if (!isRecord(raw.rules)) {
|
|
73
|
+
throw corpusError(filePath, `"expected.rules" must be a mapping`);
|
|
74
|
+
}
|
|
75
|
+
shape.rules = raw.rules;
|
|
76
|
+
}
|
|
77
|
+
if (raw.judge !== undefined) {
|
|
78
|
+
if (!isRecord(raw.judge)) {
|
|
79
|
+
throw corpusError(filePath, `"expected.judge" must be a mapping`);
|
|
80
|
+
}
|
|
81
|
+
shape.judge = raw.judge;
|
|
82
|
+
}
|
|
83
|
+
return Object.keys(shape).length === 0 ? undefined : shape;
|
|
84
|
+
}
|
|
15
85
|
function validateCase(filePath, raw) {
|
|
16
86
|
if (!isRecord(raw)) {
|
|
17
87
|
throw corpusError(filePath, "top-level value must be a mapping");
|
|
@@ -28,17 +98,8 @@ function validateCase(filePath, raw) {
|
|
|
28
98
|
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
29
99
|
throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
|
|
30
100
|
}
|
|
31
|
-
const
|
|
32
|
-
|
|
33
|
-
if (contextFilesRaw !== undefined) {
|
|
34
|
-
if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
|
|
35
|
-
throw corpusError(filePath, `"context_files" must be an array of strings`);
|
|
36
|
-
}
|
|
37
|
-
contextFiles = contextFilesRaw;
|
|
38
|
-
}
|
|
39
|
-
const expected = raw.expected !== undefined && isRecord(raw.expected)
|
|
40
|
-
? raw.expected
|
|
41
|
-
: undefined;
|
|
101
|
+
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
102
|
+
const expected = parseExpected(filePath, raw.expected);
|
|
42
103
|
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
43
104
|
return {
|
|
44
105
|
id: id.trim(),
|
|
@@ -51,8 +112,7 @@ function validateCase(filePath, raw) {
|
|
|
51
112
|
}
|
|
52
113
|
/**
|
|
53
114
|
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
54
|
-
* single stage. Returns an empty array for a fresh install
|
|
55
|
-
* without seed cases; corpus is authored in Wave 7.1+).
|
|
115
|
+
* single stage. Returns an empty array for a fresh install.
|
|
56
116
|
*/
|
|
57
117
|
export async function loadCorpus(projectRoot, stage) {
|
|
58
118
|
const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
|
|
@@ -89,3 +149,27 @@ export async function loadCorpus(projectRoot, stage) {
|
|
|
89
149
|
cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
|
|
90
150
|
return cases;
|
|
91
151
|
}
|
|
152
|
+
/**
|
|
153
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
154
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
155
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
156
|
+
*/
|
|
157
|
+
export function fixturePathFor(projectRoot, caseEntry) {
|
|
158
|
+
if (!caseEntry.fixture)
|
|
159
|
+
return undefined;
|
|
160
|
+
return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
164
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
165
|
+
* the case but not on disk — Wave 7.1 fixtures ship alongside cases.
|
|
166
|
+
*/
|
|
167
|
+
export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
168
|
+
const fixturePath = fixturePathFor(projectRoot, caseEntry);
|
|
169
|
+
if (!fixturePath)
|
|
170
|
+
return undefined;
|
|
171
|
+
if (!(await exists(fixturePath))) {
|
|
172
|
+
throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
|
|
173
|
+
}
|
|
174
|
+
return fs.readFile(fixturePath, "utf8");
|
|
175
|
+
}
|
package/dist/eval/report.js
CHANGED
|
@@ -39,12 +39,25 @@ export function formatMarkdownReport(report) {
|
|
|
39
39
|
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
40
|
lines.push(``);
|
|
41
41
|
if (report.baselineDelta) {
|
|
42
|
+
const delta = report.baselineDelta;
|
|
42
43
|
lines.push(`## Baseline delta`);
|
|
43
44
|
lines.push(``);
|
|
44
|
-
lines.push(`- baseline: ${
|
|
45
|
-
lines.push(`- score delta: ${
|
|
46
|
-
lines.push(`- critical failures: ${
|
|
45
|
+
lines.push(`- baseline: ${delta.baselineId}`);
|
|
46
|
+
lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
|
|
47
|
+
lines.push(`- critical failures: ${delta.criticalFailures}`);
|
|
47
48
|
lines.push(``);
|
|
49
|
+
if (delta.regressions.length > 0) {
|
|
50
|
+
lines.push(`### Regressions`);
|
|
51
|
+
lines.push(``);
|
|
52
|
+
lines.push(`| stage | case id | verifier | reason | prev | curr |`);
|
|
53
|
+
lines.push(`| --- | --- | --- | --- | --- | --- |`);
|
|
54
|
+
for (const reg of delta.regressions) {
|
|
55
|
+
const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
|
|
56
|
+
const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
|
|
57
|
+
lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
|
|
58
|
+
}
|
|
59
|
+
lines.push(``);
|
|
60
|
+
}
|
|
48
61
|
}
|
|
49
62
|
if (report.cases.length === 0) {
|
|
50
63
|
lines.push(`## Cases`);
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ export interface RunEvalOptions {
|
|
|
4
4
|
projectRoot: string;
|
|
5
5
|
stage?: FlowStage;
|
|
6
6
|
tier?: EvalTier;
|
|
7
|
-
/** When true, run only structural verifiers
|
|
7
|
+
/** When true, run only structural verifiers (Wave 7.1). */
|
|
8
8
|
schemaOnly?: boolean;
|
|
9
9
|
/** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
|
|
10
10
|
rules?: boolean;
|
|
@@ -27,10 +27,6 @@ export interface DryRunSummary {
|
|
|
27
27
|
}>;
|
|
28
28
|
};
|
|
29
29
|
plannedTier: EvalTier;
|
|
30
|
-
/**
|
|
31
|
-
* Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
|
|
32
|
-
* across the board because no verifier is implemented yet.
|
|
33
|
-
*/
|
|
34
30
|
verifiersAvailable: {
|
|
35
31
|
structural: boolean;
|
|
36
32
|
rules: boolean;
|
|
@@ -40,14 +36,10 @@ export interface DryRunSummary {
|
|
|
40
36
|
notes: string[];
|
|
41
37
|
}
|
|
42
38
|
/**
|
|
43
|
-
* Wave 7.
|
|
44
|
-
*
|
|
45
|
-
* -
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
50
|
-
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
51
|
-
* not churn.
|
|
39
|
+
* Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
|
|
40
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
41
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
42
|
+
* still arrive in Waves 7.3+; until then cases without `fixture` are marked
|
|
43
|
+
* as skipped rather than failing.
|
|
52
44
|
*/
|
|
53
45
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -1,23 +1,121 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
-
import {
|
|
3
|
+
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
|
+
import { loadCorpus, readFixtureArtifact } from "./corpus.js";
|
|
4
6
|
import { loadEvalConfig } from "./config-loader.js";
|
|
7
|
+
import { verifyStructural } from "./verifiers/structural.js";
|
|
5
8
|
function groupByStage(cases) {
|
|
6
9
|
return cases.reduce((acc, item) => {
|
|
7
10
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
8
11
|
return acc;
|
|
9
12
|
}, {});
|
|
10
13
|
}
|
|
14
|
+
function skeletonVerifierResult(message, details) {
|
|
15
|
+
return {
|
|
16
|
+
kind: "structural",
|
|
17
|
+
id: "wave-7-1-no-structural-expected",
|
|
18
|
+
ok: true,
|
|
19
|
+
score: 1,
|
|
20
|
+
message,
|
|
21
|
+
...(details !== undefined ? { details } : {})
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
|
|
25
|
+
const started = Date.now();
|
|
26
|
+
const structuralExpected = caseEntry.expected?.structural;
|
|
27
|
+
const verifierResults = [];
|
|
28
|
+
if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
|
|
29
|
+
// No structural expectations declared — case is treated as "N/A" for this
|
|
30
|
+
// verifier kind; a placeholder pass keeps downstream math simple while
|
|
31
|
+
// making the situation visible in the report.
|
|
32
|
+
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
let artifact;
|
|
36
|
+
try {
|
|
37
|
+
artifact = await readFixtureArtifact(projectRoot, caseEntry);
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
verifierResults.push({
|
|
41
|
+
kind: "structural",
|
|
42
|
+
id: "structural:fixture:missing",
|
|
43
|
+
ok: false,
|
|
44
|
+
score: 0,
|
|
45
|
+
message: err instanceof Error ? err.message : String(err),
|
|
46
|
+
details: { fixture: caseEntry.fixture }
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
if (artifact !== undefined) {
|
|
50
|
+
const results = verifyStructural(artifact, structuralExpected);
|
|
51
|
+
if (results.length === 0) {
|
|
52
|
+
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
verifierResults.push(...results);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else if (verifierResults.length === 0) {
|
|
59
|
+
verifierResults.push({
|
|
60
|
+
kind: "structural",
|
|
61
|
+
id: "structural:fixture:absent",
|
|
62
|
+
ok: false,
|
|
63
|
+
score: 0,
|
|
64
|
+
message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
65
|
+
details: { fixtureProvided: false }
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const allOk = verifierResults.every((r) => r.ok);
|
|
70
|
+
return {
|
|
71
|
+
caseId: caseEntry.id,
|
|
72
|
+
stage: caseEntry.stage,
|
|
73
|
+
tier: plannedTier,
|
|
74
|
+
passed: allOk,
|
|
75
|
+
durationMs: Date.now() - started,
|
|
76
|
+
verifierResults
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
function reduceSummary(caseResults) {
|
|
80
|
+
let passed = 0;
|
|
81
|
+
let failed = 0;
|
|
82
|
+
let skipped = 0;
|
|
83
|
+
let totalCostUsd = 0;
|
|
84
|
+
let totalDurationMs = 0;
|
|
85
|
+
for (const c of caseResults) {
|
|
86
|
+
totalDurationMs += c.durationMs;
|
|
87
|
+
if (c.costUsd !== undefined)
|
|
88
|
+
totalCostUsd += c.costUsd;
|
|
89
|
+
if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
|
|
90
|
+
skipped += 1;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
if (c.passed)
|
|
94
|
+
passed += 1;
|
|
95
|
+
else
|
|
96
|
+
failed += 1;
|
|
97
|
+
}
|
|
98
|
+
return {
|
|
99
|
+
totalCases: caseResults.length,
|
|
100
|
+
passed,
|
|
101
|
+
failed,
|
|
102
|
+
skipped,
|
|
103
|
+
totalCostUsd: Number(totalCostUsd.toFixed(6)),
|
|
104
|
+
totalDurationMs
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function stagesInResults(caseResults) {
|
|
108
|
+
const set = new Set();
|
|
109
|
+
for (const c of caseResults)
|
|
110
|
+
set.add(c.stage);
|
|
111
|
+
return FLOW_STAGES.filter((s) => set.has(s));
|
|
112
|
+
}
|
|
11
113
|
/**
|
|
12
|
-
* Wave 7.
|
|
13
|
-
*
|
|
14
|
-
* -
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
19
|
-
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
20
|
-
* not churn.
|
|
114
|
+
* Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
|
|
115
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
116
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
117
|
+
* still arrive in Waves 7.3+; until then cases without `fixture` are marked
|
|
118
|
+
* as skipped rather than failing.
|
|
21
119
|
*/
|
|
22
120
|
export async function runEval(options) {
|
|
23
121
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
@@ -25,10 +123,7 @@ export async function runEval(options) {
|
|
|
25
123
|
const plannedTier = options.tier ?? config.defaultTier;
|
|
26
124
|
const notes = [];
|
|
27
125
|
if (corpus.length === 0) {
|
|
28
|
-
notes.push("Corpus is empty. Seed cases
|
|
29
|
-
}
|
|
30
|
-
if (options.schemaOnly) {
|
|
31
|
-
notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
|
|
126
|
+
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
32
127
|
}
|
|
33
128
|
if (options.rules) {
|
|
34
129
|
notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
|
|
@@ -47,7 +142,7 @@ export async function runEval(options) {
|
|
|
47
142
|
},
|
|
48
143
|
plannedTier,
|
|
49
144
|
verifiersAvailable: {
|
|
50
|
-
structural:
|
|
145
|
+
structural: true,
|
|
51
146
|
rules: false,
|
|
52
147
|
judge: false,
|
|
53
148
|
workflow: false
|
|
@@ -57,22 +152,13 @@ export async function runEval(options) {
|
|
|
57
152
|
return summary;
|
|
58
153
|
}
|
|
59
154
|
const now = new Date().toISOString();
|
|
60
|
-
const caseResults =
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
{
|
|
68
|
-
kind: "structural",
|
|
69
|
-
id: "wave-7-0-skeleton",
|
|
70
|
-
ok: false,
|
|
71
|
-
message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
|
|
72
|
-
details: { skipped: true }
|
|
73
|
-
}
|
|
74
|
-
]
|
|
75
|
-
}));
|
|
155
|
+
const caseResults = [];
|
|
156
|
+
for (const item of corpus) {
|
|
157
|
+
caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
|
|
158
|
+
}
|
|
159
|
+
const stages = stagesInResults(caseResults);
|
|
160
|
+
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
161
|
+
const summary = reduceSummary(caseResults);
|
|
76
162
|
const report = {
|
|
77
163
|
schemaVersion: 1,
|
|
78
164
|
generatedAt: now,
|
|
@@ -81,16 +167,12 @@ export async function runEval(options) {
|
|
|
81
167
|
provider: config.provider,
|
|
82
168
|
model: config.model,
|
|
83
169
|
tier: plannedTier,
|
|
84
|
-
stages
|
|
170
|
+
stages,
|
|
85
171
|
cases: caseResults,
|
|
86
|
-
summary
|
|
87
|
-
totalCases: caseResults.length,
|
|
88
|
-
passed: 0,
|
|
89
|
-
failed: 0,
|
|
90
|
-
skipped: caseResults.length,
|
|
91
|
-
totalCostUsd: 0,
|
|
92
|
-
totalDurationMs: 0
|
|
93
|
-
}
|
|
172
|
+
summary
|
|
94
173
|
};
|
|
174
|
+
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
175
|
+
if (baselineDelta)
|
|
176
|
+
report.baselineDelta = baselineDelta;
|
|
95
177
|
return report;
|
|
96
178
|
}
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -27,6 +27,45 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
|
27
27
|
*/
|
|
28
28
|
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
29
29
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
|
+
/**
|
|
31
|
+
* Structural expectations — deterministic, LLM-free checks against a single
|
|
32
|
+
* text artifact. Wave 7.1 implements all fields below; Wave 7.2 adds the
|
|
33
|
+
* sibling `rules` shape, Wave 7.3 adds `judge`.
|
|
34
|
+
*/
|
|
35
|
+
export interface StructuralExpected {
|
|
36
|
+
/**
|
|
37
|
+
* Case-insensitive substrings that must each appear on at least one markdown
|
|
38
|
+
* heading line (line starting with `#`). Useful for "required sections".
|
|
39
|
+
*/
|
|
40
|
+
requiredSections?: string[];
|
|
41
|
+
/**
|
|
42
|
+
* Case-insensitive substrings that must NOT appear anywhere in the body
|
|
43
|
+
* (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
|
|
44
|
+
*/
|
|
45
|
+
forbiddenPatterns?: string[];
|
|
46
|
+
/** Inclusive minimum line count of the artifact body (frontmatter excluded). */
|
|
47
|
+
minLines?: number;
|
|
48
|
+
/** Inclusive maximum line count of the artifact body (frontmatter excluded). */
|
|
49
|
+
maxLines?: number;
|
|
50
|
+
/** Inclusive minimum character count of the artifact body. */
|
|
51
|
+
minChars?: number;
|
|
52
|
+
/** Inclusive maximum character count of the artifact body. */
|
|
53
|
+
maxChars?: number;
|
|
54
|
+
/**
|
|
55
|
+
* Keys that must appear in the leading YAML frontmatter (between a pair of
|
|
56
|
+
* `---` delimiters at the very top of the file). An artifact without
|
|
57
|
+
* frontmatter will fail every entry.
|
|
58
|
+
*/
|
|
59
|
+
requiredFrontmatterKeys?: string[];
|
|
60
|
+
}
|
|
61
|
+
/** Superset of per-verifier expectation shapes. Only `structural` is wired in Wave 7.1. */
|
|
62
|
+
export interface ExpectedShape {
|
|
63
|
+
structural?: StructuralExpected;
|
|
64
|
+
/** Rule-based (keyword/regex/traceability) checks — Wave 7.2. */
|
|
65
|
+
rules?: Record<string, unknown>;
|
|
66
|
+
/** LLM-judge rubrics — Wave 7.3. */
|
|
67
|
+
judge?: Record<string, unknown>;
|
|
68
|
+
}
|
|
30
69
|
/**
|
|
31
70
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
32
71
|
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
@@ -40,10 +79,10 @@ export interface EvalCase {
|
|
|
40
79
|
/** Project files copied into the Tier B/C sandbox before the agent runs. */
|
|
41
80
|
contextFiles?: string[];
|
|
42
81
|
/**
|
|
43
|
-
*
|
|
44
|
-
*
|
|
82
|
+
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
83
|
+
* Each sub-shape is optional; missing sub-shapes skip that verifier tier.
|
|
45
84
|
*/
|
|
46
|
-
expected?:
|
|
85
|
+
expected?: ExpectedShape;
|
|
47
86
|
/**
|
|
48
87
|
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
49
88
|
* when verifiers are exercised without a live agent loop. Primarily a Wave
|
|
@@ -91,11 +130,7 @@ export interface EvalReport {
|
|
|
91
130
|
totalDurationMs: number;
|
|
92
131
|
};
|
|
93
132
|
/** Present when comparing against a saved baseline (Wave 7.1+). */
|
|
94
|
-
baselineDelta?:
|
|
95
|
-
baselineId: string;
|
|
96
|
-
scoreDelta: number;
|
|
97
|
-
criticalFailures: number;
|
|
98
|
-
};
|
|
133
|
+
baselineDelta?: BaselineDelta;
|
|
99
134
|
}
|
|
100
135
|
/**
|
|
101
136
|
* Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
|
|
@@ -134,3 +169,48 @@ export interface ResolvedEvalConfig extends EvalConfig {
|
|
|
134
169
|
apiKey?: string;
|
|
135
170
|
source: "default" | "file" | "env" | "file+env";
|
|
136
171
|
}
|
|
172
|
+
/**
|
|
173
|
+
* Frozen per-stage baseline used by regression gating (Wave 7.1). Baselines
|
|
174
|
+
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
175
|
+
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
176
|
+
* changed between runs.
|
|
177
|
+
*/
|
|
178
|
+
export interface BaselineSnapshot {
|
|
179
|
+
schemaVersion: 1;
|
|
180
|
+
stage: FlowStage;
|
|
181
|
+
generatedAt: string;
|
|
182
|
+
cclawVersion: string;
|
|
183
|
+
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
184
|
+
cases: Record<string, BaselineCaseEntry>;
|
|
185
|
+
}
|
|
186
|
+
export interface BaselineCaseEntry {
|
|
187
|
+
passed: boolean;
|
|
188
|
+
verifierResults: BaselineVerifierEntry[];
|
|
189
|
+
}
|
|
190
|
+
export interface BaselineVerifierEntry {
|
|
191
|
+
id: string;
|
|
192
|
+
kind: VerifierKind;
|
|
193
|
+
ok: boolean;
|
|
194
|
+
score?: number;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Delta between a fresh report and the saved baseline. Populated when
|
|
198
|
+
* baselines exist on disk and the run covers matching cases.
|
|
199
|
+
*/
|
|
200
|
+
export interface BaselineDelta {
|
|
201
|
+
baselineId: string;
|
|
202
|
+
/** Fresh-score − baseline-score, bounded to [-1, 1]. */
|
|
203
|
+
scoreDelta: number;
|
|
204
|
+
/** Count of checks that flipped from `ok:true` to `ok:false`. */
|
|
205
|
+
criticalFailures: number;
|
|
206
|
+
/** Per-case regression details for the Markdown report. */
|
|
207
|
+
regressions: BaselineRegression[];
|
|
208
|
+
}
|
|
209
|
+
export interface BaselineRegression {
|
|
210
|
+
caseId: string;
|
|
211
|
+
stage: FlowStage;
|
|
212
|
+
verifierId: string;
|
|
213
|
+
reason: "newly-failing" | "case-now-failing" | "score-drop";
|
|
214
|
+
previousScore?: number;
|
|
215
|
+
currentScore?: number;
|
|
216
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { StructuralExpected, VerifierResult } from "../types.js";
|
|
2
|
+
export interface ArtifactSplit {
|
|
3
|
+
hasFrontmatter: boolean;
|
|
4
|
+
frontmatterRaw: string;
|
|
5
|
+
frontmatterParsed?: Record<string, unknown>;
|
|
6
|
+
body: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function splitFrontmatter(artifact: string): ArtifactSplit;
|
|
9
|
+
/**
|
|
10
|
+
* Run every configured structural check against the artifact text.
|
|
11
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
12
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
13
|
+
*/
|
|
14
|
+
export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural verifier (Wave 7.1): deterministic, zero-LLM checks against a
|
|
3
|
+
* single markdown artifact. Each structural expectation produces one
|
|
4
|
+
* `VerifierResult` so baselines diff cleanly at the check level rather than
|
|
5
|
+
* lumping everything into a single boolean.
|
|
6
|
+
*
|
|
7
|
+
* Design notes:
|
|
8
|
+
*
|
|
9
|
+
* - All pattern matching is case-insensitive. Authoring a check as
|
|
10
|
+
* `"Directions"` matches `## Directions` and `### directions-suggested`.
|
|
11
|
+
* - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
|
|
12
|
+
* and close on a subsequent `---` line. Anything else is treated as "no
|
|
13
|
+
* frontmatter", which fails every `requiredFrontmatterKeys` entry
|
|
14
|
+
* deterministically.
|
|
15
|
+
* - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
|
|
16
|
+
* adds metadata does not accidentally drop the body below the floor.
|
|
17
|
+
* - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
|
|
18
|
+
* all individual `ok` flags. This keeps Wave 7.1 deterministic; the 0..1
|
|
19
|
+
* rubric scale shows up in Wave 7.3 (judge).
|
|
20
|
+
*/
|
|
21
|
+
import { parse as parseYaml } from "yaml";
|
|
22
|
+
const FRONTMATTER_OPEN = /^---\r?\n/;
|
|
23
|
+
const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
|
|
24
|
+
function slugify(input) {
|
|
25
|
+
return input
|
|
26
|
+
.toLowerCase()
|
|
27
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
28
|
+
.replace(/(^-|-$)/g, "")
|
|
29
|
+
.slice(0, 64);
|
|
30
|
+
}
|
|
31
|
+
export function splitFrontmatter(artifact) {
|
|
32
|
+
if (!FRONTMATTER_OPEN.test(artifact)) {
|
|
33
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
34
|
+
}
|
|
35
|
+
const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
|
|
36
|
+
const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
|
|
37
|
+
if (!closeMatch || closeMatch.index === undefined) {
|
|
38
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
39
|
+
}
|
|
40
|
+
const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
|
|
41
|
+
const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
|
|
42
|
+
let frontmatterParsed;
|
|
43
|
+
try {
|
|
44
|
+
const parsed = parseYaml(frontmatterRaw);
|
|
45
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
46
|
+
frontmatterParsed = parsed;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
frontmatterParsed = undefined;
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
hasFrontmatter: true,
|
|
54
|
+
frontmatterRaw,
|
|
55
|
+
frontmatterParsed,
|
|
56
|
+
body
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function extractHeadingLines(body) {
|
|
60
|
+
return body
|
|
61
|
+
.split(/\r?\n/)
|
|
62
|
+
.map((line) => line.trimStart())
|
|
63
|
+
.filter((line) => /^#{1,6}\s+\S/.test(line));
|
|
64
|
+
}
|
|
65
|
+
function result(id, ok, message, details) {
|
|
66
|
+
return {
|
|
67
|
+
kind: "structural",
|
|
68
|
+
id,
|
|
69
|
+
ok,
|
|
70
|
+
score: ok ? 1 : 0,
|
|
71
|
+
message,
|
|
72
|
+
...(details !== undefined ? { details } : {})
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function checkRequiredSections(sections, body) {
|
|
76
|
+
const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
|
|
77
|
+
return sections.map((section) => {
|
|
78
|
+
const needle = section.toLowerCase().trim();
|
|
79
|
+
const found = headings.some((heading) => heading.includes(needle));
|
|
80
|
+
return result(`structural:section:${slugify(section)}`, found, found
|
|
81
|
+
? `Section matching "${section}" present.`
|
|
82
|
+
: `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
function checkForbiddenPatterns(patterns, body) {
|
|
86
|
+
const bodyLower = body.toLowerCase();
|
|
87
|
+
return patterns.map((pattern) => {
|
|
88
|
+
const needle = pattern.toLowerCase();
|
|
89
|
+
const hits = countOccurrences(bodyLower, needle);
|
|
90
|
+
const ok = hits === 0;
|
|
91
|
+
return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
|
|
92
|
+
? `Pattern "${pattern}" absent (as required).`
|
|
93
|
+
: `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
function countOccurrences(haystack, needle) {
|
|
97
|
+
if (needle.length === 0)
|
|
98
|
+
return 0;
|
|
99
|
+
let index = 0;
|
|
100
|
+
let count = 0;
|
|
101
|
+
while (true) {
|
|
102
|
+
const at = haystack.indexOf(needle, index);
|
|
103
|
+
if (at < 0)
|
|
104
|
+
return count;
|
|
105
|
+
count += 1;
|
|
106
|
+
index = at + needle.length;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
function checkLengthBounds(expected, body) {
|
|
110
|
+
const results = [];
|
|
111
|
+
const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
|
|
112
|
+
const charCount = body.length;
|
|
113
|
+
if (expected.minLines !== undefined || expected.maxLines !== undefined) {
|
|
114
|
+
const min = expected.minLines;
|
|
115
|
+
const max = expected.maxLines;
|
|
116
|
+
const withinMin = min === undefined || lineCount >= min;
|
|
117
|
+
const withinMax = max === undefined || lineCount <= max;
|
|
118
|
+
const ok = withinMin && withinMax;
|
|
119
|
+
results.push(result("structural:length:lines", ok, ok
|
|
120
|
+
? `Body has ${lineCount} line(s), within bounds.`
|
|
121
|
+
: buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
|
|
122
|
+
}
|
|
123
|
+
if (expected.minChars !== undefined || expected.maxChars !== undefined) {
|
|
124
|
+
const min = expected.minChars;
|
|
125
|
+
const max = expected.maxChars;
|
|
126
|
+
const withinMin = min === undefined || charCount >= min;
|
|
127
|
+
const withinMax = max === undefined || charCount <= max;
|
|
128
|
+
const ok = withinMin && withinMax;
|
|
129
|
+
results.push(result("structural:length:chars", ok, ok
|
|
130
|
+
? `Body has ${charCount} char(s), within bounds.`
|
|
131
|
+
: buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
|
|
132
|
+
}
|
|
133
|
+
return results;
|
|
134
|
+
}
|
|
135
|
+
function buildOutOfRangeMessage(unit, actual, min, max) {
|
|
136
|
+
const lo = min === undefined ? "0" : String(min);
|
|
137
|
+
const hi = max === undefined ? "∞" : String(max);
|
|
138
|
+
return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
|
|
139
|
+
}
|
|
140
|
+
function checkFrontmatterKeys(keys, split) {
|
|
141
|
+
if (!split.hasFrontmatter || !split.frontmatterParsed) {
|
|
142
|
+
return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
|
|
143
|
+
}
|
|
144
|
+
const present = new Set(Object.keys(split.frontmatterParsed));
|
|
145
|
+
return keys.map((key) => {
|
|
146
|
+
const ok = present.has(key);
|
|
147
|
+
return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Run every configured structural check against the artifact text.
|
|
152
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
153
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
154
|
+
*/
|
|
155
|
+
export function verifyStructural(artifact, expected) {
|
|
156
|
+
if (!expected)
|
|
157
|
+
return [];
|
|
158
|
+
const split = splitFrontmatter(artifact);
|
|
159
|
+
const results = [];
|
|
160
|
+
if (expected.requiredSections?.length) {
|
|
161
|
+
results.push(...checkRequiredSections(expected.requiredSections, split.body));
|
|
162
|
+
}
|
|
163
|
+
if (expected.forbiddenPatterns?.length) {
|
|
164
|
+
results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
|
|
165
|
+
}
|
|
166
|
+
results.push(...checkLengthBounds(expected, split.body));
|
|
167
|
+
if (expected.requiredFrontmatterKeys?.length) {
|
|
168
|
+
results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
|
|
169
|
+
}
|
|
170
|
+
return results;
|
|
171
|
+
}
|