cclaw-cli 0.21.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { parse } from "yaml";
4
+ import { EVALS_CONFIG_PATH } from "../constants.js";
5
+ import { exists } from "../fs-utils.js";
6
+ import { EVAL_TIERS } from "./types.js";
7
+ /**
8
+ * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
9
+ * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
10
+ * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
11
+ * variables (env wins last).
12
+ */
13
+ export const DEFAULT_EVAL_CONFIG = {
14
+ provider: "zai",
15
+ baseUrl: "https://api.z.ai/api/coding/paas/v4",
16
+ model: "glm-5.1",
17
+ defaultTier: "A",
18
+ regression: {
19
+ failIfDeltaBelow: -0.15,
20
+ failIfCriticalBelow: 3.0
21
+ },
22
+ timeoutMs: 120_000,
23
+ maxRetries: 2
24
+ };
25
+ const EVAL_TIER_SET = new Set(EVAL_TIERS);
26
+ const NUMERIC_ENVS = new Set([
27
+ "CCLAW_EVAL_DAILY_USD_CAP",
28
+ "CCLAW_EVAL_TIMEOUT_MS",
29
+ "CCLAW_EVAL_MAX_RETRIES"
30
+ ]);
31
+ function evalConfigError(configFilePath, reason) {
32
+ return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
33
+ `Supported tiers: ${EVAL_TIERS.join(", ")}\n` +
34
+ `See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
35
+ }
36
+ function isRecord(value) {
37
+ return typeof value === "object" && value !== null && !Array.isArray(value);
38
+ }
39
+ function parseNumericEnv(name, raw) {
40
+ const value = Number(raw);
41
+ if (!Number.isFinite(value)) {
42
+ throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
43
+ }
44
+ return value;
45
+ }
46
+ function parseTierEnv(raw) {
47
+ const trimmed = raw.trim().toUpperCase();
48
+ if (!EVAL_TIER_SET.has(trimmed)) {
49
+ throw new Error(`Environment variable CCLAW_EVAL_TIER must be one of ${EVAL_TIERS.join("/")}, got: ${raw}`);
50
+ }
51
+ return trimmed;
52
+ }
53
+ function validateFileConfig(raw, configFilePath) {
54
+ if (raw === undefined || raw === null)
55
+ return {};
56
+ if (!isRecord(raw)) {
57
+ throw evalConfigError(configFilePath, "top-level value must be a mapping");
58
+ }
59
+ const out = {};
60
+ const assignString = (key, value) => {
61
+ if (value === undefined)
62
+ return;
63
+ if (typeof value !== "string" || value.trim().length === 0) {
64
+ throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
65
+ }
66
+ out[key] = value.trim();
67
+ };
68
+ assignString("provider", raw.provider);
69
+ assignString("baseUrl", raw.baseUrl);
70
+ assignString("model", raw.model);
71
+ assignString("judgeModel", raw.judgeModel);
72
+ if (raw.defaultTier !== undefined) {
73
+ if (typeof raw.defaultTier !== "string" || !EVAL_TIER_SET.has(raw.defaultTier)) {
74
+ throw evalConfigError(configFilePath, `"defaultTier" must be one of: ${EVAL_TIERS.join(", ")}`);
75
+ }
76
+ out.defaultTier = raw.defaultTier;
77
+ }
78
+ if (raw.dailyUsdCap !== undefined) {
79
+ if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
80
+ throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
81
+ }
82
+ out.dailyUsdCap = raw.dailyUsdCap;
83
+ }
84
+ if (raw.timeoutMs !== undefined) {
85
+ if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
86
+ throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
87
+ }
88
+ out.timeoutMs = raw.timeoutMs;
89
+ }
90
+ if (raw.maxRetries !== undefined) {
91
+ if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
92
+ throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
93
+ }
94
+ out.maxRetries = raw.maxRetries;
95
+ }
96
+ if (raw.regression !== undefined) {
97
+ if (!isRecord(raw.regression)) {
98
+ throw evalConfigError(configFilePath, `"regression" must be a mapping`);
99
+ }
100
+ const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
101
+ const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
102
+ if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
103
+ throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
104
+ }
105
+ if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
106
+ throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
107
+ }
108
+ out.regression = {
109
+ failIfDeltaBelow: typeof failIfDeltaBelow === "number"
110
+ ? failIfDeltaBelow
111
+ : DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
112
+ failIfCriticalBelow: typeof failIfCriticalBelow === "number"
113
+ ? failIfCriticalBelow
114
+ : DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
115
+ };
116
+ }
117
+ const knownKeys = new Set([
118
+ "provider",
119
+ "baseUrl",
120
+ "model",
121
+ "judgeModel",
122
+ "defaultTier",
123
+ "dailyUsdCap",
124
+ "timeoutMs",
125
+ "maxRetries",
126
+ "regression"
127
+ ]);
128
+ const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
129
+ if (unknown.length > 0) {
130
+ throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
131
+ }
132
+ return out;
133
+ }
134
+ async function readFileConfig(projectRoot) {
135
+ const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
136
+ if (!(await exists(configFilePath))) {
137
+ return { patch: {}, source: "default" };
138
+ }
139
+ let parsed;
140
+ try {
141
+ parsed = parse(await fs.readFile(configFilePath, "utf8"));
142
+ }
143
+ catch (err) {
144
+ throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
145
+ }
146
+ const patch = validateFileConfig(parsed, configFilePath);
147
+ return { patch, source: "file" };
148
+ }
149
+ function applyEnvOverrides(base, env) {
150
+ let overridden = false;
151
+ const patched = {
152
+ ...base,
153
+ regression: { ...base.regression }
154
+ };
155
+ for (const name of Object.keys(env)) {
156
+ if (!name.startsWith("CCLAW_EVAL_"))
157
+ continue;
158
+ if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
159
+ // validated below when applied
160
+ }
161
+ }
162
+ const read = (name) => {
163
+ const value = env[name];
164
+ return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
165
+ };
166
+ const baseUrl = read("CCLAW_EVAL_BASE_URL");
167
+ if (baseUrl) {
168
+ patched.baseUrl = baseUrl;
169
+ overridden = true;
170
+ }
171
+ const model = read("CCLAW_EVAL_MODEL");
172
+ if (model) {
173
+ patched.model = model;
174
+ overridden = true;
175
+ }
176
+ const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
177
+ if (judgeModel) {
178
+ patched.judgeModel = judgeModel;
179
+ overridden = true;
180
+ }
181
+ const provider = read("CCLAW_EVAL_PROVIDER");
182
+ if (provider) {
183
+ patched.provider = provider;
184
+ overridden = true;
185
+ }
186
+ const tier = read("CCLAW_EVAL_TIER");
187
+ if (tier) {
188
+ patched.defaultTier = parseTierEnv(tier);
189
+ overridden = true;
190
+ }
191
+ const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
192
+ if (cap) {
193
+ patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
194
+ overridden = true;
195
+ }
196
+ const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
197
+ if (timeout) {
198
+ patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
199
+ overridden = true;
200
+ }
201
+ const retries = read("CCLAW_EVAL_MAX_RETRIES");
202
+ if (retries) {
203
+ patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
204
+ overridden = true;
205
+ }
206
+ const apiKey = read("CCLAW_EVAL_API_KEY");
207
+ return { patched, overridden, apiKey };
208
+ }
209
+ /**
210
+ * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
211
+ * Returns a fully-populated config plus a provenance marker so `--dry-run` can
212
+ * surface where each setting came from.
213
+ */
214
+ export async function loadEvalConfig(projectRoot, env = process.env) {
215
+ const { patch, source: fileSource } = await readFileConfig(projectRoot);
216
+ const merged = {
217
+ ...DEFAULT_EVAL_CONFIG,
218
+ ...patch,
219
+ regression: {
220
+ ...DEFAULT_EVAL_CONFIG.regression,
221
+ ...(patch.regression ?? {})
222
+ }
223
+ };
224
+ const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
225
+ let source = "default";
226
+ if (fileSource === "file" && overridden)
227
+ source = "file+env";
228
+ else if (fileSource === "file")
229
+ source = "file";
230
+ else if (overridden)
231
+ source = "env";
232
+ return {
233
+ ...patched,
234
+ apiKey,
235
+ source
236
+ };
237
+ }
@@ -0,0 +1,19 @@
1
+ import type { FlowStage } from "../types.js";
2
+ import type { EvalCase } from "./types.js";
3
+ /**
4
+ * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
5
+ * single stage. Returns an empty array for a fresh install.
6
+ */
7
+ export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
8
+ /**
9
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
10
+ * field is interpreted relative to the case's stage directory (i.e., a
11
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
12
+ */
13
+ export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
14
+ /**
15
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
16
+ * has no fixture reference. Throws a descriptive error if the path exists in
17
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
18
+ */
19
+ export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
@@ -0,0 +1,175 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { parse } from "yaml";
4
+ import { EVALS_ROOT } from "../constants.js";
5
+ import { exists } from "../fs-utils.js";
6
+ import { FLOW_STAGES } from "../types.js";
7
+ const FLOW_STAGE_SET = new Set(FLOW_STAGES);
8
+ function corpusError(filePath, reason) {
9
+ return new Error(`Invalid eval case at ${filePath}: ${reason}\n` +
10
+ `Supported stages: ${FLOW_STAGES.join(", ")}`);
11
+ }
12
+ function isRecord(value) {
13
+ return typeof value === "object" && value !== null && !Array.isArray(value);
14
+ }
15
+ function readStringArray(filePath, context, value) {
16
+ if (value === undefined)
17
+ return undefined;
18
+ if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
19
+ throw corpusError(filePath, `"${context}" must be an array of strings`);
20
+ }
21
+ return value;
22
+ }
23
+ function readNonNegativeInteger(filePath, context, value) {
24
+ if (value === undefined)
25
+ return undefined;
26
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
27
+ throw corpusError(filePath, `"${context}" must be a non-negative integer`);
28
+ }
29
+ return value;
30
+ }
31
+ function parseStructural(filePath, raw) {
32
+ if (raw === undefined)
33
+ return undefined;
34
+ if (!isRecord(raw)) {
35
+ throw corpusError(filePath, `"expected.structural" must be a mapping`);
36
+ }
37
+ const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
38
+ const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
39
+ const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
40
+ const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
41
+ const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
42
+ const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
43
+ const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
44
+ const structural = {};
45
+ if (requiredSections)
46
+ structural.requiredSections = requiredSections;
47
+ if (forbiddenPatterns)
48
+ structural.forbiddenPatterns = forbiddenPatterns;
49
+ if (requiredFrontmatterKeys)
50
+ structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
51
+ if (minLines !== undefined)
52
+ structural.minLines = minLines;
53
+ if (maxLines !== undefined)
54
+ structural.maxLines = maxLines;
55
+ if (minChars !== undefined)
56
+ structural.minChars = minChars;
57
+ if (maxChars !== undefined)
58
+ structural.maxChars = maxChars;
59
+ return structural;
60
+ }
61
+ function parseExpected(filePath, raw) {
62
+ if (raw === undefined)
63
+ return undefined;
64
+ if (!isRecord(raw)) {
65
+ throw corpusError(filePath, `"expected" must be a mapping`);
66
+ }
67
+ const shape = {};
68
+ const structural = parseStructural(filePath, raw.structural);
69
+ if (structural)
70
+ shape.structural = structural;
71
+ if (raw.rules !== undefined) {
72
+ if (!isRecord(raw.rules)) {
73
+ throw corpusError(filePath, `"expected.rules" must be a mapping`);
74
+ }
75
+ shape.rules = raw.rules;
76
+ }
77
+ if (raw.judge !== undefined) {
78
+ if (!isRecord(raw.judge)) {
79
+ throw corpusError(filePath, `"expected.judge" must be a mapping`);
80
+ }
81
+ shape.judge = raw.judge;
82
+ }
83
+ return Object.keys(shape).length === 0 ? undefined : shape;
84
+ }
85
+ function validateCase(filePath, raw) {
86
+ if (!isRecord(raw)) {
87
+ throw corpusError(filePath, "top-level value must be a mapping");
88
+ }
89
+ const id = raw.id;
90
+ if (typeof id !== "string" || id.trim().length === 0) {
91
+ throw corpusError(filePath, `"id" must be a non-empty string`);
92
+ }
93
+ const stageRaw = raw.stage;
94
+ if (typeof stageRaw !== "string" || !FLOW_STAGE_SET.has(stageRaw)) {
95
+ throw corpusError(filePath, `"stage" must be one of: ${FLOW_STAGES.join(", ")}`);
96
+ }
97
+ const inputPrompt = raw.input_prompt ?? raw.inputPrompt;
98
+ if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
99
+ throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
100
+ }
101
+ const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
102
+ const expected = parseExpected(filePath, raw.expected);
103
+ const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
104
+ return {
105
+ id: id.trim(),
106
+ stage: stageRaw,
107
+ inputPrompt: inputPrompt.trim(),
108
+ contextFiles,
109
+ expected,
110
+ fixture
111
+ };
112
+ }
113
+ /**
114
+ * Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
115
+ * single stage. Returns an empty array for a fresh install.
116
+ */
117
+ export async function loadCorpus(projectRoot, stage) {
118
+ const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
119
+ if (!(await exists(corpusRoot))) {
120
+ return [];
121
+ }
122
+ const cases = [];
123
+ const stageDirs = stage
124
+ ? [path.join(corpusRoot, stage)]
125
+ : (await fs.readdir(corpusRoot, { withFileTypes: true }))
126
+ .filter((entry) => entry.isDirectory())
127
+ .filter((entry) => FLOW_STAGE_SET.has(entry.name))
128
+ .map((entry) => path.join(corpusRoot, entry.name));
129
+ for (const stageDir of stageDirs) {
130
+ if (!(await exists(stageDir)))
131
+ continue;
132
+ const entries = await fs.readdir(stageDir, { withFileTypes: true });
133
+ for (const entry of entries) {
134
+ if (!entry.isFile())
135
+ continue;
136
+ if (!entry.name.endsWith(".yaml") && !entry.name.endsWith(".yml"))
137
+ continue;
138
+ const filePath = path.join(stageDir, entry.name);
139
+ let parsed;
140
+ try {
141
+ parsed = parse(await fs.readFile(filePath, "utf8"));
142
+ }
143
+ catch (err) {
144
+ throw corpusError(filePath, err instanceof Error ? err.message : String(err));
145
+ }
146
+ cases.push(validateCase(filePath, parsed));
147
+ }
148
+ }
149
+ cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
150
+ return cases;
151
+ }
152
+ /**
153
+ * Resolve a case's `fixture` path to an absolute filesystem path. The fixture
154
+ * field is interpreted relative to the case's stage directory (i.e., a
155
+ * sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
156
+ */
157
+ export function fixturePathFor(projectRoot, caseEntry) {
158
+ if (!caseEntry.fixture)
159
+ return undefined;
160
+ return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
161
+ }
162
+ /**
163
+ * Read the fixture artifact text for a case. Returns `undefined` if the case
164
+ * has no fixture reference. Throws a descriptive error if the path exists in
165
+ * the case but not on disk — Wave 7.1 fixtures ship alongside cases.
166
+ */
167
+ export async function readFixtureArtifact(projectRoot, caseEntry) {
168
+ const fixturePath = fixturePathFor(projectRoot, caseEntry);
169
+ if (!fixturePath)
170
+ return undefined;
171
+ if (!(await exists(fixturePath))) {
172
+ throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
173
+ }
174
+ return fs.readFile(fixturePath, "utf8");
175
+ }
@@ -0,0 +1,62 @@
1
+ /**
2
+ * LLM client skeleton for the cclaw eval subsystem.
3
+ *
4
+ * Wave 7.0 declares the shape of the client without pulling in the `openai`
5
+ * runtime dependency. The real implementation is wired in Wave 7.3 when
6
+ * single-shot (Tier A) evals and LLM judging come online. Keeping this stub
7
+ * separate means users of Waves 7.0–7.2 (structural + rule-based verifiers)
8
+ * never install an extra dependency or receive network egress warnings.
9
+ */
10
+ import type { ResolvedEvalConfig } from "./types.js";
11
+ /**
12
+ * Minimal chat interface the rest of the eval code will depend on. It is
13
+ * intentionally a subset of OpenAI's Chat Completions surface so that the
14
+ * Wave 7.3 implementation is a thin adapter around `OpenAI.chat.completions.create`.
15
+ */
16
+ export interface ChatMessage {
17
+ role: "system" | "user" | "assistant" | "tool";
18
+ content: string;
19
+ name?: string;
20
+ toolCallId?: string;
21
+ }
22
+ export interface ChatRequest {
23
+ model: string;
24
+ messages: ChatMessage[];
25
+ maxTokens?: number;
26
+ temperature?: number;
27
+ timeoutMs?: number;
28
+ /**
29
+ * Tool/function-calling definitions in OpenAI wire format. Populated only by
30
+ * Wave 7.4 (Tier B). Ignored by the Wave 7.3 single-shot path.
31
+ */
32
+ tools?: unknown[];
33
+ toolChoice?: "auto" | "none";
34
+ }
35
+ export interface ChatUsage {
36
+ promptTokens: number;
37
+ completionTokens: number;
38
+ totalTokens: number;
39
+ }
40
+ export interface ChatResponse {
41
+ content: string;
42
+ toolCalls?: Array<{
43
+ id: string;
44
+ name: string;
45
+ arguments: string;
46
+ }>;
47
+ usage: ChatUsage;
48
+ finishReason: "stop" | "length" | "tool_calls" | "content_filter";
49
+ }
50
+ /** Lightweight client abstraction shared across eval runners. */
51
+ export interface EvalLlmClient {
52
+ chat(request: ChatRequest): Promise<ChatResponse>;
53
+ }
54
+ export declare class EvalLlmNotWiredError extends Error {
55
+ constructor(wave: string);
56
+ }
57
+ /**
58
+ * Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
59
+ * easy to diagnose. The Wave 7.3 implementation will replace this body with
60
+ * `new OpenAI({ apiKey, baseURL }) ... adapter`.
61
+ */
62
+ export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
@@ -0,0 +1,19 @@
1
+ export class EvalLlmNotWiredError extends Error {
2
+ constructor(wave) {
3
+ super(`LLM client is not wired in Wave 7.0. It arrives in Wave ${wave}.\n` +
4
+ `Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
5
+ this.name = "EvalLlmNotWiredError";
6
+ }
7
+ }
8
+ /**
9
+ * Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
10
+ * easy to diagnose. The Wave 7.3 implementation will replace this body with
11
+ * `new OpenAI({ apiKey, baseURL }) ... adapter`.
12
+ */
13
+ export function createEvalClient(_config) {
14
+ return {
15
+ async chat() {
16
+ throw new EvalLlmNotWiredError("7.3");
17
+ }
18
+ };
19
+ }
@@ -0,0 +1,11 @@
1
+ import type { EvalReport } from "./types.js";
2
+ export declare function reportsDir(projectRoot: string): string;
3
+ export declare function defaultReportBasename(report: EvalReport): string;
4
+ /**
5
+ * Format a report as a human-readable Markdown document. Keeping the layout
6
+ * stable matters: CI posts diffs against earlier reports, and unit tests use
7
+ * the output as a regression guard.
8
+ */
9
+ export declare function formatMarkdownReport(report: EvalReport): string;
10
+ export declare function writeJsonReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
11
+ export declare function writeMarkdownReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
@@ -0,0 +1,101 @@
1
+ import path from "node:path";
2
+ import { EVALS_ROOT } from "../constants.js";
3
+ import { writeFileSafe } from "../fs-utils.js";
4
+ export function reportsDir(projectRoot) {
5
+ return path.join(projectRoot, EVALS_ROOT, "reports");
6
+ }
7
+ export function defaultReportBasename(report) {
8
+ const ts = report.generatedAt.replace(/[:.]/g, "-");
9
+ return `eval-${ts}-${report.runId.slice(0, 8)}`;
10
+ }
11
+ /**
12
+ * Format a report as a human-readable Markdown document. Keeping the layout
13
+ * stable matters: CI posts diffs against earlier reports, and unit tests use
14
+ * the output as a regression guard.
15
+ */
16
+ export function formatMarkdownReport(report) {
17
+ const { summary } = report;
18
+ const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
19
+ const lines = [];
20
+ lines.push(`# cclaw eval report`);
21
+ lines.push(``);
22
+ lines.push(`- generated: ${report.generatedAt}`);
23
+ lines.push(`- runId: ${report.runId}`);
24
+ lines.push(`- cclaw version: ${report.cclawVersion}`);
25
+ lines.push(`- provider: ${report.provider}`);
26
+ lines.push(`- model: ${report.model}`);
27
+ lines.push(`- tier: ${report.tier}`);
28
+ lines.push(`- stages: ${stages}`);
29
+ lines.push(``);
30
+ lines.push(`## Summary`);
31
+ lines.push(``);
32
+ lines.push(`| metric | value |`);
33
+ lines.push(`| --- | --- |`);
34
+ lines.push(`| total cases | ${summary.totalCases} |`);
35
+ lines.push(`| passed | ${summary.passed} |`);
36
+ lines.push(`| failed | ${summary.failed} |`);
37
+ lines.push(`| skipped | ${summary.skipped} |`);
38
+ lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
39
+ lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
40
+ lines.push(``);
41
+ if (report.baselineDelta) {
42
+ const delta = report.baselineDelta;
43
+ lines.push(`## Baseline delta`);
44
+ lines.push(``);
45
+ lines.push(`- baseline: ${delta.baselineId}`);
46
+ lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
47
+ lines.push(`- critical failures: ${delta.criticalFailures}`);
48
+ lines.push(``);
49
+ if (delta.regressions.length > 0) {
50
+ lines.push(`### Regressions`);
51
+ lines.push(``);
52
+ lines.push(`| stage | case id | verifier | reason | prev | curr |`);
53
+ lines.push(`| --- | --- | --- | --- | --- | --- |`);
54
+ for (const reg of delta.regressions) {
55
+ const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
56
+ const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
57
+ lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
58
+ }
59
+ lines.push(``);
60
+ }
61
+ }
62
+ if (report.cases.length === 0) {
63
+ lines.push(`## Cases`);
64
+ lines.push(``);
65
+ lines.push(`No cases were executed. See \`docs/evals.md\` for the Wave rollout plan.`);
66
+ lines.push(``);
67
+ return `${lines.join("\n")}\n`;
68
+ }
69
+ lines.push(`## Cases`);
70
+ lines.push(``);
71
+ lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
72
+ lines.push(`| --- | --- | --- | --- | --- |`);
73
+ for (const item of report.cases) {
74
+ const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
75
+ lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
76
+ }
77
+ lines.push(``);
78
+ lines.push(`## Verifier details`);
79
+ lines.push(``);
80
+ for (const item of report.cases) {
81
+ lines.push(`### ${item.stage} / ${item.caseId}`);
82
+ lines.push(``);
83
+ for (const verifier of item.verifierResults) {
84
+ const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
85
+ lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
86
+ (verifier.message ? ` — ${verifier.message}` : ""));
87
+ }
88
+ lines.push(``);
89
+ }
90
+ return `${lines.join("\n")}\n`;
91
+ }
92
+ export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
93
+ const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
94
+ await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
95
+ return outPath;
96
+ }
97
+ export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
98
+ const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
99
+ await writeFileSafe(outPath, formatMarkdownReport(report));
100
+ return outPath;
101
+ }
@@ -0,0 +1,45 @@
1
+ import type { FlowStage } from "../types.js";
2
+ import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
3
+ export interface RunEvalOptions {
4
+ projectRoot: string;
5
+ stage?: FlowStage;
6
+ tier?: EvalTier;
7
+ /** When true, run only structural verifiers (Wave 7.1). */
8
+ schemaOnly?: boolean;
9
+ /** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
10
+ rules?: boolean;
11
+ /** When true, also run LLM judge verifiers. Wave 7.3 wires judging. */
12
+ judge?: boolean;
13
+ /** When true, load config + corpus and return a summary without running any verifier. */
14
+ dryRun?: boolean;
15
+ /** Override process.env during tests. */
16
+ env?: NodeJS.ProcessEnv;
17
+ }
18
+ export interface DryRunSummary {
19
+ kind: "dry-run";
20
+ config: ResolvedEvalConfig;
21
+ corpus: {
22
+ total: number;
23
+ byStage: Record<string, number>;
24
+ cases: Array<{
25
+ id: string;
26
+ stage: FlowStage;
27
+ }>;
28
+ };
29
+ plannedTier: EvalTier;
30
+ verifiersAvailable: {
31
+ structural: boolean;
32
+ rules: boolean;
33
+ judge: boolean;
34
+ workflow: boolean;
35
+ };
36
+ notes: string[];
37
+ }
38
+ /**
39
+ * Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
40
+ * active), runs structural verifiers against fixture-backed cases and loads
41
+ * per-stage baselines for regression comparison. Tier A/B/C agent loops
42
+ * still arrive in Waves 7.3+; until then cases without `fixture` are marked
43
+ * as skipped rather than failing.
44
+ */
45
+ export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;