cclaw-cli 0.21.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +11 -1
- package/dist/cli.js +154 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +19 -0
- package/dist/eval/corpus.js +175 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +101 -0
- package/dist/eval/runner.d.ts +45 -0
- package/dist/eval/runner.js +178 -0
- package/dist/eval/types.d.ts +216 -0
- package/dist/eval/types.js +15 -0
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/dist/install.js +22 -0
- package/package.json +1 -1
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { EVAL_TIERS } from "./types.js";
|
|
7
|
+
/**
|
|
8
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
9
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
10
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
11
|
+
* variables (env wins last).
|
|
12
|
+
*/
|
|
13
|
+
export const DEFAULT_EVAL_CONFIG = {
|
|
14
|
+
provider: "zai",
|
|
15
|
+
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
16
|
+
model: "glm-5.1",
|
|
17
|
+
defaultTier: "A",
|
|
18
|
+
regression: {
|
|
19
|
+
failIfDeltaBelow: -0.15,
|
|
20
|
+
failIfCriticalBelow: 3.0
|
|
21
|
+
},
|
|
22
|
+
timeoutMs: 120_000,
|
|
23
|
+
maxRetries: 2
|
|
24
|
+
};
|
|
25
|
+
const EVAL_TIER_SET = new Set(EVAL_TIERS);
|
|
26
|
+
const NUMERIC_ENVS = new Set([
|
|
27
|
+
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
28
|
+
"CCLAW_EVAL_TIMEOUT_MS",
|
|
29
|
+
"CCLAW_EVAL_MAX_RETRIES"
|
|
30
|
+
]);
|
|
31
|
+
function evalConfigError(configFilePath, reason) {
|
|
32
|
+
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
33
|
+
`Supported tiers: ${EVAL_TIERS.join(", ")}\n` +
|
|
34
|
+
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
35
|
+
}
|
|
36
|
+
function isRecord(value) {
|
|
37
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
38
|
+
}
|
|
39
|
+
function parseNumericEnv(name, raw) {
|
|
40
|
+
const value = Number(raw);
|
|
41
|
+
if (!Number.isFinite(value)) {
|
|
42
|
+
throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
|
|
43
|
+
}
|
|
44
|
+
return value;
|
|
45
|
+
}
|
|
46
|
+
function parseTierEnv(raw) {
|
|
47
|
+
const trimmed = raw.trim().toUpperCase();
|
|
48
|
+
if (!EVAL_TIER_SET.has(trimmed)) {
|
|
49
|
+
throw new Error(`Environment variable CCLAW_EVAL_TIER must be one of ${EVAL_TIERS.join("/")}, got: ${raw}`);
|
|
50
|
+
}
|
|
51
|
+
return trimmed;
|
|
52
|
+
}
|
|
53
|
+
function validateFileConfig(raw, configFilePath) {
|
|
54
|
+
if (raw === undefined || raw === null)
|
|
55
|
+
return {};
|
|
56
|
+
if (!isRecord(raw)) {
|
|
57
|
+
throw evalConfigError(configFilePath, "top-level value must be a mapping");
|
|
58
|
+
}
|
|
59
|
+
const out = {};
|
|
60
|
+
const assignString = (key, value) => {
|
|
61
|
+
if (value === undefined)
|
|
62
|
+
return;
|
|
63
|
+
if (typeof value !== "string" || value.trim().length === 0) {
|
|
64
|
+
throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
|
|
65
|
+
}
|
|
66
|
+
out[key] = value.trim();
|
|
67
|
+
};
|
|
68
|
+
assignString("provider", raw.provider);
|
|
69
|
+
assignString("baseUrl", raw.baseUrl);
|
|
70
|
+
assignString("model", raw.model);
|
|
71
|
+
assignString("judgeModel", raw.judgeModel);
|
|
72
|
+
if (raw.defaultTier !== undefined) {
|
|
73
|
+
if (typeof raw.defaultTier !== "string" || !EVAL_TIER_SET.has(raw.defaultTier)) {
|
|
74
|
+
throw evalConfigError(configFilePath, `"defaultTier" must be one of: ${EVAL_TIERS.join(", ")}`);
|
|
75
|
+
}
|
|
76
|
+
out.defaultTier = raw.defaultTier;
|
|
77
|
+
}
|
|
78
|
+
if (raw.dailyUsdCap !== undefined) {
|
|
79
|
+
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
80
|
+
throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
|
|
81
|
+
}
|
|
82
|
+
out.dailyUsdCap = raw.dailyUsdCap;
|
|
83
|
+
}
|
|
84
|
+
if (raw.timeoutMs !== undefined) {
|
|
85
|
+
if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
|
|
86
|
+
throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
|
|
87
|
+
}
|
|
88
|
+
out.timeoutMs = raw.timeoutMs;
|
|
89
|
+
}
|
|
90
|
+
if (raw.maxRetries !== undefined) {
|
|
91
|
+
if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
|
|
92
|
+
throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
|
|
93
|
+
}
|
|
94
|
+
out.maxRetries = raw.maxRetries;
|
|
95
|
+
}
|
|
96
|
+
if (raw.regression !== undefined) {
|
|
97
|
+
if (!isRecord(raw.regression)) {
|
|
98
|
+
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
99
|
+
}
|
|
100
|
+
const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
|
|
101
|
+
const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
|
|
102
|
+
if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
|
|
103
|
+
throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
|
|
104
|
+
}
|
|
105
|
+
if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
|
|
106
|
+
throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
|
|
107
|
+
}
|
|
108
|
+
out.regression = {
|
|
109
|
+
failIfDeltaBelow: typeof failIfDeltaBelow === "number"
|
|
110
|
+
? failIfDeltaBelow
|
|
111
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
|
|
112
|
+
failIfCriticalBelow: typeof failIfCriticalBelow === "number"
|
|
113
|
+
? failIfCriticalBelow
|
|
114
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
const knownKeys = new Set([
|
|
118
|
+
"provider",
|
|
119
|
+
"baseUrl",
|
|
120
|
+
"model",
|
|
121
|
+
"judgeModel",
|
|
122
|
+
"defaultTier",
|
|
123
|
+
"dailyUsdCap",
|
|
124
|
+
"timeoutMs",
|
|
125
|
+
"maxRetries",
|
|
126
|
+
"regression"
|
|
127
|
+
]);
|
|
128
|
+
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
129
|
+
if (unknown.length > 0) {
|
|
130
|
+
throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
async function readFileConfig(projectRoot) {
|
|
135
|
+
const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
|
|
136
|
+
if (!(await exists(configFilePath))) {
|
|
137
|
+
return { patch: {}, source: "default" };
|
|
138
|
+
}
|
|
139
|
+
let parsed;
|
|
140
|
+
try {
|
|
141
|
+
parsed = parse(await fs.readFile(configFilePath, "utf8"));
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
145
|
+
}
|
|
146
|
+
const patch = validateFileConfig(parsed, configFilePath);
|
|
147
|
+
return { patch, source: "file" };
|
|
148
|
+
}
|
|
149
|
+
function applyEnvOverrides(base, env) {
|
|
150
|
+
let overridden = false;
|
|
151
|
+
const patched = {
|
|
152
|
+
...base,
|
|
153
|
+
regression: { ...base.regression }
|
|
154
|
+
};
|
|
155
|
+
for (const name of Object.keys(env)) {
|
|
156
|
+
if (!name.startsWith("CCLAW_EVAL_"))
|
|
157
|
+
continue;
|
|
158
|
+
if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
|
|
159
|
+
// validated below when applied
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const read = (name) => {
|
|
163
|
+
const value = env[name];
|
|
164
|
+
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
165
|
+
};
|
|
166
|
+
const baseUrl = read("CCLAW_EVAL_BASE_URL");
|
|
167
|
+
if (baseUrl) {
|
|
168
|
+
patched.baseUrl = baseUrl;
|
|
169
|
+
overridden = true;
|
|
170
|
+
}
|
|
171
|
+
const model = read("CCLAW_EVAL_MODEL");
|
|
172
|
+
if (model) {
|
|
173
|
+
patched.model = model;
|
|
174
|
+
overridden = true;
|
|
175
|
+
}
|
|
176
|
+
const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
|
|
177
|
+
if (judgeModel) {
|
|
178
|
+
patched.judgeModel = judgeModel;
|
|
179
|
+
overridden = true;
|
|
180
|
+
}
|
|
181
|
+
const provider = read("CCLAW_EVAL_PROVIDER");
|
|
182
|
+
if (provider) {
|
|
183
|
+
patched.provider = provider;
|
|
184
|
+
overridden = true;
|
|
185
|
+
}
|
|
186
|
+
const tier = read("CCLAW_EVAL_TIER");
|
|
187
|
+
if (tier) {
|
|
188
|
+
patched.defaultTier = parseTierEnv(tier);
|
|
189
|
+
overridden = true;
|
|
190
|
+
}
|
|
191
|
+
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
192
|
+
if (cap) {
|
|
193
|
+
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
194
|
+
overridden = true;
|
|
195
|
+
}
|
|
196
|
+
const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
|
|
197
|
+
if (timeout) {
|
|
198
|
+
patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
|
|
199
|
+
overridden = true;
|
|
200
|
+
}
|
|
201
|
+
const retries = read("CCLAW_EVAL_MAX_RETRIES");
|
|
202
|
+
if (retries) {
|
|
203
|
+
patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
|
|
204
|
+
overridden = true;
|
|
205
|
+
}
|
|
206
|
+
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
207
|
+
return { patched, overridden, apiKey };
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
211
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
212
|
+
* surface where each setting came from.
|
|
213
|
+
*/
|
|
214
|
+
export async function loadEvalConfig(projectRoot, env = process.env) {
|
|
215
|
+
const { patch, source: fileSource } = await readFileConfig(projectRoot);
|
|
216
|
+
const merged = {
|
|
217
|
+
...DEFAULT_EVAL_CONFIG,
|
|
218
|
+
...patch,
|
|
219
|
+
regression: {
|
|
220
|
+
...DEFAULT_EVAL_CONFIG.regression,
|
|
221
|
+
...(patch.regression ?? {})
|
|
222
|
+
}
|
|
223
|
+
};
|
|
224
|
+
const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
|
|
225
|
+
let source = "default";
|
|
226
|
+
if (fileSource === "file" && overridden)
|
|
227
|
+
source = "file+env";
|
|
228
|
+
else if (fileSource === "file")
|
|
229
|
+
source = "file";
|
|
230
|
+
else if (overridden)
|
|
231
|
+
source = "env";
|
|
232
|
+
return {
|
|
233
|
+
...patched,
|
|
234
|
+
apiKey,
|
|
235
|
+
source
|
|
236
|
+
};
|
|
237
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalCase } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
5
|
+
* single stage. Returns an empty array for a fresh install.
|
|
6
|
+
*/
|
|
7
|
+
export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
10
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
11
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
12
|
+
*/
|
|
13
|
+
export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
|
|
14
|
+
/**
|
|
15
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
16
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
17
|
+
* the case but not on disk — Wave 7.1 fixtures ship alongside cases.
|
|
18
|
+
*/
|
|
19
|
+
export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { FLOW_STAGES } from "../types.js";
|
|
7
|
+
const FLOW_STAGE_SET = new Set(FLOW_STAGES);
|
|
8
|
+
function corpusError(filePath, reason) {
|
|
9
|
+
return new Error(`Invalid eval case at ${filePath}: ${reason}\n` +
|
|
10
|
+
`Supported stages: ${FLOW_STAGES.join(", ")}`);
|
|
11
|
+
}
|
|
12
|
+
function isRecord(value) {
|
|
13
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
14
|
+
}
|
|
15
|
+
function readStringArray(filePath, context, value) {
|
|
16
|
+
if (value === undefined)
|
|
17
|
+
return undefined;
|
|
18
|
+
if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
|
|
19
|
+
throw corpusError(filePath, `"${context}" must be an array of strings`);
|
|
20
|
+
}
|
|
21
|
+
return value;
|
|
22
|
+
}
|
|
23
|
+
function readNonNegativeInteger(filePath, context, value) {
|
|
24
|
+
if (value === undefined)
|
|
25
|
+
return undefined;
|
|
26
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
|
|
27
|
+
throw corpusError(filePath, `"${context}" must be a non-negative integer`);
|
|
28
|
+
}
|
|
29
|
+
return value;
|
|
30
|
+
}
|
|
31
|
+
function parseStructural(filePath, raw) {
|
|
32
|
+
if (raw === undefined)
|
|
33
|
+
return undefined;
|
|
34
|
+
if (!isRecord(raw)) {
|
|
35
|
+
throw corpusError(filePath, `"expected.structural" must be a mapping`);
|
|
36
|
+
}
|
|
37
|
+
const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
|
|
38
|
+
const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
|
|
39
|
+
const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
|
|
40
|
+
const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
|
|
41
|
+
const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
|
|
42
|
+
const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
|
|
43
|
+
const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
|
|
44
|
+
const structural = {};
|
|
45
|
+
if (requiredSections)
|
|
46
|
+
structural.requiredSections = requiredSections;
|
|
47
|
+
if (forbiddenPatterns)
|
|
48
|
+
structural.forbiddenPatterns = forbiddenPatterns;
|
|
49
|
+
if (requiredFrontmatterKeys)
|
|
50
|
+
structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
|
|
51
|
+
if (minLines !== undefined)
|
|
52
|
+
structural.minLines = minLines;
|
|
53
|
+
if (maxLines !== undefined)
|
|
54
|
+
structural.maxLines = maxLines;
|
|
55
|
+
if (minChars !== undefined)
|
|
56
|
+
structural.minChars = minChars;
|
|
57
|
+
if (maxChars !== undefined)
|
|
58
|
+
structural.maxChars = maxChars;
|
|
59
|
+
return structural;
|
|
60
|
+
}
|
|
61
|
+
function parseExpected(filePath, raw) {
|
|
62
|
+
if (raw === undefined)
|
|
63
|
+
return undefined;
|
|
64
|
+
if (!isRecord(raw)) {
|
|
65
|
+
throw corpusError(filePath, `"expected" must be a mapping`);
|
|
66
|
+
}
|
|
67
|
+
const shape = {};
|
|
68
|
+
const structural = parseStructural(filePath, raw.structural);
|
|
69
|
+
if (structural)
|
|
70
|
+
shape.structural = structural;
|
|
71
|
+
if (raw.rules !== undefined) {
|
|
72
|
+
if (!isRecord(raw.rules)) {
|
|
73
|
+
throw corpusError(filePath, `"expected.rules" must be a mapping`);
|
|
74
|
+
}
|
|
75
|
+
shape.rules = raw.rules;
|
|
76
|
+
}
|
|
77
|
+
if (raw.judge !== undefined) {
|
|
78
|
+
if (!isRecord(raw.judge)) {
|
|
79
|
+
throw corpusError(filePath, `"expected.judge" must be a mapping`);
|
|
80
|
+
}
|
|
81
|
+
shape.judge = raw.judge;
|
|
82
|
+
}
|
|
83
|
+
return Object.keys(shape).length === 0 ? undefined : shape;
|
|
84
|
+
}
|
|
85
|
+
function validateCase(filePath, raw) {
|
|
86
|
+
if (!isRecord(raw)) {
|
|
87
|
+
throw corpusError(filePath, "top-level value must be a mapping");
|
|
88
|
+
}
|
|
89
|
+
const id = raw.id;
|
|
90
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
91
|
+
throw corpusError(filePath, `"id" must be a non-empty string`);
|
|
92
|
+
}
|
|
93
|
+
const stageRaw = raw.stage;
|
|
94
|
+
if (typeof stageRaw !== "string" || !FLOW_STAGE_SET.has(stageRaw)) {
|
|
95
|
+
throw corpusError(filePath, `"stage" must be one of: ${FLOW_STAGES.join(", ")}`);
|
|
96
|
+
}
|
|
97
|
+
const inputPrompt = raw.input_prompt ?? raw.inputPrompt;
|
|
98
|
+
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
99
|
+
throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
|
|
100
|
+
}
|
|
101
|
+
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
102
|
+
const expected = parseExpected(filePath, raw.expected);
|
|
103
|
+
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
104
|
+
return {
|
|
105
|
+
id: id.trim(),
|
|
106
|
+
stage: stageRaw,
|
|
107
|
+
inputPrompt: inputPrompt.trim(),
|
|
108
|
+
contextFiles,
|
|
109
|
+
expected,
|
|
110
|
+
fixture
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
115
|
+
* single stage. Returns an empty array for a fresh install.
|
|
116
|
+
*/
|
|
117
|
+
export async function loadCorpus(projectRoot, stage) {
|
|
118
|
+
const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
|
|
119
|
+
if (!(await exists(corpusRoot))) {
|
|
120
|
+
return [];
|
|
121
|
+
}
|
|
122
|
+
const cases = [];
|
|
123
|
+
const stageDirs = stage
|
|
124
|
+
? [path.join(corpusRoot, stage)]
|
|
125
|
+
: (await fs.readdir(corpusRoot, { withFileTypes: true }))
|
|
126
|
+
.filter((entry) => entry.isDirectory())
|
|
127
|
+
.filter((entry) => FLOW_STAGE_SET.has(entry.name))
|
|
128
|
+
.map((entry) => path.join(corpusRoot, entry.name));
|
|
129
|
+
for (const stageDir of stageDirs) {
|
|
130
|
+
if (!(await exists(stageDir)))
|
|
131
|
+
continue;
|
|
132
|
+
const entries = await fs.readdir(stageDir, { withFileTypes: true });
|
|
133
|
+
for (const entry of entries) {
|
|
134
|
+
if (!entry.isFile())
|
|
135
|
+
continue;
|
|
136
|
+
if (!entry.name.endsWith(".yaml") && !entry.name.endsWith(".yml"))
|
|
137
|
+
continue;
|
|
138
|
+
const filePath = path.join(stageDir, entry.name);
|
|
139
|
+
let parsed;
|
|
140
|
+
try {
|
|
141
|
+
parsed = parse(await fs.readFile(filePath, "utf8"));
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
throw corpusError(filePath, err instanceof Error ? err.message : String(err));
|
|
145
|
+
}
|
|
146
|
+
cases.push(validateCase(filePath, parsed));
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
|
|
150
|
+
return cases;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
154
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
155
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
156
|
+
*/
|
|
157
|
+
export function fixturePathFor(projectRoot, caseEntry) {
|
|
158
|
+
if (!caseEntry.fixture)
|
|
159
|
+
return undefined;
|
|
160
|
+
return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
164
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
165
|
+
* the case but not on disk — Wave 7.1 fixtures ship alongside cases.
|
|
166
|
+
*/
|
|
167
|
+
export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
168
|
+
const fixturePath = fixturePathFor(projectRoot, caseEntry);
|
|
169
|
+
if (!fixturePath)
|
|
170
|
+
return undefined;
|
|
171
|
+
if (!(await exists(fixturePath))) {
|
|
172
|
+
throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
|
|
173
|
+
}
|
|
174
|
+
return fs.readFile(fixturePath, "utf8");
|
|
175
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Wave 7.0 declares the shape of the client without pulling in the `openai`
|
|
5
|
+
* runtime dependency. The real implementation is wired in Wave 7.3 when
|
|
6
|
+
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
+
* separate means users of Waves 7.0–7.2 (structural + rule-based verifiers)
|
|
8
|
+
* never install an extra dependency or receive network egress warnings.
|
|
9
|
+
*/
|
|
10
|
+
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
+
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
+
* Wave 7.3 implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
+
*/
|
|
16
|
+
export interface ChatMessage {
|
|
17
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
18
|
+
content: string;
|
|
19
|
+
name?: string;
|
|
20
|
+
toolCallId?: string;
|
|
21
|
+
}
|
|
22
|
+
export interface ChatRequest {
|
|
23
|
+
model: string;
|
|
24
|
+
messages: ChatMessage[];
|
|
25
|
+
maxTokens?: number;
|
|
26
|
+
temperature?: number;
|
|
27
|
+
timeoutMs?: number;
|
|
28
|
+
/**
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only by
|
|
30
|
+
* Wave 7.4 (Tier B). Ignored by the Wave 7.3 single-shot path.
|
|
31
|
+
*/
|
|
32
|
+
tools?: unknown[];
|
|
33
|
+
toolChoice?: "auto" | "none";
|
|
34
|
+
}
|
|
35
|
+
export interface ChatUsage {
|
|
36
|
+
promptTokens: number;
|
|
37
|
+
completionTokens: number;
|
|
38
|
+
totalTokens: number;
|
|
39
|
+
}
|
|
40
|
+
export interface ChatResponse {
|
|
41
|
+
content: string;
|
|
42
|
+
toolCalls?: Array<{
|
|
43
|
+
id: string;
|
|
44
|
+
name: string;
|
|
45
|
+
arguments: string;
|
|
46
|
+
}>;
|
|
47
|
+
usage: ChatUsage;
|
|
48
|
+
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
49
|
+
}
|
|
50
|
+
/** Lightweight client abstraction shared across eval runners. */
|
|
51
|
+
export interface EvalLlmClient {
|
|
52
|
+
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
|
+
}
|
|
54
|
+
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
+
constructor(wave: string);
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
59
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
60
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
|
+
*/
|
|
62
|
+
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class EvalLlmNotWiredError extends Error {
|
|
2
|
+
constructor(wave) {
|
|
3
|
+
super(`LLM client is not wired in Wave 7.0. It arrives in Wave ${wave}.\n` +
|
|
4
|
+
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
|
+
this.name = "EvalLlmNotWiredError";
|
|
6
|
+
}
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
10
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
11
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
|
+
*/
|
|
13
|
+
export function createEvalClient(_config) {
|
|
14
|
+
return {
|
|
15
|
+
async chat() {
|
|
16
|
+
throw new EvalLlmNotWiredError("7.3");
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { EvalReport } from "./types.js";
|
|
2
|
+
export declare function reportsDir(projectRoot: string): string;
|
|
3
|
+
export declare function defaultReportBasename(report: EvalReport): string;
|
|
4
|
+
/**
|
|
5
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
6
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
7
|
+
* the output as a regression guard.
|
|
8
|
+
*/
|
|
9
|
+
export declare function formatMarkdownReport(report: EvalReport): string;
|
|
10
|
+
export declare function writeJsonReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
11
|
+
export declare function writeMarkdownReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
3
|
+
import { writeFileSafe } from "../fs-utils.js";
|
|
4
|
+
export function reportsDir(projectRoot) {
|
|
5
|
+
return path.join(projectRoot, EVALS_ROOT, "reports");
|
|
6
|
+
}
|
|
7
|
+
export function defaultReportBasename(report) {
|
|
8
|
+
const ts = report.generatedAt.replace(/[:.]/g, "-");
|
|
9
|
+
return `eval-${ts}-${report.runId.slice(0, 8)}`;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
13
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
14
|
+
* the output as a regression guard.
|
|
15
|
+
*/
|
|
16
|
+
export function formatMarkdownReport(report) {
|
|
17
|
+
const { summary } = report;
|
|
18
|
+
const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
|
|
19
|
+
const lines = [];
|
|
20
|
+
lines.push(`# cclaw eval report`);
|
|
21
|
+
lines.push(``);
|
|
22
|
+
lines.push(`- generated: ${report.generatedAt}`);
|
|
23
|
+
lines.push(`- runId: ${report.runId}`);
|
|
24
|
+
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
|
+
lines.push(`- provider: ${report.provider}`);
|
|
26
|
+
lines.push(`- model: ${report.model}`);
|
|
27
|
+
lines.push(`- tier: ${report.tier}`);
|
|
28
|
+
lines.push(`- stages: ${stages}`);
|
|
29
|
+
lines.push(``);
|
|
30
|
+
lines.push(`## Summary`);
|
|
31
|
+
lines.push(``);
|
|
32
|
+
lines.push(`| metric | value |`);
|
|
33
|
+
lines.push(`| --- | --- |`);
|
|
34
|
+
lines.push(`| total cases | ${summary.totalCases} |`);
|
|
35
|
+
lines.push(`| passed | ${summary.passed} |`);
|
|
36
|
+
lines.push(`| failed | ${summary.failed} |`);
|
|
37
|
+
lines.push(`| skipped | ${summary.skipped} |`);
|
|
38
|
+
lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
|
|
39
|
+
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
|
+
lines.push(``);
|
|
41
|
+
if (report.baselineDelta) {
|
|
42
|
+
const delta = report.baselineDelta;
|
|
43
|
+
lines.push(`## Baseline delta`);
|
|
44
|
+
lines.push(``);
|
|
45
|
+
lines.push(`- baseline: ${delta.baselineId}`);
|
|
46
|
+
lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
|
|
47
|
+
lines.push(`- critical failures: ${delta.criticalFailures}`);
|
|
48
|
+
lines.push(``);
|
|
49
|
+
if (delta.regressions.length > 0) {
|
|
50
|
+
lines.push(`### Regressions`);
|
|
51
|
+
lines.push(``);
|
|
52
|
+
lines.push(`| stage | case id | verifier | reason | prev | curr |`);
|
|
53
|
+
lines.push(`| --- | --- | --- | --- | --- | --- |`);
|
|
54
|
+
for (const reg of delta.regressions) {
|
|
55
|
+
const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
|
|
56
|
+
const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
|
|
57
|
+
lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
|
|
58
|
+
}
|
|
59
|
+
lines.push(``);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
if (report.cases.length === 0) {
|
|
63
|
+
lines.push(`## Cases`);
|
|
64
|
+
lines.push(``);
|
|
65
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the Wave rollout plan.`);
|
|
66
|
+
lines.push(``);
|
|
67
|
+
return `${lines.join("\n")}\n`;
|
|
68
|
+
}
|
|
69
|
+
lines.push(`## Cases`);
|
|
70
|
+
lines.push(``);
|
|
71
|
+
lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
|
|
72
|
+
lines.push(`| --- | --- | --- | --- | --- |`);
|
|
73
|
+
for (const item of report.cases) {
|
|
74
|
+
const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
|
|
75
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
|
+
}
|
|
77
|
+
lines.push(``);
|
|
78
|
+
lines.push(`## Verifier details`);
|
|
79
|
+
lines.push(``);
|
|
80
|
+
for (const item of report.cases) {
|
|
81
|
+
lines.push(`### ${item.stage} / ${item.caseId}`);
|
|
82
|
+
lines.push(``);
|
|
83
|
+
for (const verifier of item.verifierResults) {
|
|
84
|
+
const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
|
|
85
|
+
lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
|
|
86
|
+
(verifier.message ? ` — ${verifier.message}` : ""));
|
|
87
|
+
}
|
|
88
|
+
lines.push(``);
|
|
89
|
+
}
|
|
90
|
+
return `${lines.join("\n")}\n`;
|
|
91
|
+
}
|
|
92
|
+
export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
93
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
|
|
94
|
+
await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
95
|
+
return outPath;
|
|
96
|
+
}
|
|
97
|
+
export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
98
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
|
|
99
|
+
await writeFileSafe(outPath, formatMarkdownReport(report));
|
|
100
|
+
return outPath;
|
|
101
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
|
+
export interface RunEvalOptions {
|
|
4
|
+
projectRoot: string;
|
|
5
|
+
stage?: FlowStage;
|
|
6
|
+
tier?: EvalTier;
|
|
7
|
+
/** When true, run only structural verifiers (Wave 7.1). */
|
|
8
|
+
schemaOnly?: boolean;
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
|
|
10
|
+
rules?: boolean;
|
|
11
|
+
/** When true, also run LLM judge verifiers. Wave 7.3 wires judging. */
|
|
12
|
+
judge?: boolean;
|
|
13
|
+
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
|
+
dryRun?: boolean;
|
|
15
|
+
/** Override process.env during tests. */
|
|
16
|
+
env?: NodeJS.ProcessEnv;
|
|
17
|
+
}
|
|
18
|
+
export interface DryRunSummary {
|
|
19
|
+
kind: "dry-run";
|
|
20
|
+
config: ResolvedEvalConfig;
|
|
21
|
+
corpus: {
|
|
22
|
+
total: number;
|
|
23
|
+
byStage: Record<string, number>;
|
|
24
|
+
cases: Array<{
|
|
25
|
+
id: string;
|
|
26
|
+
stage: FlowStage;
|
|
27
|
+
}>;
|
|
28
|
+
};
|
|
29
|
+
plannedTier: EvalTier;
|
|
30
|
+
verifiersAvailable: {
|
|
31
|
+
structural: boolean;
|
|
32
|
+
rules: boolean;
|
|
33
|
+
judge: boolean;
|
|
34
|
+
workflow: boolean;
|
|
35
|
+
};
|
|
36
|
+
notes: string[];
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
|
|
40
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
41
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
42
|
+
* still arrive in Waves 7.3+; until then cases without `fixture` are marked
|
|
43
|
+
* as skipped rather than failing.
|
|
44
|
+
*/
|
|
45
|
+
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|