cclaw-cli 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +9 -1
- package/dist/cli.js +123 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/content/skills.js +1 -1
- package/dist/content/stages/brainstorm.js +3 -7
- package/dist/content/stages/design.js +2 -5
- package/dist/content/stages/plan.js +2 -4
- package/dist/content/stages/review.js +2 -4
- package/dist/content/stages/schema-types.d.ts +8 -2
- package/dist/content/stages/scope.js +2 -6
- package/dist/content/stages/ship.js +2 -4
- package/dist/content/stages/spec.js +2 -5
- package/dist/content/stages/tdd.js +2 -4
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +8 -0
- package/dist/eval/corpus.js +91 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +88 -0
- package/dist/eval/runner.d.ts +53 -0
- package/dist/eval/runner.js +96 -0
- package/dist/eval/types.d.ts +136 -0
- package/dist/eval/types.js +15 -0
- package/dist/install.js +22 -0
- package/dist/runs.d.ts +0 -18
- package/dist/runs.js +1 -188
- package/package.json +1 -1
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { EVAL_TIERS } from "./types.js";
|
|
7
|
+
/**
|
|
8
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
9
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
10
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
11
|
+
* variables (env wins last).
|
|
12
|
+
*/
|
|
13
|
+
export const DEFAULT_EVAL_CONFIG = {
|
|
14
|
+
provider: "zai",
|
|
15
|
+
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
16
|
+
model: "glm-5.1",
|
|
17
|
+
defaultTier: "A",
|
|
18
|
+
regression: {
|
|
19
|
+
failIfDeltaBelow: -0.15,
|
|
20
|
+
failIfCriticalBelow: 3.0
|
|
21
|
+
},
|
|
22
|
+
timeoutMs: 120_000,
|
|
23
|
+
maxRetries: 2
|
|
24
|
+
};
|
|
25
|
+
const EVAL_TIER_SET = new Set(EVAL_TIERS);
|
|
26
|
+
const NUMERIC_ENVS = new Set([
|
|
27
|
+
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
28
|
+
"CCLAW_EVAL_TIMEOUT_MS",
|
|
29
|
+
"CCLAW_EVAL_MAX_RETRIES"
|
|
30
|
+
]);
|
|
31
|
+
function evalConfigError(configFilePath, reason) {
|
|
32
|
+
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
33
|
+
`Supported tiers: ${EVAL_TIERS.join(", ")}\n` +
|
|
34
|
+
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
35
|
+
}
|
|
36
|
+
function isRecord(value) {
|
|
37
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
38
|
+
}
|
|
39
|
+
function parseNumericEnv(name, raw) {
|
|
40
|
+
const value = Number(raw);
|
|
41
|
+
if (!Number.isFinite(value)) {
|
|
42
|
+
throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
|
|
43
|
+
}
|
|
44
|
+
return value;
|
|
45
|
+
}
|
|
46
|
+
function parseTierEnv(raw) {
|
|
47
|
+
const trimmed = raw.trim().toUpperCase();
|
|
48
|
+
if (!EVAL_TIER_SET.has(trimmed)) {
|
|
49
|
+
throw new Error(`Environment variable CCLAW_EVAL_TIER must be one of ${EVAL_TIERS.join("/")}, got: ${raw}`);
|
|
50
|
+
}
|
|
51
|
+
return trimmed;
|
|
52
|
+
}
|
|
53
|
+
function validateFileConfig(raw, configFilePath) {
|
|
54
|
+
if (raw === undefined || raw === null)
|
|
55
|
+
return {};
|
|
56
|
+
if (!isRecord(raw)) {
|
|
57
|
+
throw evalConfigError(configFilePath, "top-level value must be a mapping");
|
|
58
|
+
}
|
|
59
|
+
const out = {};
|
|
60
|
+
const assignString = (key, value) => {
|
|
61
|
+
if (value === undefined)
|
|
62
|
+
return;
|
|
63
|
+
if (typeof value !== "string" || value.trim().length === 0) {
|
|
64
|
+
throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
|
|
65
|
+
}
|
|
66
|
+
out[key] = value.trim();
|
|
67
|
+
};
|
|
68
|
+
assignString("provider", raw.provider);
|
|
69
|
+
assignString("baseUrl", raw.baseUrl);
|
|
70
|
+
assignString("model", raw.model);
|
|
71
|
+
assignString("judgeModel", raw.judgeModel);
|
|
72
|
+
if (raw.defaultTier !== undefined) {
|
|
73
|
+
if (typeof raw.defaultTier !== "string" || !EVAL_TIER_SET.has(raw.defaultTier)) {
|
|
74
|
+
throw evalConfigError(configFilePath, `"defaultTier" must be one of: ${EVAL_TIERS.join(", ")}`);
|
|
75
|
+
}
|
|
76
|
+
out.defaultTier = raw.defaultTier;
|
|
77
|
+
}
|
|
78
|
+
if (raw.dailyUsdCap !== undefined) {
|
|
79
|
+
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
80
|
+
throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
|
|
81
|
+
}
|
|
82
|
+
out.dailyUsdCap = raw.dailyUsdCap;
|
|
83
|
+
}
|
|
84
|
+
if (raw.timeoutMs !== undefined) {
|
|
85
|
+
if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
|
|
86
|
+
throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
|
|
87
|
+
}
|
|
88
|
+
out.timeoutMs = raw.timeoutMs;
|
|
89
|
+
}
|
|
90
|
+
if (raw.maxRetries !== undefined) {
|
|
91
|
+
if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
|
|
92
|
+
throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
|
|
93
|
+
}
|
|
94
|
+
out.maxRetries = raw.maxRetries;
|
|
95
|
+
}
|
|
96
|
+
if (raw.regression !== undefined) {
|
|
97
|
+
if (!isRecord(raw.regression)) {
|
|
98
|
+
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
99
|
+
}
|
|
100
|
+
const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
|
|
101
|
+
const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
|
|
102
|
+
if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
|
|
103
|
+
throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
|
|
104
|
+
}
|
|
105
|
+
if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
|
|
106
|
+
throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
|
|
107
|
+
}
|
|
108
|
+
out.regression = {
|
|
109
|
+
failIfDeltaBelow: typeof failIfDeltaBelow === "number"
|
|
110
|
+
? failIfDeltaBelow
|
|
111
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
|
|
112
|
+
failIfCriticalBelow: typeof failIfCriticalBelow === "number"
|
|
113
|
+
? failIfCriticalBelow
|
|
114
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
const knownKeys = new Set([
|
|
118
|
+
"provider",
|
|
119
|
+
"baseUrl",
|
|
120
|
+
"model",
|
|
121
|
+
"judgeModel",
|
|
122
|
+
"defaultTier",
|
|
123
|
+
"dailyUsdCap",
|
|
124
|
+
"timeoutMs",
|
|
125
|
+
"maxRetries",
|
|
126
|
+
"regression"
|
|
127
|
+
]);
|
|
128
|
+
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
129
|
+
if (unknown.length > 0) {
|
|
130
|
+
throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
async function readFileConfig(projectRoot) {
|
|
135
|
+
const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
|
|
136
|
+
if (!(await exists(configFilePath))) {
|
|
137
|
+
return { patch: {}, source: "default" };
|
|
138
|
+
}
|
|
139
|
+
let parsed;
|
|
140
|
+
try {
|
|
141
|
+
parsed = parse(await fs.readFile(configFilePath, "utf8"));
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
145
|
+
}
|
|
146
|
+
const patch = validateFileConfig(parsed, configFilePath);
|
|
147
|
+
return { patch, source: "file" };
|
|
148
|
+
}
|
|
149
|
+
function applyEnvOverrides(base, env) {
|
|
150
|
+
let overridden = false;
|
|
151
|
+
const patched = {
|
|
152
|
+
...base,
|
|
153
|
+
regression: { ...base.regression }
|
|
154
|
+
};
|
|
155
|
+
for (const name of Object.keys(env)) {
|
|
156
|
+
if (!name.startsWith("CCLAW_EVAL_"))
|
|
157
|
+
continue;
|
|
158
|
+
if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
|
|
159
|
+
// validated below when applied
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const read = (name) => {
|
|
163
|
+
const value = env[name];
|
|
164
|
+
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
165
|
+
};
|
|
166
|
+
const baseUrl = read("CCLAW_EVAL_BASE_URL");
|
|
167
|
+
if (baseUrl) {
|
|
168
|
+
patched.baseUrl = baseUrl;
|
|
169
|
+
overridden = true;
|
|
170
|
+
}
|
|
171
|
+
const model = read("CCLAW_EVAL_MODEL");
|
|
172
|
+
if (model) {
|
|
173
|
+
patched.model = model;
|
|
174
|
+
overridden = true;
|
|
175
|
+
}
|
|
176
|
+
const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
|
|
177
|
+
if (judgeModel) {
|
|
178
|
+
patched.judgeModel = judgeModel;
|
|
179
|
+
overridden = true;
|
|
180
|
+
}
|
|
181
|
+
const provider = read("CCLAW_EVAL_PROVIDER");
|
|
182
|
+
if (provider) {
|
|
183
|
+
patched.provider = provider;
|
|
184
|
+
overridden = true;
|
|
185
|
+
}
|
|
186
|
+
const tier = read("CCLAW_EVAL_TIER");
|
|
187
|
+
if (tier) {
|
|
188
|
+
patched.defaultTier = parseTierEnv(tier);
|
|
189
|
+
overridden = true;
|
|
190
|
+
}
|
|
191
|
+
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
192
|
+
if (cap) {
|
|
193
|
+
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
194
|
+
overridden = true;
|
|
195
|
+
}
|
|
196
|
+
const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
|
|
197
|
+
if (timeout) {
|
|
198
|
+
patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
|
|
199
|
+
overridden = true;
|
|
200
|
+
}
|
|
201
|
+
const retries = read("CCLAW_EVAL_MAX_RETRIES");
|
|
202
|
+
if (retries) {
|
|
203
|
+
patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
|
|
204
|
+
overridden = true;
|
|
205
|
+
}
|
|
206
|
+
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
207
|
+
return { patched, overridden, apiKey };
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
211
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
212
|
+
* surface where each setting came from.
|
|
213
|
+
*/
|
|
214
|
+
export async function loadEvalConfig(projectRoot, env = process.env) {
|
|
215
|
+
const { patch, source: fileSource } = await readFileConfig(projectRoot);
|
|
216
|
+
const merged = {
|
|
217
|
+
...DEFAULT_EVAL_CONFIG,
|
|
218
|
+
...patch,
|
|
219
|
+
regression: {
|
|
220
|
+
...DEFAULT_EVAL_CONFIG.regression,
|
|
221
|
+
...(patch.regression ?? {})
|
|
222
|
+
}
|
|
223
|
+
};
|
|
224
|
+
const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
|
|
225
|
+
let source = "default";
|
|
226
|
+
if (fileSource === "file" && overridden)
|
|
227
|
+
source = "file+env";
|
|
228
|
+
else if (fileSource === "file")
|
|
229
|
+
source = "file";
|
|
230
|
+
else if (overridden)
|
|
231
|
+
source = "env";
|
|
232
|
+
return {
|
|
233
|
+
...patched,
|
|
234
|
+
apiKey,
|
|
235
|
+
source
|
|
236
|
+
};
|
|
237
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalCase } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
5
|
+
* single stage. Returns an empty array for a fresh install (Wave 7.0 ships
|
|
6
|
+
* without seed cases; corpus is authored in Wave 7.1+).
|
|
7
|
+
*/
|
|
8
|
+
export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { FLOW_STAGES } from "../types.js";
|
|
7
|
+
const FLOW_STAGE_SET = new Set(FLOW_STAGES);
|
|
8
|
+
function corpusError(filePath, reason) {
|
|
9
|
+
return new Error(`Invalid eval case at ${filePath}: ${reason}\n` +
|
|
10
|
+
`Supported stages: ${FLOW_STAGES.join(", ")}`);
|
|
11
|
+
}
|
|
12
|
+
function isRecord(value) {
|
|
13
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
14
|
+
}
|
|
15
|
+
function validateCase(filePath, raw) {
|
|
16
|
+
if (!isRecord(raw)) {
|
|
17
|
+
throw corpusError(filePath, "top-level value must be a mapping");
|
|
18
|
+
}
|
|
19
|
+
const id = raw.id;
|
|
20
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
21
|
+
throw corpusError(filePath, `"id" must be a non-empty string`);
|
|
22
|
+
}
|
|
23
|
+
const stageRaw = raw.stage;
|
|
24
|
+
if (typeof stageRaw !== "string" || !FLOW_STAGE_SET.has(stageRaw)) {
|
|
25
|
+
throw corpusError(filePath, `"stage" must be one of: ${FLOW_STAGES.join(", ")}`);
|
|
26
|
+
}
|
|
27
|
+
const inputPrompt = raw.input_prompt ?? raw.inputPrompt;
|
|
28
|
+
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
29
|
+
throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
|
|
30
|
+
}
|
|
31
|
+
const contextFilesRaw = raw.context_files ?? raw.contextFiles;
|
|
32
|
+
let contextFiles;
|
|
33
|
+
if (contextFilesRaw !== undefined) {
|
|
34
|
+
if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
|
|
35
|
+
throw corpusError(filePath, `"context_files" must be an array of strings`);
|
|
36
|
+
}
|
|
37
|
+
contextFiles = contextFilesRaw;
|
|
38
|
+
}
|
|
39
|
+
const expected = raw.expected !== undefined && isRecord(raw.expected)
|
|
40
|
+
? raw.expected
|
|
41
|
+
: undefined;
|
|
42
|
+
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
43
|
+
return {
|
|
44
|
+
id: id.trim(),
|
|
45
|
+
stage: stageRaw,
|
|
46
|
+
inputPrompt: inputPrompt.trim(),
|
|
47
|
+
contextFiles,
|
|
48
|
+
expected,
|
|
49
|
+
fixture
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
54
|
+
* single stage. Returns an empty array for a fresh install (Wave 7.0 ships
|
|
55
|
+
* without seed cases; corpus is authored in Wave 7.1+).
|
|
56
|
+
*/
|
|
57
|
+
export async function loadCorpus(projectRoot, stage) {
|
|
58
|
+
const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
|
|
59
|
+
if (!(await exists(corpusRoot))) {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
const cases = [];
|
|
63
|
+
const stageDirs = stage
|
|
64
|
+
? [path.join(corpusRoot, stage)]
|
|
65
|
+
: (await fs.readdir(corpusRoot, { withFileTypes: true }))
|
|
66
|
+
.filter((entry) => entry.isDirectory())
|
|
67
|
+
.filter((entry) => FLOW_STAGE_SET.has(entry.name))
|
|
68
|
+
.map((entry) => path.join(corpusRoot, entry.name));
|
|
69
|
+
for (const stageDir of stageDirs) {
|
|
70
|
+
if (!(await exists(stageDir)))
|
|
71
|
+
continue;
|
|
72
|
+
const entries = await fs.readdir(stageDir, { withFileTypes: true });
|
|
73
|
+
for (const entry of entries) {
|
|
74
|
+
if (!entry.isFile())
|
|
75
|
+
continue;
|
|
76
|
+
if (!entry.name.endsWith(".yaml") && !entry.name.endsWith(".yml"))
|
|
77
|
+
continue;
|
|
78
|
+
const filePath = path.join(stageDir, entry.name);
|
|
79
|
+
let parsed;
|
|
80
|
+
try {
|
|
81
|
+
parsed = parse(await fs.readFile(filePath, "utf8"));
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
throw corpusError(filePath, err instanceof Error ? err.message : String(err));
|
|
85
|
+
}
|
|
86
|
+
cases.push(validateCase(filePath, parsed));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
|
|
90
|
+
return cases;
|
|
91
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Wave 7.0 declares the shape of the client without pulling in the `openai`
|
|
5
|
+
* runtime dependency. The real implementation is wired in Wave 7.3 when
|
|
6
|
+
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
+
* separate means users of Waves 7.0–7.2 (structural + rule-based verifiers)
|
|
8
|
+
* never install an extra dependency or receive network egress warnings.
|
|
9
|
+
*/
|
|
10
|
+
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
+
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
+
* Wave 7.3 implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
+
*/
|
|
16
|
+
export interface ChatMessage {
|
|
17
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
18
|
+
content: string;
|
|
19
|
+
name?: string;
|
|
20
|
+
toolCallId?: string;
|
|
21
|
+
}
|
|
22
|
+
export interface ChatRequest {
|
|
23
|
+
model: string;
|
|
24
|
+
messages: ChatMessage[];
|
|
25
|
+
maxTokens?: number;
|
|
26
|
+
temperature?: number;
|
|
27
|
+
timeoutMs?: number;
|
|
28
|
+
/**
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only by
|
|
30
|
+
* Wave 7.4 (Tier B). Ignored by the Wave 7.3 single-shot path.
|
|
31
|
+
*/
|
|
32
|
+
tools?: unknown[];
|
|
33
|
+
toolChoice?: "auto" | "none";
|
|
34
|
+
}
|
|
35
|
+
export interface ChatUsage {
|
|
36
|
+
promptTokens: number;
|
|
37
|
+
completionTokens: number;
|
|
38
|
+
totalTokens: number;
|
|
39
|
+
}
|
|
40
|
+
export interface ChatResponse {
|
|
41
|
+
content: string;
|
|
42
|
+
toolCalls?: Array<{
|
|
43
|
+
id: string;
|
|
44
|
+
name: string;
|
|
45
|
+
arguments: string;
|
|
46
|
+
}>;
|
|
47
|
+
usage: ChatUsage;
|
|
48
|
+
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
49
|
+
}
|
|
50
|
+
/** Lightweight client abstraction shared across eval runners. */
|
|
51
|
+
export interface EvalLlmClient {
|
|
52
|
+
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
|
+
}
|
|
54
|
+
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
+
constructor(wave: string);
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
59
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
60
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
|
+
*/
|
|
62
|
+
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class EvalLlmNotWiredError extends Error {
|
|
2
|
+
constructor(wave) {
|
|
3
|
+
super(`LLM client is not wired in Wave 7.0. It arrives in Wave ${wave}.\n` +
|
|
4
|
+
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
|
+
this.name = "EvalLlmNotWiredError";
|
|
6
|
+
}
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
10
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
11
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
|
+
*/
|
|
13
|
+
export function createEvalClient(_config) {
|
|
14
|
+
return {
|
|
15
|
+
async chat() {
|
|
16
|
+
throw new EvalLlmNotWiredError("7.3");
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { EvalReport } from "./types.js";
|
|
2
|
+
export declare function reportsDir(projectRoot: string): string;
|
|
3
|
+
export declare function defaultReportBasename(report: EvalReport): string;
|
|
4
|
+
/**
|
|
5
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
6
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
7
|
+
* the output as a regression guard.
|
|
8
|
+
*/
|
|
9
|
+
export declare function formatMarkdownReport(report: EvalReport): string;
|
|
10
|
+
export declare function writeJsonReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
11
|
+
export declare function writeMarkdownReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
3
|
+
import { writeFileSafe } from "../fs-utils.js";
|
|
4
|
+
export function reportsDir(projectRoot) {
|
|
5
|
+
return path.join(projectRoot, EVALS_ROOT, "reports");
|
|
6
|
+
}
|
|
7
|
+
export function defaultReportBasename(report) {
|
|
8
|
+
const ts = report.generatedAt.replace(/[:.]/g, "-");
|
|
9
|
+
return `eval-${ts}-${report.runId.slice(0, 8)}`;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
13
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
14
|
+
* the output as a regression guard.
|
|
15
|
+
*/
|
|
16
|
+
export function formatMarkdownReport(report) {
|
|
17
|
+
const { summary } = report;
|
|
18
|
+
const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
|
|
19
|
+
const lines = [];
|
|
20
|
+
lines.push(`# cclaw eval report`);
|
|
21
|
+
lines.push(``);
|
|
22
|
+
lines.push(`- generated: ${report.generatedAt}`);
|
|
23
|
+
lines.push(`- runId: ${report.runId}`);
|
|
24
|
+
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
|
+
lines.push(`- provider: ${report.provider}`);
|
|
26
|
+
lines.push(`- model: ${report.model}`);
|
|
27
|
+
lines.push(`- tier: ${report.tier}`);
|
|
28
|
+
lines.push(`- stages: ${stages}`);
|
|
29
|
+
lines.push(``);
|
|
30
|
+
lines.push(`## Summary`);
|
|
31
|
+
lines.push(``);
|
|
32
|
+
lines.push(`| metric | value |`);
|
|
33
|
+
lines.push(`| --- | --- |`);
|
|
34
|
+
lines.push(`| total cases | ${summary.totalCases} |`);
|
|
35
|
+
lines.push(`| passed | ${summary.passed} |`);
|
|
36
|
+
lines.push(`| failed | ${summary.failed} |`);
|
|
37
|
+
lines.push(`| skipped | ${summary.skipped} |`);
|
|
38
|
+
lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
|
|
39
|
+
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
|
+
lines.push(``);
|
|
41
|
+
if (report.baselineDelta) {
|
|
42
|
+
lines.push(`## Baseline delta`);
|
|
43
|
+
lines.push(``);
|
|
44
|
+
lines.push(`- baseline: ${report.baselineDelta.baselineId}`);
|
|
45
|
+
lines.push(`- score delta: ${report.baselineDelta.scoreDelta.toFixed(4)}`);
|
|
46
|
+
lines.push(`- critical failures: ${report.baselineDelta.criticalFailures}`);
|
|
47
|
+
lines.push(``);
|
|
48
|
+
}
|
|
49
|
+
if (report.cases.length === 0) {
|
|
50
|
+
lines.push(`## Cases`);
|
|
51
|
+
lines.push(``);
|
|
52
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the Wave rollout plan.`);
|
|
53
|
+
lines.push(``);
|
|
54
|
+
return `${lines.join("\n")}\n`;
|
|
55
|
+
}
|
|
56
|
+
lines.push(`## Cases`);
|
|
57
|
+
lines.push(``);
|
|
58
|
+
lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
|
|
59
|
+
lines.push(`| --- | --- | --- | --- | --- |`);
|
|
60
|
+
for (const item of report.cases) {
|
|
61
|
+
const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
|
|
62
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
63
|
+
}
|
|
64
|
+
lines.push(``);
|
|
65
|
+
lines.push(`## Verifier details`);
|
|
66
|
+
lines.push(``);
|
|
67
|
+
for (const item of report.cases) {
|
|
68
|
+
lines.push(`### ${item.stage} / ${item.caseId}`);
|
|
69
|
+
lines.push(``);
|
|
70
|
+
for (const verifier of item.verifierResults) {
|
|
71
|
+
const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
|
|
72
|
+
lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
|
|
73
|
+
(verifier.message ? ` — ${verifier.message}` : ""));
|
|
74
|
+
}
|
|
75
|
+
lines.push(``);
|
|
76
|
+
}
|
|
77
|
+
return `${lines.join("\n")}\n`;
|
|
78
|
+
}
|
|
79
|
+
export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
80
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
|
|
81
|
+
await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
82
|
+
return outPath;
|
|
83
|
+
}
|
|
84
|
+
export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
85
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
|
|
86
|
+
await writeFileSafe(outPath, formatMarkdownReport(report));
|
|
87
|
+
return outPath;
|
|
88
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
|
+
export interface RunEvalOptions {
|
|
4
|
+
projectRoot: string;
|
|
5
|
+
stage?: FlowStage;
|
|
6
|
+
tier?: EvalTier;
|
|
7
|
+
/** When true, run only structural verifiers. Wave 7.1 wires actual verifiers. */
|
|
8
|
+
schemaOnly?: boolean;
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
|
|
10
|
+
rules?: boolean;
|
|
11
|
+
/** When true, also run LLM judge verifiers. Wave 7.3 wires judging. */
|
|
12
|
+
judge?: boolean;
|
|
13
|
+
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
|
+
dryRun?: boolean;
|
|
15
|
+
/** Override process.env during tests. */
|
|
16
|
+
env?: NodeJS.ProcessEnv;
|
|
17
|
+
}
|
|
18
|
+
export interface DryRunSummary {
|
|
19
|
+
kind: "dry-run";
|
|
20
|
+
config: ResolvedEvalConfig;
|
|
21
|
+
corpus: {
|
|
22
|
+
total: number;
|
|
23
|
+
byStage: Record<string, number>;
|
|
24
|
+
cases: Array<{
|
|
25
|
+
id: string;
|
|
26
|
+
stage: FlowStage;
|
|
27
|
+
}>;
|
|
28
|
+
};
|
|
29
|
+
plannedTier: EvalTier;
|
|
30
|
+
/**
|
|
31
|
+
* Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
|
|
32
|
+
* across the board because no verifier is implemented yet.
|
|
33
|
+
*/
|
|
34
|
+
verifiersAvailable: {
|
|
35
|
+
structural: boolean;
|
|
36
|
+
rules: boolean;
|
|
37
|
+
judge: boolean;
|
|
38
|
+
workflow: boolean;
|
|
39
|
+
};
|
|
40
|
+
notes: string[];
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Wave 7.0 runner. Responsibilities:
|
|
44
|
+
* - Load resolved config (defaults + file + env).
|
|
45
|
+
* - Load corpus (empty on a fresh install).
|
|
46
|
+
* - Validate that no verifier flag asks for a capability that does not exist yet.
|
|
47
|
+
* - Return either a dry-run summary or an empty report.
|
|
48
|
+
*
|
|
49
|
+
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
50
|
+
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
51
|
+
* not churn.
|
|
52
|
+
*/
|
|
53
|
+
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
+
import { loadCorpus } from "./corpus.js";
|
|
4
|
+
import { loadEvalConfig } from "./config-loader.js";
|
|
5
|
+
function groupByStage(cases) {
|
|
6
|
+
return cases.reduce((acc, item) => {
|
|
7
|
+
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
8
|
+
return acc;
|
|
9
|
+
}, {});
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Wave 7.0 runner. Responsibilities:
|
|
13
|
+
* - Load resolved config (defaults + file + env).
|
|
14
|
+
* - Load corpus (empty on a fresh install).
|
|
15
|
+
* - Validate that no verifier flag asks for a capability that does not exist yet.
|
|
16
|
+
* - Return either a dry-run summary or an empty report.
|
|
17
|
+
*
|
|
18
|
+
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
19
|
+
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
20
|
+
* not churn.
|
|
21
|
+
*/
|
|
22
|
+
export async function runEval(options) {
|
|
23
|
+
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
24
|
+
const corpus = await loadCorpus(options.projectRoot, options.stage);
|
|
25
|
+
const plannedTier = options.tier ?? config.defaultTier;
|
|
26
|
+
const notes = [];
|
|
27
|
+
if (corpus.length === 0) {
|
|
28
|
+
notes.push("Corpus is empty. Seed cases land in Wave 7.1 (`.cclaw/evals/corpus/<stage>/*.yaml`).");
|
|
29
|
+
}
|
|
30
|
+
if (options.schemaOnly) {
|
|
31
|
+
notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
|
|
32
|
+
}
|
|
33
|
+
if (options.rules) {
|
|
34
|
+
notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
|
|
35
|
+
}
|
|
36
|
+
if (options.judge) {
|
|
37
|
+
notes.push("--judge is accepted; LLM judging wires up in Wave 7.3.");
|
|
38
|
+
}
|
|
39
|
+
if (options.dryRun === true) {
|
|
40
|
+
const summary = {
|
|
41
|
+
kind: "dry-run",
|
|
42
|
+
config,
|
|
43
|
+
corpus: {
|
|
44
|
+
total: corpus.length,
|
|
45
|
+
byStage: groupByStage(corpus),
|
|
46
|
+
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
47
|
+
},
|
|
48
|
+
plannedTier,
|
|
49
|
+
verifiersAvailable: {
|
|
50
|
+
structural: false,
|
|
51
|
+
rules: false,
|
|
52
|
+
judge: false,
|
|
53
|
+
workflow: false
|
|
54
|
+
},
|
|
55
|
+
notes
|
|
56
|
+
};
|
|
57
|
+
return summary;
|
|
58
|
+
}
|
|
59
|
+
const now = new Date().toISOString();
|
|
60
|
+
const caseResults = corpus.map((item) => ({
|
|
61
|
+
caseId: item.id,
|
|
62
|
+
stage: item.stage,
|
|
63
|
+
tier: plannedTier,
|
|
64
|
+
passed: false,
|
|
65
|
+
durationMs: 0,
|
|
66
|
+
verifierResults: [
|
|
67
|
+
{
|
|
68
|
+
kind: "structural",
|
|
69
|
+
id: "wave-7-0-skeleton",
|
|
70
|
+
ok: false,
|
|
71
|
+
message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
|
|
72
|
+
details: { skipped: true }
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
}));
|
|
76
|
+
const report = {
|
|
77
|
+
schemaVersion: 1,
|
|
78
|
+
generatedAt: now,
|
|
79
|
+
runId: randomUUID(),
|
|
80
|
+
cclawVersion: CCLAW_VERSION,
|
|
81
|
+
provider: config.provider,
|
|
82
|
+
model: config.model,
|
|
83
|
+
tier: plannedTier,
|
|
84
|
+
stages: options.stage ? [options.stage] : [],
|
|
85
|
+
cases: caseResults,
|
|
86
|
+
summary: {
|
|
87
|
+
totalCases: caseResults.length,
|
|
88
|
+
passed: 0,
|
|
89
|
+
failed: 0,
|
|
90
|
+
skipped: caseResults.length,
|
|
91
|
+
totalCostUsd: 0,
|
|
92
|
+
totalDurationMs: 0
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
return report;
|
|
96
|
+
}
|