cclaw-cli 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -2
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/config-loader.js +96 -3
- package/dist/eval/corpus.d.ts +11 -0
- package/dist/eval/corpus.js +162 -7
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +113 -20
- package/dist/eval/llm-client.js +242 -10
- package/dist/eval/report.js +26 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +213 -34
- package/dist/eval/types.d.ts +171 -4
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/eval/verifiers/rules.d.ts +24 -0
- package/dist/eval/verifiers/rules.js +218 -0
- package/dist/eval/verifiers/traceability.d.ts +23 -0
- package/dist/eval/verifiers/traceability.js +84 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,251 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* LLM client for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Thin adapter over the `openai` SDK pointed at any OpenAI-compatible
|
|
5
|
+
* `baseURL` (z.ai, OpenAI, vLLM, Ollama+openai-proxy, ...). The surface is
|
|
6
|
+
* deliberately narrow:
|
|
7
|
+
*
|
|
8
|
+
* - `chat()` — one request/response round-trip with timeout, bounded
|
|
9
|
+
* retries on transient errors, and a structured error hierarchy so
|
|
10
|
+
* callers can react policy-style (cost-guard, judge, agent-under-test).
|
|
11
|
+
* - `ChatRequest` / `ChatResponse` — wire format decoupled from the
|
|
12
|
+
* OpenAI types so swapping vendors stays a one-file change.
|
|
13
|
+
*
|
|
14
|
+
* Factories stay side-effect-free: no network calls are made until `chat()`
|
|
15
|
+
* is invoked, so CLI help and dry-run paths never need an API key.
|
|
16
|
+
*/
|
|
17
|
+
import OpenAI from "openai";
|
|
18
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
19
|
+
export class EvalLlmError extends Error {
|
|
20
|
+
retryable;
|
|
21
|
+
status;
|
|
22
|
+
constructor(message, opts) {
|
|
23
|
+
super(message);
|
|
24
|
+
this.name = "EvalLlmError";
|
|
25
|
+
this.retryable = opts.retryable;
|
|
26
|
+
if (opts.status !== undefined)
|
|
27
|
+
this.status = opts.status;
|
|
28
|
+
if (opts.cause !== undefined)
|
|
29
|
+
this.cause = opts.cause;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
export class EvalLlmAuthError extends EvalLlmError {
|
|
33
|
+
constructor(cause) {
|
|
34
|
+
super("LLM request rejected (auth). Check CCLAW_EVAL_API_KEY and provider permissions.", {
|
|
35
|
+
retryable: false,
|
|
36
|
+
status: 401,
|
|
37
|
+
cause
|
|
38
|
+
});
|
|
39
|
+
this.name = "EvalLlmAuthError";
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
export class EvalLlmConfigError extends EvalLlmError {
|
|
43
|
+
constructor(message, cause) {
|
|
44
|
+
super(message, { retryable: false, cause });
|
|
45
|
+
this.name = "EvalLlmConfigError";
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export class EvalLlmTimeoutError extends EvalLlmError {
|
|
49
|
+
constructor(timeoutMs) {
|
|
50
|
+
super(`LLM request timed out after ${timeoutMs}ms.`, { retryable: true });
|
|
51
|
+
this.name = "EvalLlmTimeoutError";
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
export class EvalLlmRateLimitedError extends EvalLlmError {
|
|
55
|
+
constructor(cause) {
|
|
56
|
+
super("LLM rate limit hit. Retrying with backoff.", {
|
|
57
|
+
retryable: true,
|
|
58
|
+
status: 429,
|
|
59
|
+
cause
|
|
60
|
+
});
|
|
61
|
+
this.name = "EvalLlmRateLimitedError";
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
export class EvalLlmTransportError extends EvalLlmError {
|
|
65
|
+
constructor(cause, status) {
|
|
66
|
+
super("LLM transport error.", { retryable: true, status, cause });
|
|
67
|
+
this.name = "EvalLlmTransportError";
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
export class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
71
|
+
constructor(message, details) {
|
|
72
|
+
super(message, { retryable: false });
|
|
73
|
+
this.name = "EvalLlmInvalidResponseError";
|
|
74
|
+
if (details)
|
|
75
|
+
this.details = details;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
export class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
2
79
|
constructor() {
|
|
3
|
-
super(`LLM client
|
|
4
|
-
`
|
|
5
|
-
this.name = "
|
|
80
|
+
super(`LLM client not configured. Set CCLAW_EVAL_API_KEY (and optionally ` +
|
|
81
|
+
`CCLAW_EVAL_BASE_URL / CCLAW_EVAL_MODEL) or run with --schema-only / --rules.`, { retryable: false });
|
|
82
|
+
this.name = "EvalLlmNotConfiguredError";
|
|
6
83
|
}
|
|
7
84
|
}
|
|
8
85
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
86
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
87
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
88
|
+
* key" case.
|
|
12
89
|
*/
|
|
13
|
-
export
|
|
90
|
+
export class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
91
|
+
}
|
|
92
|
+
export const DEFAULT_RETRY_POLICY = {
|
|
93
|
+
maxRetries: 2,
|
|
94
|
+
initialBackoffMs: 500,
|
|
95
|
+
maxBackoffMs: 8_000
|
|
96
|
+
};
|
|
97
|
+
function isAbortError(err) {
|
|
98
|
+
if (err === null || typeof err !== "object")
|
|
99
|
+
return false;
|
|
100
|
+
const name = err.name;
|
|
101
|
+
const code = err.code;
|
|
102
|
+
return (name === "AbortError" || code === "ABORT_ERR" || code === "ERR_CANCELED");
|
|
103
|
+
}
|
|
104
|
+
function statusFromError(err) {
|
|
105
|
+
if (err === null || typeof err !== "object")
|
|
106
|
+
return undefined;
|
|
107
|
+
const status = err.status;
|
|
108
|
+
return typeof status === "number" ? status : undefined;
|
|
109
|
+
}
|
|
110
|
+
function normalizeError(err, timeoutMs) {
|
|
111
|
+
if (err instanceof EvalLlmError)
|
|
112
|
+
return err;
|
|
113
|
+
if (isAbortError(err))
|
|
114
|
+
return new EvalLlmTimeoutError(timeoutMs);
|
|
115
|
+
const status = statusFromError(err);
|
|
116
|
+
if (status === 401 || status === 403)
|
|
117
|
+
return new EvalLlmAuthError(err);
|
|
118
|
+
if (status === 429)
|
|
119
|
+
return new EvalLlmRateLimitedError(err);
|
|
120
|
+
if (status !== undefined && status >= 400 && status < 500) {
|
|
121
|
+
return new EvalLlmError(`LLM request rejected (HTTP ${status}).`, {
|
|
122
|
+
retryable: false,
|
|
123
|
+
status,
|
|
124
|
+
cause: err
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return new EvalLlmTransportError(err, status);
|
|
128
|
+
}
|
|
129
|
+
function normalizeFinishReason(raw) {
|
|
130
|
+
switch (raw) {
|
|
131
|
+
case "length":
|
|
132
|
+
return "length";
|
|
133
|
+
case "tool_calls":
|
|
134
|
+
case "function_call":
|
|
135
|
+
return "tool_calls";
|
|
136
|
+
case "content_filter":
|
|
137
|
+
return "content_filter";
|
|
138
|
+
case "stop":
|
|
139
|
+
case null:
|
|
140
|
+
case undefined:
|
|
141
|
+
default:
|
|
142
|
+
return "stop";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
function buildBody(request) {
|
|
146
|
+
const body = {
|
|
147
|
+
model: request.model,
|
|
148
|
+
messages: request.messages.map((m) => ({
|
|
149
|
+
role: m.role,
|
|
150
|
+
content: m.content,
|
|
151
|
+
...(m.name !== undefined ? { name: m.name } : {}),
|
|
152
|
+
...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
|
|
153
|
+
}))
|
|
154
|
+
};
|
|
155
|
+
if (request.maxTokens !== undefined)
|
|
156
|
+
body.max_tokens = request.maxTokens;
|
|
157
|
+
if (request.temperature !== undefined)
|
|
158
|
+
body.temperature = request.temperature;
|
|
159
|
+
if (request.seed !== undefined)
|
|
160
|
+
body.seed = request.seed;
|
|
161
|
+
if (request.tools !== undefined)
|
|
162
|
+
body.tools = request.tools;
|
|
163
|
+
if (request.toolChoice !== undefined)
|
|
164
|
+
body.tool_choice = request.toolChoice;
|
|
165
|
+
if (request.responseFormatJson === true) {
|
|
166
|
+
body.response_format = { type: "json_object" };
|
|
167
|
+
}
|
|
168
|
+
return body;
|
|
169
|
+
}
|
|
170
|
+
function defaultSleep(ms) {
|
|
171
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
172
|
+
}
|
|
173
|
+
function backoffDelay(attempt, policy) {
|
|
174
|
+
const raw = policy.initialBackoffMs * 2 ** attempt;
|
|
175
|
+
return Math.min(raw, policy.maxBackoffMs);
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
179
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
180
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
181
|
+
*/
|
|
182
|
+
export function createEvalClient(config, options = {}) {
|
|
183
|
+
const retryPolicy = options.retryPolicy ?? {
|
|
184
|
+
...DEFAULT_RETRY_POLICY,
|
|
185
|
+
maxRetries: Math.max(0, config.maxRetries ?? DEFAULT_RETRY_POLICY.maxRetries)
|
|
186
|
+
};
|
|
187
|
+
const sleep = options.sleep ?? defaultSleep;
|
|
188
|
+
let cached;
|
|
189
|
+
const getClient = () => {
|
|
190
|
+
if (cached)
|
|
191
|
+
return cached;
|
|
192
|
+
if (!config.apiKey)
|
|
193
|
+
throw new EvalLlmNotConfiguredError();
|
|
194
|
+
const factory = options.openaiFactory ??
|
|
195
|
+
((opts) => new OpenAI(opts));
|
|
196
|
+
cached = factory({ apiKey: config.apiKey, baseURL: config.baseUrl });
|
|
197
|
+
return cached;
|
|
198
|
+
};
|
|
14
199
|
return {
|
|
15
|
-
async chat() {
|
|
16
|
-
|
|
200
|
+
async chat(request) {
|
|
201
|
+
const timeoutMs = Math.max(1_000, request.timeoutMs ?? config.timeoutMs);
|
|
202
|
+
const body = buildBody(request);
|
|
203
|
+
const client = getClient();
|
|
204
|
+
let lastError;
|
|
205
|
+
const maxAttempts = retryPolicy.maxRetries + 1;
|
|
206
|
+
for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
|
|
207
|
+
const controller = new AbortController();
|
|
208
|
+
const handle = setTimeout(() => controller.abort(), timeoutMs);
|
|
209
|
+
try {
|
|
210
|
+
const raw = await client.chat.completions.create(body, {
|
|
211
|
+
signal: controller.signal
|
|
212
|
+
});
|
|
213
|
+
clearTimeout(handle);
|
|
214
|
+
const choice = raw.choices?.[0];
|
|
215
|
+
if (!choice) {
|
|
216
|
+
throw new EvalLlmInvalidResponseError("LLM response contained no choices.", { model: raw.model });
|
|
217
|
+
}
|
|
218
|
+
const content = choice.message?.content ?? "";
|
|
219
|
+
const toolCalls = choice.message?.tool_calls?.map((call) => ({
|
|
220
|
+
id: call.id,
|
|
221
|
+
name: call.function.name,
|
|
222
|
+
arguments: call.function.arguments
|
|
223
|
+
}));
|
|
224
|
+
const usage = {
|
|
225
|
+
promptTokens: raw.usage?.prompt_tokens ?? 0,
|
|
226
|
+
completionTokens: raw.usage?.completion_tokens ?? 0,
|
|
227
|
+
totalTokens: raw.usage?.total_tokens ?? 0
|
|
228
|
+
};
|
|
229
|
+
return {
|
|
230
|
+
content,
|
|
231
|
+
...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
|
|
232
|
+
usage,
|
|
233
|
+
finishReason: normalizeFinishReason(choice.finish_reason),
|
|
234
|
+
model: raw.model ?? request.model,
|
|
235
|
+
attempts: attempt + 1
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
catch (err) {
|
|
239
|
+
clearTimeout(handle);
|
|
240
|
+
const normalized = normalizeError(err, timeoutMs);
|
|
241
|
+
lastError = normalized;
|
|
242
|
+
const isLastAttempt = attempt === maxAttempts - 1;
|
|
243
|
+
if (!normalized.retryable || isLastAttempt)
|
|
244
|
+
throw normalized;
|
|
245
|
+
await sleep(backoffDelay(attempt, retryPolicy));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
|
|
17
249
|
}
|
|
18
250
|
};
|
|
19
251
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -75,6 +75,32 @@ export function formatMarkdownReport(report) {
|
|
|
75
75
|
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
76
|
}
|
|
77
77
|
lines.push(``);
|
|
78
|
+
const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
|
|
79
|
+
if (judgeCases.length > 0) {
|
|
80
|
+
lines.push(`## Judge scores`);
|
|
81
|
+
lines.push(``);
|
|
82
|
+
lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
|
|
83
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
84
|
+
for (const item of judgeCases) {
|
|
85
|
+
for (const verifier of item.verifierResults) {
|
|
86
|
+
if (verifier.kind !== "judge")
|
|
87
|
+
continue;
|
|
88
|
+
if (verifier.id === "judge:required-checks")
|
|
89
|
+
continue;
|
|
90
|
+
if (verifier.id === "judge:rubric:missing")
|
|
91
|
+
continue;
|
|
92
|
+
if (verifier.id === "judge:invocation:error")
|
|
93
|
+
continue;
|
|
94
|
+
const details = verifier.details ?? {};
|
|
95
|
+
const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
|
|
96
|
+
const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
|
|
97
|
+
const coverage = details.coverage === true ? "yes" : "no";
|
|
98
|
+
const checkId = verifier.id.replace(/^judge:/, "");
|
|
99
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
lines.push(``);
|
|
103
|
+
}
|
|
78
104
|
lines.push(`## Verifier details`);
|
|
79
105
|
lines.push(``);
|
|
80
106
|
for (const item of report.cases) {
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { RubricCheck, RubricDoc } from "./types.js";
|
|
3
|
+
export declare function rubricsDir(projectRoot: string): string;
|
|
4
|
+
export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
|
|
5
|
+
declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
|
|
6
|
+
declare function validateRubric(raw: unknown, file: string): RubricDoc;
|
|
7
|
+
/**
|
|
8
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
9
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
10
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
11
|
+
*/
|
|
12
|
+
export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
|
|
13
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
14
|
+
export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
|
|
15
|
+
/** Exposed for tests. */
|
|
16
|
+
export declare const __internal: {
|
|
17
|
+
validateRubric: typeof validateRubric;
|
|
18
|
+
validateCheck: typeof validateCheck;
|
|
19
|
+
};
|
|
20
|
+
export {};
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
3
|
+
*
|
|
4
|
+
* Each file maps to exactly one `RubricDoc` that drives the LLM judge.
|
|
5
|
+
* Validation is strict: unknown top-level keys, missing required fields,
|
|
6
|
+
* duplicate check ids, and malformed weights all surface as actionable
|
|
7
|
+
* errors rather than turning into silent "judge had nothing to score"
|
|
8
|
+
* passes.
|
|
9
|
+
*/
|
|
10
|
+
import fs from "node:fs/promises";
|
|
11
|
+
import path from "node:path";
|
|
12
|
+
import { parse } from "yaml";
|
|
13
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
14
|
+
import { exists } from "../fs-utils.js";
|
|
15
|
+
import { FLOW_STAGES } from "../types.js";
|
|
16
|
+
export function rubricsDir(projectRoot) {
|
|
17
|
+
return path.join(projectRoot, EVALS_ROOT, "rubrics");
|
|
18
|
+
}
|
|
19
|
+
export function rubricPath(projectRoot, stage) {
|
|
20
|
+
return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
|
|
21
|
+
}
|
|
22
|
+
function rubricError(file, reason) {
|
|
23
|
+
return new Error(`Invalid rubric at ${file}: ${reason}\n` +
|
|
24
|
+
`See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
|
|
25
|
+
}
|
|
26
|
+
function isRecord(value) {
|
|
27
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
28
|
+
}
|
|
29
|
+
function validateCheck(raw, index, file) {
|
|
30
|
+
if (!isRecord(raw)) {
|
|
31
|
+
throw rubricError(file, `checks[${index}] must be a mapping`);
|
|
32
|
+
}
|
|
33
|
+
const id = raw.id;
|
|
34
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
35
|
+
throw rubricError(file, `checks[${index}].id must be a non-empty string`);
|
|
36
|
+
}
|
|
37
|
+
if (!/^[a-z][a-z0-9-]*$/.test(id)) {
|
|
38
|
+
throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
|
|
39
|
+
}
|
|
40
|
+
const prompt = raw.prompt;
|
|
41
|
+
if (typeof prompt !== "string" || prompt.trim().length === 0) {
|
|
42
|
+
throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
|
|
43
|
+
}
|
|
44
|
+
const check = {
|
|
45
|
+
id,
|
|
46
|
+
prompt: prompt.trim()
|
|
47
|
+
};
|
|
48
|
+
if (raw.scale !== undefined) {
|
|
49
|
+
if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
|
|
50
|
+
throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
|
|
51
|
+
}
|
|
52
|
+
check.scale = raw.scale.trim();
|
|
53
|
+
}
|
|
54
|
+
if (raw.weight !== undefined) {
|
|
55
|
+
if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
|
|
56
|
+
throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
|
|
57
|
+
}
|
|
58
|
+
check.weight = raw.weight;
|
|
59
|
+
}
|
|
60
|
+
if (raw.critical !== undefined) {
|
|
61
|
+
if (typeof raw.critical !== "boolean") {
|
|
62
|
+
throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
|
|
63
|
+
}
|
|
64
|
+
check.critical = raw.critical;
|
|
65
|
+
}
|
|
66
|
+
const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
|
|
67
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
68
|
+
if (unknown.length > 0) {
|
|
69
|
+
throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
|
|
70
|
+
}
|
|
71
|
+
return check;
|
|
72
|
+
}
|
|
73
|
+
function validateRubric(raw, file) {
|
|
74
|
+
if (!isRecord(raw)) {
|
|
75
|
+
throw rubricError(file, "top-level value must be a mapping");
|
|
76
|
+
}
|
|
77
|
+
const stage = raw.stage;
|
|
78
|
+
if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
|
|
79
|
+
throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
|
|
80
|
+
}
|
|
81
|
+
const id = raw.id;
|
|
82
|
+
let rubricId = stage;
|
|
83
|
+
if (id !== undefined) {
|
|
84
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
85
|
+
throw rubricError(file, `"id" must be a non-empty string when provided`);
|
|
86
|
+
}
|
|
87
|
+
rubricId = id.trim();
|
|
88
|
+
}
|
|
89
|
+
const checks = raw.checks;
|
|
90
|
+
if (!Array.isArray(checks) || checks.length === 0) {
|
|
91
|
+
throw rubricError(file, `"checks" must be a non-empty array`);
|
|
92
|
+
}
|
|
93
|
+
const parsed = [];
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
for (let i = 0; i < checks.length; i += 1) {
|
|
96
|
+
const check = validateCheck(checks[i], i, file);
|
|
97
|
+
if (seen.has(check.id)) {
|
|
98
|
+
throw rubricError(file, `duplicate check id: "${check.id}"`);
|
|
99
|
+
}
|
|
100
|
+
seen.add(check.id);
|
|
101
|
+
parsed.push(check);
|
|
102
|
+
}
|
|
103
|
+
const known = new Set(["stage", "id", "checks"]);
|
|
104
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
105
|
+
if (unknown.length > 0) {
|
|
106
|
+
throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
stage: stage,
|
|
110
|
+
id: rubricId,
|
|
111
|
+
checks: parsed
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
116
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
117
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
118
|
+
*/
|
|
119
|
+
export async function loadRubric(projectRoot, stage) {
|
|
120
|
+
const file = rubricPath(projectRoot, stage);
|
|
121
|
+
if (!(await exists(file)))
|
|
122
|
+
return undefined;
|
|
123
|
+
let parsed;
|
|
124
|
+
try {
|
|
125
|
+
parsed = parse(await fs.readFile(file, "utf8"));
|
|
126
|
+
}
|
|
127
|
+
catch (err) {
|
|
128
|
+
throw rubricError(file, err instanceof Error ? err.message : String(err));
|
|
129
|
+
}
|
|
130
|
+
return validateRubric(parsed, file);
|
|
131
|
+
}
|
|
132
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
133
|
+
export async function loadAllRubrics(projectRoot) {
|
|
134
|
+
const out = new Map();
|
|
135
|
+
for (const stage of FLOW_STAGES) {
|
|
136
|
+
const doc = await loadRubric(projectRoot, stage);
|
|
137
|
+
if (doc)
|
|
138
|
+
out.set(stage, doc);
|
|
139
|
+
}
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
/** Exposed for tests. */
|
|
143
|
+
export const __internal = { validateRubric, validateCheck };
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
|
+
import { type EvalLlmClient } from "./llm-client.js";
|
|
2
3
|
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
4
|
export interface RunEvalOptions {
|
|
4
5
|
projectRoot: string;
|
|
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
|
|
|
14
15
|
dryRun?: boolean;
|
|
15
16
|
/** Override process.env during tests. */
|
|
16
17
|
env?: NodeJS.ProcessEnv;
|
|
18
|
+
/**
|
|
19
|
+
* Optional LLM client injection. Primary use case: unit and
|
|
20
|
+
* integration tests that want deterministic judge + agent behavior
|
|
21
|
+
* without hitting the network.
|
|
22
|
+
*/
|
|
23
|
+
llmClient?: EvalLlmClient;
|
|
17
24
|
}
|
|
18
25
|
export interface DryRunSummary {
|
|
19
26
|
kind: "dry-run";
|