cclaw-cli 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +2 -1
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/config-loader.js +96 -3
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +113 -20
- package/dist/eval/llm-client.js +242 -10
- package/dist/eval/report.js +26 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +145 -12
- package/dist/eval/types.d.ts +103 -1
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
3
|
+
*
|
|
4
|
+
* Each file maps to exactly one `RubricDoc` that drives the LLM judge.
|
|
5
|
+
* Validation is strict: unknown top-level keys, missing required fields,
|
|
6
|
+
* duplicate check ids, and malformed weights all surface as actionable
|
|
7
|
+
* errors rather than turning into silent "judge had nothing to score"
|
|
8
|
+
* passes.
|
|
9
|
+
*/
|
|
10
|
+
import fs from "node:fs/promises";
|
|
11
|
+
import path from "node:path";
|
|
12
|
+
import { parse } from "yaml";
|
|
13
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
14
|
+
import { exists } from "../fs-utils.js";
|
|
15
|
+
import { FLOW_STAGES } from "../types.js";
|
|
16
|
+
export function rubricsDir(projectRoot) {
|
|
17
|
+
return path.join(projectRoot, EVALS_ROOT, "rubrics");
|
|
18
|
+
}
|
|
19
|
+
export function rubricPath(projectRoot, stage) {
|
|
20
|
+
return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
|
|
21
|
+
}
|
|
22
|
+
function rubricError(file, reason) {
|
|
23
|
+
return new Error(`Invalid rubric at ${file}: ${reason}\n` +
|
|
24
|
+
`See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
|
|
25
|
+
}
|
|
26
|
+
function isRecord(value) {
|
|
27
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
28
|
+
}
|
|
29
|
+
function validateCheck(raw, index, file) {
|
|
30
|
+
if (!isRecord(raw)) {
|
|
31
|
+
throw rubricError(file, `checks[${index}] must be a mapping`);
|
|
32
|
+
}
|
|
33
|
+
const id = raw.id;
|
|
34
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
35
|
+
throw rubricError(file, `checks[${index}].id must be a non-empty string`);
|
|
36
|
+
}
|
|
37
|
+
if (!/^[a-z][a-z0-9-]*$/.test(id)) {
|
|
38
|
+
throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
|
|
39
|
+
}
|
|
40
|
+
const prompt = raw.prompt;
|
|
41
|
+
if (typeof prompt !== "string" || prompt.trim().length === 0) {
|
|
42
|
+
throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
|
|
43
|
+
}
|
|
44
|
+
const check = {
|
|
45
|
+
id,
|
|
46
|
+
prompt: prompt.trim()
|
|
47
|
+
};
|
|
48
|
+
if (raw.scale !== undefined) {
|
|
49
|
+
if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
|
|
50
|
+
throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
|
|
51
|
+
}
|
|
52
|
+
check.scale = raw.scale.trim();
|
|
53
|
+
}
|
|
54
|
+
if (raw.weight !== undefined) {
|
|
55
|
+
if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
|
|
56
|
+
throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
|
|
57
|
+
}
|
|
58
|
+
check.weight = raw.weight;
|
|
59
|
+
}
|
|
60
|
+
if (raw.critical !== undefined) {
|
|
61
|
+
if (typeof raw.critical !== "boolean") {
|
|
62
|
+
throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
|
|
63
|
+
}
|
|
64
|
+
check.critical = raw.critical;
|
|
65
|
+
}
|
|
66
|
+
const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
|
|
67
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
68
|
+
if (unknown.length > 0) {
|
|
69
|
+
throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
|
|
70
|
+
}
|
|
71
|
+
return check;
|
|
72
|
+
}
|
|
73
|
+
function validateRubric(raw, file) {
|
|
74
|
+
if (!isRecord(raw)) {
|
|
75
|
+
throw rubricError(file, "top-level value must be a mapping");
|
|
76
|
+
}
|
|
77
|
+
const stage = raw.stage;
|
|
78
|
+
if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
|
|
79
|
+
throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
|
|
80
|
+
}
|
|
81
|
+
const id = raw.id;
|
|
82
|
+
let rubricId = stage;
|
|
83
|
+
if (id !== undefined) {
|
|
84
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
85
|
+
throw rubricError(file, `"id" must be a non-empty string when provided`);
|
|
86
|
+
}
|
|
87
|
+
rubricId = id.trim();
|
|
88
|
+
}
|
|
89
|
+
const checks = raw.checks;
|
|
90
|
+
if (!Array.isArray(checks) || checks.length === 0) {
|
|
91
|
+
throw rubricError(file, `"checks" must be a non-empty array`);
|
|
92
|
+
}
|
|
93
|
+
const parsed = [];
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
for (let i = 0; i < checks.length; i += 1) {
|
|
96
|
+
const check = validateCheck(checks[i], i, file);
|
|
97
|
+
if (seen.has(check.id)) {
|
|
98
|
+
throw rubricError(file, `duplicate check id: "${check.id}"`);
|
|
99
|
+
}
|
|
100
|
+
seen.add(check.id);
|
|
101
|
+
parsed.push(check);
|
|
102
|
+
}
|
|
103
|
+
const known = new Set(["stage", "id", "checks"]);
|
|
104
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
105
|
+
if (unknown.length > 0) {
|
|
106
|
+
throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
stage: stage,
|
|
110
|
+
id: rubricId,
|
|
111
|
+
checks: parsed
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
116
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
117
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
118
|
+
*/
|
|
119
|
+
export async function loadRubric(projectRoot, stage) {
|
|
120
|
+
const file = rubricPath(projectRoot, stage);
|
|
121
|
+
if (!(await exists(file)))
|
|
122
|
+
return undefined;
|
|
123
|
+
let parsed;
|
|
124
|
+
try {
|
|
125
|
+
parsed = parse(await fs.readFile(file, "utf8"));
|
|
126
|
+
}
|
|
127
|
+
catch (err) {
|
|
128
|
+
throw rubricError(file, err instanceof Error ? err.message : String(err));
|
|
129
|
+
}
|
|
130
|
+
return validateRubric(parsed, file);
|
|
131
|
+
}
|
|
132
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
133
|
+
export async function loadAllRubrics(projectRoot) {
|
|
134
|
+
const out = new Map();
|
|
135
|
+
for (const stage of FLOW_STAGES) {
|
|
136
|
+
const doc = await loadRubric(projectRoot, stage);
|
|
137
|
+
if (doc)
|
|
138
|
+
out.set(stage, doc);
|
|
139
|
+
}
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
/** Exposed for tests. */
|
|
143
|
+
export const __internal = { validateRubric, validateCheck };
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
|
+
import { type EvalLlmClient } from "./llm-client.js";
|
|
2
3
|
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
4
|
export interface RunEvalOptions {
|
|
4
5
|
projectRoot: string;
|
|
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
|
|
|
14
15
|
dryRun?: boolean;
|
|
15
16
|
/** Override process.env during tests. */
|
|
16
17
|
env?: NodeJS.ProcessEnv;
|
|
18
|
+
/**
|
|
19
|
+
* Optional LLM client injection. Primary use case: unit and
|
|
20
|
+
* integration tests that want deterministic judge + agent behavior
|
|
21
|
+
* without hitting the network.
|
|
22
|
+
*/
|
|
23
|
+
llmClient?: EvalLlmClient;
|
|
17
24
|
}
|
|
18
25
|
export interface DryRunSummary {
|
|
19
26
|
kind: "dry-run";
|
package/dist/eval/runner.js
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { runSingleShot } from "./agents/single-shot.js";
|
|
4
5
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
6
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
6
7
|
import { loadEvalConfig } from "./config-loader.js";
|
|
8
|
+
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
9
|
+
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
10
|
+
import { loadAllRubrics } from "./rubric-loader.js";
|
|
11
|
+
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
7
12
|
import { verifyRules } from "./verifiers/rules.js";
|
|
8
13
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
9
14
|
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
@@ -26,16 +31,38 @@ function skeletonVerifierResult(message, details) {
|
|
|
26
31
|
/**
|
|
27
32
|
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
28
33
|
* on top of structural (traceability is a rule-family verifier even though
|
|
29
|
-
* it lives in its own module).
|
|
30
|
-
*
|
|
34
|
+
* it lives in its own module). --judge opens up the LLM judge and, for
|
|
35
|
+
* Tier A, the single-shot agent-under-test. --schema-only always wins so
|
|
36
|
+
* the LLM-free PR gate never pays for tokens even if stale flags collide.
|
|
31
37
|
*/
|
|
32
38
|
function resolveRunFlags(options) {
|
|
33
39
|
const rulesRequested = options.rules === true;
|
|
34
40
|
const schemaOnly = options.schemaOnly === true;
|
|
41
|
+
const judgeRequested = options.judge === true;
|
|
42
|
+
const runJudge = judgeRequested && !schemaOnly;
|
|
43
|
+
const runAgent = runJudge && (options.tier ?? "A") === "A";
|
|
35
44
|
return {
|
|
36
45
|
runStructural: true,
|
|
37
46
|
runRules: rulesRequested && !schemaOnly,
|
|
38
|
-
runTraceability: rulesRequested && !schemaOnly
|
|
47
|
+
runTraceability: rulesRequested && !schemaOnly,
|
|
48
|
+
runJudge,
|
|
49
|
+
runAgent
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Wrap a client so every chat() result is accounted against the cost
|
|
54
|
+
* guard before being returned. The guard throws
|
|
55
|
+
* DailyCostCapExceededError if committing the call would cross the
|
|
56
|
+
* configured cap — the runner surfaces that as a hard failure so
|
|
57
|
+
* nightly CI fails loud instead of silently overspending.
|
|
58
|
+
*/
|
|
59
|
+
function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
|
|
60
|
+
return {
|
|
61
|
+
async chat(request) {
|
|
62
|
+
const response = await client.chat(request);
|
|
63
|
+
await costGuard.commit(response.model || fallbackModel, response.usage);
|
|
64
|
+
return response;
|
|
65
|
+
}
|
|
39
66
|
};
|
|
40
67
|
}
|
|
41
68
|
async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
@@ -54,17 +81,61 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
|
54
81
|
return undefined;
|
|
55
82
|
}
|
|
56
83
|
}
|
|
57
|
-
async function runCase(
|
|
84
|
+
async function runCase(ctx) {
|
|
85
|
+
const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
|
|
58
86
|
const started = Date.now();
|
|
59
87
|
const verifierResults = [];
|
|
60
88
|
const expected = caseEntry.expected;
|
|
89
|
+
let caseCostUsd = 0;
|
|
61
90
|
const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
|
|
62
91
|
const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
|
|
63
92
|
const hasTraceability = flags.runTraceability && !!expected?.traceability;
|
|
64
|
-
const
|
|
93
|
+
const judgeRequested = flags.runJudge && !!expected?.judge;
|
|
94
|
+
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
65
95
|
let artifact;
|
|
66
96
|
if (needsArtifact) {
|
|
67
|
-
|
|
97
|
+
if (flags.runAgent && judgeRequested && client) {
|
|
98
|
+
try {
|
|
99
|
+
const produced = await runSingleShot({
|
|
100
|
+
caseEntry,
|
|
101
|
+
config,
|
|
102
|
+
projectRoot,
|
|
103
|
+
client
|
|
104
|
+
});
|
|
105
|
+
artifact = produced.artifact;
|
|
106
|
+
caseCostUsd += produced.usageUsd;
|
|
107
|
+
verifierResults.push({
|
|
108
|
+
kind: "workflow",
|
|
109
|
+
id: "agent:single-shot",
|
|
110
|
+
ok: true,
|
|
111
|
+
score: 1,
|
|
112
|
+
message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
|
|
113
|
+
details: {
|
|
114
|
+
model: produced.model,
|
|
115
|
+
tokensIn: produced.usage.promptTokens,
|
|
116
|
+
tokensOut: produced.usage.completionTokens,
|
|
117
|
+
usageUsd: produced.usageUsd,
|
|
118
|
+
attempts: produced.attempts
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
if (err instanceof DailyCostCapExceededError)
|
|
124
|
+
throw err;
|
|
125
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
126
|
+
verifierResults.push({
|
|
127
|
+
kind: "workflow",
|
|
128
|
+
id: "agent:single-shot",
|
|
129
|
+
ok: false,
|
|
130
|
+
score: 0,
|
|
131
|
+
message: err instanceof Error ? err.message : String(err),
|
|
132
|
+
details: { retryable }
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
138
|
+
}
|
|
68
139
|
if (artifact === undefined && verifierResults.length === 0) {
|
|
69
140
|
verifierResults.push({
|
|
70
141
|
kind: "structural",
|
|
@@ -111,6 +182,46 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
|
|
|
111
182
|
});
|
|
112
183
|
}
|
|
113
184
|
}
|
|
185
|
+
if (judgeRequested && artifact !== undefined && client) {
|
|
186
|
+
const rubric = rubrics.get(caseEntry.stage);
|
|
187
|
+
if (!rubric) {
|
|
188
|
+
verifierResults.push({
|
|
189
|
+
kind: "judge",
|
|
190
|
+
id: "judge:rubric:missing",
|
|
191
|
+
ok: false,
|
|
192
|
+
score: 0,
|
|
193
|
+
message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
|
|
194
|
+
details: { stage: caseEntry.stage }
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
else {
|
|
198
|
+
try {
|
|
199
|
+
const invocation = await runJudge({
|
|
200
|
+
artifact,
|
|
201
|
+
rubric,
|
|
202
|
+
config,
|
|
203
|
+
client,
|
|
204
|
+
caseHint: expected.judge
|
|
205
|
+
});
|
|
206
|
+
caseCostUsd += invocation.usageUsd;
|
|
207
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
|
|
208
|
+
verifierResults.push(...judgeVerifiers);
|
|
209
|
+
}
|
|
210
|
+
catch (err) {
|
|
211
|
+
if (err instanceof DailyCostCapExceededError)
|
|
212
|
+
throw err;
|
|
213
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
214
|
+
verifierResults.push({
|
|
215
|
+
kind: "judge",
|
|
216
|
+
id: "judge:invocation:error",
|
|
217
|
+
ok: false,
|
|
218
|
+
score: 0,
|
|
219
|
+
message: err instanceof Error ? err.message : String(err),
|
|
220
|
+
details: { retryable, rubricId: rubric.id }
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
114
225
|
const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
115
226
|
const allOk = nonSkippedResults.length === 0
|
|
116
227
|
? verifierResults.every((r) => r.ok)
|
|
@@ -121,6 +232,7 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
|
|
|
121
232
|
tier: plannedTier,
|
|
122
233
|
passed: allOk,
|
|
123
234
|
durationMs: Date.now() - started,
|
|
235
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
124
236
|
verifierResults
|
|
125
237
|
};
|
|
126
238
|
}
|
|
@@ -173,10 +285,13 @@ export async function runEval(options) {
|
|
|
173
285
|
if (corpus.length === 0) {
|
|
174
286
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
175
287
|
}
|
|
176
|
-
if (options.judge) {
|
|
177
|
-
notes.push("--judge is accepted; LLM judging is not wired yet.");
|
|
178
|
-
}
|
|
179
288
|
const flags = resolveRunFlags(options);
|
|
289
|
+
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
290
|
+
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
291
|
+
}
|
|
292
|
+
if ((options.tier ?? "A") !== "A" && flags.runJudge) {
|
|
293
|
+
notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
|
|
294
|
+
}
|
|
180
295
|
if (options.dryRun === true) {
|
|
181
296
|
const summary = {
|
|
182
297
|
kind: "dry-run",
|
|
@@ -190,17 +305,35 @@ export async function runEval(options) {
|
|
|
190
305
|
verifiersAvailable: {
|
|
191
306
|
structural: flags.runStructural,
|
|
192
307
|
rules: flags.runRules,
|
|
193
|
-
judge:
|
|
194
|
-
workflow:
|
|
308
|
+
judge: flags.runJudge,
|
|
309
|
+
workflow: flags.runAgent
|
|
195
310
|
},
|
|
196
311
|
notes
|
|
197
312
|
};
|
|
198
313
|
return summary;
|
|
199
314
|
}
|
|
315
|
+
const costGuard = createCostGuard(options.projectRoot, config);
|
|
316
|
+
let wrappedClient;
|
|
317
|
+
if (flags.runJudge) {
|
|
318
|
+
const base = options.llmClient ?? createEvalClient(config);
|
|
319
|
+
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
320
|
+
}
|
|
321
|
+
const rubrics = flags.runJudge
|
|
322
|
+
? await loadAllRubrics(options.projectRoot)
|
|
323
|
+
: new Map();
|
|
200
324
|
const now = new Date().toISOString();
|
|
201
325
|
const caseResults = [];
|
|
202
326
|
for (const item of corpus) {
|
|
203
|
-
caseResults.push(await runCase(
|
|
327
|
+
caseResults.push(await runCase({
|
|
328
|
+
projectRoot: options.projectRoot,
|
|
329
|
+
caseEntry: item,
|
|
330
|
+
plannedTier,
|
|
331
|
+
flags,
|
|
332
|
+
config,
|
|
333
|
+
client: wrappedClient,
|
|
334
|
+
costGuard,
|
|
335
|
+
rubrics
|
|
336
|
+
}));
|
|
204
337
|
}
|
|
205
338
|
const stages = stagesInResults(caseResults);
|
|
206
339
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -114,6 +114,31 @@ export interface TraceabilityExpected {
|
|
|
114
114
|
*/
|
|
115
115
|
requireIn: string[];
|
|
116
116
|
}
|
|
117
|
+
/**
|
|
118
|
+
* LLM-judge expectations — Step 3.
|
|
119
|
+
*
|
|
120
|
+
* When present, the judge runs against the resolved artifact (live-agent
|
|
121
|
+
* output in Tier A/B/C, or the pre-generated fixture when `--judge` is
|
|
122
|
+
* combined with `--schema-only` for smoke tests). Every field below is
|
|
123
|
+
* optional; the case-level hint overlays the stage-level rubric loaded
|
|
124
|
+
* from `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
125
|
+
*/
|
|
126
|
+
export interface JudgeExpected {
|
|
127
|
+
/**
|
|
128
|
+
* Per-case check ids that MUST be present in the stage rubric. Used when
|
|
129
|
+
* a case wants to assert the rubric covers scenario-specific properties.
|
|
130
|
+
*/
|
|
131
|
+
requiredChecks?: string[];
|
|
132
|
+
/**
|
|
133
|
+
* Stage rubric identifier when a stage ships multiple rubrics (e.g.
|
|
134
|
+
* "strict" vs. "lenient"). Defaults to the stage name.
|
|
135
|
+
*/
|
|
136
|
+
rubric?: string;
|
|
137
|
+
/** Optional override of `config.judgeSamples` for the case. */
|
|
138
|
+
samples?: number;
|
|
139
|
+
/** Per-check minimum score (1..5 scale). Fail when any score drops below. */
|
|
140
|
+
minimumScores?: Record<string, number>;
|
|
141
|
+
}
|
|
117
142
|
/** Superset of per-verifier expectation shapes. */
|
|
118
143
|
export interface ExpectedShape {
|
|
119
144
|
structural?: StructuralExpected;
|
|
@@ -122,7 +147,7 @@ export interface ExpectedShape {
|
|
|
122
147
|
/** Cross-stage ID propagation checks — Step 2. */
|
|
123
148
|
traceability?: TraceabilityExpected;
|
|
124
149
|
/** LLM-judge rubrics — Step 3. */
|
|
125
|
-
judge?:
|
|
150
|
+
judge?: JudgeExpected;
|
|
126
151
|
}
|
|
127
152
|
/**
|
|
128
153
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
@@ -228,6 +253,26 @@ export interface EvalConfig {
|
|
|
228
253
|
timeoutMs: number;
|
|
229
254
|
/** Max retries per API call on transient failures. */
|
|
230
255
|
maxRetries: number;
|
|
256
|
+
/**
|
|
257
|
+
* Number of judge samples per case (median-of-N). Defaults to 3 when unset.
|
|
258
|
+
* Must be odd so a true median exists.
|
|
259
|
+
*/
|
|
260
|
+
judgeSamples?: number;
|
|
261
|
+
/** Sampling temperature for judge calls. Defaults to 0.0. */
|
|
262
|
+
judgeTemperature?: number;
|
|
263
|
+
/** Sampling temperature for the agent-under-test. Defaults to 0.2. */
|
|
264
|
+
agentTemperature?: number;
|
|
265
|
+
/**
|
|
266
|
+
* Optional per-model USD pricing used by the cost guard. Keys match
|
|
267
|
+
* `model` / `judgeModel`. Values in USD per 1K tokens, so
|
|
268
|
+
* `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
|
|
269
|
+
*/
|
|
270
|
+
tokenPricing?: Record<string, TokenPricing>;
|
|
271
|
+
}
|
|
272
|
+
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
273
|
+
export interface TokenPricing {
|
|
274
|
+
input: number;
|
|
275
|
+
output: number;
|
|
231
276
|
}
|
|
232
277
|
/** Resolved config with env overrides applied. */
|
|
233
278
|
export interface ResolvedEvalConfig extends EvalConfig {
|
|
@@ -279,3 +324,60 @@ export interface BaselineRegression {
|
|
|
279
324
|
previousScore?: number;
|
|
280
325
|
currentScore?: number;
|
|
281
326
|
}
|
|
327
|
+
/**
|
|
328
|
+
* One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
|
|
329
|
+
* 5 means "the artifact fully meets the bar described by `prompt`".
|
|
330
|
+
*/
|
|
331
|
+
export interface RubricCheck {
|
|
332
|
+
/** Kebab-case slug, unique per rubric. Stable across runs. */
|
|
333
|
+
id: string;
|
|
334
|
+
/** Natural-language question posed to the judge. */
|
|
335
|
+
prompt: string;
|
|
336
|
+
/** Human-readable scale description rendered in judge prompts. */
|
|
337
|
+
scale?: string;
|
|
338
|
+
/** Relative weight for the stage's aggregate score. Defaults to 1.0. */
|
|
339
|
+
weight?: number;
|
|
340
|
+
/**
|
|
341
|
+
* When true, any sample below `config.regression.failIfCriticalBelow`
|
|
342
|
+
* flips the verifier to `ok:false` (not just a score drop).
|
|
343
|
+
*/
|
|
344
|
+
critical?: boolean;
|
|
345
|
+
}
|
|
346
|
+
/** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
|
|
347
|
+
export interface RubricDoc {
|
|
348
|
+
stage: FlowStage;
|
|
349
|
+
/** Optional rubric variant label; defaults to the stage name. */
|
|
350
|
+
id: string;
|
|
351
|
+
checks: RubricCheck[];
|
|
352
|
+
}
|
|
353
|
+
/**
|
|
354
|
+
* Judge response for a single sample (one API call). The judge is asked to
|
|
355
|
+
* return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
|
|
356
|
+
* `rationales[id]` is a short plain-text explanation, useful in reports but
|
|
357
|
+
* never used for gating.
|
|
358
|
+
*/
|
|
359
|
+
export interface JudgeSample {
|
|
360
|
+
scores: Record<string, number>;
|
|
361
|
+
rationales: Record<string, string>;
|
|
362
|
+
}
|
|
363
|
+
/** Aggregated judge output across N samples, per rubric check. */
|
|
364
|
+
export interface JudgeAggregate {
|
|
365
|
+
checkId: string;
|
|
366
|
+
samples: number[];
|
|
367
|
+
median: number;
|
|
368
|
+
mean: number;
|
|
369
|
+
/** True iff every sample returned a score for this check. */
|
|
370
|
+
coverage: boolean;
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Judge invocation result. Produced by `runJudge` and consumed by the
|
|
374
|
+
* runner: the runner converts each aggregate into a `VerifierResult` and
|
|
375
|
+
* records `usageUsd` toward the per-case cost.
|
|
376
|
+
*/
|
|
377
|
+
export interface JudgeInvocation {
|
|
378
|
+
rubricId: string;
|
|
379
|
+
samples: JudgeSample[];
|
|
380
|
+
aggregates: JudgeAggregate[];
|
|
381
|
+
usageUsd: number;
|
|
382
|
+
durationMs: number;
|
|
383
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge verifier — Step 3.
|
|
3
|
+
*
|
|
4
|
+
* Given an artifact and the stage's rubric, runs N judge samples (default
|
|
5
|
+
* median-of-3) against the configured LLM, aggregates the per-check
|
|
6
|
+
* scores, and returns one VerifierResult per rubric check plus one
|
|
7
|
+
* aggregate result covering the whole stage.
|
|
8
|
+
*
|
|
9
|
+
* Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
|
|
10
|
+
* so unit tests inject a stub EvalLlmClient and assert on the aggregate
|
|
11
|
+
* math without touching the network.
|
|
12
|
+
*/
|
|
13
|
+
import { type EvalLlmClient } from "../llm-client.js";
|
|
14
|
+
import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
|
|
15
|
+
export interface RunJudgeOptions {
|
|
16
|
+
artifact: string;
|
|
17
|
+
rubric: RubricDoc;
|
|
18
|
+
config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
|
|
19
|
+
client: EvalLlmClient;
|
|
20
|
+
/** Per-case hint that overlays the rubric (sample count, minimums). */
|
|
21
|
+
caseHint?: JudgeExpected;
|
|
22
|
+
/** Optional seed seed; incremented per sample for reproducibility. */
|
|
23
|
+
baseSeed?: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Parse one judge response into a JudgeSample. The parser is intentionally
|
|
27
|
+
* forgiving with rationales (missing -> empty string) but strict with
|
|
28
|
+
* scores: missing or non-numeric entries are dropped and the coverage
|
|
29
|
+
* flag on the aggregate flips to false.
|
|
30
|
+
*/
|
|
31
|
+
export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
|
|
32
|
+
/** Run the judge against an artifact and return per-sample + aggregate data. */
|
|
33
|
+
export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
|
|
34
|
+
/**
|
|
35
|
+
* Convert a JudgeInvocation into VerifierResult[] for the runner. One
|
|
36
|
+
* result per rubric check (score 0..1 normalized from the 1..5 median) +
|
|
37
|
+
* one "coverage" result that flips to `ok:false` when any sample failed
|
|
38
|
+
* to emit a score for a check.
|
|
39
|
+
*/
|
|
40
|
+
export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];
|