cclaw-cli 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -2
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/config-loader.js +96 -3
- package/dist/eval/corpus.d.ts +11 -0
- package/dist/eval/corpus.js +162 -7
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +113 -20
- package/dist/eval/llm-client.js +242 -10
- package/dist/eval/report.js +26 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +213 -34
- package/dist/eval/types.d.ts +171 -4
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/eval/verifiers/rules.d.ts +24 -0
- package/dist/eval/verifiers/rules.js +218 -0
- package/dist/eval/verifiers/traceability.d.ts +23 -0
- package/dist/eval/verifiers/traceability.js +84 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
package/dist/eval/runner.js
CHANGED
|
@@ -1,10 +1,17 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { runSingleShot } from "./agents/single-shot.js";
|
|
4
5
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
|
-
import { loadCorpus, readFixtureArtifact } from "./corpus.js";
|
|
6
|
+
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
6
7
|
import { loadEvalConfig } from "./config-loader.js";
|
|
8
|
+
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
9
|
+
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
10
|
+
import { loadAllRubrics } from "./rubric-loader.js";
|
|
11
|
+
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
12
|
+
import { verifyRules } from "./verifiers/rules.js";
|
|
7
13
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
14
|
+
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
8
15
|
function groupByStage(cases) {
|
|
9
16
|
return cases.reduce((acc, item) => {
|
|
10
17
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
@@ -21,33 +28,131 @@ function skeletonVerifierResult(message, details) {
|
|
|
21
28
|
...(details !== undefined ? { details } : {})
|
|
22
29
|
};
|
|
23
30
|
}
|
|
24
|
-
|
|
31
|
+
/**
|
|
32
|
+
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
33
|
+
* on top of structural (traceability is a rule-family verifier even though
|
|
34
|
+
* it lives in its own module). --judge opens up the LLM judge and, for
|
|
35
|
+
* Tier A, the single-shot agent-under-test. --schema-only always wins so
|
|
36
|
+
* the LLM-free PR gate never pays for tokens even if stale flags collide.
|
|
37
|
+
*/
|
|
38
|
+
function resolveRunFlags(options) {
|
|
39
|
+
const rulesRequested = options.rules === true;
|
|
40
|
+
const schemaOnly = options.schemaOnly === true;
|
|
41
|
+
const judgeRequested = options.judge === true;
|
|
42
|
+
const runJudge = judgeRequested && !schemaOnly;
|
|
43
|
+
const runAgent = runJudge && (options.tier ?? "A") === "A";
|
|
44
|
+
return {
|
|
45
|
+
runStructural: true,
|
|
46
|
+
runRules: rulesRequested && !schemaOnly,
|
|
47
|
+
runTraceability: rulesRequested && !schemaOnly,
|
|
48
|
+
runJudge,
|
|
49
|
+
runAgent
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Wrap a client so every chat() result is accounted against the cost
|
|
54
|
+
* guard before being returned. The guard throws
|
|
55
|
+
* DailyCostCapExceededError if committing the call would cross the
|
|
56
|
+
* configured cap — the runner surfaces that as a hard failure so
|
|
57
|
+
* nightly CI fails loud instead of silently overspending.
|
|
58
|
+
*/
|
|
59
|
+
function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
|
|
60
|
+
return {
|
|
61
|
+
async chat(request) {
|
|
62
|
+
const response = await client.chat(request);
|
|
63
|
+
await costGuard.commit(response.model || fallbackModel, response.usage);
|
|
64
|
+
return response;
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
69
|
+
try {
|
|
70
|
+
return await readFixtureArtifact(projectRoot, caseEntry);
|
|
71
|
+
}
|
|
72
|
+
catch (err) {
|
|
73
|
+
verifierResults.push({
|
|
74
|
+
kind: "structural",
|
|
75
|
+
id: "structural:fixture:missing",
|
|
76
|
+
ok: false,
|
|
77
|
+
score: 0,
|
|
78
|
+
message: err instanceof Error ? err.message : String(err),
|
|
79
|
+
details: { fixture: caseEntry.fixture }
|
|
80
|
+
});
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
async function runCase(ctx) {
|
|
85
|
+
const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
|
|
25
86
|
const started = Date.now();
|
|
26
|
-
const structuralExpected = caseEntry.expected?.structural;
|
|
27
87
|
const verifierResults = [];
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
88
|
+
const expected = caseEntry.expected;
|
|
89
|
+
let caseCostUsd = 0;
|
|
90
|
+
const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
|
|
91
|
+
const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
|
|
92
|
+
const hasTraceability = flags.runTraceability && !!expected?.traceability;
|
|
93
|
+
const judgeRequested = flags.runJudge && !!expected?.judge;
|
|
94
|
+
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
95
|
+
let artifact;
|
|
96
|
+
if (needsArtifact) {
|
|
97
|
+
if (flags.runAgent && judgeRequested && client) {
|
|
98
|
+
try {
|
|
99
|
+
const produced = await runSingleShot({
|
|
100
|
+
caseEntry,
|
|
101
|
+
config,
|
|
102
|
+
projectRoot,
|
|
103
|
+
client
|
|
104
|
+
});
|
|
105
|
+
artifact = produced.artifact;
|
|
106
|
+
caseCostUsd += produced.usageUsd;
|
|
107
|
+
verifierResults.push({
|
|
108
|
+
kind: "workflow",
|
|
109
|
+
id: "agent:single-shot",
|
|
110
|
+
ok: true,
|
|
111
|
+
score: 1,
|
|
112
|
+
message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
|
|
113
|
+
details: {
|
|
114
|
+
model: produced.model,
|
|
115
|
+
tokensIn: produced.usage.promptTokens,
|
|
116
|
+
tokensOut: produced.usage.completionTokens,
|
|
117
|
+
usageUsd: produced.usageUsd,
|
|
118
|
+
attempts: produced.attempts
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
if (err instanceof DailyCostCapExceededError)
|
|
124
|
+
throw err;
|
|
125
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
126
|
+
verifierResults.push({
|
|
127
|
+
kind: "workflow",
|
|
128
|
+
id: "agent:single-shot",
|
|
129
|
+
ok: false,
|
|
130
|
+
score: 0,
|
|
131
|
+
message: err instanceof Error ? err.message : String(err),
|
|
132
|
+
details: { retryable }
|
|
133
|
+
});
|
|
134
|
+
}
|
|
38
135
|
}
|
|
39
|
-
|
|
136
|
+
else {
|
|
137
|
+
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
138
|
+
}
|
|
139
|
+
if (artifact === undefined && verifierResults.length === 0) {
|
|
40
140
|
verifierResults.push({
|
|
41
141
|
kind: "structural",
|
|
42
|
-
id: "structural:fixture:
|
|
142
|
+
id: "structural:fixture:absent",
|
|
43
143
|
ok: false,
|
|
44
144
|
score: 0,
|
|
45
|
-
message:
|
|
46
|
-
details: {
|
|
145
|
+
message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
146
|
+
details: { fixtureProvided: false }
|
|
47
147
|
});
|
|
48
148
|
}
|
|
49
|
-
|
|
50
|
-
|
|
149
|
+
}
|
|
150
|
+
if (flags.runStructural) {
|
|
151
|
+
if (!hasStructural) {
|
|
152
|
+
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
153
|
+
}
|
|
154
|
+
else if (artifact !== undefined) {
|
|
155
|
+
const results = verifyStructural(artifact, expected.structural);
|
|
51
156
|
if (results.length === 0) {
|
|
52
157
|
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
53
158
|
}
|
|
@@ -55,24 +160,79 @@ async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
|
|
|
55
160
|
verifierResults.push(...results);
|
|
56
161
|
}
|
|
57
162
|
}
|
|
58
|
-
|
|
163
|
+
}
|
|
164
|
+
if (hasRules && artifact !== undefined) {
|
|
165
|
+
const results = verifyRules(artifact, expected.rules);
|
|
166
|
+
verifierResults.push(...results);
|
|
167
|
+
}
|
|
168
|
+
if (hasTraceability && artifact !== undefined) {
|
|
169
|
+
try {
|
|
170
|
+
const extras = await readExtraFixtures(projectRoot, caseEntry);
|
|
171
|
+
const results = verifyTraceability(artifact, extras, expected.traceability);
|
|
172
|
+
verifierResults.push(...results);
|
|
173
|
+
}
|
|
174
|
+
catch (err) {
|
|
59
175
|
verifierResults.push({
|
|
60
|
-
kind: "
|
|
61
|
-
id: "
|
|
176
|
+
kind: "rules",
|
|
177
|
+
id: "traceability:fixture:missing",
|
|
62
178
|
ok: false,
|
|
63
179
|
score: 0,
|
|
64
|
-
message:
|
|
65
|
-
details: {
|
|
180
|
+
message: err instanceof Error ? err.message : String(err),
|
|
181
|
+
details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
|
|
66
182
|
});
|
|
67
183
|
}
|
|
68
184
|
}
|
|
69
|
-
|
|
185
|
+
if (judgeRequested && artifact !== undefined && client) {
|
|
186
|
+
const rubric = rubrics.get(caseEntry.stage);
|
|
187
|
+
if (!rubric) {
|
|
188
|
+
verifierResults.push({
|
|
189
|
+
kind: "judge",
|
|
190
|
+
id: "judge:rubric:missing",
|
|
191
|
+
ok: false,
|
|
192
|
+
score: 0,
|
|
193
|
+
message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
|
|
194
|
+
details: { stage: caseEntry.stage }
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
else {
|
|
198
|
+
try {
|
|
199
|
+
const invocation = await runJudge({
|
|
200
|
+
artifact,
|
|
201
|
+
rubric,
|
|
202
|
+
config,
|
|
203
|
+
client,
|
|
204
|
+
caseHint: expected.judge
|
|
205
|
+
});
|
|
206
|
+
caseCostUsd += invocation.usageUsd;
|
|
207
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
|
|
208
|
+
verifierResults.push(...judgeVerifiers);
|
|
209
|
+
}
|
|
210
|
+
catch (err) {
|
|
211
|
+
if (err instanceof DailyCostCapExceededError)
|
|
212
|
+
throw err;
|
|
213
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
214
|
+
verifierResults.push({
|
|
215
|
+
kind: "judge",
|
|
216
|
+
id: "judge:invocation:error",
|
|
217
|
+
ok: false,
|
|
218
|
+
score: 0,
|
|
219
|
+
message: err instanceof Error ? err.message : String(err),
|
|
220
|
+
details: { retryable, rubricId: rubric.id }
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
226
|
+
const allOk = nonSkippedResults.length === 0
|
|
227
|
+
? verifierResults.every((r) => r.ok)
|
|
228
|
+
: nonSkippedResults.every((r) => r.ok);
|
|
70
229
|
return {
|
|
71
230
|
caseId: caseEntry.id,
|
|
72
231
|
stage: caseEntry.stage,
|
|
73
232
|
tier: plannedTier,
|
|
74
233
|
passed: allOk,
|
|
75
234
|
durationMs: Date.now() - started,
|
|
235
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
76
236
|
verifierResults
|
|
77
237
|
};
|
|
78
238
|
}
|
|
@@ -125,11 +285,12 @@ export async function runEval(options) {
|
|
|
125
285
|
if (corpus.length === 0) {
|
|
126
286
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
127
287
|
}
|
|
128
|
-
|
|
129
|
-
|
|
288
|
+
const flags = resolveRunFlags(options);
|
|
289
|
+
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
290
|
+
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
130
291
|
}
|
|
131
|
-
if (options.
|
|
132
|
-
notes.push("
|
|
292
|
+
if ((options.tier ?? "A") !== "A" && flags.runJudge) {
|
|
293
|
+
notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
|
|
133
294
|
}
|
|
134
295
|
if (options.dryRun === true) {
|
|
135
296
|
const summary = {
|
|
@@ -142,19 +303,37 @@ export async function runEval(options) {
|
|
|
142
303
|
},
|
|
143
304
|
plannedTier,
|
|
144
305
|
verifiersAvailable: {
|
|
145
|
-
structural:
|
|
146
|
-
rules:
|
|
147
|
-
judge:
|
|
148
|
-
workflow:
|
|
306
|
+
structural: flags.runStructural,
|
|
307
|
+
rules: flags.runRules,
|
|
308
|
+
judge: flags.runJudge,
|
|
309
|
+
workflow: flags.runAgent
|
|
149
310
|
},
|
|
150
311
|
notes
|
|
151
312
|
};
|
|
152
313
|
return summary;
|
|
153
314
|
}
|
|
315
|
+
const costGuard = createCostGuard(options.projectRoot, config);
|
|
316
|
+
let wrappedClient;
|
|
317
|
+
if (flags.runJudge) {
|
|
318
|
+
const base = options.llmClient ?? createEvalClient(config);
|
|
319
|
+
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
320
|
+
}
|
|
321
|
+
const rubrics = flags.runJudge
|
|
322
|
+
? await loadAllRubrics(options.projectRoot)
|
|
323
|
+
: new Map();
|
|
154
324
|
const now = new Date().toISOString();
|
|
155
325
|
const caseResults = [];
|
|
156
326
|
for (const item of corpus) {
|
|
157
|
-
caseResults.push(await
|
|
327
|
+
caseResults.push(await runCase({
|
|
328
|
+
projectRoot: options.projectRoot,
|
|
329
|
+
caseEntry: item,
|
|
330
|
+
plannedTier,
|
|
331
|
+
flags,
|
|
332
|
+
config,
|
|
333
|
+
client: wrappedClient,
|
|
334
|
+
costGuard,
|
|
335
|
+
rubrics
|
|
336
|
+
}));
|
|
158
337
|
}
|
|
159
338
|
const stages = stagesInResults(caseResults);
|
|
160
339
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -58,13 +58,96 @@ export interface StructuralExpected {
|
|
|
58
58
|
*/
|
|
59
59
|
requiredFrontmatterKeys?: string[];
|
|
60
60
|
}
|
|
61
|
-
/**
|
|
61
|
+
/**
|
|
62
|
+
* Rule-based expectations — zero-LLM content checks that are richer than
|
|
63
|
+
* structural (regex, numeric bounds, uniqueness). Introduced in Step 2.
|
|
64
|
+
*
|
|
65
|
+
* Every array field is optional; an empty `RulesExpected` produces zero
|
|
66
|
+
* verifier results so authors can enable rules incrementally.
|
|
67
|
+
*/
|
|
68
|
+
export interface RulesExpected {
|
|
69
|
+
/** Case-insensitive substrings the body must include at least once. */
|
|
70
|
+
mustContain?: string[];
|
|
71
|
+
/** Case-insensitive substrings the body must NOT include. */
|
|
72
|
+
mustNotContain?: string[];
|
|
73
|
+
/** Regex patterns that must match the body at least once. */
|
|
74
|
+
regexRequired?: RuleRegex[];
|
|
75
|
+
/** Regex patterns that must NOT match the body. */
|
|
76
|
+
regexForbidden?: RuleRegex[];
|
|
77
|
+
/** For each substring key, the body must contain at least N occurrences. */
|
|
78
|
+
minOccurrences?: Record<string, number>;
|
|
79
|
+
/** For each substring key, the body must contain at most N occurrences. */
|
|
80
|
+
maxOccurrences?: Record<string, number>;
|
|
81
|
+
/**
|
|
82
|
+
* For each named section (case-insensitive heading substring), every bullet
|
|
83
|
+
* (`- ...`) directly under the section must be unique. Catches duplicated
|
|
84
|
+
* decisions or repeated risks.
|
|
85
|
+
*/
|
|
86
|
+
uniqueBulletsInSection?: string[];
|
|
87
|
+
}
|
|
88
|
+
export interface RuleRegex {
|
|
89
|
+
/** Source of the regex. Parsed with `new RegExp(pattern, flags)`. */
|
|
90
|
+
pattern: string;
|
|
91
|
+
/** Optional regex flags; defaults to `"i"` for case-insensitive matching. */
|
|
92
|
+
flags?: string;
|
|
93
|
+
/** Human-readable label rendered in verifier messages and slugged into the id. */
|
|
94
|
+
description?: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Cross-stage traceability expectations — assert every ID extracted from
|
|
98
|
+
* `source` also appears in `self` and/or named `extra_fixtures`. Introduced
|
|
99
|
+
* in Step 2.
|
|
100
|
+
*/
|
|
101
|
+
export interface TraceabilityExpected {
|
|
102
|
+
/** Regex applied to the `source` fixture to collect the authoritative ID set. */
|
|
103
|
+
idPattern: string;
|
|
104
|
+
/** Optional regex flags (defaults to `"g"`). */
|
|
105
|
+
idFlags?: string;
|
|
106
|
+
/**
|
|
107
|
+
* Where to read the authoritative ID set from. Either `"self"` (the case's
|
|
108
|
+
* primary `fixture`) or a label present in the case's `extraFixtures` map.
|
|
109
|
+
*/
|
|
110
|
+
source: string;
|
|
111
|
+
/**
|
|
112
|
+
* Where every source ID must also appear. Each entry is `"self"` or an
|
|
113
|
+
* `extraFixtures` label. Order is preserved for deterministic result ids.
|
|
114
|
+
*/
|
|
115
|
+
requireIn: string[];
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* LLM-judge expectations — Step 3.
|
|
119
|
+
*
|
|
120
|
+
* When present, the judge runs against the resolved artifact (live-agent
|
|
121
|
+
* output in Tier A/B/C, or the pre-generated fixture when `--judge` is
|
|
122
|
+
* combined with `--schema-only` for smoke tests). Every field below is
|
|
123
|
+
* optional; the case-level hint overlays the stage-level rubric loaded
|
|
124
|
+
* from `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
125
|
+
*/
|
|
126
|
+
export interface JudgeExpected {
|
|
127
|
+
/**
|
|
128
|
+
* Per-case check ids that MUST be present in the stage rubric. Used when
|
|
129
|
+
* a case wants to assert the rubric covers scenario-specific properties.
|
|
130
|
+
*/
|
|
131
|
+
requiredChecks?: string[];
|
|
132
|
+
/**
|
|
133
|
+
* Stage rubric identifier when a stage ships multiple rubrics (e.g.
|
|
134
|
+
* "strict" vs. "lenient"). Defaults to the stage name.
|
|
135
|
+
*/
|
|
136
|
+
rubric?: string;
|
|
137
|
+
/** Optional override of `config.judgeSamples` for the case. */
|
|
138
|
+
samples?: number;
|
|
139
|
+
/** Per-check minimum score (1..5 scale). Fail when any score drops below. */
|
|
140
|
+
minimumScores?: Record<string, number>;
|
|
141
|
+
}
|
|
142
|
+
/** Superset of per-verifier expectation shapes. */
|
|
62
143
|
export interface ExpectedShape {
|
|
63
144
|
structural?: StructuralExpected;
|
|
64
|
-
/** Rule-based (keyword/regex/
|
|
65
|
-
rules?:
|
|
145
|
+
/** Rule-based (keyword/regex/count/uniqueness) checks — Step 2. */
|
|
146
|
+
rules?: RulesExpected;
|
|
147
|
+
/** Cross-stage ID propagation checks — Step 2. */
|
|
148
|
+
traceability?: TraceabilityExpected;
|
|
66
149
|
/** LLM-judge rubrics — Step 3. */
|
|
67
|
-
judge?:
|
|
150
|
+
judge?: JudgeExpected;
|
|
68
151
|
}
|
|
69
152
|
/**
|
|
70
153
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
@@ -89,6 +172,13 @@ export interface EvalCase {
|
|
|
89
172
|
* Step 1 development aid.
|
|
90
173
|
*/
|
|
91
174
|
fixture?: string;
|
|
175
|
+
/**
|
|
176
|
+
* Additional fixture paths loaded alongside the primary `fixture`, keyed
|
|
177
|
+
* by a free-form label. Consumed by cross-artifact verifiers (e.g.,
|
|
178
|
+
* traceability) introduced in Step 2. Paths are resolved relative to the
|
|
179
|
+
* case's stage directory, just like `fixture`.
|
|
180
|
+
*/
|
|
181
|
+
extraFixtures?: Record<string, string>;
|
|
92
182
|
}
|
|
93
183
|
/** Result of one verifier applied to one case. */
|
|
94
184
|
export interface VerifierResult {
|
|
@@ -163,6 +253,26 @@ export interface EvalConfig {
|
|
|
163
253
|
timeoutMs: number;
|
|
164
254
|
/** Max retries per API call on transient failures. */
|
|
165
255
|
maxRetries: number;
|
|
256
|
+
/**
|
|
257
|
+
* Number of judge samples per case (median-of-N). Defaults to 3 when unset.
|
|
258
|
+
* Must be odd so a true median exists.
|
|
259
|
+
*/
|
|
260
|
+
judgeSamples?: number;
|
|
261
|
+
/** Sampling temperature for judge calls. Defaults to 0.0. */
|
|
262
|
+
judgeTemperature?: number;
|
|
263
|
+
/** Sampling temperature for the agent-under-test. Defaults to 0.2. */
|
|
264
|
+
agentTemperature?: number;
|
|
265
|
+
/**
|
|
266
|
+
* Optional per-model USD pricing used by the cost guard. Keys match
|
|
267
|
+
* `model` / `judgeModel`. Values in USD per 1K tokens, so
|
|
268
|
+
* `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
|
|
269
|
+
*/
|
|
270
|
+
tokenPricing?: Record<string, TokenPricing>;
|
|
271
|
+
}
|
|
272
|
+
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
273
|
+
export interface TokenPricing {
|
|
274
|
+
input: number;
|
|
275
|
+
output: number;
|
|
166
276
|
}
|
|
167
277
|
/** Resolved config with env overrides applied. */
|
|
168
278
|
export interface ResolvedEvalConfig extends EvalConfig {
|
|
@@ -214,3 +324,60 @@ export interface BaselineRegression {
|
|
|
214
324
|
previousScore?: number;
|
|
215
325
|
currentScore?: number;
|
|
216
326
|
}
|
|
327
|
+
/**
|
|
328
|
+
* One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
|
|
329
|
+
* 5 means "the artifact fully meets the bar described by `prompt`".
|
|
330
|
+
*/
|
|
331
|
+
export interface RubricCheck {
|
|
332
|
+
/** Kebab-case slug, unique per rubric. Stable across runs. */
|
|
333
|
+
id: string;
|
|
334
|
+
/** Natural-language question posed to the judge. */
|
|
335
|
+
prompt: string;
|
|
336
|
+
/** Human-readable scale description rendered in judge prompts. */
|
|
337
|
+
scale?: string;
|
|
338
|
+
/** Relative weight for the stage's aggregate score. Defaults to 1.0. */
|
|
339
|
+
weight?: number;
|
|
340
|
+
/**
|
|
341
|
+
* When true, any sample below `config.regression.failIfCriticalBelow`
|
|
342
|
+
* flips the verifier to `ok:false` (not just a score drop).
|
|
343
|
+
*/
|
|
344
|
+
critical?: boolean;
|
|
345
|
+
}
|
|
346
|
+
/** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
|
|
347
|
+
export interface RubricDoc {
|
|
348
|
+
stage: FlowStage;
|
|
349
|
+
/** Optional rubric variant label; defaults to the stage name. */
|
|
350
|
+
id: string;
|
|
351
|
+
checks: RubricCheck[];
|
|
352
|
+
}
|
|
353
|
+
/**
|
|
354
|
+
* Judge response for a single sample (one API call). The judge is asked to
|
|
355
|
+
* return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
|
|
356
|
+
* `rationales[id]` is a short plain-text explanation, useful in reports but
|
|
357
|
+
* never used for gating.
|
|
358
|
+
*/
|
|
359
|
+
export interface JudgeSample {
|
|
360
|
+
scores: Record<string, number>;
|
|
361
|
+
rationales: Record<string, string>;
|
|
362
|
+
}
|
|
363
|
+
/** Aggregated judge output across N samples, per rubric check. */
|
|
364
|
+
export interface JudgeAggregate {
|
|
365
|
+
checkId: string;
|
|
366
|
+
samples: number[];
|
|
367
|
+
median: number;
|
|
368
|
+
mean: number;
|
|
369
|
+
/** True iff every sample returned a score for this check. */
|
|
370
|
+
coverage: boolean;
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Judge invocation result. Produced by `runJudge` and consumed by the
|
|
374
|
+
* runner: the runner converts each aggregate into a `VerifierResult` and
|
|
375
|
+
* records `usageUsd` toward the per-case cost.
|
|
376
|
+
*/
|
|
377
|
+
export interface JudgeInvocation {
|
|
378
|
+
rubricId: string;
|
|
379
|
+
samples: JudgeSample[];
|
|
380
|
+
aggregates: JudgeAggregate[];
|
|
381
|
+
usageUsd: number;
|
|
382
|
+
durationMs: number;
|
|
383
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge verifier — Step 3.
|
|
3
|
+
*
|
|
4
|
+
* Given an artifact and the stage's rubric, runs N judge samples (default
|
|
5
|
+
* median-of-3) against the configured LLM, aggregates the per-check
|
|
6
|
+
* scores, and returns one VerifierResult per rubric check plus one
|
|
7
|
+
* aggregate result covering the whole stage.
|
|
8
|
+
*
|
|
9
|
+
* Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
|
|
10
|
+
* so unit tests inject a stub EvalLlmClient and assert on the aggregate
|
|
11
|
+
* math without touching the network.
|
|
12
|
+
*/
|
|
13
|
+
import { type EvalLlmClient } from "../llm-client.js";
|
|
14
|
+
import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
|
|
15
|
+
export interface RunJudgeOptions {
|
|
16
|
+
artifact: string;
|
|
17
|
+
rubric: RubricDoc;
|
|
18
|
+
config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
|
|
19
|
+
client: EvalLlmClient;
|
|
20
|
+
/** Per-case hint that overlays the rubric (sample count, minimums). */
|
|
21
|
+
caseHint?: JudgeExpected;
|
|
22
|
+
/** Optional seed seed; incremented per sample for reproducibility. */
|
|
23
|
+
baseSeed?: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Parse one judge response into a JudgeSample. The parser is intentionally
|
|
27
|
+
* forgiving with rationales (missing -> empty string) but strict with
|
|
28
|
+
* scores: missing or non-numeric entries are dropped and the coverage
|
|
29
|
+
* flag on the aggregate flips to false.
|
|
30
|
+
*/
|
|
31
|
+
export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
|
|
32
|
+
/** Run the judge against an artifact and return per-sample + aggregate data. */
|
|
33
|
+
export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
|
|
34
|
+
/**
|
|
35
|
+
* Convert a JudgeInvocation into VerifierResult[] for the runner. One
|
|
36
|
+
* result per rubric check (score 0..1 normalized from the 1..5 median) +
|
|
37
|
+
* one "coverage" result that flips to `ok:false` when any sample failed
|
|
38
|
+
* to emit a score for a check.
|
|
39
|
+
*/
|
|
40
|
+
export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];
|