cclaw-cli 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -2
- package/dist/cli.js +388 -18
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +22 -16
- package/dist/eval/agents/workflow.d.ts +31 -0
- package/dist/eval/agents/workflow.js +135 -0
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +52 -19
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +36 -1
- package/dist/eval/runner.d.ts +37 -8
- package/dist/eval/runner.js +351 -42
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +158 -15
- package/dist/eval/types.js +39 -7
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/eval/report.js
CHANGED
|
@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
|
|
|
24
24
|
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
25
|
lines.push(`- provider: ${report.provider}`);
|
|
26
26
|
lines.push(`- model: ${report.model}`);
|
|
27
|
-
lines.push(`-
|
|
27
|
+
lines.push(`- mode: ${report.mode}`);
|
|
28
28
|
lines.push(`- stages: ${stages}`);
|
|
29
29
|
lines.push(``);
|
|
30
30
|
lines.push(`## Summary`);
|
|
@@ -120,6 +120,41 @@ export function formatMarkdownReport(report) {
|
|
|
120
120
|
}
|
|
121
121
|
lines.push(``);
|
|
122
122
|
}
|
|
123
|
+
const workflowCases = report.cases.filter((item) => !!item.workflow);
|
|
124
|
+
if (workflowCases.length > 0) {
|
|
125
|
+
lines.push(`## Workflow stages`);
|
|
126
|
+
lines.push(``);
|
|
127
|
+
lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
|
|
128
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
129
|
+
for (const item of workflowCases) {
|
|
130
|
+
const wf = item.workflow;
|
|
131
|
+
for (const stage of wf.stages) {
|
|
132
|
+
const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
|
|
133
|
+
const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
|
|
134
|
+
lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
|
|
135
|
+
`${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
lines.push(``);
|
|
139
|
+
}
|
|
140
|
+
const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
|
|
141
|
+
if (consistencyCases.length > 0) {
|
|
142
|
+
lines.push(`## Consistency checks`);
|
|
143
|
+
lines.push(``);
|
|
144
|
+
lines.push(`| case id | check id | ok | message |`);
|
|
145
|
+
lines.push(`| --- | --- | --- | --- |`);
|
|
146
|
+
for (const item of consistencyCases) {
|
|
147
|
+
for (const verifier of item.verifierResults) {
|
|
148
|
+
if (verifier.kind !== "consistency")
|
|
149
|
+
continue;
|
|
150
|
+
const message = verifier.message
|
|
151
|
+
? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
|
|
152
|
+
: "-";
|
|
153
|
+
lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
lines.push(``);
|
|
157
|
+
}
|
|
123
158
|
lines.push(`## Verifier details`);
|
|
124
159
|
lines.push(``);
|
|
125
160
|
for (const item of report.cases) {
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
import { type EvalLlmClient } from "./llm-client.js";
|
|
3
|
-
import
|
|
3
|
+
import { type ProgressLogger } from "./progress.js";
|
|
4
|
+
import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
|
|
4
5
|
export interface RunEvalOptions {
|
|
5
6
|
projectRoot: string;
|
|
6
7
|
stage?: FlowStage;
|
|
7
|
-
|
|
8
|
+
mode?: EvalMode;
|
|
8
9
|
/** When true, run only structural verifiers (Step 1). */
|
|
9
10
|
schemaOnly?: boolean;
|
|
10
11
|
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
|
|
|
21
22
|
* without hitting the network.
|
|
22
23
|
*/
|
|
23
24
|
llmClient?: EvalLlmClient;
|
|
25
|
+
/**
|
|
26
|
+
* Optional progress logger. The CLI wires a stderr-backed logger by
|
|
27
|
+
* default so users see one-line updates during long runs; tests and
|
|
28
|
+
* programmatic callers can inject a silent (noop) logger or capture
|
|
29
|
+
* events for assertions. When omitted, progress is silenced.
|
|
30
|
+
*/
|
|
31
|
+
progress?: ProgressLogger;
|
|
32
|
+
/**
|
|
33
|
+
* Per-run USD cap. Enforced in-memory; independent from the daily cap
|
|
34
|
+
* (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
|
|
35
|
+
* invocations. Undefined means no cap.
|
|
36
|
+
*/
|
|
37
|
+
maxCostUsd?: number;
|
|
38
|
+
/**
|
|
39
|
+
* Override the configured `model` (and `judgeModel`) for this run.
|
|
40
|
+
* Used by `cclaw eval --compare-model` to replay the same corpus
|
|
41
|
+
* against an alternative model without editing `config.yaml`.
|
|
42
|
+
*/
|
|
43
|
+
modelOverride?: string;
|
|
24
44
|
}
|
|
25
45
|
export interface DryRunSummary {
|
|
26
46
|
kind: "dry-run";
|
|
@@ -33,20 +53,29 @@ export interface DryRunSummary {
|
|
|
33
53
|
stage: FlowStage;
|
|
34
54
|
}>;
|
|
35
55
|
};
|
|
36
|
-
|
|
56
|
+
/** Only populated in `workflow` mode; empty for fixture / agent modes. */
|
|
57
|
+
workflowCorpus: {
|
|
58
|
+
total: number;
|
|
59
|
+
cases: Array<{
|
|
60
|
+
id: string;
|
|
61
|
+
stages: WorkflowStageName[];
|
|
62
|
+
}>;
|
|
63
|
+
};
|
|
64
|
+
plannedMode: EvalMode;
|
|
37
65
|
verifiersAvailable: {
|
|
38
66
|
structural: boolean;
|
|
39
67
|
rules: boolean;
|
|
40
68
|
judge: boolean;
|
|
41
69
|
workflow: boolean;
|
|
70
|
+
consistency: boolean;
|
|
42
71
|
};
|
|
43
72
|
notes: string[];
|
|
44
73
|
}
|
|
45
74
|
/**
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
* skipped
|
|
75
|
+
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
76
|
+
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
77
|
+
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
78
|
+
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
79
|
+
* marked skipped (not failed) when no LLM drafting runs.
|
|
51
80
|
*/
|
|
52
81
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -3,16 +3,20 @@ import { CCLAW_VERSION } from "../constants.js";
|
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
4
|
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
5
|
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
6
|
+
import { runWorkflow } from "./agents/workflow.js";
|
|
6
7
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
7
8
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
9
|
+
import { loadWorkflowCorpus } from "./workflow-corpus.js";
|
|
8
10
|
import { loadEvalConfig } from "./config-loader.js";
|
|
9
|
-
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
11
|
+
import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
|
|
10
12
|
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
13
|
+
import { noopProgressLogger } from "./progress.js";
|
|
11
14
|
import { loadAllRubrics } from "./rubric-loader.js";
|
|
12
15
|
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
13
16
|
import { verifyRules } from "./verifiers/rules.js";
|
|
14
17
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
15
18
|
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
19
|
+
import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
|
|
16
20
|
function groupByStage(cases) {
|
|
17
21
|
return cases.reduce((acc, item) => {
|
|
18
22
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
@@ -32,17 +36,24 @@ function skeletonVerifierResult(message, details) {
|
|
|
32
36
|
/**
|
|
33
37
|
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
34
38
|
* on top of structural (traceability is a rule-family verifier even though
|
|
35
|
-
* it lives in its own module). --judge opens up the LLM judge and,
|
|
36
|
-
*
|
|
37
|
-
* the LLM-free PR gate never pays for tokens even if stale flags
|
|
39
|
+
* it lives in its own module). --judge opens up the LLM judge and, in
|
|
40
|
+
* `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
|
|
41
|
+
* wins so the LLM-free PR gate never pays for tokens even if stale flags
|
|
42
|
+
* collide.
|
|
38
43
|
*/
|
|
39
44
|
function resolveRunFlags(options) {
|
|
40
45
|
const rulesRequested = options.rules === true;
|
|
41
46
|
const schemaOnly = options.schemaOnly === true;
|
|
42
47
|
const judgeRequested = options.judge === true;
|
|
43
|
-
const
|
|
48
|
+
const mode = options.mode ?? "fixture";
|
|
44
49
|
const runJudge = judgeRequested && !schemaOnly;
|
|
45
|
-
|
|
50
|
+
// `workflow` always needs the agent loop (no fixture fallback), so we still
|
|
51
|
+
// require an LLM client but do NOT require --judge on the CLI to produce a
|
|
52
|
+
// workflow run. The judge piece stays gated by `runJudge` so consistency-
|
|
53
|
+
// only runs remain cheap and deterministic.
|
|
54
|
+
const runAgent = mode === "workflow"
|
|
55
|
+
? !schemaOnly
|
|
56
|
+
: runJudge && (mode === "fixture" || mode === "agent");
|
|
46
57
|
return {
|
|
47
58
|
runStructural: true,
|
|
48
59
|
runRules: rulesRequested && !schemaOnly,
|
|
@@ -83,8 +94,203 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
|
83
94
|
return undefined;
|
|
84
95
|
}
|
|
85
96
|
}
|
|
97
|
+
function stageJudgeHint(step) {
|
|
98
|
+
const hint = {};
|
|
99
|
+
if (step.rubric)
|
|
100
|
+
hint.rubric = step.rubric;
|
|
101
|
+
if (step.requiredChecks)
|
|
102
|
+
hint.requiredChecks = step.requiredChecks;
|
|
103
|
+
if (step.minimumScores)
|
|
104
|
+
hint.minimumScores = step.minimumScores;
|
|
105
|
+
return hint;
|
|
106
|
+
}
|
|
107
|
+
async function runWorkflowCase(ctx) {
|
|
108
|
+
const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
|
|
109
|
+
const started = Date.now();
|
|
110
|
+
const verifierResults = [];
|
|
111
|
+
let caseCostUsd = 0;
|
|
112
|
+
const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
|
|
113
|
+
"plan";
|
|
114
|
+
if (!flags.runAgent || !client) {
|
|
115
|
+
verifierResults.push({
|
|
116
|
+
kind: "workflow",
|
|
117
|
+
id: "workflow:agent:disabled",
|
|
118
|
+
ok: false,
|
|
119
|
+
score: 0,
|
|
120
|
+
message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
|
|
121
|
+
"Re-run with credentials to execute the workflow.",
|
|
122
|
+
details: { stages: workflow.stages.map((s) => s.name) }
|
|
123
|
+
});
|
|
124
|
+
return {
|
|
125
|
+
caseId: workflow.id,
|
|
126
|
+
stage: lastStage,
|
|
127
|
+
mode: plannedMode,
|
|
128
|
+
passed: false,
|
|
129
|
+
durationMs: Date.now() - started,
|
|
130
|
+
verifierResults
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
let workflowResult;
|
|
134
|
+
try {
|
|
135
|
+
workflowResult = await runWorkflow({
|
|
136
|
+
workflow,
|
|
137
|
+
config,
|
|
138
|
+
projectRoot,
|
|
139
|
+
client,
|
|
140
|
+
onStageStart: (stage) => progress.emit({
|
|
141
|
+
kind: "stage-start",
|
|
142
|
+
caseId: workflow.id,
|
|
143
|
+
stage,
|
|
144
|
+
index: caseIndex,
|
|
145
|
+
total: totalCases
|
|
146
|
+
}),
|
|
147
|
+
onStageEnd: (stage, stageResult) => progress.emit({
|
|
148
|
+
kind: "stage-end",
|
|
149
|
+
caseId: workflow.id,
|
|
150
|
+
stage,
|
|
151
|
+
index: caseIndex,
|
|
152
|
+
total: totalCases,
|
|
153
|
+
passed: true,
|
|
154
|
+
durationMs: stageResult.durationMs,
|
|
155
|
+
...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
|
|
156
|
+
})
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
catch (err) {
|
|
160
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
161
|
+
throw err;
|
|
162
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
163
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
164
|
+
verifierResults.push({
|
|
165
|
+
kind: "workflow",
|
|
166
|
+
id: "workflow:agent:error",
|
|
167
|
+
ok: false,
|
|
168
|
+
score: 0,
|
|
169
|
+
message: err instanceof Error ? err.message : String(err),
|
|
170
|
+
details: {
|
|
171
|
+
retryable,
|
|
172
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
173
|
+
}
|
|
174
|
+
});
|
|
175
|
+
return {
|
|
176
|
+
caseId: workflow.id,
|
|
177
|
+
stage: lastStage,
|
|
178
|
+
mode: plannedMode,
|
|
179
|
+
passed: false,
|
|
180
|
+
durationMs: Date.now() - started,
|
|
181
|
+
verifierResults
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
caseCostUsd += workflowResult.totalUsageUsd;
|
|
185
|
+
const stageResults = [...workflowResult.stages];
|
|
186
|
+
verifierResults.push({
|
|
187
|
+
kind: "workflow",
|
|
188
|
+
id: "workflow:agent",
|
|
189
|
+
ok: true,
|
|
190
|
+
score: 1,
|
|
191
|
+
message: `workflow ran ${stageResults.length} stage(s) in ` +
|
|
192
|
+
`${workflowResult.totalDurationMs}ms ` +
|
|
193
|
+
`(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
|
|
194
|
+
details: {
|
|
195
|
+
stages: stageResults.map((s) => ({
|
|
196
|
+
name: s.stage,
|
|
197
|
+
durationMs: s.durationMs,
|
|
198
|
+
usageUsd: s.usageUsd,
|
|
199
|
+
turns: s.toolUse.turns,
|
|
200
|
+
calls: s.toolUse.calls
|
|
201
|
+
}))
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
let allJudgeOk = true;
|
|
205
|
+
if (flags.runJudge) {
|
|
206
|
+
for (let i = 0; i < workflow.stages.length; i += 1) {
|
|
207
|
+
const step = workflow.stages[i];
|
|
208
|
+
const stageResult = stageResults[i];
|
|
209
|
+
const rubric = rubrics.get(step.name);
|
|
210
|
+
if (!rubric) {
|
|
211
|
+
verifierResults.push({
|
|
212
|
+
kind: "judge",
|
|
213
|
+
id: `judge:rubric:missing:${step.name}`,
|
|
214
|
+
ok: false,
|
|
215
|
+
score: 0,
|
|
216
|
+
message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
|
|
217
|
+
details: { stage: step.name }
|
|
218
|
+
});
|
|
219
|
+
allJudgeOk = false;
|
|
220
|
+
stageResult.judgeOk = false;
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
const hint = stageJudgeHint(step);
|
|
224
|
+
try {
|
|
225
|
+
const invocation = await runJudge({
|
|
226
|
+
artifact: stageResult.artifact,
|
|
227
|
+
rubric,
|
|
228
|
+
config,
|
|
229
|
+
client,
|
|
230
|
+
caseHint: hint
|
|
231
|
+
});
|
|
232
|
+
caseCostUsd += invocation.usageUsd;
|
|
233
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
|
|
234
|
+
const medians = {};
|
|
235
|
+
for (const agg of invocation.aggregates) {
|
|
236
|
+
medians[agg.checkId] = agg.median;
|
|
237
|
+
}
|
|
238
|
+
stageResult.judgeMedians = medians;
|
|
239
|
+
const stageOk = judgeVerifiers.every((v) => v.ok);
|
|
240
|
+
stageResult.judgeOk = stageOk;
|
|
241
|
+
if (!stageOk)
|
|
242
|
+
allJudgeOk = false;
|
|
243
|
+
for (const v of judgeVerifiers) {
|
|
244
|
+
verifierResults.push({
|
|
245
|
+
...v,
|
|
246
|
+
id: `${v.id}:${step.name}`,
|
|
247
|
+
details: { ...(v.details ?? {}), stage: step.name }
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch (err) {
|
|
252
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
253
|
+
throw err;
|
|
254
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
255
|
+
verifierResults.push({
|
|
256
|
+
kind: "judge",
|
|
257
|
+
id: `judge:invocation:error:${step.name}`,
|
|
258
|
+
ok: false,
|
|
259
|
+
score: 0,
|
|
260
|
+
message: err instanceof Error ? err.message : String(err),
|
|
261
|
+
details: { retryable, rubricId: rubric.id, stage: step.name }
|
|
262
|
+
});
|
|
263
|
+
stageResult.judgeOk = false;
|
|
264
|
+
allJudgeOk = false;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
|
|
269
|
+
verifierResults.push(...consistencyResults);
|
|
270
|
+
const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
271
|
+
const allOk = nonSkipped.length === 0
|
|
272
|
+
? verifierResults.every((r) => r.ok)
|
|
273
|
+
: nonSkipped.every((r) => r.ok);
|
|
274
|
+
const workflowSummary = {
|
|
275
|
+
caseId: workflow.id,
|
|
276
|
+
stages: stageResults,
|
|
277
|
+
totalUsageUsd: workflowResult.totalUsageUsd,
|
|
278
|
+
totalDurationMs: workflowResult.totalDurationMs,
|
|
279
|
+
allJudgeOk: flags.runJudge ? allJudgeOk : true
|
|
280
|
+
};
|
|
281
|
+
return {
|
|
282
|
+
caseId: workflow.id,
|
|
283
|
+
stage: lastStage,
|
|
284
|
+
mode: plannedMode,
|
|
285
|
+
passed: allOk,
|
|
286
|
+
durationMs: Date.now() - started,
|
|
287
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
288
|
+
verifierResults,
|
|
289
|
+
workflow: workflowSummary
|
|
290
|
+
};
|
|
291
|
+
}
|
|
86
292
|
async function runCase(ctx) {
|
|
87
|
-
const { projectRoot, caseEntry,
|
|
293
|
+
const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
|
|
88
294
|
const started = Date.now();
|
|
89
295
|
const verifierResults = [];
|
|
90
296
|
const expected = caseEntry.expected;
|
|
@@ -96,7 +302,7 @@ async function runCase(ctx) {
|
|
|
96
302
|
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
97
303
|
let artifact;
|
|
98
304
|
if (needsArtifact) {
|
|
99
|
-
if (flags.runAgent && judgeRequested && client &&
|
|
305
|
+
if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
|
|
100
306
|
try {
|
|
101
307
|
const produced = await runSingleShot({
|
|
102
308
|
caseEntry,
|
|
@@ -122,7 +328,7 @@ async function runCase(ctx) {
|
|
|
122
328
|
});
|
|
123
329
|
}
|
|
124
330
|
catch (err) {
|
|
125
|
-
if (err instanceof DailyCostCapExceededError)
|
|
331
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
126
332
|
throw err;
|
|
127
333
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
128
334
|
verifierResults.push({
|
|
@@ -135,7 +341,7 @@ async function runCase(ctx) {
|
|
|
135
341
|
});
|
|
136
342
|
}
|
|
137
343
|
}
|
|
138
|
-
else if (flags.runAgent && judgeRequested && client &&
|
|
344
|
+
else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
|
|
139
345
|
try {
|
|
140
346
|
const produced = await runWithTools({
|
|
141
347
|
caseEntry,
|
|
@@ -164,7 +370,7 @@ async function runCase(ctx) {
|
|
|
164
370
|
});
|
|
165
371
|
}
|
|
166
372
|
catch (err) {
|
|
167
|
-
if (err instanceof DailyCostCapExceededError)
|
|
373
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
168
374
|
throw err;
|
|
169
375
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
170
376
|
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
@@ -256,7 +462,7 @@ async function runCase(ctx) {
|
|
|
256
462
|
verifierResults.push(...judgeVerifiers);
|
|
257
463
|
}
|
|
258
464
|
catch (err) {
|
|
259
|
-
if (err instanceof DailyCostCapExceededError)
|
|
465
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
260
466
|
throw err;
|
|
261
467
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
262
468
|
verifierResults.push({
|
|
@@ -277,7 +483,7 @@ async function runCase(ctx) {
|
|
|
277
483
|
return {
|
|
278
484
|
caseId: caseEntry.id,
|
|
279
485
|
stage: caseEntry.stage,
|
|
280
|
-
|
|
486
|
+
mode: plannedMode,
|
|
281
487
|
passed: allOk,
|
|
282
488
|
durationMs: Date.now() - started,
|
|
283
489
|
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
@@ -319,26 +525,37 @@ function stagesInResults(caseResults) {
|
|
|
319
525
|
return FLOW_STAGES.filter((s) => set.has(s));
|
|
320
526
|
}
|
|
321
527
|
/**
|
|
322
|
-
*
|
|
323
|
-
*
|
|
324
|
-
*
|
|
325
|
-
*
|
|
326
|
-
* skipped
|
|
528
|
+
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
529
|
+
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
530
|
+
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
531
|
+
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
532
|
+
* marked skipped (not failed) when no LLM drafting runs.
|
|
327
533
|
*/
|
|
328
534
|
export async function runEval(options) {
|
|
329
|
-
const
|
|
330
|
-
const
|
|
331
|
-
|
|
535
|
+
const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
536
|
+
const config = options.modelOverride
|
|
537
|
+
? {
|
|
538
|
+
...baseConfig,
|
|
539
|
+
model: options.modelOverride,
|
|
540
|
+
judgeModel: options.modelOverride
|
|
541
|
+
}
|
|
542
|
+
: baseConfig;
|
|
543
|
+
const plannedMode = options.mode ?? config.defaultMode;
|
|
544
|
+
const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
|
|
545
|
+
const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
|
|
332
546
|
const notes = [];
|
|
333
|
-
if (corpus.length === 0) {
|
|
547
|
+
if (plannedMode !== "workflow" && corpus.length === 0) {
|
|
334
548
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
335
549
|
}
|
|
550
|
+
if (plannedMode === "workflow" && workflowCorpus.length === 0) {
|
|
551
|
+
notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
|
|
552
|
+
}
|
|
336
553
|
const flags = resolveRunFlags(options);
|
|
337
554
|
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
338
555
|
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
339
556
|
}
|
|
340
|
-
if (
|
|
341
|
-
notes.push("
|
|
557
|
+
if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
|
|
558
|
+
notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
|
|
342
559
|
}
|
|
343
560
|
if (options.dryRun === true) {
|
|
344
561
|
const summary = {
|
|
@@ -349,39 +566,124 @@ export async function runEval(options) {
|
|
|
349
566
|
byStage: groupByStage(corpus),
|
|
350
567
|
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
351
568
|
},
|
|
352
|
-
|
|
569
|
+
workflowCorpus: {
|
|
570
|
+
total: workflowCorpus.length,
|
|
571
|
+
cases: workflowCorpus.map((item) => ({
|
|
572
|
+
id: item.id,
|
|
573
|
+
stages: item.stages.map((s) => s.name)
|
|
574
|
+
}))
|
|
575
|
+
},
|
|
576
|
+
plannedMode,
|
|
353
577
|
verifiersAvailable: {
|
|
354
578
|
structural: flags.runStructural,
|
|
355
579
|
rules: flags.runRules,
|
|
356
580
|
judge: flags.runJudge,
|
|
357
|
-
workflow: flags.runAgent
|
|
581
|
+
workflow: flags.runAgent,
|
|
582
|
+
consistency: plannedMode === "workflow"
|
|
358
583
|
},
|
|
359
584
|
notes
|
|
360
585
|
};
|
|
361
586
|
return summary;
|
|
362
587
|
}
|
|
363
|
-
const costGuard = createCostGuard(options.projectRoot, config);
|
|
588
|
+
const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
|
|
589
|
+
const progress = options.progress ?? noopProgressLogger();
|
|
364
590
|
let wrappedClient;
|
|
365
|
-
|
|
366
|
-
|
|
591
|
+
const clientNeeded = flags.runJudge || plannedMode === "workflow";
|
|
592
|
+
if (clientNeeded) {
|
|
593
|
+
const base = options.llmClient ??
|
|
594
|
+
createEvalClient(config, {
|
|
595
|
+
onRetry: (event) => progress.emit({
|
|
596
|
+
kind: "retry",
|
|
597
|
+
caseId: "llm",
|
|
598
|
+
attempt: event.attempt,
|
|
599
|
+
maxAttempts: event.maxAttempts,
|
|
600
|
+
waitMs: event.waitMs,
|
|
601
|
+
reason: event.error.message
|
|
602
|
+
})
|
|
603
|
+
});
|
|
367
604
|
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
368
605
|
}
|
|
369
|
-
const
|
|
606
|
+
const rubricsNeeded = flags.runJudge;
|
|
607
|
+
const rubrics = rubricsNeeded
|
|
370
608
|
? await loadAllRubrics(options.projectRoot)
|
|
371
609
|
: new Map();
|
|
372
610
|
const now = new Date().toISOString();
|
|
373
611
|
const caseResults = [];
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
612
|
+
const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
|
|
613
|
+
const runStarted = Date.now();
|
|
614
|
+
progress.emit({
|
|
615
|
+
kind: "run-start",
|
|
616
|
+
mode: plannedMode,
|
|
617
|
+
totalCases: totalPlannedCases
|
|
618
|
+
});
|
|
619
|
+
if (plannedMode === "workflow") {
|
|
620
|
+
for (let i = 0; i < workflowCorpus.length; i += 1) {
|
|
621
|
+
const wf = workflowCorpus[i];
|
|
622
|
+
progress.emit({
|
|
623
|
+
kind: "case-start",
|
|
624
|
+
caseId: wf.id,
|
|
625
|
+
stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
|
|
626
|
+
index: i + 1,
|
|
627
|
+
total: workflowCorpus.length
|
|
628
|
+
});
|
|
629
|
+
const result = await runWorkflowCase({
|
|
630
|
+
projectRoot: options.projectRoot,
|
|
631
|
+
workflow: wf,
|
|
632
|
+
plannedMode,
|
|
633
|
+
flags,
|
|
634
|
+
config,
|
|
635
|
+
client: wrappedClient,
|
|
636
|
+
costGuard,
|
|
637
|
+
rubrics,
|
|
638
|
+
progress,
|
|
639
|
+
caseIndex: i + 1,
|
|
640
|
+
totalCases: workflowCorpus.length
|
|
641
|
+
});
|
|
642
|
+
progress.emit({
|
|
643
|
+
kind: "case-end",
|
|
644
|
+
caseId: wf.id,
|
|
645
|
+
stage: result.stage,
|
|
646
|
+
index: i + 1,
|
|
647
|
+
total: workflowCorpus.length,
|
|
648
|
+
passed: result.passed,
|
|
649
|
+
durationMs: result.durationMs,
|
|
650
|
+
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
651
|
+
});
|
|
652
|
+
caseResults.push(result);
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
else {
|
|
656
|
+
for (let i = 0; i < corpus.length; i += 1) {
|
|
657
|
+
const item = corpus[i];
|
|
658
|
+
progress.emit({
|
|
659
|
+
kind: "case-start",
|
|
660
|
+
caseId: item.id,
|
|
661
|
+
stage: item.stage,
|
|
662
|
+
index: i + 1,
|
|
663
|
+
total: corpus.length
|
|
664
|
+
});
|
|
665
|
+
const result = await runCase({
|
|
666
|
+
projectRoot: options.projectRoot,
|
|
667
|
+
caseEntry: item,
|
|
668
|
+
plannedMode,
|
|
669
|
+
flags,
|
|
670
|
+
config,
|
|
671
|
+
client: wrappedClient,
|
|
672
|
+
costGuard,
|
|
673
|
+
rubrics
|
|
674
|
+
});
|
|
675
|
+
progress.emit({
|
|
676
|
+
kind: "case-end",
|
|
677
|
+
caseId: item.id,
|
|
678
|
+
stage: item.stage,
|
|
679
|
+
index: i + 1,
|
|
680
|
+
total: corpus.length,
|
|
681
|
+
passed: result.passed,
|
|
682
|
+
durationMs: result.durationMs,
|
|
683
|
+
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
684
|
+
});
|
|
685
|
+
caseResults.push(result);
|
|
686
|
+
}
|
|
385
687
|
}
|
|
386
688
|
const stages = stagesInResults(caseResults);
|
|
387
689
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
@@ -393,7 +695,7 @@ export async function runEval(options) {
|
|
|
393
695
|
cclawVersion: CCLAW_VERSION,
|
|
394
696
|
provider: config.provider,
|
|
395
697
|
model: config.model,
|
|
396
|
-
|
|
698
|
+
mode: plannedMode,
|
|
397
699
|
stages,
|
|
398
700
|
cases: caseResults,
|
|
399
701
|
summary
|
|
@@ -401,5 +703,12 @@ export async function runEval(options) {
|
|
|
401
703
|
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
402
704
|
if (baselineDelta)
|
|
403
705
|
report.baselineDelta = baselineDelta;
|
|
706
|
+
progress.emit({
|
|
707
|
+
kind: "run-end",
|
|
708
|
+
totalCases: summary.totalCases,
|
|
709
|
+
passed: summary.passed,
|
|
710
|
+
failed: summary.failed,
|
|
711
|
+
durationMs: Date.now() - runStarted
|
|
712
|
+
});
|
|
404
713
|
return report;
|
|
405
714
|
}
|