cclaw-cli 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +8 -4
- package/dist/cli.js +316 -20
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +6 -6
- package/dist/eval/agents/with-tools.js +5 -5
- package/dist/eval/agents/workflow.d.ts +7 -0
- package/dist/eval/agents/workflow.js +5 -3
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +46 -17
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +1 -1
- package/dist/eval/diff.js +3 -3
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +29 -9
- package/dist/eval/runner.js +148 -56
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +54 -27
- package/dist/eval/types.js +21 -9
- package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
- package/dist/eval/workflow-corpus.d.ts +2 -2
- package/dist/eval/workflow-corpus.js +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
const NOOP_LOGGER = { emit() { } };
|
|
2
|
+
export function noopProgressLogger() {
|
|
3
|
+
return NOOP_LOGGER;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Emit a one-line status update per event to stderr.
|
|
7
|
+
*
|
|
8
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
9
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
10
|
+
* so sub-cent runs still show a non-zero value.
|
|
11
|
+
*/
|
|
12
|
+
export function createStderrProgressLogger(opts = {}) {
|
|
13
|
+
const writer = opts.writer ?? ((s) => process.stderr.write(s));
|
|
14
|
+
return {
|
|
15
|
+
emit(event) {
|
|
16
|
+
writer(`[cclaw eval] ${formatEvent(event)}\n`);
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function formatDuration(ms) {
|
|
21
|
+
if (ms < 1000)
|
|
22
|
+
return `${ms}ms`;
|
|
23
|
+
const s = ms / 1000;
|
|
24
|
+
if (s < 60)
|
|
25
|
+
return `${s.toFixed(1)}s`;
|
|
26
|
+
const m = Math.floor(s / 60);
|
|
27
|
+
const rem = Math.round(s - m * 60);
|
|
28
|
+
return `${m}m${rem.toString().padStart(2, "0")}s`;
|
|
29
|
+
}
|
|
30
|
+
function formatCost(usd) {
|
|
31
|
+
if (usd === undefined || usd <= 0)
|
|
32
|
+
return "";
|
|
33
|
+
return ` $${usd.toFixed(4)}`;
|
|
34
|
+
}
|
|
35
|
+
function formatEvent(event) {
|
|
36
|
+
switch (event.kind) {
|
|
37
|
+
case "run-start":
|
|
38
|
+
return `start mode=${event.mode} cases=${event.totalCases}`;
|
|
39
|
+
case "case-start":
|
|
40
|
+
return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
|
|
41
|
+
case "case-end": {
|
|
42
|
+
const status = event.passed ? "PASS" : "FAIL";
|
|
43
|
+
return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
|
|
44
|
+
`in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
|
|
45
|
+
}
|
|
46
|
+
case "stage-start":
|
|
47
|
+
return ` stage ${event.stage} ...`;
|
|
48
|
+
case "stage-end": {
|
|
49
|
+
const status = event.passed ? "ok" : "fail";
|
|
50
|
+
return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
|
|
51
|
+
}
|
|
52
|
+
case "retry":
|
|
53
|
+
return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
|
|
54
|
+
`attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
|
|
55
|
+
case "run-end":
|
|
56
|
+
return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
|
|
57
|
+
`in ${formatDuration(event.durationMs)}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
package/dist/eval/report.js
CHANGED
|
@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
|
|
|
24
24
|
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
25
|
lines.push(`- provider: ${report.provider}`);
|
|
26
26
|
lines.push(`- model: ${report.model}`);
|
|
27
|
-
lines.push(`-
|
|
27
|
+
lines.push(`- mode: ${report.mode}`);
|
|
28
28
|
lines.push(`- stages: ${stages}`);
|
|
29
29
|
lines.push(``);
|
|
30
30
|
lines.push(`## Summary`);
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
import { type EvalLlmClient } from "./llm-client.js";
|
|
3
|
-
import
|
|
3
|
+
import { type ProgressLogger } from "./progress.js";
|
|
4
|
+
import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
|
|
4
5
|
export interface RunEvalOptions {
|
|
5
6
|
projectRoot: string;
|
|
6
7
|
stage?: FlowStage;
|
|
7
|
-
|
|
8
|
+
mode?: EvalMode;
|
|
8
9
|
/** When true, run only structural verifiers (Step 1). */
|
|
9
10
|
schemaOnly?: boolean;
|
|
10
11
|
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
|
|
|
21
22
|
* without hitting the network.
|
|
22
23
|
*/
|
|
23
24
|
llmClient?: EvalLlmClient;
|
|
25
|
+
/**
|
|
26
|
+
* Optional progress logger. The CLI wires a stderr-backed logger by
|
|
27
|
+
* default so users see one-line updates during long runs; tests and
|
|
28
|
+
* programmatic callers can inject a silent (noop) logger or capture
|
|
29
|
+
* events for assertions. When omitted, progress is silenced.
|
|
30
|
+
*/
|
|
31
|
+
progress?: ProgressLogger;
|
|
32
|
+
/**
|
|
33
|
+
* Per-run USD cap. Enforced in-memory; independent from the daily cap
|
|
34
|
+
* (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
|
|
35
|
+
* invocations. Undefined means no cap.
|
|
36
|
+
*/
|
|
37
|
+
maxCostUsd?: number;
|
|
38
|
+
/**
|
|
39
|
+
* Override the configured `model` (and `judgeModel`) for this run.
|
|
40
|
+
* Used by `cclaw eval --compare-model` to replay the same corpus
|
|
41
|
+
* against an alternative model without editing `config.yaml`.
|
|
42
|
+
*/
|
|
43
|
+
modelOverride?: string;
|
|
24
44
|
}
|
|
25
45
|
export interface DryRunSummary {
|
|
26
46
|
kind: "dry-run";
|
|
@@ -33,7 +53,7 @@ export interface DryRunSummary {
|
|
|
33
53
|
stage: FlowStage;
|
|
34
54
|
}>;
|
|
35
55
|
};
|
|
36
|
-
/**
|
|
56
|
+
/** Only populated in `workflow` mode; empty for fixture / agent modes. */
|
|
37
57
|
workflowCorpus: {
|
|
38
58
|
total: number;
|
|
39
59
|
cases: Array<{
|
|
@@ -41,7 +61,7 @@ export interface DryRunSummary {
|
|
|
41
61
|
stages: WorkflowStageName[];
|
|
42
62
|
}>;
|
|
43
63
|
};
|
|
44
|
-
|
|
64
|
+
plannedMode: EvalMode;
|
|
45
65
|
verifiersAvailable: {
|
|
46
66
|
structural: boolean;
|
|
47
67
|
rules: boolean;
|
|
@@ -52,10 +72,10 @@ export interface DryRunSummary {
|
|
|
52
72
|
notes: string[];
|
|
53
73
|
}
|
|
54
74
|
/**
|
|
55
|
-
*
|
|
56
|
-
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
* skipped
|
|
75
|
+
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
76
|
+
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
77
|
+
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
78
|
+
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
79
|
+
* marked skipped (not failed) when no LLM drafting runs.
|
|
60
80
|
*/
|
|
61
81
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -8,8 +8,9 @@ import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
|
8
8
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
9
9
|
import { loadWorkflowCorpus } from "./workflow-corpus.js";
|
|
10
10
|
import { loadEvalConfig } from "./config-loader.js";
|
|
11
|
-
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
11
|
+
import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
|
|
12
12
|
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
13
|
+
import { noopProgressLogger } from "./progress.js";
|
|
13
14
|
import { loadAllRubrics } from "./rubric-loader.js";
|
|
14
15
|
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
15
16
|
import { verifyRules } from "./verifiers/rules.js";
|
|
@@ -35,23 +36,24 @@ function skeletonVerifierResult(message, details) {
|
|
|
35
36
|
/**
|
|
36
37
|
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
37
38
|
* on top of structural (traceability is a rule-family verifier even though
|
|
38
|
-
* it lives in its own module). --judge opens up the LLM judge and,
|
|
39
|
-
*
|
|
40
|
-
* the LLM-free PR gate never pays for tokens even if stale flags
|
|
39
|
+
* it lives in its own module). --judge opens up the LLM judge and, in
|
|
40
|
+
* `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
|
|
41
|
+
* wins so the LLM-free PR gate never pays for tokens even if stale flags
|
|
42
|
+
* collide.
|
|
41
43
|
*/
|
|
42
44
|
function resolveRunFlags(options) {
|
|
43
45
|
const rulesRequested = options.rules === true;
|
|
44
46
|
const schemaOnly = options.schemaOnly === true;
|
|
45
47
|
const judgeRequested = options.judge === true;
|
|
46
|
-
const
|
|
48
|
+
const mode = options.mode ?? "fixture";
|
|
47
49
|
const runJudge = judgeRequested && !schemaOnly;
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
const runAgent =
|
|
50
|
+
// `workflow` always needs the agent loop (no fixture fallback), so we still
|
|
51
|
+
// require an LLM client but do NOT require --judge on the CLI to produce a
|
|
52
|
+
// workflow run. The judge piece stays gated by `runJudge` so consistency-
|
|
53
|
+
// only runs remain cheap and deterministic.
|
|
54
|
+
const runAgent = mode === "workflow"
|
|
53
55
|
? !schemaOnly
|
|
54
|
-
: runJudge && (
|
|
56
|
+
: runJudge && (mode === "fixture" || mode === "agent");
|
|
55
57
|
return {
|
|
56
58
|
runStructural: true,
|
|
57
59
|
runRules: rulesRequested && !schemaOnly,
|
|
@@ -103,7 +105,7 @@ function stageJudgeHint(step) {
|
|
|
103
105
|
return hint;
|
|
104
106
|
}
|
|
105
107
|
async function runWorkflowCase(ctx) {
|
|
106
|
-
const { projectRoot, workflow,
|
|
108
|
+
const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
|
|
107
109
|
const started = Date.now();
|
|
108
110
|
const verifierResults = [];
|
|
109
111
|
let caseCostUsd = 0;
|
|
@@ -115,14 +117,14 @@ async function runWorkflowCase(ctx) {
|
|
|
115
117
|
id: "workflow:agent:disabled",
|
|
116
118
|
ok: false,
|
|
117
119
|
score: 0,
|
|
118
|
-
message: "
|
|
120
|
+
message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
|
|
119
121
|
"Re-run with credentials to execute the workflow.",
|
|
120
122
|
details: { stages: workflow.stages.map((s) => s.name) }
|
|
121
123
|
});
|
|
122
124
|
return {
|
|
123
125
|
caseId: workflow.id,
|
|
124
126
|
stage: lastStage,
|
|
125
|
-
|
|
127
|
+
mode: plannedMode,
|
|
126
128
|
passed: false,
|
|
127
129
|
durationMs: Date.now() - started,
|
|
128
130
|
verifierResults
|
|
@@ -134,11 +136,28 @@ async function runWorkflowCase(ctx) {
|
|
|
134
136
|
workflow,
|
|
135
137
|
config,
|
|
136
138
|
projectRoot,
|
|
137
|
-
client
|
|
139
|
+
client,
|
|
140
|
+
onStageStart: (stage) => progress.emit({
|
|
141
|
+
kind: "stage-start",
|
|
142
|
+
caseId: workflow.id,
|
|
143
|
+
stage,
|
|
144
|
+
index: caseIndex,
|
|
145
|
+
total: totalCases
|
|
146
|
+
}),
|
|
147
|
+
onStageEnd: (stage, stageResult) => progress.emit({
|
|
148
|
+
kind: "stage-end",
|
|
149
|
+
caseId: workflow.id,
|
|
150
|
+
stage,
|
|
151
|
+
index: caseIndex,
|
|
152
|
+
total: totalCases,
|
|
153
|
+
passed: true,
|
|
154
|
+
durationMs: stageResult.durationMs,
|
|
155
|
+
...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
|
|
156
|
+
})
|
|
138
157
|
});
|
|
139
158
|
}
|
|
140
159
|
catch (err) {
|
|
141
|
-
if (err instanceof DailyCostCapExceededError)
|
|
160
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
142
161
|
throw err;
|
|
143
162
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
144
163
|
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
@@ -156,7 +175,7 @@ async function runWorkflowCase(ctx) {
|
|
|
156
175
|
return {
|
|
157
176
|
caseId: workflow.id,
|
|
158
177
|
stage: lastStage,
|
|
159
|
-
|
|
178
|
+
mode: plannedMode,
|
|
160
179
|
passed: false,
|
|
161
180
|
durationMs: Date.now() - started,
|
|
162
181
|
verifierResults
|
|
@@ -230,7 +249,7 @@ async function runWorkflowCase(ctx) {
|
|
|
230
249
|
}
|
|
231
250
|
}
|
|
232
251
|
catch (err) {
|
|
233
|
-
if (err instanceof DailyCostCapExceededError)
|
|
252
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
234
253
|
throw err;
|
|
235
254
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
236
255
|
verifierResults.push({
|
|
@@ -262,7 +281,7 @@ async function runWorkflowCase(ctx) {
|
|
|
262
281
|
return {
|
|
263
282
|
caseId: workflow.id,
|
|
264
283
|
stage: lastStage,
|
|
265
|
-
|
|
284
|
+
mode: plannedMode,
|
|
266
285
|
passed: allOk,
|
|
267
286
|
durationMs: Date.now() - started,
|
|
268
287
|
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
@@ -271,7 +290,7 @@ async function runWorkflowCase(ctx) {
|
|
|
271
290
|
};
|
|
272
291
|
}
|
|
273
292
|
async function runCase(ctx) {
|
|
274
|
-
const { projectRoot, caseEntry,
|
|
293
|
+
const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
|
|
275
294
|
const started = Date.now();
|
|
276
295
|
const verifierResults = [];
|
|
277
296
|
const expected = caseEntry.expected;
|
|
@@ -283,7 +302,7 @@ async function runCase(ctx) {
|
|
|
283
302
|
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
284
303
|
let artifact;
|
|
285
304
|
if (needsArtifact) {
|
|
286
|
-
if (flags.runAgent && judgeRequested && client &&
|
|
305
|
+
if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
|
|
287
306
|
try {
|
|
288
307
|
const produced = await runSingleShot({
|
|
289
308
|
caseEntry,
|
|
@@ -309,7 +328,7 @@ async function runCase(ctx) {
|
|
|
309
328
|
});
|
|
310
329
|
}
|
|
311
330
|
catch (err) {
|
|
312
|
-
if (err instanceof DailyCostCapExceededError)
|
|
331
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
313
332
|
throw err;
|
|
314
333
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
315
334
|
verifierResults.push({
|
|
@@ -322,7 +341,7 @@ async function runCase(ctx) {
|
|
|
322
341
|
});
|
|
323
342
|
}
|
|
324
343
|
}
|
|
325
|
-
else if (flags.runAgent && judgeRequested && client &&
|
|
344
|
+
else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
|
|
326
345
|
try {
|
|
327
346
|
const produced = await runWithTools({
|
|
328
347
|
caseEntry,
|
|
@@ -351,7 +370,7 @@ async function runCase(ctx) {
|
|
|
351
370
|
});
|
|
352
371
|
}
|
|
353
372
|
catch (err) {
|
|
354
|
-
if (err instanceof DailyCostCapExceededError)
|
|
373
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
355
374
|
throw err;
|
|
356
375
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
357
376
|
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
@@ -443,7 +462,7 @@ async function runCase(ctx) {
|
|
|
443
462
|
verifierResults.push(...judgeVerifiers);
|
|
444
463
|
}
|
|
445
464
|
catch (err) {
|
|
446
|
-
if (err instanceof DailyCostCapExceededError)
|
|
465
|
+
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
447
466
|
throw err;
|
|
448
467
|
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
449
468
|
verifierResults.push({
|
|
@@ -464,7 +483,7 @@ async function runCase(ctx) {
|
|
|
464
483
|
return {
|
|
465
484
|
caseId: caseEntry.id,
|
|
466
485
|
stage: caseEntry.stage,
|
|
467
|
-
|
|
486
|
+
mode: plannedMode,
|
|
468
487
|
passed: allOk,
|
|
469
488
|
durationMs: Date.now() - started,
|
|
470
489
|
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
@@ -506,30 +525,37 @@ function stagesInResults(caseResults) {
|
|
|
506
525
|
return FLOW_STAGES.filter((s) => set.has(s));
|
|
507
526
|
}
|
|
508
527
|
/**
|
|
509
|
-
*
|
|
510
|
-
*
|
|
511
|
-
*
|
|
512
|
-
*
|
|
513
|
-
* skipped
|
|
528
|
+
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
529
|
+
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
530
|
+
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
531
|
+
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
532
|
+
* marked skipped (not failed) when no LLM drafting runs.
|
|
514
533
|
*/
|
|
515
534
|
export async function runEval(options) {
|
|
516
|
-
const
|
|
517
|
-
const
|
|
518
|
-
|
|
519
|
-
|
|
535
|
+
const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
536
|
+
const config = options.modelOverride
|
|
537
|
+
? {
|
|
538
|
+
...baseConfig,
|
|
539
|
+
model: options.modelOverride,
|
|
540
|
+
judgeModel: options.modelOverride
|
|
541
|
+
}
|
|
542
|
+
: baseConfig;
|
|
543
|
+
const plannedMode = options.mode ?? config.defaultMode;
|
|
544
|
+
const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
|
|
545
|
+
const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
|
|
520
546
|
const notes = [];
|
|
521
|
-
if (
|
|
547
|
+
if (plannedMode !== "workflow" && corpus.length === 0) {
|
|
522
548
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
523
549
|
}
|
|
524
|
-
if (
|
|
525
|
-
notes.push("Workflow corpus is empty.
|
|
550
|
+
if (plannedMode === "workflow" && workflowCorpus.length === 0) {
|
|
551
|
+
notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
|
|
526
552
|
}
|
|
527
553
|
const flags = resolveRunFlags(options);
|
|
528
554
|
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
529
555
|
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
530
556
|
}
|
|
531
|
-
if (
|
|
532
|
-
notes.push("
|
|
557
|
+
if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
|
|
558
|
+
notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
|
|
533
559
|
}
|
|
534
560
|
if (options.dryRun === true) {
|
|
535
561
|
const summary = {
|
|
@@ -547,23 +573,34 @@ export async function runEval(options) {
|
|
|
547
573
|
stages: item.stages.map((s) => s.name)
|
|
548
574
|
}))
|
|
549
575
|
},
|
|
550
|
-
|
|
576
|
+
plannedMode,
|
|
551
577
|
verifiersAvailable: {
|
|
552
578
|
structural: flags.runStructural,
|
|
553
579
|
rules: flags.runRules,
|
|
554
580
|
judge: flags.runJudge,
|
|
555
581
|
workflow: flags.runAgent,
|
|
556
|
-
consistency:
|
|
582
|
+
consistency: plannedMode === "workflow"
|
|
557
583
|
},
|
|
558
584
|
notes
|
|
559
585
|
};
|
|
560
586
|
return summary;
|
|
561
587
|
}
|
|
562
|
-
const costGuard = createCostGuard(options.projectRoot, config);
|
|
588
|
+
const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
|
|
589
|
+
const progress = options.progress ?? noopProgressLogger();
|
|
563
590
|
let wrappedClient;
|
|
564
|
-
const clientNeeded = flags.runJudge ||
|
|
591
|
+
const clientNeeded = flags.runJudge || plannedMode === "workflow";
|
|
565
592
|
if (clientNeeded) {
|
|
566
|
-
const base = options.llmClient ??
|
|
593
|
+
const base = options.llmClient ??
|
|
594
|
+
createEvalClient(config, {
|
|
595
|
+
onRetry: (event) => progress.emit({
|
|
596
|
+
kind: "retry",
|
|
597
|
+
caseId: "llm",
|
|
598
|
+
attempt: event.attempt,
|
|
599
|
+
maxAttempts: event.maxAttempts,
|
|
600
|
+
waitMs: event.waitMs,
|
|
601
|
+
reason: event.error.message
|
|
602
|
+
})
|
|
603
|
+
});
|
|
567
604
|
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
568
605
|
}
|
|
569
606
|
const rubricsNeeded = flags.runJudge;
|
|
@@ -572,32 +609,80 @@ export async function runEval(options) {
|
|
|
572
609
|
: new Map();
|
|
573
610
|
const now = new Date().toISOString();
|
|
574
611
|
const caseResults = [];
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
612
|
+
const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
|
|
613
|
+
const runStarted = Date.now();
|
|
614
|
+
progress.emit({
|
|
615
|
+
kind: "run-start",
|
|
616
|
+
mode: plannedMode,
|
|
617
|
+
totalCases: totalPlannedCases
|
|
618
|
+
});
|
|
619
|
+
if (plannedMode === "workflow") {
|
|
620
|
+
for (let i = 0; i < workflowCorpus.length; i += 1) {
|
|
621
|
+
const wf = workflowCorpus[i];
|
|
622
|
+
progress.emit({
|
|
623
|
+
kind: "case-start",
|
|
624
|
+
caseId: wf.id,
|
|
625
|
+
stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
|
|
626
|
+
index: i + 1,
|
|
627
|
+
total: workflowCorpus.length
|
|
628
|
+
});
|
|
629
|
+
const result = await runWorkflowCase({
|
|
578
630
|
projectRoot: options.projectRoot,
|
|
579
631
|
workflow: wf,
|
|
580
|
-
|
|
632
|
+
plannedMode,
|
|
581
633
|
flags,
|
|
582
634
|
config,
|
|
583
635
|
client: wrappedClient,
|
|
584
636
|
costGuard,
|
|
585
|
-
rubrics
|
|
586
|
-
|
|
637
|
+
rubrics,
|
|
638
|
+
progress,
|
|
639
|
+
caseIndex: i + 1,
|
|
640
|
+
totalCases: workflowCorpus.length
|
|
641
|
+
});
|
|
642
|
+
progress.emit({
|
|
643
|
+
kind: "case-end",
|
|
644
|
+
caseId: wf.id,
|
|
645
|
+
stage: result.stage,
|
|
646
|
+
index: i + 1,
|
|
647
|
+
total: workflowCorpus.length,
|
|
648
|
+
passed: result.passed,
|
|
649
|
+
durationMs: result.durationMs,
|
|
650
|
+
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
651
|
+
});
|
|
652
|
+
caseResults.push(result);
|
|
587
653
|
}
|
|
588
654
|
}
|
|
589
655
|
else {
|
|
590
|
-
for (
|
|
591
|
-
|
|
656
|
+
for (let i = 0; i < corpus.length; i += 1) {
|
|
657
|
+
const item = corpus[i];
|
|
658
|
+
progress.emit({
|
|
659
|
+
kind: "case-start",
|
|
660
|
+
caseId: item.id,
|
|
661
|
+
stage: item.stage,
|
|
662
|
+
index: i + 1,
|
|
663
|
+
total: corpus.length
|
|
664
|
+
});
|
|
665
|
+
const result = await runCase({
|
|
592
666
|
projectRoot: options.projectRoot,
|
|
593
667
|
caseEntry: item,
|
|
594
|
-
|
|
668
|
+
plannedMode,
|
|
595
669
|
flags,
|
|
596
670
|
config,
|
|
597
671
|
client: wrappedClient,
|
|
598
672
|
costGuard,
|
|
599
673
|
rubrics
|
|
600
|
-
})
|
|
674
|
+
});
|
|
675
|
+
progress.emit({
|
|
676
|
+
kind: "case-end",
|
|
677
|
+
caseId: item.id,
|
|
678
|
+
stage: item.stage,
|
|
679
|
+
index: i + 1,
|
|
680
|
+
total: corpus.length,
|
|
681
|
+
passed: result.passed,
|
|
682
|
+
durationMs: result.durationMs,
|
|
683
|
+
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
684
|
+
});
|
|
685
|
+
caseResults.push(result);
|
|
601
686
|
}
|
|
602
687
|
}
|
|
603
688
|
const stages = stagesInResults(caseResults);
|
|
@@ -610,7 +695,7 @@ export async function runEval(options) {
|
|
|
610
695
|
cclawVersion: CCLAW_VERSION,
|
|
611
696
|
provider: config.provider,
|
|
612
697
|
model: config.model,
|
|
613
|
-
|
|
698
|
+
mode: plannedMode,
|
|
614
699
|
stages,
|
|
615
700
|
cases: caseResults,
|
|
616
701
|
summary
|
|
@@ -618,5 +703,12 @@ export async function runEval(options) {
|
|
|
618
703
|
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
619
704
|
if (baselineDelta)
|
|
620
705
|
report.baselineDelta = baselineDelta;
|
|
706
|
+
progress.emit({
|
|
707
|
+
kind: "run-end",
|
|
708
|
+
totalCases: summary.totalCases,
|
|
709
|
+
passed: summary.passed,
|
|
710
|
+
failed: summary.failed,
|
|
711
|
+
durationMs: Date.now() - runStarted
|
|
712
|
+
});
|
|
621
713
|
return report;
|
|
622
714
|
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export declare const RUNS_DIR = "runs";
|
|
2
|
+
export interface EvalRunStatus {
|
|
3
|
+
id: string;
|
|
4
|
+
startedAt: string;
|
|
5
|
+
endedAt?: string;
|
|
6
|
+
pid: number;
|
|
7
|
+
argv: string[];
|
|
8
|
+
cwd: string;
|
|
9
|
+
exitCode?: number;
|
|
10
|
+
state: "running" | "succeeded" | "failed";
|
|
11
|
+
}
|
|
12
|
+
export declare function runsRoot(projectRoot: string): string;
|
|
13
|
+
export declare function runDir(projectRoot: string, id: string): string;
|
|
14
|
+
export declare function runLogPath(projectRoot: string, id: string): string;
|
|
15
|
+
export declare function runStatusPath(projectRoot: string, id: string): string;
|
|
16
|
+
/**
|
|
17
|
+
* Generate a short, lexicographically-sortable run id. The timestamp
|
|
18
|
+
* prefix means `ls -1` already returns the runs in chronological order
|
|
19
|
+
* which keeps the `runs list` subcommand trivial.
|
|
20
|
+
*/
|
|
21
|
+
export declare function generateRunId(now?: Date): string;
|
|
22
|
+
export declare function ensureRunDir(projectRoot: string, id: string): Promise<string>;
|
|
23
|
+
export declare function writeRunStatus(projectRoot: string, status: EvalRunStatus): Promise<void>;
|
|
24
|
+
export declare function readRunStatus(projectRoot: string, id: string): Promise<EvalRunStatus | null>;
|
|
25
|
+
/**
|
|
26
|
+
* List run ids under `.cclaw/evals/runs/`, most recent first. Directory
|
|
27
|
+
* entries that don't contain a `run.json` are skipped (half-initialized
|
|
28
|
+
* or manually mkdir'd folders).
|
|
29
|
+
*/
|
|
30
|
+
export declare function listRuns(projectRoot: string): Promise<EvalRunStatus[]>;
|
|
31
|
+
/**
|
|
32
|
+
* Resolve `"latest"` (or undefined) to the most recent run id.
|
|
33
|
+
* Returns `null` when there are no runs.
|
|
34
|
+
*/
|
|
35
|
+
export declare function resolveRunId(projectRoot: string, hint: string | undefined): Promise<string | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
|
|
38
|
+
* (process crashed mid-commit), so we double-check with `kill(pid, 0)`
|
|
39
|
+
* before trusting the `state: "running"` field.
|
|
40
|
+
*/
|
|
41
|
+
export declare function isRunAlive(status: EvalRunStatus): boolean;
|