cclaw-cli 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -2
- package/dist/cli.js +388 -18
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +22 -16
- package/dist/eval/agents/workflow.d.ts +31 -0
- package/dist/eval/agents/workflow.js +135 -0
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +52 -19
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +36 -1
- package/dist/eval/runner.d.ts +37 -8
- package/dist/eval/runner.js +351 -42
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +158 -15
- package/dist/eval/types.js +39 -7
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
|
|
3
|
-
import type {
|
|
3
|
+
import type { EvalMode } from "./eval/types.js";
|
|
4
4
|
type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
|
|
5
5
|
interface ParsedArgs {
|
|
6
6
|
command?: CommandName;
|
|
@@ -18,7 +18,7 @@ interface ParsedArgs {
|
|
|
18
18
|
archiveSkipRetro?: boolean;
|
|
19
19
|
archiveSkipRetroReason?: string;
|
|
20
20
|
evalStage?: string;
|
|
21
|
-
|
|
21
|
+
evalMode?: EvalMode;
|
|
22
22
|
evalSchemaOnly?: boolean;
|
|
23
23
|
evalRules?: boolean;
|
|
24
24
|
evalJudge?: boolean;
|
|
@@ -26,6 +26,14 @@ interface ParsedArgs {
|
|
|
26
26
|
evalNoWrite?: boolean;
|
|
27
27
|
evalUpdateBaseline?: boolean;
|
|
28
28
|
evalConfirm?: boolean;
|
|
29
|
+
evalQuiet?: boolean;
|
|
30
|
+
evalMaxCostUsd?: number;
|
|
31
|
+
/** Optional subcommand after `eval`. */
|
|
32
|
+
evalSubcommand?: "diff" | "runs";
|
|
33
|
+
/** Positional arguments for eval subcommands (e.g. `diff <old> <new>`). */
|
|
34
|
+
evalArgs?: string[];
|
|
35
|
+
evalBackground?: boolean;
|
|
36
|
+
evalCompareModel?: string;
|
|
29
37
|
showHelp?: boolean;
|
|
30
38
|
showVersion?: boolean;
|
|
31
39
|
}
|
package/dist/cli.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { readFileSync, realpathSync } from "node:fs";
|
|
2
|
+
import { createReadStream, readFileSync, realpathSync } from "node:fs";
|
|
3
|
+
import { spawn } from "node:child_process";
|
|
4
|
+
import fs from "node:fs/promises";
|
|
3
5
|
import process from "node:process";
|
|
4
6
|
import path from "node:path";
|
|
5
7
|
import { createInterface } from "node:readline/promises";
|
|
@@ -14,9 +16,13 @@ import { createDefaultConfig, createProfileConfig } from "./config.js";
|
|
|
14
16
|
import { detectHarnesses } from "./init-detect.js";
|
|
15
17
|
import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
16
18
|
import { runEval } from "./eval/runner.js";
|
|
19
|
+
import { createStderrProgressLogger } from "./eval/progress.js";
|
|
17
20
|
import { writeBaselinesFromReport } from "./eval/baseline.js";
|
|
18
21
|
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
19
|
-
import {
|
|
22
|
+
import { formatDiffMarkdown, runEvalDiff } from "./eval/diff.js";
|
|
23
|
+
import { ensureRunDir, generateRunId, isRunAlive, listRuns, readRunStatus, resolveRunId, runLogPath, writeRunStatus } from "./eval/runs.js";
|
|
24
|
+
import { EVAL_MODES } from "./eval/types.js";
|
|
25
|
+
import { parseModeInput } from "./eval/mode.js";
|
|
20
26
|
import { FLOW_STAGES } from "./types.js";
|
|
21
27
|
const INSTALLER_COMMANDS = [
|
|
22
28
|
"init",
|
|
@@ -55,16 +61,41 @@ Commands:
|
|
|
55
61
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
56
62
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
57
63
|
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
|
|
58
|
-
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
59
|
-
--
|
|
64
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}) for fixture/agent modes.
|
|
65
|
+
--mode=<${EVAL_MODES.join("|")}>
|
|
66
|
+
Evaluation mode:
|
|
67
|
+
fixture = verify existing artifacts with structural/rule/judge verifiers.
|
|
68
|
+
agent = LLM drafts one stage's artifact in a sandbox with tools.
|
|
69
|
+
workflow = LLM runs the full multi-stage flow (brainstorm→plan).
|
|
70
|
+
Legacy --tier=A|B|C still works (deprecated).
|
|
60
71
|
--schema-only Run only structural verifiers (default).
|
|
61
72
|
--rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
|
|
62
|
-
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY;
|
|
73
|
+
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; fixture mode judges an existing artifact, agent/workflow modes draft first and then judge.
|
|
63
74
|
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
75
|
--json Emit machine-readable JSON on stdout.
|
|
65
76
|
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
66
77
|
--update-baseline Overwrite baselines from the current run (requires --confirm).
|
|
67
78
|
--confirm Acknowledge --update-baseline (prevents accidental resets).
|
|
79
|
+
--quiet Silence the stderr progress logger (default: emit one
|
|
80
|
+
line per case / stage to stderr so long runs are visible).
|
|
81
|
+
--max-cost-usd=<n> Abort the run if committed USD spend crosses <n>
|
|
82
|
+
(independent from the daily cap). Also readable from
|
|
83
|
+
CCLAW_EVAL_MAX_COST_USD.
|
|
84
|
+
--compare-model=<id> Run the same corpus twice — once with the configured model
|
|
85
|
+
and once with <id> — then diff the summaries. Exit code 1
|
|
86
|
+
when the override model regressed.
|
|
87
|
+
--background Spawn the run as a detached child process, write the
|
|
88
|
+
combined output to .cclaw/evals/runs/<id>/run.log, and
|
|
89
|
+
return immediately. Attach later with
|
|
90
|
+
\`cclaw eval runs tail <id|latest>\`.
|
|
91
|
+
|
|
92
|
+
Subcommands:
|
|
93
|
+
diff <old> <new> Compare two reports under .cclaw/evals/reports/.
|
|
94
|
+
Each argument is a cclawVersion (e.g. 0.26.0), a filename,
|
|
95
|
+
or the literal "latest". Exit code 1 when the diff shows a
|
|
96
|
+
regression. Accepts --json to emit machine-readable output.
|
|
97
|
+
runs [action] [id] Inspect background runs under .cclaw/evals/runs/.
|
|
98
|
+
Actions: list (default) | status <id|latest> | tail <id|latest>.
|
|
68
99
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
69
100
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
70
101
|
|
|
@@ -78,8 +109,10 @@ Examples:
|
|
|
78
109
|
cclaw archive --name=payments-revamp
|
|
79
110
|
cclaw eval --dry-run
|
|
80
111
|
cclaw eval --stage=brainstorm --schema-only
|
|
81
|
-
cclaw eval --judge --
|
|
82
|
-
cclaw eval --judge --
|
|
112
|
+
cclaw eval --judge --mode=fixture --stage=brainstorm
|
|
113
|
+
cclaw eval --judge --mode=agent --stage=spec
|
|
114
|
+
cclaw eval --mode=workflow --judge
|
|
115
|
+
cclaw eval diff 0.26.0 latest
|
|
83
116
|
|
|
84
117
|
Docs: https://github.com/zuevrs/cclaw
|
|
85
118
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -135,12 +168,17 @@ function parseProfile(raw) {
|
|
|
135
168
|
}
|
|
136
169
|
return trimmed;
|
|
137
170
|
}
|
|
138
|
-
function
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
}
|
|
143
|
-
|
|
171
|
+
function parseLegacyTier(raw) {
|
|
172
|
+
return parseModeInput(raw.toUpperCase(), {
|
|
173
|
+
source: "cli",
|
|
174
|
+
raw: `--tier=${raw}`
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
function parseEvalMode(raw) {
|
|
178
|
+
return parseModeInput(raw, {
|
|
179
|
+
source: "cli",
|
|
180
|
+
raw: `--mode=${raw}`
|
|
181
|
+
});
|
|
144
182
|
}
|
|
145
183
|
function parseEvalStage(raw) {
|
|
146
184
|
const trimmed = raw.trim();
|
|
@@ -363,6 +401,18 @@ function printDoctorText(ctx, checks, options) {
|
|
|
363
401
|
ctx.stdout.write("Doctor status: HEALTHY (no failing error checks)\n");
|
|
364
402
|
}
|
|
365
403
|
}
|
|
404
|
+
function resolveMaxCostOption(fromCli, env) {
|
|
405
|
+
if (fromCli !== undefined)
|
|
406
|
+
return { maxCostUsd: fromCli };
|
|
407
|
+
const raw = env.CCLAW_EVAL_MAX_COST_USD;
|
|
408
|
+
if (raw === undefined || raw.trim() === "")
|
|
409
|
+
return {};
|
|
410
|
+
const value = Number(raw);
|
|
411
|
+
if (!Number.isFinite(value) || value <= 0) {
|
|
412
|
+
throw new Error(`CCLAW_EVAL_MAX_COST_USD must be a positive number, got: ${raw}`);
|
|
413
|
+
}
|
|
414
|
+
return { maxCostUsd: value };
|
|
415
|
+
}
|
|
366
416
|
function parseArgs(argv) {
|
|
367
417
|
const parsed = {};
|
|
368
418
|
const helpFlag = argv.find((arg) => arg === "--help" || arg === "-h");
|
|
@@ -373,10 +423,45 @@ function parseArgs(argv) {
|
|
|
373
423
|
if (versionFlag) {
|
|
374
424
|
parsed.showVersion = true;
|
|
375
425
|
}
|
|
376
|
-
const
|
|
426
|
+
const filteredArgv = argv.filter((arg) => arg !== "--help" && arg !== "-h" && arg !== "--version" && arg !== "-v");
|
|
427
|
+
const [commandRaw, ...rest] = filteredArgv;
|
|
377
428
|
parsed.command = INSTALLER_COMMANDS.includes(commandRaw)
|
|
378
429
|
? commandRaw
|
|
379
430
|
: undefined;
|
|
431
|
+
// For `eval`, the next non-flag argument is an optional subcommand. Any
|
|
432
|
+
// subsequent non-flag tokens are captured as evalArgs (consumed by the
|
|
433
|
+
// subcommand handler). This preserves backwards compat: callers that run
|
|
434
|
+
// `cclaw eval --dry-run` see no subcommand and no positional args.
|
|
435
|
+
let flags = rest;
|
|
436
|
+
if (parsed.command === "eval") {
|
|
437
|
+
const evalArgs = [];
|
|
438
|
+
const remainder = [];
|
|
439
|
+
let sawSubcommand = false;
|
|
440
|
+
for (const token of rest) {
|
|
441
|
+
if (token.startsWith("--")) {
|
|
442
|
+
remainder.push(token);
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
if (!sawSubcommand) {
|
|
446
|
+
if (token === "diff") {
|
|
447
|
+
parsed.evalSubcommand = "diff";
|
|
448
|
+
sawSubcommand = true;
|
|
449
|
+
}
|
|
450
|
+
else if (token === "runs") {
|
|
451
|
+
parsed.evalSubcommand = "runs";
|
|
452
|
+
sawSubcommand = true;
|
|
453
|
+
}
|
|
454
|
+
else {
|
|
455
|
+
evalArgs.push(token);
|
|
456
|
+
}
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
evalArgs.push(token);
|
|
460
|
+
}
|
|
461
|
+
if (evalArgs.length > 0)
|
|
462
|
+
parsed.evalArgs = evalArgs;
|
|
463
|
+
flags = remainder;
|
|
464
|
+
}
|
|
380
465
|
for (const flag of flags) {
|
|
381
466
|
if (flag.startsWith("--harnesses=")) {
|
|
382
467
|
parsed.harnesses = parseHarnesses(flag.replace("--harnesses=", ""));
|
|
@@ -438,8 +523,12 @@ function parseArgs(argv) {
|
|
|
438
523
|
parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
|
|
439
524
|
continue;
|
|
440
525
|
}
|
|
526
|
+
if (flag.startsWith("--mode=")) {
|
|
527
|
+
parsed.evalMode = parseEvalMode(flag.replace("--mode=", ""));
|
|
528
|
+
continue;
|
|
529
|
+
}
|
|
441
530
|
if (flag.startsWith("--tier=")) {
|
|
442
|
-
parsed.
|
|
531
|
+
parsed.evalMode = parseLegacyTier(flag.replace("--tier=", ""));
|
|
443
532
|
continue;
|
|
444
533
|
}
|
|
445
534
|
if (flag === "--schema-only") {
|
|
@@ -466,14 +555,245 @@ function parseArgs(argv) {
|
|
|
466
555
|
parsed.evalConfirm = true;
|
|
467
556
|
continue;
|
|
468
557
|
}
|
|
558
|
+
if (flag === "--background") {
|
|
559
|
+
parsed.evalBackground = true;
|
|
560
|
+
continue;
|
|
561
|
+
}
|
|
562
|
+
if (flag.startsWith("--compare-model=")) {
|
|
563
|
+
const value = flag.replace("--compare-model=", "").trim();
|
|
564
|
+
if (value.length === 0) {
|
|
565
|
+
throw new Error(`--compare-model requires a non-empty model id (e.g. --compare-model=gpt-4o-mini).`);
|
|
566
|
+
}
|
|
567
|
+
parsed.evalCompareModel = value;
|
|
568
|
+
continue;
|
|
569
|
+
}
|
|
570
|
+
if (flag.startsWith("--max-cost-usd=")) {
|
|
571
|
+
const raw = flag.replace("--max-cost-usd=", "").trim();
|
|
572
|
+
const value = Number(raw);
|
|
573
|
+
if (!Number.isFinite(value) || value <= 0) {
|
|
574
|
+
throw new Error(`--max-cost-usd requires a positive number, got: ${raw}`);
|
|
575
|
+
}
|
|
576
|
+
parsed.evalMaxCostUsd = value;
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
469
579
|
}
|
|
470
580
|
// `--json` is shared between doctor and eval. Disambiguate by command.
|
|
471
581
|
if (parsed.command === "eval" && parsed.doctorJson === true) {
|
|
472
582
|
parsed.evalJson = true;
|
|
473
583
|
parsed.doctorJson = undefined;
|
|
474
584
|
}
|
|
585
|
+
// `--quiet` on `eval` silences the stderr progress logger. On doctor it
|
|
586
|
+
// continues to mean "print only failing checks" — the flag slot is the
|
|
587
|
+
// same, the semantics depend on which command owns the invocation.
|
|
588
|
+
if (parsed.command === "eval" && parsed.doctorQuiet === true) {
|
|
589
|
+
parsed.evalQuiet = true;
|
|
590
|
+
parsed.doctorQuiet = undefined;
|
|
591
|
+
}
|
|
475
592
|
return parsed;
|
|
476
593
|
}
|
|
594
|
+
/**
|
|
595
|
+
* Spawn `cclaw eval` (without `--background`) in a detached child process
|
|
596
|
+
* and return immediately. The child's stdout+stderr are piped to
|
|
597
|
+
* `.cclaw/evals/runs/<id>/run.log` so the user can attach later with
|
|
598
|
+
* `cclaw eval runs tail`. We do NOT wait for the child — the whole point
|
|
599
|
+
* is to free the terminal while a multi-minute workflow-mode run
|
|
600
|
+
* proceeds in the background.
|
|
601
|
+
*/
|
|
602
|
+
async function spawnBackgroundEval(parsed, ctx) {
|
|
603
|
+
const id = generateRunId();
|
|
604
|
+
await ensureRunDir(ctx.cwd, id);
|
|
605
|
+
const logPath = runLogPath(ctx.cwd, id);
|
|
606
|
+
const childArgv = process.argv.slice(2).filter((a) => a !== "--background");
|
|
607
|
+
const cliEntry = process.argv[1];
|
|
608
|
+
if (!cliEntry) {
|
|
609
|
+
error(ctx, "Could not resolve cclaw entrypoint for --background.");
|
|
610
|
+
return 1;
|
|
611
|
+
}
|
|
612
|
+
const logHandle = await fs.open(logPath, "a");
|
|
613
|
+
try {
|
|
614
|
+
const child = spawn(process.execPath, [cliEntry, ...childArgv], {
|
|
615
|
+
cwd: ctx.cwd,
|
|
616
|
+
detached: true,
|
|
617
|
+
stdio: ["ignore", logHandle.fd, logHandle.fd],
|
|
618
|
+
env: process.env
|
|
619
|
+
});
|
|
620
|
+
const pid = child.pid ?? -1;
|
|
621
|
+
await writeRunStatus(ctx.cwd, {
|
|
622
|
+
id,
|
|
623
|
+
startedAt: new Date().toISOString(),
|
|
624
|
+
pid,
|
|
625
|
+
argv: childArgv,
|
|
626
|
+
cwd: ctx.cwd,
|
|
627
|
+
state: "running"
|
|
628
|
+
});
|
|
629
|
+
child.unref();
|
|
630
|
+
const finalize = async (code) => {
|
|
631
|
+
const current = await readRunStatus(ctx.cwd, id);
|
|
632
|
+
if (!current)
|
|
633
|
+
return;
|
|
634
|
+
const exitCode = typeof code === "number" ? code : -1;
|
|
635
|
+
await writeRunStatus(ctx.cwd, {
|
|
636
|
+
...current,
|
|
637
|
+
endedAt: new Date().toISOString(),
|
|
638
|
+
exitCode,
|
|
639
|
+
state: exitCode === 0 ? "succeeded" : "failed"
|
|
640
|
+
});
|
|
641
|
+
};
|
|
642
|
+
child.on("exit", (code) => {
|
|
643
|
+
void finalize(code);
|
|
644
|
+
});
|
|
645
|
+
child.on("error", (err) => {
|
|
646
|
+
void writeRunStatus(ctx.cwd, {
|
|
647
|
+
id,
|
|
648
|
+
startedAt: new Date().toISOString(),
|
|
649
|
+
pid,
|
|
650
|
+
argv: childArgv,
|
|
651
|
+
cwd: ctx.cwd,
|
|
652
|
+
endedAt: new Date().toISOString(),
|
|
653
|
+
exitCode: -1,
|
|
654
|
+
state: "failed"
|
|
655
|
+
});
|
|
656
|
+
error(ctx, `Background eval failed to start: ${err.message}`);
|
|
657
|
+
});
|
|
658
|
+
ctx.stdout.write(`cclaw eval: background run id=${id} pid=${pid}\n` +
|
|
659
|
+
` log: ${logPath}\n` +
|
|
660
|
+
` tail: cclaw eval runs tail ${id}\n` +
|
|
661
|
+
` status: cclaw eval runs status ${id}\n`);
|
|
662
|
+
return 0;
|
|
663
|
+
}
|
|
664
|
+
finally {
|
|
665
|
+
await logHandle.close();
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
function formatRunRow(status) {
|
|
669
|
+
const ended = status.endedAt ? ` ended=${status.endedAt}` : "";
|
|
670
|
+
const exitCode = status.exitCode !== undefined ? ` exit=${status.exitCode}` : "";
|
|
671
|
+
const alive = status.state === "running" ? (isRunAlive(status) ? "" : " (stale)") : "";
|
|
672
|
+
return `${status.id} state=${status.state}${alive} pid=${status.pid} started=${status.startedAt}${ended}${exitCode}`;
|
|
673
|
+
}
|
|
674
|
+
async function runEvalRunsSubcommand(parsed, ctx) {
|
|
675
|
+
const args = parsed.evalArgs ?? [];
|
|
676
|
+
const action = args[0] ?? "list";
|
|
677
|
+
if (action === "list") {
|
|
678
|
+
const runs = await listRuns(ctx.cwd);
|
|
679
|
+
if (runs.length === 0) {
|
|
680
|
+
ctx.stdout.write("No eval runs recorded under .cclaw/evals/runs/.\n");
|
|
681
|
+
return 0;
|
|
682
|
+
}
|
|
683
|
+
if (parsed.evalJson === true) {
|
|
684
|
+
ctx.stdout.write(`${JSON.stringify(runs, null, 2)}\n`);
|
|
685
|
+
return 0;
|
|
686
|
+
}
|
|
687
|
+
for (const run of runs)
|
|
688
|
+
ctx.stdout.write(`${formatRunRow(run)}\n`);
|
|
689
|
+
return 0;
|
|
690
|
+
}
|
|
691
|
+
if (action === "status") {
|
|
692
|
+
const id = await resolveRunId(ctx.cwd, args[1]);
|
|
693
|
+
if (!id) {
|
|
694
|
+
error(ctx, `No such run: ${args[1] ?? "(none recorded)"}`);
|
|
695
|
+
return 1;
|
|
696
|
+
}
|
|
697
|
+
const status = await readRunStatus(ctx.cwd, id);
|
|
698
|
+
if (!status) {
|
|
699
|
+
error(ctx, `Run ${id} has no status file.`);
|
|
700
|
+
return 1;
|
|
701
|
+
}
|
|
702
|
+
if (parsed.evalJson === true) {
|
|
703
|
+
ctx.stdout.write(`${JSON.stringify(status, null, 2)}\n`);
|
|
704
|
+
}
|
|
705
|
+
else {
|
|
706
|
+
ctx.stdout.write(`${formatRunRow(status)}\n`);
|
|
707
|
+
ctx.stdout.write(`log: ${runLogPath(ctx.cwd, id)}\n`);
|
|
708
|
+
}
|
|
709
|
+
return status.state === "failed" ? 1 : 0;
|
|
710
|
+
}
|
|
711
|
+
if (action === "tail") {
|
|
712
|
+
const id = await resolveRunId(ctx.cwd, args[1]);
|
|
713
|
+
if (!id) {
|
|
714
|
+
error(ctx, `No such run: ${args[1] ?? "(none recorded)"}`);
|
|
715
|
+
return 1;
|
|
716
|
+
}
|
|
717
|
+
const logFile = runLogPath(ctx.cwd, id);
|
|
718
|
+
const stream = createReadStream(logFile, { encoding: "utf8" });
|
|
719
|
+
await new Promise((resolve, reject) => {
|
|
720
|
+
stream.on("data", (chunk) => ctx.stdout.write(chunk));
|
|
721
|
+
stream.on("end", () => resolve());
|
|
722
|
+
stream.on("error", reject);
|
|
723
|
+
});
|
|
724
|
+
return 0;
|
|
725
|
+
}
|
|
726
|
+
error(ctx, `Unknown \`cclaw eval runs\` action: ${action}. Use list | status | tail.`);
|
|
727
|
+
return 1;
|
|
728
|
+
}
|
|
729
|
+
/**
|
|
730
|
+
* Run the same corpus twice — once against the configured model, once
|
|
731
|
+
* against `--compare-model=<id>` — and print a summary comparing the
|
|
732
|
+
* two. Both reports are written to `.cclaw/evals/reports/` (unless
|
|
733
|
+
* `--no-write` is set) and a unified diff is emitted to stdout. Exit
|
|
734
|
+
* code is 1 when the override model regressed against the baseline
|
|
735
|
+
* model, 0 otherwise.
|
|
736
|
+
*/
|
|
737
|
+
async function runCompareModel(parsed, ctx, progress) {
|
|
738
|
+
const baselineOpts = {
|
|
739
|
+
projectRoot: ctx.cwd,
|
|
740
|
+
stage: parsed.evalStage,
|
|
741
|
+
mode: parsed.evalMode,
|
|
742
|
+
schemaOnly: parsed.evalSchemaOnly === true,
|
|
743
|
+
rules: parsed.evalRules === true,
|
|
744
|
+
judge: parsed.evalJudge === true,
|
|
745
|
+
...(progress ? { progress } : {}),
|
|
746
|
+
...resolveMaxCostOption(parsed.evalMaxCostUsd, process.env)
|
|
747
|
+
};
|
|
748
|
+
ctx.stderr.write(`[cclaw eval] compare: running baseline model...\n`);
|
|
749
|
+
const baseline = await runEval(baselineOpts);
|
|
750
|
+
if ("kind" in baseline) {
|
|
751
|
+
error(ctx, "--compare-model is incompatible with --dry-run.");
|
|
752
|
+
return 1;
|
|
753
|
+
}
|
|
754
|
+
ctx.stderr.write(`[cclaw eval] compare: running ${parsed.evalCompareModel} ...\n`);
|
|
755
|
+
const candidate = await runEval({
|
|
756
|
+
...baselineOpts,
|
|
757
|
+
modelOverride: parsed.evalCompareModel
|
|
758
|
+
});
|
|
759
|
+
if ("kind" in candidate) {
|
|
760
|
+
error(ctx, "--compare-model received an unexpected dry-run response.");
|
|
761
|
+
return 1;
|
|
762
|
+
}
|
|
763
|
+
if (parsed.evalNoWrite !== true) {
|
|
764
|
+
await writeJsonReport(ctx.cwd, baseline);
|
|
765
|
+
await writeMarkdownReport(ctx.cwd, baseline);
|
|
766
|
+
await writeJsonReport(ctx.cwd, candidate);
|
|
767
|
+
await writeMarkdownReport(ctx.cwd, candidate);
|
|
768
|
+
}
|
|
769
|
+
const passDelta = candidate.summary.passed - baseline.summary.passed;
|
|
770
|
+
const failDelta = candidate.summary.failed - baseline.summary.failed;
|
|
771
|
+
const costDelta = candidate.summary.totalCostUsd - baseline.summary.totalCostUsd;
|
|
772
|
+
if (parsed.evalJson === true) {
|
|
773
|
+
ctx.stdout.write(`${JSON.stringify({
|
|
774
|
+
baseline: {
|
|
775
|
+
model: baseline.model,
|
|
776
|
+
summary: baseline.summary
|
|
777
|
+
},
|
|
778
|
+
candidate: {
|
|
779
|
+
model: candidate.model,
|
|
780
|
+
summary: candidate.summary
|
|
781
|
+
},
|
|
782
|
+
delta: { passed: passDelta, failed: failDelta, costUsd: costDelta }
|
|
783
|
+
}, null, 2)}\n`);
|
|
784
|
+
}
|
|
785
|
+
else {
|
|
786
|
+
ctx.stdout.write(`cclaw eval compare-model:\n` +
|
|
787
|
+
` baseline ${baseline.model}: pass=${baseline.summary.passed}/${baseline.summary.totalCases} ` +
|
|
788
|
+
`fail=${baseline.summary.failed} cost=$${baseline.summary.totalCostUsd.toFixed(4)}\n` +
|
|
789
|
+
` candidate ${candidate.model}: pass=${candidate.summary.passed}/${candidate.summary.totalCases} ` +
|
|
790
|
+
`fail=${candidate.summary.failed} cost=$${candidate.summary.totalCostUsd.toFixed(4)}\n` +
|
|
791
|
+
` delta: passed=${passDelta >= 0 ? "+" : ""}${passDelta} ` +
|
|
792
|
+
`failed=${failDelta >= 0 ? "+" : ""}${failDelta} ` +
|
|
793
|
+
`cost=${costDelta >= 0 ? "+" : ""}$${costDelta.toFixed(4)}\n`);
|
|
794
|
+
}
|
|
795
|
+
return failDelta > 0 ? 1 : 0;
|
|
796
|
+
}
|
|
477
797
|
async function runCommand(parsed, ctx) {
|
|
478
798
|
if (parsed.showHelp) {
|
|
479
799
|
ctx.stdout.write(usage());
|
|
@@ -567,15 +887,59 @@ async function runCommand(parsed, ctx) {
|
|
|
567
887
|
info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
|
|
568
888
|
return 0;
|
|
569
889
|
}
|
|
890
|
+
if (command === "eval" && parsed.evalSubcommand === "runs") {
|
|
891
|
+
return runEvalRunsSubcommand(parsed, ctx);
|
|
892
|
+
}
|
|
893
|
+
if (command === "eval" && parsed.evalBackground === true) {
|
|
894
|
+
return spawnBackgroundEval(parsed, ctx);
|
|
895
|
+
}
|
|
896
|
+
if (command === "eval" && parsed.evalSubcommand === "diff") {
|
|
897
|
+
const args = parsed.evalArgs ?? [];
|
|
898
|
+
if (args.length !== 2) {
|
|
899
|
+
error(ctx, `\`cclaw eval diff\` requires two arguments: <old> <new>. ` +
|
|
900
|
+
`Example: cclaw eval diff 0.26.0 latest`);
|
|
901
|
+
return 1;
|
|
902
|
+
}
|
|
903
|
+
const [oldSel, newSel] = args;
|
|
904
|
+
try {
|
|
905
|
+
const diff = await runEvalDiff({
|
|
906
|
+
projectRoot: ctx.cwd,
|
|
907
|
+
old: oldSel,
|
|
908
|
+
new: newSel
|
|
909
|
+
});
|
|
910
|
+
if (parsed.evalJson === true) {
|
|
911
|
+
ctx.stdout.write(`${JSON.stringify(diff, null, 2)}\n`);
|
|
912
|
+
}
|
|
913
|
+
else {
|
|
914
|
+
ctx.stdout.write(formatDiffMarkdown(diff));
|
|
915
|
+
}
|
|
916
|
+
return diff.regressed ? 1 : 0;
|
|
917
|
+
}
|
|
918
|
+
catch (err) {
|
|
919
|
+
error(ctx, err instanceof Error ? err.message : String(err));
|
|
920
|
+
return 1;
|
|
921
|
+
}
|
|
922
|
+
}
|
|
570
923
|
if (command === "eval") {
|
|
924
|
+
const wantProgress = parsed.evalQuiet !== true &&
|
|
925
|
+
parsed.dryRun !== true &&
|
|
926
|
+
parsed.evalJson !== true;
|
|
927
|
+
const progress = wantProgress
|
|
928
|
+
? createStderrProgressLogger({ writer: (s) => ctx.stderr.write(s) })
|
|
929
|
+
: undefined;
|
|
930
|
+
if (parsed.evalCompareModel !== undefined) {
|
|
931
|
+
return runCompareModel(parsed, ctx, progress);
|
|
932
|
+
}
|
|
571
933
|
const result = await runEval({
|
|
572
934
|
projectRoot: ctx.cwd,
|
|
573
935
|
stage: parsed.evalStage,
|
|
574
|
-
|
|
936
|
+
mode: parsed.evalMode,
|
|
575
937
|
schemaOnly: parsed.evalSchemaOnly === true,
|
|
576
938
|
rules: parsed.evalRules === true,
|
|
577
939
|
judge: parsed.evalJudge === true,
|
|
578
|
-
dryRun: parsed.dryRun === true
|
|
940
|
+
dryRun: parsed.dryRun === true,
|
|
941
|
+
...(progress ? { progress } : {}),
|
|
942
|
+
...resolveMaxCostOption(parsed.evalMaxCostUsd, process.env)
|
|
579
943
|
});
|
|
580
944
|
if ("kind" in result) {
|
|
581
945
|
if (parsed.evalJson === true) {
|
|
@@ -588,11 +952,17 @@ async function runCommand(parsed, ctx) {
|
|
|
588
952
|
ctx.stdout.write(` model: ${result.config.model}\n`);
|
|
589
953
|
ctx.stdout.write(` source: ${result.config.source}\n`);
|
|
590
954
|
ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
|
|
591
|
-
ctx.stdout.write(`
|
|
955
|
+
ctx.stdout.write(` mode: ${result.plannedMode}\n`);
|
|
592
956
|
ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
|
|
593
957
|
for (const [stage, count] of Object.entries(result.corpus.byStage)) {
|
|
594
958
|
ctx.stdout.write(` - ${stage}: ${count}\n`);
|
|
595
959
|
}
|
|
960
|
+
if (result.workflowCorpus.total > 0 || result.plannedMode === "workflow") {
|
|
961
|
+
ctx.stdout.write(` workflow corpus: ${result.workflowCorpus.total} case(s)\n`);
|
|
962
|
+
for (const wf of result.workflowCorpus.cases) {
|
|
963
|
+
ctx.stdout.write(` - ${wf.id}: ${wf.stages.join(" → ")}\n`);
|
|
964
|
+
}
|
|
965
|
+
}
|
|
596
966
|
ctx.stdout.write(` verifiers available:\n`);
|
|
597
967
|
for (const [key, value] of Object.entries(result.verifiersAvailable)) {
|
|
598
968
|
ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
5
|
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
6
|
*/
|
|
7
|
-
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default
|
|
8
|
-
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land.
|
|
7
|
+
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default evaluation mode when --mode is not supplied.\n# fixture = verify existing artifacts (cheap, LLM-free unless --judge is set)\n# agent = LLM drafts one stage's artifact in a sandbox with tools\n# workflow = LLM runs the full multi-stage flow (brainstorm \u2192 plan)\n# (Legacy alias --tier=A|B|C still works; A\u2192fixture, B\u2192agent, C\u2192workflow.)\ndefaultMode: fixture\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
|
|
8
|
+
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Agent/workflow runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
|
|
9
9
|
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale. The runner picks `<stage>.yaml` when\n`cclaw eval --judge` is invoked; every stage ships a starter rubric\nbelow \u2014 edit the checks to match what your team cares about, and add\n`critical: true` to the checks that should hard-fail nightly CI on\nregression.\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n critical: false\n```\n\nSee `docs/evals.md` for the full schema.\n";
|
|
10
10
|
export declare const EVAL_RUBRIC_FILES: ReadonlyArray<{
|
|
11
11
|
stage: string;
|
|
@@ -13,11 +13,12 @@ provider: zai
|
|
|
13
13
|
baseUrl: https://api.z.ai/api/coding/paas/v4
|
|
14
14
|
model: glm-5.1
|
|
15
15
|
|
|
16
|
-
# Default
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
|
|
16
|
+
# Default evaluation mode when --mode is not supplied.
|
|
17
|
+
# fixture = verify existing artifacts (cheap, LLM-free unless --judge is set)
|
|
18
|
+
# agent = LLM drafts one stage's artifact in a sandbox with tools
|
|
19
|
+
# workflow = LLM runs the full multi-stage flow (brainstorm → plan)
|
|
20
|
+
# (Legacy alias --tier=A|B|C still works; A→fixture, B→agent, C→workflow.)
|
|
21
|
+
defaultMode: fixture
|
|
21
22
|
|
|
22
23
|
# Per-call timeout and retry budget.
|
|
23
24
|
timeoutMs: 120000
|
|
@@ -51,7 +52,7 @@ expected:
|
|
|
51
52
|
\`\`\`
|
|
52
53
|
|
|
53
54
|
Start with 3 structural cases per stage (24 total), then expand to 5 per
|
|
54
|
-
stage (40 total) once rule verifiers land.
|
|
55
|
+
stage (40 total) once rule verifiers land. Agent/workflow runs may add
|
|
55
56
|
\`context_files\` pulled from real projects to exercise the sandbox.
|
|
56
57
|
`;
|
|
57
58
|
export const EVAL_RUBRICS_README = `# Eval Rubrics
|
|
@@ -23,5 +23,5 @@ export interface SingleShotOutput {
|
|
|
23
23
|
userPrompt: string;
|
|
24
24
|
}
|
|
25
25
|
export declare function loadStageSkill(projectRoot: string, stage: FlowStage): Promise<string>;
|
|
26
|
-
/** Run the
|
|
26
|
+
/** Run the single-shot AUT (fixture mode + --judge) and return the produced artifact. */
|
|
27
27
|
export declare function runSingleShot(input: SingleShotInput): Promise<SingleShotOutput>;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Single-shot agent used by fixture mode when `--judge` is set.
|
|
3
3
|
*
|
|
4
4
|
* Simplest realistic AUT: one LLM call with the stage's SKILL.md as the
|
|
5
5
|
* system prompt and the case's `inputPrompt` as the user message. Output
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* Design notes:
|
|
10
10
|
*
|
|
11
11
|
* - No tools. No multi-turn. No reads of the project beyond the one
|
|
12
|
-
* SKILL.md.
|
|
12
|
+
* SKILL.md. agent/workflow modes layer complexity on top.
|
|
13
13
|
* - Errors are propagated as-is (`EvalLlmError` subclasses) so the
|
|
14
14
|
* runner can surface them as verifier failures without swallowing the
|
|
15
15
|
* cause.
|
|
@@ -27,7 +27,7 @@ export async function loadStageSkill(projectRoot, stage) {
|
|
|
27
27
|
const file = path.join(projectRoot, RUNTIME_ROOT, "skills", folder, "SKILL.md");
|
|
28
28
|
if (!(await exists(file))) {
|
|
29
29
|
throw new Error(`Stage skill not found: ${path.relative(projectRoot, file)}. ` +
|
|
30
|
-
`Run \`cclaw init\` (or \`cclaw sync\`) before \`cclaw eval --
|
|
30
|
+
`Run \`cclaw init\` (or \`cclaw sync\`) before \`cclaw eval --mode=fixture --judge\`.`);
|
|
31
31
|
}
|
|
32
32
|
return fs.readFile(file, "utf8");
|
|
33
33
|
}
|
|
@@ -50,7 +50,7 @@ function buildUserPrompt(caseEntry) {
|
|
|
50
50
|
`Do not wrap in code fences, do not add commentary before or after.`);
|
|
51
51
|
return lines.join("\n");
|
|
52
52
|
}
|
|
53
|
-
/** Run the
|
|
53
|
+
/** Run the single-shot AUT (fixture mode + --judge) and return the produced artifact. */
|
|
54
54
|
export async function runSingleShot(input) {
|
|
55
55
|
const { caseEntry, config, projectRoot, client } = input;
|
|
56
56
|
const started = Date.now();
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
|
|
2
|
-
import { createSandbox } from "../sandbox.js";
|
|
2
|
+
import { createSandbox, type Sandbox } from "../sandbox.js";
|
|
3
3
|
import type { SandboxTool } from "../tools/index.js";
|
|
4
4
|
import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
|
|
5
5
|
export declare class MaxTurnsExceededError extends Error {
|
|
@@ -16,6 +16,19 @@ export interface WithToolsInput {
|
|
|
16
16
|
loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
|
|
17
17
|
/** Override for the sandbox factory (test hook). */
|
|
18
18
|
createSandboxFn?: typeof createSandbox;
|
|
19
|
+
/**
|
|
20
|
+
* Reuse an externally-managed sandbox instead of creating + disposing a
|
|
21
|
+
* per-call one. Workflow mode uses this so every stage shares the same
|
|
22
|
+
* sandbox and earlier artifacts remain visible. When set, the caller is
|
|
23
|
+
* responsible for `dispose()`.
|
|
24
|
+
*/
|
|
25
|
+
externalSandbox?: Sandbox;
|
|
26
|
+
/**
|
|
27
|
+
* Optional override of the default user prompt prefix. Workflow mode uses
|
|
28
|
+
* this to tell the model which stage it is on and where the prior
|
|
29
|
+
* artifacts are located.
|
|
30
|
+
*/
|
|
31
|
+
promptPreamble?: string;
|
|
19
32
|
}
|
|
20
33
|
export interface WithToolsOutput {
|
|
21
34
|
artifact: string;
|