cclaw-cli 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +421 -64
  2. package/dist/cli.d.ts +8 -4
  3. package/dist/cli.js +318 -47
  4. package/dist/constants.d.ts +1 -1
  5. package/dist/constants.js +34 -1
  6. package/dist/content/eval-scaffold.d.ts +2 -2
  7. package/dist/content/eval-scaffold.js +7 -6
  8. package/dist/content/start-command.d.ts +3 -2
  9. package/dist/content/start-command.js +5 -4
  10. package/dist/eval/agents/single-shot.d.ts +1 -1
  11. package/dist/eval/agents/single-shot.js +4 -4
  12. package/dist/eval/agents/with-tools.d.ts +6 -6
  13. package/dist/eval/agents/with-tools.js +5 -5
  14. package/dist/eval/agents/workflow.d.ts +7 -0
  15. package/dist/eval/agents/workflow.js +5 -3
  16. package/dist/eval/baseline.d.ts +24 -0
  17. package/dist/eval/baseline.js +75 -2
  18. package/dist/eval/config-loader.js +46 -17
  19. package/dist/eval/cost-guard.d.ts +22 -0
  20. package/dist/eval/cost-guard.js +38 -1
  21. package/dist/eval/diff.d.ts +1 -1
  22. package/dist/eval/diff.js +3 -3
  23. package/dist/eval/llm-client.d.ts +13 -2
  24. package/dist/eval/llm-client.js +8 -1
  25. package/dist/eval/mode.d.ts +28 -0
  26. package/dist/eval/mode.js +61 -0
  27. package/dist/eval/progress.d.ts +83 -0
  28. package/dist/eval/progress.js +59 -0
  29. package/dist/eval/report.js +1 -1
  30. package/dist/eval/runner.d.ts +29 -9
  31. package/dist/eval/runner.js +148 -56
  32. package/dist/eval/runs.d.ts +41 -0
  33. package/dist/eval/runs.js +114 -0
  34. package/dist/eval/sandbox.js +1 -1
  35. package/dist/eval/tools/index.js +1 -1
  36. package/dist/eval/tools/types.d.ts +1 -1
  37. package/dist/eval/types.d.ts +54 -27
  38. package/dist/eval/types.js +21 -9
  39. package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
  40. package/dist/eval/workflow-corpus.d.ts +2 -2
  41. package/dist/eval/workflow-corpus.js +4 -4
  42. package/dist/install.d.ts +10 -0
  43. package/dist/install.js +19 -5
  44. package/package.json +1 -1
package/dist/cli.js CHANGED
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env node
2
- import { readFileSync, realpathSync } from "node:fs";
2
+ import { createReadStream, realpathSync } from "node:fs";
3
+ import { spawn } from "node:child_process";
4
+ import fs from "node:fs/promises";
3
5
  import process from "node:process";
4
6
  import path from "node:path";
5
7
  import { createInterface } from "node:readline/promises";
@@ -9,15 +11,18 @@ import { doctorChecks, doctorSucceeded } from "./doctor.js";
9
11
  import { initCclaw, syncCclaw, uninstallCclaw, upgradeCclaw } from "./install.js";
10
12
  import { error, info } from "./logger.js";
11
13
  import { archiveRun } from "./runs.js";
12
- import { RUNTIME_ROOT } from "./constants.js";
14
+ import { CCLAW_VERSION, RUNTIME_ROOT } from "./constants.js";
13
15
  import { createDefaultConfig, createProfileConfig } from "./config.js";
14
16
  import { detectHarnesses } from "./init-detect.js";
15
17
  import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
18
  import { runEval } from "./eval/runner.js";
19
+ import { createStderrProgressLogger } from "./eval/progress.js";
17
20
  import { writeBaselinesFromReport } from "./eval/baseline.js";
18
21
  import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
19
22
  import { formatDiffMarkdown, runEvalDiff } from "./eval/diff.js";
20
- import { EVAL_TIERS } from "./eval/types.js";
23
+ import { ensureRunDir, generateRunId, isRunAlive, listRuns, readRunStatus, resolveRunId, runLogPath, writeRunStatus } from "./eval/runs.js";
24
+ import { EVAL_MODES } from "./eval/types.js";
25
+ import { parseModeInput } from "./eval/mode.js";
21
26
  import { FLOW_STAGES } from "./types.js";
22
27
  const INSTALLER_COMMANDS = [
23
28
  "init",
@@ -56,22 +61,41 @@ Commands:
56
61
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
57
62
  --retro-reason=<t> Reason for bypassing retro gate.
58
63
  eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
59
- Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}) for Tier A/B.
60
- --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=multi-stage workflow).
64
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}) for fixture/agent modes.
65
+ --mode=<${EVAL_MODES.join("|")}>
66
+ Evaluation mode:
67
+ fixture = verify existing artifacts with structural/rule/judge verifiers.
68
+ agent = LLM drafts one stage's artifact in a sandbox with tools.
69
+ workflow = LLM runs the full multi-stage flow (brainstorm→plan).
70
+ Legacy --tier=A|B|C still works (deprecated).
61
71
  --schema-only Run only structural verifiers (default).
62
72
  --rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
63
- --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B/C the sandbox tool-using agent (read_file/write_file/glob/grep).
73
+ --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; fixture mode judges an existing artifact, agent/workflow modes draft first and then judge.
64
74
  --dry-run Validate config + corpus, print summary, do not execute.
65
75
  --json Emit machine-readable JSON on stdout.
66
76
  --no-write Skip writing the report to .cclaw/evals/reports/.
67
77
  --update-baseline Overwrite baselines from the current run (requires --confirm).
68
78
  --confirm Acknowledge --update-baseline (prevents accidental resets).
79
+ --quiet Silence the stderr progress logger (default: emit one
80
+ line per case / stage to stderr so long runs are visible).
81
+ --max-cost-usd=<n> Abort the run if committed USD spend crosses <n>
82
+ (independent from the daily cap). Also readable from
83
+ CCLAW_EVAL_MAX_COST_USD.
84
+ --compare-model=<id> Run the same corpus twice — once with the configured model
85
+ and once with <id> — then diff the summaries. Exit code 1
86
+ when the override model regressed.
87
+ --background Spawn the run as a detached child process, write the
88
+ combined output to .cclaw/evals/runs/<id>/run.log, and
89
+ return immediately. Attach later with
90
+ \`cclaw eval runs tail <id|latest>\`.
69
91
 
70
92
  Subcommands:
71
93
  diff <old> <new> Compare two reports under .cclaw/evals/reports/.
72
94
  Each argument is a cclawVersion (e.g. 0.26.0), a filename,
73
95
  or the literal "latest". Exit code 1 when the diff shows a
74
96
  regression. Accepts --json to emit machine-readable output.
97
+ runs [action] [id] Inspect background runs under .cclaw/evals/runs/.
98
+ Actions: list (default) | status <id|latest> | tail <id|latest>.
75
99
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
76
100
  uninstall Remove .cclaw runtime and the generated harness shim files.
77
101
 
@@ -85,40 +109,15 @@ Examples:
85
109
  cclaw archive --name=payments-revamp
86
110
  cclaw eval --dry-run
87
111
  cclaw eval --stage=brainstorm --schema-only
88
- cclaw eval --judge --tier=A --stage=brainstorm
89
- cclaw eval --judge --tier=B --stage=spec
90
- cclaw eval --tier=C --judge
112
+ cclaw eval --judge --mode=fixture --stage=brainstorm
113
+ cclaw eval --judge --mode=agent --stage=spec
114
+ cclaw eval --mode=workflow --judge
91
115
  cclaw eval diff 0.26.0 latest
92
116
 
93
117
  Docs: https://github.com/zuevrs/cclaw
94
118
  Issues: https://github.com/zuevrs/cclaw/issues
95
119
  `;
96
120
  }
97
- function cliPackageVersion() {
98
- try {
99
- const here = path.dirname(fileURLToPath(import.meta.url));
100
- const candidates = [
101
- path.resolve(here, "../package.json"),
102
- path.resolve(here, "../../package.json")
103
- ];
104
- for (const candidate of candidates) {
105
- try {
106
- const raw = readFileSync(candidate, "utf8");
107
- const parsed = JSON.parse(raw);
108
- if (parsed.name === "cclaw-cli" && typeof parsed.version === "string") {
109
- return parsed.version;
110
- }
111
- }
112
- catch {
113
- continue;
114
- }
115
- }
116
- }
117
- catch {
118
- // fall through
119
- }
120
- return "unknown";
121
- }
122
121
  function parseHarnesses(raw) {
123
122
  const requested = raw
124
123
  .split(",")
@@ -144,12 +143,17 @@ function parseProfile(raw) {
144
143
  }
145
144
  return trimmed;
146
145
  }
147
- function parseEvalTier(raw) {
148
- const trimmed = raw.trim().toUpperCase();
149
- if (!EVAL_TIERS.includes(trimmed)) {
150
- throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
151
- }
152
- return trimmed;
146
+ function parseLegacyTier(raw) {
147
+ return parseModeInput(raw.toUpperCase(), {
148
+ source: "cli",
149
+ raw: `--tier=${raw}`
150
+ });
151
+ }
152
+ function parseEvalMode(raw) {
153
+ return parseModeInput(raw, {
154
+ source: "cli",
155
+ raw: `--mode=${raw}`
156
+ });
153
157
  }
154
158
  function parseEvalStage(raw) {
155
159
  const trimmed = raw.trim();
@@ -372,6 +376,18 @@ function printDoctorText(ctx, checks, options) {
372
376
  ctx.stdout.write("Doctor status: HEALTHY (no failing error checks)\n");
373
377
  }
374
378
  }
379
+ function resolveMaxCostOption(fromCli, env) {
380
+ if (fromCli !== undefined)
381
+ return { maxCostUsd: fromCli };
382
+ const raw = env.CCLAW_EVAL_MAX_COST_USD;
383
+ if (raw === undefined || raw.trim() === "")
384
+ return {};
385
+ const value = Number(raw);
386
+ if (!Number.isFinite(value) || value <= 0) {
387
+ throw new Error(`CCLAW_EVAL_MAX_COST_USD must be a positive number, got: ${raw}`);
388
+ }
389
+ return { maxCostUsd: value };
390
+ }
375
391
  function parseArgs(argv) {
376
392
  const parsed = {};
377
393
  const helpFlag = argv.find((arg) => arg === "--help" || arg === "-h");
@@ -406,8 +422,11 @@ function parseArgs(argv) {
406
422
  parsed.evalSubcommand = "diff";
407
423
  sawSubcommand = true;
408
424
  }
425
+ else if (token === "runs") {
426
+ parsed.evalSubcommand = "runs";
427
+ sawSubcommand = true;
428
+ }
409
429
  else {
410
- // Treat unknown positional as an eval arg for forward compat.
411
430
  evalArgs.push(token);
412
431
  }
413
432
  continue;
@@ -479,8 +498,12 @@ function parseArgs(argv) {
479
498
  parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
480
499
  continue;
481
500
  }
501
+ if (flag.startsWith("--mode=")) {
502
+ parsed.evalMode = parseEvalMode(flag.replace("--mode=", ""));
503
+ continue;
504
+ }
482
505
  if (flag.startsWith("--tier=")) {
483
- parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
506
+ parsed.evalMode = parseLegacyTier(flag.replace("--tier=", ""));
484
507
  continue;
485
508
  }
486
509
  if (flag === "--schema-only") {
@@ -507,21 +530,252 @@ function parseArgs(argv) {
507
530
  parsed.evalConfirm = true;
508
531
  continue;
509
532
  }
533
+ if (flag === "--background") {
534
+ parsed.evalBackground = true;
535
+ continue;
536
+ }
537
+ if (flag.startsWith("--compare-model=")) {
538
+ const value = flag.replace("--compare-model=", "").trim();
539
+ if (value.length === 0) {
540
+ throw new Error(`--compare-model requires a non-empty model id (e.g. --compare-model=gpt-4o-mini).`);
541
+ }
542
+ parsed.evalCompareModel = value;
543
+ continue;
544
+ }
545
+ if (flag.startsWith("--max-cost-usd=")) {
546
+ const raw = flag.replace("--max-cost-usd=", "").trim();
547
+ const value = Number(raw);
548
+ if (!Number.isFinite(value) || value <= 0) {
549
+ throw new Error(`--max-cost-usd requires a positive number, got: ${raw}`);
550
+ }
551
+ parsed.evalMaxCostUsd = value;
552
+ continue;
553
+ }
510
554
  }
511
555
  // `--json` is shared between doctor and eval. Disambiguate by command.
512
556
  if (parsed.command === "eval" && parsed.doctorJson === true) {
513
557
  parsed.evalJson = true;
514
558
  parsed.doctorJson = undefined;
515
559
  }
560
+ // `--quiet` on `eval` silences the stderr progress logger. On doctor it
561
+ // continues to mean "print only failing checks" — the flag slot is the
562
+ // same, the semantics depend on which command owns the invocation.
563
+ if (parsed.command === "eval" && parsed.doctorQuiet === true) {
564
+ parsed.evalQuiet = true;
565
+ parsed.doctorQuiet = undefined;
566
+ }
516
567
  return parsed;
517
568
  }
569
+ /**
570
+ * Spawn `cclaw eval` (without `--background`) in a detached child process
571
+ * and return immediately. The child's stdout+stderr are piped to
572
+ * `.cclaw/evals/runs/<id>/run.log` so the user can attach later with
573
+ * `cclaw eval runs tail`. We do NOT wait for the child — the whole point
574
+ * is to free the terminal while a multi-minute workflow-mode run
575
+ * proceeds in the background.
576
+ */
577
+ async function spawnBackgroundEval(parsed, ctx) {
578
+ const id = generateRunId();
579
+ await ensureRunDir(ctx.cwd, id);
580
+ const logPath = runLogPath(ctx.cwd, id);
581
+ const childArgv = process.argv.slice(2).filter((a) => a !== "--background");
582
+ const cliEntry = process.argv[1];
583
+ if (!cliEntry) {
584
+ error(ctx, "Could not resolve cclaw entrypoint for --background.");
585
+ return 1;
586
+ }
587
+ const logHandle = await fs.open(logPath, "a");
588
+ try {
589
+ const child = spawn(process.execPath, [cliEntry, ...childArgv], {
590
+ cwd: ctx.cwd,
591
+ detached: true,
592
+ stdio: ["ignore", logHandle.fd, logHandle.fd],
593
+ env: process.env
594
+ });
595
+ const pid = child.pid ?? -1;
596
+ await writeRunStatus(ctx.cwd, {
597
+ id,
598
+ startedAt: new Date().toISOString(),
599
+ pid,
600
+ argv: childArgv,
601
+ cwd: ctx.cwd,
602
+ state: "running"
603
+ });
604
+ child.unref();
605
+ const finalize = async (code) => {
606
+ const current = await readRunStatus(ctx.cwd, id);
607
+ if (!current)
608
+ return;
609
+ const exitCode = typeof code === "number" ? code : -1;
610
+ await writeRunStatus(ctx.cwd, {
611
+ ...current,
612
+ endedAt: new Date().toISOString(),
613
+ exitCode,
614
+ state: exitCode === 0 ? "succeeded" : "failed"
615
+ });
616
+ };
617
+ child.on("exit", (code) => {
618
+ void finalize(code);
619
+ });
620
+ child.on("error", (err) => {
621
+ void writeRunStatus(ctx.cwd, {
622
+ id,
623
+ startedAt: new Date().toISOString(),
624
+ pid,
625
+ argv: childArgv,
626
+ cwd: ctx.cwd,
627
+ endedAt: new Date().toISOString(),
628
+ exitCode: -1,
629
+ state: "failed"
630
+ });
631
+ error(ctx, `Background eval failed to start: ${err.message}`);
632
+ });
633
+ ctx.stdout.write(`cclaw eval: background run id=${id} pid=${pid}\n` +
634
+ ` log: ${logPath}\n` +
635
+ ` tail: cclaw eval runs tail ${id}\n` +
636
+ ` status: cclaw eval runs status ${id}\n`);
637
+ return 0;
638
+ }
639
+ finally {
640
+ await logHandle.close();
641
+ }
642
+ }
643
+ function formatRunRow(status) {
644
+ const ended = status.endedAt ? ` ended=${status.endedAt}` : "";
645
+ const exitCode = status.exitCode !== undefined ? ` exit=${status.exitCode}` : "";
646
+ const alive = status.state === "running" ? (isRunAlive(status) ? "" : " (stale)") : "";
647
+ return `${status.id} state=${status.state}${alive} pid=${status.pid} started=${status.startedAt}${ended}${exitCode}`;
648
+ }
649
+ async function runEvalRunsSubcommand(parsed, ctx) {
650
+ const args = parsed.evalArgs ?? [];
651
+ const action = args[0] ?? "list";
652
+ if (action === "list") {
653
+ const runs = await listRuns(ctx.cwd);
654
+ if (runs.length === 0) {
655
+ ctx.stdout.write("No eval runs recorded under .cclaw/evals/runs/.\n");
656
+ return 0;
657
+ }
658
+ if (parsed.evalJson === true) {
659
+ ctx.stdout.write(`${JSON.stringify(runs, null, 2)}\n`);
660
+ return 0;
661
+ }
662
+ for (const run of runs)
663
+ ctx.stdout.write(`${formatRunRow(run)}\n`);
664
+ return 0;
665
+ }
666
+ if (action === "status") {
667
+ const id = await resolveRunId(ctx.cwd, args[1]);
668
+ if (!id) {
669
+ error(ctx, `No such run: ${args[1] ?? "(none recorded)"}`);
670
+ return 1;
671
+ }
672
+ const status = await readRunStatus(ctx.cwd, id);
673
+ if (!status) {
674
+ error(ctx, `Run ${id} has no status file.`);
675
+ return 1;
676
+ }
677
+ if (parsed.evalJson === true) {
678
+ ctx.stdout.write(`${JSON.stringify(status, null, 2)}\n`);
679
+ }
680
+ else {
681
+ ctx.stdout.write(`${formatRunRow(status)}\n`);
682
+ ctx.stdout.write(`log: ${runLogPath(ctx.cwd, id)}\n`);
683
+ }
684
+ return status.state === "failed" ? 1 : 0;
685
+ }
686
+ if (action === "tail") {
687
+ const id = await resolveRunId(ctx.cwd, args[1]);
688
+ if (!id) {
689
+ error(ctx, `No such run: ${args[1] ?? "(none recorded)"}`);
690
+ return 1;
691
+ }
692
+ const logFile = runLogPath(ctx.cwd, id);
693
+ const stream = createReadStream(logFile, { encoding: "utf8" });
694
+ await new Promise((resolve, reject) => {
695
+ stream.on("data", (chunk) => ctx.stdout.write(chunk));
696
+ stream.on("end", () => resolve());
697
+ stream.on("error", reject);
698
+ });
699
+ return 0;
700
+ }
701
+ error(ctx, `Unknown \`cclaw eval runs\` action: ${action}. Use list | status | tail.`);
702
+ return 1;
703
+ }
704
+ /**
705
+ * Run the same corpus twice — once against the configured model, once
706
+ * against `--compare-model=<id>` — and print a summary comparing the
707
+ * two. Both reports are written to `.cclaw/evals/reports/` (unless
708
+ * `--no-write` is set) and a unified diff is emitted to stdout. Exit
709
+ * code is 1 when the override model regressed against the baseline
710
+ * model, 0 otherwise.
711
+ */
712
+ async function runCompareModel(parsed, ctx, progress) {
713
+ const baselineOpts = {
714
+ projectRoot: ctx.cwd,
715
+ stage: parsed.evalStage,
716
+ mode: parsed.evalMode,
717
+ schemaOnly: parsed.evalSchemaOnly === true,
718
+ rules: parsed.evalRules === true,
719
+ judge: parsed.evalJudge === true,
720
+ ...(progress ? { progress } : {}),
721
+ ...resolveMaxCostOption(parsed.evalMaxCostUsd, process.env)
722
+ };
723
+ ctx.stderr.write(`[cclaw eval] compare: running baseline model...\n`);
724
+ const baseline = await runEval(baselineOpts);
725
+ if ("kind" in baseline) {
726
+ error(ctx, "--compare-model is incompatible with --dry-run.");
727
+ return 1;
728
+ }
729
+ ctx.stderr.write(`[cclaw eval] compare: running ${parsed.evalCompareModel} ...\n`);
730
+ const candidate = await runEval({
731
+ ...baselineOpts,
732
+ modelOverride: parsed.evalCompareModel
733
+ });
734
+ if ("kind" in candidate) {
735
+ error(ctx, "--compare-model received an unexpected dry-run response.");
736
+ return 1;
737
+ }
738
+ if (parsed.evalNoWrite !== true) {
739
+ await writeJsonReport(ctx.cwd, baseline);
740
+ await writeMarkdownReport(ctx.cwd, baseline);
741
+ await writeJsonReport(ctx.cwd, candidate);
742
+ await writeMarkdownReport(ctx.cwd, candidate);
743
+ }
744
+ const passDelta = candidate.summary.passed - baseline.summary.passed;
745
+ const failDelta = candidate.summary.failed - baseline.summary.failed;
746
+ const costDelta = candidate.summary.totalCostUsd - baseline.summary.totalCostUsd;
747
+ if (parsed.evalJson === true) {
748
+ ctx.stdout.write(`${JSON.stringify({
749
+ baseline: {
750
+ model: baseline.model,
751
+ summary: baseline.summary
752
+ },
753
+ candidate: {
754
+ model: candidate.model,
755
+ summary: candidate.summary
756
+ },
757
+ delta: { passed: passDelta, failed: failDelta, costUsd: costDelta }
758
+ }, null, 2)}\n`);
759
+ }
760
+ else {
761
+ ctx.stdout.write(`cclaw eval compare-model:\n` +
762
+ ` baseline ${baseline.model}: pass=${baseline.summary.passed}/${baseline.summary.totalCases} ` +
763
+ `fail=${baseline.summary.failed} cost=$${baseline.summary.totalCostUsd.toFixed(4)}\n` +
764
+ ` candidate ${candidate.model}: pass=${candidate.summary.passed}/${candidate.summary.totalCases} ` +
765
+ `fail=${candidate.summary.failed} cost=$${candidate.summary.totalCostUsd.toFixed(4)}\n` +
766
+ ` delta: passed=${passDelta >= 0 ? "+" : ""}${passDelta} ` +
767
+ `failed=${failDelta >= 0 ? "+" : ""}${failDelta} ` +
768
+ `cost=${costDelta >= 0 ? "+" : ""}$${costDelta.toFixed(4)}\n`);
769
+ }
770
+ return failDelta > 0 ? 1 : 0;
771
+ }
518
772
  async function runCommand(parsed, ctx) {
519
773
  if (parsed.showHelp) {
520
774
  ctx.stdout.write(usage());
521
775
  return 0;
522
776
  }
523
777
  if (parsed.showVersion) {
524
- ctx.stdout.write(`cclaw ${cliPackageVersion()}\n`);
778
+ ctx.stdout.write(`cclaw ${CCLAW_VERSION}\n`);
525
779
  return 0;
526
780
  }
527
781
  const command = parsed.command;
@@ -608,6 +862,12 @@ async function runCommand(parsed, ctx) {
608
862
  info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
609
863
  return 0;
610
864
  }
865
+ if (command === "eval" && parsed.evalSubcommand === "runs") {
866
+ return runEvalRunsSubcommand(parsed, ctx);
867
+ }
868
+ if (command === "eval" && parsed.evalBackground === true) {
869
+ return spawnBackgroundEval(parsed, ctx);
870
+ }
611
871
  if (command === "eval" && parsed.evalSubcommand === "diff") {
612
872
  const args = parsed.evalArgs ?? [];
613
873
  if (args.length !== 2) {
@@ -636,14 +896,25 @@ async function runCommand(parsed, ctx) {
636
896
  }
637
897
  }
638
898
  if (command === "eval") {
899
+ const wantProgress = parsed.evalQuiet !== true &&
900
+ parsed.dryRun !== true &&
901
+ parsed.evalJson !== true;
902
+ const progress = wantProgress
903
+ ? createStderrProgressLogger({ writer: (s) => ctx.stderr.write(s) })
904
+ : undefined;
905
+ if (parsed.evalCompareModel !== undefined) {
906
+ return runCompareModel(parsed, ctx, progress);
907
+ }
639
908
  const result = await runEval({
640
909
  projectRoot: ctx.cwd,
641
910
  stage: parsed.evalStage,
642
- tier: parsed.evalTier,
911
+ mode: parsed.evalMode,
643
912
  schemaOnly: parsed.evalSchemaOnly === true,
644
913
  rules: parsed.evalRules === true,
645
914
  judge: parsed.evalJudge === true,
646
- dryRun: parsed.dryRun === true
915
+ dryRun: parsed.dryRun === true,
916
+ ...(progress ? { progress } : {}),
917
+ ...resolveMaxCostOption(parsed.evalMaxCostUsd, process.env)
647
918
  });
648
919
  if ("kind" in result) {
649
920
  if (parsed.evalJson === true) {
@@ -656,12 +927,12 @@ async function runCommand(parsed, ctx) {
656
927
  ctx.stdout.write(` model: ${result.config.model}\n`);
657
928
  ctx.stdout.write(` source: ${result.config.source}\n`);
658
929
  ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
659
- ctx.stdout.write(` tier: ${result.plannedTier}\n`);
930
+ ctx.stdout.write(` mode: ${result.plannedMode}\n`);
660
931
  ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
661
932
  for (const [stage, count] of Object.entries(result.corpus.byStage)) {
662
933
  ctx.stdout.write(` - ${stage}: ${count}\n`);
663
934
  }
664
- if (result.workflowCorpus.total > 0 || result.plannedTier === "C") {
935
+ if (result.workflowCorpus.total > 0 || result.plannedMode === "workflow") {
665
936
  ctx.stdout.write(` workflow corpus: ${result.workflowCorpus.total} case(s)\n`);
666
937
  for (const wf of result.workflowCorpus.cases) {
667
938
  ctx.stdout.write(` - ${wf.id}: ${wf.stages.join(" → ")}\n`);
@@ -1,7 +1,7 @@
1
1
  import type { FlowStage, HarnessId } from "./types.js";
2
2
  /** Hidden runtime directory at project root (dot-prefixed). */
3
3
  export declare const RUNTIME_ROOT = ".cclaw";
4
- export declare const CCLAW_VERSION = "0.1.1";
4
+ export declare const CCLAW_VERSION: string;
5
5
  export declare const FLOW_VERSION = "1.0.0";
6
6
  export declare const DEFAULT_HARNESSES: HarnessId[];
7
7
  /**
package/dist/constants.js CHANGED
@@ -1,6 +1,39 @@
1
+ import { readFileSync } from "node:fs";
2
+ import path from "node:path";
3
+ import { fileURLToPath } from "node:url";
1
4
  /** Hidden runtime directory at project root (dot-prefixed). */
2
5
  export const RUNTIME_ROOT = ".cclaw";
3
- export const CCLAW_VERSION = "0.1.1";
6
+ /**
7
+ * Resolved once at module load from the cclaw-cli package.json. Walking a
8
+ * short list of candidates keeps the helper working in both the compiled
9
+ * `dist/` layout and the in-repo `src/` layout (tests, ts-node).
10
+ */
11
+ function readPackageVersion() {
12
+ try {
13
+ const here = path.dirname(fileURLToPath(import.meta.url));
14
+ const candidates = [
15
+ path.resolve(here, "../package.json"),
16
+ path.resolve(here, "../../package.json")
17
+ ];
18
+ for (const candidate of candidates) {
19
+ try {
20
+ const raw = readFileSync(candidate, "utf8");
21
+ const parsed = JSON.parse(raw);
22
+ if (parsed.name === "cclaw-cli" && typeof parsed.version === "string") {
23
+ return parsed.version;
24
+ }
25
+ }
26
+ catch {
27
+ continue;
28
+ }
29
+ }
30
+ }
31
+ catch {
32
+ // Fall through to dev fallback.
33
+ }
34
+ return "0.0.0-dev";
35
+ }
36
+ export const CCLAW_VERSION = readPackageVersion();
4
37
  export const FLOW_VERSION = "1.0.0";
5
38
  export const DEFAULT_HARNESSES = [
6
39
  "claude",
@@ -4,8 +4,8 @@
4
4
  * scaffold is intentionally minimal: a usable default config plus short
5
5
  * READMEs that point at `docs/evals.md` for authoring guidance.
6
6
  */
7
- export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap)\n# B = SDK with tool use (realistic)\n# C = multi-stage workflow (end-to-end)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
- export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Tier B/C runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
7
+ export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default evaluation mode when --mode is not supplied.\n# fixture = verify existing artifacts (cheap, LLM-free unless --judge is set)\n# agent = LLM drafts one stage's artifact in a sandbox with tools\n# workflow = LLM runs the full multi-stage flow (brainstorm \u2192 plan)\n# (Legacy alias --tier=A|B|C still works; A\u2192fixture, B\u2192agent, C\u2192workflow.)\ndefaultMode: fixture\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
+ export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Agent/workflow runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
9
9
  export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale. The runner picks `<stage>.yaml` when\n`cclaw eval --judge` is invoked; every stage ships a starter rubric\nbelow \u2014 edit the checks to match what your team cares about, and add\n`critical: true` to the checks that should hard-fail nightly CI on\nregression.\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n critical: false\n```\n\nSee `docs/evals.md` for the full schema.\n";
10
10
  export declare const EVAL_RUBRIC_FILES: ReadonlyArray<{
11
11
  stage: string;
@@ -13,11 +13,12 @@ provider: zai
13
13
  baseUrl: https://api.z.ai/api/coding/paas/v4
14
14
  model: glm-5.1
15
15
 
16
- # Default fidelity tier when --tier is not supplied.
17
- # A = single-shot API call (cheap)
18
- # B = SDK with tool use (realistic)
19
- # C = multi-stage workflow (end-to-end)
20
- defaultTier: A
16
+ # Default evaluation mode when --mode is not supplied.
17
+ # fixture = verify existing artifacts (cheap, LLM-free unless --judge is set)
18
+ # agent = LLM drafts one stage's artifact in a sandbox with tools
19
+ # workflow = LLM runs the full multi-stage flow (brainstorm → plan)
20
+ # (Legacy alias --tier=A|B|C still works; A→fixture, B→agent, C→workflow.)
21
+ defaultMode: fixture
21
22
 
22
23
  # Per-call timeout and retry budget.
23
24
  timeoutMs: 120000
@@ -51,7 +52,7 @@ expected:
51
52
  \`\`\`
52
53
 
53
54
  Start with 3 structural cases per stage (24 total), then expand to 5 per
54
- stage (40 total) once rule verifiers land. Tier B/C runs may add
55
+ stage (40 total) once rule verifiers land. Agent/workflow runs may add
55
56
  \`context_files\` pulled from real projects to exercise the sandbox.
56
57
  `;
57
58
  export const EVAL_RUBRICS_README = `# Eval Rubrics
@@ -1,7 +1,8 @@
1
1
  /**
2
2
  * Command contract for /cc — the unified entry point.
3
- * No args → behaves like /cc-next (resume or start brainstorm).
4
- * With prompt → starts brainstorm with the given idea.
3
+ * No args → behaves like /cc-next (resume or start the flow at its first stage).
4
+ * With prompt → classifies the idea, selects a track, and starts the first
5
+ * stage of that track (brainstorm for medium/standard, spec for quick).
5
6
  */
6
7
  export declare function startCommandContract(): string;
7
8
  /**
@@ -6,8 +6,9 @@ function flowStatePath() {
6
6
  }
7
7
  /**
8
8
  * Command contract for /cc — the unified entry point.
9
- * No args → behaves like /cc-next (resume or start brainstorm).
10
- * With prompt → starts brainstorm with the given idea.
9
+ * No args → behaves like /cc-next (resume or start the flow at its first stage).
10
+ * With prompt → classifies the idea, selects a track, and starts the first
11
+ * stage of that track (brainstorm for medium/standard, spec for quick).
11
12
  */
12
13
  export function startCommandContract() {
13
14
  const flowPath = flowStatePath();
@@ -111,7 +112,7 @@ export function startCommandSkillMarkdown() {
111
112
  const flowPath = flowStatePath();
112
113
  return `---
113
114
  name: ${START_SKILL_NAME}
114
- description: "Unified entry point for the cclaw flow. No args = resume/next. With prompt = start brainstorm with idea."
115
+ description: "Unified entry point for the cclaw flow. No args = resume/next. With prompt = classify, pick track, start its first stage."
115
116
  ---
116
117
 
117
118
  # /cc — Flow Entry Point
@@ -121,7 +122,7 @@ description: "Unified entry point for the cclaw flow. No args = resume/next. Wit
121
122
  \`/cc\` is the **starting command** for cclaw. It intelligently routes:
122
123
 
123
124
  - **No arguments** → acts as \`/cc-next\` (resume current stage or advance to next)
124
- - **With a prompt** → captures the idea and starts brainstorm
125
+ - **With a prompt** → classifies the task, picks a track (quick/medium/standard), and starts the **first stage of that track** (not always brainstorm — e.g. the \`quick\` track starts at \`spec\`)
125
126
 
126
127
  ## HARD-GATE
127
128
 
@@ -23,5 +23,5 @@ export interface SingleShotOutput {
23
23
  userPrompt: string;
24
24
  }
25
25
  export declare function loadStageSkill(projectRoot: string, stage: FlowStage): Promise<string>;
26
- /** Run the Tier A single-shot AUT and return the produced artifact. */
26
+ /** Run the single-shot AUT (fixture mode + --judge) and return the produced artifact. */
27
27
  export declare function runSingleShot(input: SingleShotInput): Promise<SingleShotOutput>;