cclaw-cli 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +421 -64
  2. package/dist/cli.d.ts +8 -4
  3. package/dist/cli.js +318 -47
  4. package/dist/constants.d.ts +1 -1
  5. package/dist/constants.js +34 -1
  6. package/dist/content/eval-scaffold.d.ts +2 -2
  7. package/dist/content/eval-scaffold.js +7 -6
  8. package/dist/content/start-command.d.ts +3 -2
  9. package/dist/content/start-command.js +5 -4
  10. package/dist/eval/agents/single-shot.d.ts +1 -1
  11. package/dist/eval/agents/single-shot.js +4 -4
  12. package/dist/eval/agents/with-tools.d.ts +6 -6
  13. package/dist/eval/agents/with-tools.js +5 -5
  14. package/dist/eval/agents/workflow.d.ts +7 -0
  15. package/dist/eval/agents/workflow.js +5 -3
  16. package/dist/eval/baseline.d.ts +24 -0
  17. package/dist/eval/baseline.js +75 -2
  18. package/dist/eval/config-loader.js +46 -17
  19. package/dist/eval/cost-guard.d.ts +22 -0
  20. package/dist/eval/cost-guard.js +38 -1
  21. package/dist/eval/diff.d.ts +1 -1
  22. package/dist/eval/diff.js +3 -3
  23. package/dist/eval/llm-client.d.ts +13 -2
  24. package/dist/eval/llm-client.js +8 -1
  25. package/dist/eval/mode.d.ts +28 -0
  26. package/dist/eval/mode.js +61 -0
  27. package/dist/eval/progress.d.ts +83 -0
  28. package/dist/eval/progress.js +59 -0
  29. package/dist/eval/report.js +1 -1
  30. package/dist/eval/runner.d.ts +29 -9
  31. package/dist/eval/runner.js +148 -56
  32. package/dist/eval/runs.d.ts +41 -0
  33. package/dist/eval/runs.js +114 -0
  34. package/dist/eval/sandbox.js +1 -1
  35. package/dist/eval/tools/index.js +1 -1
  36. package/dist/eval/tools/types.d.ts +1 -1
  37. package/dist/eval/types.d.ts +54 -27
  38. package/dist/eval/types.js +21 -9
  39. package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
  40. package/dist/eval/workflow-corpus.d.ts +2 -2
  41. package/dist/eval/workflow-corpus.js +4 -4
  42. package/dist/install.d.ts +10 -0
  43. package/dist/install.js +19 -5
  44. package/package.json +1 -1
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Helpers that translate between the legacy `Tier A/B/C` naming and the
3
+ * current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
4
+ *
5
+ * The names we actually carry in reports, config, CLI flags, and verifier
6
+ * messages are the `EvalMode` ones; legacy tier inputs are accepted with a
7
+ * single deprecation warning per process so existing scripts keep working
8
+ * through the 0.28.x line.
9
+ */
10
+ import { EVAL_MODES } from "./types.js";
11
+ const LEGACY_TIER_TO_MODE = {
12
+ A: "fixture",
13
+ B: "agent",
14
+ C: "workflow"
15
+ };
16
+ const MODE_TO_LEGACY_TIER = {
17
+ fixture: "A",
18
+ agent: "B",
19
+ workflow: "C"
20
+ };
21
+ const DEPRECATED_NAMES = new Set(Object.keys(LEGACY_TIER_TO_MODE));
22
+ let legacyWarningEmitted = false;
23
+ /**
24
+ * Reset the per-process "already warned about legacy tier" flag. Used by
25
+ * tests so each test file gets a deterministic warning surface.
26
+ */
27
+ export function __resetLegacyWarningForTests() {
28
+ legacyWarningEmitted = false;
29
+ }
30
+ /**
31
+ * Normalize a raw string from the CLI / env / config into an `EvalMode`.
32
+ * Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
33
+ * Emits a deprecation warning to stderr at most once per process when a
34
+ * legacy tier name is seen.
35
+ */
36
+ export function parseModeInput(raw, input, writeWarning = defaultWriteWarning) {
37
+ const trimmed = raw.trim();
38
+ if (trimmed.length === 0) {
39
+ throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C).`);
40
+ }
41
+ if (EVAL_MODES.includes(trimmed)) {
42
+ return trimmed;
43
+ }
44
+ if (DEPRECATED_NAMES.has(trimmed)) {
45
+ const replacement = LEGACY_TIER_TO_MODE[trimmed];
46
+ if (!legacyWarningEmitted) {
47
+ legacyWarningEmitted = true;
48
+ writeWarning(`[cclaw] "${input.source}: ${input.raw}" is using the legacy tier name "${trimmed}". ` +
49
+ `Please switch to --mode=${replacement} (legacy --tier=A|B|C will be removed in the next major release).`);
50
+ }
51
+ return replacement;
52
+ }
53
+ throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C), got: ${raw}`);
54
+ }
55
+ /** @deprecated kept for callers that still need to serialize as legacy. */
56
+ export function modeToLegacyTier(mode) {
57
+ return MODE_TO_LEGACY_TIER[mode];
58
+ }
59
+ function defaultWriteWarning(message) {
60
+ process.stderr.write(`${message}\n`);
61
+ }
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Lightweight progress logger for `cclaw eval`.
3
+ *
4
+ * The runner is otherwise silent: a full workflow-mode run can easily take
5
+ * a few minutes and the user would see nothing until the Markdown report
6
+ * hits disk. We emit structured events here so the CLI can print concise
7
+ * one-line status updates to stderr (stdout stays reserved for the final
8
+ * report + `--json` output).
9
+ *
10
+ * The logger is intentionally minimal: no ANSI colors, no spinners, no
11
+ * carriage-return rewrites. Those do not survive `tee`, CI log viewers,
12
+ * or the background `runs/tail` path (which copies the stream to a log
13
+ * file), and users also told us "nothing is clear now, everything is
14
+ * long" — so we optimize for log-friendly line-by-line readability.
15
+ */
16
+ import type { EvalMode, WorkflowStageName } from "./types.js";
17
+ export type ProgressEvent = {
18
+ kind: "run-start";
19
+ mode: EvalMode;
20
+ totalCases: number;
21
+ } | {
22
+ kind: "case-start";
23
+ caseId: string;
24
+ stage: string;
25
+ index: number;
26
+ total: number;
27
+ } | {
28
+ kind: "case-end";
29
+ caseId: string;
30
+ stage: string;
31
+ index: number;
32
+ total: number;
33
+ passed: boolean;
34
+ durationMs: number;
35
+ costUsd?: number;
36
+ } | {
37
+ kind: "stage-start";
38
+ caseId: string;
39
+ stage: WorkflowStageName;
40
+ index: number;
41
+ total: number;
42
+ } | {
43
+ kind: "stage-end";
44
+ caseId: string;
45
+ stage: WorkflowStageName;
46
+ index: number;
47
+ total: number;
48
+ passed: boolean;
49
+ durationMs: number;
50
+ costUsd?: number;
51
+ } | {
52
+ kind: "retry";
53
+ caseId: string;
54
+ stage?: string;
55
+ attempt: number;
56
+ maxAttempts: number;
57
+ waitMs: number;
58
+ reason: string;
59
+ } | {
60
+ kind: "run-end";
61
+ totalCases: number;
62
+ passed: number;
63
+ failed: number;
64
+ durationMs: number;
65
+ };
66
+ export interface ProgressLogger {
67
+ emit(event: ProgressEvent): void;
68
+ }
69
+ export declare function noopProgressLogger(): ProgressLogger;
70
+ export interface StderrProgressLoggerOptions {
71
+ /** Override the underlying write target; defaults to `process.stderr.write`. */
72
+ writer?: (message: string) => void;
73
+ /** Return wall-clock in ms. Injectable for tests. */
74
+ now?: () => number;
75
+ }
76
+ /**
77
+ * Emit a one-line status update per event to stderr.
78
+ *
79
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
80
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
81
+ * so sub-cent runs still show a non-zero value.
82
+ */
83
+ export declare function createStderrProgressLogger(opts?: StderrProgressLoggerOptions): ProgressLogger;
@@ -0,0 +1,59 @@
1
+ const NOOP_LOGGER = { emit() { } };
2
+ export function noopProgressLogger() {
3
+ return NOOP_LOGGER;
4
+ }
5
+ /**
6
+ * Emit a one-line status update per event to stderr.
7
+ *
8
+ * Format is deliberately boring: `[cclaw eval] <message>` so users can grep
9
+ * for the prefix in combined logs. Costs are rendered with up to 4 decimals
10
+ * so sub-cent runs still show a non-zero value.
11
+ */
12
+ export function createStderrProgressLogger(opts = {}) {
13
+ const writer = opts.writer ?? ((s) => process.stderr.write(s));
14
+ return {
15
+ emit(event) {
16
+ writer(`[cclaw eval] ${formatEvent(event)}\n`);
17
+ }
18
+ };
19
+ }
20
+ function formatDuration(ms) {
21
+ if (ms < 1000)
22
+ return `${ms}ms`;
23
+ const s = ms / 1000;
24
+ if (s < 60)
25
+ return `${s.toFixed(1)}s`;
26
+ const m = Math.floor(s / 60);
27
+ const rem = Math.round(s - m * 60);
28
+ return `${m}m${rem.toString().padStart(2, "0")}s`;
29
+ }
30
+ function formatCost(usd) {
31
+ if (usd === undefined || usd <= 0)
32
+ return "";
33
+ return ` $${usd.toFixed(4)}`;
34
+ }
35
+ function formatEvent(event) {
36
+ switch (event.kind) {
37
+ case "run-start":
38
+ return `start mode=${event.mode} cases=${event.totalCases}`;
39
+ case "case-start":
40
+ return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
41
+ case "case-end": {
42
+ const status = event.passed ? "PASS" : "FAIL";
43
+ return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
44
+ `in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
45
+ }
46
+ case "stage-start":
47
+ return ` stage ${event.stage} ...`;
48
+ case "stage-end": {
49
+ const status = event.passed ? "ok" : "fail";
50
+ return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
51
+ }
52
+ case "retry":
53
+ return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
54
+ `attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
55
+ case "run-end":
56
+ return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
57
+ `in ${formatDuration(event.durationMs)}`);
58
+ }
59
+ }
@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
24
24
  lines.push(`- cclaw version: ${report.cclawVersion}`);
25
25
  lines.push(`- provider: ${report.provider}`);
26
26
  lines.push(`- model: ${report.model}`);
27
- lines.push(`- tier: ${report.tier}`);
27
+ lines.push(`- mode: ${report.mode}`);
28
28
  lines.push(`- stages: ${stages}`);
29
29
  lines.push(``);
30
30
  lines.push(`## Summary`);
@@ -1,10 +1,11 @@
1
1
  import type { FlowStage } from "../types.js";
2
2
  import { type EvalLlmClient } from "./llm-client.js";
3
- import type { EvalReport, EvalTier, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
3
+ import { type ProgressLogger } from "./progress.js";
4
+ import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
4
5
  export interface RunEvalOptions {
5
6
  projectRoot: string;
6
7
  stage?: FlowStage;
7
- tier?: EvalTier;
8
+ mode?: EvalMode;
8
9
  /** When true, run only structural verifiers (Step 1). */
9
10
  schemaOnly?: boolean;
10
11
  /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
21
22
  * without hitting the network.
22
23
  */
23
24
  llmClient?: EvalLlmClient;
25
+ /**
26
+ * Optional progress logger. The CLI wires a stderr-backed logger by
27
+ * default so users see one-line updates during long runs; tests and
28
+ * programmatic callers can inject a silent (noop) logger or capture
29
+ * events for assertions. When omitted, progress is silenced.
30
+ */
31
+ progress?: ProgressLogger;
32
+ /**
33
+ * Per-run USD cap. Enforced in-memory; independent from the daily cap
34
+ * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
35
+ * invocations. Undefined means no cap.
36
+ */
37
+ maxCostUsd?: number;
38
+ /**
39
+ * Override the configured `model` (and `judgeModel`) for this run.
40
+ * Used by `cclaw eval --compare-model` to replay the same corpus
41
+ * against an alternative model without editing `config.yaml`.
42
+ */
43
+ modelOverride?: string;
24
44
  }
25
45
  export interface DryRunSummary {
26
46
  kind: "dry-run";
@@ -33,7 +53,7 @@ export interface DryRunSummary {
33
53
  stage: FlowStage;
34
54
  }>;
35
55
  };
36
- /** Tier C-only workflow corpus summary. Empty for Tier A/B planned runs. */
56
+ /** Only populated in `workflow` mode; empty for fixture / agent modes. */
37
57
  workflowCorpus: {
38
58
  total: number;
39
59
  cases: Array<{
@@ -41,7 +61,7 @@ export interface DryRunSummary {
41
61
  stages: WorkflowStageName[];
42
62
  }>;
43
63
  };
44
- plannedTier: EvalTier;
64
+ plannedMode: EvalMode;
45
65
  verifiersAvailable: {
46
66
  structural: boolean;
47
67
  rules: boolean;
@@ -52,10 +72,10 @@ export interface DryRunSummary {
52
72
  notes: string[];
53
73
  }
54
74
  /**
55
- * Structural runner. When `schemaOnly` is set (or no other verifier flags are
56
- * active), runs structural verifiers against fixture-backed cases and loads
57
- * per-stage baselines for regression comparison. Tier A/B/C agent loops
58
- * arrive in later steps; until then cases without `fixture` are marked as
59
- * skipped rather than failing.
75
+ * Main eval runner. Dispatches between fixture-backed verification, the
76
+ * single-stage agent-with-tools loop, and the multi-stage workflow
77
+ * orchestrator based on `options.mode`. Per-stage baselines are loaded for
78
+ * regression comparison. Cases without a `fixture` path in the yaml are
79
+ * marked skipped (not failed) when no LLM drafting runs.
60
80
  */
61
81
  export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;