cclaw-cli 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +421 -64
- package/dist/cli.d.ts +8 -4
- package/dist/cli.js +318 -47
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +34 -1
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/content/start-command.d.ts +3 -2
- package/dist/content/start-command.js +5 -4
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +6 -6
- package/dist/eval/agents/with-tools.js +5 -5
- package/dist/eval/agents/workflow.d.ts +7 -0
- package/dist/eval/agents/workflow.js +5 -3
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +46 -17
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +1 -1
- package/dist/eval/diff.js +3 -3
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +29 -9
- package/dist/eval/runner.js +148 -56
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +54 -27
- package/dist/eval/types.js +21 -9
- package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
- package/dist/eval/workflow-corpus.d.ts +2 -2
- package/dist/eval/workflow-corpus.js +4 -4
- package/dist/install.d.ts +10 -0
- package/dist/install.js +19 -5
- package/package.json +1 -1
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helpers that translate between the legacy `Tier A/B/C` naming and the
|
|
3
|
+
* current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
|
|
4
|
+
*
|
|
5
|
+
* The names we actually carry in reports, config, CLI flags, and verifier
|
|
6
|
+
* messages are the `EvalMode` ones; legacy tier inputs are accepted with a
|
|
7
|
+
* single deprecation warning per process so existing scripts keep working
|
|
8
|
+
* through the 0.28.x line.
|
|
9
|
+
*/
|
|
10
|
+
import { EVAL_MODES } from "./types.js";
|
|
11
|
+
const LEGACY_TIER_TO_MODE = {
|
|
12
|
+
A: "fixture",
|
|
13
|
+
B: "agent",
|
|
14
|
+
C: "workflow"
|
|
15
|
+
};
|
|
16
|
+
const MODE_TO_LEGACY_TIER = {
|
|
17
|
+
fixture: "A",
|
|
18
|
+
agent: "B",
|
|
19
|
+
workflow: "C"
|
|
20
|
+
};
|
|
21
|
+
const DEPRECATED_NAMES = new Set(Object.keys(LEGACY_TIER_TO_MODE));
|
|
22
|
+
let legacyWarningEmitted = false;
|
|
23
|
+
/**
|
|
24
|
+
* Reset the per-process "already warned about legacy tier" flag. Used by
|
|
25
|
+
* tests so each test file gets a deterministic warning surface.
|
|
26
|
+
*/
|
|
27
|
+
export function __resetLegacyWarningForTests() {
|
|
28
|
+
legacyWarningEmitted = false;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Normalize a raw string from the CLI / env / config into an `EvalMode`.
|
|
32
|
+
* Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
|
|
33
|
+
* Emits a deprecation warning to stderr at most once per process when a
|
|
34
|
+
* legacy tier name is seen.
|
|
35
|
+
*/
|
|
36
|
+
export function parseModeInput(raw, input, writeWarning = defaultWriteWarning) {
|
|
37
|
+
const trimmed = raw.trim();
|
|
38
|
+
if (trimmed.length === 0) {
|
|
39
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C).`);
|
|
40
|
+
}
|
|
41
|
+
if (EVAL_MODES.includes(trimmed)) {
|
|
42
|
+
return trimmed;
|
|
43
|
+
}
|
|
44
|
+
if (DEPRECATED_NAMES.has(trimmed)) {
|
|
45
|
+
const replacement = LEGACY_TIER_TO_MODE[trimmed];
|
|
46
|
+
if (!legacyWarningEmitted) {
|
|
47
|
+
legacyWarningEmitted = true;
|
|
48
|
+
writeWarning(`[cclaw] "${input.source}: ${input.raw}" is using the legacy tier name "${trimmed}". ` +
|
|
49
|
+
`Please switch to --mode=${replacement} (legacy --tier=A|B|C will be removed in the next major release).`);
|
|
50
|
+
}
|
|
51
|
+
return replacement;
|
|
52
|
+
}
|
|
53
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C), got: ${raw}`);
|
|
54
|
+
}
|
|
55
|
+
/** @deprecated kept for callers that still need to serialize as legacy. */
|
|
56
|
+
export function modeToLegacyTier(mode) {
|
|
57
|
+
return MODE_TO_LEGACY_TIER[mode];
|
|
58
|
+
}
|
|
59
|
+
function defaultWriteWarning(message) {
|
|
60
|
+
process.stderr.write(`${message}\n`);
|
|
61
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight progress logger for `cclaw eval`.
|
|
3
|
+
*
|
|
4
|
+
* The runner is otherwise silent: a full workflow-mode run can easily take
|
|
5
|
+
* a few minutes and the user would see nothing until the Markdown report
|
|
6
|
+
* hits disk. We emit structured events here so the CLI can print concise
|
|
7
|
+
* one-line status updates to stderr (stdout stays reserved for the final
|
|
8
|
+
* report + `--json` output).
|
|
9
|
+
*
|
|
10
|
+
* The logger is intentionally minimal: no ANSI colors, no spinners, no
|
|
11
|
+
* carriage-return rewrites. Those do not survive `tee`, CI log viewers,
|
|
12
|
+
* or the background `runs/tail` path (which copies the stream to a log
|
|
13
|
+
* file), and users also told us "nothing is clear now, everything is
|
|
14
|
+
* long" — so we optimize for log-friendly line-by-line readability.
|
|
15
|
+
*/
|
|
16
|
+
import type { EvalMode, WorkflowStageName } from "./types.js";
|
|
17
|
+
export type ProgressEvent = {
|
|
18
|
+
kind: "run-start";
|
|
19
|
+
mode: EvalMode;
|
|
20
|
+
totalCases: number;
|
|
21
|
+
} | {
|
|
22
|
+
kind: "case-start";
|
|
23
|
+
caseId: string;
|
|
24
|
+
stage: string;
|
|
25
|
+
index: number;
|
|
26
|
+
total: number;
|
|
27
|
+
} | {
|
|
28
|
+
kind: "case-end";
|
|
29
|
+
caseId: string;
|
|
30
|
+
stage: string;
|
|
31
|
+
index: number;
|
|
32
|
+
total: number;
|
|
33
|
+
passed: boolean;
|
|
34
|
+
durationMs: number;
|
|
35
|
+
costUsd?: number;
|
|
36
|
+
} | {
|
|
37
|
+
kind: "stage-start";
|
|
38
|
+
caseId: string;
|
|
39
|
+
stage: WorkflowStageName;
|
|
40
|
+
index: number;
|
|
41
|
+
total: number;
|
|
42
|
+
} | {
|
|
43
|
+
kind: "stage-end";
|
|
44
|
+
caseId: string;
|
|
45
|
+
stage: WorkflowStageName;
|
|
46
|
+
index: number;
|
|
47
|
+
total: number;
|
|
48
|
+
passed: boolean;
|
|
49
|
+
durationMs: number;
|
|
50
|
+
costUsd?: number;
|
|
51
|
+
} | {
|
|
52
|
+
kind: "retry";
|
|
53
|
+
caseId: string;
|
|
54
|
+
stage?: string;
|
|
55
|
+
attempt: number;
|
|
56
|
+
maxAttempts: number;
|
|
57
|
+
waitMs: number;
|
|
58
|
+
reason: string;
|
|
59
|
+
} | {
|
|
60
|
+
kind: "run-end";
|
|
61
|
+
totalCases: number;
|
|
62
|
+
passed: number;
|
|
63
|
+
failed: number;
|
|
64
|
+
durationMs: number;
|
|
65
|
+
};
|
|
66
|
+
export interface ProgressLogger {
|
|
67
|
+
emit(event: ProgressEvent): void;
|
|
68
|
+
}
|
|
69
|
+
export declare function noopProgressLogger(): ProgressLogger;
|
|
70
|
+
export interface StderrProgressLoggerOptions {
|
|
71
|
+
/** Override the underlying write target; defaults to `process.stderr.write`. */
|
|
72
|
+
writer?: (message: string) => void;
|
|
73
|
+
/** Return wall-clock in ms. Injectable for tests. */
|
|
74
|
+
now?: () => number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Emit a one-line status update per event to stderr.
|
|
78
|
+
*
|
|
79
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
80
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
81
|
+
* so sub-cent runs still show a non-zero value.
|
|
82
|
+
*/
|
|
83
|
+
export declare function createStderrProgressLogger(opts?: StderrProgressLoggerOptions): ProgressLogger;
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
const NOOP_LOGGER = { emit() { } };
|
|
2
|
+
export function noopProgressLogger() {
|
|
3
|
+
return NOOP_LOGGER;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Emit a one-line status update per event to stderr.
|
|
7
|
+
*
|
|
8
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
9
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
10
|
+
* so sub-cent runs still show a non-zero value.
|
|
11
|
+
*/
|
|
12
|
+
export function createStderrProgressLogger(opts = {}) {
|
|
13
|
+
const writer = opts.writer ?? ((s) => process.stderr.write(s));
|
|
14
|
+
return {
|
|
15
|
+
emit(event) {
|
|
16
|
+
writer(`[cclaw eval] ${formatEvent(event)}\n`);
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function formatDuration(ms) {
|
|
21
|
+
if (ms < 1000)
|
|
22
|
+
return `${ms}ms`;
|
|
23
|
+
const s = ms / 1000;
|
|
24
|
+
if (s < 60)
|
|
25
|
+
return `${s.toFixed(1)}s`;
|
|
26
|
+
const m = Math.floor(s / 60);
|
|
27
|
+
const rem = Math.round(s - m * 60);
|
|
28
|
+
return `${m}m${rem.toString().padStart(2, "0")}s`;
|
|
29
|
+
}
|
|
30
|
+
function formatCost(usd) {
|
|
31
|
+
if (usd === undefined || usd <= 0)
|
|
32
|
+
return "";
|
|
33
|
+
return ` $${usd.toFixed(4)}`;
|
|
34
|
+
}
|
|
35
|
+
function formatEvent(event) {
|
|
36
|
+
switch (event.kind) {
|
|
37
|
+
case "run-start":
|
|
38
|
+
return `start mode=${event.mode} cases=${event.totalCases}`;
|
|
39
|
+
case "case-start":
|
|
40
|
+
return `[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ...`;
|
|
41
|
+
case "case-end": {
|
|
42
|
+
const status = event.passed ? "PASS" : "FAIL";
|
|
43
|
+
return (`[${event.index}/${event.total}] ${event.caseId} (${event.stage}) ${status} ` +
|
|
44
|
+
`in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`);
|
|
45
|
+
}
|
|
46
|
+
case "stage-start":
|
|
47
|
+
return ` stage ${event.stage} ...`;
|
|
48
|
+
case "stage-end": {
|
|
49
|
+
const status = event.passed ? "ok" : "fail";
|
|
50
|
+
return ` stage ${event.stage} ${status} in ${formatDuration(event.durationMs)}${formatCost(event.costUsd)}`;
|
|
51
|
+
}
|
|
52
|
+
case "retry":
|
|
53
|
+
return (` retry ${event.caseId}${event.stage ? `/${event.stage}` : ""} ` +
|
|
54
|
+
`attempt ${event.attempt}/${event.maxAttempts} in ${formatDuration(event.waitMs)} (${event.reason})`);
|
|
55
|
+
case "run-end":
|
|
56
|
+
return (`done pass=${event.passed} fail=${event.failed} total=${event.totalCases} ` +
|
|
57
|
+
`in ${formatDuration(event.durationMs)}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
package/dist/eval/report.js
CHANGED
|
@@ -24,7 +24,7 @@ export function formatMarkdownReport(report) {
|
|
|
24
24
|
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
25
|
lines.push(`- provider: ${report.provider}`);
|
|
26
26
|
lines.push(`- model: ${report.model}`);
|
|
27
|
-
lines.push(`-
|
|
27
|
+
lines.push(`- mode: ${report.mode}`);
|
|
28
28
|
lines.push(`- stages: ${stages}`);
|
|
29
29
|
lines.push(``);
|
|
30
30
|
lines.push(`## Summary`);
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
import { type EvalLlmClient } from "./llm-client.js";
|
|
3
|
-
import
|
|
3
|
+
import { type ProgressLogger } from "./progress.js";
|
|
4
|
+
import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
|
|
4
5
|
export interface RunEvalOptions {
|
|
5
6
|
projectRoot: string;
|
|
6
7
|
stage?: FlowStage;
|
|
7
|
-
|
|
8
|
+
mode?: EvalMode;
|
|
8
9
|
/** When true, run only structural verifiers (Step 1). */
|
|
9
10
|
schemaOnly?: boolean;
|
|
10
11
|
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
@@ -21,6 +22,25 @@ export interface RunEvalOptions {
|
|
|
21
22
|
* without hitting the network.
|
|
22
23
|
*/
|
|
23
24
|
llmClient?: EvalLlmClient;
|
|
25
|
+
/**
|
|
26
|
+
* Optional progress logger. The CLI wires a stderr-backed logger by
|
|
27
|
+
* default so users see one-line updates during long runs; tests and
|
|
28
|
+
* programmatic callers can inject a silent (noop) logger or capture
|
|
29
|
+
* events for assertions. When omitted, progress is silenced.
|
|
30
|
+
*/
|
|
31
|
+
progress?: ProgressLogger;
|
|
32
|
+
/**
|
|
33
|
+
* Per-run USD cap. Enforced in-memory; independent from the daily cap
|
|
34
|
+
* (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
|
|
35
|
+
* invocations. Undefined means no cap.
|
|
36
|
+
*/
|
|
37
|
+
maxCostUsd?: number;
|
|
38
|
+
/**
|
|
39
|
+
* Override the configured `model` (and `judgeModel`) for this run.
|
|
40
|
+
* Used by `cclaw eval --compare-model` to replay the same corpus
|
|
41
|
+
* against an alternative model without editing `config.yaml`.
|
|
42
|
+
*/
|
|
43
|
+
modelOverride?: string;
|
|
24
44
|
}
|
|
25
45
|
export interface DryRunSummary {
|
|
26
46
|
kind: "dry-run";
|
|
@@ -33,7 +53,7 @@ export interface DryRunSummary {
|
|
|
33
53
|
stage: FlowStage;
|
|
34
54
|
}>;
|
|
35
55
|
};
|
|
36
|
-
/**
|
|
56
|
+
/** Only populated in `workflow` mode; empty for fixture / agent modes. */
|
|
37
57
|
workflowCorpus: {
|
|
38
58
|
total: number;
|
|
39
59
|
cases: Array<{
|
|
@@ -41,7 +61,7 @@ export interface DryRunSummary {
|
|
|
41
61
|
stages: WorkflowStageName[];
|
|
42
62
|
}>;
|
|
43
63
|
};
|
|
44
|
-
|
|
64
|
+
plannedMode: EvalMode;
|
|
45
65
|
verifiersAvailable: {
|
|
46
66
|
structural: boolean;
|
|
47
67
|
rules: boolean;
|
|
@@ -52,10 +72,10 @@ export interface DryRunSummary {
|
|
|
52
72
|
notes: string[];
|
|
53
73
|
}
|
|
54
74
|
/**
|
|
55
|
-
*
|
|
56
|
-
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
* skipped
|
|
75
|
+
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
76
|
+
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
77
|
+
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
78
|
+
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
79
|
+
* marked skipped (not failed) when no LLM drafting runs.
|
|
60
80
|
*/
|
|
61
81
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|