cclaw-cli 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +421 -64
  2. package/dist/cli.d.ts +8 -4
  3. package/dist/cli.js +318 -47
  4. package/dist/constants.d.ts +1 -1
  5. package/dist/constants.js +34 -1
  6. package/dist/content/eval-scaffold.d.ts +2 -2
  7. package/dist/content/eval-scaffold.js +7 -6
  8. package/dist/content/start-command.d.ts +3 -2
  9. package/dist/content/start-command.js +5 -4
  10. package/dist/eval/agents/single-shot.d.ts +1 -1
  11. package/dist/eval/agents/single-shot.js +4 -4
  12. package/dist/eval/agents/with-tools.d.ts +6 -6
  13. package/dist/eval/agents/with-tools.js +5 -5
  14. package/dist/eval/agents/workflow.d.ts +7 -0
  15. package/dist/eval/agents/workflow.js +5 -3
  16. package/dist/eval/baseline.d.ts +24 -0
  17. package/dist/eval/baseline.js +75 -2
  18. package/dist/eval/config-loader.js +46 -17
  19. package/dist/eval/cost-guard.d.ts +22 -0
  20. package/dist/eval/cost-guard.js +38 -1
  21. package/dist/eval/diff.d.ts +1 -1
  22. package/dist/eval/diff.js +3 -3
  23. package/dist/eval/llm-client.d.ts +13 -2
  24. package/dist/eval/llm-client.js +8 -1
  25. package/dist/eval/mode.d.ts +28 -0
  26. package/dist/eval/mode.js +61 -0
  27. package/dist/eval/progress.d.ts +83 -0
  28. package/dist/eval/progress.js +59 -0
  29. package/dist/eval/report.js +1 -1
  30. package/dist/eval/runner.d.ts +29 -9
  31. package/dist/eval/runner.js +148 -56
  32. package/dist/eval/runs.d.ts +41 -0
  33. package/dist/eval/runs.js +114 -0
  34. package/dist/eval/sandbox.js +1 -1
  35. package/dist/eval/tools/index.js +1 -1
  36. package/dist/eval/tools/types.d.ts +1 -1
  37. package/dist/eval/types.d.ts +54 -27
  38. package/dist/eval/types.js +21 -9
  39. package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
  40. package/dist/eval/workflow-corpus.d.ts +2 -2
  41. package/dist/eval/workflow-corpus.js +4 -4
  42. package/dist/install.d.ts +10 -0
  43. package/dist/install.js +19 -5
  44. package/package.json +1 -1
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Tier A single-shot agent.
2
+ * Single-shot agent used by fixture mode when `--judge` is set.
3
3
  *
4
4
  * Simplest realistic AUT: one LLM call with the stage's SKILL.md as the
5
5
  * system prompt and the case's `inputPrompt` as the user message. Output
@@ -9,7 +9,7 @@
9
9
  * Design notes:
10
10
  *
11
11
  * - No tools. No multi-turn. No reads of the project beyond the one
12
- * SKILL.md. Tier B/C layer complexity on top in later steps.
12
+ * SKILL.md. agent/workflow modes layer complexity on top.
13
13
  * - Errors are propagated as-is (`EvalLlmError` subclasses) so the
14
14
  * runner can surface them as verifier failures without swallowing the
15
15
  * cause.
@@ -27,7 +27,7 @@ export async function loadStageSkill(projectRoot, stage) {
27
27
  const file = path.join(projectRoot, RUNTIME_ROOT, "skills", folder, "SKILL.md");
28
28
  if (!(await exists(file))) {
29
29
  throw new Error(`Stage skill not found: ${path.relative(projectRoot, file)}. ` +
30
- `Run \`cclaw init\` (or \`cclaw sync\`) before \`cclaw eval --tier=A --judge\`.`);
30
+ `Run \`cclaw init\` (or \`cclaw sync\`) before \`cclaw eval --mode=fixture --judge\`.`);
31
31
  }
32
32
  return fs.readFile(file, "utf8");
33
33
  }
@@ -50,7 +50,7 @@ function buildUserPrompt(caseEntry) {
50
50
  `Do not wrap in code fences, do not add commentary before or after.`);
51
51
  return lines.join("\n");
52
52
  }
53
- /** Run the Tier A single-shot AUT and return the produced artifact. */
53
+ /** Run the single-shot AUT (fixture mode + --judge) and return the produced artifact. */
54
54
  export async function runSingleShot(input) {
55
55
  const { caseEntry, config, projectRoot, client } = input;
56
56
  const started = Date.now();
@@ -18,15 +18,15 @@ export interface WithToolsInput {
18
18
  createSandboxFn?: typeof createSandbox;
19
19
  /**
20
20
  * Reuse an externally-managed sandbox instead of creating + disposing a
21
- * per-call one. Tier C workflow orchestration uses this so every stage
22
- * shares the same sandbox and earlier artifacts remain visible. When
23
- * set, the caller is responsible for `dispose()`.
21
+ * per-call one. Workflow mode uses this so every stage shares the same
22
+ * sandbox and earlier artifacts remain visible. When set, the caller is
23
+ * responsible for `dispose()`.
24
24
  */
25
25
  externalSandbox?: Sandbox;
26
26
  /**
27
- * Optional override of the default user prompt prefix. Tier C uses this
28
- * to tell the model which stage it is on and where the prior artifacts
29
- * are located.
27
+ * Optional override of the default user prompt prefix. Workflow mode uses
28
+ * this to tell the model which stage it is on and where the prior
29
+ * artifacts are located.
30
30
  */
31
31
  promptPreamble?: string;
32
32
  }
@@ -1,11 +1,11 @@
1
1
  /**
2
- * Tier B with-tools agent.
2
+ * Multi-turn with-tools agent (agent mode, reused by workflow mode).
3
3
  *
4
4
  * Multi-turn loop with OpenAI-style function-calling over a set of
5
5
  * sandbox-confined tools. The AUT is given:
6
6
  *
7
- * - System prompt = stage SKILL.md (same contract as Tier A so the
8
- * single-shot baseline is comparable).
7
+ * - System prompt = stage SKILL.md (same contract as the single-shot path
8
+ * so the baseline is comparable).
9
9
  * - User prompt = task description + a short "tools available" hint
10
10
  * that names the sandbox root and the four built-in tools.
11
11
  * - Tools = `read_file`, `write_file`, `glob`, `grep` (see
@@ -29,7 +29,7 @@
29
29
  * Artifact resolution: the final assistant content is the artifact. If
30
30
  * the model used `write_file` to stage the artifact at
31
31
  * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
32
- * mirrors the Tier C workflow where writes are the deliverable. The
32
+ * mirrors workflow mode where writes are the deliverable. The
33
33
  * fallback is the terminal assistant message so prompts that don't
34
34
  * call write_file still produce something judgable.
35
35
  */
@@ -42,7 +42,7 @@ import { loadStageSkill } from "./single-shot.js";
42
42
  export class MaxTurnsExceededError extends Error {
43
43
  turns;
44
44
  constructor(turns) {
45
- super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
45
+ super(`Agent loop exceeded the ${turns}-turn budget without a terminal stop.`);
46
46
  this.name = "MaxTurnsExceededError";
47
47
  this.turns = turns;
48
48
  }
@@ -12,6 +12,13 @@ export interface WorkflowInput {
12
12
  loadSkill?: (stage: WorkflowStageName) => Promise<string>;
13
13
  /** Override for the sandbox factory (test hook). */
14
14
  createSandboxFn?: typeof createSandbox;
15
+ /**
16
+ * Optional per-stage lifecycle hooks. The runner uses these to emit
17
+ * progress events to stderr so workflow-mode runs surface real-time
18
+ * status rather than going silent for minutes.
19
+ */
20
+ onStageStart?: (stage: WorkflowStageName) => void;
21
+ onStageEnd?: (stage: WorkflowStageName, result: WorkflowStageResult) => void;
15
22
  }
16
23
  export interface WorkflowOutput {
17
24
  caseId: string;
@@ -1,7 +1,7 @@
1
1
  /**
2
- * Tier C workflow agent.
2
+ * Workflow-mode agent.
3
3
  *
4
- * Runs the Tier B with-tools loop once per stage in a workflow case,
4
+ * Runs the with-tools loop once per stage in a workflow case,
5
5
  * sharing a single sandbox across stages so every new stage can read
6
6
  * the earlier artifacts the model produced. The shape of the run is:
7
7
  *
@@ -46,6 +46,7 @@ export async function runWorkflow(input) {
46
46
  try {
47
47
  await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
48
48
  for (const step of workflow.stages) {
49
+ input.onStageStart?.(step.name);
49
50
  await clearArtifactFile(sandbox);
50
51
  const priorStages = stageResults.map((r) => r.stage);
51
52
  const preamble = buildStagePreamble(workflow, step.name, priorStages);
@@ -83,6 +84,7 @@ export async function runWorkflow(input) {
83
84
  completionTokens: result.usage.completionTokens
84
85
  };
85
86
  stageResults.push(stageResult);
87
+ input.onStageEnd?.(step.name, stageResult);
86
88
  totalUsageUsd += result.usageUsd;
87
89
  totalDurationMs += result.durationMs;
88
90
  }
@@ -118,7 +120,7 @@ async function persistStageArtifact(sandbox, stage, artifact) {
118
120
  }
119
121
  function buildStagePreamble(workflow, current, priorStages) {
120
122
  const lines = [];
121
- lines.push(`You are running stage "${current}" of the Tier C workflow "${workflow.id}".`);
123
+ lines.push(`You are running stage "${current}" of the workflow "${workflow.id}".`);
122
124
  if (workflow.description) {
123
125
  lines.push(`Case description: ${workflow.description}`);
124
126
  }
@@ -1,6 +1,30 @@
1
1
  import type { FlowStage } from "../types.js";
2
2
  import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
3
3
  export declare const BASELINE_SCHEMA_VERSION = 1;
4
+ /**
5
+ * Thrown when a signed baseline's on-disk digest does not match the
6
+ * canonical encoding of its `{ schemaVersion, stage, cases }` block.
7
+ * Callers should treat this as a hard failure: the baseline was either
8
+ * hand-edited or corrupted and cannot be trusted for regression gating.
9
+ */
10
+ export declare class BaselineSignatureError extends Error {
11
+ readonly file: string;
12
+ readonly expected: string;
13
+ readonly actual: string;
14
+ constructor(opts: {
15
+ file: string;
16
+ expected: string;
17
+ actual: string;
18
+ });
19
+ }
20
+ /**
21
+ * Produce a deterministic sha256 digest over the signable portion of a
22
+ * baseline. We intentionally exclude `generatedAt` and `cclawVersion`
23
+ * from the digest so that rebuilding the same baseline from identical
24
+ * case results on a new CLI version doesn't invalidate the signature —
25
+ * only changes to the observed pass/ok/score payloads do.
26
+ */
27
+ export declare function computeBaselineDigest(snapshot: Pick<BaselineSnapshot, "schemaVersion" | "stage" | "cases">): string;
4
28
  export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
5
29
  export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
6
30
  export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
@@ -14,15 +14,67 @@
14
14
  * Writes are gated behind an explicit `--update-baseline --confirm` pair at
15
15
  * the CLI layer so accidental resets do not slip into PRs.
16
16
  */
17
+ import { createHash } from "node:crypto";
17
18
  import fs from "node:fs/promises";
18
19
  import path from "node:path";
19
20
  import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
20
21
  import { exists } from "../fs-utils.js";
21
22
  import { FLOW_STAGES } from "../types.js";
22
23
  export const BASELINE_SCHEMA_VERSION = 1;
24
+ /**
25
+ * Thrown when a signed baseline's on-disk digest does not match the
26
+ * canonical encoding of its `{ schemaVersion, stage, cases }` block.
27
+ * Callers should treat this as a hard failure: the baseline was either
28
+ * hand-edited or corrupted and cannot be trusted for regression gating.
29
+ */
30
+ export class BaselineSignatureError extends Error {
31
+ file;
32
+ expected;
33
+ actual;
34
+ constructor(opts) {
35
+ super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
36
+ `The file was modified outside of \`cclaw eval --update-baseline\`. ` +
37
+ `Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
38
+ this.name = "BaselineSignatureError";
39
+ this.file = opts.file;
40
+ this.expected = opts.expected;
41
+ this.actual = opts.actual;
42
+ }
43
+ }
23
44
  function baselinePath(projectRoot, stage) {
24
45
  return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
25
46
  }
47
+ /**
48
+ * Produce a deterministic sha256 digest over the signable portion of a
49
+ * baseline. We intentionally exclude `generatedAt` and `cclawVersion`
50
+ * from the digest so that rebuilding the same baseline from identical
51
+ * case results on a new CLI version doesn't invalidate the signature —
52
+ * only changes to the observed pass/ok/score payloads do.
53
+ */
54
+ export function computeBaselineDigest(snapshot) {
55
+ const canonical = canonicalJson({
56
+ schemaVersion: snapshot.schemaVersion,
57
+ stage: snapshot.stage,
58
+ cases: snapshot.cases
59
+ });
60
+ return createHash("sha256").update(canonical).digest("hex");
61
+ }
62
+ /**
63
+ * JSON.stringify with object keys sorted recursively so the digest is
64
+ * stable across filesystem / serializer variations.
65
+ */
66
+ function canonicalJson(value) {
67
+ if (value === null || typeof value !== "object") {
68
+ return JSON.stringify(value);
69
+ }
70
+ if (Array.isArray(value)) {
71
+ return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
72
+ }
73
+ const record = value;
74
+ const keys = Object.keys(record).sort();
75
+ const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
76
+ return `{${parts.join(",")}}`;
77
+ }
26
78
  export async function loadBaseline(projectRoot, stage) {
27
79
  const filePath = baselinePath(projectRoot, stage);
28
80
  if (!(await exists(filePath)))
@@ -38,6 +90,20 @@ export async function loadBaseline(projectRoot, stage) {
38
90
  if (!isBaseline(parsed, stage)) {
39
91
  throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
40
92
  }
93
+ const signature = parsed.signature;
94
+ if (signature) {
95
+ if (signature.algorithm !== "sha256") {
96
+ throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
97
+ }
98
+ const actual = computeBaselineDigest(parsed);
99
+ if (actual !== signature.digest) {
100
+ throw new BaselineSignatureError({
101
+ file: filePath,
102
+ expected: signature.digest,
103
+ actual
104
+ });
105
+ }
106
+ }
41
107
  return parsed;
42
108
  }
43
109
  function isBaseline(value, stage) {
@@ -80,13 +146,20 @@ export function buildBaselineForStage(stage, report) {
80
146
  for (const c of stageCases) {
81
147
  cases[c.caseId] = entryFromResult(c);
82
148
  }
83
- return {
149
+ const now = new Date().toISOString();
150
+ const unsigned = {
84
151
  schemaVersion: BASELINE_SCHEMA_VERSION,
85
152
  stage,
86
- generatedAt: new Date().toISOString(),
153
+ generatedAt: now,
87
154
  cclawVersion: CCLAW_VERSION,
88
155
  cases
89
156
  };
157
+ unsigned.signature = {
158
+ algorithm: "sha256",
159
+ digest: computeBaselineDigest(unsigned),
160
+ signedAt: now
161
+ };
162
+ return unsigned;
90
163
  }
91
164
  export async function writeBaselinesFromReport(projectRoot, report) {
92
165
  const written = [];
@@ -3,7 +3,8 @@ import path from "node:path";
3
3
  import { parse } from "yaml";
4
4
  import { EVALS_CONFIG_PATH } from "../constants.js";
5
5
  import { exists } from "../fs-utils.js";
6
- import { EVAL_TIERS } from "./types.js";
6
+ import { EVAL_MODES } from "./types.js";
7
+ import { parseModeInput } from "./mode.js";
7
8
  /**
8
9
  * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
9
10
  * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
@@ -14,7 +15,7 @@ export const DEFAULT_EVAL_CONFIG = {
14
15
  provider: "zai",
15
16
  baseUrl: "https://api.z.ai/api/coding/paas/v4",
16
17
  model: "glm-5.1",
17
- defaultTier: "A",
18
+ defaultMode: "fixture",
18
19
  regression: {
19
20
  failIfDeltaBelow: -0.15,
20
21
  failIfCriticalBelow: 3.0
@@ -25,7 +26,6 @@ export const DEFAULT_EVAL_CONFIG = {
25
26
  judgeTemperature: 0,
26
27
  agentTemperature: 0.2
27
28
  };
28
- const EVAL_TIER_SET = new Set(EVAL_TIERS);
29
29
  const NUMERIC_ENVS = new Set([
30
30
  "CCLAW_EVAL_DAILY_USD_CAP",
31
31
  "CCLAW_EVAL_TIMEOUT_MS",
@@ -40,7 +40,7 @@ const NUMERIC_ENVS = new Set([
40
40
  ]);
41
41
  function evalConfigError(configFilePath, reason) {
42
42
  return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
43
- `Supported tiers: ${EVAL_TIERS.join(", ")}\n` +
43
+ `Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
44
44
  `See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
45
45
  }
46
46
  function isRecord(value) {
@@ -53,12 +53,11 @@ function parseNumericEnv(name, raw) {
53
53
  }
54
54
  return value;
55
55
  }
56
- function parseTierEnv(raw) {
57
- const trimmed = raw.trim().toUpperCase();
58
- if (!EVAL_TIER_SET.has(trimmed)) {
59
- throw new Error(`Environment variable CCLAW_EVAL_TIER must be one of ${EVAL_TIERS.join("/")}, got: ${raw}`);
60
- }
61
- return trimmed;
56
+ function parseModeEnv(raw, envName) {
57
+ return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
58
+ source: "env",
59
+ raw: `${envName}=${raw}`
60
+ });
62
61
  }
63
62
  function validateFileConfig(raw, configFilePath) {
64
63
  if (raw === undefined || raw === null)
@@ -79,11 +78,33 @@ function validateFileConfig(raw, configFilePath) {
79
78
  assignString("baseUrl", raw.baseUrl);
80
79
  assignString("model", raw.model);
81
80
  assignString("judgeModel", raw.judgeModel);
82
- if (raw.defaultTier !== undefined) {
83
- if (typeof raw.defaultTier !== "string" || !EVAL_TIER_SET.has(raw.defaultTier)) {
84
- throw evalConfigError(configFilePath, `"defaultTier" must be one of: ${EVAL_TIERS.join(", ")}`);
81
+ if (raw.defaultMode !== undefined) {
82
+ if (typeof raw.defaultMode !== "string") {
83
+ throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
84
+ }
85
+ try {
86
+ out.defaultMode = parseModeInput(raw.defaultMode, {
87
+ source: "config",
88
+ raw: `defaultMode: ${raw.defaultMode}`
89
+ });
90
+ }
91
+ catch (err) {
92
+ throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
93
+ }
94
+ }
95
+ else if (raw.defaultTier !== undefined) {
96
+ if (typeof raw.defaultTier !== "string") {
97
+ throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
98
+ }
99
+ try {
100
+ out.defaultMode = parseModeInput(raw.defaultTier, {
101
+ source: "config",
102
+ raw: `defaultTier: ${raw.defaultTier}`
103
+ });
104
+ }
105
+ catch (err) {
106
+ throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
85
107
  }
86
- out.defaultTier = raw.defaultTier;
87
108
  }
88
109
  if (raw.dailyUsdCap !== undefined) {
89
110
  if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
@@ -194,6 +215,7 @@ function validateFileConfig(raw, configFilePath) {
194
215
  "baseUrl",
195
216
  "model",
196
217
  "judgeModel",
218
+ "defaultMode",
197
219
  "defaultTier",
198
220
  "dailyUsdCap",
199
221
  "timeoutMs",
@@ -266,11 +288,18 @@ function applyEnvOverrides(base, env) {
266
288
  patched.provider = provider;
267
289
  overridden = true;
268
290
  }
269
- const tier = read("CCLAW_EVAL_TIER");
270
- if (tier) {
271
- patched.defaultTier = parseTierEnv(tier);
291
+ const modeEnv = read("CCLAW_EVAL_MODE");
292
+ if (modeEnv) {
293
+ patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
272
294
  overridden = true;
273
295
  }
296
+ else {
297
+ const legacyTier = read("CCLAW_EVAL_TIER");
298
+ if (legacyTier) {
299
+ patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
300
+ overridden = true;
301
+ }
302
+ }
274
303
  const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
275
304
  if (cap) {
276
305
  patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
@@ -35,6 +35,22 @@ export declare class DailyCostCapExceededError extends Error {
35
35
  currentUsd: number;
36
36
  });
37
37
  }
38
+ /**
39
+ * Per-run cost cap — enforced in-memory, no ledger file. Complements the
40
+ * daily cap so a single long workflow run can't blow the whole day's
41
+ * budget even if the daily cap is generous. Opt-in via
42
+ * `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
43
+ */
44
+ export declare class RunCostCapExceededError extends Error {
45
+ readonly capUsd: number;
46
+ readonly projectedUsd: number;
47
+ readonly currentUsd: number;
48
+ constructor(opts: {
49
+ capUsd: number;
50
+ projectedUsd: number;
51
+ currentUsd: number;
52
+ });
53
+ }
38
54
  declare function utcDate(now?: Date): string;
39
55
  declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
40
56
  /**
@@ -67,6 +83,12 @@ export interface CreateCostGuardOptions {
67
83
  now?: () => Date;
68
84
  /** Override the default filesystem root for the ledger. */
69
85
  ledgerPath?: string;
86
+ /**
87
+ * Per-run (in-memory) USD cap. Independent from the persisted daily
88
+ * cap so a single `cclaw eval` invocation can be budgeted without
89
+ * touching the shared nightly ledger. Undefined = unlimited.
90
+ */
91
+ runCapUsd?: number;
70
92
  }
71
93
  export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
72
94
  /** Exposed for tests. */
@@ -52,6 +52,28 @@ export class DailyCostCapExceededError extends Error {
52
52
  this.currentUsd = opts.currentUsd;
53
53
  }
54
54
  }
55
+ /**
56
+ * Per-run cost cap — enforced in-memory, no ledger file. Complements the
57
+ * daily cap so a single long workflow run can't blow the whole day's
58
+ * budget even if the daily cap is generous. Opt-in via
59
+ * `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
60
+ */
61
+ export class RunCostCapExceededError extends Error {
62
+ capUsd;
63
+ projectedUsd;
64
+ currentUsd;
65
+ constructor(opts) {
66
+ super(`Run cost cap would be exceeded: ` +
67
+ `current=$${opts.currentUsd.toFixed(4)}, ` +
68
+ `projected=$${opts.projectedUsd.toFixed(4)}, ` +
69
+ `cap=$${opts.capUsd.toFixed(4)}. ` +
70
+ `Raise --max-cost-usd or drop it to run uncapped.`);
71
+ this.name = "RunCostCapExceededError";
72
+ this.capUsd = opts.capUsd;
73
+ this.projectedUsd = opts.projectedUsd;
74
+ this.currentUsd = opts.currentUsd;
75
+ }
76
+ }
55
77
  function utcDate(now = new Date()) {
56
78
  return now.toISOString().slice(0, 10);
57
79
  }
@@ -109,11 +131,25 @@ export function createCostGuard(projectRoot, config, options = {}) {
109
131
  const now = options.now ?? (() => new Date());
110
132
  const currentDate = () => utcDate(now());
111
133
  const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
134
+ const runCap = options.runCapUsd;
135
+ let runTotalUsd = 0;
112
136
  return {
113
137
  async commit(model, usage) {
114
138
  const usd = computeUsageUsd(model, usage, config);
115
- if (config.dailyUsdCap === undefined)
139
+ if (runCap !== undefined) {
140
+ const projected = Number((runTotalUsd + usd).toFixed(6));
141
+ if (projected > runCap) {
142
+ throw new RunCostCapExceededError({
143
+ capUsd: runCap,
144
+ projectedUsd: projected,
145
+ currentUsd: runTotalUsd
146
+ });
147
+ }
148
+ }
149
+ if (config.dailyUsdCap === undefined) {
150
+ runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
116
151
  return usd;
152
+ }
117
153
  const date = currentDate();
118
154
  const target = file();
119
155
  const ledger = await readLedger(target, date);
@@ -133,6 +169,7 @@ export function createCostGuard(projectRoot, config, options = {}) {
133
169
  byModel.usd = Number((byModel.usd + usd).toFixed(6));
134
170
  ledger.byModel[model] = byModel;
135
171
  await writeLedger(target, ledger);
172
+ runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
136
173
  return usd;
137
174
  },
138
175
  async snapshot() {
@@ -53,7 +53,7 @@ export interface EvalDiffReportMeta {
53
53
  runId: string;
54
54
  cclawVersion: string;
55
55
  generatedAt: string;
56
- tier: string;
56
+ mode: string;
57
57
  model: string;
58
58
  sourcePath: string;
59
59
  }
package/dist/eval/diff.js CHANGED
@@ -8,8 +8,8 @@
8
8
  * - per-case pass/fail transitions
9
9
  * - per-verifier score drops (only the drops — new passes are noted in
10
10
  * the summary line, not repeated per verifier)
11
- * - Tier C stage-level cost & duration deltas when both reports carry a
12
- * `workflow` summary for the same case id
11
+ * - Workflow-mode stage-level cost & duration deltas when both reports
12
+ * carry a `workflow` summary for the same case id
13
13
  *
14
14
  * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
15
15
  *
@@ -98,7 +98,7 @@ function meta(report, sourcePath) {
98
98
  runId: report.runId,
99
99
  cclawVersion: report.cclawVersion,
100
100
  generatedAt: report.generatedAt,
101
- tier: report.tier,
101
+ mode: report.mode,
102
102
  model: report.model,
103
103
  sourcePath
104
104
  };
@@ -7,7 +7,7 @@ export interface ChatMessage {
7
7
  toolCallId?: string;
8
8
  /**
9
9
  * OpenAI-style tool calls carried on a preceding assistant message.
10
- * Populated by the Tier B loop so the wire transcript stays
10
+ * Populated by the with-tools loop so the wire transcript stays
11
11
  * consistent (assistant message → tool responses).
12
12
  */
13
13
  toolCalls?: Array<{
@@ -35,7 +35,7 @@ export interface ChatRequest {
35
35
  seed?: number;
36
36
  /**
37
37
  * Tool/function-calling definitions in OpenAI wire format. Populated only
38
- * by Tier B. Ignored by the Tier A single-shot path.
38
+ * by agent/workflow modes. Ignored by the single-shot path.
39
39
  */
40
40
  tools?: unknown[];
41
41
  toolChoice?: "auto" | "none";
@@ -111,6 +111,17 @@ export interface CreateEvalClientOptions {
111
111
  retryPolicy?: RetryPolicy;
112
112
  /** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
113
113
  sleep?: (ms: number) => Promise<void>;
114
+ /**
115
+ * Observer invoked when a chat() call is about to sleep before the next
116
+ * retry attempt. Use this to surface "we are retrying" status via the
117
+ * progress logger so long, silent backoff windows become visible.
118
+ */
119
+ onRetry?: (event: {
120
+ attempt: number;
121
+ maxAttempts: number;
122
+ waitMs: number;
123
+ error: EvalLlmError;
124
+ }) => void;
114
125
  }
115
126
  export interface RetryPolicy {
116
127
  /** Max retries *on top of* the initial attempt. 0 = single attempt. */
@@ -251,7 +251,14 @@ export function createEvalClient(config, options = {}) {
251
251
  const isLastAttempt = attempt === maxAttempts - 1;
252
252
  if (!normalized.retryable || isLastAttempt)
253
253
  throw normalized;
254
- await sleep(backoffDelay(attempt, retryPolicy));
254
+ const waitMs = backoffDelay(attempt, retryPolicy);
255
+ options.onRetry?.({
256
+ attempt: attempt + 1,
257
+ maxAttempts,
258
+ waitMs,
259
+ error: normalized
260
+ });
261
+ await sleep(waitMs);
255
262
  }
256
263
  }
257
264
  throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Helpers that translate between the legacy `Tier A/B/C` naming and the
3
+ * current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
4
+ *
5
+ * The names we actually carry in reports, config, CLI flags, and verifier
6
+ * messages are the `EvalMode` ones; legacy tier inputs are accepted with a
7
+ * single deprecation warning per process so existing scripts keep working
8
+ * through the 0.28.x line.
9
+ */
10
+ import { type EvalMode } from "./types.js";
11
+ /**
12
+ * Reset the per-process "already warned about legacy tier" flag. Used by
13
+ * tests so each test file gets a deterministic warning surface.
14
+ */
15
+ export declare function __resetLegacyWarningForTests(): void;
16
+ export interface LegacyTierInput {
17
+ source: "cli" | "env" | "config";
18
+ raw: string;
19
+ }
20
+ /**
21
+ * Normalize a raw string from the CLI / env / config into an `EvalMode`.
22
+ * Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
23
+ * Emits a deprecation warning to stderr at most once per process when a
24
+ * legacy tier name is seen.
25
+ */
26
+ export declare function parseModeInput(raw: string, input: LegacyTierInput, writeWarning?: (message: string) => void): EvalMode;
27
+ /** @deprecated kept for callers that still need to serialize as legacy. */
28
+ export declare function modeToLegacyTier(mode: EvalMode): "A" | "B" | "C";