cclaw-cli 0.24.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Tier B with-tools agent.
3
+ *
4
+ * Multi-turn loop with OpenAI-style function-calling over a set of
5
+ * sandbox-confined tools. The AUT is given:
6
+ *
7
+ * - System prompt = stage SKILL.md (same contract as Tier A so the
8
+ * single-shot baseline is comparable).
9
+ * - User prompt = task description + a short "tools available" hint
10
+ * that names the sandbox root and the four built-in tools.
11
+ * - Tools = `read_file`, `write_file`, `glob`, `grep` (see
12
+ * `src/eval/tools/`).
13
+ *
14
+ * The loop runs up to `config.toolMaxTurns` turns (default 8). Each
15
+ * turn:
16
+ *
17
+ * 1. Send the current transcript to the model with tools enabled.
18
+ * 2. Commit token usage against the wrapped client (cost guard sees
19
+ * every call).
20
+ * 3. If the model returned tool_calls, execute each sandbox tool and
21
+ * append a `role: "tool"` message with the JSON-serialized result.
22
+ * 4. If the model produced assistant content with `finish_reason: stop`,
23
+ * treat that as the artifact and exit.
24
+ *
25
+ * When the turn budget is exhausted without a terminal stop, the agent
26
+ * throws `MaxTurnsExceededError`. The runner surfaces the error as a
27
+ * failed workflow verifier so the case counts as a regression.
28
+ *
29
+ * Artifact resolution: the final assistant content is the artifact. If
30
+ * the model used `write_file` to stage the artifact at
31
+ * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
32
+ * mirrors the Tier C workflow where writes are the deliverable. The
33
+ * fallback is the terminal assistant message so prompts that don't
34
+ * call write_file still produce something judgable.
35
+ */
36
+ import fs from "node:fs/promises";
37
+ import path from "node:path";
38
+ import { computeUsageUsd } from "../cost-guard.js";
39
+ import { createSandbox } from "../sandbox.js";
40
+ import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
41
+ import { loadStageSkill } from "./single-shot.js";
42
+ export class MaxTurnsExceededError extends Error {
43
+ turns;
44
+ constructor(turns) {
45
+ super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
46
+ this.name = "MaxTurnsExceededError";
47
+ this.turns = turns;
48
+ }
49
+ }
50
+ const DEFAULT_MAX_TURNS = 8;
51
+ const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
52
+ const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
53
+ const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
54
+ export async function runWithTools(input) {
55
+ const { caseEntry, config, projectRoot, client } = input;
56
+ const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
57
+ const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
58
+ const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
59
+ const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
60
+ const systemPrompt = await loader(caseEntry.stage);
61
+ const tools = input.tools ?? BUILTIN_TOOLS;
62
+ const toolMap = toolsByName(tools);
63
+ const toolsBody = toolsForRequest(tools);
64
+ const sandboxFactory = input.createSandboxFn ?? createSandbox;
65
+ const sandbox = await sandboxFactory({
66
+ projectRoot,
67
+ ...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
68
+ });
69
+ const toolUse = {
70
+ turns: 0,
71
+ calls: 0,
72
+ errors: 0,
73
+ deniedPaths: [],
74
+ byTool: {}
75
+ };
76
+ const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
77
+ let lastModel = config.model;
78
+ let totalAttempts = 0;
79
+ const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
80
+ const messages = [
81
+ { role: "system", content: systemPrompt },
82
+ { role: "user", content: userPrompt }
83
+ ];
84
+ const started = Date.now();
85
+ try {
86
+ for (let turn = 0; turn < maxTurns; turn += 1) {
87
+ toolUse.turns = turn + 1;
88
+ const response = await client.chat({
89
+ model: config.model,
90
+ messages,
91
+ temperature: config.agentTemperature ?? 0.2,
92
+ timeoutMs: config.timeoutMs,
93
+ tools: toolsBody,
94
+ toolChoice: "auto"
95
+ });
96
+ usage.promptTokens += response.usage.promptTokens;
97
+ usage.completionTokens += response.usage.completionTokens;
98
+ usage.totalTokens += response.usage.totalTokens;
99
+ lastModel = response.model;
100
+ totalAttempts += response.attempts;
101
+ const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
102
+ messages.push(rememberAssistant(response.content, response.toolCalls));
103
+ if (!hasToolCalls) {
104
+ const artifact = await resolveArtifact(sandbox, response.content);
105
+ return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
106
+ }
107
+ for (const call of response.toolCalls) {
108
+ const tool = toolMap.get(call.name);
109
+ const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
110
+ if (argBytes > maxArgBytes) {
111
+ toolUse.errors += 1;
112
+ bumpToolCount(toolUse, call.name);
113
+ messages.push(toolResponseMessage(call.id, {
114
+ ok: false,
115
+ name: call.name,
116
+ error: `arguments payload exceeds ${maxArgBytes} bytes`
117
+ }));
118
+ continue;
119
+ }
120
+ if (!tool) {
121
+ toolUse.errors += 1;
122
+ bumpToolCount(toolUse, call.name);
123
+ messages.push(toolResponseMessage(call.id, {
124
+ ok: false,
125
+ name: call.name,
126
+ error: `unknown tool "${call.name}"`
127
+ }));
128
+ continue;
129
+ }
130
+ bumpToolCount(toolUse, call.name);
131
+ const result = await tool.invoke(call.arguments ?? "", {
132
+ sandbox,
133
+ maxResultBytes
134
+ });
135
+ if (!result.ok) {
136
+ toolUse.errors += 1;
137
+ const denied = result.details && typeof result.details.deniedPath === "string"
138
+ ? result.details.deniedPath
139
+ : undefined;
140
+ if (denied && !toolUse.deniedPaths.includes(denied)) {
141
+ toolUse.deniedPaths.push(denied);
142
+ }
143
+ }
144
+ else {
145
+ toolUse.calls += 1;
146
+ }
147
+ messages.push(toolResponseMessage(call.id, result));
148
+ }
149
+ }
150
+ throw new MaxTurnsExceededError(maxTurns);
151
+ }
152
+ finally {
153
+ await sandbox.dispose();
154
+ }
155
+ }
156
+ function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
157
+ const usageUsd = computeUsageUsd(model, usage, {
158
+ tokenPricing: config.tokenPricing
159
+ });
160
+ return {
161
+ artifact: artifact.trim(),
162
+ usage,
163
+ usageUsd,
164
+ model,
165
+ attempts,
166
+ durationMs: Date.now() - started,
167
+ toolUse,
168
+ systemPrompt,
169
+ userPrompt
170
+ };
171
+ }
172
+ function rememberAssistant(content, toolCalls) {
173
+ const base = { role: "assistant", content };
174
+ if (toolCalls && toolCalls.length > 0)
175
+ base.toolCalls = toolCalls;
176
+ return base;
177
+ }
178
+ function toolResponseMessage(callId, result) {
179
+ const payload = result.ok
180
+ ? { ok: true, content: result.content, details: result.details ?? {} }
181
+ : { ok: false, error: result.error, details: result.details ?? {} };
182
+ return {
183
+ role: "tool",
184
+ content: truncatePayload(JSON.stringify(payload), 32 * 1024),
185
+ toolCallId: callId,
186
+ name: result.name
187
+ };
188
+ }
189
+ function bumpToolCount(summary, name) {
190
+ summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
191
+ }
192
+ function clampPositive(value, fallback) {
193
+ if (value === undefined)
194
+ return fallback;
195
+ if (!Number.isFinite(value) || value <= 0)
196
+ return fallback;
197
+ return Math.floor(value);
198
+ }
199
+ function buildUserPrompt(caseEntry, sandbox, tools) {
200
+ const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
201
+ const files = caseEntry.contextFiles ?? [];
202
+ const contextLines = files.length > 0
203
+ ? files.map((f) => `- ${f}`).join("\n")
204
+ : "(no files seeded)";
205
+ const lines = [
206
+ `Stage: ${caseEntry.stage}`,
207
+ `Case id: ${caseEntry.id}`,
208
+ ``,
209
+ `Sandbox root: ${sandbox.root}`,
210
+ `You may call the following tools to read or modify files inside the sandbox.`,
211
+ `All paths are relative to the sandbox root.`,
212
+ ``,
213
+ `Tools:`,
214
+ ...toolList,
215
+ ``,
216
+ `Seeded context files (available under the sandbox root):`,
217
+ contextLines,
218
+ ``,
219
+ `Task:`,
220
+ caseEntry.inputPrompt.trim(),
221
+ ``,
222
+ `When you are done, reply with the artifact as the final assistant message.`,
223
+ `Output the artifact directly (markdown with optional YAML frontmatter).`,
224
+ `Do not wrap in code fences, do not add commentary before or after.`,
225
+ `You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
226
+ `if you do, the last written \`artifact.md\` is preferred over the chat reply.`
227
+ ];
228
+ return lines.join("\n");
229
+ }
230
+ async function resolveArtifact(sandbox, fallback) {
231
+ for (const candidate of ARTIFACT_CANDIDATES) {
232
+ try {
233
+ const abs = await sandbox.resolve(candidate);
234
+ const stat = await fs.stat(abs);
235
+ if (stat.isFile()) {
236
+ return await fs.readFile(abs, "utf8");
237
+ }
238
+ }
239
+ catch {
240
+ continue;
241
+ }
242
+ }
243
+ try {
244
+ const dir = path.join(sandbox.root);
245
+ const entries = (await fs.readdir(dir, { withFileTypes: true }));
246
+ const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
247
+ if (match) {
248
+ return await fs.readFile(path.join(dir, match.name), "utf8");
249
+ }
250
+ }
251
+ catch {
252
+ // fall through to fallback
253
+ }
254
+ return fallback;
255
+ }
@@ -20,13 +20,22 @@ export const DEFAULT_EVAL_CONFIG = {
20
20
  failIfCriticalBelow: 3.0
21
21
  },
22
22
  timeoutMs: 120_000,
23
- maxRetries: 2
23
+ maxRetries: 2,
24
+ judgeSamples: 3,
25
+ judgeTemperature: 0,
26
+ agentTemperature: 0.2
24
27
  };
25
28
  const EVAL_TIER_SET = new Set(EVAL_TIERS);
26
29
  const NUMERIC_ENVS = new Set([
27
30
  "CCLAW_EVAL_DAILY_USD_CAP",
28
31
  "CCLAW_EVAL_TIMEOUT_MS",
29
- "CCLAW_EVAL_MAX_RETRIES"
32
+ "CCLAW_EVAL_MAX_RETRIES",
33
+ "CCLAW_EVAL_JUDGE_SAMPLES",
34
+ "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
+ "CCLAW_EVAL_AGENT_TEMPERATURE",
36
+ "CCLAW_EVAL_TOOL_MAX_TURNS",
37
+ "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
38
+ "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
30
39
  ]);
31
40
  function evalConfigError(configFilePath, reason) {
32
41
  return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -93,6 +102,70 @@ function validateFileConfig(raw, configFilePath) {
93
102
  }
94
103
  out.maxRetries = raw.maxRetries;
95
104
  }
105
+ if (raw.judgeSamples !== undefined) {
106
+ const value = raw.judgeSamples;
107
+ if (!Number.isInteger(value) || value < 1) {
108
+ throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
109
+ }
110
+ if (value % 2 === 0) {
111
+ throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
112
+ }
113
+ out.judgeSamples = value;
114
+ }
115
+ if (raw.judgeTemperature !== undefined) {
116
+ if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
117
+ throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
118
+ }
119
+ if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
120
+ throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
121
+ }
122
+ out.judgeTemperature = raw.judgeTemperature;
123
+ }
124
+ if (raw.agentTemperature !== undefined) {
125
+ if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
126
+ throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
127
+ }
128
+ if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
129
+ throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
130
+ }
131
+ out.agentTemperature = raw.agentTemperature;
132
+ }
133
+ if (raw.tokenPricing !== undefined) {
134
+ if (!isRecord(raw.tokenPricing)) {
135
+ throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
136
+ }
137
+ const pricing = {};
138
+ for (const [model, value] of Object.entries(raw.tokenPricing)) {
139
+ if (!isRecord(value)) {
140
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
141
+ }
142
+ const input = value.input;
143
+ const output = value.output;
144
+ if (typeof input !== "number" || input < 0) {
145
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
146
+ }
147
+ if (typeof output !== "number" || output < 0) {
148
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
149
+ }
150
+ const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
151
+ if (extraneous.length > 0) {
152
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
153
+ }
154
+ pricing[model] = { input, output };
155
+ }
156
+ out.tokenPricing = pricing;
157
+ }
158
+ const assignPositiveInt = (key, value, label) => {
159
+ if (value === undefined)
160
+ return;
161
+ if (!Number.isInteger(value) || value < 1) {
162
+ throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
163
+ }
164
+ out[key] = value;
165
+ };
166
+ assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
167
+ assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
168
+ assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
96
169
  if (raw.regression !== undefined) {
97
170
  if (!isRecord(raw.regression)) {
98
171
  throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -123,7 +196,14 @@ function validateFileConfig(raw, configFilePath) {
123
196
  "dailyUsdCap",
124
197
  "timeoutMs",
125
198
  "maxRetries",
126
- "regression"
199
+ "regression",
200
+ "judgeSamples",
201
+ "judgeTemperature",
202
+ "agentTemperature",
203
+ "tokenPricing",
204
+ "toolMaxTurns",
205
+ "toolMaxArgumentsBytes",
206
+ "toolMaxResultBytes"
127
207
  ]);
128
208
  const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
129
209
  if (unknown.length > 0) {
@@ -203,6 +283,51 @@ function applyEnvOverrides(base, env) {
203
283
  patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
204
284
  overridden = true;
205
285
  }
286
+ const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
287
+ if (judgeSamples) {
288
+ const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
289
+ if (!Number.isInteger(value) || value < 1) {
290
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
291
+ }
292
+ if (value % 2 === 0) {
293
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
294
+ }
295
+ patched.judgeSamples = value;
296
+ overridden = true;
297
+ }
298
+ const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
299
+ if (judgeTemp) {
300
+ const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
301
+ if (value < 0 || value > 2) {
302
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
303
+ }
304
+ patched.judgeTemperature = value;
305
+ overridden = true;
306
+ }
307
+ const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
308
+ if (agentTemp) {
309
+ const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
310
+ if (value < 0 || value > 2) {
311
+ throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
312
+ }
313
+ patched.agentTemperature = value;
314
+ overridden = true;
315
+ }
316
+ const readPositiveInt = (name, key, label) => {
317
+ const raw = read(name);
318
+ if (!raw)
319
+ return;
320
+ const value = parseNumericEnv(name, raw);
321
+ if (!Number.isInteger(value) || value < 1) {
322
+ throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
323
+ }
324
+ patched[key] = value;
325
+ overridden = true;
326
+ void label;
327
+ };
328
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
329
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
330
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
206
331
  const apiKey = read("CCLAW_EVAL_API_KEY");
207
332
  return { patched, overridden, apiKey };
208
333
  }
@@ -0,0 +1,80 @@
1
+ import type { ChatUsage } from "./llm-client.js";
2
+ import type { ResolvedEvalConfig, TokenPricing } from "./types.js";
3
+ /**
4
+ * Builtin pricing fallback. Intentionally conservative: when the user
5
+ * hasn't configured pricing and we don't know the model, we default to a
6
+ * "small model" USD schedule so the cap can still do something useful.
7
+ *
8
+ * Values are USD per 1K tokens. Sources are public pricing pages as of
9
+ * 2026-04; update by editing this constant, not the guard logic.
10
+ */
11
+ export declare const DEFAULT_TOKEN_PRICING: Readonly<Record<string, TokenPricing>>;
12
+ /** Hard default when neither config nor builtins know the model. */
13
+ export declare const UNKNOWN_MODEL_PRICING: TokenPricing;
14
+ export interface SpendLedger {
15
+ /** ISO date (`YYYY-MM-DD` in UTC) — also embedded in the file name. */
16
+ date: string;
17
+ /** USD spent so far today across every call that hit the guard. */
18
+ totalUsd: number;
19
+ /** Number of `chat()` calls accounted for. */
20
+ calls: number;
21
+ /** Per-model breakdown for the report. */
22
+ byModel: Record<string, {
23
+ tokensIn: number;
24
+ tokensOut: number;
25
+ usd: number;
26
+ }>;
27
+ }
28
+ export declare class DailyCostCapExceededError extends Error {
29
+ readonly capUsd: number;
30
+ readonly projectedUsd: number;
31
+ readonly currentUsd: number;
32
+ constructor(opts: {
33
+ capUsd: number;
34
+ projectedUsd: number;
35
+ currentUsd: number;
36
+ });
37
+ }
38
+ declare function utcDate(now?: Date): string;
39
+ declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
40
+ /**
41
+ * Compute USD cost of a single `ChatUsage` using the given `model` pricing
42
+ * schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
43
+ * before first token).
44
+ */
45
+ export declare function computeUsageUsd(model: string, usage: ChatUsage, config: Pick<ResolvedEvalConfig, "tokenPricing">): number;
46
+ declare function ledgerPath(projectRoot: string, date: string): string;
47
+ declare function readLedger(file: string, date: string): Promise<SpendLedger>;
48
+ declare function writeLedger(file: string, ledger: SpendLedger): Promise<void>;
49
+ /**
50
+ * Guard a single LLM call against the daily USD cap. Returns the updated
51
+ * ledger on success; throws `DailyCostCapExceededError` when the projected
52
+ * total would cross the cap. When `config.dailyUsdCap` is unset, the guard
53
+ * is a no-op — no file writes, no ledger — so non-judge runs never touch
54
+ * the filesystem.
55
+ */
56
+ export interface CostGuard {
57
+ /**
58
+ * Commit the USD cost of a finished call to the ledger. When `dailyUsdCap`
59
+ * is set, refuses the commit if the projected total would exceed the cap.
60
+ */
61
+ commit(model: string, usage: ChatUsage): Promise<number>;
62
+ /** Snapshot the current ledger (or undefined when no cap is set). */
63
+ snapshot(): Promise<SpendLedger | undefined>;
64
+ }
65
+ export interface CreateCostGuardOptions {
66
+ /** Clock injection for tests. */
67
+ now?: () => Date;
68
+ /** Override the default filesystem root for the ledger. */
69
+ ledgerPath?: string;
70
+ }
71
+ export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
72
+ /** Exposed for tests. */
73
+ export declare const __internal: {
74
+ utcDate: typeof utcDate;
75
+ pricingFor: typeof pricingFor;
76
+ ledgerPath: typeof ledgerPath;
77
+ readLedger: typeof readLedger;
78
+ writeLedger: typeof writeLedger;
79
+ };
80
+ export {};
@@ -0,0 +1,153 @@
1
+ /**
2
+ * Cost guard for the cclaw eval subsystem.
3
+ *
4
+ * Two responsibilities:
5
+ *
6
+ * 1. Convert `ChatUsage` (prompt/completion token counts) into USD using
7
+ * a per-model `TokenPricing` schedule. Pricing comes from
8
+ * `config.tokenPricing[model]` first, then from the builtin fallback
9
+ * schedule for well-known models (z.ai GLM 5.1 at publish time).
10
+ * 2. Maintain a per-day running total persisted to
11
+ * `.cclaw/evals/.spend-YYYY-MM-DD.json` so that a long eval session
12
+ * (or a cron-run nightly) can't blow through the configured
13
+ * `dailyUsdCap`. The counter is opt-in: no cap, no writes.
14
+ *
15
+ * The guard is deliberately pessimistic — it rounds USD up to 6 decimals
16
+ * and never subtracts, so a CI run that errors mid-flight still shows the
17
+ * partial spend in the next report.
18
+ */
19
+ import fs from "node:fs/promises";
20
+ import path from "node:path";
21
+ import { EVALS_ROOT } from "../constants.js";
22
+ import { exists } from "../fs-utils.js";
23
+ /**
24
+ * Builtin pricing fallback. Intentionally conservative: when the user
25
+ * hasn't configured pricing and we don't know the model, we default to a
26
+ * "small model" USD schedule so the cap can still do something useful.
27
+ *
28
+ * Values are USD per 1K tokens. Sources are public pricing pages as of
29
+ * 2026-04; update by editing this constant, not the guard logic.
30
+ */
31
+ export const DEFAULT_TOKEN_PRICING = {
32
+ "glm-5.1": { input: 0.0005, output: 0.0015 },
33
+ "glm-4.6": { input: 0.0005, output: 0.0015 },
34
+ "gpt-4o-mini": { input: 0.00015, output: 0.0006 },
35
+ "gpt-4o": { input: 0.005, output: 0.015 }
36
+ };
37
+ /** Hard default when neither config nor builtins know the model. */
38
+ export const UNKNOWN_MODEL_PRICING = { input: 0.001, output: 0.003 };
39
+ export class DailyCostCapExceededError extends Error {
40
+ capUsd;
41
+ projectedUsd;
42
+ currentUsd;
43
+ constructor(opts) {
44
+ super(`Daily cost cap would be exceeded: ` +
45
+ `current=$${opts.currentUsd.toFixed(4)}, ` +
46
+ `projected=$${opts.projectedUsd.toFixed(4)}, ` +
47
+ `cap=$${opts.capUsd.toFixed(4)}. ` +
48
+ `Unset CCLAW_EVAL_DAILY_USD_CAP or increase the cap to continue.`);
49
+ this.name = "DailyCostCapExceededError";
50
+ this.capUsd = opts.capUsd;
51
+ this.projectedUsd = opts.projectedUsd;
52
+ this.currentUsd = opts.currentUsd;
53
+ }
54
+ }
55
+ function utcDate(now = new Date()) {
56
+ return now.toISOString().slice(0, 10);
57
+ }
58
+ function pricingFor(model, config) {
59
+ const custom = config.tokenPricing?.[model];
60
+ if (custom)
61
+ return custom;
62
+ const builtin = DEFAULT_TOKEN_PRICING[model];
63
+ if (builtin)
64
+ return builtin;
65
+ return UNKNOWN_MODEL_PRICING;
66
+ }
67
+ /**
68
+ * Compute USD cost of a single `ChatUsage` using the given `model` pricing
69
+ * schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
70
+ * before first token).
71
+ */
72
+ export function computeUsageUsd(model, usage, config) {
73
+ if (!usage || usage.totalTokens <= 0)
74
+ return 0;
75
+ const schedule = pricingFor(model, config);
76
+ const cost = (usage.promptTokens * schedule.input) / 1_000 +
77
+ (usage.completionTokens * schedule.output) / 1_000;
78
+ return Math.max(0, Number(cost.toFixed(6)));
79
+ }
80
+ function emptyLedger(date) {
81
+ return { date, totalUsd: 0, calls: 0, byModel: {} };
82
+ }
83
+ function ledgerPath(projectRoot, date) {
84
+ return path.join(projectRoot, EVALS_ROOT, `.spend-${date}.json`);
85
+ }
86
+ async function readLedger(file, date) {
87
+ if (!(await exists(file)))
88
+ return emptyLedger(date);
89
+ try {
90
+ const raw = JSON.parse(await fs.readFile(file, "utf8"));
91
+ if (raw?.date !== date)
92
+ return emptyLedger(date);
93
+ return {
94
+ date,
95
+ totalUsd: typeof raw.totalUsd === "number" ? raw.totalUsd : 0,
96
+ calls: typeof raw.calls === "number" ? raw.calls : 0,
97
+ byModel: raw.byModel && typeof raw.byModel === "object" ? raw.byModel : {}
98
+ };
99
+ }
100
+ catch {
101
+ return emptyLedger(date);
102
+ }
103
+ }
104
+ async function writeLedger(file, ledger) {
105
+ await fs.mkdir(path.dirname(file), { recursive: true });
106
+ await fs.writeFile(file, `${JSON.stringify(ledger, null, 2)}\n`, "utf8");
107
+ }
108
+ export function createCostGuard(projectRoot, config, options = {}) {
109
+ const now = options.now ?? (() => new Date());
110
+ const currentDate = () => utcDate(now());
111
+ const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
112
+ return {
113
+ async commit(model, usage) {
114
+ const usd = computeUsageUsd(model, usage, config);
115
+ if (config.dailyUsdCap === undefined)
116
+ return usd;
117
+ const date = currentDate();
118
+ const target = file();
119
+ const ledger = await readLedger(target, date);
120
+ const projected = Number((ledger.totalUsd + usd).toFixed(6));
121
+ if (projected > config.dailyUsdCap) {
122
+ throw new DailyCostCapExceededError({
123
+ capUsd: config.dailyUsdCap,
124
+ projectedUsd: projected,
125
+ currentUsd: ledger.totalUsd
126
+ });
127
+ }
128
+ ledger.totalUsd = projected;
129
+ ledger.calls += 1;
130
+ const byModel = ledger.byModel[model] ?? { tokensIn: 0, tokensOut: 0, usd: 0 };
131
+ byModel.tokensIn += usage.promptTokens;
132
+ byModel.tokensOut += usage.completionTokens;
133
+ byModel.usd = Number((byModel.usd + usd).toFixed(6));
134
+ ledger.byModel[model] = byModel;
135
+ await writeLedger(target, ledger);
136
+ return usd;
137
+ },
138
+ async snapshot() {
139
+ if (config.dailyUsdCap === undefined)
140
+ return undefined;
141
+ const date = currentDate();
142
+ return readLedger(file(), date);
143
+ }
144
+ };
145
+ }
146
+ /** Exposed for tests. */
147
+ export const __internal = {
148
+ utcDate,
149
+ pricingFor,
150
+ ledgerPath,
151
+ readLedger,
152
+ writeLedger
153
+ };