npm - cclaw-cli - Versions diffs - 0.24.0 → 0.26.0 - Mend

cclaw-cli 0.24.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/dist/cli.js +3 -1
package/dist/content/eval-scaffold.d.ts +5 -1
package/dist/content/eval-scaffold.js +284 -3
package/dist/eval/agents/single-shot.d.ts +27 -0
package/dist/eval/agents/single-shot.js +79 -0
package/dist/eval/agents/with-tools.d.ts +31 -0
package/dist/eval/agents/with-tools.js +255 -0
package/dist/eval/config-loader.js +128 -3
package/dist/eval/cost-guard.d.ts +80 -0
package/dist/eval/cost-guard.js +153 -0
package/dist/eval/llm-client.d.ts +123 -20
package/dist/eval/llm-client.js +251 -10
package/dist/eval/report.js +45 -0
package/dist/eval/rubric-loader.d.ts +20 -0
package/dist/eval/rubric-loader.js +143 -0
package/dist/eval/runner.d.ts +7 -0
package/dist/eval/runner.js +193 -12
package/dist/eval/sandbox.d.ts +38 -0
package/dist/eval/sandbox.js +137 -0
package/dist/eval/tools/glob.d.ts +2 -0
package/dist/eval/tools/glob.js +163 -0
package/dist/eval/tools/grep.d.ts +2 -0
package/dist/eval/tools/grep.js +152 -0
package/dist/eval/tools/index.d.ts +7 -0
package/dist/eval/tools/index.js +35 -0
package/dist/eval/tools/read.d.ts +2 -0
package/dist/eval/tools/read.js +122 -0
package/dist/eval/tools/types.d.ts +49 -0
package/dist/eval/tools/types.js +41 -0
package/dist/eval/tools/write.d.ts +2 -0
package/dist/eval/tools/write.js +92 -0
package/dist/eval/types.d.ts +138 -1
package/dist/eval/verifiers/judge.d.ts +40 -0
package/dist/eval/verifiers/judge.js +256 -0
package/dist/install.js +7 -1
package/package.json +2 -1

package/dist/eval/agents/with-tools.js ADDED Viewed

@@ -0,0 +1,255 @@
+/**
+ * Tier B with-tools agent.
+ *
+ * Multi-turn loop with OpenAI-style function-calling over a set of
+ * sandbox-confined tools. The AUT is given:
+ *
+ *  - System prompt = stage SKILL.md (same contract as Tier A so the
+ *    single-shot baseline is comparable).
+ *  - User prompt = task description + a short "tools available" hint
+ *    that names the sandbox root and the four built-in tools.
+ *  - Tools = `read_file`, `write_file`, `glob`, `grep` (see
+ *    `src/eval/tools/`).
+ *
+ * The loop runs up to `config.toolMaxTurns` turns (default 8). Each
+ * turn:
+ *
+ *  1. Send the current transcript to the model with tools enabled.
+ *  2. Commit token usage against the wrapped client (cost guard sees
+ *     every call).
+ *  3. If the model returned tool_calls, execute each sandbox tool and
+ *     append a `role: "tool"` message with the JSON-serialized result.
+ *  4. If the model produced assistant content with `finish_reason: stop`,
+ *     treat that as the artifact and exit.
+ *
+ * When the turn budget is exhausted without a terminal stop, the agent
+ * throws `MaxTurnsExceededError`. The runner surfaces the error as a
+ * failed workflow verifier so the case counts as a regression.
+ *
+ * Artifact resolution: the final assistant content is the artifact. If
+ * the model used `write_file` to stage the artifact at
+ * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
+ * mirrors the Tier C workflow where writes are the deliverable. The
+ * fallback is the terminal assistant message so prompts that don't
+ * call write_file still produce something judgable.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { computeUsageUsd } from "../cost-guard.js";
+import { createSandbox } from "../sandbox.js";
+import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
+import { loadStageSkill } from "./single-shot.js";
+export class MaxTurnsExceededError extends Error {
+    turns;
+    constructor(turns) {
+        super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
+        this.name = "MaxTurnsExceededError";
+        this.turns = turns;
+    }
+}
+const DEFAULT_MAX_TURNS = 8;
+const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
+const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
+const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
+export async function runWithTools(input) {
+    const { caseEntry, config, projectRoot, client } = input;
+    const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
+    const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
+    const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
+    const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
+    const systemPrompt = await loader(caseEntry.stage);
+    const tools = input.tools ?? BUILTIN_TOOLS;
+    const toolMap = toolsByName(tools);
+    const toolsBody = toolsForRequest(tools);
+    const sandboxFactory = input.createSandboxFn ?? createSandbox;
+    const sandbox = await sandboxFactory({
+        projectRoot,
+        ...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
+    });
+    const toolUse = {
+        turns: 0,
+        calls: 0,
+        errors: 0,
+        deniedPaths: [],
+        byTool: {}
+    };
+    const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
+    let lastModel = config.model;
+    let totalAttempts = 0;
+    const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
+    const messages = [
+        { role: "system", content: systemPrompt },
+        { role: "user", content: userPrompt }
+    ];
+    const started = Date.now();
+    try {
+        for (let turn = 0; turn < maxTurns; turn += 1) {
+            toolUse.turns = turn + 1;
+            const response = await client.chat({
+                model: config.model,
+                messages,
+                temperature: config.agentTemperature ?? 0.2,
+                timeoutMs: config.timeoutMs,
+                tools: toolsBody,
+                toolChoice: "auto"
+            });
+            usage.promptTokens += response.usage.promptTokens;
+            usage.completionTokens += response.usage.completionTokens;
+            usage.totalTokens += response.usage.totalTokens;
+            lastModel = response.model;
+            totalAttempts += response.attempts;
+            const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
+            messages.push(rememberAssistant(response.content, response.toolCalls));
+            if (!hasToolCalls) {
+                const artifact = await resolveArtifact(sandbox, response.content);
+                return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
+            }
+            for (const call of response.toolCalls) {
+                const tool = toolMap.get(call.name);
+                const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
+                if (argBytes > maxArgBytes) {
+                    toolUse.errors += 1;
+                    bumpToolCount(toolUse, call.name);
+                    messages.push(toolResponseMessage(call.id, {
+                        ok: false,
+                        name: call.name,
+                        error: `arguments payload exceeds ${maxArgBytes} bytes`
+                    }));
+                    continue;
+                }
+                if (!tool) {
+                    toolUse.errors += 1;
+                    bumpToolCount(toolUse, call.name);
+                    messages.push(toolResponseMessage(call.id, {
+                        ok: false,
+                        name: call.name,
+                        error: `unknown tool "${call.name}"`
+                    }));
+                    continue;
+                }
+                bumpToolCount(toolUse, call.name);
+                const result = await tool.invoke(call.arguments ?? "", {
+                    sandbox,
+                    maxResultBytes
+                });
+                if (!result.ok) {
+                    toolUse.errors += 1;
+                    const denied = result.details && typeof result.details.deniedPath === "string"
+                        ? result.details.deniedPath
+                        : undefined;
+                    if (denied && !toolUse.deniedPaths.includes(denied)) {
+                        toolUse.deniedPaths.push(denied);
+                    }
+                }
+                else {
+                    toolUse.calls += 1;
+                }
+                messages.push(toolResponseMessage(call.id, result));
+            }
+        }
+        throw new MaxTurnsExceededError(maxTurns);
+    }
+    finally {
+        await sandbox.dispose();
+    }
+}
+function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
+    const usageUsd = computeUsageUsd(model, usage, {
+        tokenPricing: config.tokenPricing
+    });
+    return {
+        artifact: artifact.trim(),
+        usage,
+        usageUsd,
+        model,
+        attempts,
+        durationMs: Date.now() - started,
+        toolUse,
+        systemPrompt,
+        userPrompt
+    };
+}
+function rememberAssistant(content, toolCalls) {
+    const base = { role: "assistant", content };
+    if (toolCalls && toolCalls.length > 0)
+        base.toolCalls = toolCalls;
+    return base;
+}
+function toolResponseMessage(callId, result) {
+    const payload = result.ok
+        ? { ok: true, content: result.content, details: result.details ?? {} }
+        : { ok: false, error: result.error, details: result.details ?? {} };
+    return {
+        role: "tool",
+        content: truncatePayload(JSON.stringify(payload), 32 * 1024),
+        toolCallId: callId,
+        name: result.name
+    };
+}
+function bumpToolCount(summary, name) {
+    summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
+}
+function clampPositive(value, fallback) {
+    if (value === undefined)
+        return fallback;
+    if (!Number.isFinite(value) || value <= 0)
+        return fallback;
+    return Math.floor(value);
+}
+function buildUserPrompt(caseEntry, sandbox, tools) {
+    const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
+    const files = caseEntry.contextFiles ?? [];
+    const contextLines = files.length > 0
+        ? files.map((f) => `- ${f}`).join("\n")
+        : "(no files seeded)";
+    const lines = [
+        `Stage: ${caseEntry.stage}`,
+        `Case id: ${caseEntry.id}`,
+        ``,
+        `Sandbox root: ${sandbox.root}`,
+        `You may call the following tools to read or modify files inside the sandbox.`,
+        `All paths are relative to the sandbox root.`,
+        ``,
+        `Tools:`,
+        ...toolList,
+        ``,
+        `Seeded context files (available under the sandbox root):`,
+        contextLines,
+        ``,
+        `Task:`,
+        caseEntry.inputPrompt.trim(),
+        ``,
+        `When you are done, reply with the artifact as the final assistant message.`,
+        `Output the artifact directly (markdown with optional YAML frontmatter).`,
+        `Do not wrap in code fences, do not add commentary before or after.`,
+        `You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
+            `if you do, the last written \`artifact.md\` is preferred over the chat reply.`
+    ];
+    return lines.join("\n");
+}
+async function resolveArtifact(sandbox, fallback) {
+    for (const candidate of ARTIFACT_CANDIDATES) {
+        try {
+            const abs = await sandbox.resolve(candidate);
+            const stat = await fs.stat(abs);
+            if (stat.isFile()) {
+                return await fs.readFile(abs, "utf8");
+            }
+        }
+        catch {
+            continue;
+        }
+    }
+    try {
+        const dir = path.join(sandbox.root);
+        const entries = (await fs.readdir(dir, { withFileTypes: true }));
+        const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
+        if (match) {
+            return await fs.readFile(path.join(dir, match.name), "utf8");
+        }
+    }
+    catch {
+        // fall through to fallback
+    }
+    return fallback;
+}

package/dist/eval/config-loader.js CHANGED Viewed

@@ -20,13 +20,22 @@ export const DEFAULT_EVAL_CONFIG = {
         failIfCriticalBelow: 3.0
     },
     timeoutMs: 120_000,
-    maxRetries: 2
+    maxRetries: 2,
+    judgeSamples: 3,
+    judgeTemperature: 0,
+    agentTemperature: 0.2
 };
 const EVAL_TIER_SET = new Set(EVAL_TIERS);
 const NUMERIC_ENVS = new Set([
     "CCLAW_EVAL_DAILY_USD_CAP",
     "CCLAW_EVAL_TIMEOUT_MS",
-    "CCLAW_EVAL_MAX_RETRIES"
+    "CCLAW_EVAL_MAX_RETRIES",
+    "CCLAW_EVAL_JUDGE_SAMPLES",
+    "CCLAW_EVAL_JUDGE_TEMPERATURE",
+    "CCLAW_EVAL_AGENT_TEMPERATURE",
+    "CCLAW_EVAL_TOOL_MAX_TURNS",
+    "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
+    "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
 ]);
 function evalConfigError(configFilePath, reason) {
     return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -93,6 +102,70 @@ function validateFileConfig(raw, configFilePath) {
         }
         out.maxRetries = raw.maxRetries;
     }
+    if (raw.judgeSamples !== undefined) {
+        const value = raw.judgeSamples;
+        if (!Number.isInteger(value) || value < 1) {
+            throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
+        }
+        if (value % 2 === 0) {
+            throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
+        }
+        out.judgeSamples = value;
+    }
+    if (raw.judgeTemperature !== undefined) {
+        if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
+            throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
+        }
+        if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
+            throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
+        }
+        out.judgeTemperature = raw.judgeTemperature;
+    }
+    if (raw.agentTemperature !== undefined) {
+        if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
+            throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
+        }
+        if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
+            throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
+        }
+        out.agentTemperature = raw.agentTemperature;
+    }
+    if (raw.tokenPricing !== undefined) {
+        if (!isRecord(raw.tokenPricing)) {
+            throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
+        }
+        const pricing = {};
+        for (const [model, value] of Object.entries(raw.tokenPricing)) {
+            if (!isRecord(value)) {
+                throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
+            }
+            const input = value.input;
+            const output = value.output;
+            if (typeof input !== "number" || input < 0) {
+                throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
+            }
+            if (typeof output !== "number" || output < 0) {
+                throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
+            }
+            const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
+            if (extraneous.length > 0) {
+                throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
+            }
+            pricing[model] = { input, output };
+        }
+        out.tokenPricing = pricing;
+    }
+    const assignPositiveInt = (key, value, label) => {
+        if (value === undefined)
+            return;
+        if (!Number.isInteger(value) || value < 1) {
+            throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
+        }
+        out[key] = value;
+    };
+    assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
+    assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
+    assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
     if (raw.regression !== undefined) {
         if (!isRecord(raw.regression)) {
             throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -123,7 +196,14 @@ function validateFileConfig(raw, configFilePath) {
         "dailyUsdCap",
         "timeoutMs",
         "maxRetries",
-        "regression"
+        "regression",
+        "judgeSamples",
+        "judgeTemperature",
+        "agentTemperature",
+        "tokenPricing",
+        "toolMaxTurns",
+        "toolMaxArgumentsBytes",
+        "toolMaxResultBytes"
     ]);
     const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
     if (unknown.length > 0) {
@@ -203,6 +283,51 @@ function applyEnvOverrides(base, env) {
         patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
         overridden = true;
     }
+    const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
+    if (judgeSamples) {
+        const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
+        if (!Number.isInteger(value) || value < 1) {
+            throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
+        }
+        if (value % 2 === 0) {
+            throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
+        }
+        patched.judgeSamples = value;
+        overridden = true;
+    }
+    const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
+    if (judgeTemp) {
+        const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
+        if (value < 0 || value > 2) {
+            throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
+        }
+        patched.judgeTemperature = value;
+        overridden = true;
+    }
+    const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
+    if (agentTemp) {
+        const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
+        if (value < 0 || value > 2) {
+            throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
+        }
+        patched.agentTemperature = value;
+        overridden = true;
+    }
+    const readPositiveInt = (name, key, label) => {
+        const raw = read(name);
+        if (!raw)
+            return;
+        const value = parseNumericEnv(name, raw);
+        if (!Number.isInteger(value) || value < 1) {
+            throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
+        }
+        patched[key] = value;
+        overridden = true;
+        void label;
+    };
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
     const apiKey = read("CCLAW_EVAL_API_KEY");
     return { patched, overridden, apiKey };
 }

package/dist/eval/cost-guard.d.ts ADDED Viewed

@@ -0,0 +1,80 @@
+import type { ChatUsage } from "./llm-client.js";
+import type { ResolvedEvalConfig, TokenPricing } from "./types.js";
+/**
+ * Builtin pricing fallback. Intentionally conservative: when the user
+ * hasn't configured pricing and we don't know the model, we default to a
+ * "small model" USD schedule so the cap can still do something useful.
+ *
+ * Values are USD per 1K tokens. Sources are public pricing pages as of
+ * 2026-04; update by editing this constant, not the guard logic.
+ */
+export declare const DEFAULT_TOKEN_PRICING: Readonly<Record<string, TokenPricing>>;
+/** Hard default when neither config nor builtins know the model. */
+export declare const UNKNOWN_MODEL_PRICING: TokenPricing;
+export interface SpendLedger {
+    /** ISO date (`YYYY-MM-DD` in UTC) — also embedded in the file name. */
+    date: string;
+    /** USD spent so far today across every call that hit the guard. */
+    totalUsd: number;
+    /** Number of `chat()` calls accounted for. */
+    calls: number;
+    /** Per-model breakdown for the report. */
+    byModel: Record<string, {
+        tokensIn: number;
+        tokensOut: number;
+        usd: number;
+    }>;
+}
+export declare class DailyCostCapExceededError extends Error {
+    readonly capUsd: number;
+    readonly projectedUsd: number;
+    readonly currentUsd: number;
+    constructor(opts: {
+        capUsd: number;
+        projectedUsd: number;
+        currentUsd: number;
+    });
+}
+declare function utcDate(now?: Date): string;
+declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
+/**
+ * Compute USD cost of a single `ChatUsage` using the given `model` pricing
+ * schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
+ * before first token).
+ */
+export declare function computeUsageUsd(model: string, usage: ChatUsage, config: Pick<ResolvedEvalConfig, "tokenPricing">): number;
+declare function ledgerPath(projectRoot: string, date: string): string;
+declare function readLedger(file: string, date: string): Promise<SpendLedger>;
+declare function writeLedger(file: string, ledger: SpendLedger): Promise<void>;
+/**
+ * Guard a single LLM call against the daily USD cap. Returns the updated
+ * ledger on success; throws `DailyCostCapExceededError` when the projected
+ * total would cross the cap. When `config.dailyUsdCap` is unset, the guard
+ * is a no-op — no file writes, no ledger — so non-judge runs never touch
+ * the filesystem.
+ */
+export interface CostGuard {
+    /**
+     * Commit the USD cost of a finished call to the ledger. When `dailyUsdCap`
+     * is set, refuses the commit if the projected total would exceed the cap.
+     */
+    commit(model: string, usage: ChatUsage): Promise<number>;
+    /** Snapshot the current ledger (or undefined when no cap is set). */
+    snapshot(): Promise<SpendLedger | undefined>;
+}
+export interface CreateCostGuardOptions {
+    /** Clock injection for tests. */
+    now?: () => Date;
+    /** Override the default filesystem root for the ledger. */
+    ledgerPath?: string;
+}
+export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
+/** Exposed for tests. */
+export declare const __internal: {
+    utcDate: typeof utcDate;
+    pricingFor: typeof pricingFor;
+    ledgerPath: typeof ledgerPath;
+    readLedger: typeof readLedger;
+    writeLedger: typeof writeLedger;
+};
+export {};

package/dist/eval/cost-guard.js ADDED Viewed

@@ -0,0 +1,153 @@
+/**
+ * Cost guard for the cclaw eval subsystem.
+ *
+ * Two responsibilities:
+ *
+ * 1. Convert `ChatUsage` (prompt/completion token counts) into USD using
+ *    a per-model `TokenPricing` schedule. Pricing comes from
+ *    `config.tokenPricing[model]` first, then from the builtin fallback
+ *    schedule for well-known models (z.ai GLM 5.1 at publish time).
+ * 2. Maintain a per-day running total persisted to
+ *    `.cclaw/evals/.spend-YYYY-MM-DD.json` so that a long eval session
+ *    (or a cron-run nightly) can't blow through the configured
+ *    `dailyUsdCap`. The counter is opt-in: no cap, no writes.
+ *
+ * The guard is deliberately pessimistic — it rounds USD up to 6 decimals
+ * and never subtracts, so a CI run that errors mid-flight still shows the
+ * partial spend in the next report.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { EVALS_ROOT } from "../constants.js";
+import { exists } from "../fs-utils.js";
+/**
+ * Builtin pricing fallback. Intentionally conservative: when the user
+ * hasn't configured pricing and we don't know the model, we default to a
+ * "small model" USD schedule so the cap can still do something useful.
+ *
+ * Values are USD per 1K tokens. Sources are public pricing pages as of
+ * 2026-04; update by editing this constant, not the guard logic.
+ */
+export const DEFAULT_TOKEN_PRICING = {
+    "glm-5.1": { input: 0.0005, output: 0.0015 },
+    "glm-4.6": { input: 0.0005, output: 0.0015 },
+    "gpt-4o-mini": { input: 0.00015, output: 0.0006 },
+    "gpt-4o": { input: 0.005, output: 0.015 }
+};
+/** Hard default when neither config nor builtins know the model. */
+export const UNKNOWN_MODEL_PRICING = { input: 0.001, output: 0.003 };
+export class DailyCostCapExceededError extends Error {
+    capUsd;
+    projectedUsd;
+    currentUsd;
+    constructor(opts) {
+        super(`Daily cost cap would be exceeded: ` +
+            `current=$${opts.currentUsd.toFixed(4)}, ` +
+            `projected=$${opts.projectedUsd.toFixed(4)}, ` +
+            `cap=$${opts.capUsd.toFixed(4)}. ` +
+            `Unset CCLAW_EVAL_DAILY_USD_CAP or increase the cap to continue.`);
+        this.name = "DailyCostCapExceededError";
+        this.capUsd = opts.capUsd;
+        this.projectedUsd = opts.projectedUsd;
+        this.currentUsd = opts.currentUsd;
+    }
+}
+function utcDate(now = new Date()) {
+    return now.toISOString().slice(0, 10);
+}
+function pricingFor(model, config) {
+    const custom = config.tokenPricing?.[model];
+    if (custom)
+        return custom;
+    const builtin = DEFAULT_TOKEN_PRICING[model];
+    if (builtin)
+        return builtin;
+    return UNKNOWN_MODEL_PRICING;
+}
+/**
+ * Compute USD cost of a single `ChatUsage` using the given `model` pricing
+ * schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
+ * before first token).
+ */
+export function computeUsageUsd(model, usage, config) {
+    if (!usage || usage.totalTokens <= 0)
+        return 0;
+    const schedule = pricingFor(model, config);
+    const cost = (usage.promptTokens * schedule.input) / 1_000 +
+        (usage.completionTokens * schedule.output) / 1_000;
+    return Math.max(0, Number(cost.toFixed(6)));
+}
+function emptyLedger(date) {
+    return { date, totalUsd: 0, calls: 0, byModel: {} };
+}
+function ledgerPath(projectRoot, date) {
+    return path.join(projectRoot, EVALS_ROOT, `.spend-${date}.json`);
+}
+async function readLedger(file, date) {
+    if (!(await exists(file)))
+        return emptyLedger(date);
+    try {
+        const raw = JSON.parse(await fs.readFile(file, "utf8"));
+        if (raw?.date !== date)
+            return emptyLedger(date);
+        return {
+            date,
+            totalUsd: typeof raw.totalUsd === "number" ? raw.totalUsd : 0,
+            calls: typeof raw.calls === "number" ? raw.calls : 0,
+            byModel: raw.byModel && typeof raw.byModel === "object" ? raw.byModel : {}
+        };
+    }
+    catch {
+        return emptyLedger(date);
+    }
+}
+async function writeLedger(file, ledger) {
+    await fs.mkdir(path.dirname(file), { recursive: true });
+    await fs.writeFile(file, `${JSON.stringify(ledger, null, 2)}\n`, "utf8");
+}
+export function createCostGuard(projectRoot, config, options = {}) {
+    const now = options.now ?? (() => new Date());
+    const currentDate = () => utcDate(now());
+    const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
+    return {
+        async commit(model, usage) {
+            const usd = computeUsageUsd(model, usage, config);
+            if (config.dailyUsdCap === undefined)
+                return usd;
+            const date = currentDate();
+            const target = file();
+            const ledger = await readLedger(target, date);
+            const projected = Number((ledger.totalUsd + usd).toFixed(6));
+            if (projected > config.dailyUsdCap) {
+                throw new DailyCostCapExceededError({
+                    capUsd: config.dailyUsdCap,
+                    projectedUsd: projected,
+                    currentUsd: ledger.totalUsd
+                });
+            }
+            ledger.totalUsd = projected;
+            ledger.calls += 1;
+            const byModel = ledger.byModel[model] ?? { tokensIn: 0, tokensOut: 0, usd: 0 };
+            byModel.tokensIn += usage.promptTokens;
+            byModel.tokensOut += usage.completionTokens;
+            byModel.usd = Number((byModel.usd + usd).toFixed(6));
+            ledger.byModel[model] = byModel;
+            await writeLedger(target, ledger);
+            return usd;
+        },
+        async snapshot() {
+            if (config.dailyUsdCap === undefined)
+                return undefined;
+            const date = currentDate();
+            return readLedger(file(), date);
+        }
+    };
+}
+/** Exposed for tests. */
+export const __internal = {
+    utcDate,
+    pricingFor,
+    ledgerPath,
+    readLedger,
+    writeLedger
+};