npm - cclaw-cli - Versions diffs - 0.25.0 → 0.26.0 - Mend

cclaw-cli 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/cli.js +2 -1
package/dist/eval/agents/with-tools.d.ts +31 -0
package/dist/eval/agents/with-tools.js +255 -0
package/dist/eval/config-loader.js +34 -2
package/dist/eval/llm-client.d.ts +10 -0
package/dist/eval/llm-client.js +10 -1
package/dist/eval/report.js +19 -0
package/dist/eval/runner.js +50 -2
package/dist/eval/sandbox.d.ts +38 -0
package/dist/eval/sandbox.js +137 -0
package/dist/eval/tools/glob.d.ts +2 -0
package/dist/eval/tools/glob.js +163 -0
package/dist/eval/tools/grep.d.ts +2 -0
package/dist/eval/tools/grep.js +152 -0
package/dist/eval/tools/index.d.ts +7 -0
package/dist/eval/tools/index.js +35 -0
package/dist/eval/tools/read.d.ts +2 -0
package/dist/eval/tools/read.js +122 -0
package/dist/eval/tools/types.d.ts +49 -0
package/dist/eval/tools/types.js +41 -0
package/dist/eval/tools/write.d.ts +2 -0
package/dist/eval/tools/write.js +92 -0
package/dist/eval/types.d.ts +35 -0
package/package.json +1 -1

package/dist/cli.js CHANGED Viewed

@@ -59,7 +59,7 @@ Commands:
                     --tier=<A|B|C>       Fidelity tier (A=single-shot, B=tools, C=workflow).
                     --schema-only        Run only structural verifiers (default).
                     --rules              Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
-                    --judge              Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A also runs the single-shot agent-under-test.
+                    --judge              Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B runs the sandbox tool-using agent (read_file/write_file/glob/grep).
                     --dry-run            Validate config + corpus, print summary, do not execute.
                     --json               Emit machine-readable JSON on stdout.
                     --no-write           Skip writing the report to .cclaw/evals/reports/.
@@ -79,6 +79,7 @@ Examples:
   cclaw eval --dry-run
   cclaw eval --stage=brainstorm --schema-only
   cclaw eval --judge --tier=A --stage=brainstorm
+  cclaw eval --judge --tier=B --stage=spec
 Docs:   https://github.com/zuevrs/cclaw
 Issues: https://github.com/zuevrs/cclaw/issues

package/dist/eval/agents/with-tools.d.ts ADDED Viewed

@@ -0,0 +1,31 @@
+import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
+import { createSandbox } from "../sandbox.js";
+import type { SandboxTool } from "../tools/index.js";
+import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
+export declare class MaxTurnsExceededError extends Error {
+    readonly turns: number;
+    constructor(turns: number);
+}
+export interface WithToolsInput {
+    caseEntry: EvalCase;
+    config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes">;
+    projectRoot: string;
+    client: EvalLlmClient;
+    tools?: SandboxTool[];
+    /** Override for the SKILL.md loader (test hook). */
+    loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
+    /** Override for the sandbox factory (test hook). */
+    createSandboxFn?: typeof createSandbox;
+}
+export interface WithToolsOutput {
+    artifact: string;
+    usage: ChatUsage;
+    usageUsd: number;
+    model: string;
+    attempts: number;
+    durationMs: number;
+    toolUse: ToolUseSummary;
+    systemPrompt: string;
+    userPrompt: string;
+}
+export declare function runWithTools(input: WithToolsInput): Promise<WithToolsOutput>;

package/dist/eval/agents/with-tools.js ADDED Viewed

@@ -0,0 +1,255 @@
+/**
+ * Tier B with-tools agent.
+ *
+ * Multi-turn loop with OpenAI-style function-calling over a set of
+ * sandbox-confined tools. The AUT is given:
+ *
+ *  - System prompt = stage SKILL.md (same contract as Tier A so the
+ *    single-shot baseline is comparable).
+ *  - User prompt = task description + a short "tools available" hint
+ *    that names the sandbox root and the four built-in tools.
+ *  - Tools = `read_file`, `write_file`, `glob`, `grep` (see
+ *    `src/eval/tools/`).
+ *
+ * The loop runs up to `config.toolMaxTurns` turns (default 8). Each
+ * turn:
+ *
+ *  1. Send the current transcript to the model with tools enabled.
+ *  2. Commit token usage against the wrapped client (cost guard sees
+ *     every call).
+ *  3. If the model returned tool_calls, execute each sandbox tool and
+ *     append a `role: "tool"` message with the JSON-serialized result.
+ *  4. If the model produced assistant content with `finish_reason: stop`,
+ *     treat that as the artifact and exit.
+ *
+ * When the turn budget is exhausted without a terminal stop, the agent
+ * throws `MaxTurnsExceededError`. The runner surfaces the error as a
+ * failed workflow verifier so the case counts as a regression.
+ *
+ * Artifact resolution: the final assistant content is the artifact. If
+ * the model used `write_file` to stage the artifact at
+ * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
+ * mirrors the Tier C workflow where writes are the deliverable. The
+ * fallback is the terminal assistant message so prompts that don't
+ * call write_file still produce something judgable.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { computeUsageUsd } from "../cost-guard.js";
+import { createSandbox } from "../sandbox.js";
+import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
+import { loadStageSkill } from "./single-shot.js";
+export class MaxTurnsExceededError extends Error {
+    turns;
+    constructor(turns) {
+        super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
+        this.name = "MaxTurnsExceededError";
+        this.turns = turns;
+    }
+}
+const DEFAULT_MAX_TURNS = 8;
+const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
+const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
+const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
+export async function runWithTools(input) {
+    const { caseEntry, config, projectRoot, client } = input;
+    const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
+    const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
+    const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
+    const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
+    const systemPrompt = await loader(caseEntry.stage);
+    const tools = input.tools ?? BUILTIN_TOOLS;
+    const toolMap = toolsByName(tools);
+    const toolsBody = toolsForRequest(tools);
+    const sandboxFactory = input.createSandboxFn ?? createSandbox;
+    const sandbox = await sandboxFactory({
+        projectRoot,
+        ...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
+    });
+    const toolUse = {
+        turns: 0,
+        calls: 0,
+        errors: 0,
+        deniedPaths: [],
+        byTool: {}
+    };
+    const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
+    let lastModel = config.model;
+    let totalAttempts = 0;
+    const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
+    const messages = [
+        { role: "system", content: systemPrompt },
+        { role: "user", content: userPrompt }
+    ];
+    const started = Date.now();
+    try {
+        for (let turn = 0; turn < maxTurns; turn += 1) {
+            toolUse.turns = turn + 1;
+            const response = await client.chat({
+                model: config.model,
+                messages,
+                temperature: config.agentTemperature ?? 0.2,
+                timeoutMs: config.timeoutMs,
+                tools: toolsBody,
+                toolChoice: "auto"
+            });
+            usage.promptTokens += response.usage.promptTokens;
+            usage.completionTokens += response.usage.completionTokens;
+            usage.totalTokens += response.usage.totalTokens;
+            lastModel = response.model;
+            totalAttempts += response.attempts;
+            const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
+            messages.push(rememberAssistant(response.content, response.toolCalls));
+            if (!hasToolCalls) {
+                const artifact = await resolveArtifact(sandbox, response.content);
+                return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
+            }
+            for (const call of response.toolCalls) {
+                const tool = toolMap.get(call.name);
+                const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
+                if (argBytes > maxArgBytes) {
+                    toolUse.errors += 1;
+                    bumpToolCount(toolUse, call.name);
+                    messages.push(toolResponseMessage(call.id, {
+                        ok: false,
+                        name: call.name,
+                        error: `arguments payload exceeds ${maxArgBytes} bytes`
+                    }));
+                    continue;
+                }
+                if (!tool) {
+                    toolUse.errors += 1;
+                    bumpToolCount(toolUse, call.name);
+                    messages.push(toolResponseMessage(call.id, {
+                        ok: false,
+                        name: call.name,
+                        error: `unknown tool "${call.name}"`
+                    }));
+                    continue;
+                }
+                bumpToolCount(toolUse, call.name);
+                const result = await tool.invoke(call.arguments ?? "", {
+                    sandbox,
+                    maxResultBytes
+                });
+                if (!result.ok) {
+                    toolUse.errors += 1;
+                    const denied = result.details && typeof result.details.deniedPath === "string"
+                        ? result.details.deniedPath
+                        : undefined;
+                    if (denied && !toolUse.deniedPaths.includes(denied)) {
+                        toolUse.deniedPaths.push(denied);
+                    }
+                }
+                else {
+                    toolUse.calls += 1;
+                }
+                messages.push(toolResponseMessage(call.id, result));
+            }
+        }
+        throw new MaxTurnsExceededError(maxTurns);
+    }
+    finally {
+        await sandbox.dispose();
+    }
+}
+function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
+    const usageUsd = computeUsageUsd(model, usage, {
+        tokenPricing: config.tokenPricing
+    });
+    return {
+        artifact: artifact.trim(),
+        usage,
+        usageUsd,
+        model,
+        attempts,
+        durationMs: Date.now() - started,
+        toolUse,
+        systemPrompt,
+        userPrompt
+    };
+}
+function rememberAssistant(content, toolCalls) {
+    const base = { role: "assistant", content };
+    if (toolCalls && toolCalls.length > 0)
+        base.toolCalls = toolCalls;
+    return base;
+}
+function toolResponseMessage(callId, result) {
+    const payload = result.ok
+        ? { ok: true, content: result.content, details: result.details ?? {} }
+        : { ok: false, error: result.error, details: result.details ?? {} };
+    return {
+        role: "tool",
+        content: truncatePayload(JSON.stringify(payload), 32 * 1024),
+        toolCallId: callId,
+        name: result.name
+    };
+}
+function bumpToolCount(summary, name) {
+    summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
+}
+function clampPositive(value, fallback) {
+    if (value === undefined)
+        return fallback;
+    if (!Number.isFinite(value) || value <= 0)
+        return fallback;
+    return Math.floor(value);
+}
+function buildUserPrompt(caseEntry, sandbox, tools) {
+    const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
+    const files = caseEntry.contextFiles ?? [];
+    const contextLines = files.length > 0
+        ? files.map((f) => `- ${f}`).join("\n")
+        : "(no files seeded)";
+    const lines = [
+        `Stage: ${caseEntry.stage}`,
+        `Case id: ${caseEntry.id}`,
+        ``,
+        `Sandbox root: ${sandbox.root}`,
+        `You may call the following tools to read or modify files inside the sandbox.`,
+        `All paths are relative to the sandbox root.`,
+        ``,
+        `Tools:`,
+        ...toolList,
+        ``,
+        `Seeded context files (available under the sandbox root):`,
+        contextLines,
+        ``,
+        `Task:`,
+        caseEntry.inputPrompt.trim(),
+        ``,
+        `When you are done, reply with the artifact as the final assistant message.`,
+        `Output the artifact directly (markdown with optional YAML frontmatter).`,
+        `Do not wrap in code fences, do not add commentary before or after.`,
+        `You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
+            `if you do, the last written \`artifact.md\` is preferred over the chat reply.`
+    ];
+    return lines.join("\n");
+}
+async function resolveArtifact(sandbox, fallback) {
+    for (const candidate of ARTIFACT_CANDIDATES) {
+        try {
+            const abs = await sandbox.resolve(candidate);
+            const stat = await fs.stat(abs);
+            if (stat.isFile()) {
+                return await fs.readFile(abs, "utf8");
+            }
+        }
+        catch {
+            continue;
+        }
+    }
+    try {
+        const dir = path.join(sandbox.root);
+        const entries = (await fs.readdir(dir, { withFileTypes: true }));
+        const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
+        if (match) {
+            return await fs.readFile(path.join(dir, match.name), "utf8");
+        }
+    }
+    catch {
+        // fall through to fallback
+    }
+    return fallback;
+}

package/dist/eval/config-loader.js CHANGED Viewed

@@ -32,7 +32,10 @@ const NUMERIC_ENVS = new Set([
     "CCLAW_EVAL_MAX_RETRIES",
     "CCLAW_EVAL_JUDGE_SAMPLES",
     "CCLAW_EVAL_JUDGE_TEMPERATURE",
-    "CCLAW_EVAL_AGENT_TEMPERATURE"
+    "CCLAW_EVAL_AGENT_TEMPERATURE",
+    "CCLAW_EVAL_TOOL_MAX_TURNS",
+    "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
+    "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
 ]);
 function evalConfigError(configFilePath, reason) {
     return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -152,6 +155,17 @@ function validateFileConfig(raw, configFilePath) {
         }
         out.tokenPricing = pricing;
     }
+    const assignPositiveInt = (key, value, label) => {
+        if (value === undefined)
+            return;
+        if (!Number.isInteger(value) || value < 1) {
+            throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
+        }
+        out[key] = value;
+    };
+    assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
+    assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
+    assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
     if (raw.regression !== undefined) {
         if (!isRecord(raw.regression)) {
             throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -186,7 +200,10 @@ function validateFileConfig(raw, configFilePath) {
         "judgeSamples",
         "judgeTemperature",
         "agentTemperature",
-        "tokenPricing"
+        "tokenPricing",
+        "toolMaxTurns",
+        "toolMaxArgumentsBytes",
+        "toolMaxResultBytes"
     ]);
     const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
     if (unknown.length > 0) {
@@ -296,6 +313,21 @@ function applyEnvOverrides(base, env) {
         patched.agentTemperature = value;
         overridden = true;
     }
+    const readPositiveInt = (name, key, label) => {
+        const raw = read(name);
+        if (!raw)
+            return;
+        const value = parseNumericEnv(name, raw);
+        if (!Number.isInteger(value) || value < 1) {
+            throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
+        }
+        patched[key] = value;
+        overridden = true;
+        void label;
+    };
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
+    readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
     const apiKey = read("CCLAW_EVAL_API_KEY");
     return { patched, overridden, apiKey };
 }

package/dist/eval/llm-client.d.ts CHANGED Viewed

@@ -5,6 +5,16 @@ export interface ChatMessage {
     content: string;
     name?: string;
     toolCallId?: string;
+    /**
+     * OpenAI-style tool calls carried on a preceding assistant message.
+     * Populated by the Tier B loop so the wire transcript stays
+     * consistent (assistant message → tool responses).
+     */
+    toolCalls?: Array<{
+        id: string;
+        name: string;
+        arguments: string;
+    }>;
 }
 export interface ChatRequest {
     model: string;

package/dist/eval/llm-client.js CHANGED Viewed

@@ -149,7 +149,16 @@ function buildBody(request) {
             role: m.role,
             content: m.content,
             ...(m.name !== undefined ? { name: m.name } : {}),
-            ...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
+            ...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {}),
+            ...(m.toolCalls && m.toolCalls.length > 0
+                ? {
+                    tool_calls: m.toolCalls.map((call) => ({
+                        id: call.id,
+                        type: "function",
+                        function: { name: call.name, arguments: call.arguments }
+                    }))
+                }
+                : {})
         }))
     };
     if (request.maxTokens !== undefined)

package/dist/eval/report.js CHANGED Viewed

@@ -75,6 +75,25 @@ export function formatMarkdownReport(report) {
         lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
     }
     lines.push(``);
+    const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
+    if (toolCases.length > 0) {
+        lines.push(`## Tool use`);
+        lines.push(``);
+        lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
+        lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
+        for (const item of toolCases) {
+            const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
+            const toolUse = verifier?.details?.toolUse;
+            if (!toolUse)
+                continue;
+            const byTool = Object.entries(toolUse.byTool)
+                .map(([name, count]) => `${name}=${count}`)
+                .join(", ");
+            const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
+            lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
+        }
+        lines.push(``);
+    }
     const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
     if (judgeCases.length > 0) {
         lines.push(`## Judge scores`);

package/dist/eval/runner.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { randomUUID } from "node:crypto";
 import { CCLAW_VERSION } from "../constants.js";
 import { FLOW_STAGES } from "../types.js";
 import { runSingleShot } from "./agents/single-shot.js";
+import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
 import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
@@ -39,8 +40,9 @@ function resolveRunFlags(options) {
     const rulesRequested = options.rules === true;
     const schemaOnly = options.schemaOnly === true;
     const judgeRequested = options.judge === true;
+    const tier = options.tier ?? "A";
     const runJudge = judgeRequested && !schemaOnly;
-    const runAgent = runJudge && (options.tier ?? "A") === "A";
+    const runAgent = runJudge && (tier === "A" || tier === "B");
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
@@ -94,7 +96,7 @@ async function runCase(ctx) {
     const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
     let artifact;
     if (needsArtifact) {
-        if (flags.runAgent && judgeRequested && client) {
+        if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
             try {
                 const produced = await runSingleShot({
                     caseEntry,
@@ -133,6 +135,52 @@ async function runCase(ctx) {
                 });
             }
         }
+        else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
+            try {
+                const produced = await runWithTools({
+                    caseEntry,
+                    config,
+                    projectRoot,
+                    client
+                });
+                artifact = produced.artifact;
+                caseCostUsd += produced.usageUsd;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:with-tools",
+                    ok: true,
+                    score: 1,
+                    message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
+                        `${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
+                        `(${produced.toolUse.calls} tool call(s))`,
+                    details: {
+                        model: produced.model,
+                        tokensIn: produced.usage.promptTokens,
+                        tokensOut: produced.usage.completionTokens,
+                        usageUsd: produced.usageUsd,
+                        attempts: produced.attempts,
+                        toolUse: produced.toolUse
+                    }
+                });
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:with-tools",
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: {
+                        retryable,
+                        ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
+                    }
+                });
+            }
+        }
         else {
             artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
         }

package/dist/eval/sandbox.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+export declare class SandboxEscapeError extends Error {
+    readonly requestedPath: string;
+    constructor(requestedPath: string, reason: string);
+}
+export interface SandboxOptions {
+    /** Project root that `contextFiles` are resolved against. */
+    projectRoot: string;
+    /** Case-relative paths to copy into the sandbox before the agent starts. */
+    contextFiles?: string[];
+    /**
+     * Base directory that will host the per-case tmpdir. Defaults to
+     * `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
+     * traces in `/tmp` when assertions fail.
+     */
+    baseDir?: string;
+    /** Override the per-case suffix. Primarily for deterministic tests. */
+    idOverride?: string;
+}
+export interface Sandbox {
+    /** Absolute path to the sandbox root directory. */
+    root: string;
+    /**
+     * Resolve `requested` relative to the sandbox root and return the
+     * absolute, realpath'd filesystem path. Throws
+     * `SandboxEscapeError` when the resolution crosses the boundary.
+     *
+     * `allowMissing: true` lets callers pre-resolve a destination for a
+     * write where the final component doesn't exist yet — the parent
+     * directory is realpath'd to still catch symlink escapes.
+     */
+    resolve(requested: string, options?: {
+        allowMissing?: boolean;
+    }): Promise<string>;
+    /** Remove the sandbox directory. Idempotent. */
+    dispose(): Promise<void>;
+}
+/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
+export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;