cclaw-cli 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -59,7 +59,7 @@ Commands:
59
59
  --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
60
60
  --schema-only Run only structural verifiers (default).
61
61
  --rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
62
- --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A also runs the single-shot agent-under-test.
62
+ --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B runs the sandbox tool-using agent (read_file/write_file/glob/grep).
63
63
  --dry-run Validate config + corpus, print summary, do not execute.
64
64
  --json Emit machine-readable JSON on stdout.
65
65
  --no-write Skip writing the report to .cclaw/evals/reports/.
@@ -79,6 +79,7 @@ Examples:
79
79
  cclaw eval --dry-run
80
80
  cclaw eval --stage=brainstorm --schema-only
81
81
  cclaw eval --judge --tier=A --stage=brainstorm
82
+ cclaw eval --judge --tier=B --stage=spec
82
83
 
83
84
  Docs: https://github.com/zuevrs/cclaw
84
85
  Issues: https://github.com/zuevrs/cclaw/issues
@@ -0,0 +1,31 @@
1
+ import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
2
+ import { createSandbox } from "../sandbox.js";
3
+ import type { SandboxTool } from "../tools/index.js";
4
+ import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
5
+ export declare class MaxTurnsExceededError extends Error {
6
+ readonly turns: number;
7
+ constructor(turns: number);
8
+ }
9
+ export interface WithToolsInput {
10
+ caseEntry: EvalCase;
11
+ config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes">;
12
+ projectRoot: string;
13
+ client: EvalLlmClient;
14
+ tools?: SandboxTool[];
15
+ /** Override for the SKILL.md loader (test hook). */
16
+ loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
17
+ /** Override for the sandbox factory (test hook). */
18
+ createSandboxFn?: typeof createSandbox;
19
+ }
20
+ export interface WithToolsOutput {
21
+ artifact: string;
22
+ usage: ChatUsage;
23
+ usageUsd: number;
24
+ model: string;
25
+ attempts: number;
26
+ durationMs: number;
27
+ toolUse: ToolUseSummary;
28
+ systemPrompt: string;
29
+ userPrompt: string;
30
+ }
31
+ export declare function runWithTools(input: WithToolsInput): Promise<WithToolsOutput>;
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Tier B with-tools agent.
3
+ *
4
+ * Multi-turn loop with OpenAI-style function-calling over a set of
5
+ * sandbox-confined tools. The AUT is given:
6
+ *
7
+ * - System prompt = stage SKILL.md (same contract as Tier A so the
8
+ * single-shot baseline is comparable).
9
+ * - User prompt = task description + a short "tools available" hint
10
+ * that names the sandbox root and the four built-in tools.
11
+ * - Tools = `read_file`, `write_file`, `glob`, `grep` (see
12
+ * `src/eval/tools/`).
13
+ *
14
+ * The loop runs up to `config.toolMaxTurns` turns (default 8). Each
15
+ * turn:
16
+ *
17
+ * 1. Send the current transcript to the model with tools enabled.
18
+ * 2. Commit token usage against the wrapped client (cost guard sees
19
+ * every call).
20
+ * 3. If the model returned tool_calls, execute each sandbox tool and
21
+ * append a `role: "tool"` message with the JSON-serialized result.
22
+ * 4. If the model produced assistant content with `finish_reason: stop`,
23
+ * treat that as the artifact and exit.
24
+ *
25
+ * When the turn budget is exhausted without a terminal stop, the agent
26
+ * throws `MaxTurnsExceededError`. The runner surfaces the error as a
27
+ * failed workflow verifier so the case counts as a regression.
28
+ *
29
+ * Artifact resolution: the final assistant content is the artifact. If
30
+ * the model used `write_file` to stage the artifact at
31
+ * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
32
+ * mirrors the Tier C workflow where writes are the deliverable. The
33
+ * fallback is the terminal assistant message so prompts that don't
34
+ * call write_file still produce something judgable.
35
+ */
36
+ import fs from "node:fs/promises";
37
+ import path from "node:path";
38
+ import { computeUsageUsd } from "../cost-guard.js";
39
+ import { createSandbox } from "../sandbox.js";
40
+ import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
41
+ import { loadStageSkill } from "./single-shot.js";
42
+ export class MaxTurnsExceededError extends Error {
43
+ turns;
44
+ constructor(turns) {
45
+ super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
46
+ this.name = "MaxTurnsExceededError";
47
+ this.turns = turns;
48
+ }
49
+ }
50
+ const DEFAULT_MAX_TURNS = 8;
51
+ const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
52
+ const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
53
+ const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
54
+ export async function runWithTools(input) {
55
+ const { caseEntry, config, projectRoot, client } = input;
56
+ const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
57
+ const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
58
+ const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
59
+ const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
60
+ const systemPrompt = await loader(caseEntry.stage);
61
+ const tools = input.tools ?? BUILTIN_TOOLS;
62
+ const toolMap = toolsByName(tools);
63
+ const toolsBody = toolsForRequest(tools);
64
+ const sandboxFactory = input.createSandboxFn ?? createSandbox;
65
+ const sandbox = await sandboxFactory({
66
+ projectRoot,
67
+ ...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
68
+ });
69
+ const toolUse = {
70
+ turns: 0,
71
+ calls: 0,
72
+ errors: 0,
73
+ deniedPaths: [],
74
+ byTool: {}
75
+ };
76
+ const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
77
+ let lastModel = config.model;
78
+ let totalAttempts = 0;
79
+ const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
80
+ const messages = [
81
+ { role: "system", content: systemPrompt },
82
+ { role: "user", content: userPrompt }
83
+ ];
84
+ const started = Date.now();
85
+ try {
86
+ for (let turn = 0; turn < maxTurns; turn += 1) {
87
+ toolUse.turns = turn + 1;
88
+ const response = await client.chat({
89
+ model: config.model,
90
+ messages,
91
+ temperature: config.agentTemperature ?? 0.2,
92
+ timeoutMs: config.timeoutMs,
93
+ tools: toolsBody,
94
+ toolChoice: "auto"
95
+ });
96
+ usage.promptTokens += response.usage.promptTokens;
97
+ usage.completionTokens += response.usage.completionTokens;
98
+ usage.totalTokens += response.usage.totalTokens;
99
+ lastModel = response.model;
100
+ totalAttempts += response.attempts;
101
+ const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
102
+ messages.push(rememberAssistant(response.content, response.toolCalls));
103
+ if (!hasToolCalls) {
104
+ const artifact = await resolveArtifact(sandbox, response.content);
105
+ return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
106
+ }
107
+ for (const call of response.toolCalls) {
108
+ const tool = toolMap.get(call.name);
109
+ const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
110
+ if (argBytes > maxArgBytes) {
111
+ toolUse.errors += 1;
112
+ bumpToolCount(toolUse, call.name);
113
+ messages.push(toolResponseMessage(call.id, {
114
+ ok: false,
115
+ name: call.name,
116
+ error: `arguments payload exceeds ${maxArgBytes} bytes`
117
+ }));
118
+ continue;
119
+ }
120
+ if (!tool) {
121
+ toolUse.errors += 1;
122
+ bumpToolCount(toolUse, call.name);
123
+ messages.push(toolResponseMessage(call.id, {
124
+ ok: false,
125
+ name: call.name,
126
+ error: `unknown tool "${call.name}"`
127
+ }));
128
+ continue;
129
+ }
130
+ bumpToolCount(toolUse, call.name);
131
+ const result = await tool.invoke(call.arguments ?? "", {
132
+ sandbox,
133
+ maxResultBytes
134
+ });
135
+ if (!result.ok) {
136
+ toolUse.errors += 1;
137
+ const denied = result.details && typeof result.details.deniedPath === "string"
138
+ ? result.details.deniedPath
139
+ : undefined;
140
+ if (denied && !toolUse.deniedPaths.includes(denied)) {
141
+ toolUse.deniedPaths.push(denied);
142
+ }
143
+ }
144
+ else {
145
+ toolUse.calls += 1;
146
+ }
147
+ messages.push(toolResponseMessage(call.id, result));
148
+ }
149
+ }
150
+ throw new MaxTurnsExceededError(maxTurns);
151
+ }
152
+ finally {
153
+ await sandbox.dispose();
154
+ }
155
+ }
156
+ function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
157
+ const usageUsd = computeUsageUsd(model, usage, {
158
+ tokenPricing: config.tokenPricing
159
+ });
160
+ return {
161
+ artifact: artifact.trim(),
162
+ usage,
163
+ usageUsd,
164
+ model,
165
+ attempts,
166
+ durationMs: Date.now() - started,
167
+ toolUse,
168
+ systemPrompt,
169
+ userPrompt
170
+ };
171
+ }
172
+ function rememberAssistant(content, toolCalls) {
173
+ const base = { role: "assistant", content };
174
+ if (toolCalls && toolCalls.length > 0)
175
+ base.toolCalls = toolCalls;
176
+ return base;
177
+ }
178
+ function toolResponseMessage(callId, result) {
179
+ const payload = result.ok
180
+ ? { ok: true, content: result.content, details: result.details ?? {} }
181
+ : { ok: false, error: result.error, details: result.details ?? {} };
182
+ return {
183
+ role: "tool",
184
+ content: truncatePayload(JSON.stringify(payload), 32 * 1024),
185
+ toolCallId: callId,
186
+ name: result.name
187
+ };
188
+ }
189
+ function bumpToolCount(summary, name) {
190
+ summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
191
+ }
192
+ function clampPositive(value, fallback) {
193
+ if (value === undefined)
194
+ return fallback;
195
+ if (!Number.isFinite(value) || value <= 0)
196
+ return fallback;
197
+ return Math.floor(value);
198
+ }
199
+ function buildUserPrompt(caseEntry, sandbox, tools) {
200
+ const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
201
+ const files = caseEntry.contextFiles ?? [];
202
+ const contextLines = files.length > 0
203
+ ? files.map((f) => `- ${f}`).join("\n")
204
+ : "(no files seeded)";
205
+ const lines = [
206
+ `Stage: ${caseEntry.stage}`,
207
+ `Case id: ${caseEntry.id}`,
208
+ ``,
209
+ `Sandbox root: ${sandbox.root}`,
210
+ `You may call the following tools to read or modify files inside the sandbox.`,
211
+ `All paths are relative to the sandbox root.`,
212
+ ``,
213
+ `Tools:`,
214
+ ...toolList,
215
+ ``,
216
+ `Seeded context files (available under the sandbox root):`,
217
+ contextLines,
218
+ ``,
219
+ `Task:`,
220
+ caseEntry.inputPrompt.trim(),
221
+ ``,
222
+ `When you are done, reply with the artifact as the final assistant message.`,
223
+ `Output the artifact directly (markdown with optional YAML frontmatter).`,
224
+ `Do not wrap in code fences, do not add commentary before or after.`,
225
+ `You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
226
+ `if you do, the last written \`artifact.md\` is preferred over the chat reply.`
227
+ ];
228
+ return lines.join("\n");
229
+ }
230
+ async function resolveArtifact(sandbox, fallback) {
231
+ for (const candidate of ARTIFACT_CANDIDATES) {
232
+ try {
233
+ const abs = await sandbox.resolve(candidate);
234
+ const stat = await fs.stat(abs);
235
+ if (stat.isFile()) {
236
+ return await fs.readFile(abs, "utf8");
237
+ }
238
+ }
239
+ catch {
240
+ continue;
241
+ }
242
+ }
243
+ try {
244
+ const dir = path.join(sandbox.root);
245
+ const entries = (await fs.readdir(dir, { withFileTypes: true }));
246
+ const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
247
+ if (match) {
248
+ return await fs.readFile(path.join(dir, match.name), "utf8");
249
+ }
250
+ }
251
+ catch {
252
+ // fall through to fallback
253
+ }
254
+ return fallback;
255
+ }
@@ -32,7 +32,10 @@ const NUMERIC_ENVS = new Set([
32
32
  "CCLAW_EVAL_MAX_RETRIES",
33
33
  "CCLAW_EVAL_JUDGE_SAMPLES",
34
34
  "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
- "CCLAW_EVAL_AGENT_TEMPERATURE"
35
+ "CCLAW_EVAL_AGENT_TEMPERATURE",
36
+ "CCLAW_EVAL_TOOL_MAX_TURNS",
37
+ "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
38
+ "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
36
39
  ]);
37
40
  function evalConfigError(configFilePath, reason) {
38
41
  return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -152,6 +155,17 @@ function validateFileConfig(raw, configFilePath) {
152
155
  }
153
156
  out.tokenPricing = pricing;
154
157
  }
158
+ const assignPositiveInt = (key, value, label) => {
159
+ if (value === undefined)
160
+ return;
161
+ if (!Number.isInteger(value) || value < 1) {
162
+ throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
163
+ }
164
+ out[key] = value;
165
+ };
166
+ assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
167
+ assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
168
+ assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
155
169
  if (raw.regression !== undefined) {
156
170
  if (!isRecord(raw.regression)) {
157
171
  throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -186,7 +200,10 @@ function validateFileConfig(raw, configFilePath) {
186
200
  "judgeSamples",
187
201
  "judgeTemperature",
188
202
  "agentTemperature",
189
- "tokenPricing"
203
+ "tokenPricing",
204
+ "toolMaxTurns",
205
+ "toolMaxArgumentsBytes",
206
+ "toolMaxResultBytes"
190
207
  ]);
191
208
  const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
192
209
  if (unknown.length > 0) {
@@ -296,6 +313,21 @@ function applyEnvOverrides(base, env) {
296
313
  patched.agentTemperature = value;
297
314
  overridden = true;
298
315
  }
316
+ const readPositiveInt = (name, key, label) => {
317
+ const raw = read(name);
318
+ if (!raw)
319
+ return;
320
+ const value = parseNumericEnv(name, raw);
321
+ if (!Number.isInteger(value) || value < 1) {
322
+ throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
323
+ }
324
+ patched[key] = value;
325
+ overridden = true;
326
+ void label;
327
+ };
328
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
329
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
330
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
299
331
  const apiKey = read("CCLAW_EVAL_API_KEY");
300
332
  return { patched, overridden, apiKey };
301
333
  }
@@ -5,6 +5,16 @@ export interface ChatMessage {
5
5
  content: string;
6
6
  name?: string;
7
7
  toolCallId?: string;
8
+ /**
9
+ * OpenAI-style tool calls carried on a preceding assistant message.
10
+ * Populated by the Tier B loop so the wire transcript stays
11
+ * consistent (assistant message → tool responses).
12
+ */
13
+ toolCalls?: Array<{
14
+ id: string;
15
+ name: string;
16
+ arguments: string;
17
+ }>;
8
18
  }
9
19
  export interface ChatRequest {
10
20
  model: string;
@@ -149,7 +149,16 @@ function buildBody(request) {
149
149
  role: m.role,
150
150
  content: m.content,
151
151
  ...(m.name !== undefined ? { name: m.name } : {}),
152
- ...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
152
+ ...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {}),
153
+ ...(m.toolCalls && m.toolCalls.length > 0
154
+ ? {
155
+ tool_calls: m.toolCalls.map((call) => ({
156
+ id: call.id,
157
+ type: "function",
158
+ function: { name: call.name, arguments: call.arguments }
159
+ }))
160
+ }
161
+ : {})
153
162
  }))
154
163
  };
155
164
  if (request.maxTokens !== undefined)
@@ -75,6 +75,25 @@ export function formatMarkdownReport(report) {
75
75
  lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
76
76
  }
77
77
  lines.push(``);
78
+ const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
79
+ if (toolCases.length > 0) {
80
+ lines.push(`## Tool use`);
81
+ lines.push(``);
82
+ lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
83
+ lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
84
+ for (const item of toolCases) {
85
+ const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
86
+ const toolUse = verifier?.details?.toolUse;
87
+ if (!toolUse)
88
+ continue;
89
+ const byTool = Object.entries(toolUse.byTool)
90
+ .map(([name, count]) => `${name}=${count}`)
91
+ .join(", ");
92
+ const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
93
+ lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
94
+ }
95
+ lines.push(``);
96
+ }
78
97
  const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
79
98
  if (judgeCases.length > 0) {
80
99
  lines.push(`## Judge scores`);
@@ -2,6 +2,7 @@ import { randomUUID } from "node:crypto";
2
2
  import { CCLAW_VERSION } from "../constants.js";
3
3
  import { FLOW_STAGES } from "../types.js";
4
4
  import { runSingleShot } from "./agents/single-shot.js";
5
+ import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
5
6
  import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
6
7
  import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
7
8
  import { loadEvalConfig } from "./config-loader.js";
@@ -39,8 +40,9 @@ function resolveRunFlags(options) {
39
40
  const rulesRequested = options.rules === true;
40
41
  const schemaOnly = options.schemaOnly === true;
41
42
  const judgeRequested = options.judge === true;
43
+ const tier = options.tier ?? "A";
42
44
  const runJudge = judgeRequested && !schemaOnly;
43
- const runAgent = runJudge && (options.tier ?? "A") === "A";
45
+ const runAgent = runJudge && (tier === "A" || tier === "B");
44
46
  return {
45
47
  runStructural: true,
46
48
  runRules: rulesRequested && !schemaOnly,
@@ -94,7 +96,7 @@ async function runCase(ctx) {
94
96
  const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
95
97
  let artifact;
96
98
  if (needsArtifact) {
97
- if (flags.runAgent && judgeRequested && client) {
99
+ if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
98
100
  try {
99
101
  const produced = await runSingleShot({
100
102
  caseEntry,
@@ -133,6 +135,52 @@ async function runCase(ctx) {
133
135
  });
134
136
  }
135
137
  }
138
+ else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
139
+ try {
140
+ const produced = await runWithTools({
141
+ caseEntry,
142
+ config,
143
+ projectRoot,
144
+ client
145
+ });
146
+ artifact = produced.artifact;
147
+ caseCostUsd += produced.usageUsd;
148
+ verifierResults.push({
149
+ kind: "workflow",
150
+ id: "agent:with-tools",
151
+ ok: true,
152
+ score: 1,
153
+ message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
154
+ `${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
155
+ `(${produced.toolUse.calls} tool call(s))`,
156
+ details: {
157
+ model: produced.model,
158
+ tokensIn: produced.usage.promptTokens,
159
+ tokensOut: produced.usage.completionTokens,
160
+ usageUsd: produced.usageUsd,
161
+ attempts: produced.attempts,
162
+ toolUse: produced.toolUse
163
+ }
164
+ });
165
+ }
166
+ catch (err) {
167
+ if (err instanceof DailyCostCapExceededError)
168
+ throw err;
169
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
170
+ const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
171
+ verifierResults.push({
172
+ kind: "workflow",
173
+ id: "agent:with-tools",
174
+ ok: false,
175
+ score: 0,
176
+ message: err instanceof Error ? err.message : String(err),
177
+ details: {
178
+ retryable,
179
+ ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
180
+ }
181
+ });
182
+ }
183
+ }
136
184
  else {
137
185
  artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
138
186
  }
@@ -0,0 +1,38 @@
1
+ export declare class SandboxEscapeError extends Error {
2
+ readonly requestedPath: string;
3
+ constructor(requestedPath: string, reason: string);
4
+ }
5
+ export interface SandboxOptions {
6
+ /** Project root that `contextFiles` are resolved against. */
7
+ projectRoot: string;
8
+ /** Case-relative paths to copy into the sandbox before the agent starts. */
9
+ contextFiles?: string[];
10
+ /**
11
+ * Base directory that will host the per-case tmpdir. Defaults to
12
+ * `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
13
+ * traces in `/tmp` when assertions fail.
14
+ */
15
+ baseDir?: string;
16
+ /** Override the per-case suffix. Primarily for deterministic tests. */
17
+ idOverride?: string;
18
+ }
19
+ export interface Sandbox {
20
+ /** Absolute path to the sandbox root directory. */
21
+ root: string;
22
+ /**
23
+ * Resolve `requested` relative to the sandbox root and return the
24
+ * absolute, realpath'd filesystem path. Throws
25
+ * `SandboxEscapeError` when the resolution crosses the boundary.
26
+ *
27
+ * `allowMissing: true` lets callers pre-resolve a destination for a
28
+ * write where the final component doesn't exist yet — the parent
29
+ * directory is realpath'd to still catch symlink escapes.
30
+ */
31
+ resolve(requested: string, options?: {
32
+ allowMissing?: boolean;
33
+ }): Promise<string>;
34
+ /** Remove the sandbox directory. Idempotent. */
35
+ dispose(): Promise<void>;
36
+ }
37
+ /** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
38
+ export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;