cclaw-cli 0.25.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +79 -4
- package/dist/eval/agents/with-tools.d.ts +44 -0
- package/dist/eval/agents/with-tools.js +261 -0
- package/dist/eval/agents/workflow.d.ts +24 -0
- package/dist/eval/agents/workflow.js +133 -0
- package/dist/eval/config-loader.js +38 -2
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +10 -0
- package/dist/eval/llm-client.js +10 -1
- package/dist/eval/report.js +54 -0
- package/dist/eval/runner.d.ts +10 -1
- package/dist/eval/runner.js +285 -20
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +152 -1
- package/dist/eval/types.js +21 -1
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/eval/runner.js
CHANGED
|
@@ -2,8 +2,11 @@ import { randomUUID } from "node:crypto";
|
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
4
|
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
|
+
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
6
|
+
import { runWorkflow } from "./agents/workflow.js";
|
|
5
7
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
6
8
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
9
|
+
import { loadWorkflowCorpus } from "./workflow-corpus.js";
|
|
7
10
|
import { loadEvalConfig } from "./config-loader.js";
|
|
8
11
|
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
9
12
|
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
@@ -12,6 +15,7 @@ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
|
12
15
|
import { verifyRules } from "./verifiers/rules.js";
|
|
13
16
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
14
17
|
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
18
|
+
import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
|
|
15
19
|
function groupByStage(cases) {
|
|
16
20
|
return cases.reduce((acc, item) => {
|
|
17
21
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
@@ -39,8 +43,15 @@ function resolveRunFlags(options) {
|
|
|
39
43
|
const rulesRequested = options.rules === true;
|
|
40
44
|
const schemaOnly = options.schemaOnly === true;
|
|
41
45
|
const judgeRequested = options.judge === true;
|
|
46
|
+
const tier = options.tier ?? "A";
|
|
42
47
|
const runJudge = judgeRequested && !schemaOnly;
|
|
43
|
-
|
|
48
|
+
// Tier C always needs the agent loop (no fixture fallback for workflows),
|
|
49
|
+
// so we still require an LLM client but we do NOT require --judge on the
|
|
50
|
+
// CLI to produce a workflow run. The judge piece itself stays gated by
|
|
51
|
+
// `runJudge` so consistency-only runs are cheap and deterministic.
|
|
52
|
+
const runAgent = tier === "C"
|
|
53
|
+
? !schemaOnly
|
|
54
|
+
: runJudge && (tier === "A" || tier === "B");
|
|
44
55
|
return {
|
|
45
56
|
runStructural: true,
|
|
46
57
|
runRules: rulesRequested && !schemaOnly,
|
|
@@ -81,6 +92,184 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
|
81
92
|
return undefined;
|
|
82
93
|
}
|
|
83
94
|
}
|
|
95
|
+
function stageJudgeHint(step) {
|
|
96
|
+
const hint = {};
|
|
97
|
+
if (step.rubric)
|
|
98
|
+
hint.rubric = step.rubric;
|
|
99
|
+
if (step.requiredChecks)
|
|
100
|
+
hint.requiredChecks = step.requiredChecks;
|
|
101
|
+
if (step.minimumScores)
|
|
102
|
+
hint.minimumScores = step.minimumScores;
|
|
103
|
+
return hint;
|
|
104
|
+
}
|
|
105
|
+
async function runWorkflowCase(ctx) {
|
|
106
|
+
const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
|
|
107
|
+
const started = Date.now();
|
|
108
|
+
const verifierResults = [];
|
|
109
|
+
let caseCostUsd = 0;
|
|
110
|
+
const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
|
|
111
|
+
"plan";
|
|
112
|
+
if (!flags.runAgent || !client) {
|
|
113
|
+
verifierResults.push({
|
|
114
|
+
kind: "workflow",
|
|
115
|
+
id: "workflow:agent:disabled",
|
|
116
|
+
ok: false,
|
|
117
|
+
score: 0,
|
|
118
|
+
message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
|
|
119
|
+
"Re-run with credentials to execute the workflow.",
|
|
120
|
+
details: { stages: workflow.stages.map((s) => s.name) }
|
|
121
|
+
});
|
|
122
|
+
return {
|
|
123
|
+
caseId: workflow.id,
|
|
124
|
+
stage: lastStage,
|
|
125
|
+
tier: plannedTier,
|
|
126
|
+
passed: false,
|
|
127
|
+
durationMs: Date.now() - started,
|
|
128
|
+
verifierResults
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
let workflowResult;
|
|
132
|
+
try {
|
|
133
|
+
workflowResult = await runWorkflow({
|
|
134
|
+
workflow,
|
|
135
|
+
config,
|
|
136
|
+
projectRoot,
|
|
137
|
+
client
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
if (err instanceof DailyCostCapExceededError)
|
|
142
|
+
throw err;
|
|
143
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
144
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
145
|
+
verifierResults.push({
|
|
146
|
+
kind: "workflow",
|
|
147
|
+
id: "workflow:agent:error",
|
|
148
|
+
ok: false,
|
|
149
|
+
score: 0,
|
|
150
|
+
message: err instanceof Error ? err.message : String(err),
|
|
151
|
+
details: {
|
|
152
|
+
retryable,
|
|
153
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
return {
|
|
157
|
+
caseId: workflow.id,
|
|
158
|
+
stage: lastStage,
|
|
159
|
+
tier: plannedTier,
|
|
160
|
+
passed: false,
|
|
161
|
+
durationMs: Date.now() - started,
|
|
162
|
+
verifierResults
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
caseCostUsd += workflowResult.totalUsageUsd;
|
|
166
|
+
const stageResults = [...workflowResult.stages];
|
|
167
|
+
verifierResults.push({
|
|
168
|
+
kind: "workflow",
|
|
169
|
+
id: "workflow:agent",
|
|
170
|
+
ok: true,
|
|
171
|
+
score: 1,
|
|
172
|
+
message: `workflow ran ${stageResults.length} stage(s) in ` +
|
|
173
|
+
`${workflowResult.totalDurationMs}ms ` +
|
|
174
|
+
`(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
|
|
175
|
+
details: {
|
|
176
|
+
stages: stageResults.map((s) => ({
|
|
177
|
+
name: s.stage,
|
|
178
|
+
durationMs: s.durationMs,
|
|
179
|
+
usageUsd: s.usageUsd,
|
|
180
|
+
turns: s.toolUse.turns,
|
|
181
|
+
calls: s.toolUse.calls
|
|
182
|
+
}))
|
|
183
|
+
}
|
|
184
|
+
});
|
|
185
|
+
let allJudgeOk = true;
|
|
186
|
+
if (flags.runJudge) {
|
|
187
|
+
for (let i = 0; i < workflow.stages.length; i += 1) {
|
|
188
|
+
const step = workflow.stages[i];
|
|
189
|
+
const stageResult = stageResults[i];
|
|
190
|
+
const rubric = rubrics.get(step.name);
|
|
191
|
+
if (!rubric) {
|
|
192
|
+
verifierResults.push({
|
|
193
|
+
kind: "judge",
|
|
194
|
+
id: `judge:rubric:missing:${step.name}`,
|
|
195
|
+
ok: false,
|
|
196
|
+
score: 0,
|
|
197
|
+
message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
|
|
198
|
+
details: { stage: step.name }
|
|
199
|
+
});
|
|
200
|
+
allJudgeOk = false;
|
|
201
|
+
stageResult.judgeOk = false;
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
const hint = stageJudgeHint(step);
|
|
205
|
+
try {
|
|
206
|
+
const invocation = await runJudge({
|
|
207
|
+
artifact: stageResult.artifact,
|
|
208
|
+
rubric,
|
|
209
|
+
config,
|
|
210
|
+
client,
|
|
211
|
+
caseHint: hint
|
|
212
|
+
});
|
|
213
|
+
caseCostUsd += invocation.usageUsd;
|
|
214
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
|
|
215
|
+
const medians = {};
|
|
216
|
+
for (const agg of invocation.aggregates) {
|
|
217
|
+
medians[agg.checkId] = agg.median;
|
|
218
|
+
}
|
|
219
|
+
stageResult.judgeMedians = medians;
|
|
220
|
+
const stageOk = judgeVerifiers.every((v) => v.ok);
|
|
221
|
+
stageResult.judgeOk = stageOk;
|
|
222
|
+
if (!stageOk)
|
|
223
|
+
allJudgeOk = false;
|
|
224
|
+
for (const v of judgeVerifiers) {
|
|
225
|
+
verifierResults.push({
|
|
226
|
+
...v,
|
|
227
|
+
id: `${v.id}:${step.name}`,
|
|
228
|
+
details: { ...(v.details ?? {}), stage: step.name }
|
|
229
|
+
});
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
catch (err) {
|
|
233
|
+
if (err instanceof DailyCostCapExceededError)
|
|
234
|
+
throw err;
|
|
235
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
236
|
+
verifierResults.push({
|
|
237
|
+
kind: "judge",
|
|
238
|
+
id: `judge:invocation:error:${step.name}`,
|
|
239
|
+
ok: false,
|
|
240
|
+
score: 0,
|
|
241
|
+
message: err instanceof Error ? err.message : String(err),
|
|
242
|
+
details: { retryable, rubricId: rubric.id, stage: step.name }
|
|
243
|
+
});
|
|
244
|
+
stageResult.judgeOk = false;
|
|
245
|
+
allJudgeOk = false;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
|
|
250
|
+
verifierResults.push(...consistencyResults);
|
|
251
|
+
const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
252
|
+
const allOk = nonSkipped.length === 0
|
|
253
|
+
? verifierResults.every((r) => r.ok)
|
|
254
|
+
: nonSkipped.every((r) => r.ok);
|
|
255
|
+
const workflowSummary = {
|
|
256
|
+
caseId: workflow.id,
|
|
257
|
+
stages: stageResults,
|
|
258
|
+
totalUsageUsd: workflowResult.totalUsageUsd,
|
|
259
|
+
totalDurationMs: workflowResult.totalDurationMs,
|
|
260
|
+
allJudgeOk: flags.runJudge ? allJudgeOk : true
|
|
261
|
+
};
|
|
262
|
+
return {
|
|
263
|
+
caseId: workflow.id,
|
|
264
|
+
stage: lastStage,
|
|
265
|
+
tier: plannedTier,
|
|
266
|
+
passed: allOk,
|
|
267
|
+
durationMs: Date.now() - started,
|
|
268
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
269
|
+
verifierResults,
|
|
270
|
+
workflow: workflowSummary
|
|
271
|
+
};
|
|
272
|
+
}
|
|
84
273
|
async function runCase(ctx) {
|
|
85
274
|
const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
|
|
86
275
|
const started = Date.now();
|
|
@@ -94,7 +283,7 @@ async function runCase(ctx) {
|
|
|
94
283
|
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
95
284
|
let artifact;
|
|
96
285
|
if (needsArtifact) {
|
|
97
|
-
if (flags.runAgent && judgeRequested && client) {
|
|
286
|
+
if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
|
|
98
287
|
try {
|
|
99
288
|
const produced = await runSingleShot({
|
|
100
289
|
caseEntry,
|
|
@@ -133,6 +322,52 @@ async function runCase(ctx) {
|
|
|
133
322
|
});
|
|
134
323
|
}
|
|
135
324
|
}
|
|
325
|
+
else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
|
|
326
|
+
try {
|
|
327
|
+
const produced = await runWithTools({
|
|
328
|
+
caseEntry,
|
|
329
|
+
config,
|
|
330
|
+
projectRoot,
|
|
331
|
+
client
|
|
332
|
+
});
|
|
333
|
+
artifact = produced.artifact;
|
|
334
|
+
caseCostUsd += produced.usageUsd;
|
|
335
|
+
verifierResults.push({
|
|
336
|
+
kind: "workflow",
|
|
337
|
+
id: "agent:with-tools",
|
|
338
|
+
ok: true,
|
|
339
|
+
score: 1,
|
|
340
|
+
message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
|
|
341
|
+
`${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
|
|
342
|
+
`(${produced.toolUse.calls} tool call(s))`,
|
|
343
|
+
details: {
|
|
344
|
+
model: produced.model,
|
|
345
|
+
tokensIn: produced.usage.promptTokens,
|
|
346
|
+
tokensOut: produced.usage.completionTokens,
|
|
347
|
+
usageUsd: produced.usageUsd,
|
|
348
|
+
attempts: produced.attempts,
|
|
349
|
+
toolUse: produced.toolUse
|
|
350
|
+
}
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
catch (err) {
|
|
354
|
+
if (err instanceof DailyCostCapExceededError)
|
|
355
|
+
throw err;
|
|
356
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
357
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
358
|
+
verifierResults.push({
|
|
359
|
+
kind: "workflow",
|
|
360
|
+
id: "agent:with-tools",
|
|
361
|
+
ok: false,
|
|
362
|
+
score: 0,
|
|
363
|
+
message: err instanceof Error ? err.message : String(err),
|
|
364
|
+
details: {
|
|
365
|
+
retryable,
|
|
366
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
367
|
+
}
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
}
|
|
136
371
|
else {
|
|
137
372
|
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
138
373
|
}
|
|
@@ -279,18 +514,22 @@ function stagesInResults(caseResults) {
|
|
|
279
514
|
*/
|
|
280
515
|
export async function runEval(options) {
|
|
281
516
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
282
|
-
const corpus = await loadCorpus(options.projectRoot, options.stage);
|
|
283
517
|
const plannedTier = options.tier ?? config.defaultTier;
|
|
518
|
+
const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
|
|
519
|
+
const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
|
|
284
520
|
const notes = [];
|
|
285
|
-
if (corpus.length === 0) {
|
|
521
|
+
if (plannedTier !== "C" && corpus.length === 0) {
|
|
286
522
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
287
523
|
}
|
|
524
|
+
if (plannedTier === "C" && workflowCorpus.length === 0) {
|
|
525
|
+
notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
|
|
526
|
+
}
|
|
288
527
|
const flags = resolveRunFlags(options);
|
|
289
528
|
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
290
529
|
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
291
530
|
}
|
|
292
|
-
if (
|
|
293
|
-
notes.push("Tier
|
|
531
|
+
if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
|
|
532
|
+
notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
|
|
294
533
|
}
|
|
295
534
|
if (options.dryRun === true) {
|
|
296
535
|
const summary = {
|
|
@@ -301,12 +540,20 @@ export async function runEval(options) {
|
|
|
301
540
|
byStage: groupByStage(corpus),
|
|
302
541
|
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
303
542
|
},
|
|
543
|
+
workflowCorpus: {
|
|
544
|
+
total: workflowCorpus.length,
|
|
545
|
+
cases: workflowCorpus.map((item) => ({
|
|
546
|
+
id: item.id,
|
|
547
|
+
stages: item.stages.map((s) => s.name)
|
|
548
|
+
}))
|
|
549
|
+
},
|
|
304
550
|
plannedTier,
|
|
305
551
|
verifiersAvailable: {
|
|
306
552
|
structural: flags.runStructural,
|
|
307
553
|
rules: flags.runRules,
|
|
308
554
|
judge: flags.runJudge,
|
|
309
|
-
workflow: flags.runAgent
|
|
555
|
+
workflow: flags.runAgent,
|
|
556
|
+
consistency: plannedTier === "C"
|
|
310
557
|
},
|
|
311
558
|
notes
|
|
312
559
|
};
|
|
@@ -314,26 +561,44 @@ export async function runEval(options) {
|
|
|
314
561
|
}
|
|
315
562
|
const costGuard = createCostGuard(options.projectRoot, config);
|
|
316
563
|
let wrappedClient;
|
|
317
|
-
|
|
564
|
+
const clientNeeded = flags.runJudge || plannedTier === "C";
|
|
565
|
+
if (clientNeeded) {
|
|
318
566
|
const base = options.llmClient ?? createEvalClient(config);
|
|
319
567
|
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
320
568
|
}
|
|
321
|
-
const
|
|
569
|
+
const rubricsNeeded = flags.runJudge;
|
|
570
|
+
const rubrics = rubricsNeeded
|
|
322
571
|
? await loadAllRubrics(options.projectRoot)
|
|
323
572
|
: new Map();
|
|
324
573
|
const now = new Date().toISOString();
|
|
325
574
|
const caseResults = [];
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
575
|
+
if (plannedTier === "C") {
|
|
576
|
+
for (const wf of workflowCorpus) {
|
|
577
|
+
caseResults.push(await runWorkflowCase({
|
|
578
|
+
projectRoot: options.projectRoot,
|
|
579
|
+
workflow: wf,
|
|
580
|
+
plannedTier,
|
|
581
|
+
flags,
|
|
582
|
+
config,
|
|
583
|
+
client: wrappedClient,
|
|
584
|
+
costGuard,
|
|
585
|
+
rubrics
|
|
586
|
+
}));
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
else {
|
|
590
|
+
for (const item of corpus) {
|
|
591
|
+
caseResults.push(await runCase({
|
|
592
|
+
projectRoot: options.projectRoot,
|
|
593
|
+
caseEntry: item,
|
|
594
|
+
plannedTier,
|
|
595
|
+
flags,
|
|
596
|
+
config,
|
|
597
|
+
client: wrappedClient,
|
|
598
|
+
costGuard,
|
|
599
|
+
rubrics
|
|
600
|
+
}));
|
|
601
|
+
}
|
|
337
602
|
}
|
|
338
603
|
const stages = stagesInResults(caseResults);
|
|
339
604
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export declare class SandboxEscapeError extends Error {
|
|
2
|
+
readonly requestedPath: string;
|
|
3
|
+
constructor(requestedPath: string, reason: string);
|
|
4
|
+
}
|
|
5
|
+
export interface SandboxOptions {
|
|
6
|
+
/** Project root that `contextFiles` are resolved against. */
|
|
7
|
+
projectRoot: string;
|
|
8
|
+
/** Case-relative paths to copy into the sandbox before the agent starts. */
|
|
9
|
+
contextFiles?: string[];
|
|
10
|
+
/**
|
|
11
|
+
* Base directory that will host the per-case tmpdir. Defaults to
|
|
12
|
+
* `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
|
|
13
|
+
* traces in `/tmp` when assertions fail.
|
|
14
|
+
*/
|
|
15
|
+
baseDir?: string;
|
|
16
|
+
/** Override the per-case suffix. Primarily for deterministic tests. */
|
|
17
|
+
idOverride?: string;
|
|
18
|
+
}
|
|
19
|
+
export interface Sandbox {
|
|
20
|
+
/** Absolute path to the sandbox root directory. */
|
|
21
|
+
root: string;
|
|
22
|
+
/**
|
|
23
|
+
* Resolve `requested` relative to the sandbox root and return the
|
|
24
|
+
* absolute, realpath'd filesystem path. Throws
|
|
25
|
+
* `SandboxEscapeError` when the resolution crosses the boundary.
|
|
26
|
+
*
|
|
27
|
+
* `allowMissing: true` lets callers pre-resolve a destination for a
|
|
28
|
+
* write where the final component doesn't exist yet — the parent
|
|
29
|
+
* directory is realpath'd to still catch symlink escapes.
|
|
30
|
+
*/
|
|
31
|
+
resolve(requested: string, options?: {
|
|
32
|
+
allowMissing?: boolean;
|
|
33
|
+
}): Promise<string>;
|
|
34
|
+
/** Remove the sandbox directory. Idempotent. */
|
|
35
|
+
dispose(): Promise<void>;
|
|
36
|
+
}
|
|
37
|
+
/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
|
|
38
|
+
export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-case sandbox for the Tier B with-tools agent.
|
|
3
|
+
*
|
|
4
|
+
* Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
|
|
5
|
+
* `contextFiles` the case declares are copied in relative to the project
|
|
6
|
+
* root, and every tool invocation resolves paths against the sandbox
|
|
7
|
+
* root with a defensive check that refuses symlinks and `..` escapes.
|
|
8
|
+
*
|
|
9
|
+
* Design notes:
|
|
10
|
+
*
|
|
11
|
+
* - The sandbox is intentionally tiny (one directory, no symlink
|
|
12
|
+
* creation, no executable bits). We rely on `fs.realpath` on every
|
|
13
|
+
* resolved path so hostile tool output that creates a symlink to
|
|
14
|
+
* `/etc/passwd` and then tries to read it still trips the boundary
|
|
15
|
+
* check.
|
|
16
|
+
* - Cleanup is handled by `dispose()`; callers (runner, tests) must
|
|
17
|
+
* invoke it in a `try/finally` so leftover temp directories never
|
|
18
|
+
* accumulate.
|
|
19
|
+
* - The sandbox does not preserve the project's directory structure
|
|
20
|
+
* verbatim. Each entry in `contextFiles` is copied flat into
|
|
21
|
+
* `sandboxRoot/<basename>` unless it contains path separators, in
|
|
22
|
+
* which case the full relative layout is recreated. That keeps demo
|
|
23
|
+
* cases portable while still letting richer cases place files under
|
|
24
|
+
* subdirectories (e.g. `.cclaw/skills/brainstorming/SKILL.md`).
|
|
25
|
+
*/
|
|
26
|
+
import { randomUUID } from "node:crypto";
|
|
27
|
+
import fs from "node:fs/promises";
|
|
28
|
+
import os from "node:os";
|
|
29
|
+
import path from "node:path";
|
|
30
|
+
export class SandboxEscapeError extends Error {
|
|
31
|
+
requestedPath;
|
|
32
|
+
constructor(requestedPath, reason) {
|
|
33
|
+
super(`Sandbox refused path "${requestedPath}": ${reason}.`);
|
|
34
|
+
this.name = "SandboxEscapeError";
|
|
35
|
+
this.requestedPath = requestedPath;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
|
|
39
|
+
export async function createSandbox(options) {
|
|
40
|
+
const baseDir = options.baseDir ?? os.tmpdir();
|
|
41
|
+
const id = options.idOverride ?? randomUUID();
|
|
42
|
+
const root = path.join(baseDir, `cclaw-eval-${id}`);
|
|
43
|
+
await fs.mkdir(root, { recursive: true });
|
|
44
|
+
const realRoot = await fs.realpath(root);
|
|
45
|
+
if (options.contextFiles && options.contextFiles.length > 0) {
|
|
46
|
+
for (const rel of options.contextFiles) {
|
|
47
|
+
await copyContextFile(options.projectRoot, realRoot, rel);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
async function resolveInside(requested, opts = {}) {
|
|
51
|
+
if (typeof requested !== "string" || requested.length === 0) {
|
|
52
|
+
throw new SandboxEscapeError(String(requested), "path must be a non-empty string");
|
|
53
|
+
}
|
|
54
|
+
if (path.isAbsolute(requested)) {
|
|
55
|
+
throw new SandboxEscapeError(requested, "absolute paths are not allowed");
|
|
56
|
+
}
|
|
57
|
+
if (requested.includes("\0")) {
|
|
58
|
+
throw new SandboxEscapeError(requested, "NUL byte in path");
|
|
59
|
+
}
|
|
60
|
+
const joined = path.resolve(realRoot, requested);
|
|
61
|
+
const relative = path.relative(realRoot, joined);
|
|
62
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
63
|
+
throw new SandboxEscapeError(requested, "resolves outside the sandbox");
|
|
64
|
+
}
|
|
65
|
+
let finalPath;
|
|
66
|
+
try {
|
|
67
|
+
finalPath = await fs.realpath(joined);
|
|
68
|
+
}
|
|
69
|
+
catch (err) {
|
|
70
|
+
if (!opts.allowMissing) {
|
|
71
|
+
throw new SandboxEscapeError(requested, `realpath failed: ${err.message}`);
|
|
72
|
+
}
|
|
73
|
+
const existingAncestor = await findExistingAncestor(joined, realRoot);
|
|
74
|
+
if (!existingAncestor) {
|
|
75
|
+
throw new SandboxEscapeError(requested, "no existing ancestor inside the sandbox");
|
|
76
|
+
}
|
|
77
|
+
const ancestorRel = path.relative(realRoot, existingAncestor.real);
|
|
78
|
+
if (ancestorRel.startsWith("..") || path.isAbsolute(ancestorRel)) {
|
|
79
|
+
throw new SandboxEscapeError(requested, "parent resolves outside the sandbox");
|
|
80
|
+
}
|
|
81
|
+
finalPath = path.join(existingAncestor.real, existingAncestor.trailing);
|
|
82
|
+
}
|
|
83
|
+
const finalRel = path.relative(realRoot, finalPath);
|
|
84
|
+
if (finalRel.startsWith("..") || path.isAbsolute(finalRel)) {
|
|
85
|
+
throw new SandboxEscapeError(requested, "realpath escapes the sandbox");
|
|
86
|
+
}
|
|
87
|
+
return finalPath;
|
|
88
|
+
}
|
|
89
|
+
return {
|
|
90
|
+
root: realRoot,
|
|
91
|
+
resolve: resolveInside,
|
|
92
|
+
async dispose() {
|
|
93
|
+
await fs.rm(realRoot, { recursive: true, force: true });
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
async function findExistingAncestor(target, stopAt) {
|
|
98
|
+
const segments = [];
|
|
99
|
+
let current = target;
|
|
100
|
+
while (true) {
|
|
101
|
+
try {
|
|
102
|
+
const real = await fs.realpath(current);
|
|
103
|
+
return { real, trailing: path.join(...segments.reverse()) };
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
const parent = path.dirname(current);
|
|
107
|
+
if (parent === current)
|
|
108
|
+
return undefined;
|
|
109
|
+
segments.push(path.basename(current));
|
|
110
|
+
if (path.relative(stopAt, parent).startsWith(".."))
|
|
111
|
+
return undefined;
|
|
112
|
+
current = parent;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
async function copyContextFile(projectRoot, sandboxRoot, relPath) {
|
|
117
|
+
if (path.isAbsolute(relPath)) {
|
|
118
|
+
throw new Error(`context_files must be project-relative: ${relPath}`);
|
|
119
|
+
}
|
|
120
|
+
const src = path.resolve(projectRoot, relPath);
|
|
121
|
+
const srcReal = await fs.realpath(src);
|
|
122
|
+
const projectReal = await fs.realpath(projectRoot);
|
|
123
|
+
const inside = path.relative(projectReal, srcReal);
|
|
124
|
+
if (inside.startsWith("..") || path.isAbsolute(inside)) {
|
|
125
|
+
throw new Error(`context_files entry resolves outside the project: ${relPath}`);
|
|
126
|
+
}
|
|
127
|
+
const stat = await fs.stat(srcReal);
|
|
128
|
+
if (stat.isDirectory()) {
|
|
129
|
+
const dest = path.join(sandboxRoot, relPath);
|
|
130
|
+
await fs.mkdir(dest, { recursive: true });
|
|
131
|
+
await fs.cp(srcReal, dest, { recursive: true });
|
|
132
|
+
return;
|
|
133
|
+
}
|
|
134
|
+
const dest = path.join(sandboxRoot, relPath);
|
|
135
|
+
await fs.mkdir(path.dirname(dest), { recursive: true });
|
|
136
|
+
await fs.copyFile(srcReal, dest);
|
|
137
|
+
}
|