cclaw-cli 0.24.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -1
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/agents/with-tools.d.ts +31 -0
- package/dist/eval/agents/with-tools.js +255 -0
- package/dist/eval/config-loader.js +128 -3
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +123 -20
- package/dist/eval/llm-client.js +251 -10
- package/dist/eval/report.js +45 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +193 -12
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +138 -1
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
package/dist/eval/runner.js
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
|
+
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
4
6
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
7
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
6
8
|
import { loadEvalConfig } from "./config-loader.js";
|
|
9
|
+
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
10
|
+
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
11
|
+
import { loadAllRubrics } from "./rubric-loader.js";
|
|
12
|
+
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
7
13
|
import { verifyRules } from "./verifiers/rules.js";
|
|
8
14
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
9
15
|
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
@@ -26,16 +32,39 @@ function skeletonVerifierResult(message, details) {
|
|
|
26
32
|
/**
|
|
27
33
|
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
28
34
|
* on top of structural (traceability is a rule-family verifier even though
|
|
29
|
-
* it lives in its own module).
|
|
30
|
-
*
|
|
35
|
+
* it lives in its own module). --judge opens up the LLM judge and, for
|
|
36
|
+
* Tier A, the single-shot agent-under-test. --schema-only always wins so
|
|
37
|
+
* the LLM-free PR gate never pays for tokens even if stale flags collide.
|
|
31
38
|
*/
|
|
32
39
|
function resolveRunFlags(options) {
|
|
33
40
|
const rulesRequested = options.rules === true;
|
|
34
41
|
const schemaOnly = options.schemaOnly === true;
|
|
42
|
+
const judgeRequested = options.judge === true;
|
|
43
|
+
const tier = options.tier ?? "A";
|
|
44
|
+
const runJudge = judgeRequested && !schemaOnly;
|
|
45
|
+
const runAgent = runJudge && (tier === "A" || tier === "B");
|
|
35
46
|
return {
|
|
36
47
|
runStructural: true,
|
|
37
48
|
runRules: rulesRequested && !schemaOnly,
|
|
38
|
-
runTraceability: rulesRequested && !schemaOnly
|
|
49
|
+
runTraceability: rulesRequested && !schemaOnly,
|
|
50
|
+
runJudge,
|
|
51
|
+
runAgent
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Wrap a client so every chat() result is accounted against the cost
|
|
56
|
+
* guard before being returned. The guard throws
|
|
57
|
+
* DailyCostCapExceededError if committing the call would cross the
|
|
58
|
+
* configured cap — the runner surfaces that as a hard failure so
|
|
59
|
+
* nightly CI fails loud instead of silently overspending.
|
|
60
|
+
*/
|
|
61
|
+
function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
|
|
62
|
+
return {
|
|
63
|
+
async chat(request) {
|
|
64
|
+
const response = await client.chat(request);
|
|
65
|
+
await costGuard.commit(response.model || fallbackModel, response.usage);
|
|
66
|
+
return response;
|
|
67
|
+
}
|
|
39
68
|
};
|
|
40
69
|
}
|
|
41
70
|
async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
@@ -54,17 +83,107 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
|
54
83
|
return undefined;
|
|
55
84
|
}
|
|
56
85
|
}
|
|
57
|
-
async function runCase(
|
|
86
|
+
async function runCase(ctx) {
|
|
87
|
+
const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
|
|
58
88
|
const started = Date.now();
|
|
59
89
|
const verifierResults = [];
|
|
60
90
|
const expected = caseEntry.expected;
|
|
91
|
+
let caseCostUsd = 0;
|
|
61
92
|
const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
|
|
62
93
|
const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
|
|
63
94
|
const hasTraceability = flags.runTraceability && !!expected?.traceability;
|
|
64
|
-
const
|
|
95
|
+
const judgeRequested = flags.runJudge && !!expected?.judge;
|
|
96
|
+
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
65
97
|
let artifact;
|
|
66
98
|
if (needsArtifact) {
|
|
67
|
-
|
|
99
|
+
if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
|
|
100
|
+
try {
|
|
101
|
+
const produced = await runSingleShot({
|
|
102
|
+
caseEntry,
|
|
103
|
+
config,
|
|
104
|
+
projectRoot,
|
|
105
|
+
client
|
|
106
|
+
});
|
|
107
|
+
artifact = produced.artifact;
|
|
108
|
+
caseCostUsd += produced.usageUsd;
|
|
109
|
+
verifierResults.push({
|
|
110
|
+
kind: "workflow",
|
|
111
|
+
id: "agent:single-shot",
|
|
112
|
+
ok: true,
|
|
113
|
+
score: 1,
|
|
114
|
+
message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
|
|
115
|
+
details: {
|
|
116
|
+
model: produced.model,
|
|
117
|
+
tokensIn: produced.usage.promptTokens,
|
|
118
|
+
tokensOut: produced.usage.completionTokens,
|
|
119
|
+
usageUsd: produced.usageUsd,
|
|
120
|
+
attempts: produced.attempts
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
catch (err) {
|
|
125
|
+
if (err instanceof DailyCostCapExceededError)
|
|
126
|
+
throw err;
|
|
127
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
128
|
+
verifierResults.push({
|
|
129
|
+
kind: "workflow",
|
|
130
|
+
id: "agent:single-shot",
|
|
131
|
+
ok: false,
|
|
132
|
+
score: 0,
|
|
133
|
+
message: err instanceof Error ? err.message : String(err),
|
|
134
|
+
details: { retryable }
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
|
|
139
|
+
try {
|
|
140
|
+
const produced = await runWithTools({
|
|
141
|
+
caseEntry,
|
|
142
|
+
config,
|
|
143
|
+
projectRoot,
|
|
144
|
+
client
|
|
145
|
+
});
|
|
146
|
+
artifact = produced.artifact;
|
|
147
|
+
caseCostUsd += produced.usageUsd;
|
|
148
|
+
verifierResults.push({
|
|
149
|
+
kind: "workflow",
|
|
150
|
+
id: "agent:with-tools",
|
|
151
|
+
ok: true,
|
|
152
|
+
score: 1,
|
|
153
|
+
message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
|
|
154
|
+
`${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
|
|
155
|
+
`(${produced.toolUse.calls} tool call(s))`,
|
|
156
|
+
details: {
|
|
157
|
+
model: produced.model,
|
|
158
|
+
tokensIn: produced.usage.promptTokens,
|
|
159
|
+
tokensOut: produced.usage.completionTokens,
|
|
160
|
+
usageUsd: produced.usageUsd,
|
|
161
|
+
attempts: produced.attempts,
|
|
162
|
+
toolUse: produced.toolUse
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
catch (err) {
|
|
167
|
+
if (err instanceof DailyCostCapExceededError)
|
|
168
|
+
throw err;
|
|
169
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
170
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
171
|
+
verifierResults.push({
|
|
172
|
+
kind: "workflow",
|
|
173
|
+
id: "agent:with-tools",
|
|
174
|
+
ok: false,
|
|
175
|
+
score: 0,
|
|
176
|
+
message: err instanceof Error ? err.message : String(err),
|
|
177
|
+
details: {
|
|
178
|
+
retryable,
|
|
179
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
186
|
+
}
|
|
68
187
|
if (artifact === undefined && verifierResults.length === 0) {
|
|
69
188
|
verifierResults.push({
|
|
70
189
|
kind: "structural",
|
|
@@ -111,6 +230,46 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
|
|
|
111
230
|
});
|
|
112
231
|
}
|
|
113
232
|
}
|
|
233
|
+
if (judgeRequested && artifact !== undefined && client) {
|
|
234
|
+
const rubric = rubrics.get(caseEntry.stage);
|
|
235
|
+
if (!rubric) {
|
|
236
|
+
verifierResults.push({
|
|
237
|
+
kind: "judge",
|
|
238
|
+
id: "judge:rubric:missing",
|
|
239
|
+
ok: false,
|
|
240
|
+
score: 0,
|
|
241
|
+
message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
|
|
242
|
+
details: { stage: caseEntry.stage }
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
else {
|
|
246
|
+
try {
|
|
247
|
+
const invocation = await runJudge({
|
|
248
|
+
artifact,
|
|
249
|
+
rubric,
|
|
250
|
+
config,
|
|
251
|
+
client,
|
|
252
|
+
caseHint: expected.judge
|
|
253
|
+
});
|
|
254
|
+
caseCostUsd += invocation.usageUsd;
|
|
255
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
|
|
256
|
+
verifierResults.push(...judgeVerifiers);
|
|
257
|
+
}
|
|
258
|
+
catch (err) {
|
|
259
|
+
if (err instanceof DailyCostCapExceededError)
|
|
260
|
+
throw err;
|
|
261
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
262
|
+
verifierResults.push({
|
|
263
|
+
kind: "judge",
|
|
264
|
+
id: "judge:invocation:error",
|
|
265
|
+
ok: false,
|
|
266
|
+
score: 0,
|
|
267
|
+
message: err instanceof Error ? err.message : String(err),
|
|
268
|
+
details: { retryable, rubricId: rubric.id }
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
114
273
|
const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
115
274
|
const allOk = nonSkippedResults.length === 0
|
|
116
275
|
? verifierResults.every((r) => r.ok)
|
|
@@ -121,6 +280,7 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
|
|
|
121
280
|
tier: plannedTier,
|
|
122
281
|
passed: allOk,
|
|
123
282
|
durationMs: Date.now() - started,
|
|
283
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
124
284
|
verifierResults
|
|
125
285
|
};
|
|
126
286
|
}
|
|
@@ -173,10 +333,13 @@ export async function runEval(options) {
|
|
|
173
333
|
if (corpus.length === 0) {
|
|
174
334
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
175
335
|
}
|
|
176
|
-
if (options.judge) {
|
|
177
|
-
notes.push("--judge is accepted; LLM judging is not wired yet.");
|
|
178
|
-
}
|
|
179
336
|
const flags = resolveRunFlags(options);
|
|
337
|
+
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
338
|
+
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
339
|
+
}
|
|
340
|
+
if ((options.tier ?? "A") !== "A" && flags.runJudge) {
|
|
341
|
+
notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
|
|
342
|
+
}
|
|
180
343
|
if (options.dryRun === true) {
|
|
181
344
|
const summary = {
|
|
182
345
|
kind: "dry-run",
|
|
@@ -190,17 +353,35 @@ export async function runEval(options) {
|
|
|
190
353
|
verifiersAvailable: {
|
|
191
354
|
structural: flags.runStructural,
|
|
192
355
|
rules: flags.runRules,
|
|
193
|
-
judge:
|
|
194
|
-
workflow:
|
|
356
|
+
judge: flags.runJudge,
|
|
357
|
+
workflow: flags.runAgent
|
|
195
358
|
},
|
|
196
359
|
notes
|
|
197
360
|
};
|
|
198
361
|
return summary;
|
|
199
362
|
}
|
|
363
|
+
const costGuard = createCostGuard(options.projectRoot, config);
|
|
364
|
+
let wrappedClient;
|
|
365
|
+
if (flags.runJudge) {
|
|
366
|
+
const base = options.llmClient ?? createEvalClient(config);
|
|
367
|
+
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
368
|
+
}
|
|
369
|
+
const rubrics = flags.runJudge
|
|
370
|
+
? await loadAllRubrics(options.projectRoot)
|
|
371
|
+
: new Map();
|
|
200
372
|
const now = new Date().toISOString();
|
|
201
373
|
const caseResults = [];
|
|
202
374
|
for (const item of corpus) {
|
|
203
|
-
caseResults.push(await runCase(
|
|
375
|
+
caseResults.push(await runCase({
|
|
376
|
+
projectRoot: options.projectRoot,
|
|
377
|
+
caseEntry: item,
|
|
378
|
+
plannedTier,
|
|
379
|
+
flags,
|
|
380
|
+
config,
|
|
381
|
+
client: wrappedClient,
|
|
382
|
+
costGuard,
|
|
383
|
+
rubrics
|
|
384
|
+
}));
|
|
204
385
|
}
|
|
205
386
|
const stages = stagesInResults(caseResults);
|
|
206
387
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export declare class SandboxEscapeError extends Error {
|
|
2
|
+
readonly requestedPath: string;
|
|
3
|
+
constructor(requestedPath: string, reason: string);
|
|
4
|
+
}
|
|
5
|
+
export interface SandboxOptions {
|
|
6
|
+
/** Project root that `contextFiles` are resolved against. */
|
|
7
|
+
projectRoot: string;
|
|
8
|
+
/** Case-relative paths to copy into the sandbox before the agent starts. */
|
|
9
|
+
contextFiles?: string[];
|
|
10
|
+
/**
|
|
11
|
+
* Base directory that will host the per-case tmpdir. Defaults to
|
|
12
|
+
* `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
|
|
13
|
+
* traces in `/tmp` when assertions fail.
|
|
14
|
+
*/
|
|
15
|
+
baseDir?: string;
|
|
16
|
+
/** Override the per-case suffix. Primarily for deterministic tests. */
|
|
17
|
+
idOverride?: string;
|
|
18
|
+
}
|
|
19
|
+
export interface Sandbox {
|
|
20
|
+
/** Absolute path to the sandbox root directory. */
|
|
21
|
+
root: string;
|
|
22
|
+
/**
|
|
23
|
+
* Resolve `requested` relative to the sandbox root and return the
|
|
24
|
+
* absolute, realpath'd filesystem path. Throws
|
|
25
|
+
* `SandboxEscapeError` when the resolution crosses the boundary.
|
|
26
|
+
*
|
|
27
|
+
* `allowMissing: true` lets callers pre-resolve a destination for a
|
|
28
|
+
* write where the final component doesn't exist yet — the parent
|
|
29
|
+
* directory is realpath'd to still catch symlink escapes.
|
|
30
|
+
*/
|
|
31
|
+
resolve(requested: string, options?: {
|
|
32
|
+
allowMissing?: boolean;
|
|
33
|
+
}): Promise<string>;
|
|
34
|
+
/** Remove the sandbox directory. Idempotent. */
|
|
35
|
+
dispose(): Promise<void>;
|
|
36
|
+
}
|
|
37
|
+
/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
|
|
38
|
+
export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-case sandbox for the Tier B with-tools agent.
|
|
3
|
+
*
|
|
4
|
+
* Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
|
|
5
|
+
* `contextFiles` the case declares are copied in relative to the project
|
|
6
|
+
* root, and every tool invocation resolves paths against the sandbox
|
|
7
|
+
* root with a defensive check that refuses symlinks and `..` escapes.
|
|
8
|
+
*
|
|
9
|
+
* Design notes:
|
|
10
|
+
*
|
|
11
|
+
* - The sandbox is intentionally tiny (one directory, no symlink
|
|
12
|
+
* creation, no executable bits). We rely on `fs.realpath` on every
|
|
13
|
+
* resolved path so hostile tool output that creates a symlink to
|
|
14
|
+
* `/etc/passwd` and then tries to read it still trips the boundary
|
|
15
|
+
* check.
|
|
16
|
+
* - Cleanup is handled by `dispose()`; callers (runner, tests) must
|
|
17
|
+
* invoke it in a `try/finally` so leftover temp directories never
|
|
18
|
+
* accumulate.
|
|
19
|
+
* - The sandbox does not preserve the project's directory structure
|
|
20
|
+
* verbatim. Each entry in `contextFiles` is copied flat into
|
|
21
|
+
* `sandboxRoot/<basename>` unless it contains path separators, in
|
|
22
|
+
* which case the full relative layout is recreated. That keeps demo
|
|
23
|
+
* cases portable while still letting richer cases place files under
|
|
24
|
+
* subdirectories (e.g. `.cclaw/skills/brainstorming/SKILL.md`).
|
|
25
|
+
*/
|
|
26
|
+
import { randomUUID } from "node:crypto";
|
|
27
|
+
import fs from "node:fs/promises";
|
|
28
|
+
import os from "node:os";
|
|
29
|
+
import path from "node:path";
|
|
30
|
+
export class SandboxEscapeError extends Error {
|
|
31
|
+
requestedPath;
|
|
32
|
+
constructor(requestedPath, reason) {
|
|
33
|
+
super(`Sandbox refused path "${requestedPath}": ${reason}.`);
|
|
34
|
+
this.name = "SandboxEscapeError";
|
|
35
|
+
this.requestedPath = requestedPath;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
|
|
39
|
+
export async function createSandbox(options) {
|
|
40
|
+
const baseDir = options.baseDir ?? os.tmpdir();
|
|
41
|
+
const id = options.idOverride ?? randomUUID();
|
|
42
|
+
const root = path.join(baseDir, `cclaw-eval-${id}`);
|
|
43
|
+
await fs.mkdir(root, { recursive: true });
|
|
44
|
+
const realRoot = await fs.realpath(root);
|
|
45
|
+
if (options.contextFiles && options.contextFiles.length > 0) {
|
|
46
|
+
for (const rel of options.contextFiles) {
|
|
47
|
+
await copyContextFile(options.projectRoot, realRoot, rel);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
async function resolveInside(requested, opts = {}) {
|
|
51
|
+
if (typeof requested !== "string" || requested.length === 0) {
|
|
52
|
+
throw new SandboxEscapeError(String(requested), "path must be a non-empty string");
|
|
53
|
+
}
|
|
54
|
+
if (path.isAbsolute(requested)) {
|
|
55
|
+
throw new SandboxEscapeError(requested, "absolute paths are not allowed");
|
|
56
|
+
}
|
|
57
|
+
if (requested.includes("\0")) {
|
|
58
|
+
throw new SandboxEscapeError(requested, "NUL byte in path");
|
|
59
|
+
}
|
|
60
|
+
const joined = path.resolve(realRoot, requested);
|
|
61
|
+
const relative = path.relative(realRoot, joined);
|
|
62
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
63
|
+
throw new SandboxEscapeError(requested, "resolves outside the sandbox");
|
|
64
|
+
}
|
|
65
|
+
let finalPath;
|
|
66
|
+
try {
|
|
67
|
+
finalPath = await fs.realpath(joined);
|
|
68
|
+
}
|
|
69
|
+
catch (err) {
|
|
70
|
+
if (!opts.allowMissing) {
|
|
71
|
+
throw new SandboxEscapeError(requested, `realpath failed: ${err.message}`);
|
|
72
|
+
}
|
|
73
|
+
const existingAncestor = await findExistingAncestor(joined, realRoot);
|
|
74
|
+
if (!existingAncestor) {
|
|
75
|
+
throw new SandboxEscapeError(requested, "no existing ancestor inside the sandbox");
|
|
76
|
+
}
|
|
77
|
+
const ancestorRel = path.relative(realRoot, existingAncestor.real);
|
|
78
|
+
if (ancestorRel.startsWith("..") || path.isAbsolute(ancestorRel)) {
|
|
79
|
+
throw new SandboxEscapeError(requested, "parent resolves outside the sandbox");
|
|
80
|
+
}
|
|
81
|
+
finalPath = path.join(existingAncestor.real, existingAncestor.trailing);
|
|
82
|
+
}
|
|
83
|
+
const finalRel = path.relative(realRoot, finalPath);
|
|
84
|
+
if (finalRel.startsWith("..") || path.isAbsolute(finalRel)) {
|
|
85
|
+
throw new SandboxEscapeError(requested, "realpath escapes the sandbox");
|
|
86
|
+
}
|
|
87
|
+
return finalPath;
|
|
88
|
+
}
|
|
89
|
+
return {
|
|
90
|
+
root: realRoot,
|
|
91
|
+
resolve: resolveInside,
|
|
92
|
+
async dispose() {
|
|
93
|
+
await fs.rm(realRoot, { recursive: true, force: true });
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
async function findExistingAncestor(target, stopAt) {
|
|
98
|
+
const segments = [];
|
|
99
|
+
let current = target;
|
|
100
|
+
while (true) {
|
|
101
|
+
try {
|
|
102
|
+
const real = await fs.realpath(current);
|
|
103
|
+
return { real, trailing: path.join(...segments.reverse()) };
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
const parent = path.dirname(current);
|
|
107
|
+
if (parent === current)
|
|
108
|
+
return undefined;
|
|
109
|
+
segments.push(path.basename(current));
|
|
110
|
+
if (path.relative(stopAt, parent).startsWith(".."))
|
|
111
|
+
return undefined;
|
|
112
|
+
current = parent;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
async function copyContextFile(projectRoot, sandboxRoot, relPath) {
|
|
117
|
+
if (path.isAbsolute(relPath)) {
|
|
118
|
+
throw new Error(`context_files must be project-relative: ${relPath}`);
|
|
119
|
+
}
|
|
120
|
+
const src = path.resolve(projectRoot, relPath);
|
|
121
|
+
const srcReal = await fs.realpath(src);
|
|
122
|
+
const projectReal = await fs.realpath(projectRoot);
|
|
123
|
+
const inside = path.relative(projectReal, srcReal);
|
|
124
|
+
if (inside.startsWith("..") || path.isAbsolute(inside)) {
|
|
125
|
+
throw new Error(`context_files entry resolves outside the project: ${relPath}`);
|
|
126
|
+
}
|
|
127
|
+
const stat = await fs.stat(srcReal);
|
|
128
|
+
if (stat.isDirectory()) {
|
|
129
|
+
const dest = path.join(sandboxRoot, relPath);
|
|
130
|
+
await fs.mkdir(dest, { recursive: true });
|
|
131
|
+
await fs.cp(srcReal, dest, { recursive: true });
|
|
132
|
+
return;
|
|
133
|
+
}
|
|
134
|
+
const dest = path.join(sandboxRoot, relPath);
|
|
135
|
+
await fs.mkdir(path.dirname(dest), { recursive: true });
|
|
136
|
+
await fs.copyFile(srcReal, dest);
|
|
137
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { SandboxEscapeError } from "../sandbox.js";
|
|
4
|
+
import { parseArgs, requireString, truncatePayload } from "./types.js";
|
|
5
|
+
const DESCRIPTION = "List files inside the sandbox whose relative path matches a glob-style " +
|
|
6
|
+
"pattern. Supports `*` (any chars within a path segment) and `**` " +
|
|
7
|
+
"(any number of path segments). Returns matching paths, one per line.";
|
|
8
|
+
const MAX_MATCHES = 500;
|
|
9
|
+
export const globTool = {
|
|
10
|
+
descriptor: {
|
|
11
|
+
name: "glob",
|
|
12
|
+
description: DESCRIPTION,
|
|
13
|
+
parameters: {
|
|
14
|
+
type: "object",
|
|
15
|
+
additionalProperties: false,
|
|
16
|
+
required: ["pattern"],
|
|
17
|
+
properties: {
|
|
18
|
+
pattern: {
|
|
19
|
+
type: "string",
|
|
20
|
+
description: "Glob pattern, relative to the sandbox root."
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
async invoke(rawArgs, ctx) {
|
|
26
|
+
let args;
|
|
27
|
+
try {
|
|
28
|
+
args = parseArgs(rawArgs);
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
return { ok: false, name: this.descriptor.name, error: err.message };
|
|
32
|
+
}
|
|
33
|
+
let pattern;
|
|
34
|
+
try {
|
|
35
|
+
pattern = requireString(args, "pattern");
|
|
36
|
+
}
|
|
37
|
+
catch (err) {
|
|
38
|
+
return { ok: false, name: this.descriptor.name, error: err.message };
|
|
39
|
+
}
|
|
40
|
+
if (pattern.includes("\0")) {
|
|
41
|
+
return {
|
|
42
|
+
ok: false,
|
|
43
|
+
name: this.descriptor.name,
|
|
44
|
+
error: '"pattern" must not contain NUL bytes'
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
let regex;
|
|
48
|
+
try {
|
|
49
|
+
regex = globToRegExp(pattern);
|
|
50
|
+
}
|
|
51
|
+
catch (err) {
|
|
52
|
+
return {
|
|
53
|
+
ok: false,
|
|
54
|
+
name: this.descriptor.name,
|
|
55
|
+
error: err.message
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
const matches = [];
|
|
59
|
+
try {
|
|
60
|
+
await walk(ctx.sandbox.root, "", matches, regex);
|
|
61
|
+
}
|
|
62
|
+
catch (err) {
|
|
63
|
+
if (err instanceof SandboxEscapeError) {
|
|
64
|
+
return {
|
|
65
|
+
ok: false,
|
|
66
|
+
name: this.descriptor.name,
|
|
67
|
+
error: err.message,
|
|
68
|
+
details: { deniedPath: pattern }
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
ok: false,
|
|
73
|
+
name: this.descriptor.name,
|
|
74
|
+
error: `walk failed: ${err.message}`
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
matches.sort();
|
|
78
|
+
const capped = matches.slice(0, MAX_MATCHES);
|
|
79
|
+
const body = capped.length > 0
|
|
80
|
+
? capped.join("\n") +
|
|
81
|
+
(matches.length > capped.length
|
|
82
|
+
? `\n…[truncated at ${MAX_MATCHES} matches]`
|
|
83
|
+
: "")
|
|
84
|
+
: "(no matches)";
|
|
85
|
+
return {
|
|
86
|
+
ok: true,
|
|
87
|
+
name: this.descriptor.name,
|
|
88
|
+
content: truncatePayload(body, ctx.maxResultBytes),
|
|
89
|
+
details: {
|
|
90
|
+
pattern,
|
|
91
|
+
matches: capped.length,
|
|
92
|
+
totalMatches: matches.length,
|
|
93
|
+
truncated: matches.length > capped.length
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
async function walk(root, rel, acc, regex) {
|
|
99
|
+
const dir = path.join(root, rel);
|
|
100
|
+
let entries;
|
|
101
|
+
try {
|
|
102
|
+
entries = (await fs.readdir(dir, { withFileTypes: true }));
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
for (const entry of entries) {
|
|
108
|
+
const childRel = rel ? path.join(rel, entry.name) : entry.name;
|
|
109
|
+
if (entry.isSymbolicLink())
|
|
110
|
+
continue;
|
|
111
|
+
if (entry.isDirectory()) {
|
|
112
|
+
await walk(root, childRel, acc, regex);
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
if (entry.isFile() && regex.test(childRel.replace(/\\/g, "/"))) {
|
|
116
|
+
acc.push(childRel);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Minimal glob → regex: `**` matches zero or more path segments, `*`
|
|
122
|
+
* matches anything except `/`, `?` matches a single non-slash char.
|
|
123
|
+
* Everything else is escaped. Intentionally narrower than full
|
|
124
|
+
* bash-style expansion so behavior is easy to reason about.
|
|
125
|
+
*/
|
|
126
|
+
function globToRegExp(pattern) {
|
|
127
|
+
const normalized = pattern.replace(/\\/g, "/");
|
|
128
|
+
let src = "^";
|
|
129
|
+
let i = 0;
|
|
130
|
+
while (i < normalized.length) {
|
|
131
|
+
const c = normalized[i];
|
|
132
|
+
if (c === "*") {
|
|
133
|
+
if (normalized[i + 1] === "*") {
|
|
134
|
+
if (normalized[i + 2] === "/") {
|
|
135
|
+
src += "(?:.*/)?";
|
|
136
|
+
i += 3;
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
139
|
+
src += ".*";
|
|
140
|
+
i += 2;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
src += "[^/]*";
|
|
145
|
+
i += 1;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
else if (c === "?") {
|
|
149
|
+
src += "[^/]";
|
|
150
|
+
i += 1;
|
|
151
|
+
}
|
|
152
|
+
else if ("+()|^$.{}[]\\".includes(c)) {
|
|
153
|
+
src += `\\${c}`;
|
|
154
|
+
i += 1;
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
src += c;
|
|
158
|
+
i += 1;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
src += "$";
|
|
162
|
+
return new RegExp(src);
|
|
163
|
+
}
|