cclaw-cli 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +2 -1
- package/dist/eval/agents/with-tools.d.ts +31 -0
- package/dist/eval/agents/with-tools.js +255 -0
- package/dist/eval/config-loader.js +34 -2
- package/dist/eval/llm-client.d.ts +10 -0
- package/dist/eval/llm-client.js +10 -1
- package/dist/eval/report.js +19 -0
- package/dist/eval/runner.js +50 -2
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +35 -0
- package/package.json +1 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/** Truncate a result payload to `maxBytes` with a visible cutoff marker. */
|
|
2
|
+
export function truncatePayload(payload, maxBytes) {
|
|
3
|
+
if (Buffer.byteLength(payload, "utf8") <= maxBytes)
|
|
4
|
+
return payload;
|
|
5
|
+
const marker = "\n…[truncated by cclaw sandbox]";
|
|
6
|
+
const budget = Math.max(0, maxBytes - Buffer.byteLength(marker, "utf8"));
|
|
7
|
+
const buf = Buffer.from(payload, "utf8").subarray(0, budget);
|
|
8
|
+
return `${buf.toString("utf8")}${marker}`;
|
|
9
|
+
}
|
|
10
|
+
export function parseArgs(raw) {
|
|
11
|
+
if (typeof raw !== "string" || raw.trim() === "") {
|
|
12
|
+
throw new Error("tool arguments missing");
|
|
13
|
+
}
|
|
14
|
+
let parsed;
|
|
15
|
+
try {
|
|
16
|
+
parsed = JSON.parse(raw);
|
|
17
|
+
}
|
|
18
|
+
catch (err) {
|
|
19
|
+
throw new Error(`tool arguments are not valid JSON: ${err.message}`);
|
|
20
|
+
}
|
|
21
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
22
|
+
throw new Error("tool arguments must be a JSON object");
|
|
23
|
+
}
|
|
24
|
+
return parsed;
|
|
25
|
+
}
|
|
26
|
+
export function requireString(args, key) {
|
|
27
|
+
const value = args[key];
|
|
28
|
+
if (typeof value !== "string" || value.length === 0) {
|
|
29
|
+
throw new Error(`"${key}" must be a non-empty string`);
|
|
30
|
+
}
|
|
31
|
+
return value;
|
|
32
|
+
}
|
|
33
|
+
export function optionalNumber(args, key) {
|
|
34
|
+
const value = args[key];
|
|
35
|
+
if (value === undefined || value === null)
|
|
36
|
+
return undefined;
|
|
37
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
38
|
+
throw new Error(`"${key}" must be a finite number`);
|
|
39
|
+
}
|
|
40
|
+
return value;
|
|
41
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { SandboxEscapeError } from "../sandbox.js";
|
|
4
|
+
import { parseArgs, requireString, truncatePayload } from "./types.js";
|
|
5
|
+
const DESCRIPTION = "Write a UTF-8 text file inside the sandbox. Creates parent directories " +
|
|
6
|
+
"as needed. Overwrites existing files. Only paths inside the sandbox " +
|
|
7
|
+
"are accepted.";
|
|
8
|
+
export const writeTool = {
|
|
9
|
+
descriptor: {
|
|
10
|
+
name: "write_file",
|
|
11
|
+
description: DESCRIPTION,
|
|
12
|
+
parameters: {
|
|
13
|
+
type: "object",
|
|
14
|
+
additionalProperties: false,
|
|
15
|
+
required: ["path", "content"],
|
|
16
|
+
properties: {
|
|
17
|
+
path: {
|
|
18
|
+
type: "string",
|
|
19
|
+
description: "Path relative to the sandbox root."
|
|
20
|
+
},
|
|
21
|
+
content: {
|
|
22
|
+
type: "string",
|
|
23
|
+
description: "UTF-8 contents to write."
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
async invoke(rawArgs, ctx) {
|
|
29
|
+
let args;
|
|
30
|
+
try {
|
|
31
|
+
args = parseArgs(rawArgs);
|
|
32
|
+
}
|
|
33
|
+
catch (err) {
|
|
34
|
+
return { ok: false, name: this.descriptor.name, error: err.message };
|
|
35
|
+
}
|
|
36
|
+
let relPath;
|
|
37
|
+
try {
|
|
38
|
+
relPath = requireString(args, "path");
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
return { ok: false, name: this.descriptor.name, error: err.message };
|
|
42
|
+
}
|
|
43
|
+
const rawContent = args.content;
|
|
44
|
+
if (typeof rawContent !== "string") {
|
|
45
|
+
return {
|
|
46
|
+
ok: false,
|
|
47
|
+
name: this.descriptor.name,
|
|
48
|
+
error: '"content" must be a string'
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
const payloadBytes = Buffer.byteLength(rawContent, "utf8");
|
|
52
|
+
if (payloadBytes > ctx.maxResultBytes * 4) {
|
|
53
|
+
return {
|
|
54
|
+
ok: false,
|
|
55
|
+
name: this.descriptor.name,
|
|
56
|
+
error: `"content" exceeds per-invocation ceiling (${payloadBytes} bytes).`
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
let abs;
|
|
60
|
+
try {
|
|
61
|
+
abs = await ctx.sandbox.resolve(relPath, { allowMissing: true });
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
const denied = err instanceof SandboxEscapeError ? relPath : undefined;
|
|
65
|
+
return {
|
|
66
|
+
ok: false,
|
|
67
|
+
name: this.descriptor.name,
|
|
68
|
+
error: err.message,
|
|
69
|
+
details: denied ? { deniedPath: denied } : undefined
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
try {
|
|
73
|
+
await fs.mkdir(path.dirname(abs), { recursive: true });
|
|
74
|
+
await fs.writeFile(abs, rawContent, "utf8");
|
|
75
|
+
}
|
|
76
|
+
catch (err) {
|
|
77
|
+
return {
|
|
78
|
+
ok: false,
|
|
79
|
+
name: this.descriptor.name,
|
|
80
|
+
error: `write failed: ${err.message}`,
|
|
81
|
+
details: { path: relPath }
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
const summary = `wrote ${payloadBytes} byte(s) to ${relPath}`;
|
|
85
|
+
return {
|
|
86
|
+
ok: true,
|
|
87
|
+
name: this.descriptor.name,
|
|
88
|
+
content: truncatePayload(summary, ctx.maxResultBytes),
|
|
89
|
+
details: { path: relPath, bytes: payloadBytes }
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
};
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -268,6 +268,24 @@ export interface EvalConfig {
|
|
|
268
268
|
* `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
|
|
269
269
|
*/
|
|
270
270
|
tokenPricing?: Record<string, TokenPricing>;
|
|
271
|
+
/**
|
|
272
|
+
* Maximum assistant turns (tool_calls → tool result cycles) allowed by
|
|
273
|
+
* the Tier B with-tools agent. Defaults to 8 when unset. Runs that
|
|
274
|
+
* exceed the cap fail with a `MaxTurnsExceededError` and surface as a
|
|
275
|
+
* workflow verifier result.
|
|
276
|
+
*/
|
|
277
|
+
toolMaxTurns?: number;
|
|
278
|
+
/**
|
|
279
|
+
* Per-invocation ceiling on tool call arguments bytes. Defends against
|
|
280
|
+
* runaway writes. Defaults to 64 KiB.
|
|
281
|
+
*/
|
|
282
|
+
toolMaxArgumentsBytes?: number;
|
|
283
|
+
/**
|
|
284
|
+
* Per-invocation ceiling on tool call result bytes returned to the
|
|
285
|
+
* model. Defaults to 32 KiB; longer results are truncated with a
|
|
286
|
+
* marker so the model sees the cutoff.
|
|
287
|
+
*/
|
|
288
|
+
toolMaxResultBytes?: number;
|
|
271
289
|
}
|
|
272
290
|
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
273
291
|
export interface TokenPricing {
|
|
@@ -381,3 +399,20 @@ export interface JudgeInvocation {
|
|
|
381
399
|
usageUsd: number;
|
|
382
400
|
durationMs: number;
|
|
383
401
|
}
|
|
402
|
+
/**
|
|
403
|
+
* Tool-use summary produced by the Tier B with-tools agent. Captured so
|
|
404
|
+
* the runner can surface per-case tool metrics in the markdown report
|
|
405
|
+
* (number of calls, depth, error rate, denied paths).
|
|
406
|
+
*/
|
|
407
|
+
export interface ToolUseSummary {
|
|
408
|
+
/** Turns consumed before the agent produced a terminal assistant message. */
|
|
409
|
+
turns: number;
|
|
410
|
+
/** Total successful tool invocations across all turns. */
|
|
411
|
+
calls: number;
|
|
412
|
+
/** Tool invocations that returned an error (bad args, denied path, etc.). */
|
|
413
|
+
errors: number;
|
|
414
|
+
/** Paths the sandbox refused to resolve (escape attempts, missing files). */
|
|
415
|
+
deniedPaths: string[];
|
|
416
|
+
/** Per-tool call counts, keyed by tool name. */
|
|
417
|
+
byTool: Record<string, number>;
|
|
418
|
+
}
|