cclaw-cli 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +2 -1
- package/dist/eval/agents/with-tools.d.ts +31 -0
- package/dist/eval/agents/with-tools.js +255 -0
- package/dist/eval/config-loader.js +34 -2
- package/dist/eval/llm-client.d.ts +10 -0
- package/dist/eval/llm-client.js +10 -1
- package/dist/eval/report.js +19 -0
- package/dist/eval/runner.js +50 -2
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +35 -0
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -59,7 +59,7 @@ Commands:
|
|
|
59
59
|
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
60
60
|
--schema-only Run only structural verifiers (default).
|
|
61
61
|
--rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
|
|
62
|
-
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A
|
|
62
|
+
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B runs the sandbox tool-using agent (read_file/write_file/glob/grep).
|
|
63
63
|
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
64
|
--json Emit machine-readable JSON on stdout.
|
|
65
65
|
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
@@ -79,6 +79,7 @@ Examples:
|
|
|
79
79
|
cclaw eval --dry-run
|
|
80
80
|
cclaw eval --stage=brainstorm --schema-only
|
|
81
81
|
cclaw eval --judge --tier=A --stage=brainstorm
|
|
82
|
+
cclaw eval --judge --tier=B --stage=spec
|
|
82
83
|
|
|
83
84
|
Docs: https://github.com/zuevrs/cclaw
|
|
84
85
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
|
|
2
|
+
import { createSandbox } from "../sandbox.js";
|
|
3
|
+
import type { SandboxTool } from "../tools/index.js";
|
|
4
|
+
import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
|
|
5
|
+
export declare class MaxTurnsExceededError extends Error {
|
|
6
|
+
readonly turns: number;
|
|
7
|
+
constructor(turns: number);
|
|
8
|
+
}
|
|
9
|
+
export interface WithToolsInput {
|
|
10
|
+
caseEntry: EvalCase;
|
|
11
|
+
config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes">;
|
|
12
|
+
projectRoot: string;
|
|
13
|
+
client: EvalLlmClient;
|
|
14
|
+
tools?: SandboxTool[];
|
|
15
|
+
/** Override for the SKILL.md loader (test hook). */
|
|
16
|
+
loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
|
|
17
|
+
/** Override for the sandbox factory (test hook). */
|
|
18
|
+
createSandboxFn?: typeof createSandbox;
|
|
19
|
+
}
|
|
20
|
+
export interface WithToolsOutput {
|
|
21
|
+
artifact: string;
|
|
22
|
+
usage: ChatUsage;
|
|
23
|
+
usageUsd: number;
|
|
24
|
+
model: string;
|
|
25
|
+
attempts: number;
|
|
26
|
+
durationMs: number;
|
|
27
|
+
toolUse: ToolUseSummary;
|
|
28
|
+
systemPrompt: string;
|
|
29
|
+
userPrompt: string;
|
|
30
|
+
}
|
|
31
|
+
export declare function runWithTools(input: WithToolsInput): Promise<WithToolsOutput>;
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tier B with-tools agent.
|
|
3
|
+
*
|
|
4
|
+
* Multi-turn loop with OpenAI-style function-calling over a set of
|
|
5
|
+
* sandbox-confined tools. The AUT is given:
|
|
6
|
+
*
|
|
7
|
+
* - System prompt = stage SKILL.md (same contract as Tier A so the
|
|
8
|
+
* single-shot baseline is comparable).
|
|
9
|
+
* - User prompt = task description + a short "tools available" hint
|
|
10
|
+
* that names the sandbox root and the four built-in tools.
|
|
11
|
+
* - Tools = `read_file`, `write_file`, `glob`, `grep` (see
|
|
12
|
+
* `src/eval/tools/`).
|
|
13
|
+
*
|
|
14
|
+
* The loop runs up to `config.toolMaxTurns` turns (default 8). Each
|
|
15
|
+
* turn:
|
|
16
|
+
*
|
|
17
|
+
* 1. Send the current transcript to the model with tools enabled.
|
|
18
|
+
* 2. Commit token usage against the wrapped client (cost guard sees
|
|
19
|
+
* every call).
|
|
20
|
+
* 3. If the model returned tool_calls, execute each sandbox tool and
|
|
21
|
+
* append a `role: "tool"` message with the JSON-serialized result.
|
|
22
|
+
* 4. If the model produced assistant content with `finish_reason: stop`,
|
|
23
|
+
* treat that as the artifact and exit.
|
|
24
|
+
*
|
|
25
|
+
* When the turn budget is exhausted without a terminal stop, the agent
|
|
26
|
+
* throws `MaxTurnsExceededError`. The runner surfaces the error as a
|
|
27
|
+
* failed workflow verifier so the case counts as a regression.
|
|
28
|
+
*
|
|
29
|
+
* Artifact resolution: the final assistant content is the artifact. If
|
|
30
|
+
* the model used `write_file` to stage the artifact at
|
|
31
|
+
* `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
|
|
32
|
+
* mirrors the Tier C workflow where writes are the deliverable. The
|
|
33
|
+
* fallback is the terminal assistant message so prompts that don't
|
|
34
|
+
* call write_file still produce something judgable.
|
|
35
|
+
*/
|
|
36
|
+
import fs from "node:fs/promises";
|
|
37
|
+
import path from "node:path";
|
|
38
|
+
import { computeUsageUsd } from "../cost-guard.js";
|
|
39
|
+
import { createSandbox } from "../sandbox.js";
|
|
40
|
+
import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
|
|
41
|
+
import { loadStageSkill } from "./single-shot.js";
|
|
42
|
+
export class MaxTurnsExceededError extends Error {
|
|
43
|
+
turns;
|
|
44
|
+
constructor(turns) {
|
|
45
|
+
super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
|
|
46
|
+
this.name = "MaxTurnsExceededError";
|
|
47
|
+
this.turns = turns;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const DEFAULT_MAX_TURNS = 8;
|
|
51
|
+
const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
|
|
52
|
+
const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
|
|
53
|
+
const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
|
|
54
|
+
export async function runWithTools(input) {
|
|
55
|
+
const { caseEntry, config, projectRoot, client } = input;
|
|
56
|
+
const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
|
|
57
|
+
const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
|
|
58
|
+
const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
|
|
59
|
+
const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
|
|
60
|
+
const systemPrompt = await loader(caseEntry.stage);
|
|
61
|
+
const tools = input.tools ?? BUILTIN_TOOLS;
|
|
62
|
+
const toolMap = toolsByName(tools);
|
|
63
|
+
const toolsBody = toolsForRequest(tools);
|
|
64
|
+
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
65
|
+
const sandbox = await sandboxFactory({
|
|
66
|
+
projectRoot,
|
|
67
|
+
...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
|
|
68
|
+
});
|
|
69
|
+
const toolUse = {
|
|
70
|
+
turns: 0,
|
|
71
|
+
calls: 0,
|
|
72
|
+
errors: 0,
|
|
73
|
+
deniedPaths: [],
|
|
74
|
+
byTool: {}
|
|
75
|
+
};
|
|
76
|
+
const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
77
|
+
let lastModel = config.model;
|
|
78
|
+
let totalAttempts = 0;
|
|
79
|
+
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
|
|
80
|
+
const messages = [
|
|
81
|
+
{ role: "system", content: systemPrompt },
|
|
82
|
+
{ role: "user", content: userPrompt }
|
|
83
|
+
];
|
|
84
|
+
const started = Date.now();
|
|
85
|
+
try {
|
|
86
|
+
for (let turn = 0; turn < maxTurns; turn += 1) {
|
|
87
|
+
toolUse.turns = turn + 1;
|
|
88
|
+
const response = await client.chat({
|
|
89
|
+
model: config.model,
|
|
90
|
+
messages,
|
|
91
|
+
temperature: config.agentTemperature ?? 0.2,
|
|
92
|
+
timeoutMs: config.timeoutMs,
|
|
93
|
+
tools: toolsBody,
|
|
94
|
+
toolChoice: "auto"
|
|
95
|
+
});
|
|
96
|
+
usage.promptTokens += response.usage.promptTokens;
|
|
97
|
+
usage.completionTokens += response.usage.completionTokens;
|
|
98
|
+
usage.totalTokens += response.usage.totalTokens;
|
|
99
|
+
lastModel = response.model;
|
|
100
|
+
totalAttempts += response.attempts;
|
|
101
|
+
const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
|
|
102
|
+
messages.push(rememberAssistant(response.content, response.toolCalls));
|
|
103
|
+
if (!hasToolCalls) {
|
|
104
|
+
const artifact = await resolveArtifact(sandbox, response.content);
|
|
105
|
+
return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
|
|
106
|
+
}
|
|
107
|
+
for (const call of response.toolCalls) {
|
|
108
|
+
const tool = toolMap.get(call.name);
|
|
109
|
+
const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
|
|
110
|
+
if (argBytes > maxArgBytes) {
|
|
111
|
+
toolUse.errors += 1;
|
|
112
|
+
bumpToolCount(toolUse, call.name);
|
|
113
|
+
messages.push(toolResponseMessage(call.id, {
|
|
114
|
+
ok: false,
|
|
115
|
+
name: call.name,
|
|
116
|
+
error: `arguments payload exceeds ${maxArgBytes} bytes`
|
|
117
|
+
}));
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
if (!tool) {
|
|
121
|
+
toolUse.errors += 1;
|
|
122
|
+
bumpToolCount(toolUse, call.name);
|
|
123
|
+
messages.push(toolResponseMessage(call.id, {
|
|
124
|
+
ok: false,
|
|
125
|
+
name: call.name,
|
|
126
|
+
error: `unknown tool "${call.name}"`
|
|
127
|
+
}));
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
bumpToolCount(toolUse, call.name);
|
|
131
|
+
const result = await tool.invoke(call.arguments ?? "", {
|
|
132
|
+
sandbox,
|
|
133
|
+
maxResultBytes
|
|
134
|
+
});
|
|
135
|
+
if (!result.ok) {
|
|
136
|
+
toolUse.errors += 1;
|
|
137
|
+
const denied = result.details && typeof result.details.deniedPath === "string"
|
|
138
|
+
? result.details.deniedPath
|
|
139
|
+
: undefined;
|
|
140
|
+
if (denied && !toolUse.deniedPaths.includes(denied)) {
|
|
141
|
+
toolUse.deniedPaths.push(denied);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
toolUse.calls += 1;
|
|
146
|
+
}
|
|
147
|
+
messages.push(toolResponseMessage(call.id, result));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
throw new MaxTurnsExceededError(maxTurns);
|
|
151
|
+
}
|
|
152
|
+
finally {
|
|
153
|
+
await sandbox.dispose();
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
|
|
157
|
+
const usageUsd = computeUsageUsd(model, usage, {
|
|
158
|
+
tokenPricing: config.tokenPricing
|
|
159
|
+
});
|
|
160
|
+
return {
|
|
161
|
+
artifact: artifact.trim(),
|
|
162
|
+
usage,
|
|
163
|
+
usageUsd,
|
|
164
|
+
model,
|
|
165
|
+
attempts,
|
|
166
|
+
durationMs: Date.now() - started,
|
|
167
|
+
toolUse,
|
|
168
|
+
systemPrompt,
|
|
169
|
+
userPrompt
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
function rememberAssistant(content, toolCalls) {
|
|
173
|
+
const base = { role: "assistant", content };
|
|
174
|
+
if (toolCalls && toolCalls.length > 0)
|
|
175
|
+
base.toolCalls = toolCalls;
|
|
176
|
+
return base;
|
|
177
|
+
}
|
|
178
|
+
function toolResponseMessage(callId, result) {
|
|
179
|
+
const payload = result.ok
|
|
180
|
+
? { ok: true, content: result.content, details: result.details ?? {} }
|
|
181
|
+
: { ok: false, error: result.error, details: result.details ?? {} };
|
|
182
|
+
return {
|
|
183
|
+
role: "tool",
|
|
184
|
+
content: truncatePayload(JSON.stringify(payload), 32 * 1024),
|
|
185
|
+
toolCallId: callId,
|
|
186
|
+
name: result.name
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
function bumpToolCount(summary, name) {
|
|
190
|
+
summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
|
|
191
|
+
}
|
|
192
|
+
function clampPositive(value, fallback) {
|
|
193
|
+
if (value === undefined)
|
|
194
|
+
return fallback;
|
|
195
|
+
if (!Number.isFinite(value) || value <= 0)
|
|
196
|
+
return fallback;
|
|
197
|
+
return Math.floor(value);
|
|
198
|
+
}
|
|
199
|
+
function buildUserPrompt(caseEntry, sandbox, tools) {
|
|
200
|
+
const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
|
|
201
|
+
const files = caseEntry.contextFiles ?? [];
|
|
202
|
+
const contextLines = files.length > 0
|
|
203
|
+
? files.map((f) => `- ${f}`).join("\n")
|
|
204
|
+
: "(no files seeded)";
|
|
205
|
+
const lines = [
|
|
206
|
+
`Stage: ${caseEntry.stage}`,
|
|
207
|
+
`Case id: ${caseEntry.id}`,
|
|
208
|
+
``,
|
|
209
|
+
`Sandbox root: ${sandbox.root}`,
|
|
210
|
+
`You may call the following tools to read or modify files inside the sandbox.`,
|
|
211
|
+
`All paths are relative to the sandbox root.`,
|
|
212
|
+
``,
|
|
213
|
+
`Tools:`,
|
|
214
|
+
...toolList,
|
|
215
|
+
``,
|
|
216
|
+
`Seeded context files (available under the sandbox root):`,
|
|
217
|
+
contextLines,
|
|
218
|
+
``,
|
|
219
|
+
`Task:`,
|
|
220
|
+
caseEntry.inputPrompt.trim(),
|
|
221
|
+
``,
|
|
222
|
+
`When you are done, reply with the artifact as the final assistant message.`,
|
|
223
|
+
`Output the artifact directly (markdown with optional YAML frontmatter).`,
|
|
224
|
+
`Do not wrap in code fences, do not add commentary before or after.`,
|
|
225
|
+
`You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
|
|
226
|
+
`if you do, the last written \`artifact.md\` is preferred over the chat reply.`
|
|
227
|
+
];
|
|
228
|
+
return lines.join("\n");
|
|
229
|
+
}
|
|
230
|
+
async function resolveArtifact(sandbox, fallback) {
|
|
231
|
+
for (const candidate of ARTIFACT_CANDIDATES) {
|
|
232
|
+
try {
|
|
233
|
+
const abs = await sandbox.resolve(candidate);
|
|
234
|
+
const stat = await fs.stat(abs);
|
|
235
|
+
if (stat.isFile()) {
|
|
236
|
+
return await fs.readFile(abs, "utf8");
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
catch {
|
|
240
|
+
continue;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
try {
|
|
244
|
+
const dir = path.join(sandbox.root);
|
|
245
|
+
const entries = (await fs.readdir(dir, { withFileTypes: true }));
|
|
246
|
+
const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
|
|
247
|
+
if (match) {
|
|
248
|
+
return await fs.readFile(path.join(dir, match.name), "utf8");
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// fall through to fallback
|
|
253
|
+
}
|
|
254
|
+
return fallback;
|
|
255
|
+
}
|
|
@@ -32,7 +32,10 @@ const NUMERIC_ENVS = new Set([
|
|
|
32
32
|
"CCLAW_EVAL_MAX_RETRIES",
|
|
33
33
|
"CCLAW_EVAL_JUDGE_SAMPLES",
|
|
34
34
|
"CCLAW_EVAL_JUDGE_TEMPERATURE",
|
|
35
|
-
"CCLAW_EVAL_AGENT_TEMPERATURE"
|
|
35
|
+
"CCLAW_EVAL_AGENT_TEMPERATURE",
|
|
36
|
+
"CCLAW_EVAL_TOOL_MAX_TURNS",
|
|
37
|
+
"CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
|
|
38
|
+
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
|
|
36
39
|
]);
|
|
37
40
|
function evalConfigError(configFilePath, reason) {
|
|
38
41
|
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
@@ -152,6 +155,17 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
152
155
|
}
|
|
153
156
|
out.tokenPricing = pricing;
|
|
154
157
|
}
|
|
158
|
+
const assignPositiveInt = (key, value, label) => {
|
|
159
|
+
if (value === undefined)
|
|
160
|
+
return;
|
|
161
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
162
|
+
throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
|
|
163
|
+
}
|
|
164
|
+
out[key] = value;
|
|
165
|
+
};
|
|
166
|
+
assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
|
|
167
|
+
assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
|
|
168
|
+
assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
|
|
155
169
|
if (raw.regression !== undefined) {
|
|
156
170
|
if (!isRecord(raw.regression)) {
|
|
157
171
|
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
@@ -186,7 +200,10 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
186
200
|
"judgeSamples",
|
|
187
201
|
"judgeTemperature",
|
|
188
202
|
"agentTemperature",
|
|
189
|
-
"tokenPricing"
|
|
203
|
+
"tokenPricing",
|
|
204
|
+
"toolMaxTurns",
|
|
205
|
+
"toolMaxArgumentsBytes",
|
|
206
|
+
"toolMaxResultBytes"
|
|
190
207
|
]);
|
|
191
208
|
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
192
209
|
if (unknown.length > 0) {
|
|
@@ -296,6 +313,21 @@ function applyEnvOverrides(base, env) {
|
|
|
296
313
|
patched.agentTemperature = value;
|
|
297
314
|
overridden = true;
|
|
298
315
|
}
|
|
316
|
+
const readPositiveInt = (name, key, label) => {
|
|
317
|
+
const raw = read(name);
|
|
318
|
+
if (!raw)
|
|
319
|
+
return;
|
|
320
|
+
const value = parseNumericEnv(name, raw);
|
|
321
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
322
|
+
throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
|
|
323
|
+
}
|
|
324
|
+
patched[key] = value;
|
|
325
|
+
overridden = true;
|
|
326
|
+
void label;
|
|
327
|
+
};
|
|
328
|
+
readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
|
|
329
|
+
readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
|
|
330
|
+
readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
|
|
299
331
|
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
300
332
|
return { patched, overridden, apiKey };
|
|
301
333
|
}
|
|
@@ -5,6 +5,16 @@ export interface ChatMessage {
|
|
|
5
5
|
content: string;
|
|
6
6
|
name?: string;
|
|
7
7
|
toolCallId?: string;
|
|
8
|
+
/**
|
|
9
|
+
* OpenAI-style tool calls carried on a preceding assistant message.
|
|
10
|
+
* Populated by the Tier B loop so the wire transcript stays
|
|
11
|
+
* consistent (assistant message → tool responses).
|
|
12
|
+
*/
|
|
13
|
+
toolCalls?: Array<{
|
|
14
|
+
id: string;
|
|
15
|
+
name: string;
|
|
16
|
+
arguments: string;
|
|
17
|
+
}>;
|
|
8
18
|
}
|
|
9
19
|
export interface ChatRequest {
|
|
10
20
|
model: string;
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -149,7 +149,16 @@ function buildBody(request) {
|
|
|
149
149
|
role: m.role,
|
|
150
150
|
content: m.content,
|
|
151
151
|
...(m.name !== undefined ? { name: m.name } : {}),
|
|
152
|
-
...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
|
|
152
|
+
...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {}),
|
|
153
|
+
...(m.toolCalls && m.toolCalls.length > 0
|
|
154
|
+
? {
|
|
155
|
+
tool_calls: m.toolCalls.map((call) => ({
|
|
156
|
+
id: call.id,
|
|
157
|
+
type: "function",
|
|
158
|
+
function: { name: call.name, arguments: call.arguments }
|
|
159
|
+
}))
|
|
160
|
+
}
|
|
161
|
+
: {})
|
|
153
162
|
}))
|
|
154
163
|
};
|
|
155
164
|
if (request.maxTokens !== undefined)
|
package/dist/eval/report.js
CHANGED
|
@@ -75,6 +75,25 @@ export function formatMarkdownReport(report) {
|
|
|
75
75
|
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
76
|
}
|
|
77
77
|
lines.push(``);
|
|
78
|
+
const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
|
|
79
|
+
if (toolCases.length > 0) {
|
|
80
|
+
lines.push(`## Tool use`);
|
|
81
|
+
lines.push(``);
|
|
82
|
+
lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
|
|
83
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
84
|
+
for (const item of toolCases) {
|
|
85
|
+
const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
|
|
86
|
+
const toolUse = verifier?.details?.toolUse;
|
|
87
|
+
if (!toolUse)
|
|
88
|
+
continue;
|
|
89
|
+
const byTool = Object.entries(toolUse.byTool)
|
|
90
|
+
.map(([name, count]) => `${name}=${count}`)
|
|
91
|
+
.join(", ");
|
|
92
|
+
const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
|
|
93
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
|
|
94
|
+
}
|
|
95
|
+
lines.push(``);
|
|
96
|
+
}
|
|
78
97
|
const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
|
|
79
98
|
if (judgeCases.length > 0) {
|
|
80
99
|
lines.push(`## Judge scores`);
|
package/dist/eval/runner.js
CHANGED
|
@@ -2,6 +2,7 @@ import { randomUUID } from "node:crypto";
|
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
4
|
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
|
+
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
5
6
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
6
7
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
7
8
|
import { loadEvalConfig } from "./config-loader.js";
|
|
@@ -39,8 +40,9 @@ function resolveRunFlags(options) {
|
|
|
39
40
|
const rulesRequested = options.rules === true;
|
|
40
41
|
const schemaOnly = options.schemaOnly === true;
|
|
41
42
|
const judgeRequested = options.judge === true;
|
|
43
|
+
const tier = options.tier ?? "A";
|
|
42
44
|
const runJudge = judgeRequested && !schemaOnly;
|
|
43
|
-
const runAgent = runJudge && (
|
|
45
|
+
const runAgent = runJudge && (tier === "A" || tier === "B");
|
|
44
46
|
return {
|
|
45
47
|
runStructural: true,
|
|
46
48
|
runRules: rulesRequested && !schemaOnly,
|
|
@@ -94,7 +96,7 @@ async function runCase(ctx) {
|
|
|
94
96
|
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
95
97
|
let artifact;
|
|
96
98
|
if (needsArtifact) {
|
|
97
|
-
if (flags.runAgent && judgeRequested && client) {
|
|
99
|
+
if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
|
|
98
100
|
try {
|
|
99
101
|
const produced = await runSingleShot({
|
|
100
102
|
caseEntry,
|
|
@@ -133,6 +135,52 @@ async function runCase(ctx) {
|
|
|
133
135
|
});
|
|
134
136
|
}
|
|
135
137
|
}
|
|
138
|
+
else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
|
|
139
|
+
try {
|
|
140
|
+
const produced = await runWithTools({
|
|
141
|
+
caseEntry,
|
|
142
|
+
config,
|
|
143
|
+
projectRoot,
|
|
144
|
+
client
|
|
145
|
+
});
|
|
146
|
+
artifact = produced.artifact;
|
|
147
|
+
caseCostUsd += produced.usageUsd;
|
|
148
|
+
verifierResults.push({
|
|
149
|
+
kind: "workflow",
|
|
150
|
+
id: "agent:with-tools",
|
|
151
|
+
ok: true,
|
|
152
|
+
score: 1,
|
|
153
|
+
message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
|
|
154
|
+
`${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
|
|
155
|
+
`(${produced.toolUse.calls} tool call(s))`,
|
|
156
|
+
details: {
|
|
157
|
+
model: produced.model,
|
|
158
|
+
tokensIn: produced.usage.promptTokens,
|
|
159
|
+
tokensOut: produced.usage.completionTokens,
|
|
160
|
+
usageUsd: produced.usageUsd,
|
|
161
|
+
attempts: produced.attempts,
|
|
162
|
+
toolUse: produced.toolUse
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
catch (err) {
|
|
167
|
+
if (err instanceof DailyCostCapExceededError)
|
|
168
|
+
throw err;
|
|
169
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
170
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
171
|
+
verifierResults.push({
|
|
172
|
+
kind: "workflow",
|
|
173
|
+
id: "agent:with-tools",
|
|
174
|
+
ok: false,
|
|
175
|
+
score: 0,
|
|
176
|
+
message: err instanceof Error ? err.message : String(err),
|
|
177
|
+
details: {
|
|
178
|
+
retryable,
|
|
179
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
136
184
|
else {
|
|
137
185
|
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
138
186
|
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export declare class SandboxEscapeError extends Error {
|
|
2
|
+
readonly requestedPath: string;
|
|
3
|
+
constructor(requestedPath: string, reason: string);
|
|
4
|
+
}
|
|
5
|
+
export interface SandboxOptions {
|
|
6
|
+
/** Project root that `contextFiles` are resolved against. */
|
|
7
|
+
projectRoot: string;
|
|
8
|
+
/** Case-relative paths to copy into the sandbox before the agent starts. */
|
|
9
|
+
contextFiles?: string[];
|
|
10
|
+
/**
|
|
11
|
+
* Base directory that will host the per-case tmpdir. Defaults to
|
|
12
|
+
* `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
|
|
13
|
+
* traces in `/tmp` when assertions fail.
|
|
14
|
+
*/
|
|
15
|
+
baseDir?: string;
|
|
16
|
+
/** Override the per-case suffix. Primarily for deterministic tests. */
|
|
17
|
+
idOverride?: string;
|
|
18
|
+
}
|
|
19
|
+
export interface Sandbox {
|
|
20
|
+
/** Absolute path to the sandbox root directory. */
|
|
21
|
+
root: string;
|
|
22
|
+
/**
|
|
23
|
+
* Resolve `requested` relative to the sandbox root and return the
|
|
24
|
+
* absolute, realpath'd filesystem path. Throws
|
|
25
|
+
* `SandboxEscapeError` when the resolution crosses the boundary.
|
|
26
|
+
*
|
|
27
|
+
* `allowMissing: true` lets callers pre-resolve a destination for a
|
|
28
|
+
* write where the final component doesn't exist yet — the parent
|
|
29
|
+
* directory is realpath'd to still catch symlink escapes.
|
|
30
|
+
*/
|
|
31
|
+
resolve(requested: string, options?: {
|
|
32
|
+
allowMissing?: boolean;
|
|
33
|
+
}): Promise<string>;
|
|
34
|
+
/** Remove the sandbox directory. Idempotent. */
|
|
35
|
+
dispose(): Promise<void>;
|
|
36
|
+
}
|
|
37
|
+
/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
|
|
38
|
+
export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;
|