cclaw-cli 0.25.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts CHANGED
@@ -26,6 +26,10 @@ interface ParsedArgs {
26
26
  evalNoWrite?: boolean;
27
27
  evalUpdateBaseline?: boolean;
28
28
  evalConfirm?: boolean;
29
+ /** Optional subcommand after `eval`. Currently only `diff` is supported. */
30
+ evalSubcommand?: "diff";
31
+ /** Positional arguments for eval subcommands (e.g. `diff <old> <new>`). */
32
+ evalArgs?: string[];
29
33
  showHelp?: boolean;
30
34
  showVersion?: boolean;
31
35
  }
package/dist/cli.js CHANGED
@@ -16,6 +16,7 @@ import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
16
  import { runEval } from "./eval/runner.js";
17
17
  import { writeBaselinesFromReport } from "./eval/baseline.js";
18
18
  import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
19
+ import { formatDiffMarkdown, runEvalDiff } from "./eval/diff.js";
19
20
  import { EVAL_TIERS } from "./eval/types.js";
20
21
  import { FLOW_STAGES } from "./types.js";
21
22
  const INSTALLER_COMMANDS = [
@@ -55,16 +56,22 @@ Commands:
55
56
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
56
57
  --retro-reason=<t> Reason for bypassing retro gate.
57
58
  eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
58
- Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
59
- --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
59
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}) for Tier A/B.
60
+ --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=multi-stage workflow).
60
61
  --schema-only Run only structural verifiers (default).
61
62
  --rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
62
- --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A also runs the single-shot agent-under-test.
63
+ --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B/C the sandbox tool-using agent (read_file/write_file/glob/grep).
63
64
  --dry-run Validate config + corpus, print summary, do not execute.
64
65
  --json Emit machine-readable JSON on stdout.
65
66
  --no-write Skip writing the report to .cclaw/evals/reports/.
66
67
  --update-baseline Overwrite baselines from the current run (requires --confirm).
67
68
  --confirm Acknowledge --update-baseline (prevents accidental resets).
69
+
70
+ Subcommands:
71
+ diff <old> <new> Compare two reports under .cclaw/evals/reports/.
72
+ Each argument is a cclawVersion (e.g. 0.26.0), a filename,
73
+ or the literal "latest". Exit code 1 when the diff shows a
74
+ regression. Accepts --json to emit machine-readable output.
68
75
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
69
76
  uninstall Remove .cclaw runtime and the generated harness shim files.
70
77
 
@@ -79,6 +86,9 @@ Examples:
79
86
  cclaw eval --dry-run
80
87
  cclaw eval --stage=brainstorm --schema-only
81
88
  cclaw eval --judge --tier=A --stage=brainstorm
89
+ cclaw eval --judge --tier=B --stage=spec
90
+ cclaw eval --tier=C --judge
91
+ cclaw eval diff 0.26.0 latest
82
92
 
83
93
  Docs: https://github.com/zuevrs/cclaw
84
94
  Issues: https://github.com/zuevrs/cclaw/issues
@@ -372,10 +382,42 @@ function parseArgs(argv) {
372
382
  if (versionFlag) {
373
383
  parsed.showVersion = true;
374
384
  }
375
- const [commandRaw, ...flags] = argv.filter((arg) => arg !== "--help" && arg !== "-h" && arg !== "--version" && arg !== "-v");
385
+ const filteredArgv = argv.filter((arg) => arg !== "--help" && arg !== "-h" && arg !== "--version" && arg !== "-v");
386
+ const [commandRaw, ...rest] = filteredArgv;
376
387
  parsed.command = INSTALLER_COMMANDS.includes(commandRaw)
377
388
  ? commandRaw
378
389
  : undefined;
390
+ // For `eval`, the next non-flag argument is an optional subcommand. Any
391
+ // subsequent non-flag tokens are captured as evalArgs (consumed by the
392
+ // subcommand handler). This preserves backwards compat: callers that run
393
+ // `cclaw eval --dry-run` see no subcommand and no positional args.
394
+ let flags = rest;
395
+ if (parsed.command === "eval") {
396
+ const evalArgs = [];
397
+ const remainder = [];
398
+ let sawSubcommand = false;
399
+ for (const token of rest) {
400
+ if (token.startsWith("--")) {
401
+ remainder.push(token);
402
+ continue;
403
+ }
404
+ if (!sawSubcommand) {
405
+ if (token === "diff") {
406
+ parsed.evalSubcommand = "diff";
407
+ sawSubcommand = true;
408
+ }
409
+ else {
410
+ // Treat unknown positional as an eval arg for forward compat.
411
+ evalArgs.push(token);
412
+ }
413
+ continue;
414
+ }
415
+ evalArgs.push(token);
416
+ }
417
+ if (evalArgs.length > 0)
418
+ parsed.evalArgs = evalArgs;
419
+ flags = remainder;
420
+ }
379
421
  for (const flag of flags) {
380
422
  if (flag.startsWith("--harnesses=")) {
381
423
  parsed.harnesses = parseHarnesses(flag.replace("--harnesses=", ""));
@@ -566,6 +608,33 @@ async function runCommand(parsed, ctx) {
566
608
  info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
567
609
  return 0;
568
610
  }
611
+ if (command === "eval" && parsed.evalSubcommand === "diff") {
612
+ const args = parsed.evalArgs ?? [];
613
+ if (args.length !== 2) {
614
+ error(ctx, `\`cclaw eval diff\` requires two arguments: <old> <new>. ` +
615
+ `Example: cclaw eval diff 0.26.0 latest`);
616
+ return 1;
617
+ }
618
+ const [oldSel, newSel] = args;
619
+ try {
620
+ const diff = await runEvalDiff({
621
+ projectRoot: ctx.cwd,
622
+ old: oldSel,
623
+ new: newSel
624
+ });
625
+ if (parsed.evalJson === true) {
626
+ ctx.stdout.write(`${JSON.stringify(diff, null, 2)}\n`);
627
+ }
628
+ else {
629
+ ctx.stdout.write(formatDiffMarkdown(diff));
630
+ }
631
+ return diff.regressed ? 1 : 0;
632
+ }
633
+ catch (err) {
634
+ error(ctx, err instanceof Error ? err.message : String(err));
635
+ return 1;
636
+ }
637
+ }
569
638
  if (command === "eval") {
570
639
  const result = await runEval({
571
640
  projectRoot: ctx.cwd,
@@ -592,6 +661,12 @@ async function runCommand(parsed, ctx) {
592
661
  for (const [stage, count] of Object.entries(result.corpus.byStage)) {
593
662
  ctx.stdout.write(` - ${stage}: ${count}\n`);
594
663
  }
664
+ if (result.workflowCorpus.total > 0 || result.plannedTier === "C") {
665
+ ctx.stdout.write(` workflow corpus: ${result.workflowCorpus.total} case(s)\n`);
666
+ for (const wf of result.workflowCorpus.cases) {
667
+ ctx.stdout.write(` - ${wf.id}: ${wf.stages.join(" → ")}\n`);
668
+ }
669
+ }
595
670
  ctx.stdout.write(` verifiers available:\n`);
596
671
  for (const [key, value] of Object.entries(result.verifiersAvailable)) {
597
672
  ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
@@ -0,0 +1,44 @@
1
+ import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
2
+ import { createSandbox, type Sandbox } from "../sandbox.js";
3
+ import type { SandboxTool } from "../tools/index.js";
4
+ import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
5
+ export declare class MaxTurnsExceededError extends Error {
6
+ readonly turns: number;
7
+ constructor(turns: number);
8
+ }
9
+ export interface WithToolsInput {
10
+ caseEntry: EvalCase;
11
+ config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes">;
12
+ projectRoot: string;
13
+ client: EvalLlmClient;
14
+ tools?: SandboxTool[];
15
+ /** Override for the SKILL.md loader (test hook). */
16
+ loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
17
+ /** Override for the sandbox factory (test hook). */
18
+ createSandboxFn?: typeof createSandbox;
19
+ /**
20
+ * Reuse an externally-managed sandbox instead of creating + disposing a
21
+ * per-call one. Tier C workflow orchestration uses this so every stage
22
+ * shares the same sandbox and earlier artifacts remain visible. When
23
+ * set, the caller is responsible for `dispose()`.
24
+ */
25
+ externalSandbox?: Sandbox;
26
+ /**
27
+ * Optional override of the default user prompt prefix. Tier C uses this
28
+ * to tell the model which stage it is on and where the prior artifacts
29
+ * are located.
30
+ */
31
+ promptPreamble?: string;
32
+ }
33
+ export interface WithToolsOutput {
34
+ artifact: string;
35
+ usage: ChatUsage;
36
+ usageUsd: number;
37
+ model: string;
38
+ attempts: number;
39
+ durationMs: number;
40
+ toolUse: ToolUseSummary;
41
+ systemPrompt: string;
42
+ userPrompt: string;
43
+ }
44
+ export declare function runWithTools(input: WithToolsInput): Promise<WithToolsOutput>;
@@ -0,0 +1,261 @@
1
+ /**
2
+ * Tier B with-tools agent.
3
+ *
4
+ * Multi-turn loop with OpenAI-style function-calling over a set of
5
+ * sandbox-confined tools. The AUT is given:
6
+ *
7
+ * - System prompt = stage SKILL.md (same contract as Tier A so the
8
+ * single-shot baseline is comparable).
9
+ * - User prompt = task description + a short "tools available" hint
10
+ * that names the sandbox root and the four built-in tools.
11
+ * - Tools = `read_file`, `write_file`, `glob`, `grep` (see
12
+ * `src/eval/tools/`).
13
+ *
14
+ * The loop runs up to `config.toolMaxTurns` turns (default 8). Each
15
+ * turn:
16
+ *
17
+ * 1. Send the current transcript to the model with tools enabled.
18
+ * 2. Commit token usage against the wrapped client (cost guard sees
19
+ * every call).
20
+ * 3. If the model returned tool_calls, execute each sandbox tool and
21
+ * append a `role: "tool"` message with the JSON-serialized result.
22
+ * 4. If the model produced assistant content with `finish_reason: stop`,
23
+ * treat that as the artifact and exit.
24
+ *
25
+ * When the turn budget is exhausted without a terminal stop, the agent
26
+ * throws `MaxTurnsExceededError`. The runner surfaces the error as a
27
+ * failed workflow verifier so the case counts as a regression.
28
+ *
29
+ * Artifact resolution: the final assistant content is the artifact. If
30
+ * the model used `write_file` to stage the artifact at
31
+ * `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
32
+ * mirrors the Tier C workflow where writes are the deliverable. The
33
+ * fallback is the terminal assistant message so prompts that don't
34
+ * call write_file still produce something judgable.
35
+ */
36
+ import fs from "node:fs/promises";
37
+ import path from "node:path";
38
+ import { computeUsageUsd } from "../cost-guard.js";
39
+ import { createSandbox } from "../sandbox.js";
40
+ import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
41
+ import { loadStageSkill } from "./single-shot.js";
42
+ export class MaxTurnsExceededError extends Error {
43
+ turns;
44
+ constructor(turns) {
45
+ super(`Tier B agent exceeded the ${turns}-turn budget without a terminal stop.`);
46
+ this.name = "MaxTurnsExceededError";
47
+ this.turns = turns;
48
+ }
49
+ }
50
+ const DEFAULT_MAX_TURNS = 8;
51
+ const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
52
+ const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
53
+ const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
54
+ export async function runWithTools(input) {
55
+ const { caseEntry, config, projectRoot, client } = input;
56
+ const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
57
+ const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
58
+ const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
59
+ const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
60
+ const systemPrompt = await loader(caseEntry.stage);
61
+ const tools = input.tools ?? BUILTIN_TOOLS;
62
+ const toolMap = toolsByName(tools);
63
+ const toolsBody = toolsForRequest(tools);
64
+ const sandboxFactory = input.createSandboxFn ?? createSandbox;
65
+ const externalSandbox = input.externalSandbox;
66
+ const sandbox = externalSandbox ??
67
+ (await sandboxFactory({
68
+ projectRoot,
69
+ ...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
70
+ }));
71
+ const toolUse = {
72
+ turns: 0,
73
+ calls: 0,
74
+ errors: 0,
75
+ deniedPaths: [],
76
+ byTool: {}
77
+ };
78
+ const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
79
+ let lastModel = config.model;
80
+ let totalAttempts = 0;
81
+ const userPrompt = buildUserPrompt(caseEntry, sandbox, tools, input.promptPreamble);
82
+ const messages = [
83
+ { role: "system", content: systemPrompt },
84
+ { role: "user", content: userPrompt }
85
+ ];
86
+ const started = Date.now();
87
+ try {
88
+ for (let turn = 0; turn < maxTurns; turn += 1) {
89
+ toolUse.turns = turn + 1;
90
+ const response = await client.chat({
91
+ model: config.model,
92
+ messages,
93
+ temperature: config.agentTemperature ?? 0.2,
94
+ timeoutMs: config.timeoutMs,
95
+ tools: toolsBody,
96
+ toolChoice: "auto"
97
+ });
98
+ usage.promptTokens += response.usage.promptTokens;
99
+ usage.completionTokens += response.usage.completionTokens;
100
+ usage.totalTokens += response.usage.totalTokens;
101
+ lastModel = response.model;
102
+ totalAttempts += response.attempts;
103
+ const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
104
+ messages.push(rememberAssistant(response.content, response.toolCalls));
105
+ if (!hasToolCalls) {
106
+ const artifact = await resolveArtifact(sandbox, response.content);
107
+ return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
108
+ }
109
+ for (const call of response.toolCalls) {
110
+ const tool = toolMap.get(call.name);
111
+ const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
112
+ if (argBytes > maxArgBytes) {
113
+ toolUse.errors += 1;
114
+ bumpToolCount(toolUse, call.name);
115
+ messages.push(toolResponseMessage(call.id, {
116
+ ok: false,
117
+ name: call.name,
118
+ error: `arguments payload exceeds ${maxArgBytes} bytes`
119
+ }));
120
+ continue;
121
+ }
122
+ if (!tool) {
123
+ toolUse.errors += 1;
124
+ bumpToolCount(toolUse, call.name);
125
+ messages.push(toolResponseMessage(call.id, {
126
+ ok: false,
127
+ name: call.name,
128
+ error: `unknown tool "${call.name}"`
129
+ }));
130
+ continue;
131
+ }
132
+ bumpToolCount(toolUse, call.name);
133
+ const result = await tool.invoke(call.arguments ?? "", {
134
+ sandbox,
135
+ maxResultBytes
136
+ });
137
+ if (!result.ok) {
138
+ toolUse.errors += 1;
139
+ const denied = result.details && typeof result.details.deniedPath === "string"
140
+ ? result.details.deniedPath
141
+ : undefined;
142
+ if (denied && !toolUse.deniedPaths.includes(denied)) {
143
+ toolUse.deniedPaths.push(denied);
144
+ }
145
+ }
146
+ else {
147
+ toolUse.calls += 1;
148
+ }
149
+ messages.push(toolResponseMessage(call.id, result));
150
+ }
151
+ }
152
+ throw new MaxTurnsExceededError(maxTurns);
153
+ }
154
+ finally {
155
+ if (!externalSandbox)
156
+ await sandbox.dispose();
157
+ }
158
+ }
159
+ function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
160
+ const usageUsd = computeUsageUsd(model, usage, {
161
+ tokenPricing: config.tokenPricing
162
+ });
163
+ return {
164
+ artifact: artifact.trim(),
165
+ usage,
166
+ usageUsd,
167
+ model,
168
+ attempts,
169
+ durationMs: Date.now() - started,
170
+ toolUse,
171
+ systemPrompt,
172
+ userPrompt
173
+ };
174
+ }
175
+ function rememberAssistant(content, toolCalls) {
176
+ const base = { role: "assistant", content };
177
+ if (toolCalls && toolCalls.length > 0)
178
+ base.toolCalls = toolCalls;
179
+ return base;
180
+ }
181
+ function toolResponseMessage(callId, result) {
182
+ const payload = result.ok
183
+ ? { ok: true, content: result.content, details: result.details ?? {} }
184
+ : { ok: false, error: result.error, details: result.details ?? {} };
185
+ return {
186
+ role: "tool",
187
+ content: truncatePayload(JSON.stringify(payload), 32 * 1024),
188
+ toolCallId: callId,
189
+ name: result.name
190
+ };
191
+ }
192
+ function bumpToolCount(summary, name) {
193
+ summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
194
+ }
195
+ function clampPositive(value, fallback) {
196
+ if (value === undefined)
197
+ return fallback;
198
+ if (!Number.isFinite(value) || value <= 0)
199
+ return fallback;
200
+ return Math.floor(value);
201
+ }
202
+ function buildUserPrompt(caseEntry, sandbox, tools, preamble) {
203
+ const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
204
+ const files = caseEntry.contextFiles ?? [];
205
+ const contextLines = files.length > 0
206
+ ? files.map((f) => `- ${f}`).join("\n")
207
+ : "(no files seeded)";
208
+ const lines = [];
209
+ if (preamble && preamble.trim().length > 0) {
210
+ lines.push(preamble.trim(), ``);
211
+ }
212
+ lines.push(`Stage: ${caseEntry.stage}`, `Case id: ${caseEntry.id}`, ``);
213
+ const rest = [
214
+ `Sandbox root: ${sandbox.root}`,
215
+ `You may call the following tools to read or modify files inside the sandbox.`,
216
+ `All paths are relative to the sandbox root.`,
217
+ ``,
218
+ `Tools:`,
219
+ ...toolList,
220
+ ``,
221
+ `Seeded context files (available under the sandbox root):`,
222
+ contextLines,
223
+ ``,
224
+ `Task:`,
225
+ caseEntry.inputPrompt.trim(),
226
+ ``,
227
+ `When you are done, reply with the artifact as the final assistant message.`,
228
+ `Output the artifact directly (markdown with optional YAML frontmatter).`,
229
+ `Do not wrap in code fences, do not add commentary before or after.`,
230
+ `You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
231
+ `if you do, the last written \`artifact.md\` is preferred over the chat reply.`
232
+ ];
233
+ lines.push(...rest);
234
+ return lines.join("\n");
235
+ }
236
+ async function resolveArtifact(sandbox, fallback) {
237
+ for (const candidate of ARTIFACT_CANDIDATES) {
238
+ try {
239
+ const abs = await sandbox.resolve(candidate);
240
+ const stat = await fs.stat(abs);
241
+ if (stat.isFile()) {
242
+ return await fs.readFile(abs, "utf8");
243
+ }
244
+ }
245
+ catch {
246
+ continue;
247
+ }
248
+ }
249
+ try {
250
+ const dir = path.join(sandbox.root);
251
+ const entries = (await fs.readdir(dir, { withFileTypes: true }));
252
+ const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
253
+ if (match) {
254
+ return await fs.readFile(path.join(dir, match.name), "utf8");
255
+ }
256
+ }
257
+ catch {
258
+ // fall through to fallback
259
+ }
260
+ return fallback;
261
+ }
@@ -0,0 +1,24 @@
1
+ import type { EvalLlmClient } from "../llm-client.js";
2
+ import { createSandbox } from "../sandbox.js";
3
+ import type { SandboxTool } from "../tools/index.js";
4
+ import type { ResolvedEvalConfig, WorkflowCase, WorkflowStageName, WorkflowStageResult } from "../types.js";
5
+ export interface WorkflowInput {
6
+ workflow: WorkflowCase;
7
+ config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes" | "workflowMaxTotalTurns">;
8
+ projectRoot: string;
9
+ client: EvalLlmClient;
10
+ tools?: SandboxTool[];
11
+ /** Override for the SKILL.md loader (test hook). */
12
+ loadSkill?: (stage: WorkflowStageName) => Promise<string>;
13
+ /** Override for the sandbox factory (test hook). */
14
+ createSandboxFn?: typeof createSandbox;
15
+ }
16
+ export interface WorkflowOutput {
17
+ caseId: string;
18
+ stages: WorkflowStageResult[];
19
+ /** Map from stage name to produced artifact (also persisted in sandbox). */
20
+ artifacts: Map<WorkflowStageName, string>;
21
+ totalUsageUsd: number;
22
+ totalDurationMs: number;
23
+ }
24
+ export declare function runWorkflow(input: WorkflowInput): Promise<WorkflowOutput>;
@@ -0,0 +1,133 @@
1
+ /**
2
+ * Tier C workflow agent.
3
+ *
4
+ * Runs the Tier B with-tools loop once per stage in a workflow case,
5
+ * sharing a single sandbox across stages so every new stage can read
6
+ * the earlier artifacts the model produced. The shape of the run is:
7
+ *
8
+ * 1. Create one sandbox seeded with `contextFiles`.
9
+ * 2. For each stage in `workflow.stages`:
10
+ * a. Delete any leftover `artifact.md` so the resolver doesn't
11
+ * accidentally pick the previous stage's output.
12
+ * b. Invoke `runWithTools({ externalSandbox: sandbox, promptPreamble })`.
13
+ * The preamble tells the model which stage it is on and lists the
14
+ * `stages/*.md` files available for reading.
15
+ * c. Persist the returned artifact to `stages/<stage>.md` inside the
16
+ * sandbox (deterministic, regardless of whether the model wrote
17
+ * `artifact.md` itself).
18
+ * d. Record `WorkflowStageResult` with usage, duration, and tool use.
19
+ * 3. Dispose the sandbox in a `finally` so temp directories never leak.
20
+ *
21
+ * Errors bubble up from `runWithTools`:
22
+ * - `MaxTurnsExceededError` stops the workflow at the current stage.
23
+ * - `DailyCostCapExceededError` (surfaced by the cost-guard wrapper in
24
+ * the runner) aborts immediately.
25
+ * - Generic `EvalLlmError` subclasses propagate as-is so the runner can
26
+ * record a workflow-level verifier failure.
27
+ */
28
+ import fs from "node:fs/promises";
29
+ import path from "node:path";
30
+ import { createSandbox } from "../sandbox.js";
31
+ import { loadStageSkill } from "./single-shot.js";
32
+ import { runWithTools } from "./with-tools.js";
33
+ const STAGES_SUBDIR = "stages";
34
+ const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
35
+ export async function runWorkflow(input) {
36
+ const { workflow, config, projectRoot, client } = input;
37
+ const sandboxFactory = input.createSandboxFn ?? createSandbox;
38
+ const sandbox = await sandboxFactory({
39
+ projectRoot,
40
+ ...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
41
+ });
42
+ const stageResults = [];
43
+ const artifacts = new Map();
44
+ let totalUsageUsd = 0;
45
+ let totalDurationMs = 0;
46
+ try {
47
+ await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
48
+ for (const step of workflow.stages) {
49
+ await clearArtifactFile(sandbox);
50
+ const priorStages = stageResults.map((r) => r.stage);
51
+ const preamble = buildStagePreamble(workflow, step.name, priorStages);
52
+ const caseEntry = {
53
+ id: `${workflow.id}/${step.name}`,
54
+ stage: step.name,
55
+ inputPrompt: step.inputPrompt,
56
+ ...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
57
+ };
58
+ const result = await runWithTools({
59
+ caseEntry,
60
+ config,
61
+ projectRoot,
62
+ client,
63
+ ...(input.tools ? { tools: input.tools } : {}),
64
+ ...(input.loadSkill
65
+ ? { loadSkill: input.loadSkill }
66
+ : {
67
+ loadSkill: (stage) => loadStageSkill(projectRoot, stage)
68
+ }),
69
+ externalSandbox: sandbox,
70
+ promptPreamble: preamble
71
+ });
72
+ await persistStageArtifact(sandbox, step.name, result.artifact);
73
+ artifacts.set(step.name, result.artifact);
74
+ const stageResult = {
75
+ stage: step.name,
76
+ artifact: result.artifact,
77
+ durationMs: result.durationMs,
78
+ usageUsd: result.usageUsd,
79
+ toolUse: result.toolUse,
80
+ attempts: result.attempts,
81
+ model: result.model,
82
+ promptTokens: result.usage.promptTokens,
83
+ completionTokens: result.usage.completionTokens
84
+ };
85
+ stageResults.push(stageResult);
86
+ totalUsageUsd += result.usageUsd;
87
+ totalDurationMs += result.durationMs;
88
+ }
89
+ return {
90
+ caseId: workflow.id,
91
+ stages: stageResults,
92
+ artifacts,
93
+ totalUsageUsd: Number(totalUsageUsd.toFixed(6)),
94
+ totalDurationMs
95
+ };
96
+ }
97
+ finally {
98
+ await sandbox.dispose();
99
+ }
100
+ }
101
+ async function clearArtifactFile(sandbox) {
102
+ for (const candidate of ARTIFACT_CANDIDATES) {
103
+ try {
104
+ const abs = await sandbox.resolve(candidate);
105
+ await fs.rm(abs, { force: true });
106
+ }
107
+ catch {
108
+ // candidate did not exist — resolve threw SandboxEscapeError for
109
+ // missing realpath; safe to ignore.
110
+ }
111
+ }
112
+ }
113
+ async function persistStageArtifact(sandbox, stage, artifact) {
114
+ const rel = `${STAGES_SUBDIR}/${stage}.md`;
115
+ const abs = await sandbox.resolve(rel, { allowMissing: true });
116
+ await fs.mkdir(path.dirname(abs), { recursive: true });
117
+ await fs.writeFile(abs, artifact.endsWith("\n") ? artifact : `${artifact}\n`, "utf8");
118
+ }
119
+ function buildStagePreamble(workflow, current, priorStages) {
120
+ const lines = [];
121
+ lines.push(`You are running stage "${current}" of the Tier C workflow "${workflow.id}".`);
122
+ if (workflow.description) {
123
+ lines.push(`Case description: ${workflow.description}`);
124
+ }
125
+ if (priorStages.length === 0) {
126
+ lines.push(`This is the first stage. Any context_files have been seeded into the sandbox root.`);
127
+ }
128
+ else {
129
+ lines.push(`Earlier stage artifacts are available via read_file:`, ...priorStages.map((name) => ` - ${STAGES_SUBDIR}/${name}.md`), `Read the prior artifacts before drafting your output so decisions and ` +
130
+ `ids carry through.`);
131
+ }
132
+ return lines.join("\n");
133
+ }
@@ -32,7 +32,11 @@ const NUMERIC_ENVS = new Set([
32
32
  "CCLAW_EVAL_MAX_RETRIES",
33
33
  "CCLAW_EVAL_JUDGE_SAMPLES",
34
34
  "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
- "CCLAW_EVAL_AGENT_TEMPERATURE"
35
+ "CCLAW_EVAL_AGENT_TEMPERATURE",
36
+ "CCLAW_EVAL_TOOL_MAX_TURNS",
37
+ "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
38
+ "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
39
+ "CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
36
40
  ]);
37
41
  function evalConfigError(configFilePath, reason) {
38
42
  return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -152,6 +156,18 @@ function validateFileConfig(raw, configFilePath) {
152
156
  }
153
157
  out.tokenPricing = pricing;
154
158
  }
159
+ const assignPositiveInt = (key, value, label) => {
160
+ if (value === undefined)
161
+ return;
162
+ if (!Number.isInteger(value) || value < 1) {
163
+ throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
164
+ }
165
+ out[key] = value;
166
+ };
167
+ assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
168
+ assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
169
+ assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
170
+ assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
155
171
  if (raw.regression !== undefined) {
156
172
  if (!isRecord(raw.regression)) {
157
173
  throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -186,7 +202,11 @@ function validateFileConfig(raw, configFilePath) {
186
202
  "judgeSamples",
187
203
  "judgeTemperature",
188
204
  "agentTemperature",
189
- "tokenPricing"
205
+ "tokenPricing",
206
+ "toolMaxTurns",
207
+ "toolMaxArgumentsBytes",
208
+ "toolMaxResultBytes",
209
+ "workflowMaxTotalTurns"
190
210
  ]);
191
211
  const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
192
212
  if (unknown.length > 0) {
@@ -296,6 +316,22 @@ function applyEnvOverrides(base, env) {
296
316
  patched.agentTemperature = value;
297
317
  overridden = true;
298
318
  }
319
+ const readPositiveInt = (name, key, label) => {
320
+ const raw = read(name);
321
+ if (!raw)
322
+ return;
323
+ const value = parseNumericEnv(name, raw);
324
+ if (!Number.isInteger(value) || value < 1) {
325
+ throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
326
+ }
327
+ patched[key] = value;
328
+ overridden = true;
329
+ void label;
330
+ };
331
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
332
+ readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
333
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
334
+ readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
299
335
  const apiKey = read("CCLAW_EVAL_API_KEY");
300
336
  return { patched, overridden, apiKey };
301
337
  }