cclaw-cli 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +78 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +17 -11
- package/dist/eval/agents/workflow.d.ts +24 -0
- package/dist/eval/agents/workflow.js +133 -0
- package/dist/eval/config-loader.js +6 -2
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/report.js +35 -0
- package/dist/eval/runner.d.ts +10 -1
- package/dist/eval/runner.js +236 -19
- package/dist/eval/types.d.ts +117 -1
- package/dist/eval/types.js +21 -1
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -26,6 +26,10 @@ interface ParsedArgs {
|
|
|
26
26
|
evalNoWrite?: boolean;
|
|
27
27
|
evalUpdateBaseline?: boolean;
|
|
28
28
|
evalConfirm?: boolean;
|
|
29
|
+
/** Optional subcommand after `eval`. Currently only `diff` is supported. */
|
|
30
|
+
evalSubcommand?: "diff";
|
|
31
|
+
/** Positional arguments for eval subcommands (e.g. `diff <old> <new>`). */
|
|
32
|
+
evalArgs?: string[];
|
|
29
33
|
showHelp?: boolean;
|
|
30
34
|
showVersion?: boolean;
|
|
31
35
|
}
|
package/dist/cli.js
CHANGED
|
@@ -16,6 +16,7 @@ import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
|
16
16
|
import { runEval } from "./eval/runner.js";
|
|
17
17
|
import { writeBaselinesFromReport } from "./eval/baseline.js";
|
|
18
18
|
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
19
|
+
import { formatDiffMarkdown, runEvalDiff } from "./eval/diff.js";
|
|
19
20
|
import { EVAL_TIERS } from "./eval/types.js";
|
|
20
21
|
import { FLOW_STAGES } from "./types.js";
|
|
21
22
|
const INSTALLER_COMMANDS = [
|
|
@@ -55,16 +56,22 @@ Commands:
|
|
|
55
56
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
56
57
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
57
58
|
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
|
|
58
|
-
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
59
|
-
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
59
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}) for Tier A/B.
|
|
60
|
+
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=multi-stage workflow).
|
|
60
61
|
--schema-only Run only structural verifiers (default).
|
|
61
62
|
--rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
|
|
62
|
-
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B
|
|
63
|
+
--judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A runs the single-shot agent, Tier B/C the sandbox tool-using agent (read_file/write_file/glob/grep).
|
|
63
64
|
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
65
|
--json Emit machine-readable JSON on stdout.
|
|
65
66
|
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
66
67
|
--update-baseline Overwrite baselines from the current run (requires --confirm).
|
|
67
68
|
--confirm Acknowledge --update-baseline (prevents accidental resets).
|
|
69
|
+
|
|
70
|
+
Subcommands:
|
|
71
|
+
diff <old> <new> Compare two reports under .cclaw/evals/reports/.
|
|
72
|
+
Each argument is a cclawVersion (e.g. 0.26.0), a filename,
|
|
73
|
+
or the literal "latest". Exit code 1 when the diff shows a
|
|
74
|
+
regression. Accepts --json to emit machine-readable output.
|
|
68
75
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
69
76
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
70
77
|
|
|
@@ -80,6 +87,8 @@ Examples:
|
|
|
80
87
|
cclaw eval --stage=brainstorm --schema-only
|
|
81
88
|
cclaw eval --judge --tier=A --stage=brainstorm
|
|
82
89
|
cclaw eval --judge --tier=B --stage=spec
|
|
90
|
+
cclaw eval --tier=C --judge
|
|
91
|
+
cclaw eval diff 0.26.0 latest
|
|
83
92
|
|
|
84
93
|
Docs: https://github.com/zuevrs/cclaw
|
|
85
94
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -373,10 +382,42 @@ function parseArgs(argv) {
|
|
|
373
382
|
if (versionFlag) {
|
|
374
383
|
parsed.showVersion = true;
|
|
375
384
|
}
|
|
376
|
-
const
|
|
385
|
+
const filteredArgv = argv.filter((arg) => arg !== "--help" && arg !== "-h" && arg !== "--version" && arg !== "-v");
|
|
386
|
+
const [commandRaw, ...rest] = filteredArgv;
|
|
377
387
|
parsed.command = INSTALLER_COMMANDS.includes(commandRaw)
|
|
378
388
|
? commandRaw
|
|
379
389
|
: undefined;
|
|
390
|
+
// For `eval`, the next non-flag argument is an optional subcommand. Any
|
|
391
|
+
// subsequent non-flag tokens are captured as evalArgs (consumed by the
|
|
392
|
+
// subcommand handler). This preserves backwards compat: callers that run
|
|
393
|
+
// `cclaw eval --dry-run` see no subcommand and no positional args.
|
|
394
|
+
let flags = rest;
|
|
395
|
+
if (parsed.command === "eval") {
|
|
396
|
+
const evalArgs = [];
|
|
397
|
+
const remainder = [];
|
|
398
|
+
let sawSubcommand = false;
|
|
399
|
+
for (const token of rest) {
|
|
400
|
+
if (token.startsWith("--")) {
|
|
401
|
+
remainder.push(token);
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
if (!sawSubcommand) {
|
|
405
|
+
if (token === "diff") {
|
|
406
|
+
parsed.evalSubcommand = "diff";
|
|
407
|
+
sawSubcommand = true;
|
|
408
|
+
}
|
|
409
|
+
else {
|
|
410
|
+
// Treat unknown positional as an eval arg for forward compat.
|
|
411
|
+
evalArgs.push(token);
|
|
412
|
+
}
|
|
413
|
+
continue;
|
|
414
|
+
}
|
|
415
|
+
evalArgs.push(token);
|
|
416
|
+
}
|
|
417
|
+
if (evalArgs.length > 0)
|
|
418
|
+
parsed.evalArgs = evalArgs;
|
|
419
|
+
flags = remainder;
|
|
420
|
+
}
|
|
380
421
|
for (const flag of flags) {
|
|
381
422
|
if (flag.startsWith("--harnesses=")) {
|
|
382
423
|
parsed.harnesses = parseHarnesses(flag.replace("--harnesses=", ""));
|
|
@@ -567,6 +608,33 @@ async function runCommand(parsed, ctx) {
|
|
|
567
608
|
info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
|
|
568
609
|
return 0;
|
|
569
610
|
}
|
|
611
|
+
if (command === "eval" && parsed.evalSubcommand === "diff") {
|
|
612
|
+
const args = parsed.evalArgs ?? [];
|
|
613
|
+
if (args.length !== 2) {
|
|
614
|
+
error(ctx, `\`cclaw eval diff\` requires two arguments: <old> <new>. ` +
|
|
615
|
+
`Example: cclaw eval diff 0.26.0 latest`);
|
|
616
|
+
return 1;
|
|
617
|
+
}
|
|
618
|
+
const [oldSel, newSel] = args;
|
|
619
|
+
try {
|
|
620
|
+
const diff = await runEvalDiff({
|
|
621
|
+
projectRoot: ctx.cwd,
|
|
622
|
+
old: oldSel,
|
|
623
|
+
new: newSel
|
|
624
|
+
});
|
|
625
|
+
if (parsed.evalJson === true) {
|
|
626
|
+
ctx.stdout.write(`${JSON.stringify(diff, null, 2)}\n`);
|
|
627
|
+
}
|
|
628
|
+
else {
|
|
629
|
+
ctx.stdout.write(formatDiffMarkdown(diff));
|
|
630
|
+
}
|
|
631
|
+
return diff.regressed ? 1 : 0;
|
|
632
|
+
}
|
|
633
|
+
catch (err) {
|
|
634
|
+
error(ctx, err instanceof Error ? err.message : String(err));
|
|
635
|
+
return 1;
|
|
636
|
+
}
|
|
637
|
+
}
|
|
570
638
|
if (command === "eval") {
|
|
571
639
|
const result = await runEval({
|
|
572
640
|
projectRoot: ctx.cwd,
|
|
@@ -593,6 +661,12 @@ async function runCommand(parsed, ctx) {
|
|
|
593
661
|
for (const [stage, count] of Object.entries(result.corpus.byStage)) {
|
|
594
662
|
ctx.stdout.write(` - ${stage}: ${count}\n`);
|
|
595
663
|
}
|
|
664
|
+
if (result.workflowCorpus.total > 0 || result.plannedTier === "C") {
|
|
665
|
+
ctx.stdout.write(` workflow corpus: ${result.workflowCorpus.total} case(s)\n`);
|
|
666
|
+
for (const wf of result.workflowCorpus.cases) {
|
|
667
|
+
ctx.stdout.write(` - ${wf.id}: ${wf.stages.join(" → ")}\n`);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
596
670
|
ctx.stdout.write(` verifiers available:\n`);
|
|
597
671
|
for (const [key, value] of Object.entries(result.verifiersAvailable)) {
|
|
598
672
|
ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
|
|
2
|
-
import { createSandbox } from "../sandbox.js";
|
|
2
|
+
import { createSandbox, type Sandbox } from "../sandbox.js";
|
|
3
3
|
import type { SandboxTool } from "../tools/index.js";
|
|
4
4
|
import type { EvalCase, ResolvedEvalConfig, ToolUseSummary } from "../types.js";
|
|
5
5
|
export declare class MaxTurnsExceededError extends Error {
|
|
@@ -16,6 +16,19 @@ export interface WithToolsInput {
|
|
|
16
16
|
loadSkill?: (stage: EvalCase["stage"]) => Promise<string>;
|
|
17
17
|
/** Override for the sandbox factory (test hook). */
|
|
18
18
|
createSandboxFn?: typeof createSandbox;
|
|
19
|
+
/**
|
|
20
|
+
* Reuse an externally-managed sandbox instead of creating + disposing a
|
|
21
|
+
* per-call one. Tier C workflow orchestration uses this so every stage
|
|
22
|
+
* shares the same sandbox and earlier artifacts remain visible. When
|
|
23
|
+
* set, the caller is responsible for `dispose()`.
|
|
24
|
+
*/
|
|
25
|
+
externalSandbox?: Sandbox;
|
|
26
|
+
/**
|
|
27
|
+
* Optional override of the default user prompt prefix. Tier C uses this
|
|
28
|
+
* to tell the model which stage it is on and where the prior artifacts
|
|
29
|
+
* are located.
|
|
30
|
+
*/
|
|
31
|
+
promptPreamble?: string;
|
|
19
32
|
}
|
|
20
33
|
export interface WithToolsOutput {
|
|
21
34
|
artifact: string;
|
|
@@ -62,10 +62,12 @@ export async function runWithTools(input) {
|
|
|
62
62
|
const toolMap = toolsByName(tools);
|
|
63
63
|
const toolsBody = toolsForRequest(tools);
|
|
64
64
|
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
65
|
-
const
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
const externalSandbox = input.externalSandbox;
|
|
66
|
+
const sandbox = externalSandbox ??
|
|
67
|
+
(await sandboxFactory({
|
|
68
|
+
projectRoot,
|
|
69
|
+
...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
|
|
70
|
+
}));
|
|
69
71
|
const toolUse = {
|
|
70
72
|
turns: 0,
|
|
71
73
|
calls: 0,
|
|
@@ -76,7 +78,7 @@ export async function runWithTools(input) {
|
|
|
76
78
|
const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
77
79
|
let lastModel = config.model;
|
|
78
80
|
let totalAttempts = 0;
|
|
79
|
-
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
|
|
81
|
+
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools, input.promptPreamble);
|
|
80
82
|
const messages = [
|
|
81
83
|
{ role: "system", content: systemPrompt },
|
|
82
84
|
{ role: "user", content: userPrompt }
|
|
@@ -150,7 +152,8 @@ export async function runWithTools(input) {
|
|
|
150
152
|
throw new MaxTurnsExceededError(maxTurns);
|
|
151
153
|
}
|
|
152
154
|
finally {
|
|
153
|
-
|
|
155
|
+
if (!externalSandbox)
|
|
156
|
+
await sandbox.dispose();
|
|
154
157
|
}
|
|
155
158
|
}
|
|
156
159
|
function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
|
|
@@ -196,16 +199,18 @@ function clampPositive(value, fallback) {
|
|
|
196
199
|
return fallback;
|
|
197
200
|
return Math.floor(value);
|
|
198
201
|
}
|
|
199
|
-
function buildUserPrompt(caseEntry, sandbox, tools) {
|
|
202
|
+
function buildUserPrompt(caseEntry, sandbox, tools, preamble) {
|
|
200
203
|
const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
|
|
201
204
|
const files = caseEntry.contextFiles ?? [];
|
|
202
205
|
const contextLines = files.length > 0
|
|
203
206
|
? files.map((f) => `- ${f}`).join("\n")
|
|
204
207
|
: "(no files seeded)";
|
|
205
|
-
const lines = [
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
208
|
+
const lines = [];
|
|
209
|
+
if (preamble && preamble.trim().length > 0) {
|
|
210
|
+
lines.push(preamble.trim(), ``);
|
|
211
|
+
}
|
|
212
|
+
lines.push(`Stage: ${caseEntry.stage}`, `Case id: ${caseEntry.id}`, ``);
|
|
213
|
+
const rest = [
|
|
209
214
|
`Sandbox root: ${sandbox.root}`,
|
|
210
215
|
`You may call the following tools to read or modify files inside the sandbox.`,
|
|
211
216
|
`All paths are relative to the sandbox root.`,
|
|
@@ -225,6 +230,7 @@ function buildUserPrompt(caseEntry, sandbox, tools) {
|
|
|
225
230
|
`You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
|
|
226
231
|
`if you do, the last written \`artifact.md\` is preferred over the chat reply.`
|
|
227
232
|
];
|
|
233
|
+
lines.push(...rest);
|
|
228
234
|
return lines.join("\n");
|
|
229
235
|
}
|
|
230
236
|
async function resolveArtifact(sandbox, fallback) {
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { EvalLlmClient } from "../llm-client.js";
|
|
2
|
+
import { createSandbox } from "../sandbox.js";
|
|
3
|
+
import type { SandboxTool } from "../tools/index.js";
|
|
4
|
+
import type { ResolvedEvalConfig, WorkflowCase, WorkflowStageName, WorkflowStageResult } from "../types.js";
|
|
5
|
+
export interface WorkflowInput {
|
|
6
|
+
workflow: WorkflowCase;
|
|
7
|
+
config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes" | "workflowMaxTotalTurns">;
|
|
8
|
+
projectRoot: string;
|
|
9
|
+
client: EvalLlmClient;
|
|
10
|
+
tools?: SandboxTool[];
|
|
11
|
+
/** Override for the SKILL.md loader (test hook). */
|
|
12
|
+
loadSkill?: (stage: WorkflowStageName) => Promise<string>;
|
|
13
|
+
/** Override for the sandbox factory (test hook). */
|
|
14
|
+
createSandboxFn?: typeof createSandbox;
|
|
15
|
+
}
|
|
16
|
+
export interface WorkflowOutput {
|
|
17
|
+
caseId: string;
|
|
18
|
+
stages: WorkflowStageResult[];
|
|
19
|
+
/** Map from stage name to produced artifact (also persisted in sandbox). */
|
|
20
|
+
artifacts: Map<WorkflowStageName, string>;
|
|
21
|
+
totalUsageUsd: number;
|
|
22
|
+
totalDurationMs: number;
|
|
23
|
+
}
|
|
24
|
+
export declare function runWorkflow(input: WorkflowInput): Promise<WorkflowOutput>;
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tier C workflow agent.
|
|
3
|
+
*
|
|
4
|
+
* Runs the Tier B with-tools loop once per stage in a workflow case,
|
|
5
|
+
* sharing a single sandbox across stages so every new stage can read
|
|
6
|
+
* the earlier artifacts the model produced. The shape of the run is:
|
|
7
|
+
*
|
|
8
|
+
* 1. Create one sandbox seeded with `contextFiles`.
|
|
9
|
+
* 2. For each stage in `workflow.stages`:
|
|
10
|
+
* a. Delete any leftover `artifact.md` so the resolver doesn't
|
|
11
|
+
* accidentally pick the previous stage's output.
|
|
12
|
+
* b. Invoke `runWithTools({ externalSandbox: sandbox, promptPreamble })`.
|
|
13
|
+
* The preamble tells the model which stage it is on and lists the
|
|
14
|
+
* `stages/*.md` files available for reading.
|
|
15
|
+
* c. Persist the returned artifact to `stages/<stage>.md` inside the
|
|
16
|
+
* sandbox (deterministic, regardless of whether the model wrote
|
|
17
|
+
* `artifact.md` itself).
|
|
18
|
+
* d. Record `WorkflowStageResult` with usage, duration, and tool use.
|
|
19
|
+
* 3. Dispose the sandbox in a `finally` so temp directories never leak.
|
|
20
|
+
*
|
|
21
|
+
* Errors bubble up from `runWithTools`:
|
|
22
|
+
* - `MaxTurnsExceededError` stops the workflow at the current stage.
|
|
23
|
+
* - `DailyCostCapExceededError` (surfaced by the cost-guard wrapper in
|
|
24
|
+
* the runner) aborts immediately.
|
|
25
|
+
* - Generic `EvalLlmError` subclasses propagate as-is so the runner can
|
|
26
|
+
* record a workflow-level verifier failure.
|
|
27
|
+
*/
|
|
28
|
+
import fs from "node:fs/promises";
|
|
29
|
+
import path from "node:path";
|
|
30
|
+
import { createSandbox } from "../sandbox.js";
|
|
31
|
+
import { loadStageSkill } from "./single-shot.js";
|
|
32
|
+
import { runWithTools } from "./with-tools.js";
|
|
33
|
+
const STAGES_SUBDIR = "stages";
|
|
34
|
+
const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
|
|
35
|
+
export async function runWorkflow(input) {
|
|
36
|
+
const { workflow, config, projectRoot, client } = input;
|
|
37
|
+
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
38
|
+
const sandbox = await sandboxFactory({
|
|
39
|
+
projectRoot,
|
|
40
|
+
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
41
|
+
});
|
|
42
|
+
const stageResults = [];
|
|
43
|
+
const artifacts = new Map();
|
|
44
|
+
let totalUsageUsd = 0;
|
|
45
|
+
let totalDurationMs = 0;
|
|
46
|
+
try {
|
|
47
|
+
await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
|
|
48
|
+
for (const step of workflow.stages) {
|
|
49
|
+
await clearArtifactFile(sandbox);
|
|
50
|
+
const priorStages = stageResults.map((r) => r.stage);
|
|
51
|
+
const preamble = buildStagePreamble(workflow, step.name, priorStages);
|
|
52
|
+
const caseEntry = {
|
|
53
|
+
id: `${workflow.id}/${step.name}`,
|
|
54
|
+
stage: step.name,
|
|
55
|
+
inputPrompt: step.inputPrompt,
|
|
56
|
+
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
57
|
+
};
|
|
58
|
+
const result = await runWithTools({
|
|
59
|
+
caseEntry,
|
|
60
|
+
config,
|
|
61
|
+
projectRoot,
|
|
62
|
+
client,
|
|
63
|
+
...(input.tools ? { tools: input.tools } : {}),
|
|
64
|
+
...(input.loadSkill
|
|
65
|
+
? { loadSkill: input.loadSkill }
|
|
66
|
+
: {
|
|
67
|
+
loadSkill: (stage) => loadStageSkill(projectRoot, stage)
|
|
68
|
+
}),
|
|
69
|
+
externalSandbox: sandbox,
|
|
70
|
+
promptPreamble: preamble
|
|
71
|
+
});
|
|
72
|
+
await persistStageArtifact(sandbox, step.name, result.artifact);
|
|
73
|
+
artifacts.set(step.name, result.artifact);
|
|
74
|
+
const stageResult = {
|
|
75
|
+
stage: step.name,
|
|
76
|
+
artifact: result.artifact,
|
|
77
|
+
durationMs: result.durationMs,
|
|
78
|
+
usageUsd: result.usageUsd,
|
|
79
|
+
toolUse: result.toolUse,
|
|
80
|
+
attempts: result.attempts,
|
|
81
|
+
model: result.model,
|
|
82
|
+
promptTokens: result.usage.promptTokens,
|
|
83
|
+
completionTokens: result.usage.completionTokens
|
|
84
|
+
};
|
|
85
|
+
stageResults.push(stageResult);
|
|
86
|
+
totalUsageUsd += result.usageUsd;
|
|
87
|
+
totalDurationMs += result.durationMs;
|
|
88
|
+
}
|
|
89
|
+
return {
|
|
90
|
+
caseId: workflow.id,
|
|
91
|
+
stages: stageResults,
|
|
92
|
+
artifacts,
|
|
93
|
+
totalUsageUsd: Number(totalUsageUsd.toFixed(6)),
|
|
94
|
+
totalDurationMs
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
finally {
|
|
98
|
+
await sandbox.dispose();
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
async function clearArtifactFile(sandbox) {
|
|
102
|
+
for (const candidate of ARTIFACT_CANDIDATES) {
|
|
103
|
+
try {
|
|
104
|
+
const abs = await sandbox.resolve(candidate);
|
|
105
|
+
await fs.rm(abs, { force: true });
|
|
106
|
+
}
|
|
107
|
+
catch {
|
|
108
|
+
// candidate did not exist — resolve threw SandboxEscapeError for
|
|
109
|
+
// missing realpath; safe to ignore.
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
async function persistStageArtifact(sandbox, stage, artifact) {
|
|
114
|
+
const rel = `${STAGES_SUBDIR}/${stage}.md`;
|
|
115
|
+
const abs = await sandbox.resolve(rel, { allowMissing: true });
|
|
116
|
+
await fs.mkdir(path.dirname(abs), { recursive: true });
|
|
117
|
+
await fs.writeFile(abs, artifact.endsWith("\n") ? artifact : `${artifact}\n`, "utf8");
|
|
118
|
+
}
|
|
119
|
+
function buildStagePreamble(workflow, current, priorStages) {
|
|
120
|
+
const lines = [];
|
|
121
|
+
lines.push(`You are running stage "${current}" of the Tier C workflow "${workflow.id}".`);
|
|
122
|
+
if (workflow.description) {
|
|
123
|
+
lines.push(`Case description: ${workflow.description}`);
|
|
124
|
+
}
|
|
125
|
+
if (priorStages.length === 0) {
|
|
126
|
+
lines.push(`This is the first stage. Any context_files have been seeded into the sandbox root.`);
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
lines.push(`Earlier stage artifacts are available via read_file:`, ...priorStages.map((name) => ` - ${STAGES_SUBDIR}/${name}.md`), `Read the prior artifacts before drafting your output so decisions and ` +
|
|
130
|
+
`ids carry through.`);
|
|
131
|
+
}
|
|
132
|
+
return lines.join("\n");
|
|
133
|
+
}
|
|
@@ -35,7 +35,8 @@ const NUMERIC_ENVS = new Set([
|
|
|
35
35
|
"CCLAW_EVAL_AGENT_TEMPERATURE",
|
|
36
36
|
"CCLAW_EVAL_TOOL_MAX_TURNS",
|
|
37
37
|
"CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
|
|
38
|
-
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
|
|
38
|
+
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
|
|
39
|
+
"CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
|
|
39
40
|
]);
|
|
40
41
|
function evalConfigError(configFilePath, reason) {
|
|
41
42
|
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
@@ -166,6 +167,7 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
166
167
|
assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
|
|
167
168
|
assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
|
|
168
169
|
assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
|
|
170
|
+
assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
|
|
169
171
|
if (raw.regression !== undefined) {
|
|
170
172
|
if (!isRecord(raw.regression)) {
|
|
171
173
|
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
@@ -203,7 +205,8 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
203
205
|
"tokenPricing",
|
|
204
206
|
"toolMaxTurns",
|
|
205
207
|
"toolMaxArgumentsBytes",
|
|
206
|
-
"toolMaxResultBytes"
|
|
208
|
+
"toolMaxResultBytes",
|
|
209
|
+
"workflowMaxTotalTurns"
|
|
207
210
|
]);
|
|
208
211
|
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
209
212
|
if (unknown.length > 0) {
|
|
@@ -326,6 +329,7 @@ function applyEnvOverrides(base, env) {
|
|
|
326
329
|
void label;
|
|
327
330
|
};
|
|
328
331
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
|
|
332
|
+
readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
|
|
329
333
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
|
|
330
334
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
|
|
331
335
|
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import type { EvalReport } from "./types.js";
|
|
2
|
+
export interface EvalDiffInput {
|
|
3
|
+
projectRoot: string;
|
|
4
|
+
/** Version string, filename, or "latest". */
|
|
5
|
+
old: string;
|
|
6
|
+
/** Version string, filename, or "latest". */
|
|
7
|
+
new: string;
|
|
8
|
+
}
|
|
9
|
+
export interface EvalDiffCaseEntry {
|
|
10
|
+
caseId: string;
|
|
11
|
+
stage: string;
|
|
12
|
+
/** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
|
|
13
|
+
transition: "same" | "regressed" | "recovered" | "added" | "removed";
|
|
14
|
+
previousPassed?: boolean;
|
|
15
|
+
currentPassed?: boolean;
|
|
16
|
+
durationDeltaMs?: number;
|
|
17
|
+
costDeltaUsd?: number;
|
|
18
|
+
verifierDeltas: EvalDiffVerifierEntry[];
|
|
19
|
+
stageDeltas?: EvalDiffStageEntry[];
|
|
20
|
+
}
|
|
21
|
+
export interface EvalDiffVerifierEntry {
|
|
22
|
+
verifierId: string;
|
|
23
|
+
kind: string;
|
|
24
|
+
transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
|
|
25
|
+
previousScore?: number;
|
|
26
|
+
currentScore?: number;
|
|
27
|
+
previousOk?: boolean;
|
|
28
|
+
currentOk?: boolean;
|
|
29
|
+
}
|
|
30
|
+
export interface EvalDiffStageEntry {
|
|
31
|
+
stage: string;
|
|
32
|
+
durationDeltaMs: number;
|
|
33
|
+
costDeltaUsd: number;
|
|
34
|
+
turnsDelta: number;
|
|
35
|
+
callsDelta: number;
|
|
36
|
+
}
|
|
37
|
+
export interface EvalDiffReport {
|
|
38
|
+
old: EvalDiffReportMeta;
|
|
39
|
+
new: EvalDiffReportMeta;
|
|
40
|
+
summaryDelta: {
|
|
41
|
+
totalCasesDelta: number;
|
|
42
|
+
passedDelta: number;
|
|
43
|
+
failedDelta: number;
|
|
44
|
+
skippedDelta: number;
|
|
45
|
+
totalCostUsdDelta: number;
|
|
46
|
+
totalDurationMsDelta: number;
|
|
47
|
+
};
|
|
48
|
+
cases: EvalDiffCaseEntry[];
|
|
49
|
+
/** True when any case regressed or any verifier dropped. */
|
|
50
|
+
regressed: boolean;
|
|
51
|
+
}
|
|
52
|
+
export interface EvalDiffReportMeta {
|
|
53
|
+
runId: string;
|
|
54
|
+
cclawVersion: string;
|
|
55
|
+
generatedAt: string;
|
|
56
|
+
tier: string;
|
|
57
|
+
model: string;
|
|
58
|
+
sourcePath: string;
|
|
59
|
+
}
|
|
60
|
+
export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
|
|
61
|
+
export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
|
|
62
|
+
export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
|
|
63
|
+
/** Render the diff as a terse human-readable Markdown block. */
|
|
64
|
+
export declare function formatDiffMarkdown(diff: EvalDiffReport): string;
|