cclaw-cli 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -2
- package/dist/cli.js +388 -18
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +22 -16
- package/dist/eval/agents/workflow.d.ts +31 -0
- package/dist/eval/agents/workflow.js +135 -0
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +52 -19
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +36 -1
- package/dist/eval/runner.d.ts +37 -8
- package/dist/eval/runner.js +351 -42
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +158 -15
- package/dist/eval/types.js +39 -7
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Multi-turn with-tools agent (agent mode, reused by workflow mode).
|
|
3
3
|
*
|
|
4
4
|
* Multi-turn loop with OpenAI-style function-calling over a set of
|
|
5
5
|
* sandbox-confined tools. The AUT is given:
|
|
6
6
|
*
|
|
7
|
-
* - System prompt = stage SKILL.md (same contract as
|
|
8
|
-
*
|
|
7
|
+
* - System prompt = stage SKILL.md (same contract as the single-shot path
|
|
8
|
+
* so the baseline is comparable).
|
|
9
9
|
* - User prompt = task description + a short "tools available" hint
|
|
10
10
|
* that names the sandbox root and the four built-in tools.
|
|
11
11
|
* - Tools = `read_file`, `write_file`, `glob`, `grep` (see
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
* Artifact resolution: the final assistant content is the artifact. If
|
|
30
30
|
* the model used `write_file` to stage the artifact at
|
|
31
31
|
* `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
|
|
32
|
-
* mirrors
|
|
32
|
+
* mirrors workflow mode where writes are the deliverable. The
|
|
33
33
|
* fallback is the terminal assistant message so prompts that don't
|
|
34
34
|
* call write_file still produce something judgable.
|
|
35
35
|
*/
|
|
@@ -42,7 +42,7 @@ import { loadStageSkill } from "./single-shot.js";
|
|
|
42
42
|
export class MaxTurnsExceededError extends Error {
|
|
43
43
|
turns;
|
|
44
44
|
constructor(turns) {
|
|
45
|
-
super(`
|
|
45
|
+
super(`Agent loop exceeded the ${turns}-turn budget without a terminal stop.`);
|
|
46
46
|
this.name = "MaxTurnsExceededError";
|
|
47
47
|
this.turns = turns;
|
|
48
48
|
}
|
|
@@ -62,10 +62,12 @@ export async function runWithTools(input) {
|
|
|
62
62
|
const toolMap = toolsByName(tools);
|
|
63
63
|
const toolsBody = toolsForRequest(tools);
|
|
64
64
|
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
65
|
-
const
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
const externalSandbox = input.externalSandbox;
|
|
66
|
+
const sandbox = externalSandbox ??
|
|
67
|
+
(await sandboxFactory({
|
|
68
|
+
projectRoot,
|
|
69
|
+
...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
|
|
70
|
+
}));
|
|
69
71
|
const toolUse = {
|
|
70
72
|
turns: 0,
|
|
71
73
|
calls: 0,
|
|
@@ -76,7 +78,7 @@ export async function runWithTools(input) {
|
|
|
76
78
|
const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
77
79
|
let lastModel = config.model;
|
|
78
80
|
let totalAttempts = 0;
|
|
79
|
-
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools);
|
|
81
|
+
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools, input.promptPreamble);
|
|
80
82
|
const messages = [
|
|
81
83
|
{ role: "system", content: systemPrompt },
|
|
82
84
|
{ role: "user", content: userPrompt }
|
|
@@ -150,7 +152,8 @@ export async function runWithTools(input) {
|
|
|
150
152
|
throw new MaxTurnsExceededError(maxTurns);
|
|
151
153
|
}
|
|
152
154
|
finally {
|
|
153
|
-
|
|
155
|
+
if (!externalSandbox)
|
|
156
|
+
await sandbox.dispose();
|
|
154
157
|
}
|
|
155
158
|
}
|
|
156
159
|
function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
|
|
@@ -196,16 +199,18 @@ function clampPositive(value, fallback) {
|
|
|
196
199
|
return fallback;
|
|
197
200
|
return Math.floor(value);
|
|
198
201
|
}
|
|
199
|
-
function buildUserPrompt(caseEntry, sandbox, tools) {
|
|
202
|
+
function buildUserPrompt(caseEntry, sandbox, tools, preamble) {
|
|
200
203
|
const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
|
|
201
204
|
const files = caseEntry.contextFiles ?? [];
|
|
202
205
|
const contextLines = files.length > 0
|
|
203
206
|
? files.map((f) => `- ${f}`).join("\n")
|
|
204
207
|
: "(no files seeded)";
|
|
205
|
-
const lines = [
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
208
|
+
const lines = [];
|
|
209
|
+
if (preamble && preamble.trim().length > 0) {
|
|
210
|
+
lines.push(preamble.trim(), ``);
|
|
211
|
+
}
|
|
212
|
+
lines.push(`Stage: ${caseEntry.stage}`, `Case id: ${caseEntry.id}`, ``);
|
|
213
|
+
const rest = [
|
|
209
214
|
`Sandbox root: ${sandbox.root}`,
|
|
210
215
|
`You may call the following tools to read or modify files inside the sandbox.`,
|
|
211
216
|
`All paths are relative to the sandbox root.`,
|
|
@@ -225,6 +230,7 @@ function buildUserPrompt(caseEntry, sandbox, tools) {
|
|
|
225
230
|
`You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
|
|
226
231
|
`if you do, the last written \`artifact.md\` is preferred over the chat reply.`
|
|
227
232
|
];
|
|
233
|
+
lines.push(...rest);
|
|
228
234
|
return lines.join("\n");
|
|
229
235
|
}
|
|
230
236
|
async function resolveArtifact(sandbox, fallback) {
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { EvalLlmClient } from "../llm-client.js";
|
|
2
|
+
import { createSandbox } from "../sandbox.js";
|
|
3
|
+
import type { SandboxTool } from "../tools/index.js";
|
|
4
|
+
import type { ResolvedEvalConfig, WorkflowCase, WorkflowStageName, WorkflowStageResult } from "../types.js";
|
|
5
|
+
export interface WorkflowInput {
|
|
6
|
+
workflow: WorkflowCase;
|
|
7
|
+
config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes" | "workflowMaxTotalTurns">;
|
|
8
|
+
projectRoot: string;
|
|
9
|
+
client: EvalLlmClient;
|
|
10
|
+
tools?: SandboxTool[];
|
|
11
|
+
/** Override for the SKILL.md loader (test hook). */
|
|
12
|
+
loadSkill?: (stage: WorkflowStageName) => Promise<string>;
|
|
13
|
+
/** Override for the sandbox factory (test hook). */
|
|
14
|
+
createSandboxFn?: typeof createSandbox;
|
|
15
|
+
/**
|
|
16
|
+
* Optional per-stage lifecycle hooks. The runner uses these to emit
|
|
17
|
+
* progress events to stderr so workflow-mode runs surface real-time
|
|
18
|
+
* status rather than going silent for minutes.
|
|
19
|
+
*/
|
|
20
|
+
onStageStart?: (stage: WorkflowStageName) => void;
|
|
21
|
+
onStageEnd?: (stage: WorkflowStageName, result: WorkflowStageResult) => void;
|
|
22
|
+
}
|
|
23
|
+
export interface WorkflowOutput {
|
|
24
|
+
caseId: string;
|
|
25
|
+
stages: WorkflowStageResult[];
|
|
26
|
+
/** Map from stage name to produced artifact (also persisted in sandbox). */
|
|
27
|
+
artifacts: Map<WorkflowStageName, string>;
|
|
28
|
+
totalUsageUsd: number;
|
|
29
|
+
totalDurationMs: number;
|
|
30
|
+
}
|
|
31
|
+
export declare function runWorkflow(input: WorkflowInput): Promise<WorkflowOutput>;
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Workflow-mode agent.
|
|
3
|
+
*
|
|
4
|
+
* Runs the with-tools loop once per stage in a workflow case,
|
|
5
|
+
* sharing a single sandbox across stages so every new stage can read
|
|
6
|
+
* the earlier artifacts the model produced. The shape of the run is:
|
|
7
|
+
*
|
|
8
|
+
* 1. Create one sandbox seeded with `contextFiles`.
|
|
9
|
+
* 2. For each stage in `workflow.stages`:
|
|
10
|
+
* a. Delete any leftover `artifact.md` so the resolver doesn't
|
|
11
|
+
* accidentally pick the previous stage's output.
|
|
12
|
+
* b. Invoke `runWithTools({ externalSandbox: sandbox, promptPreamble })`.
|
|
13
|
+
* The preamble tells the model which stage it is on and lists the
|
|
14
|
+
* `stages/*.md` files available for reading.
|
|
15
|
+
* c. Persist the returned artifact to `stages/<stage>.md` inside the
|
|
16
|
+
* sandbox (deterministic, regardless of whether the model wrote
|
|
17
|
+
* `artifact.md` itself).
|
|
18
|
+
* d. Record `WorkflowStageResult` with usage, duration, and tool use.
|
|
19
|
+
* 3. Dispose the sandbox in a `finally` so temp directories never leak.
|
|
20
|
+
*
|
|
21
|
+
* Errors bubble up from `runWithTools`:
|
|
22
|
+
* - `MaxTurnsExceededError` stops the workflow at the current stage.
|
|
23
|
+
* - `DailyCostCapExceededError` (surfaced by the cost-guard wrapper in
|
|
24
|
+
* the runner) aborts immediately.
|
|
25
|
+
* - Generic `EvalLlmError` subclasses propagate as-is so the runner can
|
|
26
|
+
* record a workflow-level verifier failure.
|
|
27
|
+
*/
|
|
28
|
+
import fs from "node:fs/promises";
|
|
29
|
+
import path from "node:path";
|
|
30
|
+
import { createSandbox } from "../sandbox.js";
|
|
31
|
+
import { loadStageSkill } from "./single-shot.js";
|
|
32
|
+
import { runWithTools } from "./with-tools.js";
|
|
33
|
+
const STAGES_SUBDIR = "stages";
|
|
34
|
+
const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
|
|
35
|
+
export async function runWorkflow(input) {
|
|
36
|
+
const { workflow, config, projectRoot, client } = input;
|
|
37
|
+
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
38
|
+
const sandbox = await sandboxFactory({
|
|
39
|
+
projectRoot,
|
|
40
|
+
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
41
|
+
});
|
|
42
|
+
const stageResults = [];
|
|
43
|
+
const artifacts = new Map();
|
|
44
|
+
let totalUsageUsd = 0;
|
|
45
|
+
let totalDurationMs = 0;
|
|
46
|
+
try {
|
|
47
|
+
await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
|
|
48
|
+
for (const step of workflow.stages) {
|
|
49
|
+
input.onStageStart?.(step.name);
|
|
50
|
+
await clearArtifactFile(sandbox);
|
|
51
|
+
const priorStages = stageResults.map((r) => r.stage);
|
|
52
|
+
const preamble = buildStagePreamble(workflow, step.name, priorStages);
|
|
53
|
+
const caseEntry = {
|
|
54
|
+
id: `${workflow.id}/${step.name}`,
|
|
55
|
+
stage: step.name,
|
|
56
|
+
inputPrompt: step.inputPrompt,
|
|
57
|
+
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
58
|
+
};
|
|
59
|
+
const result = await runWithTools({
|
|
60
|
+
caseEntry,
|
|
61
|
+
config,
|
|
62
|
+
projectRoot,
|
|
63
|
+
client,
|
|
64
|
+
...(input.tools ? { tools: input.tools } : {}),
|
|
65
|
+
...(input.loadSkill
|
|
66
|
+
? { loadSkill: input.loadSkill }
|
|
67
|
+
: {
|
|
68
|
+
loadSkill: (stage) => loadStageSkill(projectRoot, stage)
|
|
69
|
+
}),
|
|
70
|
+
externalSandbox: sandbox,
|
|
71
|
+
promptPreamble: preamble
|
|
72
|
+
});
|
|
73
|
+
await persistStageArtifact(sandbox, step.name, result.artifact);
|
|
74
|
+
artifacts.set(step.name, result.artifact);
|
|
75
|
+
const stageResult = {
|
|
76
|
+
stage: step.name,
|
|
77
|
+
artifact: result.artifact,
|
|
78
|
+
durationMs: result.durationMs,
|
|
79
|
+
usageUsd: result.usageUsd,
|
|
80
|
+
toolUse: result.toolUse,
|
|
81
|
+
attempts: result.attempts,
|
|
82
|
+
model: result.model,
|
|
83
|
+
promptTokens: result.usage.promptTokens,
|
|
84
|
+
completionTokens: result.usage.completionTokens
|
|
85
|
+
};
|
|
86
|
+
stageResults.push(stageResult);
|
|
87
|
+
input.onStageEnd?.(step.name, stageResult);
|
|
88
|
+
totalUsageUsd += result.usageUsd;
|
|
89
|
+
totalDurationMs += result.durationMs;
|
|
90
|
+
}
|
|
91
|
+
return {
|
|
92
|
+
caseId: workflow.id,
|
|
93
|
+
stages: stageResults,
|
|
94
|
+
artifacts,
|
|
95
|
+
totalUsageUsd: Number(totalUsageUsd.toFixed(6)),
|
|
96
|
+
totalDurationMs
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
finally {
|
|
100
|
+
await sandbox.dispose();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
async function clearArtifactFile(sandbox) {
|
|
104
|
+
for (const candidate of ARTIFACT_CANDIDATES) {
|
|
105
|
+
try {
|
|
106
|
+
const abs = await sandbox.resolve(candidate);
|
|
107
|
+
await fs.rm(abs, { force: true });
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
// candidate did not exist — resolve threw SandboxEscapeError for
|
|
111
|
+
// missing realpath; safe to ignore.
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
async function persistStageArtifact(sandbox, stage, artifact) {
|
|
116
|
+
const rel = `${STAGES_SUBDIR}/${stage}.md`;
|
|
117
|
+
const abs = await sandbox.resolve(rel, { allowMissing: true });
|
|
118
|
+
await fs.mkdir(path.dirname(abs), { recursive: true });
|
|
119
|
+
await fs.writeFile(abs, artifact.endsWith("\n") ? artifact : `${artifact}\n`, "utf8");
|
|
120
|
+
}
|
|
121
|
+
function buildStagePreamble(workflow, current, priorStages) {
|
|
122
|
+
const lines = [];
|
|
123
|
+
lines.push(`You are running stage "${current}" of the workflow "${workflow.id}".`);
|
|
124
|
+
if (workflow.description) {
|
|
125
|
+
lines.push(`Case description: ${workflow.description}`);
|
|
126
|
+
}
|
|
127
|
+
if (priorStages.length === 0) {
|
|
128
|
+
lines.push(`This is the first stage. Any context_files have been seeded into the sandbox root.`);
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
lines.push(`Earlier stage artifacts are available via read_file:`, ...priorStages.map((name) => ` - ${STAGES_SUBDIR}/${name}.md`), `Read the prior artifacts before drafting your output so decisions and ` +
|
|
132
|
+
`ids carry through.`);
|
|
133
|
+
}
|
|
134
|
+
return lines.join("\n");
|
|
135
|
+
}
|
package/dist/eval/baseline.d.ts
CHANGED
|
@@ -1,6 +1,30 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
3
|
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
+
/**
|
|
5
|
+
* Thrown when a signed baseline's on-disk digest does not match the
|
|
6
|
+
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
7
|
+
* Callers should treat this as a hard failure: the baseline was either
|
|
8
|
+
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
9
|
+
*/
|
|
10
|
+
export declare class BaselineSignatureError extends Error {
|
|
11
|
+
readonly file: string;
|
|
12
|
+
readonly expected: string;
|
|
13
|
+
readonly actual: string;
|
|
14
|
+
constructor(opts: {
|
|
15
|
+
file: string;
|
|
16
|
+
expected: string;
|
|
17
|
+
actual: string;
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Produce a deterministic sha256 digest over the signable portion of a
|
|
22
|
+
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
23
|
+
* from the digest so that rebuilding the same baseline from identical
|
|
24
|
+
* case results on a new CLI version doesn't invalidate the signature —
|
|
25
|
+
* only changes to the observed pass/ok/score payloads do.
|
|
26
|
+
*/
|
|
27
|
+
export declare function computeBaselineDigest(snapshot: Pick<BaselineSnapshot, "schemaVersion" | "stage" | "cases">): string;
|
|
4
28
|
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
5
29
|
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
6
30
|
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
package/dist/eval/baseline.js
CHANGED
|
@@ -14,15 +14,67 @@
|
|
|
14
14
|
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
15
|
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
16
|
*/
|
|
17
|
+
import { createHash } from "node:crypto";
|
|
17
18
|
import fs from "node:fs/promises";
|
|
18
19
|
import path from "node:path";
|
|
19
20
|
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
20
21
|
import { exists } from "../fs-utils.js";
|
|
21
22
|
import { FLOW_STAGES } from "../types.js";
|
|
22
23
|
export const BASELINE_SCHEMA_VERSION = 1;
|
|
24
|
+
/**
|
|
25
|
+
* Thrown when a signed baseline's on-disk digest does not match the
|
|
26
|
+
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
27
|
+
* Callers should treat this as a hard failure: the baseline was either
|
|
28
|
+
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
29
|
+
*/
|
|
30
|
+
export class BaselineSignatureError extends Error {
|
|
31
|
+
file;
|
|
32
|
+
expected;
|
|
33
|
+
actual;
|
|
34
|
+
constructor(opts) {
|
|
35
|
+
super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
|
|
36
|
+
`The file was modified outside of \`cclaw eval --update-baseline\`. ` +
|
|
37
|
+
`Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
|
|
38
|
+
this.name = "BaselineSignatureError";
|
|
39
|
+
this.file = opts.file;
|
|
40
|
+
this.expected = opts.expected;
|
|
41
|
+
this.actual = opts.actual;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
23
44
|
function baselinePath(projectRoot, stage) {
|
|
24
45
|
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
25
46
|
}
|
|
47
|
+
/**
|
|
48
|
+
* Produce a deterministic sha256 digest over the signable portion of a
|
|
49
|
+
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
50
|
+
* from the digest so that rebuilding the same baseline from identical
|
|
51
|
+
* case results on a new CLI version doesn't invalidate the signature —
|
|
52
|
+
* only changes to the observed pass/ok/score payloads do.
|
|
53
|
+
*/
|
|
54
|
+
export function computeBaselineDigest(snapshot) {
|
|
55
|
+
const canonical = canonicalJson({
|
|
56
|
+
schemaVersion: snapshot.schemaVersion,
|
|
57
|
+
stage: snapshot.stage,
|
|
58
|
+
cases: snapshot.cases
|
|
59
|
+
});
|
|
60
|
+
return createHash("sha256").update(canonical).digest("hex");
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* JSON.stringify with object keys sorted recursively so the digest is
|
|
64
|
+
* stable across filesystem / serializer variations.
|
|
65
|
+
*/
|
|
66
|
+
function canonicalJson(value) {
|
|
67
|
+
if (value === null || typeof value !== "object") {
|
|
68
|
+
return JSON.stringify(value);
|
|
69
|
+
}
|
|
70
|
+
if (Array.isArray(value)) {
|
|
71
|
+
return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
|
|
72
|
+
}
|
|
73
|
+
const record = value;
|
|
74
|
+
const keys = Object.keys(record).sort();
|
|
75
|
+
const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
|
|
76
|
+
return `{${parts.join(",")}}`;
|
|
77
|
+
}
|
|
26
78
|
export async function loadBaseline(projectRoot, stage) {
|
|
27
79
|
const filePath = baselinePath(projectRoot, stage);
|
|
28
80
|
if (!(await exists(filePath)))
|
|
@@ -38,6 +90,20 @@ export async function loadBaseline(projectRoot, stage) {
|
|
|
38
90
|
if (!isBaseline(parsed, stage)) {
|
|
39
91
|
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
40
92
|
}
|
|
93
|
+
const signature = parsed.signature;
|
|
94
|
+
if (signature) {
|
|
95
|
+
if (signature.algorithm !== "sha256") {
|
|
96
|
+
throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
|
|
97
|
+
}
|
|
98
|
+
const actual = computeBaselineDigest(parsed);
|
|
99
|
+
if (actual !== signature.digest) {
|
|
100
|
+
throw new BaselineSignatureError({
|
|
101
|
+
file: filePath,
|
|
102
|
+
expected: signature.digest,
|
|
103
|
+
actual
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
41
107
|
return parsed;
|
|
42
108
|
}
|
|
43
109
|
function isBaseline(value, stage) {
|
|
@@ -80,13 +146,20 @@ export function buildBaselineForStage(stage, report) {
|
|
|
80
146
|
for (const c of stageCases) {
|
|
81
147
|
cases[c.caseId] = entryFromResult(c);
|
|
82
148
|
}
|
|
83
|
-
|
|
149
|
+
const now = new Date().toISOString();
|
|
150
|
+
const unsigned = {
|
|
84
151
|
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
85
152
|
stage,
|
|
86
|
-
generatedAt:
|
|
153
|
+
generatedAt: now,
|
|
87
154
|
cclawVersion: CCLAW_VERSION,
|
|
88
155
|
cases
|
|
89
156
|
};
|
|
157
|
+
unsigned.signature = {
|
|
158
|
+
algorithm: "sha256",
|
|
159
|
+
digest: computeBaselineDigest(unsigned),
|
|
160
|
+
signedAt: now
|
|
161
|
+
};
|
|
162
|
+
return unsigned;
|
|
90
163
|
}
|
|
91
164
|
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
92
165
|
const written = [];
|
|
@@ -3,7 +3,8 @@ import path from "node:path";
|
|
|
3
3
|
import { parse } from "yaml";
|
|
4
4
|
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
5
|
import { exists } from "../fs-utils.js";
|
|
6
|
-
import {
|
|
6
|
+
import { EVAL_MODES } from "./types.js";
|
|
7
|
+
import { parseModeInput } from "./mode.js";
|
|
7
8
|
/**
|
|
8
9
|
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
9
10
|
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
@@ -14,7 +15,7 @@ export const DEFAULT_EVAL_CONFIG = {
|
|
|
14
15
|
provider: "zai",
|
|
15
16
|
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
16
17
|
model: "glm-5.1",
|
|
17
|
-
|
|
18
|
+
defaultMode: "fixture",
|
|
18
19
|
regression: {
|
|
19
20
|
failIfDeltaBelow: -0.15,
|
|
20
21
|
failIfCriticalBelow: 3.0
|
|
@@ -25,7 +26,6 @@ export const DEFAULT_EVAL_CONFIG = {
|
|
|
25
26
|
judgeTemperature: 0,
|
|
26
27
|
agentTemperature: 0.2
|
|
27
28
|
};
|
|
28
|
-
const EVAL_TIER_SET = new Set(EVAL_TIERS);
|
|
29
29
|
const NUMERIC_ENVS = new Set([
|
|
30
30
|
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
31
31
|
"CCLAW_EVAL_TIMEOUT_MS",
|
|
@@ -35,11 +35,12 @@ const NUMERIC_ENVS = new Set([
|
|
|
35
35
|
"CCLAW_EVAL_AGENT_TEMPERATURE",
|
|
36
36
|
"CCLAW_EVAL_TOOL_MAX_TURNS",
|
|
37
37
|
"CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
|
|
38
|
-
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES"
|
|
38
|
+
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
|
|
39
|
+
"CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
|
|
39
40
|
]);
|
|
40
41
|
function evalConfigError(configFilePath, reason) {
|
|
41
42
|
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
42
|
-
`Supported
|
|
43
|
+
`Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
|
|
43
44
|
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
44
45
|
}
|
|
45
46
|
function isRecord(value) {
|
|
@@ -52,12 +53,11 @@ function parseNumericEnv(name, raw) {
|
|
|
52
53
|
}
|
|
53
54
|
return value;
|
|
54
55
|
}
|
|
55
|
-
function
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
}
|
|
60
|
-
return trimmed;
|
|
56
|
+
function parseModeEnv(raw, envName) {
|
|
57
|
+
return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
|
|
58
|
+
source: "env",
|
|
59
|
+
raw: `${envName}=${raw}`
|
|
60
|
+
});
|
|
61
61
|
}
|
|
62
62
|
function validateFileConfig(raw, configFilePath) {
|
|
63
63
|
if (raw === undefined || raw === null)
|
|
@@ -78,11 +78,33 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
78
78
|
assignString("baseUrl", raw.baseUrl);
|
|
79
79
|
assignString("model", raw.model);
|
|
80
80
|
assignString("judgeModel", raw.judgeModel);
|
|
81
|
-
if (raw.
|
|
82
|
-
if (typeof raw.
|
|
83
|
-
throw evalConfigError(configFilePath, `"
|
|
81
|
+
if (raw.defaultMode !== undefined) {
|
|
82
|
+
if (typeof raw.defaultMode !== "string") {
|
|
83
|
+
throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
|
|
84
|
+
}
|
|
85
|
+
try {
|
|
86
|
+
out.defaultMode = parseModeInput(raw.defaultMode, {
|
|
87
|
+
source: "config",
|
|
88
|
+
raw: `defaultMode: ${raw.defaultMode}`
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
else if (raw.defaultTier !== undefined) {
|
|
96
|
+
if (typeof raw.defaultTier !== "string") {
|
|
97
|
+
throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
|
|
98
|
+
}
|
|
99
|
+
try {
|
|
100
|
+
out.defaultMode = parseModeInput(raw.defaultTier, {
|
|
101
|
+
source: "config",
|
|
102
|
+
raw: `defaultTier: ${raw.defaultTier}`
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
catch (err) {
|
|
106
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
84
107
|
}
|
|
85
|
-
out.defaultTier = raw.defaultTier;
|
|
86
108
|
}
|
|
87
109
|
if (raw.dailyUsdCap !== undefined) {
|
|
88
110
|
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
@@ -166,6 +188,7 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
166
188
|
assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
|
|
167
189
|
assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
|
|
168
190
|
assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
|
|
191
|
+
assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
|
|
169
192
|
if (raw.regression !== undefined) {
|
|
170
193
|
if (!isRecord(raw.regression)) {
|
|
171
194
|
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
@@ -192,6 +215,7 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
192
215
|
"baseUrl",
|
|
193
216
|
"model",
|
|
194
217
|
"judgeModel",
|
|
218
|
+
"defaultMode",
|
|
195
219
|
"defaultTier",
|
|
196
220
|
"dailyUsdCap",
|
|
197
221
|
"timeoutMs",
|
|
@@ -203,7 +227,8 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
203
227
|
"tokenPricing",
|
|
204
228
|
"toolMaxTurns",
|
|
205
229
|
"toolMaxArgumentsBytes",
|
|
206
|
-
"toolMaxResultBytes"
|
|
230
|
+
"toolMaxResultBytes",
|
|
231
|
+
"workflowMaxTotalTurns"
|
|
207
232
|
]);
|
|
208
233
|
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
209
234
|
if (unknown.length > 0) {
|
|
@@ -263,11 +288,18 @@ function applyEnvOverrides(base, env) {
|
|
|
263
288
|
patched.provider = provider;
|
|
264
289
|
overridden = true;
|
|
265
290
|
}
|
|
266
|
-
const
|
|
267
|
-
if (
|
|
268
|
-
patched.
|
|
291
|
+
const modeEnv = read("CCLAW_EVAL_MODE");
|
|
292
|
+
if (modeEnv) {
|
|
293
|
+
patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
|
|
269
294
|
overridden = true;
|
|
270
295
|
}
|
|
296
|
+
else {
|
|
297
|
+
const legacyTier = read("CCLAW_EVAL_TIER");
|
|
298
|
+
if (legacyTier) {
|
|
299
|
+
patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
|
|
300
|
+
overridden = true;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
271
303
|
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
272
304
|
if (cap) {
|
|
273
305
|
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
@@ -326,6 +358,7 @@ function applyEnvOverrides(base, env) {
|
|
|
326
358
|
void label;
|
|
327
359
|
};
|
|
328
360
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
|
|
361
|
+
readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
|
|
329
362
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
|
|
330
363
|
readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
|
|
331
364
|
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
@@ -35,6 +35,22 @@ export declare class DailyCostCapExceededError extends Error {
|
|
|
35
35
|
currentUsd: number;
|
|
36
36
|
});
|
|
37
37
|
}
|
|
38
|
+
/**
|
|
39
|
+
* Per-run cost cap — enforced in-memory, no ledger file. Complements the
|
|
40
|
+
* daily cap so a single long workflow run can't blow the whole day's
|
|
41
|
+
* budget even if the daily cap is generous. Opt-in via
|
|
42
|
+
* `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
|
|
43
|
+
*/
|
|
44
|
+
export declare class RunCostCapExceededError extends Error {
|
|
45
|
+
readonly capUsd: number;
|
|
46
|
+
readonly projectedUsd: number;
|
|
47
|
+
readonly currentUsd: number;
|
|
48
|
+
constructor(opts: {
|
|
49
|
+
capUsd: number;
|
|
50
|
+
projectedUsd: number;
|
|
51
|
+
currentUsd: number;
|
|
52
|
+
});
|
|
53
|
+
}
|
|
38
54
|
declare function utcDate(now?: Date): string;
|
|
39
55
|
declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
|
|
40
56
|
/**
|
|
@@ -67,6 +83,12 @@ export interface CreateCostGuardOptions {
|
|
|
67
83
|
now?: () => Date;
|
|
68
84
|
/** Override the default filesystem root for the ledger. */
|
|
69
85
|
ledgerPath?: string;
|
|
86
|
+
/**
|
|
87
|
+
* Per-run (in-memory) USD cap. Independent from the persisted daily
|
|
88
|
+
* cap so a single `cclaw eval` invocation can be budgeted without
|
|
89
|
+
* touching the shared nightly ledger. Undefined = unlimited.
|
|
90
|
+
*/
|
|
91
|
+
runCapUsd?: number;
|
|
70
92
|
}
|
|
71
93
|
export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
|
|
72
94
|
/** Exposed for tests. */
|
package/dist/eval/cost-guard.js
CHANGED
|
@@ -52,6 +52,28 @@ export class DailyCostCapExceededError extends Error {
|
|
|
52
52
|
this.currentUsd = opts.currentUsd;
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Per-run cost cap — enforced in-memory, no ledger file. Complements the
|
|
57
|
+
* daily cap so a single long workflow run can't blow the whole day's
|
|
58
|
+
* budget even if the daily cap is generous. Opt-in via
|
|
59
|
+
* `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
|
|
60
|
+
*/
|
|
61
|
+
export class RunCostCapExceededError extends Error {
|
|
62
|
+
capUsd;
|
|
63
|
+
projectedUsd;
|
|
64
|
+
currentUsd;
|
|
65
|
+
constructor(opts) {
|
|
66
|
+
super(`Run cost cap would be exceeded: ` +
|
|
67
|
+
`current=$${opts.currentUsd.toFixed(4)}, ` +
|
|
68
|
+
`projected=$${opts.projectedUsd.toFixed(4)}, ` +
|
|
69
|
+
`cap=$${opts.capUsd.toFixed(4)}. ` +
|
|
70
|
+
`Raise --max-cost-usd or drop it to run uncapped.`);
|
|
71
|
+
this.name = "RunCostCapExceededError";
|
|
72
|
+
this.capUsd = opts.capUsd;
|
|
73
|
+
this.projectedUsd = opts.projectedUsd;
|
|
74
|
+
this.currentUsd = opts.currentUsd;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
55
77
|
function utcDate(now = new Date()) {
|
|
56
78
|
return now.toISOString().slice(0, 10);
|
|
57
79
|
}
|
|
@@ -109,11 +131,25 @@ export function createCostGuard(projectRoot, config, options = {}) {
|
|
|
109
131
|
const now = options.now ?? (() => new Date());
|
|
110
132
|
const currentDate = () => utcDate(now());
|
|
111
133
|
const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
|
|
134
|
+
const runCap = options.runCapUsd;
|
|
135
|
+
let runTotalUsd = 0;
|
|
112
136
|
return {
|
|
113
137
|
async commit(model, usage) {
|
|
114
138
|
const usd = computeUsageUsd(model, usage, config);
|
|
115
|
-
if (
|
|
139
|
+
if (runCap !== undefined) {
|
|
140
|
+
const projected = Number((runTotalUsd + usd).toFixed(6));
|
|
141
|
+
if (projected > runCap) {
|
|
142
|
+
throw new RunCostCapExceededError({
|
|
143
|
+
capUsd: runCap,
|
|
144
|
+
projectedUsd: projected,
|
|
145
|
+
currentUsd: runTotalUsd
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (config.dailyUsdCap === undefined) {
|
|
150
|
+
runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
|
|
116
151
|
return usd;
|
|
152
|
+
}
|
|
117
153
|
const date = currentDate();
|
|
118
154
|
const target = file();
|
|
119
155
|
const ledger = await readLedger(target, date);
|
|
@@ -133,6 +169,7 @@ export function createCostGuard(projectRoot, config, options = {}) {
|
|
|
133
169
|
byModel.usd = Number((byModel.usd + usd).toFixed(6));
|
|
134
170
|
ledger.byModel[model] = byModel;
|
|
135
171
|
await writeLedger(target, ledger);
|
|
172
|
+
runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
|
|
136
173
|
return usd;
|
|
137
174
|
},
|
|
138
175
|
async snapshot() {
|