cclaw-cli 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +8 -4
- package/dist/cli.js +316 -20
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +6 -6
- package/dist/eval/agents/with-tools.js +5 -5
- package/dist/eval/agents/workflow.d.ts +7 -0
- package/dist/eval/agents/workflow.js +5 -3
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +46 -17
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +1 -1
- package/dist/eval/diff.js +3 -3
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +29 -9
- package/dist/eval/runner.js +148 -56
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +54 -27
- package/dist/eval/types.js +21 -9
- package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
- package/dist/eval/workflow-corpus.d.ts +2 -2
- package/dist/eval/workflow-corpus.js +4 -4
- package/package.json +1 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Workflow-mode agent.
|
|
3
3
|
*
|
|
4
|
-
* Runs the
|
|
4
|
+
* Runs the with-tools loop once per stage in a workflow case,
|
|
5
5
|
* sharing a single sandbox across stages so every new stage can read
|
|
6
6
|
* the earlier artifacts the model produced. The shape of the run is:
|
|
7
7
|
*
|
|
@@ -46,6 +46,7 @@ export async function runWorkflow(input) {
|
|
|
46
46
|
try {
|
|
47
47
|
await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
|
|
48
48
|
for (const step of workflow.stages) {
|
|
49
|
+
input.onStageStart?.(step.name);
|
|
49
50
|
await clearArtifactFile(sandbox);
|
|
50
51
|
const priorStages = stageResults.map((r) => r.stage);
|
|
51
52
|
const preamble = buildStagePreamble(workflow, step.name, priorStages);
|
|
@@ -83,6 +84,7 @@ export async function runWorkflow(input) {
|
|
|
83
84
|
completionTokens: result.usage.completionTokens
|
|
84
85
|
};
|
|
85
86
|
stageResults.push(stageResult);
|
|
87
|
+
input.onStageEnd?.(step.name, stageResult);
|
|
86
88
|
totalUsageUsd += result.usageUsd;
|
|
87
89
|
totalDurationMs += result.durationMs;
|
|
88
90
|
}
|
|
@@ -118,7 +120,7 @@ async function persistStageArtifact(sandbox, stage, artifact) {
|
|
|
118
120
|
}
|
|
119
121
|
function buildStagePreamble(workflow, current, priorStages) {
|
|
120
122
|
const lines = [];
|
|
121
|
-
lines.push(`You are running stage "${current}" of the
|
|
123
|
+
lines.push(`You are running stage "${current}" of the workflow "${workflow.id}".`);
|
|
122
124
|
if (workflow.description) {
|
|
123
125
|
lines.push(`Case description: ${workflow.description}`);
|
|
124
126
|
}
|
package/dist/eval/baseline.d.ts
CHANGED
|
@@ -1,6 +1,30 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
3
|
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
+
/**
|
|
5
|
+
* Thrown when a signed baseline's on-disk digest does not match the
|
|
6
|
+
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
7
|
+
* Callers should treat this as a hard failure: the baseline was either
|
|
8
|
+
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
9
|
+
*/
|
|
10
|
+
export declare class BaselineSignatureError extends Error {
|
|
11
|
+
readonly file: string;
|
|
12
|
+
readonly expected: string;
|
|
13
|
+
readonly actual: string;
|
|
14
|
+
constructor(opts: {
|
|
15
|
+
file: string;
|
|
16
|
+
expected: string;
|
|
17
|
+
actual: string;
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Produce a deterministic sha256 digest over the signable portion of a
|
|
22
|
+
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
23
|
+
* from the digest so that rebuilding the same baseline from identical
|
|
24
|
+
* case results on a new CLI version doesn't invalidate the signature —
|
|
25
|
+
* only changes to the observed pass/ok/score payloads do.
|
|
26
|
+
*/
|
|
27
|
+
export declare function computeBaselineDigest(snapshot: Pick<BaselineSnapshot, "schemaVersion" | "stage" | "cases">): string;
|
|
4
28
|
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
5
29
|
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
6
30
|
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
package/dist/eval/baseline.js
CHANGED
|
@@ -14,15 +14,67 @@
|
|
|
14
14
|
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
15
|
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
16
|
*/
|
|
17
|
+
import { createHash } from "node:crypto";
|
|
17
18
|
import fs from "node:fs/promises";
|
|
18
19
|
import path from "node:path";
|
|
19
20
|
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
20
21
|
import { exists } from "../fs-utils.js";
|
|
21
22
|
import { FLOW_STAGES } from "../types.js";
|
|
22
23
|
export const BASELINE_SCHEMA_VERSION = 1;
|
|
24
|
+
/**
|
|
25
|
+
* Thrown when a signed baseline's on-disk digest does not match the
|
|
26
|
+
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
27
|
+
* Callers should treat this as a hard failure: the baseline was either
|
|
28
|
+
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
29
|
+
*/
|
|
30
|
+
export class BaselineSignatureError extends Error {
|
|
31
|
+
file;
|
|
32
|
+
expected;
|
|
33
|
+
actual;
|
|
34
|
+
constructor(opts) {
|
|
35
|
+
super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
|
|
36
|
+
`The file was modified outside of \`cclaw eval --update-baseline\`. ` +
|
|
37
|
+
`Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
|
|
38
|
+
this.name = "BaselineSignatureError";
|
|
39
|
+
this.file = opts.file;
|
|
40
|
+
this.expected = opts.expected;
|
|
41
|
+
this.actual = opts.actual;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
23
44
|
function baselinePath(projectRoot, stage) {
|
|
24
45
|
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
25
46
|
}
|
|
47
|
+
/**
|
|
48
|
+
* Produce a deterministic sha256 digest over the signable portion of a
|
|
49
|
+
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
50
|
+
* from the digest so that rebuilding the same baseline from identical
|
|
51
|
+
* case results on a new CLI version doesn't invalidate the signature —
|
|
52
|
+
* only changes to the observed pass/ok/score payloads do.
|
|
53
|
+
*/
|
|
54
|
+
export function computeBaselineDigest(snapshot) {
|
|
55
|
+
const canonical = canonicalJson({
|
|
56
|
+
schemaVersion: snapshot.schemaVersion,
|
|
57
|
+
stage: snapshot.stage,
|
|
58
|
+
cases: snapshot.cases
|
|
59
|
+
});
|
|
60
|
+
return createHash("sha256").update(canonical).digest("hex");
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* JSON.stringify with object keys sorted recursively so the digest is
|
|
64
|
+
* stable across filesystem / serializer variations.
|
|
65
|
+
*/
|
|
66
|
+
function canonicalJson(value) {
|
|
67
|
+
if (value === null || typeof value !== "object") {
|
|
68
|
+
return JSON.stringify(value);
|
|
69
|
+
}
|
|
70
|
+
if (Array.isArray(value)) {
|
|
71
|
+
return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
|
|
72
|
+
}
|
|
73
|
+
const record = value;
|
|
74
|
+
const keys = Object.keys(record).sort();
|
|
75
|
+
const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
|
|
76
|
+
return `{${parts.join(",")}}`;
|
|
77
|
+
}
|
|
26
78
|
export async function loadBaseline(projectRoot, stage) {
|
|
27
79
|
const filePath = baselinePath(projectRoot, stage);
|
|
28
80
|
if (!(await exists(filePath)))
|
|
@@ -38,6 +90,20 @@ export async function loadBaseline(projectRoot, stage) {
|
|
|
38
90
|
if (!isBaseline(parsed, stage)) {
|
|
39
91
|
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
40
92
|
}
|
|
93
|
+
const signature = parsed.signature;
|
|
94
|
+
if (signature) {
|
|
95
|
+
if (signature.algorithm !== "sha256") {
|
|
96
|
+
throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
|
|
97
|
+
}
|
|
98
|
+
const actual = computeBaselineDigest(parsed);
|
|
99
|
+
if (actual !== signature.digest) {
|
|
100
|
+
throw new BaselineSignatureError({
|
|
101
|
+
file: filePath,
|
|
102
|
+
expected: signature.digest,
|
|
103
|
+
actual
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
41
107
|
return parsed;
|
|
42
108
|
}
|
|
43
109
|
function isBaseline(value, stage) {
|
|
@@ -80,13 +146,20 @@ export function buildBaselineForStage(stage, report) {
|
|
|
80
146
|
for (const c of stageCases) {
|
|
81
147
|
cases[c.caseId] = entryFromResult(c);
|
|
82
148
|
}
|
|
83
|
-
|
|
149
|
+
const now = new Date().toISOString();
|
|
150
|
+
const unsigned = {
|
|
84
151
|
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
85
152
|
stage,
|
|
86
|
-
generatedAt:
|
|
153
|
+
generatedAt: now,
|
|
87
154
|
cclawVersion: CCLAW_VERSION,
|
|
88
155
|
cases
|
|
89
156
|
};
|
|
157
|
+
unsigned.signature = {
|
|
158
|
+
algorithm: "sha256",
|
|
159
|
+
digest: computeBaselineDigest(unsigned),
|
|
160
|
+
signedAt: now
|
|
161
|
+
};
|
|
162
|
+
return unsigned;
|
|
90
163
|
}
|
|
91
164
|
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
92
165
|
const written = [];
|
|
@@ -3,7 +3,8 @@ import path from "node:path";
|
|
|
3
3
|
import { parse } from "yaml";
|
|
4
4
|
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
5
|
import { exists } from "../fs-utils.js";
|
|
6
|
-
import {
|
|
6
|
+
import { EVAL_MODES } from "./types.js";
|
|
7
|
+
import { parseModeInput } from "./mode.js";
|
|
7
8
|
/**
|
|
8
9
|
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
9
10
|
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
@@ -14,7 +15,7 @@ export const DEFAULT_EVAL_CONFIG = {
|
|
|
14
15
|
provider: "zai",
|
|
15
16
|
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
16
17
|
model: "glm-5.1",
|
|
17
|
-
|
|
18
|
+
defaultMode: "fixture",
|
|
18
19
|
regression: {
|
|
19
20
|
failIfDeltaBelow: -0.15,
|
|
20
21
|
failIfCriticalBelow: 3.0
|
|
@@ -25,7 +26,6 @@ export const DEFAULT_EVAL_CONFIG = {
|
|
|
25
26
|
judgeTemperature: 0,
|
|
26
27
|
agentTemperature: 0.2
|
|
27
28
|
};
|
|
28
|
-
const EVAL_TIER_SET = new Set(EVAL_TIERS);
|
|
29
29
|
const NUMERIC_ENVS = new Set([
|
|
30
30
|
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
31
31
|
"CCLAW_EVAL_TIMEOUT_MS",
|
|
@@ -40,7 +40,7 @@ const NUMERIC_ENVS = new Set([
|
|
|
40
40
|
]);
|
|
41
41
|
function evalConfigError(configFilePath, reason) {
|
|
42
42
|
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
43
|
-
`Supported
|
|
43
|
+
`Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
|
|
44
44
|
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
45
45
|
}
|
|
46
46
|
function isRecord(value) {
|
|
@@ -53,12 +53,11 @@ function parseNumericEnv(name, raw) {
|
|
|
53
53
|
}
|
|
54
54
|
return value;
|
|
55
55
|
}
|
|
56
|
-
function
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}
|
|
61
|
-
return trimmed;
|
|
56
|
+
function parseModeEnv(raw, envName) {
|
|
57
|
+
return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
|
|
58
|
+
source: "env",
|
|
59
|
+
raw: `${envName}=${raw}`
|
|
60
|
+
});
|
|
62
61
|
}
|
|
63
62
|
function validateFileConfig(raw, configFilePath) {
|
|
64
63
|
if (raw === undefined || raw === null)
|
|
@@ -79,11 +78,33 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
79
78
|
assignString("baseUrl", raw.baseUrl);
|
|
80
79
|
assignString("model", raw.model);
|
|
81
80
|
assignString("judgeModel", raw.judgeModel);
|
|
82
|
-
if (raw.
|
|
83
|
-
if (typeof raw.
|
|
84
|
-
throw evalConfigError(configFilePath, `"
|
|
81
|
+
if (raw.defaultMode !== undefined) {
|
|
82
|
+
if (typeof raw.defaultMode !== "string") {
|
|
83
|
+
throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
|
|
84
|
+
}
|
|
85
|
+
try {
|
|
86
|
+
out.defaultMode = parseModeInput(raw.defaultMode, {
|
|
87
|
+
source: "config",
|
|
88
|
+
raw: `defaultMode: ${raw.defaultMode}`
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
else if (raw.defaultTier !== undefined) {
|
|
96
|
+
if (typeof raw.defaultTier !== "string") {
|
|
97
|
+
throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
|
|
98
|
+
}
|
|
99
|
+
try {
|
|
100
|
+
out.defaultMode = parseModeInput(raw.defaultTier, {
|
|
101
|
+
source: "config",
|
|
102
|
+
raw: `defaultTier: ${raw.defaultTier}`
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
catch (err) {
|
|
106
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
85
107
|
}
|
|
86
|
-
out.defaultTier = raw.defaultTier;
|
|
87
108
|
}
|
|
88
109
|
if (raw.dailyUsdCap !== undefined) {
|
|
89
110
|
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
@@ -194,6 +215,7 @@ function validateFileConfig(raw, configFilePath) {
|
|
|
194
215
|
"baseUrl",
|
|
195
216
|
"model",
|
|
196
217
|
"judgeModel",
|
|
218
|
+
"defaultMode",
|
|
197
219
|
"defaultTier",
|
|
198
220
|
"dailyUsdCap",
|
|
199
221
|
"timeoutMs",
|
|
@@ -266,11 +288,18 @@ function applyEnvOverrides(base, env) {
|
|
|
266
288
|
patched.provider = provider;
|
|
267
289
|
overridden = true;
|
|
268
290
|
}
|
|
269
|
-
const
|
|
270
|
-
if (
|
|
271
|
-
patched.
|
|
291
|
+
const modeEnv = read("CCLAW_EVAL_MODE");
|
|
292
|
+
if (modeEnv) {
|
|
293
|
+
patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
|
|
272
294
|
overridden = true;
|
|
273
295
|
}
|
|
296
|
+
else {
|
|
297
|
+
const legacyTier = read("CCLAW_EVAL_TIER");
|
|
298
|
+
if (legacyTier) {
|
|
299
|
+
patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
|
|
300
|
+
overridden = true;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
274
303
|
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
275
304
|
if (cap) {
|
|
276
305
|
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
@@ -35,6 +35,22 @@ export declare class DailyCostCapExceededError extends Error {
|
|
|
35
35
|
currentUsd: number;
|
|
36
36
|
});
|
|
37
37
|
}
|
|
38
|
+
/**
|
|
39
|
+
* Per-run cost cap — enforced in-memory, no ledger file. Complements the
|
|
40
|
+
* daily cap so a single long workflow run can't blow the whole day's
|
|
41
|
+
* budget even if the daily cap is generous. Opt-in via
|
|
42
|
+
* `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
|
|
43
|
+
*/
|
|
44
|
+
export declare class RunCostCapExceededError extends Error {
|
|
45
|
+
readonly capUsd: number;
|
|
46
|
+
readonly projectedUsd: number;
|
|
47
|
+
readonly currentUsd: number;
|
|
48
|
+
constructor(opts: {
|
|
49
|
+
capUsd: number;
|
|
50
|
+
projectedUsd: number;
|
|
51
|
+
currentUsd: number;
|
|
52
|
+
});
|
|
53
|
+
}
|
|
38
54
|
declare function utcDate(now?: Date): string;
|
|
39
55
|
declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
|
|
40
56
|
/**
|
|
@@ -67,6 +83,12 @@ export interface CreateCostGuardOptions {
|
|
|
67
83
|
now?: () => Date;
|
|
68
84
|
/** Override the default filesystem root for the ledger. */
|
|
69
85
|
ledgerPath?: string;
|
|
86
|
+
/**
|
|
87
|
+
* Per-run (in-memory) USD cap. Independent from the persisted daily
|
|
88
|
+
* cap so a single `cclaw eval` invocation can be budgeted without
|
|
89
|
+
* touching the shared nightly ledger. Undefined = unlimited.
|
|
90
|
+
*/
|
|
91
|
+
runCapUsd?: number;
|
|
70
92
|
}
|
|
71
93
|
export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
|
|
72
94
|
/** Exposed for tests. */
|
package/dist/eval/cost-guard.js
CHANGED
|
@@ -52,6 +52,28 @@ export class DailyCostCapExceededError extends Error {
|
|
|
52
52
|
this.currentUsd = opts.currentUsd;
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Per-run cost cap — enforced in-memory, no ledger file. Complements the
|
|
57
|
+
* daily cap so a single long workflow run can't blow the whole day's
|
|
58
|
+
* budget even if the daily cap is generous. Opt-in via
|
|
59
|
+
* `--max-cost-usd=<n>` on the CLI or `CCLAW_EVAL_MAX_COST_USD`.
|
|
60
|
+
*/
|
|
61
|
+
export class RunCostCapExceededError extends Error {
|
|
62
|
+
capUsd;
|
|
63
|
+
projectedUsd;
|
|
64
|
+
currentUsd;
|
|
65
|
+
constructor(opts) {
|
|
66
|
+
super(`Run cost cap would be exceeded: ` +
|
|
67
|
+
`current=$${opts.currentUsd.toFixed(4)}, ` +
|
|
68
|
+
`projected=$${opts.projectedUsd.toFixed(4)}, ` +
|
|
69
|
+
`cap=$${opts.capUsd.toFixed(4)}. ` +
|
|
70
|
+
`Raise --max-cost-usd or drop it to run uncapped.`);
|
|
71
|
+
this.name = "RunCostCapExceededError";
|
|
72
|
+
this.capUsd = opts.capUsd;
|
|
73
|
+
this.projectedUsd = opts.projectedUsd;
|
|
74
|
+
this.currentUsd = opts.currentUsd;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
55
77
|
function utcDate(now = new Date()) {
|
|
56
78
|
return now.toISOString().slice(0, 10);
|
|
57
79
|
}
|
|
@@ -109,11 +131,25 @@ export function createCostGuard(projectRoot, config, options = {}) {
|
|
|
109
131
|
const now = options.now ?? (() => new Date());
|
|
110
132
|
const currentDate = () => utcDate(now());
|
|
111
133
|
const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
|
|
134
|
+
const runCap = options.runCapUsd;
|
|
135
|
+
let runTotalUsd = 0;
|
|
112
136
|
return {
|
|
113
137
|
async commit(model, usage) {
|
|
114
138
|
const usd = computeUsageUsd(model, usage, config);
|
|
115
|
-
if (
|
|
139
|
+
if (runCap !== undefined) {
|
|
140
|
+
const projected = Number((runTotalUsd + usd).toFixed(6));
|
|
141
|
+
if (projected > runCap) {
|
|
142
|
+
throw new RunCostCapExceededError({
|
|
143
|
+
capUsd: runCap,
|
|
144
|
+
projectedUsd: projected,
|
|
145
|
+
currentUsd: runTotalUsd
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (config.dailyUsdCap === undefined) {
|
|
150
|
+
runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
|
|
116
151
|
return usd;
|
|
152
|
+
}
|
|
117
153
|
const date = currentDate();
|
|
118
154
|
const target = file();
|
|
119
155
|
const ledger = await readLedger(target, date);
|
|
@@ -133,6 +169,7 @@ export function createCostGuard(projectRoot, config, options = {}) {
|
|
|
133
169
|
byModel.usd = Number((byModel.usd + usd).toFixed(6));
|
|
134
170
|
ledger.byModel[model] = byModel;
|
|
135
171
|
await writeLedger(target, ledger);
|
|
172
|
+
runTotalUsd = Number((runTotalUsd + usd).toFixed(6));
|
|
136
173
|
return usd;
|
|
137
174
|
},
|
|
138
175
|
async snapshot() {
|
package/dist/eval/diff.d.ts
CHANGED
package/dist/eval/diff.js
CHANGED
|
@@ -8,8 +8,8 @@
|
|
|
8
8
|
* - per-case pass/fail transitions
|
|
9
9
|
* - per-verifier score drops (only the drops — new passes are noted in
|
|
10
10
|
* the summary line, not repeated per verifier)
|
|
11
|
-
* -
|
|
12
|
-
* `workflow` summary for the same case id
|
|
11
|
+
* - Workflow-mode stage-level cost & duration deltas when both reports
|
|
12
|
+
* carry a `workflow` summary for the same case id
|
|
13
13
|
*
|
|
14
14
|
* The resolver accepts three shapes for the `<old>` / `<new>` arguments:
|
|
15
15
|
*
|
|
@@ -98,7 +98,7 @@ function meta(report, sourcePath) {
|
|
|
98
98
|
runId: report.runId,
|
|
99
99
|
cclawVersion: report.cclawVersion,
|
|
100
100
|
generatedAt: report.generatedAt,
|
|
101
|
-
|
|
101
|
+
mode: report.mode,
|
|
102
102
|
model: report.model,
|
|
103
103
|
sourcePath
|
|
104
104
|
};
|
|
@@ -7,7 +7,7 @@ export interface ChatMessage {
|
|
|
7
7
|
toolCallId?: string;
|
|
8
8
|
/**
|
|
9
9
|
* OpenAI-style tool calls carried on a preceding assistant message.
|
|
10
|
-
* Populated by the
|
|
10
|
+
* Populated by the with-tools loop so the wire transcript stays
|
|
11
11
|
* consistent (assistant message → tool responses).
|
|
12
12
|
*/
|
|
13
13
|
toolCalls?: Array<{
|
|
@@ -35,7 +35,7 @@ export interface ChatRequest {
|
|
|
35
35
|
seed?: number;
|
|
36
36
|
/**
|
|
37
37
|
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
38
|
-
* by
|
|
38
|
+
* by agent/workflow modes. Ignored by the single-shot path.
|
|
39
39
|
*/
|
|
40
40
|
tools?: unknown[];
|
|
41
41
|
toolChoice?: "auto" | "none";
|
|
@@ -111,6 +111,17 @@ export interface CreateEvalClientOptions {
|
|
|
111
111
|
retryPolicy?: RetryPolicy;
|
|
112
112
|
/** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
|
|
113
113
|
sleep?: (ms: number) => Promise<void>;
|
|
114
|
+
/**
|
|
115
|
+
* Observer invoked when a chat() call is about to sleep before the next
|
|
116
|
+
* retry attempt. Use this to surface "we are retrying" status via the
|
|
117
|
+
* progress logger so long, silent backoff windows become visible.
|
|
118
|
+
*/
|
|
119
|
+
onRetry?: (event: {
|
|
120
|
+
attempt: number;
|
|
121
|
+
maxAttempts: number;
|
|
122
|
+
waitMs: number;
|
|
123
|
+
error: EvalLlmError;
|
|
124
|
+
}) => void;
|
|
114
125
|
}
|
|
115
126
|
export interface RetryPolicy {
|
|
116
127
|
/** Max retries *on top of* the initial attempt. 0 = single attempt. */
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -251,7 +251,14 @@ export function createEvalClient(config, options = {}) {
|
|
|
251
251
|
const isLastAttempt = attempt === maxAttempts - 1;
|
|
252
252
|
if (!normalized.retryable || isLastAttempt)
|
|
253
253
|
throw normalized;
|
|
254
|
-
|
|
254
|
+
const waitMs = backoffDelay(attempt, retryPolicy);
|
|
255
|
+
options.onRetry?.({
|
|
256
|
+
attempt: attempt + 1,
|
|
257
|
+
maxAttempts,
|
|
258
|
+
waitMs,
|
|
259
|
+
error: normalized
|
|
260
|
+
});
|
|
261
|
+
await sleep(waitMs);
|
|
255
262
|
}
|
|
256
263
|
}
|
|
257
264
|
throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helpers that translate between the legacy `Tier A/B/C` naming and the
|
|
3
|
+
* current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
|
|
4
|
+
*
|
|
5
|
+
* The names we actually carry in reports, config, CLI flags, and verifier
|
|
6
|
+
* messages are the `EvalMode` ones; legacy tier inputs are accepted with a
|
|
7
|
+
* single deprecation warning per process so existing scripts keep working
|
|
8
|
+
* through the 0.28.x line.
|
|
9
|
+
*/
|
|
10
|
+
import { type EvalMode } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Reset the per-process "already warned about legacy tier" flag. Used by
|
|
13
|
+
* tests so each test file gets a deterministic warning surface.
|
|
14
|
+
*/
|
|
15
|
+
export declare function __resetLegacyWarningForTests(): void;
|
|
16
|
+
export interface LegacyTierInput {
|
|
17
|
+
source: "cli" | "env" | "config";
|
|
18
|
+
raw: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Normalize a raw string from the CLI / env / config into an `EvalMode`.
|
|
22
|
+
* Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
|
|
23
|
+
* Emits a deprecation warning to stderr at most once per process when a
|
|
24
|
+
* legacy tier name is seen.
|
|
25
|
+
*/
|
|
26
|
+
export declare function parseModeInput(raw: string, input: LegacyTierInput, writeWarning?: (message: string) => void): EvalMode;
|
|
27
|
+
/** @deprecated kept for callers that still need to serialize as legacy. */
|
|
28
|
+
export declare function modeToLegacyTier(mode: EvalMode): "A" | "B" | "C";
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helpers that translate between the legacy `Tier A/B/C` naming and the
|
|
3
|
+
* current `EvalMode` identifiers (`fixture` / `agent` / `workflow`).
|
|
4
|
+
*
|
|
5
|
+
* The names we actually carry in reports, config, CLI flags, and verifier
|
|
6
|
+
* messages are the `EvalMode` ones; legacy tier inputs are accepted with a
|
|
7
|
+
* single deprecation warning per process so existing scripts keep working
|
|
8
|
+
* through the 0.28.x line.
|
|
9
|
+
*/
|
|
10
|
+
import { EVAL_MODES } from "./types.js";
|
|
11
|
+
const LEGACY_TIER_TO_MODE = {
|
|
12
|
+
A: "fixture",
|
|
13
|
+
B: "agent",
|
|
14
|
+
C: "workflow"
|
|
15
|
+
};
|
|
16
|
+
const MODE_TO_LEGACY_TIER = {
|
|
17
|
+
fixture: "A",
|
|
18
|
+
agent: "B",
|
|
19
|
+
workflow: "C"
|
|
20
|
+
};
|
|
21
|
+
const DEPRECATED_NAMES = new Set(Object.keys(LEGACY_TIER_TO_MODE));
|
|
22
|
+
let legacyWarningEmitted = false;
|
|
23
|
+
/**
|
|
24
|
+
* Reset the per-process "already warned about legacy tier" flag. Used by
|
|
25
|
+
* tests so each test file gets a deterministic warning surface.
|
|
26
|
+
*/
|
|
27
|
+
export function __resetLegacyWarningForTests() {
|
|
28
|
+
legacyWarningEmitted = false;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Normalize a raw string from the CLI / env / config into an `EvalMode`.
|
|
32
|
+
* Accepts both new (`fixture|agent|workflow`) and legacy (`A|B|C`) names.
|
|
33
|
+
* Emits a deprecation warning to stderr at most once per process when a
|
|
34
|
+
* legacy tier name is seen.
|
|
35
|
+
*/
|
|
36
|
+
export function parseModeInput(raw, input, writeWarning = defaultWriteWarning) {
|
|
37
|
+
const trimmed = raw.trim();
|
|
38
|
+
if (trimmed.length === 0) {
|
|
39
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C).`);
|
|
40
|
+
}
|
|
41
|
+
if (EVAL_MODES.includes(trimmed)) {
|
|
42
|
+
return trimmed;
|
|
43
|
+
}
|
|
44
|
+
if (DEPRECATED_NAMES.has(trimmed)) {
|
|
45
|
+
const replacement = LEGACY_TIER_TO_MODE[trimmed];
|
|
46
|
+
if (!legacyWarningEmitted) {
|
|
47
|
+
legacyWarningEmitted = true;
|
|
48
|
+
writeWarning(`[cclaw] "${input.source}: ${input.raw}" is using the legacy tier name "${trimmed}". ` +
|
|
49
|
+
`Please switch to --mode=${replacement} (legacy --tier=A|B|C will be removed in the next major release).`);
|
|
50
|
+
}
|
|
51
|
+
return replacement;
|
|
52
|
+
}
|
|
53
|
+
throw new Error(`Evaluation mode must be one of: ${EVAL_MODES.join("|")} (or legacy A|B|C), got: ${raw}`);
|
|
54
|
+
}
|
|
55
|
+
/** @deprecated kept for callers that still need to serialize as legacy. */
|
|
56
|
+
export function modeToLegacyTier(mode) {
|
|
57
|
+
return MODE_TO_LEGACY_TIER[mode];
|
|
58
|
+
}
|
|
59
|
+
function defaultWriteWarning(message) {
|
|
60
|
+
process.stderr.write(`${message}\n`);
|
|
61
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight progress logger for `cclaw eval`.
|
|
3
|
+
*
|
|
4
|
+
* The runner is otherwise silent: a full workflow-mode run can easily take
|
|
5
|
+
* a few minutes and the user would see nothing until the Markdown report
|
|
6
|
+
* hits disk. We emit structured events here so the CLI can print concise
|
|
7
|
+
* one-line status updates to stderr (stdout stays reserved for the final
|
|
8
|
+
* report + `--json` output).
|
|
9
|
+
*
|
|
10
|
+
* The logger is intentionally minimal: no ANSI colors, no spinners, no
|
|
11
|
+
* carriage-return rewrites. Those do not survive `tee`, CI log viewers,
|
|
12
|
+
* or the background `runs/tail` path (which copies the stream to a log
|
|
13
|
+
* file), and users also told us "nothing is clear now, everything is
|
|
14
|
+
* long" — so we optimize for log-friendly line-by-line readability.
|
|
15
|
+
*/
|
|
16
|
+
import type { EvalMode, WorkflowStageName } from "./types.js";
|
|
17
|
+
export type ProgressEvent = {
|
|
18
|
+
kind: "run-start";
|
|
19
|
+
mode: EvalMode;
|
|
20
|
+
totalCases: number;
|
|
21
|
+
} | {
|
|
22
|
+
kind: "case-start";
|
|
23
|
+
caseId: string;
|
|
24
|
+
stage: string;
|
|
25
|
+
index: number;
|
|
26
|
+
total: number;
|
|
27
|
+
} | {
|
|
28
|
+
kind: "case-end";
|
|
29
|
+
caseId: string;
|
|
30
|
+
stage: string;
|
|
31
|
+
index: number;
|
|
32
|
+
total: number;
|
|
33
|
+
passed: boolean;
|
|
34
|
+
durationMs: number;
|
|
35
|
+
costUsd?: number;
|
|
36
|
+
} | {
|
|
37
|
+
kind: "stage-start";
|
|
38
|
+
caseId: string;
|
|
39
|
+
stage: WorkflowStageName;
|
|
40
|
+
index: number;
|
|
41
|
+
total: number;
|
|
42
|
+
} | {
|
|
43
|
+
kind: "stage-end";
|
|
44
|
+
caseId: string;
|
|
45
|
+
stage: WorkflowStageName;
|
|
46
|
+
index: number;
|
|
47
|
+
total: number;
|
|
48
|
+
passed: boolean;
|
|
49
|
+
durationMs: number;
|
|
50
|
+
costUsd?: number;
|
|
51
|
+
} | {
|
|
52
|
+
kind: "retry";
|
|
53
|
+
caseId: string;
|
|
54
|
+
stage?: string;
|
|
55
|
+
attempt: number;
|
|
56
|
+
maxAttempts: number;
|
|
57
|
+
waitMs: number;
|
|
58
|
+
reason: string;
|
|
59
|
+
} | {
|
|
60
|
+
kind: "run-end";
|
|
61
|
+
totalCases: number;
|
|
62
|
+
passed: number;
|
|
63
|
+
failed: number;
|
|
64
|
+
durationMs: number;
|
|
65
|
+
};
|
|
66
|
+
export interface ProgressLogger {
|
|
67
|
+
emit(event: ProgressEvent): void;
|
|
68
|
+
}
|
|
69
|
+
export declare function noopProgressLogger(): ProgressLogger;
|
|
70
|
+
export interface StderrProgressLoggerOptions {
|
|
71
|
+
/** Override the underlying write target; defaults to `process.stderr.write`. */
|
|
72
|
+
writer?: (message: string) => void;
|
|
73
|
+
/** Return wall-clock in ms. Injectable for tests. */
|
|
74
|
+
now?: () => number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Emit a one-line status update per event to stderr.
|
|
78
|
+
*
|
|
79
|
+
* Format is deliberately boring: `[cclaw eval] <message>` so users can grep
|
|
80
|
+
* for the prefix in combined logs. Costs are rendered with up to 4 decimals
|
|
81
|
+
* so sub-cent runs still show a non-zero value.
|
|
82
|
+
*/
|
|
83
|
+
export declare function createStderrProgressLogger(opts?: StderrProgressLoggerOptions): ProgressLogger;
|