cclaw-cli 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -59,7 +59,7 @@ Commands:
59
59
  --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
60
60
  --schema-only Run only structural verifiers (default).
61
61
  --rules Also run rule-based verifiers (keywords, regex, counts, uniqueness, traceability).
62
- --judge Include LLM judging (not wired yet; requires API key).
62
+ --judge Run the LLM judge (median-of-N) against each case's rubric. Requires CCLAW_EVAL_API_KEY; Tier A also runs the single-shot agent-under-test.
63
63
  --dry-run Validate config + corpus, print summary, do not execute.
64
64
  --json Emit machine-readable JSON on stdout.
65
65
  --no-write Skip writing the report to .cclaw/evals/reports/.
@@ -78,6 +78,7 @@ Examples:
78
78
  cclaw archive --name=payments-revamp
79
79
  cclaw eval --dry-run
80
80
  cclaw eval --stage=brainstorm --schema-only
81
+ cclaw eval --judge --tier=A --stage=brainstorm
81
82
 
82
83
  Docs: https://github.com/zuevrs/cclaw
83
84
  Issues: https://github.com/zuevrs/cclaw/issues
@@ -6,6 +6,10 @@
6
6
  */
7
7
  export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap)\n# B = SDK with tool use (realistic)\n# C = multi-stage workflow (end-to-end)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
8
  export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Tier B/C runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
9
- export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
9
+ export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale. The runner picks `<stage>.yaml` when\n`cclaw eval --judge` is invoked; every stage ships a starter rubric\nbelow \u2014 edit the checks to match what your team cares about, and add\n`critical: true` to the checks that should hard-fail nightly CI on\nregression.\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n critical: false\n```\n\nSee `docs/evals.md` for the full schema.\n";
10
+ export declare const EVAL_RUBRIC_FILES: ReadonlyArray<{
11
+ stage: string;
12
+ contents: string;
13
+ }>;
10
14
  export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`.\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
11
15
  export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
@@ -57,7 +57,11 @@ stage (40 total) once rule verifiers land. Tier B/C runs may add
57
57
  export const EVAL_RUBRICS_README = `# Eval Rubrics
58
58
 
59
59
  LLM-judge rubrics. Each rubric is a short list of checks scored on a
60
- \`1–5\` scale with a rationale:
60
+ \`1–5\` scale with a rationale. The runner picks \`<stage>.yaml\` when
61
+ \`cclaw eval --judge\` is invoked; every stage ships a starter rubric
62
+ below — edit the checks to match what your team cares about, and add
63
+ \`critical: true\` to the checks that should hard-fail nightly CI on
64
+ regression.
61
65
 
62
66
  \`\`\`yaml
63
67
  stage: brainstorm
@@ -66,12 +70,289 @@ checks:
66
70
  prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
67
71
  scale: "1-5 where 5=fully distinct approaches"
68
72
  weight: 1.0
73
+ critical: false
69
74
  \`\`\`
70
75
 
71
- Rubric authoring happens when Tier A runs start producing artifacts, so we
72
- score the *right* properties rather than retrofitting generic quality checks.
73
76
  See \`docs/evals.md\` for the full schema.
74
77
  `;
78
+ const STARTER_RUBRICS = [
79
+ {
80
+ stage: "brainstorm",
81
+ checks: [
82
+ {
83
+ id: "distinctness",
84
+ prompt: "Are the proposed directions genuinely distinct (different approaches, not rephrasings of one idea)?",
85
+ scale: "1-5 where 5 = every direction uses a materially different approach",
86
+ weight: 1.0,
87
+ critical: true
88
+ },
89
+ {
90
+ id: "coverage",
91
+ prompt: "Do the directions cover the problem space (at least one tackling cost, one velocity, one risk)?",
92
+ scale: "1-5 where 5 = each major trade-off dimension has a direction",
93
+ weight: 1.0
94
+ },
95
+ {
96
+ id: "actionability",
97
+ prompt: "Could a reader pick one direction and start a scope doc tomorrow without asking clarifying questions?",
98
+ scale: "1-5 where 5 = every direction is concrete enough to scope immediately",
99
+ weight: 1.0
100
+ },
101
+ {
102
+ id: "recommendation-clarity",
103
+ prompt: "Is the Recommendation section explicit, single-voiced, and consistent with the highest-ranked direction?",
104
+ scale: "1-5 where 5 = recommendation names the chosen direction and the decisive trade-off",
105
+ weight: 1.0,
106
+ critical: true
107
+ }
108
+ ]
109
+ },
110
+ {
111
+ stage: "scope",
112
+ checks: [
113
+ {
114
+ id: "problem-statement",
115
+ prompt: "Is the problem statement anchored on user/system behavior (not on a proposed solution)?",
116
+ scale: "1-5 where 5 = problem is described independently of any implementation choice",
117
+ weight: 1.0,
118
+ critical: true
119
+ },
120
+ {
121
+ id: "non-goals",
122
+ prompt: "Are non-goals explicit and mutually-exclusive with the goals (no overlap, no vague 'we might' entries)?",
123
+ scale: "1-5 where 5 = every non-goal is a crisp decision a future reader can defend",
124
+ weight: 1.0
125
+ },
126
+ {
127
+ id: "decision-ids",
128
+ prompt: "Does the Decisions section use stable D-NN ids and name who (or what) owns each decision?",
129
+ scale: "1-5 where 5 = every decision has a D-NN id and an explicit owner",
130
+ weight: 1.0,
131
+ critical: true
132
+ },
133
+ {
134
+ id: "risks",
135
+ prompt: "Are risks concrete (named system, threshold, or scenario) rather than generic hedges?",
136
+ scale: "1-5 where 5 = each risk is testable by observing a specific signal",
137
+ weight: 0.8
138
+ }
139
+ ]
140
+ },
141
+ {
142
+ stage: "design",
143
+ checks: [
144
+ {
145
+ id: "decision-trace",
146
+ prompt: "Does the design doc restate every scope D-NN that drives the architecture, and call out the ones it rejects?",
147
+ scale: "1-5 where 5 = full D-NN trace with explicit kept/rejected markers",
148
+ weight: 1.0,
149
+ critical: true
150
+ },
151
+ {
152
+ id: "diagram-or-flow",
153
+ prompt: "Is there at least one diagram or clearly labeled flow section that shows data and control moving across the system?",
154
+ scale: "1-5 where 5 = diagram covers read path, write path, and failure path",
155
+ weight: 1.0
156
+ },
157
+ {
158
+ id: "alternatives-considered",
159
+ prompt: "Are concrete alternatives considered with explicit trade-offs (cost, complexity, latency)?",
160
+ scale: "1-5 where 5 = at least two alternatives are rejected with reasons tied to measurable properties",
161
+ weight: 0.8
162
+ },
163
+ {
164
+ id: "interface-stability",
165
+ prompt: "Are public interfaces (APIs, queues, tables) named, typed, and marked as SEMVER-stable or experimental?",
166
+ scale: "1-5 where 5 = every interface has a name, a type/shape, and a stability tag",
167
+ weight: 1.0
168
+ }
169
+ ]
170
+ },
171
+ {
172
+ stage: "spec",
173
+ checks: [
174
+ {
175
+ id: "acceptance-criteria",
176
+ prompt: "Does the spec have explicit Acceptance Criteria bullets that are unambiguously verifiable?",
177
+ scale: "1-5 where 5 = each AC states an observable condition with clear pass/fail",
178
+ weight: 1.0,
179
+ critical: true
180
+ },
181
+ {
182
+ id: "edge-cases",
183
+ prompt: "Are failure modes and edge cases enumerated (empty input, concurrent writers, partial outage)?",
184
+ scale: "1-5 where 5 = at least three distinct edge cases with expected behavior",
185
+ weight: 1.0
186
+ },
187
+ {
188
+ id: "test-plan-hooks",
189
+ prompt: "Does the spec name the test surfaces (unit, integration, e2e, synthetic probe) that will validate each AC?",
190
+ scale: "1-5 where 5 = every AC maps to at least one test surface",
191
+ weight: 1.0
192
+ },
193
+ {
194
+ id: "traceability",
195
+ prompt: "Does the spec cite the originating scope decisions (D-NN) and design sections so future engineers can trace back?",
196
+ scale: "1-5 where 5 = every material choice links to a D-NN or design heading",
197
+ weight: 0.8,
198
+ critical: true
199
+ }
200
+ ]
201
+ },
202
+ {
203
+ stage: "plan",
204
+ checks: [
205
+ {
206
+ id: "task-granularity",
207
+ prompt: "Are tasks sized so one engineer can land each in a single PR (<1 day of work)?",
208
+ scale: "1-5 where 5 = every T-NN fits in a single reviewable PR",
209
+ weight: 1.0,
210
+ critical: true
211
+ },
212
+ {
213
+ id: "tdd-loop",
214
+ prompt: "Does each task have explicit RED/GREEN/REFACTOR expectations or an equivalent TDD-compatible exit condition?",
215
+ scale: "1-5 where 5 = every task says what test fails first and what code makes it pass",
216
+ weight: 1.0,
217
+ critical: true
218
+ },
219
+ {
220
+ id: "dependency-graph",
221
+ prompt: "Is the dependency order between tasks explicit (and minimal), so parallelizable work is called out?",
222
+ scale: "1-5 where 5 = every task lists its blockers and independent tasks are marked parallelizable",
223
+ weight: 0.8
224
+ },
225
+ {
226
+ id: "scope-traceability",
227
+ prompt: "Does the plan reference the scope D-NN ids that drive each task, and does coverage leave no decision orphaned?",
228
+ scale: "1-5 where 5 = every D-NN appears in at least one task and every task names its D-NN",
229
+ weight: 1.0
230
+ }
231
+ ]
232
+ },
233
+ {
234
+ stage: "tdd",
235
+ checks: [
236
+ {
237
+ id: "red-first",
238
+ prompt: "Does the artifact show a failing test (RED) before the implementation change (GREEN)?",
239
+ scale: "1-5 where 5 = RED command output is quoted and the fix lands after",
240
+ weight: 1.0,
241
+ critical: true
242
+ },
243
+ {
244
+ id: "refactor-evidence",
245
+ prompt: "Is there a REFACTOR step with a diff or named improvement (not just passing tests)?",
246
+ scale: "1-5 where 5 = REFACTOR names a specific code-quality win and cites the affected file(s)",
247
+ weight: 0.8
248
+ },
249
+ {
250
+ id: "gate-evidence",
251
+ prompt: "Does the artifact quote the output of the required gates (lint, typecheck, tests) after the change?",
252
+ scale: "1-5 where 5 = every gate command is reproduced with its exit status",
253
+ weight: 1.0,
254
+ critical: true
255
+ },
256
+ {
257
+ id: "learnings",
258
+ prompt: "Does the artifact capture at least one durable learning (pattern, pitfall, follow-up) for future runs?",
259
+ scale: "1-5 where 5 = learning is specific, filed under knowledge.jsonl or an equivalent store",
260
+ weight: 0.6
261
+ }
262
+ ]
263
+ },
264
+ {
265
+ stage: "review",
266
+ checks: [
267
+ {
268
+ id: "two-layer-structure",
269
+ prompt: "Does the review show both layers (automated gates + human judgment) with distinct evidence?",
270
+ scale: "1-5 where 5 = Layer 1 cites tool outputs, Layer 2 cites reviewer reasoning",
271
+ weight: 1.0,
272
+ critical: true
273
+ },
274
+ {
275
+ id: "blocker-severity",
276
+ prompt: "Are issues classified by severity (blocker / major / minor) with one-line rationales?",
277
+ scale: "1-5 where 5 = every finding names severity + consequence if not fixed",
278
+ weight: 1.0
279
+ },
280
+ {
281
+ id: "security-posture",
282
+ prompt: "Does the review cover security-relevant areas explicitly (secrets, authz, PII, deps)?",
283
+ scale: "1-5 where 5 = each security dimension is addressed (with 'n/a' counted as a deliberate pass)",
284
+ weight: 0.8,
285
+ critical: true
286
+ },
287
+ {
288
+ id: "follow-ups",
289
+ prompt: "Are non-blocking follow-ups filed as explicit tickets or knowledge-log entries (not left as prose)?",
290
+ scale: "1-5 where 5 = every follow-up has a home and an owner",
291
+ weight: 0.8
292
+ }
293
+ ]
294
+ },
295
+ {
296
+ stage: "ship",
297
+ checks: [
298
+ {
299
+ id: "release-readiness",
300
+ prompt: "Does the artifact prove release readiness (gates green, changelog, version bump)?",
301
+ scale: "1-5 where 5 = each readiness item is linked to concrete evidence",
302
+ weight: 1.0,
303
+ critical: true
304
+ },
305
+ {
306
+ id: "rollback",
307
+ prompt: "Is there an explicit rollback path (command, feature-flag, migration reversal)?",
308
+ scale: "1-5 where 5 = rollback is reproducible from the doc with no context rehydration",
309
+ weight: 1.0,
310
+ critical: true
311
+ },
312
+ {
313
+ id: "monitoring",
314
+ prompt: "Are monitoring and alerting hooks named (dashboards, logs, SLO tripwires)?",
315
+ scale: "1-5 where 5 = each hook has a canonical URL or query",
316
+ weight: 0.8
317
+ },
318
+ {
319
+ id: "retro-seed",
320
+ prompt: "Does the artifact leave a retro seed (what went well, what to change for the next run)?",
321
+ scale: "1-5 where 5 = at least one distinct 'keep' and one 'change' statement",
322
+ weight: 0.6
323
+ }
324
+ ]
325
+ }
326
+ ];
327
+ function renderRubric(rubric) {
328
+ const lines = [];
329
+ lines.push(`# Starter rubric for the \`${rubric.stage}\` stage.`);
330
+ lines.push(`# Edit the checks to reflect your team's bar before running`);
331
+ lines.push(`# \`cclaw eval --judge\`. Every check id is used verbatim in`);
332
+ lines.push(`# report output and baseline files, so keep slugs stable once`);
333
+ lines.push(`# they start appearing in CI.`);
334
+ lines.push(`stage: ${rubric.stage}`);
335
+ lines.push(`checks:`);
336
+ for (const check of rubric.checks) {
337
+ lines.push(` - id: ${check.id}`);
338
+ lines.push(` prompt: >-`);
339
+ lines.push(` ${check.prompt}`);
340
+ if (check.scale !== undefined) {
341
+ lines.push(` scale: ${JSON.stringify(check.scale)}`);
342
+ }
343
+ if (check.weight !== undefined) {
344
+ lines.push(` weight: ${check.weight}`);
345
+ }
346
+ if (check.critical === true) {
347
+ lines.push(` critical: true`);
348
+ }
349
+ }
350
+ return `${lines.join("\n")}\n`;
351
+ }
352
+ export const EVAL_RUBRIC_FILES = STARTER_RUBRICS.map((rubric) => ({
353
+ stage: rubric.stage,
354
+ contents: renderRubric(rubric)
355
+ }));
75
356
  export const EVAL_BASELINES_README = `# Eval Baselines
76
357
 
77
358
  Frozen score snapshots used by regression gates. Baselines are committed to
@@ -0,0 +1,27 @@
1
+ import type { FlowStage } from "../../types.js";
2
+ import type { ChatUsage, EvalLlmClient } from "../llm-client.js";
3
+ import type { EvalCase, ResolvedEvalConfig } from "../types.js";
4
+ export interface SingleShotInput {
5
+ caseEntry: EvalCase;
6
+ config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing">;
7
+ projectRoot: string;
8
+ client: EvalLlmClient;
9
+ /**
10
+ * Override the SKILL.md loader. Primarily a test hook so unit tests
11
+ * can swap a canned system prompt without creating fixtures on disk.
12
+ */
13
+ loadSkill?: (stage: FlowStage) => Promise<string>;
14
+ }
15
+ export interface SingleShotOutput {
16
+ artifact: string;
17
+ usage: ChatUsage;
18
+ usageUsd: number;
19
+ model: string;
20
+ durationMs: number;
21
+ attempts: number;
22
+ systemPrompt: string;
23
+ userPrompt: string;
24
+ }
25
+ export declare function loadStageSkill(projectRoot: string, stage: FlowStage): Promise<string>;
26
+ /** Run the Tier A single-shot AUT and return the produced artifact. */
27
+ export declare function runSingleShot(input: SingleShotInput): Promise<SingleShotOutput>;
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Tier A single-shot agent.
3
+ *
4
+ * Simplest realistic AUT: one LLM call with the stage's SKILL.md as the
5
+ * system prompt and the case's `inputPrompt` as the user message. Output
6
+ * is the raw assistant content, returned as the artifact for the judge
7
+ * pipeline.
8
+ *
9
+ * Design notes:
10
+ *
11
+ * - No tools. No multi-turn. No reads of the project beyond the one
12
+ * SKILL.md. Tier B/C layer complexity on top in later steps.
13
+ * - Errors are propagated as-is (`EvalLlmError` subclasses) so the
14
+ * runner can surface them as verifier failures without swallowing the
15
+ * cause.
16
+ * - Usage and USD cost are surfaced so the runner can commit them to
17
+ * the cost guard + case-level `costUsd`.
18
+ */
19
+ import fs from "node:fs/promises";
20
+ import path from "node:path";
21
+ import { RUNTIME_ROOT } from "../../constants.js";
22
+ import { stageSkillFolder } from "../../content/skills.js";
23
+ import { exists } from "../../fs-utils.js";
24
+ import { computeUsageUsd } from "../cost-guard.js";
25
+ export async function loadStageSkill(projectRoot, stage) {
26
+ const folder = stageSkillFolder(stage);
27
+ const file = path.join(projectRoot, RUNTIME_ROOT, "skills", folder, "SKILL.md");
28
+ if (!(await exists(file))) {
29
+ throw new Error(`Stage skill not found: ${path.relative(projectRoot, file)}. ` +
30
+ `Run \`cclaw init\` (or \`cclaw sync\`) before \`cclaw eval --tier=A --judge\`.`);
31
+ }
32
+ return fs.readFile(file, "utf8");
33
+ }
34
+ function buildMessages(systemPrompt, userPrompt) {
35
+ return [
36
+ { role: "system", content: systemPrompt },
37
+ { role: "user", content: userPrompt }
38
+ ];
39
+ }
40
+ function buildUserPrompt(caseEntry) {
41
+ const lines = [];
42
+ lines.push(`Stage: ${caseEntry.stage}`);
43
+ lines.push(`Case id: ${caseEntry.id}`);
44
+ lines.push(``);
45
+ lines.push(`Task:`);
46
+ lines.push(caseEntry.inputPrompt.trim());
47
+ lines.push(``);
48
+ lines.push(`Produce the artifact required by this stage using the SKILL.md above. ` +
49
+ `Output the artifact directly (markdown with optional YAML frontmatter). ` +
50
+ `Do not wrap in code fences, do not add commentary before or after.`);
51
+ return lines.join("\n");
52
+ }
53
+ /** Run the Tier A single-shot AUT and return the produced artifact. */
54
+ export async function runSingleShot(input) {
55
+ const { caseEntry, config, projectRoot, client } = input;
56
+ const started = Date.now();
57
+ const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
58
+ const systemPrompt = await loader(caseEntry.stage);
59
+ const userPrompt = buildUserPrompt(caseEntry);
60
+ const response = await client.chat({
61
+ model: config.model,
62
+ messages: buildMessages(systemPrompt, userPrompt),
63
+ temperature: config.agentTemperature ?? 0.2,
64
+ timeoutMs: config.timeoutMs
65
+ });
66
+ const usageUsd = computeUsageUsd(response.model, response.usage, {
67
+ tokenPricing: config.tokenPricing
68
+ });
69
+ return {
70
+ artifact: response.content.trim(),
71
+ usage: response.usage,
72
+ usageUsd,
73
+ model: response.model,
74
+ attempts: response.attempts,
75
+ durationMs: Date.now() - started,
76
+ systemPrompt,
77
+ userPrompt
78
+ };
79
+ }
@@ -20,13 +20,19 @@ export const DEFAULT_EVAL_CONFIG = {
20
20
  failIfCriticalBelow: 3.0
21
21
  },
22
22
  timeoutMs: 120_000,
23
- maxRetries: 2
23
+ maxRetries: 2,
24
+ judgeSamples: 3,
25
+ judgeTemperature: 0,
26
+ agentTemperature: 0.2
24
27
  };
25
28
  const EVAL_TIER_SET = new Set(EVAL_TIERS);
26
29
  const NUMERIC_ENVS = new Set([
27
30
  "CCLAW_EVAL_DAILY_USD_CAP",
28
31
  "CCLAW_EVAL_TIMEOUT_MS",
29
- "CCLAW_EVAL_MAX_RETRIES"
32
+ "CCLAW_EVAL_MAX_RETRIES",
33
+ "CCLAW_EVAL_JUDGE_SAMPLES",
34
+ "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
+ "CCLAW_EVAL_AGENT_TEMPERATURE"
30
36
  ]);
31
37
  function evalConfigError(configFilePath, reason) {
32
38
  return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
@@ -93,6 +99,59 @@ function validateFileConfig(raw, configFilePath) {
93
99
  }
94
100
  out.maxRetries = raw.maxRetries;
95
101
  }
102
+ if (raw.judgeSamples !== undefined) {
103
+ const value = raw.judgeSamples;
104
+ if (!Number.isInteger(value) || value < 1) {
105
+ throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
106
+ }
107
+ if (value % 2 === 0) {
108
+ throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
109
+ }
110
+ out.judgeSamples = value;
111
+ }
112
+ if (raw.judgeTemperature !== undefined) {
113
+ if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
114
+ throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
115
+ }
116
+ if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
117
+ throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
118
+ }
119
+ out.judgeTemperature = raw.judgeTemperature;
120
+ }
121
+ if (raw.agentTemperature !== undefined) {
122
+ if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
123
+ throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
124
+ }
125
+ if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
126
+ throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
127
+ }
128
+ out.agentTemperature = raw.agentTemperature;
129
+ }
130
+ if (raw.tokenPricing !== undefined) {
131
+ if (!isRecord(raw.tokenPricing)) {
132
+ throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
133
+ }
134
+ const pricing = {};
135
+ for (const [model, value] of Object.entries(raw.tokenPricing)) {
136
+ if (!isRecord(value)) {
137
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
138
+ }
139
+ const input = value.input;
140
+ const output = value.output;
141
+ if (typeof input !== "number" || input < 0) {
142
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
143
+ }
144
+ if (typeof output !== "number" || output < 0) {
145
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
146
+ }
147
+ const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
148
+ if (extraneous.length > 0) {
149
+ throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
150
+ }
151
+ pricing[model] = { input, output };
152
+ }
153
+ out.tokenPricing = pricing;
154
+ }
96
155
  if (raw.regression !== undefined) {
97
156
  if (!isRecord(raw.regression)) {
98
157
  throw evalConfigError(configFilePath, `"regression" must be a mapping`);
@@ -123,7 +182,11 @@ function validateFileConfig(raw, configFilePath) {
123
182
  "dailyUsdCap",
124
183
  "timeoutMs",
125
184
  "maxRetries",
126
- "regression"
185
+ "regression",
186
+ "judgeSamples",
187
+ "judgeTemperature",
188
+ "agentTemperature",
189
+ "tokenPricing"
127
190
  ]);
128
191
  const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
129
192
  if (unknown.length > 0) {
@@ -203,6 +266,36 @@ function applyEnvOverrides(base, env) {
203
266
  patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
204
267
  overridden = true;
205
268
  }
269
+ const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
270
+ if (judgeSamples) {
271
+ const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
272
+ if (!Number.isInteger(value) || value < 1) {
273
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
274
+ }
275
+ if (value % 2 === 0) {
276
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
277
+ }
278
+ patched.judgeSamples = value;
279
+ overridden = true;
280
+ }
281
+ const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
282
+ if (judgeTemp) {
283
+ const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
284
+ if (value < 0 || value > 2) {
285
+ throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
286
+ }
287
+ patched.judgeTemperature = value;
288
+ overridden = true;
289
+ }
290
+ const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
291
+ if (agentTemp) {
292
+ const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
293
+ if (value < 0 || value > 2) {
294
+ throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
295
+ }
296
+ patched.agentTemperature = value;
297
+ overridden = true;
298
+ }
206
299
  const apiKey = read("CCLAW_EVAL_API_KEY");
207
300
  return { patched, overridden, apiKey };
208
301
  }
@@ -0,0 +1,80 @@
1
+ import type { ChatUsage } from "./llm-client.js";
2
+ import type { ResolvedEvalConfig, TokenPricing } from "./types.js";
3
+ /**
4
+ * Builtin pricing fallback. Intentionally conservative: when the user
5
+ * hasn't configured pricing and we don't know the model, we default to a
6
+ * "small model" USD schedule so the cap can still do something useful.
7
+ *
8
+ * Values are USD per 1K tokens. Sources are public pricing pages as of
9
+ * 2026-04; update by editing this constant, not the guard logic.
10
+ */
11
+ export declare const DEFAULT_TOKEN_PRICING: Readonly<Record<string, TokenPricing>>;
12
+ /** Hard default when neither config nor builtins know the model. */
13
+ export declare const UNKNOWN_MODEL_PRICING: TokenPricing;
14
+ export interface SpendLedger {
15
+ /** ISO date (`YYYY-MM-DD` in UTC) — also embedded in the file name. */
16
+ date: string;
17
+ /** USD spent so far today across every call that hit the guard. */
18
+ totalUsd: number;
19
+ /** Number of `chat()` calls accounted for. */
20
+ calls: number;
21
+ /** Per-model breakdown for the report. */
22
+ byModel: Record<string, {
23
+ tokensIn: number;
24
+ tokensOut: number;
25
+ usd: number;
26
+ }>;
27
+ }
28
+ export declare class DailyCostCapExceededError extends Error {
29
+ readonly capUsd: number;
30
+ readonly projectedUsd: number;
31
+ readonly currentUsd: number;
32
+ constructor(opts: {
33
+ capUsd: number;
34
+ projectedUsd: number;
35
+ currentUsd: number;
36
+ });
37
+ }
38
+ declare function utcDate(now?: Date): string;
39
+ declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
40
+ /**
41
+ * Compute USD cost of a single `ChatUsage` using the given `model` pricing
42
+ * schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
43
+ * before first token).
44
+ */
45
+ export declare function computeUsageUsd(model: string, usage: ChatUsage, config: Pick<ResolvedEvalConfig, "tokenPricing">): number;
46
+ declare function ledgerPath(projectRoot: string, date: string): string;
47
+ declare function readLedger(file: string, date: string): Promise<SpendLedger>;
48
+ declare function writeLedger(file: string, ledger: SpendLedger): Promise<void>;
49
+ /**
50
+ * Guard a single LLM call against the daily USD cap. Returns the updated
51
+ * ledger on success; throws `DailyCostCapExceededError` when the projected
52
+ * total would cross the cap. When `config.dailyUsdCap` is unset, the guard
53
+ * is a no-op — no file writes, no ledger — so non-judge runs never touch
54
+ * the filesystem.
55
+ */
56
+ export interface CostGuard {
57
+ /**
58
+ * Commit the USD cost of a finished call to the ledger. When `dailyUsdCap`
59
+ * is set, refuses the commit if the projected total would exceed the cap.
60
+ */
61
+ commit(model: string, usage: ChatUsage): Promise<number>;
62
+ /** Snapshot the current ledger (or undefined when no cap is set). */
63
+ snapshot(): Promise<SpendLedger | undefined>;
64
+ }
65
+ export interface CreateCostGuardOptions {
66
+ /** Clock injection for tests. */
67
+ now?: () => Date;
68
+ /** Override the default filesystem root for the ledger. */
69
+ ledgerPath?: string;
70
+ }
71
+ export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
72
+ /** Exposed for tests. */
73
+ export declare const __internal: {
74
+ utcDate: typeof utcDate;
75
+ pricingFor: typeof pricingFor;
76
+ ledgerPath: typeof ledgerPath;
77
+ readLedger: typeof readLedger;
78
+ writeLedger: typeof writeLedger;
79
+ };
80
+ export {};