cclaw-cli 0.49.0 → 0.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +57 -84
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +55 -17
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +44 -21
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +34 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/language-policy.d.ts +2 -0
  28. package/dist/content/language-policy.js +13 -0
  29. package/dist/content/learnings.d.ts +3 -4
  30. package/dist/content/learnings.js +26 -50
  31. package/dist/content/meta-skill.js +33 -22
  32. package/dist/content/next-command.js +41 -38
  33. package/dist/content/node-hooks.js +17 -345
  34. package/dist/content/opencode-plugin.js +5 -103
  35. package/dist/content/research-playbooks.js +14 -14
  36. package/dist/content/review-loop.d.ts +2 -0
  37. package/dist/content/review-loop.js +8 -0
  38. package/dist/content/session-hooks.js +15 -47
  39. package/dist/content/skills.d.ts +0 -5
  40. package/dist/content/skills.js +55 -128
  41. package/dist/content/stage-common-guidance.d.ts +0 -1
  42. package/dist/content/stage-common-guidance.js +17 -14
  43. package/dist/content/stage-schema.d.ts +26 -1
  44. package/dist/content/stage-schema.js +121 -40
  45. package/dist/content/stages/_lint-metadata/index.js +9 -15
  46. package/dist/content/stages/brainstorm.js +22 -43
  47. package/dist/content/stages/design.js +37 -57
  48. package/dist/content/stages/plan.js +22 -13
  49. package/dist/content/stages/review.js +24 -27
  50. package/dist/content/stages/scope.js +34 -46
  51. package/dist/content/stages/ship.js +7 -4
  52. package/dist/content/stages/spec.js +20 -9
  53. package/dist/content/stages/tdd.js +64 -44
  54. package/dist/content/start-command.js +13 -12
  55. package/dist/content/status-command.d.ts +2 -7
  56. package/dist/content/status-command.js +19 -146
  57. package/dist/content/subagents.d.ts +0 -5
  58. package/dist/content/subagents.js +51 -28
  59. package/dist/content/templates.d.ts +1 -1
  60. package/dist/content/templates.js +126 -135
  61. package/dist/content/track-render-context.d.ts +17 -0
  62. package/dist/content/track-render-context.js +44 -0
  63. package/dist/content/tree-command.d.ts +1 -2
  64. package/dist/content/tree-command.js +4 -87
  65. package/dist/content/utility-skills.d.ts +2 -29
  66. package/dist/content/utility-skills.js +2 -1534
  67. package/dist/content/view-command.js +31 -11
  68. package/dist/delegation.d.ts +1 -1
  69. package/dist/delegation.js +5 -15
  70. package/dist/doctor-registry.js +20 -21
  71. package/dist/doctor.js +88 -344
  72. package/dist/flow-state.d.ts +3 -0
  73. package/dist/flow-state.js +2 -0
  74. package/dist/harness-adapters.d.ts +1 -1
  75. package/dist/harness-adapters.js +51 -58
  76. package/dist/install.js +128 -358
  77. package/dist/internal/advance-stage.js +3 -9
  78. package/dist/internal/compound-readiness.d.ts +1 -1
  79. package/dist/internal/compound-readiness.js +1 -1
  80. package/dist/internal/tdd-loop-status.d.ts +1 -1
  81. package/dist/internal/tdd-loop-status.js +1 -1
  82. package/dist/knowledge-store.d.ts +16 -10
  83. package/dist/knowledge-store.js +51 -15
  84. package/dist/policy.js +16 -105
  85. package/dist/run-archive.d.ts +4 -6
  86. package/dist/run-archive.js +15 -20
  87. package/dist/run-persistence.d.ts +2 -2
  88. package/dist/run-persistence.js +3 -9
  89. package/package.json +1 -2
  90. package/dist/content/archive-command.d.ts +0 -2
  91. package/dist/content/archive-command.js +0 -124
  92. package/dist/content/compound-command.d.ts +0 -5
  93. package/dist/content/compound-command.js +0 -193
  94. package/dist/content/contexts.d.ts +0 -18
  95. package/dist/content/contexts.js +0 -24
  96. package/dist/content/contracts.d.ts +0 -2
  97. package/dist/content/contracts.js +0 -51
  98. package/dist/content/doctor-references.d.ts +0 -2
  99. package/dist/content/doctor-references.js +0 -150
  100. package/dist/content/eval-scaffold.d.ts +0 -15
  101. package/dist/content/eval-scaffold.js +0 -370
  102. package/dist/content/feature-command.d.ts +0 -2
  103. package/dist/content/feature-command.js +0 -123
  104. package/dist/content/flow-map.d.ts +0 -23
  105. package/dist/content/flow-map.js +0 -134
  106. package/dist/content/harness-doc.d.ts +0 -2
  107. package/dist/content/harness-doc.js +0 -202
  108. package/dist/content/harness-playbooks.d.ts +0 -24
  109. package/dist/content/harness-playbooks.js +0 -393
  110. package/dist/content/harness-tool-refs.d.ts +0 -20
  111. package/dist/content/harness-tool-refs.js +0 -268
  112. package/dist/content/ops-command.d.ts +0 -2
  113. package/dist/content/ops-command.js +0 -71
  114. package/dist/content/protocols.d.ts +0 -7
  115. package/dist/content/protocols.js +0 -215
  116. package/dist/content/retro-command.d.ts +0 -2
  117. package/dist/content/retro-command.js +0 -165
  118. package/dist/content/rewind-command.d.ts +0 -2
  119. package/dist/content/rewind-command.js +0 -106
  120. package/dist/content/tdd-log-command.d.ts +0 -2
  121. package/dist/content/tdd-log-command.js +0 -85
  122. package/dist/eval/agents/single-shot.d.ts +0 -27
  123. package/dist/eval/agents/single-shot.js +0 -79
  124. package/dist/eval/agents/with-tools.d.ts +0 -44
  125. package/dist/eval/agents/with-tools.js +0 -261
  126. package/dist/eval/agents/workflow.d.ts +0 -31
  127. package/dist/eval/agents/workflow.js +0 -155
  128. package/dist/eval/baseline.d.ts +0 -38
  129. package/dist/eval/baseline.js +0 -282
  130. package/dist/eval/config-loader.d.ts +0 -14
  131. package/dist/eval/config-loader.js +0 -395
  132. package/dist/eval/corpus.d.ts +0 -30
  133. package/dist/eval/corpus.js +0 -330
  134. package/dist/eval/cost-guard.d.ts +0 -102
  135. package/dist/eval/cost-guard.js +0 -190
  136. package/dist/eval/diff.d.ts +0 -64
  137. package/dist/eval/diff.js +0 -323
  138. package/dist/eval/llm-client.d.ts +0 -176
  139. package/dist/eval/llm-client.js +0 -267
  140. package/dist/eval/mode.d.ts +0 -28
  141. package/dist/eval/mode.js +0 -61
  142. package/dist/eval/progress.d.ts +0 -83
  143. package/dist/eval/progress.js +0 -59
  144. package/dist/eval/report.d.ts +0 -11
  145. package/dist/eval/report.js +0 -181
  146. package/dist/eval/rubric-loader.d.ts +0 -20
  147. package/dist/eval/rubric-loader.js +0 -143
  148. package/dist/eval/runner.d.ts +0 -81
  149. package/dist/eval/runner.js +0 -746
  150. package/dist/eval/runs.d.ts +0 -41
  151. package/dist/eval/runs.js +0 -114
  152. package/dist/eval/sandbox.d.ts +0 -38
  153. package/dist/eval/sandbox.js +0 -137
  154. package/dist/eval/tools/glob.d.ts +0 -2
  155. package/dist/eval/tools/glob.js +0 -163
  156. package/dist/eval/tools/grep.d.ts +0 -2
  157. package/dist/eval/tools/grep.js +0 -152
  158. package/dist/eval/tools/index.d.ts +0 -7
  159. package/dist/eval/tools/index.js +0 -35
  160. package/dist/eval/tools/read.d.ts +0 -2
  161. package/dist/eval/tools/read.js +0 -122
  162. package/dist/eval/tools/types.d.ts +0 -49
  163. package/dist/eval/tools/types.js +0 -41
  164. package/dist/eval/tools/write.d.ts +0 -2
  165. package/dist/eval/tools/write.js +0 -92
  166. package/dist/eval/types.d.ts +0 -561
  167. package/dist/eval/types.js +0 -47
  168. package/dist/eval/verifiers/judge.d.ts +0 -40
  169. package/dist/eval/verifiers/judge.js +0 -256
  170. package/dist/eval/verifiers/rules.d.ts +0 -24
  171. package/dist/eval/verifiers/rules.js +0 -218
  172. package/dist/eval/verifiers/structural.d.ts +0 -14
  173. package/dist/eval/verifiers/structural.js +0 -171
  174. package/dist/eval/verifiers/traceability.d.ts +0 -23
  175. package/dist/eval/verifiers/traceability.js +0 -84
  176. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  177. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  178. package/dist/eval/workflow-corpus.d.ts +0 -7
  179. package/dist/eval/workflow-corpus.js +0 -207
  180. package/dist/feature-system.d.ts +0 -42
  181. package/dist/feature-system.js +0 -432
  182. package/dist/internal/knowledge-digest.d.ts +0 -7
  183. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,282 +0,0 @@
1
- /**
2
- * Baseline I/O + regression comparison for the eval subsystem.
3
- *
4
- * Layout on disk (committed):
5
- *
6
- * .cclaw/evals/baselines/<stage>.json
7
- *
8
- * Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
9
- * regressions by comparing per-verifier `ok` flags across runs: any verifier
10
- * that was `ok:true` in the baseline and is `ok:false` now counts as a
11
- * critical failure. A case whose aggregate `passed` flipped from true to
12
- * false is flagged as `case-now-failing` regardless of per-verifier churn.
13
- *
14
- * Writes are gated behind an explicit `--update-baseline --confirm` pair at
15
- * the CLI layer so accidental resets do not slip into PRs.
16
- */
17
- import { createHash } from "node:crypto";
18
- import fs from "node:fs/promises";
19
- import path from "node:path";
20
- import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
21
- import { exists } from "../fs-utils.js";
22
- import { FLOW_STAGES } from "../types.js";
23
- export const BASELINE_SCHEMA_VERSION = 1;
24
- /**
25
- * Thrown when a signed baseline's on-disk digest does not match the
26
- * canonical encoding of its `{ schemaVersion, stage, cases }` block.
27
- * Callers should treat this as a hard failure: the baseline was either
28
- * hand-edited or corrupted and cannot be trusted for regression gating.
29
- */
30
- export class BaselineSignatureError extends Error {
31
- file;
32
- expected;
33
- actual;
34
- constructor(opts) {
35
- super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
36
- `The file was modified outside of \`cclaw eval --update-baseline\`. ` +
37
- `Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
38
- this.name = "BaselineSignatureError";
39
- this.file = opts.file;
40
- this.expected = opts.expected;
41
- this.actual = opts.actual;
42
- }
43
- }
44
- function baselinePath(projectRoot, stage) {
45
- return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
46
- }
47
- /**
48
- * Produce a deterministic sha256 digest over the signable portion of a
49
- * baseline. We intentionally exclude `generatedAt` and `cclawVersion`
50
- * from the digest so that rebuilding the same baseline from identical
51
- * case results on a new CLI version doesn't invalidate the signature —
52
- * only changes to the observed pass/ok/score payloads do.
53
- */
54
- export function computeBaselineDigest(snapshot) {
55
- const canonical = canonicalJson({
56
- schemaVersion: snapshot.schemaVersion,
57
- stage: snapshot.stage,
58
- cases: snapshot.cases
59
- });
60
- return createHash("sha256").update(canonical).digest("hex");
61
- }
62
- /**
63
- * JSON.stringify with object keys sorted recursively so the digest is
64
- * stable across filesystem / serializer variations.
65
- */
66
- function canonicalJson(value) {
67
- if (value === null || typeof value !== "object") {
68
- return JSON.stringify(value);
69
- }
70
- if (Array.isArray(value)) {
71
- return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
72
- }
73
- const record = value;
74
- const keys = Object.keys(record).sort();
75
- const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
76
- return `{${parts.join(",")}}`;
77
- }
78
- export async function loadBaseline(projectRoot, stage) {
79
- const filePath = baselinePath(projectRoot, stage);
80
- if (!(await exists(filePath)))
81
- return null;
82
- const raw = await fs.readFile(filePath, "utf8");
83
- let parsed;
84
- try {
85
- parsed = JSON.parse(raw);
86
- }
87
- catch (err) {
88
- throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
89
- }
90
- if (!isBaseline(parsed, stage)) {
91
- throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
92
- }
93
- const signature = parsed.signature;
94
- if (signature) {
95
- if (signature.algorithm !== "sha256") {
96
- throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
97
- }
98
- const actual = computeBaselineDigest(parsed);
99
- if (actual !== signature.digest) {
100
- throw new BaselineSignatureError({
101
- file: filePath,
102
- expected: signature.digest,
103
- actual
104
- });
105
- }
106
- }
107
- return parsed;
108
- }
109
- function isBaseline(value, stage) {
110
- if (!value || typeof value !== "object")
111
- return false;
112
- const candidate = value;
113
- if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
114
- return false;
115
- if (candidate.stage !== stage)
116
- return false;
117
- if (typeof candidate.generatedAt !== "string")
118
- return false;
119
- if (typeof candidate.cclawVersion !== "string")
120
- return false;
121
- if (!candidate.cases || typeof candidate.cases !== "object")
122
- return false;
123
- return true;
124
- }
125
- export async function loadBaselinesByStage(projectRoot, stages) {
126
- const out = new Map();
127
- for (const stage of stages) {
128
- const snapshot = await loadBaseline(projectRoot, stage);
129
- if (snapshot)
130
- out.set(stage, snapshot);
131
- }
132
- return out;
133
- }
134
- function entryFromResult(result) {
135
- const verifierResults = result.verifierResults.map((v) => ({
136
- id: v.id,
137
- kind: v.kind,
138
- ok: v.ok,
139
- ...(v.score !== undefined ? { score: v.score } : {})
140
- }));
141
- return { passed: result.passed, verifierResults };
142
- }
143
- export function buildBaselineForStage(stage, report) {
144
- const stageCases = report.cases.filter((c) => c.stage === stage);
145
- const cases = {};
146
- for (const c of stageCases) {
147
- cases[c.caseId] = entryFromResult(c);
148
- }
149
- const now = new Date().toISOString();
150
- const unsigned = {
151
- schemaVersion: BASELINE_SCHEMA_VERSION,
152
- stage,
153
- generatedAt: now,
154
- cclawVersion: CCLAW_VERSION,
155
- cases
156
- };
157
- unsigned.signature = {
158
- algorithm: "sha256",
159
- digest: computeBaselineDigest(unsigned),
160
- signedAt: now
161
- };
162
- return unsigned;
163
- }
164
- export async function writeBaselinesFromReport(projectRoot, report) {
165
- const written = [];
166
- const stages = new Set(report.cases.map((c) => c.stage));
167
- for (const stage of stages) {
168
- const snapshot = buildBaselineForStage(stage, report);
169
- const file = baselinePath(projectRoot, stage);
170
- await fs.mkdir(path.dirname(file), { recursive: true });
171
- await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
172
- written.push(file);
173
- }
174
- return written.sort();
175
- }
176
- function verifierMap(entries) {
177
- const out = new Map();
178
- for (const entry of entries) {
179
- out.set(entry.id, entry);
180
- }
181
- return out;
182
- }
183
- function computePassRate(cases) {
184
- if (cases.length === 0)
185
- return 1;
186
- const passed = cases.filter((c) => c.passed).length;
187
- return passed / cases.length;
188
- }
189
- function baselinePassRate(snapshot) {
190
- const entries = Object.values(snapshot.cases);
191
- if (entries.length === 0)
192
- return 1;
193
- const passed = entries.filter((e) => e.passed).length;
194
- return passed / entries.length;
195
- }
196
- /**
197
- * Compare a freshly computed report against loaded baselines. If no baseline
198
- * exists for a stage covered by the report, that stage contributes zero
199
- * regressions (first run of that stage). Current is the source of truth.
200
- */
201
- export function compareAgainstBaselines(report, baselines) {
202
- if (baselines.size === 0)
203
- return undefined;
204
- const regressions = [];
205
- const caseResultsByStage = new Map();
206
- for (const c of report.cases) {
207
- const bucket = caseResultsByStage.get(c.stage) ?? [];
208
- bucket.push(c);
209
- caseResultsByStage.set(c.stage, bucket);
210
- }
211
- let baselineTotalPassRate = 0;
212
- let baselineStagesCounted = 0;
213
- for (const [stage, snapshot] of baselines) {
214
- const current = caseResultsByStage.get(stage) ?? [];
215
- baselineTotalPassRate += baselinePassRate(snapshot);
216
- baselineStagesCounted += 1;
217
- for (const caseResult of current) {
218
- const baselineEntry = snapshot.cases[caseResult.caseId];
219
- if (!baselineEntry)
220
- continue;
221
- if (baselineEntry.passed && !caseResult.passed) {
222
- regressions.push({
223
- caseId: caseResult.caseId,
224
- stage,
225
- verifierId: "<case>",
226
- reason: "case-now-failing",
227
- previousScore: 1,
228
- currentScore: 0
229
- });
230
- }
231
- const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
232
- for (const currentVerifier of caseResult.verifierResults) {
233
- const prev = baselineVerifiers.get(currentVerifier.id);
234
- if (!prev)
235
- continue;
236
- if (prev.ok && !currentVerifier.ok) {
237
- regressions.push({
238
- caseId: caseResult.caseId,
239
- stage,
240
- verifierId: currentVerifier.id,
241
- reason: "newly-failing",
242
- previousScore: prev.score ?? 1,
243
- currentScore: currentVerifier.score ?? 0
244
- });
245
- }
246
- else if (prev.score !== undefined &&
247
- currentVerifier.score !== undefined &&
248
- currentVerifier.score < prev.score) {
249
- regressions.push({
250
- caseId: caseResult.caseId,
251
- stage,
252
- verifierId: currentVerifier.id,
253
- reason: "score-drop",
254
- previousScore: prev.score,
255
- currentScore: currentVerifier.score
256
- });
257
- }
258
- }
259
- }
260
- }
261
- const currentPassRate = computePassRate(report.cases);
262
- const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
263
- const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
264
- const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
265
- const baselineStages = [...baselines.keys()].sort().join(",");
266
- return {
267
- baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
268
- scoreDelta,
269
- criticalFailures,
270
- regressions
271
- };
272
- }
273
- export function listBaselineStages(projectRoot) {
274
- const root = path.join(projectRoot, EVALS_ROOT, "baselines");
275
- return fs
276
- .readdir(root, { withFileTypes: true })
277
- .then((entries) => entries
278
- .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
279
- .map((entry) => entry.name.replace(/\.json$/, ""))
280
- .filter((name) => FLOW_STAGES.includes(name)))
281
- .catch(() => []);
282
- }
@@ -1,14 +0,0 @@
1
- import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
2
- /**
3
- * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
4
- * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
5
- * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
6
- * variables (env wins last).
7
- */
8
- export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
9
- /**
10
- * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
11
- * Returns a fully-populated config plus a provenance marker so `--dry-run` can
12
- * surface where each setting came from.
13
- */
14
- export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
@@ -1,395 +0,0 @@
1
- import fs from "node:fs/promises";
2
- import path from "node:path";
3
- import { parse } from "yaml";
4
- import { EVALS_CONFIG_PATH } from "../constants.js";
5
- import { exists } from "../fs-utils.js";
6
- import { EVAL_MODES } from "./types.js";
7
- import { parseModeInput } from "./mode.js";
8
- /**
9
- * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
10
- * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
11
- * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
12
- * variables (env wins last).
13
- */
14
- export const DEFAULT_EVAL_CONFIG = {
15
- provider: "zai",
16
- baseUrl: "https://api.z.ai/api/coding/paas/v4",
17
- model: "glm-5.1",
18
- defaultMode: "fixture",
19
- regression: {
20
- failIfDeltaBelow: -0.15,
21
- failIfCriticalBelow: 3.0
22
- },
23
- timeoutMs: 120_000,
24
- maxRetries: 2,
25
- judgeSamples: 3,
26
- judgeTemperature: 0,
27
- agentTemperature: 0.2
28
- };
29
- const NUMERIC_ENVS = new Set([
30
- "CCLAW_EVAL_DAILY_USD_CAP",
31
- "CCLAW_EVAL_TIMEOUT_MS",
32
- "CCLAW_EVAL_MAX_RETRIES",
33
- "CCLAW_EVAL_JUDGE_SAMPLES",
34
- "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
- "CCLAW_EVAL_AGENT_TEMPERATURE",
36
- "CCLAW_EVAL_TOOL_MAX_TURNS",
37
- "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
38
- "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
39
- "CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
40
- ]);
41
- function evalConfigError(configFilePath, reason) {
42
- return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
43
- `Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
44
- `See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
45
- }
46
- function isRecord(value) {
47
- return typeof value === "object" && value !== null && !Array.isArray(value);
48
- }
49
- function parseNumericEnv(name, raw) {
50
- const value = Number(raw);
51
- if (!Number.isFinite(value)) {
52
- throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
53
- }
54
- return value;
55
- }
56
- function parseModeEnv(raw, envName) {
57
- return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
58
- source: "env",
59
- raw: `${envName}=${raw}`
60
- });
61
- }
62
- function validateFileConfig(raw, configFilePath) {
63
- if (raw === undefined || raw === null)
64
- return {};
65
- if (!isRecord(raw)) {
66
- throw evalConfigError(configFilePath, "top-level value must be a mapping");
67
- }
68
- const out = {};
69
- const assignString = (key, value) => {
70
- if (value === undefined)
71
- return;
72
- if (typeof value !== "string" || value.trim().length === 0) {
73
- throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
74
- }
75
- out[key] = value.trim();
76
- };
77
- assignString("provider", raw.provider);
78
- assignString("baseUrl", raw.baseUrl);
79
- assignString("model", raw.model);
80
- assignString("judgeModel", raw.judgeModel);
81
- if (raw.defaultMode !== undefined) {
82
- if (typeof raw.defaultMode !== "string") {
83
- throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
84
- }
85
- try {
86
- out.defaultMode = parseModeInput(raw.defaultMode, {
87
- source: "config",
88
- raw: `defaultMode: ${raw.defaultMode}`
89
- });
90
- }
91
- catch (err) {
92
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
93
- }
94
- }
95
- else if (raw.defaultTier !== undefined) {
96
- if (typeof raw.defaultTier !== "string") {
97
- throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
98
- }
99
- try {
100
- out.defaultMode = parseModeInput(raw.defaultTier, {
101
- source: "config",
102
- raw: `defaultTier: ${raw.defaultTier}`
103
- });
104
- }
105
- catch (err) {
106
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
107
- }
108
- }
109
- if (raw.dailyUsdCap !== undefined) {
110
- if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
111
- throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
112
- }
113
- out.dailyUsdCap = raw.dailyUsdCap;
114
- }
115
- if (raw.timeoutMs !== undefined) {
116
- if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
117
- throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
118
- }
119
- out.timeoutMs = raw.timeoutMs;
120
- }
121
- if (raw.maxRetries !== undefined) {
122
- if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
123
- throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
124
- }
125
- out.maxRetries = raw.maxRetries;
126
- }
127
- if (raw.judgeSamples !== undefined) {
128
- const value = raw.judgeSamples;
129
- if (!Number.isInteger(value) || value < 1) {
130
- throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
131
- }
132
- if (value % 2 === 0) {
133
- throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
134
- }
135
- out.judgeSamples = value;
136
- }
137
- if (raw.judgeTemperature !== undefined) {
138
- if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
139
- throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
140
- }
141
- if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
142
- throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
143
- }
144
- out.judgeTemperature = raw.judgeTemperature;
145
- }
146
- if (raw.agentTemperature !== undefined) {
147
- if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
148
- throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
149
- }
150
- if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
151
- throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
152
- }
153
- out.agentTemperature = raw.agentTemperature;
154
- }
155
- if (raw.tokenPricing !== undefined) {
156
- if (!isRecord(raw.tokenPricing)) {
157
- throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
158
- }
159
- const pricing = {};
160
- for (const [model, value] of Object.entries(raw.tokenPricing)) {
161
- if (!isRecord(value)) {
162
- throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
163
- }
164
- const input = value.input;
165
- const output = value.output;
166
- if (typeof input !== "number" || input < 0) {
167
- throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
168
- }
169
- if (typeof output !== "number" || output < 0) {
170
- throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
171
- }
172
- const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
173
- if (extraneous.length > 0) {
174
- throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
175
- }
176
- pricing[model] = { input, output };
177
- }
178
- out.tokenPricing = pricing;
179
- }
180
- const assignPositiveInt = (key, value, label) => {
181
- if (value === undefined)
182
- return;
183
- if (!Number.isInteger(value) || value < 1) {
184
- throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
185
- }
186
- out[key] = value;
187
- };
188
- assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
189
- assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
190
- assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
191
- assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
192
- if (raw.regression !== undefined) {
193
- if (!isRecord(raw.regression)) {
194
- throw evalConfigError(configFilePath, `"regression" must be a mapping`);
195
- }
196
- const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
197
- const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
198
- if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
199
- throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
200
- }
201
- if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
202
- throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
203
- }
204
- out.regression = {
205
- failIfDeltaBelow: typeof failIfDeltaBelow === "number"
206
- ? failIfDeltaBelow
207
- : DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
208
- failIfCriticalBelow: typeof failIfCriticalBelow === "number"
209
- ? failIfCriticalBelow
210
- : DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
211
- };
212
- }
213
- const knownKeys = new Set([
214
- "provider",
215
- "baseUrl",
216
- "model",
217
- "judgeModel",
218
- "defaultMode",
219
- "defaultTier",
220
- "dailyUsdCap",
221
- "timeoutMs",
222
- "maxRetries",
223
- "regression",
224
- "judgeSamples",
225
- "judgeTemperature",
226
- "agentTemperature",
227
- "tokenPricing",
228
- "toolMaxTurns",
229
- "toolMaxArgumentsBytes",
230
- "toolMaxResultBytes",
231
- "workflowMaxTotalTurns"
232
- ]);
233
- const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
234
- if (unknown.length > 0) {
235
- throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
236
- }
237
- return out;
238
- }
239
- async function readFileConfig(projectRoot) {
240
- const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
241
- if (!(await exists(configFilePath))) {
242
- return { patch: {}, source: "default" };
243
- }
244
- let parsed;
245
- try {
246
- parsed = parse(await fs.readFile(configFilePath, "utf8"));
247
- }
248
- catch (err) {
249
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
250
- }
251
- const patch = validateFileConfig(parsed, configFilePath);
252
- return { patch, source: "file" };
253
- }
254
- function applyEnvOverrides(base, env) {
255
- let overridden = false;
256
- const patched = {
257
- ...base,
258
- regression: { ...base.regression }
259
- };
260
- for (const name of Object.keys(env)) {
261
- if (!name.startsWith("CCLAW_EVAL_"))
262
- continue;
263
- if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
264
- // validated below when applied
265
- }
266
- }
267
- const read = (name) => {
268
- const value = env[name];
269
- return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
270
- };
271
- const baseUrl = read("CCLAW_EVAL_BASE_URL");
272
- if (baseUrl) {
273
- patched.baseUrl = baseUrl;
274
- overridden = true;
275
- }
276
- const model = read("CCLAW_EVAL_MODEL");
277
- if (model) {
278
- patched.model = model;
279
- overridden = true;
280
- }
281
- const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
282
- if (judgeModel) {
283
- patched.judgeModel = judgeModel;
284
- overridden = true;
285
- }
286
- const provider = read("CCLAW_EVAL_PROVIDER");
287
- if (provider) {
288
- patched.provider = provider;
289
- overridden = true;
290
- }
291
- const modeEnv = read("CCLAW_EVAL_MODE");
292
- if (modeEnv) {
293
- patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
294
- overridden = true;
295
- }
296
- else {
297
- const legacyTier = read("CCLAW_EVAL_TIER");
298
- if (legacyTier) {
299
- patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
300
- overridden = true;
301
- }
302
- }
303
- const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
304
- if (cap) {
305
- patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
306
- overridden = true;
307
- }
308
- const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
309
- if (timeout) {
310
- patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
311
- overridden = true;
312
- }
313
- const retries = read("CCLAW_EVAL_MAX_RETRIES");
314
- if (retries) {
315
- patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
316
- overridden = true;
317
- }
318
- const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
319
- if (judgeSamples) {
320
- const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
321
- if (!Number.isInteger(value) || value < 1) {
322
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
323
- }
324
- if (value % 2 === 0) {
325
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
326
- }
327
- patched.judgeSamples = value;
328
- overridden = true;
329
- }
330
- const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
331
- if (judgeTemp) {
332
- const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
333
- if (value < 0 || value > 2) {
334
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
335
- }
336
- patched.judgeTemperature = value;
337
- overridden = true;
338
- }
339
- const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
340
- if (agentTemp) {
341
- const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
342
- if (value < 0 || value > 2) {
343
- throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
344
- }
345
- patched.agentTemperature = value;
346
- overridden = true;
347
- }
348
- const readPositiveInt = (name, key, label) => {
349
- const raw = read(name);
350
- if (!raw)
351
- return;
352
- const value = parseNumericEnv(name, raw);
353
- if (!Number.isInteger(value) || value < 1) {
354
- throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
355
- }
356
- patched[key] = value;
357
- overridden = true;
358
- void label;
359
- };
360
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
361
- readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
362
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
363
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
364
- const apiKey = read("CCLAW_EVAL_API_KEY");
365
- return { patched, overridden, apiKey };
366
- }
367
- /**
368
- * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
369
- * Returns a fully-populated config plus a provenance marker so `--dry-run` can
370
- * surface where each setting came from.
371
- */
372
- export async function loadEvalConfig(projectRoot, env = process.env) {
373
- const { patch, source: fileSource } = await readFileConfig(projectRoot);
374
- const merged = {
375
- ...DEFAULT_EVAL_CONFIG,
376
- ...patch,
377
- regression: {
378
- ...DEFAULT_EVAL_CONFIG.regression,
379
- ...(patch.regression ?? {})
380
- }
381
- };
382
- const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
383
- let source = "default";
384
- if (fileSource === "file" && overridden)
385
- source = "file+env";
386
- else if (fileSource === "file")
387
- source = "file";
388
- else if (overridden)
389
- source = "env";
390
- return {
391
- ...patched,
392
- apiKey,
393
- source
394
- };
395
- }