cclaw-cli 0.48.35 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +54 -82
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -495
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -46
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +51 -9
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +2 -0
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +31 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/learnings.d.ts +3 -4
  28. package/dist/content/learnings.js +24 -50
  29. package/dist/content/meta-skill.js +31 -24
  30. package/dist/content/next-command.js +38 -38
  31. package/dist/content/node-hooks.js +17 -343
  32. package/dist/content/opencode-plugin.js +2 -100
  33. package/dist/content/research-playbooks.js +14 -14
  34. package/dist/content/review-loop.d.ts +2 -0
  35. package/dist/content/review-loop.js +8 -0
  36. package/dist/content/session-hooks.js +14 -46
  37. package/dist/content/skills.d.ts +0 -5
  38. package/dist/content/skills.js +53 -128
  39. package/dist/content/stage-common-guidance.d.ts +0 -1
  40. package/dist/content/stage-common-guidance.js +15 -14
  41. package/dist/content/stage-schema.d.ts +26 -1
  42. package/dist/content/stage-schema.js +121 -40
  43. package/dist/content/stages/_lint-metadata/index.js +9 -15
  44. package/dist/content/stages/brainstorm.js +22 -43
  45. package/dist/content/stages/design.js +37 -57
  46. package/dist/content/stages/plan.js +22 -13
  47. package/dist/content/stages/review.js +24 -27
  48. package/dist/content/stages/scope.js +34 -46
  49. package/dist/content/stages/ship.js +7 -4
  50. package/dist/content/stages/spec.js +20 -9
  51. package/dist/content/stages/tdd.js +64 -44
  52. package/dist/content/start-command.js +10 -12
  53. package/dist/content/status-command.d.ts +2 -7
  54. package/dist/content/status-command.js +19 -146
  55. package/dist/content/subagents.d.ts +0 -5
  56. package/dist/content/subagents.js +47 -28
  57. package/dist/content/templates.d.ts +1 -1
  58. package/dist/content/templates.js +126 -135
  59. package/dist/content/track-render-context.d.ts +17 -0
  60. package/dist/content/track-render-context.js +44 -0
  61. package/dist/content/tree-command.d.ts +1 -2
  62. package/dist/content/tree-command.js +4 -87
  63. package/dist/content/utility-skills.d.ts +2 -29
  64. package/dist/content/utility-skills.js +2 -1533
  65. package/dist/content/view-command.js +29 -11
  66. package/dist/delegation.d.ts +1 -1
  67. package/dist/delegation.js +5 -15
  68. package/dist/doctor-registry.js +20 -21
  69. package/dist/doctor.js +88 -408
  70. package/dist/flow-state.d.ts +3 -0
  71. package/dist/flow-state.js +2 -0
  72. package/dist/harness-adapters.d.ts +1 -1
  73. package/dist/harness-adapters.js +48 -57
  74. package/dist/install.js +128 -520
  75. package/dist/internal/advance-stage.js +3 -9
  76. package/dist/internal/compound-readiness.d.ts +1 -1
  77. package/dist/internal/compound-readiness.js +1 -1
  78. package/dist/internal/tdd-loop-status.d.ts +1 -1
  79. package/dist/internal/tdd-loop-status.js +1 -1
  80. package/dist/knowledge-store.d.ts +16 -10
  81. package/dist/knowledge-store.js +51 -15
  82. package/dist/policy.js +16 -109
  83. package/dist/run-archive.d.ts +4 -6
  84. package/dist/run-archive.js +15 -20
  85. package/dist/run-persistence.d.ts +2 -2
  86. package/dist/run-persistence.js +3 -9
  87. package/package.json +1 -2
  88. package/dist/content/archive-command.d.ts +0 -2
  89. package/dist/content/archive-command.js +0 -124
  90. package/dist/content/compound-command.d.ts +0 -5
  91. package/dist/content/compound-command.js +0 -193
  92. package/dist/content/contexts.d.ts +0 -9
  93. package/dist/content/contexts.js +0 -65
  94. package/dist/content/contracts.d.ts +0 -2
  95. package/dist/content/contracts.js +0 -51
  96. package/dist/content/doctor-references.d.ts +0 -2
  97. package/dist/content/doctor-references.js +0 -150
  98. package/dist/content/eval-scaffold.d.ts +0 -15
  99. package/dist/content/eval-scaffold.js +0 -370
  100. package/dist/content/feature-command.d.ts +0 -2
  101. package/dist/content/feature-command.js +0 -123
  102. package/dist/content/flow-map.d.ts +0 -23
  103. package/dist/content/flow-map.js +0 -134
  104. package/dist/content/harness-doc.d.ts +0 -2
  105. package/dist/content/harness-doc.js +0 -202
  106. package/dist/content/harness-playbooks.d.ts +0 -24
  107. package/dist/content/harness-playbooks.js +0 -393
  108. package/dist/content/harness-tool-refs.d.ts +0 -20
  109. package/dist/content/harness-tool-refs.js +0 -268
  110. package/dist/content/ops-command.d.ts +0 -2
  111. package/dist/content/ops-command.js +0 -71
  112. package/dist/content/protocols.d.ts +0 -7
  113. package/dist/content/protocols.js +0 -215
  114. package/dist/content/retro-command.d.ts +0 -2
  115. package/dist/content/retro-command.js +0 -165
  116. package/dist/content/rewind-command.d.ts +0 -2
  117. package/dist/content/rewind-command.js +0 -106
  118. package/dist/content/tdd-log-command.d.ts +0 -2
  119. package/dist/content/tdd-log-command.js +0 -85
  120. package/dist/eval/agents/single-shot.d.ts +0 -27
  121. package/dist/eval/agents/single-shot.js +0 -79
  122. package/dist/eval/agents/with-tools.d.ts +0 -44
  123. package/dist/eval/agents/with-tools.js +0 -261
  124. package/dist/eval/agents/workflow.d.ts +0 -31
  125. package/dist/eval/agents/workflow.js +0 -155
  126. package/dist/eval/baseline.d.ts +0 -38
  127. package/dist/eval/baseline.js +0 -282
  128. package/dist/eval/config-loader.d.ts +0 -14
  129. package/dist/eval/config-loader.js +0 -395
  130. package/dist/eval/corpus.d.ts +0 -30
  131. package/dist/eval/corpus.js +0 -330
  132. package/dist/eval/cost-guard.d.ts +0 -102
  133. package/dist/eval/cost-guard.js +0 -190
  134. package/dist/eval/diff.d.ts +0 -64
  135. package/dist/eval/diff.js +0 -323
  136. package/dist/eval/llm-client.d.ts +0 -176
  137. package/dist/eval/llm-client.js +0 -267
  138. package/dist/eval/mode.d.ts +0 -28
  139. package/dist/eval/mode.js +0 -61
  140. package/dist/eval/progress.d.ts +0 -83
  141. package/dist/eval/progress.js +0 -59
  142. package/dist/eval/report.d.ts +0 -11
  143. package/dist/eval/report.js +0 -181
  144. package/dist/eval/rubric-loader.d.ts +0 -20
  145. package/dist/eval/rubric-loader.js +0 -143
  146. package/dist/eval/runner.d.ts +0 -81
  147. package/dist/eval/runner.js +0 -746
  148. package/dist/eval/runs.d.ts +0 -41
  149. package/dist/eval/runs.js +0 -114
  150. package/dist/eval/sandbox.d.ts +0 -38
  151. package/dist/eval/sandbox.js +0 -137
  152. package/dist/eval/tools/glob.d.ts +0 -2
  153. package/dist/eval/tools/glob.js +0 -163
  154. package/dist/eval/tools/grep.d.ts +0 -2
  155. package/dist/eval/tools/grep.js +0 -152
  156. package/dist/eval/tools/index.d.ts +0 -7
  157. package/dist/eval/tools/index.js +0 -35
  158. package/dist/eval/tools/read.d.ts +0 -2
  159. package/dist/eval/tools/read.js +0 -122
  160. package/dist/eval/tools/types.d.ts +0 -49
  161. package/dist/eval/tools/types.js +0 -41
  162. package/dist/eval/tools/write.d.ts +0 -2
  163. package/dist/eval/tools/write.js +0 -92
  164. package/dist/eval/types.d.ts +0 -561
  165. package/dist/eval/types.js +0 -47
  166. package/dist/eval/verifiers/judge.d.ts +0 -40
  167. package/dist/eval/verifiers/judge.js +0 -256
  168. package/dist/eval/verifiers/rules.d.ts +0 -24
  169. package/dist/eval/verifiers/rules.js +0 -218
  170. package/dist/eval/verifiers/structural.d.ts +0 -14
  171. package/dist/eval/verifiers/structural.js +0 -171
  172. package/dist/eval/verifiers/traceability.d.ts +0 -23
  173. package/dist/eval/verifiers/traceability.js +0 -84
  174. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  175. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  176. package/dist/eval/workflow-corpus.d.ts +0 -7
  177. package/dist/eval/workflow-corpus.js +0 -207
  178. package/dist/feature-system.d.ts +0 -42
  179. package/dist/feature-system.js +0 -432
  180. package/dist/internal/knowledge-digest.d.ts +0 -7
  181. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,282 +0,0 @@
1
- /**
2
- * Baseline I/O + regression comparison for the eval subsystem.
3
- *
4
- * Layout on disk (committed):
5
- *
6
- * .cclaw/evals/baselines/<stage>.json
7
- *
8
- * Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
9
- * regressions by comparing per-verifier `ok` flags across runs: any verifier
10
- * that was `ok:true` in the baseline and is `ok:false` now counts as a
11
- * critical failure. A case whose aggregate `passed` flipped from true to
12
- * false is flagged as `case-now-failing` regardless of per-verifier churn.
13
- *
14
- * Writes are gated behind an explicit `--update-baseline --confirm` pair at
15
- * the CLI layer so accidental resets do not slip into PRs.
16
- */
17
- import { createHash } from "node:crypto";
18
- import fs from "node:fs/promises";
19
- import path from "node:path";
20
- import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
21
- import { exists } from "../fs-utils.js";
22
- import { FLOW_STAGES } from "../types.js";
23
- export const BASELINE_SCHEMA_VERSION = 1;
24
- /**
25
- * Thrown when a signed baseline's on-disk digest does not match the
26
- * canonical encoding of its `{ schemaVersion, stage, cases }` block.
27
- * Callers should treat this as a hard failure: the baseline was either
28
- * hand-edited or corrupted and cannot be trusted for regression gating.
29
- */
30
- export class BaselineSignatureError extends Error {
31
- file;
32
- expected;
33
- actual;
34
- constructor(opts) {
35
- super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
36
- `The file was modified outside of \`cclaw eval --update-baseline\`. ` +
37
- `Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
38
- this.name = "BaselineSignatureError";
39
- this.file = opts.file;
40
- this.expected = opts.expected;
41
- this.actual = opts.actual;
42
- }
43
- }
44
- function baselinePath(projectRoot, stage) {
45
- return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
46
- }
47
- /**
48
- * Produce a deterministic sha256 digest over the signable portion of a
49
- * baseline. We intentionally exclude `generatedAt` and `cclawVersion`
50
- * from the digest so that rebuilding the same baseline from identical
51
- * case results on a new CLI version doesn't invalidate the signature —
52
- * only changes to the observed pass/ok/score payloads do.
53
- */
54
- export function computeBaselineDigest(snapshot) {
55
- const canonical = canonicalJson({
56
- schemaVersion: snapshot.schemaVersion,
57
- stage: snapshot.stage,
58
- cases: snapshot.cases
59
- });
60
- return createHash("sha256").update(canonical).digest("hex");
61
- }
62
- /**
63
- * JSON.stringify with object keys sorted recursively so the digest is
64
- * stable across filesystem / serializer variations.
65
- */
66
- function canonicalJson(value) {
67
- if (value === null || typeof value !== "object") {
68
- return JSON.stringify(value);
69
- }
70
- if (Array.isArray(value)) {
71
- return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
72
- }
73
- const record = value;
74
- const keys = Object.keys(record).sort();
75
- const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
76
- return `{${parts.join(",")}}`;
77
- }
78
- export async function loadBaseline(projectRoot, stage) {
79
- const filePath = baselinePath(projectRoot, stage);
80
- if (!(await exists(filePath)))
81
- return null;
82
- const raw = await fs.readFile(filePath, "utf8");
83
- let parsed;
84
- try {
85
- parsed = JSON.parse(raw);
86
- }
87
- catch (err) {
88
- throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
89
- }
90
- if (!isBaseline(parsed, stage)) {
91
- throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
92
- }
93
- const signature = parsed.signature;
94
- if (signature) {
95
- if (signature.algorithm !== "sha256") {
96
- throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
97
- }
98
- const actual = computeBaselineDigest(parsed);
99
- if (actual !== signature.digest) {
100
- throw new BaselineSignatureError({
101
- file: filePath,
102
- expected: signature.digest,
103
- actual
104
- });
105
- }
106
- }
107
- return parsed;
108
- }
109
- function isBaseline(value, stage) {
110
- if (!value || typeof value !== "object")
111
- return false;
112
- const candidate = value;
113
- if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
114
- return false;
115
- if (candidate.stage !== stage)
116
- return false;
117
- if (typeof candidate.generatedAt !== "string")
118
- return false;
119
- if (typeof candidate.cclawVersion !== "string")
120
- return false;
121
- if (!candidate.cases || typeof candidate.cases !== "object")
122
- return false;
123
- return true;
124
- }
125
- export async function loadBaselinesByStage(projectRoot, stages) {
126
- const out = new Map();
127
- for (const stage of stages) {
128
- const snapshot = await loadBaseline(projectRoot, stage);
129
- if (snapshot)
130
- out.set(stage, snapshot);
131
- }
132
- return out;
133
- }
134
- function entryFromResult(result) {
135
- const verifierResults = result.verifierResults.map((v) => ({
136
- id: v.id,
137
- kind: v.kind,
138
- ok: v.ok,
139
- ...(v.score !== undefined ? { score: v.score } : {})
140
- }));
141
- return { passed: result.passed, verifierResults };
142
- }
143
- export function buildBaselineForStage(stage, report) {
144
- const stageCases = report.cases.filter((c) => c.stage === stage);
145
- const cases = {};
146
- for (const c of stageCases) {
147
- cases[c.caseId] = entryFromResult(c);
148
- }
149
- const now = new Date().toISOString();
150
- const unsigned = {
151
- schemaVersion: BASELINE_SCHEMA_VERSION,
152
- stage,
153
- generatedAt: now,
154
- cclawVersion: CCLAW_VERSION,
155
- cases
156
- };
157
- unsigned.signature = {
158
- algorithm: "sha256",
159
- digest: computeBaselineDigest(unsigned),
160
- signedAt: now
161
- };
162
- return unsigned;
163
- }
164
- export async function writeBaselinesFromReport(projectRoot, report) {
165
- const written = [];
166
- const stages = new Set(report.cases.map((c) => c.stage));
167
- for (const stage of stages) {
168
- const snapshot = buildBaselineForStage(stage, report);
169
- const file = baselinePath(projectRoot, stage);
170
- await fs.mkdir(path.dirname(file), { recursive: true });
171
- await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
172
- written.push(file);
173
- }
174
- return written.sort();
175
- }
176
- function verifierMap(entries) {
177
- const out = new Map();
178
- for (const entry of entries) {
179
- out.set(entry.id, entry);
180
- }
181
- return out;
182
- }
183
- function computePassRate(cases) {
184
- if (cases.length === 0)
185
- return 1;
186
- const passed = cases.filter((c) => c.passed).length;
187
- return passed / cases.length;
188
- }
189
- function baselinePassRate(snapshot) {
190
- const entries = Object.values(snapshot.cases);
191
- if (entries.length === 0)
192
- return 1;
193
- const passed = entries.filter((e) => e.passed).length;
194
- return passed / entries.length;
195
- }
196
- /**
197
- * Compare a freshly computed report against loaded baselines. If no baseline
198
- * exists for a stage covered by the report, that stage contributes zero
199
- * regressions (first run of that stage). Current is the source of truth.
200
- */
201
- export function compareAgainstBaselines(report, baselines) {
202
- if (baselines.size === 0)
203
- return undefined;
204
- const regressions = [];
205
- const caseResultsByStage = new Map();
206
- for (const c of report.cases) {
207
- const bucket = caseResultsByStage.get(c.stage) ?? [];
208
- bucket.push(c);
209
- caseResultsByStage.set(c.stage, bucket);
210
- }
211
- let baselineTotalPassRate = 0;
212
- let baselineStagesCounted = 0;
213
- for (const [stage, snapshot] of baselines) {
214
- const current = caseResultsByStage.get(stage) ?? [];
215
- baselineTotalPassRate += baselinePassRate(snapshot);
216
- baselineStagesCounted += 1;
217
- for (const caseResult of current) {
218
- const baselineEntry = snapshot.cases[caseResult.caseId];
219
- if (!baselineEntry)
220
- continue;
221
- if (baselineEntry.passed && !caseResult.passed) {
222
- regressions.push({
223
- caseId: caseResult.caseId,
224
- stage,
225
- verifierId: "<case>",
226
- reason: "case-now-failing",
227
- previousScore: 1,
228
- currentScore: 0
229
- });
230
- }
231
- const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
232
- for (const currentVerifier of caseResult.verifierResults) {
233
- const prev = baselineVerifiers.get(currentVerifier.id);
234
- if (!prev)
235
- continue;
236
- if (prev.ok && !currentVerifier.ok) {
237
- regressions.push({
238
- caseId: caseResult.caseId,
239
- stage,
240
- verifierId: currentVerifier.id,
241
- reason: "newly-failing",
242
- previousScore: prev.score ?? 1,
243
- currentScore: currentVerifier.score ?? 0
244
- });
245
- }
246
- else if (prev.score !== undefined &&
247
- currentVerifier.score !== undefined &&
248
- currentVerifier.score < prev.score) {
249
- regressions.push({
250
- caseId: caseResult.caseId,
251
- stage,
252
- verifierId: currentVerifier.id,
253
- reason: "score-drop",
254
- previousScore: prev.score,
255
- currentScore: currentVerifier.score
256
- });
257
- }
258
- }
259
- }
260
- }
261
- const currentPassRate = computePassRate(report.cases);
262
- const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
263
- const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
264
- const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
265
- const baselineStages = [...baselines.keys()].sort().join(",");
266
- return {
267
- baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
268
- scoreDelta,
269
- criticalFailures,
270
- regressions
271
- };
272
- }
273
- export function listBaselineStages(projectRoot) {
274
- const root = path.join(projectRoot, EVALS_ROOT, "baselines");
275
- return fs
276
- .readdir(root, { withFileTypes: true })
277
- .then((entries) => entries
278
- .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
279
- .map((entry) => entry.name.replace(/\.json$/, ""))
280
- .filter((name) => FLOW_STAGES.includes(name)))
281
- .catch(() => []);
282
- }
@@ -1,14 +0,0 @@
1
- import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
2
- /**
3
- * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
4
- * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
5
- * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
6
- * variables (env wins last).
7
- */
8
- export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
9
- /**
10
- * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
11
- * Returns a fully-populated config plus a provenance marker so `--dry-run` can
12
- * surface where each setting came from.
13
- */
14
- export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
@@ -1,395 +0,0 @@
1
- import fs from "node:fs/promises";
2
- import path from "node:path";
3
- import { parse } from "yaml";
4
- import { EVALS_CONFIG_PATH } from "../constants.js";
5
- import { exists } from "../fs-utils.js";
6
- import { EVAL_MODES } from "./types.js";
7
- import { parseModeInput } from "./mode.js";
8
- /**
9
- * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
10
- * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
11
- * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
12
- * variables (env wins last).
13
- */
14
- export const DEFAULT_EVAL_CONFIG = {
15
- provider: "zai",
16
- baseUrl: "https://api.z.ai/api/coding/paas/v4",
17
- model: "glm-5.1",
18
- defaultMode: "fixture",
19
- regression: {
20
- failIfDeltaBelow: -0.15,
21
- failIfCriticalBelow: 3.0
22
- },
23
- timeoutMs: 120_000,
24
- maxRetries: 2,
25
- judgeSamples: 3,
26
- judgeTemperature: 0,
27
- agentTemperature: 0.2
28
- };
29
- const NUMERIC_ENVS = new Set([
30
- "CCLAW_EVAL_DAILY_USD_CAP",
31
- "CCLAW_EVAL_TIMEOUT_MS",
32
- "CCLAW_EVAL_MAX_RETRIES",
33
- "CCLAW_EVAL_JUDGE_SAMPLES",
34
- "CCLAW_EVAL_JUDGE_TEMPERATURE",
35
- "CCLAW_EVAL_AGENT_TEMPERATURE",
36
- "CCLAW_EVAL_TOOL_MAX_TURNS",
37
- "CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
38
- "CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
39
- "CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
40
- ]);
41
- function evalConfigError(configFilePath, reason) {
42
- return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
43
- `Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
44
- `See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
45
- }
46
- function isRecord(value) {
47
- return typeof value === "object" && value !== null && !Array.isArray(value);
48
- }
49
- function parseNumericEnv(name, raw) {
50
- const value = Number(raw);
51
- if (!Number.isFinite(value)) {
52
- throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
53
- }
54
- return value;
55
- }
56
- function parseModeEnv(raw, envName) {
57
- return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
58
- source: "env",
59
- raw: `${envName}=${raw}`
60
- });
61
- }
62
- function validateFileConfig(raw, configFilePath) {
63
- if (raw === undefined || raw === null)
64
- return {};
65
- if (!isRecord(raw)) {
66
- throw evalConfigError(configFilePath, "top-level value must be a mapping");
67
- }
68
- const out = {};
69
- const assignString = (key, value) => {
70
- if (value === undefined)
71
- return;
72
- if (typeof value !== "string" || value.trim().length === 0) {
73
- throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
74
- }
75
- out[key] = value.trim();
76
- };
77
- assignString("provider", raw.provider);
78
- assignString("baseUrl", raw.baseUrl);
79
- assignString("model", raw.model);
80
- assignString("judgeModel", raw.judgeModel);
81
- if (raw.defaultMode !== undefined) {
82
- if (typeof raw.defaultMode !== "string") {
83
- throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
84
- }
85
- try {
86
- out.defaultMode = parseModeInput(raw.defaultMode, {
87
- source: "config",
88
- raw: `defaultMode: ${raw.defaultMode}`
89
- });
90
- }
91
- catch (err) {
92
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
93
- }
94
- }
95
- else if (raw.defaultTier !== undefined) {
96
- if (typeof raw.defaultTier !== "string") {
97
- throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
98
- }
99
- try {
100
- out.defaultMode = parseModeInput(raw.defaultTier, {
101
- source: "config",
102
- raw: `defaultTier: ${raw.defaultTier}`
103
- });
104
- }
105
- catch (err) {
106
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
107
- }
108
- }
109
- if (raw.dailyUsdCap !== undefined) {
110
- if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
111
- throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
112
- }
113
- out.dailyUsdCap = raw.dailyUsdCap;
114
- }
115
- if (raw.timeoutMs !== undefined) {
116
- if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
117
- throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
118
- }
119
- out.timeoutMs = raw.timeoutMs;
120
- }
121
- if (raw.maxRetries !== undefined) {
122
- if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
123
- throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
124
- }
125
- out.maxRetries = raw.maxRetries;
126
- }
127
- if (raw.judgeSamples !== undefined) {
128
- const value = raw.judgeSamples;
129
- if (!Number.isInteger(value) || value < 1) {
130
- throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
131
- }
132
- if (value % 2 === 0) {
133
- throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
134
- }
135
- out.judgeSamples = value;
136
- }
137
- if (raw.judgeTemperature !== undefined) {
138
- if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
139
- throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
140
- }
141
- if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
142
- throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
143
- }
144
- out.judgeTemperature = raw.judgeTemperature;
145
- }
146
- if (raw.agentTemperature !== undefined) {
147
- if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
148
- throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
149
- }
150
- if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
151
- throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
152
- }
153
- out.agentTemperature = raw.agentTemperature;
154
- }
155
- if (raw.tokenPricing !== undefined) {
156
- if (!isRecord(raw.tokenPricing)) {
157
- throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
158
- }
159
- const pricing = {};
160
- for (const [model, value] of Object.entries(raw.tokenPricing)) {
161
- if (!isRecord(value)) {
162
- throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
163
- }
164
- const input = value.input;
165
- const output = value.output;
166
- if (typeof input !== "number" || input < 0) {
167
- throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
168
- }
169
- if (typeof output !== "number" || output < 0) {
170
- throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
171
- }
172
- const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
173
- if (extraneous.length > 0) {
174
- throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
175
- }
176
- pricing[model] = { input, output };
177
- }
178
- out.tokenPricing = pricing;
179
- }
180
- const assignPositiveInt = (key, value, label) => {
181
- if (value === undefined)
182
- return;
183
- if (!Number.isInteger(value) || value < 1) {
184
- throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
185
- }
186
- out[key] = value;
187
- };
188
- assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
189
- assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
190
- assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
191
- assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
192
- if (raw.regression !== undefined) {
193
- if (!isRecord(raw.regression)) {
194
- throw evalConfigError(configFilePath, `"regression" must be a mapping`);
195
- }
196
- const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
197
- const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
198
- if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
199
- throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
200
- }
201
- if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
202
- throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
203
- }
204
- out.regression = {
205
- failIfDeltaBelow: typeof failIfDeltaBelow === "number"
206
- ? failIfDeltaBelow
207
- : DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
208
- failIfCriticalBelow: typeof failIfCriticalBelow === "number"
209
- ? failIfCriticalBelow
210
- : DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
211
- };
212
- }
213
- const knownKeys = new Set([
214
- "provider",
215
- "baseUrl",
216
- "model",
217
- "judgeModel",
218
- "defaultMode",
219
- "defaultTier",
220
- "dailyUsdCap",
221
- "timeoutMs",
222
- "maxRetries",
223
- "regression",
224
- "judgeSamples",
225
- "judgeTemperature",
226
- "agentTemperature",
227
- "tokenPricing",
228
- "toolMaxTurns",
229
- "toolMaxArgumentsBytes",
230
- "toolMaxResultBytes",
231
- "workflowMaxTotalTurns"
232
- ]);
233
- const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
234
- if (unknown.length > 0) {
235
- throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
236
- }
237
- return out;
238
- }
239
- async function readFileConfig(projectRoot) {
240
- const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
241
- if (!(await exists(configFilePath))) {
242
- return { patch: {}, source: "default" };
243
- }
244
- let parsed;
245
- try {
246
- parsed = parse(await fs.readFile(configFilePath, "utf8"));
247
- }
248
- catch (err) {
249
- throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
250
- }
251
- const patch = validateFileConfig(parsed, configFilePath);
252
- return { patch, source: "file" };
253
- }
254
- function applyEnvOverrides(base, env) {
255
- let overridden = false;
256
- const patched = {
257
- ...base,
258
- regression: { ...base.regression }
259
- };
260
- for (const name of Object.keys(env)) {
261
- if (!name.startsWith("CCLAW_EVAL_"))
262
- continue;
263
- if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
264
- // validated below when applied
265
- }
266
- }
267
- const read = (name) => {
268
- const value = env[name];
269
- return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
270
- };
271
- const baseUrl = read("CCLAW_EVAL_BASE_URL");
272
- if (baseUrl) {
273
- patched.baseUrl = baseUrl;
274
- overridden = true;
275
- }
276
- const model = read("CCLAW_EVAL_MODEL");
277
- if (model) {
278
- patched.model = model;
279
- overridden = true;
280
- }
281
- const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
282
- if (judgeModel) {
283
- patched.judgeModel = judgeModel;
284
- overridden = true;
285
- }
286
- const provider = read("CCLAW_EVAL_PROVIDER");
287
- if (provider) {
288
- patched.provider = provider;
289
- overridden = true;
290
- }
291
- const modeEnv = read("CCLAW_EVAL_MODE");
292
- if (modeEnv) {
293
- patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
294
- overridden = true;
295
- }
296
- else {
297
- const legacyTier = read("CCLAW_EVAL_TIER");
298
- if (legacyTier) {
299
- patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
300
- overridden = true;
301
- }
302
- }
303
- const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
304
- if (cap) {
305
- patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
306
- overridden = true;
307
- }
308
- const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
309
- if (timeout) {
310
- patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
311
- overridden = true;
312
- }
313
- const retries = read("CCLAW_EVAL_MAX_RETRIES");
314
- if (retries) {
315
- patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
316
- overridden = true;
317
- }
318
- const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
319
- if (judgeSamples) {
320
- const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
321
- if (!Number.isInteger(value) || value < 1) {
322
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
323
- }
324
- if (value % 2 === 0) {
325
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
326
- }
327
- patched.judgeSamples = value;
328
- overridden = true;
329
- }
330
- const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
331
- if (judgeTemp) {
332
- const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
333
- if (value < 0 || value > 2) {
334
- throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
335
- }
336
- patched.judgeTemperature = value;
337
- overridden = true;
338
- }
339
- const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
340
- if (agentTemp) {
341
- const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
342
- if (value < 0 || value > 2) {
343
- throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
344
- }
345
- patched.agentTemperature = value;
346
- overridden = true;
347
- }
348
- const readPositiveInt = (name, key, label) => {
349
- const raw = read(name);
350
- if (!raw)
351
- return;
352
- const value = parseNumericEnv(name, raw);
353
- if (!Number.isInteger(value) || value < 1) {
354
- throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
355
- }
356
- patched[key] = value;
357
- overridden = true;
358
- void label;
359
- };
360
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
361
- readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
362
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
363
- readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
364
- const apiKey = read("CCLAW_EVAL_API_KEY");
365
- return { patched, overridden, apiKey };
366
- }
367
- /**
368
- * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
369
- * Returns a fully-populated config plus a provenance marker so `--dry-run` can
370
- * surface where each setting came from.
371
- */
372
- export async function loadEvalConfig(projectRoot, env = process.env) {
373
- const { patch, source: fileSource } = await readFileConfig(projectRoot);
374
- const merged = {
375
- ...DEFAULT_EVAL_CONFIG,
376
- ...patch,
377
- regression: {
378
- ...DEFAULT_EVAL_CONFIG.regression,
379
- ...(patch.regression ?? {})
380
- }
381
- };
382
- const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
383
- let source = "default";
384
- if (fileSource === "file" && overridden)
385
- source = "file+env";
386
- else if (fileSource === "file")
387
- source = "file";
388
- else if (overridden)
389
- source = "env";
390
- return {
391
- ...patched,
392
- apiKey,
393
- source
394
- };
395
- }