cclaw-cli 0.49.0 → 0.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -82
- package/dist/artifact-linter.d.ts +4 -0
- package/dist/artifact-linter.js +24 -3
- package/dist/cli.d.ts +1 -19
- package/dist/cli.js +49 -491
- package/dist/constants.d.ts +2 -13
- package/dist/constants.js +1 -43
- package/dist/content/closeout-guidance.d.ts +14 -0
- package/dist/content/closeout-guidance.js +42 -0
- package/dist/content/core-agents.js +51 -9
- package/dist/content/decision-protocol.d.ts +12 -0
- package/dist/content/decision-protocol.js +20 -0
- package/dist/content/diff-command.d.ts +1 -2
- package/dist/content/diff-command.js +8 -94
- package/dist/content/examples.d.ts +4 -10
- package/dist/content/examples.js +10 -20
- package/dist/content/hook-events.js +2 -2
- package/dist/content/hook-inline-snippets.d.ts +5 -2
- package/dist/content/hook-inline-snippets.js +33 -1
- package/dist/content/hook-manifest.d.ts +3 -4
- package/dist/content/hook-manifest.js +11 -12
- package/dist/content/hooks.js +2 -0
- package/dist/content/ideate-command.d.ts +2 -0
- package/dist/content/ideate-command.js +31 -25
- package/dist/content/iron-laws.d.ts +5 -5
- package/dist/content/iron-laws.js +5 -5
- package/dist/content/learnings.d.ts +3 -4
- package/dist/content/learnings.js +24 -50
- package/dist/content/meta-skill.js +31 -21
- package/dist/content/next-command.js +38 -38
- package/dist/content/node-hooks.js +17 -343
- package/dist/content/opencode-plugin.js +2 -100
- package/dist/content/research-playbooks.js +14 -14
- package/dist/content/review-loop.d.ts +2 -0
- package/dist/content/review-loop.js +8 -0
- package/dist/content/session-hooks.js +14 -46
- package/dist/content/skills.d.ts +0 -5
- package/dist/content/skills.js +53 -128
- package/dist/content/stage-common-guidance.d.ts +0 -1
- package/dist/content/stage-common-guidance.js +15 -14
- package/dist/content/stage-schema.d.ts +26 -1
- package/dist/content/stage-schema.js +121 -40
- package/dist/content/stages/_lint-metadata/index.js +9 -15
- package/dist/content/stages/brainstorm.js +22 -43
- package/dist/content/stages/design.js +37 -57
- package/dist/content/stages/plan.js +22 -13
- package/dist/content/stages/review.js +24 -27
- package/dist/content/stages/scope.js +34 -46
- package/dist/content/stages/ship.js +7 -4
- package/dist/content/stages/spec.js +20 -9
- package/dist/content/stages/tdd.js +64 -44
- package/dist/content/start-command.js +10 -12
- package/dist/content/status-command.d.ts +2 -7
- package/dist/content/status-command.js +19 -146
- package/dist/content/subagents.d.ts +0 -5
- package/dist/content/subagents.js +47 -28
- package/dist/content/templates.d.ts +1 -1
- package/dist/content/templates.js +126 -135
- package/dist/content/track-render-context.d.ts +17 -0
- package/dist/content/track-render-context.js +44 -0
- package/dist/content/tree-command.d.ts +1 -2
- package/dist/content/tree-command.js +4 -87
- package/dist/content/utility-skills.d.ts +2 -29
- package/dist/content/utility-skills.js +2 -1534
- package/dist/content/view-command.js +29 -11
- package/dist/delegation.d.ts +1 -1
- package/dist/delegation.js +5 -15
- package/dist/doctor-registry.js +20 -21
- package/dist/doctor.js +88 -344
- package/dist/flow-state.d.ts +3 -0
- package/dist/flow-state.js +2 -0
- package/dist/harness-adapters.d.ts +1 -1
- package/dist/harness-adapters.js +48 -57
- package/dist/install.js +128 -358
- package/dist/internal/advance-stage.js +3 -9
- package/dist/internal/compound-readiness.d.ts +1 -1
- package/dist/internal/compound-readiness.js +1 -1
- package/dist/internal/tdd-loop-status.d.ts +1 -1
- package/dist/internal/tdd-loop-status.js +1 -1
- package/dist/knowledge-store.d.ts +16 -10
- package/dist/knowledge-store.js +51 -15
- package/dist/policy.js +16 -105
- package/dist/run-archive.d.ts +4 -6
- package/dist/run-archive.js +15 -20
- package/dist/run-persistence.d.ts +2 -2
- package/dist/run-persistence.js +3 -9
- package/package.json +1 -2
- package/dist/content/archive-command.d.ts +0 -2
- package/dist/content/archive-command.js +0 -124
- package/dist/content/compound-command.d.ts +0 -5
- package/dist/content/compound-command.js +0 -193
- package/dist/content/contexts.d.ts +0 -18
- package/dist/content/contexts.js +0 -24
- package/dist/content/contracts.d.ts +0 -2
- package/dist/content/contracts.js +0 -51
- package/dist/content/doctor-references.d.ts +0 -2
- package/dist/content/doctor-references.js +0 -150
- package/dist/content/eval-scaffold.d.ts +0 -15
- package/dist/content/eval-scaffold.js +0 -370
- package/dist/content/feature-command.d.ts +0 -2
- package/dist/content/feature-command.js +0 -123
- package/dist/content/flow-map.d.ts +0 -23
- package/dist/content/flow-map.js +0 -134
- package/dist/content/harness-doc.d.ts +0 -2
- package/dist/content/harness-doc.js +0 -202
- package/dist/content/harness-playbooks.d.ts +0 -24
- package/dist/content/harness-playbooks.js +0 -393
- package/dist/content/harness-tool-refs.d.ts +0 -20
- package/dist/content/harness-tool-refs.js +0 -268
- package/dist/content/ops-command.d.ts +0 -2
- package/dist/content/ops-command.js +0 -71
- package/dist/content/protocols.d.ts +0 -7
- package/dist/content/protocols.js +0 -215
- package/dist/content/retro-command.d.ts +0 -2
- package/dist/content/retro-command.js +0 -165
- package/dist/content/rewind-command.d.ts +0 -2
- package/dist/content/rewind-command.js +0 -106
- package/dist/content/tdd-log-command.d.ts +0 -2
- package/dist/content/tdd-log-command.js +0 -85
- package/dist/eval/agents/single-shot.d.ts +0 -27
- package/dist/eval/agents/single-shot.js +0 -79
- package/dist/eval/agents/with-tools.d.ts +0 -44
- package/dist/eval/agents/with-tools.js +0 -261
- package/dist/eval/agents/workflow.d.ts +0 -31
- package/dist/eval/agents/workflow.js +0 -155
- package/dist/eval/baseline.d.ts +0 -38
- package/dist/eval/baseline.js +0 -282
- package/dist/eval/config-loader.d.ts +0 -14
- package/dist/eval/config-loader.js +0 -395
- package/dist/eval/corpus.d.ts +0 -30
- package/dist/eval/corpus.js +0 -330
- package/dist/eval/cost-guard.d.ts +0 -102
- package/dist/eval/cost-guard.js +0 -190
- package/dist/eval/diff.d.ts +0 -64
- package/dist/eval/diff.js +0 -323
- package/dist/eval/llm-client.d.ts +0 -176
- package/dist/eval/llm-client.js +0 -267
- package/dist/eval/mode.d.ts +0 -28
- package/dist/eval/mode.js +0 -61
- package/dist/eval/progress.d.ts +0 -83
- package/dist/eval/progress.js +0 -59
- package/dist/eval/report.d.ts +0 -11
- package/dist/eval/report.js +0 -181
- package/dist/eval/rubric-loader.d.ts +0 -20
- package/dist/eval/rubric-loader.js +0 -143
- package/dist/eval/runner.d.ts +0 -81
- package/dist/eval/runner.js +0 -746
- package/dist/eval/runs.d.ts +0 -41
- package/dist/eval/runs.js +0 -114
- package/dist/eval/sandbox.d.ts +0 -38
- package/dist/eval/sandbox.js +0 -137
- package/dist/eval/tools/glob.d.ts +0 -2
- package/dist/eval/tools/glob.js +0 -163
- package/dist/eval/tools/grep.d.ts +0 -2
- package/dist/eval/tools/grep.js +0 -152
- package/dist/eval/tools/index.d.ts +0 -7
- package/dist/eval/tools/index.js +0 -35
- package/dist/eval/tools/read.d.ts +0 -2
- package/dist/eval/tools/read.js +0 -122
- package/dist/eval/tools/types.d.ts +0 -49
- package/dist/eval/tools/types.js +0 -41
- package/dist/eval/tools/write.d.ts +0 -2
- package/dist/eval/tools/write.js +0 -92
- package/dist/eval/types.d.ts +0 -561
- package/dist/eval/types.js +0 -47
- package/dist/eval/verifiers/judge.d.ts +0 -40
- package/dist/eval/verifiers/judge.js +0 -256
- package/dist/eval/verifiers/rules.d.ts +0 -24
- package/dist/eval/verifiers/rules.js +0 -218
- package/dist/eval/verifiers/structural.d.ts +0 -14
- package/dist/eval/verifiers/structural.js +0 -171
- package/dist/eval/verifiers/traceability.d.ts +0 -23
- package/dist/eval/verifiers/traceability.js +0 -84
- package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
- package/dist/eval/verifiers/workflow-consistency.js +0 -225
- package/dist/eval/workflow-corpus.d.ts +0 -7
- package/dist/eval/workflow-corpus.js +0 -207
- package/dist/feature-system.d.ts +0 -42
- package/dist/feature-system.js +0 -432
- package/dist/internal/knowledge-digest.d.ts +0 -7
- package/dist/internal/knowledge-digest.js +0 -93
package/dist/eval/baseline.js
DELETED
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Baseline I/O + regression comparison for the eval subsystem.
|
|
3
|
-
*
|
|
4
|
-
* Layout on disk (committed):
|
|
5
|
-
*
|
|
6
|
-
* .cclaw/evals/baselines/<stage>.json
|
|
7
|
-
*
|
|
8
|
-
* Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
|
|
9
|
-
* regressions by comparing per-verifier `ok` flags across runs: any verifier
|
|
10
|
-
* that was `ok:true` in the baseline and is `ok:false` now counts as a
|
|
11
|
-
* critical failure. A case whose aggregate `passed` flipped from true to
|
|
12
|
-
* false is flagged as `case-now-failing` regardless of per-verifier churn.
|
|
13
|
-
*
|
|
14
|
-
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
|
-
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
|
-
*/
|
|
17
|
-
import { createHash } from "node:crypto";
|
|
18
|
-
import fs from "node:fs/promises";
|
|
19
|
-
import path from "node:path";
|
|
20
|
-
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
21
|
-
import { exists } from "../fs-utils.js";
|
|
22
|
-
import { FLOW_STAGES } from "../types.js";
|
|
23
|
-
export const BASELINE_SCHEMA_VERSION = 1;
|
|
24
|
-
/**
|
|
25
|
-
* Thrown when a signed baseline's on-disk digest does not match the
|
|
26
|
-
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
27
|
-
* Callers should treat this as a hard failure: the baseline was either
|
|
28
|
-
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
29
|
-
*/
|
|
30
|
-
export class BaselineSignatureError extends Error {
|
|
31
|
-
file;
|
|
32
|
-
expected;
|
|
33
|
-
actual;
|
|
34
|
-
constructor(opts) {
|
|
35
|
-
super(`Baseline signature mismatch at ${opts.file}: expected ${opts.expected}, got ${opts.actual}. ` +
|
|
36
|
-
`The file was modified outside of \`cclaw eval --update-baseline\`. ` +
|
|
37
|
-
`Re-run with --update-baseline --confirm to re-sign a known-good snapshot.`);
|
|
38
|
-
this.name = "BaselineSignatureError";
|
|
39
|
-
this.file = opts.file;
|
|
40
|
-
this.expected = opts.expected;
|
|
41
|
-
this.actual = opts.actual;
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
function baselinePath(projectRoot, stage) {
|
|
45
|
-
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
46
|
-
}
|
|
47
|
-
/**
|
|
48
|
-
* Produce a deterministic sha256 digest over the signable portion of a
|
|
49
|
-
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
50
|
-
* from the digest so that rebuilding the same baseline from identical
|
|
51
|
-
* case results on a new CLI version doesn't invalidate the signature —
|
|
52
|
-
* only changes to the observed pass/ok/score payloads do.
|
|
53
|
-
*/
|
|
54
|
-
export function computeBaselineDigest(snapshot) {
|
|
55
|
-
const canonical = canonicalJson({
|
|
56
|
-
schemaVersion: snapshot.schemaVersion,
|
|
57
|
-
stage: snapshot.stage,
|
|
58
|
-
cases: snapshot.cases
|
|
59
|
-
});
|
|
60
|
-
return createHash("sha256").update(canonical).digest("hex");
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* JSON.stringify with object keys sorted recursively so the digest is
|
|
64
|
-
* stable across filesystem / serializer variations.
|
|
65
|
-
*/
|
|
66
|
-
function canonicalJson(value) {
|
|
67
|
-
if (value === null || typeof value !== "object") {
|
|
68
|
-
return JSON.stringify(value);
|
|
69
|
-
}
|
|
70
|
-
if (Array.isArray(value)) {
|
|
71
|
-
return `[${value.map((v) => canonicalJson(v)).join(",")}]`;
|
|
72
|
-
}
|
|
73
|
-
const record = value;
|
|
74
|
-
const keys = Object.keys(record).sort();
|
|
75
|
-
const parts = keys.map((k) => `${JSON.stringify(k)}:${canonicalJson(record[k])}`);
|
|
76
|
-
return `{${parts.join(",")}}`;
|
|
77
|
-
}
|
|
78
|
-
export async function loadBaseline(projectRoot, stage) {
|
|
79
|
-
const filePath = baselinePath(projectRoot, stage);
|
|
80
|
-
if (!(await exists(filePath)))
|
|
81
|
-
return null;
|
|
82
|
-
const raw = await fs.readFile(filePath, "utf8");
|
|
83
|
-
let parsed;
|
|
84
|
-
try {
|
|
85
|
-
parsed = JSON.parse(raw);
|
|
86
|
-
}
|
|
87
|
-
catch (err) {
|
|
88
|
-
throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
89
|
-
}
|
|
90
|
-
if (!isBaseline(parsed, stage)) {
|
|
91
|
-
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
92
|
-
}
|
|
93
|
-
const signature = parsed.signature;
|
|
94
|
-
if (signature) {
|
|
95
|
-
if (signature.algorithm !== "sha256") {
|
|
96
|
-
throw new Error(`Invalid baseline at ${filePath}: unsupported signature algorithm "${signature.algorithm}".`);
|
|
97
|
-
}
|
|
98
|
-
const actual = computeBaselineDigest(parsed);
|
|
99
|
-
if (actual !== signature.digest) {
|
|
100
|
-
throw new BaselineSignatureError({
|
|
101
|
-
file: filePath,
|
|
102
|
-
expected: signature.digest,
|
|
103
|
-
actual
|
|
104
|
-
});
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
return parsed;
|
|
108
|
-
}
|
|
109
|
-
function isBaseline(value, stage) {
|
|
110
|
-
if (!value || typeof value !== "object")
|
|
111
|
-
return false;
|
|
112
|
-
const candidate = value;
|
|
113
|
-
if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
|
|
114
|
-
return false;
|
|
115
|
-
if (candidate.stage !== stage)
|
|
116
|
-
return false;
|
|
117
|
-
if (typeof candidate.generatedAt !== "string")
|
|
118
|
-
return false;
|
|
119
|
-
if (typeof candidate.cclawVersion !== "string")
|
|
120
|
-
return false;
|
|
121
|
-
if (!candidate.cases || typeof candidate.cases !== "object")
|
|
122
|
-
return false;
|
|
123
|
-
return true;
|
|
124
|
-
}
|
|
125
|
-
export async function loadBaselinesByStage(projectRoot, stages) {
|
|
126
|
-
const out = new Map();
|
|
127
|
-
for (const stage of stages) {
|
|
128
|
-
const snapshot = await loadBaseline(projectRoot, stage);
|
|
129
|
-
if (snapshot)
|
|
130
|
-
out.set(stage, snapshot);
|
|
131
|
-
}
|
|
132
|
-
return out;
|
|
133
|
-
}
|
|
134
|
-
function entryFromResult(result) {
|
|
135
|
-
const verifierResults = result.verifierResults.map((v) => ({
|
|
136
|
-
id: v.id,
|
|
137
|
-
kind: v.kind,
|
|
138
|
-
ok: v.ok,
|
|
139
|
-
...(v.score !== undefined ? { score: v.score } : {})
|
|
140
|
-
}));
|
|
141
|
-
return { passed: result.passed, verifierResults };
|
|
142
|
-
}
|
|
143
|
-
export function buildBaselineForStage(stage, report) {
|
|
144
|
-
const stageCases = report.cases.filter((c) => c.stage === stage);
|
|
145
|
-
const cases = {};
|
|
146
|
-
for (const c of stageCases) {
|
|
147
|
-
cases[c.caseId] = entryFromResult(c);
|
|
148
|
-
}
|
|
149
|
-
const now = new Date().toISOString();
|
|
150
|
-
const unsigned = {
|
|
151
|
-
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
152
|
-
stage,
|
|
153
|
-
generatedAt: now,
|
|
154
|
-
cclawVersion: CCLAW_VERSION,
|
|
155
|
-
cases
|
|
156
|
-
};
|
|
157
|
-
unsigned.signature = {
|
|
158
|
-
algorithm: "sha256",
|
|
159
|
-
digest: computeBaselineDigest(unsigned),
|
|
160
|
-
signedAt: now
|
|
161
|
-
};
|
|
162
|
-
return unsigned;
|
|
163
|
-
}
|
|
164
|
-
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
165
|
-
const written = [];
|
|
166
|
-
const stages = new Set(report.cases.map((c) => c.stage));
|
|
167
|
-
for (const stage of stages) {
|
|
168
|
-
const snapshot = buildBaselineForStage(stage, report);
|
|
169
|
-
const file = baselinePath(projectRoot, stage);
|
|
170
|
-
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
171
|
-
await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
|
|
172
|
-
written.push(file);
|
|
173
|
-
}
|
|
174
|
-
return written.sort();
|
|
175
|
-
}
|
|
176
|
-
function verifierMap(entries) {
|
|
177
|
-
const out = new Map();
|
|
178
|
-
for (const entry of entries) {
|
|
179
|
-
out.set(entry.id, entry);
|
|
180
|
-
}
|
|
181
|
-
return out;
|
|
182
|
-
}
|
|
183
|
-
function computePassRate(cases) {
|
|
184
|
-
if (cases.length === 0)
|
|
185
|
-
return 1;
|
|
186
|
-
const passed = cases.filter((c) => c.passed).length;
|
|
187
|
-
return passed / cases.length;
|
|
188
|
-
}
|
|
189
|
-
function baselinePassRate(snapshot) {
|
|
190
|
-
const entries = Object.values(snapshot.cases);
|
|
191
|
-
if (entries.length === 0)
|
|
192
|
-
return 1;
|
|
193
|
-
const passed = entries.filter((e) => e.passed).length;
|
|
194
|
-
return passed / entries.length;
|
|
195
|
-
}
|
|
196
|
-
/**
|
|
197
|
-
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
198
|
-
* exists for a stage covered by the report, that stage contributes zero
|
|
199
|
-
* regressions (first run of that stage). Current is the source of truth.
|
|
200
|
-
*/
|
|
201
|
-
export function compareAgainstBaselines(report, baselines) {
|
|
202
|
-
if (baselines.size === 0)
|
|
203
|
-
return undefined;
|
|
204
|
-
const regressions = [];
|
|
205
|
-
const caseResultsByStage = new Map();
|
|
206
|
-
for (const c of report.cases) {
|
|
207
|
-
const bucket = caseResultsByStage.get(c.stage) ?? [];
|
|
208
|
-
bucket.push(c);
|
|
209
|
-
caseResultsByStage.set(c.stage, bucket);
|
|
210
|
-
}
|
|
211
|
-
let baselineTotalPassRate = 0;
|
|
212
|
-
let baselineStagesCounted = 0;
|
|
213
|
-
for (const [stage, snapshot] of baselines) {
|
|
214
|
-
const current = caseResultsByStage.get(stage) ?? [];
|
|
215
|
-
baselineTotalPassRate += baselinePassRate(snapshot);
|
|
216
|
-
baselineStagesCounted += 1;
|
|
217
|
-
for (const caseResult of current) {
|
|
218
|
-
const baselineEntry = snapshot.cases[caseResult.caseId];
|
|
219
|
-
if (!baselineEntry)
|
|
220
|
-
continue;
|
|
221
|
-
if (baselineEntry.passed && !caseResult.passed) {
|
|
222
|
-
regressions.push({
|
|
223
|
-
caseId: caseResult.caseId,
|
|
224
|
-
stage,
|
|
225
|
-
verifierId: "<case>",
|
|
226
|
-
reason: "case-now-failing",
|
|
227
|
-
previousScore: 1,
|
|
228
|
-
currentScore: 0
|
|
229
|
-
});
|
|
230
|
-
}
|
|
231
|
-
const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
|
|
232
|
-
for (const currentVerifier of caseResult.verifierResults) {
|
|
233
|
-
const prev = baselineVerifiers.get(currentVerifier.id);
|
|
234
|
-
if (!prev)
|
|
235
|
-
continue;
|
|
236
|
-
if (prev.ok && !currentVerifier.ok) {
|
|
237
|
-
regressions.push({
|
|
238
|
-
caseId: caseResult.caseId,
|
|
239
|
-
stage,
|
|
240
|
-
verifierId: currentVerifier.id,
|
|
241
|
-
reason: "newly-failing",
|
|
242
|
-
previousScore: prev.score ?? 1,
|
|
243
|
-
currentScore: currentVerifier.score ?? 0
|
|
244
|
-
});
|
|
245
|
-
}
|
|
246
|
-
else if (prev.score !== undefined &&
|
|
247
|
-
currentVerifier.score !== undefined &&
|
|
248
|
-
currentVerifier.score < prev.score) {
|
|
249
|
-
regressions.push({
|
|
250
|
-
caseId: caseResult.caseId,
|
|
251
|
-
stage,
|
|
252
|
-
verifierId: currentVerifier.id,
|
|
253
|
-
reason: "score-drop",
|
|
254
|
-
previousScore: prev.score,
|
|
255
|
-
currentScore: currentVerifier.score
|
|
256
|
-
});
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
const currentPassRate = computePassRate(report.cases);
|
|
262
|
-
const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
|
|
263
|
-
const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
|
|
264
|
-
const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
|
|
265
|
-
const baselineStages = [...baselines.keys()].sort().join(",");
|
|
266
|
-
return {
|
|
267
|
-
baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
|
|
268
|
-
scoreDelta,
|
|
269
|
-
criticalFailures,
|
|
270
|
-
regressions
|
|
271
|
-
};
|
|
272
|
-
}
|
|
273
|
-
export function listBaselineStages(projectRoot) {
|
|
274
|
-
const root = path.join(projectRoot, EVALS_ROOT, "baselines");
|
|
275
|
-
return fs
|
|
276
|
-
.readdir(root, { withFileTypes: true })
|
|
277
|
-
.then((entries) => entries
|
|
278
|
-
.filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
|
|
279
|
-
.map((entry) => entry.name.replace(/\.json$/, ""))
|
|
280
|
-
.filter((name) => FLOW_STAGES.includes(name)))
|
|
281
|
-
.catch(() => []);
|
|
282
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
|
|
2
|
-
/**
|
|
3
|
-
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
4
|
-
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
5
|
-
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
6
|
-
* variables (env wins last).
|
|
7
|
-
*/
|
|
8
|
-
export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
|
|
9
|
-
/**
|
|
10
|
-
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
11
|
-
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
12
|
-
* surface where each setting came from.
|
|
13
|
-
*/
|
|
14
|
-
export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
|
|
@@ -1,395 +0,0 @@
|
|
|
1
|
-
import fs from "node:fs/promises";
|
|
2
|
-
import path from "node:path";
|
|
3
|
-
import { parse } from "yaml";
|
|
4
|
-
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
|
-
import { exists } from "../fs-utils.js";
|
|
6
|
-
import { EVAL_MODES } from "./types.js";
|
|
7
|
-
import { parseModeInput } from "./mode.js";
|
|
8
|
-
/**
|
|
9
|
-
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
10
|
-
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
11
|
-
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
12
|
-
* variables (env wins last).
|
|
13
|
-
*/
|
|
14
|
-
export const DEFAULT_EVAL_CONFIG = {
|
|
15
|
-
provider: "zai",
|
|
16
|
-
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
17
|
-
model: "glm-5.1",
|
|
18
|
-
defaultMode: "fixture",
|
|
19
|
-
regression: {
|
|
20
|
-
failIfDeltaBelow: -0.15,
|
|
21
|
-
failIfCriticalBelow: 3.0
|
|
22
|
-
},
|
|
23
|
-
timeoutMs: 120_000,
|
|
24
|
-
maxRetries: 2,
|
|
25
|
-
judgeSamples: 3,
|
|
26
|
-
judgeTemperature: 0,
|
|
27
|
-
agentTemperature: 0.2
|
|
28
|
-
};
|
|
29
|
-
const NUMERIC_ENVS = new Set([
|
|
30
|
-
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
31
|
-
"CCLAW_EVAL_TIMEOUT_MS",
|
|
32
|
-
"CCLAW_EVAL_MAX_RETRIES",
|
|
33
|
-
"CCLAW_EVAL_JUDGE_SAMPLES",
|
|
34
|
-
"CCLAW_EVAL_JUDGE_TEMPERATURE",
|
|
35
|
-
"CCLAW_EVAL_AGENT_TEMPERATURE",
|
|
36
|
-
"CCLAW_EVAL_TOOL_MAX_TURNS",
|
|
37
|
-
"CCLAW_EVAL_TOOL_MAX_ARG_BYTES",
|
|
38
|
-
"CCLAW_EVAL_TOOL_MAX_RESULT_BYTES",
|
|
39
|
-
"CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS"
|
|
40
|
-
]);
|
|
41
|
-
function evalConfigError(configFilePath, reason) {
|
|
42
|
-
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
43
|
-
`Supported modes: ${EVAL_MODES.join(", ")} (legacy tier values A|B|C also accepted).\n` +
|
|
44
|
-
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
45
|
-
}
|
|
46
|
-
function isRecord(value) {
|
|
47
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
48
|
-
}
|
|
49
|
-
function parseNumericEnv(name, raw) {
|
|
50
|
-
const value = Number(raw);
|
|
51
|
-
if (!Number.isFinite(value)) {
|
|
52
|
-
throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
|
|
53
|
-
}
|
|
54
|
-
return value;
|
|
55
|
-
}
|
|
56
|
-
function parseModeEnv(raw, envName) {
|
|
57
|
-
return parseModeInput(envName === "CCLAW_EVAL_TIER" ? raw.toUpperCase() : raw, {
|
|
58
|
-
source: "env",
|
|
59
|
-
raw: `${envName}=${raw}`
|
|
60
|
-
});
|
|
61
|
-
}
|
|
62
|
-
function validateFileConfig(raw, configFilePath) {
|
|
63
|
-
if (raw === undefined || raw === null)
|
|
64
|
-
return {};
|
|
65
|
-
if (!isRecord(raw)) {
|
|
66
|
-
throw evalConfigError(configFilePath, "top-level value must be a mapping");
|
|
67
|
-
}
|
|
68
|
-
const out = {};
|
|
69
|
-
const assignString = (key, value) => {
|
|
70
|
-
if (value === undefined)
|
|
71
|
-
return;
|
|
72
|
-
if (typeof value !== "string" || value.trim().length === 0) {
|
|
73
|
-
throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
|
|
74
|
-
}
|
|
75
|
-
out[key] = value.trim();
|
|
76
|
-
};
|
|
77
|
-
assignString("provider", raw.provider);
|
|
78
|
-
assignString("baseUrl", raw.baseUrl);
|
|
79
|
-
assignString("model", raw.model);
|
|
80
|
-
assignString("judgeModel", raw.judgeModel);
|
|
81
|
-
if (raw.defaultMode !== undefined) {
|
|
82
|
-
if (typeof raw.defaultMode !== "string") {
|
|
83
|
-
throw evalConfigError(configFilePath, `"defaultMode" must be one of: ${EVAL_MODES.join(", ")}`);
|
|
84
|
-
}
|
|
85
|
-
try {
|
|
86
|
-
out.defaultMode = parseModeInput(raw.defaultMode, {
|
|
87
|
-
source: "config",
|
|
88
|
-
raw: `defaultMode: ${raw.defaultMode}`
|
|
89
|
-
});
|
|
90
|
-
}
|
|
91
|
-
catch (err) {
|
|
92
|
-
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
else if (raw.defaultTier !== undefined) {
|
|
96
|
-
if (typeof raw.defaultTier !== "string") {
|
|
97
|
-
throw evalConfigError(configFilePath, `"defaultTier" must be a string (legacy; prefer "defaultMode")`);
|
|
98
|
-
}
|
|
99
|
-
try {
|
|
100
|
-
out.defaultMode = parseModeInput(raw.defaultTier, {
|
|
101
|
-
source: "config",
|
|
102
|
-
raw: `defaultTier: ${raw.defaultTier}`
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
catch (err) {
|
|
106
|
-
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
if (raw.dailyUsdCap !== undefined) {
|
|
110
|
-
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
111
|
-
throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
|
|
112
|
-
}
|
|
113
|
-
out.dailyUsdCap = raw.dailyUsdCap;
|
|
114
|
-
}
|
|
115
|
-
if (raw.timeoutMs !== undefined) {
|
|
116
|
-
if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
|
|
117
|
-
throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
|
|
118
|
-
}
|
|
119
|
-
out.timeoutMs = raw.timeoutMs;
|
|
120
|
-
}
|
|
121
|
-
if (raw.maxRetries !== undefined) {
|
|
122
|
-
if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
|
|
123
|
-
throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
|
|
124
|
-
}
|
|
125
|
-
out.maxRetries = raw.maxRetries;
|
|
126
|
-
}
|
|
127
|
-
if (raw.judgeSamples !== undefined) {
|
|
128
|
-
const value = raw.judgeSamples;
|
|
129
|
-
if (!Number.isInteger(value) || value < 1) {
|
|
130
|
-
throw evalConfigError(configFilePath, `"judgeSamples" must be a positive integer`);
|
|
131
|
-
}
|
|
132
|
-
if (value % 2 === 0) {
|
|
133
|
-
throw evalConfigError(configFilePath, `"judgeSamples" must be odd (so median-of-N is a true integer)`);
|
|
134
|
-
}
|
|
135
|
-
out.judgeSamples = value;
|
|
136
|
-
}
|
|
137
|
-
if (raw.judgeTemperature !== undefined) {
|
|
138
|
-
if (typeof raw.judgeTemperature !== "number" || !Number.isFinite(raw.judgeTemperature)) {
|
|
139
|
-
throw evalConfigError(configFilePath, `"judgeTemperature" must be a finite number`);
|
|
140
|
-
}
|
|
141
|
-
if (raw.judgeTemperature < 0 || raw.judgeTemperature > 2) {
|
|
142
|
-
throw evalConfigError(configFilePath, `"judgeTemperature" must be within [0, 2]`);
|
|
143
|
-
}
|
|
144
|
-
out.judgeTemperature = raw.judgeTemperature;
|
|
145
|
-
}
|
|
146
|
-
if (raw.agentTemperature !== undefined) {
|
|
147
|
-
if (typeof raw.agentTemperature !== "number" || !Number.isFinite(raw.agentTemperature)) {
|
|
148
|
-
throw evalConfigError(configFilePath, `"agentTemperature" must be a finite number`);
|
|
149
|
-
}
|
|
150
|
-
if (raw.agentTemperature < 0 || raw.agentTemperature > 2) {
|
|
151
|
-
throw evalConfigError(configFilePath, `"agentTemperature" must be within [0, 2]`);
|
|
152
|
-
}
|
|
153
|
-
out.agentTemperature = raw.agentTemperature;
|
|
154
|
-
}
|
|
155
|
-
if (raw.tokenPricing !== undefined) {
|
|
156
|
-
if (!isRecord(raw.tokenPricing)) {
|
|
157
|
-
throw evalConfigError(configFilePath, `"tokenPricing" must be a mapping`);
|
|
158
|
-
}
|
|
159
|
-
const pricing = {};
|
|
160
|
-
for (const [model, value] of Object.entries(raw.tokenPricing)) {
|
|
161
|
-
if (!isRecord(value)) {
|
|
162
|
-
throw evalConfigError(configFilePath, `"tokenPricing.${model}" must be a mapping with numeric input + output keys`);
|
|
163
|
-
}
|
|
164
|
-
const input = value.input;
|
|
165
|
-
const output = value.output;
|
|
166
|
-
if (typeof input !== "number" || input < 0) {
|
|
167
|
-
throw evalConfigError(configFilePath, `"tokenPricing.${model}.input" must be a non-negative number`);
|
|
168
|
-
}
|
|
169
|
-
if (typeof output !== "number" || output < 0) {
|
|
170
|
-
throw evalConfigError(configFilePath, `"tokenPricing.${model}.output" must be a non-negative number`);
|
|
171
|
-
}
|
|
172
|
-
const extraneous = Object.keys(value).filter((key) => key !== "input" && key !== "output");
|
|
173
|
-
if (extraneous.length > 0) {
|
|
174
|
-
throw evalConfigError(configFilePath, `"tokenPricing.${model}" has unknown key(s): ${extraneous.join(", ")}`);
|
|
175
|
-
}
|
|
176
|
-
pricing[model] = { input, output };
|
|
177
|
-
}
|
|
178
|
-
out.tokenPricing = pricing;
|
|
179
|
-
}
|
|
180
|
-
const assignPositiveInt = (key, value, label) => {
|
|
181
|
-
if (value === undefined)
|
|
182
|
-
return;
|
|
183
|
-
if (!Number.isInteger(value) || value < 1) {
|
|
184
|
-
throw evalConfigError(configFilePath, `"${label}" must be a positive integer`);
|
|
185
|
-
}
|
|
186
|
-
out[key] = value;
|
|
187
|
-
};
|
|
188
|
-
assignPositiveInt("toolMaxTurns", raw.toolMaxTurns, "toolMaxTurns");
|
|
189
|
-
assignPositiveInt("toolMaxArgumentsBytes", raw.toolMaxArgumentsBytes, "toolMaxArgumentsBytes");
|
|
190
|
-
assignPositiveInt("toolMaxResultBytes", raw.toolMaxResultBytes, "toolMaxResultBytes");
|
|
191
|
-
assignPositiveInt("workflowMaxTotalTurns", raw.workflowMaxTotalTurns, "workflowMaxTotalTurns");
|
|
192
|
-
if (raw.regression !== undefined) {
|
|
193
|
-
if (!isRecord(raw.regression)) {
|
|
194
|
-
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
195
|
-
}
|
|
196
|
-
const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
|
|
197
|
-
const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
|
|
198
|
-
if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
|
|
199
|
-
throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
|
|
200
|
-
}
|
|
201
|
-
if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
|
|
202
|
-
throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
|
|
203
|
-
}
|
|
204
|
-
out.regression = {
|
|
205
|
-
failIfDeltaBelow: typeof failIfDeltaBelow === "number"
|
|
206
|
-
? failIfDeltaBelow
|
|
207
|
-
: DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
|
|
208
|
-
failIfCriticalBelow: typeof failIfCriticalBelow === "number"
|
|
209
|
-
? failIfCriticalBelow
|
|
210
|
-
: DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
|
|
211
|
-
};
|
|
212
|
-
}
|
|
213
|
-
const knownKeys = new Set([
|
|
214
|
-
"provider",
|
|
215
|
-
"baseUrl",
|
|
216
|
-
"model",
|
|
217
|
-
"judgeModel",
|
|
218
|
-
"defaultMode",
|
|
219
|
-
"defaultTier",
|
|
220
|
-
"dailyUsdCap",
|
|
221
|
-
"timeoutMs",
|
|
222
|
-
"maxRetries",
|
|
223
|
-
"regression",
|
|
224
|
-
"judgeSamples",
|
|
225
|
-
"judgeTemperature",
|
|
226
|
-
"agentTemperature",
|
|
227
|
-
"tokenPricing",
|
|
228
|
-
"toolMaxTurns",
|
|
229
|
-
"toolMaxArgumentsBytes",
|
|
230
|
-
"toolMaxResultBytes",
|
|
231
|
-
"workflowMaxTotalTurns"
|
|
232
|
-
]);
|
|
233
|
-
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
234
|
-
if (unknown.length > 0) {
|
|
235
|
-
throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
236
|
-
}
|
|
237
|
-
return out;
|
|
238
|
-
}
|
|
239
|
-
async function readFileConfig(projectRoot) {
|
|
240
|
-
const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
|
|
241
|
-
if (!(await exists(configFilePath))) {
|
|
242
|
-
return { patch: {}, source: "default" };
|
|
243
|
-
}
|
|
244
|
-
let parsed;
|
|
245
|
-
try {
|
|
246
|
-
parsed = parse(await fs.readFile(configFilePath, "utf8"));
|
|
247
|
-
}
|
|
248
|
-
catch (err) {
|
|
249
|
-
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
250
|
-
}
|
|
251
|
-
const patch = validateFileConfig(parsed, configFilePath);
|
|
252
|
-
return { patch, source: "file" };
|
|
253
|
-
}
|
|
254
|
-
function applyEnvOverrides(base, env) {
|
|
255
|
-
let overridden = false;
|
|
256
|
-
const patched = {
|
|
257
|
-
...base,
|
|
258
|
-
regression: { ...base.regression }
|
|
259
|
-
};
|
|
260
|
-
for (const name of Object.keys(env)) {
|
|
261
|
-
if (!name.startsWith("CCLAW_EVAL_"))
|
|
262
|
-
continue;
|
|
263
|
-
if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
|
|
264
|
-
// validated below when applied
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
const read = (name) => {
|
|
268
|
-
const value = env[name];
|
|
269
|
-
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
270
|
-
};
|
|
271
|
-
const baseUrl = read("CCLAW_EVAL_BASE_URL");
|
|
272
|
-
if (baseUrl) {
|
|
273
|
-
patched.baseUrl = baseUrl;
|
|
274
|
-
overridden = true;
|
|
275
|
-
}
|
|
276
|
-
const model = read("CCLAW_EVAL_MODEL");
|
|
277
|
-
if (model) {
|
|
278
|
-
patched.model = model;
|
|
279
|
-
overridden = true;
|
|
280
|
-
}
|
|
281
|
-
const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
|
|
282
|
-
if (judgeModel) {
|
|
283
|
-
patched.judgeModel = judgeModel;
|
|
284
|
-
overridden = true;
|
|
285
|
-
}
|
|
286
|
-
const provider = read("CCLAW_EVAL_PROVIDER");
|
|
287
|
-
if (provider) {
|
|
288
|
-
patched.provider = provider;
|
|
289
|
-
overridden = true;
|
|
290
|
-
}
|
|
291
|
-
const modeEnv = read("CCLAW_EVAL_MODE");
|
|
292
|
-
if (modeEnv) {
|
|
293
|
-
patched.defaultMode = parseModeEnv(modeEnv, "CCLAW_EVAL_MODE");
|
|
294
|
-
overridden = true;
|
|
295
|
-
}
|
|
296
|
-
else {
|
|
297
|
-
const legacyTier = read("CCLAW_EVAL_TIER");
|
|
298
|
-
if (legacyTier) {
|
|
299
|
-
patched.defaultMode = parseModeEnv(legacyTier, "CCLAW_EVAL_TIER");
|
|
300
|
-
overridden = true;
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
304
|
-
if (cap) {
|
|
305
|
-
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
306
|
-
overridden = true;
|
|
307
|
-
}
|
|
308
|
-
const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
|
|
309
|
-
if (timeout) {
|
|
310
|
-
patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
|
|
311
|
-
overridden = true;
|
|
312
|
-
}
|
|
313
|
-
const retries = read("CCLAW_EVAL_MAX_RETRIES");
|
|
314
|
-
if (retries) {
|
|
315
|
-
patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
|
|
316
|
-
overridden = true;
|
|
317
|
-
}
|
|
318
|
-
const judgeSamples = read("CCLAW_EVAL_JUDGE_SAMPLES");
|
|
319
|
-
if (judgeSamples) {
|
|
320
|
-
const value = parseNumericEnv("CCLAW_EVAL_JUDGE_SAMPLES", judgeSamples);
|
|
321
|
-
if (!Number.isInteger(value) || value < 1) {
|
|
322
|
-
throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be a positive integer, got: ${judgeSamples}`);
|
|
323
|
-
}
|
|
324
|
-
if (value % 2 === 0) {
|
|
325
|
-
throw new Error(`Environment variable CCLAW_EVAL_JUDGE_SAMPLES must be odd, got: ${judgeSamples}`);
|
|
326
|
-
}
|
|
327
|
-
patched.judgeSamples = value;
|
|
328
|
-
overridden = true;
|
|
329
|
-
}
|
|
330
|
-
const judgeTemp = read("CCLAW_EVAL_JUDGE_TEMPERATURE");
|
|
331
|
-
if (judgeTemp) {
|
|
332
|
-
const value = parseNumericEnv("CCLAW_EVAL_JUDGE_TEMPERATURE", judgeTemp);
|
|
333
|
-
if (value < 0 || value > 2) {
|
|
334
|
-
throw new Error(`Environment variable CCLAW_EVAL_JUDGE_TEMPERATURE must be within [0, 2], got: ${judgeTemp}`);
|
|
335
|
-
}
|
|
336
|
-
patched.judgeTemperature = value;
|
|
337
|
-
overridden = true;
|
|
338
|
-
}
|
|
339
|
-
const agentTemp = read("CCLAW_EVAL_AGENT_TEMPERATURE");
|
|
340
|
-
if (agentTemp) {
|
|
341
|
-
const value = parseNumericEnv("CCLAW_EVAL_AGENT_TEMPERATURE", agentTemp);
|
|
342
|
-
if (value < 0 || value > 2) {
|
|
343
|
-
throw new Error(`Environment variable CCLAW_EVAL_AGENT_TEMPERATURE must be within [0, 2], got: ${agentTemp}`);
|
|
344
|
-
}
|
|
345
|
-
patched.agentTemperature = value;
|
|
346
|
-
overridden = true;
|
|
347
|
-
}
|
|
348
|
-
const readPositiveInt = (name, key, label) => {
|
|
349
|
-
const raw = read(name);
|
|
350
|
-
if (!raw)
|
|
351
|
-
return;
|
|
352
|
-
const value = parseNumericEnv(name, raw);
|
|
353
|
-
if (!Number.isInteger(value) || value < 1) {
|
|
354
|
-
throw new Error(`Environment variable ${name} must be a positive integer, got: ${raw}`);
|
|
355
|
-
}
|
|
356
|
-
patched[key] = value;
|
|
357
|
-
overridden = true;
|
|
358
|
-
void label;
|
|
359
|
-
};
|
|
360
|
-
readPositiveInt("CCLAW_EVAL_TOOL_MAX_TURNS", "toolMaxTurns", "toolMaxTurns");
|
|
361
|
-
readPositiveInt("CCLAW_EVAL_WORKFLOW_MAX_TOTAL_TURNS", "workflowMaxTotalTurns", "workflowMaxTotalTurns");
|
|
362
|
-
readPositiveInt("CCLAW_EVAL_TOOL_MAX_ARG_BYTES", "toolMaxArgumentsBytes", "toolMaxArgumentsBytes");
|
|
363
|
-
readPositiveInt("CCLAW_EVAL_TOOL_MAX_RESULT_BYTES", "toolMaxResultBytes", "toolMaxResultBytes");
|
|
364
|
-
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
365
|
-
return { patched, overridden, apiKey };
|
|
366
|
-
}
|
|
367
|
-
/**
|
|
368
|
-
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
369
|
-
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
370
|
-
* surface where each setting came from.
|
|
371
|
-
*/
|
|
372
|
-
export async function loadEvalConfig(projectRoot, env = process.env) {
|
|
373
|
-
const { patch, source: fileSource } = await readFileConfig(projectRoot);
|
|
374
|
-
const merged = {
|
|
375
|
-
...DEFAULT_EVAL_CONFIG,
|
|
376
|
-
...patch,
|
|
377
|
-
regression: {
|
|
378
|
-
...DEFAULT_EVAL_CONFIG.regression,
|
|
379
|
-
...(patch.regression ?? {})
|
|
380
|
-
}
|
|
381
|
-
};
|
|
382
|
-
const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
|
|
383
|
-
let source = "default";
|
|
384
|
-
if (fileSource === "file" && overridden)
|
|
385
|
-
source = "file+env";
|
|
386
|
-
else if (fileSource === "file")
|
|
387
|
-
source = "file";
|
|
388
|
-
else if (overridden)
|
|
389
|
-
source = "env";
|
|
390
|
-
return {
|
|
391
|
-
...patched,
|
|
392
|
-
apiKey,
|
|
393
|
-
source
|
|
394
|
-
};
|
|
395
|
-
}
|