cclaw-cli 0.49.0 → 0.51.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -84
- package/dist/artifact-linter.d.ts +4 -0
- package/dist/artifact-linter.js +24 -3
- package/dist/cli.d.ts +1 -19
- package/dist/cli.js +49 -491
- package/dist/constants.d.ts +2 -13
- package/dist/constants.js +1 -43
- package/dist/content/closeout-guidance.d.ts +14 -0
- package/dist/content/closeout-guidance.js +42 -0
- package/dist/content/core-agents.js +55 -17
- package/dist/content/decision-protocol.d.ts +12 -0
- package/dist/content/decision-protocol.js +20 -0
- package/dist/content/diff-command.d.ts +1 -2
- package/dist/content/diff-command.js +8 -94
- package/dist/content/examples.d.ts +4 -10
- package/dist/content/examples.js +10 -20
- package/dist/content/hook-events.js +2 -2
- package/dist/content/hook-inline-snippets.d.ts +5 -2
- package/dist/content/hook-inline-snippets.js +33 -1
- package/dist/content/hook-manifest.d.ts +3 -4
- package/dist/content/hook-manifest.js +11 -12
- package/dist/content/hooks.js +44 -21
- package/dist/content/ideate-command.d.ts +2 -0
- package/dist/content/ideate-command.js +34 -25
- package/dist/content/iron-laws.d.ts +5 -5
- package/dist/content/iron-laws.js +5 -5
- package/dist/content/language-policy.d.ts +2 -0
- package/dist/content/language-policy.js +13 -0
- package/dist/content/learnings.d.ts +3 -4
- package/dist/content/learnings.js +26 -50
- package/dist/content/meta-skill.js +33 -22
- package/dist/content/next-command.js +41 -38
- package/dist/content/node-hooks.js +17 -345
- package/dist/content/opencode-plugin.js +5 -103
- package/dist/content/research-playbooks.js +14 -14
- package/dist/content/review-loop.d.ts +2 -0
- package/dist/content/review-loop.js +8 -0
- package/dist/content/session-hooks.js +15 -47
- package/dist/content/skills.d.ts +0 -5
- package/dist/content/skills.js +55 -128
- package/dist/content/stage-common-guidance.d.ts +0 -1
- package/dist/content/stage-common-guidance.js +17 -14
- package/dist/content/stage-schema.d.ts +26 -1
- package/dist/content/stage-schema.js +121 -40
- package/dist/content/stages/_lint-metadata/index.js +9 -15
- package/dist/content/stages/brainstorm.js +22 -43
- package/dist/content/stages/design.js +37 -57
- package/dist/content/stages/plan.js +22 -13
- package/dist/content/stages/review.js +24 -27
- package/dist/content/stages/scope.js +34 -46
- package/dist/content/stages/ship.js +7 -4
- package/dist/content/stages/spec.js +20 -9
- package/dist/content/stages/tdd.js +64 -44
- package/dist/content/start-command.js +13 -12
- package/dist/content/status-command.d.ts +2 -7
- package/dist/content/status-command.js +19 -146
- package/dist/content/subagents.d.ts +0 -5
- package/dist/content/subagents.js +51 -28
- package/dist/content/templates.d.ts +1 -1
- package/dist/content/templates.js +126 -135
- package/dist/content/track-render-context.d.ts +17 -0
- package/dist/content/track-render-context.js +44 -0
- package/dist/content/tree-command.d.ts +1 -2
- package/dist/content/tree-command.js +4 -87
- package/dist/content/utility-skills.d.ts +2 -29
- package/dist/content/utility-skills.js +2 -1534
- package/dist/content/view-command.js +31 -11
- package/dist/delegation.d.ts +1 -1
- package/dist/delegation.js +5 -15
- package/dist/doctor-registry.js +20 -21
- package/dist/doctor.js +88 -344
- package/dist/flow-state.d.ts +3 -0
- package/dist/flow-state.js +2 -0
- package/dist/harness-adapters.d.ts +1 -1
- package/dist/harness-adapters.js +51 -58
- package/dist/install.js +128 -358
- package/dist/internal/advance-stage.js +3 -9
- package/dist/internal/compound-readiness.d.ts +1 -1
- package/dist/internal/compound-readiness.js +1 -1
- package/dist/internal/tdd-loop-status.d.ts +1 -1
- package/dist/internal/tdd-loop-status.js +1 -1
- package/dist/knowledge-store.d.ts +16 -10
- package/dist/knowledge-store.js +51 -15
- package/dist/policy.js +16 -105
- package/dist/run-archive.d.ts +4 -6
- package/dist/run-archive.js +15 -20
- package/dist/run-persistence.d.ts +2 -2
- package/dist/run-persistence.js +3 -9
- package/package.json +1 -2
- package/dist/content/archive-command.d.ts +0 -2
- package/dist/content/archive-command.js +0 -124
- package/dist/content/compound-command.d.ts +0 -5
- package/dist/content/compound-command.js +0 -193
- package/dist/content/contexts.d.ts +0 -18
- package/dist/content/contexts.js +0 -24
- package/dist/content/contracts.d.ts +0 -2
- package/dist/content/contracts.js +0 -51
- package/dist/content/doctor-references.d.ts +0 -2
- package/dist/content/doctor-references.js +0 -150
- package/dist/content/eval-scaffold.d.ts +0 -15
- package/dist/content/eval-scaffold.js +0 -370
- package/dist/content/feature-command.d.ts +0 -2
- package/dist/content/feature-command.js +0 -123
- package/dist/content/flow-map.d.ts +0 -23
- package/dist/content/flow-map.js +0 -134
- package/dist/content/harness-doc.d.ts +0 -2
- package/dist/content/harness-doc.js +0 -202
- package/dist/content/harness-playbooks.d.ts +0 -24
- package/dist/content/harness-playbooks.js +0 -393
- package/dist/content/harness-tool-refs.d.ts +0 -20
- package/dist/content/harness-tool-refs.js +0 -268
- package/dist/content/ops-command.d.ts +0 -2
- package/dist/content/ops-command.js +0 -71
- package/dist/content/protocols.d.ts +0 -7
- package/dist/content/protocols.js +0 -215
- package/dist/content/retro-command.d.ts +0 -2
- package/dist/content/retro-command.js +0 -165
- package/dist/content/rewind-command.d.ts +0 -2
- package/dist/content/rewind-command.js +0 -106
- package/dist/content/tdd-log-command.d.ts +0 -2
- package/dist/content/tdd-log-command.js +0 -85
- package/dist/eval/agents/single-shot.d.ts +0 -27
- package/dist/eval/agents/single-shot.js +0 -79
- package/dist/eval/agents/with-tools.d.ts +0 -44
- package/dist/eval/agents/with-tools.js +0 -261
- package/dist/eval/agents/workflow.d.ts +0 -31
- package/dist/eval/agents/workflow.js +0 -155
- package/dist/eval/baseline.d.ts +0 -38
- package/dist/eval/baseline.js +0 -282
- package/dist/eval/config-loader.d.ts +0 -14
- package/dist/eval/config-loader.js +0 -395
- package/dist/eval/corpus.d.ts +0 -30
- package/dist/eval/corpus.js +0 -330
- package/dist/eval/cost-guard.d.ts +0 -102
- package/dist/eval/cost-guard.js +0 -190
- package/dist/eval/diff.d.ts +0 -64
- package/dist/eval/diff.js +0 -323
- package/dist/eval/llm-client.d.ts +0 -176
- package/dist/eval/llm-client.js +0 -267
- package/dist/eval/mode.d.ts +0 -28
- package/dist/eval/mode.js +0 -61
- package/dist/eval/progress.d.ts +0 -83
- package/dist/eval/progress.js +0 -59
- package/dist/eval/report.d.ts +0 -11
- package/dist/eval/report.js +0 -181
- package/dist/eval/rubric-loader.d.ts +0 -20
- package/dist/eval/rubric-loader.js +0 -143
- package/dist/eval/runner.d.ts +0 -81
- package/dist/eval/runner.js +0 -746
- package/dist/eval/runs.d.ts +0 -41
- package/dist/eval/runs.js +0 -114
- package/dist/eval/sandbox.d.ts +0 -38
- package/dist/eval/sandbox.js +0 -137
- package/dist/eval/tools/glob.d.ts +0 -2
- package/dist/eval/tools/glob.js +0 -163
- package/dist/eval/tools/grep.d.ts +0 -2
- package/dist/eval/tools/grep.js +0 -152
- package/dist/eval/tools/index.d.ts +0 -7
- package/dist/eval/tools/index.js +0 -35
- package/dist/eval/tools/read.d.ts +0 -2
- package/dist/eval/tools/read.js +0 -122
- package/dist/eval/tools/types.d.ts +0 -49
- package/dist/eval/tools/types.js +0 -41
- package/dist/eval/tools/write.d.ts +0 -2
- package/dist/eval/tools/write.js +0 -92
- package/dist/eval/types.d.ts +0 -561
- package/dist/eval/types.js +0 -47
- package/dist/eval/verifiers/judge.d.ts +0 -40
- package/dist/eval/verifiers/judge.js +0 -256
- package/dist/eval/verifiers/rules.d.ts +0 -24
- package/dist/eval/verifiers/rules.js +0 -218
- package/dist/eval/verifiers/structural.d.ts +0 -14
- package/dist/eval/verifiers/structural.js +0 -171
- package/dist/eval/verifiers/traceability.d.ts +0 -23
- package/dist/eval/verifiers/traceability.js +0 -84
- package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
- package/dist/eval/verifiers/workflow-consistency.js +0 -225
- package/dist/eval/workflow-corpus.d.ts +0 -7
- package/dist/eval/workflow-corpus.js +0 -207
- package/dist/feature-system.d.ts +0 -42
- package/dist/feature-system.js +0 -432
- package/dist/internal/knowledge-digest.d.ts +0 -7
- package/dist/internal/knowledge-digest.js +0 -93
package/dist/eval/report.js
DELETED
|
@@ -1,181 +0,0 @@
|
|
|
1
|
-
import path from "node:path";
|
|
2
|
-
import { EVALS_ROOT } from "../constants.js";
|
|
3
|
-
import { writeFileSafe } from "../fs-utils.js";
|
|
4
|
-
export function reportsDir(projectRoot) {
|
|
5
|
-
return path.join(projectRoot, EVALS_ROOT, "reports");
|
|
6
|
-
}
|
|
7
|
-
export function defaultReportBasename(report) {
|
|
8
|
-
const ts = report.generatedAt.replace(/[:.]/g, "-");
|
|
9
|
-
return `eval-${ts}-${report.runId.slice(0, 8)}`;
|
|
10
|
-
}
|
|
11
|
-
/**
|
|
12
|
-
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
13
|
-
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
14
|
-
* the output as a regression guard.
|
|
15
|
-
*/
|
|
16
|
-
export function formatMarkdownReport(report) {
|
|
17
|
-
const { summary } = report;
|
|
18
|
-
const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
|
|
19
|
-
const lines = [];
|
|
20
|
-
lines.push(`# cclaw eval report`);
|
|
21
|
-
lines.push(``);
|
|
22
|
-
lines.push(`- generated: ${report.generatedAt}`);
|
|
23
|
-
lines.push(`- runId: ${report.runId}`);
|
|
24
|
-
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
|
-
lines.push(`- provider: ${report.provider}`);
|
|
26
|
-
lines.push(`- model: ${report.model}`);
|
|
27
|
-
lines.push(`- mode: ${report.mode}`);
|
|
28
|
-
lines.push(`- stages: ${stages}`);
|
|
29
|
-
lines.push(``);
|
|
30
|
-
lines.push(`## Summary`);
|
|
31
|
-
lines.push(``);
|
|
32
|
-
lines.push(`| metric | value |`);
|
|
33
|
-
lines.push(`| --- | --- |`);
|
|
34
|
-
lines.push(`| total cases | ${summary.totalCases} |`);
|
|
35
|
-
lines.push(`| passed | ${summary.passed} |`);
|
|
36
|
-
lines.push(`| failed | ${summary.failed} |`);
|
|
37
|
-
lines.push(`| skipped | ${summary.skipped} |`);
|
|
38
|
-
lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
|
|
39
|
-
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
|
-
lines.push(``);
|
|
41
|
-
if (report.baselineDelta) {
|
|
42
|
-
const delta = report.baselineDelta;
|
|
43
|
-
lines.push(`## Baseline delta`);
|
|
44
|
-
lines.push(``);
|
|
45
|
-
lines.push(`- baseline: ${delta.baselineId}`);
|
|
46
|
-
lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
|
|
47
|
-
lines.push(`- critical failures: ${delta.criticalFailures}`);
|
|
48
|
-
lines.push(``);
|
|
49
|
-
if (delta.regressions.length > 0) {
|
|
50
|
-
lines.push(`### Regressions`);
|
|
51
|
-
lines.push(``);
|
|
52
|
-
lines.push(`| stage | case id | verifier | reason | prev | curr |`);
|
|
53
|
-
lines.push(`| --- | --- | --- | --- | --- | --- |`);
|
|
54
|
-
for (const reg of delta.regressions) {
|
|
55
|
-
const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
|
|
56
|
-
const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
|
|
57
|
-
lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
|
|
58
|
-
}
|
|
59
|
-
lines.push(``);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
if (report.cases.length === 0) {
|
|
63
|
-
lines.push(`## Cases`);
|
|
64
|
-
lines.push(``);
|
|
65
|
-
lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
|
|
66
|
-
lines.push(``);
|
|
67
|
-
return `${lines.join("\n")}\n`;
|
|
68
|
-
}
|
|
69
|
-
lines.push(`## Cases`);
|
|
70
|
-
lines.push(``);
|
|
71
|
-
lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
|
|
72
|
-
lines.push(`| --- | --- | --- | --- | --- |`);
|
|
73
|
-
for (const item of report.cases) {
|
|
74
|
-
const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
|
|
75
|
-
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
|
-
}
|
|
77
|
-
lines.push(``);
|
|
78
|
-
const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
|
|
79
|
-
if (toolCases.length > 0) {
|
|
80
|
-
lines.push(`## Tool use`);
|
|
81
|
-
lines.push(``);
|
|
82
|
-
lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
|
|
83
|
-
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
84
|
-
for (const item of toolCases) {
|
|
85
|
-
const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
|
|
86
|
-
const toolUse = verifier?.details?.toolUse;
|
|
87
|
-
if (!toolUse)
|
|
88
|
-
continue;
|
|
89
|
-
const byTool = Object.entries(toolUse.byTool)
|
|
90
|
-
.map(([name, count]) => `${name}=${count}`)
|
|
91
|
-
.join(", ");
|
|
92
|
-
const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
|
|
93
|
-
lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
|
|
94
|
-
}
|
|
95
|
-
lines.push(``);
|
|
96
|
-
}
|
|
97
|
-
const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
|
|
98
|
-
if (judgeCases.length > 0) {
|
|
99
|
-
lines.push(`## Judge scores`);
|
|
100
|
-
lines.push(``);
|
|
101
|
-
lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
|
|
102
|
-
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
103
|
-
for (const item of judgeCases) {
|
|
104
|
-
for (const verifier of item.verifierResults) {
|
|
105
|
-
if (verifier.kind !== "judge")
|
|
106
|
-
continue;
|
|
107
|
-
if (verifier.id === "judge:required-checks")
|
|
108
|
-
continue;
|
|
109
|
-
if (verifier.id === "judge:rubric:missing")
|
|
110
|
-
continue;
|
|
111
|
-
if (verifier.id === "judge:invocation:error")
|
|
112
|
-
continue;
|
|
113
|
-
const details = verifier.details ?? {};
|
|
114
|
-
const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
|
|
115
|
-
const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
|
|
116
|
-
const coverage = details.coverage === true ? "yes" : "no";
|
|
117
|
-
const checkId = verifier.id.replace(/^judge:/, "");
|
|
118
|
-
lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
lines.push(``);
|
|
122
|
-
}
|
|
123
|
-
const workflowCases = report.cases.filter((item) => !!item.workflow);
|
|
124
|
-
if (workflowCases.length > 0) {
|
|
125
|
-
lines.push(`## Workflow stages`);
|
|
126
|
-
lines.push(``);
|
|
127
|
-
lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
|
|
128
|
-
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
129
|
-
for (const item of workflowCases) {
|
|
130
|
-
const wf = item.workflow;
|
|
131
|
-
for (const stage of wf.stages) {
|
|
132
|
-
const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
|
|
133
|
-
const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
|
|
134
|
-
lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
|
|
135
|
-
`${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
lines.push(``);
|
|
139
|
-
}
|
|
140
|
-
const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
|
|
141
|
-
if (consistencyCases.length > 0) {
|
|
142
|
-
lines.push(`## Consistency checks`);
|
|
143
|
-
lines.push(``);
|
|
144
|
-
lines.push(`| case id | check id | ok | message |`);
|
|
145
|
-
lines.push(`| --- | --- | --- | --- |`);
|
|
146
|
-
for (const item of consistencyCases) {
|
|
147
|
-
for (const verifier of item.verifierResults) {
|
|
148
|
-
if (verifier.kind !== "consistency")
|
|
149
|
-
continue;
|
|
150
|
-
const message = verifier.message
|
|
151
|
-
? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
|
|
152
|
-
: "-";
|
|
153
|
-
lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
lines.push(``);
|
|
157
|
-
}
|
|
158
|
-
lines.push(`## Verifier details`);
|
|
159
|
-
lines.push(``);
|
|
160
|
-
for (const item of report.cases) {
|
|
161
|
-
lines.push(`### ${item.stage} / ${item.caseId}`);
|
|
162
|
-
lines.push(``);
|
|
163
|
-
for (const verifier of item.verifierResults) {
|
|
164
|
-
const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
|
|
165
|
-
lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
|
|
166
|
-
(verifier.message ? ` — ${verifier.message}` : ""));
|
|
167
|
-
}
|
|
168
|
-
lines.push(``);
|
|
169
|
-
}
|
|
170
|
-
return `${lines.join("\n")}\n`;
|
|
171
|
-
}
|
|
172
|
-
export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
173
|
-
const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
|
|
174
|
-
await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
175
|
-
return outPath;
|
|
176
|
-
}
|
|
177
|
-
export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
178
|
-
const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
|
|
179
|
-
await writeFileSafe(outPath, formatMarkdownReport(report));
|
|
180
|
-
return outPath;
|
|
181
|
-
}
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import type { FlowStage } from "../types.js";
|
|
2
|
-
import type { RubricCheck, RubricDoc } from "./types.js";
|
|
3
|
-
export declare function rubricsDir(projectRoot: string): string;
|
|
4
|
-
export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
|
|
5
|
-
declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
|
|
6
|
-
declare function validateRubric(raw: unknown, file: string): RubricDoc;
|
|
7
|
-
/**
|
|
8
|
-
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
9
|
-
* missing so callers can emit a "no rubric" verifier result rather than
|
|
10
|
-
* crashing — authors are expected to grow rubrics incrementally.
|
|
11
|
-
*/
|
|
12
|
-
export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
|
|
13
|
-
/** Load every rubric present in the given rubrics directory. */
|
|
14
|
-
export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
|
|
15
|
-
/** Exposed for tests. */
|
|
16
|
-
export declare const __internal: {
|
|
17
|
-
validateRubric: typeof validateRubric;
|
|
18
|
-
validateCheck: typeof validateCheck;
|
|
19
|
-
};
|
|
20
|
-
export {};
|
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
3
|
-
*
|
|
4
|
-
* Each file maps to exactly one `RubricDoc` that drives the LLM judge.
|
|
5
|
-
* Validation is strict: unknown top-level keys, missing required fields,
|
|
6
|
-
* duplicate check ids, and malformed weights all surface as actionable
|
|
7
|
-
* errors rather than turning into silent "judge had nothing to score"
|
|
8
|
-
* passes.
|
|
9
|
-
*/
|
|
10
|
-
import fs from "node:fs/promises";
|
|
11
|
-
import path from "node:path";
|
|
12
|
-
import { parse } from "yaml";
|
|
13
|
-
import { EVALS_ROOT } from "../constants.js";
|
|
14
|
-
import { exists } from "../fs-utils.js";
|
|
15
|
-
import { FLOW_STAGES } from "../types.js";
|
|
16
|
-
export function rubricsDir(projectRoot) {
|
|
17
|
-
return path.join(projectRoot, EVALS_ROOT, "rubrics");
|
|
18
|
-
}
|
|
19
|
-
export function rubricPath(projectRoot, stage) {
|
|
20
|
-
return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
|
|
21
|
-
}
|
|
22
|
-
function rubricError(file, reason) {
|
|
23
|
-
return new Error(`Invalid rubric at ${file}: ${reason}\n` +
|
|
24
|
-
`See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
|
|
25
|
-
}
|
|
26
|
-
function isRecord(value) {
|
|
27
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
28
|
-
}
|
|
29
|
-
function validateCheck(raw, index, file) {
|
|
30
|
-
if (!isRecord(raw)) {
|
|
31
|
-
throw rubricError(file, `checks[${index}] must be a mapping`);
|
|
32
|
-
}
|
|
33
|
-
const id = raw.id;
|
|
34
|
-
if (typeof id !== "string" || id.trim().length === 0) {
|
|
35
|
-
throw rubricError(file, `checks[${index}].id must be a non-empty string`);
|
|
36
|
-
}
|
|
37
|
-
if (!/^[a-z][a-z0-9-]*$/.test(id)) {
|
|
38
|
-
throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
|
|
39
|
-
}
|
|
40
|
-
const prompt = raw.prompt;
|
|
41
|
-
if (typeof prompt !== "string" || prompt.trim().length === 0) {
|
|
42
|
-
throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
|
|
43
|
-
}
|
|
44
|
-
const check = {
|
|
45
|
-
id,
|
|
46
|
-
prompt: prompt.trim()
|
|
47
|
-
};
|
|
48
|
-
if (raw.scale !== undefined) {
|
|
49
|
-
if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
|
|
50
|
-
throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
|
|
51
|
-
}
|
|
52
|
-
check.scale = raw.scale.trim();
|
|
53
|
-
}
|
|
54
|
-
if (raw.weight !== undefined) {
|
|
55
|
-
if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
|
|
56
|
-
throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
|
|
57
|
-
}
|
|
58
|
-
check.weight = raw.weight;
|
|
59
|
-
}
|
|
60
|
-
if (raw.critical !== undefined) {
|
|
61
|
-
if (typeof raw.critical !== "boolean") {
|
|
62
|
-
throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
|
|
63
|
-
}
|
|
64
|
-
check.critical = raw.critical;
|
|
65
|
-
}
|
|
66
|
-
const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
|
|
67
|
-
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
68
|
-
if (unknown.length > 0) {
|
|
69
|
-
throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
|
|
70
|
-
}
|
|
71
|
-
return check;
|
|
72
|
-
}
|
|
73
|
-
function validateRubric(raw, file) {
|
|
74
|
-
if (!isRecord(raw)) {
|
|
75
|
-
throw rubricError(file, "top-level value must be a mapping");
|
|
76
|
-
}
|
|
77
|
-
const stage = raw.stage;
|
|
78
|
-
if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
|
|
79
|
-
throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
|
|
80
|
-
}
|
|
81
|
-
const id = raw.id;
|
|
82
|
-
let rubricId = stage;
|
|
83
|
-
if (id !== undefined) {
|
|
84
|
-
if (typeof id !== "string" || id.trim().length === 0) {
|
|
85
|
-
throw rubricError(file, `"id" must be a non-empty string when provided`);
|
|
86
|
-
}
|
|
87
|
-
rubricId = id.trim();
|
|
88
|
-
}
|
|
89
|
-
const checks = raw.checks;
|
|
90
|
-
if (!Array.isArray(checks) || checks.length === 0) {
|
|
91
|
-
throw rubricError(file, `"checks" must be a non-empty array`);
|
|
92
|
-
}
|
|
93
|
-
const parsed = [];
|
|
94
|
-
const seen = new Set();
|
|
95
|
-
for (let i = 0; i < checks.length; i += 1) {
|
|
96
|
-
const check = validateCheck(checks[i], i, file);
|
|
97
|
-
if (seen.has(check.id)) {
|
|
98
|
-
throw rubricError(file, `duplicate check id: "${check.id}"`);
|
|
99
|
-
}
|
|
100
|
-
seen.add(check.id);
|
|
101
|
-
parsed.push(check);
|
|
102
|
-
}
|
|
103
|
-
const known = new Set(["stage", "id", "checks"]);
|
|
104
|
-
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
105
|
-
if (unknown.length > 0) {
|
|
106
|
-
throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
107
|
-
}
|
|
108
|
-
return {
|
|
109
|
-
stage: stage,
|
|
110
|
-
id: rubricId,
|
|
111
|
-
checks: parsed
|
|
112
|
-
};
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
116
|
-
* missing so callers can emit a "no rubric" verifier result rather than
|
|
117
|
-
* crashing — authors are expected to grow rubrics incrementally.
|
|
118
|
-
*/
|
|
119
|
-
export async function loadRubric(projectRoot, stage) {
|
|
120
|
-
const file = rubricPath(projectRoot, stage);
|
|
121
|
-
if (!(await exists(file)))
|
|
122
|
-
return undefined;
|
|
123
|
-
let parsed;
|
|
124
|
-
try {
|
|
125
|
-
parsed = parse(await fs.readFile(file, "utf8"));
|
|
126
|
-
}
|
|
127
|
-
catch (err) {
|
|
128
|
-
throw rubricError(file, err instanceof Error ? err.message : String(err));
|
|
129
|
-
}
|
|
130
|
-
return validateRubric(parsed, file);
|
|
131
|
-
}
|
|
132
|
-
/** Load every rubric present in the given rubrics directory. */
|
|
133
|
-
export async function loadAllRubrics(projectRoot) {
|
|
134
|
-
const out = new Map();
|
|
135
|
-
for (const stage of FLOW_STAGES) {
|
|
136
|
-
const doc = await loadRubric(projectRoot, stage);
|
|
137
|
-
if (doc)
|
|
138
|
-
out.set(stage, doc);
|
|
139
|
-
}
|
|
140
|
-
return out;
|
|
141
|
-
}
|
|
142
|
-
/** Exposed for tests. */
|
|
143
|
-
export const __internal = { validateRubric, validateCheck };
|
package/dist/eval/runner.d.ts
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import type { FlowStage } from "../types.js";
|
|
2
|
-
import { type EvalLlmClient } from "./llm-client.js";
|
|
3
|
-
import { type ProgressLogger } from "./progress.js";
|
|
4
|
-
import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
|
|
5
|
-
export interface RunEvalOptions {
|
|
6
|
-
projectRoot: string;
|
|
7
|
-
stage?: FlowStage;
|
|
8
|
-
mode?: EvalMode;
|
|
9
|
-
/** When true, run only structural verifiers (Step 1). */
|
|
10
|
-
schemaOnly?: boolean;
|
|
11
|
-
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
12
|
-
rules?: boolean;
|
|
13
|
-
/** When true, also run LLM judge verifiers. Step 3 wires judging. */
|
|
14
|
-
judge?: boolean;
|
|
15
|
-
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
16
|
-
dryRun?: boolean;
|
|
17
|
-
/** Override process.env during tests. */
|
|
18
|
-
env?: NodeJS.ProcessEnv;
|
|
19
|
-
/**
|
|
20
|
-
* Optional LLM client injection. Primary use case: unit and
|
|
21
|
-
* integration tests that want deterministic judge + agent behavior
|
|
22
|
-
* without hitting the network.
|
|
23
|
-
*/
|
|
24
|
-
llmClient?: EvalLlmClient;
|
|
25
|
-
/**
|
|
26
|
-
* Optional progress logger. The CLI wires a stderr-backed logger by
|
|
27
|
-
* default so users see one-line updates during long runs; tests and
|
|
28
|
-
* programmatic callers can inject a silent (noop) logger or capture
|
|
29
|
-
* events for assertions. When omitted, progress is silenced.
|
|
30
|
-
*/
|
|
31
|
-
progress?: ProgressLogger;
|
|
32
|
-
/**
|
|
33
|
-
* Per-run USD cap. Enforced in-memory; independent from the daily cap
|
|
34
|
-
* (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
|
|
35
|
-
* invocations. Undefined means no cap.
|
|
36
|
-
*/
|
|
37
|
-
maxCostUsd?: number;
|
|
38
|
-
/**
|
|
39
|
-
* Override the configured `model` (and `judgeModel`) for this run.
|
|
40
|
-
* Used by `cclaw eval --compare-model` to replay the same corpus
|
|
41
|
-
* against an alternative model without editing `config.yaml`.
|
|
42
|
-
*/
|
|
43
|
-
modelOverride?: string;
|
|
44
|
-
}
|
|
45
|
-
export interface DryRunSummary {
|
|
46
|
-
kind: "dry-run";
|
|
47
|
-
config: ResolvedEvalConfig;
|
|
48
|
-
corpus: {
|
|
49
|
-
total: number;
|
|
50
|
-
byStage: Record<string, number>;
|
|
51
|
-
cases: Array<{
|
|
52
|
-
id: string;
|
|
53
|
-
stage: FlowStage;
|
|
54
|
-
}>;
|
|
55
|
-
};
|
|
56
|
-
/** Only populated in `workflow` mode; empty for fixture / agent modes. */
|
|
57
|
-
workflowCorpus: {
|
|
58
|
-
total: number;
|
|
59
|
-
cases: Array<{
|
|
60
|
-
id: string;
|
|
61
|
-
stages: WorkflowStageName[];
|
|
62
|
-
}>;
|
|
63
|
-
};
|
|
64
|
-
plannedMode: EvalMode;
|
|
65
|
-
verifiersAvailable: {
|
|
66
|
-
structural: boolean;
|
|
67
|
-
rules: boolean;
|
|
68
|
-
judge: boolean;
|
|
69
|
-
workflow: boolean;
|
|
70
|
-
consistency: boolean;
|
|
71
|
-
};
|
|
72
|
-
notes: string[];
|
|
73
|
-
}
|
|
74
|
-
/**
|
|
75
|
-
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
76
|
-
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
77
|
-
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
78
|
-
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
79
|
-
* marked skipped (not failed) when no LLM drafting runs.
|
|
80
|
-
*/
|
|
81
|
-
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|