cclaw-cli 0.49.0 → 0.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +57 -84
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +55 -17
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +44 -21
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +34 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/language-policy.d.ts +2 -0
  28. package/dist/content/language-policy.js +13 -0
  29. package/dist/content/learnings.d.ts +3 -4
  30. package/dist/content/learnings.js +26 -50
  31. package/dist/content/meta-skill.js +33 -22
  32. package/dist/content/next-command.js +41 -38
  33. package/dist/content/node-hooks.js +17 -345
  34. package/dist/content/opencode-plugin.js +5 -103
  35. package/dist/content/research-playbooks.js +14 -14
  36. package/dist/content/review-loop.d.ts +2 -0
  37. package/dist/content/review-loop.js +8 -0
  38. package/dist/content/session-hooks.js +15 -47
  39. package/dist/content/skills.d.ts +0 -5
  40. package/dist/content/skills.js +55 -128
  41. package/dist/content/stage-common-guidance.d.ts +0 -1
  42. package/dist/content/stage-common-guidance.js +17 -14
  43. package/dist/content/stage-schema.d.ts +26 -1
  44. package/dist/content/stage-schema.js +121 -40
  45. package/dist/content/stages/_lint-metadata/index.js +9 -15
  46. package/dist/content/stages/brainstorm.js +22 -43
  47. package/dist/content/stages/design.js +37 -57
  48. package/dist/content/stages/plan.js +22 -13
  49. package/dist/content/stages/review.js +24 -27
  50. package/dist/content/stages/scope.js +34 -46
  51. package/dist/content/stages/ship.js +7 -4
  52. package/dist/content/stages/spec.js +20 -9
  53. package/dist/content/stages/tdd.js +64 -44
  54. package/dist/content/start-command.js +13 -12
  55. package/dist/content/status-command.d.ts +2 -7
  56. package/dist/content/status-command.js +19 -146
  57. package/dist/content/subagents.d.ts +0 -5
  58. package/dist/content/subagents.js +51 -28
  59. package/dist/content/templates.d.ts +1 -1
  60. package/dist/content/templates.js +126 -135
  61. package/dist/content/track-render-context.d.ts +17 -0
  62. package/dist/content/track-render-context.js +44 -0
  63. package/dist/content/tree-command.d.ts +1 -2
  64. package/dist/content/tree-command.js +4 -87
  65. package/dist/content/utility-skills.d.ts +2 -29
  66. package/dist/content/utility-skills.js +2 -1534
  67. package/dist/content/view-command.js +31 -11
  68. package/dist/delegation.d.ts +1 -1
  69. package/dist/delegation.js +5 -15
  70. package/dist/doctor-registry.js +20 -21
  71. package/dist/doctor.js +88 -344
  72. package/dist/flow-state.d.ts +3 -0
  73. package/dist/flow-state.js +2 -0
  74. package/dist/harness-adapters.d.ts +1 -1
  75. package/dist/harness-adapters.js +51 -58
  76. package/dist/install.js +128 -358
  77. package/dist/internal/advance-stage.js +3 -9
  78. package/dist/internal/compound-readiness.d.ts +1 -1
  79. package/dist/internal/compound-readiness.js +1 -1
  80. package/dist/internal/tdd-loop-status.d.ts +1 -1
  81. package/dist/internal/tdd-loop-status.js +1 -1
  82. package/dist/knowledge-store.d.ts +16 -10
  83. package/dist/knowledge-store.js +51 -15
  84. package/dist/policy.js +16 -105
  85. package/dist/run-archive.d.ts +4 -6
  86. package/dist/run-archive.js +15 -20
  87. package/dist/run-persistence.d.ts +2 -2
  88. package/dist/run-persistence.js +3 -9
  89. package/package.json +1 -2
  90. package/dist/content/archive-command.d.ts +0 -2
  91. package/dist/content/archive-command.js +0 -124
  92. package/dist/content/compound-command.d.ts +0 -5
  93. package/dist/content/compound-command.js +0 -193
  94. package/dist/content/contexts.d.ts +0 -18
  95. package/dist/content/contexts.js +0 -24
  96. package/dist/content/contracts.d.ts +0 -2
  97. package/dist/content/contracts.js +0 -51
  98. package/dist/content/doctor-references.d.ts +0 -2
  99. package/dist/content/doctor-references.js +0 -150
  100. package/dist/content/eval-scaffold.d.ts +0 -15
  101. package/dist/content/eval-scaffold.js +0 -370
  102. package/dist/content/feature-command.d.ts +0 -2
  103. package/dist/content/feature-command.js +0 -123
  104. package/dist/content/flow-map.d.ts +0 -23
  105. package/dist/content/flow-map.js +0 -134
  106. package/dist/content/harness-doc.d.ts +0 -2
  107. package/dist/content/harness-doc.js +0 -202
  108. package/dist/content/harness-playbooks.d.ts +0 -24
  109. package/dist/content/harness-playbooks.js +0 -393
  110. package/dist/content/harness-tool-refs.d.ts +0 -20
  111. package/dist/content/harness-tool-refs.js +0 -268
  112. package/dist/content/ops-command.d.ts +0 -2
  113. package/dist/content/ops-command.js +0 -71
  114. package/dist/content/protocols.d.ts +0 -7
  115. package/dist/content/protocols.js +0 -215
  116. package/dist/content/retro-command.d.ts +0 -2
  117. package/dist/content/retro-command.js +0 -165
  118. package/dist/content/rewind-command.d.ts +0 -2
  119. package/dist/content/rewind-command.js +0 -106
  120. package/dist/content/tdd-log-command.d.ts +0 -2
  121. package/dist/content/tdd-log-command.js +0 -85
  122. package/dist/eval/agents/single-shot.d.ts +0 -27
  123. package/dist/eval/agents/single-shot.js +0 -79
  124. package/dist/eval/agents/with-tools.d.ts +0 -44
  125. package/dist/eval/agents/with-tools.js +0 -261
  126. package/dist/eval/agents/workflow.d.ts +0 -31
  127. package/dist/eval/agents/workflow.js +0 -155
  128. package/dist/eval/baseline.d.ts +0 -38
  129. package/dist/eval/baseline.js +0 -282
  130. package/dist/eval/config-loader.d.ts +0 -14
  131. package/dist/eval/config-loader.js +0 -395
  132. package/dist/eval/corpus.d.ts +0 -30
  133. package/dist/eval/corpus.js +0 -330
  134. package/dist/eval/cost-guard.d.ts +0 -102
  135. package/dist/eval/cost-guard.js +0 -190
  136. package/dist/eval/diff.d.ts +0 -64
  137. package/dist/eval/diff.js +0 -323
  138. package/dist/eval/llm-client.d.ts +0 -176
  139. package/dist/eval/llm-client.js +0 -267
  140. package/dist/eval/mode.d.ts +0 -28
  141. package/dist/eval/mode.js +0 -61
  142. package/dist/eval/progress.d.ts +0 -83
  143. package/dist/eval/progress.js +0 -59
  144. package/dist/eval/report.d.ts +0 -11
  145. package/dist/eval/report.js +0 -181
  146. package/dist/eval/rubric-loader.d.ts +0 -20
  147. package/dist/eval/rubric-loader.js +0 -143
  148. package/dist/eval/runner.d.ts +0 -81
  149. package/dist/eval/runner.js +0 -746
  150. package/dist/eval/runs.d.ts +0 -41
  151. package/dist/eval/runs.js +0 -114
  152. package/dist/eval/sandbox.d.ts +0 -38
  153. package/dist/eval/sandbox.js +0 -137
  154. package/dist/eval/tools/glob.d.ts +0 -2
  155. package/dist/eval/tools/glob.js +0 -163
  156. package/dist/eval/tools/grep.d.ts +0 -2
  157. package/dist/eval/tools/grep.js +0 -152
  158. package/dist/eval/tools/index.d.ts +0 -7
  159. package/dist/eval/tools/index.js +0 -35
  160. package/dist/eval/tools/read.d.ts +0 -2
  161. package/dist/eval/tools/read.js +0 -122
  162. package/dist/eval/tools/types.d.ts +0 -49
  163. package/dist/eval/tools/types.js +0 -41
  164. package/dist/eval/tools/write.d.ts +0 -2
  165. package/dist/eval/tools/write.js +0 -92
  166. package/dist/eval/types.d.ts +0 -561
  167. package/dist/eval/types.js +0 -47
  168. package/dist/eval/verifiers/judge.d.ts +0 -40
  169. package/dist/eval/verifiers/judge.js +0 -256
  170. package/dist/eval/verifiers/rules.d.ts +0 -24
  171. package/dist/eval/verifiers/rules.js +0 -218
  172. package/dist/eval/verifiers/structural.d.ts +0 -14
  173. package/dist/eval/verifiers/structural.js +0 -171
  174. package/dist/eval/verifiers/traceability.d.ts +0 -23
  175. package/dist/eval/verifiers/traceability.js +0 -84
  176. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  177. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  178. package/dist/eval/workflow-corpus.d.ts +0 -7
  179. package/dist/eval/workflow-corpus.js +0 -207
  180. package/dist/feature-system.d.ts +0 -42
  181. package/dist/feature-system.js +0 -432
  182. package/dist/internal/knowledge-digest.d.ts +0 -7
  183. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,181 +0,0 @@
1
- import path from "node:path";
2
- import { EVALS_ROOT } from "../constants.js";
3
- import { writeFileSafe } from "../fs-utils.js";
4
- export function reportsDir(projectRoot) {
5
- return path.join(projectRoot, EVALS_ROOT, "reports");
6
- }
7
- export function defaultReportBasename(report) {
8
- const ts = report.generatedAt.replace(/[:.]/g, "-");
9
- return `eval-${ts}-${report.runId.slice(0, 8)}`;
10
- }
11
- /**
12
- * Format a report as a human-readable Markdown document. Keeping the layout
13
- * stable matters: CI posts diffs against earlier reports, and unit tests use
14
- * the output as a regression guard.
15
- */
16
- export function formatMarkdownReport(report) {
17
- const { summary } = report;
18
- const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
19
- const lines = [];
20
- lines.push(`# cclaw eval report`);
21
- lines.push(``);
22
- lines.push(`- generated: ${report.generatedAt}`);
23
- lines.push(`- runId: ${report.runId}`);
24
- lines.push(`- cclaw version: ${report.cclawVersion}`);
25
- lines.push(`- provider: ${report.provider}`);
26
- lines.push(`- model: ${report.model}`);
27
- lines.push(`- mode: ${report.mode}`);
28
- lines.push(`- stages: ${stages}`);
29
- lines.push(``);
30
- lines.push(`## Summary`);
31
- lines.push(``);
32
- lines.push(`| metric | value |`);
33
- lines.push(`| --- | --- |`);
34
- lines.push(`| total cases | ${summary.totalCases} |`);
35
- lines.push(`| passed | ${summary.passed} |`);
36
- lines.push(`| failed | ${summary.failed} |`);
37
- lines.push(`| skipped | ${summary.skipped} |`);
38
- lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
39
- lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
40
- lines.push(``);
41
- if (report.baselineDelta) {
42
- const delta = report.baselineDelta;
43
- lines.push(`## Baseline delta`);
44
- lines.push(``);
45
- lines.push(`- baseline: ${delta.baselineId}`);
46
- lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
47
- lines.push(`- critical failures: ${delta.criticalFailures}`);
48
- lines.push(``);
49
- if (delta.regressions.length > 0) {
50
- lines.push(`### Regressions`);
51
- lines.push(``);
52
- lines.push(`| stage | case id | verifier | reason | prev | curr |`);
53
- lines.push(`| --- | --- | --- | --- | --- | --- |`);
54
- for (const reg of delta.regressions) {
55
- const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
56
- const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
57
- lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
58
- }
59
- lines.push(``);
60
- }
61
- }
62
- if (report.cases.length === 0) {
63
- lines.push(`## Cases`);
64
- lines.push(``);
65
- lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
66
- lines.push(``);
67
- return `${lines.join("\n")}\n`;
68
- }
69
- lines.push(`## Cases`);
70
- lines.push(``);
71
- lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
72
- lines.push(`| --- | --- | --- | --- | --- |`);
73
- for (const item of report.cases) {
74
- const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
75
- lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
76
- }
77
- lines.push(``);
78
- const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
79
- if (toolCases.length > 0) {
80
- lines.push(`## Tool use`);
81
- lines.push(``);
82
- lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
83
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
84
- for (const item of toolCases) {
85
- const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
86
- const toolUse = verifier?.details?.toolUse;
87
- if (!toolUse)
88
- continue;
89
- const byTool = Object.entries(toolUse.byTool)
90
- .map(([name, count]) => `${name}=${count}`)
91
- .join(", ");
92
- const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
93
- lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
94
- }
95
- lines.push(``);
96
- }
97
- const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
98
- if (judgeCases.length > 0) {
99
- lines.push(`## Judge scores`);
100
- lines.push(``);
101
- lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
102
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
103
- for (const item of judgeCases) {
104
- for (const verifier of item.verifierResults) {
105
- if (verifier.kind !== "judge")
106
- continue;
107
- if (verifier.id === "judge:required-checks")
108
- continue;
109
- if (verifier.id === "judge:rubric:missing")
110
- continue;
111
- if (verifier.id === "judge:invocation:error")
112
- continue;
113
- const details = verifier.details ?? {};
114
- const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
115
- const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
116
- const coverage = details.coverage === true ? "yes" : "no";
117
- const checkId = verifier.id.replace(/^judge:/, "");
118
- lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
119
- }
120
- }
121
- lines.push(``);
122
- }
123
- const workflowCases = report.cases.filter((item) => !!item.workflow);
124
- if (workflowCases.length > 0) {
125
- lines.push(`## Workflow stages`);
126
- lines.push(``);
127
- lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
128
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
129
- for (const item of workflowCases) {
130
- const wf = item.workflow;
131
- for (const stage of wf.stages) {
132
- const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
133
- const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
134
- lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
135
- `${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
136
- }
137
- }
138
- lines.push(``);
139
- }
140
- const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
141
- if (consistencyCases.length > 0) {
142
- lines.push(`## Consistency checks`);
143
- lines.push(``);
144
- lines.push(`| case id | check id | ok | message |`);
145
- lines.push(`| --- | --- | --- | --- |`);
146
- for (const item of consistencyCases) {
147
- for (const verifier of item.verifierResults) {
148
- if (verifier.kind !== "consistency")
149
- continue;
150
- const message = verifier.message
151
- ? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
152
- : "-";
153
- lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
154
- }
155
- }
156
- lines.push(``);
157
- }
158
- lines.push(`## Verifier details`);
159
- lines.push(``);
160
- for (const item of report.cases) {
161
- lines.push(`### ${item.stage} / ${item.caseId}`);
162
- lines.push(``);
163
- for (const verifier of item.verifierResults) {
164
- const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
165
- lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
166
- (verifier.message ? ` — ${verifier.message}` : ""));
167
- }
168
- lines.push(``);
169
- }
170
- return `${lines.join("\n")}\n`;
171
- }
172
- export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
173
- const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
174
- await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
175
- return outPath;
176
- }
177
- export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
178
- const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
179
- await writeFileSafe(outPath, formatMarkdownReport(report));
180
- return outPath;
181
- }
@@ -1,20 +0,0 @@
1
- import type { FlowStage } from "../types.js";
2
- import type { RubricCheck, RubricDoc } from "./types.js";
3
- export declare function rubricsDir(projectRoot: string): string;
4
- export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
5
- declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
6
- declare function validateRubric(raw: unknown, file: string): RubricDoc;
7
- /**
8
- * Load the rubric for `stage`. Returns `undefined` when the file is
9
- * missing so callers can emit a "no rubric" verifier result rather than
10
- * crashing — authors are expected to grow rubrics incrementally.
11
- */
12
- export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
13
- /** Load every rubric present in the given rubrics directory. */
14
- export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
15
- /** Exposed for tests. */
16
- export declare const __internal: {
17
- validateRubric: typeof validateRubric;
18
- validateCheck: typeof validateCheck;
19
- };
20
- export {};
@@ -1,143 +0,0 @@
1
- /**
2
- * Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
3
- *
4
- * Each file maps to exactly one `RubricDoc` that drives the LLM judge.
5
- * Validation is strict: unknown top-level keys, missing required fields,
6
- * duplicate check ids, and malformed weights all surface as actionable
7
- * errors rather than turning into silent "judge had nothing to score"
8
- * passes.
9
- */
10
- import fs from "node:fs/promises";
11
- import path from "node:path";
12
- import { parse } from "yaml";
13
- import { EVALS_ROOT } from "../constants.js";
14
- import { exists } from "../fs-utils.js";
15
- import { FLOW_STAGES } from "../types.js";
16
- export function rubricsDir(projectRoot) {
17
- return path.join(projectRoot, EVALS_ROOT, "rubrics");
18
- }
19
- export function rubricPath(projectRoot, stage) {
20
- return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
21
- }
22
- function rubricError(file, reason) {
23
- return new Error(`Invalid rubric at ${file}: ${reason}\n` +
24
- `See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
25
- }
26
- function isRecord(value) {
27
- return typeof value === "object" && value !== null && !Array.isArray(value);
28
- }
29
- function validateCheck(raw, index, file) {
30
- if (!isRecord(raw)) {
31
- throw rubricError(file, `checks[${index}] must be a mapping`);
32
- }
33
- const id = raw.id;
34
- if (typeof id !== "string" || id.trim().length === 0) {
35
- throw rubricError(file, `checks[${index}].id must be a non-empty string`);
36
- }
37
- if (!/^[a-z][a-z0-9-]*$/.test(id)) {
38
- throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
39
- }
40
- const prompt = raw.prompt;
41
- if (typeof prompt !== "string" || prompt.trim().length === 0) {
42
- throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
43
- }
44
- const check = {
45
- id,
46
- prompt: prompt.trim()
47
- };
48
- if (raw.scale !== undefined) {
49
- if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
50
- throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
51
- }
52
- check.scale = raw.scale.trim();
53
- }
54
- if (raw.weight !== undefined) {
55
- if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
56
- throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
57
- }
58
- check.weight = raw.weight;
59
- }
60
- if (raw.critical !== undefined) {
61
- if (typeof raw.critical !== "boolean") {
62
- throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
63
- }
64
- check.critical = raw.critical;
65
- }
66
- const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
67
- const unknown = Object.keys(raw).filter((key) => !known.has(key));
68
- if (unknown.length > 0) {
69
- throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
70
- }
71
- return check;
72
- }
73
- function validateRubric(raw, file) {
74
- if (!isRecord(raw)) {
75
- throw rubricError(file, "top-level value must be a mapping");
76
- }
77
- const stage = raw.stage;
78
- if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
79
- throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
80
- }
81
- const id = raw.id;
82
- let rubricId = stage;
83
- if (id !== undefined) {
84
- if (typeof id !== "string" || id.trim().length === 0) {
85
- throw rubricError(file, `"id" must be a non-empty string when provided`);
86
- }
87
- rubricId = id.trim();
88
- }
89
- const checks = raw.checks;
90
- if (!Array.isArray(checks) || checks.length === 0) {
91
- throw rubricError(file, `"checks" must be a non-empty array`);
92
- }
93
- const parsed = [];
94
- const seen = new Set();
95
- for (let i = 0; i < checks.length; i += 1) {
96
- const check = validateCheck(checks[i], i, file);
97
- if (seen.has(check.id)) {
98
- throw rubricError(file, `duplicate check id: "${check.id}"`);
99
- }
100
- seen.add(check.id);
101
- parsed.push(check);
102
- }
103
- const known = new Set(["stage", "id", "checks"]);
104
- const unknown = Object.keys(raw).filter((key) => !known.has(key));
105
- if (unknown.length > 0) {
106
- throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
107
- }
108
- return {
109
- stage: stage,
110
- id: rubricId,
111
- checks: parsed
112
- };
113
- }
114
- /**
115
- * Load the rubric for `stage`. Returns `undefined` when the file is
116
- * missing so callers can emit a "no rubric" verifier result rather than
117
- * crashing — authors are expected to grow rubrics incrementally.
118
- */
119
- export async function loadRubric(projectRoot, stage) {
120
- const file = rubricPath(projectRoot, stage);
121
- if (!(await exists(file)))
122
- return undefined;
123
- let parsed;
124
- try {
125
- parsed = parse(await fs.readFile(file, "utf8"));
126
- }
127
- catch (err) {
128
- throw rubricError(file, err instanceof Error ? err.message : String(err));
129
- }
130
- return validateRubric(parsed, file);
131
- }
132
- /** Load every rubric present in the given rubrics directory. */
133
- export async function loadAllRubrics(projectRoot) {
134
- const out = new Map();
135
- for (const stage of FLOW_STAGES) {
136
- const doc = await loadRubric(projectRoot, stage);
137
- if (doc)
138
- out.set(stage, doc);
139
- }
140
- return out;
141
- }
142
- /** Exposed for tests. */
143
- export const __internal = { validateRubric, validateCheck };
@@ -1,81 +0,0 @@
1
- import type { FlowStage } from "../types.js";
2
- import { type EvalLlmClient } from "./llm-client.js";
3
- import { type ProgressLogger } from "./progress.js";
4
- import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
5
- export interface RunEvalOptions {
6
- projectRoot: string;
7
- stage?: FlowStage;
8
- mode?: EvalMode;
9
- /** When true, run only structural verifiers (Step 1). */
10
- schemaOnly?: boolean;
11
- /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
12
- rules?: boolean;
13
- /** When true, also run LLM judge verifiers. Step 3 wires judging. */
14
- judge?: boolean;
15
- /** When true, load config + corpus and return a summary without running any verifier. */
16
- dryRun?: boolean;
17
- /** Override process.env during tests. */
18
- env?: NodeJS.ProcessEnv;
19
- /**
20
- * Optional LLM client injection. Primary use case: unit and
21
- * integration tests that want deterministic judge + agent behavior
22
- * without hitting the network.
23
- */
24
- llmClient?: EvalLlmClient;
25
- /**
26
- * Optional progress logger. The CLI wires a stderr-backed logger by
27
- * default so users see one-line updates during long runs; tests and
28
- * programmatic callers can inject a silent (noop) logger or capture
29
- * events for assertions. When omitted, progress is silenced.
30
- */
31
- progress?: ProgressLogger;
32
- /**
33
- * Per-run USD cap. Enforced in-memory; independent from the daily cap
34
- * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
35
- * invocations. Undefined means no cap.
36
- */
37
- maxCostUsd?: number;
38
- /**
39
- * Override the configured `model` (and `judgeModel`) for this run.
40
- * Used by `cclaw eval --compare-model` to replay the same corpus
41
- * against an alternative model without editing `config.yaml`.
42
- */
43
- modelOverride?: string;
44
- }
45
- export interface DryRunSummary {
46
- kind: "dry-run";
47
- config: ResolvedEvalConfig;
48
- corpus: {
49
- total: number;
50
- byStage: Record<string, number>;
51
- cases: Array<{
52
- id: string;
53
- stage: FlowStage;
54
- }>;
55
- };
56
- /** Only populated in `workflow` mode; empty for fixture / agent modes. */
57
- workflowCorpus: {
58
- total: number;
59
- cases: Array<{
60
- id: string;
61
- stages: WorkflowStageName[];
62
- }>;
63
- };
64
- plannedMode: EvalMode;
65
- verifiersAvailable: {
66
- structural: boolean;
67
- rules: boolean;
68
- judge: boolean;
69
- workflow: boolean;
70
- consistency: boolean;
71
- };
72
- notes: string[];
73
- }
74
- /**
75
- * Main eval runner. Dispatches between fixture-backed verification, the
76
- * single-stage agent-with-tools loop, and the multi-stage workflow
77
- * orchestrator based on `options.mode`. Per-stage baselines are loaded for
78
- * regression comparison. Cases without a `fixture` path in the yaml are
79
- * marked skipped (not failed) when no LLM drafting runs.
80
- */
81
- export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;