cclaw-cli 0.48.35 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +54 -82
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -495
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -46
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +51 -9
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +2 -0
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +31 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/learnings.d.ts +3 -4
  28. package/dist/content/learnings.js +24 -50
  29. package/dist/content/meta-skill.js +31 -24
  30. package/dist/content/next-command.js +38 -38
  31. package/dist/content/node-hooks.js +17 -343
  32. package/dist/content/opencode-plugin.js +2 -100
  33. package/dist/content/research-playbooks.js +14 -14
  34. package/dist/content/review-loop.d.ts +2 -0
  35. package/dist/content/review-loop.js +8 -0
  36. package/dist/content/session-hooks.js +14 -46
  37. package/dist/content/skills.d.ts +0 -5
  38. package/dist/content/skills.js +53 -128
  39. package/dist/content/stage-common-guidance.d.ts +0 -1
  40. package/dist/content/stage-common-guidance.js +15 -14
  41. package/dist/content/stage-schema.d.ts +26 -1
  42. package/dist/content/stage-schema.js +121 -40
  43. package/dist/content/stages/_lint-metadata/index.js +9 -15
  44. package/dist/content/stages/brainstorm.js +22 -43
  45. package/dist/content/stages/design.js +37 -57
  46. package/dist/content/stages/plan.js +22 -13
  47. package/dist/content/stages/review.js +24 -27
  48. package/dist/content/stages/scope.js +34 -46
  49. package/dist/content/stages/ship.js +7 -4
  50. package/dist/content/stages/spec.js +20 -9
  51. package/dist/content/stages/tdd.js +64 -44
  52. package/dist/content/start-command.js +10 -12
  53. package/dist/content/status-command.d.ts +2 -7
  54. package/dist/content/status-command.js +19 -146
  55. package/dist/content/subagents.d.ts +0 -5
  56. package/dist/content/subagents.js +47 -28
  57. package/dist/content/templates.d.ts +1 -1
  58. package/dist/content/templates.js +126 -135
  59. package/dist/content/track-render-context.d.ts +17 -0
  60. package/dist/content/track-render-context.js +44 -0
  61. package/dist/content/tree-command.d.ts +1 -2
  62. package/dist/content/tree-command.js +4 -87
  63. package/dist/content/utility-skills.d.ts +2 -29
  64. package/dist/content/utility-skills.js +2 -1533
  65. package/dist/content/view-command.js +29 -11
  66. package/dist/delegation.d.ts +1 -1
  67. package/dist/delegation.js +5 -15
  68. package/dist/doctor-registry.js +20 -21
  69. package/dist/doctor.js +88 -408
  70. package/dist/flow-state.d.ts +3 -0
  71. package/dist/flow-state.js +2 -0
  72. package/dist/harness-adapters.d.ts +1 -1
  73. package/dist/harness-adapters.js +48 -57
  74. package/dist/install.js +128 -520
  75. package/dist/internal/advance-stage.js +3 -9
  76. package/dist/internal/compound-readiness.d.ts +1 -1
  77. package/dist/internal/compound-readiness.js +1 -1
  78. package/dist/internal/tdd-loop-status.d.ts +1 -1
  79. package/dist/internal/tdd-loop-status.js +1 -1
  80. package/dist/knowledge-store.d.ts +16 -10
  81. package/dist/knowledge-store.js +51 -15
  82. package/dist/policy.js +16 -109
  83. package/dist/run-archive.d.ts +4 -6
  84. package/dist/run-archive.js +15 -20
  85. package/dist/run-persistence.d.ts +2 -2
  86. package/dist/run-persistence.js +3 -9
  87. package/package.json +1 -2
  88. package/dist/content/archive-command.d.ts +0 -2
  89. package/dist/content/archive-command.js +0 -124
  90. package/dist/content/compound-command.d.ts +0 -5
  91. package/dist/content/compound-command.js +0 -193
  92. package/dist/content/contexts.d.ts +0 -9
  93. package/dist/content/contexts.js +0 -65
  94. package/dist/content/contracts.d.ts +0 -2
  95. package/dist/content/contracts.js +0 -51
  96. package/dist/content/doctor-references.d.ts +0 -2
  97. package/dist/content/doctor-references.js +0 -150
  98. package/dist/content/eval-scaffold.d.ts +0 -15
  99. package/dist/content/eval-scaffold.js +0 -370
  100. package/dist/content/feature-command.d.ts +0 -2
  101. package/dist/content/feature-command.js +0 -123
  102. package/dist/content/flow-map.d.ts +0 -23
  103. package/dist/content/flow-map.js +0 -134
  104. package/dist/content/harness-doc.d.ts +0 -2
  105. package/dist/content/harness-doc.js +0 -202
  106. package/dist/content/harness-playbooks.d.ts +0 -24
  107. package/dist/content/harness-playbooks.js +0 -393
  108. package/dist/content/harness-tool-refs.d.ts +0 -20
  109. package/dist/content/harness-tool-refs.js +0 -268
  110. package/dist/content/ops-command.d.ts +0 -2
  111. package/dist/content/ops-command.js +0 -71
  112. package/dist/content/protocols.d.ts +0 -7
  113. package/dist/content/protocols.js +0 -215
  114. package/dist/content/retro-command.d.ts +0 -2
  115. package/dist/content/retro-command.js +0 -165
  116. package/dist/content/rewind-command.d.ts +0 -2
  117. package/dist/content/rewind-command.js +0 -106
  118. package/dist/content/tdd-log-command.d.ts +0 -2
  119. package/dist/content/tdd-log-command.js +0 -85
  120. package/dist/eval/agents/single-shot.d.ts +0 -27
  121. package/dist/eval/agents/single-shot.js +0 -79
  122. package/dist/eval/agents/with-tools.d.ts +0 -44
  123. package/dist/eval/agents/with-tools.js +0 -261
  124. package/dist/eval/agents/workflow.d.ts +0 -31
  125. package/dist/eval/agents/workflow.js +0 -155
  126. package/dist/eval/baseline.d.ts +0 -38
  127. package/dist/eval/baseline.js +0 -282
  128. package/dist/eval/config-loader.d.ts +0 -14
  129. package/dist/eval/config-loader.js +0 -395
  130. package/dist/eval/corpus.d.ts +0 -30
  131. package/dist/eval/corpus.js +0 -330
  132. package/dist/eval/cost-guard.d.ts +0 -102
  133. package/dist/eval/cost-guard.js +0 -190
  134. package/dist/eval/diff.d.ts +0 -64
  135. package/dist/eval/diff.js +0 -323
  136. package/dist/eval/llm-client.d.ts +0 -176
  137. package/dist/eval/llm-client.js +0 -267
  138. package/dist/eval/mode.d.ts +0 -28
  139. package/dist/eval/mode.js +0 -61
  140. package/dist/eval/progress.d.ts +0 -83
  141. package/dist/eval/progress.js +0 -59
  142. package/dist/eval/report.d.ts +0 -11
  143. package/dist/eval/report.js +0 -181
  144. package/dist/eval/rubric-loader.d.ts +0 -20
  145. package/dist/eval/rubric-loader.js +0 -143
  146. package/dist/eval/runner.d.ts +0 -81
  147. package/dist/eval/runner.js +0 -746
  148. package/dist/eval/runs.d.ts +0 -41
  149. package/dist/eval/runs.js +0 -114
  150. package/dist/eval/sandbox.d.ts +0 -38
  151. package/dist/eval/sandbox.js +0 -137
  152. package/dist/eval/tools/glob.d.ts +0 -2
  153. package/dist/eval/tools/glob.js +0 -163
  154. package/dist/eval/tools/grep.d.ts +0 -2
  155. package/dist/eval/tools/grep.js +0 -152
  156. package/dist/eval/tools/index.d.ts +0 -7
  157. package/dist/eval/tools/index.js +0 -35
  158. package/dist/eval/tools/read.d.ts +0 -2
  159. package/dist/eval/tools/read.js +0 -122
  160. package/dist/eval/tools/types.d.ts +0 -49
  161. package/dist/eval/tools/types.js +0 -41
  162. package/dist/eval/tools/write.d.ts +0 -2
  163. package/dist/eval/tools/write.js +0 -92
  164. package/dist/eval/types.d.ts +0 -561
  165. package/dist/eval/types.js +0 -47
  166. package/dist/eval/verifiers/judge.d.ts +0 -40
  167. package/dist/eval/verifiers/judge.js +0 -256
  168. package/dist/eval/verifiers/rules.d.ts +0 -24
  169. package/dist/eval/verifiers/rules.js +0 -218
  170. package/dist/eval/verifiers/structural.d.ts +0 -14
  171. package/dist/eval/verifiers/structural.js +0 -171
  172. package/dist/eval/verifiers/traceability.d.ts +0 -23
  173. package/dist/eval/verifiers/traceability.js +0 -84
  174. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  175. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  176. package/dist/eval/workflow-corpus.d.ts +0 -7
  177. package/dist/eval/workflow-corpus.js +0 -207
  178. package/dist/feature-system.d.ts +0 -42
  179. package/dist/feature-system.js +0 -432
  180. package/dist/internal/knowledge-digest.d.ts +0 -7
  181. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,181 +0,0 @@
1
- import path from "node:path";
2
- import { EVALS_ROOT } from "../constants.js";
3
- import { writeFileSafe } from "../fs-utils.js";
4
- export function reportsDir(projectRoot) {
5
- return path.join(projectRoot, EVALS_ROOT, "reports");
6
- }
7
- export function defaultReportBasename(report) {
8
- const ts = report.generatedAt.replace(/[:.]/g, "-");
9
- return `eval-${ts}-${report.runId.slice(0, 8)}`;
10
- }
11
- /**
12
- * Format a report as a human-readable Markdown document. Keeping the layout
13
- * stable matters: CI posts diffs against earlier reports, and unit tests use
14
- * the output as a regression guard.
15
- */
16
- export function formatMarkdownReport(report) {
17
- const { summary } = report;
18
- const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
19
- const lines = [];
20
- lines.push(`# cclaw eval report`);
21
- lines.push(``);
22
- lines.push(`- generated: ${report.generatedAt}`);
23
- lines.push(`- runId: ${report.runId}`);
24
- lines.push(`- cclaw version: ${report.cclawVersion}`);
25
- lines.push(`- provider: ${report.provider}`);
26
- lines.push(`- model: ${report.model}`);
27
- lines.push(`- mode: ${report.mode}`);
28
- lines.push(`- stages: ${stages}`);
29
- lines.push(``);
30
- lines.push(`## Summary`);
31
- lines.push(``);
32
- lines.push(`| metric | value |`);
33
- lines.push(`| --- | --- |`);
34
- lines.push(`| total cases | ${summary.totalCases} |`);
35
- lines.push(`| passed | ${summary.passed} |`);
36
- lines.push(`| failed | ${summary.failed} |`);
37
- lines.push(`| skipped | ${summary.skipped} |`);
38
- lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
39
- lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
40
- lines.push(``);
41
- if (report.baselineDelta) {
42
- const delta = report.baselineDelta;
43
- lines.push(`## Baseline delta`);
44
- lines.push(``);
45
- lines.push(`- baseline: ${delta.baselineId}`);
46
- lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
47
- lines.push(`- critical failures: ${delta.criticalFailures}`);
48
- lines.push(``);
49
- if (delta.regressions.length > 0) {
50
- lines.push(`### Regressions`);
51
- lines.push(``);
52
- lines.push(`| stage | case id | verifier | reason | prev | curr |`);
53
- lines.push(`| --- | --- | --- | --- | --- | --- |`);
54
- for (const reg of delta.regressions) {
55
- const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
56
- const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
57
- lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
58
- }
59
- lines.push(``);
60
- }
61
- }
62
- if (report.cases.length === 0) {
63
- lines.push(`## Cases`);
64
- lines.push(``);
65
- lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
66
- lines.push(``);
67
- return `${lines.join("\n")}\n`;
68
- }
69
- lines.push(`## Cases`);
70
- lines.push(``);
71
- lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
72
- lines.push(`| --- | --- | --- | --- | --- |`);
73
- for (const item of report.cases) {
74
- const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
75
- lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
76
- }
77
- lines.push(``);
78
- const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
79
- if (toolCases.length > 0) {
80
- lines.push(`## Tool use`);
81
- lines.push(``);
82
- lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
83
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
84
- for (const item of toolCases) {
85
- const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
86
- const toolUse = verifier?.details?.toolUse;
87
- if (!toolUse)
88
- continue;
89
- const byTool = Object.entries(toolUse.byTool)
90
- .map(([name, count]) => `${name}=${count}`)
91
- .join(", ");
92
- const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
93
- lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
94
- }
95
- lines.push(``);
96
- }
97
- const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
98
- if (judgeCases.length > 0) {
99
- lines.push(`## Judge scores`);
100
- lines.push(``);
101
- lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
102
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
103
- for (const item of judgeCases) {
104
- for (const verifier of item.verifierResults) {
105
- if (verifier.kind !== "judge")
106
- continue;
107
- if (verifier.id === "judge:required-checks")
108
- continue;
109
- if (verifier.id === "judge:rubric:missing")
110
- continue;
111
- if (verifier.id === "judge:invocation:error")
112
- continue;
113
- const details = verifier.details ?? {};
114
- const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
115
- const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
116
- const coverage = details.coverage === true ? "yes" : "no";
117
- const checkId = verifier.id.replace(/^judge:/, "");
118
- lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
119
- }
120
- }
121
- lines.push(``);
122
- }
123
- const workflowCases = report.cases.filter((item) => !!item.workflow);
124
- if (workflowCases.length > 0) {
125
- lines.push(`## Workflow stages`);
126
- lines.push(``);
127
- lines.push(`| case id | stage | duration (ms) | cost (USD) | turns | tool calls | judge ok |`);
128
- lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
129
- for (const item of workflowCases) {
130
- const wf = item.workflow;
131
- for (const stage of wf.stages) {
132
- const cost = stage.usageUsd > 0 ? stage.usageUsd.toFixed(4) : "-";
133
- const judgeOk = stage.judgeOk === true ? "yes" : stage.judgeOk === false ? "no" : "-";
134
- lines.push(`| ${item.caseId} | ${stage.stage} | ${stage.durationMs} | ${cost} | ` +
135
- `${stage.toolUse.turns} | ${stage.toolUse.calls} | ${judgeOk} |`);
136
- }
137
- }
138
- lines.push(``);
139
- }
140
- const consistencyCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "consistency"));
141
- if (consistencyCases.length > 0) {
142
- lines.push(`## Consistency checks`);
143
- lines.push(``);
144
- lines.push(`| case id | check id | ok | message |`);
145
- lines.push(`| --- | --- | --- | --- |`);
146
- for (const item of consistencyCases) {
147
- for (const verifier of item.verifierResults) {
148
- if (verifier.kind !== "consistency")
149
- continue;
150
- const message = verifier.message
151
- ? verifier.message.replace(/\|/g, "\\|").slice(0, 160)
152
- : "-";
153
- lines.push(`| ${item.caseId} | ${verifier.id} | ${verifier.ok ? "yes" : "no"} | ${message} |`);
154
- }
155
- }
156
- lines.push(``);
157
- }
158
- lines.push(`## Verifier details`);
159
- lines.push(``);
160
- for (const item of report.cases) {
161
- lines.push(`### ${item.stage} / ${item.caseId}`);
162
- lines.push(``);
163
- for (const verifier of item.verifierResults) {
164
- const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
165
- lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
166
- (verifier.message ? ` — ${verifier.message}` : ""));
167
- }
168
- lines.push(``);
169
- }
170
- return `${lines.join("\n")}\n`;
171
- }
172
- export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
173
- const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
174
- await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
175
- return outPath;
176
- }
177
- export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
178
- const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
179
- await writeFileSafe(outPath, formatMarkdownReport(report));
180
- return outPath;
181
- }
@@ -1,20 +0,0 @@
1
- import type { FlowStage } from "../types.js";
2
- import type { RubricCheck, RubricDoc } from "./types.js";
3
- export declare function rubricsDir(projectRoot: string): string;
4
- export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
5
- declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
6
- declare function validateRubric(raw: unknown, file: string): RubricDoc;
7
- /**
8
- * Load the rubric for `stage`. Returns `undefined` when the file is
9
- * missing so callers can emit a "no rubric" verifier result rather than
10
- * crashing — authors are expected to grow rubrics incrementally.
11
- */
12
- export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
13
- /** Load every rubric present in the given rubrics directory. */
14
- export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
15
- /** Exposed for tests. */
16
- export declare const __internal: {
17
- validateRubric: typeof validateRubric;
18
- validateCheck: typeof validateCheck;
19
- };
20
- export {};
@@ -1,143 +0,0 @@
1
- /**
2
- * Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
3
- *
4
- * Each file maps to exactly one `RubricDoc` that drives the LLM judge.
5
- * Validation is strict: unknown top-level keys, missing required fields,
6
- * duplicate check ids, and malformed weights all surface as actionable
7
- * errors rather than turning into silent "judge had nothing to score"
8
- * passes.
9
- */
10
- import fs from "node:fs/promises";
11
- import path from "node:path";
12
- import { parse } from "yaml";
13
- import { EVALS_ROOT } from "../constants.js";
14
- import { exists } from "../fs-utils.js";
15
- import { FLOW_STAGES } from "../types.js";
16
- export function rubricsDir(projectRoot) {
17
- return path.join(projectRoot, EVALS_ROOT, "rubrics");
18
- }
19
- export function rubricPath(projectRoot, stage) {
20
- return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
21
- }
22
- function rubricError(file, reason) {
23
- return new Error(`Invalid rubric at ${file}: ${reason}\n` +
24
- `See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
25
- }
26
- function isRecord(value) {
27
- return typeof value === "object" && value !== null && !Array.isArray(value);
28
- }
29
- function validateCheck(raw, index, file) {
30
- if (!isRecord(raw)) {
31
- throw rubricError(file, `checks[${index}] must be a mapping`);
32
- }
33
- const id = raw.id;
34
- if (typeof id !== "string" || id.trim().length === 0) {
35
- throw rubricError(file, `checks[${index}].id must be a non-empty string`);
36
- }
37
- if (!/^[a-z][a-z0-9-]*$/.test(id)) {
38
- throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
39
- }
40
- const prompt = raw.prompt;
41
- if (typeof prompt !== "string" || prompt.trim().length === 0) {
42
- throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
43
- }
44
- const check = {
45
- id,
46
- prompt: prompt.trim()
47
- };
48
- if (raw.scale !== undefined) {
49
- if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
50
- throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
51
- }
52
- check.scale = raw.scale.trim();
53
- }
54
- if (raw.weight !== undefined) {
55
- if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
56
- throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
57
- }
58
- check.weight = raw.weight;
59
- }
60
- if (raw.critical !== undefined) {
61
- if (typeof raw.critical !== "boolean") {
62
- throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
63
- }
64
- check.critical = raw.critical;
65
- }
66
- const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
67
- const unknown = Object.keys(raw).filter((key) => !known.has(key));
68
- if (unknown.length > 0) {
69
- throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
70
- }
71
- return check;
72
- }
73
- function validateRubric(raw, file) {
74
- if (!isRecord(raw)) {
75
- throw rubricError(file, "top-level value must be a mapping");
76
- }
77
- const stage = raw.stage;
78
- if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
79
- throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
80
- }
81
- const id = raw.id;
82
- let rubricId = stage;
83
- if (id !== undefined) {
84
- if (typeof id !== "string" || id.trim().length === 0) {
85
- throw rubricError(file, `"id" must be a non-empty string when provided`);
86
- }
87
- rubricId = id.trim();
88
- }
89
- const checks = raw.checks;
90
- if (!Array.isArray(checks) || checks.length === 0) {
91
- throw rubricError(file, `"checks" must be a non-empty array`);
92
- }
93
- const parsed = [];
94
- const seen = new Set();
95
- for (let i = 0; i < checks.length; i += 1) {
96
- const check = validateCheck(checks[i], i, file);
97
- if (seen.has(check.id)) {
98
- throw rubricError(file, `duplicate check id: "${check.id}"`);
99
- }
100
- seen.add(check.id);
101
- parsed.push(check);
102
- }
103
- const known = new Set(["stage", "id", "checks"]);
104
- const unknown = Object.keys(raw).filter((key) => !known.has(key));
105
- if (unknown.length > 0) {
106
- throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
107
- }
108
- return {
109
- stage: stage,
110
- id: rubricId,
111
- checks: parsed
112
- };
113
- }
114
- /**
115
- * Load the rubric for `stage`. Returns `undefined` when the file is
116
- * missing so callers can emit a "no rubric" verifier result rather than
117
- * crashing — authors are expected to grow rubrics incrementally.
118
- */
119
- export async function loadRubric(projectRoot, stage) {
120
- const file = rubricPath(projectRoot, stage);
121
- if (!(await exists(file)))
122
- return undefined;
123
- let parsed;
124
- try {
125
- parsed = parse(await fs.readFile(file, "utf8"));
126
- }
127
- catch (err) {
128
- throw rubricError(file, err instanceof Error ? err.message : String(err));
129
- }
130
- return validateRubric(parsed, file);
131
- }
132
- /** Load every rubric present in the given rubrics directory. */
133
- export async function loadAllRubrics(projectRoot) {
134
- const out = new Map();
135
- for (const stage of FLOW_STAGES) {
136
- const doc = await loadRubric(projectRoot, stage);
137
- if (doc)
138
- out.set(stage, doc);
139
- }
140
- return out;
141
- }
142
- /** Exposed for tests. */
143
- export const __internal = { validateRubric, validateCheck };
@@ -1,81 +0,0 @@
1
- import type { FlowStage } from "../types.js";
2
- import { type EvalLlmClient } from "./llm-client.js";
3
- import { type ProgressLogger } from "./progress.js";
4
- import type { EvalMode, EvalReport, ResolvedEvalConfig, WorkflowStageName } from "./types.js";
5
- export interface RunEvalOptions {
6
- projectRoot: string;
7
- stage?: FlowStage;
8
- mode?: EvalMode;
9
- /** When true, run only structural verifiers (Step 1). */
10
- schemaOnly?: boolean;
11
- /** When true, run structural + rule-based verifiers. Step 2 wires rules. */
12
- rules?: boolean;
13
- /** When true, also run LLM judge verifiers. Step 3 wires judging. */
14
- judge?: boolean;
15
- /** When true, load config + corpus and return a summary without running any verifier. */
16
- dryRun?: boolean;
17
- /** Override process.env during tests. */
18
- env?: NodeJS.ProcessEnv;
19
- /**
20
- * Optional LLM client injection. Primary use case: unit and
21
- * integration tests that want deterministic judge + agent behavior
22
- * without hitting the network.
23
- */
24
- llmClient?: EvalLlmClient;
25
- /**
26
- * Optional progress logger. The CLI wires a stderr-backed logger by
27
- * default so users see one-line updates during long runs; tests and
28
- * programmatic callers can inject a silent (noop) logger or capture
29
- * events for assertions. When omitted, progress is silenced.
30
- */
31
- progress?: ProgressLogger;
32
- /**
33
- * Per-run USD cap. Enforced in-memory; independent from the daily cap
34
- * (`dailyUsdCap` / `CCLAW_EVAL_DAILY_USD_CAP`) that persists across
35
- * invocations. Undefined means no cap.
36
- */
37
- maxCostUsd?: number;
38
- /**
39
- * Override the configured `model` (and `judgeModel`) for this run.
40
- * Used by `cclaw eval --compare-model` to replay the same corpus
41
- * against an alternative model without editing `config.yaml`.
42
- */
43
- modelOverride?: string;
44
- }
45
- export interface DryRunSummary {
46
- kind: "dry-run";
47
- config: ResolvedEvalConfig;
48
- corpus: {
49
- total: number;
50
- byStage: Record<string, number>;
51
- cases: Array<{
52
- id: string;
53
- stage: FlowStage;
54
- }>;
55
- };
56
- /** Only populated in `workflow` mode; empty for fixture / agent modes. */
57
- workflowCorpus: {
58
- total: number;
59
- cases: Array<{
60
- id: string;
61
- stages: WorkflowStageName[];
62
- }>;
63
- };
64
- plannedMode: EvalMode;
65
- verifiersAvailable: {
66
- structural: boolean;
67
- rules: boolean;
68
- judge: boolean;
69
- workflow: boolean;
70
- consistency: boolean;
71
- };
72
- notes: string[];
73
- }
74
- /**
75
- * Main eval runner. Dispatches between fixture-backed verification, the
76
- * single-stage agent-with-tools loop, and the multi-stage workflow
77
- * orchestrator based on `options.mode`. Per-stage baselines are loaded for
78
- * regression comparison. Cases without a `fixture` path in the yaml are
79
- * marked skipped (not failed) when no LLM drafting runs.
80
- */
81
- export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;