cclaw-cli 0.49.0 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +54 -82
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +51 -9
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +2 -0
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +31 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/learnings.d.ts +3 -4
  28. package/dist/content/learnings.js +24 -50
  29. package/dist/content/meta-skill.js +31 -21
  30. package/dist/content/next-command.js +38 -38
  31. package/dist/content/node-hooks.js +17 -343
  32. package/dist/content/opencode-plugin.js +2 -100
  33. package/dist/content/research-playbooks.js +14 -14
  34. package/dist/content/review-loop.d.ts +2 -0
  35. package/dist/content/review-loop.js +8 -0
  36. package/dist/content/session-hooks.js +14 -46
  37. package/dist/content/skills.d.ts +0 -5
  38. package/dist/content/skills.js +53 -128
  39. package/dist/content/stage-common-guidance.d.ts +0 -1
  40. package/dist/content/stage-common-guidance.js +15 -14
  41. package/dist/content/stage-schema.d.ts +26 -1
  42. package/dist/content/stage-schema.js +121 -40
  43. package/dist/content/stages/_lint-metadata/index.js +9 -15
  44. package/dist/content/stages/brainstorm.js +22 -43
  45. package/dist/content/stages/design.js +37 -57
  46. package/dist/content/stages/plan.js +22 -13
  47. package/dist/content/stages/review.js +24 -27
  48. package/dist/content/stages/scope.js +34 -46
  49. package/dist/content/stages/ship.js +7 -4
  50. package/dist/content/stages/spec.js +20 -9
  51. package/dist/content/stages/tdd.js +64 -44
  52. package/dist/content/start-command.js +10 -12
  53. package/dist/content/status-command.d.ts +2 -7
  54. package/dist/content/status-command.js +19 -146
  55. package/dist/content/subagents.d.ts +0 -5
  56. package/dist/content/subagents.js +47 -28
  57. package/dist/content/templates.d.ts +1 -1
  58. package/dist/content/templates.js +126 -135
  59. package/dist/content/track-render-context.d.ts +17 -0
  60. package/dist/content/track-render-context.js +44 -0
  61. package/dist/content/tree-command.d.ts +1 -2
  62. package/dist/content/tree-command.js +4 -87
  63. package/dist/content/utility-skills.d.ts +2 -29
  64. package/dist/content/utility-skills.js +2 -1534
  65. package/dist/content/view-command.js +29 -11
  66. package/dist/delegation.d.ts +1 -1
  67. package/dist/delegation.js +5 -15
  68. package/dist/doctor-registry.js +20 -21
  69. package/dist/doctor.js +88 -344
  70. package/dist/flow-state.d.ts +3 -0
  71. package/dist/flow-state.js +2 -0
  72. package/dist/harness-adapters.d.ts +1 -1
  73. package/dist/harness-adapters.js +48 -57
  74. package/dist/install.js +128 -358
  75. package/dist/internal/advance-stage.js +3 -9
  76. package/dist/internal/compound-readiness.d.ts +1 -1
  77. package/dist/internal/compound-readiness.js +1 -1
  78. package/dist/internal/tdd-loop-status.d.ts +1 -1
  79. package/dist/internal/tdd-loop-status.js +1 -1
  80. package/dist/knowledge-store.d.ts +16 -10
  81. package/dist/knowledge-store.js +51 -15
  82. package/dist/policy.js +16 -105
  83. package/dist/run-archive.d.ts +4 -6
  84. package/dist/run-archive.js +15 -20
  85. package/dist/run-persistence.d.ts +2 -2
  86. package/dist/run-persistence.js +3 -9
  87. package/package.json +1 -2
  88. package/dist/content/archive-command.d.ts +0 -2
  89. package/dist/content/archive-command.js +0 -124
  90. package/dist/content/compound-command.d.ts +0 -5
  91. package/dist/content/compound-command.js +0 -193
  92. package/dist/content/contexts.d.ts +0 -18
  93. package/dist/content/contexts.js +0 -24
  94. package/dist/content/contracts.d.ts +0 -2
  95. package/dist/content/contracts.js +0 -51
  96. package/dist/content/doctor-references.d.ts +0 -2
  97. package/dist/content/doctor-references.js +0 -150
  98. package/dist/content/eval-scaffold.d.ts +0 -15
  99. package/dist/content/eval-scaffold.js +0 -370
  100. package/dist/content/feature-command.d.ts +0 -2
  101. package/dist/content/feature-command.js +0 -123
  102. package/dist/content/flow-map.d.ts +0 -23
  103. package/dist/content/flow-map.js +0 -134
  104. package/dist/content/harness-doc.d.ts +0 -2
  105. package/dist/content/harness-doc.js +0 -202
  106. package/dist/content/harness-playbooks.d.ts +0 -24
  107. package/dist/content/harness-playbooks.js +0 -393
  108. package/dist/content/harness-tool-refs.d.ts +0 -20
  109. package/dist/content/harness-tool-refs.js +0 -268
  110. package/dist/content/ops-command.d.ts +0 -2
  111. package/dist/content/ops-command.js +0 -71
  112. package/dist/content/protocols.d.ts +0 -7
  113. package/dist/content/protocols.js +0 -215
  114. package/dist/content/retro-command.d.ts +0 -2
  115. package/dist/content/retro-command.js +0 -165
  116. package/dist/content/rewind-command.d.ts +0 -2
  117. package/dist/content/rewind-command.js +0 -106
  118. package/dist/content/tdd-log-command.d.ts +0 -2
  119. package/dist/content/tdd-log-command.js +0 -85
  120. package/dist/eval/agents/single-shot.d.ts +0 -27
  121. package/dist/eval/agents/single-shot.js +0 -79
  122. package/dist/eval/agents/with-tools.d.ts +0 -44
  123. package/dist/eval/agents/with-tools.js +0 -261
  124. package/dist/eval/agents/workflow.d.ts +0 -31
  125. package/dist/eval/agents/workflow.js +0 -155
  126. package/dist/eval/baseline.d.ts +0 -38
  127. package/dist/eval/baseline.js +0 -282
  128. package/dist/eval/config-loader.d.ts +0 -14
  129. package/dist/eval/config-loader.js +0 -395
  130. package/dist/eval/corpus.d.ts +0 -30
  131. package/dist/eval/corpus.js +0 -330
  132. package/dist/eval/cost-guard.d.ts +0 -102
  133. package/dist/eval/cost-guard.js +0 -190
  134. package/dist/eval/diff.d.ts +0 -64
  135. package/dist/eval/diff.js +0 -323
  136. package/dist/eval/llm-client.d.ts +0 -176
  137. package/dist/eval/llm-client.js +0 -267
  138. package/dist/eval/mode.d.ts +0 -28
  139. package/dist/eval/mode.js +0 -61
  140. package/dist/eval/progress.d.ts +0 -83
  141. package/dist/eval/progress.js +0 -59
  142. package/dist/eval/report.d.ts +0 -11
  143. package/dist/eval/report.js +0 -181
  144. package/dist/eval/rubric-loader.d.ts +0 -20
  145. package/dist/eval/rubric-loader.js +0 -143
  146. package/dist/eval/runner.d.ts +0 -81
  147. package/dist/eval/runner.js +0 -746
  148. package/dist/eval/runs.d.ts +0 -41
  149. package/dist/eval/runs.js +0 -114
  150. package/dist/eval/sandbox.d.ts +0 -38
  151. package/dist/eval/sandbox.js +0 -137
  152. package/dist/eval/tools/glob.d.ts +0 -2
  153. package/dist/eval/tools/glob.js +0 -163
  154. package/dist/eval/tools/grep.d.ts +0 -2
  155. package/dist/eval/tools/grep.js +0 -152
  156. package/dist/eval/tools/index.d.ts +0 -7
  157. package/dist/eval/tools/index.js +0 -35
  158. package/dist/eval/tools/read.d.ts +0 -2
  159. package/dist/eval/tools/read.js +0 -122
  160. package/dist/eval/tools/types.d.ts +0 -49
  161. package/dist/eval/tools/types.js +0 -41
  162. package/dist/eval/tools/write.d.ts +0 -2
  163. package/dist/eval/tools/write.js +0 -92
  164. package/dist/eval/types.d.ts +0 -561
  165. package/dist/eval/types.js +0 -47
  166. package/dist/eval/verifiers/judge.d.ts +0 -40
  167. package/dist/eval/verifiers/judge.js +0 -256
  168. package/dist/eval/verifiers/rules.d.ts +0 -24
  169. package/dist/eval/verifiers/rules.js +0 -218
  170. package/dist/eval/verifiers/structural.d.ts +0 -14
  171. package/dist/eval/verifiers/structural.js +0 -171
  172. package/dist/eval/verifiers/traceability.d.ts +0 -23
  173. package/dist/eval/verifiers/traceability.js +0 -84
  174. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  175. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  176. package/dist/eval/workflow-corpus.d.ts +0 -7
  177. package/dist/eval/workflow-corpus.js +0 -207
  178. package/dist/feature-system.d.ts +0 -42
  179. package/dist/feature-system.js +0 -432
  180. package/dist/internal/knowledge-digest.d.ts +0 -7
  181. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,746 +0,0 @@
1
- import { randomUUID } from "node:crypto";
2
- import { CCLAW_VERSION } from "../constants.js";
3
- import { FLOW_STAGES } from "../types.js";
4
- import { runSingleShot } from "./agents/single-shot.js";
5
- import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
6
- import { runWorkflow } from "./agents/workflow.js";
7
- import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
8
- import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
9
- import { loadWorkflowCorpus } from "./workflow-corpus.js";
10
- import { loadEvalConfig } from "./config-loader.js";
11
- import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
12
- import { createEvalClient, EvalLlmError } from "./llm-client.js";
13
- import { noopProgressLogger } from "./progress.js";
14
- import { loadAllRubrics } from "./rubric-loader.js";
15
- import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
16
- import { verifyRules } from "./verifiers/rules.js";
17
- import { verifyStructural } from "./verifiers/structural.js";
18
- import { verifyTraceability } from "./verifiers/traceability.js";
19
- import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
20
- function groupByStage(cases) {
21
- return cases.reduce((acc, item) => {
22
- acc[item.stage] = (acc[item.stage] ?? 0) + 1;
23
- return acc;
24
- }, {});
25
- }
26
- function skeletonVerifierResult(message, details) {
27
- return {
28
- kind: "structural",
29
- id: "structural:no-expectations",
30
- ok: true,
31
- score: 1,
32
- message,
33
- ...(details !== undefined ? { details } : {})
34
- };
35
- }
36
- /**
37
- * --schema-only narrows to structural. --rules opens up rules + traceability
38
- * on top of structural (traceability is a rule-family verifier even though
39
- * it lives in its own module). --judge opens up the LLM judge and, in
40
- * `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
41
- * wins so the LLM-free PR gate never pays for tokens even if stale flags
42
- * collide.
43
- */
44
- function resolveRunFlags(options) {
45
- const rulesRequested = options.rules === true;
46
- const schemaOnly = options.schemaOnly === true;
47
- const judgeRequested = options.judge === true;
48
- const mode = options.mode ?? "fixture";
49
- const runJudge = judgeRequested && !schemaOnly;
50
- // `workflow` always needs the agent loop (no fixture fallback), so we still
51
- // require an LLM client but do NOT require --judge on the CLI to produce a
52
- // workflow run. The judge piece stays gated by `runJudge` so consistency-
53
- // only runs remain cheap and deterministic.
54
- const runAgent = mode === "workflow"
55
- ? !schemaOnly
56
- : runJudge && (mode === "fixture" || mode === "agent");
57
- return {
58
- runStructural: true,
59
- runRules: rulesRequested && !schemaOnly,
60
- runTraceability: rulesRequested && !schemaOnly,
61
- runJudge,
62
- runAgent
63
- };
64
- }
65
- /**
66
- * Wrap a client so every chat() result is accounted against the cost
67
- * guard before being returned. The guard throws
68
- * DailyCostCapExceededError if committing the call would cross the
69
- * configured cap — the runner surfaces that as a hard failure so
70
- * nightly CI fails loud instead of silently overspending.
71
- */
72
- function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
73
- return {
74
- async chat(request) {
75
- const response = await client.chat(request);
76
- await costGuard.commit(response.model || fallbackModel, response.usage);
77
- return response;
78
- }
79
- };
80
- }
81
- async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
82
- try {
83
- return await readFixtureArtifact(projectRoot, caseEntry);
84
- }
85
- catch (err) {
86
- verifierResults.push({
87
- kind: "structural",
88
- id: "structural:fixture:missing",
89
- ok: false,
90
- score: 0,
91
- message: err instanceof Error ? err.message : String(err),
92
- details: { fixture: caseEntry.fixture }
93
- });
94
- return undefined;
95
- }
96
- }
97
- function stageJudgeHint(step) {
98
- const hint = {};
99
- if (step.rubric)
100
- hint.rubric = step.rubric;
101
- if (step.requiredChecks)
102
- hint.requiredChecks = step.requiredChecks;
103
- if (step.minimumScores)
104
- hint.minimumScores = step.minimumScores;
105
- return hint;
106
- }
107
- async function runWorkflowCase(ctx) {
108
- const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
109
- const started = Date.now();
110
- const verifierResults = [];
111
- let caseCostUsd = 0;
112
- const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
113
- "plan";
114
- if (!flags.runAgent || !client) {
115
- verifierResults.push({
116
- kind: "workflow",
117
- id: "workflow:agent:disabled",
118
- ok: false,
119
- score: 0,
120
- message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
121
- "Re-run with credentials to execute the workflow.",
122
- details: { stages: workflow.stages.map((s) => s.name) }
123
- });
124
- return {
125
- caseId: workflow.id,
126
- stage: lastStage,
127
- mode: plannedMode,
128
- passed: false,
129
- durationMs: Date.now() - started,
130
- verifierResults
131
- };
132
- }
133
- let workflowResult;
134
- try {
135
- workflowResult = await runWorkflow({
136
- workflow,
137
- config,
138
- projectRoot,
139
- client,
140
- onStageStart: (stage) => progress.emit({
141
- kind: "stage-start",
142
- caseId: workflow.id,
143
- stage,
144
- index: caseIndex,
145
- total: totalCases
146
- }),
147
- onStageEnd: (stage, stageResult) => progress.emit({
148
- kind: "stage-end",
149
- caseId: workflow.id,
150
- stage,
151
- index: caseIndex,
152
- total: totalCases,
153
- passed: true,
154
- durationMs: stageResult.durationMs,
155
- ...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
156
- })
157
- });
158
- }
159
- catch (err) {
160
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
161
- throw err;
162
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
163
- const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
164
- verifierResults.push({
165
- kind: "workflow",
166
- id: "workflow:agent:error",
167
- ok: false,
168
- score: 0,
169
- message: err instanceof Error ? err.message : String(err),
170
- details: {
171
- retryable,
172
- ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
173
- }
174
- });
175
- return {
176
- caseId: workflow.id,
177
- stage: lastStage,
178
- mode: plannedMode,
179
- passed: false,
180
- durationMs: Date.now() - started,
181
- verifierResults
182
- };
183
- }
184
- caseCostUsd += workflowResult.totalUsageUsd;
185
- const stageResults = [...workflowResult.stages];
186
- verifierResults.push({
187
- kind: "workflow",
188
- id: "workflow:agent",
189
- ok: true,
190
- score: 1,
191
- message: `workflow ran ${stageResults.length} stage(s) in ` +
192
- `${workflowResult.totalDurationMs}ms ` +
193
- `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
194
- details: {
195
- stages: stageResults.map((s) => ({
196
- name: s.stage,
197
- durationMs: s.durationMs,
198
- usageUsd: s.usageUsd,
199
- turns: s.toolUse.turns,
200
- calls: s.toolUse.calls
201
- }))
202
- }
203
- });
204
- let allJudgeOk = true;
205
- if (flags.runJudge) {
206
- for (let i = 0; i < workflow.stages.length; i += 1) {
207
- const step = workflow.stages[i];
208
- const stageResult = stageResults[i];
209
- const rubric = rubrics.get(step.name);
210
- if (!rubric) {
211
- verifierResults.push({
212
- kind: "judge",
213
- id: `judge:rubric:missing:${step.name}`,
214
- ok: false,
215
- score: 0,
216
- message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
217
- details: { stage: step.name }
218
- });
219
- allJudgeOk = false;
220
- stageResult.judgeOk = false;
221
- continue;
222
- }
223
- const hint = stageJudgeHint(step);
224
- try {
225
- const invocation = await runJudge({
226
- artifact: stageResult.artifact,
227
- rubric,
228
- config,
229
- client,
230
- caseHint: hint
231
- });
232
- caseCostUsd += invocation.usageUsd;
233
- const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
234
- const medians = {};
235
- for (const agg of invocation.aggregates) {
236
- medians[agg.checkId] = agg.median;
237
- }
238
- stageResult.judgeMedians = medians;
239
- const stageOk = judgeVerifiers.every((v) => v.ok);
240
- stageResult.judgeOk = stageOk;
241
- if (!stageOk)
242
- allJudgeOk = false;
243
- for (const v of judgeVerifiers) {
244
- verifierResults.push({
245
- ...v,
246
- id: `${v.id}:${step.name}`,
247
- details: { ...(v.details ?? {}), stage: step.name }
248
- });
249
- }
250
- }
251
- catch (err) {
252
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
253
- throw err;
254
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
255
- verifierResults.push({
256
- kind: "judge",
257
- id: `judge:invocation:error:${step.name}`,
258
- ok: false,
259
- score: 0,
260
- message: err instanceof Error ? err.message : String(err),
261
- details: { retryable, rubricId: rubric.id, stage: step.name }
262
- });
263
- stageResult.judgeOk = false;
264
- allJudgeOk = false;
265
- }
266
- }
267
- }
268
- const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
269
- verifierResults.push(...consistencyResults);
270
- const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
271
- const allOk = nonSkipped.length === 0
272
- ? verifierResults.every((r) => r.ok)
273
- : nonSkipped.every((r) => r.ok);
274
- const workflowSummary = {
275
- caseId: workflow.id,
276
- stages: stageResults,
277
- totalUsageUsd: workflowResult.totalUsageUsd,
278
- totalDurationMs: workflowResult.totalDurationMs,
279
- allJudgeOk: flags.runJudge ? allJudgeOk : true
280
- };
281
- return {
282
- caseId: workflow.id,
283
- stage: lastStage,
284
- mode: plannedMode,
285
- passed: allOk,
286
- durationMs: Date.now() - started,
287
- costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
288
- verifierResults,
289
- workflow: workflowSummary
290
- };
291
- }
292
- async function runCase(ctx) {
293
- const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
294
- const started = Date.now();
295
- const verifierResults = [];
296
- const expected = caseEntry.expected;
297
- let caseCostUsd = 0;
298
- const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
299
- const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
300
- const hasTraceability = flags.runTraceability && !!expected?.traceability;
301
- const judgeRequested = flags.runJudge && !!expected?.judge;
302
- const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
303
- let artifact;
304
- if (needsArtifact) {
305
- if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
306
- try {
307
- const produced = await runSingleShot({
308
- caseEntry,
309
- config,
310
- projectRoot,
311
- client
312
- });
313
- artifact = produced.artifact;
314
- caseCostUsd += produced.usageUsd;
315
- verifierResults.push({
316
- kind: "workflow",
317
- id: "agent:single-shot",
318
- ok: true,
319
- score: 1,
320
- message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
321
- details: {
322
- model: produced.model,
323
- tokensIn: produced.usage.promptTokens,
324
- tokensOut: produced.usage.completionTokens,
325
- usageUsd: produced.usageUsd,
326
- attempts: produced.attempts
327
- }
328
- });
329
- }
330
- catch (err) {
331
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
332
- throw err;
333
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
334
- verifierResults.push({
335
- kind: "workflow",
336
- id: "agent:single-shot",
337
- ok: false,
338
- score: 0,
339
- message: err instanceof Error ? err.message : String(err),
340
- details: { retryable }
341
- });
342
- }
343
- }
344
- else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
345
- try {
346
- const produced = await runWithTools({
347
- caseEntry,
348
- config,
349
- projectRoot,
350
- client
351
- });
352
- artifact = produced.artifact;
353
- caseCostUsd += produced.usageUsd;
354
- verifierResults.push({
355
- kind: "workflow",
356
- id: "agent:with-tools",
357
- ok: true,
358
- score: 1,
359
- message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
360
- `${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
361
- `(${produced.toolUse.calls} tool call(s))`,
362
- details: {
363
- model: produced.model,
364
- tokensIn: produced.usage.promptTokens,
365
- tokensOut: produced.usage.completionTokens,
366
- usageUsd: produced.usageUsd,
367
- attempts: produced.attempts,
368
- toolUse: produced.toolUse
369
- }
370
- });
371
- }
372
- catch (err) {
373
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
374
- throw err;
375
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
376
- const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
377
- verifierResults.push({
378
- kind: "workflow",
379
- id: "agent:with-tools",
380
- ok: false,
381
- score: 0,
382
- message: err instanceof Error ? err.message : String(err),
383
- details: {
384
- retryable,
385
- ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
386
- }
387
- });
388
- }
389
- }
390
- else {
391
- artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
392
- }
393
- if (artifact === undefined && verifierResults.length === 0) {
394
- verifierResults.push({
395
- kind: "structural",
396
- id: "structural:fixture:absent",
397
- ok: false,
398
- score: 0,
399
- message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
400
- details: { fixtureProvided: false }
401
- });
402
- }
403
- }
404
- if (flags.runStructural) {
405
- if (!hasStructural) {
406
- verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
407
- }
408
- else if (artifact !== undefined) {
409
- const results = verifyStructural(artifact, expected.structural);
410
- if (results.length === 0) {
411
- verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
412
- }
413
- else {
414
- verifierResults.push(...results);
415
- }
416
- }
417
- }
418
- if (hasRules && artifact !== undefined) {
419
- const results = verifyRules(artifact, expected.rules);
420
- verifierResults.push(...results);
421
- }
422
- if (hasTraceability && artifact !== undefined) {
423
- try {
424
- const extras = await readExtraFixtures(projectRoot, caseEntry);
425
- const results = verifyTraceability(artifact, extras, expected.traceability);
426
- verifierResults.push(...results);
427
- }
428
- catch (err) {
429
- verifierResults.push({
430
- kind: "rules",
431
- id: "traceability:fixture:missing",
432
- ok: false,
433
- score: 0,
434
- message: err instanceof Error ? err.message : String(err),
435
- details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
436
- });
437
- }
438
- }
439
- if (judgeRequested && artifact !== undefined && client) {
440
- const rubric = rubrics.get(caseEntry.stage);
441
- if (!rubric) {
442
- verifierResults.push({
443
- kind: "judge",
444
- id: "judge:rubric:missing",
445
- ok: false,
446
- score: 0,
447
- message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
448
- details: { stage: caseEntry.stage }
449
- });
450
- }
451
- else {
452
- try {
453
- const invocation = await runJudge({
454
- artifact,
455
- rubric,
456
- config,
457
- client,
458
- caseHint: expected.judge
459
- });
460
- caseCostUsd += invocation.usageUsd;
461
- const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
462
- verifierResults.push(...judgeVerifiers);
463
- }
464
- catch (err) {
465
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
466
- throw err;
467
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
468
- verifierResults.push({
469
- kind: "judge",
470
- id: "judge:invocation:error",
471
- ok: false,
472
- score: 0,
473
- message: err instanceof Error ? err.message : String(err),
474
- details: { retryable, rubricId: rubric.id }
475
- });
476
- }
477
- }
478
- }
479
- const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
480
- const allOk = nonSkippedResults.length === 0
481
- ? verifierResults.every((r) => r.ok)
482
- : nonSkippedResults.every((r) => r.ok);
483
- return {
484
- caseId: caseEntry.id,
485
- stage: caseEntry.stage,
486
- mode: plannedMode,
487
- passed: allOk,
488
- durationMs: Date.now() - started,
489
- costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
490
- verifierResults
491
- };
492
- }
493
- function reduceSummary(caseResults) {
494
- let passed = 0;
495
- let failed = 0;
496
- let skipped = 0;
497
- let totalCostUsd = 0;
498
- let totalDurationMs = 0;
499
- for (const c of caseResults) {
500
- totalDurationMs += c.durationMs;
501
- if (c.costUsd !== undefined)
502
- totalCostUsd += c.costUsd;
503
- if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
504
- skipped += 1;
505
- continue;
506
- }
507
- if (c.passed)
508
- passed += 1;
509
- else
510
- failed += 1;
511
- }
512
- return {
513
- totalCases: caseResults.length,
514
- passed,
515
- failed,
516
- skipped,
517
- totalCostUsd: Number(totalCostUsd.toFixed(6)),
518
- totalDurationMs
519
- };
520
- }
521
- function stagesInResults(caseResults) {
522
- const set = new Set();
523
- for (const c of caseResults)
524
- set.add(c.stage);
525
- return FLOW_STAGES.filter((s) => set.has(s));
526
- }
527
- const MAX_PARALLEL_CASES = 4;
528
- async function runCasesWithBoundedConcurrency(items, concurrency, worker) {
529
- if (items.length === 0) {
530
- return [];
531
- }
532
- const limit = Math.max(1, Math.min(concurrency, items.length));
533
- if (limit === 1) {
534
- const results = [];
535
- for (let i = 0; i < items.length; i += 1) {
536
- results.push(await worker(items[i], i));
537
- }
538
- return results;
539
- }
540
- const results = new Array(items.length);
541
- let cursor = 0;
542
- const runners = Array.from({ length: limit }, async () => {
543
- while (true) {
544
- const index = cursor;
545
- cursor += 1;
546
- if (index >= items.length) {
547
- return;
548
- }
549
- results[index] = await worker(items[index], index);
550
- }
551
- });
552
- await Promise.all(runners);
553
- return results;
554
- }
555
- /**
556
- * Main eval runner. Dispatches between fixture-backed verification, the
557
- * single-stage agent-with-tools loop, and the multi-stage workflow
558
- * orchestrator based on `options.mode`. Per-stage baselines are loaded for
559
- * regression comparison. Cases without a `fixture` path in the yaml are
560
- * marked skipped (not failed) when no LLM drafting runs.
561
- */
562
- export async function runEval(options) {
563
- const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
564
- const config = options.modelOverride
565
- ? {
566
- ...baseConfig,
567
- model: options.modelOverride,
568
- judgeModel: options.modelOverride
569
- }
570
- : baseConfig;
571
- const plannedMode = options.mode ?? config.defaultMode;
572
- const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
573
- const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
574
- const notes = [];
575
- if (plannedMode !== "workflow" && corpus.length === 0) {
576
- notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
577
- }
578
- if (plannedMode === "workflow" && workflowCorpus.length === 0) {
579
- notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
580
- }
581
- const flags = resolveRunFlags(options);
582
- if (flags.runJudge && !config.apiKey && !options.llmClient) {
583
- notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
584
- }
585
- if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
586
- notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
587
- }
588
- if (options.dryRun === true) {
589
- const summary = {
590
- kind: "dry-run",
591
- config,
592
- corpus: {
593
- total: corpus.length,
594
- byStage: groupByStage(corpus),
595
- cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
596
- },
597
- workflowCorpus: {
598
- total: workflowCorpus.length,
599
- cases: workflowCorpus.map((item) => ({
600
- id: item.id,
601
- stages: item.stages.map((s) => s.name)
602
- }))
603
- },
604
- plannedMode,
605
- verifiersAvailable: {
606
- structural: flags.runStructural,
607
- rules: flags.runRules,
608
- judge: flags.runJudge,
609
- workflow: flags.runAgent,
610
- consistency: plannedMode === "workflow"
611
- },
612
- notes
613
- };
614
- return summary;
615
- }
616
- const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
617
- const progress = options.progress ?? noopProgressLogger();
618
- let wrappedClient;
619
- const clientNeeded = flags.runJudge || plannedMode === "workflow";
620
- if (clientNeeded) {
621
- const base = options.llmClient ??
622
- createEvalClient(config, {
623
- onRetry: (event) => progress.emit({
624
- kind: "retry",
625
- caseId: "llm",
626
- attempt: event.attempt,
627
- maxAttempts: event.maxAttempts,
628
- waitMs: event.waitMs,
629
- reason: event.error.message
630
- })
631
- });
632
- wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
633
- }
634
- const rubricsNeeded = flags.runJudge;
635
- const rubrics = rubricsNeeded
636
- ? await loadAllRubrics(options.projectRoot)
637
- : new Map();
638
- const now = new Date().toISOString();
639
- const caseResults = [];
640
- const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
641
- const runStarted = Date.now();
642
- progress.emit({
643
- kind: "run-start",
644
- mode: plannedMode,
645
- totalCases: totalPlannedCases
646
- });
647
- if (plannedMode === "workflow") {
648
- for (let i = 0; i < workflowCorpus.length; i += 1) {
649
- const wf = workflowCorpus[i];
650
- progress.emit({
651
- kind: "case-start",
652
- caseId: wf.id,
653
- stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
654
- index: i + 1,
655
- total: workflowCorpus.length
656
- });
657
- const result = await runWorkflowCase({
658
- projectRoot: options.projectRoot,
659
- workflow: wf,
660
- plannedMode,
661
- flags,
662
- config,
663
- client: wrappedClient,
664
- costGuard,
665
- rubrics,
666
- progress,
667
- caseIndex: i + 1,
668
- totalCases: workflowCorpus.length
669
- });
670
- progress.emit({
671
- kind: "case-end",
672
- caseId: wf.id,
673
- stage: result.stage,
674
- index: i + 1,
675
- total: workflowCorpus.length,
676
- passed: result.passed,
677
- durationMs: result.durationMs,
678
- ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
679
- });
680
- caseResults.push(result);
681
- }
682
- }
683
- else {
684
- // Only parallelize fixture/rules verification passes that do not depend on
685
- // LLM judge/agent loops. Those modes touch cost guards and retries where
686
- // ordered execution is safer.
687
- const caseConcurrency = flags.runJudge || flags.runAgent ? 1 : MAX_PARALLEL_CASES;
688
- const results = await runCasesWithBoundedConcurrency(corpus, caseConcurrency, async (item, i) => {
689
- progress.emit({
690
- kind: "case-start",
691
- caseId: item.id,
692
- stage: item.stage,
693
- index: i + 1,
694
- total: corpus.length
695
- });
696
- const result = await runCase({
697
- projectRoot: options.projectRoot,
698
- caseEntry: item,
699
- plannedMode,
700
- flags,
701
- config,
702
- client: wrappedClient,
703
- costGuard,
704
- rubrics
705
- });
706
- progress.emit({
707
- kind: "case-end",
708
- caseId: item.id,
709
- stage: item.stage,
710
- index: i + 1,
711
- total: corpus.length,
712
- passed: result.passed,
713
- durationMs: result.durationMs,
714
- ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
715
- });
716
- return result;
717
- });
718
- caseResults.push(...results);
719
- }
720
- const stages = stagesInResults(caseResults);
721
- const baselines = await loadBaselinesByStage(options.projectRoot, stages);
722
- const summary = reduceSummary(caseResults);
723
- const report = {
724
- schemaVersion: 1,
725
- generatedAt: now,
726
- runId: randomUUID(),
727
- cclawVersion: CCLAW_VERSION,
728
- provider: config.provider,
729
- model: config.model,
730
- mode: plannedMode,
731
- stages,
732
- cases: caseResults,
733
- summary
734
- };
735
- const baselineDelta = compareAgainstBaselines(report, baselines);
736
- if (baselineDelta)
737
- report.baselineDelta = baselineDelta;
738
- progress.emit({
739
- kind: "run-end",
740
- totalCases: summary.totalCases,
741
- passed: summary.passed,
742
- failed: summary.failed,
743
- durationMs: Date.now() - runStarted
744
- });
745
- return report;
746
- }