cclaw-cli 0.49.0 → 0.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +57 -84
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +55 -17
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +44 -21
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +34 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/language-policy.d.ts +2 -0
  28. package/dist/content/language-policy.js +13 -0
  29. package/dist/content/learnings.d.ts +3 -4
  30. package/dist/content/learnings.js +26 -50
  31. package/dist/content/meta-skill.js +33 -22
  32. package/dist/content/next-command.js +41 -38
  33. package/dist/content/node-hooks.js +17 -345
  34. package/dist/content/opencode-plugin.js +5 -103
  35. package/dist/content/research-playbooks.js +14 -14
  36. package/dist/content/review-loop.d.ts +2 -0
  37. package/dist/content/review-loop.js +8 -0
  38. package/dist/content/session-hooks.js +15 -47
  39. package/dist/content/skills.d.ts +0 -5
  40. package/dist/content/skills.js +55 -128
  41. package/dist/content/stage-common-guidance.d.ts +0 -1
  42. package/dist/content/stage-common-guidance.js +17 -14
  43. package/dist/content/stage-schema.d.ts +26 -1
  44. package/dist/content/stage-schema.js +121 -40
  45. package/dist/content/stages/_lint-metadata/index.js +9 -15
  46. package/dist/content/stages/brainstorm.js +22 -43
  47. package/dist/content/stages/design.js +37 -57
  48. package/dist/content/stages/plan.js +22 -13
  49. package/dist/content/stages/review.js +24 -27
  50. package/dist/content/stages/scope.js +34 -46
  51. package/dist/content/stages/ship.js +7 -4
  52. package/dist/content/stages/spec.js +20 -9
  53. package/dist/content/stages/tdd.js +64 -44
  54. package/dist/content/start-command.js +13 -12
  55. package/dist/content/status-command.d.ts +2 -7
  56. package/dist/content/status-command.js +19 -146
  57. package/dist/content/subagents.d.ts +0 -5
  58. package/dist/content/subagents.js +51 -28
  59. package/dist/content/templates.d.ts +1 -1
  60. package/dist/content/templates.js +126 -135
  61. package/dist/content/track-render-context.d.ts +17 -0
  62. package/dist/content/track-render-context.js +44 -0
  63. package/dist/content/tree-command.d.ts +1 -2
  64. package/dist/content/tree-command.js +4 -87
  65. package/dist/content/utility-skills.d.ts +2 -29
  66. package/dist/content/utility-skills.js +2 -1534
  67. package/dist/content/view-command.js +31 -11
  68. package/dist/delegation.d.ts +1 -1
  69. package/dist/delegation.js +5 -15
  70. package/dist/doctor-registry.js +20 -21
  71. package/dist/doctor.js +88 -344
  72. package/dist/flow-state.d.ts +3 -0
  73. package/dist/flow-state.js +2 -0
  74. package/dist/harness-adapters.d.ts +1 -1
  75. package/dist/harness-adapters.js +51 -58
  76. package/dist/install.js +128 -358
  77. package/dist/internal/advance-stage.js +3 -9
  78. package/dist/internal/compound-readiness.d.ts +1 -1
  79. package/dist/internal/compound-readiness.js +1 -1
  80. package/dist/internal/tdd-loop-status.d.ts +1 -1
  81. package/dist/internal/tdd-loop-status.js +1 -1
  82. package/dist/knowledge-store.d.ts +16 -10
  83. package/dist/knowledge-store.js +51 -15
  84. package/dist/policy.js +16 -105
  85. package/dist/run-archive.d.ts +4 -6
  86. package/dist/run-archive.js +15 -20
  87. package/dist/run-persistence.d.ts +2 -2
  88. package/dist/run-persistence.js +3 -9
  89. package/package.json +1 -2
  90. package/dist/content/archive-command.d.ts +0 -2
  91. package/dist/content/archive-command.js +0 -124
  92. package/dist/content/compound-command.d.ts +0 -5
  93. package/dist/content/compound-command.js +0 -193
  94. package/dist/content/contexts.d.ts +0 -18
  95. package/dist/content/contexts.js +0 -24
  96. package/dist/content/contracts.d.ts +0 -2
  97. package/dist/content/contracts.js +0 -51
  98. package/dist/content/doctor-references.d.ts +0 -2
  99. package/dist/content/doctor-references.js +0 -150
  100. package/dist/content/eval-scaffold.d.ts +0 -15
  101. package/dist/content/eval-scaffold.js +0 -370
  102. package/dist/content/feature-command.d.ts +0 -2
  103. package/dist/content/feature-command.js +0 -123
  104. package/dist/content/flow-map.d.ts +0 -23
  105. package/dist/content/flow-map.js +0 -134
  106. package/dist/content/harness-doc.d.ts +0 -2
  107. package/dist/content/harness-doc.js +0 -202
  108. package/dist/content/harness-playbooks.d.ts +0 -24
  109. package/dist/content/harness-playbooks.js +0 -393
  110. package/dist/content/harness-tool-refs.d.ts +0 -20
  111. package/dist/content/harness-tool-refs.js +0 -268
  112. package/dist/content/ops-command.d.ts +0 -2
  113. package/dist/content/ops-command.js +0 -71
  114. package/dist/content/protocols.d.ts +0 -7
  115. package/dist/content/protocols.js +0 -215
  116. package/dist/content/retro-command.d.ts +0 -2
  117. package/dist/content/retro-command.js +0 -165
  118. package/dist/content/rewind-command.d.ts +0 -2
  119. package/dist/content/rewind-command.js +0 -106
  120. package/dist/content/tdd-log-command.d.ts +0 -2
  121. package/dist/content/tdd-log-command.js +0 -85
  122. package/dist/eval/agents/single-shot.d.ts +0 -27
  123. package/dist/eval/agents/single-shot.js +0 -79
  124. package/dist/eval/agents/with-tools.d.ts +0 -44
  125. package/dist/eval/agents/with-tools.js +0 -261
  126. package/dist/eval/agents/workflow.d.ts +0 -31
  127. package/dist/eval/agents/workflow.js +0 -155
  128. package/dist/eval/baseline.d.ts +0 -38
  129. package/dist/eval/baseline.js +0 -282
  130. package/dist/eval/config-loader.d.ts +0 -14
  131. package/dist/eval/config-loader.js +0 -395
  132. package/dist/eval/corpus.d.ts +0 -30
  133. package/dist/eval/corpus.js +0 -330
  134. package/dist/eval/cost-guard.d.ts +0 -102
  135. package/dist/eval/cost-guard.js +0 -190
  136. package/dist/eval/diff.d.ts +0 -64
  137. package/dist/eval/diff.js +0 -323
  138. package/dist/eval/llm-client.d.ts +0 -176
  139. package/dist/eval/llm-client.js +0 -267
  140. package/dist/eval/mode.d.ts +0 -28
  141. package/dist/eval/mode.js +0 -61
  142. package/dist/eval/progress.d.ts +0 -83
  143. package/dist/eval/progress.js +0 -59
  144. package/dist/eval/report.d.ts +0 -11
  145. package/dist/eval/report.js +0 -181
  146. package/dist/eval/rubric-loader.d.ts +0 -20
  147. package/dist/eval/rubric-loader.js +0 -143
  148. package/dist/eval/runner.d.ts +0 -81
  149. package/dist/eval/runner.js +0 -746
  150. package/dist/eval/runs.d.ts +0 -41
  151. package/dist/eval/runs.js +0 -114
  152. package/dist/eval/sandbox.d.ts +0 -38
  153. package/dist/eval/sandbox.js +0 -137
  154. package/dist/eval/tools/glob.d.ts +0 -2
  155. package/dist/eval/tools/glob.js +0 -163
  156. package/dist/eval/tools/grep.d.ts +0 -2
  157. package/dist/eval/tools/grep.js +0 -152
  158. package/dist/eval/tools/index.d.ts +0 -7
  159. package/dist/eval/tools/index.js +0 -35
  160. package/dist/eval/tools/read.d.ts +0 -2
  161. package/dist/eval/tools/read.js +0 -122
  162. package/dist/eval/tools/types.d.ts +0 -49
  163. package/dist/eval/tools/types.js +0 -41
  164. package/dist/eval/tools/write.d.ts +0 -2
  165. package/dist/eval/tools/write.js +0 -92
  166. package/dist/eval/types.d.ts +0 -561
  167. package/dist/eval/types.js +0 -47
  168. package/dist/eval/verifiers/judge.d.ts +0 -40
  169. package/dist/eval/verifiers/judge.js +0 -256
  170. package/dist/eval/verifiers/rules.d.ts +0 -24
  171. package/dist/eval/verifiers/rules.js +0 -218
  172. package/dist/eval/verifiers/structural.d.ts +0 -14
  173. package/dist/eval/verifiers/structural.js +0 -171
  174. package/dist/eval/verifiers/traceability.d.ts +0 -23
  175. package/dist/eval/verifiers/traceability.js +0 -84
  176. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  177. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  178. package/dist/eval/workflow-corpus.d.ts +0 -7
  179. package/dist/eval/workflow-corpus.js +0 -207
  180. package/dist/feature-system.d.ts +0 -42
  181. package/dist/feature-system.js +0 -432
  182. package/dist/internal/knowledge-digest.d.ts +0 -7
  183. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,746 +0,0 @@
1
- import { randomUUID } from "node:crypto";
2
- import { CCLAW_VERSION } from "../constants.js";
3
- import { FLOW_STAGES } from "../types.js";
4
- import { runSingleShot } from "./agents/single-shot.js";
5
- import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
6
- import { runWorkflow } from "./agents/workflow.js";
7
- import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
8
- import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
9
- import { loadWorkflowCorpus } from "./workflow-corpus.js";
10
- import { loadEvalConfig } from "./config-loader.js";
11
- import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
12
- import { createEvalClient, EvalLlmError } from "./llm-client.js";
13
- import { noopProgressLogger } from "./progress.js";
14
- import { loadAllRubrics } from "./rubric-loader.js";
15
- import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
16
- import { verifyRules } from "./verifiers/rules.js";
17
- import { verifyStructural } from "./verifiers/structural.js";
18
- import { verifyTraceability } from "./verifiers/traceability.js";
19
- import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
20
- function groupByStage(cases) {
21
- return cases.reduce((acc, item) => {
22
- acc[item.stage] = (acc[item.stage] ?? 0) + 1;
23
- return acc;
24
- }, {});
25
- }
26
- function skeletonVerifierResult(message, details) {
27
- return {
28
- kind: "structural",
29
- id: "structural:no-expectations",
30
- ok: true,
31
- score: 1,
32
- message,
33
- ...(details !== undefined ? { details } : {})
34
- };
35
- }
36
- /**
37
- * --schema-only narrows to structural. --rules opens up rules + traceability
38
- * on top of structural (traceability is a rule-family verifier even though
39
- * it lives in its own module). --judge opens up the LLM judge and, in
40
- * `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
41
- * wins so the LLM-free PR gate never pays for tokens even if stale flags
42
- * collide.
43
- */
44
- function resolveRunFlags(options) {
45
- const rulesRequested = options.rules === true;
46
- const schemaOnly = options.schemaOnly === true;
47
- const judgeRequested = options.judge === true;
48
- const mode = options.mode ?? "fixture";
49
- const runJudge = judgeRequested && !schemaOnly;
50
- // `workflow` always needs the agent loop (no fixture fallback), so we still
51
- // require an LLM client but do NOT require --judge on the CLI to produce a
52
- // workflow run. The judge piece stays gated by `runJudge` so consistency-
53
- // only runs remain cheap and deterministic.
54
- const runAgent = mode === "workflow"
55
- ? !schemaOnly
56
- : runJudge && (mode === "fixture" || mode === "agent");
57
- return {
58
- runStructural: true,
59
- runRules: rulesRequested && !schemaOnly,
60
- runTraceability: rulesRequested && !schemaOnly,
61
- runJudge,
62
- runAgent
63
- };
64
- }
65
- /**
66
- * Wrap a client so every chat() result is accounted against the cost
67
- * guard before being returned. The guard throws
68
- * DailyCostCapExceededError if committing the call would cross the
69
- * configured cap — the runner surfaces that as a hard failure so
70
- * nightly CI fails loud instead of silently overspending.
71
- */
72
- function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
73
- return {
74
- async chat(request) {
75
- const response = await client.chat(request);
76
- await costGuard.commit(response.model || fallbackModel, response.usage);
77
- return response;
78
- }
79
- };
80
- }
81
- async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
82
- try {
83
- return await readFixtureArtifact(projectRoot, caseEntry);
84
- }
85
- catch (err) {
86
- verifierResults.push({
87
- kind: "structural",
88
- id: "structural:fixture:missing",
89
- ok: false,
90
- score: 0,
91
- message: err instanceof Error ? err.message : String(err),
92
- details: { fixture: caseEntry.fixture }
93
- });
94
- return undefined;
95
- }
96
- }
97
- function stageJudgeHint(step) {
98
- const hint = {};
99
- if (step.rubric)
100
- hint.rubric = step.rubric;
101
- if (step.requiredChecks)
102
- hint.requiredChecks = step.requiredChecks;
103
- if (step.minimumScores)
104
- hint.minimumScores = step.minimumScores;
105
- return hint;
106
- }
107
- async function runWorkflowCase(ctx) {
108
- const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
109
- const started = Date.now();
110
- const verifierResults = [];
111
- let caseCostUsd = 0;
112
- const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
113
- "plan";
114
- if (!flags.runAgent || !client) {
115
- verifierResults.push({
116
- kind: "workflow",
117
- id: "workflow:agent:disabled",
118
- ok: false,
119
- score: 0,
120
- message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
121
- "Re-run with credentials to execute the workflow.",
122
- details: { stages: workflow.stages.map((s) => s.name) }
123
- });
124
- return {
125
- caseId: workflow.id,
126
- stage: lastStage,
127
- mode: plannedMode,
128
- passed: false,
129
- durationMs: Date.now() - started,
130
- verifierResults
131
- };
132
- }
133
- let workflowResult;
134
- try {
135
- workflowResult = await runWorkflow({
136
- workflow,
137
- config,
138
- projectRoot,
139
- client,
140
- onStageStart: (stage) => progress.emit({
141
- kind: "stage-start",
142
- caseId: workflow.id,
143
- stage,
144
- index: caseIndex,
145
- total: totalCases
146
- }),
147
- onStageEnd: (stage, stageResult) => progress.emit({
148
- kind: "stage-end",
149
- caseId: workflow.id,
150
- stage,
151
- index: caseIndex,
152
- total: totalCases,
153
- passed: true,
154
- durationMs: stageResult.durationMs,
155
- ...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
156
- })
157
- });
158
- }
159
- catch (err) {
160
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
161
- throw err;
162
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
163
- const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
164
- verifierResults.push({
165
- kind: "workflow",
166
- id: "workflow:agent:error",
167
- ok: false,
168
- score: 0,
169
- message: err instanceof Error ? err.message : String(err),
170
- details: {
171
- retryable,
172
- ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
173
- }
174
- });
175
- return {
176
- caseId: workflow.id,
177
- stage: lastStage,
178
- mode: plannedMode,
179
- passed: false,
180
- durationMs: Date.now() - started,
181
- verifierResults
182
- };
183
- }
184
- caseCostUsd += workflowResult.totalUsageUsd;
185
- const stageResults = [...workflowResult.stages];
186
- verifierResults.push({
187
- kind: "workflow",
188
- id: "workflow:agent",
189
- ok: true,
190
- score: 1,
191
- message: `workflow ran ${stageResults.length} stage(s) in ` +
192
- `${workflowResult.totalDurationMs}ms ` +
193
- `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
194
- details: {
195
- stages: stageResults.map((s) => ({
196
- name: s.stage,
197
- durationMs: s.durationMs,
198
- usageUsd: s.usageUsd,
199
- turns: s.toolUse.turns,
200
- calls: s.toolUse.calls
201
- }))
202
- }
203
- });
204
- let allJudgeOk = true;
205
- if (flags.runJudge) {
206
- for (let i = 0; i < workflow.stages.length; i += 1) {
207
- const step = workflow.stages[i];
208
- const stageResult = stageResults[i];
209
- const rubric = rubrics.get(step.name);
210
- if (!rubric) {
211
- verifierResults.push({
212
- kind: "judge",
213
- id: `judge:rubric:missing:${step.name}`,
214
- ok: false,
215
- score: 0,
216
- message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
217
- details: { stage: step.name }
218
- });
219
- allJudgeOk = false;
220
- stageResult.judgeOk = false;
221
- continue;
222
- }
223
- const hint = stageJudgeHint(step);
224
- try {
225
- const invocation = await runJudge({
226
- artifact: stageResult.artifact,
227
- rubric,
228
- config,
229
- client,
230
- caseHint: hint
231
- });
232
- caseCostUsd += invocation.usageUsd;
233
- const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
234
- const medians = {};
235
- for (const agg of invocation.aggregates) {
236
- medians[agg.checkId] = agg.median;
237
- }
238
- stageResult.judgeMedians = medians;
239
- const stageOk = judgeVerifiers.every((v) => v.ok);
240
- stageResult.judgeOk = stageOk;
241
- if (!stageOk)
242
- allJudgeOk = false;
243
- for (const v of judgeVerifiers) {
244
- verifierResults.push({
245
- ...v,
246
- id: `${v.id}:${step.name}`,
247
- details: { ...(v.details ?? {}), stage: step.name }
248
- });
249
- }
250
- }
251
- catch (err) {
252
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
253
- throw err;
254
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
255
- verifierResults.push({
256
- kind: "judge",
257
- id: `judge:invocation:error:${step.name}`,
258
- ok: false,
259
- score: 0,
260
- message: err instanceof Error ? err.message : String(err),
261
- details: { retryable, rubricId: rubric.id, stage: step.name }
262
- });
263
- stageResult.judgeOk = false;
264
- allJudgeOk = false;
265
- }
266
- }
267
- }
268
- const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
269
- verifierResults.push(...consistencyResults);
270
- const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
271
- const allOk = nonSkipped.length === 0
272
- ? verifierResults.every((r) => r.ok)
273
- : nonSkipped.every((r) => r.ok);
274
- const workflowSummary = {
275
- caseId: workflow.id,
276
- stages: stageResults,
277
- totalUsageUsd: workflowResult.totalUsageUsd,
278
- totalDurationMs: workflowResult.totalDurationMs,
279
- allJudgeOk: flags.runJudge ? allJudgeOk : true
280
- };
281
- return {
282
- caseId: workflow.id,
283
- stage: lastStage,
284
- mode: plannedMode,
285
- passed: allOk,
286
- durationMs: Date.now() - started,
287
- costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
288
- verifierResults,
289
- workflow: workflowSummary
290
- };
291
- }
292
- async function runCase(ctx) {
293
- const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
294
- const started = Date.now();
295
- const verifierResults = [];
296
- const expected = caseEntry.expected;
297
- let caseCostUsd = 0;
298
- const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
299
- const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
300
- const hasTraceability = flags.runTraceability && !!expected?.traceability;
301
- const judgeRequested = flags.runJudge && !!expected?.judge;
302
- const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
303
- let artifact;
304
- if (needsArtifact) {
305
- if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
306
- try {
307
- const produced = await runSingleShot({
308
- caseEntry,
309
- config,
310
- projectRoot,
311
- client
312
- });
313
- artifact = produced.artifact;
314
- caseCostUsd += produced.usageUsd;
315
- verifierResults.push({
316
- kind: "workflow",
317
- id: "agent:single-shot",
318
- ok: true,
319
- score: 1,
320
- message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
321
- details: {
322
- model: produced.model,
323
- tokensIn: produced.usage.promptTokens,
324
- tokensOut: produced.usage.completionTokens,
325
- usageUsd: produced.usageUsd,
326
- attempts: produced.attempts
327
- }
328
- });
329
- }
330
- catch (err) {
331
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
332
- throw err;
333
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
334
- verifierResults.push({
335
- kind: "workflow",
336
- id: "agent:single-shot",
337
- ok: false,
338
- score: 0,
339
- message: err instanceof Error ? err.message : String(err),
340
- details: { retryable }
341
- });
342
- }
343
- }
344
- else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
345
- try {
346
- const produced = await runWithTools({
347
- caseEntry,
348
- config,
349
- projectRoot,
350
- client
351
- });
352
- artifact = produced.artifact;
353
- caseCostUsd += produced.usageUsd;
354
- verifierResults.push({
355
- kind: "workflow",
356
- id: "agent:with-tools",
357
- ok: true,
358
- score: 1,
359
- message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
360
- `${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
361
- `(${produced.toolUse.calls} tool call(s))`,
362
- details: {
363
- model: produced.model,
364
- tokensIn: produced.usage.promptTokens,
365
- tokensOut: produced.usage.completionTokens,
366
- usageUsd: produced.usageUsd,
367
- attempts: produced.attempts,
368
- toolUse: produced.toolUse
369
- }
370
- });
371
- }
372
- catch (err) {
373
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
374
- throw err;
375
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
376
- const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
377
- verifierResults.push({
378
- kind: "workflow",
379
- id: "agent:with-tools",
380
- ok: false,
381
- score: 0,
382
- message: err instanceof Error ? err.message : String(err),
383
- details: {
384
- retryable,
385
- ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
386
- }
387
- });
388
- }
389
- }
390
- else {
391
- artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
392
- }
393
- if (artifact === undefined && verifierResults.length === 0) {
394
- verifierResults.push({
395
- kind: "structural",
396
- id: "structural:fixture:absent",
397
- ok: false,
398
- score: 0,
399
- message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
400
- details: { fixtureProvided: false }
401
- });
402
- }
403
- }
404
- if (flags.runStructural) {
405
- if (!hasStructural) {
406
- verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
407
- }
408
- else if (artifact !== undefined) {
409
- const results = verifyStructural(artifact, expected.structural);
410
- if (results.length === 0) {
411
- verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
412
- }
413
- else {
414
- verifierResults.push(...results);
415
- }
416
- }
417
- }
418
- if (hasRules && artifact !== undefined) {
419
- const results = verifyRules(artifact, expected.rules);
420
- verifierResults.push(...results);
421
- }
422
- if (hasTraceability && artifact !== undefined) {
423
- try {
424
- const extras = await readExtraFixtures(projectRoot, caseEntry);
425
- const results = verifyTraceability(artifact, extras, expected.traceability);
426
- verifierResults.push(...results);
427
- }
428
- catch (err) {
429
- verifierResults.push({
430
- kind: "rules",
431
- id: "traceability:fixture:missing",
432
- ok: false,
433
- score: 0,
434
- message: err instanceof Error ? err.message : String(err),
435
- details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
436
- });
437
- }
438
- }
439
- if (judgeRequested && artifact !== undefined && client) {
440
- const rubric = rubrics.get(caseEntry.stage);
441
- if (!rubric) {
442
- verifierResults.push({
443
- kind: "judge",
444
- id: "judge:rubric:missing",
445
- ok: false,
446
- score: 0,
447
- message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
448
- details: { stage: caseEntry.stage }
449
- });
450
- }
451
- else {
452
- try {
453
- const invocation = await runJudge({
454
- artifact,
455
- rubric,
456
- config,
457
- client,
458
- caseHint: expected.judge
459
- });
460
- caseCostUsd += invocation.usageUsd;
461
- const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
462
- verifierResults.push(...judgeVerifiers);
463
- }
464
- catch (err) {
465
- if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
466
- throw err;
467
- const retryable = err instanceof EvalLlmError ? err.retryable : false;
468
- verifierResults.push({
469
- kind: "judge",
470
- id: "judge:invocation:error",
471
- ok: false,
472
- score: 0,
473
- message: err instanceof Error ? err.message : String(err),
474
- details: { retryable, rubricId: rubric.id }
475
- });
476
- }
477
- }
478
- }
479
- const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
480
- const allOk = nonSkippedResults.length === 0
481
- ? verifierResults.every((r) => r.ok)
482
- : nonSkippedResults.every((r) => r.ok);
483
- return {
484
- caseId: caseEntry.id,
485
- stage: caseEntry.stage,
486
- mode: plannedMode,
487
- passed: allOk,
488
- durationMs: Date.now() - started,
489
- costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
490
- verifierResults
491
- };
492
- }
493
- function reduceSummary(caseResults) {
494
- let passed = 0;
495
- let failed = 0;
496
- let skipped = 0;
497
- let totalCostUsd = 0;
498
- let totalDurationMs = 0;
499
- for (const c of caseResults) {
500
- totalDurationMs += c.durationMs;
501
- if (c.costUsd !== undefined)
502
- totalCostUsd += c.costUsd;
503
- if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
504
- skipped += 1;
505
- continue;
506
- }
507
- if (c.passed)
508
- passed += 1;
509
- else
510
- failed += 1;
511
- }
512
- return {
513
- totalCases: caseResults.length,
514
- passed,
515
- failed,
516
- skipped,
517
- totalCostUsd: Number(totalCostUsd.toFixed(6)),
518
- totalDurationMs
519
- };
520
- }
521
- function stagesInResults(caseResults) {
522
- const set = new Set();
523
- for (const c of caseResults)
524
- set.add(c.stage);
525
- return FLOW_STAGES.filter((s) => set.has(s));
526
- }
527
- const MAX_PARALLEL_CASES = 4;
528
- async function runCasesWithBoundedConcurrency(items, concurrency, worker) {
529
- if (items.length === 0) {
530
- return [];
531
- }
532
- const limit = Math.max(1, Math.min(concurrency, items.length));
533
- if (limit === 1) {
534
- const results = [];
535
- for (let i = 0; i < items.length; i += 1) {
536
- results.push(await worker(items[i], i));
537
- }
538
- return results;
539
- }
540
- const results = new Array(items.length);
541
- let cursor = 0;
542
- const runners = Array.from({ length: limit }, async () => {
543
- while (true) {
544
- const index = cursor;
545
- cursor += 1;
546
- if (index >= items.length) {
547
- return;
548
- }
549
- results[index] = await worker(items[index], index);
550
- }
551
- });
552
- await Promise.all(runners);
553
- return results;
554
- }
555
- /**
556
- * Main eval runner. Dispatches between fixture-backed verification, the
557
- * single-stage agent-with-tools loop, and the multi-stage workflow
558
- * orchestrator based on `options.mode`. Per-stage baselines are loaded for
559
- * regression comparison. Cases without a `fixture` path in the yaml are
560
- * marked skipped (not failed) when no LLM drafting runs.
561
- */
562
- export async function runEval(options) {
563
- const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
564
- const config = options.modelOverride
565
- ? {
566
- ...baseConfig,
567
- model: options.modelOverride,
568
- judgeModel: options.modelOverride
569
- }
570
- : baseConfig;
571
- const plannedMode = options.mode ?? config.defaultMode;
572
- const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
573
- const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
574
- const notes = [];
575
- if (plannedMode !== "workflow" && corpus.length === 0) {
576
- notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
577
- }
578
- if (plannedMode === "workflow" && workflowCorpus.length === 0) {
579
- notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
580
- }
581
- const flags = resolveRunFlags(options);
582
- if (flags.runJudge && !config.apiKey && !options.llmClient) {
583
- notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
584
- }
585
- if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
586
- notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
587
- }
588
- if (options.dryRun === true) {
589
- const summary = {
590
- kind: "dry-run",
591
- config,
592
- corpus: {
593
- total: corpus.length,
594
- byStage: groupByStage(corpus),
595
- cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
596
- },
597
- workflowCorpus: {
598
- total: workflowCorpus.length,
599
- cases: workflowCorpus.map((item) => ({
600
- id: item.id,
601
- stages: item.stages.map((s) => s.name)
602
- }))
603
- },
604
- plannedMode,
605
- verifiersAvailable: {
606
- structural: flags.runStructural,
607
- rules: flags.runRules,
608
- judge: flags.runJudge,
609
- workflow: flags.runAgent,
610
- consistency: plannedMode === "workflow"
611
- },
612
- notes
613
- };
614
- return summary;
615
- }
616
- const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
617
- const progress = options.progress ?? noopProgressLogger();
618
- let wrappedClient;
619
- const clientNeeded = flags.runJudge || plannedMode === "workflow";
620
- if (clientNeeded) {
621
- const base = options.llmClient ??
622
- createEvalClient(config, {
623
- onRetry: (event) => progress.emit({
624
- kind: "retry",
625
- caseId: "llm",
626
- attempt: event.attempt,
627
- maxAttempts: event.maxAttempts,
628
- waitMs: event.waitMs,
629
- reason: event.error.message
630
- })
631
- });
632
- wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
633
- }
634
- const rubricsNeeded = flags.runJudge;
635
- const rubrics = rubricsNeeded
636
- ? await loadAllRubrics(options.projectRoot)
637
- : new Map();
638
- const now = new Date().toISOString();
639
- const caseResults = [];
640
- const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
641
- const runStarted = Date.now();
642
- progress.emit({
643
- kind: "run-start",
644
- mode: plannedMode,
645
- totalCases: totalPlannedCases
646
- });
647
- if (plannedMode === "workflow") {
648
- for (let i = 0; i < workflowCorpus.length; i += 1) {
649
- const wf = workflowCorpus[i];
650
- progress.emit({
651
- kind: "case-start",
652
- caseId: wf.id,
653
- stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
654
- index: i + 1,
655
- total: workflowCorpus.length
656
- });
657
- const result = await runWorkflowCase({
658
- projectRoot: options.projectRoot,
659
- workflow: wf,
660
- plannedMode,
661
- flags,
662
- config,
663
- client: wrappedClient,
664
- costGuard,
665
- rubrics,
666
- progress,
667
- caseIndex: i + 1,
668
- totalCases: workflowCorpus.length
669
- });
670
- progress.emit({
671
- kind: "case-end",
672
- caseId: wf.id,
673
- stage: result.stage,
674
- index: i + 1,
675
- total: workflowCorpus.length,
676
- passed: result.passed,
677
- durationMs: result.durationMs,
678
- ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
679
- });
680
- caseResults.push(result);
681
- }
682
- }
683
- else {
684
- // Only parallelize fixture/rules verification passes that do not depend on
685
- // LLM judge/agent loops. Those modes touch cost guards and retries where
686
- // ordered execution is safer.
687
- const caseConcurrency = flags.runJudge || flags.runAgent ? 1 : MAX_PARALLEL_CASES;
688
- const results = await runCasesWithBoundedConcurrency(corpus, caseConcurrency, async (item, i) => {
689
- progress.emit({
690
- kind: "case-start",
691
- caseId: item.id,
692
- stage: item.stage,
693
- index: i + 1,
694
- total: corpus.length
695
- });
696
- const result = await runCase({
697
- projectRoot: options.projectRoot,
698
- caseEntry: item,
699
- plannedMode,
700
- flags,
701
- config,
702
- client: wrappedClient,
703
- costGuard,
704
- rubrics
705
- });
706
- progress.emit({
707
- kind: "case-end",
708
- caseId: item.id,
709
- stage: item.stage,
710
- index: i + 1,
711
- total: corpus.length,
712
- passed: result.passed,
713
- durationMs: result.durationMs,
714
- ...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
715
- });
716
- return result;
717
- });
718
- caseResults.push(...results);
719
- }
720
- const stages = stagesInResults(caseResults);
721
- const baselines = await loadBaselinesByStage(options.projectRoot, stages);
722
- const summary = reduceSummary(caseResults);
723
- const report = {
724
- schemaVersion: 1,
725
- generatedAt: now,
726
- runId: randomUUID(),
727
- cclawVersion: CCLAW_VERSION,
728
- provider: config.provider,
729
- model: config.model,
730
- mode: plannedMode,
731
- stages,
732
- cases: caseResults,
733
- summary
734
- };
735
- const baselineDelta = compareAgainstBaselines(report, baselines);
736
- if (baselineDelta)
737
- report.baselineDelta = baselineDelta;
738
- progress.emit({
739
- kind: "run-end",
740
- totalCases: summary.totalCases,
741
- passed: summary.passed,
742
- failed: summary.failed,
743
- durationMs: Date.now() - runStarted
744
- });
745
- return report;
746
- }