cclaw-cli 0.49.0 → 0.51.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -84
- package/dist/artifact-linter.d.ts +4 -0
- package/dist/artifact-linter.js +24 -3
- package/dist/cli.d.ts +1 -19
- package/dist/cli.js +49 -491
- package/dist/constants.d.ts +2 -13
- package/dist/constants.js +1 -43
- package/dist/content/closeout-guidance.d.ts +14 -0
- package/dist/content/closeout-guidance.js +42 -0
- package/dist/content/core-agents.js +55 -17
- package/dist/content/decision-protocol.d.ts +12 -0
- package/dist/content/decision-protocol.js +20 -0
- package/dist/content/diff-command.d.ts +1 -2
- package/dist/content/diff-command.js +8 -94
- package/dist/content/examples.d.ts +4 -10
- package/dist/content/examples.js +10 -20
- package/dist/content/hook-events.js +2 -2
- package/dist/content/hook-inline-snippets.d.ts +5 -2
- package/dist/content/hook-inline-snippets.js +33 -1
- package/dist/content/hook-manifest.d.ts +3 -4
- package/dist/content/hook-manifest.js +11 -12
- package/dist/content/hooks.js +44 -21
- package/dist/content/ideate-command.d.ts +2 -0
- package/dist/content/ideate-command.js +34 -25
- package/dist/content/iron-laws.d.ts +5 -5
- package/dist/content/iron-laws.js +5 -5
- package/dist/content/language-policy.d.ts +2 -0
- package/dist/content/language-policy.js +13 -0
- package/dist/content/learnings.d.ts +3 -4
- package/dist/content/learnings.js +26 -50
- package/dist/content/meta-skill.js +33 -22
- package/dist/content/next-command.js +41 -38
- package/dist/content/node-hooks.js +17 -345
- package/dist/content/opencode-plugin.js +5 -103
- package/dist/content/research-playbooks.js +14 -14
- package/dist/content/review-loop.d.ts +2 -0
- package/dist/content/review-loop.js +8 -0
- package/dist/content/session-hooks.js +15 -47
- package/dist/content/skills.d.ts +0 -5
- package/dist/content/skills.js +55 -128
- package/dist/content/stage-common-guidance.d.ts +0 -1
- package/dist/content/stage-common-guidance.js +17 -14
- package/dist/content/stage-schema.d.ts +26 -1
- package/dist/content/stage-schema.js +121 -40
- package/dist/content/stages/_lint-metadata/index.js +9 -15
- package/dist/content/stages/brainstorm.js +22 -43
- package/dist/content/stages/design.js +37 -57
- package/dist/content/stages/plan.js +22 -13
- package/dist/content/stages/review.js +24 -27
- package/dist/content/stages/scope.js +34 -46
- package/dist/content/stages/ship.js +7 -4
- package/dist/content/stages/spec.js +20 -9
- package/dist/content/stages/tdd.js +64 -44
- package/dist/content/start-command.js +13 -12
- package/dist/content/status-command.d.ts +2 -7
- package/dist/content/status-command.js +19 -146
- package/dist/content/subagents.d.ts +0 -5
- package/dist/content/subagents.js +51 -28
- package/dist/content/templates.d.ts +1 -1
- package/dist/content/templates.js +126 -135
- package/dist/content/track-render-context.d.ts +17 -0
- package/dist/content/track-render-context.js +44 -0
- package/dist/content/tree-command.d.ts +1 -2
- package/dist/content/tree-command.js +4 -87
- package/dist/content/utility-skills.d.ts +2 -29
- package/dist/content/utility-skills.js +2 -1534
- package/dist/content/view-command.js +31 -11
- package/dist/delegation.d.ts +1 -1
- package/dist/delegation.js +5 -15
- package/dist/doctor-registry.js +20 -21
- package/dist/doctor.js +88 -344
- package/dist/flow-state.d.ts +3 -0
- package/dist/flow-state.js +2 -0
- package/dist/harness-adapters.d.ts +1 -1
- package/dist/harness-adapters.js +51 -58
- package/dist/install.js +128 -358
- package/dist/internal/advance-stage.js +3 -9
- package/dist/internal/compound-readiness.d.ts +1 -1
- package/dist/internal/compound-readiness.js +1 -1
- package/dist/internal/tdd-loop-status.d.ts +1 -1
- package/dist/internal/tdd-loop-status.js +1 -1
- package/dist/knowledge-store.d.ts +16 -10
- package/dist/knowledge-store.js +51 -15
- package/dist/policy.js +16 -105
- package/dist/run-archive.d.ts +4 -6
- package/dist/run-archive.js +15 -20
- package/dist/run-persistence.d.ts +2 -2
- package/dist/run-persistence.js +3 -9
- package/package.json +1 -2
- package/dist/content/archive-command.d.ts +0 -2
- package/dist/content/archive-command.js +0 -124
- package/dist/content/compound-command.d.ts +0 -5
- package/dist/content/compound-command.js +0 -193
- package/dist/content/contexts.d.ts +0 -18
- package/dist/content/contexts.js +0 -24
- package/dist/content/contracts.d.ts +0 -2
- package/dist/content/contracts.js +0 -51
- package/dist/content/doctor-references.d.ts +0 -2
- package/dist/content/doctor-references.js +0 -150
- package/dist/content/eval-scaffold.d.ts +0 -15
- package/dist/content/eval-scaffold.js +0 -370
- package/dist/content/feature-command.d.ts +0 -2
- package/dist/content/feature-command.js +0 -123
- package/dist/content/flow-map.d.ts +0 -23
- package/dist/content/flow-map.js +0 -134
- package/dist/content/harness-doc.d.ts +0 -2
- package/dist/content/harness-doc.js +0 -202
- package/dist/content/harness-playbooks.d.ts +0 -24
- package/dist/content/harness-playbooks.js +0 -393
- package/dist/content/harness-tool-refs.d.ts +0 -20
- package/dist/content/harness-tool-refs.js +0 -268
- package/dist/content/ops-command.d.ts +0 -2
- package/dist/content/ops-command.js +0 -71
- package/dist/content/protocols.d.ts +0 -7
- package/dist/content/protocols.js +0 -215
- package/dist/content/retro-command.d.ts +0 -2
- package/dist/content/retro-command.js +0 -165
- package/dist/content/rewind-command.d.ts +0 -2
- package/dist/content/rewind-command.js +0 -106
- package/dist/content/tdd-log-command.d.ts +0 -2
- package/dist/content/tdd-log-command.js +0 -85
- package/dist/eval/agents/single-shot.d.ts +0 -27
- package/dist/eval/agents/single-shot.js +0 -79
- package/dist/eval/agents/with-tools.d.ts +0 -44
- package/dist/eval/agents/with-tools.js +0 -261
- package/dist/eval/agents/workflow.d.ts +0 -31
- package/dist/eval/agents/workflow.js +0 -155
- package/dist/eval/baseline.d.ts +0 -38
- package/dist/eval/baseline.js +0 -282
- package/dist/eval/config-loader.d.ts +0 -14
- package/dist/eval/config-loader.js +0 -395
- package/dist/eval/corpus.d.ts +0 -30
- package/dist/eval/corpus.js +0 -330
- package/dist/eval/cost-guard.d.ts +0 -102
- package/dist/eval/cost-guard.js +0 -190
- package/dist/eval/diff.d.ts +0 -64
- package/dist/eval/diff.js +0 -323
- package/dist/eval/llm-client.d.ts +0 -176
- package/dist/eval/llm-client.js +0 -267
- package/dist/eval/mode.d.ts +0 -28
- package/dist/eval/mode.js +0 -61
- package/dist/eval/progress.d.ts +0 -83
- package/dist/eval/progress.js +0 -59
- package/dist/eval/report.d.ts +0 -11
- package/dist/eval/report.js +0 -181
- package/dist/eval/rubric-loader.d.ts +0 -20
- package/dist/eval/rubric-loader.js +0 -143
- package/dist/eval/runner.d.ts +0 -81
- package/dist/eval/runner.js +0 -746
- package/dist/eval/runs.d.ts +0 -41
- package/dist/eval/runs.js +0 -114
- package/dist/eval/sandbox.d.ts +0 -38
- package/dist/eval/sandbox.js +0 -137
- package/dist/eval/tools/glob.d.ts +0 -2
- package/dist/eval/tools/glob.js +0 -163
- package/dist/eval/tools/grep.d.ts +0 -2
- package/dist/eval/tools/grep.js +0 -152
- package/dist/eval/tools/index.d.ts +0 -7
- package/dist/eval/tools/index.js +0 -35
- package/dist/eval/tools/read.d.ts +0 -2
- package/dist/eval/tools/read.js +0 -122
- package/dist/eval/tools/types.d.ts +0 -49
- package/dist/eval/tools/types.js +0 -41
- package/dist/eval/tools/write.d.ts +0 -2
- package/dist/eval/tools/write.js +0 -92
- package/dist/eval/types.d.ts +0 -561
- package/dist/eval/types.js +0 -47
- package/dist/eval/verifiers/judge.d.ts +0 -40
- package/dist/eval/verifiers/judge.js +0 -256
- package/dist/eval/verifiers/rules.d.ts +0 -24
- package/dist/eval/verifiers/rules.js +0 -218
- package/dist/eval/verifiers/structural.d.ts +0 -14
- package/dist/eval/verifiers/structural.js +0 -171
- package/dist/eval/verifiers/traceability.d.ts +0 -23
- package/dist/eval/verifiers/traceability.js +0 -84
- package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
- package/dist/eval/verifiers/workflow-consistency.js +0 -225
- package/dist/eval/workflow-corpus.d.ts +0 -7
- package/dist/eval/workflow-corpus.js +0 -207
- package/dist/feature-system.d.ts +0 -42
- package/dist/feature-system.js +0 -432
- package/dist/internal/knowledge-digest.d.ts +0 -7
- package/dist/internal/knowledge-digest.js +0 -93
package/dist/eval/runner.js
DELETED
|
@@ -1,746 +0,0 @@
|
|
|
1
|
-
import { randomUUID } from "node:crypto";
|
|
2
|
-
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
-
import { FLOW_STAGES } from "../types.js";
|
|
4
|
-
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
|
-
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
6
|
-
import { runWorkflow } from "./agents/workflow.js";
|
|
7
|
-
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
8
|
-
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
9
|
-
import { loadWorkflowCorpus } from "./workflow-corpus.js";
|
|
10
|
-
import { loadEvalConfig } from "./config-loader.js";
|
|
11
|
-
import { createCostGuard, DailyCostCapExceededError, RunCostCapExceededError } from "./cost-guard.js";
|
|
12
|
-
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
13
|
-
import { noopProgressLogger } from "./progress.js";
|
|
14
|
-
import { loadAllRubrics } from "./rubric-loader.js";
|
|
15
|
-
import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
16
|
-
import { verifyRules } from "./verifiers/rules.js";
|
|
17
|
-
import { verifyStructural } from "./verifiers/structural.js";
|
|
18
|
-
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
19
|
-
import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
|
|
20
|
-
function groupByStage(cases) {
|
|
21
|
-
return cases.reduce((acc, item) => {
|
|
22
|
-
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
23
|
-
return acc;
|
|
24
|
-
}, {});
|
|
25
|
-
}
|
|
26
|
-
function skeletonVerifierResult(message, details) {
|
|
27
|
-
return {
|
|
28
|
-
kind: "structural",
|
|
29
|
-
id: "structural:no-expectations",
|
|
30
|
-
ok: true,
|
|
31
|
-
score: 1,
|
|
32
|
-
message,
|
|
33
|
-
...(details !== undefined ? { details } : {})
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
38
|
-
* on top of structural (traceability is a rule-family verifier even though
|
|
39
|
-
* it lives in its own module). --judge opens up the LLM judge and, in
|
|
40
|
-
* `agent` / `workflow` modes, the agent-under-test loop. --schema-only always
|
|
41
|
-
* wins so the LLM-free PR gate never pays for tokens even if stale flags
|
|
42
|
-
* collide.
|
|
43
|
-
*/
|
|
44
|
-
function resolveRunFlags(options) {
|
|
45
|
-
const rulesRequested = options.rules === true;
|
|
46
|
-
const schemaOnly = options.schemaOnly === true;
|
|
47
|
-
const judgeRequested = options.judge === true;
|
|
48
|
-
const mode = options.mode ?? "fixture";
|
|
49
|
-
const runJudge = judgeRequested && !schemaOnly;
|
|
50
|
-
// `workflow` always needs the agent loop (no fixture fallback), so we still
|
|
51
|
-
// require an LLM client but do NOT require --judge on the CLI to produce a
|
|
52
|
-
// workflow run. The judge piece stays gated by `runJudge` so consistency-
|
|
53
|
-
// only runs remain cheap and deterministic.
|
|
54
|
-
const runAgent = mode === "workflow"
|
|
55
|
-
? !schemaOnly
|
|
56
|
-
: runJudge && (mode === "fixture" || mode === "agent");
|
|
57
|
-
return {
|
|
58
|
-
runStructural: true,
|
|
59
|
-
runRules: rulesRequested && !schemaOnly,
|
|
60
|
-
runTraceability: rulesRequested && !schemaOnly,
|
|
61
|
-
runJudge,
|
|
62
|
-
runAgent
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Wrap a client so every chat() result is accounted against the cost
|
|
67
|
-
* guard before being returned. The guard throws
|
|
68
|
-
* DailyCostCapExceededError if committing the call would cross the
|
|
69
|
-
* configured cap — the runner surfaces that as a hard failure so
|
|
70
|
-
* nightly CI fails loud instead of silently overspending.
|
|
71
|
-
*/
|
|
72
|
-
function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
|
|
73
|
-
return {
|
|
74
|
-
async chat(request) {
|
|
75
|
-
const response = await client.chat(request);
|
|
76
|
-
await costGuard.commit(response.model || fallbackModel, response.usage);
|
|
77
|
-
return response;
|
|
78
|
-
}
|
|
79
|
-
};
|
|
80
|
-
}
|
|
81
|
-
async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
82
|
-
try {
|
|
83
|
-
return await readFixtureArtifact(projectRoot, caseEntry);
|
|
84
|
-
}
|
|
85
|
-
catch (err) {
|
|
86
|
-
verifierResults.push({
|
|
87
|
-
kind: "structural",
|
|
88
|
-
id: "structural:fixture:missing",
|
|
89
|
-
ok: false,
|
|
90
|
-
score: 0,
|
|
91
|
-
message: err instanceof Error ? err.message : String(err),
|
|
92
|
-
details: { fixture: caseEntry.fixture }
|
|
93
|
-
});
|
|
94
|
-
return undefined;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
function stageJudgeHint(step) {
|
|
98
|
-
const hint = {};
|
|
99
|
-
if (step.rubric)
|
|
100
|
-
hint.rubric = step.rubric;
|
|
101
|
-
if (step.requiredChecks)
|
|
102
|
-
hint.requiredChecks = step.requiredChecks;
|
|
103
|
-
if (step.minimumScores)
|
|
104
|
-
hint.minimumScores = step.minimumScores;
|
|
105
|
-
return hint;
|
|
106
|
-
}
|
|
107
|
-
async function runWorkflowCase(ctx) {
|
|
108
|
-
const { projectRoot, workflow, plannedMode, flags, config, client, rubrics, progress, caseIndex, totalCases } = ctx;
|
|
109
|
-
const started = Date.now();
|
|
110
|
-
const verifierResults = [];
|
|
111
|
-
let caseCostUsd = 0;
|
|
112
|
-
const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
|
|
113
|
-
"plan";
|
|
114
|
-
if (!flags.runAgent || !client) {
|
|
115
|
-
verifierResults.push({
|
|
116
|
-
kind: "workflow",
|
|
117
|
-
id: "workflow:agent:disabled",
|
|
118
|
-
ok: false,
|
|
119
|
-
score: 0,
|
|
120
|
-
message: "workflow mode requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
|
|
121
|
-
"Re-run with credentials to execute the workflow.",
|
|
122
|
-
details: { stages: workflow.stages.map((s) => s.name) }
|
|
123
|
-
});
|
|
124
|
-
return {
|
|
125
|
-
caseId: workflow.id,
|
|
126
|
-
stage: lastStage,
|
|
127
|
-
mode: plannedMode,
|
|
128
|
-
passed: false,
|
|
129
|
-
durationMs: Date.now() - started,
|
|
130
|
-
verifierResults
|
|
131
|
-
};
|
|
132
|
-
}
|
|
133
|
-
let workflowResult;
|
|
134
|
-
try {
|
|
135
|
-
workflowResult = await runWorkflow({
|
|
136
|
-
workflow,
|
|
137
|
-
config,
|
|
138
|
-
projectRoot,
|
|
139
|
-
client,
|
|
140
|
-
onStageStart: (stage) => progress.emit({
|
|
141
|
-
kind: "stage-start",
|
|
142
|
-
caseId: workflow.id,
|
|
143
|
-
stage,
|
|
144
|
-
index: caseIndex,
|
|
145
|
-
total: totalCases
|
|
146
|
-
}),
|
|
147
|
-
onStageEnd: (stage, stageResult) => progress.emit({
|
|
148
|
-
kind: "stage-end",
|
|
149
|
-
caseId: workflow.id,
|
|
150
|
-
stage,
|
|
151
|
-
index: caseIndex,
|
|
152
|
-
total: totalCases,
|
|
153
|
-
passed: true,
|
|
154
|
-
durationMs: stageResult.durationMs,
|
|
155
|
-
...(stageResult.usageUsd > 0 ? { costUsd: stageResult.usageUsd } : {})
|
|
156
|
-
})
|
|
157
|
-
});
|
|
158
|
-
}
|
|
159
|
-
catch (err) {
|
|
160
|
-
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
161
|
-
throw err;
|
|
162
|
-
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
163
|
-
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
164
|
-
verifierResults.push({
|
|
165
|
-
kind: "workflow",
|
|
166
|
-
id: "workflow:agent:error",
|
|
167
|
-
ok: false,
|
|
168
|
-
score: 0,
|
|
169
|
-
message: err instanceof Error ? err.message : String(err),
|
|
170
|
-
details: {
|
|
171
|
-
retryable,
|
|
172
|
-
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
173
|
-
}
|
|
174
|
-
});
|
|
175
|
-
return {
|
|
176
|
-
caseId: workflow.id,
|
|
177
|
-
stage: lastStage,
|
|
178
|
-
mode: plannedMode,
|
|
179
|
-
passed: false,
|
|
180
|
-
durationMs: Date.now() - started,
|
|
181
|
-
verifierResults
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
caseCostUsd += workflowResult.totalUsageUsd;
|
|
185
|
-
const stageResults = [...workflowResult.stages];
|
|
186
|
-
verifierResults.push({
|
|
187
|
-
kind: "workflow",
|
|
188
|
-
id: "workflow:agent",
|
|
189
|
-
ok: true,
|
|
190
|
-
score: 1,
|
|
191
|
-
message: `workflow ran ${stageResults.length} stage(s) in ` +
|
|
192
|
-
`${workflowResult.totalDurationMs}ms ` +
|
|
193
|
-
`(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
|
|
194
|
-
details: {
|
|
195
|
-
stages: stageResults.map((s) => ({
|
|
196
|
-
name: s.stage,
|
|
197
|
-
durationMs: s.durationMs,
|
|
198
|
-
usageUsd: s.usageUsd,
|
|
199
|
-
turns: s.toolUse.turns,
|
|
200
|
-
calls: s.toolUse.calls
|
|
201
|
-
}))
|
|
202
|
-
}
|
|
203
|
-
});
|
|
204
|
-
let allJudgeOk = true;
|
|
205
|
-
if (flags.runJudge) {
|
|
206
|
-
for (let i = 0; i < workflow.stages.length; i += 1) {
|
|
207
|
-
const step = workflow.stages[i];
|
|
208
|
-
const stageResult = stageResults[i];
|
|
209
|
-
const rubric = rubrics.get(step.name);
|
|
210
|
-
if (!rubric) {
|
|
211
|
-
verifierResults.push({
|
|
212
|
-
kind: "judge",
|
|
213
|
-
id: `judge:rubric:missing:${step.name}`,
|
|
214
|
-
ok: false,
|
|
215
|
-
score: 0,
|
|
216
|
-
message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
|
|
217
|
-
details: { stage: step.name }
|
|
218
|
-
});
|
|
219
|
-
allJudgeOk = false;
|
|
220
|
-
stageResult.judgeOk = false;
|
|
221
|
-
continue;
|
|
222
|
-
}
|
|
223
|
-
const hint = stageJudgeHint(step);
|
|
224
|
-
try {
|
|
225
|
-
const invocation = await runJudge({
|
|
226
|
-
artifact: stageResult.artifact,
|
|
227
|
-
rubric,
|
|
228
|
-
config,
|
|
229
|
-
client,
|
|
230
|
-
caseHint: hint
|
|
231
|
-
});
|
|
232
|
-
caseCostUsd += invocation.usageUsd;
|
|
233
|
-
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
|
|
234
|
-
const medians = {};
|
|
235
|
-
for (const agg of invocation.aggregates) {
|
|
236
|
-
medians[agg.checkId] = agg.median;
|
|
237
|
-
}
|
|
238
|
-
stageResult.judgeMedians = medians;
|
|
239
|
-
const stageOk = judgeVerifiers.every((v) => v.ok);
|
|
240
|
-
stageResult.judgeOk = stageOk;
|
|
241
|
-
if (!stageOk)
|
|
242
|
-
allJudgeOk = false;
|
|
243
|
-
for (const v of judgeVerifiers) {
|
|
244
|
-
verifierResults.push({
|
|
245
|
-
...v,
|
|
246
|
-
id: `${v.id}:${step.name}`,
|
|
247
|
-
details: { ...(v.details ?? {}), stage: step.name }
|
|
248
|
-
});
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
catch (err) {
|
|
252
|
-
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
253
|
-
throw err;
|
|
254
|
-
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
255
|
-
verifierResults.push({
|
|
256
|
-
kind: "judge",
|
|
257
|
-
id: `judge:invocation:error:${step.name}`,
|
|
258
|
-
ok: false,
|
|
259
|
-
score: 0,
|
|
260
|
-
message: err instanceof Error ? err.message : String(err),
|
|
261
|
-
details: { retryable, rubricId: rubric.id, stage: step.name }
|
|
262
|
-
});
|
|
263
|
-
stageResult.judgeOk = false;
|
|
264
|
-
allJudgeOk = false;
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
|
|
269
|
-
verifierResults.push(...consistencyResults);
|
|
270
|
-
const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
271
|
-
const allOk = nonSkipped.length === 0
|
|
272
|
-
? verifierResults.every((r) => r.ok)
|
|
273
|
-
: nonSkipped.every((r) => r.ok);
|
|
274
|
-
const workflowSummary = {
|
|
275
|
-
caseId: workflow.id,
|
|
276
|
-
stages: stageResults,
|
|
277
|
-
totalUsageUsd: workflowResult.totalUsageUsd,
|
|
278
|
-
totalDurationMs: workflowResult.totalDurationMs,
|
|
279
|
-
allJudgeOk: flags.runJudge ? allJudgeOk : true
|
|
280
|
-
};
|
|
281
|
-
return {
|
|
282
|
-
caseId: workflow.id,
|
|
283
|
-
stage: lastStage,
|
|
284
|
-
mode: plannedMode,
|
|
285
|
-
passed: allOk,
|
|
286
|
-
durationMs: Date.now() - started,
|
|
287
|
-
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
288
|
-
verifierResults,
|
|
289
|
-
workflow: workflowSummary
|
|
290
|
-
};
|
|
291
|
-
}
|
|
292
|
-
async function runCase(ctx) {
|
|
293
|
-
const { projectRoot, caseEntry, plannedMode, flags, config, client, costGuard, rubrics } = ctx;
|
|
294
|
-
const started = Date.now();
|
|
295
|
-
const verifierResults = [];
|
|
296
|
-
const expected = caseEntry.expected;
|
|
297
|
-
let caseCostUsd = 0;
|
|
298
|
-
const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
|
|
299
|
-
const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
|
|
300
|
-
const hasTraceability = flags.runTraceability && !!expected?.traceability;
|
|
301
|
-
const judgeRequested = flags.runJudge && !!expected?.judge;
|
|
302
|
-
const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
|
|
303
|
-
let artifact;
|
|
304
|
-
if (needsArtifact) {
|
|
305
|
-
if (flags.runAgent && judgeRequested && client && plannedMode === "fixture") {
|
|
306
|
-
try {
|
|
307
|
-
const produced = await runSingleShot({
|
|
308
|
-
caseEntry,
|
|
309
|
-
config,
|
|
310
|
-
projectRoot,
|
|
311
|
-
client
|
|
312
|
-
});
|
|
313
|
-
artifact = produced.artifact;
|
|
314
|
-
caseCostUsd += produced.usageUsd;
|
|
315
|
-
verifierResults.push({
|
|
316
|
-
kind: "workflow",
|
|
317
|
-
id: "agent:single-shot",
|
|
318
|
-
ok: true,
|
|
319
|
-
score: 1,
|
|
320
|
-
message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
|
|
321
|
-
details: {
|
|
322
|
-
model: produced.model,
|
|
323
|
-
tokensIn: produced.usage.promptTokens,
|
|
324
|
-
tokensOut: produced.usage.completionTokens,
|
|
325
|
-
usageUsd: produced.usageUsd,
|
|
326
|
-
attempts: produced.attempts
|
|
327
|
-
}
|
|
328
|
-
});
|
|
329
|
-
}
|
|
330
|
-
catch (err) {
|
|
331
|
-
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
332
|
-
throw err;
|
|
333
|
-
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
334
|
-
verifierResults.push({
|
|
335
|
-
kind: "workflow",
|
|
336
|
-
id: "agent:single-shot",
|
|
337
|
-
ok: false,
|
|
338
|
-
score: 0,
|
|
339
|
-
message: err instanceof Error ? err.message : String(err),
|
|
340
|
-
details: { retryable }
|
|
341
|
-
});
|
|
342
|
-
}
|
|
343
|
-
}
|
|
344
|
-
else if (flags.runAgent && judgeRequested && client && plannedMode === "agent") {
|
|
345
|
-
try {
|
|
346
|
-
const produced = await runWithTools({
|
|
347
|
-
caseEntry,
|
|
348
|
-
config,
|
|
349
|
-
projectRoot,
|
|
350
|
-
client
|
|
351
|
-
});
|
|
352
|
-
artifact = produced.artifact;
|
|
353
|
-
caseCostUsd += produced.usageUsd;
|
|
354
|
-
verifierResults.push({
|
|
355
|
-
kind: "workflow",
|
|
356
|
-
id: "agent:with-tools",
|
|
357
|
-
ok: true,
|
|
358
|
-
score: 1,
|
|
359
|
-
message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
|
|
360
|
-
`${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
|
|
361
|
-
`(${produced.toolUse.calls} tool call(s))`,
|
|
362
|
-
details: {
|
|
363
|
-
model: produced.model,
|
|
364
|
-
tokensIn: produced.usage.promptTokens,
|
|
365
|
-
tokensOut: produced.usage.completionTokens,
|
|
366
|
-
usageUsd: produced.usageUsd,
|
|
367
|
-
attempts: produced.attempts,
|
|
368
|
-
toolUse: produced.toolUse
|
|
369
|
-
}
|
|
370
|
-
});
|
|
371
|
-
}
|
|
372
|
-
catch (err) {
|
|
373
|
-
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
374
|
-
throw err;
|
|
375
|
-
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
376
|
-
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
377
|
-
verifierResults.push({
|
|
378
|
-
kind: "workflow",
|
|
379
|
-
id: "agent:with-tools",
|
|
380
|
-
ok: false,
|
|
381
|
-
score: 0,
|
|
382
|
-
message: err instanceof Error ? err.message : String(err),
|
|
383
|
-
details: {
|
|
384
|
-
retryable,
|
|
385
|
-
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
386
|
-
}
|
|
387
|
-
});
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
else {
|
|
391
|
-
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
392
|
-
}
|
|
393
|
-
if (artifact === undefined && verifierResults.length === 0) {
|
|
394
|
-
verifierResults.push({
|
|
395
|
-
kind: "structural",
|
|
396
|
-
id: "structural:fixture:absent",
|
|
397
|
-
ok: false,
|
|
398
|
-
score: 0,
|
|
399
|
-
message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
400
|
-
details: { fixtureProvided: false }
|
|
401
|
-
});
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
if (flags.runStructural) {
|
|
405
|
-
if (!hasStructural) {
|
|
406
|
-
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
407
|
-
}
|
|
408
|
-
else if (artifact !== undefined) {
|
|
409
|
-
const results = verifyStructural(artifact, expected.structural);
|
|
410
|
-
if (results.length === 0) {
|
|
411
|
-
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
412
|
-
}
|
|
413
|
-
else {
|
|
414
|
-
verifierResults.push(...results);
|
|
415
|
-
}
|
|
416
|
-
}
|
|
417
|
-
}
|
|
418
|
-
if (hasRules && artifact !== undefined) {
|
|
419
|
-
const results = verifyRules(artifact, expected.rules);
|
|
420
|
-
verifierResults.push(...results);
|
|
421
|
-
}
|
|
422
|
-
if (hasTraceability && artifact !== undefined) {
|
|
423
|
-
try {
|
|
424
|
-
const extras = await readExtraFixtures(projectRoot, caseEntry);
|
|
425
|
-
const results = verifyTraceability(artifact, extras, expected.traceability);
|
|
426
|
-
verifierResults.push(...results);
|
|
427
|
-
}
|
|
428
|
-
catch (err) {
|
|
429
|
-
verifierResults.push({
|
|
430
|
-
kind: "rules",
|
|
431
|
-
id: "traceability:fixture:missing",
|
|
432
|
-
ok: false,
|
|
433
|
-
score: 0,
|
|
434
|
-
message: err instanceof Error ? err.message : String(err),
|
|
435
|
-
details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
|
|
436
|
-
});
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
if (judgeRequested && artifact !== undefined && client) {
|
|
440
|
-
const rubric = rubrics.get(caseEntry.stage);
|
|
441
|
-
if (!rubric) {
|
|
442
|
-
verifierResults.push({
|
|
443
|
-
kind: "judge",
|
|
444
|
-
id: "judge:rubric:missing",
|
|
445
|
-
ok: false,
|
|
446
|
-
score: 0,
|
|
447
|
-
message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
|
|
448
|
-
details: { stage: caseEntry.stage }
|
|
449
|
-
});
|
|
450
|
-
}
|
|
451
|
-
else {
|
|
452
|
-
try {
|
|
453
|
-
const invocation = await runJudge({
|
|
454
|
-
artifact,
|
|
455
|
-
rubric,
|
|
456
|
-
config,
|
|
457
|
-
client,
|
|
458
|
-
caseHint: expected.judge
|
|
459
|
-
});
|
|
460
|
-
caseCostUsd += invocation.usageUsd;
|
|
461
|
-
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
|
|
462
|
-
verifierResults.push(...judgeVerifiers);
|
|
463
|
-
}
|
|
464
|
-
catch (err) {
|
|
465
|
-
if (err instanceof DailyCostCapExceededError || err instanceof RunCostCapExceededError)
|
|
466
|
-
throw err;
|
|
467
|
-
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
468
|
-
verifierResults.push({
|
|
469
|
-
kind: "judge",
|
|
470
|
-
id: "judge:invocation:error",
|
|
471
|
-
ok: false,
|
|
472
|
-
score: 0,
|
|
473
|
-
message: err instanceof Error ? err.message : String(err),
|
|
474
|
-
details: { retryable, rubricId: rubric.id }
|
|
475
|
-
});
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
}
|
|
479
|
-
const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
480
|
-
const allOk = nonSkippedResults.length === 0
|
|
481
|
-
? verifierResults.every((r) => r.ok)
|
|
482
|
-
: nonSkippedResults.every((r) => r.ok);
|
|
483
|
-
return {
|
|
484
|
-
caseId: caseEntry.id,
|
|
485
|
-
stage: caseEntry.stage,
|
|
486
|
-
mode: plannedMode,
|
|
487
|
-
passed: allOk,
|
|
488
|
-
durationMs: Date.now() - started,
|
|
489
|
-
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
490
|
-
verifierResults
|
|
491
|
-
};
|
|
492
|
-
}
|
|
493
|
-
function reduceSummary(caseResults) {
|
|
494
|
-
let passed = 0;
|
|
495
|
-
let failed = 0;
|
|
496
|
-
let skipped = 0;
|
|
497
|
-
let totalCostUsd = 0;
|
|
498
|
-
let totalDurationMs = 0;
|
|
499
|
-
for (const c of caseResults) {
|
|
500
|
-
totalDurationMs += c.durationMs;
|
|
501
|
-
if (c.costUsd !== undefined)
|
|
502
|
-
totalCostUsd += c.costUsd;
|
|
503
|
-
if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
|
|
504
|
-
skipped += 1;
|
|
505
|
-
continue;
|
|
506
|
-
}
|
|
507
|
-
if (c.passed)
|
|
508
|
-
passed += 1;
|
|
509
|
-
else
|
|
510
|
-
failed += 1;
|
|
511
|
-
}
|
|
512
|
-
return {
|
|
513
|
-
totalCases: caseResults.length,
|
|
514
|
-
passed,
|
|
515
|
-
failed,
|
|
516
|
-
skipped,
|
|
517
|
-
totalCostUsd: Number(totalCostUsd.toFixed(6)),
|
|
518
|
-
totalDurationMs
|
|
519
|
-
};
|
|
520
|
-
}
|
|
521
|
-
function stagesInResults(caseResults) {
|
|
522
|
-
const set = new Set();
|
|
523
|
-
for (const c of caseResults)
|
|
524
|
-
set.add(c.stage);
|
|
525
|
-
return FLOW_STAGES.filter((s) => set.has(s));
|
|
526
|
-
}
|
|
527
|
-
const MAX_PARALLEL_CASES = 4;
|
|
528
|
-
async function runCasesWithBoundedConcurrency(items, concurrency, worker) {
|
|
529
|
-
if (items.length === 0) {
|
|
530
|
-
return [];
|
|
531
|
-
}
|
|
532
|
-
const limit = Math.max(1, Math.min(concurrency, items.length));
|
|
533
|
-
if (limit === 1) {
|
|
534
|
-
const results = [];
|
|
535
|
-
for (let i = 0; i < items.length; i += 1) {
|
|
536
|
-
results.push(await worker(items[i], i));
|
|
537
|
-
}
|
|
538
|
-
return results;
|
|
539
|
-
}
|
|
540
|
-
const results = new Array(items.length);
|
|
541
|
-
let cursor = 0;
|
|
542
|
-
const runners = Array.from({ length: limit }, async () => {
|
|
543
|
-
while (true) {
|
|
544
|
-
const index = cursor;
|
|
545
|
-
cursor += 1;
|
|
546
|
-
if (index >= items.length) {
|
|
547
|
-
return;
|
|
548
|
-
}
|
|
549
|
-
results[index] = await worker(items[index], index);
|
|
550
|
-
}
|
|
551
|
-
});
|
|
552
|
-
await Promise.all(runners);
|
|
553
|
-
return results;
|
|
554
|
-
}
|
|
555
|
-
/**
|
|
556
|
-
* Main eval runner. Dispatches between fixture-backed verification, the
|
|
557
|
-
* single-stage agent-with-tools loop, and the multi-stage workflow
|
|
558
|
-
* orchestrator based on `options.mode`. Per-stage baselines are loaded for
|
|
559
|
-
* regression comparison. Cases without a `fixture` path in the yaml are
|
|
560
|
-
* marked skipped (not failed) when no LLM drafting runs.
|
|
561
|
-
*/
|
|
562
|
-
export async function runEval(options) {
|
|
563
|
-
const baseConfig = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
564
|
-
const config = options.modelOverride
|
|
565
|
-
? {
|
|
566
|
-
...baseConfig,
|
|
567
|
-
model: options.modelOverride,
|
|
568
|
-
judgeModel: options.modelOverride
|
|
569
|
-
}
|
|
570
|
-
: baseConfig;
|
|
571
|
-
const plannedMode = options.mode ?? config.defaultMode;
|
|
572
|
-
const corpus = plannedMode === "workflow" ? [] : await loadCorpus(options.projectRoot, options.stage);
|
|
573
|
-
const workflowCorpus = plannedMode === "workflow" ? await loadWorkflowCorpus(options.projectRoot) : [];
|
|
574
|
-
const notes = [];
|
|
575
|
-
if (plannedMode !== "workflow" && corpus.length === 0) {
|
|
576
|
-
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
577
|
-
}
|
|
578
|
-
if (plannedMode === "workflow" && workflowCorpus.length === 0) {
|
|
579
|
-
notes.push("Workflow corpus is empty. Workflow-mode cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
|
|
580
|
-
}
|
|
581
|
-
const flags = resolveRunFlags(options);
|
|
582
|
-
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
583
|
-
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
584
|
-
}
|
|
585
|
-
if (plannedMode === "workflow" && !config.apiKey && !options.llmClient) {
|
|
586
|
-
notes.push("workflow mode requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
|
|
587
|
-
}
|
|
588
|
-
if (options.dryRun === true) {
|
|
589
|
-
const summary = {
|
|
590
|
-
kind: "dry-run",
|
|
591
|
-
config,
|
|
592
|
-
corpus: {
|
|
593
|
-
total: corpus.length,
|
|
594
|
-
byStage: groupByStage(corpus),
|
|
595
|
-
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
596
|
-
},
|
|
597
|
-
workflowCorpus: {
|
|
598
|
-
total: workflowCorpus.length,
|
|
599
|
-
cases: workflowCorpus.map((item) => ({
|
|
600
|
-
id: item.id,
|
|
601
|
-
stages: item.stages.map((s) => s.name)
|
|
602
|
-
}))
|
|
603
|
-
},
|
|
604
|
-
plannedMode,
|
|
605
|
-
verifiersAvailable: {
|
|
606
|
-
structural: flags.runStructural,
|
|
607
|
-
rules: flags.runRules,
|
|
608
|
-
judge: flags.runJudge,
|
|
609
|
-
workflow: flags.runAgent,
|
|
610
|
-
consistency: plannedMode === "workflow"
|
|
611
|
-
},
|
|
612
|
-
notes
|
|
613
|
-
};
|
|
614
|
-
return summary;
|
|
615
|
-
}
|
|
616
|
-
const costGuard = createCostGuard(options.projectRoot, config, options.maxCostUsd !== undefined ? { runCapUsd: options.maxCostUsd } : {});
|
|
617
|
-
const progress = options.progress ?? noopProgressLogger();
|
|
618
|
-
let wrappedClient;
|
|
619
|
-
const clientNeeded = flags.runJudge || plannedMode === "workflow";
|
|
620
|
-
if (clientNeeded) {
|
|
621
|
-
const base = options.llmClient ??
|
|
622
|
-
createEvalClient(config, {
|
|
623
|
-
onRetry: (event) => progress.emit({
|
|
624
|
-
kind: "retry",
|
|
625
|
-
caseId: "llm",
|
|
626
|
-
attempt: event.attempt,
|
|
627
|
-
maxAttempts: event.maxAttempts,
|
|
628
|
-
waitMs: event.waitMs,
|
|
629
|
-
reason: event.error.message
|
|
630
|
-
})
|
|
631
|
-
});
|
|
632
|
-
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
633
|
-
}
|
|
634
|
-
const rubricsNeeded = flags.runJudge;
|
|
635
|
-
const rubrics = rubricsNeeded
|
|
636
|
-
? await loadAllRubrics(options.projectRoot)
|
|
637
|
-
: new Map();
|
|
638
|
-
const now = new Date().toISOString();
|
|
639
|
-
const caseResults = [];
|
|
640
|
-
const totalPlannedCases = plannedMode === "workflow" ? workflowCorpus.length : corpus.length;
|
|
641
|
-
const runStarted = Date.now();
|
|
642
|
-
progress.emit({
|
|
643
|
-
kind: "run-start",
|
|
644
|
-
mode: plannedMode,
|
|
645
|
-
totalCases: totalPlannedCases
|
|
646
|
-
});
|
|
647
|
-
if (plannedMode === "workflow") {
|
|
648
|
-
for (let i = 0; i < workflowCorpus.length; i += 1) {
|
|
649
|
-
const wf = workflowCorpus[i];
|
|
650
|
-
progress.emit({
|
|
651
|
-
kind: "case-start",
|
|
652
|
-
caseId: wf.id,
|
|
653
|
-
stage: wf.stages[wf.stages.length - 1]?.name ?? "workflow",
|
|
654
|
-
index: i + 1,
|
|
655
|
-
total: workflowCorpus.length
|
|
656
|
-
});
|
|
657
|
-
const result = await runWorkflowCase({
|
|
658
|
-
projectRoot: options.projectRoot,
|
|
659
|
-
workflow: wf,
|
|
660
|
-
plannedMode,
|
|
661
|
-
flags,
|
|
662
|
-
config,
|
|
663
|
-
client: wrappedClient,
|
|
664
|
-
costGuard,
|
|
665
|
-
rubrics,
|
|
666
|
-
progress,
|
|
667
|
-
caseIndex: i + 1,
|
|
668
|
-
totalCases: workflowCorpus.length
|
|
669
|
-
});
|
|
670
|
-
progress.emit({
|
|
671
|
-
kind: "case-end",
|
|
672
|
-
caseId: wf.id,
|
|
673
|
-
stage: result.stage,
|
|
674
|
-
index: i + 1,
|
|
675
|
-
total: workflowCorpus.length,
|
|
676
|
-
passed: result.passed,
|
|
677
|
-
durationMs: result.durationMs,
|
|
678
|
-
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
679
|
-
});
|
|
680
|
-
caseResults.push(result);
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
else {
|
|
684
|
-
// Only parallelize fixture/rules verification passes that do not depend on
|
|
685
|
-
// LLM judge/agent loops. Those modes touch cost guards and retries where
|
|
686
|
-
// ordered execution is safer.
|
|
687
|
-
const caseConcurrency = flags.runJudge || flags.runAgent ? 1 : MAX_PARALLEL_CASES;
|
|
688
|
-
const results = await runCasesWithBoundedConcurrency(corpus, caseConcurrency, async (item, i) => {
|
|
689
|
-
progress.emit({
|
|
690
|
-
kind: "case-start",
|
|
691
|
-
caseId: item.id,
|
|
692
|
-
stage: item.stage,
|
|
693
|
-
index: i + 1,
|
|
694
|
-
total: corpus.length
|
|
695
|
-
});
|
|
696
|
-
const result = await runCase({
|
|
697
|
-
projectRoot: options.projectRoot,
|
|
698
|
-
caseEntry: item,
|
|
699
|
-
plannedMode,
|
|
700
|
-
flags,
|
|
701
|
-
config,
|
|
702
|
-
client: wrappedClient,
|
|
703
|
-
costGuard,
|
|
704
|
-
rubrics
|
|
705
|
-
});
|
|
706
|
-
progress.emit({
|
|
707
|
-
kind: "case-end",
|
|
708
|
-
caseId: item.id,
|
|
709
|
-
stage: item.stage,
|
|
710
|
-
index: i + 1,
|
|
711
|
-
total: corpus.length,
|
|
712
|
-
passed: result.passed,
|
|
713
|
-
durationMs: result.durationMs,
|
|
714
|
-
...(result.costUsd !== undefined ? { costUsd: result.costUsd } : {})
|
|
715
|
-
});
|
|
716
|
-
return result;
|
|
717
|
-
});
|
|
718
|
-
caseResults.push(...results);
|
|
719
|
-
}
|
|
720
|
-
const stages = stagesInResults(caseResults);
|
|
721
|
-
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
722
|
-
const summary = reduceSummary(caseResults);
|
|
723
|
-
const report = {
|
|
724
|
-
schemaVersion: 1,
|
|
725
|
-
generatedAt: now,
|
|
726
|
-
runId: randomUUID(),
|
|
727
|
-
cclawVersion: CCLAW_VERSION,
|
|
728
|
-
provider: config.provider,
|
|
729
|
-
model: config.model,
|
|
730
|
-
mode: plannedMode,
|
|
731
|
-
stages,
|
|
732
|
-
cases: caseResults,
|
|
733
|
-
summary
|
|
734
|
-
};
|
|
735
|
-
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
736
|
-
if (baselineDelta)
|
|
737
|
-
report.baselineDelta = baselineDelta;
|
|
738
|
-
progress.emit({
|
|
739
|
-
kind: "run-end",
|
|
740
|
-
totalCases: summary.totalCases,
|
|
741
|
-
passed: summary.passed,
|
|
742
|
-
failed: summary.failed,
|
|
743
|
-
durationMs: Date.now() - runStarted
|
|
744
|
-
});
|
|
745
|
-
return report;
|
|
746
|
-
}
|