cclaw-cli 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +78 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +17 -11
- package/dist/eval/agents/workflow.d.ts +24 -0
- package/dist/eval/agents/workflow.js +133 -0
- package/dist/eval/config-loader.js +6 -2
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/report.js +35 -0
- package/dist/eval/runner.d.ts +10 -1
- package/dist/eval/runner.js +236 -19
- package/dist/eval/types.d.ts +117 -1
- package/dist/eval/types.js +21 -1
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/eval/runner.js
CHANGED
|
@@ -3,8 +3,10 @@ import { CCLAW_VERSION } from "../constants.js";
|
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
4
|
import { runSingleShot } from "./agents/single-shot.js";
|
|
5
5
|
import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
|
|
6
|
+
import { runWorkflow } from "./agents/workflow.js";
|
|
6
7
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
7
8
|
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
9
|
+
import { loadWorkflowCorpus } from "./workflow-corpus.js";
|
|
8
10
|
import { loadEvalConfig } from "./config-loader.js";
|
|
9
11
|
import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
|
|
10
12
|
import { createEvalClient, EvalLlmError } from "./llm-client.js";
|
|
@@ -13,6 +15,7 @@ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
|
|
|
13
15
|
import { verifyRules } from "./verifiers/rules.js";
|
|
14
16
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
15
17
|
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
18
|
+
import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
|
|
16
19
|
function groupByStage(cases) {
|
|
17
20
|
return cases.reduce((acc, item) => {
|
|
18
21
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
@@ -42,7 +45,13 @@ function resolveRunFlags(options) {
|
|
|
42
45
|
const judgeRequested = options.judge === true;
|
|
43
46
|
const tier = options.tier ?? "A";
|
|
44
47
|
const runJudge = judgeRequested && !schemaOnly;
|
|
45
|
-
|
|
48
|
+
// Tier C always needs the agent loop (no fixture fallback for workflows),
|
|
49
|
+
// so we still require an LLM client but we do NOT require --judge on the
|
|
50
|
+
// CLI to produce a workflow run. The judge piece itself stays gated by
|
|
51
|
+
// `runJudge` so consistency-only runs are cheap and deterministic.
|
|
52
|
+
const runAgent = tier === "C"
|
|
53
|
+
? !schemaOnly
|
|
54
|
+
: runJudge && (tier === "A" || tier === "B");
|
|
46
55
|
return {
|
|
47
56
|
runStructural: true,
|
|
48
57
|
runRules: rulesRequested && !schemaOnly,
|
|
@@ -83,6 +92,184 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
|
83
92
|
return undefined;
|
|
84
93
|
}
|
|
85
94
|
}
|
|
95
|
+
function stageJudgeHint(step) {
|
|
96
|
+
const hint = {};
|
|
97
|
+
if (step.rubric)
|
|
98
|
+
hint.rubric = step.rubric;
|
|
99
|
+
if (step.requiredChecks)
|
|
100
|
+
hint.requiredChecks = step.requiredChecks;
|
|
101
|
+
if (step.minimumScores)
|
|
102
|
+
hint.minimumScores = step.minimumScores;
|
|
103
|
+
return hint;
|
|
104
|
+
}
|
|
105
|
+
async function runWorkflowCase(ctx) {
|
|
106
|
+
const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
|
|
107
|
+
const started = Date.now();
|
|
108
|
+
const verifierResults = [];
|
|
109
|
+
let caseCostUsd = 0;
|
|
110
|
+
const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
|
|
111
|
+
"plan";
|
|
112
|
+
if (!flags.runAgent || !client) {
|
|
113
|
+
verifierResults.push({
|
|
114
|
+
kind: "workflow",
|
|
115
|
+
id: "workflow:agent:disabled",
|
|
116
|
+
ok: false,
|
|
117
|
+
score: 0,
|
|
118
|
+
message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
|
|
119
|
+
"Re-run with credentials to execute the workflow.",
|
|
120
|
+
details: { stages: workflow.stages.map((s) => s.name) }
|
|
121
|
+
});
|
|
122
|
+
return {
|
|
123
|
+
caseId: workflow.id,
|
|
124
|
+
stage: lastStage,
|
|
125
|
+
tier: plannedTier,
|
|
126
|
+
passed: false,
|
|
127
|
+
durationMs: Date.now() - started,
|
|
128
|
+
verifierResults
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
let workflowResult;
|
|
132
|
+
try {
|
|
133
|
+
workflowResult = await runWorkflow({
|
|
134
|
+
workflow,
|
|
135
|
+
config,
|
|
136
|
+
projectRoot,
|
|
137
|
+
client
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
if (err instanceof DailyCostCapExceededError)
|
|
142
|
+
throw err;
|
|
143
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
144
|
+
const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
|
|
145
|
+
verifierResults.push({
|
|
146
|
+
kind: "workflow",
|
|
147
|
+
id: "workflow:agent:error",
|
|
148
|
+
ok: false,
|
|
149
|
+
score: 0,
|
|
150
|
+
message: err instanceof Error ? err.message : String(err),
|
|
151
|
+
details: {
|
|
152
|
+
retryable,
|
|
153
|
+
...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
return {
|
|
157
|
+
caseId: workflow.id,
|
|
158
|
+
stage: lastStage,
|
|
159
|
+
tier: plannedTier,
|
|
160
|
+
passed: false,
|
|
161
|
+
durationMs: Date.now() - started,
|
|
162
|
+
verifierResults
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
caseCostUsd += workflowResult.totalUsageUsd;
|
|
166
|
+
const stageResults = [...workflowResult.stages];
|
|
167
|
+
verifierResults.push({
|
|
168
|
+
kind: "workflow",
|
|
169
|
+
id: "workflow:agent",
|
|
170
|
+
ok: true,
|
|
171
|
+
score: 1,
|
|
172
|
+
message: `workflow ran ${stageResults.length} stage(s) in ` +
|
|
173
|
+
`${workflowResult.totalDurationMs}ms ` +
|
|
174
|
+
`(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
|
|
175
|
+
details: {
|
|
176
|
+
stages: stageResults.map((s) => ({
|
|
177
|
+
name: s.stage,
|
|
178
|
+
durationMs: s.durationMs,
|
|
179
|
+
usageUsd: s.usageUsd,
|
|
180
|
+
turns: s.toolUse.turns,
|
|
181
|
+
calls: s.toolUse.calls
|
|
182
|
+
}))
|
|
183
|
+
}
|
|
184
|
+
});
|
|
185
|
+
let allJudgeOk = true;
|
|
186
|
+
if (flags.runJudge) {
|
|
187
|
+
for (let i = 0; i < workflow.stages.length; i += 1) {
|
|
188
|
+
const step = workflow.stages[i];
|
|
189
|
+
const stageResult = stageResults[i];
|
|
190
|
+
const rubric = rubrics.get(step.name);
|
|
191
|
+
if (!rubric) {
|
|
192
|
+
verifierResults.push({
|
|
193
|
+
kind: "judge",
|
|
194
|
+
id: `judge:rubric:missing:${step.name}`,
|
|
195
|
+
ok: false,
|
|
196
|
+
score: 0,
|
|
197
|
+
message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
|
|
198
|
+
details: { stage: step.name }
|
|
199
|
+
});
|
|
200
|
+
allJudgeOk = false;
|
|
201
|
+
stageResult.judgeOk = false;
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
const hint = stageJudgeHint(step);
|
|
205
|
+
try {
|
|
206
|
+
const invocation = await runJudge({
|
|
207
|
+
artifact: stageResult.artifact,
|
|
208
|
+
rubric,
|
|
209
|
+
config,
|
|
210
|
+
client,
|
|
211
|
+
caseHint: hint
|
|
212
|
+
});
|
|
213
|
+
caseCostUsd += invocation.usageUsd;
|
|
214
|
+
const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
|
|
215
|
+
const medians = {};
|
|
216
|
+
for (const agg of invocation.aggregates) {
|
|
217
|
+
medians[agg.checkId] = agg.median;
|
|
218
|
+
}
|
|
219
|
+
stageResult.judgeMedians = medians;
|
|
220
|
+
const stageOk = judgeVerifiers.every((v) => v.ok);
|
|
221
|
+
stageResult.judgeOk = stageOk;
|
|
222
|
+
if (!stageOk)
|
|
223
|
+
allJudgeOk = false;
|
|
224
|
+
for (const v of judgeVerifiers) {
|
|
225
|
+
verifierResults.push({
|
|
226
|
+
...v,
|
|
227
|
+
id: `${v.id}:${step.name}`,
|
|
228
|
+
details: { ...(v.details ?? {}), stage: step.name }
|
|
229
|
+
});
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
catch (err) {
|
|
233
|
+
if (err instanceof DailyCostCapExceededError)
|
|
234
|
+
throw err;
|
|
235
|
+
const retryable = err instanceof EvalLlmError ? err.retryable : false;
|
|
236
|
+
verifierResults.push({
|
|
237
|
+
kind: "judge",
|
|
238
|
+
id: `judge:invocation:error:${step.name}`,
|
|
239
|
+
ok: false,
|
|
240
|
+
score: 0,
|
|
241
|
+
message: err instanceof Error ? err.message : String(err),
|
|
242
|
+
details: { retryable, rubricId: rubric.id, stage: step.name }
|
|
243
|
+
});
|
|
244
|
+
stageResult.judgeOk = false;
|
|
245
|
+
allJudgeOk = false;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
|
|
250
|
+
verifierResults.push(...consistencyResults);
|
|
251
|
+
const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
252
|
+
const allOk = nonSkipped.length === 0
|
|
253
|
+
? verifierResults.every((r) => r.ok)
|
|
254
|
+
: nonSkipped.every((r) => r.ok);
|
|
255
|
+
const workflowSummary = {
|
|
256
|
+
caseId: workflow.id,
|
|
257
|
+
stages: stageResults,
|
|
258
|
+
totalUsageUsd: workflowResult.totalUsageUsd,
|
|
259
|
+
totalDurationMs: workflowResult.totalDurationMs,
|
|
260
|
+
allJudgeOk: flags.runJudge ? allJudgeOk : true
|
|
261
|
+
};
|
|
262
|
+
return {
|
|
263
|
+
caseId: workflow.id,
|
|
264
|
+
stage: lastStage,
|
|
265
|
+
tier: plannedTier,
|
|
266
|
+
passed: allOk,
|
|
267
|
+
durationMs: Date.now() - started,
|
|
268
|
+
costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
|
|
269
|
+
verifierResults,
|
|
270
|
+
workflow: workflowSummary
|
|
271
|
+
};
|
|
272
|
+
}
|
|
86
273
|
async function runCase(ctx) {
|
|
87
274
|
const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
|
|
88
275
|
const started = Date.now();
|
|
@@ -327,18 +514,22 @@ function stagesInResults(caseResults) {
|
|
|
327
514
|
*/
|
|
328
515
|
export async function runEval(options) {
|
|
329
516
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
330
|
-
const corpus = await loadCorpus(options.projectRoot, options.stage);
|
|
331
517
|
const plannedTier = options.tier ?? config.defaultTier;
|
|
518
|
+
const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
|
|
519
|
+
const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
|
|
332
520
|
const notes = [];
|
|
333
|
-
if (corpus.length === 0) {
|
|
521
|
+
if (plannedTier !== "C" && corpus.length === 0) {
|
|
334
522
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
335
523
|
}
|
|
524
|
+
if (plannedTier === "C" && workflowCorpus.length === 0) {
|
|
525
|
+
notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
|
|
526
|
+
}
|
|
336
527
|
const flags = resolveRunFlags(options);
|
|
337
528
|
if (flags.runJudge && !config.apiKey && !options.llmClient) {
|
|
338
529
|
notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
|
|
339
530
|
}
|
|
340
|
-
if (
|
|
341
|
-
notes.push("Tier
|
|
531
|
+
if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
|
|
532
|
+
notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
|
|
342
533
|
}
|
|
343
534
|
if (options.dryRun === true) {
|
|
344
535
|
const summary = {
|
|
@@ -349,12 +540,20 @@ export async function runEval(options) {
|
|
|
349
540
|
byStage: groupByStage(corpus),
|
|
350
541
|
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
351
542
|
},
|
|
543
|
+
workflowCorpus: {
|
|
544
|
+
total: workflowCorpus.length,
|
|
545
|
+
cases: workflowCorpus.map((item) => ({
|
|
546
|
+
id: item.id,
|
|
547
|
+
stages: item.stages.map((s) => s.name)
|
|
548
|
+
}))
|
|
549
|
+
},
|
|
352
550
|
plannedTier,
|
|
353
551
|
verifiersAvailable: {
|
|
354
552
|
structural: flags.runStructural,
|
|
355
553
|
rules: flags.runRules,
|
|
356
554
|
judge: flags.runJudge,
|
|
357
|
-
workflow: flags.runAgent
|
|
555
|
+
workflow: flags.runAgent,
|
|
556
|
+
consistency: plannedTier === "C"
|
|
358
557
|
},
|
|
359
558
|
notes
|
|
360
559
|
};
|
|
@@ -362,26 +561,44 @@ export async function runEval(options) {
|
|
|
362
561
|
}
|
|
363
562
|
const costGuard = createCostGuard(options.projectRoot, config);
|
|
364
563
|
let wrappedClient;
|
|
365
|
-
|
|
564
|
+
const clientNeeded = flags.runJudge || plannedTier === "C";
|
|
565
|
+
if (clientNeeded) {
|
|
366
566
|
const base = options.llmClient ?? createEvalClient(config);
|
|
367
567
|
wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
|
|
368
568
|
}
|
|
369
|
-
const
|
|
569
|
+
const rubricsNeeded = flags.runJudge;
|
|
570
|
+
const rubrics = rubricsNeeded
|
|
370
571
|
? await loadAllRubrics(options.projectRoot)
|
|
371
572
|
: new Map();
|
|
372
573
|
const now = new Date().toISOString();
|
|
373
574
|
const caseResults = [];
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
575
|
+
if (plannedTier === "C") {
|
|
576
|
+
for (const wf of workflowCorpus) {
|
|
577
|
+
caseResults.push(await runWorkflowCase({
|
|
578
|
+
projectRoot: options.projectRoot,
|
|
579
|
+
workflow: wf,
|
|
580
|
+
plannedTier,
|
|
581
|
+
flags,
|
|
582
|
+
config,
|
|
583
|
+
client: wrappedClient,
|
|
584
|
+
costGuard,
|
|
585
|
+
rubrics
|
|
586
|
+
}));
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
else {
|
|
590
|
+
for (const item of corpus) {
|
|
591
|
+
caseResults.push(await runCase({
|
|
592
|
+
projectRoot: options.projectRoot,
|
|
593
|
+
caseEntry: item,
|
|
594
|
+
plannedTier,
|
|
595
|
+
flags,
|
|
596
|
+
config,
|
|
597
|
+
client: wrappedClient,
|
|
598
|
+
costGuard,
|
|
599
|
+
rubrics
|
|
600
|
+
}));
|
|
601
|
+
}
|
|
385
602
|
}
|
|
386
603
|
const stages = stagesInResults(caseResults);
|
|
387
604
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -24,8 +24,10 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
|
24
24
|
/**
|
|
25
25
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
26
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
27
|
+
* `consistency` is the Tier C cross-artifact family (deterministic but
|
|
28
|
+
* operates over multiple artifacts at once).
|
|
27
29
|
*/
|
|
28
|
-
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
30
|
+
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
|
|
29
31
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
32
|
/**
|
|
31
33
|
* Structural expectations — deterministic, LLM-free checks against a single
|
|
@@ -199,6 +201,11 @@ export interface EvalCaseResult {
|
|
|
199
201
|
durationMs: number;
|
|
200
202
|
costUsd?: number;
|
|
201
203
|
verifierResults: VerifierResult[];
|
|
204
|
+
/**
|
|
205
|
+
* Tier C only: the per-stage breakdown collected by the workflow
|
|
206
|
+
* agent. Unset for Tier A/B cases so the on-disk JSON stays small.
|
|
207
|
+
*/
|
|
208
|
+
workflow?: WorkflowRunSummary;
|
|
202
209
|
}
|
|
203
210
|
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
204
211
|
export interface EvalReport {
|
|
@@ -286,6 +293,14 @@ export interface EvalConfig {
|
|
|
286
293
|
* marker so the model sees the cutoff.
|
|
287
294
|
*/
|
|
288
295
|
toolMaxResultBytes?: number;
|
|
296
|
+
/**
|
|
297
|
+
* Maximum total turns a single Tier C workflow case may consume
|
|
298
|
+
* across all stages combined. Defaults to 40 (stages × toolMaxTurns).
|
|
299
|
+
* Runs that exceed the cap fail the current stage with a
|
|
300
|
+
* `MaxTurnsExceededError` propagated from the underlying with-tools
|
|
301
|
+
* loop rather than a dedicated workflow-level error.
|
|
302
|
+
*/
|
|
303
|
+
workflowMaxTotalTurns?: number;
|
|
289
304
|
}
|
|
290
305
|
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
291
306
|
export interface TokenPricing {
|
|
@@ -416,3 +431,104 @@ export interface ToolUseSummary {
|
|
|
416
431
|
/** Per-tool call counts, keyed by tool name. */
|
|
417
432
|
byTool: Record<string, number>;
|
|
418
433
|
}
|
|
434
|
+
/**
|
|
435
|
+
* Cross-stage consistency expectations for a Tier C workflow case. Every
|
|
436
|
+
* sub-check is optional so authors can opt in incrementally; an empty
|
|
437
|
+
* block produces zero verifier results.
|
|
438
|
+
*/
|
|
439
|
+
export interface WorkflowConsistencyExpected {
|
|
440
|
+
/**
|
|
441
|
+
* For each rule, every id extracted from the `from` stage must appear in
|
|
442
|
+
* every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
|
|
443
|
+
* "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
|
|
444
|
+
*/
|
|
445
|
+
idsFlow?: Array<{
|
|
446
|
+
idPattern: string;
|
|
447
|
+
idFlags?: string;
|
|
448
|
+
from: WorkflowStageName;
|
|
449
|
+
to: WorkflowStageName[];
|
|
450
|
+
}>;
|
|
451
|
+
/**
|
|
452
|
+
* Stages that must not contain any of the listed case-insensitive
|
|
453
|
+
* phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
|
|
454
|
+
* empty array; omit entirely to skip the check.
|
|
455
|
+
*/
|
|
456
|
+
placeholderFree?: {
|
|
457
|
+
stages: WorkflowStageName[];
|
|
458
|
+
phrases?: string[];
|
|
459
|
+
};
|
|
460
|
+
/**
|
|
461
|
+
* Free-form substring pairs: for every entry, if `must` appears in the
|
|
462
|
+
* named stage, `forbid` must NOT appear anywhere in the listed
|
|
463
|
+
* `stages`. Useful for "v1 decided in scope, plan must not say v2".
|
|
464
|
+
*/
|
|
465
|
+
noContradictions?: Array<{
|
|
466
|
+
stage: WorkflowStageName;
|
|
467
|
+
must: string;
|
|
468
|
+
forbid: string;
|
|
469
|
+
stages: WorkflowStageName[];
|
|
470
|
+
}>;
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* A single stage step inside a Tier C workflow case. The stage's
|
|
474
|
+
* `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
|
|
475
|
+
* artifacts seeded into the sandbox under `stages/<name>.md`.
|
|
476
|
+
*/
|
|
477
|
+
export interface WorkflowStageStep {
|
|
478
|
+
name: WorkflowStageName;
|
|
479
|
+
inputPrompt: string;
|
|
480
|
+
/** Per-stage rubric id override (defaults to the stage name). */
|
|
481
|
+
rubric?: string;
|
|
482
|
+
/** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
|
|
483
|
+
requiredChecks?: string[];
|
|
484
|
+
/** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
|
|
485
|
+
minimumScores?: Record<string, number>;
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
489
|
+
* Tier C covers the early "design" arc of a project. TDD/review/ship
|
|
490
|
+
* are out of scope (they require real code execution).
|
|
491
|
+
*/
|
|
492
|
+
export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
|
|
493
|
+
export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
|
|
494
|
+
/**
|
|
495
|
+
* A Tier C workflow case. Lives under
|
|
496
|
+
* `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
|
|
497
|
+
* through the with-tools agent.
|
|
498
|
+
*/
|
|
499
|
+
export interface WorkflowCase {
|
|
500
|
+
id: string;
|
|
501
|
+
/** Short human-readable description (rendered in reports). */
|
|
502
|
+
description?: string;
|
|
503
|
+
/** Project files seeded into the sandbox before stage 1 runs. */
|
|
504
|
+
contextFiles?: string[];
|
|
505
|
+
/** Ordered list of stages to run. Must be non-empty. */
|
|
506
|
+
stages: WorkflowStageStep[];
|
|
507
|
+
/** Cross-stage consistency checks (Tier C-specific verifier family). */
|
|
508
|
+
consistency?: WorkflowConsistencyExpected;
|
|
509
|
+
}
|
|
510
|
+
/** Per-stage record inside a Tier C workflow run. */
|
|
511
|
+
export interface WorkflowStageResult {
|
|
512
|
+
stage: WorkflowStageName;
|
|
513
|
+
artifact: string;
|
|
514
|
+
durationMs: number;
|
|
515
|
+
usageUsd: number;
|
|
516
|
+
toolUse: ToolUseSummary;
|
|
517
|
+
attempts: number;
|
|
518
|
+
model: string;
|
|
519
|
+
promptTokens: number;
|
|
520
|
+
completionTokens: number;
|
|
521
|
+
/** True when the judge (when requested) produced `ok:true` for every required check. */
|
|
522
|
+
judgeOk?: boolean;
|
|
523
|
+
/** Per-rubric-check medians keyed by check id (for the report). */
|
|
524
|
+
judgeMedians?: Record<string, number>;
|
|
525
|
+
}
|
|
526
|
+
/** Tier C orchestration output collected by the runner. */
|
|
527
|
+
export interface WorkflowRunSummary {
|
|
528
|
+
caseId: string;
|
|
529
|
+
stages: WorkflowStageResult[];
|
|
530
|
+
totalUsageUsd: number;
|
|
531
|
+
totalDurationMs: number;
|
|
532
|
+
/** True when every stage judge was ok (or judge was skipped everywhere). */
|
|
533
|
+
allJudgeOk: boolean;
|
|
534
|
+
}
|
package/dist/eval/types.js
CHANGED
|
@@ -11,5 +11,25 @@ export const EVAL_TIERS = ["A", "B", "C"];
|
|
|
11
11
|
/**
|
|
12
12
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
13
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
14
|
+
* `consistency` is the Tier C cross-artifact family (deterministic but
|
|
15
|
+
* operates over multiple artifacts at once).
|
|
14
16
|
*/
|
|
15
|
-
export const VERIFIER_KINDS = [
|
|
17
|
+
export const VERIFIER_KINDS = [
|
|
18
|
+
"structural",
|
|
19
|
+
"rules",
|
|
20
|
+
"judge",
|
|
21
|
+
"workflow",
|
|
22
|
+
"consistency"
|
|
23
|
+
];
|
|
24
|
+
/**
|
|
25
|
+
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
26
|
+
* Tier C covers the early "design" arc of a project. TDD/review/ship
|
|
27
|
+
* are out of scope (they require real code execution).
|
|
28
|
+
*/
|
|
29
|
+
export const WORKFLOW_STAGES = [
|
|
30
|
+
"brainstorm",
|
|
31
|
+
"scope",
|
|
32
|
+
"design",
|
|
33
|
+
"spec",
|
|
34
|
+
"plan"
|
|
35
|
+
];
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-artifact consistency verifier for Tier C.
|
|
3
|
+
*
|
|
4
|
+
* Operates over a `{ stage → artifact }` map produced by the workflow
|
|
5
|
+
* agent and emits deterministic verifier results for:
|
|
6
|
+
*
|
|
7
|
+
* - `ids_flow`: every id extracted from `from` must appear in every
|
|
8
|
+
* `to` stage. Typical use — `D-\d+` from scope must all land in plan.
|
|
9
|
+
* - `placeholder_free`: none of the listed phrases
|
|
10
|
+
* (default `TBD`/`TODO`/`placeholder`) appear in any of the named
|
|
11
|
+
* stages.
|
|
12
|
+
* - `no_contradictions`: for each entry, if `must` is present in the
|
|
13
|
+
* declaring stage, `forbid` must not appear in any of the listed
|
|
14
|
+
* `stages`.
|
|
15
|
+
*
|
|
16
|
+
* Each sub-check contributes zero or more `VerifierResult`s with
|
|
17
|
+
* `kind: "consistency"`. An empty `WorkflowConsistencyExpected` produces
|
|
18
|
+
* zero results so authors can opt in incrementally.
|
|
19
|
+
*/
|
|
20
|
+
import type { VerifierResult, WorkflowConsistencyExpected, WorkflowStageName } from "../types.js";
|
|
21
|
+
export declare function verifyWorkflowConsistency(artifacts: Map<WorkflowStageName, string>, expected: WorkflowConsistencyExpected | undefined): VerifierResult[];
|