cclaw-cli 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,8 +3,10 @@ import { CCLAW_VERSION } from "../constants.js";
3
3
  import { FLOW_STAGES } from "../types.js";
4
4
  import { runSingleShot } from "./agents/single-shot.js";
5
5
  import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
6
+ import { runWorkflow } from "./agents/workflow.js";
6
7
  import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
7
8
  import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
9
+ import { loadWorkflowCorpus } from "./workflow-corpus.js";
8
10
  import { loadEvalConfig } from "./config-loader.js";
9
11
  import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
10
12
  import { createEvalClient, EvalLlmError } from "./llm-client.js";
@@ -13,6 +15,7 @@ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
13
15
  import { verifyRules } from "./verifiers/rules.js";
14
16
  import { verifyStructural } from "./verifiers/structural.js";
15
17
  import { verifyTraceability } from "./verifiers/traceability.js";
18
+ import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
16
19
  function groupByStage(cases) {
17
20
  return cases.reduce((acc, item) => {
18
21
  acc[item.stage] = (acc[item.stage] ?? 0) + 1;
@@ -42,7 +45,13 @@ function resolveRunFlags(options) {
42
45
  const judgeRequested = options.judge === true;
43
46
  const tier = options.tier ?? "A";
44
47
  const runJudge = judgeRequested && !schemaOnly;
45
- const runAgent = runJudge && (tier === "A" || tier === "B");
48
+ // Tier C always needs the agent loop (no fixture fallback for workflows),
49
+ // so we still require an LLM client but we do NOT require --judge on the
50
+ // CLI to produce a workflow run. The judge piece itself stays gated by
51
+ // `runJudge` so consistency-only runs are cheap and deterministic.
52
+ const runAgent = tier === "C"
53
+ ? !schemaOnly
54
+ : runJudge && (tier === "A" || tier === "B");
46
55
  return {
47
56
  runStructural: true,
48
57
  runRules: rulesRequested && !schemaOnly,
@@ -83,6 +92,184 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
83
92
  return undefined;
84
93
  }
85
94
  }
95
+ function stageJudgeHint(step) {
96
+ const hint = {};
97
+ if (step.rubric)
98
+ hint.rubric = step.rubric;
99
+ if (step.requiredChecks)
100
+ hint.requiredChecks = step.requiredChecks;
101
+ if (step.minimumScores)
102
+ hint.minimumScores = step.minimumScores;
103
+ return hint;
104
+ }
105
+ async function runWorkflowCase(ctx) {
106
+ const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
107
+ const started = Date.now();
108
+ const verifierResults = [];
109
+ let caseCostUsd = 0;
110
+ const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
111
+ "plan";
112
+ if (!flags.runAgent || !client) {
113
+ verifierResults.push({
114
+ kind: "workflow",
115
+ id: "workflow:agent:disabled",
116
+ ok: false,
117
+ score: 0,
118
+ message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
119
+ "Re-run with credentials to execute the workflow.",
120
+ details: { stages: workflow.stages.map((s) => s.name) }
121
+ });
122
+ return {
123
+ caseId: workflow.id,
124
+ stage: lastStage,
125
+ tier: plannedTier,
126
+ passed: false,
127
+ durationMs: Date.now() - started,
128
+ verifierResults
129
+ };
130
+ }
131
+ let workflowResult;
132
+ try {
133
+ workflowResult = await runWorkflow({
134
+ workflow,
135
+ config,
136
+ projectRoot,
137
+ client
138
+ });
139
+ }
140
+ catch (err) {
141
+ if (err instanceof DailyCostCapExceededError)
142
+ throw err;
143
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
144
+ const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
145
+ verifierResults.push({
146
+ kind: "workflow",
147
+ id: "workflow:agent:error",
148
+ ok: false,
149
+ score: 0,
150
+ message: err instanceof Error ? err.message : String(err),
151
+ details: {
152
+ retryable,
153
+ ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
154
+ }
155
+ });
156
+ return {
157
+ caseId: workflow.id,
158
+ stage: lastStage,
159
+ tier: plannedTier,
160
+ passed: false,
161
+ durationMs: Date.now() - started,
162
+ verifierResults
163
+ };
164
+ }
165
+ caseCostUsd += workflowResult.totalUsageUsd;
166
+ const stageResults = [...workflowResult.stages];
167
+ verifierResults.push({
168
+ kind: "workflow",
169
+ id: "workflow:agent",
170
+ ok: true,
171
+ score: 1,
172
+ message: `workflow ran ${stageResults.length} stage(s) in ` +
173
+ `${workflowResult.totalDurationMs}ms ` +
174
+ `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
175
+ details: {
176
+ stages: stageResults.map((s) => ({
177
+ name: s.stage,
178
+ durationMs: s.durationMs,
179
+ usageUsd: s.usageUsd,
180
+ turns: s.toolUse.turns,
181
+ calls: s.toolUse.calls
182
+ }))
183
+ }
184
+ });
185
+ let allJudgeOk = true;
186
+ if (flags.runJudge) {
187
+ for (let i = 0; i < workflow.stages.length; i += 1) {
188
+ const step = workflow.stages[i];
189
+ const stageResult = stageResults[i];
190
+ const rubric = rubrics.get(step.name);
191
+ if (!rubric) {
192
+ verifierResults.push({
193
+ kind: "judge",
194
+ id: `judge:rubric:missing:${step.name}`,
195
+ ok: false,
196
+ score: 0,
197
+ message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
198
+ details: { stage: step.name }
199
+ });
200
+ allJudgeOk = false;
201
+ stageResult.judgeOk = false;
202
+ continue;
203
+ }
204
+ const hint = stageJudgeHint(step);
205
+ try {
206
+ const invocation = await runJudge({
207
+ artifact: stageResult.artifact,
208
+ rubric,
209
+ config,
210
+ client,
211
+ caseHint: hint
212
+ });
213
+ caseCostUsd += invocation.usageUsd;
214
+ const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
215
+ const medians = {};
216
+ for (const agg of invocation.aggregates) {
217
+ medians[agg.checkId] = agg.median;
218
+ }
219
+ stageResult.judgeMedians = medians;
220
+ const stageOk = judgeVerifiers.every((v) => v.ok);
221
+ stageResult.judgeOk = stageOk;
222
+ if (!stageOk)
223
+ allJudgeOk = false;
224
+ for (const v of judgeVerifiers) {
225
+ verifierResults.push({
226
+ ...v,
227
+ id: `${v.id}:${step.name}`,
228
+ details: { ...(v.details ?? {}), stage: step.name }
229
+ });
230
+ }
231
+ }
232
+ catch (err) {
233
+ if (err instanceof DailyCostCapExceededError)
234
+ throw err;
235
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
236
+ verifierResults.push({
237
+ kind: "judge",
238
+ id: `judge:invocation:error:${step.name}`,
239
+ ok: false,
240
+ score: 0,
241
+ message: err instanceof Error ? err.message : String(err),
242
+ details: { retryable, rubricId: rubric.id, stage: step.name }
243
+ });
244
+ stageResult.judgeOk = false;
245
+ allJudgeOk = false;
246
+ }
247
+ }
248
+ }
249
+ const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
250
+ verifierResults.push(...consistencyResults);
251
+ const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
252
+ const allOk = nonSkipped.length === 0
253
+ ? verifierResults.every((r) => r.ok)
254
+ : nonSkipped.every((r) => r.ok);
255
+ const workflowSummary = {
256
+ caseId: workflow.id,
257
+ stages: stageResults,
258
+ totalUsageUsd: workflowResult.totalUsageUsd,
259
+ totalDurationMs: workflowResult.totalDurationMs,
260
+ allJudgeOk: flags.runJudge ? allJudgeOk : true
261
+ };
262
+ return {
263
+ caseId: workflow.id,
264
+ stage: lastStage,
265
+ tier: plannedTier,
266
+ passed: allOk,
267
+ durationMs: Date.now() - started,
268
+ costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
269
+ verifierResults,
270
+ workflow: workflowSummary
271
+ };
272
+ }
86
273
  async function runCase(ctx) {
87
274
  const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
88
275
  const started = Date.now();
@@ -327,18 +514,22 @@ function stagesInResults(caseResults) {
327
514
  */
328
515
  export async function runEval(options) {
329
516
  const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
330
- const corpus = await loadCorpus(options.projectRoot, options.stage);
331
517
  const plannedTier = options.tier ?? config.defaultTier;
518
+ const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
519
+ const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
332
520
  const notes = [];
333
- if (corpus.length === 0) {
521
+ if (plannedTier !== "C" && corpus.length === 0) {
334
522
  notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
335
523
  }
524
+ if (plannedTier === "C" && workflowCorpus.length === 0) {
525
+ notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
526
+ }
336
527
  const flags = resolveRunFlags(options);
337
528
  if (flags.runJudge && !config.apiKey && !options.llmClient) {
338
529
  notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
339
530
  }
340
- if ((options.tier ?? "A") !== "A" && flags.runJudge) {
341
- notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
531
+ if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
532
+ notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
342
533
  }
343
534
  if (options.dryRun === true) {
344
535
  const summary = {
@@ -349,12 +540,20 @@ export async function runEval(options) {
349
540
  byStage: groupByStage(corpus),
350
541
  cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
351
542
  },
543
+ workflowCorpus: {
544
+ total: workflowCorpus.length,
545
+ cases: workflowCorpus.map((item) => ({
546
+ id: item.id,
547
+ stages: item.stages.map((s) => s.name)
548
+ }))
549
+ },
352
550
  plannedTier,
353
551
  verifiersAvailable: {
354
552
  structural: flags.runStructural,
355
553
  rules: flags.runRules,
356
554
  judge: flags.runJudge,
357
- workflow: flags.runAgent
555
+ workflow: flags.runAgent,
556
+ consistency: plannedTier === "C"
358
557
  },
359
558
  notes
360
559
  };
@@ -362,26 +561,44 @@ export async function runEval(options) {
362
561
  }
363
562
  const costGuard = createCostGuard(options.projectRoot, config);
364
563
  let wrappedClient;
365
- if (flags.runJudge) {
564
+ const clientNeeded = flags.runJudge || plannedTier === "C";
565
+ if (clientNeeded) {
366
566
  const base = options.llmClient ?? createEvalClient(config);
367
567
  wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
368
568
  }
369
- const rubrics = flags.runJudge
569
+ const rubricsNeeded = flags.runJudge;
570
+ const rubrics = rubricsNeeded
370
571
  ? await loadAllRubrics(options.projectRoot)
371
572
  : new Map();
372
573
  const now = new Date().toISOString();
373
574
  const caseResults = [];
374
- for (const item of corpus) {
375
- caseResults.push(await runCase({
376
- projectRoot: options.projectRoot,
377
- caseEntry: item,
378
- plannedTier,
379
- flags,
380
- config,
381
- client: wrappedClient,
382
- costGuard,
383
- rubrics
384
- }));
575
+ if (plannedTier === "C") {
576
+ for (const wf of workflowCorpus) {
577
+ caseResults.push(await runWorkflowCase({
578
+ projectRoot: options.projectRoot,
579
+ workflow: wf,
580
+ plannedTier,
581
+ flags,
582
+ config,
583
+ client: wrappedClient,
584
+ costGuard,
585
+ rubrics
586
+ }));
587
+ }
588
+ }
589
+ else {
590
+ for (const item of corpus) {
591
+ caseResults.push(await runCase({
592
+ projectRoot: options.projectRoot,
593
+ caseEntry: item,
594
+ plannedTier,
595
+ flags,
596
+ config,
597
+ client: wrappedClient,
598
+ costGuard,
599
+ rubrics
600
+ }));
601
+ }
385
602
  }
386
603
  const stages = stagesInResults(caseResults);
387
604
  const baselines = await loadBaselinesByStage(options.projectRoot, stages);
@@ -24,8 +24,10 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
24
24
  /**
25
25
  * Verifier kinds, in increasing cost and decreasing determinism:
26
26
  * structural and rules run without LLM; judge and workflow use the configured model.
27
+ * `consistency` is the Tier C cross-artifact family (deterministic but
28
+ * operates over multiple artifacts at once).
27
29
  */
28
- export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
30
+ export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
29
31
  export type VerifierKind = (typeof VERIFIER_KINDS)[number];
30
32
  /**
31
33
  * Structural expectations — deterministic, LLM-free checks against a single
@@ -199,6 +201,11 @@ export interface EvalCaseResult {
199
201
  durationMs: number;
200
202
  costUsd?: number;
201
203
  verifierResults: VerifierResult[];
204
+ /**
205
+ * Tier C only: the per-stage breakdown collected by the workflow
206
+ * agent. Unset for Tier A/B cases so the on-disk JSON stays small.
207
+ */
208
+ workflow?: WorkflowRunSummary;
202
209
  }
203
210
  /** Top-level eval report, serialized to JSON and rendered to Markdown. */
204
211
  export interface EvalReport {
@@ -286,6 +293,14 @@ export interface EvalConfig {
286
293
  * marker so the model sees the cutoff.
287
294
  */
288
295
  toolMaxResultBytes?: number;
296
+ /**
297
+ * Maximum total turns a single Tier C workflow case may consume
298
+ * across all stages combined. Defaults to 40 (stages × toolMaxTurns).
299
+ * Runs that exceed the cap fail the current stage with a
300
+ * `MaxTurnsExceededError` propagated from the underlying with-tools
301
+ * loop rather than a dedicated workflow-level error.
302
+ */
303
+ workflowMaxTotalTurns?: number;
289
304
  }
290
305
  /** Per-model pricing schedule, expressed as USD per 1K tokens. */
291
306
  export interface TokenPricing {
@@ -416,3 +431,104 @@ export interface ToolUseSummary {
416
431
  /** Per-tool call counts, keyed by tool name. */
417
432
  byTool: Record<string, number>;
418
433
  }
434
+ /**
435
+ * Cross-stage consistency expectations for a Tier C workflow case. Every
436
+ * sub-check is optional so authors can opt in incrementally; an empty
437
+ * block produces zero verifier results.
438
+ */
439
+ export interface WorkflowConsistencyExpected {
440
+ /**
441
+ * For each rule, every id extracted from the `from` stage must appear in
442
+ * every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
443
+ * "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
444
+ */
445
+ idsFlow?: Array<{
446
+ idPattern: string;
447
+ idFlags?: string;
448
+ from: WorkflowStageName;
449
+ to: WorkflowStageName[];
450
+ }>;
451
+ /**
452
+ * Stages that must not contain any of the listed case-insensitive
453
+ * phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
454
+ * empty array; omit entirely to skip the check.
455
+ */
456
+ placeholderFree?: {
457
+ stages: WorkflowStageName[];
458
+ phrases?: string[];
459
+ };
460
+ /**
461
+ * Free-form substring pairs: for every entry, if `must` appears in the
462
+ * named stage, `forbid` must NOT appear anywhere in the listed
463
+ * `stages`. Useful for "v1 decided in scope, plan must not say v2".
464
+ */
465
+ noContradictions?: Array<{
466
+ stage: WorkflowStageName;
467
+ must: string;
468
+ forbid: string;
469
+ stages: WorkflowStageName[];
470
+ }>;
471
+ }
472
+ /**
473
+ * A single stage step inside a Tier C workflow case. The stage's
474
+ * `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
475
+ * artifacts seeded into the sandbox under `stages/<name>.md`.
476
+ */
477
+ export interface WorkflowStageStep {
478
+ name: WorkflowStageName;
479
+ inputPrompt: string;
480
+ /** Per-stage rubric id override (defaults to the stage name). */
481
+ rubric?: string;
482
+ /** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
483
+ requiredChecks?: string[];
484
+ /** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
485
+ minimumScores?: Record<string, number>;
486
+ }
487
+ /**
488
+ * Supported workflow stages. Deliberately a subset of `FlowStage` —
489
+ * Tier C covers the early "design" arc of a project. TDD/review/ship
490
+ * are out of scope (they require real code execution).
491
+ */
492
+ export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
493
+ export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
494
+ /**
495
+ * A Tier C workflow case. Lives under
496
+ * `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
497
+ * through the with-tools agent.
498
+ */
499
+ export interface WorkflowCase {
500
+ id: string;
501
+ /** Short human-readable description (rendered in reports). */
502
+ description?: string;
503
+ /** Project files seeded into the sandbox before stage 1 runs. */
504
+ contextFiles?: string[];
505
+ /** Ordered list of stages to run. Must be non-empty. */
506
+ stages: WorkflowStageStep[];
507
+ /** Cross-stage consistency checks (Tier C-specific verifier family). */
508
+ consistency?: WorkflowConsistencyExpected;
509
+ }
510
+ /** Per-stage record inside a Tier C workflow run. */
511
+ export interface WorkflowStageResult {
512
+ stage: WorkflowStageName;
513
+ artifact: string;
514
+ durationMs: number;
515
+ usageUsd: number;
516
+ toolUse: ToolUseSummary;
517
+ attempts: number;
518
+ model: string;
519
+ promptTokens: number;
520
+ completionTokens: number;
521
+ /** True when the judge (when requested) produced `ok:true` for every required check. */
522
+ judgeOk?: boolean;
523
+ /** Per-rubric-check medians keyed by check id (for the report). */
524
+ judgeMedians?: Record<string, number>;
525
+ }
526
+ /** Tier C orchestration output collected by the runner. */
527
+ export interface WorkflowRunSummary {
528
+ caseId: string;
529
+ stages: WorkflowStageResult[];
530
+ totalUsageUsd: number;
531
+ totalDurationMs: number;
532
+ /** True when every stage judge was ok (or judge was skipped everywhere). */
533
+ allJudgeOk: boolean;
534
+ }
@@ -11,5 +11,25 @@ export const EVAL_TIERS = ["A", "B", "C"];
11
11
  /**
12
12
  * Verifier kinds, in increasing cost and decreasing determinism:
13
13
  * structural and rules run without LLM; judge and workflow use the configured model.
14
+ * `consistency` is the Tier C cross-artifact family (deterministic but
15
+ * operates over multiple artifacts at once).
14
16
  */
15
- export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];
17
+ export const VERIFIER_KINDS = [
18
+ "structural",
19
+ "rules",
20
+ "judge",
21
+ "workflow",
22
+ "consistency"
23
+ ];
24
+ /**
25
+ * Supported workflow stages. Deliberately a subset of `FlowStage` —
26
+ * Tier C covers the early "design" arc of a project. TDD/review/ship
27
+ * are out of scope (they require real code execution).
28
+ */
29
+ export const WORKFLOW_STAGES = [
30
+ "brainstorm",
31
+ "scope",
32
+ "design",
33
+ "spec",
34
+ "plan"
35
+ ];
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Cross-artifact consistency verifier for Tier C.
3
+ *
4
+ * Operates over a `{ stage → artifact }` map produced by the workflow
5
+ * agent and emits deterministic verifier results for:
6
+ *
7
+ * - `ids_flow`: every id extracted from `from` must appear in every
8
+ * `to` stage. Typical use — `D-\d+` from scope must all land in plan.
9
+ * - `placeholder_free`: none of the listed phrases
10
+ * (default `TBD`/`TODO`/`placeholder`) appear in any of the named
11
+ * stages.
12
+ * - `no_contradictions`: for each entry, if `must` is present in the
13
+ * declaring stage, `forbid` must not appear in any of the listed
14
+ * `stages`.
15
+ *
16
+ * Each sub-check contributes zero or more `VerifierResult`s with
17
+ * `kind: "consistency"`. An empty `WorkflowConsistencyExpected` produces
18
+ * zero results so authors can opt in incrementally.
19
+ */
20
+ import type { VerifierResult, WorkflowConsistencyExpected, WorkflowStageName } from "../types.js";
21
+ export declare function verifyWorkflowConsistency(artifacts: Map<WorkflowStageName, string>, expected: WorkflowConsistencyExpected | undefined): VerifierResult[];