cclaw-cli 0.25.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +79 -4
- package/dist/eval/agents/with-tools.d.ts +44 -0
- package/dist/eval/agents/with-tools.js +261 -0
- package/dist/eval/agents/workflow.d.ts +24 -0
- package/dist/eval/agents/workflow.js +133 -0
- package/dist/eval/config-loader.js +38 -2
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +10 -0
- package/dist/eval/llm-client.js +10 -1
- package/dist/eval/report.js +54 -0
- package/dist/eval/runner.d.ts +10 -1
- package/dist/eval/runner.js +285 -20
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +152 -1
- package/dist/eval/types.js +21 -1
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
package/dist/eval/types.d.ts
CHANGED
|
@@ -24,8 +24,10 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
|
24
24
|
/**
|
|
25
25
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
26
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
27
|
+
* `consistency` is the Tier C cross-artifact family (deterministic but
|
|
28
|
+
* operates over multiple artifacts at once).
|
|
27
29
|
*/
|
|
28
|
-
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
30
|
+
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
|
|
29
31
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
32
|
/**
|
|
31
33
|
* Structural expectations — deterministic, LLM-free checks against a single
|
|
@@ -199,6 +201,11 @@ export interface EvalCaseResult {
|
|
|
199
201
|
durationMs: number;
|
|
200
202
|
costUsd?: number;
|
|
201
203
|
verifierResults: VerifierResult[];
|
|
204
|
+
/**
|
|
205
|
+
* Tier C only: the per-stage breakdown collected by the workflow
|
|
206
|
+
* agent. Unset for Tier A/B cases so the on-disk JSON stays small.
|
|
207
|
+
*/
|
|
208
|
+
workflow?: WorkflowRunSummary;
|
|
202
209
|
}
|
|
203
210
|
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
204
211
|
export interface EvalReport {
|
|
@@ -268,6 +275,32 @@ export interface EvalConfig {
|
|
|
268
275
|
* `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
|
|
269
276
|
*/
|
|
270
277
|
tokenPricing?: Record<string, TokenPricing>;
|
|
278
|
+
/**
|
|
279
|
+
* Maximum assistant turns (tool_calls → tool result cycles) allowed by
|
|
280
|
+
* the Tier B with-tools agent. Defaults to 8 when unset. Runs that
|
|
281
|
+
* exceed the cap fail with a `MaxTurnsExceededError` and surface as a
|
|
282
|
+
* workflow verifier result.
|
|
283
|
+
*/
|
|
284
|
+
toolMaxTurns?: number;
|
|
285
|
+
/**
|
|
286
|
+
* Per-invocation ceiling on tool call arguments bytes. Defends against
|
|
287
|
+
* runaway writes. Defaults to 64 KiB.
|
|
288
|
+
*/
|
|
289
|
+
toolMaxArgumentsBytes?: number;
|
|
290
|
+
/**
|
|
291
|
+
* Per-invocation ceiling on tool call result bytes returned to the
|
|
292
|
+
* model. Defaults to 32 KiB; longer results are truncated with a
|
|
293
|
+
* marker so the model sees the cutoff.
|
|
294
|
+
*/
|
|
295
|
+
toolMaxResultBytes?: number;
|
|
296
|
+
/**
|
|
297
|
+
* Maximum total turns a single Tier C workflow case may consume
|
|
298
|
+
* across all stages combined. Defaults to 40 (stages × toolMaxTurns).
|
|
299
|
+
* Runs that exceed the cap fail the current stage with a
|
|
300
|
+
* `MaxTurnsExceededError` propagated from the underlying with-tools
|
|
301
|
+
* loop rather than a dedicated workflow-level error.
|
|
302
|
+
*/
|
|
303
|
+
workflowMaxTotalTurns?: number;
|
|
271
304
|
}
|
|
272
305
|
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
273
306
|
export interface TokenPricing {
|
|
@@ -381,3 +414,121 @@ export interface JudgeInvocation {
|
|
|
381
414
|
usageUsd: number;
|
|
382
415
|
durationMs: number;
|
|
383
416
|
}
|
|
417
|
+
/**
|
|
418
|
+
* Tool-use summary produced by the Tier B with-tools agent. Captured so
|
|
419
|
+
* the runner can surface per-case tool metrics in the markdown report
|
|
420
|
+
* (number of calls, depth, error rate, denied paths).
|
|
421
|
+
*/
|
|
422
|
+
export interface ToolUseSummary {
|
|
423
|
+
/** Turns consumed before the agent produced a terminal assistant message. */
|
|
424
|
+
turns: number;
|
|
425
|
+
/** Total successful tool invocations across all turns. */
|
|
426
|
+
calls: number;
|
|
427
|
+
/** Tool invocations that returned an error (bad args, denied path, etc.). */
|
|
428
|
+
errors: number;
|
|
429
|
+
/** Paths the sandbox refused to resolve (escape attempts, missing files). */
|
|
430
|
+
deniedPaths: string[];
|
|
431
|
+
/** Per-tool call counts, keyed by tool name. */
|
|
432
|
+
byTool: Record<string, number>;
|
|
433
|
+
}
|
|
434
|
+
/**
|
|
435
|
+
* Cross-stage consistency expectations for a Tier C workflow case. Every
|
|
436
|
+
* sub-check is optional so authors can opt in incrementally; an empty
|
|
437
|
+
* block produces zero verifier results.
|
|
438
|
+
*/
|
|
439
|
+
export interface WorkflowConsistencyExpected {
|
|
440
|
+
/**
|
|
441
|
+
* For each rule, every id extracted from the `from` stage must appear in
|
|
442
|
+
* every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
|
|
443
|
+
* "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
|
|
444
|
+
*/
|
|
445
|
+
idsFlow?: Array<{
|
|
446
|
+
idPattern: string;
|
|
447
|
+
idFlags?: string;
|
|
448
|
+
from: WorkflowStageName;
|
|
449
|
+
to: WorkflowStageName[];
|
|
450
|
+
}>;
|
|
451
|
+
/**
|
|
452
|
+
* Stages that must not contain any of the listed case-insensitive
|
|
453
|
+
* phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
|
|
454
|
+
* empty array; omit entirely to skip the check.
|
|
455
|
+
*/
|
|
456
|
+
placeholderFree?: {
|
|
457
|
+
stages: WorkflowStageName[];
|
|
458
|
+
phrases?: string[];
|
|
459
|
+
};
|
|
460
|
+
/**
|
|
461
|
+
* Free-form substring pairs: for every entry, if `must` appears in the
|
|
462
|
+
* named stage, `forbid` must NOT appear anywhere in the listed
|
|
463
|
+
* `stages`. Useful for "v1 decided in scope, plan must not say v2".
|
|
464
|
+
*/
|
|
465
|
+
noContradictions?: Array<{
|
|
466
|
+
stage: WorkflowStageName;
|
|
467
|
+
must: string;
|
|
468
|
+
forbid: string;
|
|
469
|
+
stages: WorkflowStageName[];
|
|
470
|
+
}>;
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* A single stage step inside a Tier C workflow case. The stage's
|
|
474
|
+
* `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
|
|
475
|
+
* artifacts seeded into the sandbox under `stages/<name>.md`.
|
|
476
|
+
*/
|
|
477
|
+
export interface WorkflowStageStep {
|
|
478
|
+
name: WorkflowStageName;
|
|
479
|
+
inputPrompt: string;
|
|
480
|
+
/** Per-stage rubric id override (defaults to the stage name). */
|
|
481
|
+
rubric?: string;
|
|
482
|
+
/** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
|
|
483
|
+
requiredChecks?: string[];
|
|
484
|
+
/** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
|
|
485
|
+
minimumScores?: Record<string, number>;
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
489
|
+
* Tier C covers the early "design" arc of a project. TDD/review/ship
|
|
490
|
+
* are out of scope (they require real code execution).
|
|
491
|
+
*/
|
|
492
|
+
export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
|
|
493
|
+
export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
|
|
494
|
+
/**
|
|
495
|
+
* A Tier C workflow case. Lives under
|
|
496
|
+
* `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
|
|
497
|
+
* through the with-tools agent.
|
|
498
|
+
*/
|
|
499
|
+
export interface WorkflowCase {
|
|
500
|
+
id: string;
|
|
501
|
+
/** Short human-readable description (rendered in reports). */
|
|
502
|
+
description?: string;
|
|
503
|
+
/** Project files seeded into the sandbox before stage 1 runs. */
|
|
504
|
+
contextFiles?: string[];
|
|
505
|
+
/** Ordered list of stages to run. Must be non-empty. */
|
|
506
|
+
stages: WorkflowStageStep[];
|
|
507
|
+
/** Cross-stage consistency checks (Tier C-specific verifier family). */
|
|
508
|
+
consistency?: WorkflowConsistencyExpected;
|
|
509
|
+
}
|
|
510
|
+
/** Per-stage record inside a Tier C workflow run. */
|
|
511
|
+
export interface WorkflowStageResult {
|
|
512
|
+
stage: WorkflowStageName;
|
|
513
|
+
artifact: string;
|
|
514
|
+
durationMs: number;
|
|
515
|
+
usageUsd: number;
|
|
516
|
+
toolUse: ToolUseSummary;
|
|
517
|
+
attempts: number;
|
|
518
|
+
model: string;
|
|
519
|
+
promptTokens: number;
|
|
520
|
+
completionTokens: number;
|
|
521
|
+
/** True when the judge (when requested) produced `ok:true` for every required check. */
|
|
522
|
+
judgeOk?: boolean;
|
|
523
|
+
/** Per-rubric-check medians keyed by check id (for the report). */
|
|
524
|
+
judgeMedians?: Record<string, number>;
|
|
525
|
+
}
|
|
526
|
+
/** Tier C orchestration output collected by the runner. */
|
|
527
|
+
export interface WorkflowRunSummary {
|
|
528
|
+
caseId: string;
|
|
529
|
+
stages: WorkflowStageResult[];
|
|
530
|
+
totalUsageUsd: number;
|
|
531
|
+
totalDurationMs: number;
|
|
532
|
+
/** True when every stage judge was ok (or judge was skipped everywhere). */
|
|
533
|
+
allJudgeOk: boolean;
|
|
534
|
+
}
|
package/dist/eval/types.js
CHANGED
|
@@ -11,5 +11,25 @@ export const EVAL_TIERS = ["A", "B", "C"];
|
|
|
11
11
|
/**
|
|
12
12
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
13
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
14
|
+
* `consistency` is the Tier C cross-artifact family (deterministic but
|
|
15
|
+
* operates over multiple artifacts at once).
|
|
14
16
|
*/
|
|
15
|
-
export const VERIFIER_KINDS = [
|
|
17
|
+
export const VERIFIER_KINDS = [
|
|
18
|
+
"structural",
|
|
19
|
+
"rules",
|
|
20
|
+
"judge",
|
|
21
|
+
"workflow",
|
|
22
|
+
"consistency"
|
|
23
|
+
];
|
|
24
|
+
/**
|
|
25
|
+
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
26
|
+
* Tier C covers the early "design" arc of a project. TDD/review/ship
|
|
27
|
+
* are out of scope (they require real code execution).
|
|
28
|
+
*/
|
|
29
|
+
export const WORKFLOW_STAGES = [
|
|
30
|
+
"brainstorm",
|
|
31
|
+
"scope",
|
|
32
|
+
"design",
|
|
33
|
+
"spec",
|
|
34
|
+
"plan"
|
|
35
|
+
];
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-artifact consistency verifier for Tier C.
|
|
3
|
+
*
|
|
4
|
+
* Operates over a `{ stage → artifact }` map produced by the workflow
|
|
5
|
+
* agent and emits deterministic verifier results for:
|
|
6
|
+
*
|
|
7
|
+
* - `ids_flow`: every id extracted from `from` must appear in every
|
|
8
|
+
* `to` stage. Typical use — `D-\d+` from scope must all land in plan.
|
|
9
|
+
* - `placeholder_free`: none of the listed phrases
|
|
10
|
+
* (default `TBD`/`TODO`/`placeholder`) appear in any of the named
|
|
11
|
+
* stages.
|
|
12
|
+
* - `no_contradictions`: for each entry, if `must` is present in the
|
|
13
|
+
* declaring stage, `forbid` must not appear in any of the listed
|
|
14
|
+
* `stages`.
|
|
15
|
+
*
|
|
16
|
+
* Each sub-check contributes zero or more `VerifierResult`s with
|
|
17
|
+
* `kind: "consistency"`. An empty `WorkflowConsistencyExpected` produces
|
|
18
|
+
* zero results so authors can opt in incrementally.
|
|
19
|
+
*/
|
|
20
|
+
import type { VerifierResult, WorkflowConsistencyExpected, WorkflowStageName } from "../types.js";
|
|
21
|
+
export declare function verifyWorkflowConsistency(artifacts: Map<WorkflowStageName, string>, expected: WorkflowConsistencyExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
const DEFAULT_PLACEHOLDERS = ["TBD", "TODO", "placeholder"];
|
|
2
|
+
export function verifyWorkflowConsistency(artifacts, expected) {
|
|
3
|
+
if (!expected)
|
|
4
|
+
return [];
|
|
5
|
+
const out = [];
|
|
6
|
+
if (expected.idsFlow) {
|
|
7
|
+
for (const rule of expected.idsFlow) {
|
|
8
|
+
out.push(...checkIdsFlow(artifacts, rule));
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
if (expected.placeholderFree) {
|
|
12
|
+
out.push(...checkPlaceholderFree(artifacts, expected.placeholderFree.stages, expected.placeholderFree.phrases && expected.placeholderFree.phrases.length > 0
|
|
13
|
+
? expected.placeholderFree.phrases
|
|
14
|
+
: DEFAULT_PLACEHOLDERS));
|
|
15
|
+
}
|
|
16
|
+
if (expected.noContradictions) {
|
|
17
|
+
for (const rule of expected.noContradictions) {
|
|
18
|
+
out.push(...checkNoContradiction(artifacts, rule));
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
return out;
|
|
22
|
+
}
|
|
23
|
+
function slug(value) {
|
|
24
|
+
return value
|
|
25
|
+
.toLowerCase()
|
|
26
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
27
|
+
.replace(/^-+|-+$/g, "")
|
|
28
|
+
.slice(0, 48);
|
|
29
|
+
}
|
|
30
|
+
function missingStage(artifacts, stage, verifierId, label) {
|
|
31
|
+
if (artifacts.has(stage))
|
|
32
|
+
return undefined;
|
|
33
|
+
return {
|
|
34
|
+
kind: "consistency",
|
|
35
|
+
id: verifierId,
|
|
36
|
+
ok: false,
|
|
37
|
+
score: 0,
|
|
38
|
+
message: `Workflow artifact for stage "${stage}" is missing (${label}).`,
|
|
39
|
+
details: { stage, missing: true }
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
function extractIds(text, pattern, flags) {
|
|
43
|
+
const normalized = flags.includes("g") ? flags : `${flags}g`;
|
|
44
|
+
const regex = new RegExp(pattern, normalized);
|
|
45
|
+
const hits = new Set();
|
|
46
|
+
let match;
|
|
47
|
+
while ((match = regex.exec(text)) !== null) {
|
|
48
|
+
hits.add(match[0]);
|
|
49
|
+
if (regex.lastIndex === match.index)
|
|
50
|
+
regex.lastIndex += 1;
|
|
51
|
+
}
|
|
52
|
+
return [...hits].sort((a, b) => a.localeCompare(b));
|
|
53
|
+
}
|
|
54
|
+
function checkIdsFlow(artifacts, rule) {
|
|
55
|
+
const idTag = slug(rule.idPattern);
|
|
56
|
+
const baseId = `consistency:ids-flow:${rule.from}:${idTag}`;
|
|
57
|
+
const results = [];
|
|
58
|
+
const missingFrom = missingStage(artifacts, rule.from, `${baseId}:source-missing`, "ids-flow source");
|
|
59
|
+
if (missingFrom) {
|
|
60
|
+
results.push(missingFrom);
|
|
61
|
+
return results;
|
|
62
|
+
}
|
|
63
|
+
const source = artifacts.get(rule.from);
|
|
64
|
+
let sourceIds;
|
|
65
|
+
try {
|
|
66
|
+
sourceIds = extractIds(source, rule.idPattern, rule.idFlags ?? "g");
|
|
67
|
+
}
|
|
68
|
+
catch (err) {
|
|
69
|
+
results.push({
|
|
70
|
+
kind: "consistency",
|
|
71
|
+
id: `${baseId}:regex`,
|
|
72
|
+
ok: false,
|
|
73
|
+
score: 0,
|
|
74
|
+
message: `Invalid id regex "${rule.idPattern}": ${err instanceof Error ? err.message : String(err)}`,
|
|
75
|
+
details: { from: rule.from }
|
|
76
|
+
});
|
|
77
|
+
return results;
|
|
78
|
+
}
|
|
79
|
+
if (sourceIds.length === 0) {
|
|
80
|
+
results.push({
|
|
81
|
+
kind: "consistency",
|
|
82
|
+
id: `${baseId}:source-empty`,
|
|
83
|
+
ok: false,
|
|
84
|
+
score: 0,
|
|
85
|
+
message: `No ids matched "${rule.idPattern}" in stage "${rule.from}".`,
|
|
86
|
+
details: { from: rule.from, pattern: rule.idPattern }
|
|
87
|
+
});
|
|
88
|
+
return results;
|
|
89
|
+
}
|
|
90
|
+
for (const target of rule.to) {
|
|
91
|
+
const missingTarget = missingStage(artifacts, target, `${baseId}:${target}:target-missing`, "ids-flow target");
|
|
92
|
+
if (missingTarget) {
|
|
93
|
+
results.push(missingTarget);
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
const body = artifacts.get(target);
|
|
97
|
+
const missing = sourceIds.filter((id) => !body.includes(id));
|
|
98
|
+
const verifierId = `${baseId}:${target}`;
|
|
99
|
+
if (missing.length === 0) {
|
|
100
|
+
results.push({
|
|
101
|
+
kind: "consistency",
|
|
102
|
+
id: verifierId,
|
|
103
|
+
ok: true,
|
|
104
|
+
score: 1,
|
|
105
|
+
message: `All ${sourceIds.length} id(s) from "${rule.from}" appear in "${target}".`,
|
|
106
|
+
details: { from: rule.from, to: target, ids: sourceIds }
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
results.push({
|
|
111
|
+
kind: "consistency",
|
|
112
|
+
id: verifierId,
|
|
113
|
+
ok: false,
|
|
114
|
+
score: 0,
|
|
115
|
+
message: `Missing in "${target}": ${missing.slice(0, 5).join(", ")}` +
|
|
116
|
+
(missing.length > 5 ? ` (+${missing.length - 5} more)` : ""),
|
|
117
|
+
details: {
|
|
118
|
+
from: rule.from,
|
|
119
|
+
to: target,
|
|
120
|
+
ids: sourceIds,
|
|
121
|
+
missing
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return results;
|
|
127
|
+
}
|
|
128
|
+
function checkPlaceholderFree(artifacts, stages, phrases) {
|
|
129
|
+
const results = [];
|
|
130
|
+
for (const stage of stages) {
|
|
131
|
+
const verifierId = `consistency:placeholder-free:${stage}`;
|
|
132
|
+
const missing = missingStage(artifacts, stage, verifierId, "placeholder-free");
|
|
133
|
+
if (missing) {
|
|
134
|
+
results.push(missing);
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
const body = artifacts.get(stage);
|
|
138
|
+
const lower = body.toLowerCase();
|
|
139
|
+
const hits = phrases.filter((p) => lower.includes(p.toLowerCase()));
|
|
140
|
+
if (hits.length === 0) {
|
|
141
|
+
results.push({
|
|
142
|
+
kind: "consistency",
|
|
143
|
+
id: verifierId,
|
|
144
|
+
ok: true,
|
|
145
|
+
score: 1,
|
|
146
|
+
message: `No placeholder phrases found in "${stage}".`,
|
|
147
|
+
details: { stage, phrases }
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
results.push({
|
|
152
|
+
kind: "consistency",
|
|
153
|
+
id: verifierId,
|
|
154
|
+
ok: false,
|
|
155
|
+
score: 0,
|
|
156
|
+
message: `Placeholder phrases in "${stage}": ${hits.join(", ")}.`,
|
|
157
|
+
details: { stage, phrases, hits }
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return results;
|
|
162
|
+
}
|
|
163
|
+
function checkNoContradiction(artifacts, rule) {
|
|
164
|
+
const tag = `${slug(rule.must)}-vs-${slug(rule.forbid)}`;
|
|
165
|
+
const baseId = `consistency:no-contradiction:${rule.stage}:${tag}`;
|
|
166
|
+
const results = [];
|
|
167
|
+
const missingAnchor = missingStage(artifacts, rule.stage, `${baseId}:anchor-missing`, "no-contradiction anchor");
|
|
168
|
+
if (missingAnchor) {
|
|
169
|
+
results.push(missingAnchor);
|
|
170
|
+
return results;
|
|
171
|
+
}
|
|
172
|
+
const anchorText = artifacts.get(rule.stage);
|
|
173
|
+
if (!anchorText.toLowerCase().includes(rule.must.toLowerCase())) {
|
|
174
|
+
// The declaring stage doesn't actually assert `must`, so the rule is vacuously satisfied.
|
|
175
|
+
results.push({
|
|
176
|
+
kind: "consistency",
|
|
177
|
+
id: `${baseId}:anchor-inactive`,
|
|
178
|
+
ok: true,
|
|
179
|
+
score: 1,
|
|
180
|
+
message: `Anchor "${rule.must}" not present in "${rule.stage}"; contradiction check skipped.`,
|
|
181
|
+
details: { stage: rule.stage, anchor: rule.must, skipped: true }
|
|
182
|
+
});
|
|
183
|
+
return results;
|
|
184
|
+
}
|
|
185
|
+
for (const target of rule.stages) {
|
|
186
|
+
const verifierId = `${baseId}:${target}`;
|
|
187
|
+
const missingTarget = missingStage(artifacts, target, `${verifierId}:target-missing`, "no-contradiction target");
|
|
188
|
+
if (missingTarget) {
|
|
189
|
+
results.push(missingTarget);
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
const body = artifacts.get(target);
|
|
193
|
+
if (body.toLowerCase().includes(rule.forbid.toLowerCase())) {
|
|
194
|
+
results.push({
|
|
195
|
+
kind: "consistency",
|
|
196
|
+
id: verifierId,
|
|
197
|
+
ok: false,
|
|
198
|
+
score: 0,
|
|
199
|
+
message: `"${rule.stage}" asserts "${rule.must}" but "${target}" contains "${rule.forbid}".`,
|
|
200
|
+
details: {
|
|
201
|
+
stage: rule.stage,
|
|
202
|
+
anchor: rule.must,
|
|
203
|
+
forbid: rule.forbid,
|
|
204
|
+
target
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
else {
|
|
209
|
+
results.push({
|
|
210
|
+
kind: "consistency",
|
|
211
|
+
id: verifierId,
|
|
212
|
+
ok: true,
|
|
213
|
+
score: 1,
|
|
214
|
+
message: `"${target}" does not contradict "${rule.stage}" on "${rule.must}".`,
|
|
215
|
+
details: {
|
|
216
|
+
stage: rule.stage,
|
|
217
|
+
anchor: rule.must,
|
|
218
|
+
forbid: rule.forbid,
|
|
219
|
+
target
|
|
220
|
+
}
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return results;
|
|
225
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { WorkflowCase } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Load every Tier C workflow case under
|
|
4
|
+
* `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
|
|
5
|
+
* directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
|
|
6
|
+
*/
|
|
7
|
+
export declare function loadWorkflowCorpus(projectRoot: string): Promise<WorkflowCase[]>;
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Workflow corpus loader for Tier C.
|
|
3
|
+
*
|
|
4
|
+
* Tier C cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
|
|
5
|
+
* describe a multi-stage run that chains the with-tools agent across
|
|
6
|
+
* `brainstorm → scope → design → spec → plan`. Unlike single-stage
|
|
7
|
+
* cases (which are keyed by stage folder), workflow cases ship as a
|
|
8
|
+
* single YAML that embeds each stage's prompt + expectations.
|
|
9
|
+
*
|
|
10
|
+
* The loader is intentionally separate from `loadCorpus` so the
|
|
11
|
+
* structural / rules CI paths never walk the workflow directory — those
|
|
12
|
+
* paths are single-stage only.
|
|
13
|
+
*/
|
|
14
|
+
import fs from "node:fs/promises";
|
|
15
|
+
import path from "node:path";
|
|
16
|
+
import { parse } from "yaml";
|
|
17
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
18
|
+
import { exists } from "../fs-utils.js";
|
|
19
|
+
import { WORKFLOW_STAGES } from "./types.js";
|
|
20
|
+
const WORKFLOW_STAGE_SET = new Set(WORKFLOW_STAGES);
|
|
21
|
+
function workflowCorpusError(filePath, reason) {
|
|
22
|
+
return new Error(`Invalid workflow case at ${filePath}: ${reason}\n` +
|
|
23
|
+
`Supported workflow stages: ${WORKFLOW_STAGES.join(", ")}`);
|
|
24
|
+
}
|
|
25
|
+
function isRecord(value) {
|
|
26
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
27
|
+
}
|
|
28
|
+
function readStringArray(filePath, context, value) {
|
|
29
|
+
if (value === undefined)
|
|
30
|
+
return undefined;
|
|
31
|
+
if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
|
|
32
|
+
throw workflowCorpusError(filePath, `"${context}" must be an array of strings`);
|
|
33
|
+
}
|
|
34
|
+
return value;
|
|
35
|
+
}
|
|
36
|
+
function parseStageName(filePath, context, value) {
|
|
37
|
+
if (typeof value !== "string" || !WORKFLOW_STAGE_SET.has(value)) {
|
|
38
|
+
throw workflowCorpusError(filePath, `"${context}" must be one of: ${WORKFLOW_STAGES.join(", ")}`);
|
|
39
|
+
}
|
|
40
|
+
return value;
|
|
41
|
+
}
|
|
42
|
+
function parseStageArray(filePath, context, value) {
|
|
43
|
+
if (!Array.isArray(value) || value.length === 0) {
|
|
44
|
+
throw workflowCorpusError(filePath, `"${context}" must be a non-empty array of stage names`);
|
|
45
|
+
}
|
|
46
|
+
return value.map((entry, index) => parseStageName(filePath, `${context}[${index}]`, entry));
|
|
47
|
+
}
|
|
48
|
+
function parseStageStep(filePath, index, raw) {
|
|
49
|
+
if (!isRecord(raw)) {
|
|
50
|
+
throw workflowCorpusError(filePath, `stages[${index}] must be a mapping`);
|
|
51
|
+
}
|
|
52
|
+
const name = parseStageName(filePath, `stages[${index}].name`, raw.name);
|
|
53
|
+
const inputPrompt = raw.input_prompt ?? raw.inputPrompt;
|
|
54
|
+
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
55
|
+
throw workflowCorpusError(filePath, `stages[${index}].input_prompt must be a non-empty string`);
|
|
56
|
+
}
|
|
57
|
+
const step = { name, inputPrompt: inputPrompt.trim() };
|
|
58
|
+
if (raw.rubric !== undefined) {
|
|
59
|
+
if (typeof raw.rubric !== "string" || raw.rubric.trim().length === 0) {
|
|
60
|
+
throw workflowCorpusError(filePath, `stages[${index}].rubric must be a non-empty string`);
|
|
61
|
+
}
|
|
62
|
+
step.rubric = raw.rubric.trim();
|
|
63
|
+
}
|
|
64
|
+
const requiredChecks = readStringArray(filePath, `stages[${index}].required_checks`, raw.required_checks ?? raw.requiredChecks);
|
|
65
|
+
if (requiredChecks)
|
|
66
|
+
step.requiredChecks = requiredChecks;
|
|
67
|
+
const minScoresRaw = raw.minimum_scores ?? raw.minimumScores;
|
|
68
|
+
if (minScoresRaw !== undefined) {
|
|
69
|
+
if (!isRecord(minScoresRaw)) {
|
|
70
|
+
throw workflowCorpusError(filePath, `stages[${index}].minimum_scores must be a mapping of check id → number`);
|
|
71
|
+
}
|
|
72
|
+
const minimumScores = {};
|
|
73
|
+
for (const [key, val] of Object.entries(minScoresRaw)) {
|
|
74
|
+
if (typeof val !== "number" || !Number.isFinite(val) || val < 1 || val > 5) {
|
|
75
|
+
throw workflowCorpusError(filePath, `stages[${index}].minimum_scores.${key} must be a number in [1,5]`);
|
|
76
|
+
}
|
|
77
|
+
minimumScores[key] = val;
|
|
78
|
+
}
|
|
79
|
+
step.minimumScores = minimumScores;
|
|
80
|
+
}
|
|
81
|
+
return step;
|
|
82
|
+
}
|
|
83
|
+
function parseConsistency(filePath, raw) {
|
|
84
|
+
if (raw === undefined)
|
|
85
|
+
return undefined;
|
|
86
|
+
if (!isRecord(raw)) {
|
|
87
|
+
throw workflowCorpusError(filePath, `"consistency" must be a mapping`);
|
|
88
|
+
}
|
|
89
|
+
const out = {};
|
|
90
|
+
const idsFlowRaw = raw.ids_flow ?? raw.idsFlow;
|
|
91
|
+
if (idsFlowRaw !== undefined) {
|
|
92
|
+
if (!Array.isArray(idsFlowRaw)) {
|
|
93
|
+
throw workflowCorpusError(filePath, `"consistency.ids_flow" must be an array`);
|
|
94
|
+
}
|
|
95
|
+
out.idsFlow = idsFlowRaw.map((entry, index) => {
|
|
96
|
+
if (!isRecord(entry)) {
|
|
97
|
+
throw workflowCorpusError(filePath, `consistency.ids_flow[${index}] must be a mapping`);
|
|
98
|
+
}
|
|
99
|
+
const idPattern = entry.id_pattern ?? entry.idPattern;
|
|
100
|
+
if (typeof idPattern !== "string" || idPattern.length === 0) {
|
|
101
|
+
throw workflowCorpusError(filePath, `consistency.ids_flow[${index}].id_pattern must be a non-empty regex source`);
|
|
102
|
+
}
|
|
103
|
+
const idFlags = entry.id_flags ?? entry.idFlags;
|
|
104
|
+
if (idFlags !== undefined && typeof idFlags !== "string") {
|
|
105
|
+
throw workflowCorpusError(filePath, `consistency.ids_flow[${index}].id_flags must be a string`);
|
|
106
|
+
}
|
|
107
|
+
const from = parseStageName(filePath, `consistency.ids_flow[${index}].from`, entry.from);
|
|
108
|
+
const to = parseStageArray(filePath, `consistency.ids_flow[${index}].to`, entry.to);
|
|
109
|
+
const result = { idPattern, from, to };
|
|
110
|
+
if (idFlags !== undefined)
|
|
111
|
+
result.idFlags = idFlags;
|
|
112
|
+
return result;
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
const placeholderRaw = raw.placeholder_free ?? raw.placeholderFree;
|
|
116
|
+
if (placeholderRaw !== undefined) {
|
|
117
|
+
if (!isRecord(placeholderRaw)) {
|
|
118
|
+
throw workflowCorpusError(filePath, `"consistency.placeholder_free" must be a mapping`);
|
|
119
|
+
}
|
|
120
|
+
const stages = parseStageArray(filePath, "consistency.placeholder_free.stages", placeholderRaw.stages);
|
|
121
|
+
const phrases = readStringArray(filePath, "consistency.placeholder_free.phrases", placeholderRaw.phrases);
|
|
122
|
+
const block = { stages };
|
|
123
|
+
if (phrases)
|
|
124
|
+
block.phrases = phrases;
|
|
125
|
+
out.placeholderFree = block;
|
|
126
|
+
}
|
|
127
|
+
const noContradictionsRaw = raw.no_contradictions ?? raw.noContradictions;
|
|
128
|
+
if (noContradictionsRaw !== undefined) {
|
|
129
|
+
if (!Array.isArray(noContradictionsRaw)) {
|
|
130
|
+
throw workflowCorpusError(filePath, `"consistency.no_contradictions" must be an array`);
|
|
131
|
+
}
|
|
132
|
+
out.noContradictions = noContradictionsRaw.map((entry, index) => {
|
|
133
|
+
if (!isRecord(entry)) {
|
|
134
|
+
throw workflowCorpusError(filePath, `consistency.no_contradictions[${index}] must be a mapping`);
|
|
135
|
+
}
|
|
136
|
+
const stage = parseStageName(filePath, `consistency.no_contradictions[${index}].stage`, entry.stage);
|
|
137
|
+
if (typeof entry.must !== "string" || entry.must.length === 0) {
|
|
138
|
+
throw workflowCorpusError(filePath, `consistency.no_contradictions[${index}].must must be a non-empty string`);
|
|
139
|
+
}
|
|
140
|
+
if (typeof entry.forbid !== "string" || entry.forbid.length === 0) {
|
|
141
|
+
throw workflowCorpusError(filePath, `consistency.no_contradictions[${index}].forbid must be a non-empty string`);
|
|
142
|
+
}
|
|
143
|
+
const stages = parseStageArray(filePath, `consistency.no_contradictions[${index}].stages`, entry.stages);
|
|
144
|
+
return {
|
|
145
|
+
stage,
|
|
146
|
+
must: entry.must,
|
|
147
|
+
forbid: entry.forbid,
|
|
148
|
+
stages
|
|
149
|
+
};
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
return Object.keys(out).length === 0 ? undefined : out;
|
|
153
|
+
}
|
|
154
|
+
function validateWorkflowCase(filePath, raw) {
|
|
155
|
+
if (!isRecord(raw)) {
|
|
156
|
+
throw workflowCorpusError(filePath, `top-level value must be a mapping`);
|
|
157
|
+
}
|
|
158
|
+
const id = raw.id;
|
|
159
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
160
|
+
throw workflowCorpusError(filePath, `"id" must be a non-empty string`);
|
|
161
|
+
}
|
|
162
|
+
const stagesRaw = raw.stages;
|
|
163
|
+
if (!Array.isArray(stagesRaw) || stagesRaw.length === 0) {
|
|
164
|
+
throw workflowCorpusError(filePath, `"stages" must be a non-empty array`);
|
|
165
|
+
}
|
|
166
|
+
const stages = stagesRaw.map((entry, index) => parseStageStep(filePath, index, entry));
|
|
167
|
+
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
168
|
+
const consistency = parseConsistency(filePath, raw.consistency);
|
|
169
|
+
const description = typeof raw.description === "string" ? raw.description.trim() : undefined;
|
|
170
|
+
const out = { id: id.trim(), stages };
|
|
171
|
+
if (description)
|
|
172
|
+
out.description = description;
|
|
173
|
+
if (contextFiles)
|
|
174
|
+
out.contextFiles = contextFiles;
|
|
175
|
+
if (consistency)
|
|
176
|
+
out.consistency = consistency;
|
|
177
|
+
return out;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Load every Tier C workflow case under
|
|
181
|
+
* `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
|
|
182
|
+
* directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
|
|
183
|
+
*/
|
|
184
|
+
export async function loadWorkflowCorpus(projectRoot) {
|
|
185
|
+
const dir = path.join(projectRoot, EVALS_ROOT, "corpus", "workflows");
|
|
186
|
+
if (!(await exists(dir)))
|
|
187
|
+
return [];
|
|
188
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
189
|
+
const out = [];
|
|
190
|
+
for (const entry of entries) {
|
|
191
|
+
if (!entry.isFile())
|
|
192
|
+
continue;
|
|
193
|
+
if (!entry.name.endsWith(".yaml") && !entry.name.endsWith(".yml"))
|
|
194
|
+
continue;
|
|
195
|
+
const filePath = path.join(dir, entry.name);
|
|
196
|
+
let parsed;
|
|
197
|
+
try {
|
|
198
|
+
parsed = parse(await fs.readFile(filePath, "utf8"));
|
|
199
|
+
}
|
|
200
|
+
catch (err) {
|
|
201
|
+
throw workflowCorpusError(filePath, err instanceof Error ? err.message : String(err));
|
|
202
|
+
}
|
|
203
|
+
out.push(validateWorkflowCase(filePath, parsed));
|
|
204
|
+
}
|
|
205
|
+
out.sort((a, b) => a.id.localeCompare(b.id));
|
|
206
|
+
return out;
|
|
207
|
+
}
|