cclaw-cli 0.22.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +42 -11
- package/dist/constants.d.ts +4 -4
- package/dist/constants.js +4 -4
- package/dist/content/eval-scaffold.d.ts +4 -4
- package/dist/content/eval-scaffold.js +13 -14
- package/dist/content/examples.js +11 -11
- package/dist/content/hooks.js +1 -1
- package/dist/content/skills.d.ts +3 -3
- package/dist/content/skills.js +19 -19
- package/dist/content/stage-schema.js +2 -2
- package/dist/content/stages/plan.js +18 -18
- package/dist/content/stages/schema-types.d.ts +2 -2
- package/dist/content/stages/tdd.js +1 -1
- package/dist/content/subagents.js +1 -1
- package/dist/content/templates.js +8 -8
- package/dist/content/utility-skills.js +19 -19
- package/dist/doctor.js +2 -2
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/corpus.d.ts +13 -2
- package/dist/eval/corpus.js +97 -13
- package/dist/eval/llm-client.d.ts +10 -10
- package/dist/eval/llm-client.js +5 -5
- package/dist/eval/report.js +17 -4
- package/dist/eval/runner.d.ts +8 -16
- package/dist/eval/runner.js +124 -42
- package/dist/eval/types.d.ts +94 -14
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/dist/install.js +3 -3
- package/dist/policy.js +1 -1
- package/package.json +1 -1
package/dist/eval/runner.d.ts
CHANGED
|
@@ -4,11 +4,11 @@ export interface RunEvalOptions {
|
|
|
4
4
|
projectRoot: string;
|
|
5
5
|
stage?: FlowStage;
|
|
6
6
|
tier?: EvalTier;
|
|
7
|
-
/** When true, run only structural verifiers
|
|
7
|
+
/** When true, run only structural verifiers (Step 1). */
|
|
8
8
|
schemaOnly?: boolean;
|
|
9
|
-
/** When true, run structural + rule-based verifiers.
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
10
10
|
rules?: boolean;
|
|
11
|
-
/** When true, also run LLM judge verifiers.
|
|
11
|
+
/** When true, also run LLM judge verifiers. Step 3 wires judging. */
|
|
12
12
|
judge?: boolean;
|
|
13
13
|
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
14
|
dryRun?: boolean;
|
|
@@ -27,10 +27,6 @@ export interface DryRunSummary {
|
|
|
27
27
|
}>;
|
|
28
28
|
};
|
|
29
29
|
plannedTier: EvalTier;
|
|
30
|
-
/**
|
|
31
|
-
* Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
|
|
32
|
-
* across the board because no verifier is implemented yet.
|
|
33
|
-
*/
|
|
34
30
|
verifiersAvailable: {
|
|
35
31
|
structural: boolean;
|
|
36
32
|
rules: boolean;
|
|
@@ -40,14 +36,10 @@ export interface DryRunSummary {
|
|
|
40
36
|
notes: string[];
|
|
41
37
|
}
|
|
42
38
|
/**
|
|
43
|
-
*
|
|
44
|
-
*
|
|
45
|
-
* -
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
50
|
-
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
51
|
-
* not churn.
|
|
39
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
40
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
41
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
42
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
43
|
+
* skipped rather than failing.
|
|
52
44
|
*/
|
|
53
45
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -1,23 +1,121 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
-
import {
|
|
3
|
+
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
|
+
import { loadCorpus, readFixtureArtifact } from "./corpus.js";
|
|
4
6
|
import { loadEvalConfig } from "./config-loader.js";
|
|
7
|
+
import { verifyStructural } from "./verifiers/structural.js";
|
|
5
8
|
function groupByStage(cases) {
|
|
6
9
|
return cases.reduce((acc, item) => {
|
|
7
10
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
8
11
|
return acc;
|
|
9
12
|
}, {});
|
|
10
13
|
}
|
|
14
|
+
function skeletonVerifierResult(message, details) {
|
|
15
|
+
return {
|
|
16
|
+
kind: "structural",
|
|
17
|
+
id: "structural:no-expectations",
|
|
18
|
+
ok: true,
|
|
19
|
+
score: 1,
|
|
20
|
+
message,
|
|
21
|
+
...(details !== undefined ? { details } : {})
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
|
|
25
|
+
const started = Date.now();
|
|
26
|
+
const structuralExpected = caseEntry.expected?.structural;
|
|
27
|
+
const verifierResults = [];
|
|
28
|
+
if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
|
|
29
|
+
// No structural expectations declared — case is treated as "N/A" for this
|
|
30
|
+
// verifier kind; a placeholder pass keeps downstream math simple while
|
|
31
|
+
// making the situation visible in the report.
|
|
32
|
+
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
let artifact;
|
|
36
|
+
try {
|
|
37
|
+
artifact = await readFixtureArtifact(projectRoot, caseEntry);
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
verifierResults.push({
|
|
41
|
+
kind: "structural",
|
|
42
|
+
id: "structural:fixture:missing",
|
|
43
|
+
ok: false,
|
|
44
|
+
score: 0,
|
|
45
|
+
message: err instanceof Error ? err.message : String(err),
|
|
46
|
+
details: { fixture: caseEntry.fixture }
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
if (artifact !== undefined) {
|
|
50
|
+
const results = verifyStructural(artifact, structuralExpected);
|
|
51
|
+
if (results.length === 0) {
|
|
52
|
+
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
verifierResults.push(...results);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else if (verifierResults.length === 0) {
|
|
59
|
+
verifierResults.push({
|
|
60
|
+
kind: "structural",
|
|
61
|
+
id: "structural:fixture:absent",
|
|
62
|
+
ok: false,
|
|
63
|
+
score: 0,
|
|
64
|
+
message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
65
|
+
details: { fixtureProvided: false }
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const allOk = verifierResults.every((r) => r.ok);
|
|
70
|
+
return {
|
|
71
|
+
caseId: caseEntry.id,
|
|
72
|
+
stage: caseEntry.stage,
|
|
73
|
+
tier: plannedTier,
|
|
74
|
+
passed: allOk,
|
|
75
|
+
durationMs: Date.now() - started,
|
|
76
|
+
verifierResults
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
function reduceSummary(caseResults) {
|
|
80
|
+
let passed = 0;
|
|
81
|
+
let failed = 0;
|
|
82
|
+
let skipped = 0;
|
|
83
|
+
let totalCostUsd = 0;
|
|
84
|
+
let totalDurationMs = 0;
|
|
85
|
+
for (const c of caseResults) {
|
|
86
|
+
totalDurationMs += c.durationMs;
|
|
87
|
+
if (c.costUsd !== undefined)
|
|
88
|
+
totalCostUsd += c.costUsd;
|
|
89
|
+
if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
|
|
90
|
+
skipped += 1;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
if (c.passed)
|
|
94
|
+
passed += 1;
|
|
95
|
+
else
|
|
96
|
+
failed += 1;
|
|
97
|
+
}
|
|
98
|
+
return {
|
|
99
|
+
totalCases: caseResults.length,
|
|
100
|
+
passed,
|
|
101
|
+
failed,
|
|
102
|
+
skipped,
|
|
103
|
+
totalCostUsd: Number(totalCostUsd.toFixed(6)),
|
|
104
|
+
totalDurationMs
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function stagesInResults(caseResults) {
|
|
108
|
+
const set = new Set();
|
|
109
|
+
for (const c of caseResults)
|
|
110
|
+
set.add(c.stage);
|
|
111
|
+
return FLOW_STAGES.filter((s) => set.has(s));
|
|
112
|
+
}
|
|
11
113
|
/**
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* -
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
19
|
-
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
20
|
-
* not churn.
|
|
114
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
115
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
116
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
117
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
118
|
+
* skipped rather than failing.
|
|
21
119
|
*/
|
|
22
120
|
export async function runEval(options) {
|
|
23
121
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
@@ -25,16 +123,13 @@ export async function runEval(options) {
|
|
|
25
123
|
const plannedTier = options.tier ?? config.defaultTier;
|
|
26
124
|
const notes = [];
|
|
27
125
|
if (corpus.length === 0) {
|
|
28
|
-
notes.push("Corpus is empty. Seed cases
|
|
29
|
-
}
|
|
30
|
-
if (options.schemaOnly) {
|
|
31
|
-
notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
|
|
126
|
+
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
32
127
|
}
|
|
33
128
|
if (options.rules) {
|
|
34
|
-
notes.push("--rules is accepted; rule verifiers
|
|
129
|
+
notes.push("--rules is accepted; rule verifiers are not wired yet.");
|
|
35
130
|
}
|
|
36
131
|
if (options.judge) {
|
|
37
|
-
notes.push("--judge is accepted; LLM judging
|
|
132
|
+
notes.push("--judge is accepted; LLM judging is not wired yet.");
|
|
38
133
|
}
|
|
39
134
|
if (options.dryRun === true) {
|
|
40
135
|
const summary = {
|
|
@@ -47,7 +142,7 @@ export async function runEval(options) {
|
|
|
47
142
|
},
|
|
48
143
|
plannedTier,
|
|
49
144
|
verifiersAvailable: {
|
|
50
|
-
structural:
|
|
145
|
+
structural: true,
|
|
51
146
|
rules: false,
|
|
52
147
|
judge: false,
|
|
53
148
|
workflow: false
|
|
@@ -57,22 +152,13 @@ export async function runEval(options) {
|
|
|
57
152
|
return summary;
|
|
58
153
|
}
|
|
59
154
|
const now = new Date().toISOString();
|
|
60
|
-
const caseResults =
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
{
|
|
68
|
-
kind: "structural",
|
|
69
|
-
id: "wave-7-0-skeleton",
|
|
70
|
-
ok: false,
|
|
71
|
-
message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
|
|
72
|
-
details: { skipped: true }
|
|
73
|
-
}
|
|
74
|
-
]
|
|
75
|
-
}));
|
|
155
|
+
const caseResults = [];
|
|
156
|
+
for (const item of corpus) {
|
|
157
|
+
caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
|
|
158
|
+
}
|
|
159
|
+
const stages = stagesInResults(caseResults);
|
|
160
|
+
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
161
|
+
const summary = reduceSummary(caseResults);
|
|
76
162
|
const report = {
|
|
77
163
|
schemaVersion: 1,
|
|
78
164
|
generatedAt: now,
|
|
@@ -81,16 +167,12 @@ export async function runEval(options) {
|
|
|
81
167
|
provider: config.provider,
|
|
82
168
|
model: config.model,
|
|
83
169
|
tier: plannedTier,
|
|
84
|
-
stages
|
|
170
|
+
stages,
|
|
85
171
|
cases: caseResults,
|
|
86
|
-
summary
|
|
87
|
-
totalCases: caseResults.length,
|
|
88
|
-
passed: 0,
|
|
89
|
-
failed: 0,
|
|
90
|
-
skipped: caseResults.length,
|
|
91
|
-
totalCostUsd: 0,
|
|
92
|
-
totalDurationMs: 0
|
|
93
|
-
}
|
|
172
|
+
summary
|
|
94
173
|
};
|
|
174
|
+
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
175
|
+
if (baselineDelta)
|
|
176
|
+
report.baselineDelta = baselineDelta;
|
|
95
177
|
return report;
|
|
96
178
|
}
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
7
|
*
|
|
8
8
|
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
-
* - The verifier / rubric / LLM stack evolves on its own release cadence (
|
|
9
|
+
* - The verifier / rubric / LLM stack evolves on its own release cadence (Steps 0-6).
|
|
10
10
|
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
11
|
*/
|
|
12
12
|
import type { FlowStage } from "../types.js";
|
|
@@ -27,11 +27,50 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
|
27
27
|
*/
|
|
28
28
|
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
29
29
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
|
+
/**
|
|
31
|
+
* Structural expectations — deterministic, LLM-free checks against a single
|
|
32
|
+
* text artifact. Step 1 implements all fields below; Step 2 adds the
|
|
33
|
+
* sibling `rules` shape, Step 3 adds `judge`.
|
|
34
|
+
*/
|
|
35
|
+
export interface StructuralExpected {
|
|
36
|
+
/**
|
|
37
|
+
* Case-insensitive substrings that must each appear on at least one markdown
|
|
38
|
+
* heading line (line starting with `#`). Useful for "required sections".
|
|
39
|
+
*/
|
|
40
|
+
requiredSections?: string[];
|
|
41
|
+
/**
|
|
42
|
+
* Case-insensitive substrings that must NOT appear anywhere in the body
|
|
43
|
+
* (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
|
|
44
|
+
*/
|
|
45
|
+
forbiddenPatterns?: string[];
|
|
46
|
+
/** Inclusive minimum line count of the artifact body (frontmatter excluded). */
|
|
47
|
+
minLines?: number;
|
|
48
|
+
/** Inclusive maximum line count of the artifact body (frontmatter excluded). */
|
|
49
|
+
maxLines?: number;
|
|
50
|
+
/** Inclusive minimum character count of the artifact body. */
|
|
51
|
+
minChars?: number;
|
|
52
|
+
/** Inclusive maximum character count of the artifact body. */
|
|
53
|
+
maxChars?: number;
|
|
54
|
+
/**
|
|
55
|
+
* Keys that must appear in the leading YAML frontmatter (between a pair of
|
|
56
|
+
* `---` delimiters at the very top of the file). An artifact without
|
|
57
|
+
* frontmatter will fail every entry.
|
|
58
|
+
*/
|
|
59
|
+
requiredFrontmatterKeys?: string[];
|
|
60
|
+
}
|
|
61
|
+
/** Superset of per-verifier expectation shapes. Only `structural` is wired in Step 1. */
|
|
62
|
+
export interface ExpectedShape {
|
|
63
|
+
structural?: StructuralExpected;
|
|
64
|
+
/** Rule-based (keyword/regex/traceability) checks — Step 2. */
|
|
65
|
+
rules?: Record<string, unknown>;
|
|
66
|
+
/** LLM-judge rubrics — Step 3. */
|
|
67
|
+
judge?: Record<string, unknown>;
|
|
68
|
+
}
|
|
30
69
|
/**
|
|
31
70
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
32
71
|
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
33
|
-
* fixture artifact for verifier development (
|
|
34
|
-
* exists (
|
|
72
|
+
* fixture artifact for verifier development (Step 1) before the agent loop
|
|
73
|
+
* exists (Step 3+).
|
|
35
74
|
*/
|
|
36
75
|
export interface EvalCase {
|
|
37
76
|
id: string;
|
|
@@ -40,14 +79,14 @@ export interface EvalCase {
|
|
|
40
79
|
/** Project files copied into the Tier B/C sandbox before the agent runs. */
|
|
41
80
|
contextFiles?: string[];
|
|
42
81
|
/**
|
|
43
|
-
*
|
|
44
|
-
*
|
|
82
|
+
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
83
|
+
* Each sub-shape is optional; missing sub-shapes skip that verifier tier.
|
|
45
84
|
*/
|
|
46
|
-
expected?:
|
|
85
|
+
expected?: ExpectedShape;
|
|
47
86
|
/**
|
|
48
87
|
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
49
|
-
* when verifiers are exercised without a live agent loop. Primarily a
|
|
50
|
-
*
|
|
88
|
+
* when verifiers are exercised without a live agent loop. Primarily a
|
|
89
|
+
* Step 1 development aid.
|
|
51
90
|
*/
|
|
52
91
|
fixture?: string;
|
|
53
92
|
}
|
|
@@ -90,12 +129,8 @@ export interface EvalReport {
|
|
|
90
129
|
totalCostUsd: number;
|
|
91
130
|
totalDurationMs: number;
|
|
92
131
|
};
|
|
93
|
-
/** Present when comparing against a saved baseline (
|
|
94
|
-
baselineDelta?:
|
|
95
|
-
baselineId: string;
|
|
96
|
-
scoreDelta: number;
|
|
97
|
-
criticalFailures: number;
|
|
98
|
-
};
|
|
132
|
+
/** Present when comparing against a saved baseline (Step 1+). */
|
|
133
|
+
baselineDelta?: BaselineDelta;
|
|
99
134
|
}
|
|
100
135
|
/**
|
|
101
136
|
* Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
|
|
@@ -134,3 +169,48 @@ export interface ResolvedEvalConfig extends EvalConfig {
|
|
|
134
169
|
apiKey?: string;
|
|
135
170
|
source: "default" | "file" | "env" | "file+env";
|
|
136
171
|
}
|
|
172
|
+
/**
|
|
173
|
+
* Frozen per-stage baseline used by regression gating (Step 1). Baselines
|
|
174
|
+
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
175
|
+
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
176
|
+
* changed between runs.
|
|
177
|
+
*/
|
|
178
|
+
export interface BaselineSnapshot {
|
|
179
|
+
schemaVersion: 1;
|
|
180
|
+
stage: FlowStage;
|
|
181
|
+
generatedAt: string;
|
|
182
|
+
cclawVersion: string;
|
|
183
|
+
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
184
|
+
cases: Record<string, BaselineCaseEntry>;
|
|
185
|
+
}
|
|
186
|
+
export interface BaselineCaseEntry {
|
|
187
|
+
passed: boolean;
|
|
188
|
+
verifierResults: BaselineVerifierEntry[];
|
|
189
|
+
}
|
|
190
|
+
export interface BaselineVerifierEntry {
|
|
191
|
+
id: string;
|
|
192
|
+
kind: VerifierKind;
|
|
193
|
+
ok: boolean;
|
|
194
|
+
score?: number;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Delta between a fresh report and the saved baseline. Populated when
|
|
198
|
+
* baselines exist on disk and the run covers matching cases.
|
|
199
|
+
*/
|
|
200
|
+
export interface BaselineDelta {
|
|
201
|
+
baselineId: string;
|
|
202
|
+
/** Fresh-score − baseline-score, bounded to [-1, 1]. */
|
|
203
|
+
scoreDelta: number;
|
|
204
|
+
/** Count of checks that flipped from `ok:true` to `ok:false`. */
|
|
205
|
+
criticalFailures: number;
|
|
206
|
+
/** Per-case regression details for the Markdown report. */
|
|
207
|
+
regressions: BaselineRegression[];
|
|
208
|
+
}
|
|
209
|
+
export interface BaselineRegression {
|
|
210
|
+
caseId: string;
|
|
211
|
+
stage: FlowStage;
|
|
212
|
+
verifierId: string;
|
|
213
|
+
reason: "newly-failing" | "case-now-failing" | "score-drop";
|
|
214
|
+
previousScore?: number;
|
|
215
|
+
currentScore?: number;
|
|
216
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { StructuralExpected, VerifierResult } from "../types.js";
|
|
2
|
+
export interface ArtifactSplit {
|
|
3
|
+
hasFrontmatter: boolean;
|
|
4
|
+
frontmatterRaw: string;
|
|
5
|
+
frontmatterParsed?: Record<string, unknown>;
|
|
6
|
+
body: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function splitFrontmatter(artifact: string): ArtifactSplit;
|
|
9
|
+
/**
|
|
10
|
+
* Run every configured structural check against the artifact text.
|
|
11
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
12
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
13
|
+
*/
|
|
14
|
+
export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural verifier: deterministic, zero-LLM checks against a
|
|
3
|
+
* single markdown artifact. Each structural expectation produces one
|
|
4
|
+
* `VerifierResult` so baselines diff cleanly at the check level rather than
|
|
5
|
+
* lumping everything into a single boolean.
|
|
6
|
+
*
|
|
7
|
+
* Design notes:
|
|
8
|
+
*
|
|
9
|
+
* - All pattern matching is case-insensitive. Authoring a check as
|
|
10
|
+
* `"Directions"` matches `## Directions` and `### directions-suggested`.
|
|
11
|
+
* - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
|
|
12
|
+
* and close on a subsequent `---` line. Anything else is treated as "no
|
|
13
|
+
* frontmatter", which fails every `requiredFrontmatterKeys` entry
|
|
14
|
+
* deterministically.
|
|
15
|
+
* - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
|
|
16
|
+
* adds metadata does not accidentally drop the body below the floor.
|
|
17
|
+
* - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
|
|
18
|
+
* all individual `ok` flags. This keeps the structural verifier
|
|
19
|
+
* deterministic; the 0..1 rubric scale shows up later in the LLM judge.
|
|
20
|
+
*/
|
|
21
|
+
import { parse as parseYaml } from "yaml";
|
|
22
|
+
const FRONTMATTER_OPEN = /^---\r?\n/;
|
|
23
|
+
const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
|
|
24
|
+
function slugify(input) {
|
|
25
|
+
return input
|
|
26
|
+
.toLowerCase()
|
|
27
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
28
|
+
.replace(/(^-|-$)/g, "")
|
|
29
|
+
.slice(0, 64);
|
|
30
|
+
}
|
|
31
|
+
export function splitFrontmatter(artifact) {
|
|
32
|
+
if (!FRONTMATTER_OPEN.test(artifact)) {
|
|
33
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
34
|
+
}
|
|
35
|
+
const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
|
|
36
|
+
const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
|
|
37
|
+
if (!closeMatch || closeMatch.index === undefined) {
|
|
38
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
39
|
+
}
|
|
40
|
+
const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
|
|
41
|
+
const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
|
|
42
|
+
let frontmatterParsed;
|
|
43
|
+
try {
|
|
44
|
+
const parsed = parseYaml(frontmatterRaw);
|
|
45
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
46
|
+
frontmatterParsed = parsed;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
frontmatterParsed = undefined;
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
hasFrontmatter: true,
|
|
54
|
+
frontmatterRaw,
|
|
55
|
+
frontmatterParsed,
|
|
56
|
+
body
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function extractHeadingLines(body) {
|
|
60
|
+
return body
|
|
61
|
+
.split(/\r?\n/)
|
|
62
|
+
.map((line) => line.trimStart())
|
|
63
|
+
.filter((line) => /^#{1,6}\s+\S/.test(line));
|
|
64
|
+
}
|
|
65
|
+
function result(id, ok, message, details) {
|
|
66
|
+
return {
|
|
67
|
+
kind: "structural",
|
|
68
|
+
id,
|
|
69
|
+
ok,
|
|
70
|
+
score: ok ? 1 : 0,
|
|
71
|
+
message,
|
|
72
|
+
...(details !== undefined ? { details } : {})
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function checkRequiredSections(sections, body) {
|
|
76
|
+
const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
|
|
77
|
+
return sections.map((section) => {
|
|
78
|
+
const needle = section.toLowerCase().trim();
|
|
79
|
+
const found = headings.some((heading) => heading.includes(needle));
|
|
80
|
+
return result(`structural:section:${slugify(section)}`, found, found
|
|
81
|
+
? `Section matching "${section}" present.`
|
|
82
|
+
: `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
function checkForbiddenPatterns(patterns, body) {
|
|
86
|
+
const bodyLower = body.toLowerCase();
|
|
87
|
+
return patterns.map((pattern) => {
|
|
88
|
+
const needle = pattern.toLowerCase();
|
|
89
|
+
const hits = countOccurrences(bodyLower, needle);
|
|
90
|
+
const ok = hits === 0;
|
|
91
|
+
return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
|
|
92
|
+
? `Pattern "${pattern}" absent (as required).`
|
|
93
|
+
: `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
function countOccurrences(haystack, needle) {
|
|
97
|
+
if (needle.length === 0)
|
|
98
|
+
return 0;
|
|
99
|
+
let index = 0;
|
|
100
|
+
let count = 0;
|
|
101
|
+
while (true) {
|
|
102
|
+
const at = haystack.indexOf(needle, index);
|
|
103
|
+
if (at < 0)
|
|
104
|
+
return count;
|
|
105
|
+
count += 1;
|
|
106
|
+
index = at + needle.length;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
function checkLengthBounds(expected, body) {
|
|
110
|
+
const results = [];
|
|
111
|
+
const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
|
|
112
|
+
const charCount = body.length;
|
|
113
|
+
if (expected.minLines !== undefined || expected.maxLines !== undefined) {
|
|
114
|
+
const min = expected.minLines;
|
|
115
|
+
const max = expected.maxLines;
|
|
116
|
+
const withinMin = min === undefined || lineCount >= min;
|
|
117
|
+
const withinMax = max === undefined || lineCount <= max;
|
|
118
|
+
const ok = withinMin && withinMax;
|
|
119
|
+
results.push(result("structural:length:lines", ok, ok
|
|
120
|
+
? `Body has ${lineCount} line(s), within bounds.`
|
|
121
|
+
: buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
|
|
122
|
+
}
|
|
123
|
+
if (expected.minChars !== undefined || expected.maxChars !== undefined) {
|
|
124
|
+
const min = expected.minChars;
|
|
125
|
+
const max = expected.maxChars;
|
|
126
|
+
const withinMin = min === undefined || charCount >= min;
|
|
127
|
+
const withinMax = max === undefined || charCount <= max;
|
|
128
|
+
const ok = withinMin && withinMax;
|
|
129
|
+
results.push(result("structural:length:chars", ok, ok
|
|
130
|
+
? `Body has ${charCount} char(s), within bounds.`
|
|
131
|
+
: buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
|
|
132
|
+
}
|
|
133
|
+
return results;
|
|
134
|
+
}
|
|
135
|
+
function buildOutOfRangeMessage(unit, actual, min, max) {
|
|
136
|
+
const lo = min === undefined ? "0" : String(min);
|
|
137
|
+
const hi = max === undefined ? "∞" : String(max);
|
|
138
|
+
return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
|
|
139
|
+
}
|
|
140
|
+
function checkFrontmatterKeys(keys, split) {
|
|
141
|
+
if (!split.hasFrontmatter || !split.frontmatterParsed) {
|
|
142
|
+
return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
|
|
143
|
+
}
|
|
144
|
+
const present = new Set(Object.keys(split.frontmatterParsed));
|
|
145
|
+
return keys.map((key) => {
|
|
146
|
+
const ok = present.has(key);
|
|
147
|
+
return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Run every configured structural check against the artifact text.
|
|
152
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
153
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
154
|
+
*/
|
|
155
|
+
export function verifyStructural(artifact, expected) {
|
|
156
|
+
if (!expected)
|
|
157
|
+
return [];
|
|
158
|
+
const split = splitFrontmatter(artifact);
|
|
159
|
+
const results = [];
|
|
160
|
+
if (expected.requiredSections?.length) {
|
|
161
|
+
results.push(...checkRequiredSections(expected.requiredSections, split.body));
|
|
162
|
+
}
|
|
163
|
+
if (expected.forbiddenPatterns?.length) {
|
|
164
|
+
results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
|
|
165
|
+
}
|
|
166
|
+
results.push(...checkLengthBounds(expected, split.body));
|
|
167
|
+
if (expected.requiredFrontmatterKeys?.length) {
|
|
168
|
+
results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
|
|
169
|
+
}
|
|
170
|
+
return results;
|
|
171
|
+
}
|
package/dist/install.js
CHANGED
|
@@ -29,7 +29,7 @@ import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.j
|
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
31
|
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
32
|
-
import {
|
|
32
|
+
import { TDD_BATCH_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
33
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
34
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
35
35
|
import { LANGUAGE_RULE_PACK_DIR, LANGUAGE_RULE_PACK_FILES, LANGUAGE_RULE_PACK_GENERATORS, LEGACY_LANGUAGE_RULE_PACK_FOLDERS, UTILITY_SKILL_FOLDERS, UTILITY_SKILL_MAP } from "./content/utility-skills.js";
|
|
@@ -218,11 +218,11 @@ async function writeSkills(projectRoot, config) {
|
|
|
218
218
|
await writeFileSafe(runtimePath(projectRoot, ...referenceDir, `${stage}-examples.md`), referenceMarkdown);
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
|
-
// Progressive disclosure for the TDD
|
|
221
|
+
// Progressive disclosure for the TDD Batch Execution walkthrough (A.1#1).
|
|
222
222
|
// The detailed 3-task transcript lives next to stage examples so the
|
|
223
223
|
// always-rendered TDD skill stays under the line-budget and the reference
|
|
224
224
|
// is loaded on demand.
|
|
225
|
-
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-
|
|
225
|
+
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-batch-walkthrough.md"), TDD_BATCH_WALKTHROUGH_MARKDOWN);
|
|
226
226
|
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "common-guidance.md"), stageCommonGuidanceMarkdown());
|
|
227
227
|
// Utility skills (not flow stages)
|
|
228
228
|
await writeFileSafe(runtimePath(projectRoot, "skills", "learnings", "SKILL.md"), learnSkillMarkdown());
|
package/dist/policy.js
CHANGED
|
@@ -161,7 +161,7 @@ export async function policyChecks(projectRoot, options = {}) {
|
|
|
161
161
|
{ file: runtimeFile("skills/docs/SKILL.md"), needle: "## README Guidance", name: "utility_skill:docs:readme" },
|
|
162
162
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:executing_plans:hard_gate" },
|
|
163
163
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Execution Protocol", name: "utility_skill:executing_plans:protocol" },
|
|
164
|
-
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "##
|
|
164
|
+
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Batch Checklist", name: "utility_skill:executing_plans:batches" },
|
|
165
165
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:verification_before_completion:hard_gate" },
|
|
166
166
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## Protocol", name: "utility_skill:verification_before_completion:protocol" },
|
|
167
167
|
{ file: runtimeFile("skills/finishing-a-development-branch/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:finishing_branch:hard_gate" },
|