cclaw-cli 0.21.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +11 -1
- package/dist/cli.js +154 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +19 -0
- package/dist/eval/corpus.js +175 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +101 -0
- package/dist/eval/runner.d.ts +45 -0
- package/dist/eval/runner.js +178 -0
- package/dist/eval/types.d.ts +216 -0
- package/dist/eval/types.js +15 -0
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/dist/install.js +22 -0
- package/package.json +1 -1
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
+
import { FLOW_STAGES } from "../types.js";
|
|
4
|
+
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
|
+
import { loadCorpus, readFixtureArtifact } from "./corpus.js";
|
|
6
|
+
import { loadEvalConfig } from "./config-loader.js";
|
|
7
|
+
import { verifyStructural } from "./verifiers/structural.js";
|
|
8
|
+
function groupByStage(cases) {
|
|
9
|
+
return cases.reduce((acc, item) => {
|
|
10
|
+
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
11
|
+
return acc;
|
|
12
|
+
}, {});
|
|
13
|
+
}
|
|
14
|
+
function skeletonVerifierResult(message, details) {
|
|
15
|
+
return {
|
|
16
|
+
kind: "structural",
|
|
17
|
+
id: "wave-7-1-no-structural-expected",
|
|
18
|
+
ok: true,
|
|
19
|
+
score: 1,
|
|
20
|
+
message,
|
|
21
|
+
...(details !== undefined ? { details } : {})
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
|
|
25
|
+
const started = Date.now();
|
|
26
|
+
const structuralExpected = caseEntry.expected?.structural;
|
|
27
|
+
const verifierResults = [];
|
|
28
|
+
if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
|
|
29
|
+
// No structural expectations declared — case is treated as "N/A" for this
|
|
30
|
+
// verifier kind; a placeholder pass keeps downstream math simple while
|
|
31
|
+
// making the situation visible in the report.
|
|
32
|
+
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
let artifact;
|
|
36
|
+
try {
|
|
37
|
+
artifact = await readFixtureArtifact(projectRoot, caseEntry);
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
verifierResults.push({
|
|
41
|
+
kind: "structural",
|
|
42
|
+
id: "structural:fixture:missing",
|
|
43
|
+
ok: false,
|
|
44
|
+
score: 0,
|
|
45
|
+
message: err instanceof Error ? err.message : String(err),
|
|
46
|
+
details: { fixture: caseEntry.fixture }
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
if (artifact !== undefined) {
|
|
50
|
+
const results = verifyStructural(artifact, structuralExpected);
|
|
51
|
+
if (results.length === 0) {
|
|
52
|
+
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
verifierResults.push(...results);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else if (verifierResults.length === 0) {
|
|
59
|
+
verifierResults.push({
|
|
60
|
+
kind: "structural",
|
|
61
|
+
id: "structural:fixture:absent",
|
|
62
|
+
ok: false,
|
|
63
|
+
score: 0,
|
|
64
|
+
message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
65
|
+
details: { fixtureProvided: false }
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const allOk = verifierResults.every((r) => r.ok);
|
|
70
|
+
return {
|
|
71
|
+
caseId: caseEntry.id,
|
|
72
|
+
stage: caseEntry.stage,
|
|
73
|
+
tier: plannedTier,
|
|
74
|
+
passed: allOk,
|
|
75
|
+
durationMs: Date.now() - started,
|
|
76
|
+
verifierResults
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
function reduceSummary(caseResults) {
|
|
80
|
+
let passed = 0;
|
|
81
|
+
let failed = 0;
|
|
82
|
+
let skipped = 0;
|
|
83
|
+
let totalCostUsd = 0;
|
|
84
|
+
let totalDurationMs = 0;
|
|
85
|
+
for (const c of caseResults) {
|
|
86
|
+
totalDurationMs += c.durationMs;
|
|
87
|
+
if (c.costUsd !== undefined)
|
|
88
|
+
totalCostUsd += c.costUsd;
|
|
89
|
+
if (c.verifierResults.length === 1 && c.verifierResults[0]?.details?.skipped === true) {
|
|
90
|
+
skipped += 1;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
if (c.passed)
|
|
94
|
+
passed += 1;
|
|
95
|
+
else
|
|
96
|
+
failed += 1;
|
|
97
|
+
}
|
|
98
|
+
return {
|
|
99
|
+
totalCases: caseResults.length,
|
|
100
|
+
passed,
|
|
101
|
+
failed,
|
|
102
|
+
skipped,
|
|
103
|
+
totalCostUsd: Number(totalCostUsd.toFixed(6)),
|
|
104
|
+
totalDurationMs
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function stagesInResults(caseResults) {
|
|
108
|
+
const set = new Set();
|
|
109
|
+
for (const c of caseResults)
|
|
110
|
+
set.add(c.stage);
|
|
111
|
+
return FLOW_STAGES.filter((s) => set.has(s));
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Wave 7.1 runner. When `schemaOnly` is set (or no other verifier flags are
|
|
115
|
+
* active), runs structural verifiers against fixture-backed cases and loads
|
|
116
|
+
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
117
|
+
* still arrive in Waves 7.3+; until then cases without `fixture` are marked
|
|
118
|
+
* as skipped rather than failing.
|
|
119
|
+
*/
|
|
120
|
+
export async function runEval(options) {
|
|
121
|
+
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
122
|
+
const corpus = await loadCorpus(options.projectRoot, options.stage);
|
|
123
|
+
const plannedTier = options.tier ?? config.defaultTier;
|
|
124
|
+
const notes = [];
|
|
125
|
+
if (corpus.length === 0) {
|
|
126
|
+
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
127
|
+
}
|
|
128
|
+
if (options.rules) {
|
|
129
|
+
notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
|
|
130
|
+
}
|
|
131
|
+
if (options.judge) {
|
|
132
|
+
notes.push("--judge is accepted; LLM judging wires up in Wave 7.3.");
|
|
133
|
+
}
|
|
134
|
+
if (options.dryRun === true) {
|
|
135
|
+
const summary = {
|
|
136
|
+
kind: "dry-run",
|
|
137
|
+
config,
|
|
138
|
+
corpus: {
|
|
139
|
+
total: corpus.length,
|
|
140
|
+
byStage: groupByStage(corpus),
|
|
141
|
+
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
142
|
+
},
|
|
143
|
+
plannedTier,
|
|
144
|
+
verifiersAvailable: {
|
|
145
|
+
structural: true,
|
|
146
|
+
rules: false,
|
|
147
|
+
judge: false,
|
|
148
|
+
workflow: false
|
|
149
|
+
},
|
|
150
|
+
notes
|
|
151
|
+
};
|
|
152
|
+
return summary;
|
|
153
|
+
}
|
|
154
|
+
const now = new Date().toISOString();
|
|
155
|
+
const caseResults = [];
|
|
156
|
+
for (const item of corpus) {
|
|
157
|
+
caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
|
|
158
|
+
}
|
|
159
|
+
const stages = stagesInResults(caseResults);
|
|
160
|
+
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|
|
161
|
+
const summary = reduceSummary(caseResults);
|
|
162
|
+
const report = {
|
|
163
|
+
schemaVersion: 1,
|
|
164
|
+
generatedAt: now,
|
|
165
|
+
runId: randomUUID(),
|
|
166
|
+
cclawVersion: CCLAW_VERSION,
|
|
167
|
+
provider: config.provider,
|
|
168
|
+
model: config.model,
|
|
169
|
+
tier: plannedTier,
|
|
170
|
+
stages,
|
|
171
|
+
cases: caseResults,
|
|
172
|
+
summary
|
|
173
|
+
};
|
|
174
|
+
const baselineDelta = compareAgainstBaselines(report, baselines);
|
|
175
|
+
if (baselineDelta)
|
|
176
|
+
report.baselineDelta = baselineDelta;
|
|
177
|
+
return report;
|
|
178
|
+
}
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the cclaw eval subsystem (Phase 7).
|
|
3
|
+
*
|
|
4
|
+
* The eval subsystem lets us measure whether a change to a prompt, skill, or
|
|
5
|
+
* stage contract improves or regresses the quality of agent output. It is
|
|
6
|
+
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
|
+
*
|
|
8
|
+
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
+
* - The verifier / rubric / LLM stack evolves on its own release cadence (Waves 7.0-7.6).
|
|
10
|
+
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
|
+
*/
|
|
12
|
+
import type { FlowStage } from "../types.js";
|
|
13
|
+
/**
|
|
14
|
+
* Fidelity tier for the agent-under-test.
|
|
15
|
+
*
|
|
16
|
+
* - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
|
|
17
|
+
* - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
|
|
18
|
+
* - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
|
|
19
|
+
* artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
|
|
20
|
+
* (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
|
|
21
|
+
*/
|
|
22
|
+
export declare const EVAL_TIERS: readonly ["A", "B", "C"];
|
|
23
|
+
export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
24
|
+
/**
|
|
25
|
+
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
|
+
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
27
|
+
*/
|
|
28
|
+
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
29
|
+
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
|
+
/**
|
|
31
|
+
* Structural expectations — deterministic, LLM-free checks against a single
|
|
32
|
+
* text artifact. Wave 7.1 implements all fields below; Wave 7.2 adds the
|
|
33
|
+
* sibling `rules` shape, Wave 7.3 adds `judge`.
|
|
34
|
+
*/
|
|
35
|
+
export interface StructuralExpected {
|
|
36
|
+
/**
|
|
37
|
+
* Case-insensitive substrings that must each appear on at least one markdown
|
|
38
|
+
* heading line (line starting with `#`). Useful for "required sections".
|
|
39
|
+
*/
|
|
40
|
+
requiredSections?: string[];
|
|
41
|
+
/**
|
|
42
|
+
* Case-insensitive substrings that must NOT appear anywhere in the body
|
|
43
|
+
* (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
|
|
44
|
+
*/
|
|
45
|
+
forbiddenPatterns?: string[];
|
|
46
|
+
/** Inclusive minimum line count of the artifact body (frontmatter excluded). */
|
|
47
|
+
minLines?: number;
|
|
48
|
+
/** Inclusive maximum line count of the artifact body (frontmatter excluded). */
|
|
49
|
+
maxLines?: number;
|
|
50
|
+
/** Inclusive minimum character count of the artifact body. */
|
|
51
|
+
minChars?: number;
|
|
52
|
+
/** Inclusive maximum character count of the artifact body. */
|
|
53
|
+
maxChars?: number;
|
|
54
|
+
/**
|
|
55
|
+
* Keys that must appear in the leading YAML frontmatter (between a pair of
|
|
56
|
+
* `---` delimiters at the very top of the file). An artifact without
|
|
57
|
+
* frontmatter will fail every entry.
|
|
58
|
+
*/
|
|
59
|
+
requiredFrontmatterKeys?: string[];
|
|
60
|
+
}
|
|
61
|
+
/** Superset of per-verifier expectation shapes. Only `structural` is wired in Wave 7.1. */
|
|
62
|
+
export interface ExpectedShape {
|
|
63
|
+
structural?: StructuralExpected;
|
|
64
|
+
/** Rule-based (keyword/regex/traceability) checks — Wave 7.2. */
|
|
65
|
+
rules?: Record<string, unknown>;
|
|
66
|
+
/** LLM-judge rubrics — Wave 7.3. */
|
|
67
|
+
judge?: Record<string, unknown>;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* A single eval case describes one input scenario for one stage. Cases live in
|
|
71
|
+
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
72
|
+
* fixture artifact for verifier development (Wave 7.1) before the agent loop
|
|
73
|
+
* exists (Wave 7.3+).
|
|
74
|
+
*/
|
|
75
|
+
export interface EvalCase {
|
|
76
|
+
id: string;
|
|
77
|
+
stage: FlowStage;
|
|
78
|
+
inputPrompt: string;
|
|
79
|
+
/** Project files copied into the Tier B/C sandbox before the agent runs. */
|
|
80
|
+
contextFiles?: string[];
|
|
81
|
+
/**
|
|
82
|
+
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
83
|
+
* Each sub-shape is optional; missing sub-shapes skip that verifier tier.
|
|
84
|
+
*/
|
|
85
|
+
expected?: ExpectedShape;
|
|
86
|
+
/**
|
|
87
|
+
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
88
|
+
* when verifiers are exercised without a live agent loop. Primarily a Wave
|
|
89
|
+
* 7.1 development aid.
|
|
90
|
+
*/
|
|
91
|
+
fixture?: string;
|
|
92
|
+
}
|
|
93
|
+
/** Result of one verifier applied to one case. */
|
|
94
|
+
export interface VerifierResult {
|
|
95
|
+
kind: VerifierKind;
|
|
96
|
+
id: string;
|
|
97
|
+
ok: boolean;
|
|
98
|
+
/** Normalized 0..1 score when the verifier produces a numeric signal. */
|
|
99
|
+
score?: number;
|
|
100
|
+
message?: string;
|
|
101
|
+
details?: Record<string, unknown>;
|
|
102
|
+
}
|
|
103
|
+
/** Aggregate result for one case after all verifiers run. */
|
|
104
|
+
export interface EvalCaseResult {
|
|
105
|
+
caseId: string;
|
|
106
|
+
stage: FlowStage;
|
|
107
|
+
tier: EvalTier;
|
|
108
|
+
passed: boolean;
|
|
109
|
+
durationMs: number;
|
|
110
|
+
costUsd?: number;
|
|
111
|
+
verifierResults: VerifierResult[];
|
|
112
|
+
}
|
|
113
|
+
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
114
|
+
export interface EvalReport {
|
|
115
|
+
schemaVersion: 1;
|
|
116
|
+
generatedAt: string;
|
|
117
|
+
runId: string;
|
|
118
|
+
cclawVersion: string;
|
|
119
|
+
provider: string;
|
|
120
|
+
model: string;
|
|
121
|
+
tier: EvalTier;
|
|
122
|
+
stages: FlowStage[];
|
|
123
|
+
cases: EvalCaseResult[];
|
|
124
|
+
summary: {
|
|
125
|
+
totalCases: number;
|
|
126
|
+
passed: number;
|
|
127
|
+
failed: number;
|
|
128
|
+
skipped: number;
|
|
129
|
+
totalCostUsd: number;
|
|
130
|
+
totalDurationMs: number;
|
|
131
|
+
};
|
|
132
|
+
/** Present when comparing against a saved baseline (Wave 7.1+). */
|
|
133
|
+
baselineDelta?: BaselineDelta;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
|
|
137
|
+
* with `CCLAW_EVAL_*` environment variables at runtime.
|
|
138
|
+
*/
|
|
139
|
+
export interface EvalConfig {
|
|
140
|
+
/**
|
|
141
|
+
* Free-form provider name used in reports. The actual HTTP protocol is
|
|
142
|
+
* determined by `baseUrl`, which is expected to be OpenAI-compatible.
|
|
143
|
+
*/
|
|
144
|
+
provider: string;
|
|
145
|
+
/** OpenAI-compatible base URL, e.g. `https://api.z.ai/api/coding/paas/v4`. */
|
|
146
|
+
baseUrl: string;
|
|
147
|
+
/** Model identifier for both agent-under-test and judge unless `judgeModel` overrides. */
|
|
148
|
+
model: string;
|
|
149
|
+
/** Optional separate model for the judge role. Defaults to `model`. */
|
|
150
|
+
judgeModel?: string;
|
|
151
|
+
/** Default tier when `--tier` is not supplied. */
|
|
152
|
+
defaultTier: EvalTier;
|
|
153
|
+
/** Optional hard stop on estimated USD spend per day. Unset = no cap. */
|
|
154
|
+
dailyUsdCap?: number;
|
|
155
|
+
/** Regression thresholds for CI gates. */
|
|
156
|
+
regression: {
|
|
157
|
+
/** Fail when overall score drops by more than this fraction (e.g. 0.15 = 15%). */
|
|
158
|
+
failIfDeltaBelow: number;
|
|
159
|
+
/** Fail when any single critical rubric drops below this absolute score. */
|
|
160
|
+
failIfCriticalBelow: number;
|
|
161
|
+
};
|
|
162
|
+
/** Per-agent-run timeout in milliseconds. */
|
|
163
|
+
timeoutMs: number;
|
|
164
|
+
/** Max retries per API call on transient failures. */
|
|
165
|
+
maxRetries: number;
|
|
166
|
+
}
|
|
167
|
+
/** Resolved config with env overrides applied. */
|
|
168
|
+
export interface ResolvedEvalConfig extends EvalConfig {
|
|
169
|
+
apiKey?: string;
|
|
170
|
+
source: "default" | "file" | "env" | "file+env";
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Frozen per-stage baseline used by regression gating (Wave 7.1). Baselines
|
|
174
|
+
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
175
|
+
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
176
|
+
* changed between runs.
|
|
177
|
+
*/
|
|
178
|
+
export interface BaselineSnapshot {
|
|
179
|
+
schemaVersion: 1;
|
|
180
|
+
stage: FlowStage;
|
|
181
|
+
generatedAt: string;
|
|
182
|
+
cclawVersion: string;
|
|
183
|
+
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
184
|
+
cases: Record<string, BaselineCaseEntry>;
|
|
185
|
+
}
|
|
186
|
+
export interface BaselineCaseEntry {
|
|
187
|
+
passed: boolean;
|
|
188
|
+
verifierResults: BaselineVerifierEntry[];
|
|
189
|
+
}
|
|
190
|
+
export interface BaselineVerifierEntry {
|
|
191
|
+
id: string;
|
|
192
|
+
kind: VerifierKind;
|
|
193
|
+
ok: boolean;
|
|
194
|
+
score?: number;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Delta between a fresh report and the saved baseline. Populated when
|
|
198
|
+
* baselines exist on disk and the run covers matching cases.
|
|
199
|
+
*/
|
|
200
|
+
export interface BaselineDelta {
|
|
201
|
+
baselineId: string;
|
|
202
|
+
/** Fresh-score − baseline-score, bounded to [-1, 1]. */
|
|
203
|
+
scoreDelta: number;
|
|
204
|
+
/** Count of checks that flipped from `ok:true` to `ok:false`. */
|
|
205
|
+
criticalFailures: number;
|
|
206
|
+
/** Per-case regression details for the Markdown report. */
|
|
207
|
+
regressions: BaselineRegression[];
|
|
208
|
+
}
|
|
209
|
+
export interface BaselineRegression {
|
|
210
|
+
caseId: string;
|
|
211
|
+
stage: FlowStage;
|
|
212
|
+
verifierId: string;
|
|
213
|
+
reason: "newly-failing" | "case-now-failing" | "score-drop";
|
|
214
|
+
previousScore?: number;
|
|
215
|
+
currentScore?: number;
|
|
216
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fidelity tier for the agent-under-test.
|
|
3
|
+
*
|
|
4
|
+
* - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
|
|
5
|
+
* - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
|
|
6
|
+
* - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
|
|
7
|
+
* artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
|
|
8
|
+
* (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
|
|
9
|
+
*/
|
|
10
|
+
export const EVAL_TIERS = ["A", "B", "C"];
|
|
11
|
+
/**
|
|
12
|
+
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
|
+
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
14
|
+
*/
|
|
15
|
+
export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { StructuralExpected, VerifierResult } from "../types.js";
|
|
2
|
+
export interface ArtifactSplit {
|
|
3
|
+
hasFrontmatter: boolean;
|
|
4
|
+
frontmatterRaw: string;
|
|
5
|
+
frontmatterParsed?: Record<string, unknown>;
|
|
6
|
+
body: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function splitFrontmatter(artifact: string): ArtifactSplit;
|
|
9
|
+
/**
|
|
10
|
+
* Run every configured structural check against the artifact text.
|
|
11
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
12
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
13
|
+
*/
|
|
14
|
+
export declare function verifyStructural(artifact: string, expected: StructuralExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural verifier (Wave 7.1): deterministic, zero-LLM checks against a
|
|
3
|
+
* single markdown artifact. Each structural expectation produces one
|
|
4
|
+
* `VerifierResult` so baselines diff cleanly at the check level rather than
|
|
5
|
+
* lumping everything into a single boolean.
|
|
6
|
+
*
|
|
7
|
+
* Design notes:
|
|
8
|
+
*
|
|
9
|
+
* - All pattern matching is case-insensitive. Authoring a check as
|
|
10
|
+
* `"Directions"` matches `## Directions` and `### directions-suggested`.
|
|
11
|
+
* - Frontmatter detection is permissive: it must start at byte 0 with `---\n`
|
|
12
|
+
* and close on a subsequent `---` line. Anything else is treated as "no
|
|
13
|
+
* frontmatter", which fails every `requiredFrontmatterKeys` entry
|
|
14
|
+
* deterministically.
|
|
15
|
+
* - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
|
|
16
|
+
* adds metadata does not accidentally drop the body below the floor.
|
|
17
|
+
* - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
|
|
18
|
+
* all individual `ok` flags. This keeps Wave 7.1 deterministic; the 0..1
|
|
19
|
+
* rubric scale shows up in Wave 7.3 (judge).
|
|
20
|
+
*/
|
|
21
|
+
import { parse as parseYaml } from "yaml";
|
|
22
|
+
const FRONTMATTER_OPEN = /^---\r?\n/;
|
|
23
|
+
const FRONTMATTER_CLOSE = /\r?\n---\r?(?:\n|$)/;
|
|
24
|
+
function slugify(input) {
|
|
25
|
+
return input
|
|
26
|
+
.toLowerCase()
|
|
27
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
28
|
+
.replace(/(^-|-$)/g, "")
|
|
29
|
+
.slice(0, 64);
|
|
30
|
+
}
|
|
31
|
+
export function splitFrontmatter(artifact) {
|
|
32
|
+
if (!FRONTMATTER_OPEN.test(artifact)) {
|
|
33
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
34
|
+
}
|
|
35
|
+
const afterOpen = artifact.replace(FRONTMATTER_OPEN, "");
|
|
36
|
+
const closeMatch = afterOpen.match(FRONTMATTER_CLOSE);
|
|
37
|
+
if (!closeMatch || closeMatch.index === undefined) {
|
|
38
|
+
return { hasFrontmatter: false, frontmatterRaw: "", body: artifact };
|
|
39
|
+
}
|
|
40
|
+
const frontmatterRaw = afterOpen.slice(0, closeMatch.index);
|
|
41
|
+
const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
|
|
42
|
+
let frontmatterParsed;
|
|
43
|
+
try {
|
|
44
|
+
const parsed = parseYaml(frontmatterRaw);
|
|
45
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
46
|
+
frontmatterParsed = parsed;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
frontmatterParsed = undefined;
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
hasFrontmatter: true,
|
|
54
|
+
frontmatterRaw,
|
|
55
|
+
frontmatterParsed,
|
|
56
|
+
body
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function extractHeadingLines(body) {
|
|
60
|
+
return body
|
|
61
|
+
.split(/\r?\n/)
|
|
62
|
+
.map((line) => line.trimStart())
|
|
63
|
+
.filter((line) => /^#{1,6}\s+\S/.test(line));
|
|
64
|
+
}
|
|
65
|
+
function result(id, ok, message, details) {
|
|
66
|
+
return {
|
|
67
|
+
kind: "structural",
|
|
68
|
+
id,
|
|
69
|
+
ok,
|
|
70
|
+
score: ok ? 1 : 0,
|
|
71
|
+
message,
|
|
72
|
+
...(details !== undefined ? { details } : {})
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function checkRequiredSections(sections, body) {
|
|
76
|
+
const headings = extractHeadingLines(body).map((line) => line.toLowerCase());
|
|
77
|
+
return sections.map((section) => {
|
|
78
|
+
const needle = section.toLowerCase().trim();
|
|
79
|
+
const found = headings.some((heading) => heading.includes(needle));
|
|
80
|
+
return result(`structural:section:${slugify(section)}`, found, found
|
|
81
|
+
? `Section matching "${section}" present.`
|
|
82
|
+
: `No heading contains "${section}".`, { pattern: section, searchedHeadings: headings.length });
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
function checkForbiddenPatterns(patterns, body) {
|
|
86
|
+
const bodyLower = body.toLowerCase();
|
|
87
|
+
return patterns.map((pattern) => {
|
|
88
|
+
const needle = pattern.toLowerCase();
|
|
89
|
+
const hits = countOccurrences(bodyLower, needle);
|
|
90
|
+
const ok = hits === 0;
|
|
91
|
+
return result(`structural:forbidden:${slugify(pattern)}`, ok, ok
|
|
92
|
+
? `Pattern "${pattern}" absent (as required).`
|
|
93
|
+
: `Pattern "${pattern}" appears ${hits} time(s); remove.`, { pattern, occurrences: hits });
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
function countOccurrences(haystack, needle) {
|
|
97
|
+
if (needle.length === 0)
|
|
98
|
+
return 0;
|
|
99
|
+
let index = 0;
|
|
100
|
+
let count = 0;
|
|
101
|
+
while (true) {
|
|
102
|
+
const at = haystack.indexOf(needle, index);
|
|
103
|
+
if (at < 0)
|
|
104
|
+
return count;
|
|
105
|
+
count += 1;
|
|
106
|
+
index = at + needle.length;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
function checkLengthBounds(expected, body) {
|
|
110
|
+
const results = [];
|
|
111
|
+
const lineCount = body.length === 0 ? 0 : body.split(/\r?\n/).length;
|
|
112
|
+
const charCount = body.length;
|
|
113
|
+
if (expected.minLines !== undefined || expected.maxLines !== undefined) {
|
|
114
|
+
const min = expected.minLines;
|
|
115
|
+
const max = expected.maxLines;
|
|
116
|
+
const withinMin = min === undefined || lineCount >= min;
|
|
117
|
+
const withinMax = max === undefined || lineCount <= max;
|
|
118
|
+
const ok = withinMin && withinMax;
|
|
119
|
+
results.push(result("structural:length:lines", ok, ok
|
|
120
|
+
? `Body has ${lineCount} line(s), within bounds.`
|
|
121
|
+
: buildOutOfRangeMessage("line", lineCount, min, max), { lineCount, minLines: min, maxLines: max }));
|
|
122
|
+
}
|
|
123
|
+
if (expected.minChars !== undefined || expected.maxChars !== undefined) {
|
|
124
|
+
const min = expected.minChars;
|
|
125
|
+
const max = expected.maxChars;
|
|
126
|
+
const withinMin = min === undefined || charCount >= min;
|
|
127
|
+
const withinMax = max === undefined || charCount <= max;
|
|
128
|
+
const ok = withinMin && withinMax;
|
|
129
|
+
results.push(result("structural:length:chars", ok, ok
|
|
130
|
+
? `Body has ${charCount} char(s), within bounds.`
|
|
131
|
+
: buildOutOfRangeMessage("char", charCount, min, max), { charCount, minChars: min, maxChars: max }));
|
|
132
|
+
}
|
|
133
|
+
return results;
|
|
134
|
+
}
|
|
135
|
+
function buildOutOfRangeMessage(unit, actual, min, max) {
|
|
136
|
+
const lo = min === undefined ? "0" : String(min);
|
|
137
|
+
const hi = max === undefined ? "∞" : String(max);
|
|
138
|
+
return `Body has ${actual} ${unit}(s); expected ${lo}..${hi}.`;
|
|
139
|
+
}
|
|
140
|
+
function checkFrontmatterKeys(keys, split) {
|
|
141
|
+
if (!split.hasFrontmatter || !split.frontmatterParsed) {
|
|
142
|
+
return keys.map((key) => result(`structural:frontmatter:${slugify(key)}`, false, `Frontmatter key "${key}" missing (no parseable frontmatter).`, { key, frontmatterPresent: split.hasFrontmatter }));
|
|
143
|
+
}
|
|
144
|
+
const present = new Set(Object.keys(split.frontmatterParsed));
|
|
145
|
+
return keys.map((key) => {
|
|
146
|
+
const ok = present.has(key);
|
|
147
|
+
return result(`structural:frontmatter:${slugify(key)}`, ok, ok ? `Frontmatter key "${key}" present.` : `Frontmatter key "${key}" missing.`, { key });
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Run every configured structural check against the artifact text.
|
|
152
|
+
* Returns [] when `expected` is undefined/empty so the runner can treat
|
|
153
|
+
* "no structural expectations" as "no verifier results" rather than "pass".
|
|
154
|
+
*/
|
|
155
|
+
export function verifyStructural(artifact, expected) {
|
|
156
|
+
if (!expected)
|
|
157
|
+
return [];
|
|
158
|
+
const split = splitFrontmatter(artifact);
|
|
159
|
+
const results = [];
|
|
160
|
+
if (expected.requiredSections?.length) {
|
|
161
|
+
results.push(...checkRequiredSections(expected.requiredSections, split.body));
|
|
162
|
+
}
|
|
163
|
+
if (expected.forbiddenPatterns?.length) {
|
|
164
|
+
results.push(...checkForbiddenPatterns(expected.forbiddenPatterns, split.body));
|
|
165
|
+
}
|
|
166
|
+
results.push(...checkLengthBounds(expected, split.body));
|
|
167
|
+
if (expected.requiredFrontmatterKeys?.length) {
|
|
168
|
+
results.push(...checkFrontmatterKeys(expected.requiredFrontmatterKeys, split));
|
|
169
|
+
}
|
|
170
|
+
return results;
|
|
171
|
+
}
|
package/dist/install.js
CHANGED
|
@@ -28,6 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
|
|
|
28
28
|
import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
|
+
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
31
32
|
import { TDD_WAVE_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
32
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
33
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
@@ -184,6 +185,26 @@ async function writeArtifactTemplates(projectRoot) {
|
|
|
184
185
|
await writeFileSafe(runtimePath(projectRoot, "templates", fileName), content);
|
|
185
186
|
}
|
|
186
187
|
}
|
|
188
|
+
/**
|
|
189
|
+
* Seed the `.cclaw/evals/` scaffold. Only writes files that do not already
|
|
190
|
+
* exist so that user-authored config.yaml / corpus / rubrics / baselines are
|
|
191
|
+
* never clobbered by `cclaw sync`.
|
|
192
|
+
*/
|
|
193
|
+
async function writeEvalScaffold(projectRoot) {
|
|
194
|
+
const targets = [
|
|
195
|
+
{ rel: "evals/config.yaml", content: EVAL_CONFIG_YAML },
|
|
196
|
+
{ rel: "evals/corpus/README.md", content: EVAL_CORPUS_README },
|
|
197
|
+
{ rel: "evals/rubrics/README.md", content: EVAL_RUBRICS_README },
|
|
198
|
+
{ rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
|
|
199
|
+
{ rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
|
|
200
|
+
];
|
|
201
|
+
for (const target of targets) {
|
|
202
|
+
const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
|
|
203
|
+
if (await exists(absolute))
|
|
204
|
+
continue;
|
|
205
|
+
await writeFileSafe(absolute, target.content);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
187
208
|
async function writeSkills(projectRoot, config) {
|
|
188
209
|
for (const stage of COMMAND_FILE_ORDER) {
|
|
189
210
|
const folder = stageSkillFolder(stage);
|
|
@@ -1044,6 +1065,7 @@ async function materializeRuntime(projectRoot, config, forceStateReset) {
|
|
|
1044
1065
|
await writeSkills(projectRoot, config);
|
|
1045
1066
|
await writeContextModes(projectRoot);
|
|
1046
1067
|
await writeArtifactTemplates(projectRoot);
|
|
1068
|
+
await writeEvalScaffold(projectRoot);
|
|
1047
1069
|
await writeRulebook(projectRoot);
|
|
1048
1070
|
await writeState(projectRoot, config, forceStateReset);
|
|
1049
1071
|
await ensureRunSystem(projectRoot, { createIfMissing: false });
|