cclaw-cli 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +4 -4
- package/dist/constants.d.ts +4 -4
- package/dist/constants.js +4 -4
- package/dist/content/eval-scaffold.d.ts +4 -4
- package/dist/content/eval-scaffold.js +13 -14
- package/dist/content/examples.js +11 -11
- package/dist/content/hooks.js +1 -1
- package/dist/content/skills.d.ts +3 -3
- package/dist/content/skills.js +19 -19
- package/dist/content/stage-schema.js +2 -2
- package/dist/content/stages/plan.js +18 -18
- package/dist/content/stages/schema-types.d.ts +2 -2
- package/dist/content/stages/tdd.js +1 -1
- package/dist/content/subagents.js +1 -1
- package/dist/content/templates.js +8 -8
- package/dist/content/utility-skills.js +19 -19
- package/dist/doctor.js +2 -2
- package/dist/eval/baseline.js +1 -1
- package/dist/eval/corpus.d.ts +12 -1
- package/dist/eval/corpus.js +163 -8
- package/dist/eval/llm-client.d.ts +10 -10
- package/dist/eval/llm-client.js +5 -5
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +6 -6
- package/dist/eval/runner.js +83 -37
- package/dist/eval/types.d.ts +78 -13
- package/dist/eval/verifiers/rules.d.ts +24 -0
- package/dist/eval/verifiers/rules.js +218 -0
- package/dist/eval/verifiers/structural.js +3 -3
- package/dist/eval/verifiers/traceability.d.ts +23 -0
- package/dist/eval/verifiers/traceability.js +84 -0
- package/dist/install.js +3 -3
- package/dist/policy.js +1 -1
- package/package.json +1 -1
|
@@ -482,7 +482,7 @@ description: "Execute approved plans with disciplined batching, explicit checkpo
|
|
|
482
482
|
## Quick Start
|
|
483
483
|
|
|
484
484
|
> 1. Confirm the plan and stage gates are approved before execution.
|
|
485
|
-
> 2. Execute in batches
|
|
485
|
+
> 2. Execute in batches, not as one giant untracked stream.
|
|
486
486
|
> 3. Stop at checkpoint boundaries for verification and user visibility.
|
|
487
487
|
|
|
488
488
|
## HARD-GATE
|
|
@@ -492,47 +492,47 @@ Do not start implementation execution without an approved plan artifact and expl
|
|
|
492
492
|
## Execution Protocol
|
|
493
493
|
|
|
494
494
|
1. **Load plan source of truth** from \`.cclaw/artifacts/05-plan.md\` (canonical run copy when available).
|
|
495
|
-
2. **Group tasks into
|
|
496
|
-
3. **Run one
|
|
497
|
-
4. **Checkpoint each
|
|
495
|
+
2. **Group tasks into batches** by dependency order and risk.
|
|
496
|
+
3. **Run one batch at a time** with evidence after each task (tests, build, lint, or review evidence as applicable).
|
|
497
|
+
4. **Checkpoint each batch** by updating stage artifact evidence and unresolved blockers.
|
|
498
498
|
5. **Stop immediately** on any hard blocker, failing gate, or unresolved critical finding.
|
|
499
499
|
|
|
500
|
-
##
|
|
500
|
+
## Batch Checklist
|
|
501
501
|
|
|
502
|
-
-
|
|
502
|
+
- Batch scope is explicit (task IDs + expected outputs).
|
|
503
503
|
- Verification command for each task is predetermined.
|
|
504
504
|
- Machine-only checks are delegated to subagents when supported.
|
|
505
505
|
- User approvals are requested only at required gate boundaries.
|
|
506
506
|
|
|
507
|
-
## Fresh Context Protocol (between
|
|
507
|
+
## Fresh Context Protocol (between batches)
|
|
508
508
|
|
|
509
|
-
After a
|
|
510
|
-
the #1 cause of degraded execution quality. Before starting the **next
|
|
509
|
+
After a batch completes — especially after long agent turns — context drift is
|
|
510
|
+
the #1 cause of degraded execution quality. Before starting the **next batch**,
|
|
511
511
|
prefer a **fresh agent context** over continuing in a saturated session:
|
|
512
512
|
|
|
513
|
-
1. **Snapshot
|
|
514
|
-
(\`###
|
|
513
|
+
1. **Snapshot batch outcome** — append a short summary to the plan artifact
|
|
514
|
+
(\`### Batch <N> outcome\` with: tasks done, evidence files, blockers, next-batch inputs).
|
|
515
515
|
2. **Capture handoff facts** — the minimum information the next agent needs:
|
|
516
516
|
- Stage and run id (from \`.cclaw/state/flow-state.json\`)
|
|
517
517
|
- List of completed task IDs from the plan
|
|
518
518
|
- Open blockers / failing gates by name
|
|
519
|
-
- File paths the next
|
|
519
|
+
- File paths the next batch will touch (no full diffs)
|
|
520
520
|
3. **Decide: continue or rotate**
|
|
521
|
-
- **Rotate** (start a new agent session) when: prior
|
|
522
|
-
- **Continue** when: next
|
|
521
|
+
- **Rotate** (start a new agent session) when: prior batch consumed > ~50% of the context budget, the prior batch required deep investigation that the next batch does not need, or you are about to cross a stage boundary.
|
|
522
|
+
- **Continue** when: next batch is a tiny follow-up (≤ 1 task) and the prior context is directly relevant.
|
|
523
523
|
4. **Resume** in the new session via \`/cc-next\` — the session-start hook will restore flow state, checkpoint, and digest automatically.
|
|
524
524
|
|
|
525
|
-
This is the same intuition as Compound Engineering's "fresh context per iteration": every
|
|
525
|
+
This is the same intuition as Compound Engineering's "fresh context per iteration": every batch starts with a clean, intentionally-loaded context, not a degraded carry-over.
|
|
526
526
|
|
|
527
527
|
### Handoff template (paste into next session)
|
|
528
528
|
|
|
529
529
|
\`\`\`markdown
|
|
530
|
-
##
|
|
530
|
+
## Batch <N> handoff
|
|
531
531
|
- Stage: <stage>
|
|
532
532
|
- Run: <runId>
|
|
533
533
|
- Completed task IDs: <list>
|
|
534
534
|
- Blockers: <list or none>
|
|
535
|
-
- Files next
|
|
535
|
+
- Files next batch will touch: <list>
|
|
536
536
|
- Verification command(s) used: <list>
|
|
537
537
|
\`\`\`
|
|
538
538
|
|
|
@@ -542,7 +542,7 @@ This is the same intuition as Compound Engineering's "fresh context per iteratio
|
|
|
542
542
|
- Marking tasks done without command evidence.
|
|
543
543
|
- Reordering critical dependencies for speed.
|
|
544
544
|
- Continuing after a gate failure hoping later tasks fix it.
|
|
545
|
-
- Carrying a saturated context across
|
|
545
|
+
- Carrying a saturated context across batch boundaries because "it has all the history" — saturated context is a liability, not an asset.
|
|
546
546
|
`;
|
|
547
547
|
}
|
|
548
548
|
export function contextEngineeringSkill() {
|
|
@@ -1338,7 +1338,7 @@ For each lens, write either a knowledge entry **or** the explicit string
|
|
|
1338
1338
|
|
|
1339
1339
|
### 2. What slowed us down?
|
|
1340
1340
|
|
|
1341
|
-
- Repeated context loss between
|
|
1341
|
+
- Repeated context loss between batches → \`[compound]\` accelerator.
|
|
1342
1342
|
- Re-derivation of a fact already in upstream artifacts → \`[pattern]\` "re-read X first".
|
|
1343
1343
|
- Tooling friction (slow test loop, flaky CI) → \`[compound]\` follow-up.
|
|
1344
1344
|
|
package/dist/doctor.js
CHANGED
|
@@ -283,8 +283,8 @@ export async function doctorChecks(projectRoot, options = {}) {
|
|
|
283
283
|
const skillContent = await fs.readFile(skillPath, "utf8");
|
|
284
284
|
const lineCount = skillContent.split("\n").length;
|
|
285
285
|
const MIN_SKILL_LINES = 110;
|
|
286
|
-
// Soft max tightened
|
|
287
|
-
//
|
|
286
|
+
// Soft max tightened from 650 → 500 after externalising the TDD
|
|
287
|
+
// batch-execution walkthrough and collapsing the duplicate "what
|
|
288
288
|
// goes wrong" lists. Stage skills beyond 500 lines drift into unread
|
|
289
289
|
// bloat; long-form content belongs under `.cclaw/references/` instead.
|
|
290
290
|
const MAX_SKILL_LINES = 500;
|
package/dist/eval/baseline.js
CHANGED
package/dist/eval/corpus.d.ts
CHANGED
|
@@ -14,6 +14,17 @@ export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase)
|
|
|
14
14
|
/**
|
|
15
15
|
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
16
16
|
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
17
|
-
* the case but not on disk —
|
|
17
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
18
18
|
*/
|
|
19
19
|
export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
|
|
20
|
+
/**
|
|
21
|
+
* Resolve an entry from `extraFixtures` to an absolute filesystem path,
|
|
22
|
+
* relative to the case's stage directory (same convention as `fixture`).
|
|
23
|
+
*/
|
|
24
|
+
export declare function extraFixturePath(projectRoot: string, caseEntry: EvalCase, label: string): string | undefined;
|
|
25
|
+
/**
|
|
26
|
+
* Read every declared extra fixture for a case into a `{ label → text }`
|
|
27
|
+
* map. Missing files throw so authoring mistakes surface immediately rather
|
|
28
|
+
* than being silently skipped by cross-artifact verifiers.
|
|
29
|
+
*/
|
|
30
|
+
export declare function readExtraFixtures(projectRoot: string, caseEntry: EvalCase): Promise<Record<string, string>>;
|
package/dist/eval/corpus.js
CHANGED
|
@@ -58,6 +58,128 @@ function parseStructural(filePath, raw) {
|
|
|
58
58
|
structural.maxChars = maxChars;
|
|
59
59
|
return structural;
|
|
60
60
|
}
|
|
61
|
+
function parseRegexRule(filePath, context, value) {
|
|
62
|
+
if (typeof value === "string") {
|
|
63
|
+
return { pattern: value };
|
|
64
|
+
}
|
|
65
|
+
if (!isRecord(value)) {
|
|
66
|
+
throw corpusError(filePath, `"${context}" entries must be either a string or a mapping with "pattern"`);
|
|
67
|
+
}
|
|
68
|
+
const pattern = value.pattern;
|
|
69
|
+
if (typeof pattern !== "string" || pattern.length === 0) {
|
|
70
|
+
throw corpusError(filePath, `"${context}" mapping entry must include a non-empty "pattern" string`);
|
|
71
|
+
}
|
|
72
|
+
const flags = value.flags;
|
|
73
|
+
if (flags !== undefined && typeof flags !== "string") {
|
|
74
|
+
throw corpusError(filePath, `"${context}" flags must be a string`);
|
|
75
|
+
}
|
|
76
|
+
const description = value.description;
|
|
77
|
+
if (description !== undefined && typeof description !== "string") {
|
|
78
|
+
throw corpusError(filePath, `"${context}" description must be a string`);
|
|
79
|
+
}
|
|
80
|
+
const rule = { pattern };
|
|
81
|
+
if (flags !== undefined)
|
|
82
|
+
rule.flags = flags;
|
|
83
|
+
if (description !== undefined)
|
|
84
|
+
rule.description = description;
|
|
85
|
+
return rule;
|
|
86
|
+
}
|
|
87
|
+
function parseRegexRules(filePath, context, value) {
|
|
88
|
+
if (value === undefined)
|
|
89
|
+
return undefined;
|
|
90
|
+
if (!Array.isArray(value)) {
|
|
91
|
+
throw corpusError(filePath, `"${context}" must be an array`);
|
|
92
|
+
}
|
|
93
|
+
return value.map((entry, index) => parseRegexRule(filePath, `${context}[${index}]`, entry));
|
|
94
|
+
}
|
|
95
|
+
function parseOccurrenceBounds(filePath, context, value) {
|
|
96
|
+
if (value === undefined)
|
|
97
|
+
return undefined;
|
|
98
|
+
if (!isRecord(value)) {
|
|
99
|
+
throw corpusError(filePath, `"${context}" must be a mapping of phrase → integer`);
|
|
100
|
+
}
|
|
101
|
+
const out = {};
|
|
102
|
+
for (const [phrase, count] of Object.entries(value)) {
|
|
103
|
+
if (typeof count !== "number" || !Number.isFinite(count) || !Number.isInteger(count) || count < 0) {
|
|
104
|
+
throw corpusError(filePath, `"${context}.${phrase}" must be a non-negative integer`);
|
|
105
|
+
}
|
|
106
|
+
out[phrase] = count;
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
function parseRules(filePath, raw) {
|
|
111
|
+
if (raw === undefined)
|
|
112
|
+
return undefined;
|
|
113
|
+
if (!isRecord(raw)) {
|
|
114
|
+
throw corpusError(filePath, `"expected.rules" must be a mapping`);
|
|
115
|
+
}
|
|
116
|
+
const mustContain = readStringArray(filePath, "expected.rules.must_contain", raw.must_contain ?? raw.mustContain);
|
|
117
|
+
const mustNotContain = readStringArray(filePath, "expected.rules.must_not_contain", raw.must_not_contain ?? raw.mustNotContain);
|
|
118
|
+
const regexRequired = parseRegexRules(filePath, "expected.rules.regex_required", raw.regex_required ?? raw.regexRequired);
|
|
119
|
+
const regexForbidden = parseRegexRules(filePath, "expected.rules.regex_forbidden", raw.regex_forbidden ?? raw.regexForbidden);
|
|
120
|
+
const minOccurrences = parseOccurrenceBounds(filePath, "expected.rules.min_occurrences", raw.min_occurrences ?? raw.minOccurrences);
|
|
121
|
+
const maxOccurrences = parseOccurrenceBounds(filePath, "expected.rules.max_occurrences", raw.max_occurrences ?? raw.maxOccurrences);
|
|
122
|
+
const uniqueBulletsInSection = readStringArray(filePath, "expected.rules.unique_bullets_in_section", raw.unique_bullets_in_section ?? raw.uniqueBulletsInSection);
|
|
123
|
+
const rules = {};
|
|
124
|
+
if (mustContain)
|
|
125
|
+
rules.mustContain = mustContain;
|
|
126
|
+
if (mustNotContain)
|
|
127
|
+
rules.mustNotContain = mustNotContain;
|
|
128
|
+
if (regexRequired)
|
|
129
|
+
rules.regexRequired = regexRequired;
|
|
130
|
+
if (regexForbidden)
|
|
131
|
+
rules.regexForbidden = regexForbidden;
|
|
132
|
+
if (minOccurrences)
|
|
133
|
+
rules.minOccurrences = minOccurrences;
|
|
134
|
+
if (maxOccurrences)
|
|
135
|
+
rules.maxOccurrences = maxOccurrences;
|
|
136
|
+
if (uniqueBulletsInSection)
|
|
137
|
+
rules.uniqueBulletsInSection = uniqueBulletsInSection;
|
|
138
|
+
return Object.keys(rules).length === 0 ? undefined : rules;
|
|
139
|
+
}
|
|
140
|
+
function parseTraceability(filePath, raw) {
|
|
141
|
+
if (raw === undefined)
|
|
142
|
+
return undefined;
|
|
143
|
+
if (!isRecord(raw)) {
|
|
144
|
+
throw corpusError(filePath, `"expected.traceability" must be a mapping`);
|
|
145
|
+
}
|
|
146
|
+
const idPattern = raw.id_pattern ?? raw.idPattern;
|
|
147
|
+
if (typeof idPattern !== "string" || idPattern.length === 0) {
|
|
148
|
+
throw corpusError(filePath, `"expected.traceability.id_pattern" must be a non-empty regex source`);
|
|
149
|
+
}
|
|
150
|
+
const idFlags = raw.id_flags ?? raw.idFlags;
|
|
151
|
+
if (idFlags !== undefined && typeof idFlags !== "string") {
|
|
152
|
+
throw corpusError(filePath, `"expected.traceability.id_flags" must be a string`);
|
|
153
|
+
}
|
|
154
|
+
const source = raw.source;
|
|
155
|
+
if (typeof source !== "string" || source.length === 0) {
|
|
156
|
+
throw corpusError(filePath, `"expected.traceability.source" must be "self" or an extra_fixtures label`);
|
|
157
|
+
}
|
|
158
|
+
const requireInRaw = raw.require_in ?? raw.requireIn;
|
|
159
|
+
const requireIn = readStringArray(filePath, "expected.traceability.require_in", requireInRaw);
|
|
160
|
+
if (!requireIn || requireIn.length === 0) {
|
|
161
|
+
throw corpusError(filePath, `"expected.traceability.require_in" must be a non-empty array`);
|
|
162
|
+
}
|
|
163
|
+
const out = { idPattern, source, requireIn };
|
|
164
|
+
if (idFlags !== undefined)
|
|
165
|
+
out.idFlags = idFlags;
|
|
166
|
+
return out;
|
|
167
|
+
}
|
|
168
|
+
function parseExtraFixtures(filePath, raw) {
|
|
169
|
+
if (raw === undefined)
|
|
170
|
+
return undefined;
|
|
171
|
+
if (!isRecord(raw)) {
|
|
172
|
+
throw corpusError(filePath, `"extra_fixtures" must be a mapping of label → path`);
|
|
173
|
+
}
|
|
174
|
+
const out = {};
|
|
175
|
+
for (const [label, value] of Object.entries(raw)) {
|
|
176
|
+
if (typeof value !== "string" || value.length === 0) {
|
|
177
|
+
throw corpusError(filePath, `"extra_fixtures.${label}" must be a non-empty path string`);
|
|
178
|
+
}
|
|
179
|
+
out[label] = value;
|
|
180
|
+
}
|
|
181
|
+
return Object.keys(out).length === 0 ? undefined : out;
|
|
182
|
+
}
|
|
61
183
|
function parseExpected(filePath, raw) {
|
|
62
184
|
if (raw === undefined)
|
|
63
185
|
return undefined;
|
|
@@ -68,12 +190,12 @@ function parseExpected(filePath, raw) {
|
|
|
68
190
|
const structural = parseStructural(filePath, raw.structural);
|
|
69
191
|
if (structural)
|
|
70
192
|
shape.structural = structural;
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
193
|
+
const rules = parseRules(filePath, raw.rules);
|
|
194
|
+
if (rules)
|
|
195
|
+
shape.rules = rules;
|
|
196
|
+
const traceability = parseTraceability(filePath, raw.traceability);
|
|
197
|
+
if (traceability)
|
|
198
|
+
shape.traceability = traceability;
|
|
77
199
|
if (raw.judge !== undefined) {
|
|
78
200
|
if (!isRecord(raw.judge)) {
|
|
79
201
|
throw corpusError(filePath, `"expected.judge" must be a mapping`);
|
|
@@ -101,13 +223,15 @@ function validateCase(filePath, raw) {
|
|
|
101
223
|
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
102
224
|
const expected = parseExpected(filePath, raw.expected);
|
|
103
225
|
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
226
|
+
const extraFixtures = parseExtraFixtures(filePath, raw.extra_fixtures ?? raw.extraFixtures);
|
|
104
227
|
return {
|
|
105
228
|
id: id.trim(),
|
|
106
229
|
stage: stageRaw,
|
|
107
230
|
inputPrompt: inputPrompt.trim(),
|
|
108
231
|
contextFiles,
|
|
109
232
|
expected,
|
|
110
|
-
fixture
|
|
233
|
+
fixture,
|
|
234
|
+
extraFixtures
|
|
111
235
|
};
|
|
112
236
|
}
|
|
113
237
|
/**
|
|
@@ -162,7 +286,7 @@ export function fixturePathFor(projectRoot, caseEntry) {
|
|
|
162
286
|
/**
|
|
163
287
|
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
164
288
|
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
165
|
-
* the case but not on disk —
|
|
289
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
166
290
|
*/
|
|
167
291
|
export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
168
292
|
const fixturePath = fixturePathFor(projectRoot, caseEntry);
|
|
@@ -173,3 +297,34 @@ export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
|
173
297
|
}
|
|
174
298
|
return fs.readFile(fixturePath, "utf8");
|
|
175
299
|
}
|
|
300
|
+
/**
|
|
301
|
+
* Resolve an entry from `extraFixtures` to an absolute filesystem path,
|
|
302
|
+
* relative to the case's stage directory (same convention as `fixture`).
|
|
303
|
+
*/
|
|
304
|
+
export function extraFixturePath(projectRoot, caseEntry, label) {
|
|
305
|
+
const value = caseEntry.extraFixtures?.[label];
|
|
306
|
+
if (!value)
|
|
307
|
+
return undefined;
|
|
308
|
+
return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, value);
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Read every declared extra fixture for a case into a `{ label → text }`
|
|
312
|
+
* map. Missing files throw so authoring mistakes surface immediately rather
|
|
313
|
+
* than being silently skipped by cross-artifact verifiers.
|
|
314
|
+
*/
|
|
315
|
+
export async function readExtraFixtures(projectRoot, caseEntry) {
|
|
316
|
+
const out = {};
|
|
317
|
+
if (!caseEntry.extraFixtures)
|
|
318
|
+
return out;
|
|
319
|
+
for (const label of Object.keys(caseEntry.extraFixtures)) {
|
|
320
|
+
const filePath = extraFixturePath(projectRoot, caseEntry, label);
|
|
321
|
+
if (!filePath)
|
|
322
|
+
continue;
|
|
323
|
+
if (!(await exists(filePath))) {
|
|
324
|
+
throw new Error(`Extra fixture missing for ${caseEntry.stage}/${caseEntry.id} ` +
|
|
325
|
+
`(label="${label}"): ${filePath}`);
|
|
326
|
+
}
|
|
327
|
+
out[label] = await fs.readFile(filePath, "utf8");
|
|
328
|
+
}
|
|
329
|
+
return out;
|
|
330
|
+
}
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* runtime dependency. The real implementation
|
|
4
|
+
* This module declares the shape of the client without pulling in the
|
|
5
|
+
* `openai` runtime dependency. The real implementation lands when
|
|
6
6
|
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users
|
|
8
|
-
*
|
|
7
|
+
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
+
* install an extra dependency or receive network egress warnings.
|
|
9
9
|
*/
|
|
10
10
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
11
|
/**
|
|
12
12
|
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
13
|
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
*
|
|
14
|
+
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
15
|
*/
|
|
16
16
|
export interface ChatMessage {
|
|
17
17
|
role: "system" | "user" | "assistant" | "tool";
|
|
@@ -26,8 +26,8 @@ export interface ChatRequest {
|
|
|
26
26
|
temperature?: number;
|
|
27
27
|
timeoutMs?: number;
|
|
28
28
|
/**
|
|
29
|
-
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
-
*
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
+
* by Tier B. Ignored by the Tier A single-shot path.
|
|
31
31
|
*/
|
|
32
32
|
tools?: unknown[];
|
|
33
33
|
toolChoice?: "auto" | "none";
|
|
@@ -52,11 +52,11 @@ export interface EvalLlmClient {
|
|
|
52
52
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
53
|
}
|
|
54
54
|
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
-
constructor(
|
|
55
|
+
constructor();
|
|
56
56
|
}
|
|
57
57
|
/**
|
|
58
|
-
* Factory stub. Throws with a clear message so accidental
|
|
59
|
-
* easy to diagnose. The
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
59
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
60
60
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
61
|
*/
|
|
62
62
|
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
export class EvalLlmNotWiredError extends Error {
|
|
2
|
-
constructor(
|
|
3
|
-
super(`LLM client is not wired
|
|
2
|
+
constructor() {
|
|
3
|
+
super(`LLM client is not wired yet.\n` +
|
|
4
4
|
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
5
|
this.name = "EvalLlmNotWiredError";
|
|
6
6
|
}
|
|
7
7
|
}
|
|
8
8
|
/**
|
|
9
|
-
* Factory stub. Throws with a clear message so accidental
|
|
10
|
-
* easy to diagnose. The
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
10
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
11
11
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
12
|
*/
|
|
13
13
|
export function createEvalClient(_config) {
|
|
14
14
|
return {
|
|
15
15
|
async chat() {
|
|
16
|
-
throw new EvalLlmNotWiredError(
|
|
16
|
+
throw new EvalLlmNotWiredError();
|
|
17
17
|
}
|
|
18
18
|
};
|
|
19
19
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -62,7 +62,7 @@ export function formatMarkdownReport(report) {
|
|
|
62
62
|
if (report.cases.length === 0) {
|
|
63
63
|
lines.push(`## Cases`);
|
|
64
64
|
lines.push(``);
|
|
65
|
-
lines.push(`No cases were executed. See \`docs/evals.md\` for the
|
|
65
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
|
|
66
66
|
lines.push(``);
|
|
67
67
|
return `${lines.join("\n")}\n`;
|
|
68
68
|
}
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -4,11 +4,11 @@ export interface RunEvalOptions {
|
|
|
4
4
|
projectRoot: string;
|
|
5
5
|
stage?: FlowStage;
|
|
6
6
|
tier?: EvalTier;
|
|
7
|
-
/** When true, run only structural verifiers (
|
|
7
|
+
/** When true, run only structural verifiers (Step 1). */
|
|
8
8
|
schemaOnly?: boolean;
|
|
9
|
-
/** When true, run structural + rule-based verifiers.
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
10
10
|
rules?: boolean;
|
|
11
|
-
/** When true, also run LLM judge verifiers.
|
|
11
|
+
/** When true, also run LLM judge verifiers. Step 3 wires judging. */
|
|
12
12
|
judge?: boolean;
|
|
13
13
|
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
14
|
dryRun?: boolean;
|
|
@@ -36,10 +36,10 @@ export interface DryRunSummary {
|
|
|
36
36
|
notes: string[];
|
|
37
37
|
}
|
|
38
38
|
/**
|
|
39
|
-
*
|
|
39
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
40
40
|
* active), runs structural verifiers against fixture-backed cases and loads
|
|
41
41
|
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
42
|
-
*
|
|
43
|
-
*
|
|
42
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
43
|
+
* skipped rather than failing.
|
|
44
44
|
*/
|
|
45
45
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -2,9 +2,11 @@ import { randomUUID } from "node:crypto";
|
|
|
2
2
|
import { CCLAW_VERSION } from "../constants.js";
|
|
3
3
|
import { FLOW_STAGES } from "../types.js";
|
|
4
4
|
import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
|
|
5
|
-
import { loadCorpus, readFixtureArtifact } from "./corpus.js";
|
|
5
|
+
import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
|
|
6
6
|
import { loadEvalConfig } from "./config-loader.js";
|
|
7
|
+
import { verifyRules } from "./verifiers/rules.js";
|
|
7
8
|
import { verifyStructural } from "./verifiers/structural.js";
|
|
9
|
+
import { verifyTraceability } from "./verifiers/traceability.js";
|
|
8
10
|
function groupByStage(cases) {
|
|
9
11
|
return cases.reduce((acc, item) => {
|
|
10
12
|
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
@@ -14,40 +16,72 @@ function groupByStage(cases) {
|
|
|
14
16
|
function skeletonVerifierResult(message, details) {
|
|
15
17
|
return {
|
|
16
18
|
kind: "structural",
|
|
17
|
-
id: "
|
|
19
|
+
id: "structural:no-expectations",
|
|
18
20
|
ok: true,
|
|
19
21
|
score: 1,
|
|
20
22
|
message,
|
|
21
23
|
...(details !== undefined ? { details } : {})
|
|
22
24
|
};
|
|
23
25
|
}
|
|
24
|
-
|
|
26
|
+
/**
|
|
27
|
+
* --schema-only narrows to structural. --rules opens up rules + traceability
|
|
28
|
+
* on top of structural (traceability is a rule-family verifier even though
|
|
29
|
+
* it lives in its own module). Default (no flag) matches --schema-only for
|
|
30
|
+
* backwards compatibility with the Step 1 gate.
|
|
31
|
+
*/
|
|
32
|
+
function resolveRunFlags(options) {
|
|
33
|
+
const rulesRequested = options.rules === true;
|
|
34
|
+
const schemaOnly = options.schemaOnly === true;
|
|
35
|
+
return {
|
|
36
|
+
runStructural: true,
|
|
37
|
+
runRules: rulesRequested && !schemaOnly,
|
|
38
|
+
runTraceability: rulesRequested && !schemaOnly
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
|
|
42
|
+
try {
|
|
43
|
+
return await readFixtureArtifact(projectRoot, caseEntry);
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
verifierResults.push({
|
|
47
|
+
kind: "structural",
|
|
48
|
+
id: "structural:fixture:missing",
|
|
49
|
+
ok: false,
|
|
50
|
+
score: 0,
|
|
51
|
+
message: err instanceof Error ? err.message : String(err),
|
|
52
|
+
details: { fixture: caseEntry.fixture }
|
|
53
|
+
});
|
|
54
|
+
return undefined;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
async function runCase(projectRoot, caseEntry, plannedTier, flags) {
|
|
25
58
|
const started = Date.now();
|
|
26
|
-
const structuralExpected = caseEntry.expected?.structural;
|
|
27
59
|
const verifierResults = [];
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
artifact = await readFixtureArtifact(projectRoot, caseEntry);
|
|
38
|
-
}
|
|
39
|
-
catch (err) {
|
|
60
|
+
const expected = caseEntry.expected;
|
|
61
|
+
const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
|
|
62
|
+
const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
|
|
63
|
+
const hasTraceability = flags.runTraceability && !!expected?.traceability;
|
|
64
|
+
const needsArtifact = hasStructural || hasRules || hasTraceability;
|
|
65
|
+
let artifact;
|
|
66
|
+
if (needsArtifact) {
|
|
67
|
+
artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
|
|
68
|
+
if (artifact === undefined && verifierResults.length === 0) {
|
|
40
69
|
verifierResults.push({
|
|
41
70
|
kind: "structural",
|
|
42
|
-
id: "structural:fixture:
|
|
71
|
+
id: "structural:fixture:absent",
|
|
43
72
|
ok: false,
|
|
44
73
|
score: 0,
|
|
45
|
-
message:
|
|
46
|
-
details: {
|
|
74
|
+
message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
|
|
75
|
+
details: { fixtureProvided: false }
|
|
47
76
|
});
|
|
48
77
|
}
|
|
49
|
-
|
|
50
|
-
|
|
78
|
+
}
|
|
79
|
+
if (flags.runStructural) {
|
|
80
|
+
if (!hasStructural) {
|
|
81
|
+
verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
|
|
82
|
+
}
|
|
83
|
+
else if (artifact !== undefined) {
|
|
84
|
+
const results = verifyStructural(artifact, expected.structural);
|
|
51
85
|
if (results.length === 0) {
|
|
52
86
|
verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
|
|
53
87
|
}
|
|
@@ -55,18 +89,32 @@ async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
|
|
|
55
89
|
verifierResults.push(...results);
|
|
56
90
|
}
|
|
57
91
|
}
|
|
58
|
-
|
|
92
|
+
}
|
|
93
|
+
if (hasRules && artifact !== undefined) {
|
|
94
|
+
const results = verifyRules(artifact, expected.rules);
|
|
95
|
+
verifierResults.push(...results);
|
|
96
|
+
}
|
|
97
|
+
if (hasTraceability && artifact !== undefined) {
|
|
98
|
+
try {
|
|
99
|
+
const extras = await readExtraFixtures(projectRoot, caseEntry);
|
|
100
|
+
const results = verifyTraceability(artifact, extras, expected.traceability);
|
|
101
|
+
verifierResults.push(...results);
|
|
102
|
+
}
|
|
103
|
+
catch (err) {
|
|
59
104
|
verifierResults.push({
|
|
60
|
-
kind: "
|
|
61
|
-
id: "
|
|
105
|
+
kind: "rules",
|
|
106
|
+
id: "traceability:fixture:missing",
|
|
62
107
|
ok: false,
|
|
63
108
|
score: 0,
|
|
64
|
-
message:
|
|
65
|
-
details: {
|
|
109
|
+
message: err instanceof Error ? err.message : String(err),
|
|
110
|
+
details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
|
|
66
111
|
});
|
|
67
112
|
}
|
|
68
113
|
}
|
|
69
|
-
const
|
|
114
|
+
const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
|
|
115
|
+
const allOk = nonSkippedResults.length === 0
|
|
116
|
+
? verifierResults.every((r) => r.ok)
|
|
117
|
+
: nonSkippedResults.every((r) => r.ok);
|
|
70
118
|
return {
|
|
71
119
|
caseId: caseEntry.id,
|
|
72
120
|
stage: caseEntry.stage,
|
|
@@ -111,11 +159,11 @@ function stagesInResults(caseResults) {
|
|
|
111
159
|
return FLOW_STAGES.filter((s) => set.has(s));
|
|
112
160
|
}
|
|
113
161
|
/**
|
|
114
|
-
*
|
|
162
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
115
163
|
* active), runs structural verifiers against fixture-backed cases and loads
|
|
116
164
|
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
117
|
-
*
|
|
118
|
-
*
|
|
165
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
166
|
+
* skipped rather than failing.
|
|
119
167
|
*/
|
|
120
168
|
export async function runEval(options) {
|
|
121
169
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
@@ -125,12 +173,10 @@ export async function runEval(options) {
|
|
|
125
173
|
if (corpus.length === 0) {
|
|
126
174
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
127
175
|
}
|
|
128
|
-
if (options.rules) {
|
|
129
|
-
notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
|
|
130
|
-
}
|
|
131
176
|
if (options.judge) {
|
|
132
|
-
notes.push("--judge is accepted; LLM judging
|
|
177
|
+
notes.push("--judge is accepted; LLM judging is not wired yet.");
|
|
133
178
|
}
|
|
179
|
+
const flags = resolveRunFlags(options);
|
|
134
180
|
if (options.dryRun === true) {
|
|
135
181
|
const summary = {
|
|
136
182
|
kind: "dry-run",
|
|
@@ -142,8 +188,8 @@ export async function runEval(options) {
|
|
|
142
188
|
},
|
|
143
189
|
plannedTier,
|
|
144
190
|
verifiersAvailable: {
|
|
145
|
-
structural:
|
|
146
|
-
rules:
|
|
191
|
+
structural: flags.runStructural,
|
|
192
|
+
rules: flags.runRules,
|
|
147
193
|
judge: false,
|
|
148
194
|
workflow: false
|
|
149
195
|
},
|
|
@@ -154,7 +200,7 @@ export async function runEval(options) {
|
|
|
154
200
|
const now = new Date().toISOString();
|
|
155
201
|
const caseResults = [];
|
|
156
202
|
for (const item of corpus) {
|
|
157
|
-
caseResults.push(await
|
|
203
|
+
caseResults.push(await runCase(options.projectRoot, item, plannedTier, flags));
|
|
158
204
|
}
|
|
159
205
|
const stages = stagesInResults(caseResults);
|
|
160
206
|
const baselines = await loadBaselinesByStage(options.projectRoot, stages);
|