cclaw-cli 0.22.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +42 -11
- package/dist/constants.d.ts +4 -4
- package/dist/constants.js +4 -4
- package/dist/content/eval-scaffold.d.ts +4 -4
- package/dist/content/eval-scaffold.js +13 -14
- package/dist/content/examples.js +11 -11
- package/dist/content/hooks.js +1 -1
- package/dist/content/skills.d.ts +3 -3
- package/dist/content/skills.js +19 -19
- package/dist/content/stage-schema.js +2 -2
- package/dist/content/stages/plan.js +18 -18
- package/dist/content/stages/schema-types.d.ts +2 -2
- package/dist/content/stages/tdd.js +1 -1
- package/dist/content/subagents.js +1 -1
- package/dist/content/templates.js +8 -8
- package/dist/content/utility-skills.js +19 -19
- package/dist/doctor.js +2 -2
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/corpus.d.ts +13 -2
- package/dist/eval/corpus.js +97 -13
- package/dist/eval/llm-client.d.ts +10 -10
- package/dist/eval/llm-client.js +5 -5
- package/dist/eval/report.js +17 -4
- package/dist/eval/runner.d.ts +8 -16
- package/dist/eval/runner.js +124 -42
- package/dist/eval/types.d.ts +94 -14
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/dist/install.js +3 -3
- package/dist/policy.js +1 -1
- package/package.json +1 -1
|
@@ -309,23 +309,23 @@ inputs_hash: sha256:pending
|
|
|
309
309
|
## Dependency Graph
|
|
310
310
|
-
|
|
311
311
|
|
|
312
|
-
## Dependency
|
|
312
|
+
## Dependency Batches
|
|
313
313
|
|
|
314
|
-
###
|
|
314
|
+
### Batch 1 (foundation)
|
|
315
315
|
- Task IDs:
|
|
316
316
|
- Verification gate:
|
|
317
317
|
|
|
318
|
-
###
|
|
318
|
+
### Batch 2 (dependent)
|
|
319
319
|
- Task IDs:
|
|
320
320
|
- Depends on:
|
|
321
321
|
- Verification gate:
|
|
322
322
|
|
|
323
|
-
###
|
|
323
|
+
### Batch 3 (integration)
|
|
324
324
|
- Task IDs:
|
|
325
325
|
- Depends on:
|
|
326
326
|
- Verification gate:
|
|
327
327
|
|
|
328
|
-
Execution rule: complete and verify each
|
|
328
|
+
Execution rule: complete and verify each batch before starting the next batch.
|
|
329
329
|
|
|
330
330
|
## Task List
|
|
331
331
|
|
|
@@ -333,7 +333,7 @@ Execution rule: complete and verify each wave before starting the next wave.
|
|
|
333
333
|
- Every task fits the **2-5 minute budget**. If \`[~Nm]\` is >5, split the task.
|
|
334
334
|
- **No placeholders.** Forbidden tokens anywhere in this table: \`TODO\`, \`TBD\`, \`FIXME\`, \`<fill-in>\`, \`<your-*-here>\`, \`xxx\`, bare ellipsis. Every file path, test, and verification command must be copy-pasteable as written.
|
|
335
335
|
- **No silent scope reduction.** Forbidden phrasing when locked decisions exist: \`v1\`, \`for now\`, \`later\`, \`temporary\`, \`placeholder\`, \`mock for now\`, \`hardcoded for now\`, \`will improve later\`.
|
|
336
|
-
- If an estimate is genuinely uncertain (new library, unfamiliar subsystem), add a **spike task in
|
|
336
|
+
- If an estimate is genuinely uncertain (new library, unfamiliar subsystem), add a **spike task in batch 0** to de-risk — do NOT hide the uncertainty inside a large estimate.
|
|
337
337
|
|
|
338
338
|
| Task ID | Description | Acceptance criterion | Verification command | Effort (S/M/L) | Minutes |
|
|
339
339
|
|---|---|---|---|---|---|
|
|
@@ -350,12 +350,12 @@ Execution rule: complete and verify each wave before starting the next wave.
|
|
|
350
350
|
| D-01 | 02-scope.md > Locked Decisions | T-1 | covered |
|
|
351
351
|
|
|
352
352
|
## Risk Assessment
|
|
353
|
-
| Task/
|
|
353
|
+
| Task/Batch | Risk | Likelihood | Impact | Mitigation |
|
|
354
354
|
|---|---|---|---|---|
|
|
355
355
|
| | | | | |
|
|
356
356
|
|
|
357
357
|
## Boundary Map
|
|
358
|
-
| Task/
|
|
358
|
+
| Task/Batch | Produces (exports) | Consumes (imports from) |
|
|
359
359
|
|---|---|---|
|
|
360
360
|
| | | |
|
|
361
361
|
|
|
@@ -482,7 +482,7 @@ description: "Execute approved plans with disciplined batching, explicit checkpo
|
|
|
482
482
|
## Quick Start
|
|
483
483
|
|
|
484
484
|
> 1. Confirm the plan and stage gates are approved before execution.
|
|
485
|
-
> 2. Execute in batches
|
|
485
|
+
> 2. Execute in batches, not as one giant untracked stream.
|
|
486
486
|
> 3. Stop at checkpoint boundaries for verification and user visibility.
|
|
487
487
|
|
|
488
488
|
## HARD-GATE
|
|
@@ -492,47 +492,47 @@ Do not start implementation execution without an approved plan artifact and expl
|
|
|
492
492
|
## Execution Protocol
|
|
493
493
|
|
|
494
494
|
1. **Load plan source of truth** from \`.cclaw/artifacts/05-plan.md\` (canonical run copy when available).
|
|
495
|
-
2. **Group tasks into
|
|
496
|
-
3. **Run one
|
|
497
|
-
4. **Checkpoint each
|
|
495
|
+
2. **Group tasks into batches** by dependency order and risk.
|
|
496
|
+
3. **Run one batch at a time** with evidence after each task (tests, build, lint, or review evidence as applicable).
|
|
497
|
+
4. **Checkpoint each batch** by updating stage artifact evidence and unresolved blockers.
|
|
498
498
|
5. **Stop immediately** on any hard blocker, failing gate, or unresolved critical finding.
|
|
499
499
|
|
|
500
|
-
##
|
|
500
|
+
## Batch Checklist
|
|
501
501
|
|
|
502
|
-
-
|
|
502
|
+
- Batch scope is explicit (task IDs + expected outputs).
|
|
503
503
|
- Verification command for each task is predetermined.
|
|
504
504
|
- Machine-only checks are delegated to subagents when supported.
|
|
505
505
|
- User approvals are requested only at required gate boundaries.
|
|
506
506
|
|
|
507
|
-
## Fresh Context Protocol (between
|
|
507
|
+
## Fresh Context Protocol (between batches)
|
|
508
508
|
|
|
509
|
-
After a
|
|
510
|
-
the #1 cause of degraded execution quality. Before starting the **next
|
|
509
|
+
After a batch completes — especially after long agent turns — context drift is
|
|
510
|
+
the #1 cause of degraded execution quality. Before starting the **next batch**,
|
|
511
511
|
prefer a **fresh agent context** over continuing in a saturated session:
|
|
512
512
|
|
|
513
|
-
1. **Snapshot
|
|
514
|
-
(\`###
|
|
513
|
+
1. **Snapshot batch outcome** — append a short summary to the plan artifact
|
|
514
|
+
(\`### Batch <N> outcome\` with: tasks done, evidence files, blockers, next-batch inputs).
|
|
515
515
|
2. **Capture handoff facts** — the minimum information the next agent needs:
|
|
516
516
|
- Stage and run id (from \`.cclaw/state/flow-state.json\`)
|
|
517
517
|
- List of completed task IDs from the plan
|
|
518
518
|
- Open blockers / failing gates by name
|
|
519
|
-
- File paths the next
|
|
519
|
+
- File paths the next batch will touch (no full diffs)
|
|
520
520
|
3. **Decide: continue or rotate**
|
|
521
|
-
- **Rotate** (start a new agent session) when: prior
|
|
522
|
-
- **Continue** when: next
|
|
521
|
+
- **Rotate** (start a new agent session) when: prior batch consumed > ~50% of the context budget, the prior batch required deep investigation that the next batch does not need, or you are about to cross a stage boundary.
|
|
522
|
+
- **Continue** when: next batch is a tiny follow-up (≤ 1 task) and the prior context is directly relevant.
|
|
523
523
|
4. **Resume** in the new session via \`/cc-next\` — the session-start hook will restore flow state, checkpoint, and digest automatically.
|
|
524
524
|
|
|
525
|
-
This is the same intuition as Compound Engineering's "fresh context per iteration": every
|
|
525
|
+
This is the same intuition as Compound Engineering's "fresh context per iteration": every batch starts with a clean, intentionally-loaded context, not a degraded carry-over.
|
|
526
526
|
|
|
527
527
|
### Handoff template (paste into next session)
|
|
528
528
|
|
|
529
529
|
\`\`\`markdown
|
|
530
|
-
##
|
|
530
|
+
## Batch <N> handoff
|
|
531
531
|
- Stage: <stage>
|
|
532
532
|
- Run: <runId>
|
|
533
533
|
- Completed task IDs: <list>
|
|
534
534
|
- Blockers: <list or none>
|
|
535
|
-
- Files next
|
|
535
|
+
- Files next batch will touch: <list>
|
|
536
536
|
- Verification command(s) used: <list>
|
|
537
537
|
\`\`\`
|
|
538
538
|
|
|
@@ -542,7 +542,7 @@ This is the same intuition as Compound Engineering's "fresh context per iteratio
|
|
|
542
542
|
- Marking tasks done without command evidence.
|
|
543
543
|
- Reordering critical dependencies for speed.
|
|
544
544
|
- Continuing after a gate failure hoping later tasks fix it.
|
|
545
|
-
- Carrying a saturated context across
|
|
545
|
+
- Carrying a saturated context across batch boundaries because "it has all the history" — saturated context is a liability, not an asset.
|
|
546
546
|
`;
|
|
547
547
|
}
|
|
548
548
|
export function contextEngineeringSkill() {
|
|
@@ -1338,7 +1338,7 @@ For each lens, write either a knowledge entry **or** the explicit string
|
|
|
1338
1338
|
|
|
1339
1339
|
### 2. What slowed us down?
|
|
1340
1340
|
|
|
1341
|
-
- Repeated context loss between
|
|
1341
|
+
- Repeated context loss between batches → \`[compound]\` accelerator.
|
|
1342
1342
|
- Re-derivation of a fact already in upstream artifacts → \`[pattern]\` "re-read X first".
|
|
1343
1343
|
- Tooling friction (slow test loop, flaky CI) → \`[compound]\` follow-up.
|
|
1344
1344
|
|
package/dist/doctor.js
CHANGED
|
@@ -283,8 +283,8 @@ export async function doctorChecks(projectRoot, options = {}) {
|
|
|
283
283
|
const skillContent = await fs.readFile(skillPath, "utf8");
|
|
284
284
|
const lineCount = skillContent.split("\n").length;
|
|
285
285
|
const MIN_SKILL_LINES = 110;
|
|
286
|
-
// Soft max tightened
|
|
287
|
-
//
|
|
286
|
+
// Soft max tightened from 650 → 500 after externalising the TDD
|
|
287
|
+
// batch-execution walkthrough and collapsing the duplicate "what
|
|
288
288
|
// goes wrong" lists. Stage skills beyond 500 lines drift into unread
|
|
289
289
|
// bloat; long-form content belongs under `.cclaw/references/` instead.
|
|
290
290
|
const MAX_SKILL_LINES = 500;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
|
+
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
+
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
5
|
+
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
6
|
+
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
|
7
|
+
export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
10
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
11
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
12
|
+
*/
|
|
13
|
+
export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
|
|
14
|
+
export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline I/O + regression comparison for the eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Layout on disk (committed):
|
|
5
|
+
*
|
|
6
|
+
* .cclaw/evals/baselines/<stage>.json
|
|
7
|
+
*
|
|
8
|
+
* Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
|
|
9
|
+
* regressions by comparing per-verifier `ok` flags across runs: any verifier
|
|
10
|
+
* that was `ok:true` in the baseline and is `ok:false` now counts as a
|
|
11
|
+
* critical failure. A case whose aggregate `passed` flipped from true to
|
|
12
|
+
* false is flagged as `case-now-failing` regardless of per-verifier churn.
|
|
13
|
+
*
|
|
14
|
+
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
|
+
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
|
+
*/
|
|
17
|
+
import fs from "node:fs/promises";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
20
|
+
import { exists } from "../fs-utils.js";
|
|
21
|
+
import { FLOW_STAGES } from "../types.js";
|
|
22
|
+
export const BASELINE_SCHEMA_VERSION = 1;
|
|
23
|
+
function baselinePath(projectRoot, stage) {
|
|
24
|
+
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
25
|
+
}
|
|
26
|
+
export async function loadBaseline(projectRoot, stage) {
|
|
27
|
+
const filePath = baselinePath(projectRoot, stage);
|
|
28
|
+
if (!(await exists(filePath)))
|
|
29
|
+
return null;
|
|
30
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
31
|
+
let parsed;
|
|
32
|
+
try {
|
|
33
|
+
parsed = JSON.parse(raw);
|
|
34
|
+
}
|
|
35
|
+
catch (err) {
|
|
36
|
+
throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
37
|
+
}
|
|
38
|
+
if (!isBaseline(parsed, stage)) {
|
|
39
|
+
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
40
|
+
}
|
|
41
|
+
return parsed;
|
|
42
|
+
}
|
|
43
|
+
function isBaseline(value, stage) {
|
|
44
|
+
if (!value || typeof value !== "object")
|
|
45
|
+
return false;
|
|
46
|
+
const candidate = value;
|
|
47
|
+
if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
|
|
48
|
+
return false;
|
|
49
|
+
if (candidate.stage !== stage)
|
|
50
|
+
return false;
|
|
51
|
+
if (typeof candidate.generatedAt !== "string")
|
|
52
|
+
return false;
|
|
53
|
+
if (typeof candidate.cclawVersion !== "string")
|
|
54
|
+
return false;
|
|
55
|
+
if (!candidate.cases || typeof candidate.cases !== "object")
|
|
56
|
+
return false;
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
export async function loadBaselinesByStage(projectRoot, stages) {
|
|
60
|
+
const out = new Map();
|
|
61
|
+
for (const stage of stages) {
|
|
62
|
+
const snapshot = await loadBaseline(projectRoot, stage);
|
|
63
|
+
if (snapshot)
|
|
64
|
+
out.set(stage, snapshot);
|
|
65
|
+
}
|
|
66
|
+
return out;
|
|
67
|
+
}
|
|
68
|
+
function entryFromResult(result) {
|
|
69
|
+
const verifierResults = result.verifierResults.map((v) => ({
|
|
70
|
+
id: v.id,
|
|
71
|
+
kind: v.kind,
|
|
72
|
+
ok: v.ok,
|
|
73
|
+
...(v.score !== undefined ? { score: v.score } : {})
|
|
74
|
+
}));
|
|
75
|
+
return { passed: result.passed, verifierResults };
|
|
76
|
+
}
|
|
77
|
+
export function buildBaselineForStage(stage, report) {
|
|
78
|
+
const stageCases = report.cases.filter((c) => c.stage === stage);
|
|
79
|
+
const cases = {};
|
|
80
|
+
for (const c of stageCases) {
|
|
81
|
+
cases[c.caseId] = entryFromResult(c);
|
|
82
|
+
}
|
|
83
|
+
return {
|
|
84
|
+
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
85
|
+
stage,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
cclawVersion: CCLAW_VERSION,
|
|
88
|
+
cases
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
92
|
+
const written = [];
|
|
93
|
+
const stages = new Set(report.cases.map((c) => c.stage));
|
|
94
|
+
for (const stage of stages) {
|
|
95
|
+
const snapshot = buildBaselineForStage(stage, report);
|
|
96
|
+
const file = baselinePath(projectRoot, stage);
|
|
97
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
98
|
+
await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
|
|
99
|
+
written.push(file);
|
|
100
|
+
}
|
|
101
|
+
return written.sort();
|
|
102
|
+
}
|
|
103
|
+
function verifierMap(entries) {
|
|
104
|
+
const out = new Map();
|
|
105
|
+
for (const entry of entries) {
|
|
106
|
+
out.set(entry.id, entry);
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
function computePassRate(cases) {
|
|
111
|
+
if (cases.length === 0)
|
|
112
|
+
return 1;
|
|
113
|
+
const passed = cases.filter((c) => c.passed).length;
|
|
114
|
+
return passed / cases.length;
|
|
115
|
+
}
|
|
116
|
+
function baselinePassRate(snapshot) {
|
|
117
|
+
const entries = Object.values(snapshot.cases);
|
|
118
|
+
if (entries.length === 0)
|
|
119
|
+
return 1;
|
|
120
|
+
const passed = entries.filter((e) => e.passed).length;
|
|
121
|
+
return passed / entries.length;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
125
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
126
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
127
|
+
*/
|
|
128
|
+
export function compareAgainstBaselines(report, baselines) {
|
|
129
|
+
if (baselines.size === 0)
|
|
130
|
+
return undefined;
|
|
131
|
+
const regressions = [];
|
|
132
|
+
const caseResultsByStage = new Map();
|
|
133
|
+
for (const c of report.cases) {
|
|
134
|
+
const bucket = caseResultsByStage.get(c.stage) ?? [];
|
|
135
|
+
bucket.push(c);
|
|
136
|
+
caseResultsByStage.set(c.stage, bucket);
|
|
137
|
+
}
|
|
138
|
+
let baselineTotalPassRate = 0;
|
|
139
|
+
let baselineStagesCounted = 0;
|
|
140
|
+
for (const [stage, snapshot] of baselines) {
|
|
141
|
+
const current = caseResultsByStage.get(stage) ?? [];
|
|
142
|
+
baselineTotalPassRate += baselinePassRate(snapshot);
|
|
143
|
+
baselineStagesCounted += 1;
|
|
144
|
+
for (const caseResult of current) {
|
|
145
|
+
const baselineEntry = snapshot.cases[caseResult.caseId];
|
|
146
|
+
if (!baselineEntry)
|
|
147
|
+
continue;
|
|
148
|
+
if (baselineEntry.passed && !caseResult.passed) {
|
|
149
|
+
regressions.push({
|
|
150
|
+
caseId: caseResult.caseId,
|
|
151
|
+
stage,
|
|
152
|
+
verifierId: "<case>",
|
|
153
|
+
reason: "case-now-failing",
|
|
154
|
+
previousScore: 1,
|
|
155
|
+
currentScore: 0
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
|
|
159
|
+
for (const currentVerifier of caseResult.verifierResults) {
|
|
160
|
+
const prev = baselineVerifiers.get(currentVerifier.id);
|
|
161
|
+
if (!prev)
|
|
162
|
+
continue;
|
|
163
|
+
if (prev.ok && !currentVerifier.ok) {
|
|
164
|
+
regressions.push({
|
|
165
|
+
caseId: caseResult.caseId,
|
|
166
|
+
stage,
|
|
167
|
+
verifierId: currentVerifier.id,
|
|
168
|
+
reason: "newly-failing",
|
|
169
|
+
previousScore: prev.score ?? 1,
|
|
170
|
+
currentScore: currentVerifier.score ?? 0
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
else if (prev.score !== undefined &&
|
|
174
|
+
currentVerifier.score !== undefined &&
|
|
175
|
+
currentVerifier.score < prev.score) {
|
|
176
|
+
regressions.push({
|
|
177
|
+
caseId: caseResult.caseId,
|
|
178
|
+
stage,
|
|
179
|
+
verifierId: currentVerifier.id,
|
|
180
|
+
reason: "score-drop",
|
|
181
|
+
previousScore: prev.score,
|
|
182
|
+
currentScore: currentVerifier.score
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
const currentPassRate = computePassRate(report.cases);
|
|
189
|
+
const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
|
|
190
|
+
const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
|
|
191
|
+
const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
|
|
192
|
+
const baselineStages = [...baselines.keys()].sort().join(",");
|
|
193
|
+
return {
|
|
194
|
+
baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
|
|
195
|
+
scoreDelta,
|
|
196
|
+
criticalFailures,
|
|
197
|
+
regressions
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
export function listBaselineStages(projectRoot) {
|
|
201
|
+
const root = path.join(projectRoot, EVALS_ROOT, "baselines");
|
|
202
|
+
return fs
|
|
203
|
+
.readdir(root, { withFileTypes: true })
|
|
204
|
+
.then((entries) => entries
|
|
205
|
+
.filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
|
|
206
|
+
.map((entry) => entry.name.replace(/\.json$/, ""))
|
|
207
|
+
.filter((name) => FLOW_STAGES.includes(name)))
|
|
208
|
+
.catch(() => []);
|
|
209
|
+
}
|
package/dist/eval/corpus.d.ts
CHANGED
|
@@ -2,7 +2,18 @@ import type { FlowStage } from "../types.js";
|
|
|
2
2
|
import type { EvalCase } from "./types.js";
|
|
3
3
|
/**
|
|
4
4
|
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
5
|
-
* single stage. Returns an empty array for a fresh install
|
|
6
|
-
* without seed cases; corpus is authored in Wave 7.1+).
|
|
5
|
+
* single stage. Returns an empty array for a fresh install.
|
|
7
6
|
*/
|
|
8
7
|
export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
10
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
11
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
12
|
+
*/
|
|
13
|
+
export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase): string | undefined;
|
|
14
|
+
/**
|
|
15
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
16
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
17
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
18
|
+
*/
|
|
19
|
+
export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
|
package/dist/eval/corpus.js
CHANGED
|
@@ -12,6 +12,76 @@ function corpusError(filePath, reason) {
|
|
|
12
12
|
function isRecord(value) {
|
|
13
13
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
14
14
|
}
|
|
15
|
+
function readStringArray(filePath, context, value) {
|
|
16
|
+
if (value === undefined)
|
|
17
|
+
return undefined;
|
|
18
|
+
if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
|
|
19
|
+
throw corpusError(filePath, `"${context}" must be an array of strings`);
|
|
20
|
+
}
|
|
21
|
+
return value;
|
|
22
|
+
}
|
|
23
|
+
function readNonNegativeInteger(filePath, context, value) {
|
|
24
|
+
if (value === undefined)
|
|
25
|
+
return undefined;
|
|
26
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || !Number.isInteger(value)) {
|
|
27
|
+
throw corpusError(filePath, `"${context}" must be a non-negative integer`);
|
|
28
|
+
}
|
|
29
|
+
return value;
|
|
30
|
+
}
|
|
31
|
+
function parseStructural(filePath, raw) {
|
|
32
|
+
if (raw === undefined)
|
|
33
|
+
return undefined;
|
|
34
|
+
if (!isRecord(raw)) {
|
|
35
|
+
throw corpusError(filePath, `"expected.structural" must be a mapping`);
|
|
36
|
+
}
|
|
37
|
+
const requiredSections = readStringArray(filePath, "expected.structural.required_sections", raw.required_sections ?? raw.requiredSections);
|
|
38
|
+
const forbiddenPatterns = readStringArray(filePath, "expected.structural.forbidden_patterns", raw.forbidden_patterns ?? raw.forbiddenPatterns);
|
|
39
|
+
const requiredFrontmatterKeys = readStringArray(filePath, "expected.structural.required_frontmatter_keys", raw.required_frontmatter_keys ?? raw.requiredFrontmatterKeys);
|
|
40
|
+
const minLines = readNonNegativeInteger(filePath, "expected.structural.min_lines", raw.min_lines ?? raw.minLines);
|
|
41
|
+
const maxLines = readNonNegativeInteger(filePath, "expected.structural.max_lines", raw.max_lines ?? raw.maxLines);
|
|
42
|
+
const minChars = readNonNegativeInteger(filePath, "expected.structural.min_chars", raw.min_chars ?? raw.minChars);
|
|
43
|
+
const maxChars = readNonNegativeInteger(filePath, "expected.structural.max_chars", raw.max_chars ?? raw.maxChars);
|
|
44
|
+
const structural = {};
|
|
45
|
+
if (requiredSections)
|
|
46
|
+
structural.requiredSections = requiredSections;
|
|
47
|
+
if (forbiddenPatterns)
|
|
48
|
+
structural.forbiddenPatterns = forbiddenPatterns;
|
|
49
|
+
if (requiredFrontmatterKeys)
|
|
50
|
+
structural.requiredFrontmatterKeys = requiredFrontmatterKeys;
|
|
51
|
+
if (minLines !== undefined)
|
|
52
|
+
structural.minLines = minLines;
|
|
53
|
+
if (maxLines !== undefined)
|
|
54
|
+
structural.maxLines = maxLines;
|
|
55
|
+
if (minChars !== undefined)
|
|
56
|
+
structural.minChars = minChars;
|
|
57
|
+
if (maxChars !== undefined)
|
|
58
|
+
structural.maxChars = maxChars;
|
|
59
|
+
return structural;
|
|
60
|
+
}
|
|
61
|
+
function parseExpected(filePath, raw) {
|
|
62
|
+
if (raw === undefined)
|
|
63
|
+
return undefined;
|
|
64
|
+
if (!isRecord(raw)) {
|
|
65
|
+
throw corpusError(filePath, `"expected" must be a mapping`);
|
|
66
|
+
}
|
|
67
|
+
const shape = {};
|
|
68
|
+
const structural = parseStructural(filePath, raw.structural);
|
|
69
|
+
if (structural)
|
|
70
|
+
shape.structural = structural;
|
|
71
|
+
if (raw.rules !== undefined) {
|
|
72
|
+
if (!isRecord(raw.rules)) {
|
|
73
|
+
throw corpusError(filePath, `"expected.rules" must be a mapping`);
|
|
74
|
+
}
|
|
75
|
+
shape.rules = raw.rules;
|
|
76
|
+
}
|
|
77
|
+
if (raw.judge !== undefined) {
|
|
78
|
+
if (!isRecord(raw.judge)) {
|
|
79
|
+
throw corpusError(filePath, `"expected.judge" must be a mapping`);
|
|
80
|
+
}
|
|
81
|
+
shape.judge = raw.judge;
|
|
82
|
+
}
|
|
83
|
+
return Object.keys(shape).length === 0 ? undefined : shape;
|
|
84
|
+
}
|
|
15
85
|
function validateCase(filePath, raw) {
|
|
16
86
|
if (!isRecord(raw)) {
|
|
17
87
|
throw corpusError(filePath, "top-level value must be a mapping");
|
|
@@ -28,17 +98,8 @@ function validateCase(filePath, raw) {
|
|
|
28
98
|
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
29
99
|
throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
|
|
30
100
|
}
|
|
31
|
-
const
|
|
32
|
-
|
|
33
|
-
if (contextFilesRaw !== undefined) {
|
|
34
|
-
if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
|
|
35
|
-
throw corpusError(filePath, `"context_files" must be an array of strings`);
|
|
36
|
-
}
|
|
37
|
-
contextFiles = contextFilesRaw;
|
|
38
|
-
}
|
|
39
|
-
const expected = raw.expected !== undefined && isRecord(raw.expected)
|
|
40
|
-
? raw.expected
|
|
41
|
-
: undefined;
|
|
101
|
+
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
102
|
+
const expected = parseExpected(filePath, raw.expected);
|
|
42
103
|
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
43
104
|
return {
|
|
44
105
|
id: id.trim(),
|
|
@@ -51,8 +112,7 @@ function validateCase(filePath, raw) {
|
|
|
51
112
|
}
|
|
52
113
|
/**
|
|
53
114
|
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
54
|
-
* single stage. Returns an empty array for a fresh install
|
|
55
|
-
* without seed cases; corpus is authored in Wave 7.1+).
|
|
115
|
+
* single stage. Returns an empty array for a fresh install.
|
|
56
116
|
*/
|
|
57
117
|
export async function loadCorpus(projectRoot, stage) {
|
|
58
118
|
const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
|
|
@@ -89,3 +149,27 @@ export async function loadCorpus(projectRoot, stage) {
|
|
|
89
149
|
cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
|
|
90
150
|
return cases;
|
|
91
151
|
}
|
|
152
|
+
/**
|
|
153
|
+
* Resolve a case's `fixture` path to an absolute filesystem path. The fixture
|
|
154
|
+
* field is interpreted relative to the case's stage directory (i.e., a
|
|
155
|
+
* sibling subdirectory or file inside `.cclaw/evals/corpus/<stage>/`).
|
|
156
|
+
*/
|
|
157
|
+
export function fixturePathFor(projectRoot, caseEntry) {
|
|
158
|
+
if (!caseEntry.fixture)
|
|
159
|
+
return undefined;
|
|
160
|
+
return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, caseEntry.fixture);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
164
|
+
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
165
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
166
|
+
*/
|
|
167
|
+
export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
168
|
+
const fixturePath = fixturePathFor(projectRoot, caseEntry);
|
|
169
|
+
if (!fixturePath)
|
|
170
|
+
return undefined;
|
|
171
|
+
if (!(await exists(fixturePath))) {
|
|
172
|
+
throw new Error(`Fixture missing for case ${caseEntry.stage}/${caseEntry.id}: ${fixturePath}`);
|
|
173
|
+
}
|
|
174
|
+
return fs.readFile(fixturePath, "utf8");
|
|
175
|
+
}
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* runtime dependency. The real implementation
|
|
4
|
+
* This module declares the shape of the client without pulling in the
|
|
5
|
+
* `openai` runtime dependency. The real implementation lands when
|
|
6
6
|
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users
|
|
8
|
-
*
|
|
7
|
+
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
+
* install an extra dependency or receive network egress warnings.
|
|
9
9
|
*/
|
|
10
10
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
11
|
/**
|
|
12
12
|
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
13
|
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
*
|
|
14
|
+
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
15
|
*/
|
|
16
16
|
export interface ChatMessage {
|
|
17
17
|
role: "system" | "user" | "assistant" | "tool";
|
|
@@ -26,8 +26,8 @@ export interface ChatRequest {
|
|
|
26
26
|
temperature?: number;
|
|
27
27
|
timeoutMs?: number;
|
|
28
28
|
/**
|
|
29
|
-
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
-
*
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
+
* by Tier B. Ignored by the Tier A single-shot path.
|
|
31
31
|
*/
|
|
32
32
|
tools?: unknown[];
|
|
33
33
|
toolChoice?: "auto" | "none";
|
|
@@ -52,11 +52,11 @@ export interface EvalLlmClient {
|
|
|
52
52
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
53
|
}
|
|
54
54
|
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
-
constructor(
|
|
55
|
+
constructor();
|
|
56
56
|
}
|
|
57
57
|
/**
|
|
58
|
-
* Factory stub. Throws with a clear message so accidental
|
|
59
|
-
* easy to diagnose. The
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
59
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
60
60
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
61
|
*/
|
|
62
62
|
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
export class EvalLlmNotWiredError extends Error {
|
|
2
|
-
constructor(
|
|
3
|
-
super(`LLM client is not wired
|
|
2
|
+
constructor() {
|
|
3
|
+
super(`LLM client is not wired yet.\n` +
|
|
4
4
|
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
5
|
this.name = "EvalLlmNotWiredError";
|
|
6
6
|
}
|
|
7
7
|
}
|
|
8
8
|
/**
|
|
9
|
-
* Factory stub. Throws with a clear message so accidental
|
|
10
|
-
* easy to diagnose. The
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
10
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
11
11
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
12
|
*/
|
|
13
13
|
export function createEvalClient(_config) {
|
|
14
14
|
return {
|
|
15
15
|
async chat() {
|
|
16
|
-
throw new EvalLlmNotWiredError(
|
|
16
|
+
throw new EvalLlmNotWiredError();
|
|
17
17
|
}
|
|
18
18
|
};
|
|
19
19
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -39,17 +39,30 @@ export function formatMarkdownReport(report) {
|
|
|
39
39
|
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
40
|
lines.push(``);
|
|
41
41
|
if (report.baselineDelta) {
|
|
42
|
+
const delta = report.baselineDelta;
|
|
42
43
|
lines.push(`## Baseline delta`);
|
|
43
44
|
lines.push(``);
|
|
44
|
-
lines.push(`- baseline: ${
|
|
45
|
-
lines.push(`- score delta: ${
|
|
46
|
-
lines.push(`- critical failures: ${
|
|
45
|
+
lines.push(`- baseline: ${delta.baselineId}`);
|
|
46
|
+
lines.push(`- score delta: ${delta.scoreDelta.toFixed(4)}`);
|
|
47
|
+
lines.push(`- critical failures: ${delta.criticalFailures}`);
|
|
47
48
|
lines.push(``);
|
|
49
|
+
if (delta.regressions.length > 0) {
|
|
50
|
+
lines.push(`### Regressions`);
|
|
51
|
+
lines.push(``);
|
|
52
|
+
lines.push(`| stage | case id | verifier | reason | prev | curr |`);
|
|
53
|
+
lines.push(`| --- | --- | --- | --- | --- | --- |`);
|
|
54
|
+
for (const reg of delta.regressions) {
|
|
55
|
+
const prev = reg.previousScore !== undefined ? reg.previousScore.toFixed(2) : "-";
|
|
56
|
+
const curr = reg.currentScore !== undefined ? reg.currentScore.toFixed(2) : "-";
|
|
57
|
+
lines.push(`| ${reg.stage} | ${reg.caseId} | ${reg.verifierId} | ${reg.reason} | ${prev} | ${curr} |`);
|
|
58
|
+
}
|
|
59
|
+
lines.push(``);
|
|
60
|
+
}
|
|
48
61
|
}
|
|
49
62
|
if (report.cases.length === 0) {
|
|
50
63
|
lines.push(`## Cases`);
|
|
51
64
|
lines.push(``);
|
|
52
|
-
lines.push(`No cases were executed. See \`docs/evals.md\` for the
|
|
65
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
|
|
53
66
|
lines.push(``);
|
|
54
67
|
return `${lines.join("\n")}\n`;
|
|
55
68
|
}
|