cclaw-cli 0.21.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +11 -1
- package/dist/cli.js +154 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/eval/baseline.d.ts +14 -0
- package/dist/eval/baseline.js +209 -0
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +19 -0
- package/dist/eval/corpus.js +175 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +101 -0
- package/dist/eval/runner.d.ts +45 -0
- package/dist/eval/runner.js +178 -0
- package/dist/eval/types.d.ts +216 -0
- package/dist/eval/types.js +15 -0
- package/dist/eval/verifiers/structural.d.ts +14 -0
- package/dist/eval/verifiers/structural.js +171 -0
- package/dist/install.js +22 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
|
|
3
|
-
type
|
|
3
|
+
import type { EvalTier } from "./eval/types.js";
|
|
4
|
+
type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
|
|
4
5
|
interface ParsedArgs {
|
|
5
6
|
command?: CommandName;
|
|
6
7
|
harnesses?: HarnessId[];
|
|
@@ -16,6 +17,15 @@ interface ParsedArgs {
|
|
|
16
17
|
archiveName?: string;
|
|
17
18
|
archiveSkipRetro?: boolean;
|
|
18
19
|
archiveSkipRetroReason?: string;
|
|
20
|
+
evalStage?: string;
|
|
21
|
+
evalTier?: EvalTier;
|
|
22
|
+
evalSchemaOnly?: boolean;
|
|
23
|
+
evalRules?: boolean;
|
|
24
|
+
evalJudge?: boolean;
|
|
25
|
+
evalJson?: boolean;
|
|
26
|
+
evalNoWrite?: boolean;
|
|
27
|
+
evalUpdateBaseline?: boolean;
|
|
28
|
+
evalConfirm?: boolean;
|
|
19
29
|
showHelp?: boolean;
|
|
20
30
|
showVersion?: boolean;
|
|
21
31
|
}
|
package/dist/cli.js
CHANGED
|
@@ -13,7 +13,20 @@ import { RUNTIME_ROOT } from "./constants.js";
|
|
|
13
13
|
import { createDefaultConfig, createProfileConfig } from "./config.js";
|
|
14
14
|
import { detectHarnesses } from "./init-detect.js";
|
|
15
15
|
import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
16
|
-
|
|
16
|
+
import { runEval } from "./eval/runner.js";
|
|
17
|
+
import { writeBaselinesFromReport } from "./eval/baseline.js";
|
|
18
|
+
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
19
|
+
import { EVAL_TIERS } from "./eval/types.js";
|
|
20
|
+
import { FLOW_STAGES } from "./types.js";
|
|
21
|
+
const INSTALLER_COMMANDS = [
|
|
22
|
+
"init",
|
|
23
|
+
"sync",
|
|
24
|
+
"doctor",
|
|
25
|
+
"upgrade",
|
|
26
|
+
"uninstall",
|
|
27
|
+
"archive",
|
|
28
|
+
"eval"
|
|
29
|
+
];
|
|
17
30
|
export function usage() {
|
|
18
31
|
return `cclaw - installer-first flow toolkit
|
|
19
32
|
|
|
@@ -41,6 +54,17 @@ Commands:
|
|
|
41
54
|
Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
|
|
42
55
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
43
56
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
57
|
+
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.1: structural verifier).
|
|
58
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
59
|
+
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
60
|
+
--schema-only Run only structural verifiers (Wave 7.1, default).
|
|
61
|
+
--rules Run structural + rule verifiers (Wave 7.2).
|
|
62
|
+
--judge Include LLM judging (Wave 7.3; requires API key).
|
|
63
|
+
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
|
+
--json Emit machine-readable JSON on stdout.
|
|
65
|
+
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
66
|
+
--update-baseline Overwrite baselines from the current run (requires --confirm).
|
|
67
|
+
--confirm Acknowledge --update-baseline (prevents accidental resets).
|
|
44
68
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
45
69
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
46
70
|
|
|
@@ -52,6 +76,8 @@ Examples:
|
|
|
52
76
|
cclaw init --harnesses=claude,cursor
|
|
53
77
|
cclaw doctor --reconcile-gates
|
|
54
78
|
cclaw archive --name=payments-revamp
|
|
79
|
+
cclaw eval --dry-run
|
|
80
|
+
cclaw eval --stage=brainstorm --schema-only
|
|
55
81
|
|
|
56
82
|
Docs: https://github.com/zuevrs/cclaw
|
|
57
83
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -107,6 +133,20 @@ function parseProfile(raw) {
|
|
|
107
133
|
}
|
|
108
134
|
return trimmed;
|
|
109
135
|
}
|
|
136
|
+
function parseEvalTier(raw) {
|
|
137
|
+
const trimmed = raw.trim().toUpperCase();
|
|
138
|
+
if (!EVAL_TIERS.includes(trimmed)) {
|
|
139
|
+
throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
|
|
140
|
+
}
|
|
141
|
+
return trimmed;
|
|
142
|
+
}
|
|
143
|
+
function parseEvalStage(raw) {
|
|
144
|
+
const trimmed = raw.trim();
|
|
145
|
+
if (!FLOW_STAGES.includes(trimmed)) {
|
|
146
|
+
throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
|
|
147
|
+
}
|
|
148
|
+
return trimmed;
|
|
149
|
+
}
|
|
110
150
|
function isInitPromptAllowed(ctx) {
|
|
111
151
|
return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
|
|
112
152
|
}
|
|
@@ -390,7 +430,45 @@ function parseArgs(argv) {
|
|
|
390
430
|
}
|
|
391
431
|
if (flag.startsWith("--retro-reason=")) {
|
|
392
432
|
parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
|
|
433
|
+
continue;
|
|
434
|
+
}
|
|
435
|
+
if (flag.startsWith("--stage=")) {
|
|
436
|
+
parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
if (flag.startsWith("--tier=")) {
|
|
440
|
+
parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
|
|
441
|
+
continue;
|
|
442
|
+
}
|
|
443
|
+
if (flag === "--schema-only") {
|
|
444
|
+
parsed.evalSchemaOnly = true;
|
|
445
|
+
continue;
|
|
446
|
+
}
|
|
447
|
+
if (flag === "--rules") {
|
|
448
|
+
parsed.evalRules = true;
|
|
449
|
+
continue;
|
|
450
|
+
}
|
|
451
|
+
if (flag === "--judge") {
|
|
452
|
+
parsed.evalJudge = true;
|
|
453
|
+
continue;
|
|
454
|
+
}
|
|
455
|
+
if (flag === "--no-write") {
|
|
456
|
+
parsed.evalNoWrite = true;
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
if (flag === "--update-baseline") {
|
|
460
|
+
parsed.evalUpdateBaseline = true;
|
|
461
|
+
continue;
|
|
393
462
|
}
|
|
463
|
+
if (flag === "--confirm") {
|
|
464
|
+
parsed.evalConfirm = true;
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
// `--json` is shared between doctor and eval. Disambiguate by command.
|
|
469
|
+
if (parsed.command === "eval" && parsed.doctorJson === true) {
|
|
470
|
+
parsed.evalJson = true;
|
|
471
|
+
parsed.doctorJson = undefined;
|
|
394
472
|
}
|
|
395
473
|
return parsed;
|
|
396
474
|
}
|
|
@@ -487,6 +565,81 @@ async function runCommand(parsed, ctx) {
|
|
|
487
565
|
info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
|
|
488
566
|
return 0;
|
|
489
567
|
}
|
|
568
|
+
if (command === "eval") {
|
|
569
|
+
const result = await runEval({
|
|
570
|
+
projectRoot: ctx.cwd,
|
|
571
|
+
stage: parsed.evalStage,
|
|
572
|
+
tier: parsed.evalTier,
|
|
573
|
+
schemaOnly: parsed.evalSchemaOnly === true,
|
|
574
|
+
rules: parsed.evalRules === true,
|
|
575
|
+
judge: parsed.evalJudge === true,
|
|
576
|
+
dryRun: parsed.dryRun === true
|
|
577
|
+
});
|
|
578
|
+
if ("kind" in result) {
|
|
579
|
+
if (parsed.evalJson === true) {
|
|
580
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
581
|
+
return 0;
|
|
582
|
+
}
|
|
583
|
+
ctx.stdout.write(`cclaw eval dry-run\n`);
|
|
584
|
+
ctx.stdout.write(` provider: ${result.config.provider}\n`);
|
|
585
|
+
ctx.stdout.write(` baseUrl: ${result.config.baseUrl}\n`);
|
|
586
|
+
ctx.stdout.write(` model: ${result.config.model}\n`);
|
|
587
|
+
ctx.stdout.write(` source: ${result.config.source}\n`);
|
|
588
|
+
ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
|
|
589
|
+
ctx.stdout.write(` tier: ${result.plannedTier}\n`);
|
|
590
|
+
ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
|
|
591
|
+
for (const [stage, count] of Object.entries(result.corpus.byStage)) {
|
|
592
|
+
ctx.stdout.write(` - ${stage}: ${count}\n`);
|
|
593
|
+
}
|
|
594
|
+
ctx.stdout.write(` verifiers available:\n`);
|
|
595
|
+
for (const [key, value] of Object.entries(result.verifiersAvailable)) {
|
|
596
|
+
ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
|
|
597
|
+
}
|
|
598
|
+
if (result.notes.length > 0) {
|
|
599
|
+
ctx.stdout.write(` notes:\n`);
|
|
600
|
+
for (const note of result.notes) {
|
|
601
|
+
ctx.stdout.write(` - ${note}\n`);
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
return 0;
|
|
605
|
+
}
|
|
606
|
+
if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
|
|
607
|
+
error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
|
|
608
|
+
return 1;
|
|
609
|
+
}
|
|
610
|
+
if (parsed.evalUpdateBaseline === true) {
|
|
611
|
+
if (result.summary.failed > 0) {
|
|
612
|
+
error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
|
|
613
|
+
return 1;
|
|
614
|
+
}
|
|
615
|
+
const written = await writeBaselinesFromReport(ctx.cwd, result);
|
|
616
|
+
for (const file of written) {
|
|
617
|
+
info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
if (parsed.evalNoWrite !== true) {
|
|
621
|
+
const jsonPath = await writeJsonReport(ctx.cwd, result);
|
|
622
|
+
const mdPath = await writeMarkdownReport(ctx.cwd, result);
|
|
623
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
|
|
624
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
|
|
625
|
+
}
|
|
626
|
+
const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
|
|
627
|
+
if (parsed.evalJson === true) {
|
|
628
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
629
|
+
}
|
|
630
|
+
else {
|
|
631
|
+
const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
|
|
632
|
+
ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
|
|
633
|
+
`${result.summary.passed} passed, ` +
|
|
634
|
+
`${result.summary.failed} failed, ` +
|
|
635
|
+
`${result.summary.skipped} skipped${regressionNote}\n`);
|
|
636
|
+
}
|
|
637
|
+
if (result.summary.failed > 0)
|
|
638
|
+
return 1;
|
|
639
|
+
if (regressionCount > 0)
|
|
640
|
+
return 1;
|
|
641
|
+
return 0;
|
|
642
|
+
}
|
|
490
643
|
if (command === "archive") {
|
|
491
644
|
const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
|
|
492
645
|
skipRetro: parsed.archiveSkipRetro === true,
|
package/dist/constants.d.ts
CHANGED
|
@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
|
|
|
4
4
|
export declare const CCLAW_VERSION = "0.1.1";
|
|
5
5
|
export declare const FLOW_VERSION = "1.0.0";
|
|
6
6
|
export declare const DEFAULT_HARNESSES: HarnessId[];
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
9
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
10
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
11
|
+
* does not affect non-eval cclaw behavior.
|
|
12
|
+
*/
|
|
13
|
+
export declare const EVALS_ROOT = ".cclaw/evals";
|
|
14
|
+
export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
|
|
15
|
+
export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
16
|
+
export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
17
|
+
export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
|
|
9
18
|
export declare const COMMAND_FILE_ORDER: FlowStage[];
|
|
10
19
|
export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
|
|
11
20
|
export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];
|
package/dist/constants.js
CHANGED
|
@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
|
|
|
8
8
|
"opencode",
|
|
9
9
|
"codex"
|
|
10
10
|
];
|
|
11
|
+
/**
|
|
12
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
13
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
14
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
15
|
+
* does not affect non-eval cclaw behavior.
|
|
16
|
+
*/
|
|
17
|
+
export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
|
|
18
|
+
export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
|
|
19
|
+
export const EVALS_DIRS = [
|
|
20
|
+
EVALS_ROOT,
|
|
21
|
+
`${EVALS_ROOT}/corpus`,
|
|
22
|
+
`${EVALS_ROOT}/rubrics`,
|
|
23
|
+
`${EVALS_ROOT}/baselines`,
|
|
24
|
+
`${EVALS_ROOT}/reports`
|
|
25
|
+
];
|
|
11
26
|
export const REQUIRED_DIRS = [
|
|
12
27
|
RUNTIME_ROOT,
|
|
13
28
|
`${RUNTIME_ROOT}/commands`,
|
|
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
|
|
|
22
37
|
`${RUNTIME_ROOT}/adapters`,
|
|
23
38
|
`${RUNTIME_ROOT}/agents`,
|
|
24
39
|
`${RUNTIME_ROOT}/hooks`,
|
|
25
|
-
`${RUNTIME_ROOT}/custom-skills
|
|
40
|
+
`${RUNTIME_ROOT}/custom-skills`,
|
|
41
|
+
...EVALS_DIRS
|
|
26
42
|
];
|
|
27
43
|
export const REQUIRED_GITIGNORE_PATTERNS = [
|
|
28
44
|
"# cclaw generated artifacts",
|
|
29
45
|
`${RUNTIME_ROOT}/`,
|
|
46
|
+
"# cclaw evals: user-owned, track in git",
|
|
47
|
+
`!${EVALS_ROOT}/`,
|
|
48
|
+
`!${EVALS_ROOT}/config.yaml`,
|
|
49
|
+
`!${EVALS_ROOT}/corpus/`,
|
|
50
|
+
`!${EVALS_ROOT}/corpus/**`,
|
|
51
|
+
`!${EVALS_ROOT}/rubrics/`,
|
|
52
|
+
`!${EVALS_ROOT}/rubrics/**`,
|
|
53
|
+
`!${EVALS_ROOT}/baselines/`,
|
|
54
|
+
`!${EVALS_ROOT}/baselines/**`,
|
|
30
55
|
".claude/commands/cc-*.md",
|
|
31
56
|
".claude/commands/cc.md",
|
|
32
57
|
".cursor/commands/cc-*.md",
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
|
|
8
|
+
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
|
|
9
|
+
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
|
|
10
|
+
export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
|
|
11
|
+
export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export const EVAL_CONFIG_YAML = `# cclaw eval config
|
|
8
|
+
# See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
|
|
9
|
+
#
|
|
10
|
+
# All values can be overridden at runtime with CCLAW_EVAL_* environment
|
|
11
|
+
# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
|
|
12
|
+
provider: zai
|
|
13
|
+
baseUrl: https://api.z.ai/api/coding/paas/v4
|
|
14
|
+
model: glm-5.1
|
|
15
|
+
|
|
16
|
+
# Default fidelity tier when --tier is not supplied.
|
|
17
|
+
# A = single-shot API call (cheap, Wave 7.3)
|
|
18
|
+
# B = SDK with tool use (realistic, Wave 7.4)
|
|
19
|
+
# C = multi-stage workflow (end-to-end, Wave 7.5)
|
|
20
|
+
defaultTier: A
|
|
21
|
+
|
|
22
|
+
# Per-call timeout and retry budget.
|
|
23
|
+
timeoutMs: 120000
|
|
24
|
+
maxRetries: 2
|
|
25
|
+
|
|
26
|
+
# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
|
|
27
|
+
# dailyUsdCap: 5
|
|
28
|
+
|
|
29
|
+
# Regression thresholds used by CI (Wave 7.3+).
|
|
30
|
+
regression:
|
|
31
|
+
# Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
|
|
32
|
+
failIfDeltaBelow: -0.15
|
|
33
|
+
# Fail when any single critical rubric drops below this absolute score.
|
|
34
|
+
failIfCriticalBelow: 3.0
|
|
35
|
+
`;
|
|
36
|
+
export const EVAL_CORPUS_README = `# Eval Corpus
|
|
37
|
+
|
|
38
|
+
Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
|
|
39
|
+
See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
|
|
40
|
+
|
|
41
|
+
Minimal shape:
|
|
42
|
+
|
|
43
|
+
\`\`\`yaml
|
|
44
|
+
id: brainstorm-01
|
|
45
|
+
stage: brainstorm
|
|
46
|
+
input_prompt: |
|
|
47
|
+
One short paragraph describing the user's task.
|
|
48
|
+
context_files: []
|
|
49
|
+
expected:
|
|
50
|
+
# verifier-specific hints; optional in Wave 7.0
|
|
51
|
+
\`\`\`
|
|
52
|
+
|
|
53
|
+
Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
|
|
54
|
+
stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
|
|
55
|
+
projects to exercise Tier B/C sandboxes.
|
|
56
|
+
`;
|
|
57
|
+
export const EVAL_RUBRICS_README = `# Eval Rubrics
|
|
58
|
+
|
|
59
|
+
LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
|
|
60
|
+
scored on a \`1–5\` scale with a rationale:
|
|
61
|
+
|
|
62
|
+
\`\`\`yaml
|
|
63
|
+
stage: brainstorm
|
|
64
|
+
checks:
|
|
65
|
+
- id: distinctness
|
|
66
|
+
prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
|
|
67
|
+
scale: "1-5 where 5=fully distinct approaches"
|
|
68
|
+
weight: 1.0
|
|
69
|
+
\`\`\`
|
|
70
|
+
|
|
71
|
+
Rubric authoring happens when Tier A runs start producing artifacts, so we
|
|
72
|
+
score the *right* properties rather than retrofitting generic quality checks.
|
|
73
|
+
See \`docs/evals.md\` for the full schema.
|
|
74
|
+
`;
|
|
75
|
+
export const EVAL_BASELINES_README = `# Eval Baselines
|
|
76
|
+
|
|
77
|
+
Frozen score snapshots used by regression gates. Baselines are committed to
|
|
78
|
+
git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
|
|
79
|
+
(wired in Wave 7.1).
|
|
80
|
+
|
|
81
|
+
Each baseline file is a JSON document keyed by stage and case id. Do not edit
|
|
82
|
+
by hand; CI will flag baseline churn.
|
|
83
|
+
`;
|
|
84
|
+
export const EVAL_REPORTS_README = `# Eval Reports
|
|
85
|
+
|
|
86
|
+
Generated reports (JSON + Markdown) land here. This directory is gitignored.
|
|
87
|
+
Run \`cclaw eval --dry-run\` to preview configuration without producing a
|
|
88
|
+
report.
|
|
89
|
+
`;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
|
+
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
+
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
5
|
+
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
6
|
+
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
|
7
|
+
export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
10
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
11
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
12
|
+
*/
|
|
13
|
+
export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
|
|
14
|
+
export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline I/O + regression comparison (Wave 7.1).
|
|
3
|
+
*
|
|
4
|
+
* Layout on disk (committed):
|
|
5
|
+
*
|
|
6
|
+
* .cclaw/evals/baselines/<stage>.json
|
|
7
|
+
*
|
|
8
|
+
* Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
|
|
9
|
+
* regressions by comparing per-verifier `ok` flags across runs: any verifier
|
|
10
|
+
* that was `ok:true` in the baseline and is `ok:false` now counts as a
|
|
11
|
+
* critical failure. A case whose aggregate `passed` flipped from true to
|
|
12
|
+
* false is flagged as `case-now-failing` regardless of per-verifier churn.
|
|
13
|
+
*
|
|
14
|
+
* Writes are gated behind an explicit `--update-baseline --confirm` pair at
|
|
15
|
+
* the CLI layer so accidental resets do not slip into PRs.
|
|
16
|
+
*/
|
|
17
|
+
import fs from "node:fs/promises";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
|
|
20
|
+
import { exists } from "../fs-utils.js";
|
|
21
|
+
import { FLOW_STAGES } from "../types.js";
|
|
22
|
+
export const BASELINE_SCHEMA_VERSION = 1;
|
|
23
|
+
function baselinePath(projectRoot, stage) {
|
|
24
|
+
return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
|
|
25
|
+
}
|
|
26
|
+
export async function loadBaseline(projectRoot, stage) {
|
|
27
|
+
const filePath = baselinePath(projectRoot, stage);
|
|
28
|
+
if (!(await exists(filePath)))
|
|
29
|
+
return null;
|
|
30
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
31
|
+
let parsed;
|
|
32
|
+
try {
|
|
33
|
+
parsed = JSON.parse(raw);
|
|
34
|
+
}
|
|
35
|
+
catch (err) {
|
|
36
|
+
throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
37
|
+
}
|
|
38
|
+
if (!isBaseline(parsed, stage)) {
|
|
39
|
+
throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
|
|
40
|
+
}
|
|
41
|
+
return parsed;
|
|
42
|
+
}
|
|
43
|
+
function isBaseline(value, stage) {
|
|
44
|
+
if (!value || typeof value !== "object")
|
|
45
|
+
return false;
|
|
46
|
+
const candidate = value;
|
|
47
|
+
if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
|
|
48
|
+
return false;
|
|
49
|
+
if (candidate.stage !== stage)
|
|
50
|
+
return false;
|
|
51
|
+
if (typeof candidate.generatedAt !== "string")
|
|
52
|
+
return false;
|
|
53
|
+
if (typeof candidate.cclawVersion !== "string")
|
|
54
|
+
return false;
|
|
55
|
+
if (!candidate.cases || typeof candidate.cases !== "object")
|
|
56
|
+
return false;
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
export async function loadBaselinesByStage(projectRoot, stages) {
|
|
60
|
+
const out = new Map();
|
|
61
|
+
for (const stage of stages) {
|
|
62
|
+
const snapshot = await loadBaseline(projectRoot, stage);
|
|
63
|
+
if (snapshot)
|
|
64
|
+
out.set(stage, snapshot);
|
|
65
|
+
}
|
|
66
|
+
return out;
|
|
67
|
+
}
|
|
68
|
+
function entryFromResult(result) {
|
|
69
|
+
const verifierResults = result.verifierResults.map((v) => ({
|
|
70
|
+
id: v.id,
|
|
71
|
+
kind: v.kind,
|
|
72
|
+
ok: v.ok,
|
|
73
|
+
...(v.score !== undefined ? { score: v.score } : {})
|
|
74
|
+
}));
|
|
75
|
+
return { passed: result.passed, verifierResults };
|
|
76
|
+
}
|
|
77
|
+
export function buildBaselineForStage(stage, report) {
|
|
78
|
+
const stageCases = report.cases.filter((c) => c.stage === stage);
|
|
79
|
+
const cases = {};
|
|
80
|
+
for (const c of stageCases) {
|
|
81
|
+
cases[c.caseId] = entryFromResult(c);
|
|
82
|
+
}
|
|
83
|
+
return {
|
|
84
|
+
schemaVersion: BASELINE_SCHEMA_VERSION,
|
|
85
|
+
stage,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
cclawVersion: CCLAW_VERSION,
|
|
88
|
+
cases
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
export async function writeBaselinesFromReport(projectRoot, report) {
|
|
92
|
+
const written = [];
|
|
93
|
+
const stages = new Set(report.cases.map((c) => c.stage));
|
|
94
|
+
for (const stage of stages) {
|
|
95
|
+
const snapshot = buildBaselineForStage(stage, report);
|
|
96
|
+
const file = baselinePath(projectRoot, stage);
|
|
97
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
98
|
+
await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
|
|
99
|
+
written.push(file);
|
|
100
|
+
}
|
|
101
|
+
return written.sort();
|
|
102
|
+
}
|
|
103
|
+
function verifierMap(entries) {
|
|
104
|
+
const out = new Map();
|
|
105
|
+
for (const entry of entries) {
|
|
106
|
+
out.set(entry.id, entry);
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
function computePassRate(cases) {
|
|
111
|
+
if (cases.length === 0)
|
|
112
|
+
return 1;
|
|
113
|
+
const passed = cases.filter((c) => c.passed).length;
|
|
114
|
+
return passed / cases.length;
|
|
115
|
+
}
|
|
116
|
+
function baselinePassRate(snapshot) {
|
|
117
|
+
const entries = Object.values(snapshot.cases);
|
|
118
|
+
if (entries.length === 0)
|
|
119
|
+
return 1;
|
|
120
|
+
const passed = entries.filter((e) => e.passed).length;
|
|
121
|
+
return passed / entries.length;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
125
|
+
* exists for a stage covered by the report, that stage contributes zero
|
|
126
|
+
* regressions (first run of that stage). Current is the source of truth.
|
|
127
|
+
*/
|
|
128
|
+
export function compareAgainstBaselines(report, baselines) {
|
|
129
|
+
if (baselines.size === 0)
|
|
130
|
+
return undefined;
|
|
131
|
+
const regressions = [];
|
|
132
|
+
const caseResultsByStage = new Map();
|
|
133
|
+
for (const c of report.cases) {
|
|
134
|
+
const bucket = caseResultsByStage.get(c.stage) ?? [];
|
|
135
|
+
bucket.push(c);
|
|
136
|
+
caseResultsByStage.set(c.stage, bucket);
|
|
137
|
+
}
|
|
138
|
+
let baselineTotalPassRate = 0;
|
|
139
|
+
let baselineStagesCounted = 0;
|
|
140
|
+
for (const [stage, snapshot] of baselines) {
|
|
141
|
+
const current = caseResultsByStage.get(stage) ?? [];
|
|
142
|
+
baselineTotalPassRate += baselinePassRate(snapshot);
|
|
143
|
+
baselineStagesCounted += 1;
|
|
144
|
+
for (const caseResult of current) {
|
|
145
|
+
const baselineEntry = snapshot.cases[caseResult.caseId];
|
|
146
|
+
if (!baselineEntry)
|
|
147
|
+
continue;
|
|
148
|
+
if (baselineEntry.passed && !caseResult.passed) {
|
|
149
|
+
regressions.push({
|
|
150
|
+
caseId: caseResult.caseId,
|
|
151
|
+
stage,
|
|
152
|
+
verifierId: "<case>",
|
|
153
|
+
reason: "case-now-failing",
|
|
154
|
+
previousScore: 1,
|
|
155
|
+
currentScore: 0
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
|
|
159
|
+
for (const currentVerifier of caseResult.verifierResults) {
|
|
160
|
+
const prev = baselineVerifiers.get(currentVerifier.id);
|
|
161
|
+
if (!prev)
|
|
162
|
+
continue;
|
|
163
|
+
if (prev.ok && !currentVerifier.ok) {
|
|
164
|
+
regressions.push({
|
|
165
|
+
caseId: caseResult.caseId,
|
|
166
|
+
stage,
|
|
167
|
+
verifierId: currentVerifier.id,
|
|
168
|
+
reason: "newly-failing",
|
|
169
|
+
previousScore: prev.score ?? 1,
|
|
170
|
+
currentScore: currentVerifier.score ?? 0
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
else if (prev.score !== undefined &&
|
|
174
|
+
currentVerifier.score !== undefined &&
|
|
175
|
+
currentVerifier.score < prev.score) {
|
|
176
|
+
regressions.push({
|
|
177
|
+
caseId: caseResult.caseId,
|
|
178
|
+
stage,
|
|
179
|
+
verifierId: currentVerifier.id,
|
|
180
|
+
reason: "score-drop",
|
|
181
|
+
previousScore: prev.score,
|
|
182
|
+
currentScore: currentVerifier.score
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
const currentPassRate = computePassRate(report.cases);
|
|
189
|
+
const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
|
|
190
|
+
const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
|
|
191
|
+
const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
|
|
192
|
+
const baselineStages = [...baselines.keys()].sort().join(",");
|
|
193
|
+
return {
|
|
194
|
+
baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
|
|
195
|
+
scoreDelta,
|
|
196
|
+
criticalFailures,
|
|
197
|
+
regressions
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
export function listBaselineStages(projectRoot) {
|
|
201
|
+
const root = path.join(projectRoot, EVALS_ROOT, "baselines");
|
|
202
|
+
return fs
|
|
203
|
+
.readdir(root, { withFileTypes: true })
|
|
204
|
+
.then((entries) => entries
|
|
205
|
+
.filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
|
|
206
|
+
.map((entry) => entry.name.replace(/\.json$/, ""))
|
|
207
|
+
.filter((name) => FLOW_STAGES.includes(name)))
|
|
208
|
+
.catch(() => []);
|
|
209
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
4
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
5
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
6
|
+
* variables (env wins last).
|
|
7
|
+
*/
|
|
8
|
+
export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
|
|
9
|
+
/**
|
|
10
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
11
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
12
|
+
* surface where each setting came from.
|
|
13
|
+
*/
|
|
14
|
+
export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
|