cclaw-cli 0.21.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
3
- type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive";
3
+ import type { EvalTier } from "./eval/types.js";
4
+ type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
4
5
  interface ParsedArgs {
5
6
  command?: CommandName;
6
7
  harnesses?: HarnessId[];
@@ -16,6 +17,15 @@ interface ParsedArgs {
16
17
  archiveName?: string;
17
18
  archiveSkipRetro?: boolean;
18
19
  archiveSkipRetroReason?: string;
20
+ evalStage?: string;
21
+ evalTier?: EvalTier;
22
+ evalSchemaOnly?: boolean;
23
+ evalRules?: boolean;
24
+ evalJudge?: boolean;
25
+ evalJson?: boolean;
26
+ evalNoWrite?: boolean;
27
+ evalUpdateBaseline?: boolean;
28
+ evalConfirm?: boolean;
19
29
  showHelp?: boolean;
20
30
  showVersion?: boolean;
21
31
  }
package/dist/cli.js CHANGED
@@ -13,7 +13,20 @@ import { RUNTIME_ROOT } from "./constants.js";
13
13
  import { createDefaultConfig, createProfileConfig } from "./config.js";
14
14
  import { detectHarnesses } from "./init-detect.js";
15
15
  import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
- const INSTALLER_COMMANDS = ["init", "sync", "doctor", "upgrade", "uninstall", "archive"];
16
+ import { runEval } from "./eval/runner.js";
17
+ import { writeBaselinesFromReport } from "./eval/baseline.js";
18
+ import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
19
+ import { EVAL_TIERS } from "./eval/types.js";
20
+ import { FLOW_STAGES } from "./types.js";
21
+ const INSTALLER_COMMANDS = [
22
+ "init",
23
+ "sync",
24
+ "doctor",
25
+ "upgrade",
26
+ "uninstall",
27
+ "archive",
28
+ "eval"
29
+ ];
17
30
  export function usage() {
18
31
  return `cclaw - installer-first flow toolkit
19
32
 
@@ -41,6 +54,17 @@ Commands:
41
54
  Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
42
55
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
43
56
  --retro-reason=<t> Reason for bypassing retro gate.
57
+ eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.1: structural verifier).
58
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
59
+ --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
60
+ --schema-only Run only structural verifiers (Wave 7.1, default).
61
+ --rules Run structural + rule verifiers (Wave 7.2).
62
+ --judge Include LLM judging (Wave 7.3; requires API key).
63
+ --dry-run Validate config + corpus, print summary, do not execute.
64
+ --json Emit machine-readable JSON on stdout.
65
+ --no-write Skip writing the report to .cclaw/evals/reports/.
66
+ --update-baseline Overwrite baselines from the current run (requires --confirm).
67
+ --confirm Acknowledge --update-baseline (prevents accidental resets).
44
68
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
45
69
  uninstall Remove .cclaw runtime and the generated harness shim files.
46
70
 
@@ -52,6 +76,8 @@ Examples:
52
76
  cclaw init --harnesses=claude,cursor
53
77
  cclaw doctor --reconcile-gates
54
78
  cclaw archive --name=payments-revamp
79
+ cclaw eval --dry-run
80
+ cclaw eval --stage=brainstorm --schema-only
55
81
 
56
82
  Docs: https://github.com/zuevrs/cclaw
57
83
  Issues: https://github.com/zuevrs/cclaw/issues
@@ -107,6 +133,20 @@ function parseProfile(raw) {
107
133
  }
108
134
  return trimmed;
109
135
  }
136
+ function parseEvalTier(raw) {
137
+ const trimmed = raw.trim().toUpperCase();
138
+ if (!EVAL_TIERS.includes(trimmed)) {
139
+ throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
140
+ }
141
+ return trimmed;
142
+ }
143
+ function parseEvalStage(raw) {
144
+ const trimmed = raw.trim();
145
+ if (!FLOW_STAGES.includes(trimmed)) {
146
+ throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
147
+ }
148
+ return trimmed;
149
+ }
110
150
  function isInitPromptAllowed(ctx) {
111
151
  return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
112
152
  }
@@ -390,7 +430,45 @@ function parseArgs(argv) {
390
430
  }
391
431
  if (flag.startsWith("--retro-reason=")) {
392
432
  parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
433
+ continue;
434
+ }
435
+ if (flag.startsWith("--stage=")) {
436
+ parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
437
+ continue;
438
+ }
439
+ if (flag.startsWith("--tier=")) {
440
+ parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
441
+ continue;
442
+ }
443
+ if (flag === "--schema-only") {
444
+ parsed.evalSchemaOnly = true;
445
+ continue;
446
+ }
447
+ if (flag === "--rules") {
448
+ parsed.evalRules = true;
449
+ continue;
450
+ }
451
+ if (flag === "--judge") {
452
+ parsed.evalJudge = true;
453
+ continue;
454
+ }
455
+ if (flag === "--no-write") {
456
+ parsed.evalNoWrite = true;
457
+ continue;
458
+ }
459
+ if (flag === "--update-baseline") {
460
+ parsed.evalUpdateBaseline = true;
461
+ continue;
393
462
  }
463
+ if (flag === "--confirm") {
464
+ parsed.evalConfirm = true;
465
+ continue;
466
+ }
467
+ }
468
+ // `--json` is shared between doctor and eval. Disambiguate by command.
469
+ if (parsed.command === "eval" && parsed.doctorJson === true) {
470
+ parsed.evalJson = true;
471
+ parsed.doctorJson = undefined;
394
472
  }
395
473
  return parsed;
396
474
  }
@@ -487,6 +565,81 @@ async function runCommand(parsed, ctx) {
487
565
  info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
488
566
  return 0;
489
567
  }
568
+ if (command === "eval") {
569
+ const result = await runEval({
570
+ projectRoot: ctx.cwd,
571
+ stage: parsed.evalStage,
572
+ tier: parsed.evalTier,
573
+ schemaOnly: parsed.evalSchemaOnly === true,
574
+ rules: parsed.evalRules === true,
575
+ judge: parsed.evalJudge === true,
576
+ dryRun: parsed.dryRun === true
577
+ });
578
+ if ("kind" in result) {
579
+ if (parsed.evalJson === true) {
580
+ ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
581
+ return 0;
582
+ }
583
+ ctx.stdout.write(`cclaw eval dry-run\n`);
584
+ ctx.stdout.write(` provider: ${result.config.provider}\n`);
585
+ ctx.stdout.write(` baseUrl: ${result.config.baseUrl}\n`);
586
+ ctx.stdout.write(` model: ${result.config.model}\n`);
587
+ ctx.stdout.write(` source: ${result.config.source}\n`);
588
+ ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
589
+ ctx.stdout.write(` tier: ${result.plannedTier}\n`);
590
+ ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
591
+ for (const [stage, count] of Object.entries(result.corpus.byStage)) {
592
+ ctx.stdout.write(` - ${stage}: ${count}\n`);
593
+ }
594
+ ctx.stdout.write(` verifiers available:\n`);
595
+ for (const [key, value] of Object.entries(result.verifiersAvailable)) {
596
+ ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
597
+ }
598
+ if (result.notes.length > 0) {
599
+ ctx.stdout.write(` notes:\n`);
600
+ for (const note of result.notes) {
601
+ ctx.stdout.write(` - ${note}\n`);
602
+ }
603
+ }
604
+ return 0;
605
+ }
606
+ if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
607
+ error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
608
+ return 1;
609
+ }
610
+ if (parsed.evalUpdateBaseline === true) {
611
+ if (result.summary.failed > 0) {
612
+ error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
613
+ return 1;
614
+ }
615
+ const written = await writeBaselinesFromReport(ctx.cwd, result);
616
+ for (const file of written) {
617
+ info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
618
+ }
619
+ }
620
+ if (parsed.evalNoWrite !== true) {
621
+ const jsonPath = await writeJsonReport(ctx.cwd, result);
622
+ const mdPath = await writeMarkdownReport(ctx.cwd, result);
623
+ info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
624
+ info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
625
+ }
626
+ const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
627
+ if (parsed.evalJson === true) {
628
+ ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
629
+ }
630
+ else {
631
+ const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
632
+ ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
633
+ `${result.summary.passed} passed, ` +
634
+ `${result.summary.failed} failed, ` +
635
+ `${result.summary.skipped} skipped${regressionNote}\n`);
636
+ }
637
+ if (result.summary.failed > 0)
638
+ return 1;
639
+ if (regressionCount > 0)
640
+ return 1;
641
+ return 0;
642
+ }
490
643
  if (command === "archive") {
491
644
  const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
492
645
  skipRetro: parsed.archiveSkipRetro === true,
@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
4
4
  export declare const CCLAW_VERSION = "0.1.1";
5
5
  export declare const FLOW_VERSION = "1.0.0";
6
6
  export declare const DEFAULT_HARNESSES: HarnessId[];
7
- export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills"];
8
- export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
7
+ /**
8
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
9
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
10
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
11
+ * does not affect non-eval cclaw behavior.
12
+ */
13
+ export declare const EVALS_ROOT = ".cclaw/evals";
14
+ export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
15
+ export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
16
+ export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
17
+ export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
9
18
  export declare const COMMAND_FILE_ORDER: FlowStage[];
10
19
  export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
11
20
  export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];
package/dist/constants.js CHANGED
@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
8
8
  "opencode",
9
9
  "codex"
10
10
  ];
11
+ /**
12
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
13
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
14
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
15
+ * does not affect non-eval cclaw behavior.
16
+ */
17
+ export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
18
+ export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
19
+ export const EVALS_DIRS = [
20
+ EVALS_ROOT,
21
+ `${EVALS_ROOT}/corpus`,
22
+ `${EVALS_ROOT}/rubrics`,
23
+ `${EVALS_ROOT}/baselines`,
24
+ `${EVALS_ROOT}/reports`
25
+ ];
11
26
  export const REQUIRED_DIRS = [
12
27
  RUNTIME_ROOT,
13
28
  `${RUNTIME_ROOT}/commands`,
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
22
37
  `${RUNTIME_ROOT}/adapters`,
23
38
  `${RUNTIME_ROOT}/agents`,
24
39
  `${RUNTIME_ROOT}/hooks`,
25
- `${RUNTIME_ROOT}/custom-skills`
40
+ `${RUNTIME_ROOT}/custom-skills`,
41
+ ...EVALS_DIRS
26
42
  ];
27
43
  export const REQUIRED_GITIGNORE_PATTERNS = [
28
44
  "# cclaw generated artifacts",
29
45
  `${RUNTIME_ROOT}/`,
46
+ "# cclaw evals: user-owned, track in git",
47
+ `!${EVALS_ROOT}/`,
48
+ `!${EVALS_ROOT}/config.yaml`,
49
+ `!${EVALS_ROOT}/corpus/`,
50
+ `!${EVALS_ROOT}/corpus/**`,
51
+ `!${EVALS_ROOT}/rubrics/`,
52
+ `!${EVALS_ROOT}/rubrics/**`,
53
+ `!${EVALS_ROOT}/baselines/`,
54
+ `!${EVALS_ROOT}/baselines/**`,
30
55
  ".claude/commands/cc-*.md",
31
56
  ".claude/commands/cc.md",
32
57
  ".cursor/commands/cc-*.md",
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
3
+ * on `cclaw sync` only if the files are missing (user content wins). The
4
+ * scaffold is intentionally minimal: a usable default config plus short
5
+ * READMEs that point at `docs/evals.md` for authoring guidance.
6
+ */
7
+ export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
+ export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
9
+ export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
10
+ export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
11
+ export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
3
+ * on `cclaw sync` only if the files are missing (user content wins). The
4
+ * scaffold is intentionally minimal: a usable default config plus short
5
+ * READMEs that point at `docs/evals.md` for authoring guidance.
6
+ */
7
+ export const EVAL_CONFIG_YAML = `# cclaw eval config
8
+ # See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
9
+ #
10
+ # All values can be overridden at runtime with CCLAW_EVAL_* environment
11
+ # variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
12
+ provider: zai
13
+ baseUrl: https://api.z.ai/api/coding/paas/v4
14
+ model: glm-5.1
15
+
16
+ # Default fidelity tier when --tier is not supplied.
17
+ # A = single-shot API call (cheap, Wave 7.3)
18
+ # B = SDK with tool use (realistic, Wave 7.4)
19
+ # C = multi-stage workflow (end-to-end, Wave 7.5)
20
+ defaultTier: A
21
+
22
+ # Per-call timeout and retry budget.
23
+ timeoutMs: 120000
24
+ maxRetries: 2
25
+
26
+ # Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
27
+ # dailyUsdCap: 5
28
+
29
+ # Regression thresholds used by CI (Wave 7.3+).
30
+ regression:
31
+ # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
32
+ failIfDeltaBelow: -0.15
33
+ # Fail when any single critical rubric drops below this absolute score.
34
+ failIfCriticalBelow: 3.0
35
+ `;
36
+ export const EVAL_CORPUS_README = `# Eval Corpus
37
+
38
+ Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
39
+ See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
40
+
41
+ Minimal shape:
42
+
43
+ \`\`\`yaml
44
+ id: brainstorm-01
45
+ stage: brainstorm
46
+ input_prompt: |
47
+ One short paragraph describing the user's task.
48
+ context_files: []
49
+ expected:
50
+ # verifier-specific hints; optional in Wave 7.0
51
+ \`\`\`
52
+
53
+ Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
54
+ stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
55
+ projects to exercise Tier B/C sandboxes.
56
+ `;
57
+ export const EVAL_RUBRICS_README = `# Eval Rubrics
58
+
59
+ LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
60
+ scored on a \`1–5\` scale with a rationale:
61
+
62
+ \`\`\`yaml
63
+ stage: brainstorm
64
+ checks:
65
+ - id: distinctness
66
+ prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
67
+ scale: "1-5 where 5=fully distinct approaches"
68
+ weight: 1.0
69
+ \`\`\`
70
+
71
+ Rubric authoring happens when Tier A runs start producing artifacts, so we
72
+ score the *right* properties rather than retrofitting generic quality checks.
73
+ See \`docs/evals.md\` for the full schema.
74
+ `;
75
+ export const EVAL_BASELINES_README = `# Eval Baselines
76
+
77
+ Frozen score snapshots used by regression gates. Baselines are committed to
78
+ git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
79
+ (wired in Wave 7.1).
80
+
81
+ Each baseline file is a JSON document keyed by stage and case id. Do not edit
82
+ by hand; CI will flag baseline churn.
83
+ `;
84
+ export const EVAL_REPORTS_README = `# Eval Reports
85
+
86
+ Generated reports (JSON + Markdown) land here. This directory is gitignored.
87
+ Run \`cclaw eval --dry-run\` to preview configuration without producing a
88
+ report.
89
+ `;
@@ -0,0 +1,14 @@
1
+ import type { FlowStage } from "../types.js";
2
+ import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
3
+ export declare const BASELINE_SCHEMA_VERSION = 1;
4
+ export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
5
+ export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
6
+ export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
7
+ export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
8
+ /**
9
+ * Compare a freshly computed report against loaded baselines. If no baseline
10
+ * exists for a stage covered by the report, that stage contributes zero
11
+ * regressions (first run of that stage). Current is the source of truth.
12
+ */
13
+ export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
14
+ export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
@@ -0,0 +1,209 @@
1
+ /**
2
+ * Baseline I/O + regression comparison (Wave 7.1).
3
+ *
4
+ * Layout on disk (committed):
5
+ *
6
+ * .cclaw/evals/baselines/<stage>.json
7
+ *
8
+ * Each file contains a `BaselineSnapshot` keyed by `EvalCase.id`. We compute
9
+ * regressions by comparing per-verifier `ok` flags across runs: any verifier
10
+ * that was `ok:true` in the baseline and is `ok:false` now counts as a
11
+ * critical failure. A case whose aggregate `passed` flipped from true to
12
+ * false is flagged as `case-now-failing` regardless of per-verifier churn.
13
+ *
14
+ * Writes are gated behind an explicit `--update-baseline --confirm` pair at
15
+ * the CLI layer so accidental resets do not slip into PRs.
16
+ */
17
+ import fs from "node:fs/promises";
18
+ import path from "node:path";
19
+ import { EVALS_ROOT, CCLAW_VERSION } from "../constants.js";
20
+ import { exists } from "../fs-utils.js";
21
+ import { FLOW_STAGES } from "../types.js";
22
+ export const BASELINE_SCHEMA_VERSION = 1;
23
+ function baselinePath(projectRoot, stage) {
24
+ return path.join(projectRoot, EVALS_ROOT, "baselines", `${stage}.json`);
25
+ }
26
+ export async function loadBaseline(projectRoot, stage) {
27
+ const filePath = baselinePath(projectRoot, stage);
28
+ if (!(await exists(filePath)))
29
+ return null;
30
+ const raw = await fs.readFile(filePath, "utf8");
31
+ let parsed;
32
+ try {
33
+ parsed = JSON.parse(raw);
34
+ }
35
+ catch (err) {
36
+ throw new Error(`Invalid baseline at ${filePath}: ${err instanceof Error ? err.message : String(err)}`);
37
+ }
38
+ if (!isBaseline(parsed, stage)) {
39
+ throw new Error(`Invalid baseline at ${filePath}: shape mismatch (expected schemaVersion=${BASELINE_SCHEMA_VERSION}, stage=${stage})`);
40
+ }
41
+ return parsed;
42
+ }
43
+ function isBaseline(value, stage) {
44
+ if (!value || typeof value !== "object")
45
+ return false;
46
+ const candidate = value;
47
+ if (candidate.schemaVersion !== BASELINE_SCHEMA_VERSION)
48
+ return false;
49
+ if (candidate.stage !== stage)
50
+ return false;
51
+ if (typeof candidate.generatedAt !== "string")
52
+ return false;
53
+ if (typeof candidate.cclawVersion !== "string")
54
+ return false;
55
+ if (!candidate.cases || typeof candidate.cases !== "object")
56
+ return false;
57
+ return true;
58
+ }
59
+ export async function loadBaselinesByStage(projectRoot, stages) {
60
+ const out = new Map();
61
+ for (const stage of stages) {
62
+ const snapshot = await loadBaseline(projectRoot, stage);
63
+ if (snapshot)
64
+ out.set(stage, snapshot);
65
+ }
66
+ return out;
67
+ }
68
+ function entryFromResult(result) {
69
+ const verifierResults = result.verifierResults.map((v) => ({
70
+ id: v.id,
71
+ kind: v.kind,
72
+ ok: v.ok,
73
+ ...(v.score !== undefined ? { score: v.score } : {})
74
+ }));
75
+ return { passed: result.passed, verifierResults };
76
+ }
77
+ export function buildBaselineForStage(stage, report) {
78
+ const stageCases = report.cases.filter((c) => c.stage === stage);
79
+ const cases = {};
80
+ for (const c of stageCases) {
81
+ cases[c.caseId] = entryFromResult(c);
82
+ }
83
+ return {
84
+ schemaVersion: BASELINE_SCHEMA_VERSION,
85
+ stage,
86
+ generatedAt: new Date().toISOString(),
87
+ cclawVersion: CCLAW_VERSION,
88
+ cases
89
+ };
90
+ }
91
+ export async function writeBaselinesFromReport(projectRoot, report) {
92
+ const written = [];
93
+ const stages = new Set(report.cases.map((c) => c.stage));
94
+ for (const stage of stages) {
95
+ const snapshot = buildBaselineForStage(stage, report);
96
+ const file = baselinePath(projectRoot, stage);
97
+ await fs.mkdir(path.dirname(file), { recursive: true });
98
+ await fs.writeFile(file, `${JSON.stringify(snapshot, null, 2)}\n`, "utf8");
99
+ written.push(file);
100
+ }
101
+ return written.sort();
102
+ }
103
+ function verifierMap(entries) {
104
+ const out = new Map();
105
+ for (const entry of entries) {
106
+ out.set(entry.id, entry);
107
+ }
108
+ return out;
109
+ }
110
+ function computePassRate(cases) {
111
+ if (cases.length === 0)
112
+ return 1;
113
+ const passed = cases.filter((c) => c.passed).length;
114
+ return passed / cases.length;
115
+ }
116
+ function baselinePassRate(snapshot) {
117
+ const entries = Object.values(snapshot.cases);
118
+ if (entries.length === 0)
119
+ return 1;
120
+ const passed = entries.filter((e) => e.passed).length;
121
+ return passed / entries.length;
122
+ }
123
+ /**
124
+ * Compare a freshly computed report against loaded baselines. If no baseline
125
+ * exists for a stage covered by the report, that stage contributes zero
126
+ * regressions (first run of that stage). Current is the source of truth.
127
+ */
128
+ export function compareAgainstBaselines(report, baselines) {
129
+ if (baselines.size === 0)
130
+ return undefined;
131
+ const regressions = [];
132
+ const caseResultsByStage = new Map();
133
+ for (const c of report.cases) {
134
+ const bucket = caseResultsByStage.get(c.stage) ?? [];
135
+ bucket.push(c);
136
+ caseResultsByStage.set(c.stage, bucket);
137
+ }
138
+ let baselineTotalPassRate = 0;
139
+ let baselineStagesCounted = 0;
140
+ for (const [stage, snapshot] of baselines) {
141
+ const current = caseResultsByStage.get(stage) ?? [];
142
+ baselineTotalPassRate += baselinePassRate(snapshot);
143
+ baselineStagesCounted += 1;
144
+ for (const caseResult of current) {
145
+ const baselineEntry = snapshot.cases[caseResult.caseId];
146
+ if (!baselineEntry)
147
+ continue;
148
+ if (baselineEntry.passed && !caseResult.passed) {
149
+ regressions.push({
150
+ caseId: caseResult.caseId,
151
+ stage,
152
+ verifierId: "<case>",
153
+ reason: "case-now-failing",
154
+ previousScore: 1,
155
+ currentScore: 0
156
+ });
157
+ }
158
+ const baselineVerifiers = verifierMap(baselineEntry.verifierResults);
159
+ for (const currentVerifier of caseResult.verifierResults) {
160
+ const prev = baselineVerifiers.get(currentVerifier.id);
161
+ if (!prev)
162
+ continue;
163
+ if (prev.ok && !currentVerifier.ok) {
164
+ regressions.push({
165
+ caseId: caseResult.caseId,
166
+ stage,
167
+ verifierId: currentVerifier.id,
168
+ reason: "newly-failing",
169
+ previousScore: prev.score ?? 1,
170
+ currentScore: currentVerifier.score ?? 0
171
+ });
172
+ }
173
+ else if (prev.score !== undefined &&
174
+ currentVerifier.score !== undefined &&
175
+ currentVerifier.score < prev.score) {
176
+ regressions.push({
177
+ caseId: caseResult.caseId,
178
+ stage,
179
+ verifierId: currentVerifier.id,
180
+ reason: "score-drop",
181
+ previousScore: prev.score,
182
+ currentScore: currentVerifier.score
183
+ });
184
+ }
185
+ }
186
+ }
187
+ }
188
+ const currentPassRate = computePassRate(report.cases);
189
+ const baselineAveragePassRate = baselineStagesCounted === 0 ? currentPassRate : baselineTotalPassRate / baselineStagesCounted;
190
+ const scoreDelta = Number((currentPassRate - baselineAveragePassRate).toFixed(4));
191
+ const criticalFailures = regressions.filter((r) => r.reason === "newly-failing" || r.reason === "case-now-failing").length;
192
+ const baselineStages = [...baselines.keys()].sort().join(",");
193
+ return {
194
+ baselineId: baselineStages.length > 0 ? baselineStages : "(empty)",
195
+ scoreDelta,
196
+ criticalFailures,
197
+ regressions
198
+ };
199
+ }
200
+ export function listBaselineStages(projectRoot) {
201
+ const root = path.join(projectRoot, EVALS_ROOT, "baselines");
202
+ return fs
203
+ .readdir(root, { withFileTypes: true })
204
+ .then((entries) => entries
205
+ .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
206
+ .map((entry) => entry.name.replace(/\.json$/, ""))
207
+ .filter((name) => FLOW_STAGES.includes(name)))
208
+ .catch(() => []);
209
+ }
@@ -0,0 +1,14 @@
1
+ import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
2
+ /**
3
+ * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
4
+ * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
5
+ * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
6
+ * variables (env wins last).
7
+ */
8
+ export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
9
+ /**
10
+ * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
11
+ * Returns a fully-populated config plus a provenance marker so `--dry-run` can
12
+ * surface where each setting came from.
13
+ */
14
+ export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;