cclaw-cli 0.21.2 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +9 -1
- package/dist/cli.js +123 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +8 -0
- package/dist/eval/corpus.js +91 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +88 -0
- package/dist/eval/runner.d.ts +53 -0
- package/dist/eval/runner.js +96 -0
- package/dist/eval/types.d.ts +136 -0
- package/dist/eval/types.js +15 -0
- package/dist/install.js +22 -0
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
|
|
3
|
-
type
|
|
3
|
+
import type { EvalTier } from "./eval/types.js";
|
|
4
|
+
type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
|
|
4
5
|
interface ParsedArgs {
|
|
5
6
|
command?: CommandName;
|
|
6
7
|
harnesses?: HarnessId[];
|
|
@@ -16,6 +17,13 @@ interface ParsedArgs {
|
|
|
16
17
|
archiveName?: string;
|
|
17
18
|
archiveSkipRetro?: boolean;
|
|
18
19
|
archiveSkipRetroReason?: string;
|
|
20
|
+
evalStage?: string;
|
|
21
|
+
evalTier?: EvalTier;
|
|
22
|
+
evalSchemaOnly?: boolean;
|
|
23
|
+
evalRules?: boolean;
|
|
24
|
+
evalJudge?: boolean;
|
|
25
|
+
evalJson?: boolean;
|
|
26
|
+
evalNoWrite?: boolean;
|
|
19
27
|
showHelp?: boolean;
|
|
20
28
|
showVersion?: boolean;
|
|
21
29
|
}
|
package/dist/cli.js
CHANGED
|
@@ -13,7 +13,19 @@ import { RUNTIME_ROOT } from "./constants.js";
|
|
|
13
13
|
import { createDefaultConfig, createProfileConfig } from "./config.js";
|
|
14
14
|
import { detectHarnesses } from "./init-detect.js";
|
|
15
15
|
import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
16
|
-
|
|
16
|
+
import { runEval } from "./eval/runner.js";
|
|
17
|
+
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
18
|
+
import { EVAL_TIERS } from "./eval/types.js";
|
|
19
|
+
import { FLOW_STAGES } from "./types.js";
|
|
20
|
+
const INSTALLER_COMMANDS = [
|
|
21
|
+
"init",
|
|
22
|
+
"sync",
|
|
23
|
+
"doctor",
|
|
24
|
+
"upgrade",
|
|
25
|
+
"uninstall",
|
|
26
|
+
"archive",
|
|
27
|
+
"eval"
|
|
28
|
+
];
|
|
17
29
|
export function usage() {
|
|
18
30
|
return `cclaw - installer-first flow toolkit
|
|
19
31
|
|
|
@@ -41,6 +53,15 @@ Commands:
|
|
|
41
53
|
Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
|
|
42
54
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
43
55
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
56
|
+
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
|
|
57
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
58
|
+
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
59
|
+
--schema-only Run only structural verifiers (Wave 7.1).
|
|
60
|
+
--rules Run structural + rule verifiers (Wave 7.2).
|
|
61
|
+
--judge Include LLM judging (Wave 7.3; requires API key).
|
|
62
|
+
--dry-run Validate config + corpus, print summary, do not execute.
|
|
63
|
+
--json Emit machine-readable JSON on stdout.
|
|
64
|
+
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
44
65
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
45
66
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
46
67
|
|
|
@@ -52,6 +73,8 @@ Examples:
|
|
|
52
73
|
cclaw init --harnesses=claude,cursor
|
|
53
74
|
cclaw doctor --reconcile-gates
|
|
54
75
|
cclaw archive --name=payments-revamp
|
|
76
|
+
cclaw eval --dry-run
|
|
77
|
+
cclaw eval --stage=brainstorm --schema-only
|
|
55
78
|
|
|
56
79
|
Docs: https://github.com/zuevrs/cclaw
|
|
57
80
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -107,6 +130,20 @@ function parseProfile(raw) {
|
|
|
107
130
|
}
|
|
108
131
|
return trimmed;
|
|
109
132
|
}
|
|
133
|
+
function parseEvalTier(raw) {
|
|
134
|
+
const trimmed = raw.trim().toUpperCase();
|
|
135
|
+
if (!EVAL_TIERS.includes(trimmed)) {
|
|
136
|
+
throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
|
|
137
|
+
}
|
|
138
|
+
return trimmed;
|
|
139
|
+
}
|
|
140
|
+
function parseEvalStage(raw) {
|
|
141
|
+
const trimmed = raw.trim();
|
|
142
|
+
if (!FLOW_STAGES.includes(trimmed)) {
|
|
143
|
+
throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
|
|
144
|
+
}
|
|
145
|
+
return trimmed;
|
|
146
|
+
}
|
|
110
147
|
function isInitPromptAllowed(ctx) {
|
|
111
148
|
return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
|
|
112
149
|
}
|
|
@@ -390,7 +427,37 @@ function parseArgs(argv) {
|
|
|
390
427
|
}
|
|
391
428
|
if (flag.startsWith("--retro-reason=")) {
|
|
392
429
|
parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
if (flag.startsWith("--stage=")) {
|
|
433
|
+
parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
if (flag.startsWith("--tier=")) {
|
|
437
|
+
parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
if (flag === "--schema-only") {
|
|
441
|
+
parsed.evalSchemaOnly = true;
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
if (flag === "--rules") {
|
|
445
|
+
parsed.evalRules = true;
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
448
|
+
if (flag === "--judge") {
|
|
449
|
+
parsed.evalJudge = true;
|
|
450
|
+
continue;
|
|
393
451
|
}
|
|
452
|
+
if (flag === "--no-write") {
|
|
453
|
+
parsed.evalNoWrite = true;
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
// `--json` is shared between doctor and eval. Disambiguate by command.
|
|
458
|
+
if (parsed.command === "eval" && parsed.doctorJson === true) {
|
|
459
|
+
parsed.evalJson = true;
|
|
460
|
+
parsed.doctorJson = undefined;
|
|
394
461
|
}
|
|
395
462
|
return parsed;
|
|
396
463
|
}
|
|
@@ -487,6 +554,61 @@ async function runCommand(parsed, ctx) {
|
|
|
487
554
|
info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
|
|
488
555
|
return 0;
|
|
489
556
|
}
|
|
557
|
+
if (command === "eval") {
|
|
558
|
+
const result = await runEval({
|
|
559
|
+
projectRoot: ctx.cwd,
|
|
560
|
+
stage: parsed.evalStage,
|
|
561
|
+
tier: parsed.evalTier,
|
|
562
|
+
schemaOnly: parsed.evalSchemaOnly === true,
|
|
563
|
+
rules: parsed.evalRules === true,
|
|
564
|
+
judge: parsed.evalJudge === true,
|
|
565
|
+
dryRun: parsed.dryRun === true
|
|
566
|
+
});
|
|
567
|
+
if ("kind" in result) {
|
|
568
|
+
if (parsed.evalJson === true) {
|
|
569
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
570
|
+
return 0;
|
|
571
|
+
}
|
|
572
|
+
ctx.stdout.write(`cclaw eval dry-run\n`);
|
|
573
|
+
ctx.stdout.write(` provider: ${result.config.provider}\n`);
|
|
574
|
+
ctx.stdout.write(` baseUrl: ${result.config.baseUrl}\n`);
|
|
575
|
+
ctx.stdout.write(` model: ${result.config.model}\n`);
|
|
576
|
+
ctx.stdout.write(` source: ${result.config.source}\n`);
|
|
577
|
+
ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
|
|
578
|
+
ctx.stdout.write(` tier: ${result.plannedTier}\n`);
|
|
579
|
+
ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
|
|
580
|
+
for (const [stage, count] of Object.entries(result.corpus.byStage)) {
|
|
581
|
+
ctx.stdout.write(` - ${stage}: ${count}\n`);
|
|
582
|
+
}
|
|
583
|
+
ctx.stdout.write(` verifiers available:\n`);
|
|
584
|
+
for (const [key, value] of Object.entries(result.verifiersAvailable)) {
|
|
585
|
+
ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
|
|
586
|
+
}
|
|
587
|
+
if (result.notes.length > 0) {
|
|
588
|
+
ctx.stdout.write(` notes:\n`);
|
|
589
|
+
for (const note of result.notes) {
|
|
590
|
+
ctx.stdout.write(` - ${note}\n`);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
return 0;
|
|
594
|
+
}
|
|
595
|
+
if (parsed.evalNoWrite !== true) {
|
|
596
|
+
const jsonPath = await writeJsonReport(ctx.cwd, result);
|
|
597
|
+
const mdPath = await writeMarkdownReport(ctx.cwd, result);
|
|
598
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
|
|
599
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
|
|
600
|
+
}
|
|
601
|
+
if (parsed.evalJson === true) {
|
|
602
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
603
|
+
}
|
|
604
|
+
else {
|
|
605
|
+
ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
|
|
606
|
+
`${result.summary.passed} passed, ` +
|
|
607
|
+
`${result.summary.failed} failed, ` +
|
|
608
|
+
`${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
|
|
609
|
+
}
|
|
610
|
+
return result.summary.failed > 0 ? 1 : 0;
|
|
611
|
+
}
|
|
490
612
|
if (command === "archive") {
|
|
491
613
|
const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
|
|
492
614
|
skipRetro: parsed.archiveSkipRetro === true,
|
package/dist/constants.d.ts
CHANGED
|
@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
|
|
|
4
4
|
export declare const CCLAW_VERSION = "0.1.1";
|
|
5
5
|
export declare const FLOW_VERSION = "1.0.0";
|
|
6
6
|
export declare const DEFAULT_HARNESSES: HarnessId[];
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
9
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
10
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
11
|
+
* does not affect non-eval cclaw behavior.
|
|
12
|
+
*/
|
|
13
|
+
export declare const EVALS_ROOT = ".cclaw/evals";
|
|
14
|
+
export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
|
|
15
|
+
export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
16
|
+
export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
17
|
+
export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
|
|
9
18
|
export declare const COMMAND_FILE_ORDER: FlowStage[];
|
|
10
19
|
export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
|
|
11
20
|
export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];
|
package/dist/constants.js
CHANGED
|
@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
|
|
|
8
8
|
"opencode",
|
|
9
9
|
"codex"
|
|
10
10
|
];
|
|
11
|
+
/**
|
|
12
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
13
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
14
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
15
|
+
* does not affect non-eval cclaw behavior.
|
|
16
|
+
*/
|
|
17
|
+
export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
|
|
18
|
+
export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
|
|
19
|
+
export const EVALS_DIRS = [
|
|
20
|
+
EVALS_ROOT,
|
|
21
|
+
`${EVALS_ROOT}/corpus`,
|
|
22
|
+
`${EVALS_ROOT}/rubrics`,
|
|
23
|
+
`${EVALS_ROOT}/baselines`,
|
|
24
|
+
`${EVALS_ROOT}/reports`
|
|
25
|
+
];
|
|
11
26
|
export const REQUIRED_DIRS = [
|
|
12
27
|
RUNTIME_ROOT,
|
|
13
28
|
`${RUNTIME_ROOT}/commands`,
|
|
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
|
|
|
22
37
|
`${RUNTIME_ROOT}/adapters`,
|
|
23
38
|
`${RUNTIME_ROOT}/agents`,
|
|
24
39
|
`${RUNTIME_ROOT}/hooks`,
|
|
25
|
-
`${RUNTIME_ROOT}/custom-skills
|
|
40
|
+
`${RUNTIME_ROOT}/custom-skills`,
|
|
41
|
+
...EVALS_DIRS
|
|
26
42
|
];
|
|
27
43
|
export const REQUIRED_GITIGNORE_PATTERNS = [
|
|
28
44
|
"# cclaw generated artifacts",
|
|
29
45
|
`${RUNTIME_ROOT}/`,
|
|
46
|
+
"# cclaw evals: user-owned, track in git",
|
|
47
|
+
`!${EVALS_ROOT}/`,
|
|
48
|
+
`!${EVALS_ROOT}/config.yaml`,
|
|
49
|
+
`!${EVALS_ROOT}/corpus/`,
|
|
50
|
+
`!${EVALS_ROOT}/corpus/**`,
|
|
51
|
+
`!${EVALS_ROOT}/rubrics/`,
|
|
52
|
+
`!${EVALS_ROOT}/rubrics/**`,
|
|
53
|
+
`!${EVALS_ROOT}/baselines/`,
|
|
54
|
+
`!${EVALS_ROOT}/baselines/**`,
|
|
30
55
|
".claude/commands/cc-*.md",
|
|
31
56
|
".claude/commands/cc.md",
|
|
32
57
|
".cursor/commands/cc-*.md",
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
|
|
8
|
+
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
|
|
9
|
+
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
|
|
10
|
+
export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
|
|
11
|
+
export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export const EVAL_CONFIG_YAML = `# cclaw eval config
|
|
8
|
+
# See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
|
|
9
|
+
#
|
|
10
|
+
# All values can be overridden at runtime with CCLAW_EVAL_* environment
|
|
11
|
+
# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
|
|
12
|
+
provider: zai
|
|
13
|
+
baseUrl: https://api.z.ai/api/coding/paas/v4
|
|
14
|
+
model: glm-5.1
|
|
15
|
+
|
|
16
|
+
# Default fidelity tier when --tier is not supplied.
|
|
17
|
+
# A = single-shot API call (cheap, Wave 7.3)
|
|
18
|
+
# B = SDK with tool use (realistic, Wave 7.4)
|
|
19
|
+
# C = multi-stage workflow (end-to-end, Wave 7.5)
|
|
20
|
+
defaultTier: A
|
|
21
|
+
|
|
22
|
+
# Per-call timeout and retry budget.
|
|
23
|
+
timeoutMs: 120000
|
|
24
|
+
maxRetries: 2
|
|
25
|
+
|
|
26
|
+
# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
|
|
27
|
+
# dailyUsdCap: 5
|
|
28
|
+
|
|
29
|
+
# Regression thresholds used by CI (Wave 7.3+).
|
|
30
|
+
regression:
|
|
31
|
+
# Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
|
|
32
|
+
failIfDeltaBelow: -0.15
|
|
33
|
+
# Fail when any single critical rubric drops below this absolute score.
|
|
34
|
+
failIfCriticalBelow: 3.0
|
|
35
|
+
`;
|
|
36
|
+
export const EVAL_CORPUS_README = `# Eval Corpus
|
|
37
|
+
|
|
38
|
+
Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
|
|
39
|
+
See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
|
|
40
|
+
|
|
41
|
+
Minimal shape:
|
|
42
|
+
|
|
43
|
+
\`\`\`yaml
|
|
44
|
+
id: brainstorm-01
|
|
45
|
+
stage: brainstorm
|
|
46
|
+
input_prompt: |
|
|
47
|
+
One short paragraph describing the user's task.
|
|
48
|
+
context_files: []
|
|
49
|
+
expected:
|
|
50
|
+
# verifier-specific hints; optional in Wave 7.0
|
|
51
|
+
\`\`\`
|
|
52
|
+
|
|
53
|
+
Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
|
|
54
|
+
stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
|
|
55
|
+
projects to exercise Tier B/C sandboxes.
|
|
56
|
+
`;
|
|
57
|
+
export const EVAL_RUBRICS_README = `# Eval Rubrics
|
|
58
|
+
|
|
59
|
+
LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
|
|
60
|
+
scored on a \`1–5\` scale with a rationale:
|
|
61
|
+
|
|
62
|
+
\`\`\`yaml
|
|
63
|
+
stage: brainstorm
|
|
64
|
+
checks:
|
|
65
|
+
- id: distinctness
|
|
66
|
+
prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
|
|
67
|
+
scale: "1-5 where 5=fully distinct approaches"
|
|
68
|
+
weight: 1.0
|
|
69
|
+
\`\`\`
|
|
70
|
+
|
|
71
|
+
Rubric authoring happens when Tier A runs start producing artifacts, so we
|
|
72
|
+
score the *right* properties rather than retrofitting generic quality checks.
|
|
73
|
+
See \`docs/evals.md\` for the full schema.
|
|
74
|
+
`;
|
|
75
|
+
export const EVAL_BASELINES_README = `# Eval Baselines
|
|
76
|
+
|
|
77
|
+
Frozen score snapshots used by regression gates. Baselines are committed to
|
|
78
|
+
git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
|
|
79
|
+
(wired in Wave 7.1).
|
|
80
|
+
|
|
81
|
+
Each baseline file is a JSON document keyed by stage and case id. Do not edit
|
|
82
|
+
by hand; CI will flag baseline churn.
|
|
83
|
+
`;
|
|
84
|
+
export const EVAL_REPORTS_README = `# Eval Reports
|
|
85
|
+
|
|
86
|
+
Generated reports (JSON + Markdown) land here. This directory is gitignored.
|
|
87
|
+
Run \`cclaw eval --dry-run\` to preview configuration without producing a
|
|
88
|
+
report.
|
|
89
|
+
`;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
4
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
5
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
6
|
+
* variables (env wins last).
|
|
7
|
+
*/
|
|
8
|
+
export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
|
|
9
|
+
/**
|
|
10
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
11
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
12
|
+
* surface where each setting came from.
|
|
13
|
+
*/
|
|
14
|
+
export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_CONFIG_PATH } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { EVAL_TIERS } from "./types.js";
|
|
7
|
+
/**
|
|
8
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
9
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
10
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
11
|
+
* variables (env wins last).
|
|
12
|
+
*/
|
|
13
|
+
export const DEFAULT_EVAL_CONFIG = {
|
|
14
|
+
provider: "zai",
|
|
15
|
+
baseUrl: "https://api.z.ai/api/coding/paas/v4",
|
|
16
|
+
model: "glm-5.1",
|
|
17
|
+
defaultTier: "A",
|
|
18
|
+
regression: {
|
|
19
|
+
failIfDeltaBelow: -0.15,
|
|
20
|
+
failIfCriticalBelow: 3.0
|
|
21
|
+
},
|
|
22
|
+
timeoutMs: 120_000,
|
|
23
|
+
maxRetries: 2
|
|
24
|
+
};
|
|
25
|
+
const EVAL_TIER_SET = new Set(EVAL_TIERS);
|
|
26
|
+
const NUMERIC_ENVS = new Set([
|
|
27
|
+
"CCLAW_EVAL_DAILY_USD_CAP",
|
|
28
|
+
"CCLAW_EVAL_TIMEOUT_MS",
|
|
29
|
+
"CCLAW_EVAL_MAX_RETRIES"
|
|
30
|
+
]);
|
|
31
|
+
function evalConfigError(configFilePath, reason) {
|
|
32
|
+
return new Error(`Invalid cclaw eval config at ${configFilePath}: ${reason}\n` +
|
|
33
|
+
`Supported tiers: ${EVAL_TIERS.join(", ")}\n` +
|
|
34
|
+
`See docs/evals.md for the full schema. After fixing, run: cclaw eval --dry-run`);
|
|
35
|
+
}
|
|
36
|
+
function isRecord(value) {
|
|
37
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
38
|
+
}
|
|
39
|
+
function parseNumericEnv(name, raw) {
|
|
40
|
+
const value = Number(raw);
|
|
41
|
+
if (!Number.isFinite(value)) {
|
|
42
|
+
throw new Error(`Environment variable ${name} must be numeric, got: ${raw}`);
|
|
43
|
+
}
|
|
44
|
+
return value;
|
|
45
|
+
}
|
|
46
|
+
function parseTierEnv(raw) {
|
|
47
|
+
const trimmed = raw.trim().toUpperCase();
|
|
48
|
+
if (!EVAL_TIER_SET.has(trimmed)) {
|
|
49
|
+
throw new Error(`Environment variable CCLAW_EVAL_TIER must be one of ${EVAL_TIERS.join("/")}, got: ${raw}`);
|
|
50
|
+
}
|
|
51
|
+
return trimmed;
|
|
52
|
+
}
|
|
53
|
+
function validateFileConfig(raw, configFilePath) {
|
|
54
|
+
if (raw === undefined || raw === null)
|
|
55
|
+
return {};
|
|
56
|
+
if (!isRecord(raw)) {
|
|
57
|
+
throw evalConfigError(configFilePath, "top-level value must be a mapping");
|
|
58
|
+
}
|
|
59
|
+
const out = {};
|
|
60
|
+
const assignString = (key, value) => {
|
|
61
|
+
if (value === undefined)
|
|
62
|
+
return;
|
|
63
|
+
if (typeof value !== "string" || value.trim().length === 0) {
|
|
64
|
+
throw evalConfigError(configFilePath, `"${String(key)}" must be a non-empty string`);
|
|
65
|
+
}
|
|
66
|
+
out[key] = value.trim();
|
|
67
|
+
};
|
|
68
|
+
assignString("provider", raw.provider);
|
|
69
|
+
assignString("baseUrl", raw.baseUrl);
|
|
70
|
+
assignString("model", raw.model);
|
|
71
|
+
assignString("judgeModel", raw.judgeModel);
|
|
72
|
+
if (raw.defaultTier !== undefined) {
|
|
73
|
+
if (typeof raw.defaultTier !== "string" || !EVAL_TIER_SET.has(raw.defaultTier)) {
|
|
74
|
+
throw evalConfigError(configFilePath, `"defaultTier" must be one of: ${EVAL_TIERS.join(", ")}`);
|
|
75
|
+
}
|
|
76
|
+
out.defaultTier = raw.defaultTier;
|
|
77
|
+
}
|
|
78
|
+
if (raw.dailyUsdCap !== undefined) {
|
|
79
|
+
if (typeof raw.dailyUsdCap !== "number" || raw.dailyUsdCap < 0) {
|
|
80
|
+
throw evalConfigError(configFilePath, `"dailyUsdCap" must be a non-negative number`);
|
|
81
|
+
}
|
|
82
|
+
out.dailyUsdCap = raw.dailyUsdCap;
|
|
83
|
+
}
|
|
84
|
+
if (raw.timeoutMs !== undefined) {
|
|
85
|
+
if (typeof raw.timeoutMs !== "number" || raw.timeoutMs <= 0) {
|
|
86
|
+
throw evalConfigError(configFilePath, `"timeoutMs" must be a positive number`);
|
|
87
|
+
}
|
|
88
|
+
out.timeoutMs = raw.timeoutMs;
|
|
89
|
+
}
|
|
90
|
+
if (raw.maxRetries !== undefined) {
|
|
91
|
+
if (!Number.isInteger(raw.maxRetries) || raw.maxRetries < 0) {
|
|
92
|
+
throw evalConfigError(configFilePath, `"maxRetries" must be a non-negative integer`);
|
|
93
|
+
}
|
|
94
|
+
out.maxRetries = raw.maxRetries;
|
|
95
|
+
}
|
|
96
|
+
if (raw.regression !== undefined) {
|
|
97
|
+
if (!isRecord(raw.regression)) {
|
|
98
|
+
throw evalConfigError(configFilePath, `"regression" must be a mapping`);
|
|
99
|
+
}
|
|
100
|
+
const failIfDeltaBelow = raw.regression.failIfDeltaBelow;
|
|
101
|
+
const failIfCriticalBelow = raw.regression.failIfCriticalBelow;
|
|
102
|
+
if (failIfDeltaBelow !== undefined && typeof failIfDeltaBelow !== "number") {
|
|
103
|
+
throw evalConfigError(configFilePath, `"regression.failIfDeltaBelow" must be a number`);
|
|
104
|
+
}
|
|
105
|
+
if (failIfCriticalBelow !== undefined && typeof failIfCriticalBelow !== "number") {
|
|
106
|
+
throw evalConfigError(configFilePath, `"regression.failIfCriticalBelow" must be a number`);
|
|
107
|
+
}
|
|
108
|
+
out.regression = {
|
|
109
|
+
failIfDeltaBelow: typeof failIfDeltaBelow === "number"
|
|
110
|
+
? failIfDeltaBelow
|
|
111
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfDeltaBelow,
|
|
112
|
+
failIfCriticalBelow: typeof failIfCriticalBelow === "number"
|
|
113
|
+
? failIfCriticalBelow
|
|
114
|
+
: DEFAULT_EVAL_CONFIG.regression.failIfCriticalBelow
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
const knownKeys = new Set([
|
|
118
|
+
"provider",
|
|
119
|
+
"baseUrl",
|
|
120
|
+
"model",
|
|
121
|
+
"judgeModel",
|
|
122
|
+
"defaultTier",
|
|
123
|
+
"dailyUsdCap",
|
|
124
|
+
"timeoutMs",
|
|
125
|
+
"maxRetries",
|
|
126
|
+
"regression"
|
|
127
|
+
]);
|
|
128
|
+
const unknown = Object.keys(raw).filter((key) => !knownKeys.has(key));
|
|
129
|
+
if (unknown.length > 0) {
|
|
130
|
+
throw evalConfigError(configFilePath, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
async function readFileConfig(projectRoot) {
|
|
135
|
+
const configFilePath = path.join(projectRoot, EVALS_CONFIG_PATH);
|
|
136
|
+
if (!(await exists(configFilePath))) {
|
|
137
|
+
return { patch: {}, source: "default" };
|
|
138
|
+
}
|
|
139
|
+
let parsed;
|
|
140
|
+
try {
|
|
141
|
+
parsed = parse(await fs.readFile(configFilePath, "utf8"));
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
throw evalConfigError(configFilePath, err instanceof Error ? err.message : String(err));
|
|
145
|
+
}
|
|
146
|
+
const patch = validateFileConfig(parsed, configFilePath);
|
|
147
|
+
return { patch, source: "file" };
|
|
148
|
+
}
|
|
149
|
+
function applyEnvOverrides(base, env) {
|
|
150
|
+
let overridden = false;
|
|
151
|
+
const patched = {
|
|
152
|
+
...base,
|
|
153
|
+
regression: { ...base.regression }
|
|
154
|
+
};
|
|
155
|
+
for (const name of Object.keys(env)) {
|
|
156
|
+
if (!name.startsWith("CCLAW_EVAL_"))
|
|
157
|
+
continue;
|
|
158
|
+
if (NUMERIC_ENVS.has(name) && typeof env[name] === "string") {
|
|
159
|
+
// validated below when applied
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const read = (name) => {
|
|
163
|
+
const value = env[name];
|
|
164
|
+
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
165
|
+
};
|
|
166
|
+
const baseUrl = read("CCLAW_EVAL_BASE_URL");
|
|
167
|
+
if (baseUrl) {
|
|
168
|
+
patched.baseUrl = baseUrl;
|
|
169
|
+
overridden = true;
|
|
170
|
+
}
|
|
171
|
+
const model = read("CCLAW_EVAL_MODEL");
|
|
172
|
+
if (model) {
|
|
173
|
+
patched.model = model;
|
|
174
|
+
overridden = true;
|
|
175
|
+
}
|
|
176
|
+
const judgeModel = read("CCLAW_EVAL_JUDGE_MODEL");
|
|
177
|
+
if (judgeModel) {
|
|
178
|
+
patched.judgeModel = judgeModel;
|
|
179
|
+
overridden = true;
|
|
180
|
+
}
|
|
181
|
+
const provider = read("CCLAW_EVAL_PROVIDER");
|
|
182
|
+
if (provider) {
|
|
183
|
+
patched.provider = provider;
|
|
184
|
+
overridden = true;
|
|
185
|
+
}
|
|
186
|
+
const tier = read("CCLAW_EVAL_TIER");
|
|
187
|
+
if (tier) {
|
|
188
|
+
patched.defaultTier = parseTierEnv(tier);
|
|
189
|
+
overridden = true;
|
|
190
|
+
}
|
|
191
|
+
const cap = read("CCLAW_EVAL_DAILY_USD_CAP");
|
|
192
|
+
if (cap) {
|
|
193
|
+
patched.dailyUsdCap = parseNumericEnv("CCLAW_EVAL_DAILY_USD_CAP", cap);
|
|
194
|
+
overridden = true;
|
|
195
|
+
}
|
|
196
|
+
const timeout = read("CCLAW_EVAL_TIMEOUT_MS");
|
|
197
|
+
if (timeout) {
|
|
198
|
+
patched.timeoutMs = parseNumericEnv("CCLAW_EVAL_TIMEOUT_MS", timeout);
|
|
199
|
+
overridden = true;
|
|
200
|
+
}
|
|
201
|
+
const retries = read("CCLAW_EVAL_MAX_RETRIES");
|
|
202
|
+
if (retries) {
|
|
203
|
+
patched.maxRetries = parseNumericEnv("CCLAW_EVAL_MAX_RETRIES", retries);
|
|
204
|
+
overridden = true;
|
|
205
|
+
}
|
|
206
|
+
const apiKey = read("CCLAW_EVAL_API_KEY");
|
|
207
|
+
return { patched, overridden, apiKey };
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
211
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
212
|
+
* surface where each setting came from.
|
|
213
|
+
*/
|
|
214
|
+
export async function loadEvalConfig(projectRoot, env = process.env) {
|
|
215
|
+
const { patch, source: fileSource } = await readFileConfig(projectRoot);
|
|
216
|
+
const merged = {
|
|
217
|
+
...DEFAULT_EVAL_CONFIG,
|
|
218
|
+
...patch,
|
|
219
|
+
regression: {
|
|
220
|
+
...DEFAULT_EVAL_CONFIG.regression,
|
|
221
|
+
...(patch.regression ?? {})
|
|
222
|
+
}
|
|
223
|
+
};
|
|
224
|
+
const { patched, overridden, apiKey } = applyEnvOverrides(merged, env);
|
|
225
|
+
let source = "default";
|
|
226
|
+
if (fileSource === "file" && overridden)
|
|
227
|
+
source = "file+env";
|
|
228
|
+
else if (fileSource === "file")
|
|
229
|
+
source = "file";
|
|
230
|
+
else if (overridden)
|
|
231
|
+
source = "env";
|
|
232
|
+
return {
|
|
233
|
+
...patched,
|
|
234
|
+
apiKey,
|
|
235
|
+
source
|
|
236
|
+
};
|
|
237
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalCase } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
5
|
+
* single stage. Returns an empty array for a fresh install (Wave 7.0 ships
|
|
6
|
+
* without seed cases; corpus is authored in Wave 7.1+).
|
|
7
|
+
*/
|
|
8
|
+
export declare function loadCorpus(projectRoot: string, stage?: FlowStage): Promise<EvalCase[]>;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parse } from "yaml";
|
|
4
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
5
|
+
import { exists } from "../fs-utils.js";
|
|
6
|
+
import { FLOW_STAGES } from "../types.js";
|
|
7
|
+
const FLOW_STAGE_SET = new Set(FLOW_STAGES);
|
|
8
|
+
function corpusError(filePath, reason) {
|
|
9
|
+
return new Error(`Invalid eval case at ${filePath}: ${reason}\n` +
|
|
10
|
+
`Supported stages: ${FLOW_STAGES.join(", ")}`);
|
|
11
|
+
}
|
|
12
|
+
function isRecord(value) {
|
|
13
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
14
|
+
}
|
|
15
|
+
function validateCase(filePath, raw) {
|
|
16
|
+
if (!isRecord(raw)) {
|
|
17
|
+
throw corpusError(filePath, "top-level value must be a mapping");
|
|
18
|
+
}
|
|
19
|
+
const id = raw.id;
|
|
20
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
21
|
+
throw corpusError(filePath, `"id" must be a non-empty string`);
|
|
22
|
+
}
|
|
23
|
+
const stageRaw = raw.stage;
|
|
24
|
+
if (typeof stageRaw !== "string" || !FLOW_STAGE_SET.has(stageRaw)) {
|
|
25
|
+
throw corpusError(filePath, `"stage" must be one of: ${FLOW_STAGES.join(", ")}`);
|
|
26
|
+
}
|
|
27
|
+
const inputPrompt = raw.input_prompt ?? raw.inputPrompt;
|
|
28
|
+
if (typeof inputPrompt !== "string" || inputPrompt.trim().length === 0) {
|
|
29
|
+
throw corpusError(filePath, `"input_prompt" must be a non-empty string`);
|
|
30
|
+
}
|
|
31
|
+
const contextFilesRaw = raw.context_files ?? raw.contextFiles;
|
|
32
|
+
let contextFiles;
|
|
33
|
+
if (contextFilesRaw !== undefined) {
|
|
34
|
+
if (!Array.isArray(contextFilesRaw) || contextFilesRaw.some((f) => typeof f !== "string")) {
|
|
35
|
+
throw corpusError(filePath, `"context_files" must be an array of strings`);
|
|
36
|
+
}
|
|
37
|
+
contextFiles = contextFilesRaw;
|
|
38
|
+
}
|
|
39
|
+
const expected = raw.expected !== undefined && isRecord(raw.expected)
|
|
40
|
+
? raw.expected
|
|
41
|
+
: undefined;
|
|
42
|
+
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
43
|
+
return {
|
|
44
|
+
id: id.trim(),
|
|
45
|
+
stage: stageRaw,
|
|
46
|
+
inputPrompt: inputPrompt.trim(),
|
|
47
|
+
contextFiles,
|
|
48
|
+
expected,
|
|
49
|
+
fixture
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Load all eval cases under `.cclaw/evals/corpus/**`. Optionally restrict to a
|
|
54
|
+
* single stage. Returns an empty array for a fresh install (Wave 7.0 ships
|
|
55
|
+
* without seed cases; corpus is authored in Wave 7.1+).
|
|
56
|
+
*/
|
|
57
|
+
export async function loadCorpus(projectRoot, stage) {
|
|
58
|
+
const corpusRoot = path.join(projectRoot, EVALS_ROOT, "corpus");
|
|
59
|
+
if (!(await exists(corpusRoot))) {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
const cases = [];
|
|
63
|
+
const stageDirs = stage
|
|
64
|
+
? [path.join(corpusRoot, stage)]
|
|
65
|
+
: (await fs.readdir(corpusRoot, { withFileTypes: true }))
|
|
66
|
+
.filter((entry) => entry.isDirectory())
|
|
67
|
+
.filter((entry) => FLOW_STAGE_SET.has(entry.name))
|
|
68
|
+
.map((entry) => path.join(corpusRoot, entry.name));
|
|
69
|
+
for (const stageDir of stageDirs) {
|
|
70
|
+
if (!(await exists(stageDir)))
|
|
71
|
+
continue;
|
|
72
|
+
const entries = await fs.readdir(stageDir, { withFileTypes: true });
|
|
73
|
+
for (const entry of entries) {
|
|
74
|
+
if (!entry.isFile())
|
|
75
|
+
continue;
|
|
76
|
+
if (!entry.name.endsWith(".yaml") && !entry.name.endsWith(".yml"))
|
|
77
|
+
continue;
|
|
78
|
+
const filePath = path.join(stageDir, entry.name);
|
|
79
|
+
let parsed;
|
|
80
|
+
try {
|
|
81
|
+
parsed = parse(await fs.readFile(filePath, "utf8"));
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
throw corpusError(filePath, err instanceof Error ? err.message : String(err));
|
|
85
|
+
}
|
|
86
|
+
cases.push(validateCase(filePath, parsed));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
cases.sort((a, b) => a.stage.localeCompare(b.stage) || a.id.localeCompare(b.id));
|
|
90
|
+
return cases;
|
|
91
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Wave 7.0 declares the shape of the client without pulling in the `openai`
|
|
5
|
+
* runtime dependency. The real implementation is wired in Wave 7.3 when
|
|
6
|
+
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
+
* separate means users of Waves 7.0–7.2 (structural + rule-based verifiers)
|
|
8
|
+
* never install an extra dependency or receive network egress warnings.
|
|
9
|
+
*/
|
|
10
|
+
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
+
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
+
* Wave 7.3 implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
+
*/
|
|
16
|
+
export interface ChatMessage {
|
|
17
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
18
|
+
content: string;
|
|
19
|
+
name?: string;
|
|
20
|
+
toolCallId?: string;
|
|
21
|
+
}
|
|
22
|
+
export interface ChatRequest {
|
|
23
|
+
model: string;
|
|
24
|
+
messages: ChatMessage[];
|
|
25
|
+
maxTokens?: number;
|
|
26
|
+
temperature?: number;
|
|
27
|
+
timeoutMs?: number;
|
|
28
|
+
/**
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only by
|
|
30
|
+
* Wave 7.4 (Tier B). Ignored by the Wave 7.3 single-shot path.
|
|
31
|
+
*/
|
|
32
|
+
tools?: unknown[];
|
|
33
|
+
toolChoice?: "auto" | "none";
|
|
34
|
+
}
|
|
35
|
+
export interface ChatUsage {
|
|
36
|
+
promptTokens: number;
|
|
37
|
+
completionTokens: number;
|
|
38
|
+
totalTokens: number;
|
|
39
|
+
}
|
|
40
|
+
export interface ChatResponse {
|
|
41
|
+
content: string;
|
|
42
|
+
toolCalls?: Array<{
|
|
43
|
+
id: string;
|
|
44
|
+
name: string;
|
|
45
|
+
arguments: string;
|
|
46
|
+
}>;
|
|
47
|
+
usage: ChatUsage;
|
|
48
|
+
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
49
|
+
}
|
|
50
|
+
/** Lightweight client abstraction shared across eval runners. */
|
|
51
|
+
export interface EvalLlmClient {
|
|
52
|
+
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
|
+
}
|
|
54
|
+
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
+
constructor(wave: string);
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
59
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
60
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
|
+
*/
|
|
62
|
+
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class EvalLlmNotWiredError extends Error {
|
|
2
|
+
constructor(wave) {
|
|
3
|
+
super(`LLM client is not wired in Wave 7.0. It arrives in Wave ${wave}.\n` +
|
|
4
|
+
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
|
+
this.name = "EvalLlmNotWiredError";
|
|
6
|
+
}
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental Wave 7.0 usage is
|
|
10
|
+
* easy to diagnose. The Wave 7.3 implementation will replace this body with
|
|
11
|
+
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
|
+
*/
|
|
13
|
+
export function createEvalClient(_config) {
|
|
14
|
+
return {
|
|
15
|
+
async chat() {
|
|
16
|
+
throw new EvalLlmNotWiredError("7.3");
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { EvalReport } from "./types.js";
|
|
2
|
+
export declare function reportsDir(projectRoot: string): string;
|
|
3
|
+
export declare function defaultReportBasename(report: EvalReport): string;
|
|
4
|
+
/**
|
|
5
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
6
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
7
|
+
* the output as a regression guard.
|
|
8
|
+
*/
|
|
9
|
+
export declare function formatMarkdownReport(report: EvalReport): string;
|
|
10
|
+
export declare function writeJsonReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
11
|
+
export declare function writeMarkdownReport(projectRoot: string, report: EvalReport, basename?: string): Promise<string>;
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
3
|
+
import { writeFileSafe } from "../fs-utils.js";
|
|
4
|
+
export function reportsDir(projectRoot) {
|
|
5
|
+
return path.join(projectRoot, EVALS_ROOT, "reports");
|
|
6
|
+
}
|
|
7
|
+
export function defaultReportBasename(report) {
|
|
8
|
+
const ts = report.generatedAt.replace(/[:.]/g, "-");
|
|
9
|
+
return `eval-${ts}-${report.runId.slice(0, 8)}`;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Format a report as a human-readable Markdown document. Keeping the layout
|
|
13
|
+
* stable matters: CI posts diffs against earlier reports, and unit tests use
|
|
14
|
+
* the output as a regression guard.
|
|
15
|
+
*/
|
|
16
|
+
export function formatMarkdownReport(report) {
|
|
17
|
+
const { summary } = report;
|
|
18
|
+
const stages = report.stages.length > 0 ? report.stages.join(", ") : "all";
|
|
19
|
+
const lines = [];
|
|
20
|
+
lines.push(`# cclaw eval report`);
|
|
21
|
+
lines.push(``);
|
|
22
|
+
lines.push(`- generated: ${report.generatedAt}`);
|
|
23
|
+
lines.push(`- runId: ${report.runId}`);
|
|
24
|
+
lines.push(`- cclaw version: ${report.cclawVersion}`);
|
|
25
|
+
lines.push(`- provider: ${report.provider}`);
|
|
26
|
+
lines.push(`- model: ${report.model}`);
|
|
27
|
+
lines.push(`- tier: ${report.tier}`);
|
|
28
|
+
lines.push(`- stages: ${stages}`);
|
|
29
|
+
lines.push(``);
|
|
30
|
+
lines.push(`## Summary`);
|
|
31
|
+
lines.push(``);
|
|
32
|
+
lines.push(`| metric | value |`);
|
|
33
|
+
lines.push(`| --- | --- |`);
|
|
34
|
+
lines.push(`| total cases | ${summary.totalCases} |`);
|
|
35
|
+
lines.push(`| passed | ${summary.passed} |`);
|
|
36
|
+
lines.push(`| failed | ${summary.failed} |`);
|
|
37
|
+
lines.push(`| skipped | ${summary.skipped} |`);
|
|
38
|
+
lines.push(`| total cost (USD) | ${summary.totalCostUsd.toFixed(4)} |`);
|
|
39
|
+
lines.push(`| total duration (ms) | ${summary.totalDurationMs} |`);
|
|
40
|
+
lines.push(``);
|
|
41
|
+
if (report.baselineDelta) {
|
|
42
|
+
lines.push(`## Baseline delta`);
|
|
43
|
+
lines.push(``);
|
|
44
|
+
lines.push(`- baseline: ${report.baselineDelta.baselineId}`);
|
|
45
|
+
lines.push(`- score delta: ${report.baselineDelta.scoreDelta.toFixed(4)}`);
|
|
46
|
+
lines.push(`- critical failures: ${report.baselineDelta.criticalFailures}`);
|
|
47
|
+
lines.push(``);
|
|
48
|
+
}
|
|
49
|
+
if (report.cases.length === 0) {
|
|
50
|
+
lines.push(`## Cases`);
|
|
51
|
+
lines.push(``);
|
|
52
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the Wave rollout plan.`);
|
|
53
|
+
lines.push(``);
|
|
54
|
+
return `${lines.join("\n")}\n`;
|
|
55
|
+
}
|
|
56
|
+
lines.push(`## Cases`);
|
|
57
|
+
lines.push(``);
|
|
58
|
+
lines.push(`| stage | case id | passed | duration (ms) | cost (USD) |`);
|
|
59
|
+
lines.push(`| --- | --- | --- | --- | --- |`);
|
|
60
|
+
for (const item of report.cases) {
|
|
61
|
+
const cost = item.costUsd !== undefined ? item.costUsd.toFixed(4) : "-";
|
|
62
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
63
|
+
}
|
|
64
|
+
lines.push(``);
|
|
65
|
+
lines.push(`## Verifier details`);
|
|
66
|
+
lines.push(``);
|
|
67
|
+
for (const item of report.cases) {
|
|
68
|
+
lines.push(`### ${item.stage} / ${item.caseId}`);
|
|
69
|
+
lines.push(``);
|
|
70
|
+
for (const verifier of item.verifierResults) {
|
|
71
|
+
const score = verifier.score !== undefined ? ` (score=${verifier.score.toFixed(2)})` : "";
|
|
72
|
+
lines.push(`- ${verifier.kind} / ${verifier.id}: ${verifier.ok ? "ok" : "fail"}${score}` +
|
|
73
|
+
(verifier.message ? ` — ${verifier.message}` : ""));
|
|
74
|
+
}
|
|
75
|
+
lines.push(``);
|
|
76
|
+
}
|
|
77
|
+
return `${lines.join("\n")}\n`;
|
|
78
|
+
}
|
|
79
|
+
export async function writeJsonReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
80
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.json`);
|
|
81
|
+
await writeFileSafe(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
82
|
+
return outPath;
|
|
83
|
+
}
|
|
84
|
+
export async function writeMarkdownReport(projectRoot, report, basename = defaultReportBasename(report)) {
|
|
85
|
+
const outPath = path.join(reportsDir(projectRoot), `${basename}.md`);
|
|
86
|
+
await writeFileSafe(outPath, formatMarkdownReport(report));
|
|
87
|
+
return outPath;
|
|
88
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
|
+
export interface RunEvalOptions {
|
|
4
|
+
projectRoot: string;
|
|
5
|
+
stage?: FlowStage;
|
|
6
|
+
tier?: EvalTier;
|
|
7
|
+
/** When true, run only structural verifiers. Wave 7.1 wires actual verifiers. */
|
|
8
|
+
schemaOnly?: boolean;
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Wave 7.2 wires rules. */
|
|
10
|
+
rules?: boolean;
|
|
11
|
+
/** When true, also run LLM judge verifiers. Wave 7.3 wires judging. */
|
|
12
|
+
judge?: boolean;
|
|
13
|
+
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
|
+
dryRun?: boolean;
|
|
15
|
+
/** Override process.env during tests. */
|
|
16
|
+
env?: NodeJS.ProcessEnv;
|
|
17
|
+
}
|
|
18
|
+
export interface DryRunSummary {
|
|
19
|
+
kind: "dry-run";
|
|
20
|
+
config: ResolvedEvalConfig;
|
|
21
|
+
corpus: {
|
|
22
|
+
total: number;
|
|
23
|
+
byStage: Record<string, number>;
|
|
24
|
+
cases: Array<{
|
|
25
|
+
id: string;
|
|
26
|
+
stage: FlowStage;
|
|
27
|
+
}>;
|
|
28
|
+
};
|
|
29
|
+
plannedTier: EvalTier;
|
|
30
|
+
/**
|
|
31
|
+
* Waves 7.1–7.3 progressively flip these to `true`. Wave 7.0 is `false`
|
|
32
|
+
* across the board because no verifier is implemented yet.
|
|
33
|
+
*/
|
|
34
|
+
verifiersAvailable: {
|
|
35
|
+
structural: boolean;
|
|
36
|
+
rules: boolean;
|
|
37
|
+
judge: boolean;
|
|
38
|
+
workflow: boolean;
|
|
39
|
+
};
|
|
40
|
+
notes: string[];
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Wave 7.0 runner. Responsibilities:
|
|
44
|
+
* - Load resolved config (defaults + file + env).
|
|
45
|
+
* - Load corpus (empty on a fresh install).
|
|
46
|
+
* - Validate that no verifier flag asks for a capability that does not exist yet.
|
|
47
|
+
* - Return either a dry-run summary or an empty report.
|
|
48
|
+
*
|
|
49
|
+
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
50
|
+
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
51
|
+
* not churn.
|
|
52
|
+
*/
|
|
53
|
+
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
import { CCLAW_VERSION } from "../constants.js";
|
|
3
|
+
import { loadCorpus } from "./corpus.js";
|
|
4
|
+
import { loadEvalConfig } from "./config-loader.js";
|
|
5
|
+
function groupByStage(cases) {
|
|
6
|
+
return cases.reduce((acc, item) => {
|
|
7
|
+
acc[item.stage] = (acc[item.stage] ?? 0) + 1;
|
|
8
|
+
return acc;
|
|
9
|
+
}, {});
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Wave 7.0 runner. Responsibilities:
|
|
13
|
+
* - Load resolved config (defaults + file + env).
|
|
14
|
+
* - Load corpus (empty on a fresh install).
|
|
15
|
+
* - Validate that no verifier flag asks for a capability that does not exist yet.
|
|
16
|
+
* - Return either a dry-run summary or an empty report.
|
|
17
|
+
*
|
|
18
|
+
* Waves 7.1+ will replace the "no verifiers available" branch with the real
|
|
19
|
+
* verifier dispatch pipeline. The signature stays stable so CLI wiring does
|
|
20
|
+
* not churn.
|
|
21
|
+
*/
|
|
22
|
+
export async function runEval(options) {
|
|
23
|
+
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
24
|
+
const corpus = await loadCorpus(options.projectRoot, options.stage);
|
|
25
|
+
const plannedTier = options.tier ?? config.defaultTier;
|
|
26
|
+
const notes = [];
|
|
27
|
+
if (corpus.length === 0) {
|
|
28
|
+
notes.push("Corpus is empty. Seed cases land in Wave 7.1 (`.cclaw/evals/corpus/<stage>/*.yaml`).");
|
|
29
|
+
}
|
|
30
|
+
if (options.schemaOnly) {
|
|
31
|
+
notes.push("--schema-only is accepted; structural verifiers wire up in Wave 7.1.");
|
|
32
|
+
}
|
|
33
|
+
if (options.rules) {
|
|
34
|
+
notes.push("--rules is accepted; rule verifiers wire up in Wave 7.2.");
|
|
35
|
+
}
|
|
36
|
+
if (options.judge) {
|
|
37
|
+
notes.push("--judge is accepted; LLM judging wires up in Wave 7.3.");
|
|
38
|
+
}
|
|
39
|
+
if (options.dryRun === true) {
|
|
40
|
+
const summary = {
|
|
41
|
+
kind: "dry-run",
|
|
42
|
+
config,
|
|
43
|
+
corpus: {
|
|
44
|
+
total: corpus.length,
|
|
45
|
+
byStage: groupByStage(corpus),
|
|
46
|
+
cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
|
|
47
|
+
},
|
|
48
|
+
plannedTier,
|
|
49
|
+
verifiersAvailable: {
|
|
50
|
+
structural: false,
|
|
51
|
+
rules: false,
|
|
52
|
+
judge: false,
|
|
53
|
+
workflow: false
|
|
54
|
+
},
|
|
55
|
+
notes
|
|
56
|
+
};
|
|
57
|
+
return summary;
|
|
58
|
+
}
|
|
59
|
+
const now = new Date().toISOString();
|
|
60
|
+
const caseResults = corpus.map((item) => ({
|
|
61
|
+
caseId: item.id,
|
|
62
|
+
stage: item.stage,
|
|
63
|
+
tier: plannedTier,
|
|
64
|
+
passed: false,
|
|
65
|
+
durationMs: 0,
|
|
66
|
+
verifierResults: [
|
|
67
|
+
{
|
|
68
|
+
kind: "structural",
|
|
69
|
+
id: "wave-7-0-skeleton",
|
|
70
|
+
ok: false,
|
|
71
|
+
message: "Verifiers are not implemented in Wave 7.0; run with --dry-run.",
|
|
72
|
+
details: { skipped: true }
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
}));
|
|
76
|
+
const report = {
|
|
77
|
+
schemaVersion: 1,
|
|
78
|
+
generatedAt: now,
|
|
79
|
+
runId: randomUUID(),
|
|
80
|
+
cclawVersion: CCLAW_VERSION,
|
|
81
|
+
provider: config.provider,
|
|
82
|
+
model: config.model,
|
|
83
|
+
tier: plannedTier,
|
|
84
|
+
stages: options.stage ? [options.stage] : [],
|
|
85
|
+
cases: caseResults,
|
|
86
|
+
summary: {
|
|
87
|
+
totalCases: caseResults.length,
|
|
88
|
+
passed: 0,
|
|
89
|
+
failed: 0,
|
|
90
|
+
skipped: caseResults.length,
|
|
91
|
+
totalCostUsd: 0,
|
|
92
|
+
totalDurationMs: 0
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
return report;
|
|
96
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the cclaw eval subsystem (Phase 7).
|
|
3
|
+
*
|
|
4
|
+
* The eval subsystem lets us measure whether a change to a prompt, skill, or
|
|
5
|
+
* stage contract improves or regresses the quality of agent output. It is
|
|
6
|
+
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
|
+
*
|
|
8
|
+
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
+
* - The verifier / rubric / LLM stack evolves on its own release cadence (Waves 7.0-7.6).
|
|
10
|
+
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
|
+
*/
|
|
12
|
+
import type { FlowStage } from "../types.js";
|
|
13
|
+
/**
|
|
14
|
+
* Fidelity tier for the agent-under-test.
|
|
15
|
+
*
|
|
16
|
+
* - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
|
|
17
|
+
* - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
|
|
18
|
+
* - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
|
|
19
|
+
* artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
|
|
20
|
+
* (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
|
|
21
|
+
*/
|
|
22
|
+
export declare const EVAL_TIERS: readonly ["A", "B", "C"];
|
|
23
|
+
export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
24
|
+
/**
|
|
25
|
+
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
|
+
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
27
|
+
*/
|
|
28
|
+
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
29
|
+
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
|
+
/**
|
|
31
|
+
* A single eval case describes one input scenario for one stage. Cases live in
|
|
32
|
+
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
33
|
+
* fixture artifact for verifier development (Wave 7.1) before the agent loop
|
|
34
|
+
* exists (Wave 7.3+).
|
|
35
|
+
*/
|
|
36
|
+
export interface EvalCase {
|
|
37
|
+
id: string;
|
|
38
|
+
stage: FlowStage;
|
|
39
|
+
inputPrompt: string;
|
|
40
|
+
/** Project files copied into the Tier B/C sandbox before the agent runs. */
|
|
41
|
+
contextFiles?: string[];
|
|
42
|
+
/**
|
|
43
|
+
* Optional expected-shape hints consumed by structural/rule verifiers.
|
|
44
|
+
* Left intentionally loose; verifiers in Waves 7.1–7.2 will narrow this.
|
|
45
|
+
*/
|
|
46
|
+
expected?: Record<string, unknown>;
|
|
47
|
+
/**
|
|
48
|
+
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
49
|
+
* when verifiers are exercised without a live agent loop. Primarily a Wave
|
|
50
|
+
* 7.1 development aid.
|
|
51
|
+
*/
|
|
52
|
+
fixture?: string;
|
|
53
|
+
}
|
|
54
|
+
/** Result of one verifier applied to one case. */
|
|
55
|
+
export interface VerifierResult {
|
|
56
|
+
kind: VerifierKind;
|
|
57
|
+
id: string;
|
|
58
|
+
ok: boolean;
|
|
59
|
+
/** Normalized 0..1 score when the verifier produces a numeric signal. */
|
|
60
|
+
score?: number;
|
|
61
|
+
message?: string;
|
|
62
|
+
details?: Record<string, unknown>;
|
|
63
|
+
}
|
|
64
|
+
/** Aggregate result for one case after all verifiers run. */
|
|
65
|
+
export interface EvalCaseResult {
|
|
66
|
+
caseId: string;
|
|
67
|
+
stage: FlowStage;
|
|
68
|
+
tier: EvalTier;
|
|
69
|
+
passed: boolean;
|
|
70
|
+
durationMs: number;
|
|
71
|
+
costUsd?: number;
|
|
72
|
+
verifierResults: VerifierResult[];
|
|
73
|
+
}
|
|
74
|
+
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
75
|
+
export interface EvalReport {
|
|
76
|
+
schemaVersion: 1;
|
|
77
|
+
generatedAt: string;
|
|
78
|
+
runId: string;
|
|
79
|
+
cclawVersion: string;
|
|
80
|
+
provider: string;
|
|
81
|
+
model: string;
|
|
82
|
+
tier: EvalTier;
|
|
83
|
+
stages: FlowStage[];
|
|
84
|
+
cases: EvalCaseResult[];
|
|
85
|
+
summary: {
|
|
86
|
+
totalCases: number;
|
|
87
|
+
passed: number;
|
|
88
|
+
failed: number;
|
|
89
|
+
skipped: number;
|
|
90
|
+
totalCostUsd: number;
|
|
91
|
+
totalDurationMs: number;
|
|
92
|
+
};
|
|
93
|
+
/** Present when comparing against a saved baseline (Wave 7.1+). */
|
|
94
|
+
baselineDelta?: {
|
|
95
|
+
baselineId: string;
|
|
96
|
+
scoreDelta: number;
|
|
97
|
+
criticalFailures: number;
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
|
|
102
|
+
* with `CCLAW_EVAL_*` environment variables at runtime.
|
|
103
|
+
*/
|
|
104
|
+
export interface EvalConfig {
|
|
105
|
+
/**
|
|
106
|
+
* Free-form provider name used in reports. The actual HTTP protocol is
|
|
107
|
+
* determined by `baseUrl`, which is expected to be OpenAI-compatible.
|
|
108
|
+
*/
|
|
109
|
+
provider: string;
|
|
110
|
+
/** OpenAI-compatible base URL, e.g. `https://api.z.ai/api/coding/paas/v4`. */
|
|
111
|
+
baseUrl: string;
|
|
112
|
+
/** Model identifier for both agent-under-test and judge unless `judgeModel` overrides. */
|
|
113
|
+
model: string;
|
|
114
|
+
/** Optional separate model for the judge role. Defaults to `model`. */
|
|
115
|
+
judgeModel?: string;
|
|
116
|
+
/** Default tier when `--tier` is not supplied. */
|
|
117
|
+
defaultTier: EvalTier;
|
|
118
|
+
/** Optional hard stop on estimated USD spend per day. Unset = no cap. */
|
|
119
|
+
dailyUsdCap?: number;
|
|
120
|
+
/** Regression thresholds for CI gates. */
|
|
121
|
+
regression: {
|
|
122
|
+
/** Fail when overall score drops by more than this fraction (e.g. 0.15 = 15%). */
|
|
123
|
+
failIfDeltaBelow: number;
|
|
124
|
+
/** Fail when any single critical rubric drops below this absolute score. */
|
|
125
|
+
failIfCriticalBelow: number;
|
|
126
|
+
};
|
|
127
|
+
/** Per-agent-run timeout in milliseconds. */
|
|
128
|
+
timeoutMs: number;
|
|
129
|
+
/** Max retries per API call on transient failures. */
|
|
130
|
+
maxRetries: number;
|
|
131
|
+
}
|
|
132
|
+
/** Resolved config with env overrides applied. */
|
|
133
|
+
export interface ResolvedEvalConfig extends EvalConfig {
|
|
134
|
+
apiKey?: string;
|
|
135
|
+
source: "default" | "file" | "env" | "file+env";
|
|
136
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fidelity tier for the agent-under-test.
|
|
3
|
+
*
|
|
4
|
+
* - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
|
|
5
|
+
* - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
|
|
6
|
+
* - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
|
|
7
|
+
* artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
|
|
8
|
+
* (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
|
|
9
|
+
*/
|
|
10
|
+
export const EVAL_TIERS = ["A", "B", "C"];
|
|
11
|
+
/**
|
|
12
|
+
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
|
+
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
14
|
+
*/
|
|
15
|
+
export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];
|
package/dist/install.js
CHANGED
|
@@ -28,6 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
|
|
|
28
28
|
import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
|
+
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
31
32
|
import { TDD_WAVE_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
32
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
33
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
@@ -184,6 +185,26 @@ async function writeArtifactTemplates(projectRoot) {
|
|
|
184
185
|
await writeFileSafe(runtimePath(projectRoot, "templates", fileName), content);
|
|
185
186
|
}
|
|
186
187
|
}
|
|
188
|
+
/**
|
|
189
|
+
* Seed the `.cclaw/evals/` scaffold. Only writes files that do not already
|
|
190
|
+
* exist so that user-authored config.yaml / corpus / rubrics / baselines are
|
|
191
|
+
* never clobbered by `cclaw sync`.
|
|
192
|
+
*/
|
|
193
|
+
async function writeEvalScaffold(projectRoot) {
|
|
194
|
+
const targets = [
|
|
195
|
+
{ rel: "evals/config.yaml", content: EVAL_CONFIG_YAML },
|
|
196
|
+
{ rel: "evals/corpus/README.md", content: EVAL_CORPUS_README },
|
|
197
|
+
{ rel: "evals/rubrics/README.md", content: EVAL_RUBRICS_README },
|
|
198
|
+
{ rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
|
|
199
|
+
{ rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
|
|
200
|
+
];
|
|
201
|
+
for (const target of targets) {
|
|
202
|
+
const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
|
|
203
|
+
if (await exists(absolute))
|
|
204
|
+
continue;
|
|
205
|
+
await writeFileSafe(absolute, target.content);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
187
208
|
async function writeSkills(projectRoot, config) {
|
|
188
209
|
for (const stage of COMMAND_FILE_ORDER) {
|
|
189
210
|
const folder = stageSkillFolder(stage);
|
|
@@ -1044,6 +1065,7 @@ async function materializeRuntime(projectRoot, config, forceStateReset) {
|
|
|
1044
1065
|
await writeSkills(projectRoot, config);
|
|
1045
1066
|
await writeContextModes(projectRoot);
|
|
1046
1067
|
await writeArtifactTemplates(projectRoot);
|
|
1068
|
+
await writeEvalScaffold(projectRoot);
|
|
1047
1069
|
await writeRulebook(projectRoot);
|
|
1048
1070
|
await writeState(projectRoot, config, forceStateReset);
|
|
1049
1071
|
await ensureRunSystem(projectRoot, { createIfMissing: false });
|