cclaw-cli 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +9 -1
- package/dist/cli.js +123 -1
- package/dist/constants.d.ts +11 -2
- package/dist/constants.js +26 -1
- package/dist/content/eval-scaffold.d.ts +11 -0
- package/dist/content/eval-scaffold.js +89 -0
- package/dist/content/skills.js +1 -1
- package/dist/content/stages/brainstorm.js +3 -7
- package/dist/content/stages/design.js +2 -5
- package/dist/content/stages/plan.js +2 -4
- package/dist/content/stages/review.js +2 -4
- package/dist/content/stages/schema-types.d.ts +8 -2
- package/dist/content/stages/scope.js +2 -6
- package/dist/content/stages/ship.js +2 -4
- package/dist/content/stages/spec.js +2 -5
- package/dist/content/stages/tdd.js +2 -4
- package/dist/eval/config-loader.d.ts +14 -0
- package/dist/eval/config-loader.js +237 -0
- package/dist/eval/corpus.d.ts +8 -0
- package/dist/eval/corpus.js +91 -0
- package/dist/eval/llm-client.d.ts +62 -0
- package/dist/eval/llm-client.js +19 -0
- package/dist/eval/report.d.ts +11 -0
- package/dist/eval/report.js +88 -0
- package/dist/eval/runner.d.ts +53 -0
- package/dist/eval/runner.js +96 -0
- package/dist/eval/types.d.ts +136 -0
- package/dist/eval/types.js +15 -0
- package/dist/install.js +22 -0
- package/dist/runs.d.ts +0 -18
- package/dist/runs.js +1 -188
- package/package.json +1 -1
package/dist/cli.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
|
|
3
|
-
type
|
|
3
|
+
import type { EvalTier } from "./eval/types.js";
|
|
4
|
+
type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
|
|
4
5
|
interface ParsedArgs {
|
|
5
6
|
command?: CommandName;
|
|
6
7
|
harnesses?: HarnessId[];
|
|
@@ -16,6 +17,13 @@ interface ParsedArgs {
|
|
|
16
17
|
archiveName?: string;
|
|
17
18
|
archiveSkipRetro?: boolean;
|
|
18
19
|
archiveSkipRetroReason?: string;
|
|
20
|
+
evalStage?: string;
|
|
21
|
+
evalTier?: EvalTier;
|
|
22
|
+
evalSchemaOnly?: boolean;
|
|
23
|
+
evalRules?: boolean;
|
|
24
|
+
evalJudge?: boolean;
|
|
25
|
+
evalJson?: boolean;
|
|
26
|
+
evalNoWrite?: boolean;
|
|
19
27
|
showHelp?: boolean;
|
|
20
28
|
showVersion?: boolean;
|
|
21
29
|
}
|
package/dist/cli.js
CHANGED
|
@@ -13,7 +13,19 @@ import { RUNTIME_ROOT } from "./constants.js";
|
|
|
13
13
|
import { createDefaultConfig, createProfileConfig } from "./config.js";
|
|
14
14
|
import { detectHarnesses } from "./init-detect.js";
|
|
15
15
|
import { HARNESS_ADAPTERS } from "./harness-adapters.js";
|
|
16
|
-
|
|
16
|
+
import { runEval } from "./eval/runner.js";
|
|
17
|
+
import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
|
|
18
|
+
import { EVAL_TIERS } from "./eval/types.js";
|
|
19
|
+
import { FLOW_STAGES } from "./types.js";
|
|
20
|
+
const INSTALLER_COMMANDS = [
|
|
21
|
+
"init",
|
|
22
|
+
"sync",
|
|
23
|
+
"doctor",
|
|
24
|
+
"upgrade",
|
|
25
|
+
"uninstall",
|
|
26
|
+
"archive",
|
|
27
|
+
"eval"
|
|
28
|
+
];
|
|
17
29
|
export function usage() {
|
|
18
30
|
return `cclaw - installer-first flow toolkit
|
|
19
31
|
|
|
@@ -41,6 +53,15 @@ Commands:
|
|
|
41
53
|
Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
|
|
42
54
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
43
55
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
56
|
+
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
|
|
57
|
+
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
58
|
+
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
59
|
+
--schema-only Run only structural verifiers (Wave 7.1).
|
|
60
|
+
--rules Run structural + rule verifiers (Wave 7.2).
|
|
61
|
+
--judge Include LLM judging (Wave 7.3; requires API key).
|
|
62
|
+
--dry-run Validate config + corpus, print summary, do not execute.
|
|
63
|
+
--json Emit machine-readable JSON on stdout.
|
|
64
|
+
--no-write Skip writing the report to .cclaw/evals/reports/.
|
|
44
65
|
upgrade Refresh generated files in .cclaw without modifying user artifacts.
|
|
45
66
|
uninstall Remove .cclaw runtime and the generated harness shim files.
|
|
46
67
|
|
|
@@ -52,6 +73,8 @@ Examples:
|
|
|
52
73
|
cclaw init --harnesses=claude,cursor
|
|
53
74
|
cclaw doctor --reconcile-gates
|
|
54
75
|
cclaw archive --name=payments-revamp
|
|
76
|
+
cclaw eval --dry-run
|
|
77
|
+
cclaw eval --stage=brainstorm --schema-only
|
|
55
78
|
|
|
56
79
|
Docs: https://github.com/zuevrs/cclaw
|
|
57
80
|
Issues: https://github.com/zuevrs/cclaw/issues
|
|
@@ -107,6 +130,20 @@ function parseProfile(raw) {
|
|
|
107
130
|
}
|
|
108
131
|
return trimmed;
|
|
109
132
|
}
|
|
133
|
+
function parseEvalTier(raw) {
|
|
134
|
+
const trimmed = raw.trim().toUpperCase();
|
|
135
|
+
if (!EVAL_TIERS.includes(trimmed)) {
|
|
136
|
+
throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
|
|
137
|
+
}
|
|
138
|
+
return trimmed;
|
|
139
|
+
}
|
|
140
|
+
function parseEvalStage(raw) {
|
|
141
|
+
const trimmed = raw.trim();
|
|
142
|
+
if (!FLOW_STAGES.includes(trimmed)) {
|
|
143
|
+
throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
|
|
144
|
+
}
|
|
145
|
+
return trimmed;
|
|
146
|
+
}
|
|
110
147
|
function isInitPromptAllowed(ctx) {
|
|
111
148
|
return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
|
|
112
149
|
}
|
|
@@ -390,7 +427,37 @@ function parseArgs(argv) {
|
|
|
390
427
|
}
|
|
391
428
|
if (flag.startsWith("--retro-reason=")) {
|
|
392
429
|
parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
if (flag.startsWith("--stage=")) {
|
|
433
|
+
parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
if (flag.startsWith("--tier=")) {
|
|
437
|
+
parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
if (flag === "--schema-only") {
|
|
441
|
+
parsed.evalSchemaOnly = true;
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
if (flag === "--rules") {
|
|
445
|
+
parsed.evalRules = true;
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
448
|
+
if (flag === "--judge") {
|
|
449
|
+
parsed.evalJudge = true;
|
|
450
|
+
continue;
|
|
393
451
|
}
|
|
452
|
+
if (flag === "--no-write") {
|
|
453
|
+
parsed.evalNoWrite = true;
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
// `--json` is shared between doctor and eval. Disambiguate by command.
|
|
458
|
+
if (parsed.command === "eval" && parsed.doctorJson === true) {
|
|
459
|
+
parsed.evalJson = true;
|
|
460
|
+
parsed.doctorJson = undefined;
|
|
394
461
|
}
|
|
395
462
|
return parsed;
|
|
396
463
|
}
|
|
@@ -487,6 +554,61 @@ async function runCommand(parsed, ctx) {
|
|
|
487
554
|
info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
|
|
488
555
|
return 0;
|
|
489
556
|
}
|
|
557
|
+
if (command === "eval") {
|
|
558
|
+
const result = await runEval({
|
|
559
|
+
projectRoot: ctx.cwd,
|
|
560
|
+
stage: parsed.evalStage,
|
|
561
|
+
tier: parsed.evalTier,
|
|
562
|
+
schemaOnly: parsed.evalSchemaOnly === true,
|
|
563
|
+
rules: parsed.evalRules === true,
|
|
564
|
+
judge: parsed.evalJudge === true,
|
|
565
|
+
dryRun: parsed.dryRun === true
|
|
566
|
+
});
|
|
567
|
+
if ("kind" in result) {
|
|
568
|
+
if (parsed.evalJson === true) {
|
|
569
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
570
|
+
return 0;
|
|
571
|
+
}
|
|
572
|
+
ctx.stdout.write(`cclaw eval dry-run\n`);
|
|
573
|
+
ctx.stdout.write(` provider: ${result.config.provider}\n`);
|
|
574
|
+
ctx.stdout.write(` baseUrl: ${result.config.baseUrl}\n`);
|
|
575
|
+
ctx.stdout.write(` model: ${result.config.model}\n`);
|
|
576
|
+
ctx.stdout.write(` source: ${result.config.source}\n`);
|
|
577
|
+
ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
|
|
578
|
+
ctx.stdout.write(` tier: ${result.plannedTier}\n`);
|
|
579
|
+
ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
|
|
580
|
+
for (const [stage, count] of Object.entries(result.corpus.byStage)) {
|
|
581
|
+
ctx.stdout.write(` - ${stage}: ${count}\n`);
|
|
582
|
+
}
|
|
583
|
+
ctx.stdout.write(` verifiers available:\n`);
|
|
584
|
+
for (const [key, value] of Object.entries(result.verifiersAvailable)) {
|
|
585
|
+
ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
|
|
586
|
+
}
|
|
587
|
+
if (result.notes.length > 0) {
|
|
588
|
+
ctx.stdout.write(` notes:\n`);
|
|
589
|
+
for (const note of result.notes) {
|
|
590
|
+
ctx.stdout.write(` - ${note}\n`);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
return 0;
|
|
594
|
+
}
|
|
595
|
+
if (parsed.evalNoWrite !== true) {
|
|
596
|
+
const jsonPath = await writeJsonReport(ctx.cwd, result);
|
|
597
|
+
const mdPath = await writeMarkdownReport(ctx.cwd, result);
|
|
598
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
|
|
599
|
+
info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
|
|
600
|
+
}
|
|
601
|
+
if (parsed.evalJson === true) {
|
|
602
|
+
ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
603
|
+
}
|
|
604
|
+
else {
|
|
605
|
+
ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
|
|
606
|
+
`${result.summary.passed} passed, ` +
|
|
607
|
+
`${result.summary.failed} failed, ` +
|
|
608
|
+
`${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
|
|
609
|
+
}
|
|
610
|
+
return result.summary.failed > 0 ? 1 : 0;
|
|
611
|
+
}
|
|
490
612
|
if (command === "archive") {
|
|
491
613
|
const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
|
|
492
614
|
skipRetro: parsed.archiveSkipRetro === true,
|
package/dist/constants.d.ts
CHANGED
|
@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
|
|
|
4
4
|
export declare const CCLAW_VERSION = "0.1.1";
|
|
5
5
|
export declare const FLOW_VERSION = "1.0.0";
|
|
6
6
|
export declare const DEFAULT_HARNESSES: HarnessId[];
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
9
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
10
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
11
|
+
* does not affect non-eval cclaw behavior.
|
|
12
|
+
*/
|
|
13
|
+
export declare const EVALS_ROOT = ".cclaw/evals";
|
|
14
|
+
export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
|
|
15
|
+
export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
16
|
+
export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
|
|
17
|
+
export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
|
|
9
18
|
export declare const COMMAND_FILE_ORDER: FlowStage[];
|
|
10
19
|
export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
|
|
11
20
|
export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];
|
package/dist/constants.js
CHANGED
|
@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
|
|
|
8
8
|
"opencode",
|
|
9
9
|
"codex"
|
|
10
10
|
];
|
|
11
|
+
/**
|
|
12
|
+
* Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
|
|
13
|
+
* verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
|
|
14
|
+
* main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
|
|
15
|
+
* does not affect non-eval cclaw behavior.
|
|
16
|
+
*/
|
|
17
|
+
export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
|
|
18
|
+
export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
|
|
19
|
+
export const EVALS_DIRS = [
|
|
20
|
+
EVALS_ROOT,
|
|
21
|
+
`${EVALS_ROOT}/corpus`,
|
|
22
|
+
`${EVALS_ROOT}/rubrics`,
|
|
23
|
+
`${EVALS_ROOT}/baselines`,
|
|
24
|
+
`${EVALS_ROOT}/reports`
|
|
25
|
+
];
|
|
11
26
|
export const REQUIRED_DIRS = [
|
|
12
27
|
RUNTIME_ROOT,
|
|
13
28
|
`${RUNTIME_ROOT}/commands`,
|
|
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
|
|
|
22
37
|
`${RUNTIME_ROOT}/adapters`,
|
|
23
38
|
`${RUNTIME_ROOT}/agents`,
|
|
24
39
|
`${RUNTIME_ROOT}/hooks`,
|
|
25
|
-
`${RUNTIME_ROOT}/custom-skills
|
|
40
|
+
`${RUNTIME_ROOT}/custom-skills`,
|
|
41
|
+
...EVALS_DIRS
|
|
26
42
|
];
|
|
27
43
|
export const REQUIRED_GITIGNORE_PATTERNS = [
|
|
28
44
|
"# cclaw generated artifacts",
|
|
29
45
|
`${RUNTIME_ROOT}/`,
|
|
46
|
+
"# cclaw evals: user-owned, track in git",
|
|
47
|
+
`!${EVALS_ROOT}/`,
|
|
48
|
+
`!${EVALS_ROOT}/config.yaml`,
|
|
49
|
+
`!${EVALS_ROOT}/corpus/`,
|
|
50
|
+
`!${EVALS_ROOT}/corpus/**`,
|
|
51
|
+
`!${EVALS_ROOT}/rubrics/`,
|
|
52
|
+
`!${EVALS_ROOT}/rubrics/**`,
|
|
53
|
+
`!${EVALS_ROOT}/baselines/`,
|
|
54
|
+
`!${EVALS_ROOT}/baselines/**`,
|
|
30
55
|
".claude/commands/cc-*.md",
|
|
31
56
|
".claude/commands/cc.md",
|
|
32
57
|
".cursor/commands/cc-*.md",
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
|
|
8
|
+
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
|
|
9
|
+
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
|
|
10
|
+
export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
|
|
11
|
+
export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
|
|
3
|
+
* on `cclaw sync` only if the files are missing (user content wins). The
|
|
4
|
+
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
|
+
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
|
+
*/
|
|
7
|
+
export const EVAL_CONFIG_YAML = `# cclaw eval config
|
|
8
|
+
# See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
|
|
9
|
+
#
|
|
10
|
+
# All values can be overridden at runtime with CCLAW_EVAL_* environment
|
|
11
|
+
# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
|
|
12
|
+
provider: zai
|
|
13
|
+
baseUrl: https://api.z.ai/api/coding/paas/v4
|
|
14
|
+
model: glm-5.1
|
|
15
|
+
|
|
16
|
+
# Default fidelity tier when --tier is not supplied.
|
|
17
|
+
# A = single-shot API call (cheap, Wave 7.3)
|
|
18
|
+
# B = SDK with tool use (realistic, Wave 7.4)
|
|
19
|
+
# C = multi-stage workflow (end-to-end, Wave 7.5)
|
|
20
|
+
defaultTier: A
|
|
21
|
+
|
|
22
|
+
# Per-call timeout and retry budget.
|
|
23
|
+
timeoutMs: 120000
|
|
24
|
+
maxRetries: 2
|
|
25
|
+
|
|
26
|
+
# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
|
|
27
|
+
# dailyUsdCap: 5
|
|
28
|
+
|
|
29
|
+
# Regression thresholds used by CI (Wave 7.3+).
|
|
30
|
+
regression:
|
|
31
|
+
# Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
|
|
32
|
+
failIfDeltaBelow: -0.15
|
|
33
|
+
# Fail when any single critical rubric drops below this absolute score.
|
|
34
|
+
failIfCriticalBelow: 3.0
|
|
35
|
+
`;
|
|
36
|
+
export const EVAL_CORPUS_README = `# Eval Corpus
|
|
37
|
+
|
|
38
|
+
Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
|
|
39
|
+
See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
|
|
40
|
+
|
|
41
|
+
Minimal shape:
|
|
42
|
+
|
|
43
|
+
\`\`\`yaml
|
|
44
|
+
id: brainstorm-01
|
|
45
|
+
stage: brainstorm
|
|
46
|
+
input_prompt: |
|
|
47
|
+
One short paragraph describing the user's task.
|
|
48
|
+
context_files: []
|
|
49
|
+
expected:
|
|
50
|
+
# verifier-specific hints; optional in Wave 7.0
|
|
51
|
+
\`\`\`
|
|
52
|
+
|
|
53
|
+
Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
|
|
54
|
+
stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
|
|
55
|
+
projects to exercise Tier B/C sandboxes.
|
|
56
|
+
`;
|
|
57
|
+
export const EVAL_RUBRICS_README = `# Eval Rubrics
|
|
58
|
+
|
|
59
|
+
LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
|
|
60
|
+
scored on a \`1–5\` scale with a rationale:
|
|
61
|
+
|
|
62
|
+
\`\`\`yaml
|
|
63
|
+
stage: brainstorm
|
|
64
|
+
checks:
|
|
65
|
+
- id: distinctness
|
|
66
|
+
prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
|
|
67
|
+
scale: "1-5 where 5=fully distinct approaches"
|
|
68
|
+
weight: 1.0
|
|
69
|
+
\`\`\`
|
|
70
|
+
|
|
71
|
+
Rubric authoring happens when Tier A runs start producing artifacts, so we
|
|
72
|
+
score the *right* properties rather than retrofitting generic quality checks.
|
|
73
|
+
See \`docs/evals.md\` for the full schema.
|
|
74
|
+
`;
|
|
75
|
+
export const EVAL_BASELINES_README = `# Eval Baselines
|
|
76
|
+
|
|
77
|
+
Frozen score snapshots used by regression gates. Baselines are committed to
|
|
78
|
+
git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
|
|
79
|
+
(wired in Wave 7.1).
|
|
80
|
+
|
|
81
|
+
Each baseline file is a JSON document keyed by stage and case id. Do not edit
|
|
82
|
+
by hand; CI will flag baseline churn.
|
|
83
|
+
`;
|
|
84
|
+
export const EVAL_REPORTS_README = `# Eval Reports
|
|
85
|
+
|
|
86
|
+
Generated reports (JSON + Markdown) land here. This directory is gitignored.
|
|
87
|
+
Run \`cclaw eval --dry-run\` to preview configuration without producing a
|
|
88
|
+
report.
|
|
89
|
+
`;
|
package/dist/content/skills.js
CHANGED
|
@@ -157,7 +157,7 @@ ${rows}
|
|
|
157
157
|
function mergedAntiPatterns(schema) {
|
|
158
158
|
const merged = [];
|
|
159
159
|
const seen = new Set();
|
|
160
|
-
for (const item of [...schema.
|
|
160
|
+
for (const item of [...schema.commonRationalizations, ...schema.blockers]) {
|
|
161
161
|
const key = item.trim().toLowerCase();
|
|
162
162
|
if (seen.has(key))
|
|
163
163
|
continue;
|
|
@@ -94,18 +94,14 @@ export const BRAINSTORM = {
|
|
|
94
94
|
"no implementation action taken",
|
|
95
95
|
"artifact reviewed by user"
|
|
96
96
|
],
|
|
97
|
-
|
|
97
|
+
commonRationalizations: [
|
|
98
98
|
"Asking questions without exploring existing project context first",
|
|
99
99
|
"Asking bundled or purely informational questions that don't change decisions",
|
|
100
100
|
"Proposing cosmetic option variants instead of architecturally distinct approaches",
|
|
101
101
|
"Jumping directly into implementation",
|
|
102
|
-
"Requesting approval without stating what decision is being approved"
|
|
103
|
-
],
|
|
104
|
-
redFlags: [
|
|
105
|
-
"No project context exploration before questions",
|
|
102
|
+
"Requesting approval without stating what decision is being approved",
|
|
106
103
|
"Questions that only gather preferences without design impact",
|
|
107
|
-
"Options that are variants of one approach, not distinct alternatives"
|
|
108
|
-
"Approval requested without explicit decision context"
|
|
104
|
+
"Options that are variants of one approach, not distinct alternatives"
|
|
109
105
|
],
|
|
110
106
|
policyNeedles: [
|
|
111
107
|
"Explore project context",
|
|
@@ -106,20 +106,17 @@ export const DESIGN = {
|
|
|
106
106
|
"completion dashboard present with all review-section statuses",
|
|
107
107
|
"artifact complete for spec handoff"
|
|
108
108
|
],
|
|
109
|
-
|
|
109
|
+
commonRationalizations: [
|
|
110
110
|
"Architecture deferred to implementation phase",
|
|
111
111
|
"Missing data-flow edge cases",
|
|
112
112
|
"No performance budget for critical path",
|
|
113
113
|
"Batching multiple design issues into one question",
|
|
114
114
|
"Skipping review sections because plan seems simple",
|
|
115
115
|
"Agreeing with user's architecture choice without evaluating alternatives",
|
|
116
|
-
"Hedging every recommendation with 'it depends' instead of taking a position"
|
|
117
|
-
],
|
|
118
|
-
redFlags: [
|
|
116
|
+
"Hedging every recommendation with 'it depends' instead of taking a position",
|
|
119
117
|
"No explicit architecture boundary section",
|
|
120
118
|
"No failure recovery strategy",
|
|
121
119
|
"No defined test/perf baseline",
|
|
122
|
-
"Review sections skipped or condensed",
|
|
123
120
|
"No NOT-in-scope output section",
|
|
124
121
|
"No What-already-exists output section",
|
|
125
122
|
"Design decisions made without reading the actual code first"
|
|
@@ -83,14 +83,12 @@ export const PLAN = {
|
|
|
83
83
|
"artifact ready for TDD execution",
|
|
84
84
|
"acceptance mapping complete"
|
|
85
85
|
],
|
|
86
|
-
|
|
86
|
+
commonRationalizations: [
|
|
87
87
|
"Horizontal decomposition without end-to-end slices",
|
|
88
88
|
"Tasks without verification steps",
|
|
89
89
|
"Starting execution before approval",
|
|
90
90
|
"Tasks that touch multiple unrelated areas",
|
|
91
|
-
"Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions"
|
|
92
|
-
],
|
|
93
|
-
redFlags: [
|
|
91
|
+
"Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions",
|
|
94
92
|
"No dependency graph",
|
|
95
93
|
"No WAIT_FOR_CONFIRM marker",
|
|
96
94
|
"No explicit dependency waves",
|
|
@@ -88,14 +88,12 @@ export const REVIEW = {
|
|
|
88
88
|
"critical blockers resolved",
|
|
89
89
|
"ship readiness explicitly stated"
|
|
90
90
|
],
|
|
91
|
-
|
|
91
|
+
commonRationalizations: [
|
|
92
92
|
"Single generic review without layered structure",
|
|
93
93
|
"No severity classification",
|
|
94
94
|
"Shipping with open criticals",
|
|
95
95
|
"Batching multiple findings into one report without individual resolution",
|
|
96
|
-
"Skipping Layer 2 sections because Layer 1 passed"
|
|
97
|
-
],
|
|
98
|
-
redFlags: [
|
|
96
|
+
"Skipping Layer 2 sections because Layer 1 passed",
|
|
99
97
|
"No separate Layer 1/Layer 2 outcomes",
|
|
100
98
|
"No structured review-army reconciliation artifact",
|
|
101
99
|
"No critical bucket",
|
|
@@ -74,8 +74,14 @@ export interface StageSchema {
|
|
|
74
74
|
outputs: string[];
|
|
75
75
|
blockers: string[];
|
|
76
76
|
exitCriteria: string[];
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
/**
|
|
78
|
+
* Consolidated "Common Rationalizations" list — things an agent is likely to
|
|
79
|
+
* talk itself into that should stop the stage. Rendered under the
|
|
80
|
+
* "Anti-Patterns & Red Flags" heading in the generated SKILL.md. Replaces
|
|
81
|
+
* the former split between `antiPatterns` and `redFlags`, which produced
|
|
82
|
+
* near-duplicate entries and forced downstream code to merge them anyway.
|
|
83
|
+
*/
|
|
84
|
+
commonRationalizations: string[];
|
|
79
85
|
policyNeedles: string[];
|
|
80
86
|
artifactFile: string;
|
|
81
87
|
next: FlowStage | "done";
|
|
@@ -96,7 +96,7 @@ export const SCOPE = {
|
|
|
96
96
|
"completion dashboard produced",
|
|
97
97
|
"scope summary produced"
|
|
98
98
|
],
|
|
99
|
-
|
|
99
|
+
commonRationalizations: [
|
|
100
100
|
"Scope silently expanded during discussion",
|
|
101
101
|
"No explicit out-of-scope section",
|
|
102
102
|
"Premise accepted without challenge",
|
|
@@ -104,16 +104,12 @@ export const SCOPE = {
|
|
|
104
104
|
"Hedged recommendations that avoid taking a position",
|
|
105
105
|
"Batching multiple scope issues into one question",
|
|
106
106
|
"Re-arguing for smaller scope after user rejects reduction",
|
|
107
|
-
"Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries"
|
|
108
|
-
],
|
|
109
|
-
redFlags: [
|
|
107
|
+
"Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries",
|
|
110
108
|
"No selected mode in artifact",
|
|
111
109
|
"Mode selected without heuristic justification",
|
|
112
110
|
"No discretion section (or explicit `None`) in artifact",
|
|
113
111
|
"No deferred/not-in-scope section",
|
|
114
112
|
"No user approval marker",
|
|
115
|
-
"Premise challenge missing or superficial",
|
|
116
|
-
"No implementation alternatives evaluated",
|
|
117
113
|
"Missing Locked Decisions section or decisions without D-XX IDs"
|
|
118
114
|
],
|
|
119
115
|
policyNeedles: ["Scope mode", "In Scope", "Out of Scope", "Discretion Areas", "NOT in scope", "Premise Challenge", "Locked Decisions"],
|
|
@@ -77,14 +77,12 @@ export const SHIP = {
|
|
|
77
77
|
"rollback and release notes complete",
|
|
78
78
|
"finalization action explicitly chosen and executed"
|
|
79
79
|
],
|
|
80
|
-
|
|
80
|
+
commonRationalizations: [
|
|
81
81
|
"Shipping without rollback strategy",
|
|
82
82
|
"Implicit finalization decision",
|
|
83
83
|
"Bypassing preflight due to urgency",
|
|
84
84
|
"Selecting multiple finalization modes",
|
|
85
|
-
"Shipping with BLOCKED review verdict"
|
|
86
|
-
],
|
|
87
|
-
redFlags: [
|
|
85
|
+
"Shipping with BLOCKED review verdict",
|
|
88
86
|
"No rollback trigger/steps",
|
|
89
87
|
"More than one finalization mode implied",
|
|
90
88
|
"No explicit preflight result",
|
|
@@ -77,14 +77,11 @@ export const SPEC = {
|
|
|
77
77
|
"plan-ready acceptance mapping exists",
|
|
78
78
|
"testability confirmed for all criteria"
|
|
79
79
|
],
|
|
80
|
-
|
|
80
|
+
commonRationalizations: [
|
|
81
81
|
"High-level goals without measurable outcomes",
|
|
82
82
|
"Implicit assumptions",
|
|
83
83
|
"Proceeding to plan before approval",
|
|
84
|
-
"Using vague adjectives (fast, intuitive, robust) without thresholds"
|
|
85
|
-
],
|
|
86
|
-
redFlags: [
|
|
87
|
-
"Criteria use vague language (fast, intuitive, robust) without thresholds",
|
|
84
|
+
"Using vague adjectives (fast, intuitive, robust) without thresholds",
|
|
88
85
|
"No explicit assumptions section",
|
|
89
86
|
"No approval record",
|
|
90
87
|
"No testability mapping",
|
|
@@ -88,16 +88,14 @@ export const TDD = {
|
|
|
88
88
|
"required gates marked satisfied",
|
|
89
89
|
"traceability annotated"
|
|
90
90
|
],
|
|
91
|
-
|
|
91
|
+
commonRationalizations: [
|
|
92
92
|
"Writing code before failing test",
|
|
93
93
|
"Asserting implementation details instead of behavior",
|
|
94
94
|
"Big-bang implementation across multiple slices",
|
|
95
95
|
"Partial test runs presented as GREEN",
|
|
96
96
|
"Skipping evidence capture",
|
|
97
97
|
"Undocumented refactor changes",
|
|
98
|
-
"Adding features beyond what RED tests require"
|
|
99
|
-
],
|
|
100
|
-
redFlags: [
|
|
98
|
+
"Adding features beyond what RED tests require",
|
|
101
99
|
"No failing test output (RED missing)",
|
|
102
100
|
"Implementation edits appear before RED evidence",
|
|
103
101
|
"No full-suite GREEN evidence",
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
|
|
4
|
+
* with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
|
|
5
|
+
* be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
|
|
6
|
+
* variables (env wins last).
|
|
7
|
+
*/
|
|
8
|
+
export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
|
|
9
|
+
/**
|
|
10
|
+
* Resolve eval config in layered order: defaults -> config.yaml -> env vars.
|
|
11
|
+
* Returns a fully-populated config plus a provenance marker so `--dry-run` can
|
|
12
|
+
* surface where each setting came from.
|
|
13
|
+
*/
|
|
14
|
+
export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;
|