cclaw-cli 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
3
- type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive";
3
+ import type { EvalTier } from "./eval/types.js";
4
+ type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
4
5
  interface ParsedArgs {
5
6
  command?: CommandName;
6
7
  harnesses?: HarnessId[];
@@ -16,6 +17,13 @@ interface ParsedArgs {
16
17
  archiveName?: string;
17
18
  archiveSkipRetro?: boolean;
18
19
  archiveSkipRetroReason?: string;
20
+ evalStage?: string;
21
+ evalTier?: EvalTier;
22
+ evalSchemaOnly?: boolean;
23
+ evalRules?: boolean;
24
+ evalJudge?: boolean;
25
+ evalJson?: boolean;
26
+ evalNoWrite?: boolean;
19
27
  showHelp?: boolean;
20
28
  showVersion?: boolean;
21
29
  }
package/dist/cli.js CHANGED
@@ -13,7 +13,19 @@ import { RUNTIME_ROOT } from "./constants.js";
13
13
  import { createDefaultConfig, createProfileConfig } from "./config.js";
14
14
  import { detectHarnesses } from "./init-detect.js";
15
15
  import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
- const INSTALLER_COMMANDS = ["init", "sync", "doctor", "upgrade", "uninstall", "archive"];
16
+ import { runEval } from "./eval/runner.js";
17
+ import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
18
+ import { EVAL_TIERS } from "./eval/types.js";
19
+ import { FLOW_STAGES } from "./types.js";
20
+ const INSTALLER_COMMANDS = [
21
+ "init",
22
+ "sync",
23
+ "doctor",
24
+ "upgrade",
25
+ "uninstall",
26
+ "archive",
27
+ "eval"
28
+ ];
17
29
  export function usage() {
18
30
  return `cclaw - installer-first flow toolkit
19
31
 
@@ -41,6 +53,15 @@ Commands:
41
53
  Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
42
54
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
43
55
  --retro-reason=<t> Reason for bypassing retro gate.
56
+ eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
57
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
58
+ --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
59
+ --schema-only Run only structural verifiers (Wave 7.1).
60
+ --rules Run structural + rule verifiers (Wave 7.2).
61
+ --judge Include LLM judging (Wave 7.3; requires API key).
62
+ --dry-run Validate config + corpus, print summary, do not execute.
63
+ --json Emit machine-readable JSON on stdout.
64
+ --no-write Skip writing the report to .cclaw/evals/reports/.
44
65
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
45
66
  uninstall Remove .cclaw runtime and the generated harness shim files.
46
67
 
@@ -52,6 +73,8 @@ Examples:
52
73
  cclaw init --harnesses=claude,cursor
53
74
  cclaw doctor --reconcile-gates
54
75
  cclaw archive --name=payments-revamp
76
+ cclaw eval --dry-run
77
+ cclaw eval --stage=brainstorm --schema-only
55
78
 
56
79
  Docs: https://github.com/zuevrs/cclaw
57
80
  Issues: https://github.com/zuevrs/cclaw/issues
@@ -107,6 +130,20 @@ function parseProfile(raw) {
107
130
  }
108
131
  return trimmed;
109
132
  }
133
+ function parseEvalTier(raw) {
134
+ const trimmed = raw.trim().toUpperCase();
135
+ if (!EVAL_TIERS.includes(trimmed)) {
136
+ throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
137
+ }
138
+ return trimmed;
139
+ }
140
+ function parseEvalStage(raw) {
141
+ const trimmed = raw.trim();
142
+ if (!FLOW_STAGES.includes(trimmed)) {
143
+ throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
144
+ }
145
+ return trimmed;
146
+ }
110
147
  function isInitPromptAllowed(ctx) {
111
148
  return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
112
149
  }
@@ -390,7 +427,37 @@ function parseArgs(argv) {
390
427
  }
391
428
  if (flag.startsWith("--retro-reason=")) {
392
429
  parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
430
+ continue;
431
+ }
432
+ if (flag.startsWith("--stage=")) {
433
+ parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
434
+ continue;
435
+ }
436
+ if (flag.startsWith("--tier=")) {
437
+ parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
438
+ continue;
439
+ }
440
+ if (flag === "--schema-only") {
441
+ parsed.evalSchemaOnly = true;
442
+ continue;
443
+ }
444
+ if (flag === "--rules") {
445
+ parsed.evalRules = true;
446
+ continue;
447
+ }
448
+ if (flag === "--judge") {
449
+ parsed.evalJudge = true;
450
+ continue;
393
451
  }
452
+ if (flag === "--no-write") {
453
+ parsed.evalNoWrite = true;
454
+ continue;
455
+ }
456
+ }
457
+ // `--json` is shared between doctor and eval. Disambiguate by command.
458
+ if (parsed.command === "eval" && parsed.doctorJson === true) {
459
+ parsed.evalJson = true;
460
+ parsed.doctorJson = undefined;
394
461
  }
395
462
  return parsed;
396
463
  }
@@ -487,6 +554,61 @@ async function runCommand(parsed, ctx) {
487
554
  info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
488
555
  return 0;
489
556
  }
557
+ if (command === "eval") {
558
+ const result = await runEval({
559
+ projectRoot: ctx.cwd,
560
+ stage: parsed.evalStage,
561
+ tier: parsed.evalTier,
562
+ schemaOnly: parsed.evalSchemaOnly === true,
563
+ rules: parsed.evalRules === true,
564
+ judge: parsed.evalJudge === true,
565
+ dryRun: parsed.dryRun === true
566
+ });
567
+ if ("kind" in result) {
568
+ if (parsed.evalJson === true) {
569
+ ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
570
+ return 0;
571
+ }
572
+ ctx.stdout.write(`cclaw eval dry-run\n`);
573
+ ctx.stdout.write(` provider: ${result.config.provider}\n`);
574
+ ctx.stdout.write(` baseUrl: ${result.config.baseUrl}\n`);
575
+ ctx.stdout.write(` model: ${result.config.model}\n`);
576
+ ctx.stdout.write(` source: ${result.config.source}\n`);
577
+ ctx.stdout.write(` apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
578
+ ctx.stdout.write(` tier: ${result.plannedTier}\n`);
579
+ ctx.stdout.write(` corpus: ${result.corpus.total} case(s)\n`);
580
+ for (const [stage, count] of Object.entries(result.corpus.byStage)) {
581
+ ctx.stdout.write(` - ${stage}: ${count}\n`);
582
+ }
583
+ ctx.stdout.write(` verifiers available:\n`);
584
+ for (const [key, value] of Object.entries(result.verifiersAvailable)) {
585
+ ctx.stdout.write(` - ${key}: ${value ? "yes" : "no"}\n`);
586
+ }
587
+ if (result.notes.length > 0) {
588
+ ctx.stdout.write(` notes:\n`);
589
+ for (const note of result.notes) {
590
+ ctx.stdout.write(` - ${note}\n`);
591
+ }
592
+ }
593
+ return 0;
594
+ }
595
+ if (parsed.evalNoWrite !== true) {
596
+ const jsonPath = await writeJsonReport(ctx.cwd, result);
597
+ const mdPath = await writeMarkdownReport(ctx.cwd, result);
598
+ info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
599
+ info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
600
+ }
601
+ if (parsed.evalJson === true) {
602
+ ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
603
+ }
604
+ else {
605
+ ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
606
+ `${result.summary.passed} passed, ` +
607
+ `${result.summary.failed} failed, ` +
608
+ `${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
609
+ }
610
+ return result.summary.failed > 0 ? 1 : 0;
611
+ }
490
612
  if (command === "archive") {
491
613
  const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
492
614
  skipRetro: parsed.archiveSkipRetro === true,
@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
4
4
  export declare const CCLAW_VERSION = "0.1.1";
5
5
  export declare const FLOW_VERSION = "1.0.0";
6
6
  export declare const DEFAULT_HARNESSES: HarnessId[];
7
- export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills"];
8
- export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
7
+ /**
8
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
9
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
10
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
11
+ * does not affect non-eval cclaw behavior.
12
+ */
13
+ export declare const EVALS_ROOT = ".cclaw/evals";
14
+ export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
15
+ export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
16
+ export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
17
+ export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
9
18
  export declare const COMMAND_FILE_ORDER: FlowStage[];
10
19
  export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
11
20
  export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];
package/dist/constants.js CHANGED
@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
8
8
  "opencode",
9
9
  "codex"
10
10
  ];
11
+ /**
12
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
13
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
14
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
15
+ * does not affect non-eval cclaw behavior.
16
+ */
17
+ export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
18
+ export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
19
+ export const EVALS_DIRS = [
20
+ EVALS_ROOT,
21
+ `${EVALS_ROOT}/corpus`,
22
+ `${EVALS_ROOT}/rubrics`,
23
+ `${EVALS_ROOT}/baselines`,
24
+ `${EVALS_ROOT}/reports`
25
+ ];
11
26
  export const REQUIRED_DIRS = [
12
27
  RUNTIME_ROOT,
13
28
  `${RUNTIME_ROOT}/commands`,
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
22
37
  `${RUNTIME_ROOT}/adapters`,
23
38
  `${RUNTIME_ROOT}/agents`,
24
39
  `${RUNTIME_ROOT}/hooks`,
25
- `${RUNTIME_ROOT}/custom-skills`
40
+ `${RUNTIME_ROOT}/custom-skills`,
41
+ ...EVALS_DIRS
26
42
  ];
27
43
  export const REQUIRED_GITIGNORE_PATTERNS = [
28
44
  "# cclaw generated artifacts",
29
45
  `${RUNTIME_ROOT}/`,
46
+ "# cclaw evals: user-owned, track in git",
47
+ `!${EVALS_ROOT}/`,
48
+ `!${EVALS_ROOT}/config.yaml`,
49
+ `!${EVALS_ROOT}/corpus/`,
50
+ `!${EVALS_ROOT}/corpus/**`,
51
+ `!${EVALS_ROOT}/rubrics/`,
52
+ `!${EVALS_ROOT}/rubrics/**`,
53
+ `!${EVALS_ROOT}/baselines/`,
54
+ `!${EVALS_ROOT}/baselines/**`,
30
55
  ".claude/commands/cc-*.md",
31
56
  ".claude/commands/cc.md",
32
57
  ".cursor/commands/cc-*.md",
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
3
+ * on `cclaw sync` only if the files are missing (user content wins). The
4
+ * scaffold is intentionally minimal: a usable default config plus short
5
+ * READMEs that point at `docs/evals.md` for authoring guidance.
6
+ */
7
+ export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
+ export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
9
+ export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
10
+ export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
11
+ export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
3
+ * on `cclaw sync` only if the files are missing (user content wins). The
4
+ * scaffold is intentionally minimal: a usable default config plus short
5
+ * READMEs that point at `docs/evals.md` for authoring guidance.
6
+ */
7
+ export const EVAL_CONFIG_YAML = `# cclaw eval config
8
+ # See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
9
+ #
10
+ # All values can be overridden at runtime with CCLAW_EVAL_* environment
11
+ # variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
12
+ provider: zai
13
+ baseUrl: https://api.z.ai/api/coding/paas/v4
14
+ model: glm-5.1
15
+
16
+ # Default fidelity tier when --tier is not supplied.
17
+ # A = single-shot API call (cheap, Wave 7.3)
18
+ # B = SDK with tool use (realistic, Wave 7.4)
19
+ # C = multi-stage workflow (end-to-end, Wave 7.5)
20
+ defaultTier: A
21
+
22
+ # Per-call timeout and retry budget.
23
+ timeoutMs: 120000
24
+ maxRetries: 2
25
+
26
+ # Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
27
+ # dailyUsdCap: 5
28
+
29
+ # Regression thresholds used by CI (Wave 7.3+).
30
+ regression:
31
+ # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
32
+ failIfDeltaBelow: -0.15
33
+ # Fail when any single critical rubric drops below this absolute score.
34
+ failIfCriticalBelow: 3.0
35
+ `;
36
+ export const EVAL_CORPUS_README = `# Eval Corpus
37
+
38
+ Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
39
+ See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
40
+
41
+ Minimal shape:
42
+
43
+ \`\`\`yaml
44
+ id: brainstorm-01
45
+ stage: brainstorm
46
+ input_prompt: |
47
+ One short paragraph describing the user's task.
48
+ context_files: []
49
+ expected:
50
+ # verifier-specific hints; optional in Wave 7.0
51
+ \`\`\`
52
+
53
+ Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
54
+ stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
55
+ projects to exercise Tier B/C sandboxes.
56
+ `;
57
+ export const EVAL_RUBRICS_README = `# Eval Rubrics
58
+
59
+ LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
60
+ scored on a \`1–5\` scale with a rationale:
61
+
62
+ \`\`\`yaml
63
+ stage: brainstorm
64
+ checks:
65
+ - id: distinctness
66
+ prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
67
+ scale: "1-5 where 5=fully distinct approaches"
68
+ weight: 1.0
69
+ \`\`\`
70
+
71
+ Rubric authoring happens when Tier A runs start producing artifacts, so we
72
+ score the *right* properties rather than retrofitting generic quality checks.
73
+ See \`docs/evals.md\` for the full schema.
74
+ `;
75
+ export const EVAL_BASELINES_README = `# Eval Baselines
76
+
77
+ Frozen score snapshots used by regression gates. Baselines are committed to
78
+ git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
79
+ (wired in Wave 7.1).
80
+
81
+ Each baseline file is a JSON document keyed by stage and case id. Do not edit
82
+ by hand; CI will flag baseline churn.
83
+ `;
84
+ export const EVAL_REPORTS_README = `# Eval Reports
85
+
86
+ Generated reports (JSON + Markdown) land here. This directory is gitignored.
87
+ Run \`cclaw eval --dry-run\` to preview configuration without producing a
88
+ report.
89
+ `;
@@ -157,7 +157,7 @@ ${rows}
157
157
  function mergedAntiPatterns(schema) {
158
158
  const merged = [];
159
159
  const seen = new Set();
160
- for (const item of [...schema.antiPatterns, ...schema.blockers, ...schema.redFlags]) {
160
+ for (const item of [...schema.commonRationalizations, ...schema.blockers]) {
161
161
  const key = item.trim().toLowerCase();
162
162
  if (seen.has(key))
163
163
  continue;
@@ -94,18 +94,14 @@ export const BRAINSTORM = {
94
94
  "no implementation action taken",
95
95
  "artifact reviewed by user"
96
96
  ],
97
- antiPatterns: [
97
+ commonRationalizations: [
98
98
  "Asking questions without exploring existing project context first",
99
99
  "Asking bundled or purely informational questions that don't change decisions",
100
100
  "Proposing cosmetic option variants instead of architecturally distinct approaches",
101
101
  "Jumping directly into implementation",
102
- "Requesting approval without stating what decision is being approved"
103
- ],
104
- redFlags: [
105
- "No project context exploration before questions",
102
+ "Requesting approval without stating what decision is being approved",
106
103
  "Questions that only gather preferences without design impact",
107
- "Options that are variants of one approach, not distinct alternatives",
108
- "Approval requested without explicit decision context"
104
+ "Options that are variants of one approach, not distinct alternatives"
109
105
  ],
110
106
  policyNeedles: [
111
107
  "Explore project context",
@@ -106,20 +106,17 @@ export const DESIGN = {
106
106
  "completion dashboard present with all review-section statuses",
107
107
  "artifact complete for spec handoff"
108
108
  ],
109
- antiPatterns: [
109
+ commonRationalizations: [
110
110
  "Architecture deferred to implementation phase",
111
111
  "Missing data-flow edge cases",
112
112
  "No performance budget for critical path",
113
113
  "Batching multiple design issues into one question",
114
114
  "Skipping review sections because plan seems simple",
115
115
  "Agreeing with user's architecture choice without evaluating alternatives",
116
- "Hedging every recommendation with 'it depends' instead of taking a position"
117
- ],
118
- redFlags: [
116
+ "Hedging every recommendation with 'it depends' instead of taking a position",
119
117
  "No explicit architecture boundary section",
120
118
  "No failure recovery strategy",
121
119
  "No defined test/perf baseline",
122
- "Review sections skipped or condensed",
123
120
  "No NOT-in-scope output section",
124
121
  "No What-already-exists output section",
125
122
  "Design decisions made without reading the actual code first"
@@ -83,14 +83,12 @@ export const PLAN = {
83
83
  "artifact ready for TDD execution",
84
84
  "acceptance mapping complete"
85
85
  ],
86
- antiPatterns: [
86
+ commonRationalizations: [
87
87
  "Horizontal decomposition without end-to-end slices",
88
88
  "Tasks without verification steps",
89
89
  "Starting execution before approval",
90
90
  "Tasks that touch multiple unrelated areas",
91
- "Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions"
92
- ],
93
- redFlags: [
91
+ "Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions",
94
92
  "No dependency graph",
95
93
  "No WAIT_FOR_CONFIRM marker",
96
94
  "No explicit dependency waves",
@@ -88,14 +88,12 @@ export const REVIEW = {
88
88
  "critical blockers resolved",
89
89
  "ship readiness explicitly stated"
90
90
  ],
91
- antiPatterns: [
91
+ commonRationalizations: [
92
92
  "Single generic review without layered structure",
93
93
  "No severity classification",
94
94
  "Shipping with open criticals",
95
95
  "Batching multiple findings into one report without individual resolution",
96
- "Skipping Layer 2 sections because Layer 1 passed"
97
- ],
98
- redFlags: [
96
+ "Skipping Layer 2 sections because Layer 1 passed",
99
97
  "No separate Layer 1/Layer 2 outcomes",
100
98
  "No structured review-army reconciliation artifact",
101
99
  "No critical bucket",
@@ -74,8 +74,14 @@ export interface StageSchema {
74
74
  outputs: string[];
75
75
  blockers: string[];
76
76
  exitCriteria: string[];
77
- antiPatterns: string[];
78
- redFlags: string[];
77
+ /**
78
+ * Consolidated "Common Rationalizations" list — things an agent is likely to
79
+ * talk itself into that should stop the stage. Rendered under the
80
+ * "Anti-Patterns & Red Flags" heading in the generated SKILL.md. Replaces
81
+ * the former split between `antiPatterns` and `redFlags`, which produced
82
+ * near-duplicate entries and forced downstream code to merge them anyway.
83
+ */
84
+ commonRationalizations: string[];
79
85
  policyNeedles: string[];
80
86
  artifactFile: string;
81
87
  next: FlowStage | "done";
@@ -96,7 +96,7 @@ export const SCOPE = {
96
96
  "completion dashboard produced",
97
97
  "scope summary produced"
98
98
  ],
99
- antiPatterns: [
99
+ commonRationalizations: [
100
100
  "Scope silently expanded during discussion",
101
101
  "No explicit out-of-scope section",
102
102
  "Premise accepted without challenge",
@@ -104,16 +104,12 @@ export const SCOPE = {
104
104
  "Hedged recommendations that avoid taking a position",
105
105
  "Batching multiple scope issues into one question",
106
106
  "Re-arguing for smaller scope after user rejects reduction",
107
- "Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries"
108
- ],
109
- redFlags: [
107
+ "Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries",
110
108
  "No selected mode in artifact",
111
109
  "Mode selected without heuristic justification",
112
110
  "No discretion section (or explicit `None`) in artifact",
113
111
  "No deferred/not-in-scope section",
114
112
  "No user approval marker",
115
- "Premise challenge missing or superficial",
116
- "No implementation alternatives evaluated",
117
113
  "Missing Locked Decisions section or decisions without D-XX IDs"
118
114
  ],
119
115
  policyNeedles: ["Scope mode", "In Scope", "Out of Scope", "Discretion Areas", "NOT in scope", "Premise Challenge", "Locked Decisions"],
@@ -77,14 +77,12 @@ export const SHIP = {
77
77
  "rollback and release notes complete",
78
78
  "finalization action explicitly chosen and executed"
79
79
  ],
80
- antiPatterns: [
80
+ commonRationalizations: [
81
81
  "Shipping without rollback strategy",
82
82
  "Implicit finalization decision",
83
83
  "Bypassing preflight due to urgency",
84
84
  "Selecting multiple finalization modes",
85
- "Shipping with BLOCKED review verdict"
86
- ],
87
- redFlags: [
85
+ "Shipping with BLOCKED review verdict",
88
86
  "No rollback trigger/steps",
89
87
  "More than one finalization mode implied",
90
88
  "No explicit preflight result",
@@ -77,14 +77,11 @@ export const SPEC = {
77
77
  "plan-ready acceptance mapping exists",
78
78
  "testability confirmed for all criteria"
79
79
  ],
80
- antiPatterns: [
80
+ commonRationalizations: [
81
81
  "High-level goals without measurable outcomes",
82
82
  "Implicit assumptions",
83
83
  "Proceeding to plan before approval",
84
- "Using vague adjectives (fast, intuitive, robust) without thresholds"
85
- ],
86
- redFlags: [
87
- "Criteria use vague language (fast, intuitive, robust) without thresholds",
84
+ "Using vague adjectives (fast, intuitive, robust) without thresholds",
88
85
  "No explicit assumptions section",
89
86
  "No approval record",
90
87
  "No testability mapping",
@@ -88,16 +88,14 @@ export const TDD = {
88
88
  "required gates marked satisfied",
89
89
  "traceability annotated"
90
90
  ],
91
- antiPatterns: [
91
+ commonRationalizations: [
92
92
  "Writing code before failing test",
93
93
  "Asserting implementation details instead of behavior",
94
94
  "Big-bang implementation across multiple slices",
95
95
  "Partial test runs presented as GREEN",
96
96
  "Skipping evidence capture",
97
97
  "Undocumented refactor changes",
98
- "Adding features beyond what RED tests require"
99
- ],
100
- redFlags: [
98
+ "Adding features beyond what RED tests require",
101
99
  "No failing test output (RED missing)",
102
100
  "Implementation edits appear before RED evidence",
103
101
  "No full-suite GREEN evidence",
@@ -0,0 +1,14 @@
1
+ import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
2
+ /**
3
+ * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
4
+ * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
5
+ * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
6
+ * variables (env wins last).
7
+ */
8
+ export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
9
+ /**
10
+ * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
11
+ * Returns a fully-populated config plus a provenance marker so `--dry-run` can
12
+ * surface where each setting came from.
13
+ */
14
+ export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;