npm - cclaw-cli - Versions diffs - 0.21.1 → 0.22.0 - Mend

cclaw-cli 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/cli.d.ts +9 -1
package/dist/cli.js +123 -1
package/dist/constants.d.ts +11 -2
package/dist/constants.js +26 -1
package/dist/content/eval-scaffold.d.ts +11 -0
package/dist/content/eval-scaffold.js +89 -0
package/dist/content/skills.js +1 -1
package/dist/content/stages/brainstorm.js +3 -7
package/dist/content/stages/design.js +2 -5
package/dist/content/stages/plan.js +2 -4
package/dist/content/stages/review.js +2 -4
package/dist/content/stages/schema-types.d.ts +8 -2
package/dist/content/stages/scope.js +2 -6
package/dist/content/stages/ship.js +2 -4
package/dist/content/stages/spec.js +2 -5
package/dist/content/stages/tdd.js +2 -4
package/dist/eval/config-loader.d.ts +14 -0
package/dist/eval/config-loader.js +237 -0
package/dist/eval/corpus.d.ts +8 -0
package/dist/eval/corpus.js +91 -0
package/dist/eval/llm-client.d.ts +62 -0
package/dist/eval/llm-client.js +19 -0
package/dist/eval/report.d.ts +11 -0
package/dist/eval/report.js +88 -0
package/dist/eval/runner.d.ts +53 -0
package/dist/eval/runner.js +96 -0
package/dist/eval/types.d.ts +136 -0
package/dist/eval/types.js +15 -0
package/dist/install.js +22 -0
package/dist/runs.d.ts +0 -18
package/dist/runs.js +1 -188
package/package.json +1 -1

package/dist/cli.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 import type { FlowTrack, HarnessId, InitProfile } from "./types.js";
-type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive";
+import type { EvalTier } from "./eval/types.js";
+type CommandName = "init" | "sync" | "doctor" | "upgrade" | "uninstall" | "archive" | "eval";
 interface ParsedArgs {
     command?: CommandName;
     harnesses?: HarnessId[];
@@ -16,6 +17,13 @@ interface ParsedArgs {
     archiveName?: string;
     archiveSkipRetro?: boolean;
     archiveSkipRetroReason?: string;
+    evalStage?: string;
+    evalTier?: EvalTier;
+    evalSchemaOnly?: boolean;
+    evalRules?: boolean;
+    evalJudge?: boolean;
+    evalJson?: boolean;
+    evalNoWrite?: boolean;
     showHelp?: boolean;
     showVersion?: boolean;
 }

package/dist/cli.js CHANGED Viewed

@@ -13,7 +13,19 @@ import { RUNTIME_ROOT } from "./constants.js";
 import { createDefaultConfig, createProfileConfig } from "./config.js";
 import { detectHarnesses } from "./init-detect.js";
 import { HARNESS_ADAPTERS } from "./harness-adapters.js";
-const INSTALLER_COMMANDS = ["init", "sync", "doctor", "upgrade", "uninstall", "archive"];
+import { runEval } from "./eval/runner.js";
+import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
+import { EVAL_TIERS } from "./eval/types.js";
+import { FLOW_STAGES } from "./types.js";
+const INSTALLER_COMMANDS = [
+    "init",
+    "sync",
+    "doctor",
+    "upgrade",
+    "uninstall",
+    "archive",
+    "eval"
+];
 export function usage() {
     return `cclaw - installer-first flow toolkit
@@ -41,6 +53,15 @@ Commands:
              Flags: --name=<feature>    Feature slug (default: inferred from 00-idea.md).
                     --skip-retro       Bypass mandatory retro gate (requires --retro-reason).
                     --retro-reason=<t> Reason for bypassing retro gate.
+  eval       Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
+             Flags: --stage=<id>        Limit to one flow stage (${FLOW_STAGES.join("|")}).
+                    --tier=<A|B|C>      Fidelity tier (A=single-shot, B=tools, C=workflow).
+                    --schema-only       Run only structural verifiers (Wave 7.1).
+                    --rules             Run structural + rule verifiers (Wave 7.2).
+                    --judge             Include LLM judging (Wave 7.3; requires API key).
+                    --dry-run           Validate config + corpus, print summary, do not execute.
+                    --json              Emit machine-readable JSON on stdout.
+                    --no-write          Skip writing the report to .cclaw/evals/reports/.
   upgrade    Refresh generated files in .cclaw without modifying user artifacts.
   uninstall  Remove .cclaw runtime and the generated harness shim files.
@@ -52,6 +73,8 @@ Examples:
   cclaw init --harnesses=claude,cursor
   cclaw doctor --reconcile-gates
   cclaw archive --name=payments-revamp
+  cclaw eval --dry-run
+  cclaw eval --stage=brainstorm --schema-only
 Docs:   https://github.com/zuevrs/cclaw
 Issues: https://github.com/zuevrs/cclaw/issues
@@ -107,6 +130,20 @@ function parseProfile(raw) {
     }
     return trimmed;
 }
+function parseEvalTier(raw) {
+    const trimmed = raw.trim().toUpperCase();
+    if (!EVAL_TIERS.includes(trimmed)) {
+        throw new Error(`Unknown eval tier: ${raw}. Supported: ${EVAL_TIERS.join(", ")}`);
+    }
+    return trimmed;
+}
+function parseEvalStage(raw) {
+    const trimmed = raw.trim();
+    if (!FLOW_STAGES.includes(trimmed)) {
+        throw new Error(`Unknown eval stage: ${raw}. Supported: ${FLOW_STAGES.join(", ")}`);
+    }
+    return trimmed;
+}
 function isInitPromptAllowed(ctx) {
     return Boolean(process.stdin.isTTY && ctx.stdout.isTTY);
 }
@@ -390,7 +427,37 @@ function parseArgs(argv) {
         }
         if (flag.startsWith("--retro-reason=")) {
             parsed.archiveSkipRetroReason = flag.replace("--retro-reason=", "").trim();
+            continue;
+        }
+        if (flag.startsWith("--stage=")) {
+            parsed.evalStage = parseEvalStage(flag.replace("--stage=", ""));
+            continue;
+        }
+        if (flag.startsWith("--tier=")) {
+            parsed.evalTier = parseEvalTier(flag.replace("--tier=", ""));
+            continue;
+        }
+        if (flag === "--schema-only") {
+            parsed.evalSchemaOnly = true;
+            continue;
+        }
+        if (flag === "--rules") {
+            parsed.evalRules = true;
+            continue;
+        }
+        if (flag === "--judge") {
+            parsed.evalJudge = true;
+            continue;
         }
+        if (flag === "--no-write") {
+            parsed.evalNoWrite = true;
+            continue;
+        }
+    }
+    // `--json` is shared between doctor and eval. Disambiguate by command.
+    if (parsed.command === "eval" && parsed.doctorJson === true) {
+        parsed.evalJson = true;
+        parsed.doctorJson = undefined;
     }
     return parsed;
 }
@@ -487,6 +554,61 @@ async function runCommand(parsed, ctx) {
         info(ctx, "Upgraded .cclaw runtime and regenerated generated files");
         return 0;
     }
+    if (command === "eval") {
+        const result = await runEval({
+            projectRoot: ctx.cwd,
+            stage: parsed.evalStage,
+            tier: parsed.evalTier,
+            schemaOnly: parsed.evalSchemaOnly === true,
+            rules: parsed.evalRules === true,
+            judge: parsed.evalJudge === true,
+            dryRun: parsed.dryRun === true
+        });
+        if ("kind" in result) {
+            if (parsed.evalJson === true) {
+                ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
+                return 0;
+            }
+            ctx.stdout.write(`cclaw eval dry-run\n`);
+            ctx.stdout.write(`  provider: ${result.config.provider}\n`);
+            ctx.stdout.write(`  baseUrl: ${result.config.baseUrl}\n`);
+            ctx.stdout.write(`  model: ${result.config.model}\n`);
+            ctx.stdout.write(`  source: ${result.config.source}\n`);
+            ctx.stdout.write(`  apiKey: ${result.config.apiKey ? "set" : "unset"}\n`);
+            ctx.stdout.write(`  tier: ${result.plannedTier}\n`);
+            ctx.stdout.write(`  corpus: ${result.corpus.total} case(s)\n`);
+            for (const [stage, count] of Object.entries(result.corpus.byStage)) {
+                ctx.stdout.write(`    - ${stage}: ${count}\n`);
+            }
+            ctx.stdout.write(`  verifiers available:\n`);
+            for (const [key, value] of Object.entries(result.verifiersAvailable)) {
+                ctx.stdout.write(`    - ${key}: ${value ? "yes" : "no"}\n`);
+            }
+            if (result.notes.length > 0) {
+                ctx.stdout.write(`  notes:\n`);
+                for (const note of result.notes) {
+                    ctx.stdout.write(`    - ${note}\n`);
+                }
+            }
+            return 0;
+        }
+        if (parsed.evalNoWrite !== true) {
+            const jsonPath = await writeJsonReport(ctx.cwd, result);
+            const mdPath = await writeMarkdownReport(ctx.cwd, result);
+            info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
+            info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
+        }
+        if (parsed.evalJson === true) {
+            ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
+        }
+        else {
+            ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
+                `${result.summary.passed} passed, ` +
+                `${result.summary.failed} failed, ` +
+                `${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
+        }
+        return result.summary.failed > 0 ? 1 : 0;
+    }
     if (command === "archive") {
         const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
             skipRetro: parsed.archiveSkipRetro === true,

package/dist/constants.d.ts CHANGED Viewed

@@ -4,8 +4,17 @@ export declare const RUNTIME_ROOT = ".cclaw";
 export declare const CCLAW_VERSION = "0.1.1";
 export declare const FLOW_VERSION = "1.0.0";
 export declare const DEFAULT_HARNESSES: HarnessId[];
-export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills"];
-export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
+/**
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
+ * does not affect non-eval cclaw behavior.
+ */
+export declare const EVALS_ROOT = ".cclaw/evals";
+export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
+export declare const EVALS_DIRS: readonly [".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
+export declare const REQUIRED_DIRS: readonly [".cclaw", ".cclaw/commands", ".cclaw/skills", ".cclaw/contexts", ".cclaw/templates", ".cclaw/artifacts", ".cclaw/worktrees", ".cclaw/state", ".cclaw/runs", ".cclaw/rules", ".cclaw/adapters", ".cclaw/agents", ".cclaw/hooks", ".cclaw/custom-skills", ".cclaw/evals", ".cclaw/evals/corpus", ".cclaw/evals/rubrics", ".cclaw/evals/baselines", ".cclaw/evals/reports"];
+export declare const REQUIRED_GITIGNORE_PATTERNS: readonly ["# cclaw generated artifacts", ".cclaw/", "# cclaw evals: user-owned, track in git", "!.cclaw/evals/", "!.cclaw/evals/config.yaml", "!.cclaw/evals/corpus/", "!.cclaw/evals/corpus/**", "!.cclaw/evals/rubrics/", "!.cclaw/evals/rubrics/**", "!.cclaw/evals/baselines/", "!.cclaw/evals/baselines/**", ".claude/commands/cc-*.md", ".claude/commands/cc.md", ".cursor/commands/cc-*.md", ".cursor/commands/cc.md", ".opencode/commands/cc-*.md", ".opencode/commands/cc.md", ".codex/commands/cc-*.md", ".codex/commands/cc.md", ".claude/hooks/hooks.json", ".cursor/hooks.json", ".codex/hooks.json", ".opencode/plugins/cclaw-plugin.mjs", ".cursor/rules/cclaw-workflow.mdc"];
 export declare const COMMAND_FILE_ORDER: FlowStage[];
 export declare const UTILITY_COMMANDS: readonly ["learn", "next", "ideate", "view", "status", "tree", "diff", "ops", "feature", "tdd-log", "retro", "compound", "archive", "rewind"];
 export declare const SUBAGENT_SKILL_FOLDERS: readonly ["subagent-dev", "parallel-dispatch"];

package/dist/constants.js CHANGED Viewed

@@ -8,6 +8,21 @@ export const DEFAULT_HARNESSES = [
     "opencode",
     "codex"
 ];
+/**
+ * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
+ * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
+ * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
+ * does not affect non-eval cclaw behavior.
+ */
+export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
+export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
+export const EVALS_DIRS = [
+    EVALS_ROOT,
+    `${EVALS_ROOT}/corpus`,
+    `${EVALS_ROOT}/rubrics`,
+    `${EVALS_ROOT}/baselines`,
+    `${EVALS_ROOT}/reports`
+];
 export const REQUIRED_DIRS = [
     RUNTIME_ROOT,
     `${RUNTIME_ROOT}/commands`,
@@ -22,11 +37,21 @@ export const REQUIRED_DIRS = [
     `${RUNTIME_ROOT}/adapters`,
     `${RUNTIME_ROOT}/agents`,
     `${RUNTIME_ROOT}/hooks`,
-    `${RUNTIME_ROOT}/custom-skills`
+    `${RUNTIME_ROOT}/custom-skills`,
+    ...EVALS_DIRS
 ];
 export const REQUIRED_GITIGNORE_PATTERNS = [
     "# cclaw generated artifacts",
     `${RUNTIME_ROOT}/`,
+    "# cclaw evals: user-owned, track in git",
+    `!${EVALS_ROOT}/`,
+    `!${EVALS_ROOT}/config.yaml`,
+    `!${EVALS_ROOT}/corpus/`,
+    `!${EVALS_ROOT}/corpus/**`,
+    `!${EVALS_ROOT}/rubrics/`,
+    `!${EVALS_ROOT}/rubrics/**`,
+    `!${EVALS_ROOT}/baselines/`,
+    `!${EVALS_ROOT}/baselines/**`,
     ".claude/commands/cc-*.md",
     ".claude/commands/cc.md",
     ".cursor/commands/cc-*.md",

package/dist/content/eval-scaffold.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+/**
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
+ * on `cclaw sync` only if the files are missing (user content wins). The
+ * scaffold is intentionally minimal: a usable default config plus short
+ * READMEs that point at `docs/evals.md` for authoring guidance.
+ */
+export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n#   A = single-shot API call (cheap, Wave 7.3)\n#   B = SDK with tool use     (realistic, Wave 7.4)\n#   C = multi-stage workflow  (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n  # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n  failIfDeltaBelow: -0.15\n  # Fail when any single critical rubric drops below this absolute score.\n  failIfCriticalBelow: 3.0\n";
+export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n  One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n  # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
+export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n  - id: distinctness\n    prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n    scale: \"1-5 where 5=fully distinct approaches\"\n    weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
+export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
+export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";

package/dist/content/eval-scaffold.js ADDED Viewed

@@ -0,0 +1,89 @@
+/**
+ * Static scaffold for `.cclaw/evals/`. Written on `cclaw init` and refreshed
+ * on `cclaw sync` only if the files are missing (user content wins). The
+ * scaffold is intentionally minimal: a usable default config plus short
+ * READMEs that point at `docs/evals.md` for authoring guidance.
+ */
+export const EVAL_CONFIG_YAML = `# cclaw eval config
+# See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
+#
+# All values can be overridden at runtime with CCLAW_EVAL_* environment
+# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
+provider: zai
+baseUrl: https://api.z.ai/api/coding/paas/v4
+model: glm-5.1
+# Default fidelity tier when --tier is not supplied.
+#   A = single-shot API call (cheap, Wave 7.3)
+#   B = SDK with tool use     (realistic, Wave 7.4)
+#   C = multi-stage workflow  (end-to-end, Wave 7.5)
+defaultTier: A
+# Per-call timeout and retry budget.
+timeoutMs: 120000
+maxRetries: 2
+# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
+# dailyUsdCap: 5
+# Regression thresholds used by CI (Wave 7.3+).
+regression:
+  # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
+  failIfDeltaBelow: -0.15
+  # Fail when any single critical rubric drops below this absolute score.
+  failIfCriticalBelow: 3.0
+`;
+export const EVAL_CORPUS_README = `# Eval Corpus
+Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
+See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
+Minimal shape:
+\`\`\`yaml
+id: brainstorm-01
+stage: brainstorm
+input_prompt: |
+  One short paragraph describing the user's task.
+context_files: []
+expected:
+  # verifier-specific hints; optional in Wave 7.0
+\`\`\`
+Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
+stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
+projects to exercise Tier B/C sandboxes.
+`;
+export const EVAL_RUBRICS_README = `# Eval Rubrics
+LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
+scored on a \`1–5\` scale with a rationale:
+\`\`\`yaml
+stage: brainstorm
+checks:
+  - id: distinctness
+    prompt: "Are the proposed directions genuinely distinct (not rephrasings)?"
+    scale: "1-5 where 5=fully distinct approaches"
+    weight: 1.0
+\`\`\`
+Rubric authoring happens when Tier A runs start producing artifacts, so we
+score the *right* properties rather than retrofitting generic quality checks.
+See \`docs/evals.md\` for the full schema.
+`;
+export const EVAL_BASELINES_README = `# Eval Baselines
+Frozen score snapshots used by regression gates. Baselines are committed to
+git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
+(wired in Wave 7.1).
+Each baseline file is a JSON document keyed by stage and case id. Do not edit
+by hand; CI will flag baseline churn.
+`;
+export const EVAL_REPORTS_README = `# Eval Reports
+Generated reports (JSON + Markdown) land here. This directory is gitignored.
+Run \`cclaw eval --dry-run\` to preview configuration without producing a
+report.
+`;

package/dist/content/skills.js CHANGED Viewed

@@ -157,7 +157,7 @@ ${rows}
 function mergedAntiPatterns(schema) {
     const merged = [];
     const seen = new Set();
-    for (const item of [...schema.antiPatterns, ...schema.blockers, ...schema.redFlags]) {
+    for (const item of [...schema.commonRationalizations, ...schema.blockers]) {
         const key = item.trim().toLowerCase();
         if (seen.has(key))
             continue;

package/dist/content/stages/brainstorm.js CHANGED Viewed

@@ -94,18 +94,14 @@ export const BRAINSTORM = {
         "no implementation action taken",
         "artifact reviewed by user"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Asking questions without exploring existing project context first",
         "Asking bundled or purely informational questions that don't change decisions",
         "Proposing cosmetic option variants instead of architecturally distinct approaches",
         "Jumping directly into implementation",
-        "Requesting approval without stating what decision is being approved"
-    ],
-    redFlags: [
-        "No project context exploration before questions",
+        "Requesting approval without stating what decision is being approved",
         "Questions that only gather preferences without design impact",
-        "Options that are variants of one approach, not distinct alternatives",
-        "Approval requested without explicit decision context"
+        "Options that are variants of one approach, not distinct alternatives"
     ],
     policyNeedles: [
         "Explore project context",

package/dist/content/stages/design.js CHANGED Viewed

@@ -106,20 +106,17 @@ export const DESIGN = {
         "completion dashboard present with all review-section statuses",
         "artifact complete for spec handoff"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Architecture deferred to implementation phase",
         "Missing data-flow edge cases",
         "No performance budget for critical path",
         "Batching multiple design issues into one question",
         "Skipping review sections because plan seems simple",
         "Agreeing with user's architecture choice without evaluating alternatives",
-        "Hedging every recommendation with 'it depends' instead of taking a position"
-    ],
-    redFlags: [
+        "Hedging every recommendation with 'it depends' instead of taking a position",
         "No explicit architecture boundary section",
         "No failure recovery strategy",
         "No defined test/perf baseline",
-        "Review sections skipped or condensed",
         "No NOT-in-scope output section",
         "No What-already-exists output section",
         "Design decisions made without reading the actual code first"

package/dist/content/stages/plan.js CHANGED Viewed

@@ -83,14 +83,12 @@ export const PLAN = {
         "artifact ready for TDD execution",
         "acceptance mapping complete"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Horizontal decomposition without end-to-end slices",
         "Tasks without verification steps",
         "Starting execution before approval",
         "Tasks that touch multiple unrelated areas",
-        "Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions"
-    ],
-    redFlags: [
+        "Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions",
         "No dependency graph",
         "No WAIT_FOR_CONFIRM marker",
         "No explicit dependency waves",

package/dist/content/stages/review.js CHANGED Viewed

@@ -88,14 +88,12 @@ export const REVIEW = {
         "critical blockers resolved",
         "ship readiness explicitly stated"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Single generic review without layered structure",
         "No severity classification",
         "Shipping with open criticals",
         "Batching multiple findings into one report without individual resolution",
-        "Skipping Layer 2 sections because Layer 1 passed"
-    ],
-    redFlags: [
+        "Skipping Layer 2 sections because Layer 1 passed",
         "No separate Layer 1/Layer 2 outcomes",
         "No structured review-army reconciliation artifact",
         "No critical bucket",

package/dist/content/stages/schema-types.d.ts CHANGED Viewed

@@ -74,8 +74,14 @@ export interface StageSchema {
     outputs: string[];
     blockers: string[];
     exitCriteria: string[];
-    antiPatterns: string[];
-    redFlags: string[];
+    /**
+     * Consolidated "Common Rationalizations" list — things an agent is likely to
+     * talk itself into that should stop the stage. Rendered under the
+     * "Anti-Patterns & Red Flags" heading in the generated SKILL.md. Replaces
+     * the former split between `antiPatterns` and `redFlags`, which produced
+     * near-duplicate entries and forced downstream code to merge them anyway.
+     */
+    commonRationalizations: string[];
     policyNeedles: string[];
     artifactFile: string;
     next: FlowStage | "done";

package/dist/content/stages/scope.js CHANGED Viewed

@@ -96,7 +96,7 @@ export const SCOPE = {
         "completion dashboard produced",
         "scope summary produced"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Scope silently expanded during discussion",
         "No explicit out-of-scope section",
         "Premise accepted without challenge",
@@ -104,16 +104,12 @@ export const SCOPE = {
         "Hedged recommendations that avoid taking a position",
         "Batching multiple scope issues into one question",
         "Re-arguing for smaller scope after user rejects reduction",
-        "Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries"
-    ],
-    redFlags: [
+        "Using scope-reduction placeholders (`v1`, `for now`, `we can do later`) instead of explicit user-approved boundaries",
         "No selected mode in artifact",
         "Mode selected without heuristic justification",
         "No discretion section (or explicit `None`) in artifact",
         "No deferred/not-in-scope section",
         "No user approval marker",
-        "Premise challenge missing or superficial",
-        "No implementation alternatives evaluated",
         "Missing Locked Decisions section or decisions without D-XX IDs"
     ],
     policyNeedles: ["Scope mode", "In Scope", "Out of Scope", "Discretion Areas", "NOT in scope", "Premise Challenge", "Locked Decisions"],

package/dist/content/stages/ship.js CHANGED Viewed

@@ -77,14 +77,12 @@ export const SHIP = {
         "rollback and release notes complete",
         "finalization action explicitly chosen and executed"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Shipping without rollback strategy",
         "Implicit finalization decision",
         "Bypassing preflight due to urgency",
         "Selecting multiple finalization modes",
-        "Shipping with BLOCKED review verdict"
-    ],
-    redFlags: [
+        "Shipping with BLOCKED review verdict",
         "No rollback trigger/steps",
         "More than one finalization mode implied",
         "No explicit preflight result",

package/dist/content/stages/spec.js CHANGED Viewed

@@ -77,14 +77,11 @@ export const SPEC = {
         "plan-ready acceptance mapping exists",
         "testability confirmed for all criteria"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "High-level goals without measurable outcomes",
         "Implicit assumptions",
         "Proceeding to plan before approval",
-        "Using vague adjectives (fast, intuitive, robust) without thresholds"
-    ],
-    redFlags: [
-        "Criteria use vague language (fast, intuitive, robust) without thresholds",
+        "Using vague adjectives (fast, intuitive, robust) without thresholds",
         "No explicit assumptions section",
         "No approval record",
         "No testability mapping",

package/dist/content/stages/tdd.js CHANGED Viewed

@@ -88,16 +88,14 @@ export const TDD = {
         "required gates marked satisfied",
         "traceability annotated"
     ],
-    antiPatterns: [
+    commonRationalizations: [
         "Writing code before failing test",
         "Asserting implementation details instead of behavior",
         "Big-bang implementation across multiple slices",
         "Partial test runs presented as GREEN",
         "Skipping evidence capture",
         "Undocumented refactor changes",
-        "Adding features beyond what RED tests require"
-    ],
-    redFlags: [
+        "Adding features beyond what RED tests require",
         "No failing test output (RED missing)",
         "Implementation edits appear before RED evidence",
         "No full-suite GREEN evidence",

package/dist/eval/config-loader.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type { EvalConfig, ResolvedEvalConfig } from "./types.js";
+/**
+ * Default eval config. Optimized for the z.ai OpenAI-compatible coding endpoint
+ * with GLM 5.1 per the roadmap locked decisions (D-EVAL-01..05). Any field can
+ * be overridden by `.cclaw/evals/config.yaml` and then by `CCLAW_EVAL_*` env
+ * variables (env wins last).
+ */
+export declare const DEFAULT_EVAL_CONFIG: EvalConfig;
+/**
+ * Resolve eval config in layered order: defaults -> config.yaml -> env vars.
+ * Returns a fully-populated config plus a provenance marker so `--dry-run` can
+ * surface where each setting came from.
+ */
+export declare function loadEvalConfig(projectRoot: string, env?: NodeJS.ProcessEnv): Promise<ResolvedEvalConfig>;