cclaw-cli 0.22.0 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts CHANGED
@@ -24,6 +24,8 @@ interface ParsedArgs {
24
24
  evalJudge?: boolean;
25
25
  evalJson?: boolean;
26
26
  evalNoWrite?: boolean;
27
+ evalUpdateBaseline?: boolean;
28
+ evalConfirm?: boolean;
27
29
  showHelp?: boolean;
28
30
  showVersion?: boolean;
29
31
  }
package/dist/cli.js CHANGED
@@ -14,6 +14,7 @@ import { createDefaultConfig, createProfileConfig } from "./config.js";
14
14
  import { detectHarnesses } from "./init-detect.js";
15
15
  import { HARNESS_ADAPTERS } from "./harness-adapters.js";
16
16
  import { runEval } from "./eval/runner.js";
17
+ import { writeBaselinesFromReport } from "./eval/baseline.js";
17
18
  import { writeJsonReport, writeMarkdownReport } from "./eval/report.js";
18
19
  import { EVAL_TIERS } from "./eval/types.js";
19
20
  import { FLOW_STAGES } from "./types.js";
@@ -53,15 +54,17 @@ Commands:
53
54
  Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
54
55
  --skip-retro Bypass mandatory retro gate (requires --retro-reason).
55
56
  --retro-reason=<t> Reason for bypassing retro gate.
56
- eval Run cclaw evals against .cclaw/evals/corpus (Phase 7, Wave 7.0 foundations).
57
- Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
58
- --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
59
- --schema-only Run only structural verifiers (Wave 7.1).
60
- --rules Run structural + rule verifiers (Wave 7.2).
61
- --judge Include LLM judging (Wave 7.3; requires API key).
62
- --dry-run Validate config + corpus, print summary, do not execute.
63
- --json Emit machine-readable JSON on stdout.
64
- --no-write Skip writing the report to .cclaw/evals/reports/.
57
+ eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
58
+ Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
59
+ --tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
60
+ --schema-only Run only structural verifiers (default).
61
+ --rules Run structural + rule verifiers (not wired yet).
62
+ --judge Include LLM judging (not wired yet; requires API key).
63
+ --dry-run Validate config + corpus, print summary, do not execute.
64
+ --json Emit machine-readable JSON on stdout.
65
+ --no-write Skip writing the report to .cclaw/evals/reports/.
66
+ --update-baseline Overwrite baselines from the current run (requires --confirm).
67
+ --confirm Acknowledge --update-baseline (prevents accidental resets).
65
68
  upgrade Refresh generated files in .cclaw without modifying user artifacts.
66
69
  uninstall Remove .cclaw runtime and the generated harness shim files.
67
70
 
@@ -453,6 +456,14 @@ function parseArgs(argv) {
453
456
  parsed.evalNoWrite = true;
454
457
  continue;
455
458
  }
459
+ if (flag === "--update-baseline") {
460
+ parsed.evalUpdateBaseline = true;
461
+ continue;
462
+ }
463
+ if (flag === "--confirm") {
464
+ parsed.evalConfirm = true;
465
+ continue;
466
+ }
456
467
  }
457
468
  // `--json` is shared between doctor and eval. Disambiguate by command.
458
469
  if (parsed.command === "eval" && parsed.doctorJson === true) {
@@ -592,22 +603,42 @@ async function runCommand(parsed, ctx) {
592
603
  }
593
604
  return 0;
594
605
  }
606
+ if (parsed.evalUpdateBaseline === true && parsed.evalConfirm !== true) {
607
+ error(ctx, "--update-baseline requires --confirm to prevent accidental baseline resets.");
608
+ return 1;
609
+ }
610
+ if (parsed.evalUpdateBaseline === true) {
611
+ if (result.summary.failed > 0) {
612
+ error(ctx, `Refusing to update baselines: ${result.summary.failed} case(s) currently failing. Fix structural checks first.`);
613
+ return 1;
614
+ }
615
+ const written = await writeBaselinesFromReport(ctx.cwd, result);
616
+ for (const file of written) {
617
+ info(ctx, `Baseline written: ${path.relative(ctx.cwd, file)}`);
618
+ }
619
+ }
595
620
  if (parsed.evalNoWrite !== true) {
596
621
  const jsonPath = await writeJsonReport(ctx.cwd, result);
597
622
  const mdPath = await writeMarkdownReport(ctx.cwd, result);
598
623
  info(ctx, `Report written: ${path.relative(ctx.cwd, jsonPath)}`);
599
624
  info(ctx, `Report written: ${path.relative(ctx.cwd, mdPath)}`);
600
625
  }
626
+ const regressionCount = result.baselineDelta?.criticalFailures ?? 0;
601
627
  if (parsed.evalJson === true) {
602
628
  ctx.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
603
629
  }
604
630
  else {
631
+ const regressionNote = regressionCount > 0 ? `, ${regressionCount} regression(s)` : "";
605
632
  ctx.stdout.write(`cclaw eval: ${result.summary.totalCases} case(s), ` +
606
633
  `${result.summary.passed} passed, ` +
607
634
  `${result.summary.failed} failed, ` +
608
- `${result.summary.skipped} skipped (Wave 7.0 skeleton — verifiers land in Wave 7.1+)\n`);
635
+ `${result.summary.skipped} skipped${regressionNote}\n`);
609
636
  }
610
- return result.summary.failed > 0 ? 1 : 0;
637
+ if (result.summary.failed > 0)
638
+ return 1;
639
+ if (regressionCount > 0)
640
+ return 1;
641
+ return 0;
611
642
  }
612
643
  if (command === "archive") {
613
644
  const archived = await archiveRun(ctx.cwd, parsed.archiveName, {
@@ -5,10 +5,10 @@ export declare const CCLAW_VERSION = "0.1.1";
5
5
  export declare const FLOW_VERSION = "1.0.0";
6
6
  export declare const DEFAULT_HARNESSES: HarnessId[];
7
7
  /**
8
- * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
9
- * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
10
- * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
11
- * does not affect non-eval cclaw behavior.
8
+ * Evals subtree. Scaffolds the directory layout and a default config.yaml; the
9
+ * structural verifier, rule verifiers, and LLM wiring layer on incrementally.
10
+ * Keeping this separate from the main REQUIRED_DIRS list makes it explicit that
11
+ * the evals runtime is additive and does not affect non-eval cclaw behavior.
12
12
  */
13
13
  export declare const EVALS_ROOT = ".cclaw/evals";
14
14
  export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
package/dist/constants.js CHANGED
@@ -9,10 +9,10 @@ export const DEFAULT_HARNESSES = [
9
9
  "codex"
10
10
  ];
11
11
  /**
12
- * Evals subtree. Wave 7.0 scaffolds the directory layout and a default config.yaml;
13
- * verifiers and LLM wiring arrive in Waves 7.1–7.5. Keeping this separate from the
14
- * main REQUIRED_DIRS list makes it explicit that the evals runtime is additive and
15
- * does not affect non-eval cclaw behavior.
12
+ * Evals subtree. Scaffolds the directory layout and a default config.yaml; the
13
+ * structural verifier, rule verifiers, and LLM wiring layer on incrementally.
14
+ * Keeping this separate from the main REQUIRED_DIRS list makes it explicit that
15
+ * the evals runtime is additive and does not affect non-eval cclaw behavior.
16
16
  */
17
17
  export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
18
18
  export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
@@ -4,8 +4,8 @@
4
4
  * scaffold is intentionally minimal: a usable default config plus short
5
5
  * READMEs that point at `docs/evals.md` for authoring guidance.
6
6
  */
7
- export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and Wave 7.1\u20137.6 rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap, Wave 7.3)\n# B = SDK with tool use (realistic, Wave 7.4)\n# C = multi-stage workflow (end-to-end, Wave 7.5)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI (Wave 7.3+).\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
- export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema; authoring begins in Wave 7.1.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional in Wave 7.0\n```\n\nWave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per\nstage (40 total). Wave 7.4/7.5 may add `context_files` pulled from real\nprojects to exercise Tier B/C sandboxes.\n";
9
- export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks\nscored on a `1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
10
- export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`\n(wired in Wave 7.1).\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
7
+ export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap)\n# B = SDK with tool use (realistic)\n# C = multi-stage workflow (end-to-end)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
8
+ export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Tier B/C runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
9
+ export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
10
+ export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`.\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
11
11
  export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
@@ -5,7 +5,7 @@
5
5
  * READMEs that point at `docs/evals.md` for authoring guidance.
6
6
  */
7
7
  export const EVAL_CONFIG_YAML = `# cclaw eval config
8
- # See docs/evals.md for the full schema and Wave 7.1–7.6 rollout plan.
8
+ # See docs/evals.md for the full schema and rollout plan.
9
9
  #
10
10
  # All values can be overridden at runtime with CCLAW_EVAL_* environment
11
11
  # variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
@@ -14,9 +14,9 @@ baseUrl: https://api.z.ai/api/coding/paas/v4
14
14
  model: glm-5.1
15
15
 
16
16
  # Default fidelity tier when --tier is not supplied.
17
- # A = single-shot API call (cheap, Wave 7.3)
18
- # B = SDK with tool use (realistic, Wave 7.4)
19
- # C = multi-stage workflow (end-to-end, Wave 7.5)
17
+ # A = single-shot API call (cheap)
18
+ # B = SDK with tool use (realistic)
19
+ # C = multi-stage workflow (end-to-end)
20
20
  defaultTier: A
21
21
 
22
22
  # Per-call timeout and retry budget.
@@ -26,7 +26,7 @@ maxRetries: 2
26
26
  # Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
27
27
  # dailyUsdCap: 5
28
28
 
29
- # Regression thresholds used by CI (Wave 7.3+).
29
+ # Regression thresholds used by CI.
30
30
  regression:
31
31
  # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
32
32
  failIfDeltaBelow: -0.15
@@ -36,7 +36,7 @@ regression:
36
36
  export const EVAL_CORPUS_README = `# Eval Corpus
37
37
 
38
38
  Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
39
- See \`docs/evals.md\` for the schema; authoring begins in Wave 7.1.
39
+ See \`docs/evals.md\` for the schema.
40
40
 
41
41
  Minimal shape:
42
42
 
@@ -47,17 +47,17 @@ input_prompt: |
47
47
  One short paragraph describing the user's task.
48
48
  context_files: []
49
49
  expected:
50
- # verifier-specific hints; optional in Wave 7.0
50
+ # verifier-specific hints; optional
51
51
  \`\`\`
52
52
 
53
- Wave 7.1 will add 3 cases per stage (24 total). Wave 7.2 will expand to 5 per
54
- stage (40 total). Wave 7.4/7.5 may add \`context_files\` pulled from real
55
- projects to exercise Tier B/C sandboxes.
53
+ Start with 3 structural cases per stage (24 total), then expand to 5 per
54
+ stage (40 total) once rule verifiers land. Tier B/C runs may add
55
+ \`context_files\` pulled from real projects to exercise the sandbox.
56
56
  `;
57
57
  export const EVAL_RUBRICS_README = `# Eval Rubrics
58
58
 
59
- LLM-judge rubrics land in Wave 7.3. Each rubric is a short list of checks
60
- scored on a \`1–5\` scale with a rationale:
59
+ LLM-judge rubrics. Each rubric is a short list of checks scored on a
60
+ \`1–5\` scale with a rationale:
61
61
 
62
62
  \`\`\`yaml
63
63
  stage: brainstorm
@@ -75,8 +75,7 @@ See \`docs/evals.md\` for the full schema.
75
75
  export const EVAL_BASELINES_README = `# Eval Baselines
76
76
 
77
77
  Frozen score snapshots used by regression gates. Baselines are committed to
78
- git and updated explicitly via \`cclaw eval --update-baseline --confirm\`
79
- (wired in Wave 7.1).
78
+ git and updated explicitly via \`cclaw eval --update-baseline --confirm\`.
80
79
 
81
80
  Each baseline file is a JSON document keyed by stage and case id. Do not edit
82
81
  by hand; CI will flag baseline churn.
@@ -277,23 +277,23 @@ T-1 ──▶ T-2 ──▶ T-3
277
277
 
278
278
  Parallel opportunity: T-1 is a prerequisite for both T-2 and T-3 (T-3 also needs T-2).
279
279
 
280
- ## Dependency Waves
280
+ ## Dependency Batches
281
281
 
282
- #### Wave 1 (foundation)
282
+ #### Batch 1 (foundation)
283
283
  - Task IDs: T-1
284
284
  - Verification gate: schema tests pass, dedupe key fixtures validated
285
285
 
286
- #### Wave 2 (core logic)
286
+ #### Batch 2 (core logic)
287
287
  - Task IDs: T-2
288
- - Depends on: Wave 1 (T-1 complete)
288
+ - Depends on: Batch 1 (T-1 complete)
289
289
  - Verification gate: integration test proves publish-to-outbox path
290
290
 
291
- #### Wave 3 (integration)
291
+ #### Batch 3 (integration)
292
292
  - Task IDs: T-3
293
- - Depends on: Wave 2 (T-2 complete)
293
+ - Depends on: Batch 2 (T-2 complete)
294
294
  - Verification gate: e2e tests pass for delivery, dedupe, and degraded mode
295
295
 
296
- Execution rule: complete and verify each wave before starting the next wave.
296
+ Execution rule: complete and verify each batch before starting the next batch.
297
297
 
298
298
  ## Task List
299
299
 
@@ -313,10 +313,10 @@ Execution rule: complete and verify each wave before starting the next wave.
313
313
 
314
314
  ## Risk Assessment
315
315
 
316
- | Task/Wave | Risk | Likelihood | Impact | Mitigation |
316
+ | Task/Batch | Risk | Likelihood | Impact | Mitigation |
317
317
  | --- | --- | --- | --- | --- |
318
- | T-3 (Wave 3) | SSE reconnect logic complex | Medium | High | Spike reconnect in isolation before integrating with feed UI |
319
- | Wave 2 → 3 | Publisher API contract may shift | Low | Medium | Pin contract in T-1 schema; T-2 integration test validates |
318
+ | T-3 (Batch 3) | SSE reconnect logic complex | Medium | High | Spike reconnect in isolation before integrating with feed UI |
319
+ | Batch 2 → 3 | Publisher API contract may shift | Low | Medium | Pin contract in T-1 schema; T-2 integration test validates |
320
320
 
321
321
  ## WAIT_FOR_CONFIRM
322
322
  - Status: pending
@@ -682,7 +682,7 @@ const STAGE_EXAMPLE_SECTION_HEADINGS = {
682
682
  "Approval block"
683
683
  ],
684
684
  plan: [
685
- "Dependency graph + dependency waves",
685
+ "Dependency graph + dependency batches",
686
686
  "Task list with effort + minutes estimate per task",
687
687
  "Acceptance mapping (every AC → task IDs)",
688
688
  "No-Placeholder scan row + WAIT_FOR_CONFIRM marker"
@@ -296,7 +296,7 @@ if [ "$SUGGESTIONS_ENABLED" = "true" ] && [ "$STAGE_MUTED" != "true" ]; then
296
296
  scope) STAGE_SUGGESTION="Suggestion: lock explicit in-scope/out-of-scope boundaries and choose one scope mode." ;;
297
297
  design) STAGE_SUGGESTION="Suggestion: map failure modes per new codepath and confirm architecture boundaries before moving forward." ;;
298
298
  spec) STAGE_SUGGESTION="Suggestion: ensure every acceptance criterion is measurable and mapped to a concrete test." ;;
299
- plan) STAGE_SUGGESTION="Suggestion: group tasks into dependency waves and keep WAIT_FOR_CONFIRM pending until approval." ;;
299
+ plan) STAGE_SUGGESTION="Suggestion: group tasks into dependency batches and keep WAIT_FOR_CONFIRM pending until approval." ;;
300
300
  tdd) STAGE_SUGGESTION="Suggestion: execute RED → GREEN → REFACTOR for each selected slice and capture evidence per cycle." ;;
301
301
  review) STAGE_SUGGESTION="Suggestion: run Layer 1 before Layer 2 and reconcile findings into 07-review-army.json." ;;
302
302
  ship) STAGE_SUGGESTION="Suggestion: verify preflight + rollback plan before selecting exactly one finalization mode." ;;
@@ -1,8 +1,8 @@
1
1
  import type { FlowStage } from "../types.js";
2
2
  /**
3
- * Long-form Wave Execution walkthrough. Rendered once into
4
- * \`.cclaw/references/stages/tdd-wave-walkthrough.md\` by the installer.
3
+ * Long-form Batch Execution walkthrough. Rendered once into
4
+ * \`.cclaw/references/stages/tdd-batch-walkthrough.md\` by the installer.
5
5
  */
6
- export declare const TDD_WAVE_WALKTHROUGH_MARKDOWN = "# TDD \u2014 Wave Execution Walkthrough\n\nDetailed RED / GREEN / REFACTOR transcript for a 3-task wave. Illustrative\nonly \u2014 do not copy the command names blindly, match them to your stack.\n\n## Wave 1 example tasks\n\n| Task ID | Description | AC | Verification |\n|---|---|---|---|\n| T-1 `[~3m]` | Add `User.emailNormalized` column | AC-1 | `npm test -- users/schema` |\n| T-2 `[~4m]` | Normalize on write in `UserRepo.save` | AC-1 | `npm test -- users/repo` |\n| T-3 `[~3m]` | Reject duplicates in `UserService.signup` | AC-2 | `npm test -- users/service` |\n\n## Execution transcript\n\n### T-1 \u2014 RED\n\n> Run: `npm test -- users/schema` \u2192 **FAIL** (missing column: `emailNormalized`). Captured the failure stack as RED evidence. No production code touched yet.\n\n### T-1 \u2014 GREEN\n\n> Added the column in the schema module. Re-ran `npm test -- users/schema` \u2192 **PASS**. Ran the full suite `npm test` \u2192 **PASS**. Captured both outputs as GREEN evidence.\n\n### T-1 \u2014 REFACTOR\n\n> Extracted the column definition into a shared `NormalizedEmail` type used by T-2/T-3. Re-ran `npm test` \u2192 **PASS**. Captured REFACTOR note: \"Extracted NormalizedEmail type to keep T-2/T-3 DRY; zero behavior change, all tests still green.\"\n\n### T-2 \u2014 RED / GREEN / REFACTOR\n\nWrite the repo test that expects normalised writes, watch it fail (RED), implement normalisation inside `UserRepo.save` only (GREEN), then refactor the normaliser out of the repo into a helper shared with T-3 (REFACTOR).\n\n### T-3 \u2014 RED / GREEN / REFACTOR\n\nWrite the service-level duplicate test that expects a rejection, watch it fail (RED), add the duplicate check in `UserService.signup` (GREEN), refactor the error message into a named constant (REFACTOR).\n\n## Wave gate check\n\nAfter T-3 REFACTOR, before declaring Wave 1 done:\n\n1. Run the full suite (`npm test`) one final time \u2192 **PASS** captured as wave-exit evidence.\n2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial waves.\n3. Only now mark Wave 1 complete. Wave 2 cannot start until this step.\n\n## When to stop mid-wave (do NOT push through)\n\n- A RED test fails for a reason you did not predict (e.g. an unrelated flaky test) \u2192 **pause**, diagnose, log an operational-self-improvement entry, and decide with the user before proceeding.\n- A GREEN step would require touching code outside the task's acceptance criterion \u2192 **pause**, the task is scoped wrong; adjust the plan or open a follow-up task.\n- The same RED failure reappears after a GREEN change \u2192 **escalate** per the 3-attempts rule; do not keep patching.\n";
6
+ export declare const TDD_BATCH_WALKTHROUGH_MARKDOWN = "# TDD \u2014 Batch Execution Walkthrough\n\nDetailed RED / GREEN / REFACTOR transcript for a 3-task batch. Illustrative\nonly \u2014 do not copy the command names blindly, match them to your stack.\n\n## Batch 1 example tasks\n\n| Task ID | Description | AC | Verification |\n|---|---|---|---|\n| T-1 `[~3m]` | Add `User.emailNormalized` column | AC-1 | `npm test -- users/schema` |\n| T-2 `[~4m]` | Normalize on write in `UserRepo.save` | AC-1 | `npm test -- users/repo` |\n| T-3 `[~3m]` | Reject duplicates in `UserService.signup` | AC-2 | `npm test -- users/service` |\n\n## Execution transcript\n\n### T-1 \u2014 RED\n\n> Run: `npm test -- users/schema` \u2192 **FAIL** (missing column: `emailNormalized`). Captured the failure stack as RED evidence. No production code touched yet.\n\n### T-1 \u2014 GREEN\n\n> Added the column in the schema module. Re-ran `npm test -- users/schema` \u2192 **PASS**. Ran the full suite `npm test` \u2192 **PASS**. Captured both outputs as GREEN evidence.\n\n### T-1 \u2014 REFACTOR\n\n> Extracted the column definition into a shared `NormalizedEmail` type used by T-2/T-3. Re-ran `npm test` \u2192 **PASS**. Captured REFACTOR note: \"Extracted NormalizedEmail type to keep T-2/T-3 DRY; zero behavior change, all tests still green.\"\n\n### T-2 \u2014 RED / GREEN / REFACTOR\n\nWrite the repo test that expects normalised writes, watch it fail (RED), implement normalisation inside `UserRepo.save` only (GREEN), then refactor the normaliser out of the repo into a helper shared with T-3 (REFACTOR).\n\n### T-3 \u2014 RED / GREEN / REFACTOR\n\nWrite the service-level duplicate test that expects a rejection, watch it fail (RED), add the duplicate check in `UserService.signup` (GREEN), refactor the error message into a named constant (REFACTOR).\n\n## Batch gate check\n\nAfter T-3 REFACTOR, before declaring Batch 1 done:\n\n1. Run the full suite (`npm test`) one final time \u2192 **PASS** captured as batch-exit evidence.\n2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial batches.\n3. Only now mark Batch 1 complete. Batch 2 cannot start until this step.\n\n## When to stop mid-batch (do NOT push through)\n\n- A RED test fails for a reason you did not predict (e.g. an unrelated flaky test) \u2192 **pause**, diagnose, log an operational-self-improvement entry, and decide with the user before proceeding.\n- A GREEN step would require touching code outside the task's acceptance criterion \u2192 **pause**, the task is scoped wrong; adjust the plan or open a follow-up task.\n- The same RED failure reappears after a GREEN change \u2192 **escalate** per the 3-attempts rule; do not keep patching.\n";
7
7
  export declare function stageSkillFolder(stage: FlowStage): string;
8
8
  export declare function stageSkillMarkdown(stage: FlowStage): string;
@@ -103,19 +103,19 @@ Reference utility skill:
103
103
  \`.cclaw/skills/verification-before-completion/SKILL.md\`
104
104
  `;
105
105
  }
106
- function waveExecutionModeBlock(stage) {
106
+ function batchExecutionModeBlock(stage) {
107
107
  const schema = stageSchema(stage);
108
- if (!schema.waveExecutionAllowed)
108
+ if (!schema.batchExecutionAllowed)
109
109
  return "";
110
- return `## Wave Execution Mode
110
+ return `## Batch Execution Mode
111
111
 
112
- Execute the current dependency wave task-by-task (RED -> GREEN -> REFACTOR).
112
+ Execute the current dependency batch task-by-task (RED -> GREEN -> REFACTOR).
113
113
  Stop on BLOCKED status or when user input is required.
114
- Apply concise turn announces: one announce per wave boundary (or when risk/plan
114
+ Apply concise turn announces: one announce per batch boundary (or when risk/plan
115
115
  changes materially), then execute tasks without repetitive boilerplate.
116
116
 
117
117
  Detailed walkthrough:
118
- \`.cclaw/${STAGE_EXAMPLES_REFERENCE_DIR}/tdd-wave-walkthrough.md\`
118
+ \`.cclaw/${STAGE_EXAMPLES_REFERENCE_DIR}/tdd-batch-walkthrough.md\`
119
119
  `;
120
120
  }
121
121
  function crossStageTraceBlock(stage) {
@@ -190,7 +190,7 @@ function stageSpecificSeeAlso(stage) {
190
190
  ],
191
191
  tdd: [
192
192
  `- \`${RUNTIME_ROOT}/skills/debugging/SKILL.md\``,
193
- `- \`${RUNTIME_ROOT}/references/stages/tdd-wave-walkthrough.md\``
193
+ `- \`${RUNTIME_ROOT}/references/stages/tdd-batch-walkthrough.md\``
194
194
  ],
195
195
  review: [
196
196
  `- \`${RUNTIME_ROOT}/skills/security/SKILL.md\``,
@@ -239,15 +239,15 @@ function quickStartBlock(stage) {
239
239
  `;
240
240
  }
241
241
  /**
242
- * Long-form Wave Execution walkthrough. Rendered once into
243
- * \`.cclaw/references/stages/tdd-wave-walkthrough.md\` by the installer.
242
+ * Long-form Batch Execution walkthrough. Rendered once into
243
+ * \`.cclaw/references/stages/tdd-batch-walkthrough.md\` by the installer.
244
244
  */
245
- export const TDD_WAVE_WALKTHROUGH_MARKDOWN = `# TDD — Wave Execution Walkthrough
245
+ export const TDD_BATCH_WALKTHROUGH_MARKDOWN = `# TDD — Batch Execution Walkthrough
246
246
 
247
- Detailed RED / GREEN / REFACTOR transcript for a 3-task wave. Illustrative
247
+ Detailed RED / GREEN / REFACTOR transcript for a 3-task batch. Illustrative
248
248
  only — do not copy the command names blindly, match them to your stack.
249
249
 
250
- ## Wave 1 example tasks
250
+ ## Batch 1 example tasks
251
251
 
252
252
  | Task ID | Description | AC | Verification |
253
253
  |---|---|---|---|
@@ -277,15 +277,15 @@ Write the repo test that expects normalised writes, watch it fail (RED), impleme
277
277
 
278
278
  Write the service-level duplicate test that expects a rejection, watch it fail (RED), add the duplicate check in \`UserService.signup\` (GREEN), refactor the error message into a named constant (REFACTOR).
279
279
 
280
- ## Wave gate check
280
+ ## Batch gate check
281
281
 
282
- After T-3 REFACTOR, before declaring Wave 1 done:
282
+ After T-3 REFACTOR, before declaring Batch 1 done:
283
283
 
284
- 1. Run the full suite (\`npm test\`) one final time → **PASS** captured as wave-exit evidence.
285
- 2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial waves.
286
- 3. Only now mark Wave 1 complete. Wave 2 cannot start until this step.
284
+ 1. Run the full suite (\`npm test\`) one final time → **PASS** captured as batch-exit evidence.
285
+ 2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial batches.
286
+ 3. Only now mark Batch 1 complete. Batch 2 cannot start until this step.
287
287
 
288
- ## When to stop mid-wave (do NOT push through)
288
+ ## When to stop mid-batch (do NOT push through)
289
289
 
290
290
  - A RED test fails for a reason you did not predict (e.g. an unrelated flaky test) → **pause**, diagnose, log an operational-self-improvement entry, and decide with the user before proceeding.
291
291
  - A GREEN step would require touching code outside the task's acceptance criterion → **pause**, the task is scoped wrong; adjust the plan or open a follow-up task.
@@ -362,7 +362,7 @@ ${schema.interactionProtocol.map((item, i) => `${i + 1}. ${item}`).join("\n")}
362
362
  Shared decision/ask-user protocol:
363
363
  \`${DECISION_PROTOCOL_PATH}\`
364
364
 
365
- ${waveExecutionModeBlock(stage)}
365
+ ${batchExecutionModeBlock(stage)}
366
366
  ## Required Gates
367
367
  ${gateList}
368
368
 
@@ -36,7 +36,7 @@ const REQUIRED_GATE_IDS = {
36
36
  ],
37
37
  plan: [
38
38
  "plan_tasks_sliced_2_5_min",
39
- "plan_dependency_waves_defined",
39
+ "plan_dependency_batches_defined",
40
40
  "plan_acceptance_mapped",
41
41
  "plan_wait_for_confirm"
42
42
  ],
@@ -64,7 +64,7 @@ const REQUIRED_ARTIFACT_SECTIONS = {
64
64
  scope: ["Scope Mode", "In Scope / Out of Scope", "Completion Dashboard", "Scope Summary"],
65
65
  design: ["Architecture Boundaries", "Architecture Diagram", "Failure Mode Table", "Completion Dashboard"],
66
66
  spec: ["Acceptance Criteria", "Edge Cases", "Testability Map", "Approval"],
67
- plan: ["Task List", "Dependency Waves", "Acceptance Mapping", "WAIT_FOR_CONFIRM"],
67
+ plan: ["Task List", "Dependency Batches", "Acceptance Mapping", "WAIT_FOR_CONFIRM"],
68
68
  tdd: ["RED Evidence", "GREEN Evidence", "REFACTOR Notes", "Traceability"],
69
69
  review: ["Layer 1 Verdict", "Review Army Contract", "Severity Summary", "Final Verdict"],
70
70
  ship: ["Preflight Results", "Release Notes", "Rollback Plan", "Finalization"]
@@ -22,7 +22,7 @@ export const PLAN = {
22
22
  checklist: [
23
23
  "Read upstream — load spec, design, and scope artifacts. Cross-reference acceptance criteria.",
24
24
  "Build dependency graph — identify task ordering, parallel opportunities, and blocking dependencies.",
25
- "Group tasks into dependency waveswave N+1 cannot start until wave N has verification evidence.",
25
+ "Group tasks into dependency batchesbatch N+1 cannot start until batch N has verification evidence.",
26
26
  "Slice into vertical tasks — each task targets 2-5 minutes, produces one testable outcome, and touches one coherent area.",
27
27
  "Attach verification — every task has an acceptance criterion mapping and a concrete verification command.",
28
28
  "Map scope Locked Decisions — every D-XX from scope is referenced by at least one plan task (or explicitly marked deferred with reason).",
@@ -33,7 +33,7 @@ export const PLAN = {
33
33
  interactionProtocol: [
34
34
  "Plan in read-only mode relative to implementation.",
35
35
  "Split work into small vertical slices (target 2-5 minute tasks).",
36
- "Publish explicit dependency waves with entry and exit checks for each wave.",
36
+ "Publish explicit dependency batches with entry and exit checks for each batch.",
37
37
  "Attach verification step to every task.",
38
38
  "Preserve locked scope boundaries: no silent scope reduction language in task rows.",
39
39
  "Enforce WAIT_FOR_CONFIRM: present the plan summary with options (A) Approve / (B) Revise / (C) Reject.",
@@ -41,7 +41,7 @@ export const PLAN = {
41
41
  ],
42
42
  process: [
43
43
  "Build dependency graph and ordered slices.",
44
- "Group slices into execution waves and define gate criteria per wave.",
44
+ "Group slices into execution batches and define gate criteria per batch.",
45
45
  "Define each task with acceptance mapping and verification commands.",
46
46
  "Trace every locked decision (D-XX) to plan tasks or explicit defer rationale.",
47
47
  "Record checkpoints and blockers.",
@@ -50,7 +50,7 @@ export const PLAN = {
50
50
  requiredGates: [
51
51
  { id: "plan_tasks_sliced_2_5_min", description: "Tasks are small, executable slices." },
52
52
  { id: "plan_dependency_graph_written", description: "Dependency graph and order are explicit." },
53
- { id: "plan_dependency_waves_defined", description: "Tasks are grouped into executable waves with gate checks." },
53
+ { id: "plan_dependency_batches_defined", description: "Tasks are grouped into executable batches with gate checks." },
54
54
  { id: "plan_verification_steps_defined", description: "Each task has verification guidance." },
55
55
  { id: "plan_acceptance_mapped", description: "Each task maps to a spec acceptance criterion." },
56
56
  { id: "plan_wait_for_confirm", description: "Execution blocked until explicit user confirmation." }
@@ -60,7 +60,7 @@ export const PLAN = {
60
60
  "Task list includes acceptance mapping.",
61
61
  "Locked decision coverage table present with D-XX trace links.",
62
62
  "Dependency graph documented.",
63
- "Dependency waves documented with wave-by-wave verification gates.",
63
+ "Dependency batches documented with batch-by-batch verification gates.",
64
64
  "WAIT_FOR_CONFIRM status recorded."
65
65
  ],
66
66
  inputs: ["approved spec", "codebase context", "delivery constraints"],
@@ -69,11 +69,11 @@ export const PLAN = {
69
69
  "current architecture",
70
70
  "known technical debt and dependencies"
71
71
  ],
72
- outputs: ["task graph", "dependency wave plan", "ordered plan", "explicit confirmation checkpoint"],
72
+ outputs: ["task graph", "dependency batch plan", "ordered plan", "explicit confirmation checkpoint"],
73
73
  blockers: [
74
74
  "tasks too broad",
75
75
  "dependency uncertainty unresolved",
76
- "wave boundaries are unclear",
76
+ "batch boundaries are unclear",
77
77
  "locked decisions from scope are not mapped to tasks",
78
78
  "no explicit confirmation"
79
79
  ],
@@ -91,13 +91,13 @@ export const PLAN = {
91
91
  "Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions",
92
92
  "No dependency graph",
93
93
  "No WAIT_FOR_CONFIRM marker",
94
- "No explicit dependency waves",
94
+ "No explicit dependency batches",
95
95
  "Tasks exceed one coherent outcome",
96
96
  "No acceptance mapping",
97
97
  "Locked decisions are missing or not mapped",
98
98
  "Scope-reduction language appears without explicit approved defer decision"
99
99
  ],
100
- policyNeedles: ["WAIT_FOR_CONFIRM", "Task Graph", "Dependency Waves", "Acceptance Mapping", "verification steps", "Locked Decision Coverage"],
100
+ policyNeedles: ["WAIT_FOR_CONFIRM", "Task Graph", "Dependency Batches", "Acceptance Mapping", "verification steps", "Locked Decision Coverage"],
101
101
  artifactFile: "05-plan.md",
102
102
  next: "tdd",
103
103
  reviewSections: [
@@ -113,13 +113,13 @@ export const PLAN = {
113
113
  stopGate: true
114
114
  },
115
115
  {
116
- title: "Wave Completeness Audit",
116
+ title: "Batch Completeness Audit",
117
117
  evaluationPoints: [
118
- "Does every task belong to exactly one wave?",
119
- "Does each wave have a verification gate?",
120
- "Are wave dependencies explicit and acyclic?",
118
+ "Does every task belong to exactly one batch?",
119
+ "Does each batch have a verification gate?",
120
+ "Are batch dependencies explicit and acyclic?",
121
121
  "Is the acceptance mapping complete — every spec criterion covered?",
122
- "Are there hidden dependencies between tasks in different waves?"
122
+ "Are there hidden dependencies between tasks in different batches?"
123
123
  ],
124
124
  stopGate: true
125
125
  },
@@ -129,7 +129,7 @@ export const PLAN = {
129
129
  "Does every task carry an explicit minutes estimate (e.g. `[~3m]`) and does every estimate fit the 2-to-5-minute budget? Estimates >5 minutes must be split.",
130
130
  "Are all file paths, test commands, and verification commands copy-pasteable as written — no `TODO`, `TBD`, `FIXME`, `<fill-in>`, `<your-*-here>`, `xxx`, or ellipsis standing in for omitted args?",
131
131
  "Does every acceptance-criterion reference resolve to a real R# / AC-### in the spec (not a blank link)?",
132
- "If an estimate is genuinely uncertain (first-time integration, unfamiliar library), is the uncertainty named explicitly and scheduled as a spike task in wave 0, rather than hidden behind a large estimate?"
132
+ "If an estimate is genuinely uncertain (first-time integration, unfamiliar library), is the uncertainty named explicitly and scheduled as a spike task in batch 0, rather than hidden behind a large estimate?"
133
133
  ],
134
134
  stopGate: true
135
135
  }
@@ -142,12 +142,12 @@ export const PLAN = {
142
142
  },
143
143
  artifactValidation: [
144
144
  { section: "Dependency Graph", required: true, validationRule: "Ordering and parallel opportunities explicit. No circular dependencies." },
145
- { section: "Dependency Waves", required: true, validationRule: "Every task belongs to a wave. Each wave has an exit gate and dependency statement." },
145
+ { section: "Dependency Batches", required: true, validationRule: "Every task belongs to a batch. Each batch has an exit gate and dependency statement." },
146
146
  { section: "Task List", required: true, validationRule: "Each task row includes ID, description, acceptance criterion, verification command, and effort estimate (S/M/L). Every task must also carry a minutes estimate within the 2-5 minute budget." },
147
147
  { section: "Acceptance Mapping", required: true, validationRule: "Every spec criterion is covered by at least one task." },
148
148
  { section: "Locked Decision Coverage", required: false, validationRule: "Every locked decision ID (D-XX) from scope is listed with linked task IDs or explicit defer rationale." },
149
- { section: "Risk Assessment", required: false, validationRule: "If present: per-task or per-wave risk identification with likelihood, impact, and mitigation strategy." },
150
- { section: "Boundary Map", required: false, validationRule: "If present: per-wave or per-task interface contracts listing what each task produces (exports) and consumes (imports) from other tasks." },
149
+ { section: "Risk Assessment", required: false, validationRule: "If present: per-task or per-batch risk identification with likelihood, impact, and mitigation strategy." },
150
+ { section: "Boundary Map", required: false, validationRule: "If present: per-batch or per-task interface contracts listing what each task produces (exports) and consumes (imports) from other tasks." },
151
151
  { section: "WAIT_FOR_CONFIRM", required: true, validationRule: "Explicit marker present. Status: pending until user approves." },
152
152
  { section: "No-Placeholder Scan", required: false, validationRule: "Confirmation that a text scan for `TODO`, `TBD`, `FIXME`, `<fill-in>`, `<your-*-here>`, `xxx`, or bare ellipses has zero hits in the task list. A placeholder is a deferred decision masquerading as a plan." },
153
153
  { section: "No Scope Reduction Language Scan", required: false, validationRule: "Confirmation that scope-reduction phrases (`v1`, `for now`, `later`, `temporary`, `placeholder`) are absent from task rows when locked decisions exist." }
@@ -90,8 +90,8 @@ export interface StageSchema {
90
90
  completionStatus: string[];
91
91
  crossStageTrace: CrossStageTrace;
92
92
  artifactValidation: ArtifactValidation[];
93
- /** When true, stage skill includes wave auto-execute guidance (tdd). */
94
- waveExecutionAllowed?: boolean;
93
+ /** When true, stage skill includes batch auto-execute guidance (tdd). */
94
+ batchExecutionAllowed?: boolean;
95
95
  /** Sections that remain required even when the trivial-change escape hatch is active (design only). */
96
96
  trivialOverrideSections?: string[];
97
97
  /** Agent names that MUST be dispatched (or waived) before stage transition — derived from mandatory auto-subagent rows. */
@@ -179,5 +179,5 @@ export const TDD = {
179
179
  { section: "Test Pyramid Shape", required: false, validationRule: "If present: per-slice count of Small/Medium/Large tests added, to let reviewers verify the suite is not drifting top-heavy." },
180
180
  { section: "Prove-It Reproduction", required: false, validationRule: "Required for bug-fix slices: original failing reproduction test (RED without fix), passing output with fix (GREEN), and a note confirming the test fails again if the fix is reverted." }
181
181
  ],
182
- waveExecutionAllowed: true
182
+ batchExecutionAllowed: true
183
183
  };
@@ -129,7 +129,7 @@ If you catch yourself writing “read PLAN.md Task 3” or “implement the next
129
129
  | Status | Meaning | Controller action |
130
130
  |---|---|---|
131
131
  | DONE | Implementation complete; tests orchestrated per prompt; no known material risks | Proceed to reviewers |
132
- | DONE_WITH_CONCERNS | Shippable but with documented tradeoffs/risks | Proceed with reviewer + explicit notes; do not “hand-wave” concerns |
132
+ | DONE_WITH_CONCERNS | Shippable but with documented tradeoffs/risks | Proceed with reviewer + explicit notes; do not dismiss concerns |
133
133
  | NEEDS_CONTEXT | Missing authoritative information only the parent/user can supply | Parent gathers context, then re-dispatch implementer with augmented prompt |
134
134
  | BLOCKED | Hard stop (permissions, tool failure, conflicting requirements, unsafe state) | Parent escalates to user; do not stack speculative guesses |
135
135