cclaw-cli 0.23.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +4 -4
- package/dist/constants.d.ts +4 -4
- package/dist/constants.js +4 -4
- package/dist/content/eval-scaffold.d.ts +4 -4
- package/dist/content/eval-scaffold.js +13 -14
- package/dist/content/examples.js +11 -11
- package/dist/content/hooks.js +1 -1
- package/dist/content/skills.d.ts +3 -3
- package/dist/content/skills.js +19 -19
- package/dist/content/stage-schema.js +2 -2
- package/dist/content/stages/plan.js +18 -18
- package/dist/content/stages/schema-types.d.ts +2 -2
- package/dist/content/stages/tdd.js +1 -1
- package/dist/content/subagents.js +1 -1
- package/dist/content/templates.js +8 -8
- package/dist/content/utility-skills.js +19 -19
- package/dist/doctor.js +2 -2
- package/dist/eval/baseline.js +1 -1
- package/dist/eval/corpus.d.ts +1 -1
- package/dist/eval/corpus.js +1 -1
- package/dist/eval/llm-client.d.ts +10 -10
- package/dist/eval/llm-client.js +5 -5
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +6 -6
- package/dist/eval/runner.js +6 -6
- package/dist/eval/types.d.ts +12 -12
- package/dist/eval/verifiers/structural.js +3 -3
- package/dist/install.js +3 -3
- package/dist/policy.js +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -54,12 +54,12 @@ Commands:
|
|
|
54
54
|
Flags: --name=<feature> Feature slug (default: inferred from 00-idea.md).
|
|
55
55
|
--skip-retro Bypass mandatory retro gate (requires --retro-reason).
|
|
56
56
|
--retro-reason=<t> Reason for bypassing retro gate.
|
|
57
|
-
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7
|
|
57
|
+
eval Run cclaw evals against .cclaw/evals/corpus (Phase 7: structural verifier + baselines).
|
|
58
58
|
Flags: --stage=<id> Limit to one flow stage (${FLOW_STAGES.join("|")}).
|
|
59
59
|
--tier=<A|B|C> Fidelity tier (A=single-shot, B=tools, C=workflow).
|
|
60
|
-
--schema-only Run only structural verifiers (
|
|
61
|
-
--rules Run structural + rule verifiers (
|
|
62
|
-
--judge Include LLM judging (
|
|
60
|
+
--schema-only Run only structural verifiers (default).
|
|
61
|
+
--rules Run structural + rule verifiers (not wired yet).
|
|
62
|
+
--judge Include LLM judging (not wired yet; requires API key).
|
|
63
63
|
--dry-run Validate config + corpus, print summary, do not execute.
|
|
64
64
|
--json Emit machine-readable JSON on stdout.
|
|
65
65
|
--no-write Skip writing the report to .cclaw/evals/reports/.
|
package/dist/constants.d.ts
CHANGED
|
@@ -5,10 +5,10 @@ export declare const CCLAW_VERSION = "0.1.1";
|
|
|
5
5
|
export declare const FLOW_VERSION = "1.0.0";
|
|
6
6
|
export declare const DEFAULT_HARNESSES: HarnessId[];
|
|
7
7
|
/**
|
|
8
|
-
* Evals subtree.
|
|
9
|
-
* verifiers and LLM wiring
|
|
10
|
-
* main REQUIRED_DIRS list makes it explicit that
|
|
11
|
-
* does not affect non-eval cclaw behavior.
|
|
8
|
+
* Evals subtree. Scaffolds the directory layout and a default config.yaml; the
|
|
9
|
+
* structural verifier, rule verifiers, and LLM wiring layer on incrementally.
|
|
10
|
+
* Keeping this separate from the main REQUIRED_DIRS list makes it explicit that
|
|
11
|
+
* the evals runtime is additive and does not affect non-eval cclaw behavior.
|
|
12
12
|
*/
|
|
13
13
|
export declare const EVALS_ROOT = ".cclaw/evals";
|
|
14
14
|
export declare const EVALS_CONFIG_PATH = ".cclaw/evals/config.yaml";
|
package/dist/constants.js
CHANGED
|
@@ -9,10 +9,10 @@ export const DEFAULT_HARNESSES = [
|
|
|
9
9
|
"codex"
|
|
10
10
|
];
|
|
11
11
|
/**
|
|
12
|
-
* Evals subtree.
|
|
13
|
-
* verifiers and LLM wiring
|
|
14
|
-
* main REQUIRED_DIRS list makes it explicit that
|
|
15
|
-
* does not affect non-eval cclaw behavior.
|
|
12
|
+
* Evals subtree. Scaffolds the directory layout and a default config.yaml; the
|
|
13
|
+
* structural verifier, rule verifiers, and LLM wiring layer on incrementally.
|
|
14
|
+
* Keeping this separate from the main REQUIRED_DIRS list makes it explicit that
|
|
15
|
+
* the evals runtime is additive and does not affect non-eval cclaw behavior.
|
|
16
16
|
*/
|
|
17
17
|
export const EVALS_ROOT = `${RUNTIME_ROOT}/evals`;
|
|
18
18
|
export const EVALS_CONFIG_PATH = `${EVALS_ROOT}/config.yaml`;
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
* scaffold is intentionally minimal: a usable default config plus short
|
|
5
5
|
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
6
|
*/
|
|
7
|
-
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and
|
|
8
|
-
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema
|
|
9
|
-
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics
|
|
10
|
-
export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm
|
|
7
|
+
export declare const EVAL_CONFIG_YAML = "# cclaw eval config\n# See docs/evals.md for the full schema and rollout plan.\n#\n# All values can be overridden at runtime with CCLAW_EVAL_* environment\n# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.\nprovider: zai\nbaseUrl: https://api.z.ai/api/coding/paas/v4\nmodel: glm-5.1\n\n# Default fidelity tier when --tier is not supplied.\n# A = single-shot API call (cheap)\n# B = SDK with tool use (realistic)\n# C = multi-stage workflow (end-to-end)\ndefaultTier: A\n\n# Per-call timeout and retry budget.\ntimeoutMs: 120000\nmaxRetries: 2\n\n# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.\n# dailyUsdCap: 5\n\n# Regression thresholds used by CI.\nregression:\n # Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).\n failIfDeltaBelow: -0.15\n # Fail when any single critical rubric drops below this absolute score.\n failIfCriticalBelow: 3.0\n";
|
|
8
|
+
export declare const EVAL_CORPUS_README = "# Eval Corpus\n\nSeed cases live in `./<stage>/<id>.yaml`, one file per case.\nSee `docs/evals.md` for the schema.\n\nMinimal shape:\n\n```yaml\nid: brainstorm-01\nstage: brainstorm\ninput_prompt: |\n One short paragraph describing the user's task.\ncontext_files: []\nexpected:\n # verifier-specific hints; optional\n```\n\nStart with 3 structural cases per stage (24 total), then expand to 5 per\nstage (40 total) once rule verifiers land. Tier B/C runs may add\n`context_files` pulled from real projects to exercise the sandbox.\n";
|
|
9
|
+
export declare const EVAL_RUBRICS_README = "# Eval Rubrics\n\nLLM-judge rubrics. Each rubric is a short list of checks scored on a\n`1\u20135` scale with a rationale:\n\n```yaml\nstage: brainstorm\nchecks:\n - id: distinctness\n prompt: \"Are the proposed directions genuinely distinct (not rephrasings)?\"\n scale: \"1-5 where 5=fully distinct approaches\"\n weight: 1.0\n```\n\nRubric authoring happens when Tier A runs start producing artifacts, so we\nscore the *right* properties rather than retrofitting generic quality checks.\nSee `docs/evals.md` for the full schema.\n";
|
|
10
|
+
export declare const EVAL_BASELINES_README = "# Eval Baselines\n\nFrozen score snapshots used by regression gates. Baselines are committed to\ngit and updated explicitly via `cclaw eval --update-baseline --confirm`.\n\nEach baseline file is a JSON document keyed by stage and case id. Do not edit\nby hand; CI will flag baseline churn.\n";
|
|
11
11
|
export declare const EVAL_REPORTS_README = "# Eval Reports\n\nGenerated reports (JSON + Markdown) land here. This directory is gitignored.\nRun `cclaw eval --dry-run` to preview configuration without producing a\nreport.\n";
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* READMEs that point at `docs/evals.md` for authoring guidance.
|
|
6
6
|
*/
|
|
7
7
|
export const EVAL_CONFIG_YAML = `# cclaw eval config
|
|
8
|
-
# See docs/evals.md for the full schema and
|
|
8
|
+
# See docs/evals.md for the full schema and rollout plan.
|
|
9
9
|
#
|
|
10
10
|
# All values can be overridden at runtime with CCLAW_EVAL_* environment
|
|
11
11
|
# variables (env wins). Secrets like CCLAW_EVAL_API_KEY never live here.
|
|
@@ -14,9 +14,9 @@ baseUrl: https://api.z.ai/api/coding/paas/v4
|
|
|
14
14
|
model: glm-5.1
|
|
15
15
|
|
|
16
16
|
# Default fidelity tier when --tier is not supplied.
|
|
17
|
-
# A = single-shot API call (cheap
|
|
18
|
-
# B = SDK with tool use (realistic
|
|
19
|
-
# C = multi-stage workflow (end-to-end
|
|
17
|
+
# A = single-shot API call (cheap)
|
|
18
|
+
# B = SDK with tool use (realistic)
|
|
19
|
+
# C = multi-stage workflow (end-to-end)
|
|
20
20
|
defaultTier: A
|
|
21
21
|
|
|
22
22
|
# Per-call timeout and retry budget.
|
|
@@ -26,7 +26,7 @@ maxRetries: 2
|
|
|
26
26
|
# Optional hard-stop on estimated USD spend per day. Leave unset for no cap.
|
|
27
27
|
# dailyUsdCap: 5
|
|
28
28
|
|
|
29
|
-
# Regression thresholds used by CI
|
|
29
|
+
# Regression thresholds used by CI.
|
|
30
30
|
regression:
|
|
31
31
|
# Fail when overall score drops by more than this fraction (e.g. -0.15 = 15%).
|
|
32
32
|
failIfDeltaBelow: -0.15
|
|
@@ -36,7 +36,7 @@ regression:
|
|
|
36
36
|
export const EVAL_CORPUS_README = `# Eval Corpus
|
|
37
37
|
|
|
38
38
|
Seed cases live in \`./<stage>/<id>.yaml\`, one file per case.
|
|
39
|
-
See \`docs/evals.md\` for the schema
|
|
39
|
+
See \`docs/evals.md\` for the schema.
|
|
40
40
|
|
|
41
41
|
Minimal shape:
|
|
42
42
|
|
|
@@ -47,17 +47,17 @@ input_prompt: |
|
|
|
47
47
|
One short paragraph describing the user's task.
|
|
48
48
|
context_files: []
|
|
49
49
|
expected:
|
|
50
|
-
# verifier-specific hints; optional
|
|
50
|
+
# verifier-specific hints; optional
|
|
51
51
|
\`\`\`
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
stage (40 total).
|
|
55
|
-
projects to exercise
|
|
53
|
+
Start with 3 structural cases per stage (24 total), then expand to 5 per
|
|
54
|
+
stage (40 total) once rule verifiers land. Tier B/C runs may add
|
|
55
|
+
\`context_files\` pulled from real projects to exercise the sandbox.
|
|
56
56
|
`;
|
|
57
57
|
export const EVAL_RUBRICS_README = `# Eval Rubrics
|
|
58
58
|
|
|
59
|
-
LLM-judge rubrics
|
|
60
|
-
|
|
59
|
+
LLM-judge rubrics. Each rubric is a short list of checks scored on a
|
|
60
|
+
\`1–5\` scale with a rationale:
|
|
61
61
|
|
|
62
62
|
\`\`\`yaml
|
|
63
63
|
stage: brainstorm
|
|
@@ -75,8 +75,7 @@ See \`docs/evals.md\` for the full schema.
|
|
|
75
75
|
export const EVAL_BASELINES_README = `# Eval Baselines
|
|
76
76
|
|
|
77
77
|
Frozen score snapshots used by regression gates. Baselines are committed to
|
|
78
|
-
git and updated explicitly via \`cclaw eval --update-baseline --confirm
|
|
79
|
-
(wired in Wave 7.1).
|
|
78
|
+
git and updated explicitly via \`cclaw eval --update-baseline --confirm\`.
|
|
80
79
|
|
|
81
80
|
Each baseline file is a JSON document keyed by stage and case id. Do not edit
|
|
82
81
|
by hand; CI will flag baseline churn.
|
package/dist/content/examples.js
CHANGED
|
@@ -277,23 +277,23 @@ T-1 ──▶ T-2 ──▶ T-3
|
|
|
277
277
|
|
|
278
278
|
Parallel opportunity: T-1 is a prerequisite for both T-2 and T-3 (T-3 also needs T-2).
|
|
279
279
|
|
|
280
|
-
## Dependency
|
|
280
|
+
## Dependency Batches
|
|
281
281
|
|
|
282
|
-
####
|
|
282
|
+
#### Batch 1 (foundation)
|
|
283
283
|
- Task IDs: T-1
|
|
284
284
|
- Verification gate: schema tests pass, dedupe key fixtures validated
|
|
285
285
|
|
|
286
|
-
####
|
|
286
|
+
#### Batch 2 (core logic)
|
|
287
287
|
- Task IDs: T-2
|
|
288
|
-
- Depends on:
|
|
288
|
+
- Depends on: Batch 1 (T-1 complete)
|
|
289
289
|
- Verification gate: integration test proves publish-to-outbox path
|
|
290
290
|
|
|
291
|
-
####
|
|
291
|
+
#### Batch 3 (integration)
|
|
292
292
|
- Task IDs: T-3
|
|
293
|
-
- Depends on:
|
|
293
|
+
- Depends on: Batch 2 (T-2 complete)
|
|
294
294
|
- Verification gate: e2e tests pass for delivery, dedupe, and degraded mode
|
|
295
295
|
|
|
296
|
-
Execution rule: complete and verify each
|
|
296
|
+
Execution rule: complete and verify each batch before starting the next batch.
|
|
297
297
|
|
|
298
298
|
## Task List
|
|
299
299
|
|
|
@@ -313,10 +313,10 @@ Execution rule: complete and verify each wave before starting the next wave.
|
|
|
313
313
|
|
|
314
314
|
## Risk Assessment
|
|
315
315
|
|
|
316
|
-
| Task/
|
|
316
|
+
| Task/Batch | Risk | Likelihood | Impact | Mitigation |
|
|
317
317
|
| --- | --- | --- | --- | --- |
|
|
318
|
-
| T-3 (
|
|
319
|
-
|
|
|
318
|
+
| T-3 (Batch 3) | SSE reconnect logic complex | Medium | High | Spike reconnect in isolation before integrating with feed UI |
|
|
319
|
+
| Batch 2 → 3 | Publisher API contract may shift | Low | Medium | Pin contract in T-1 schema; T-2 integration test validates |
|
|
320
320
|
|
|
321
321
|
## WAIT_FOR_CONFIRM
|
|
322
322
|
- Status: pending
|
|
@@ -682,7 +682,7 @@ const STAGE_EXAMPLE_SECTION_HEADINGS = {
|
|
|
682
682
|
"Approval block"
|
|
683
683
|
],
|
|
684
684
|
plan: [
|
|
685
|
-
"Dependency graph + dependency
|
|
685
|
+
"Dependency graph + dependency batches",
|
|
686
686
|
"Task list with effort + minutes estimate per task",
|
|
687
687
|
"Acceptance mapping (every AC → task IDs)",
|
|
688
688
|
"No-Placeholder scan row + WAIT_FOR_CONFIRM marker"
|
package/dist/content/hooks.js
CHANGED
|
@@ -296,7 +296,7 @@ if [ "$SUGGESTIONS_ENABLED" = "true" ] && [ "$STAGE_MUTED" != "true" ]; then
|
|
|
296
296
|
scope) STAGE_SUGGESTION="Suggestion: lock explicit in-scope/out-of-scope boundaries and choose one scope mode." ;;
|
|
297
297
|
design) STAGE_SUGGESTION="Suggestion: map failure modes per new codepath and confirm architecture boundaries before moving forward." ;;
|
|
298
298
|
spec) STAGE_SUGGESTION="Suggestion: ensure every acceptance criterion is measurable and mapped to a concrete test." ;;
|
|
299
|
-
plan) STAGE_SUGGESTION="Suggestion: group tasks into dependency
|
|
299
|
+
plan) STAGE_SUGGESTION="Suggestion: group tasks into dependency batches and keep WAIT_FOR_CONFIRM pending until approval." ;;
|
|
300
300
|
tdd) STAGE_SUGGESTION="Suggestion: execute RED → GREEN → REFACTOR for each selected slice and capture evidence per cycle." ;;
|
|
301
301
|
review) STAGE_SUGGESTION="Suggestion: run Layer 1 before Layer 2 and reconcile findings into 07-review-army.json." ;;
|
|
302
302
|
ship) STAGE_SUGGESTION="Suggestion: verify preflight + rollback plan before selecting exactly one finalization mode." ;;
|
package/dist/content/skills.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
2
|
/**
|
|
3
|
-
* Long-form
|
|
4
|
-
* \`.cclaw/references/stages/tdd-
|
|
3
|
+
* Long-form Batch Execution walkthrough. Rendered once into
|
|
4
|
+
* \`.cclaw/references/stages/tdd-batch-walkthrough.md\` by the installer.
|
|
5
5
|
*/
|
|
6
|
-
export declare const
|
|
6
|
+
export declare const TDD_BATCH_WALKTHROUGH_MARKDOWN = "# TDD \u2014 Batch Execution Walkthrough\n\nDetailed RED / GREEN / REFACTOR transcript for a 3-task batch. Illustrative\nonly \u2014 do not copy the command names blindly, match them to your stack.\n\n## Batch 1 example tasks\n\n| Task ID | Description | AC | Verification |\n|---|---|---|---|\n| T-1 `[~3m]` | Add `User.emailNormalized` column | AC-1 | `npm test -- users/schema` |\n| T-2 `[~4m]` | Normalize on write in `UserRepo.save` | AC-1 | `npm test -- users/repo` |\n| T-3 `[~3m]` | Reject duplicates in `UserService.signup` | AC-2 | `npm test -- users/service` |\n\n## Execution transcript\n\n### T-1 \u2014 RED\n\n> Run: `npm test -- users/schema` \u2192 **FAIL** (missing column: `emailNormalized`). Captured the failure stack as RED evidence. No production code touched yet.\n\n### T-1 \u2014 GREEN\n\n> Added the column in the schema module. Re-ran `npm test -- users/schema` \u2192 **PASS**. Ran the full suite `npm test` \u2192 **PASS**. Captured both outputs as GREEN evidence.\n\n### T-1 \u2014 REFACTOR\n\n> Extracted the column definition into a shared `NormalizedEmail` type used by T-2/T-3. Re-ran `npm test` \u2192 **PASS**. Captured REFACTOR note: \"Extracted NormalizedEmail type to keep T-2/T-3 DRY; zero behavior change, all tests still green.\"\n\n### T-2 \u2014 RED / GREEN / REFACTOR\n\nWrite the repo test that expects normalised writes, watch it fail (RED), implement normalisation inside `UserRepo.save` only (GREEN), then refactor the normaliser out of the repo into a helper shared with T-3 (REFACTOR).\n\n### T-3 \u2014 RED / GREEN / REFACTOR\n\nWrite the service-level duplicate test that expects a rejection, watch it fail (RED), add the duplicate check in `UserService.signup` (GREEN), refactor the error message into a named constant (REFACTOR).\n\n## Batch gate check\n\nAfter T-3 REFACTOR, before declaring Batch 1 done:\n\n1. Run the full suite (`npm test`) one final time \u2192 **PASS** captured as batch-exit evidence.\n2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial batches.\n3. Only now mark Batch 1 complete. Batch 2 cannot start until this step.\n\n## When to stop mid-batch (do NOT push through)\n\n- A RED test fails for a reason you did not predict (e.g. an unrelated flaky test) \u2192 **pause**, diagnose, log an operational-self-improvement entry, and decide with the user before proceeding.\n- A GREEN step would require touching code outside the task's acceptance criterion \u2192 **pause**, the task is scoped wrong; adjust the plan or open a follow-up task.\n- The same RED failure reappears after a GREEN change \u2192 **escalate** per the 3-attempts rule; do not keep patching.\n";
|
|
7
7
|
export declare function stageSkillFolder(stage: FlowStage): string;
|
|
8
8
|
export declare function stageSkillMarkdown(stage: FlowStage): string;
|
package/dist/content/skills.js
CHANGED
|
@@ -103,19 +103,19 @@ Reference utility skill:
|
|
|
103
103
|
\`.cclaw/skills/verification-before-completion/SKILL.md\`
|
|
104
104
|
`;
|
|
105
105
|
}
|
|
106
|
-
function
|
|
106
|
+
function batchExecutionModeBlock(stage) {
|
|
107
107
|
const schema = stageSchema(stage);
|
|
108
|
-
if (!schema.
|
|
108
|
+
if (!schema.batchExecutionAllowed)
|
|
109
109
|
return "";
|
|
110
|
-
return `##
|
|
110
|
+
return `## Batch Execution Mode
|
|
111
111
|
|
|
112
|
-
Execute the current dependency
|
|
112
|
+
Execute the current dependency batch task-by-task (RED -> GREEN -> REFACTOR).
|
|
113
113
|
Stop on BLOCKED status or when user input is required.
|
|
114
|
-
Apply concise turn announces: one announce per
|
|
114
|
+
Apply concise turn announces: one announce per batch boundary (or when risk/plan
|
|
115
115
|
changes materially), then execute tasks without repetitive boilerplate.
|
|
116
116
|
|
|
117
117
|
Detailed walkthrough:
|
|
118
|
-
\`.cclaw/${STAGE_EXAMPLES_REFERENCE_DIR}/tdd-
|
|
118
|
+
\`.cclaw/${STAGE_EXAMPLES_REFERENCE_DIR}/tdd-batch-walkthrough.md\`
|
|
119
119
|
`;
|
|
120
120
|
}
|
|
121
121
|
function crossStageTraceBlock(stage) {
|
|
@@ -190,7 +190,7 @@ function stageSpecificSeeAlso(stage) {
|
|
|
190
190
|
],
|
|
191
191
|
tdd: [
|
|
192
192
|
`- \`${RUNTIME_ROOT}/skills/debugging/SKILL.md\``,
|
|
193
|
-
`- \`${RUNTIME_ROOT}/references/stages/tdd-
|
|
193
|
+
`- \`${RUNTIME_ROOT}/references/stages/tdd-batch-walkthrough.md\``
|
|
194
194
|
],
|
|
195
195
|
review: [
|
|
196
196
|
`- \`${RUNTIME_ROOT}/skills/security/SKILL.md\``,
|
|
@@ -239,15 +239,15 @@ function quickStartBlock(stage) {
|
|
|
239
239
|
`;
|
|
240
240
|
}
|
|
241
241
|
/**
|
|
242
|
-
* Long-form
|
|
243
|
-
* \`.cclaw/references/stages/tdd-
|
|
242
|
+
* Long-form Batch Execution walkthrough. Rendered once into
|
|
243
|
+
* \`.cclaw/references/stages/tdd-batch-walkthrough.md\` by the installer.
|
|
244
244
|
*/
|
|
245
|
-
export const
|
|
245
|
+
export const TDD_BATCH_WALKTHROUGH_MARKDOWN = `# TDD — Batch Execution Walkthrough
|
|
246
246
|
|
|
247
|
-
Detailed RED / GREEN / REFACTOR transcript for a 3-task
|
|
247
|
+
Detailed RED / GREEN / REFACTOR transcript for a 3-task batch. Illustrative
|
|
248
248
|
only — do not copy the command names blindly, match them to your stack.
|
|
249
249
|
|
|
250
|
-
##
|
|
250
|
+
## Batch 1 example tasks
|
|
251
251
|
|
|
252
252
|
| Task ID | Description | AC | Verification |
|
|
253
253
|
|---|---|---|---|
|
|
@@ -277,15 +277,15 @@ Write the repo test that expects normalised writes, watch it fail (RED), impleme
|
|
|
277
277
|
|
|
278
278
|
Write the service-level duplicate test that expects a rejection, watch it fail (RED), add the duplicate check in \`UserService.signup\` (GREEN), refactor the error message into a named constant (REFACTOR).
|
|
279
279
|
|
|
280
|
-
##
|
|
280
|
+
## Batch gate check
|
|
281
281
|
|
|
282
|
-
After T-3 REFACTOR, before declaring
|
|
282
|
+
After T-3 REFACTOR, before declaring Batch 1 done:
|
|
283
283
|
|
|
284
|
-
1. Run the full suite (\`npm test\`) one final time → **PASS** captured as
|
|
285
|
-
2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial
|
|
286
|
-
3. Only now mark
|
|
284
|
+
1. Run the full suite (\`npm test\`) one final time → **PASS** captured as batch-exit evidence.
|
|
285
|
+
2. Verify the TDD artifact contains RED, GREEN, and REFACTOR evidence for T-1, T-2, **and** T-3. No partial batches.
|
|
286
|
+
3. Only now mark Batch 1 complete. Batch 2 cannot start until this step.
|
|
287
287
|
|
|
288
|
-
## When to stop mid-
|
|
288
|
+
## When to stop mid-batch (do NOT push through)
|
|
289
289
|
|
|
290
290
|
- A RED test fails for a reason you did not predict (e.g. an unrelated flaky test) → **pause**, diagnose, log an operational-self-improvement entry, and decide with the user before proceeding.
|
|
291
291
|
- A GREEN step would require touching code outside the task's acceptance criterion → **pause**, the task is scoped wrong; adjust the plan or open a follow-up task.
|
|
@@ -362,7 +362,7 @@ ${schema.interactionProtocol.map((item, i) => `${i + 1}. ${item}`).join("\n")}
|
|
|
362
362
|
Shared decision/ask-user protocol:
|
|
363
363
|
\`${DECISION_PROTOCOL_PATH}\`
|
|
364
364
|
|
|
365
|
-
${
|
|
365
|
+
${batchExecutionModeBlock(stage)}
|
|
366
366
|
## Required Gates
|
|
367
367
|
${gateList}
|
|
368
368
|
|
|
@@ -36,7 +36,7 @@ const REQUIRED_GATE_IDS = {
|
|
|
36
36
|
],
|
|
37
37
|
plan: [
|
|
38
38
|
"plan_tasks_sliced_2_5_min",
|
|
39
|
-
"
|
|
39
|
+
"plan_dependency_batches_defined",
|
|
40
40
|
"plan_acceptance_mapped",
|
|
41
41
|
"plan_wait_for_confirm"
|
|
42
42
|
],
|
|
@@ -64,7 +64,7 @@ const REQUIRED_ARTIFACT_SECTIONS = {
|
|
|
64
64
|
scope: ["Scope Mode", "In Scope / Out of Scope", "Completion Dashboard", "Scope Summary"],
|
|
65
65
|
design: ["Architecture Boundaries", "Architecture Diagram", "Failure Mode Table", "Completion Dashboard"],
|
|
66
66
|
spec: ["Acceptance Criteria", "Edge Cases", "Testability Map", "Approval"],
|
|
67
|
-
plan: ["Task List", "Dependency
|
|
67
|
+
plan: ["Task List", "Dependency Batches", "Acceptance Mapping", "WAIT_FOR_CONFIRM"],
|
|
68
68
|
tdd: ["RED Evidence", "GREEN Evidence", "REFACTOR Notes", "Traceability"],
|
|
69
69
|
review: ["Layer 1 Verdict", "Review Army Contract", "Severity Summary", "Final Verdict"],
|
|
70
70
|
ship: ["Preflight Results", "Release Notes", "Rollback Plan", "Finalization"]
|
|
@@ -22,7 +22,7 @@ export const PLAN = {
|
|
|
22
22
|
checklist: [
|
|
23
23
|
"Read upstream — load spec, design, and scope artifacts. Cross-reference acceptance criteria.",
|
|
24
24
|
"Build dependency graph — identify task ordering, parallel opportunities, and blocking dependencies.",
|
|
25
|
-
"Group tasks into dependency
|
|
25
|
+
"Group tasks into dependency batches — batch N+1 cannot start until batch N has verification evidence.",
|
|
26
26
|
"Slice into vertical tasks — each task targets 2-5 minutes, produces one testable outcome, and touches one coherent area.",
|
|
27
27
|
"Attach verification — every task has an acceptance criterion mapping and a concrete verification command.",
|
|
28
28
|
"Map scope Locked Decisions — every D-XX from scope is referenced by at least one plan task (or explicitly marked deferred with reason).",
|
|
@@ -33,7 +33,7 @@ export const PLAN = {
|
|
|
33
33
|
interactionProtocol: [
|
|
34
34
|
"Plan in read-only mode relative to implementation.",
|
|
35
35
|
"Split work into small vertical slices (target 2-5 minute tasks).",
|
|
36
|
-
"Publish explicit dependency
|
|
36
|
+
"Publish explicit dependency batches with entry and exit checks for each batch.",
|
|
37
37
|
"Attach verification step to every task.",
|
|
38
38
|
"Preserve locked scope boundaries: no silent scope reduction language in task rows.",
|
|
39
39
|
"Enforce WAIT_FOR_CONFIRM: present the plan summary with options (A) Approve / (B) Revise / (C) Reject.",
|
|
@@ -41,7 +41,7 @@ export const PLAN = {
|
|
|
41
41
|
],
|
|
42
42
|
process: [
|
|
43
43
|
"Build dependency graph and ordered slices.",
|
|
44
|
-
"Group slices into execution
|
|
44
|
+
"Group slices into execution batches and define gate criteria per batch.",
|
|
45
45
|
"Define each task with acceptance mapping and verification commands.",
|
|
46
46
|
"Trace every locked decision (D-XX) to plan tasks or explicit defer rationale.",
|
|
47
47
|
"Record checkpoints and blockers.",
|
|
@@ -50,7 +50,7 @@ export const PLAN = {
|
|
|
50
50
|
requiredGates: [
|
|
51
51
|
{ id: "plan_tasks_sliced_2_5_min", description: "Tasks are small, executable slices." },
|
|
52
52
|
{ id: "plan_dependency_graph_written", description: "Dependency graph and order are explicit." },
|
|
53
|
-
{ id: "
|
|
53
|
+
{ id: "plan_dependency_batches_defined", description: "Tasks are grouped into executable batches with gate checks." },
|
|
54
54
|
{ id: "plan_verification_steps_defined", description: "Each task has verification guidance." },
|
|
55
55
|
{ id: "plan_acceptance_mapped", description: "Each task maps to a spec acceptance criterion." },
|
|
56
56
|
{ id: "plan_wait_for_confirm", description: "Execution blocked until explicit user confirmation." }
|
|
@@ -60,7 +60,7 @@ export const PLAN = {
|
|
|
60
60
|
"Task list includes acceptance mapping.",
|
|
61
61
|
"Locked decision coverage table present with D-XX trace links.",
|
|
62
62
|
"Dependency graph documented.",
|
|
63
|
-
"Dependency
|
|
63
|
+
"Dependency batches documented with batch-by-batch verification gates.",
|
|
64
64
|
"WAIT_FOR_CONFIRM status recorded."
|
|
65
65
|
],
|
|
66
66
|
inputs: ["approved spec", "codebase context", "delivery constraints"],
|
|
@@ -69,11 +69,11 @@ export const PLAN = {
|
|
|
69
69
|
"current architecture",
|
|
70
70
|
"known technical debt and dependencies"
|
|
71
71
|
],
|
|
72
|
-
outputs: ["task graph", "dependency
|
|
72
|
+
outputs: ["task graph", "dependency batch plan", "ordered plan", "explicit confirmation checkpoint"],
|
|
73
73
|
blockers: [
|
|
74
74
|
"tasks too broad",
|
|
75
75
|
"dependency uncertainty unresolved",
|
|
76
|
-
"
|
|
76
|
+
"batch boundaries are unclear",
|
|
77
77
|
"locked decisions from scope are not mapped to tasks",
|
|
78
78
|
"no explicit confirmation"
|
|
79
79
|
],
|
|
@@ -91,13 +91,13 @@ export const PLAN = {
|
|
|
91
91
|
"Using placeholder tokens or scope-reduction phrases (`v1`, `for now`, `later`) in task definitions",
|
|
92
92
|
"No dependency graph",
|
|
93
93
|
"No WAIT_FOR_CONFIRM marker",
|
|
94
|
-
"No explicit dependency
|
|
94
|
+
"No explicit dependency batches",
|
|
95
95
|
"Tasks exceed one coherent outcome",
|
|
96
96
|
"No acceptance mapping",
|
|
97
97
|
"Locked decisions are missing or not mapped",
|
|
98
98
|
"Scope-reduction language appears without explicit approved defer decision"
|
|
99
99
|
],
|
|
100
|
-
policyNeedles: ["WAIT_FOR_CONFIRM", "Task Graph", "Dependency
|
|
100
|
+
policyNeedles: ["WAIT_FOR_CONFIRM", "Task Graph", "Dependency Batches", "Acceptance Mapping", "verification steps", "Locked Decision Coverage"],
|
|
101
101
|
artifactFile: "05-plan.md",
|
|
102
102
|
next: "tdd",
|
|
103
103
|
reviewSections: [
|
|
@@ -113,13 +113,13 @@ export const PLAN = {
|
|
|
113
113
|
stopGate: true
|
|
114
114
|
},
|
|
115
115
|
{
|
|
116
|
-
title: "
|
|
116
|
+
title: "Batch Completeness Audit",
|
|
117
117
|
evaluationPoints: [
|
|
118
|
-
"Does every task belong to exactly one
|
|
119
|
-
"Does each
|
|
120
|
-
"Are
|
|
118
|
+
"Does every task belong to exactly one batch?",
|
|
119
|
+
"Does each batch have a verification gate?",
|
|
120
|
+
"Are batch dependencies explicit and acyclic?",
|
|
121
121
|
"Is the acceptance mapping complete — every spec criterion covered?",
|
|
122
|
-
"Are there hidden dependencies between tasks in different
|
|
122
|
+
"Are there hidden dependencies between tasks in different batches?"
|
|
123
123
|
],
|
|
124
124
|
stopGate: true
|
|
125
125
|
},
|
|
@@ -129,7 +129,7 @@ export const PLAN = {
|
|
|
129
129
|
"Does every task carry an explicit minutes estimate (e.g. `[~3m]`) and does every estimate fit the 2-to-5-minute budget? Estimates >5 minutes must be split.",
|
|
130
130
|
"Are all file paths, test commands, and verification commands copy-pasteable as written — no `TODO`, `TBD`, `FIXME`, `<fill-in>`, `<your-*-here>`, `xxx`, or ellipsis standing in for omitted args?",
|
|
131
131
|
"Does every acceptance-criterion reference resolve to a real R# / AC-### in the spec (not a blank link)?",
|
|
132
|
-
"If an estimate is genuinely uncertain (first-time integration, unfamiliar library), is the uncertainty named explicitly and scheduled as a spike task in
|
|
132
|
+
"If an estimate is genuinely uncertain (first-time integration, unfamiliar library), is the uncertainty named explicitly and scheduled as a spike task in batch 0, rather than hidden behind a large estimate?"
|
|
133
133
|
],
|
|
134
134
|
stopGate: true
|
|
135
135
|
}
|
|
@@ -142,12 +142,12 @@ export const PLAN = {
|
|
|
142
142
|
},
|
|
143
143
|
artifactValidation: [
|
|
144
144
|
{ section: "Dependency Graph", required: true, validationRule: "Ordering and parallel opportunities explicit. No circular dependencies." },
|
|
145
|
-
{ section: "Dependency
|
|
145
|
+
{ section: "Dependency Batches", required: true, validationRule: "Every task belongs to a batch. Each batch has an exit gate and dependency statement." },
|
|
146
146
|
{ section: "Task List", required: true, validationRule: "Each task row includes ID, description, acceptance criterion, verification command, and effort estimate (S/M/L). Every task must also carry a minutes estimate within the 2-5 minute budget." },
|
|
147
147
|
{ section: "Acceptance Mapping", required: true, validationRule: "Every spec criterion is covered by at least one task." },
|
|
148
148
|
{ section: "Locked Decision Coverage", required: false, validationRule: "Every locked decision ID (D-XX) from scope is listed with linked task IDs or explicit defer rationale." },
|
|
149
|
-
{ section: "Risk Assessment", required: false, validationRule: "If present: per-task or per-
|
|
150
|
-
{ section: "Boundary Map", required: false, validationRule: "If present: per-
|
|
149
|
+
{ section: "Risk Assessment", required: false, validationRule: "If present: per-task or per-batch risk identification with likelihood, impact, and mitigation strategy." },
|
|
150
|
+
{ section: "Boundary Map", required: false, validationRule: "If present: per-batch or per-task interface contracts listing what each task produces (exports) and consumes (imports) from other tasks." },
|
|
151
151
|
{ section: "WAIT_FOR_CONFIRM", required: true, validationRule: "Explicit marker present. Status: pending until user approves." },
|
|
152
152
|
{ section: "No-Placeholder Scan", required: false, validationRule: "Confirmation that a text scan for `TODO`, `TBD`, `FIXME`, `<fill-in>`, `<your-*-here>`, `xxx`, or bare ellipses has zero hits in the task list. A placeholder is a deferred decision masquerading as a plan." },
|
|
153
153
|
{ section: "No Scope Reduction Language Scan", required: false, validationRule: "Confirmation that scope-reduction phrases (`v1`, `for now`, `later`, `temporary`, `placeholder`) are absent from task rows when locked decisions exist." }
|
|
@@ -90,8 +90,8 @@ export interface StageSchema {
|
|
|
90
90
|
completionStatus: string[];
|
|
91
91
|
crossStageTrace: CrossStageTrace;
|
|
92
92
|
artifactValidation: ArtifactValidation[];
|
|
93
|
-
/** When true, stage skill includes
|
|
94
|
-
|
|
93
|
+
/** When true, stage skill includes batch auto-execute guidance (tdd). */
|
|
94
|
+
batchExecutionAllowed?: boolean;
|
|
95
95
|
/** Sections that remain required even when the trivial-change escape hatch is active (design only). */
|
|
96
96
|
trivialOverrideSections?: string[];
|
|
97
97
|
/** Agent names that MUST be dispatched (or waived) before stage transition — derived from mandatory auto-subagent rows. */
|
|
@@ -179,5 +179,5 @@ export const TDD = {
|
|
|
179
179
|
{ section: "Test Pyramid Shape", required: false, validationRule: "If present: per-slice count of Small/Medium/Large tests added, to let reviewers verify the suite is not drifting top-heavy." },
|
|
180
180
|
{ section: "Prove-It Reproduction", required: false, validationRule: "Required for bug-fix slices: original failing reproduction test (RED without fix), passing output with fix (GREEN), and a note confirming the test fails again if the fix is reverted." }
|
|
181
181
|
],
|
|
182
|
-
|
|
182
|
+
batchExecutionAllowed: true
|
|
183
183
|
};
|
|
@@ -129,7 +129,7 @@ If you catch yourself writing “read PLAN.md Task 3” or “implement the next
|
|
|
129
129
|
| Status | Meaning | Controller action |
|
|
130
130
|
|---|---|---|
|
|
131
131
|
| DONE | Implementation complete; tests orchestrated per prompt; no known material risks | Proceed to reviewers |
|
|
132
|
-
| DONE_WITH_CONCERNS | Shippable but with documented tradeoffs/risks | Proceed with reviewer + explicit notes; do not
|
|
132
|
+
| DONE_WITH_CONCERNS | Shippable but with documented tradeoffs/risks | Proceed with reviewer + explicit notes; do not dismiss concerns |
|
|
133
133
|
| NEEDS_CONTEXT | Missing authoritative information only the parent/user can supply | Parent gathers context, then re-dispatch implementer with augmented prompt |
|
|
134
134
|
| BLOCKED | Hard stop (permissions, tool failure, conflicting requirements, unsafe state) | Parent escalates to user; do not stack speculative guesses |
|
|
135
135
|
|
|
@@ -309,23 +309,23 @@ inputs_hash: sha256:pending
|
|
|
309
309
|
## Dependency Graph
|
|
310
310
|
-
|
|
311
311
|
|
|
312
|
-
## Dependency
|
|
312
|
+
## Dependency Batches
|
|
313
313
|
|
|
314
|
-
###
|
|
314
|
+
### Batch 1 (foundation)
|
|
315
315
|
- Task IDs:
|
|
316
316
|
- Verification gate:
|
|
317
317
|
|
|
318
|
-
###
|
|
318
|
+
### Batch 2 (dependent)
|
|
319
319
|
- Task IDs:
|
|
320
320
|
- Depends on:
|
|
321
321
|
- Verification gate:
|
|
322
322
|
|
|
323
|
-
###
|
|
323
|
+
### Batch 3 (integration)
|
|
324
324
|
- Task IDs:
|
|
325
325
|
- Depends on:
|
|
326
326
|
- Verification gate:
|
|
327
327
|
|
|
328
|
-
Execution rule: complete and verify each
|
|
328
|
+
Execution rule: complete and verify each batch before starting the next batch.
|
|
329
329
|
|
|
330
330
|
## Task List
|
|
331
331
|
|
|
@@ -333,7 +333,7 @@ Execution rule: complete and verify each wave before starting the next wave.
|
|
|
333
333
|
- Every task fits the **2-5 minute budget**. If \`[~Nm]\` is >5, split the task.
|
|
334
334
|
- **No placeholders.** Forbidden tokens anywhere in this table: \`TODO\`, \`TBD\`, \`FIXME\`, \`<fill-in>\`, \`<your-*-here>\`, \`xxx\`, bare ellipsis. Every file path, test, and verification command must be copy-pasteable as written.
|
|
335
335
|
- **No silent scope reduction.** Forbidden phrasing when locked decisions exist: \`v1\`, \`for now\`, \`later\`, \`temporary\`, \`placeholder\`, \`mock for now\`, \`hardcoded for now\`, \`will improve later\`.
|
|
336
|
-
- If an estimate is genuinely uncertain (new library, unfamiliar subsystem), add a **spike task in
|
|
336
|
+
- If an estimate is genuinely uncertain (new library, unfamiliar subsystem), add a **spike task in batch 0** to de-risk — do NOT hide the uncertainty inside a large estimate.
|
|
337
337
|
|
|
338
338
|
| Task ID | Description | Acceptance criterion | Verification command | Effort (S/M/L) | Minutes |
|
|
339
339
|
|---|---|---|---|---|---|
|
|
@@ -350,12 +350,12 @@ Execution rule: complete and verify each wave before starting the next wave.
|
|
|
350
350
|
| D-01 | 02-scope.md > Locked Decisions | T-1 | covered |
|
|
351
351
|
|
|
352
352
|
## Risk Assessment
|
|
353
|
-
| Task/
|
|
353
|
+
| Task/Batch | Risk | Likelihood | Impact | Mitigation |
|
|
354
354
|
|---|---|---|---|---|
|
|
355
355
|
| | | | | |
|
|
356
356
|
|
|
357
357
|
## Boundary Map
|
|
358
|
-
| Task/
|
|
358
|
+
| Task/Batch | Produces (exports) | Consumes (imports from) |
|
|
359
359
|
|---|---|---|
|
|
360
360
|
| | | |
|
|
361
361
|
|
|
@@ -482,7 +482,7 @@ description: "Execute approved plans with disciplined batching, explicit checkpo
|
|
|
482
482
|
## Quick Start
|
|
483
483
|
|
|
484
484
|
> 1. Confirm the plan and stage gates are approved before execution.
|
|
485
|
-
> 2. Execute in batches
|
|
485
|
+
> 2. Execute in batches, not as one giant untracked stream.
|
|
486
486
|
> 3. Stop at checkpoint boundaries for verification and user visibility.
|
|
487
487
|
|
|
488
488
|
## HARD-GATE
|
|
@@ -492,47 +492,47 @@ Do not start implementation execution without an approved plan artifact and expl
|
|
|
492
492
|
## Execution Protocol
|
|
493
493
|
|
|
494
494
|
1. **Load plan source of truth** from \`.cclaw/artifacts/05-plan.md\` (canonical run copy when available).
|
|
495
|
-
2. **Group tasks into
|
|
496
|
-
3. **Run one
|
|
497
|
-
4. **Checkpoint each
|
|
495
|
+
2. **Group tasks into batches** by dependency order and risk.
|
|
496
|
+
3. **Run one batch at a time** with evidence after each task (tests, build, lint, or review evidence as applicable).
|
|
497
|
+
4. **Checkpoint each batch** by updating stage artifact evidence and unresolved blockers.
|
|
498
498
|
5. **Stop immediately** on any hard blocker, failing gate, or unresolved critical finding.
|
|
499
499
|
|
|
500
|
-
##
|
|
500
|
+
## Batch Checklist
|
|
501
501
|
|
|
502
|
-
-
|
|
502
|
+
- Batch scope is explicit (task IDs + expected outputs).
|
|
503
503
|
- Verification command for each task is predetermined.
|
|
504
504
|
- Machine-only checks are delegated to subagents when supported.
|
|
505
505
|
- User approvals are requested only at required gate boundaries.
|
|
506
506
|
|
|
507
|
-
## Fresh Context Protocol (between
|
|
507
|
+
## Fresh Context Protocol (between batches)
|
|
508
508
|
|
|
509
|
-
After a
|
|
510
|
-
the #1 cause of degraded execution quality. Before starting the **next
|
|
509
|
+
After a batch completes — especially after long agent turns — context drift is
|
|
510
|
+
the #1 cause of degraded execution quality. Before starting the **next batch**,
|
|
511
511
|
prefer a **fresh agent context** over continuing in a saturated session:
|
|
512
512
|
|
|
513
|
-
1. **Snapshot
|
|
514
|
-
(\`###
|
|
513
|
+
1. **Snapshot batch outcome** — append a short summary to the plan artifact
|
|
514
|
+
(\`### Batch <N> outcome\` with: tasks done, evidence files, blockers, next-batch inputs).
|
|
515
515
|
2. **Capture handoff facts** — the minimum information the next agent needs:
|
|
516
516
|
- Stage and run id (from \`.cclaw/state/flow-state.json\`)
|
|
517
517
|
- List of completed task IDs from the plan
|
|
518
518
|
- Open blockers / failing gates by name
|
|
519
|
-
- File paths the next
|
|
519
|
+
- File paths the next batch will touch (no full diffs)
|
|
520
520
|
3. **Decide: continue or rotate**
|
|
521
|
-
- **Rotate** (start a new agent session) when: prior
|
|
522
|
-
- **Continue** when: next
|
|
521
|
+
- **Rotate** (start a new agent session) when: prior batch consumed > ~50% of the context budget, the prior batch required deep investigation that the next batch does not need, or you are about to cross a stage boundary.
|
|
522
|
+
- **Continue** when: next batch is a tiny follow-up (≤ 1 task) and the prior context is directly relevant.
|
|
523
523
|
4. **Resume** in the new session via \`/cc-next\` — the session-start hook will restore flow state, checkpoint, and digest automatically.
|
|
524
524
|
|
|
525
|
-
This is the same intuition as Compound Engineering's "fresh context per iteration": every
|
|
525
|
+
This is the same intuition as Compound Engineering's "fresh context per iteration": every batch starts with a clean, intentionally-loaded context, not a degraded carry-over.
|
|
526
526
|
|
|
527
527
|
### Handoff template (paste into next session)
|
|
528
528
|
|
|
529
529
|
\`\`\`markdown
|
|
530
|
-
##
|
|
530
|
+
## Batch <N> handoff
|
|
531
531
|
- Stage: <stage>
|
|
532
532
|
- Run: <runId>
|
|
533
533
|
- Completed task IDs: <list>
|
|
534
534
|
- Blockers: <list or none>
|
|
535
|
-
- Files next
|
|
535
|
+
- Files next batch will touch: <list>
|
|
536
536
|
- Verification command(s) used: <list>
|
|
537
537
|
\`\`\`
|
|
538
538
|
|
|
@@ -542,7 +542,7 @@ This is the same intuition as Compound Engineering's "fresh context per iteratio
|
|
|
542
542
|
- Marking tasks done without command evidence.
|
|
543
543
|
- Reordering critical dependencies for speed.
|
|
544
544
|
- Continuing after a gate failure hoping later tasks fix it.
|
|
545
|
-
- Carrying a saturated context across
|
|
545
|
+
- Carrying a saturated context across batch boundaries because "it has all the history" — saturated context is a liability, not an asset.
|
|
546
546
|
`;
|
|
547
547
|
}
|
|
548
548
|
export function contextEngineeringSkill() {
|
|
@@ -1338,7 +1338,7 @@ For each lens, write either a knowledge entry **or** the explicit string
|
|
|
1338
1338
|
|
|
1339
1339
|
### 2. What slowed us down?
|
|
1340
1340
|
|
|
1341
|
-
- Repeated context loss between
|
|
1341
|
+
- Repeated context loss between batches → \`[compound]\` accelerator.
|
|
1342
1342
|
- Re-derivation of a fact already in upstream artifacts → \`[pattern]\` "re-read X first".
|
|
1343
1343
|
- Tooling friction (slow test loop, flaky CI) → \`[compound]\` follow-up.
|
|
1344
1344
|
|
package/dist/doctor.js
CHANGED
|
@@ -283,8 +283,8 @@ export async function doctorChecks(projectRoot, options = {}) {
|
|
|
283
283
|
const skillContent = await fs.readFile(skillPath, "utf8");
|
|
284
284
|
const lineCount = skillContent.split("\n").length;
|
|
285
285
|
const MIN_SKILL_LINES = 110;
|
|
286
|
-
// Soft max tightened
|
|
287
|
-
//
|
|
286
|
+
// Soft max tightened from 650 → 500 after externalising the TDD
|
|
287
|
+
// batch-execution walkthrough and collapsing the duplicate "what
|
|
288
288
|
// goes wrong" lists. Stage skills beyond 500 lines drift into unread
|
|
289
289
|
// bloat; long-form content belongs under `.cclaw/references/` instead.
|
|
290
290
|
const MAX_SKILL_LINES = 500;
|
package/dist/eval/baseline.js
CHANGED
package/dist/eval/corpus.d.ts
CHANGED
|
@@ -14,6 +14,6 @@ export declare function fixturePathFor(projectRoot: string, caseEntry: EvalCase)
|
|
|
14
14
|
/**
|
|
15
15
|
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
16
16
|
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
17
|
-
* the case but not on disk —
|
|
17
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
18
18
|
*/
|
|
19
19
|
export declare function readFixtureArtifact(projectRoot: string, caseEntry: EvalCase): Promise<string | undefined>;
|
package/dist/eval/corpus.js
CHANGED
|
@@ -162,7 +162,7 @@ export function fixturePathFor(projectRoot, caseEntry) {
|
|
|
162
162
|
/**
|
|
163
163
|
* Read the fixture artifact text for a case. Returns `undefined` if the case
|
|
164
164
|
* has no fixture reference. Throws a descriptive error if the path exists in
|
|
165
|
-
* the case but not on disk —
|
|
165
|
+
* the case but not on disk — structural fixtures ship alongside cases.
|
|
166
166
|
*/
|
|
167
167
|
export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
168
168
|
const fixturePath = fixturePathFor(projectRoot, caseEntry);
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* runtime dependency. The real implementation
|
|
4
|
+
* This module declares the shape of the client without pulling in the
|
|
5
|
+
* `openai` runtime dependency. The real implementation lands when
|
|
6
6
|
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users
|
|
8
|
-
*
|
|
7
|
+
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
+
* install an extra dependency or receive network egress warnings.
|
|
9
9
|
*/
|
|
10
10
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
11
|
/**
|
|
12
12
|
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
13
|
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
*
|
|
14
|
+
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
15
|
*/
|
|
16
16
|
export interface ChatMessage {
|
|
17
17
|
role: "system" | "user" | "assistant" | "tool";
|
|
@@ -26,8 +26,8 @@ export interface ChatRequest {
|
|
|
26
26
|
temperature?: number;
|
|
27
27
|
timeoutMs?: number;
|
|
28
28
|
/**
|
|
29
|
-
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
-
*
|
|
29
|
+
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
|
+
* by Tier B. Ignored by the Tier A single-shot path.
|
|
31
31
|
*/
|
|
32
32
|
tools?: unknown[];
|
|
33
33
|
toolChoice?: "auto" | "none";
|
|
@@ -52,11 +52,11 @@ export interface EvalLlmClient {
|
|
|
52
52
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
53
|
}
|
|
54
54
|
export declare class EvalLlmNotWiredError extends Error {
|
|
55
|
-
constructor(
|
|
55
|
+
constructor();
|
|
56
56
|
}
|
|
57
57
|
/**
|
|
58
|
-
* Factory stub. Throws with a clear message so accidental
|
|
59
|
-
* easy to diagnose. The
|
|
58
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
59
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
60
60
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
61
61
|
*/
|
|
62
62
|
export declare function createEvalClient(_config: ResolvedEvalConfig): EvalLlmClient;
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
export class EvalLlmNotWiredError extends Error {
|
|
2
|
-
constructor(
|
|
3
|
-
super(`LLM client is not wired
|
|
2
|
+
constructor() {
|
|
3
|
+
super(`LLM client is not wired yet.\n` +
|
|
4
4
|
`Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
|
|
5
5
|
this.name = "EvalLlmNotWiredError";
|
|
6
6
|
}
|
|
7
7
|
}
|
|
8
8
|
/**
|
|
9
|
-
* Factory stub. Throws with a clear message so accidental
|
|
10
|
-
* easy to diagnose. The
|
|
9
|
+
* Factory stub. Throws with a clear message so accidental early usage is
|
|
10
|
+
* easy to diagnose. The real implementation will replace this body with
|
|
11
11
|
* `new OpenAI({ apiKey, baseURL }) ... adapter`.
|
|
12
12
|
*/
|
|
13
13
|
export function createEvalClient(_config) {
|
|
14
14
|
return {
|
|
15
15
|
async chat() {
|
|
16
|
-
throw new EvalLlmNotWiredError(
|
|
16
|
+
throw new EvalLlmNotWiredError();
|
|
17
17
|
}
|
|
18
18
|
};
|
|
19
19
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -62,7 +62,7 @@ export function formatMarkdownReport(report) {
|
|
|
62
62
|
if (report.cases.length === 0) {
|
|
63
63
|
lines.push(`## Cases`);
|
|
64
64
|
lines.push(``);
|
|
65
|
-
lines.push(`No cases were executed. See \`docs/evals.md\` for the
|
|
65
|
+
lines.push(`No cases were executed. See \`docs/evals.md\` for the rollout plan.`);
|
|
66
66
|
lines.push(``);
|
|
67
67
|
return `${lines.join("\n")}\n`;
|
|
68
68
|
}
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -4,11 +4,11 @@ export interface RunEvalOptions {
|
|
|
4
4
|
projectRoot: string;
|
|
5
5
|
stage?: FlowStage;
|
|
6
6
|
tier?: EvalTier;
|
|
7
|
-
/** When true, run only structural verifiers (
|
|
7
|
+
/** When true, run only structural verifiers (Step 1). */
|
|
8
8
|
schemaOnly?: boolean;
|
|
9
|
-
/** When true, run structural + rule-based verifiers.
|
|
9
|
+
/** When true, run structural + rule-based verifiers. Step 2 wires rules. */
|
|
10
10
|
rules?: boolean;
|
|
11
|
-
/** When true, also run LLM judge verifiers.
|
|
11
|
+
/** When true, also run LLM judge verifiers. Step 3 wires judging. */
|
|
12
12
|
judge?: boolean;
|
|
13
13
|
/** When true, load config + corpus and return a summary without running any verifier. */
|
|
14
14
|
dryRun?: boolean;
|
|
@@ -36,10 +36,10 @@ export interface DryRunSummary {
|
|
|
36
36
|
notes: string[];
|
|
37
37
|
}
|
|
38
38
|
/**
|
|
39
|
-
*
|
|
39
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
40
40
|
* active), runs structural verifiers against fixture-backed cases and loads
|
|
41
41
|
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
42
|
-
*
|
|
43
|
-
*
|
|
42
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
43
|
+
* skipped rather than failing.
|
|
44
44
|
*/
|
|
45
45
|
export declare function runEval(options: RunEvalOptions): Promise<DryRunSummary | EvalReport>;
|
package/dist/eval/runner.js
CHANGED
|
@@ -14,7 +14,7 @@ function groupByStage(cases) {
|
|
|
14
14
|
function skeletonVerifierResult(message, details) {
|
|
15
15
|
return {
|
|
16
16
|
kind: "structural",
|
|
17
|
-
id: "
|
|
17
|
+
id: "structural:no-expectations",
|
|
18
18
|
ok: true,
|
|
19
19
|
score: 1,
|
|
20
20
|
message,
|
|
@@ -111,11 +111,11 @@ function stagesInResults(caseResults) {
|
|
|
111
111
|
return FLOW_STAGES.filter((s) => set.has(s));
|
|
112
112
|
}
|
|
113
113
|
/**
|
|
114
|
-
*
|
|
114
|
+
* Structural runner. When `schemaOnly` is set (or no other verifier flags are
|
|
115
115
|
* active), runs structural verifiers against fixture-backed cases and loads
|
|
116
116
|
* per-stage baselines for regression comparison. Tier A/B/C agent loops
|
|
117
|
-
*
|
|
118
|
-
*
|
|
117
|
+
* arrive in later steps; until then cases without `fixture` are marked as
|
|
118
|
+
* skipped rather than failing.
|
|
119
119
|
*/
|
|
120
120
|
export async function runEval(options) {
|
|
121
121
|
const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
|
|
@@ -126,10 +126,10 @@ export async function runEval(options) {
|
|
|
126
126
|
notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
|
|
127
127
|
}
|
|
128
128
|
if (options.rules) {
|
|
129
|
-
notes.push("--rules is accepted; rule verifiers
|
|
129
|
+
notes.push("--rules is accepted; rule verifiers are not wired yet.");
|
|
130
130
|
}
|
|
131
131
|
if (options.judge) {
|
|
132
|
-
notes.push("--judge is accepted; LLM judging
|
|
132
|
+
notes.push("--judge is accepted; LLM judging is not wired yet.");
|
|
133
133
|
}
|
|
134
134
|
if (options.dryRun === true) {
|
|
135
135
|
const summary = {
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
7
|
*
|
|
8
8
|
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
-
* - The verifier / rubric / LLM stack evolves on its own release cadence (
|
|
9
|
+
* - The verifier / rubric / LLM stack evolves on its own release cadence (Steps 0-6).
|
|
10
10
|
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
11
|
*/
|
|
12
12
|
import type { FlowStage } from "../types.js";
|
|
@@ -29,8 +29,8 @@ export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "
|
|
|
29
29
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
30
|
/**
|
|
31
31
|
* Structural expectations — deterministic, LLM-free checks against a single
|
|
32
|
-
* text artifact.
|
|
33
|
-
* sibling `rules` shape,
|
|
32
|
+
* text artifact. Step 1 implements all fields below; Step 2 adds the
|
|
33
|
+
* sibling `rules` shape, Step 3 adds `judge`.
|
|
34
34
|
*/
|
|
35
35
|
export interface StructuralExpected {
|
|
36
36
|
/**
|
|
@@ -58,19 +58,19 @@ export interface StructuralExpected {
|
|
|
58
58
|
*/
|
|
59
59
|
requiredFrontmatterKeys?: string[];
|
|
60
60
|
}
|
|
61
|
-
/** Superset of per-verifier expectation shapes. Only `structural` is wired in
|
|
61
|
+
/** Superset of per-verifier expectation shapes. Only `structural` is wired in Step 1. */
|
|
62
62
|
export interface ExpectedShape {
|
|
63
63
|
structural?: StructuralExpected;
|
|
64
|
-
/** Rule-based (keyword/regex/traceability) checks —
|
|
64
|
+
/** Rule-based (keyword/regex/traceability) checks — Step 2. */
|
|
65
65
|
rules?: Record<string, unknown>;
|
|
66
|
-
/** LLM-judge rubrics —
|
|
66
|
+
/** LLM-judge rubrics — Step 3. */
|
|
67
67
|
judge?: Record<string, unknown>;
|
|
68
68
|
}
|
|
69
69
|
/**
|
|
70
70
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
71
71
|
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
72
|
-
* fixture artifact for verifier development (
|
|
73
|
-
* exists (
|
|
72
|
+
* fixture artifact for verifier development (Step 1) before the agent loop
|
|
73
|
+
* exists (Step 3+).
|
|
74
74
|
*/
|
|
75
75
|
export interface EvalCase {
|
|
76
76
|
id: string;
|
|
@@ -85,8 +85,8 @@ export interface EvalCase {
|
|
|
85
85
|
expected?: ExpectedShape;
|
|
86
86
|
/**
|
|
87
87
|
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
88
|
-
* when verifiers are exercised without a live agent loop. Primarily a
|
|
89
|
-
*
|
|
88
|
+
* when verifiers are exercised without a live agent loop. Primarily a
|
|
89
|
+
* Step 1 development aid.
|
|
90
90
|
*/
|
|
91
91
|
fixture?: string;
|
|
92
92
|
}
|
|
@@ -129,7 +129,7 @@ export interface EvalReport {
|
|
|
129
129
|
totalCostUsd: number;
|
|
130
130
|
totalDurationMs: number;
|
|
131
131
|
};
|
|
132
|
-
/** Present when comparing against a saved baseline (
|
|
132
|
+
/** Present when comparing against a saved baseline (Step 1+). */
|
|
133
133
|
baselineDelta?: BaselineDelta;
|
|
134
134
|
}
|
|
135
135
|
/**
|
|
@@ -170,7 +170,7 @@ export interface ResolvedEvalConfig extends EvalConfig {
|
|
|
170
170
|
source: "default" | "file" | "env" | "file+env";
|
|
171
171
|
}
|
|
172
172
|
/**
|
|
173
|
-
* Frozen per-stage baseline used by regression gating (
|
|
173
|
+
* Frozen per-stage baseline used by regression gating (Step 1). Baselines
|
|
174
174
|
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
175
175
|
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
176
176
|
* changed between runs.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Structural verifier
|
|
2
|
+
* Structural verifier: deterministic, zero-LLM checks against a
|
|
3
3
|
* single markdown artifact. Each structural expectation produces one
|
|
4
4
|
* `VerifierResult` so baselines diff cleanly at the check level rather than
|
|
5
5
|
* lumping everything into a single boolean.
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
* - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
|
|
16
16
|
* adds metadata does not accidentally drop the body below the floor.
|
|
17
17
|
* - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
|
|
18
|
-
* all individual `ok` flags. This keeps
|
|
19
|
-
* rubric scale shows up in
|
|
18
|
+
* all individual `ok` flags. This keeps the structural verifier
|
|
19
|
+
* deterministic; the 0..1 rubric scale shows up later in the LLM judge.
|
|
20
20
|
*/
|
|
21
21
|
import { parse as parseYaml } from "yaml";
|
|
22
22
|
const FRONTMATTER_OPEN = /^---\r?\n/;
|
package/dist/install.js
CHANGED
|
@@ -29,7 +29,7 @@ import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.j
|
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
31
|
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
32
|
-
import {
|
|
32
|
+
import { TDD_BATCH_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
33
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
34
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
35
35
|
import { LANGUAGE_RULE_PACK_DIR, LANGUAGE_RULE_PACK_FILES, LANGUAGE_RULE_PACK_GENERATORS, LEGACY_LANGUAGE_RULE_PACK_FOLDERS, UTILITY_SKILL_FOLDERS, UTILITY_SKILL_MAP } from "./content/utility-skills.js";
|
|
@@ -218,11 +218,11 @@ async function writeSkills(projectRoot, config) {
|
|
|
218
218
|
await writeFileSafe(runtimePath(projectRoot, ...referenceDir, `${stage}-examples.md`), referenceMarkdown);
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
|
-
// Progressive disclosure for the TDD
|
|
221
|
+
// Progressive disclosure for the TDD Batch Execution walkthrough (A.1#1).
|
|
222
222
|
// The detailed 3-task transcript lives next to stage examples so the
|
|
223
223
|
// always-rendered TDD skill stays under the line-budget and the reference
|
|
224
224
|
// is loaded on demand.
|
|
225
|
-
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-
|
|
225
|
+
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-batch-walkthrough.md"), TDD_BATCH_WALKTHROUGH_MARKDOWN);
|
|
226
226
|
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "common-guidance.md"), stageCommonGuidanceMarkdown());
|
|
227
227
|
// Utility skills (not flow stages)
|
|
228
228
|
await writeFileSafe(runtimePath(projectRoot, "skills", "learnings", "SKILL.md"), learnSkillMarkdown());
|
package/dist/policy.js
CHANGED
|
@@ -161,7 +161,7 @@ export async function policyChecks(projectRoot, options = {}) {
|
|
|
161
161
|
{ file: runtimeFile("skills/docs/SKILL.md"), needle: "## README Guidance", name: "utility_skill:docs:readme" },
|
|
162
162
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:executing_plans:hard_gate" },
|
|
163
163
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Execution Protocol", name: "utility_skill:executing_plans:protocol" },
|
|
164
|
-
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "##
|
|
164
|
+
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Batch Checklist", name: "utility_skill:executing_plans:batches" },
|
|
165
165
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:verification_before_completion:hard_gate" },
|
|
166
166
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## Protocol", name: "utility_skill:verification_before_completion:protocol" },
|
|
167
167
|
{ file: runtimeFile("skills/finishing-a-development-branch/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:finishing_branch:hard_gate" },
|