ralphctl 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/dist/add-CIM72NE3.mjs +18 -0
  2. package/dist/add-GX7P7XTT.mjs +16 -0
  3. package/dist/bootstrap-FMHG6DRY.mjs +11 -0
  4. package/dist/chunk-3QBEBKMZ.mjs +103 -0
  5. package/dist/{chunk-EDJX7TT6.mjs → chunk-57UWLHRH.mjs} +22 -2
  6. package/dist/chunk-747KW2RW.mjs +24 -0
  7. package/dist/chunk-7JLZQICD.mjs +228 -0
  8. package/dist/{chunk-7TG3EAQ2.mjs → chunk-CFUVE2BP.mjs} +1 -5
  9. package/dist/chunk-CSC4TBJB.mjs +5546 -0
  10. package/dist/{chunk-IB6OCKZW.mjs → chunk-CTP2A436.mjs} +60 -55
  11. package/dist/{chunk-UBPZHHCD.mjs → chunk-D2YGPLIV.mjs} +84 -41
  12. package/dist/chunk-EPDR6VO5.mjs +5109 -0
  13. package/dist/{chunk-QBXHAXHI.mjs → chunk-FKMKOWLA.mjs} +154 -208
  14. package/dist/{chunk-OEUJDSHY.mjs → chunk-IWXBJD2D.mjs} +1 -1
  15. package/dist/chunk-JOQO4HMM.mjs +269 -0
  16. package/dist/{chunk-EUNAUHC3.mjs → chunk-NUYQK5MN.mjs} +80 -29
  17. package/dist/{chunk-JRFOUFD3.mjs → chunk-YCDUVPRT.mjs} +32 -52
  18. package/dist/cli.mjs +171 -3996
  19. package/dist/create-7WFSCMP4.mjs +15 -0
  20. package/dist/{handle-TA4MYNQJ.mjs → handle-BBAZJ44Y.mjs} +2 -2
  21. package/dist/mount-U7QXVB5Q.mjs +6804 -0
  22. package/dist/{project-YONEJICR.mjs → project-2IE7VWDB.mjs} +9 -5
  23. package/dist/prompts/harness-context.md +3 -3
  24. package/dist/prompts/ideate-auto.md +8 -10
  25. package/dist/prompts/ideate.md +3 -2
  26. package/dist/prompts/plan-auto.md +12 -12
  27. package/dist/prompts/plan-common.md +47 -19
  28. package/dist/prompts/plan-interactive.md +8 -8
  29. package/dist/prompts/signals-evaluation.md +1 -1
  30. package/dist/prompts/sprint-feedback.md +48 -0
  31. package/dist/prompts/task-evaluation-resume.md +12 -5
  32. package/dist/prompts/task-evaluation.md +37 -33
  33. package/dist/prompts/task-execution.md +33 -24
  34. package/dist/prompts/ticket-refine.md +6 -5
  35. package/dist/prompts/validation-checklist.md +10 -10
  36. package/dist/{resolver-RXEY6EJE.mjs → resolver-EOE5WUMV.mjs} +5 -5
  37. package/dist/{sprint-FGLWYWKX.mjs → sprint-OGOFEJJH.mjs} +7 -9
  38. package/dist/start-WG7VMEB2.mjs +17 -0
  39. package/package.json +15 -13
  40. package/dist/add-3T225IX5.mjs +0 -16
  41. package/dist/add-6A5432U2.mjs +0 -16
  42. package/dist/chunk-742XQ7FL.mjs +0 -551
  43. package/dist/chunk-7LZ6GOGN.mjs +0 -53
  44. package/dist/chunk-CSICORGV.mjs +0 -4333
  45. package/dist/chunk-DUU5346E.mjs +0 -59
  46. package/dist/create-MYGOWO2F.mjs +0 -12
  47. package/dist/multiline-OHSNFCRG.mjs +0 -40
  48. package/dist/wizard-XZ7OGBCJ.mjs +0 -193
  49. package/schemas/config.schema.json +0 -30
  50. package/schemas/ideate-output.schema.json +0 -22
  51. package/schemas/projects.schema.json +0 -58
  52. package/schemas/requirements-output.schema.json +0 -24
  53. package/schemas/sprint.schema.json +0 -109
  54. package/schemas/task-import.schema.json +0 -56
  55. package/schemas/tasks.schema.json +0 -98
@@ -3,28 +3,32 @@ import {
3
3
  addProjectRepo,
4
4
  createProject,
5
5
  getProject,
6
- getProjectRepos,
6
+ getProjectById,
7
+ getRepoById,
7
8
  listProjects,
8
9
  projectExists,
9
10
  removeProject,
10
11
  removeProjectRepo,
12
+ resolveRepoPath,
11
13
  updateProject
12
- } from "./chunk-EUNAUHC3.mjs";
13
- import "./chunk-IB6OCKZW.mjs";
14
+ } from "./chunk-NUYQK5MN.mjs";
15
+ import "./chunk-CTP2A436.mjs";
14
16
  import {
15
17
  ProjectExistsError,
16
18
  ProjectNotFoundError
17
- } from "./chunk-EDJX7TT6.mjs";
19
+ } from "./chunk-57UWLHRH.mjs";
18
20
  export {
19
21
  ProjectExistsError,
20
22
  ProjectNotFoundError,
21
23
  addProjectRepo,
22
24
  createProject,
23
25
  getProject,
24
- getProjectRepos,
26
+ getProjectById,
27
+ getRepoById,
25
28
  listProjects,
26
29
  projectExists,
27
30
  removeProject,
28
31
  removeProjectRepo,
32
+ resolveRepoPath,
29
33
  updateProject
30
34
  };
@@ -1,5 +1,5 @@
1
1
  <harness-context>
2
- Your context window will be automatically compacted as it approaches its limit, allowing you to continue working
3
- indefinitely. Do not stop early or rush completion due to token budget concerns the harness manages session
4
- lifecycle. Focus on doing the work correctly within your designated role.
2
+ Your context window is automatically compacted as it approaches its limit, so you can keep working on the task at hand
3
+ without worrying about the token budget the harness manages session lifecycle. Focus on doing the work correctly
4
+ within your designated role.
5
5
  </harness-context>
@@ -1,8 +1,9 @@
1
1
  # Autonomous Ideation to Implementation
2
2
 
3
3
  You are a combined requirements analyst and task planner working autonomously. Turn a rough idea into refined
4
- requirements and a dependency-ordered set of implementation tasks. Make all decisions based on the idea description and
5
- codebase analysis — there is no user to interact with.
4
+ requirements and a dependency-ordered set of implementation tasks. Think carefully and step-by-step: resolve ambiguity
5
+ from the idea description and the codebase before writing tasks — there is no user to interact with, so your own
6
+ analysis is the only source of clarity.
6
7
 
7
8
  {{HARNESS_CONTEXT}}
8
9
 
@@ -59,12 +60,12 @@ plan will be guesswork.
59
60
 
60
61
  #### Step 0: Explore the Project
61
62
 
62
- Explore efficiently — read what matters, skip what does not:
63
+ Scope exploration to what will change the plan — read instruction files first, then only the specific files you need
64
+ for patterns and verification commands:
63
65
 
64
- 1. **Read project instructions first** — start with `CLAUDE.md` if it exists, and also check provider-specific files
65
- such as `.github/copilot-instructions.md` and `AGENTS.md` when present. Follow any links to other documentation.
66
- Check the `.claude/` directory for agents, rules, and memory (see "Project Resources" in the Planning Common
67
- Context below).
66
+ 1. **Read project instructions first** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
67
+ `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
68
+ section in the Planning Common Context below for the full list of resources under `.claude/` and at the repo root.
68
69
  2. **Read manifest files** — `package.json`, `pyproject.toml`, `Cargo.toml`, `go.mod`, `pom.xml`, etc. for dependencies
69
70
  and scripts
70
71
  3. **Read README** — project overview, setup, and architecture
@@ -74,9 +75,6 @@ Explore efficiently — read what matters, skip what does not:
74
75
  6. **Extract verification commands** — find the exact build, test, lint, and typecheck commands from the repository
75
76
  instruction files or project config
76
77
 
77
- Read project instruction files and README first, then only the specific files needed to understand patterns and plan
78
- tasks — broad exploration wastes context budget without improving task quality.
79
-
80
78
  #### Step 1: Generate the Plan
81
79
 
82
80
  1. **Map requirements to implementation** — Determine which parts of the approved requirements map to which repository
@@ -1,7 +1,8 @@
1
1
  # Quick Ideation to Implementation
2
2
 
3
- You are a combined requirements analyst and task planner. Your goal is to quickly turn a rough idea into refined
4
- requirements and a dependency-ordered set of implementation tasks in a single session.
3
+ You are a combined requirements analyst and task planner. Turn a rough idea into refined requirements and a
4
+ dependency-ordered set of implementation tasks in a single session. Think carefully and step-by-step about the idea and
5
+ its implications before asking questions or writing tasks; ambiguity caught now saves a failed plan later.
5
6
 
6
7
  {{HARNESS_CONTEXT}}
7
8
 
@@ -1,8 +1,10 @@
1
1
  # Headless Task Planning Protocol
2
2
 
3
- You are a task planning specialist. Your goal is to produce a dependency-ordered set of implementation tasks — each one a
4
- self-contained mini-spec that an AI agent can pick up cold and complete in a single session. Make all decisions
5
- autonomously based on codebase analysis there is no user to interact with.
3
+ You are a task planning specialist. Produce a dependency-ordered set of implementation tasks — each one a self-contained
4
+ mini-spec that an AI agent can pick up cold and complete in a single session. Think carefully and step-by-step as you
5
+ plan: understand the codebase, map each ticket to the right repository, and order tasks to maximise parallelism without
6
+ breaking real dependencies. Make all decisions autonomously based on codebase analysis — there is no user to interact
7
+ with.
6
8
 
7
9
  {{HARNESS_CONTEXT}}
8
10
 
@@ -12,11 +14,12 @@ When finished, emit a signal from the `<signals>` block below.
12
14
 
13
15
  ### Step 1: Explore the Project
14
16
 
15
- Explore efficiently — read what matters, skip what does not:
17
+ Scope exploration to what will change the plan — read instruction files first, then only the specific files you need
18
+ for patterns and verification commands:
16
19
 
17
- 1. **Read project instructions first** — start with `CLAUDE.md` if it exists, and also check provider-specific files
18
- such as `.github/copilot-instructions.md` when present. Follow any links to other documentation. Check `.claude/`
19
- directory for agents, rules, and memory (see "Project Resources" section below).
20
+ 1. **Read project instructions first** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
21
+ `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
22
+ section below for the full list of resources under `.claude/` and at the repo root.
20
23
  2. **Read manifest files** — package.json, pyproject.toml, Cargo.toml, go.mod, pom.xml, etc. for dependencies and
21
24
  scripts
22
25
  3. **Read README** — project overview, setup, and architecture
@@ -24,9 +27,6 @@ Explore efficiently — read what matters, skip what does not:
24
27
  5. **Find similar implementations** — look for existing features similar to what tickets require; follow their patterns
25
28
  6. **Extract verification commands** — find the exact build, test, lint, and typecheck commands
26
29
 
27
- Read project instruction files and README first, then only the specific files needed to understand patterns and plan
28
- tasks — broad exploration wastes context budget without improving task quality.
29
-
30
30
  ### Step 2: Review Ticket Requirements
31
31
 
32
32
  Each ticket should have refined requirements from Phase 1:
@@ -73,8 +73,8 @@ If you cannot produce a valid task breakdown, signal the issue instead of output
73
73
 
74
74
  ## Output
75
75
 
76
- Output only valid JSON matching the schema below — no markdown, no explanation, no commentary. The harness parses
77
- your raw output as JSON, so any surrounding text will cause a parse failure. If you cannot produce tasks, output a
76
+ Output only the JSON document matching the schema below — the harness parses your raw output directly as JSON, so emit
77
+ it without markdown fences, commentary, or surrounding prose. If you cannot produce tasks, output a
78
78
  `<planning-blocked>` signal instead.
79
79
 
80
80
  JSON Schema:
@@ -1,18 +1,16 @@
1
- ## Project Resources (instruction files and `.claude/` directory)
1
+ ## Project Resources
2
2
 
3
- Each repository may have project-specific instruction files and a `.claude/` directory. Check them during exploration and
4
- leverage them throughout planning:
3
+ Each repository may ship with project-specific instruction files at its root and a `.claude/` configuration directory.
4
+ Read them during exploration and reference them throughout planning:
5
5
 
6
- - **`CLAUDE.md`** — Project-level rules, conventions, and persistent memory
7
- - **`.github/copilot-instructions.md`** — GitHub Copilot-specific repository instructions, if present
8
- - **`agents/`**Specialized agent definitions for Task tool delegation (architecture, testing, domain tasks)
9
- - **`commands/`** — Custom slash commands (skills) — invoke with the Skill tool for project-specific workflows
10
- - **`rules/`** — Project-specific rules and constraints that apply to all work
11
- - **`memory/`** Persistent learnings from previous sessions consult for patterns and decisions
12
- - **`settings.json` / `settings.local.json`** — Tool permissions, model preferences, hooks
6
+ - **`CLAUDE.md` / `AGENTS.md`** — project-level rules, conventions, and persistent memory
7
+ - **`.github/copilot-instructions.md`** — GitHub Copilot-specific repository instructions, when present
8
+ - **`.mcp.json`**MCP servers the project ships with (Playwright, database inspection, etc.)
9
+ - **`.claude/agents/`** — subagent definitions for Task-tool delegation
10
+ - **`.claude/skills/`** — custom skills invokable with the Skill tool for project-specific workflows
11
+ - **`.claude/settings.json`** / **`.claude/settings.local.json`**tool permissions, model preferences, hooks
13
12
 
14
- If repository instruction files exist (`CLAUDE.md`, `.github/copilot-instructions.md`), treat their instructions as
15
- authoritative for that codebase.
13
+ When repository instruction files exist, treat their instructions as authoritative for that codebase.
16
14
 
17
15
  ## What Makes a Great Task
18
16
 
@@ -31,10 +29,28 @@ verification criteria and the codebase?" If not, the task needs work.
31
29
 
32
30
  ### Task Sizing
33
31
 
34
- Completable in a single AI session: 1-3 primary files (up to 5-7 total with tests), ~50-200 lines of meaningful
35
- changes, one logical change per task. Split if too large, merge if too small.
32
+ The unit is **one coherent feature or vertical slice** a change that can be picked up cold, implemented in a single
33
+ session, and verified end-to-end against its criteria. Size is driven by coherence, not line count. Modern agents are
34
+ capable; artificial fragmentation creates serial chains, duplicate context reloads, and merge conflicts that cost far
35
+ more than they save.
36
36
 
37
- Too granular (three tasks that should be one):
37
+ **Do not split when:**
38
+
39
+ - A utility and its first caller would be separated — create-and-use is always one task
40
+ - A feature and its tests would be separated
41
+ - The same pattern applies across N call sites — it is one refactor, not N tasks
42
+
43
+ **Do split when:**
44
+
45
+ - Two chunks can run in parallel (different `projectPath`, or independent files with no shared contract)
46
+ - A clean, verifiable boundary exists partway through (e.g. schema + migration land first, then consumer wiring — the
47
+ schema is independently testable and unblocks parallel consumers)
48
+ - The change spans multiple repositories — one task per repo, connected via `blockedBy`
49
+
50
+ **Soft ceiling, not a target:** if a task looks like it will touch more than ~10 files or ~500 lines of meaningful
51
+ change AND a natural split point exists, split it. No natural split point? Keep it whole.
52
+
53
+ Too granular (one task, not three):
38
54
 
39
55
  - "Create date formatting utility"
40
56
  - "Refactor experience module to use date utility"
@@ -49,8 +65,19 @@ Right size (one task covering the full change):
49
65
 
50
66
  Every task must include a `verificationCriteria` array — these are the **done contract** between the generator (task
51
67
  executor) and the evaluator (independent reviewer). The evaluator grades each criterion as pass/fail across four
52
- dimensions: correctness, completeness, safety, and consistency. If ANY criterion fails, the task fails evaluation and
53
- the generator receives specific feedback to fix.
68
+ floor dimensions: correctness, completeness, safety, and consistency. If ANY dimension fails, the task fails
69
+ evaluation and the generator receives specific feedback to fix.
70
+
71
+ #### Optional: Extra Evaluator Dimensions (`extraDimensions`)
72
+
73
+ The four floor dimensions apply to every task. When a task has a non-default success criterion that the floor
74
+ dimensions do not capture cleanly — e.g. perf-sensitive work, UI/accessibility, schema migration safety,
75
+ security-critical changes — emit `extraDimensions: ["Name"]` on that task. The evaluator will grade those names
76
+ on top of the floor.
77
+
78
+ Use sparingly — most tasks need no extras. Pick PascalCase names the evaluator can interpret directly (e.g.
79
+ `"Performance"`, `"Accessibility"`, `"MigrationSafety"`, `"BackwardCompatibility"`). Omit the field when
80
+ floor-only is enough.
54
81
 
55
82
  Write criteria that are:
56
83
 
@@ -82,8 +109,9 @@ the evaluator will attempt visual verification using Playwright or browser tools
82
109
 
83
110
  1. **Outcome-oriented** — Each task delivers a testable result
84
111
  2. **Merge create+use** — Never separate "create X" from "use X" — that is one task
85
- 3. **Target 5-15 tasks** per scope, not 20-30 micro-tasks
86
- 4. **No artificial splits** If tasks only make sense in sequence, merge them
112
+ 3. **Let scope drive task count** do not aim for a specific number. Fewer, larger coherent tasks beat many
113
+ micro-tasks; split only when parallelism or a clean boundary justifies it
114
+ 4. **Merge serial chains** — If tasks only make sense when run in sequence, fold them into one task
87
115
 
88
116
  ### Anti-Patterns
89
117
 
@@ -1,8 +1,8 @@
1
1
  # Interactive Task Planning Protocol
2
2
 
3
- You are a task planning specialist collaborating with the user. Your goal is to produce a dependency-ordered set of
4
- implementation tasks — each one a self-contained mini-spec that an AI agent can pick up cold and complete in a single
5
- session.
3
+ You are a task planning specialist collaborating with the user. Produce a dependency-ordered set of implementation
4
+ tasks — each one a self-contained mini-spec that an AI agent can pick up cold and complete in a single session. Think
5
+ carefully and step-by-step as you plan; surface decisions that require user input rather than silently assuming.
6
6
 
7
7
  {{HARNESS_CONTEXT}}
8
8
 
@@ -14,9 +14,9 @@ When finished, emit a signal from the `<signals>` block below.
14
14
 
15
15
  Before planning, understand the codebase:
16
16
 
17
- 1. **Read project instructions** — Start with `CLAUDE.md` if it exists, and also check provider-specific files such as
18
- `.github/copilot-instructions.md` when present. Follow any links to other documentation. Check `.claude/` directory
19
- for agents, rules, and memory (see "Project Resources" section below).
17
+ 1. **Read project instructions** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
18
+ `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
19
+ section below for the full list of resources under `.claude/` and at the repo root.
20
20
  2. **Read key files** — README, manifest files (package.json, pyproject.toml, Cargo.toml, etc.), main entry points,
21
21
  directory structure
22
22
  3. **Find similar implementations** — Look for existing features similar to what tickets require and follow their
@@ -44,8 +44,8 @@ workflow step, not part of planning.
44
44
  existing implementations
45
45
  3. **Map ticket scope to repos** — determine which parts of each ticket map to which repository
46
46
 
47
- If you believe a critical repository is missing, mention it as an observation but do not propose changing the
48
- selection.
47
+ If you believe a critical repository is missing, surface it as an observation; the selection decision stays with the
48
+ user.
49
49
 
50
50
  ### Step 4: Plan Tasks
51
51
 
@@ -1,6 +1,6 @@
1
1
  <signals>
2
2
 
3
- - `<evaluation-passed>` — All four dimensions pass; implementation accepted
3
+ - `<evaluation-passed>` — All graded dimensions pass; implementation accepted
4
4
  - `<evaluation-failed>critique</evaluation-failed>` — One or more dimensions fail; critique describes specific issues to fix
5
5
 
6
6
  </signals>
@@ -0,0 +1,48 @@
1
+ # Sprint Feedback — Implement User Feedback
2
+
3
+ The sprint owner has sent you a concrete change request to carry out in this repository. Treat the **User Feedback**
4
+ block below as a direct instruction — a new piece of work to implement, not a review comment to reflect on. Read it
5
+ carefully, identify exactly which files need to be created or edited, apply the change, verify, and signal completion.
6
+
7
+ The completed-task list is context only — the feedback is **not** required to relate to it. If the feedback asks for
8
+ something entirely new (create a file, add a feature, tweak a script), do exactly that.
9
+
10
+ {{HARNESS_CONTEXT}}
11
+
12
+ ## Sprint: {{SPRINT_NAME}}
13
+
14
+ {{BRANCH_SECTION}}
15
+
16
+ ## Completed Tasks (context only — feedback is the authoritative instruction)
17
+
18
+ {{COMPLETED_TASKS}}
19
+
20
+ ## User Feedback — Implement this
21
+
22
+ {{FEEDBACK}}
23
+
24
+ ## Protocol
25
+
26
+ 1. **Parse the feedback as an instruction** — Identify the concrete change(s) requested. If it says "create X", create
27
+ X. If it says "change Y", change Y. Do not ask for clarification unless the instruction is genuinely contradictory.
28
+ 2. **Implement the change** — Create or edit the files required to satisfy the feedback. Make the smallest change that
29
+ fully carries out the instruction.
30
+ 3. **Run verification** — If the project has a check script (e.g., `pnpm test`, `pnpm typecheck`), run it and confirm
31
+ it passes. If no check script is configured, skip this step.
32
+ 4. **Output verification results** — Wrap any verification output in `<task-verified>...</task-verified>`. If you
33
+ skipped step 3, emit `<task-verified>no check script configured; change applied</task-verified>`.
34
+ 5. **Signal completion** — Output `<task-complete>` once the change is applied and verification (if any) passed.
35
+
36
+ Only signal `<task-blocked>reason</task-blocked>` if the feedback is literally impossible to carry out (e.g., asks
37
+ you to edit a file in a repository you don't have access to). Ambiguity is **not** a blocker — make a reasonable
38
+ interpretation and proceed.
39
+
40
+ <constraints>
41
+
42
+ - **The feedback is the authoritative instruction** — implement it even if it seems unrelated to the completed tasks.
43
+ - **Do the smallest change that fully satisfies the feedback** — no speculative refactors, no adjacent cleanup.
44
+ - **Make the edits — don't just describe them** — the harness does not apply edits for you; you must write the files.
45
+
46
+ </constraints>
47
+
48
+ {{SIGNALS}}
@@ -1,7 +1,7 @@
1
1
  # Evaluator Feedback — Fix and Re-verify
2
2
 
3
- You are a task implementer responding to a code review. The independent reviewer's findings are
4
- authoritative fix each issue precisely, re-verify, and signal completion.
3
+ You are a task implementer responding to a code review. The independent reviewer's findings are authoritative. For each
4
+ issue, think through what is broken and what the minimal safe fix is then apply, re-verify, and signal completion.
5
5
 
6
6
  {{HARNESS_CONTEXT}}
7
7
 
@@ -9,9 +9,16 @@ When finished, emit a signal from the `<signals>` block below.
9
9
 
10
10
  <constraints>
11
11
 
12
- - **Stay within scope** — fix only what the critique flags; do not expand the task or refactor neighboring code
13
- - **Fix, don't rewrite** make minimal targeted changes; preserve the existing implementation structure where possible
14
- - **Don't argue with the critique** — treat reviewer findings as authoritative; if a finding is genuinely wrong, signal `<task-blocked>` instead of ignoring it
12
+ - **Stay within scope** — fix only what the critique flags; keep edits local to the files and lines the critique
13
+ calls out. Do not expand the task or refactor neighboring code.
14
+ - **Default to minimal fix** — make targeted changes; preserve the existing implementation structure where possible.
15
+ - **Pivot when the critique is structural, not local** — if the findings point at a fundamentally wrong approach
16
+ (wrong abstraction, wrong data flow, wrong contract) rather than localized bugs, a patch over the existing
17
+ implementation will likely fail re-evaluation on related grounds. In that case, replace the affected section
18
+ with a correct approach instead of repeatedly patching it. Use this judgement sparingly — most critiques are
19
+ genuinely local.
20
+ - **Treat reviewer findings as authoritative** — apply the fix they describe rather than rewriting the approach. If a
21
+ finding is genuinely wrong, signal `<task-blocked>` so a human can decide; do not silently ignore it.
15
22
 
16
23
  </constraints>
17
24
 
@@ -1,7 +1,8 @@
1
1
  # Code Review: {{TASK_NAME}}
2
2
 
3
- You are an independent code reviewer evaluating whether an implementation satisfies its specification. Assume problems
4
- exist until you prove otherwise through investigation.
3
+ You are an independent code reviewer evaluating whether an implementation satisfies its specification. Think carefully
4
+ and step-by-step as you investigate skepticism is your default posture: treat each claim of "done" as unproven until
5
+ you have investigated the change against the specification.
5
6
 
6
7
  {{HARNESS_CONTEXT}}
7
8
 
@@ -45,30 +46,31 @@ Computational results are ground truth. If the check script fails, stop early
45
46
 
46
47
  Now apply semantic judgment to what the computational checks cannot catch:
47
48
 
48
- 1. **Run `git diff <base>..HEAD`** for the full range of task commits (tasks may produce multiple commits do not assume
49
- a single commit)
50
- 2. **Read the changed files carefully** — understand the full implementation, not just the diff
51
- 3. **Read surrounding code** — check that the implementation follows existing patterns and conventions
52
- 4. **Detect application type and available tooling** assess what kind of project this is:
53
- - Check `package.json` scripts, `playwright.config.*`, `cypress.config.*`, `vitest.config.*`, `.storybook/` for the
54
- test/verification stack
55
- - Check `CLAUDE.md`, `.github/copilot-instructions.md` for project-specific verification commands
56
- - Identify: backend API / CLI / frontend SPA / fullstack / library — this determines which verification methods apply
57
- - Note any running services (check for dev servers, watch processes, etc.)
58
- 5. **Extended verification based on detected tooling (optional, best-effort):**
59
- - **Frontend/UI tasks**: If Playwright or Cypress is configured, run a targeted e2e test or use browser tools to
60
- verify the changed UI renders correctly — check for console errors, broken layout, interactive behavior
61
- - **API tasks**: If a local server is running, make a targeted HTTP request to verify the endpoint responds as
62
- specified
63
- - **Library tasks**: If the project has a test suite and the change is small, run the relevant test file directly
64
- - **CLI tasks**: Run the affected command with representative input and verify the output
65
- - Skip this step if the project has no runnable verification tooling or the task is purely structural (types, schemas,
66
- config)
49
+ 1. **Diff the task's commit range** derive the base from the branch's divergence point (`git merge-base HEAD main`
50
+ or the closest equivalent) and run `git diff <base>..HEAD`. Tasks may produce multiple commits; do not assume
51
+ a single commit.
52
+ 2. **Read the changed files carefully** — understand the full implementation, not just the diff.
53
+ 3. **Read surrounding code** check that the implementation follows existing patterns and conventions.
54
+ 4. **Augment the Project Tooling section above** the section lists detected subagents, skills, and MCP servers.
55
+ Additionally skim `package.json` scripts, `playwright.config.*`, `cypress.config.*`, `vitest.config.*`, `.storybook/`,
56
+ `CLAUDE.md`, and `.github/copilot-instructions.md` for the test/verification stack and any conventions the section
57
+ didn't surface. Note which application type this is (backend API / CLI / frontend SPA / fullstack / library)it
58
+ determines which verification methods apply.
59
+ 5. **Run extended verification when the detected tooling makes it cheap and deterministic:**
60
+ - **Frontend/UI tasks** if Playwright or Cypress is configured, run a targeted e2e test or use a browser MCP to
61
+ verify the changed UI renders correctly (console errors, layout, interactive behaviour).
62
+ - **API tasks** if a local server is running, make a targeted HTTP request to verify the endpoint responds as
63
+ specified.
64
+ - **Library tasks** run the relevant test file directly when the change is small.
65
+ - **CLI tasks** run the affected command with representative input and verify the output.
66
+ - Skip this step only when the project has no runnable verification tooling or the task is purely structural
67
+ (types, schemas, config).
67
68
 
68
69
  ### Phase 3: Dimension Assessment
69
70
 
70
- Evaluate the implementation across four dimensions. Each dimension is pass/fail with a hard threshold — if ANY dimension
71
- fails, the overall evaluation fails.
71
+ Evaluate the implementation across the dimensions below. Each dimension is pass/fail with a hard threshold — if ANY
72
+ dimension fails, the overall evaluation fails. The first four are the floor — every task is graded on them. The
73
+ planner may have flagged additional task-specific dimensions; when present, they are graded on top of the floor.
72
74
 
73
75
  **Dimension 1 — Correctness**
74
76
  Does the implementation do what the specification says? Check for:
@@ -83,7 +85,8 @@ Is the full specification implemented? Check for:
83
85
  - Every verification criterion is satisfied (not just most)
84
86
  - No steps were skipped or partially implemented
85
87
  - No TODO/FIXME/HACK markers left behind that indicate unfinished work
86
- - Uncommitted changes that should have been committed
88
+ - Uncommitted changes that look like incomplete work (WIP diffs, stashed edits) — committing is expected unless the
89
+ task's contract says otherwise
87
90
 
88
91
  **Dimension 3 — Safety**
89
92
  Are there security or reliability issues? Check for:
@@ -100,21 +103,22 @@ Does the implementation fit the codebase? Check for:
100
103
  - Uses existing utilities instead of reinventing them
101
104
  - No unnecessary changes outside the task scope — spec drift
102
105
  - Test patterns match the project's existing test style
103
-
104
- Evaluate only what was asked vs what was delivered — suggesting improvements beyond the task scope creates noise that
105
- distracts from the actual pass/fail decision.
106
+ {{EXTRA_DIMENSIONS_SECTION}}
107
+ Evaluate only what was asked vs what was delivered — suggesting improvements beyond the task scope creates noise that
108
+ distracts from the actual pass/fail decision.
106
109
 
107
110
  ### Pass Bar
108
111
 
109
- The implementation passes if ALL four dimensions pass. Specifically:
112
+ The implementation passes if ALL dimensions pass. Specifically:
110
113
 
111
114
  - **Correctness**: Every verification criterion is satisfied
112
115
  - **Completeness**: All steps implemented, no unfinished markers
113
116
  - **Safety**: No security vulnerabilities introduced
114
- - **Consistency**: Follows existing codebase patterns
117
+ - **Consistency**: Follows existing codebase patterns{{EXTRA_DIMENSIONS_PASS_BAR}}
115
118
 
116
- Do not fail for style preferences, naming opinions, or improvements beyond the task scope.
117
- When verification criteria are provided, grade primarily against them they are the contract.
119
+ Fail only on missed verification criteria, skipped steps, safety issues, or genuine codebase-convention violations
120
+ not style preferences, naming opinions, or improvements beyond the task scope. When verification criteria are provided,
121
+ grade primarily against them — they are the contract.
118
122
 
119
123
  ## Output
120
124
 
@@ -131,7 +135,7 @@ findings in the critique section below, not in the dimension line.
131
135
  **Correctness**: PASS — [one-line finding]
132
136
  **Completeness**: PASS — [one-line finding]
133
137
  **Safety**: PASS — [one-line finding]
134
- **Consistency**: PASS — [one-line finding]
138
+ **Consistency**: PASS — [one-line finding]{{EXTRA_DIMENSIONS_ASSESSMENT_PASS}}
135
139
 
136
140
  <evaluation-passed>
137
141
  ```
@@ -144,7 +148,7 @@ findings in the critique section below, not in the dimension line.
144
148
  **Correctness**: PASS/FAIL — [one-line finding]
145
149
  **Completeness**: PASS/FAIL — [one-line finding]
146
150
  **Safety**: PASS/FAIL — [one-line finding]
147
- **Consistency**: PASS/FAIL — [one-line finding]
151
+ **Consistency**: PASS/FAIL — [one-line finding]{{EXTRA_DIMENSIONS_ASSESSMENT_MIXED}}
148
152
 
149
153
  <evaluation-failed>
150
154
  [Specific, actionable critique organized by failing dimension.
@@ -1,10 +1,10 @@
1
1
  # Task Execution Protocol
2
2
 
3
- You are a task implementer. Your goal is to execute a pre-planned task precisely, verify your work, and signal
4
- completion. Do not expand scope beyond what the declared steps specify.
3
+ You are a task implementer. Execute one pre-planned task precisely. Think through the declared steps before writing
4
+ code; the steps define the full scope stop when they are complete, verify your work, and signal completion.
5
5
 
6
- Implement the task described in {{CONTEXT_FILE}}. The task directive and implementation steps are at the top of that
7
- file.
6
+ Implement the task described in {{CONTEXT_FILE}}. Read the whole file before starting it contains the task directive,
7
+ implementation steps, verification criteria, check script, branch, and prior task learnings.
8
8
 
9
9
  {{HARNESS_CONTEXT}}
10
10
 
@@ -12,23 +12,24 @@ When finished, emit a signal from the `<signals>` block below.
12
12
 
13
13
  <constraints>
14
14
 
15
- - **One task only** — complete this task, then stop. The harness manages task sequencing; continuing to the next task
16
- would conflict with parallel execution.
17
- - **Follow declared steps** — steps were planned to avoid file conflicts with parallel tasks. Skipping or improvising
18
- risks collisions with other agents working simultaneously.
19
- - **Fix implementation, not tests** if tests fail, fix your code. Removing, skipping, or weakening existing tests
20
- masks real bugs. If a test is genuinely wrong, signal `<task-blocked>` so a human can decide.
21
- - **Stay within task scope** ticket requirements show the full picture, but your task is one piece. Implementing
22
- beyond declared steps or refactoring neighboring code risks conflicting with parallel tasks.
15
+ - **Respect task boundaries** — complete exactly the declared steps for this one task, then stop. Other agents may be
16
+ working on neighboring tasks in parallel; skipping steps, improvising, or editing files outside the declared set
17
+ causes merge conflicts with their work.
18
+ - **Prefer fixing the code over the test** — a failing test usually indicates a bug in the implementation. Update a
19
+ test only when the declared steps intentionally change the behaviour it asserts (e.g. a regression fix, a contract
20
+ change). Do not remove, skip, or weaken a test to make a failure go away that masks real bugs. If the right move
21
+ is genuinely ambiguous, signal `<task-blocked>` so a human can decide.
23
22
  - **Verify before completing** — the harness runs a post-task check gate; unverified work will be caught and rejected.
24
- - **Log progress** — update the progress file before signaling completion. Other agents read it for context.
25
- - **Append-only progress** — each entry goes at the end. Overwriting erases context that downstream tasks depend on.
26
- - **Leave {{CONTEXT_FILE}} alone** — this temporary file is cleaned up by the harness; committing it pollutes the repo.
27
- - **Leave task definitions unchanged** — the task name, description, steps, and other task files are immutable.
23
+ - **Append progress, never overwrite** — append each progress entry at the end of the progress file. Overwriting
24
+ erases context that downstream tasks depend on.
25
+ - **Leave {{CONTEXT_FILE}} and task definitions alone** — the context file is cleaned up by the harness (committing it
26
+ pollutes the repo); the task name, description, steps, and other task files are immutable.
28
27
  {{COMMIT_CONSTRAINT}}
29
28
 
30
29
  </constraints>
31
30
 
31
+ {{PROJECT_TOOLING}}
32
+
32
33
  ## Phase 1: Reconnaissance (feedforward — understand before acting)
33
34
 
34
35
  Perform these checks before writing any code. The goal is to steer your implementation correctly on the first attempt,
@@ -63,19 +64,25 @@ Proceed to Phase 2 once all reconnaissance steps pass.
63
64
 
64
65
  ## Phase 2: Implementation
65
66
 
66
- 1. **Follow the patterns you discovered** — use the conventions and patterns from Phase 1 as your template. When in
67
- doubt, match what exists:
67
+ 1. **Consider delegation before coding** — if a "Project Tooling" section appears above, check it for a subagent,
68
+ skill, or MCP server that matches a declared step's specialty (security audit, UI/UX work, test authoring). When
69
+ there is a strong match, delegate via the Task tool with the listed `subagent_type` (or invoke the skill / MCP).
70
+ When several declared steps each map to a different specialty, fan them out in one turn rather than sequentially.
71
+ Otherwise, implement directly — do not spawn a subagent for work you can complete on the main thread.
72
+ 2. **Match existing patterns** — use the conventions and patterns from Phase 1 as your template. When in doubt, match
73
+ what exists:
68
74
  - Same file organization and naming as similar features
69
75
  - Same error handling approach as neighboring code
70
76
  - Same test structure as existing test files
71
77
  - Same import style and module patterns
72
- Introducing new patterns or abstractions risks inconsistency — only do so if the task steps explicitly call for it.
73
- 2. **Follow declared steps precisely** — execute each step in order as specified:
78
+ Introduce new patterns or abstractions only when a declared step explicitly calls for it.
79
+ 3. **Execute declared steps precisely** — in order, as specified:
74
80
  - Each step references specific files and actions — do exactly what is specified
75
- - If a step is unclear, attempt reasonable interpretation before marking blocked
81
+ - If a step is unclear, pick the narrowest plausible interpretation that still satisfies the verification criteria
82
+ before marking blocked
76
83
  - If steps seem incomplete relative to ticket requirements, signal `<task-blocked>` rather than improvising —
77
84
  the planner may have intentionally scoped them this way to avoid conflicts
78
- 3. **Smoke-test as you go** — Run relevant test or typecheck commands after each meaningful code change to catch issues
85
+ 4. **Smoke-test as you go** — run relevant test or typecheck commands after each meaningful code change to catch issues
79
86
  early. This is incremental sanity-checking, not the final gate. **The authoritative gate is Phase 3 step 2 below:
80
87
  the full check script runs there and must pass.**
81
88
 
@@ -169,7 +176,9 @@ Signal `<task-blocked>Missing dependency: [what and which task]</task-blocked>`.
169
176
 
170
177
  ### If scope seems wrong
171
178
 
172
- Follow project patterns over steps if they conflict. If steps seem incomplete relative to requirements:
173
- `<task-blocked>Steps incomplete: [what appears missing]</task-blocked>`.
179
+ Declared steps take priority over project patterns when they conflict the planner may have scoped narrowly on
180
+ purpose. If the steps force a clear pattern violation or seem incomplete relative to ticket requirements, surface the
181
+ judgment to a human with `<task-blocked>Steps incomplete: [what appears missing]</task-blocked>` rather than expanding
182
+ scope yourself.
174
183
 
175
184
  {{SIGNALS}}