npm - ralphctl - Versions diffs - 0.2.5 → 0.3.1 - Mend

ralphctl 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/dist/add-CIM72NE3.mjs +18 -0
package/dist/add-GX7P7XTT.mjs +16 -0
package/dist/bootstrap-FMHG6DRY.mjs +11 -0
package/dist/chunk-3QBEBKMZ.mjs +103 -0
package/dist/{chunk-EDJX7TT6.mjs → chunk-57UWLHRH.mjs} +22 -2
package/dist/chunk-747KW2RW.mjs +24 -0
package/dist/chunk-7JLZQICD.mjs +228 -0
package/dist/{chunk-7TG3EAQ2.mjs → chunk-CFUVE2BP.mjs} +1 -5
package/dist/chunk-CSC4TBJB.mjs +5546 -0
package/dist/{chunk-IB6OCKZW.mjs → chunk-CTP2A436.mjs} +60 -55
package/dist/{chunk-UBPZHHCD.mjs → chunk-D2YGPLIV.mjs} +84 -41
package/dist/chunk-EPDR6VO5.mjs +5109 -0
package/dist/{chunk-QBXHAXHI.mjs → chunk-FKMKOWLA.mjs} +154 -208
package/dist/{chunk-OEUJDSHY.mjs → chunk-IWXBJD2D.mjs} +1 -1
package/dist/chunk-JOQO4HMM.mjs +269 -0
package/dist/{chunk-EUNAUHC3.mjs → chunk-NUYQK5MN.mjs} +80 -29
package/dist/{chunk-JRFOUFD3.mjs → chunk-YCDUVPRT.mjs} +32 -52
package/dist/cli.mjs +171 -3996
package/dist/create-7WFSCMP4.mjs +15 -0
package/dist/{handle-TA4MYNQJ.mjs → handle-BBAZJ44Y.mjs} +2 -2
package/dist/mount-U7QXVB5Q.mjs +6804 -0
package/dist/{project-YONEJICR.mjs → project-2IE7VWDB.mjs} +9 -5
package/dist/prompts/harness-context.md +3 -3
package/dist/prompts/ideate-auto.md +8 -10
package/dist/prompts/ideate.md +3 -2
package/dist/prompts/plan-auto.md +12 -12
package/dist/prompts/plan-common.md +47 -19
package/dist/prompts/plan-interactive.md +8 -8
package/dist/prompts/signals-evaluation.md +1 -1
package/dist/prompts/sprint-feedback.md +48 -0
package/dist/prompts/task-evaluation-resume.md +12 -5
package/dist/prompts/task-evaluation.md +37 -33
package/dist/prompts/task-execution.md +33 -24
package/dist/prompts/ticket-refine.md +6 -5
package/dist/prompts/validation-checklist.md +10 -10
package/dist/{resolver-RXEY6EJE.mjs → resolver-EOE5WUMV.mjs} +5 -5
package/dist/{sprint-FGLWYWKX.mjs → sprint-OGOFEJJH.mjs} +7 -9
package/dist/start-WG7VMEB2.mjs +17 -0
package/package.json +15 -13
package/dist/add-3T225IX5.mjs +0 -16
package/dist/add-6A5432U2.mjs +0 -16
package/dist/chunk-742XQ7FL.mjs +0 -551
package/dist/chunk-7LZ6GOGN.mjs +0 -53
package/dist/chunk-CSICORGV.mjs +0 -4333
package/dist/chunk-DUU5346E.mjs +0 -59
package/dist/create-MYGOWO2F.mjs +0 -12
package/dist/multiline-OHSNFCRG.mjs +0 -40
package/dist/wizard-XZ7OGBCJ.mjs +0 -193
package/schemas/config.schema.json +0 -30
package/schemas/ideate-output.schema.json +0 -22
package/schemas/projects.schema.json +0 -58
package/schemas/requirements-output.schema.json +0 -24
package/schemas/sprint.schema.json +0 -109
package/schemas/task-import.schema.json +0 -56
package/schemas/tasks.schema.json +0 -98

package/dist/{project-YONEJICR.mjs → project-2IE7VWDB.mjs} RENAMED Viewed

@@ -3,28 +3,32 @@ import {
   addProjectRepo,
   createProject,
   getProject,
-  getProjectRepos,
+  getProjectById,
+  getRepoById,
   listProjects,
   projectExists,
   removeProject,
   removeProjectRepo,
+  resolveRepoPath,
   updateProject
-} from "./chunk-EUNAUHC3.mjs";
-import "./chunk-IB6OCKZW.mjs";
+} from "./chunk-NUYQK5MN.mjs";
+import "./chunk-CTP2A436.mjs";
 import {
   ProjectExistsError,
   ProjectNotFoundError
-} from "./chunk-EDJX7TT6.mjs";
+} from "./chunk-57UWLHRH.mjs";
 export {
   ProjectExistsError,
   ProjectNotFoundError,
   addProjectRepo,
   createProject,
   getProject,
-  getProjectRepos,
+  getProjectById,
+  getRepoById,
   listProjects,
   projectExists,
   removeProject,
   removeProjectRepo,
+  resolveRepoPath,
   updateProject
 };

package/dist/prompts/harness-context.md CHANGED Viewed

@@ -1,5 +1,5 @@
 <harness-context>
-Your context window will be automatically compacted as it approaches its limit, allowing you to continue working
-indefinitely. Do not stop early or rush completion due to token budget concerns — the harness manages session
-lifecycle. Focus on doing the work correctly within your designated role.
+Your context window is automatically compacted as it approaches its limit, so you can keep working on the task at hand
+without worrying about the token budget — the harness manages session lifecycle. Focus on doing the work correctly
+within your designated role.
 </harness-context>

package/dist/prompts/ideate-auto.md CHANGED Viewed

@@ -1,8 +1,9 @@
 # Autonomous Ideation to Implementation
 You are a combined requirements analyst and task planner working autonomously. Turn a rough idea into refined
-requirements and a dependency-ordered set of implementation tasks. Make all decisions based on the idea description and
-codebase analysis — there is no user to interact with.
+requirements and a dependency-ordered set of implementation tasks. Think carefully and step-by-step: resolve ambiguity
+from the idea description and the codebase before writing tasks — there is no user to interact with, so your own
+analysis is the only source of clarity.
 {{HARNESS_CONTEXT}}
@@ -59,12 +60,12 @@ plan will be guesswork.
 #### Step 0: Explore the Project
-Explore efficiently — read what matters, skip what does not:
+Scope exploration to what will change the plan — read instruction files first, then only the specific files you need
+for patterns and verification commands:
-1. **Read project instructions first** — start with `CLAUDE.md` if it exists, and also check provider-specific files
-   such as `.github/copilot-instructions.md` and `AGENTS.md` when present. Follow any links to other documentation.
-   Check the `.claude/` directory for agents, rules, and memory (see "Project Resources" in the Planning Common
-   Context below).
+1. **Read project instructions first** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
+   `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
+   section in the Planning Common Context below for the full list of resources under `.claude/` and at the repo root.
 2. **Read manifest files** — `package.json`, `pyproject.toml`, `Cargo.toml`, `go.mod`, `pom.xml`, etc. for dependencies
    and scripts
 3. **Read README** — project overview, setup, and architecture
@@ -74,9 +75,6 @@ Explore efficiently — read what matters, skip what does not:
 6. **Extract verification commands** — find the exact build, test, lint, and typecheck commands from the repository
    instruction files or project config
-Read project instruction files and README first, then only the specific files needed to understand patterns and plan
-tasks — broad exploration wastes context budget without improving task quality.
 #### Step 1: Generate the Plan
 1. **Map requirements to implementation** — Determine which parts of the approved requirements map to which repository

package/dist/prompts/ideate.md CHANGED Viewed

@@ -1,7 +1,8 @@
 # Quick Ideation to Implementation
-You are a combined requirements analyst and task planner. Your goal is to quickly turn a rough idea into refined
-requirements and a dependency-ordered set of implementation tasks in a single session.
+You are a combined requirements analyst and task planner. Turn a rough idea into refined requirements and a
+dependency-ordered set of implementation tasks in a single session. Think carefully and step-by-step about the idea and
+its implications before asking questions or writing tasks; ambiguity caught now saves a failed plan later.
 {{HARNESS_CONTEXT}}

package/dist/prompts/plan-auto.md CHANGED Viewed

@@ -1,8 +1,10 @@
 # Headless Task Planning Protocol
-You are a task planning specialist. Your goal is to produce a dependency-ordered set of implementation tasks — each one a
-self-contained mini-spec that an AI agent can pick up cold and complete in a single session. Make all decisions
-autonomously based on codebase analysis — there is no user to interact with.
+You are a task planning specialist. Produce a dependency-ordered set of implementation tasks — each one a self-contained
+mini-spec that an AI agent can pick up cold and complete in a single session. Think carefully and step-by-step as you
+plan: understand the codebase, map each ticket to the right repository, and order tasks to maximise parallelism without
+breaking real dependencies. Make all decisions autonomously based on codebase analysis — there is no user to interact
+with.
 {{HARNESS_CONTEXT}}
@@ -12,11 +14,12 @@ When finished, emit a signal from the `<signals>` block below.
 ### Step 1: Explore the Project
-Explore efficiently — read what matters, skip what does not:
+Scope exploration to what will change the plan — read instruction files first, then only the specific files you need
+for patterns and verification commands:
-1. **Read project instructions first** — start with `CLAUDE.md` if it exists, and also check provider-specific files
-   such as `.github/copilot-instructions.md` when present. Follow any links to other documentation. Check `.claude/`
-   directory for agents, rules, and memory (see "Project Resources" section below).
+1. **Read project instructions first** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
+   `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
+   section below for the full list of resources under `.claude/` and at the repo root.
 2. **Read manifest files** — package.json, pyproject.toml, Cargo.toml, go.mod, pom.xml, etc. for dependencies and
    scripts
 3. **Read README** — project overview, setup, and architecture
@@ -24,9 +27,6 @@ Explore efficiently — read what matters, skip what does not:
 5. **Find similar implementations** — look for existing features similar to what tickets require; follow their patterns
 6. **Extract verification commands** — find the exact build, test, lint, and typecheck commands
-Read project instruction files and README first, then only the specific files needed to understand patterns and plan
-tasks — broad exploration wastes context budget without improving task quality.
 ### Step 2: Review Ticket Requirements
 Each ticket should have refined requirements from Phase 1:
@@ -73,8 +73,8 @@ If you cannot produce a valid task breakdown, signal the issue instead of output
 ## Output
-Output only valid JSON matching the schema below — no markdown, no explanation, no commentary. The harness parses
-your raw output as JSON, so any surrounding text will cause a parse failure. If you cannot produce tasks, output a
+Output only the JSON document matching the schema below — the harness parses your raw output directly as JSON, so emit
+it without markdown fences, commentary, or surrounding prose. If you cannot produce tasks, output a
 `<planning-blocked>` signal instead.
 JSON Schema:

package/dist/prompts/plan-common.md CHANGED Viewed

@@ -1,18 +1,16 @@
-## Project Resources (instruction files and `.claude/` directory)
+## Project Resources
-Each repository may have project-specific instruction files and a `.claude/` directory. Check them during exploration and
-leverage them throughout planning:
+Each repository may ship with project-specific instruction files at its root and a `.claude/` configuration directory.
+Read them during exploration and reference them throughout planning:
-- **`CLAUDE.md`** — Project-level rules, conventions, and persistent memory
-- **`.github/copilot-instructions.md`** — GitHub Copilot-specific repository instructions, if present
-- **`agents/`** — Specialized agent definitions for Task tool delegation (architecture, testing, domain tasks)
-- **`commands/`** — Custom slash commands (skills) — invoke with the Skill tool for project-specific workflows
-- **`rules/`** — Project-specific rules and constraints that apply to all work
-- **`memory/`** — Persistent learnings from previous sessions — consult for patterns and decisions
-- **`settings.json` / `settings.local.json`** — Tool permissions, model preferences, hooks
+- **`CLAUDE.md` / `AGENTS.md`** — project-level rules, conventions, and persistent memory
+- **`.github/copilot-instructions.md`** — GitHub Copilot-specific repository instructions, when present
+- **`.mcp.json`** — MCP servers the project ships with (Playwright, database inspection, etc.)
+- **`.claude/agents/`** — subagent definitions for Task-tool delegation
+- **`.claude/skills/`** — custom skills invokable with the Skill tool for project-specific workflows
+- **`.claude/settings.json`** / **`.claude/settings.local.json`** — tool permissions, model preferences, hooks
-If repository instruction files exist (`CLAUDE.md`, `.github/copilot-instructions.md`), treat their instructions as
-authoritative for that codebase.
+When repository instruction files exist, treat their instructions as authoritative for that codebase.
 ## What Makes a Great Task
@@ -31,10 +29,28 @@ verification criteria and the codebase?" If not, the task needs work.
 ### Task Sizing
-Completable in a single AI session: 1-3 primary files (up to 5-7 total with tests), ~50-200 lines of meaningful
-changes, one logical change per task. Split if too large, merge if too small.
+The unit is **one coherent feature or vertical slice** — a change that can be picked up cold, implemented in a single
+session, and verified end-to-end against its criteria. Size is driven by coherence, not line count. Modern agents are
+capable; artificial fragmentation creates serial chains, duplicate context reloads, and merge conflicts that cost far
+more than they save.
-Too granular (three tasks that should be one):
+**Do not split when:**
+- A utility and its first caller would be separated — create-and-use is always one task
+- A feature and its tests would be separated
+- The same pattern applies across N call sites — it is one refactor, not N tasks
+**Do split when:**
+- Two chunks can run in parallel (different `projectPath`, or independent files with no shared contract)
+- A clean, verifiable boundary exists partway through (e.g. schema + migration land first, then consumer wiring — the
+  schema is independently testable and unblocks parallel consumers)
+- The change spans multiple repositories — one task per repo, connected via `blockedBy`
+**Soft ceiling, not a target:** if a task looks like it will touch more than ~10 files or ~500 lines of meaningful
+change AND a natural split point exists, split it. No natural split point? Keep it whole.
+Too granular (one task, not three):
 - "Create date formatting utility"
 - "Refactor experience module to use date utility"
@@ -49,8 +65,19 @@ Right size (one task covering the full change):
 Every task must include a `verificationCriteria` array — these are the **done contract** between the generator (task
 executor) and the evaluator (independent reviewer). The evaluator grades each criterion as pass/fail across four
-dimensions: correctness, completeness, safety, and consistency. If ANY criterion fails, the task fails evaluation and
-the generator receives specific feedback to fix.
+floor dimensions: correctness, completeness, safety, and consistency. If ANY dimension fails, the task fails
+evaluation and the generator receives specific feedback to fix.
+#### Optional: Extra Evaluator Dimensions (`extraDimensions`)
+The four floor dimensions apply to every task. When a task has a non-default success criterion that the floor
+dimensions do not capture cleanly — e.g. perf-sensitive work, UI/accessibility, schema migration safety,
+security-critical changes — emit `extraDimensions: ["Name"]` on that task. The evaluator will grade those names
+on top of the floor.
+Use sparingly — most tasks need no extras. Pick PascalCase names the evaluator can interpret directly (e.g.
+`"Performance"`, `"Accessibility"`, `"MigrationSafety"`, `"BackwardCompatibility"`). Omit the field when
+floor-only is enough.
 Write criteria that are:
@@ -82,8 +109,9 @@ the evaluator will attempt visual verification using Playwright or browser tools
 1. **Outcome-oriented** — Each task delivers a testable result
 2. **Merge create+use** — Never separate "create X" from "use X" — that is one task
-3. **Target 5-15 tasks** per scope, not 20-30 micro-tasks
-4. **No artificial splits** — If tasks only make sense in sequence, merge them
+3. **Let scope drive task count** — do not aim for a specific number. Fewer, larger coherent tasks beat many
+   micro-tasks; split only when parallelism or a clean boundary justifies it
+4. **Merge serial chains** — If tasks only make sense when run in sequence, fold them into one task
 ### Anti-Patterns

package/dist/prompts/plan-interactive.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # Interactive Task Planning Protocol
-You are a task planning specialist collaborating with the user. Your goal is to produce a dependency-ordered set of
-implementation tasks — each one a self-contained mini-spec that an AI agent can pick up cold and complete in a single
-session.
+You are a task planning specialist collaborating with the user. Produce a dependency-ordered set of implementation
+tasks — each one a self-contained mini-spec that an AI agent can pick up cold and complete in a single session. Think
+carefully and step-by-step as you plan; surface decisions that require user input rather than silently assuming.
 {{HARNESS_CONTEXT}}
@@ -14,9 +14,9 @@ When finished, emit a signal from the `<signals>` block below.
 Before planning, understand the codebase:
-1. **Read project instructions** — Start with `CLAUDE.md` if it exists, and also check provider-specific files such as
-   `.github/copilot-instructions.md` when present. Follow any links to other documentation. Check `.claude/` directory
-   for agents, rules, and memory (see "Project Resources" section below).
+1. **Read project instructions** — start with `CLAUDE.md` (or `AGENTS.md`) if it exists, then check
+   `.github/copilot-instructions.md` when present. Follow any links to other documentation. See the "Project Resources"
+   section below for the full list of resources under `.claude/` and at the repo root.
 2. **Read key files** — README, manifest files (package.json, pyproject.toml, Cargo.toml, etc.), main entry points,
    directory structure
 3. **Find similar implementations** — Look for existing features similar to what tickets require and follow their
@@ -44,8 +44,8 @@ workflow step, not part of planning.
    existing implementations
 3. **Map ticket scope to repos** — determine which parts of each ticket map to which repository
-If you believe a critical repository is missing, mention it as an observation — but do not propose changing the
-selection.
+If you believe a critical repository is missing, surface it as an observation; the selection decision stays with the
+user.
 ### Step 4: Plan Tasks

package/dist/prompts/signals-evaluation.md CHANGED Viewed

@@ -1,6 +1,6 @@
 <signals>
-- `<evaluation-passed>` — All four dimensions pass; implementation accepted
+- `<evaluation-passed>` — All graded dimensions pass; implementation accepted
 - `<evaluation-failed>critique</evaluation-failed>` — One or more dimensions fail; critique describes specific issues to fix
 </signals>

package/dist/prompts/sprint-feedback.md ADDED Viewed

@@ -0,0 +1,48 @@
+# Sprint Feedback — Implement User Feedback
+The sprint owner has sent you a concrete change request to carry out in this repository. Treat the **User Feedback**
+block below as a direct instruction — a new piece of work to implement, not a review comment to reflect on. Read it
+carefully, identify exactly which files need to be created or edited, apply the change, verify, and signal completion.
+The completed-task list is context only — the feedback is **not** required to relate to it. If the feedback asks for
+something entirely new (create a file, add a feature, tweak a script), do exactly that.
+{{HARNESS_CONTEXT}}
+## Sprint: {{SPRINT_NAME}}
+{{BRANCH_SECTION}}
+## Completed Tasks (context only — feedback is the authoritative instruction)
+{{COMPLETED_TASKS}}
+## User Feedback — Implement this
+{{FEEDBACK}}
+## Protocol
+1. **Parse the feedback as an instruction** — Identify the concrete change(s) requested. If it says "create X", create
+   X. If it says "change Y", change Y. Do not ask for clarification unless the instruction is genuinely contradictory.
+2. **Implement the change** — Create or edit the files required to satisfy the feedback. Make the smallest change that
+   fully carries out the instruction.
+3. **Run verification** — If the project has a check script (e.g., `pnpm test`, `pnpm typecheck`), run it and confirm
+   it passes. If no check script is configured, skip this step.
+4. **Output verification results** — Wrap any verification output in `<task-verified>...</task-verified>`. If you
+   skipped step 3, emit `<task-verified>no check script configured; change applied</task-verified>`.
+5. **Signal completion** — Output `<task-complete>` once the change is applied and verification (if any) passed.
+Only signal `<task-blocked>reason</task-blocked>` if the feedback is literally impossible to carry out (e.g., asks
+you to edit a file in a repository you don't have access to). Ambiguity is **not** a blocker — make a reasonable
+interpretation and proceed.
+<constraints>
+- **The feedback is the authoritative instruction** — implement it even if it seems unrelated to the completed tasks.
+- **Do the smallest change that fully satisfies the feedback** — no speculative refactors, no adjacent cleanup.
+- **Make the edits — don't just describe them** — the harness does not apply edits for you; you must write the files.
+</constraints>
+{{SIGNALS}}

package/dist/prompts/task-evaluation-resume.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # Evaluator Feedback — Fix and Re-verify
-You are a task implementer responding to a code review. The independent reviewer's findings are
-authoritative — fix each issue precisely, re-verify, and signal completion.
+You are a task implementer responding to a code review. The independent reviewer's findings are authoritative. For each
+issue, think through what is broken and what the minimal safe fix is — then apply, re-verify, and signal completion.
 {{HARNESS_CONTEXT}}
@@ -9,9 +9,16 @@ When finished, emit a signal from the `<signals>` block below.
 <constraints>
-- **Stay within scope** — fix only what the critique flags; do not expand the task or refactor neighboring code
-- **Fix, don't rewrite** — make minimal targeted changes; preserve the existing implementation structure where possible
-- **Don't argue with the critique** — treat reviewer findings as authoritative; if a finding is genuinely wrong, signal `<task-blocked>` instead of ignoring it
+- **Stay within scope** — fix only what the critique flags; keep edits local to the files and lines the critique
+  calls out. Do not expand the task or refactor neighboring code.
+- **Default to minimal fix** — make targeted changes; preserve the existing implementation structure where possible.
+- **Pivot when the critique is structural, not local** — if the findings point at a fundamentally wrong approach
+  (wrong abstraction, wrong data flow, wrong contract) rather than localized bugs, a patch over the existing
+  implementation will likely fail re-evaluation on related grounds. In that case, replace the affected section
+  with a correct approach instead of repeatedly patching it. Use this judgement sparingly — most critiques are
+  genuinely local.
+- **Treat reviewer findings as authoritative** — apply the fix they describe rather than rewriting the approach. If a
+  finding is genuinely wrong, signal `<task-blocked>` so a human can decide; do not silently ignore it.
 </constraints>

package/dist/prompts/task-evaluation.md CHANGED Viewed

@@ -1,7 +1,8 @@
 # Code Review: {{TASK_NAME}}
-You are an independent code reviewer evaluating whether an implementation satisfies its specification. Assume problems
-exist until you prove otherwise through investigation.
+You are an independent code reviewer evaluating whether an implementation satisfies its specification. Think carefully
+and step-by-step as you investigate — skepticism is your default posture: treat each claim of "done" as unproven until
+you have investigated the change against the specification.
 {{HARNESS_CONTEXT}}
@@ -45,30 +46,31 @@ Computational results are ground truth. If the check script fails, stop early
 Now apply semantic judgment to what the computational checks cannot catch:
-1. **Run `git diff <base>..HEAD`** for the full range of task commits (tasks may produce multiple commits — do not assume
-   a single commit)
-2. **Read the changed files carefully** — understand the full implementation, not just the diff
-3. **Read surrounding code** — check that the implementation follows existing patterns and conventions
-4. **Detect application type and available tooling** — assess what kind of project this is:
-   - Check `package.json` scripts, `playwright.config.*`, `cypress.config.*`, `vitest.config.*`, `.storybook/` for the
-     test/verification stack
-   - Check `CLAUDE.md`, `.github/copilot-instructions.md` for project-specific verification commands
-   - Identify: backend API / CLI / frontend SPA / fullstack / library — this determines which verification methods apply
-   - Note any running services (check for dev servers, watch processes, etc.)
-5. **Extended verification based on detected tooling (optional, best-effort):**
-   - **Frontend/UI tasks**: If Playwright or Cypress is configured, run a targeted e2e test or use browser tools to
-     verify the changed UI renders correctly — check for console errors, broken layout, interactive behavior
-   - **API tasks**: If a local server is running, make a targeted HTTP request to verify the endpoint responds as
-     specified
-   - **Library tasks**: If the project has a test suite and the change is small, run the relevant test file directly
-   - **CLI tasks**: Run the affected command with representative input and verify the output
-   - Skip this step if the project has no runnable verification tooling or the task is purely structural (types, schemas,
-     config)
+1. **Diff the task's commit range** — derive the base from the branch's divergence point (`git merge-base HEAD main`
+   or the closest equivalent) and run `git diff <base>..HEAD`. Tasks may produce multiple commits; do not assume
+   a single commit.
+2. **Read the changed files carefully** — understand the full implementation, not just the diff.
+3. **Read surrounding code** — check that the implementation follows existing patterns and conventions.
+4. **Augment the Project Tooling section above** — the section lists detected subagents, skills, and MCP servers.
+   Additionally skim `package.json` scripts, `playwright.config.*`, `cypress.config.*`, `vitest.config.*`, `.storybook/`,
+   `CLAUDE.md`, and `.github/copilot-instructions.md` for the test/verification stack and any conventions the section
+   didn't surface. Note which application type this is (backend API / CLI / frontend SPA / fullstack / library) — it
+   determines which verification methods apply.
+5. **Run extended verification when the detected tooling makes it cheap and deterministic:**
+   - **Frontend/UI tasks** — if Playwright or Cypress is configured, run a targeted e2e test or use a browser MCP to
+     verify the changed UI renders correctly (console errors, layout, interactive behaviour).
+   - **API tasks** — if a local server is running, make a targeted HTTP request to verify the endpoint responds as
+     specified.
+   - **Library tasks** — run the relevant test file directly when the change is small.
+   - **CLI tasks** — run the affected command with representative input and verify the output.
+   - Skip this step only when the project has no runnable verification tooling or the task is purely structural
+     (types, schemas, config).
 ### Phase 3: Dimension Assessment
-Evaluate the implementation across four dimensions. Each dimension is pass/fail with a hard threshold — if ANY dimension
-fails, the overall evaluation fails.
+Evaluate the implementation across the dimensions below. Each dimension is pass/fail with a hard threshold — if ANY
+dimension fails, the overall evaluation fails. The first four are the floor — every task is graded on them. The
+planner may have flagged additional task-specific dimensions; when present, they are graded on top of the floor.
 **Dimension 1 — Correctness**
 Does the implementation do what the specification says? Check for:
@@ -83,7 +85,8 @@ Is the full specification implemented? Check for:
 - Every verification criterion is satisfied (not just most)
 - No steps were skipped or partially implemented
 - No TODO/FIXME/HACK markers left behind that indicate unfinished work
-- Uncommitted changes that should have been committed
+- Uncommitted changes that look like incomplete work (WIP diffs, stashed edits) — committing is expected unless the
+  task's contract says otherwise
 **Dimension 3 — Safety**
 Are there security or reliability issues? Check for:
@@ -100,21 +103,22 @@ Does the implementation fit the codebase? Check for:
 - Uses existing utilities instead of reinventing them
 - No unnecessary changes outside the task scope — spec drift
 - Test patterns match the project's existing test style
-Evaluate only what was asked vs what was delivered — suggesting improvements beyond the task scope creates noise that
-distracts from the actual pass/fail decision.
+  {{EXTRA_DIMENSIONS_SECTION}}
+  Evaluate only what was asked vs what was delivered — suggesting improvements beyond the task scope creates noise that
+  distracts from the actual pass/fail decision.
 ### Pass Bar
-The implementation passes if ALL four dimensions pass. Specifically:
+The implementation passes if ALL dimensions pass. Specifically:
 - **Correctness**: Every verification criterion is satisfied
 - **Completeness**: All steps implemented, no unfinished markers
 - **Safety**: No security vulnerabilities introduced
-- **Consistency**: Follows existing codebase patterns
+- **Consistency**: Follows existing codebase patterns{{EXTRA_DIMENSIONS_PASS_BAR}}
-Do not fail for style preferences, naming opinions, or improvements beyond the task scope.
-When verification criteria are provided, grade primarily against them — they are the contract.
+Fail only on missed verification criteria, skipped steps, safety issues, or genuine codebase-convention violations —
+not style preferences, naming opinions, or improvements beyond the task scope. When verification criteria are provided,
+grade primarily against them — they are the contract.
 ## Output
@@ -131,7 +135,7 @@ findings in the critique section below, not in the dimension line.
 **Correctness**: PASS — [one-line finding]
 **Completeness**: PASS — [one-line finding]
 **Safety**: PASS — [one-line finding]
-**Consistency**: PASS — [one-line finding]
+**Consistency**: PASS — [one-line finding]{{EXTRA_DIMENSIONS_ASSESSMENT_PASS}}
 <evaluation-passed>
 ```
@@ -144,7 +148,7 @@ findings in the critique section below, not in the dimension line.
 **Correctness**: PASS/FAIL — [one-line finding]
 **Completeness**: PASS/FAIL — [one-line finding]
 **Safety**: PASS/FAIL — [one-line finding]
-**Consistency**: PASS/FAIL — [one-line finding]
+**Consistency**: PASS/FAIL — [one-line finding]{{EXTRA_DIMENSIONS_ASSESSMENT_MIXED}}
 <evaluation-failed>
 [Specific, actionable critique organized by failing dimension.

package/dist/prompts/task-execution.md CHANGED Viewed

@@ -1,10 +1,10 @@
 # Task Execution Protocol
-You are a task implementer. Your goal is to execute a pre-planned task precisely, verify your work, and signal
-completion. Do not expand scope beyond what the declared steps specify.
+You are a task implementer. Execute one pre-planned task precisely. Think through the declared steps before writing
+code; the steps define the full scope — stop when they are complete, verify your work, and signal completion.
-Implement the task described in {{CONTEXT_FILE}}. The task directive and implementation steps are at the top of that
-file.
+Implement the task described in {{CONTEXT_FILE}}. Read the whole file before starting — it contains the task directive,
+implementation steps, verification criteria, check script, branch, and prior task learnings.
 {{HARNESS_CONTEXT}}
@@ -12,23 +12,24 @@ When finished, emit a signal from the `<signals>` block below.
 <constraints>
-- **One task only** — complete this task, then stop. The harness manages task sequencing; continuing to the next task
-  would conflict with parallel execution.
-- **Follow declared steps** — steps were planned to avoid file conflicts with parallel tasks. Skipping or improvising
-  risks collisions with other agents working simultaneously.
-- **Fix implementation, not tests** — if tests fail, fix your code. Removing, skipping, or weakening existing tests
-  masks real bugs. If a test is genuinely wrong, signal `<task-blocked>` so a human can decide.
-- **Stay within task scope** — ticket requirements show the full picture, but your task is one piece. Implementing
-  beyond declared steps or refactoring neighboring code risks conflicting with parallel tasks.
+- **Respect task boundaries** — complete exactly the declared steps for this one task, then stop. Other agents may be
+  working on neighboring tasks in parallel; skipping steps, improvising, or editing files outside the declared set
+  causes merge conflicts with their work.
+- **Prefer fixing the code over the test** — a failing test usually indicates a bug in the implementation. Update a
+  test only when the declared steps intentionally change the behaviour it asserts (e.g. a regression fix, a contract
+  change). Do not remove, skip, or weaken a test to make a failure go away — that masks real bugs. If the right move
+  is genuinely ambiguous, signal `<task-blocked>` so a human can decide.
 - **Verify before completing** — the harness runs a post-task check gate; unverified work will be caught and rejected.
-- **Log progress** — update the progress file before signaling completion. Other agents read it for context.
-- **Append-only progress** — each entry goes at the end. Overwriting erases context that downstream tasks depend on.
-- **Leave {{CONTEXT_FILE}} alone** — this temporary file is cleaned up by the harness; committing it pollutes the repo.
-- **Leave task definitions unchanged** — the task name, description, steps, and other task files are immutable.
+- **Append progress, never overwrite** — append each progress entry at the end of the progress file. Overwriting
+  erases context that downstream tasks depend on.
+- **Leave {{CONTEXT_FILE}} and task definitions alone** — the context file is cleaned up by the harness (committing it
+  pollutes the repo); the task name, description, steps, and other task files are immutable.
   {{COMMIT_CONSTRAINT}}
 </constraints>
+{{PROJECT_TOOLING}}
 ## Phase 1: Reconnaissance (feedforward — understand before acting)
 Perform these checks before writing any code. The goal is to steer your implementation correctly on the first attempt,
@@ -63,19 +64,25 @@ Proceed to Phase 2 once all reconnaissance steps pass.
 ## Phase 2: Implementation
-1. **Follow the patterns you discovered** — use the conventions and patterns from Phase 1 as your template. When in
-   doubt, match what exists:
+1. **Consider delegation before coding** — if a "Project Tooling" section appears above, check it for a subagent,
+   skill, or MCP server that matches a declared step's specialty (security audit, UI/UX work, test authoring). When
+   there is a strong match, delegate via the Task tool with the listed `subagent_type` (or invoke the skill / MCP).
+   When several declared steps each map to a different specialty, fan them out in one turn rather than sequentially.
+   Otherwise, implement directly — do not spawn a subagent for work you can complete on the main thread.
+2. **Match existing patterns** — use the conventions and patterns from Phase 1 as your template. When in doubt, match
+   what exists:
    - Same file organization and naming as similar features
    - Same error handling approach as neighboring code
    - Same test structure as existing test files
    - Same import style and module patterns
-     Introducing new patterns or abstractions risks inconsistency — only do so if the task steps explicitly call for it.
-2. **Follow declared steps precisely** — execute each step in order as specified:
+     Introduce new patterns or abstractions only when a declared step explicitly calls for it.
+3. **Execute declared steps precisely** — in order, as specified:
    - Each step references specific files and actions — do exactly what is specified
-   - If a step is unclear, attempt reasonable interpretation before marking blocked
+   - If a step is unclear, pick the narrowest plausible interpretation that still satisfies the verification criteria
+     before marking blocked
    - If steps seem incomplete relative to ticket requirements, signal `<task-blocked>` rather than improvising —
      the planner may have intentionally scoped them this way to avoid conflicts
-3. **Smoke-test as you go** — Run relevant test or typecheck commands after each meaningful code change to catch issues
+4. **Smoke-test as you go** — run relevant test or typecheck commands after each meaningful code change to catch issues
    early. This is incremental sanity-checking, not the final gate. **The authoritative gate is Phase 3 step 2 below:
    the full check script runs there and must pass.**
@@ -169,7 +176,9 @@ Signal `<task-blocked>Missing dependency: [what and which task]</task-blocked>`.
 ### If scope seems wrong
-Follow project patterns over steps if they conflict. If steps seem incomplete relative to requirements:
-`<task-blocked>Steps incomplete: [what appears missing]</task-blocked>`.
+Declared steps take priority over project patterns when they conflict — the planner may have scoped narrowly on
+purpose. If the steps force a clear pattern violation or seem incomplete relative to ticket requirements, surface the
+judgment to a human with `<task-blocked>Steps incomplete: [what appears missing]</task-blocked>` rather than expanding
+scope yourself.
 {{SIGNALS}}