npm - @exaudeus/workrail - Versions diffs - 3.78.2 → 3.79.1 - Mend

@exaudeus/workrail 3.78.2 → 3.79.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/console-ui/assets/{index-CoF7HXmk.js → index-DOM4l2VE.js} +1 -1
package/dist/console-ui/index.html +1 -1
package/dist/daemon/agent-loop.js +4 -1
package/dist/manifest.json +11 -11
package/dist/trigger/trigger-store.js +28 -0
package/dist/trigger/types.d.ts +1 -1
package/docs/ideas/backlog.md +331 -1
package/package.json +1 -1
package/workflows/coding-task-workflow-agentic.json +48 -7

package/dist/trigger/trigger-store.js CHANGED Viewed

@@ -621,6 +621,18 @@ function validateAndResolveTrigger(raw, env, workspaces = {}) {
     let agentConfig;
     if (raw.agentConfig) {
         const model = raw.agentConfig.model?.trim() || undefined;
+        if (model !== undefined) {
+            const slashIdx = model.indexOf('/');
+            const provider = slashIdx === -1 ? '' : model.slice(0, slashIdx);
+            const modelId = slashIdx === -1 ? '' : model.slice(slashIdx + 1);
+            if (!provider || !modelId) {
+                return (0, result_js_1.err)({
+                    kind: 'invalid_field_value',
+                    field: `agentConfig.model (must be in 'provider/model-id' format, e.g. amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0, got: "${model}")`,
+                    triggerId: rawId,
+                });
+            }
+        }
         let maxSessionMinutes;
         if (raw.agentConfig.maxSessionMinutes !== undefined) {
             const asNumber = Number(raw.agentConfig.maxSessionMinutes);
@@ -1064,6 +1076,22 @@ function validateTriggerStrict(trigger) {
             suggestedFix: 'branchPrefix: worktrain/',
         });
     }
+    if (trigger.agentConfig?.model !== undefined) {
+        const m = trigger.agentConfig.model;
+        const slashIdx = m.indexOf('/');
+        const provider = slashIdx === -1 ? '' : m.slice(0, slashIdx);
+        const modelId = slashIdx === -1 ? '' : m.slice(slashIdx + 1);
+        if (!provider || !modelId) {
+            issues.push({
+                rule: 'invalid-model-format',
+                severity: 'error',
+                triggerId: id,
+                message: `agentConfig.model "${m}" is not in 'provider/model-id' format -- ` +
+                    'both provider and model-id must be non-empty (e.g. amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0)',
+                suggestedFix: 'model: amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0',
+            });
+        }
+    }
     if (trigger.concurrencyMode === 'parallel' &&
         (!trigger.branchStrategy || trigger.branchStrategy === 'none') &&
         (trigger.autoCommit || trigger.autoOpenPR)) {

package/dist/trigger/types.d.ts CHANGED Viewed

@@ -115,7 +115,7 @@ export interface WebhookEvent {
     readonly payload: Readonly<Record<string, unknown>>;
     readonly signature?: string;
 }
-export type TriggerValidationRule = 'autocommit-needs-worktree' | 'autoopenpr-needs-autocommit' | 'worktree-needs-base-branch' | 'worktree-needs-prefix' | 'parallel-without-worktree' | 'missing-goal-template' | 'missing-max-session-minutes' | 'missing-max-turns' | 'autocommit-on-main-checkout' | 'missing-max-queue-depth';
+export type TriggerValidationRule = 'autocommit-needs-worktree' | 'autoopenpr-needs-autocommit' | 'worktree-needs-base-branch' | 'worktree-needs-prefix' | 'parallel-without-worktree' | 'missing-goal-template' | 'missing-max-session-minutes' | 'missing-max-turns' | 'autocommit-on-main-checkout' | 'missing-max-queue-depth' | 'invalid-model-format';
 export interface TriggerValidationIssue {
     readonly rule: TriggerValidationRule;
     readonly severity: 'error' | 'warning' | 'info';

package/docs/ideas/backlog.md CHANGED Viewed

@@ -610,6 +610,64 @@ Tier 0 injection needs a dedicated system prompt section separate from `assemble
 ---
+### Interpretation checkpoint for coding workflow: Candidate 5 (May 6, 2026)
+**Status: done** | Shipped in PR #962 (feat/etienneb/interpretation-checkpoint, May 7, 2026)
+**Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+Added `phase-0c-assumption-verification` step to `wr.coding-task` (v1.3.0 → v1.4.0) between Phase 0 (classify) and Phase 0.5 (upstream context). The step requires the coding agent to state a one-sentence interpretation before listing any assumptions, produce exactly 3 codebase assumptions with predicted locations and severity labels, verify each assumption by reading the predicted location, and output an `InterpretationArtifact` context key with `ambiguityLevel: clear | uncertain`. High-severity refutations surface to operator via `report_issue`. Also appended Subtype A/B classification prompt to the retrospective step for distribution measurement.
+This is the first step of the intent gap intervention sequence: Candidate 5 (shipped) → Candidate 4 (git-grounded context, next) → Candidate 1 or 3 gated on Subtype A/B empirical data.
+---
+### External assumption ranking for interpretation checkpoint (May 6, 2026)
+**Status: idea** | Priority: medium
+**Score: 10** | Cor:2 Cap:2 Eff:2 Lev:2 Con:2 | Blocked: no (Candidate 5 shipped PR #962, May 7, 2026)
+The interpretation checkpoint (Candidate 5) asks the coding agent to label each of its own assumptions as `severity: high` or `severity: low`. This self-labeling is a known weak point: an agent with a confident wrong prior may mislabel its most dangerous architectural assumption as low-severity to avoid triggering the gate. Self-assessed severity is the single lowest-confidence element in the pitch (confidence: 0.55).
+An external agent -- one that did not produce the assumptions -- can independently rank them by actual risk before verification runs. The external agent receives only the ticket and the assumption list (not the producing agent's full context or reasoning) and answers: which of these assumptions is most load-bearing? Which, if wrong, would cause the most damage? Are there high-risk areas this agent didn't surface at all?
+The producing agent then verifies in order of externally-ranked risk rather than self-assessed severity. Severity classification moves from self-labeling to an independent signal, removing the 0.55-confidence gap entirely.
+**Relationship to targeted session review:** the external ranking agent's output is also a high-signal review moment -- if the external agent flags assumptions the producing agent didn't think to surface, that delta is direct evidence of an interpretation gap.
+**Things to hash out:**
+- What context does the ranking agent receive? Ticket + assumption list only, or also the affected file list and design lock references? More context improves ranking quality but risks contaminating the independence.
+- Is this a lightweight parallel call (runs simultaneously with verification setup) or a blocking step?
+- How are conflicts between self-assessed severity and external ranking resolved? External ranking should win, but the producing agent should see the disagreement and explain it.
+- Cost: one additional inference call per session. Acceptable for standard/thorough sessions; probably skip for QUICK mode.
+---
+### Intent gap correction: fix the interpretation after assumption refutation (May 6, 2026)
+**Status: idea** | Priority: medium
+**Score: 10** | Cor:2 Cap:2 Eff:2 Lev:2 Con:2 | Blocked: no (Candidate 5 shipped PR #962, May 7, 2026)
+When an agent's assumption-surfacing step (Candidate 5) refutes a high-severity assumption, the current scoped fix is to surface the refutation to the operator and halt. But the real problem is deeper: the wrong prior that caused the refuted assumption may have already contaminated earlier context -- the upstream context harvest, the problem framing, the `reframedProblem` and `challengedAssumptions` context keys. A simple "re-read the file and try again" doesn't fix a wrong model; it patches the symptom in one step while leaving the contaminated context intact. Long-term, a refuted assumption that reflects a codebase-specific wrong prior (Subtype B) should also update the Memory store and eventually the knowledge graph so future sessions don't repeat the mistake.
+This is explicitly out of scope for the Candidate 5 pitch -- detection is the right first boundary. Correction is a separate, larger problem that depends on session context rollback, Memory store integration, and eventually the knowledge graph.
+**Done looks like:** when a high-severity assumption is refuted mid-session, the system can: (1) identify which prior context keys were formed under the wrong prior, (2) trigger a targeted correction sub-flow that re-derives those keys with the corrected interpretation, (3) write the correction back to the Memory store so future sessions in this workspace start with the right prior.
+**Things to hash out:**
+- What is the right granularity for context rollback? Rolling back individual keys vs. re-running entire prior phases are very different costs.
+- How do you distinguish "assumption was wrong about this specific file" (local fix) from "assumption reflects a systematic wrong prior about this codebase pattern" (Memory store update warranted)?
+- What is the trigger for a Memory store write -- every refuted high-severity assumption, or only ones confirmed as Subtype B by retrospective labeling?
+- How does this interact with the knowledge graph when it ships? The assumption store (Candidate 2 from the intent gap discovery) and the knowledge graph are both candidates for receiving the correction signal.
+**Relationship to existing entries:**
+- Blocked by: Candidate 5 (assumption surfacing step) -- detection must exist before correction can be designed
+- Related to: Subtype B intent failure (below), Knowledge graph (backlog), Memory store / living work context (shipped PR #939, #948, #952)
+---
 ### Subtype B intent failure: agent has a wrong prior about what this codebase does (May 5, 2026)
 **Status: idea -- needs empirical study before design** | Priority: high
@@ -1652,6 +1710,77 @@ Combined with the `DEFAULT_MAX_TURNS` cap, this provides defense-in-depth agains
 The durable session store, v2 engine, and workflow authoring features shared by all three systems.
+### Coordinator-managed typed output vocabulary: agent emits typed events, coordinator reacts per type (May 7, 2026)
+**Status: idea** | Priority: high
+**Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+Today, agent output is largely untyped -- notes, artifacts, context keys. The coordinator reacts to typed handoff artifacts at phase boundaries, but within a session the agent's observations, decisions, findings, and suggestions are all prose. The coordinator cannot programmatically react to them.
+The idea: the coordinator owns a vocabulary of typed output kinds that it supports. Before a session starts, it injects that vocabulary into the agent's context -- the agent knows exactly what typed things it can emit and what each one means. When the agent emits a typed output, the coordinator reacts with the appropriate process for that type. The reaction is deterministic coordinator logic (not LLM reasoning), specified per type.
+**Examples of typed output kinds and coordinator reactions:**
+- `suggestion(kind: "abstraction_extraction")` → coordinator fires targeted verification: "what are the three future cases this serves?"
+- `finding(severity: "critical", area: "security")` → coordinator routes to immediate review, may block merge
+- `decision(chose: X, over: Y, rationale: ...)` → coordinator checks for conflicts with prior decisions in the session store
+- `scope_change(direction: "larger", reason: ...)` → coordinator re-evaluates task complexity, may re-route to a heavier workflow
+- `blocker(kind: "missing_context", what: ...)` → coordinator attempts to resolve the blocker from known sources before surfacing to operator
+- `learning(claim: ..., area: ..., confidence: ...)` → coordinator writes to the assumption store for future sessions
+- `assumption(claim: ..., severity: ...)` → coordinator gates on verification before proceeding (Candidate 5 is a specific instance of this)
+**What makes this powerful:**
+The agent doesn't need to know what happens next when it emits a typed output -- that's the coordinator's job. The agent just has to recognize "this is an assumption I'm making" or "this is a scope change I'm noticing" and emit the right type. The coordinator's reaction logic handles the rest deterministically, without LLM turns.
+**Relationship to existing entries:**
+- "Typed suggestion artifacts with workflow-directed verification" (below): a specific application of this pattern to suggestions
+- "Coordinator mid-session hooks": the coordinator's reaction to typed outputs is exactly a mid-session hook triggered by a specific event type
+- "Candidate 5 / interpretation checkpoint": the assumption verification step is a manually-implemented instance of this pattern for one output type
+- "Coordinator session store awareness": the coordinator's reaction to a `learning` or `decision` type can write to the session store for future sessions
+**Things to hash out:**
+- Who defines the vocabulary of supported types -- the engine (closed set), the workflow author (per-workflow), or the coordinator (per-deployment)?
+- How does the agent learn what types are available? Injected in the system prompt, declared in the workflow, or both?
+- What is the API surface for emitting a typed output? A dedicated tool, a structured artifact field, a reserved context key pattern?
+- How are reactions defined? TypeScript in the coordinator script, declarative rules in triggers.yml, or something else?
+- What happens when the agent emits a type the coordinator doesn't handle? Silent drop, warning, or error?
+- Should typed outputs be visible in the console as first-class events, or only in the raw session log?
+---
+### Typed suggestion artifacts with workflow-directed verification (May 7, 2026)
+**Status: idea** | Priority: medium
+**Score: 11** | Cor:2 Cap:3 Eff:2 Lev:2 Con:2 | Blocked: no
+Agents frequently make suggestions mid-workflow -- propose an abstraction, recommend a deferral, flag a scope expansion, suggest a performance optimization. Today these live in plain prose notes. The workflow cannot distinguish one type of suggestion from another, cannot apply targeted follow-up logic, and cannot verify that the suggestion was actually scrutinized before being accepted. A suggestion that warrants architectural review gets the same treatment as one that warrants nothing.
+The idea: a typed `suggestion` tool call that the agent makes instead of embedding the suggestion in prose. The artifact carries a `kind` field (closed enum, workflow-declared) that tells the engine what type of suggestion this is. The workflow author declares, per suggestion kind, what verification the engine should require before the suggestion is accepted.
+**Example suggestion kinds and their natural follow-up scrutiny:**
+- `abstraction_extraction` -- "is this premature? what are the three concrete future cases this serves? does any of them exist in the current backlog? does this introduce coupling that didn't exist before?"
+- `architectural_change` -- "does this conflict with any design locks? what breaks downstream?"
+- `scope_expansion` -- "is this actually in scope? is this the scope rationalization failure mode -- the agent declaring it's a separate ticket to avoid doing the work?"
+- `deferral` -- "is this genuinely separate work, or is the agent completing checkboxes while leaving real work undone?"
+- `performance_optimization` -- "is this premature? what is the actual measured bottleneck? what evidence justifies this now?"
+**Mechanism:** fits naturally with the assessment gate system. A `suggestion_quality` assessment with dimensions specific to the suggestion kind. The workflow author declares which dimensions apply to each kind. When the agent emits a typed suggestion, the engine fires a `require_followup` consequence requiring the agent to answer the verification criteria for that kind before proceeding. If the agent cannot answer them satisfactorily, the suggestion does not pass.
+**API shape is open:** the typed suggestion could be a dedicated tool call (`suggest(type: "abstraction_extraction", ...)`), a structured artifact field in `continue_workflow`, a special context key, or something else entirely. The key property is that it is machine-readable and has a `kind` field the engine can act on -- not prose. The exact surface needs design work.
+**The friction concern:** if suggestions require too much overhead, agents will stop surfacing them or bury them in prose to avoid the gate. The verification criteria must be targeted and lightweight -- not a full review pass, just the specific questions that matter for that kind. "What are the three future cases this abstraction serves?" is lightweight. "Run a full architecture review" is not.
+**Things to hash out:**
+- What is the closed set of suggestion kinds for the initial version? Too many kinds creates complexity; too few misses the point.
+- Should suggestion kinds be workflow-declared (each workflow author defines their own) or engine-owned (a closed set the engine enforces)? Engine-owned is more consistent but less flexible.
+- How does the agent signal that a suggestion was considered and rejected, not just overlooked? A declined suggestion should be as visible as an accepted one.
+- Does the verification happen inline (a `require_followup` on the same step) or as a separate verification step? Inline is lower friction; a separate step is more auditable.
+- How does this interact with the existing `report_issue` mechanism? Some suggestions that fail verification should surface to the operator, not just loop back to the agent.
+---
 ### WorkTrain as the canonical workflow author -- MCP as a derived runtime (Apr 30, 2026)
 **Status: idea** | Priority: high
@@ -1954,6 +2083,37 @@ Surface in: `worktrain status`, `worktrain health <sessionId>`, console session
 Coordinator design patterns for WorkTrain's autonomous pipeline.
+### Reliable synthetic human gates: mimicking operator approval and refusal in autonomous pipelines (May 6, 2026)
+**Status: idea** | Priority: high
+**Score: 13** | Cor:3 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+WorkTrain's pipeline has several points where a human operator would naturally approve, reject, or redirect -- confirming an interpretation before coding starts, approving a direction from discovery, accepting a shaped pitch. In guided MCP sessions these gates fire as `requireConfirmation` steps. In fully autonomous daemon sessions, they either don't fire or surface to the operator outbox and wait indefinitely. There is currently no reliable mechanism for the coordinator to make these gate decisions autonomously in a way that is trustworthy enough to substitute for human judgment.
+The problem is not just "add an LLM to make the decision." An LLM making approval decisions is subject to the same sycophancy, self-enhancement bias, and overconfidence problems the rest of the pipeline has. A naïve "spawn an agent to approve this" produces rubber-stamping, not genuine gatekeeping. What is needed is a structured, auditable, multi-signal gate that approximates what a careful human reviewer would do -- checking specific criteria, flagging specific concerns, requiring specific evidence before proceeding.
+**What a strong synthetic gate needs:**
+- Typed criteria against which the artifact is evaluated (not free-form "does this look good?")
+- An independent agent that did not produce the artifact being evaluated
+- A cross-family challenger where possible (different model family = different correlated blind spots)
+- A structured verdict with explicit rationale tied to the criteria, not a confidence score
+- An escalation path when the synthetic gate is uncertain -- surface to operator rather than rubber-stamp
+**Use cases that need this:**
+- Interpretation checkpoint: does the coded assumption set actually cover the architectural risks for this ticket?
+- Shaping approval: does the pitch have genuine acceptance criteria or are they vague enough to accept anything?
+- Discovery direction: is the selected direction actually distinct from the runner-up, or are they the same approach with different labels?
+- Review verdict: is this finding severe enough to block merge, or is it a style preference being inflated?
+**Things to hash out:**
+- What is the right abstraction? A reusable `synthetic-gate` routine that takes typed criteria + artifact and returns a structured verdict? Or specialized gates per use case?
+- How do you prevent the synthetic gate from being gamed by the same agent that produced the artifact? The gate agent must not have access to the producing agent's reasoning, only its output.
+- What is the confidence threshold below which the synthetic gate escalates to a human rather than deciding? And how is that threshold configured per trigger?
+- How do you validate that a synthetic gate is actually performing the function of a human gate -- not just producing confident verdicts? Requires a calibration dataset of known-correct and known-incorrect artifacts with human ground truth.
+- Relationship to the `requireConfirmation` gate mechanism: the synthetic gate is the autonomous equivalent. It should produce the same typed routing signal the human confirmation gate produces, so the coordinator routing logic doesn't need to know which kind of gate fired.
+---
 ### Agents must not perform delivery actions -- only the coordinator's delivery layer can (Apr 30, 2026)
@@ -2464,6 +2624,67 @@ Ghost nodes represent steps that were compiled into the DAG but skipped at runti
 ## Workflow Library
+### Pre-specialized expert agents: on-demand consultants for main agents (May 7, 2026)
+**Status: idea** | Priority: high
+**Score: 13** | Cor:2 Cap:3 Eff:2 Lev:3 Con:3 | Blocked: no
+The main agent running a coding, review, or investigation workflow is not the expert. It is the orchestrator. When it needs specialized input -- "is this Kotlin idiomatic?", "does this violate any payments module invariants?", "what are the FP patterns this codebase uses for this?" -- it should be able to ask a pre-specialized consultant agent and get a bounded, expert answer back.
+These expert agents are not running the main workflow. They do not own any phase or make any final decisions. They are consulted: spawned with a specific question, pre-loaded with dense expertise in a specific domain, and they return a bounded answer. The main agent synthesizes the input and retains full ownership.
+**Examples:**
+- A Kotlin idioms expert pre-loaded with Kotlin best practices, common pitfalls, and idiomatic patterns -- queried when the coding or review agent wants to know "is this idiomatic Kotlin?"
+- A functional programming expert pre-loaded with the FP philosophy and patterns relevant to this codebase (from CLAUDE.md, design docs, etc.) -- queried when the agent is making decisions that touch FP style
+- A payments module expert pre-loaded with the payments execution paths, known invariants, and past design decisions -- queried when the task touches payments code
+- A security expert pre-loaded with the codebase's auth model, known vulnerabilities, and security invariants -- queried during review of auth-adjacent changes
+**Two distinct usage patterns -- both valid:**
+*Consultant mode:* The main agent mid-task asks a specific question ("is this Kotlin idiomatic?"), a pre-specialized agent is spawned with that question and its expertise briefing, it returns a bounded answer, the main agent synthesizes and moves on. Lightweight, on-demand, the main agent drives the interaction.
+*Parallel specialist mode:* The coordinator spawns multiple pre-specialized agents simultaneously for a phase of work -- e.g. an MR review that launches a Kotlin expert, a payments module expert, and an FP patterns expert in parallel, each reviewing the same diff through their lens. The main agent or coordinator synthesizes. This is the 3-angle executor pattern from wr.discovery applied to expertise curation rather than framing angles. Each specialist contributes their perspective; no single agent has to cover everything.
+The parallel specialist mode is conceptually similar to the existing reviewer families in wr.mr-review, but with expertise injection replacing role prompts. "You are a correctness reviewer" and "you are an agent briefed on this codebase's actual invariants, the past bugs in this module, and the specific patterns we use here" are very different levels of specificity.
+**What makes expert consultants distinct from existing reviewer families (MR review):**
+Existing reviewer families are top-level sessions running the full review workflow independently. Expert consultants (in consultant mode) are lightweight bounded spawns -- more like calling a function than running a parallel pipeline. In parallel specialist mode they are closer to reviewer families, but curated for the specific task rather than generically role-assigned.
+**What makes this distinct from existing context injection:**
+Existing context injection (living work context, assembledContextSummary) threads pipeline state between phases -- history of what happened. Expert consultants carry curated domain expertise -- best practices, idioms, invariants, patterns. The content type is different: not "what was done" but "what is true about this domain."
+**Implementation shape -- specialized workflows, not just context injection:**
+The most powerful form of a specialist is not an agent that receives a big expertise briefing at spawn time and then works freely. It is an agent running a purpose-built specialized workflow that contains both the expertise and the process for applying it systematically.
+A `wr.kotlin-review` workflow contains: the Kotlin expertise in `metaGuidance` and `references`, and a structured procedure -- "step 1: check null safety patterns at these call sites; step 2: evaluate coroutine usage against these criteria; step 3: check data class conventions..." Breaking the domain into steps ensures the specialist covers everything the domain requires, in the right order, with the right depth. A pure context dump leaves coverage to chance; a workflow enforces it.
+This also makes specialists auditable: you can see in the session store exactly which steps the specialist ran, what it found, and whether it covered all required dimensions. And specialized workflows improve over time via `wr.workflow-for-workflows`, compounding quality the same way all bundled workflows do.
+For dynamic specialists (payments module expert, specific subsystem expert), the workflow defines the process for generating the briefing dynamically -- walk these execution paths, read these design docs, extract these invariants -- rather than containing a static briefing.
+**What needs to be built:**
+- A catalog of specialized workflows: static domain specialists (wr.kotlin-review, wr.fp-patterns-review) and dynamic module specialists (wr.module-expert with a briefing-generation phase)
+- A matching mechanism: given the task's affected files and domains, which specialist workflows are relevant?
+- A consultation protocol: how does the main agent query a specialist? How does the specialist return a typed artifact the main agent can act on?
+- Dynamic briefing generation: for module-specific specialists, a workflow phase that walks affected execution paths and generates the curated briefing before the expert work begins
+**Relationship to existing entries:**
+- "Knowledge graph": the long-term structural ground truth version of this. Expert briefings are the lower-cost precursor that doesn't require the full graph.
+- "Assumption store": verified codebase facts are one input to the module expert briefing.
+- "Coordinator mid-session hooks": expert consultation could be triggered mid-session by the coordinator when specific signals fire (e.g. agent touches a known-tricky module).
+**Things to hash out:**
+- What is the right format for an expertise briefing? Prose vs structured facts vs a combination?
+- How are static briefings maintained? They go stale as language versions change and codebases evolve.
+- How are dynamic briefings generated? Static analysis? LLM-assisted code walk? What is the cost and freshness guarantee?
+- How does the main agent know which experts are available and when to consult them? Explicit workflow step, or opportunistic mid-task consultation?
+- Token budget: expert consultation adds turns and tokens. When is the cost worth it vs. the main agent just proceeding with its own judgment?
+- How does the consultation differ from just giving the main agent a bigger context window? The answer should be "specificity and freshness" -- a consultant briefed on this specific module is better than a general agent with everything injected.
+---
 ### Automatic root cause analysis when MR review finds issues post-coding (Apr 30, 2026)
 **Status: idea** | Priority: high
@@ -2480,6 +2701,8 @@ When an MR review session (run by a WorkTrain agent) finds issues in a coding se
 **Why this matters**: every finding that slips through is a signal about a workflow or process gap. Today that signal is lost. Capturing it systematically and feeding it back into workflow improvement closes the quality loop.
+**Concrete model:** CodeRabbit does this for MR reviews -- when a human reviewer corrects a CodeRabbit finding or points out something it missed, CodeRabbit extracts a structured learning (`{ claim, repo, file context, timestamp }`) and injects it into future review sessions for the same repo. WorkTrain should do the same, and broader: learnings from coding corrections (not just review corrections) feed into the per-workspace codebase assumption store, which directly addresses Subtype B intent failures. Human feedback on WorkTrain's PRs is the write path for that store.
 **Things to hash out:**
 - How does WorkTrain detect that a human has commented on a PR post-review? This requires monitoring the PR for new review activity after WorkTrain's session completed -- either webhook events or polling.
 - What does the analysis session actually produce? A structured finding about the gap? A concrete proposal for workflow improvement? Both?
@@ -2487,6 +2710,21 @@ When an MR review session (run by a WorkTrain agent) finds issues in a coding se
 - How do you distinguish "the workflow is fine but this was a genuinely hard edge case" from "the workflow has a systematic gap"? A single miss doesn't prove a gap; multiple misses of the same kind do.
 - Should the analysis result feed directly into `workflow-effectiveness-assessment`, or is it a separate concern?
 - For the "coding agent missed it" case: is the right fix to change the coding workflow, or to make the review workflow more adversarial?
+- How are codebase-specific learnings extracted from free-form human review comments? A structured extraction step (similar to CodeRabbit's learning extraction) is needed to turn "actually this is wrong because X" into a typed store entry.
+- How are extracted learnings scoped and invalidated over time? Per-repo scope is right for codebase-specific facts, but learnings go stale after refactors. A `lastVerified` + staleness mechanism is needed.
+- Relationship to the assumption store (Candidate 2 from the intent gap discovery): human PR corrections are the primary write path for the per-workspace codebase assumption store. These two entries should be designed together.
+---
+### wr.discovery recommendation quality improvements v3.5 (May 6, 2026)
+**Status: done** | Shipped in PR #951 (feat/etienneb/discovery-workflow-v35, May 6, 2026)
+**Score: 13** | Cor:2 Cap:3 Eff:2 Lev:3 Con:3 | Blocked: no
+Evidence-based redesign of `wr.discovery` (v3.4.0 → v3.5.0) addressing three failure modes -- coverage (right answer never generated), quality (wrong answer selected), and selection (right answer not selected). Key changes: all three assessment gates now have `assessmentConsequences` that block on failure; Phase 3d/3e split isolates external challenge from fresh-context selection; typed `SelectionOutput` tier (`strong_recommendation | provisional_recommendation | insufficient_signal`) driven by observable signals; `FrameValidityCheck` at landscape-to-frame transition; verbalized sampling + ordinary persona rotation in executor goal strings; `recommendationConfidenceBand` downgrade-only invariant across resolution phases; Phase 6 restructured as falsification-shaped fresh-context validator; `selectionTier` added to `wr.discovery_handoff` artifact.
+Full audit at `.workrail/discovery-workflow-audit.md`, implementation plan at `.workrail/discovery-workflow-implementation-plan.md`.
 ---
@@ -2601,6 +2839,45 @@ Some workflows want notes to consistently capture current understanding, key fin
 ---
+### Targeted session review: extract high-signal moments instead of reviewing full transcripts (May 6, 2026)
+**Status: idea** | Priority: high
+**Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+Reviewing a full agent session transcript to evaluate quality is prohibitively expensive -- long sessions have hundreds of tool calls, file reads, and reasoning steps. But most of the signal about whether a session went well lives in a small number of high-signal moments: confirmation gates, places where the agent flagged uncertainty or divergence, steps where the agent's output failed to match the expected contract, and points where the agent encountered reality and had to adapt. Reviewing those moments selectively is 10-50x cheaper than reading the full transcript and captures most of the quality signal.
+**High-signal moments worth targeting:**
+1. **Confirmation gate outcomes** -- when a `requireConfirmation` gate fired, what did the agent report? Did it accurately represent the state of the work? Was the decision the right one in hindsight?
+2. **Agent self-reported issues** -- calls to `report_issue` or `signal_coordinator` during the session. These are the agent's own flags that something was wrong. Each one warrants inspection: was the issue real, was the agent's characterization accurate, was the resolution appropriate?
+3. **Contract validation failures** -- steps where the engine returned a `blocked` or `require_followup` response. The agent's output failed the output contract. What did it produce, and why?
+4. **Agent-workflow friction points** -- places where the agent deviated from the expected step procedure, added divergence markers, or explicitly noted a gap between the workflow instructions and the reality it encountered. These are the inputs to workflow improvement.
+5. **Interpretation vs outcome delta** -- the gap between what the agent stated it was building (interpretation checkpoint, once it exists) and what it actually produced. The delta is the intent gap in concrete form.
+6. **Sycophancy signals** -- position changes without new evidence, position reversals after challenge, confidence-accuracy mismatches visible in the notes.
+**Why this matters:** without targeted review, session quality is only observable at the PR level (did the output pass review?). That's a lagging indicator that catches failures after they've shipped cost. Targeted review of high-signal moments catches failures mid-session or immediately post-session, before the cost compounds.
+**Relationship to existing entries:**
+- "Agent-reportable workflow bugs" (below) -- the agent's own flags are one of the primary review targets
+- "Synthetic human gates" -- the targeted review output is what a synthetic gate would consume to make an approval decision
+- "Automatic root cause analysis" -- targeted review is the cheaper precursor that identifies which sessions warrant full root cause analysis
+- "Per-run workflow improvement retrospective" -- the session retrospective is one moment in the targeted review; this entry is about the full set of moments across a session
+**Things to hash out:**
+- What is the right extraction mechanism? The session event log already records every tool call, step advance, and artifact. A targeted review agent reads selected event types rather than the full log. What is the right query interface?
+- Which moments are always reviewed vs. sampled? Confirmation gates and `report_issue` calls probably warrant 100% review; routine step advances can be sampled.
+- Should targeted review happen synchronously (coordinator waits before proceeding) or asynchronously (review happens in parallel, findings surface to operator outbox)?
+- How are review findings acted on? They could feed into: (a) the synthetic gate decision for the current session, (b) the workflow improvement retrospective, (c) the assumption store if codebase-specific learnings are extracted.
+- What does the targeted review agent actually produce? A structured verdict per moment reviewed, a severity-tagged list of concerns, or a binary pass/fail?
+---
 ### Agent-reportable workflow bugs (Apr 28, 2026)
 **Status: idea** | Priority: high
@@ -2617,6 +2894,7 @@ A mechanism for agents to report problems with the WorkRail system itself during
 - Should reports survive session cleanup, or is their lifetime tied to the session?
 - Who owns acting on these reports -- the operator, the workflow author, or an automated system?
 - Should this be available in interactive (MCP) sessions, or daemon sessions only?
+- Relationship to "Targeted session review": agent-reported workflow bugs are one of the primary high-signal moments that targeted session review would extract and inspect.
 ---
@@ -2673,6 +2951,40 @@ A proof record contains: `prNumber`, `goal`, `verificationChain` (array of `{ ki
 ---
+### Coordinator mid-session hooks: react to workflow events without waiting for session completion (May 6, 2026)
+**Status: idea** | Priority: high
+**Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+The coordinator currently acts only between sessions -- it spawns a session, awaits its completion, reads the typed output artifact, and decides what to do next. It has no mechanism to react to events that happen inside a running session. This means the coordinator cannot spawn helper agents mid-session (e.g. an external assumption ranker when the interpretation checkpoint fires), cannot intercept a confirmation gate and satisfy it autonomously, and cannot act on a step completion artifact before the full session finishes.
+The gap: workflow lifecycle events (step completed, gate fired, artifact emitted, `report_issue` called) are currently only visible after the session ends via the session store. The coordinator needs a way to subscribe to these events as they happen and act on them -- spawning agents, injecting steer messages, or making routing decisions -- without waiting for session completion.
+**Concrete use cases this unlocks:**
+- Spawn an external assumption-ranking agent when the interpretation checkpoint step completes, inject its ranking back into the session before verification runs
+- Auto-satisfy a `requireConfirmation` gate in autonomous mode by running a synthetic gate evaluation and steering the session with the result
+- Spawn a targeted review agent when a specific step artifact is emitted, surface findings before the session proceeds to the next phase
+- React to a `report_issue` call mid-session by spawning an investigation agent immediately rather than waiting for the full session to fail
+**What this requires:**
+- A real-time or near-real-time event subscription mechanism from the coordinator to the session event log (the append-only JSONL already has all the events; the coordinator needs a watch/poll interface on it)
+- A `steer` injection path from the coordinator into a running session (the steer endpoint already exists at `POST /sessions/:id/steer`)
+- A coordinator hook registry: declarative rules of the form "when session X emits event type Y with artifact kind Z, execute hook H"
+**Relationship to existing entries:**
+- "Scripts-first coordinator" (below): the hooks would be coordinator scripts reacting to events, not LLM reasoning
+- "Native multi-agent orchestration": `spawn_session` + `await_sessions` handles between-session orchestration; this handles within-session coordination
+- "Workflow runtime adapter": mid-session hooks are how the daemon adapter satisfies `requireConfirmation` gates autonomously
+**Things to hash out:**
+- Poll vs push: the session event log is append-only JSONL. The coordinator can poll it efficiently (tail -f equivalent), but a proper event bus (the daemon event emitter already exists) would be cleaner. Which is the right mechanism?
+- Hook registry format: declarative JSON rules in `triggers.yml`, or imperative TypeScript in the coordinator script? The declarative approach is more auditable; the imperative approach is more flexible.
+- Ordering guarantees: if the coordinator injects a steer message in response to a step completion, does the session engine guarantee the steer is processed before the next step begins? Race condition risk.
+- Blast radius: a hook that fires incorrectly (wrong event matched, wrong steer injected) could derail a running session in a hard-to-debug way. What are the rollback and auditability guarantees?
+---
 ### Scripts-first coordinator: avoid the main agent wherever possible (Apr 15, 2026)
 **Status: partial** | Foundation shipped PR #908 (Apr 30, 2026)
@@ -4758,7 +5070,25 @@ The agent is expensive, inconsistent, and slow. Scripts are free, deterministic,
 ### Dynamic model selection
-**Status: done** -- shipped in `triggers.yml` `agentConfig.model`
+**Status: partial** -- raw model ID (`agentConfig.model`) shipped in `triggers.yml`. Two gaps remain: (1) no validation at trigger parse or startup -- a bad model ID is only caught when the first LLM call fires; (2) every trigger hardcodes a provider-specific ID, which breaks when the inference profile naming convention changes (e.g. `us.anthropic.claude-haiku-4-5-20251001` vs `us.anthropic.claude-haiku-4-5-20251001-v1:0`).
+### Model tier abstraction: cheap / medium / expensive (May 7, 2026)
+**Status: idea** | Priority: medium
+**Score: 11** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
+**The problem:** Triggers hardcode provider-specific model IDs (`amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0`). When inference profile naming conventions change, or when switching providers/regions, every trigger must be updated manually. The daemon's adaptive coordinator already makes implicit cost/quality tradeoffs (Haiku for routing, Sonnet for coding) but has no first-class mechanism to express them -- it's locked to whatever IDs are in `agentConfig.model`.
+**The idea:** Introduce a tier abstraction. Triggers and workflow phases declare a tier (`cheap | medium | expensive`). The daemon resolves tiers to concrete model IDs from a tier map in `~/.workrail/config.json`. The adaptive coordinator picks tiers per phase: cheap for classification and routing, medium for coding, expensive for architectural review. Changing provider or region means updating the tier map once.
+**Validation is a prerequisite.** Before tiers make sense, bad model IDs need to be caught at startup rather than at first LLM call. See "Model ID validation at daemon startup" below.
+**Things to hash out:**
+- Where does the tier map live? `~/.workrail/config.json` (global) vs. `triggers.yml` (per-workspace) vs. both with cascade.
+- Does the tier map need to carry both a Bedrock and a direct-API model per tier, or does one path own the daemon?
+- Should the adaptive coordinator receive the tier map as a dependency, or should it always spawn sessions with explicit `agentConfig.model` set by the coordinator?
+- How do you handle models that exist on one provider but not another (e.g. Opus available on Bedrock but not direct API under certain rate limits)?
 ### Multi-agent support (spawn_agent + coordinator sessions)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@exaudeus/workrail",
-  "version": "3.78.2",
+  "version": "3.79.1",
   "description": "Step-by-step workflow enforcement for AI agents via MCP",
   "license": "MIT",
   "repository": {

package/workflows/coding-task-workflow-agentic.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "id": "wr.coding-task",
   "name": "Agentic Task Dev Workflow",
-  "version": "1.3.0",
+  "version": "1.4.0",
   "description": "Use this to implement a software feature or task. Follows a plan-then-execute approach with architecture decisions, invariant tracking, and final verification.",
   "about": "## Agentic Coding Task Workflow\n\nThis workflow structures the full lifecycle of a software implementation task: from understanding and classifying the work, through architecture decisions and incremental implementation, to final verification and handoff.\n\n### What it does\n\nThe workflow guides an AI agent through a disciplined plan-then-execute process. It begins by analyzing the task to determine complexity, risk, and the right level of rigor (QUICK, STANDARD, or THOROUGH). For non-trivial tasks, it then gathers codebase context, surfaces invariants and non-goals, generates competing design candidates, and selects an approach before writing a single line of code. Implementation proceeds slice by slice, with built-in verification gates after each slice. A final integration verification pass confirms acceptance criteria are met before handoff.\n\n### Upstream context (Phase 0.5)\n\nPhase 0.5 looks for any upstream document that has already defined what to build -- a Shape Up pitch, PRD, BRD, RFC, design doc, user story with acceptance criteria, Jira epic, or equivalent. The agent uses whatever tools are available (repo search, WebFetch, Confluence/Notion/Glean MCPs, Memory MCP) to find it. If found, two flags are set: `upstreamSpecDetected` (something exists) and `solutionFixed` (whether the document commits to a specific technical direction). When `solutionFixed = true`, design ideation phases (1a-1c) are skipped and Phase 1d translates the upstream constraints directly into an engineering approach. When `solutionFixed = false`, design ideation runs normally but is constrained by whatever the upstream document does specify. The plan audit (Phase 4) checks for drift against `upstreamBoundaries` whenever an upstream document was found.\n\n### When to use it\n\nUse this workflow whenever you are implementing a feature, fixing a non-trivial bug, or making an architectural change in a real codebase. It is especially valuable when:\n- The task touches multiple files or systems\n- There is meaningful risk of regressions or invariant violations\n- You want the agent to surface trade-offs and commit to a reasoned design decision rather than guessing\n- You need a resumable, auditable record of what was decided and why\n\nFor quick one-liner fixes or very small changes, the workflow includes a fast path that skips heavyweight planning.\n\n### What it produces\n\n- An `implementation_plan.md` artifact covering the selected approach, vertical slices, test design, and philosophy alignment\n- A `spec.md` for large or high-risk tasks, capturing observable behavior and acceptance criteria\n- Step-level notes in WorkRail that serve as a durable execution log\n- A PR-ready handoff summary with acceptance criteria status, invariant proofs, and follow-up tickets\n\n### How to get good results\n\n- Provide a clear task description and at least partial acceptance criteria before starting\n- If you have coding philosophy or project conventions configured in session rules or Memory MCP, the workflow will apply them automatically as a design lens\n- Let the workflow classify complexity and rigor itself; override only if the classification is clearly wrong\n- For large or high-risk tasks, review the architecture decision step before implementation begins",
   "examples": [
@@ -143,7 +143,7 @@
     "SUBAGENT SYNTHESIS: treat subagent output as evidence, not conclusions. State your hypothesis before delegating, then interrogate what came back: what was missed, wrong, or new? Say what changed your mind or what you still reject, and why.",
     "PARALLELISM: when reads, audits, or delegations are independent, run them in parallel inside the phase. Parallelize cognition; serialize synthesis and canonical writes.",
     "PHILOSOPHY LENS: apply the user's coding philosophy (from active session rules) as the evaluation lens. Flag violations by principle name, not as generic feedback. If principles conflict, surface the tension explicitly instead of silently choosing.",
-    "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness \u2014 in that order of reliability.",
+    "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness — in that order of reliability.",
     "DRIFT HANDLING: when reality diverges from the plan, update the plan artifact and re-audit deliberately rather than accumulating undocumented drift.",
     "NEVER COMMIT MARKDOWN FILES UNLESS USER EXPLICITLY ASKS.",
     "SLICE DISCIPLINE: Phase 6 is a loop -- implement ONE slice per iteration. Do not implement multiple slices at once. The verification loop exists to catch drift per slice, not retroactively."
@@ -166,6 +166,47 @@
         ]
       }
     },
+    {
+      "id": "phase-0c-assumption-verification",
+      "title": "Phase 0c: Interpretation & Assumption Verification",
+      "promptBlocks": {
+        "goal": "State your interpretation of what you are building, make three explicit assumptions about codebase state, verify each by reading the predicted location, and produce an InterpretationArtifact that the coordinator can inspect and route on.",
+        "constraints": [
+          "The interpretation statement MUST appear before any assumption listing.",
+          "You MUST produce exactly 3 assumptions -- no more, no fewer.",
+          "Each assumption MUST be about a design pattern, architectural invariant, or module behavior. Trivially checkable facts are disqualified: do NOT assume things like 'file X exists', 'import Y is present', or 'function Z has signature W'. Those are not assumptions -- they are lookups. An assumption must assert something about how the system is designed or how a module behaves.",
+          "Each assumption MUST name a specific predicted file or code location that would corroborate it.",
+          "Each assumption MUST be labeled with severity AT STATEMENT TIME (before verification): 'high' for core architectural invariants (e.g. token protocol, session model, phase boundary contract) or 'low' for pattern preferences or conventions.",
+          "Do NOT attempt automated correction of context when an assumption is refuted. Record the refutation and route per the procedure.",
+          "Do NOT propagate InterpretationArtifact to coordinator-spawned subagents."
+        ],
+        "procedure": [
+          "1. Write one sentence: 'I am building [X].' This is your interpretation. Write it before you list any assumptions.",
+          "2. State assumption 1: name the design pattern or architectural invariant you are asserting, name the predicted corroborating location, and label severity (high or low).",
+          "3. State assumption 2: same format.",
+          "4. State assumption 3: same format.",
+          "5. Verify assumption 1: read the predicted location. Record whether it is confirmed, refuted, or unresolvable.",
+          "6. Verify assumption 2: same.",
+          "7. Verify assumption 3: same.",
+          "8. Determine ambiguityLevel: 'clear' if all three assumptions are confirmed. 'uncertain' if any assumption is refuted or unresolvable.",
+          "9. Route on refuted assumptions: for any low-severity refuted assumption, downgrade ambiguityLevel to 'uncertain' and proceed. For any high-severity refuted assumption, call report_issue with kind: 'needs_human' and the specific refuted claim as the message, then stop.",
+          "10. Set these keys in the next continue_workflow call's context object: interpretationArtifact (the full artifact object), ambiguityLevel (the string 'clear' or 'uncertain')."
+        ],
+        "outputRequired": {
+          "notesMarkdown": "Summary of the interpretation, all three assumptions with their verification results, and the final ambiguityLevel."
+        },
+        "verify": [
+          "The interpretation sentence appears before the assumption list.",
+          "Exactly 3 assumptions are stated, each with a named predicted location and a severity label.",
+          "Each assumption is about a design pattern, invariant, or module behavior -- not a trivially checkable fact.",
+          "Each assumption has been verified by reading the predicted location.",
+          "ambiguityLevel is 'clear' only if all 3 assumptions are confirmed.",
+          "Any high-severity refuted assumption triggered report_issue before this step was completed.",
+          "interpretationArtifact context key is set with fields: interpretation, assumptionList, ambiguityLevel."
+        ]
+      },
+      "requireConfirmation": false
+    },
     {
       "id": "phase-0-5-upstream-context",
       "title": "Phase 0.5: Locate Upstream Context",
@@ -218,7 +259,7 @@
     },
     {
       "id": "phase-1b-design-deep",
-      "title": "Phase 1b: Design Generation (Injected Routine \u2014 Tension-Driven Design)",
+      "title": "Phase 1b: Design Generation (Injected Routine — Tension-Driven Design)",
       "runCondition": {
         "and": [
           {
@@ -257,7 +298,7 @@
           }
         ]
       },
-      "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` \u2014 chosen design with rationale tied to tensions\n- `runnerUpApproach` \u2014 next-best option and why it lost\n- `architectureRationale` \u2014 tensions resolved vs accepted\n- `pivotTriggers` \u2014 conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` \u2014 failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
+      "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` — chosen design with rationale tied to tensions\n- `runnerUpApproach` — next-best option and why it lost\n- `architectureRationale` — tensions resolved vs accepted\n- `pivotTriggers` — conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` — failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
       "promptFragments": [
         {
           "id": "phase-1c-challenge-standard",
@@ -429,7 +470,7 @@
         "var": "taskComplexity",
         "not_equals": "Small"
       },
-      "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n   - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` \u2014 count of open questions that would materially affect implementation quality\n- `planConfidenceBand` \u2014 Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
+      "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n   - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` — count of open questions that would materially affect implementation quality\n- `planConfidenceBand` — Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
       "assessmentRefs": [
         "plan-completeness-gate",
         "invariant-clarity-gate",
@@ -543,7 +584,7 @@
         {
           "id": "phase-4b-loop-decision",
           "title": "Loop Exit Decision",
-          "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop \u2014 but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n  \"artifacts\": [{\n    \"kind\": \"wr.loop_control\",\n    \"decision\": \"continue\"\n  }]\n}\n```",
+          "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop — but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n  \"artifacts\": [{\n    \"kind\": \"wr.loop_control\",\n    \"decision\": \"continue\"\n  }]\n}\n```",
           "requireConfirmation": true,
           "outputContract": {
             "contractRef": "wr.contracts.loop_control"
@@ -706,7 +747,7 @@
       "id": "phase-8-retrospective",
       "title": "Phase 8: Retrospective",
       "requireConfirmation": false,
-      "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)\n\nBefore completing this step, emit a wr.coding_handoff artifact in your complete_step call:\n{\n  \"kind\": \"wr.coding_handoff\",\n  \"version\": 1,\n  \"branchName\": \"<git branch name containing your changes>\",\n  \"keyDecisions\": [\"<architectural decision + WHY>\", ...],\n  \"knownLimitations\": [\"<known gap or deliberate shortcut>\", ...],\n  \"testsAdded\": [\"<test file or test name added>\", ...],\n  \"filesChanged\": [\"<primary file path changed>\", ...]\n}\nNote: correctedAssumptions is populated ONLY by fix/retry agents when correcting assumptions from a prior coding session. On a first-run coding session, omit this field entirely.",
+      "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)\n\nBefore completing this step, emit a wr.coding_handoff artifact in your complete_step call:\n{\n  \"kind\": \"wr.coding_handoff\",\n  \"version\": 1,\n  \"branchName\": \"<git branch name containing your changes>\",\n  \"keyDecisions\": [\"<architectural decision + WHY>\", ...],\n  \"knownLimitations\": [\"<known gap or deliberate shortcut>\", ...],\n  \"testsAdded\": [\"<test file or test name added>\", ...],\n  \"filesChanged\": [\"<primary file path changed>\", ...]\n}\nNote: correctedAssumptions is populated ONLY by fix/retry agents when correcting assumptions from a prior coding session. On a first-run coding session, omit this field entirely.\n\n5. **Interpretation accuracy check.** Did this session build what was intended? If there was a gap between what you set out to build and what you actually built, classify it:\n- Subtype A: you misread or misunderstood the ticket (the task description said one thing and you built something else)\n- Subtype B: you had a wrong model of how this codebase works (you understood the task but your assumption about a design pattern, invariant, or module behavior turned out to be incorrect)\n- Unclear: you cannot confidently attribute the gap to one type\n\nIf there was no gap, say so explicitly. Add this classification to `retrospectiveObservations`.",
       "outputContract": {
         "contractRef": "wr.contracts.coding_handoff"
       }