@exaudeus/workrail 3.78.2 → 3.79.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -621,6 +621,18 @@ function validateAndResolveTrigger(raw, env, workspaces = {}) {
621
621
  let agentConfig;
622
622
  if (raw.agentConfig) {
623
623
  const model = raw.agentConfig.model?.trim() || undefined;
624
+ if (model !== undefined) {
625
+ const slashIdx = model.indexOf('/');
626
+ const provider = slashIdx === -1 ? '' : model.slice(0, slashIdx);
627
+ const modelId = slashIdx === -1 ? '' : model.slice(slashIdx + 1);
628
+ if (!provider || !modelId) {
629
+ return (0, result_js_1.err)({
630
+ kind: 'invalid_field_value',
631
+ field: `agentConfig.model (must be in 'provider/model-id' format, e.g. amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0, got: "${model}")`,
632
+ triggerId: rawId,
633
+ });
634
+ }
635
+ }
624
636
  let maxSessionMinutes;
625
637
  if (raw.agentConfig.maxSessionMinutes !== undefined) {
626
638
  const asNumber = Number(raw.agentConfig.maxSessionMinutes);
@@ -1064,6 +1076,22 @@ function validateTriggerStrict(trigger) {
1064
1076
  suggestedFix: 'branchPrefix: worktrain/',
1065
1077
  });
1066
1078
  }
1079
+ if (trigger.agentConfig?.model !== undefined) {
1080
+ const m = trigger.agentConfig.model;
1081
+ const slashIdx = m.indexOf('/');
1082
+ const provider = slashIdx === -1 ? '' : m.slice(0, slashIdx);
1083
+ const modelId = slashIdx === -1 ? '' : m.slice(slashIdx + 1);
1084
+ if (!provider || !modelId) {
1085
+ issues.push({
1086
+ rule: 'invalid-model-format',
1087
+ severity: 'error',
1088
+ triggerId: id,
1089
+ message: `agentConfig.model "${m}" is not in 'provider/model-id' format -- ` +
1090
+ 'both provider and model-id must be non-empty (e.g. amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0)',
1091
+ suggestedFix: 'model: amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0',
1092
+ });
1093
+ }
1094
+ }
1067
1095
  if (trigger.concurrencyMode === 'parallel' &&
1068
1096
  (!trigger.branchStrategy || trigger.branchStrategy === 'none') &&
1069
1097
  (trigger.autoCommit || trigger.autoOpenPR)) {
@@ -115,7 +115,7 @@ export interface WebhookEvent {
115
115
  readonly payload: Readonly<Record<string, unknown>>;
116
116
  readonly signature?: string;
117
117
  }
118
- export type TriggerValidationRule = 'autocommit-needs-worktree' | 'autoopenpr-needs-autocommit' | 'worktree-needs-base-branch' | 'worktree-needs-prefix' | 'parallel-without-worktree' | 'missing-goal-template' | 'missing-max-session-minutes' | 'missing-max-turns' | 'autocommit-on-main-checkout' | 'missing-max-queue-depth';
118
+ export type TriggerValidationRule = 'autocommit-needs-worktree' | 'autoopenpr-needs-autocommit' | 'worktree-needs-base-branch' | 'worktree-needs-prefix' | 'parallel-without-worktree' | 'missing-goal-template' | 'missing-max-session-minutes' | 'missing-max-turns' | 'autocommit-on-main-checkout' | 'missing-max-queue-depth' | 'invalid-model-format';
119
119
  export interface TriggerValidationIssue {
120
120
  readonly rule: TriggerValidationRule;
121
121
  readonly severity: 'error' | 'warning' | 'info';
@@ -610,6 +610,64 @@ Tier 0 injection needs a dedicated system prompt section separate from `assemble
610
610
 
611
611
  ---
612
612
 
613
+ ### Interpretation checkpoint for coding workflow: Candidate 5 (May 6, 2026)
614
+
615
+ **Status: done** | Shipped in PR #962 (feat/etienneb/interpretation-checkpoint, May 7, 2026)
616
+
617
+ **Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
618
+
619
+ Added `phase-0c-assumption-verification` step to `wr.coding-task` (v1.3.0 → v1.4.0) between Phase 0 (classify) and Phase 0.5 (upstream context). The step requires the coding agent to state a one-sentence interpretation before listing any assumptions, produce exactly 3 codebase assumptions with predicted locations and severity labels, verify each assumption by reading the predicted location, and output an `InterpretationArtifact` context key with `ambiguityLevel: clear | uncertain`. High-severity refutations surface to operator via `report_issue`. Also appended Subtype A/B classification prompt to the retrospective step for distribution measurement.
620
+
621
+ This is the first step of the intent gap intervention sequence: Candidate 5 (shipped) → Candidate 4 (git-grounded context, next) → Candidate 1 or 3 gated on Subtype A/B empirical data.
622
+
623
+ ---
624
+
625
+ ### External assumption ranking for interpretation checkpoint (May 6, 2026)
626
+
627
+ **Status: idea** | Priority: medium
628
+
629
+ **Score: 10** | Cor:2 Cap:2 Eff:2 Lev:2 Con:2 | Blocked: no (Candidate 5 shipped PR #962, May 7, 2026)
630
+
631
+ The interpretation checkpoint (Candidate 5) asks the coding agent to label each of its own assumptions as `severity: high` or `severity: low`. This self-labeling is a known weak point: an agent with a confident wrong prior may mislabel its most dangerous architectural assumption as low-severity to avoid triggering the gate. Self-assessed severity is the single lowest-confidence element in the pitch (confidence: 0.55).
632
+
633
+ An external agent -- one that did not produce the assumptions -- can independently rank them by actual risk before verification runs. The external agent receives only the ticket and the assumption list (not the producing agent's full context or reasoning) and answers: which of these assumptions is most load-bearing? Which, if wrong, would cause the most damage? Are there high-risk areas this agent didn't surface at all?
634
+
635
+ The producing agent then verifies in order of externally-ranked risk rather than self-assessed severity. Severity classification moves from self-labeling to an independent signal, removing the 0.55-confidence gap entirely.
636
+
637
+ **Relationship to targeted session review:** the external ranking agent's output is also a high-signal review moment -- if the external agent flags assumptions the producing agent didn't think to surface, that delta is direct evidence of an interpretation gap.
638
+
639
+ **Things to hash out:**
640
+ - What context does the ranking agent receive? Ticket + assumption list only, or also the affected file list and design lock references? More context improves ranking quality but risks contaminating the independence.
641
+ - Is this a lightweight parallel call (runs simultaneously with verification setup) or a blocking step?
642
+ - How are conflicts between self-assessed severity and external ranking resolved? External ranking should win, but the producing agent should see the disagreement and explain it.
643
+ - Cost: one additional inference call per session. Acceptable for standard/thorough sessions; probably skip for QUICK mode.
644
+
645
+ ---
646
+
647
+ ### Intent gap correction: fix the interpretation after assumption refutation (May 6, 2026)
648
+
649
+ **Status: idea** | Priority: medium
650
+
651
+ **Score: 10** | Cor:2 Cap:2 Eff:2 Lev:2 Con:2 | Blocked: no (Candidate 5 shipped PR #962, May 7, 2026)
652
+
653
+ When an agent's assumption-surfacing step (Candidate 5) refutes a high-severity assumption, the current scoped fix is to surface the refutation to the operator and halt. But the real problem is deeper: the wrong prior that caused the refuted assumption may have already contaminated earlier context -- the upstream context harvest, the problem framing, the `reframedProblem` and `challengedAssumptions` context keys. A simple "re-read the file and try again" doesn't fix a wrong model; it patches the symptom in one step while leaving the contaminated context intact. Long-term, a refuted assumption that reflects a codebase-specific wrong prior (Subtype B) should also update the Memory store and eventually the knowledge graph so future sessions don't repeat the mistake.
654
+
655
+ This is explicitly out of scope for the Candidate 5 pitch -- detection is the right first boundary. Correction is a separate, larger problem that depends on session context rollback, Memory store integration, and eventually the knowledge graph.
656
+
657
+ **Done looks like:** when a high-severity assumption is refuted mid-session, the system can: (1) identify which prior context keys were formed under the wrong prior, (2) trigger a targeted correction sub-flow that re-derives those keys with the corrected interpretation, (3) write the correction back to the Memory store so future sessions in this workspace start with the right prior.
658
+
659
+ **Things to hash out:**
660
+ - What is the right granularity for context rollback? Rolling back individual keys vs. re-running entire prior phases are very different costs.
661
+ - How do you distinguish "assumption was wrong about this specific file" (local fix) from "assumption reflects a systematic wrong prior about this codebase pattern" (Memory store update warranted)?
662
+ - What is the trigger for a Memory store write -- every refuted high-severity assumption, or only ones confirmed as Subtype B by retrospective labeling?
663
+ - How does this interact with the knowledge graph when it ships? The assumption store (Candidate 2 from the intent gap discovery) and the knowledge graph are both candidates for receiving the correction signal.
664
+
665
+ **Relationship to existing entries:**
666
+ - Blocked by: Candidate 5 (assumption surfacing step) -- detection must exist before correction can be designed
667
+ - Related to: Subtype B intent failure (below), Knowledge graph (backlog), Memory store / living work context (shipped PR #939, #948, #952)
668
+
669
+ ---
670
+
613
671
  ### Subtype B intent failure: agent has a wrong prior about what this codebase does (May 5, 2026)
614
672
 
615
673
  **Status: idea -- needs empirical study before design** | Priority: high
@@ -1652,6 +1710,77 @@ Combined with the `DEFAULT_MAX_TURNS` cap, this provides defense-in-depth agains
1652
1710
 
1653
1711
  The durable session store, v2 engine, and workflow authoring features shared by all three systems.
1654
1712
 
1713
+ ### Coordinator-managed typed output vocabulary: agent emits typed events, coordinator reacts per type (May 7, 2026)
1714
+
1715
+ **Status: idea** | Priority: high
1716
+
1717
+ **Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
1718
+
1719
+ Today, agent output is largely untyped -- notes, artifacts, context keys. The coordinator reacts to typed handoff artifacts at phase boundaries, but within a session the agent's observations, decisions, findings, and suggestions are all prose. The coordinator cannot programmatically react to them.
1720
+
1721
+ The idea: the coordinator owns a vocabulary of typed output kinds that it supports. Before a session starts, it injects that vocabulary into the agent's context -- the agent knows exactly what typed things it can emit and what each one means. When the agent emits a typed output, the coordinator reacts with the appropriate process for that type. The reaction is deterministic coordinator logic (not LLM reasoning), specified per type.
1722
+
1723
+ **Examples of typed output kinds and coordinator reactions:**
1724
+
1725
+ - `suggestion(kind: "abstraction_extraction")` → coordinator fires targeted verification: "what are the three future cases this serves?"
1726
+ - `finding(severity: "critical", area: "security")` → coordinator routes to immediate review, may block merge
1727
+ - `decision(chose: X, over: Y, rationale: ...)` → coordinator checks for conflicts with prior decisions in the session store
1728
+ - `scope_change(direction: "larger", reason: ...)` → coordinator re-evaluates task complexity, may re-route to a heavier workflow
1729
+ - `blocker(kind: "missing_context", what: ...)` → coordinator attempts to resolve the blocker from known sources before surfacing to operator
1730
+ - `learning(claim: ..., area: ..., confidence: ...)` → coordinator writes to the assumption store for future sessions
1731
+ - `assumption(claim: ..., severity: ...)` → coordinator gates on verification before proceeding (Candidate 5 is a specific instance of this)
1732
+
1733
+ **What makes this powerful:**
1734
+ The agent doesn't need to know what happens next when it emits a typed output -- that's the coordinator's job. The agent just has to recognize "this is an assumption I'm making" or "this is a scope change I'm noticing" and emit the right type. The coordinator's reaction logic handles the rest deterministically, without LLM turns.
1735
+
1736
+ **Relationship to existing entries:**
1737
+ - "Typed suggestion artifacts with workflow-directed verification" (below): a specific application of this pattern to suggestions
1738
+ - "Coordinator mid-session hooks": the coordinator's reaction to typed outputs is exactly a mid-session hook triggered by a specific event type
1739
+ - "Candidate 5 / interpretation checkpoint": the assumption verification step is a manually-implemented instance of this pattern for one output type
1740
+ - "Coordinator session store awareness": the coordinator's reaction to a `learning` or `decision` type can write to the session store for future sessions
1741
+
1742
+ **Things to hash out:**
1743
+ - Who defines the vocabulary of supported types -- the engine (closed set), the workflow author (per-workflow), or the coordinator (per-deployment)?
1744
+ - How does the agent learn what types are available? Injected in the system prompt, declared in the workflow, or both?
1745
+ - What is the API surface for emitting a typed output? A dedicated tool, a structured artifact field, a reserved context key pattern?
1746
+ - How are reactions defined? TypeScript in the coordinator script, declarative rules in triggers.yml, or something else?
1747
+ - What happens when the agent emits a type the coordinator doesn't handle? Silent drop, warning, or error?
1748
+ - Should typed outputs be visible in the console as first-class events, or only in the raw session log?
1749
+
1750
+ ---
1751
+
1752
+ ### Typed suggestion artifacts with workflow-directed verification (May 7, 2026)
1753
+
1754
+ **Status: idea** | Priority: medium
1755
+
1756
+ **Score: 11** | Cor:2 Cap:3 Eff:2 Lev:2 Con:2 | Blocked: no
1757
+
1758
+ Agents frequently make suggestions mid-workflow -- propose an abstraction, recommend a deferral, flag a scope expansion, suggest a performance optimization. Today these live in plain prose notes. The workflow cannot distinguish one type of suggestion from another, cannot apply targeted follow-up logic, and cannot verify that the suggestion was actually scrutinized before being accepted. A suggestion that warrants architectural review gets the same treatment as one that warrants nothing.
1759
+
1760
+ The idea: a typed `suggestion` tool call that the agent makes instead of embedding the suggestion in prose. The artifact carries a `kind` field (closed enum, workflow-declared) that tells the engine what type of suggestion this is. The workflow author declares, per suggestion kind, what verification the engine should require before the suggestion is accepted.
1761
+
1762
+ **Example suggestion kinds and their natural follow-up scrutiny:**
1763
+ - `abstraction_extraction` -- "is this premature? what are the three concrete future cases this serves? does any of them exist in the current backlog? does this introduce coupling that didn't exist before?"
1764
+ - `architectural_change` -- "does this conflict with any design locks? what breaks downstream?"
1765
+ - `scope_expansion` -- "is this actually in scope? is this the scope rationalization failure mode -- the agent declaring it's a separate ticket to avoid doing the work?"
1766
+ - `deferral` -- "is this genuinely separate work, or is the agent completing checkboxes while leaving real work undone?"
1767
+ - `performance_optimization` -- "is this premature? what is the actual measured bottleneck? what evidence justifies this now?"
1768
+
1769
+ **Mechanism:** fits naturally with the assessment gate system. A `suggestion_quality` assessment with dimensions specific to the suggestion kind. The workflow author declares which dimensions apply to each kind. When the agent emits a typed suggestion, the engine fires a `require_followup` consequence requiring the agent to answer the verification criteria for that kind before proceeding. If the agent cannot answer them satisfactorily, the suggestion does not pass.
1770
+
1771
+ **API shape is open:** the typed suggestion could be a dedicated tool call (`suggest(type: "abstraction_extraction", ...)`), a structured artifact field in `continue_workflow`, a special context key, or something else entirely. The key property is that it is machine-readable and has a `kind` field the engine can act on -- not prose. The exact surface needs design work.
1772
+
1773
+ **The friction concern:** if suggestions require too much overhead, agents will stop surfacing them or bury them in prose to avoid the gate. The verification criteria must be targeted and lightweight -- not a full review pass, just the specific questions that matter for that kind. "What are the three future cases this abstraction serves?" is lightweight. "Run a full architecture review" is not.
1774
+
1775
+ **Things to hash out:**
1776
+ - What is the closed set of suggestion kinds for the initial version? Too many kinds creates complexity; too few misses the point.
1777
+ - Should suggestion kinds be workflow-declared (each workflow author defines their own) or engine-owned (a closed set the engine enforces)? Engine-owned is more consistent but less flexible.
1778
+ - How does the agent signal that a suggestion was considered and rejected, not just overlooked? A declined suggestion should be as visible as an accepted one.
1779
+ - Does the verification happen inline (a `require_followup` on the same step) or as a separate verification step? Inline is lower friction; a separate step is more auditable.
1780
+ - How does this interact with the existing `report_issue` mechanism? Some suggestions that fail verification should surface to the operator, not just loop back to the agent.
1781
+
1782
+ ---
1783
+
1655
1784
  ### WorkTrain as the canonical workflow author -- MCP as a derived runtime (Apr 30, 2026)
1656
1785
 
1657
1786
  **Status: idea** | Priority: high
@@ -1954,6 +2083,37 @@ Surface in: `worktrain status`, `worktrain health <sessionId>`, console session
1954
2083
 
1955
2084
  Coordinator design patterns for WorkTrain's autonomous pipeline.
1956
2085
 
2086
+ ### Reliable synthetic human gates: mimicking operator approval and refusal in autonomous pipelines (May 6, 2026)
2087
+
2088
+ **Status: idea** | Priority: high
2089
+
2090
+ **Score: 13** | Cor:3 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
2091
+
2092
+ WorkTrain's pipeline has several points where a human operator would naturally approve, reject, or redirect -- confirming an interpretation before coding starts, approving a direction from discovery, accepting a shaped pitch. In guided MCP sessions these gates fire as `requireConfirmation` steps. In fully autonomous daemon sessions, they either don't fire or surface to the operator outbox and wait indefinitely. There is currently no reliable mechanism for the coordinator to make these gate decisions autonomously in a way that is trustworthy enough to substitute for human judgment.
2093
+
2094
+ The problem is not just "add an LLM to make the decision." An LLM making approval decisions is subject to the same sycophancy, self-enhancement bias, and overconfidence problems the rest of the pipeline has. A naïve "spawn an agent to approve this" produces rubber-stamping, not genuine gatekeeping. What is needed is a structured, auditable, multi-signal gate that approximates what a careful human reviewer would do -- checking specific criteria, flagging specific concerns, requiring specific evidence before proceeding.
2095
+
2096
+ **What a strong synthetic gate needs:**
2097
+ - Typed criteria against which the artifact is evaluated (not free-form "does this look good?")
2098
+ - An independent agent that did not produce the artifact being evaluated
2099
+ - A cross-family challenger where possible (different model family = different correlated blind spots)
2100
+ - A structured verdict with explicit rationale tied to the criteria, not a confidence score
2101
+ - An escalation path when the synthetic gate is uncertain -- surface to operator rather than rubber-stamp
2102
+
2103
+ **Use cases that need this:**
2104
+ - Interpretation checkpoint: does the coded assumption set actually cover the architectural risks for this ticket?
2105
+ - Shaping approval: does the pitch have genuine acceptance criteria or are they vague enough to accept anything?
2106
+ - Discovery direction: is the selected direction actually distinct from the runner-up, or are they the same approach with different labels?
2107
+ - Review verdict: is this finding severe enough to block merge, or is it a style preference being inflated?
2108
+
2109
+ **Things to hash out:**
2110
+ - What is the right abstraction? A reusable `synthetic-gate` routine that takes typed criteria + artifact and returns a structured verdict? Or specialized gates per use case?
2111
+ - How do you prevent the synthetic gate from being gamed by the same agent that produced the artifact? The gate agent must not have access to the producing agent's reasoning, only its output.
2112
+ - What is the confidence threshold below which the synthetic gate escalates to a human rather than deciding? And how is that threshold configured per trigger?
2113
+ - How do you validate that a synthetic gate is actually performing the function of a human gate -- not just producing confident verdicts? Requires a calibration dataset of known-correct and known-incorrect artifacts with human ground truth.
2114
+ - Relationship to the `requireConfirmation` gate mechanism: the synthetic gate is the autonomous equivalent. It should produce the same typed routing signal the human confirmation gate produces, so the coordinator routing logic doesn't need to know which kind of gate fired.
2115
+
2116
+ ---
1957
2117
 
1958
2118
  ### Agents must not perform delivery actions -- only the coordinator's delivery layer can (Apr 30, 2026)
1959
2119
 
@@ -2464,6 +2624,67 @@ Ghost nodes represent steps that were compiled into the DAG but skipped at runti
2464
2624
 
2465
2625
  ## Workflow Library
2466
2626
 
2627
+ ### Pre-specialized expert agents: on-demand consultants for main agents (May 7, 2026)
2628
+
2629
+ **Status: idea** | Priority: high
2630
+
2631
+ **Score: 13** | Cor:2 Cap:3 Eff:2 Lev:3 Con:3 | Blocked: no
2632
+
2633
+ The main agent running a coding, review, or investigation workflow is not the expert. It is the orchestrator. When it needs specialized input -- "is this Kotlin idiomatic?", "does this violate any payments module invariants?", "what are the FP patterns this codebase uses for this?" -- it should be able to ask a pre-specialized consultant agent and get a bounded, expert answer back.
2634
+
2635
+ These expert agents are not running the main workflow. They do not own any phase or make any final decisions. They are consulted: spawned with a specific question, pre-loaded with dense expertise in a specific domain, and they return a bounded answer. The main agent synthesizes the input and retains full ownership.
2636
+
2637
+ **Examples:**
2638
+ - A Kotlin idioms expert pre-loaded with Kotlin best practices, common pitfalls, and idiomatic patterns -- queried when the coding or review agent wants to know "is this idiomatic Kotlin?"
2639
+ - A functional programming expert pre-loaded with the FP philosophy and patterns relevant to this codebase (from CLAUDE.md, design docs, etc.) -- queried when the agent is making decisions that touch FP style
2640
+ - A payments module expert pre-loaded with the payments execution paths, known invariants, and past design decisions -- queried when the task touches payments code
2641
+ - A security expert pre-loaded with the codebase's auth model, known vulnerabilities, and security invariants -- queried during review of auth-adjacent changes
2642
+
2643
+ **Two distinct usage patterns -- both valid:**
2644
+
2645
+ *Consultant mode:* The main agent mid-task asks a specific question ("is this Kotlin idiomatic?"), a pre-specialized agent is spawned with that question and its expertise briefing, it returns a bounded answer, the main agent synthesizes and moves on. Lightweight, on-demand, the main agent drives the interaction.
2646
+
2647
+ *Parallel specialist mode:* The coordinator spawns multiple pre-specialized agents simultaneously for a phase of work -- e.g. an MR review that launches a Kotlin expert, a payments module expert, and an FP patterns expert in parallel, each reviewing the same diff through their lens. The main agent or coordinator synthesizes. This is the 3-angle executor pattern from wr.discovery applied to expertise curation rather than framing angles. Each specialist contributes their perspective; no single agent has to cover everything.
2648
+
2649
+ The parallel specialist mode is conceptually similar to the existing reviewer families in wr.mr-review, but with expertise injection replacing role prompts. "You are a correctness reviewer" and "you are an agent briefed on this codebase's actual invariants, the past bugs in this module, and the specific patterns we use here" are very different levels of specificity.
2650
+
2651
+ **What makes expert consultants distinct from existing reviewer families (MR review):**
2652
+ Existing reviewer families are top-level sessions running the full review workflow independently. Expert consultants (in consultant mode) are lightweight bounded spawns -- more like calling a function than running a parallel pipeline. In parallel specialist mode they are closer to reviewer families, but curated for the specific task rather than generically role-assigned.
2653
+
2654
+ **What makes this distinct from existing context injection:**
2655
+ Existing context injection (living work context, assembledContextSummary) threads pipeline state between phases -- history of what happened. Expert consultants carry curated domain expertise -- best practices, idioms, invariants, patterns. The content type is different: not "what was done" but "what is true about this domain."
2656
+
2657
+ **Implementation shape -- specialized workflows, not just context injection:**
2658
+
2659
+ The most powerful form of a specialist is not an agent that receives a big expertise briefing at spawn time and then works freely. It is an agent running a purpose-built specialized workflow that contains both the expertise and the process for applying it systematically.
2660
+
2661
+ A `wr.kotlin-review` workflow contains: the Kotlin expertise in `metaGuidance` and `references`, and a structured procedure -- "step 1: check null safety patterns at these call sites; step 2: evaluate coroutine usage against these criteria; step 3: check data class conventions..." Breaking the domain into steps ensures the specialist covers everything the domain requires, in the right order, with the right depth. A pure context dump leaves coverage to chance; a workflow enforces it.
2662
+
2663
+ This also makes specialists auditable: you can see in the session store exactly which steps the specialist ran, what it found, and whether it covered all required dimensions. And specialized workflows improve over time via `wr.workflow-for-workflows`, compounding quality the same way all bundled workflows do.
2664
+
2665
+ For dynamic specialists (payments module expert, specific subsystem expert), the workflow defines the process for generating the briefing dynamically -- walk these execution paths, read these design docs, extract these invariants -- rather than containing a static briefing.
2666
+
2667
+ **What needs to be built:**
2668
+ - A catalog of specialized workflows: static domain specialists (wr.kotlin-review, wr.fp-patterns-review) and dynamic module specialists (wr.module-expert with a briefing-generation phase)
2669
+ - A matching mechanism: given the task's affected files and domains, which specialist workflows are relevant?
2670
+ - A consultation protocol: how does the main agent query a specialist? How does the specialist return a typed artifact the main agent can act on?
2671
+ - Dynamic briefing generation: for module-specific specialists, a workflow phase that walks affected execution paths and generates the curated briefing before the expert work begins
2672
+
2673
+ **Relationship to existing entries:**
2674
+ - "Knowledge graph": the long-term structural ground truth version of this. Expert briefings are the lower-cost precursor that doesn't require the full graph.
2675
+ - "Assumption store": verified codebase facts are one input to the module expert briefing.
2676
+ - "Coordinator mid-session hooks": expert consultation could be triggered mid-session by the coordinator when specific signals fire (e.g. agent touches a known-tricky module).
2677
+
2678
+ **Things to hash out:**
2679
+ - What is the right format for an expertise briefing? Prose vs structured facts vs a combination?
2680
+ - How are static briefings maintained? They go stale as language versions change and codebases evolve.
2681
+ - How are dynamic briefings generated? Static analysis? LLM-assisted code walk? What is the cost and freshness guarantee?
2682
+ - How does the main agent know which experts are available and when to consult them? Explicit workflow step, or opportunistic mid-task consultation?
2683
+ - Token budget: expert consultation adds turns and tokens. When is the cost worth it vs. the main agent just proceeding with its own judgment?
2684
+ - How does the consultation differ from just giving the main agent a bigger context window? The answer should be "specificity and freshness" -- a consultant briefed on this specific module is better than a general agent with everything injected.
2685
+
2686
+ ---
2687
+
2467
2688
  ### Automatic root cause analysis when MR review finds issues post-coding (Apr 30, 2026)
2468
2689
 
2469
2690
  **Status: idea** | Priority: high
@@ -2480,6 +2701,8 @@ When an MR review session (run by a WorkTrain agent) finds issues in a coding se
2480
2701
 
2481
2702
  **Why this matters**: every finding that slips through is a signal about a workflow or process gap. Today that signal is lost. Capturing it systematically and feeding it back into workflow improvement closes the quality loop.
2482
2703
 
2704
+ **Concrete model:** CodeRabbit does this for MR reviews -- when a human reviewer corrects a CodeRabbit finding or points out something it missed, CodeRabbit extracts a structured learning (`{ claim, repo, file context, timestamp }`) and injects it into future review sessions for the same repo. WorkTrain should do the same, and broader: learnings from coding corrections (not just review corrections) feed into the per-workspace codebase assumption store, which directly addresses Subtype B intent failures. Human feedback on WorkTrain's PRs is the write path for that store.
2705
+
2483
2706
  **Things to hash out:**
2484
2707
  - How does WorkTrain detect that a human has commented on a PR post-review? This requires monitoring the PR for new review activity after WorkTrain's session completed -- either webhook events or polling.
2485
2708
  - What does the analysis session actually produce? A structured finding about the gap? A concrete proposal for workflow improvement? Both?
@@ -2487,6 +2710,21 @@ When an MR review session (run by a WorkTrain agent) finds issues in a coding se
2487
2710
  - How do you distinguish "the workflow is fine but this was a genuinely hard edge case" from "the workflow has a systematic gap"? A single miss doesn't prove a gap; multiple misses of the same kind do.
2488
2711
  - Should the analysis result feed directly into `workflow-effectiveness-assessment`, or is it a separate concern?
2489
2712
  - For the "coding agent missed it" case: is the right fix to change the coding workflow, or to make the review workflow more adversarial?
2713
+ - How are codebase-specific learnings extracted from free-form human review comments? A structured extraction step (similar to CodeRabbit's learning extraction) is needed to turn "actually this is wrong because X" into a typed store entry.
2714
+ - How are extracted learnings scoped and invalidated over time? Per-repo scope is right for codebase-specific facts, but learnings go stale after refactors. A `lastVerified` + staleness mechanism is needed.
2715
+ - Relationship to the assumption store (Candidate 2 from the intent gap discovery): human PR corrections are the primary write path for the per-workspace codebase assumption store. These two entries should be designed together.
2716
+
2717
+ ---
2718
+
2719
+ ### wr.discovery recommendation quality improvements v3.5 (May 6, 2026)
2720
+
2721
+ **Status: done** | Shipped in PR #951 (feat/etienneb/discovery-workflow-v35, May 6, 2026)
2722
+
2723
+ **Score: 13** | Cor:2 Cap:3 Eff:2 Lev:3 Con:3 | Blocked: no
2724
+
2725
+ Evidence-based redesign of `wr.discovery` (v3.4.0 → v3.5.0) addressing three failure modes -- coverage (right answer never generated), quality (wrong answer selected), and selection (right answer not selected). Key changes: all three assessment gates now have `assessmentConsequences` that block on failure; Phase 3d/3e split isolates external challenge from fresh-context selection; typed `SelectionOutput` tier (`strong_recommendation | provisional_recommendation | insufficient_signal`) driven by observable signals; `FrameValidityCheck` at landscape-to-frame transition; verbalized sampling + ordinary persona rotation in executor goal strings; `recommendationConfidenceBand` downgrade-only invariant across resolution phases; Phase 6 restructured as falsification-shaped fresh-context validator; `selectionTier` added to `wr.discovery_handoff` artifact.
2726
+
2727
+ Full audit at `.workrail/discovery-workflow-audit.md`, implementation plan at `.workrail/discovery-workflow-implementation-plan.md`.
2490
2728
 
2491
2729
  ---
2492
2730
 
@@ -2601,6 +2839,45 @@ Some workflows want notes to consistently capture current understanding, key fin
2601
2839
 
2602
2840
  ---
2603
2841
 
2842
+ ### Targeted session review: extract high-signal moments instead of reviewing full transcripts (May 6, 2026)
2843
+
2844
+ **Status: idea** | Priority: high
2845
+
2846
+ **Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
2847
+
2848
+ Reviewing a full agent session transcript to evaluate quality is prohibitively expensive -- long sessions have hundreds of tool calls, file reads, and reasoning steps. But most of the signal about whether a session went well lives in a small number of high-signal moments: confirmation gates, places where the agent flagged uncertainty or divergence, steps where the agent's output failed to match the expected contract, and points where the agent encountered reality and had to adapt. Reviewing those moments selectively is 10-50x cheaper than reading the full transcript and captures most of the quality signal.
2849
+
2850
+ **High-signal moments worth targeting:**
2851
+
2852
+ 1. **Confirmation gate outcomes** -- when a `requireConfirmation` gate fired, what did the agent report? Did it accurately represent the state of the work? Was the decision the right one in hindsight?
2853
+
2854
+ 2. **Agent self-reported issues** -- calls to `report_issue` or `signal_coordinator` during the session. These are the agent's own flags that something was wrong. Each one warrants inspection: was the issue real, was the agent's characterization accurate, was the resolution appropriate?
2855
+
2856
+ 3. **Contract validation failures** -- steps where the engine returned a `blocked` or `require_followup` response. The agent's output failed the output contract. What did it produce, and why?
2857
+
2858
+ 4. **Agent-workflow friction points** -- places where the agent deviated from the expected step procedure, added divergence markers, or explicitly noted a gap between the workflow instructions and the reality it encountered. These are the inputs to workflow improvement.
2859
+
2860
+ 5. **Interpretation vs outcome delta** -- the gap between what the agent stated it was building (interpretation checkpoint, once it exists) and what it actually produced. The delta is the intent gap in concrete form.
2861
+
2862
+ 6. **Sycophancy signals** -- position changes without new evidence, position reversals after challenge, confidence-accuracy mismatches visible in the notes.
2863
+
2864
+ **Why this matters:** without targeted review, session quality is only observable at the PR level (did the output pass review?). That's a lagging indicator that catches failures after they've shipped cost. Targeted review of high-signal moments catches failures mid-session or immediately post-session, before the cost compounds.
2865
+
2866
+ **Relationship to existing entries:**
2867
+ - "Agent-reportable workflow bugs" (below) -- the agent's own flags are one of the primary review targets
2868
+ - "Synthetic human gates" -- the targeted review output is what a synthetic gate would consume to make an approval decision
2869
+ - "Automatic root cause analysis" -- targeted review is the cheaper precursor that identifies which sessions warrant full root cause analysis
2870
+ - "Per-run workflow improvement retrospective" -- the session retrospective is one moment in the targeted review; this entry is about the full set of moments across a session
2871
+
2872
+ **Things to hash out:**
2873
+ - What is the right extraction mechanism? The session event log already records every tool call, step advance, and artifact. A targeted review agent reads selected event types rather than the full log. What is the right query interface?
2874
+ - Which moments are always reviewed vs. sampled? Confirmation gates and `report_issue` calls probably warrant 100% review; routine step advances can be sampled.
2875
+ - Should targeted review happen synchronously (coordinator waits before proceeding) or asynchronously (review happens in parallel, findings surface to operator outbox)?
2876
+ - How are review findings acted on? They could feed into: (a) the synthetic gate decision for the current session, (b) the workflow improvement retrospective, (c) the assumption store if codebase-specific learnings are extracted.
2877
+ - What does the targeted review agent actually produce? A structured verdict per moment reviewed, a severity-tagged list of concerns, or a binary pass/fail?
2878
+
2879
+ ---
2880
+
2604
2881
  ### Agent-reportable workflow bugs (Apr 28, 2026)
2605
2882
 
2606
2883
  **Status: idea** | Priority: high
@@ -2617,6 +2894,7 @@ A mechanism for agents to report problems with the WorkRail system itself during
2617
2894
  - Should reports survive session cleanup, or is their lifetime tied to the session?
2618
2895
  - Who owns acting on these reports -- the operator, the workflow author, or an automated system?
2619
2896
  - Should this be available in interactive (MCP) sessions, or daemon sessions only?
2897
+ - Relationship to "Targeted session review": agent-reported workflow bugs are one of the primary high-signal moments that targeted session review would extract and inspect.
2620
2898
 
2621
2899
  ---
2622
2900
 
@@ -2673,6 +2951,40 @@ A proof record contains: `prNumber`, `goal`, `verificationChain` (array of `{ ki
2673
2951
 
2674
2952
  ---
2675
2953
 
2954
+ ### Coordinator mid-session hooks: react to workflow events without waiting for session completion (May 6, 2026)
2955
+
2956
+ **Status: idea** | Priority: high
2957
+
2958
+ **Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
2959
+
2960
+ The coordinator currently acts only between sessions -- it spawns a session, awaits its completion, reads the typed output artifact, and decides what to do next. It has no mechanism to react to events that happen inside a running session. This means the coordinator cannot spawn helper agents mid-session (e.g. an external assumption ranker when the interpretation checkpoint fires), cannot intercept a confirmation gate and satisfy it autonomously, and cannot act on a step completion artifact before the full session finishes.
2961
+
2962
+ The gap: workflow lifecycle events (step completed, gate fired, artifact emitted, `report_issue` called) are currently only visible after the session ends via the session store. The coordinator needs a way to subscribe to these events as they happen and act on them -- spawning agents, injecting steer messages, or making routing decisions -- without waiting for session completion.
2963
+
2964
+ **Concrete use cases this unlocks:**
2965
+ - Spawn an external assumption-ranking agent when the interpretation checkpoint step completes, inject its ranking back into the session before verification runs
2966
+ - Auto-satisfy a `requireConfirmation` gate in autonomous mode by running a synthetic gate evaluation and steering the session with the result
2967
+ - Spawn a targeted review agent when a specific step artifact is emitted, surface findings before the session proceeds to the next phase
2968
+ - React to a `report_issue` call mid-session by spawning an investigation agent immediately rather than waiting for the full session to fail
2969
+
2970
+ **What this requires:**
2971
+ - A real-time or near-real-time event subscription mechanism from the coordinator to the session event log (the append-only JSONL already has all the events; the coordinator needs a watch/poll interface on it)
2972
+ - A `steer` injection path from the coordinator into a running session (the steer endpoint already exists at `POST /sessions/:id/steer`)
2973
+ - A coordinator hook registry: declarative rules of the form "when session X emits event type Y with artifact kind Z, execute hook H"
2974
+
2975
+ **Relationship to existing entries:**
2976
+ - "Scripts-first coordinator" (below): the hooks would be coordinator scripts reacting to events, not LLM reasoning
2977
+ - "Native multi-agent orchestration": `spawn_session` + `await_sessions` handles between-session orchestration; this handles within-session coordination
2978
+ - "Workflow runtime adapter": mid-session hooks are how the daemon adapter satisfies `requireConfirmation` gates autonomously
2979
+
2980
+ **Things to hash out:**
2981
+ - Poll vs push: the session event log is append-only JSONL. The coordinator can poll it efficiently (tail -f equivalent), but a proper event bus (the daemon event emitter already exists) would be cleaner. Which is the right mechanism?
2982
+ - Hook registry format: declarative JSON rules in `triggers.yml`, or imperative TypeScript in the coordinator script? The declarative approach is more auditable; the imperative approach is more flexible.
2983
+ - Ordering guarantees: if the coordinator injects a steer message in response to a step completion, does the session engine guarantee the steer is processed before the next step begins? Race condition risk.
2984
+ - Blast radius: a hook that fires incorrectly (wrong event matched, wrong steer injected) could derail a running session in a hard-to-debug way. What are the rollback and auditability guarantees?
2985
+
2986
+ ---
2987
+
2676
2988
  ### Scripts-first coordinator: avoid the main agent wherever possible (Apr 15, 2026)
2677
2989
 
2678
2990
  **Status: partial** | Foundation shipped PR #908 (Apr 30, 2026)
@@ -4758,7 +5070,25 @@ The agent is expensive, inconsistent, and slow. Scripts are free, deterministic,
4758
5070
 
4759
5071
  ### Dynamic model selection
4760
5072
 
4761
- **Status: done** -- shipped in `triggers.yml` `agentConfig.model`
5073
+ **Status: partial** -- raw model ID (`agentConfig.model`) shipped in `triggers.yml`. Two gaps remain: (1) no validation at trigger parse or startup -- a bad model ID is only caught when the first LLM call fires; (2) every trigger hardcodes a provider-specific ID, which breaks when the inference profile naming convention changes (e.g. `us.anthropic.claude-haiku-4-5-20251001` vs `us.anthropic.claude-haiku-4-5-20251001-v1:0`).
5074
+
5075
+ ### Model tier abstraction: cheap / medium / expensive (May 7, 2026)
5076
+
5077
+ **Status: idea** | Priority: medium
5078
+
5079
+ **Score: 11** | Cor:2 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
5080
+
5081
+ **The problem:** Triggers hardcode provider-specific model IDs (`amazon-bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0`). When inference profile naming conventions change, or when switching providers/regions, every trigger must be updated manually. The daemon's adaptive coordinator already makes implicit cost/quality tradeoffs (Haiku for routing, Sonnet for coding) but has no first-class mechanism to express them -- it's locked to whatever IDs are in `agentConfig.model`.
5082
+
5083
+ **The idea:** Introduce a tier abstraction. Triggers and workflow phases declare a tier (`cheap | medium | expensive`). The daemon resolves tiers to concrete model IDs from a tier map in `~/.workrail/config.json`. The adaptive coordinator picks tiers per phase: cheap for classification and routing, medium for coding, expensive for architectural review. Changing provider or region means updating the tier map once.
5084
+
5085
+ **Validation is a prerequisite.** Before tiers make sense, bad model IDs need to be caught at startup rather than at first LLM call. See "Model ID validation at daemon startup" below.
5086
+
5087
+ **Things to hash out:**
5088
+ - Where does the tier map live? `~/.workrail/config.json` (global) vs. `triggers.yml` (per-workspace) vs. both with cascade.
5089
+ - Does the tier map need to carry both a Bedrock and a direct-API model per tier, or does one path own the daemon?
5090
+ - Should the adaptive coordinator receive the tier map as a dependency, or should it always spawn sessions with explicit `agentConfig.model` set by the coordinator?
5091
+ - How do you handle models that exist on one provider but not another (e.g. Opus available on Bedrock but not direct API under certain rate limits)?
4762
5092
 
4763
5093
  ### Multi-agent support (spawn_agent + coordinator sessions)
4764
5094
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@exaudeus/workrail",
3
- "version": "3.78.2",
3
+ "version": "3.79.1",
4
4
  "description": "Step-by-step workflow enforcement for AI agents via MCP",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "id": "wr.coding-task",
3
3
  "name": "Agentic Task Dev Workflow",
4
- "version": "1.3.0",
4
+ "version": "1.4.0",
5
5
  "description": "Use this to implement a software feature or task. Follows a plan-then-execute approach with architecture decisions, invariant tracking, and final verification.",
6
6
  "about": "## Agentic Coding Task Workflow\n\nThis workflow structures the full lifecycle of a software implementation task: from understanding and classifying the work, through architecture decisions and incremental implementation, to final verification and handoff.\n\n### What it does\n\nThe workflow guides an AI agent through a disciplined plan-then-execute process. It begins by analyzing the task to determine complexity, risk, and the right level of rigor (QUICK, STANDARD, or THOROUGH). For non-trivial tasks, it then gathers codebase context, surfaces invariants and non-goals, generates competing design candidates, and selects an approach before writing a single line of code. Implementation proceeds slice by slice, with built-in verification gates after each slice. A final integration verification pass confirms acceptance criteria are met before handoff.\n\n### Upstream context (Phase 0.5)\n\nPhase 0.5 looks for any upstream document that has already defined what to build -- a Shape Up pitch, PRD, BRD, RFC, design doc, user story with acceptance criteria, Jira epic, or equivalent. The agent uses whatever tools are available (repo search, WebFetch, Confluence/Notion/Glean MCPs, Memory MCP) to find it. If found, two flags are set: `upstreamSpecDetected` (something exists) and `solutionFixed` (whether the document commits to a specific technical direction). When `solutionFixed = true`, design ideation phases (1a-1c) are skipped and Phase 1d translates the upstream constraints directly into an engineering approach. When `solutionFixed = false`, design ideation runs normally but is constrained by whatever the upstream document does specify. The plan audit (Phase 4) checks for drift against `upstreamBoundaries` whenever an upstream document was found.\n\n### When to use it\n\nUse this workflow whenever you are implementing a feature, fixing a non-trivial bug, or making an architectural change in a real codebase. It is especially valuable when:\n- The task touches multiple files or systems\n- There is meaningful risk of regressions or invariant violations\n- You want the agent to surface trade-offs and commit to a reasoned design decision rather than guessing\n- You need a resumable, auditable record of what was decided and why\n\nFor quick one-liner fixes or very small changes, the workflow includes a fast path that skips heavyweight planning.\n\n### What it produces\n\n- An `implementation_plan.md` artifact covering the selected approach, vertical slices, test design, and philosophy alignment\n- A `spec.md` for large or high-risk tasks, capturing observable behavior and acceptance criteria\n- Step-level notes in WorkRail that serve as a durable execution log\n- A PR-ready handoff summary with acceptance criteria status, invariant proofs, and follow-up tickets\n\n### How to get good results\n\n- Provide a clear task description and at least partial acceptance criteria before starting\n- If you have coding philosophy or project conventions configured in session rules or Memory MCP, the workflow will apply them automatically as a design lens\n- Let the workflow classify complexity and rigor itself; override only if the classification is clearly wrong\n- For large or high-risk tasks, review the architecture decision step before implementation begins",
7
7
  "examples": [
@@ -143,7 +143,7 @@
143
143
  "SUBAGENT SYNTHESIS: treat subagent output as evidence, not conclusions. State your hypothesis before delegating, then interrogate what came back: what was missed, wrong, or new? Say what changed your mind or what you still reject, and why.",
144
144
  "PARALLELISM: when reads, audits, or delegations are independent, run them in parallel inside the phase. Parallelize cognition; serialize synthesis and canonical writes.",
145
145
  "PHILOSOPHY LENS: apply the user's coding philosophy (from active session rules) as the evaluation lens. Flag violations by principle name, not as generic feedback. If principles conflict, surface the tension explicitly instead of silently choosing.",
146
- "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness \u2014 in that order of reliability.",
146
+ "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness in that order of reliability.",
147
147
  "DRIFT HANDLING: when reality diverges from the plan, update the plan artifact and re-audit deliberately rather than accumulating undocumented drift.",
148
148
  "NEVER COMMIT MARKDOWN FILES UNLESS USER EXPLICITLY ASKS.",
149
149
  "SLICE DISCIPLINE: Phase 6 is a loop -- implement ONE slice per iteration. Do not implement multiple slices at once. The verification loop exists to catch drift per slice, not retroactively."
@@ -166,6 +166,47 @@
166
166
  ]
167
167
  }
168
168
  },
169
+ {
170
+ "id": "phase-0c-assumption-verification",
171
+ "title": "Phase 0c: Interpretation & Assumption Verification",
172
+ "promptBlocks": {
173
+ "goal": "State your interpretation of what you are building, make three explicit assumptions about codebase state, verify each by reading the predicted location, and produce an InterpretationArtifact that the coordinator can inspect and route on.",
174
+ "constraints": [
175
+ "The interpretation statement MUST appear before any assumption listing.",
176
+ "You MUST produce exactly 3 assumptions -- no more, no fewer.",
177
+ "Each assumption MUST be about a design pattern, architectural invariant, or module behavior. Trivially checkable facts are disqualified: do NOT assume things like 'file X exists', 'import Y is present', or 'function Z has signature W'. Those are not assumptions -- they are lookups. An assumption must assert something about how the system is designed or how a module behaves.",
178
+ "Each assumption MUST name a specific predicted file or code location that would corroborate it.",
179
+ "Each assumption MUST be labeled with severity AT STATEMENT TIME (before verification): 'high' for core architectural invariants (e.g. token protocol, session model, phase boundary contract) or 'low' for pattern preferences or conventions.",
180
+ "Do NOT attempt automated correction of context when an assumption is refuted. Record the refutation and route per the procedure.",
181
+ "Do NOT propagate InterpretationArtifact to coordinator-spawned subagents."
182
+ ],
183
+ "procedure": [
184
+ "1. Write one sentence: 'I am building [X].' This is your interpretation. Write it before you list any assumptions.",
185
+ "2. State assumption 1: name the design pattern or architectural invariant you are asserting, name the predicted corroborating location, and label severity (high or low).",
186
+ "3. State assumption 2: same format.",
187
+ "4. State assumption 3: same format.",
188
+ "5. Verify assumption 1: read the predicted location. Record whether it is confirmed, refuted, or unresolvable.",
189
+ "6. Verify assumption 2: same.",
190
+ "7. Verify assumption 3: same.",
191
+ "8. Determine ambiguityLevel: 'clear' if all three assumptions are confirmed. 'uncertain' if any assumption is refuted or unresolvable.",
192
+ "9. Route on refuted assumptions: for any low-severity refuted assumption, downgrade ambiguityLevel to 'uncertain' and proceed. For any high-severity refuted assumption, call report_issue with kind: 'needs_human' and the specific refuted claim as the message, then stop.",
193
+ "10. Set these keys in the next continue_workflow call's context object: interpretationArtifact (the full artifact object), ambiguityLevel (the string 'clear' or 'uncertain')."
194
+ ],
195
+ "outputRequired": {
196
+ "notesMarkdown": "Summary of the interpretation, all three assumptions with their verification results, and the final ambiguityLevel."
197
+ },
198
+ "verify": [
199
+ "The interpretation sentence appears before the assumption list.",
200
+ "Exactly 3 assumptions are stated, each with a named predicted location and a severity label.",
201
+ "Each assumption is about a design pattern, invariant, or module behavior -- not a trivially checkable fact.",
202
+ "Each assumption has been verified by reading the predicted location.",
203
+ "ambiguityLevel is 'clear' only if all 3 assumptions are confirmed.",
204
+ "Any high-severity refuted assumption triggered report_issue before this step was completed.",
205
+ "interpretationArtifact context key is set with fields: interpretation, assumptionList, ambiguityLevel."
206
+ ]
207
+ },
208
+ "requireConfirmation": false
209
+ },
169
210
  {
170
211
  "id": "phase-0-5-upstream-context",
171
212
  "title": "Phase 0.5: Locate Upstream Context",
@@ -218,7 +259,7 @@
218
259
  },
219
260
  {
220
261
  "id": "phase-1b-design-deep",
221
- "title": "Phase 1b: Design Generation (Injected Routine \u2014 Tension-Driven Design)",
262
+ "title": "Phase 1b: Design Generation (Injected Routine Tension-Driven Design)",
222
263
  "runCondition": {
223
264
  "and": [
224
265
  {
@@ -257,7 +298,7 @@
257
298
  }
258
299
  ]
259
300
  },
260
- "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` \u2014 chosen design with rationale tied to tensions\n- `runnerUpApproach` \u2014 next-best option and why it lost\n- `architectureRationale` \u2014 tensions resolved vs accepted\n- `pivotTriggers` \u2014 conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` \u2014 failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
301
+ "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` chosen design with rationale tied to tensions\n- `runnerUpApproach` next-best option and why it lost\n- `architectureRationale` tensions resolved vs accepted\n- `pivotTriggers` conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
261
302
  "promptFragments": [
262
303
  {
263
304
  "id": "phase-1c-challenge-standard",
@@ -429,7 +470,7 @@
429
470
  "var": "taskComplexity",
430
471
  "not_equals": "Small"
431
472
  },
432
- "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` \u2014 count of open questions that would materially affect implementation quality\n- `planConfidenceBand` \u2014 Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
473
+ "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` count of open questions that would materially affect implementation quality\n- `planConfidenceBand` Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
433
474
  "assessmentRefs": [
434
475
  "plan-completeness-gate",
435
476
  "invariant-clarity-gate",
@@ -543,7 +584,7 @@
543
584
  {
544
585
  "id": "phase-4b-loop-decision",
545
586
  "title": "Loop Exit Decision",
546
- "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop \u2014 but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n \"artifacts\": [{\n \"kind\": \"wr.loop_control\",\n \"decision\": \"continue\"\n }]\n}\n```",
587
+ "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n \"artifacts\": [{\n \"kind\": \"wr.loop_control\",\n \"decision\": \"continue\"\n }]\n}\n```",
547
588
  "requireConfirmation": true,
548
589
  "outputContract": {
549
590
  "contractRef": "wr.contracts.loop_control"
@@ -706,7 +747,7 @@
706
747
  "id": "phase-8-retrospective",
707
748
  "title": "Phase 8: Retrospective",
708
749
  "requireConfirmation": false,
709
- "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)\n\nBefore completing this step, emit a wr.coding_handoff artifact in your complete_step call:\n{\n \"kind\": \"wr.coding_handoff\",\n \"version\": 1,\n \"branchName\": \"<git branch name containing your changes>\",\n \"keyDecisions\": [\"<architectural decision + WHY>\", ...],\n \"knownLimitations\": [\"<known gap or deliberate shortcut>\", ...],\n \"testsAdded\": [\"<test file or test name added>\", ...],\n \"filesChanged\": [\"<primary file path changed>\", ...]\n}\nNote: correctedAssumptions is populated ONLY by fix/retry agents when correcting assumptions from a prior coding session. On a first-run coding session, omit this field entirely.",
750
+ "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)\n\nBefore completing this step, emit a wr.coding_handoff artifact in your complete_step call:\n{\n \"kind\": \"wr.coding_handoff\",\n \"version\": 1,\n \"branchName\": \"<git branch name containing your changes>\",\n \"keyDecisions\": [\"<architectural decision + WHY>\", ...],\n \"knownLimitations\": [\"<known gap or deliberate shortcut>\", ...],\n \"testsAdded\": [\"<test file or test name added>\", ...],\n \"filesChanged\": [\"<primary file path changed>\", ...]\n}\nNote: correctedAssumptions is populated ONLY by fix/retry agents when correcting assumptions from a prior coding session. On a first-run coding session, omit this field entirely.\n\n5. **Interpretation accuracy check.** Did this session build what was intended? If there was a gap between what you set out to build and what you actually built, classify it:\n- Subtype A: you misread or misunderstood the ticket (the task description said one thing and you built something else)\n- Subtype B: you had a wrong model of how this codebase works (you understood the task but your assumption about a design pattern, invariant, or module behavior turned out to be incorrect)\n- Unclear: you cannot confidently attribute the gap to one type\n\nIf there was no gap, say so explicitly. Add this classification to `retrospectiveObservations`.",
710
751
  "outputContract": {
711
752
  "contractRef": "wr.contracts.coding_handoff"
712
753
  }