@exaudeus/workrail 3.75.0 → 3.77.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/dist/console-ui/assets/index-D9pYbwS0.js +28 -0
  2. package/dist/console-ui/index.html +1 -1
  3. package/dist/coordinators/adaptive-pipeline.d.ts +8 -0
  4. package/dist/coordinators/context-assembly.d.ts +4 -0
  5. package/dist/coordinators/context-assembly.js +156 -0
  6. package/dist/coordinators/modes/full-pipeline.d.ts +1 -1
  7. package/dist/coordinators/modes/full-pipeline.js +140 -27
  8. package/dist/coordinators/modes/implement-shared.d.ts +3 -2
  9. package/dist/coordinators/modes/implement-shared.js +16 -6
  10. package/dist/coordinators/modes/implement.js +49 -3
  11. package/dist/coordinators/pipeline-run-context.d.ts +1811 -0
  12. package/dist/coordinators/pipeline-run-context.js +114 -0
  13. package/dist/daemon/context-loader.d.ts +1 -1
  14. package/dist/daemon/core/agent-client.d.ts +7 -0
  15. package/dist/daemon/core/agent-client.js +31 -0
  16. package/dist/daemon/core/index.d.ts +6 -0
  17. package/dist/daemon/core/index.js +19 -0
  18. package/dist/daemon/core/session-context.d.ts +14 -0
  19. package/dist/daemon/core/session-context.js +24 -0
  20. package/dist/daemon/core/session-result.d.ts +10 -0
  21. package/dist/daemon/core/session-result.js +92 -0
  22. package/dist/daemon/core/system-prompt.d.ts +6 -0
  23. package/dist/daemon/core/system-prompt.js +151 -0
  24. package/dist/daemon/io/conversation-log.d.ts +2 -0
  25. package/dist/daemon/io/conversation-log.js +45 -0
  26. package/dist/daemon/io/execution-stats.d.ts +7 -0
  27. package/dist/daemon/io/execution-stats.js +86 -0
  28. package/dist/daemon/io/index.d.ts +5 -0
  29. package/dist/daemon/io/index.js +24 -0
  30. package/dist/daemon/io/session-notes-loader.d.ts +4 -0
  31. package/dist/daemon/io/session-notes-loader.js +45 -0
  32. package/dist/daemon/io/soul-loader.d.ts +3 -0
  33. package/dist/daemon/io/soul-loader.js +68 -0
  34. package/dist/daemon/io/workspace-context-loader.d.ts +17 -0
  35. package/dist/daemon/io/workspace-context-loader.js +137 -0
  36. package/dist/daemon/runner/agent-loop-runner.d.ts +28 -0
  37. package/dist/daemon/runner/agent-loop-runner.js +250 -0
  38. package/dist/daemon/runner/construct-tools.d.ts +5 -0
  39. package/dist/daemon/runner/construct-tools.js +30 -0
  40. package/dist/daemon/runner/finalize-session.d.ts +3 -0
  41. package/dist/daemon/runner/finalize-session.js +75 -0
  42. package/dist/daemon/runner/index.d.ts +8 -0
  43. package/dist/daemon/runner/index.js +18 -0
  44. package/dist/daemon/runner/pre-agent-session.d.ts +7 -0
  45. package/dist/daemon/runner/pre-agent-session.js +227 -0
  46. package/dist/daemon/runner/runner-types.d.ts +73 -0
  47. package/dist/daemon/runner/runner-types.js +39 -0
  48. package/dist/daemon/runner/tool-schemas.d.ts +1 -0
  49. package/dist/daemon/runner/tool-schemas.js +151 -0
  50. package/dist/daemon/session-scope.d.ts +1 -1
  51. package/dist/daemon/startup-recovery.d.ts +20 -0
  52. package/dist/daemon/startup-recovery.js +323 -0
  53. package/dist/daemon/state/index.d.ts +6 -0
  54. package/dist/daemon/state/index.js +14 -0
  55. package/dist/daemon/state/session-state.d.ts +23 -0
  56. package/dist/daemon/state/session-state.js +44 -0
  57. package/dist/daemon/state/stuck-detection.d.ts +22 -0
  58. package/dist/daemon/state/stuck-detection.js +25 -0
  59. package/dist/daemon/state/terminal-signal.d.ts +9 -0
  60. package/dist/daemon/state/terminal-signal.js +10 -0
  61. package/dist/daemon/tools/file-tools.d.ts +1 -1
  62. package/dist/daemon/turn-end/detect-stuck.d.ts +2 -2
  63. package/dist/daemon/turn-end/detect-stuck.js +2 -2
  64. package/dist/daemon/turn-end/step-injector.d.ts +1 -1
  65. package/dist/daemon/types.d.ts +105 -0
  66. package/dist/daemon/types.js +11 -0
  67. package/dist/daemon/workflow-enricher.d.ts +16 -0
  68. package/dist/daemon/workflow-enricher.js +58 -0
  69. package/dist/daemon/workflow-runner.d.ts +13 -277
  70. package/dist/daemon/workflow-runner.js +63 -1421
  71. package/dist/manifest.json +280 -56
  72. package/dist/trigger/coordinator-deps.d.ts +1 -1
  73. package/dist/trigger/coordinator-deps.js +131 -0
  74. package/dist/trigger/delivery-client.d.ts +1 -1
  75. package/dist/trigger/delivery-pipeline.d.ts +1 -1
  76. package/dist/trigger/notification-service.d.ts +1 -1
  77. package/dist/trigger/trigger-listener.js +6 -2
  78. package/dist/trigger/trigger-router.d.ts +2 -2
  79. package/dist/v2/durable-core/domain/artifact-contract-validator.js +99 -0
  80. package/dist/v2/durable-core/schemas/artifacts/discovery-handoff.d.ts +39 -0
  81. package/dist/v2/durable-core/schemas/artifacts/discovery-handoff.js +10 -1
  82. package/dist/v2/durable-core/schemas/artifacts/index.d.ts +2 -1
  83. package/dist/v2/durable-core/schemas/artifacts/index.js +12 -1
  84. package/dist/v2/durable-core/schemas/artifacts/phase-handoff.d.ts +89 -0
  85. package/dist/v2/durable-core/schemas/artifacts/phase-handoff.js +56 -0
  86. package/docs/authoring-v2.md +12 -0
  87. package/docs/ideas/backlog.md +639 -25
  88. package/docs/reference/worktrain-daemon-invariants.md +33 -49
  89. package/docs/vision.md +5 -15
  90. package/package.json +2 -2
  91. package/workflows/coding-task-workflow-agentic.json +9 -6
  92. package/workflows/mr-review-workflow.agentic.v2.json +2 -2
  93. package/workflows/wr.discovery.json +2 -1
  94. package/workflows/wr.shaping.json +7 -4
  95. package/dist/console-ui/assets/index-BvBihscd.js +0 -28
@@ -14,7 +14,7 @@ See also: `tests/unit/workflow-runner-outcome-invariants.test.ts` -- the test fi
14
14
 
15
15
  **Why:** `'unknown'` in `execution-stats.jsonl` is silent data loss. Operators calibrate session timeouts and monitor health from this data.
16
16
 
17
- **How it breaks:** The `writeExecutionStats()` helper takes `outcome` by value. If called with a variable that hasn't been assigned yet, it silently records `'unknown'`. Every result path must call `writeExecutionStats()` with the correct outcome at the call site, not via a shared variable captured in a closure.
17
+ **How it breaks:** `writeExecutionStats()` takes `outcome` by value. If called with an unassigned variable, it silently records `'unknown'`. All result paths go through `finalizeSession()`, which calls `tagToStatsOutcome()` to derive the outcome -- there are no direct `writeExecutionStats()` calls outside `finalizeSession()`.
18
18
 
19
19
  ### 1.2 `delivery_failed` is never returned by `runWorkflow()` directly
20
20
 
@@ -32,13 +32,13 @@ See also: `tests/unit/workflow-runner-outcome-invariants.test.ts` -- the test fi
32
32
  | `'stuck'` | `'stuck'` |
33
33
  | `'delivery_failed'` | `'success'` (workflow succeeded; only the POST failed) |
34
34
 
35
- This mapping must be exhaustive. When `tagToStatsOutcome()` is extracted as a pure function (planned in the functional-core/imperative-shell refactor), it must use `assertNever` on the default case so the compiler enforces exhaustiveness.
35
+ This mapping is exhaustive. `tagToStatsOutcome()` is a pure function in `workflow-runner.ts` that uses `assertNever` on the default case -- the compiler enforces exhaustiveness when new `_tag` variants are added.
36
36
 
37
37
  ### 1.4 Outcome priority when multiple signals fire
38
38
 
39
- If both `stuckReason` and `timeoutReason` are non-null at the same time (same turn), `stuck` takes priority over `timeout`. This is intentional: stuck is the more specific signal (the agent is looping, not just slow), and fires before the wall-clock limit.
39
+ `stuck` takes priority over `timeout`. This is enforced structurally by `TerminalSignal` and `setTerminalSignal()`: `setTerminalSignal()` is first-writer-wins -- the first signal to set `state.terminalSignal` wins, and subsequent calls are silent no-ops. Because stuck detection fires inside the turn-end subscriber (which runs before the wall-clock timeout handler), stuck always sets `terminalSignal` first when both conditions are present in the same turn.
40
40
 
41
- **Code location:** The `if (stuckReason !== null)` check precedes `if (timeoutReason !== null)` in `runWorkflow()`.
41
+ **Code location:** `setTerminalSignal()` in `workflow-runner.ts`. `buildSessionResult()` reads `state.terminalSignal` after the loop exits.
42
42
 
43
43
  ### 1.5 stepCount reflects agent-loop advances only
44
44
 
@@ -56,7 +56,7 @@ Each `runWorkflow()` call writes a per-session sidecar file at `~/.workrail/daem
56
56
 
57
57
  `persistTokens()` returns `Promise<Result<void, PersistTokensError>>` (not throws). Callers in the setup phase treat `err` as fatal (abort); callers inside tool closures treat `err` as degraded-but-continue (log and still call `onAdvance`/`onTokenUpdate` -- see invariant 4.3).
58
58
 
59
- **Exception:** If `continueToken` is undefined (instant single-step completion, or `_preAllocatedStartResponse` with no token), `persistTokens()` is skipped. There is nothing to recover.
59
+ **Exception:** If `continueToken` is undefined (instant single-step completion, or a `pre_allocated` `SessionSource` with no token), `persistTokens()` is skipped. There is nothing to recover.
60
60
 
61
61
  ### 2.2 Sidecar is deleted on every non-worktree terminal path
62
62
 
@@ -88,33 +88,34 @@ Since Phase B crash recovery (PR #811), `persistTokens()` also writes `workflowI
88
88
 
89
89
  ## 3. Registry invariants
90
90
 
91
- Three registries track in-flight daemon sessions:
91
+ Two registries track in-flight daemon sessions:
92
92
 
93
93
  | Registry | Key | Value | Purpose |
94
94
  |---|---|---|---|
95
95
  | `DaemonRegistry` | `workrailSessionId` | `{ workflowId, lastHeartbeatMs }` | Console `isLive` display |
96
- | `SteerRegistry` | `workrailSessionId` | `(text: string) => void` | Mid-session coordinator injection |
97
- | `AbortRegistry` | `workrailSessionId` | `() => void` | SIGTERM graceful shutdown |
96
+ | `ActiveSessionSet` | `workrailSessionId` | `SessionHandle` | Steer injection + SIGTERM abort |
97
+
98
+ `ActiveSessionSet` + `SessionHandle` (in `src/daemon/active-sessions.ts`) replaced the former separate `SteerRegistry` and `AbortRegistry` maps. A `SessionHandle` exposes `steer()`, `setAgent()`, `abort()`, and `dispose()` -- all session lifecycle operations on a single object.
98
99
 
99
100
  ### 3.1 Registry registration and deregistration
100
101
 
101
- **Registration** happens in two places:
102
+ **Registration** happens in two phases:
102
103
 
103
- - `steerRegistry` and `DaemonRegistry` are registered inside `buildPreAgentSession()` -- AFTER all potentially-failing I/O (executeStartWorkflow, persistTokens, worktree creation). Error paths that return before registration have nothing to clean up. The single-step completion path (which returns success without running an agent loop) explicitly calls `steerRegistry.delete()` and `daemonRegistry.unregister()` before returning.
104
+ - `DaemonRegistry` and `ActiveSessionSet` are registered inside `buildPreAgentSession()` -- AFTER all potentially-failing I/O (executeStartWorkflow, persistTokens, worktree creation). Error paths that return before this point have nothing to clean up. The single-step completion path goes through `finalizeSession()` (which calls `daemonRegistry.unregister()`) and returns the handle via `PreAgentSessionResult` so the caller (`runWorkflow()`) can call `handle.dispose()`.
104
105
 
105
- - `abortRegistry` is registered in `runWorkflow()` immediately after `const agent = new AgentLoop(...)`. The closure `() => agent.abort()` references `agent` -- registering before agent construction would be a TDZ hazard.
106
+ - `handle.setAgent(agent)` is called in `buildAgentReadySession()` immediately after `const agent = new AgentLoop(...)`. This wires in abort capability. `abort()` before `setAgent()` is a safe no-op -- the TDZ hazard is eliminated by the null check inside `SessionHandleImpl.abort()`.
106
107
 
107
108
  **Deregistration**:
108
109
 
109
- - `steerRegistry.delete()` and `abortRegistry.delete()` are called in the `finally` block of `runWorkflow()`. This ensures cleanup happens even if an exception is thrown in the agent loop.
110
+ - `handle.dispose()` is called in the `finally` block of `runAgentLoop()`. This removes the handle from `ActiveSessionSet` so `size` decrements correctly and shutdown drain terminates.
110
111
 
111
- - `daemonRegistry.unregister()` is called at each result path (success, error, timeout, stuck) via `finalizeSession()`. It is NOT in `finally` because the completion status ('completed' vs 'failed') differs by path.
112
+ - `daemonRegistry.unregister()` is called via `finalizeSession()` at both result paths (early-exit and post-agent-loop). It is NOT in `finally` because the completion status ('completed' vs 'failed') differs by result.
112
113
 
113
- **Why stale entries are bugs:** A stale steer callback on a dead session makes `POST /sessions/:id/steer` return 200 (calling the closed-over callback) instead of 404. A stale abort callback makes the shutdown handler call `abort()` on an already-exited session. Both are silent correctness bugs.
114
+ **Why stale entries are bugs:** A stale steer handle on a dead session makes `POST /sessions/:id/steer` return 200 instead of 404. A stale abort handle makes the shutdown handler call `abort()` on an already-exited session. Both are silent correctness bugs.
114
115
 
115
116
  ### 3.2 `DaemonRegistry` is unregistered at every result path
116
117
 
117
- `daemonRegistry.unregister(workrailSessionId, 'completed' | 'failed')` is called at each of the four result paths (success, error, timeout, stuck). It is NOT in the `finally` block because the completion status ('completed' vs 'failed') differs by path.
118
+ `daemonRegistry.unregister(workrailSessionId, 'completed' | 'failed')` is called via `finalizeSession()` at both the early-exit path and the post-agent-loop path. It is NOT in `finally` because the completion status differs by result.
118
119
 
119
120
  ### 3.3 `workrailSessionId` is available before registry operations
120
121
 
@@ -124,11 +125,11 @@ If `parseContinueTokenOrFail()` fails (unusual -- the token just came from `exec
124
125
 
125
126
  ### 3.4 Registration gap is documented
126
127
 
127
- **SteerRegistry gap (~50ms):** There is a ~50ms window between `executeStartWorkflow()` returning and `steerRegistry.set()` being called (after `parseContinueTokenOrFail()` completes). A `POST /sessions/:id/steer` call in this window receives 404. Coordinators should retry once on 404 during session startup.
128
+ **Steer gap (~50ms):** There is a ~50ms window between `executeStartWorkflow()` returning and `activeSessionSet.register()` being called (after `parseContinueTokenOrFail()` completes). A `POST /sessions/:id/steer` call in this window receives 404. Coordinators should retry once on 404 during session startup.
128
129
 
129
- **AbortRegistry gap (~200-500ms):** `abortRegistry.set()` is registered _after_ `const agent = new AgentLoop(...)` is constructed, which happens after the context-loading phase (`loadDaemonSoul`, `loadWorkspaceContext`, `loadSessionNotes` in parallel). This means there is a ~200-500ms window where SIGTERM will not abort an in-flight session. Sessions in this window run to completion or hit the wall-clock timeout.
130
+ **Abort gap (~200-500ms):** `handle.setAgent(agent)` is called after `const agent = new AgentLoop(...)` is constructed, which happens after the context-loading phase (`loadDaemonSoul`, `loadWorkspaceContext`, `loadSessionNotes` in parallel). During this window, `handle.abort()` is a safe no-op -- SIGTERM will not abort the session. Sessions in this window run to completion or hit the wall-clock timeout.
130
131
 
131
- **Why the abort gap is wider than the steer gap:** `abortRegistry.set` registers `() => agent.abort()` which closes over `agent`. Registering this callback before `agent` is constructed would be a TDZ (Temporal Dead Zone) hazard -- `agent` is declared with `const` and would not yet be initialized if the shutdown handler fired on an early-exit path. Registering after `agent` construction eliminates the hazard at the cost of a wider registration window. The accepted tradeoff is the same as for the steer gap.
132
+ **Why the abort gap is wider than the steer gap:** `setAgent()` must be called after `agent` construction. Calling it before would be a TDZ hazard. The `SessionHandleImpl.abort()` null-checks `_agent`, making pre-`setAgent()` abort a safe no-op rather than a crash.
132
133
 
133
134
  ---
134
135
 
@@ -160,7 +161,7 @@ Both are guarded by the sequential tool execution invariant (no concurrent token
160
161
 
161
162
  All three stuck detection signals (`repeated_tool_call`, `no_progress`, `timeout_imminent`) emit `agent_stuck` events via `emitter?.emit()`, which is fire-and-forget. An event write failure never affects the session.
162
163
 
163
- Signals 1 and 2 abort the session (set `stuckReason`) subject to `stuckAbortPolicy`. Signal 3 (`timeout_imminent`) is purely observational -- the abort has already been triggered by the timeout handler.
164
+ Signals 1 and 2 call `setTerminalSignal(state, { kind: 'stuck', reason: ... })` subject to `stuckAbortPolicy`. Signal 3 (`timeout_imminent`) is purely observational -- the abort has already been triggered by the timeout handler.
164
165
 
165
166
  ### 4.5 `spawn_agent` depth is enforced at the call site
166
167
 
@@ -204,7 +205,7 @@ On failure/timeout/stuck paths, the worktree is left in place for debugging. `ru
204
205
 
205
206
  ### 6.3 Sessions with >= 1 step advance are resumed if sidecar has trigger context
206
207
 
207
- `evaluateRecovery({ stepAdvances: >= 1 })` returns `'resume'`. If the sidecar contains `workflowId` and `workspacePath`, `runStartupRecovery()` calls `executeContinueWorkflow({ intent: 'rehydrate' })` to get the current step prompt, builds a minimal `WorkflowTrigger` with `_preAllocatedStartResponse`, and calls `runWorkflow()` fire-and-forget.
208
+ `evaluateRecovery({ stepAdvances: >= 1 })` returns `'resume'`. If the sidecar contains `workflowId` and `workspacePath`, `runStartupRecovery()` calls `executeContinueWorkflow({ intent: 'rehydrate' })` to get the current step prompt, builds a minimal `WorkflowTrigger` and a `pre_allocated` `SessionSource`, and calls `runWorkflow()` fire-and-forget.
208
209
 
209
210
  **Old-format sidecars** (missing `workflowId`/`workspacePath`) fall through to discard regardless of step count.
210
211
 
@@ -214,32 +215,15 @@ Worktree sessions that are resumed set `branchStrategy: 'none'` and use the pers
214
215
 
215
216
  ---
216
217
 
217
- ## 7. Planned refactor: functional core / imperative shell
218
-
219
- The invariants above are currently enforced by convention (comments, code structure) rather than by the type system. The planned refactor will make them structurally enforced:
220
-
221
- **Core (pure functions, no I/O):**
222
- - `buildSessionConfig(trigger) SessionConfig` -- model, tools, limits, prompts
223
- - `evaluateAgentExitState(exitState) → WorkflowRunResult` -- replaces 4 scattered return sites
224
- - `tagToStatsOutcome(tag) → StatsOutcome` -- exhaustive via `assertNever`
225
- - `evaluateStuck(signals) StuckSignal | null` -- already nearly pure
226
-
227
- **Shell (one cleanup site for all I/O):**
228
- ```typescript
229
- async function runWorkflow(trigger, ctx, apiKey, ...): Promise<WorkflowRunResult> {
230
- const startMs = Date.now();
231
- const result = await _runWorkflowCore(trigger, ctx, apiKey, ...);
232
- // All I/O in one place:
233
- writeExecutionStats(statsDir, ..., tagToStatsOutcome(result._tag), result.stepCount);
234
- await cleanupSidecar(sessionId, result._tag, trigger.branchStrategy);
235
- emitSessionCompleted(emitter, sessionId, result._tag);
236
- daemonRegistry?.unregister(workrailSessionId, result._tag === 'success' ? 'completed' : 'failed');
237
- return result.workflowRunResult;
238
- }
239
- ```
240
-
241
- After the refactor, adding a new result path requires:
242
- 1. Adding it to the `WorkflowRunResult` union (compiler enforces exhaustiveness in `tagToStatsOutcome` via `assertNever`)
243
- 2. Returning the new variant from `_runWorkflowCore` (no I/O to add at the return site)
244
-
245
- The current pattern requires manually adding `writeExecutionStats()`, sidecar deletion, event emission, and registry deregistration at each new return site -- easily forgotten.
218
+ ## 7. Structural enforcement summary
219
+
220
+ The invariants above are enforced by a combination of type system guarantees and code structure:
221
+
222
+ - `tagToStatsOutcome()` -- pure function with `assertNever` default; compiler error on unhandled `_tag`
223
+ - `sidecardLifecycleFor()` -- pure function with `assertNever` default; compiler error on unhandled `_tag`
224
+ - `buildSessionResult()` -- pure function; reads `state.terminalSignal` after loop exits
225
+ - `finalizeSession()` -- single cleanup site for all result paths (event emission, registry cleanup, stats, sidecar deletion)
226
+ - `setTerminalSignal()` -- first-writer-wins; structurally prevents dual stuck+timeout state
227
+ - `SessionHandle` -- encapsulates steer/abort lifecycle; `abort()` before `setAgent()` is a safe no-op
228
+
229
+ Adding a new `WorkflowRunResult` variant requires updating `tagToStatsOutcome()` and `sidecardLifecycleFor()` -- the compiler enforces both via `assertNever`. No I/O needs to be added at the new return site.
package/docs/vision.md CHANGED
@@ -14,7 +14,7 @@ WorkTrain runs the workrail repository as one of its own workspaces. It picks up
14
14
 
15
15
  This creates a direct feedback loop: if WorkTrain's development pipeline is flawed, it will produce flawed changes to itself and catch them in review. If its context injection is thin, it will miss things in its own codebase that a well-briefed agent would catch. The quality of WorkTrain's output is the quality of WorkTrain.
16
16
 
17
- The self-improvement loop is not fully operational today. The pieces -- coordinator session chaining, full development pipeline, spec as ground truth, living work context -- are being built. But it is the north star. If WorkTrain cannot build WorkTrain well, it cannot be trusted to build anything else.
17
+ The self-improvement loop is not fully operational today, but it is the north star. If WorkTrain cannot build WorkTrain well, it cannot be trusted to build anything else.
18
18
 
19
19
  ## What success looks like
20
20
 
@@ -34,7 +34,7 @@ WorkTrain earns trust over time by doing this correctly, repeatedly, at scale --
34
34
 
35
35
  **Zero LLM turns for routing.** Coordinator decisions -- what workflow to run next, whether findings are blocking, when to merge -- are deterministic TypeScript code. LLM turns are used for cognitive work: understanding code, writing code, evaluating findings. Never for deciding "what do I do next?".
36
36
 
37
- **Structured outputs at every boundary.** Each phase produces a typed result. The next phase reads that result. Free-text scraping between phases is a design smell. `ChildSessionResult`, `wr.coordinator_result`, `wr.review_verdict` are the contracts that make phases composable without a main agent holding context.
37
+ **Structured outputs at every boundary.** Each phase produces a typed result. The next phase reads that result. Free-text scraping between phases is a design smell. Typed contracts at phase boundaries are what make phases composable without a main agent holding context.
38
38
 
39
39
  **Correctness over speed.** WorkTrain does not merge changes it is not confident in. Review findings are addressed. Tests pass. The right next step is not always the fastest one.
40
40
 
@@ -88,18 +88,6 @@ WorkTrain does not pause for: implementation decisions within a well-specified t
88
88
 
89
89
  This boundary is still being tested and refined through real usage. Where exactly "genuine ambiguity" begins is an open question.
90
90
 
91
- ## What is still being built
92
-
93
- WorkTrain is not finished. The vision above is where it is going, not where it is today. Key pieces still in progress:
94
-
95
- - **Living work context** -- shared knowledge store that accumulates across all phases so every agent starts informed (`docs/ideas/backlog.md`: "Living work context")
96
- - **Coordinator pipeline templates** -- actual coordinator scripts for full development pipeline, bug-fix, grooming (`docs/ideas/backlog.md`: "Scripts-first coordinator")
97
- - **`worktrain spawn`/`await` CLI** -- CLI surface for coordinator scripts
98
- - **Knowledge graph** -- per-workspace structural understanding so agents skip discovery on repeated tasks
99
- - **Spec as ground truth** -- wiring `wr.shaping` output into coordinator dispatch so coding/review agents work from the same spec
100
-
101
- For the current prioritized list, see `npm run backlog` or `docs/ideas/backlog.md`.
102
-
103
91
  ## Open questions
104
92
 
105
93
  These are genuinely unresolved. Any agent operating in this system should know they exist and not assume they are answered.
@@ -112,4 +100,6 @@ These are genuinely unresolved. Any agent operating in this system should know t
112
100
 
113
101
  - **What is the right granularity of tasks?** WorkTrain is being designed for ticket-sized work. Whether it handles epics (by decomposing them), hotfixes (by moving fast and deferring thoroughness), and architectural changes (which may require multiple sessions across multiple days) the same way is untested.
114
102
 
115
- - **Is "document" the right abstraction for the living work context?** A flat document implies agents read it linearly. Agents need to query it selectively -- the coding agent wants constraints relevant to a specific decision, the review agent wants what the coding agent said about a specific module. A structured knowledge store (typed facts, queryable by topic) may be more useful than a document. See `docs/ideas/backlog.md`: "Living work context".
103
+ - **Is typed-artifact-per-phase the right abstraction for inter-phase context?** The current model threads structured handoff artifacts between pipeline phases. Whether this is sufficient long-term, or whether a queryable per-workspace knowledge store (indexed by topic, accessible across pipeline runs and across tasks) is needed for things like codebase-specific priors and accumulated project memory, is an open question. See `docs/ideas/backlog.md`: "Knowledge graph".
104
+
105
+ For current priorities and status, run `npm run backlog` or read `docs/ideas/backlog.md`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@exaudeus/workrail",
3
- "version": "3.75.0",
3
+ "version": "3.77.0",
4
4
  "description": "Step-by-step workflow enforcement for AI agents via MCP",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -91,7 +91,7 @@
91
91
  },
92
92
  "dependencies": {
93
93
  "@anthropic-ai/bedrock-sdk": "^0.29.0",
94
- "@anthropic-ai/sdk": "^0.91.1",
94
+ "@anthropic-ai/sdk": "^0.94.0",
95
95
  "@modelcontextprotocol/sdk": "^1.24.0",
96
96
  "@scure/base": "2.2.0",
97
97
  "ajv": "^8.17.1",
@@ -143,7 +143,7 @@
143
143
  "SUBAGENT SYNTHESIS: treat subagent output as evidence, not conclusions. State your hypothesis before delegating, then interrogate what came back: what was missed, wrong, or new? Say what changed your mind or what you still reject, and why.",
144
144
  "PARALLELISM: when reads, audits, or delegations are independent, run them in parallel inside the phase. Parallelize cognition; serialize synthesis and canonical writes.",
145
145
  "PHILOSOPHY LENS: apply the user's coding philosophy (from active session rules) as the evaluation lens. Flag violations by principle name, not as generic feedback. If principles conflict, surface the tension explicitly instead of silently choosing.",
146
- "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness in that order of reliability.",
146
+ "VALIDATION: prefer static/compile-time safety over runtime checks. Use build, type-checking, and tests as the primary proof of correctness \u2014 in that order of reliability.",
147
147
  "DRIFT HANDLING: when reality diverges from the plan, update the plan artifact and re-audit deliberately rather than accumulating undocumented drift.",
148
148
  "NEVER COMMIT MARKDOWN FILES UNLESS USER EXPLICITLY ASKS.",
149
149
  "SLICE DISCIPLINE: Phase 6 is a loop -- implement ONE slice per iteration. Do not implement multiple slices at once. The verification loop exists to catch drift per slice, not retroactively."
@@ -218,7 +218,7 @@
218
218
  },
219
219
  {
220
220
  "id": "phase-1b-design-deep",
221
- "title": "Phase 1b: Design Generation (Injected Routine Tension-Driven Design)",
221
+ "title": "Phase 1b: Design Generation (Injected Routine \u2014 Tension-Driven Design)",
222
222
  "runCondition": {
223
223
  "and": [
224
224
  {
@@ -257,7 +257,7 @@
257
257
  }
258
258
  ]
259
259
  },
260
- "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` chosen design with rationale tied to tensions\n- `runnerUpApproach` next-best option and why it lost\n- `architectureRationale` tensions resolved vs accepted\n- `pivotTriggers` conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
260
+ "prompt": "Read `design-candidates.md`, compare it to your original guess, and make the call.\n\nBe explicit about three things:\n- what the design work confirmed\n- what changed your mind\n- what you missed the first time\n\nThen pressure-test the leading option:\n- what's the strongest case against it?\n- what assumption breaks it?\n\nAfter the challenge batch, say:\n- what changed your mind\n- what didn't\n- which findings you reject and why\n\nPick the approach yourself. Don't hide behind the artifact. If the simplest thing works, prefer it. If the front-runner stops looking right after challenge, switch.\n\nCapture:\n- `selectedApproach` \u2014 chosen design with rationale tied to tensions\n- `runnerUpApproach` \u2014 next-best option and why it lost\n- `architectureRationale` \u2014 tensions resolved vs accepted\n- `pivotTriggers` \u2014 conditions under which you'd switch to the runner-up\n- `keyRiskToMonitor` \u2014 failure mode of the selected approach\n- `acceptedTradeoffs`\n- `identifiedFailureModes`",
261
261
  "promptFragments": [
262
262
  {
263
263
  "id": "phase-1c-challenge-standard",
@@ -429,7 +429,7 @@
429
429
  "var": "taskComplexity",
430
430
  "not_equals": "Small"
431
431
  },
432
- "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` count of open questions that would materially affect implementation quality\n- `planConfidenceBand` Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
432
+ "prompt": "Turn the decision into a plan someone else could execute without guessing.\n\n**Open questions gate:** check `openQuestions` from Phase 0. If any remain unanswered and would materially affect implementation quality, either resolve them now with tools or record them in the risk register with an explicit decision about how to proceed without them. Do not silently carry unanswered questions into implementation.\n\nUpdate `implementation_plan.md`.\n\nIt should cover:\n1. Problem statement\n2. Acceptance criteria (mirror `spec.md` if it exists; `spec.md` owns observable behavior)\n3. Non-goals\n4. Philosophy-driven constraints\n5. Invariants\n6. Selected approach + rationale + runner-up\n7. Vertical slices\n8. Work packages only if they actually help\n9. Test design\n10. Risk register\n11. PR packaging strategy\n12. Philosophy alignment per slice:\n - [principle] -> [satisfied / tension / violated + 1-line why]\n\nCapture:\n- `implementationPlan`\n- `slices`\n- `testDesign`\n- `estimatedPRCount`\n- `followUpTickets` (initialize if needed)\n- `unresolvedUnknownCount` \u2014 count of open questions that would materially affect implementation quality\n- `planConfidenceBand` \u2014 Low / Medium / High\n\nThe plan is the deliverable for this step. Do not implement anything -- not a \"quick win\", not a file read that bleeds into edits, nothing. Execution begins in Phase 6, one slice at a time. If you find yourself writing code or editing source files right now, stop immediately.",
433
433
  "assessmentRefs": [
434
434
  "plan-completeness-gate",
435
435
  "invariant-clarity-gate",
@@ -543,7 +543,7 @@
543
543
  {
544
544
  "id": "phase-4b-loop-decision",
545
545
  "title": "Loop Exit Decision",
546
- "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n \"artifacts\": [{\n \"kind\": \"wr.loop_control\",\n \"decision\": \"continue\"\n }]\n}\n```",
546
+ "prompt": "Decide whether the plan needs another pass.\n\nIf `planFindings` is non-empty, keep going.\nIf it's empty, stop \u2014 but say what you checked so the clean pass means something.\nIf you've hit the limit, stop and record what still bothers you.\n\nThen emit the required loop-control artifact in this shape (`decision` must be `continue` or `stop`):\n```json\n{\n \"artifacts\": [{\n \"kind\": \"wr.loop_control\",\n \"decision\": \"continue\"\n }]\n}\n```",
547
547
  "requireConfirmation": true,
548
548
  "outputContract": {
549
549
  "contractRef": "wr.contracts.loop_control"
@@ -706,7 +706,10 @@
706
706
  "id": "phase-8-retrospective",
707
707
  "title": "Phase 8: Retrospective",
708
708
  "requireConfirmation": false,
709
- "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)"
709
+ "prompt": "The implementation is done and verified. Now look back.\n\nThis is not a re-run of tests. It is a short honest look at the work you just did.\n\nAsk yourself:\n\n1. **What would you do differently?** Now that the implementation is real, what approach, boundary, or decision looks wrong in hindsight?\n\n2. **What adjacent problems did this reveal?** Did the implementation expose gaps, tech debt, or fragile assumptions in the surrounding code that were not in scope but are worth noting?\n\n3. **What follow-up work is now visible?** What is the natural next step that became clear only after doing this work?\n\n4. **What was harder or easier than expected?** Were there surprises -- good or bad -- that would change how similar tasks are approached next time?\n\nProduce 2-4 concrete observations. Each should be specific enough to act on.\n\nFor each observation:\n- **File as follow-up**: add to backlog or open a ticket if it warrants tracking\n- **Accept**: note it explicitly if it is a known limitation you are consciously leaving\n- **Fix now**: if it is small and low-risk, fix it before closing\n\nCapture:\n- `retrospectiveObservations`: list of observations with disposition (filed/accepted/fixed)\n- `followUpTickets`: any new tickets created (append to existing list)\n\nBefore completing this step, emit a wr.coding_handoff artifact in your complete_step call:\n{\n \"kind\": \"wr.coding_handoff\",\n \"version\": 1,\n \"branchName\": \"<git branch name containing your changes>\",\n \"keyDecisions\": [\"<architectural decision + WHY>\", ...],\n \"knownLimitations\": [\"<known gap or deliberate shortcut>\", ...],\n \"testsAdded\": [\"<test file or test name added>\", ...],\n \"filesChanged\": [\"<primary file path changed>\", ...]\n}\nNote: correctedAssumptions is populated ONLY by fix/retry agents when correcting assumptions from a prior coding session. On a first-run coding session, omit this field entirely.",
710
+ "outputContract": {
711
+ "contractRef": "wr.contracts.coding_handoff"
712
+ }
710
713
  }
711
714
  ],
712
715
  "validatedAgainstSpecVersion": 3
@@ -86,7 +86,7 @@
86
86
  {
87
87
  "id": "phase-0-understand-and-classify",
88
88
  "title": "Phase 0: Locate, Bound, Enrich & Classify",
89
- "prompt": "Build the review foundation in one pass.\n\nStep 1 \u2014 Early exit / minimum inputs:\nBefore exploring, verify that the review target is real and inspectable. If the diff, changed files, or equivalent review material are completely absent and cannot be inferred with tools, ask for the minimum missing artifact and stop. Do NOT ask questions you can resolve with tools.\n\nStep 2 \u2014 Locate and bound the review target:\nAttempt to determine the strongest available review target and boundary.\n\nAttempt to establish:\n- `reviewTargetKind` from the strongest available source such as PR/MR, branch, patch, diff, or local working tree changes\n- `reviewTargetSource` describing where the target came from\n- likely PR/MR identity when available (`prUrl`, `prNumber`)\n- likely base / ancestor reference (`baseCandidate`, `mergeBaseRef`) when available\n- whether the branch may include inherited or out-of-scope changes\n- `boundaryConfidence`: High / Medium / Low\n\nDo not over-prescribe your own investigation path. Use the strongest available evidence and record uncertainty honestly.\n\nStep 3 \u2014 Enrich with context:\nRecover the strongest available intent and policy context from whatever sources are actually available.\n\nAttempt to recover:\n- MR title and purpose\n- ticket / issue / acceptance context (`ticketRefs`, `ticketContext`)\n- supporting docs / specs / rollout context (`supportingDocsFound`)\n- repo or user policy/convention context when it is likely to affect review judgment (`policySourcesFound`)\n- `contextConfidence`: High / Medium / Low\n\nStep 4 \u2014 Review-surface hygiene:\nClassify the visible change into a minimal review surface.\n\nSet:\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n\nThe goal is not a giant ledger. The goal is to avoid treating every visible changed file as equally worthy of deep review by default.\n\nStep 5 \u2014 Classify the review:\nAfter exploration, classify the work.\n\nSet:\n- `reviewMode`: QUICK / STANDARD / THOROUGH\n- `riskLevel`: Low / Medium / High\n- `shapeProfile`: choose the best primary label from `isolated_change`, `crosscutting_change`, `mechanically_noisy_change`, or `ambiguous_boundary`\n- `changeTypeProfile`: choose the best primary label from `general_code_change`, `api_contract_change`, `data_model_or_migration`, `security_sensitive`, or `test_only`\n- `maxParallelism`: 0 / 3 / 5\n- `criticalSurfaceTouched`: true / false\n- `needsSimulation`: true / false\n- `needsBoundaryFollowup`: true / false\n- `needsContextFollowup`: true / false\n- `needsReviewerBundle`: true / false\n\nDecision guidance:\n- QUICK: very small, isolated, low-risk changes with little ambiguity\n- STANDARD: typical feature or bug-fix reviews with moderate ambiguity or moderate risk\n- THOROUGH: critical surfaces, architectural novelty, high risk, broad change sets, or strong need for independent reviewer perspectives\n\nMinimal routing guidance:\n- if `boundaryConfidence = Low`, bias toward boundary/context follow-up before strong recommendation confidence\n- if `changeTypeProfile = api_contract_change`, bias toward contract/consumer/backward-compatibility scrutiny\n- if `changeTypeProfile = data_model_or_migration`, bias toward rollout / compatibility / simulation scrutiny\n- if `changeTypeProfile = security_sensitive`, bias toward adversarial/runtime-risk scrutiny and lower tolerance for weak evidence\n- if `changeTypeProfile = test_only`, bias toward stronger false-positive suppression\n- if `shapeProfile = mechanically_noisy_change`, bias toward stronger noise filtering and lower appetite for style-only findings\n\nStep 6 \u2014 Optional deeper context:\nIf `reviewMode` is STANDARD or THOROUGH and context remains incomplete, and delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-context-gathering` with focus=COMPLETENESS and focus=DEPTH. Synthesize both outputs before finishing this step.\n\nStep 7 \u2014 Human-facing artifact:\nChoose `reviewDocPath` only if a live artifact will materially improve human readability. Default suggestion: `mr-review.md` at the project root. This artifact is optional and never canonical workflow state.\n\nFallback behavior:\n- if PR/MR is not found but a branch/diff is inspectable, continue with downgraded context confidence and disclose missing PR context later\n- if the branch is inspectable but merge-base / ancestor remains ambiguous, continue with downgraded boundary confidence, set `needsBoundaryFollowup = true`, and disclose the uncertainty later\n- if ticket or supporting docs are missing, continue with downgraded context confidence and avoid overclaiming intent-sensitive findings\n- if only a patch/diff is available, continue if it is inspectable, but keep lower confidence on intent/boundary-dependent conclusions\n- if the review target itself is missing, ask only for that missing artifact and stop\n\nSet these keys in the next `continue_workflow` call's `context` object:\n- `reviewTargetKind`\n- `reviewTargetSource`\n- `prUrl`\n- `prNumber`\n- `baseCandidate`\n- `mergeBaseRef`\n- `boundaryConfidence`\n- `contextConfidence`\n- `mrTitle`\n- `mrPurpose`\n- `ticketRefs`\n- `ticketContext`\n- `supportingDocsFound`\n- `policySourcesFound`\n- `accessibleContextSources`\n- `missingContextSources`\n- `focusAreas`\n- `changedFileCount`\n- `criticalSurfaceTouched`\n- `reviewMode`\n- `riskLevel`\n- `shapeProfile`\n- `changeTypeProfile`\n- `maxParallelism`\n- `reviewDocPath`\n- `contextSummary`\n- `candidateFiles`\n- `moduleRoots`\n- `contextUnknownCount`\n- `coverageGapCount`\n- `authorIntentUnclear`\n- `needsSimulation`\n- `needsBoundaryFollowup`\n- `needsContextFollowup`\n- `needsReviewerBundle`\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n- `openQuestions`\n\nRules:\n- answer your own questions with tools whenever possible\n- only keep true human-decision questions in `openQuestions`\n- keep `openQuestions` bounded to the minimum necessary\n- classify AFTER exploring, not before\n- before leaving this phase, either establish the likely review boundary or explicitly record why you could not\n\nAlso set in the context object: one sentence describing what you are trying to accomplish (e.g. \"implement OAuth refresh token rotation\", \"review PR #47 before merge\"). This populates the session title in the Workspace console immediately.",
89
+ "prompt": "Build the review foundation in one pass.\n\nStep 1 \u2014 Early exit / minimum inputs:\nBefore exploring, verify that the review target is real and inspectable. If the diff, changed files, or equivalent review material are completely absent and cannot be inferred with tools, ask for the minimum missing artifact and stop. Do NOT ask questions you can resolve with tools.\n\nStep 2 \u2014 Locate and bound the review target:\nAttempt to determine the strongest available review target and boundary.\n\nAttempt to establish:\n- `reviewTargetKind` from the strongest available source such as PR/MR, branch, patch, diff, or local working tree changes\n- `reviewTargetSource` describing where the target came from\n- likely PR/MR identity when available (`prUrl`, `prNumber`)\n- likely base / ancestor reference (`baseCandidate`, `mergeBaseRef`) when available\n- whether the branch may include inherited or out-of-scope changes\n- `boundaryConfidence`: High / Medium / Low\n\nDo not over-prescribe your own investigation path. Use the strongest available evidence and record uncertainty honestly.\n\nStep 3 \u2014 Enrich with context:\nRecover the strongest available intent and policy context from whatever sources are actually available.\n\nAttempt to recover:\n- MR title and purpose\n- ticket / issue / acceptance context (`ticketRefs`, `ticketContext`)\n- supporting docs / specs / rollout context (`supportingDocsFound`)\n- repo or user policy/convention context when it is likely to affect review judgment (`policySourcesFound`)\n- `contextConfidence`: High / Medium / Low\n\nStep 4 \u2014 Review-surface hygiene:\nClassify the visible change into a minimal review surface.\n\nSet:\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n\nThe goal is not a giant ledger. The goal is to avoid treating every visible changed file as equally worthy of deep review by default.\n\nStep 5 \u2014 Classify the review:\nAfter exploration, classify the work.\n\nSet:\n- `reviewMode`: QUICK / STANDARD / THOROUGH\n- `riskLevel`: Low / Medium / High\n- `shapeProfile`: choose the best primary label from `isolated_change`, `crosscutting_change`, `mechanically_noisy_change`, or `ambiguous_boundary`\n- `changeTypeProfile`: choose the best primary label from `general_code_change`, `api_contract_change`, `data_model_or_migration`, `security_sensitive`, or `test_only`\n- `maxParallelism`: 0 / 3 / 5\n- `criticalSurfaceTouched`: true / false\n- `needsSimulation`: true / false\n- `needsBoundaryFollowup`: true / false\n- `needsContextFollowup`: true / false\n- `needsReviewerBundle`: true / false\n\nDecision guidance:\n- QUICK: very small, isolated, low-risk changes with little ambiguity\n- STANDARD: typical feature or bug-fix reviews with moderate ambiguity or moderate risk\n- THOROUGH: critical surfaces, architectural novelty, high risk, broad change sets, or strong need for independent reviewer perspectives\n\nMinimal routing guidance:\n- if `boundaryConfidence = Low`, bias toward boundary/context follow-up before strong recommendation confidence\n- if `changeTypeProfile = api_contract_change`, bias toward contract/consumer/backward-compatibility scrutiny\n- if `changeTypeProfile = data_model_or_migration`, bias toward rollout / compatibility / simulation scrutiny\n- if `changeTypeProfile = security_sensitive`, bias toward adversarial/runtime-risk scrutiny and lower tolerance for weak evidence\n- if `changeTypeProfile = test_only`, bias toward stronger false-positive suppression\n- if `shapeProfile = mechanically_noisy_change`, bias toward stronger noise filtering and lower appetite for style-only findings\n\nStep 6 \u2014 Optional deeper context:\nIf `reviewMode` is STANDARD or THOROUGH and context remains incomplete, and delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-context-gathering` with focus=COMPLETENESS and focus=DEPTH. Synthesize both outputs before finishing this step.\n\nStep 7 \u2014 Human-facing artifact:\nChoose `reviewDocPath` only if a live artifact will materially improve human readability. Default suggestion: `mr-review.md` at the project root. This artifact is optional and never canonical workflow state.\n\nFallback behavior:\n- if PR/MR is not found but a branch/diff is inspectable, continue with downgraded context confidence and disclose missing PR context later\n- if the branch is inspectable but merge-base / ancestor remains ambiguous, continue with downgraded boundary confidence, set `needsBoundaryFollowup = true`, and disclose the uncertainty later\n- if ticket or supporting docs are missing, continue with downgraded context confidence and avoid overclaiming intent-sensitive findings\n- if only a patch/diff is available, continue if it is inspectable, but keep lower confidence on intent/boundary-dependent conclusions\n- if the review target itself is missing, ask only for that missing artifact and stop\n\nSet these keys in the next `continue_workflow` call's `context` object:\n- `reviewTargetKind`\n- `reviewTargetSource`\n- `prUrl`\n- `prNumber`\n- `baseCandidate`\n- `mergeBaseRef`\n- `boundaryConfidence`\n- `contextConfidence`\n- `mrTitle`\n- `mrPurpose`\n- `ticketRefs`\n- `ticketContext`\n- `supportingDocsFound`\n- `policySourcesFound`\n- `accessibleContextSources`\n- `missingContextSources`\n- `focusAreas`\n- `changedFileCount`\n- `criticalSurfaceTouched`\n- `reviewMode`\n- `riskLevel`\n- `shapeProfile`\n- `changeTypeProfile`\n- `maxParallelism`\n- `reviewDocPath`\n- `contextSummary`\n- `candidateFiles`\n- `moduleRoots`\n- `contextUnknownCount`\n- `coverageGapCount`\n- `authorIntentUnclear`\n- `needsSimulation`\n- `needsBoundaryFollowup`\n- `needsContextFollowup`\n- `needsReviewerBundle`\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n- `openQuestions`\n\nRules:\n- answer your own questions with tools whenever possible\n- only keep true human-decision questions in `openQuestions`\n- keep `openQuestions` bounded to the minimum necessary\n- classify AFTER exploring, not before\n- before leaving this phase, either establish the likely review boundary or explicitly record why you could not\n\nAlso set in the context object: one sentence describing what you are trying to accomplish (e.g. \"implement OAuth refresh token rotation\", \"review PR #47 before merge\"). This populates the session title in the Workspace console immediately.\n\nIf `validationChecklist` is provided in context (from the shaping phase), verify each item explicitly before proceeding to deeper review:\n- Each item is an acceptance criterion declared during shaping\n- A failing checklist item is a blocking finding regardless of other review depth\n- Record: which items passed, which failed, which could not be verified\n- Example: if checklist says \"Auth middleware is not modified\" and auth files changed, flag it as blocking\n\nThis is step 1b in your review process.",
90
90
  "requireConfirmation": {
91
91
  "or": [
92
92
  {
@@ -103,7 +103,7 @@
103
103
  {
104
104
  "id": "phase-0b-scope-and-completeness-gate",
105
105
  "title": "Phase 0b: Scope & Completeness Gate",
106
- "prompt": "Verify that the PR delivers what was asked and nothing more.\n\nThis step runs after context is established (Phase 0) and before forming a review hypothesis. Its output feeds the fact packet in Phase 2.\n\nStep 1 Enumerate acceptance criteria:\nFrom the ticket/issue/PR description recovered in Phase 0, extract a flat list of acceptance criteria. If no explicit criteria exist, infer them from the stated goal and the PR title/description. Mark each as `explicit` (stated in ticket/issue) or `inferred` (derived from goal).\n\nIf no ticket, issue, or PR description is available, record `acceptanceCriteriaSource: none` and set `scopeCheckConfidence: Low`. Continue with downgraded confidence -- do not block the review.\n\nStep 2 Check each criterion against the diff:\nFor each acceptance criterion, examine the diff and determine:\n- `met`: the diff clearly addresses this criterion\n- `partial`: the diff partially addresses it but something appears missing\n- `missing`: the diff does not appear to address this criterion at all\n- `unclear`: insufficient context to judge\n\nCite specific files or functions for `met` and `partial` judgments. Be concrete.\n\nStep 3 Check for scope creep:\nLook for changes in the diff that go beyond what any acceptance criterion requires. Flag any change that:\n- modifies behavior not mentioned in the ticket/goal\n- touches files unrelated to the stated purpose\n- introduces new abstractions or refactors not required by the task\n\nDistinguish necessary implementation details (e.g. extracting a helper to implement the feature) from genuine scope creep (e.g. rewriting unrelated logic while here).\n\nStep 4 Set context keys:\nSet these keys in the next `continue_workflow` call's `context` object:\n- `acceptanceCriteria`: array of `{ criterion, source: 'explicit'|'inferred', status: 'met'|'partial'|'missing'|'unclear', evidence? }`\n- `acceptanceCriteriaSource`: `'ticket'` | `'pr_description'` | `'inferred'` | `'none'`\n- `missingCriteriaCount`: number of criteria with status `missing` or `partial`\n- `scopeCreepFlags`: array of specific out-of-scope changes found (empty array if none)\n- `scopeCreepCount`: length of `scopeCreepFlags`\n- `scopeCheckConfidence`: `High` | `Medium` | `Low`\n\nRules:\n- do not block the review on unclear criteria -- record uncertainty and continue\n- a criterion is only `missing` if you can confirm the behavior is absent from the diff, not just absent from a single file\n- scope creep findings feed into the reviewer families as potential `patterns_architecture` or `philosophy_alignment` concerns -- do not duplicate them as standalone findings here",
106
+ "prompt": "Verify that the PR delivers what was asked and nothing more.\n\nThis step runs after context is established (Phase 0) and before forming a review hypothesis. Its output feeds the fact packet in Phase 2.\n\nStep 1 \u2014 Enumerate acceptance criteria:\nFrom the ticket/issue/PR description recovered in Phase 0, extract a flat list of acceptance criteria. If no explicit criteria exist, infer them from the stated goal and the PR title/description. Mark each as `explicit` (stated in ticket/issue) or `inferred` (derived from goal).\n\nIf no ticket, issue, or PR description is available, record `acceptanceCriteriaSource: none` and set `scopeCheckConfidence: Low`. Continue with downgraded confidence -- do not block the review.\n\nStep 2 \u2014 Check each criterion against the diff:\nFor each acceptance criterion, examine the diff and determine:\n- `met`: the diff clearly addresses this criterion\n- `partial`: the diff partially addresses it but something appears missing\n- `missing`: the diff does not appear to address this criterion at all\n- `unclear`: insufficient context to judge\n\nCite specific files or functions for `met` and `partial` judgments. Be concrete.\n\nStep 3 \u2014 Check for scope creep:\nLook for changes in the diff that go beyond what any acceptance criterion requires. Flag any change that:\n- modifies behavior not mentioned in the ticket/goal\n- touches files unrelated to the stated purpose\n- introduces new abstractions or refactors not required by the task\n\nDistinguish necessary implementation details (e.g. extracting a helper to implement the feature) from genuine scope creep (e.g. rewriting unrelated logic while here).\n\nStep 4 \u2014 Set context keys:\nSet these keys in the next `continue_workflow` call's `context` object:\n- `acceptanceCriteria`: array of `{ criterion, source: 'explicit'|'inferred', status: 'met'|'partial'|'missing'|'unclear', evidence? }`\n- `acceptanceCriteriaSource`: `'ticket'` | `'pr_description'` | `'inferred'` | `'none'`\n- `missingCriteriaCount`: number of criteria with status `missing` or `partial`\n- `scopeCreepFlags`: array of specific out-of-scope changes found (empty array if none)\n- `scopeCreepCount`: length of `scopeCreepFlags`\n- `scopeCheckConfidence`: `High` | `Medium` | `Low`\n\nRules:\n- do not block the review on unclear criteria -- record uncertainty and continue\n- a criterion is only `missing` if you can confirm the behavior is absent from the diff, not just absent from a single file\n- scope creep findings feed into the reviewer families as potential `patterns_architecture` or `philosophy_alignment` concerns -- do not duplicate them as standalone findings here",
107
107
  "requireConfirmation": false
108
108
  },
109
109
  {
@@ -1146,7 +1146,8 @@
1146
1146
  ],
1147
1147
  "procedure": [
1148
1148
  "Update `designDocPath` with a final summary containing the selected path, problem framing, landscape takeaways, chosen direction, strongest alternative, why it lost, confidence band, residual risks, and next actions.",
1149
- "In the final chat output, tell me the selected path, the chosen direction, the key reason it won, and where to find `designDocPath`."
1149
+ "In the final chat output, tell me the selected path, the chosen direction, the key reason it won, and where to find `designDocPath`.",
1150
+ "When writing the final answer, also emit an enriched wr.discovery_handoff artifact in your complete_step call:\n{\n \"kind\": \"wr.discovery_handoff\",\n \"version\": 1,\n \"selectedDirection\": \"<one sentence: the chosen approach>\",\n \"designDocPath\": \"<path to design doc, or empty string>\",\n \"confidenceBand\": \"high\" | \"medium\" | \"low\",\n \"keyInvariants\": [\"<invariant that must hold>\", ...],\n \"rejectedDirections\": [{\"direction\": \"<approach>\", \"reason\": \"<why rejected>\"}, ...],\n \"implementationConstraints\": [\"<thing the coding agent MUST NOT violate>\", ...],\n \"keyCodebaseLocations\": [{\"path\": \"<file path>\", \"relevance\": \"<why relevant>\"}, ...]\n}\nThe implementationConstraints and keyCodebaseLocations fields are especially important -- they orient the coding agent without requiring it to re-run discovery."
1150
1151
  ],
1151
1152
  "verify": [
1152
1153
  "The design doc reads like a coherent human artifact.",
@@ -95,7 +95,7 @@
95
95
  "var": "isTrivial",
96
96
  "not_equals": true
97
97
  },
98
- "prompt": "Generate 6 fat-marker solution sketches with genuine diversity.\n\nFor each sketch:\n- 3-5 elements described in one sentence each\n- How the elements connect\n- What it explicitly does NOT do\n- Whether it stays close to the obvious solution or deviates (be honest -- at least 2 sketches must deviate meaningfully from the most obvious approach)\n\nStay at the product level. Elements describe what the feature does -- screens, flows, policies, affordances -- not how it is built. No file paths, no function names, no system internals.\n\nUse breadboard notation for connection: **Place A Place B when [user action]**. All words, no visual layout.\n\nAfter generating 6, select the 4 most diverse. Explicitly include the unconventional ones -- they are what makes the divergence valuable.\n\nNote any shared blind spots: things all 6 sketches ignored.\n\nCapture:\n- `candidateShapes` (array of 4: {framing, elements[], notDoing, deviatesFromObvious: boolean, description})\n- `sharedBlindSpots`",
98
+ "prompt": "Generate 6 fat-marker solution sketches with genuine diversity.\n\nFor each sketch:\n- 3-5 elements described in one sentence each\n- How the elements connect\n- What it explicitly does NOT do\n- Whether it stays close to the obvious solution or deviates (be honest -- at least 2 sketches must deviate meaningfully from the most obvious approach)\n\nStay at the product level. Elements describe what the feature does -- screens, flows, policies, affordances -- not how it is built. No file paths, no function names, no system internals.\n\nUse breadboard notation for connection: **Place A \u2192 Place B when [user action]**. All words, no visual layout.\n\nAfter generating 6, select the 4 most diverse. Explicitly include the unconventional ones -- they are what makes the divergence valuable.\n\nNote any shared blind spots: things all 6 sketches ignored.\n\nCapture:\n- `candidateShapes` (array of 4: {framing, elements[], notDoing, deviatesFromObvious: boolean, description})\n- `sharedBlindSpots`",
99
99
  "requireConfirmation": false
100
100
  },
101
101
  {
@@ -115,7 +115,7 @@
115
115
  "var": "isTrivial",
116
116
  "not_equals": true
117
117
  },
118
- "prompt": "Expand the chosen shape into a breadboard and element list. Stay at the product level throughout.\n\n**Breadboard (words only):**\n- **Places** -- screens, dialogs, states, endpoints (from the user's perspective)\n- **Affordances** -- buttons, fields, actions -- listed under their place\n- **Connections** -- 'Place A Place B when [user action]'\n\nNo visual layout. No code. No system internals. Words and arrows only.\n\n**Element list:**\nFor each element, one sentence classified as:\n- **Interface** -- something the user sees or interacts with (a surface, a flow, a visible state)\n- **Invariant** -- a behavioral constraint (a policy, a rule, what must always be true)\n- **Exclusion** -- functionality explicitly NOT included\n\nReject any element that:\n- Describes HOW to build something (implementation detail)\n- Uses vague modifiers without a concrete noun ('improve', 'better', 'scalable')\n\n**Structural validation for solution_roughness=high:** every element must be Interface, Invariant, or Exclusion describing product behavior -- not code structure, not technical implementation.\n\nCapture:\n- `breadboardMd` (breadboard in markdown)\n- `elements` (array: {name, description, classification: 'interface'|'invariant'|'exclusion'})",
118
+ "prompt": "Expand the chosen shape into a breadboard and element list. Stay at the product level throughout.\n\n**Breadboard (words only):**\n- **Places** -- screens, dialogs, states, endpoints (from the user's perspective)\n- **Affordances** -- buttons, fields, actions -- listed under their place\n- **Connections** -- 'Place A \u2192 Place B when [user action]'\n\nNo visual layout. No code. No system internals. Words and arrows only.\n\n**Element list:**\nFor each element, one sentence classified as:\n- **Interface** -- something the user sees or interacts with (a surface, a flow, a visible state)\n- **Invariant** -- a behavioral constraint (a policy, a rule, what must always be true)\n- **Exclusion** -- functionality explicitly NOT included\n\nReject any element that:\n- Describes HOW to build something (implementation detail)\n- Uses vague modifiers without a concrete noun ('improve', 'better', 'scalable')\n\n**Structural validation for solution_roughness=high:** every element must be Interface, Invariant, or Exclusion describing product behavior -- not code structure, not technical implementation.\n\nCapture:\n- `breadboardMd` (breadboard in markdown)\n- `elements` (array: {name, description, classification: 'interface'|'invariant'|'exclusion'})",
119
119
  "assessmentRefs": [
120
120
  "solution-roughness"
121
121
  ],
@@ -190,8 +190,11 @@
190
190
  {
191
191
  "id": "finalize",
192
192
  "title": "Step 9: Write pitch.md",
193
- "prompt": "Write the shaped pitch to disk.\n\n1. **If isTrivial=true:** write a minimal pitch.md using `trivialTaskDescription` from Step 1 as the problem (do not pick from `candidateProblems` -- Step 2 was skipped). Content: the raw task description, then 'Appetite: xs. Single bounded change, no design decisions required.' Record `divergenceMarker: 'efficiency_skip'`.\n\n2. **Otherwise:** write `.workrail/current-pitch.md` with the full pitch from Step 7. Also archive to `.workrail/pitches/YYYY-MM-DD-[slugified-problem].md`.\n\nFirst ensure the directory exists:\n```\nmkdir -p .workrail/pitches\n```\n\nSummary to print:\n- Problem: [one sentence]\n- Appetite: [sizingBucket, calendarDays days]\n- Solution: [element names]\n- Rabbit holes: [count] identified\n- Assumptions flagged for review: [count of confidence < 0.6 entries]\n- Files written: [paths]\n- Next: hand pitch.md to a human engineering team, or run coding-task-workflow-agentic",
194
- "requireConfirmation": false
193
+ "prompt": "Write the shaped pitch to disk.\n\n1. **If isTrivial=true:** write a minimal pitch.md using `trivialTaskDescription` from Step 1 as the problem (do not pick from `candidateProblems` -- Step 2 was skipped). Content: the raw task description, then 'Appetite: xs. Single bounded change, no design decisions required.' Record `divergenceMarker: 'efficiency_skip'`.\n\n2. **Otherwise:** write `.workrail/current-pitch.md` with the full pitch from Step 7. Also archive to `.workrail/pitches/YYYY-MM-DD-[slugified-problem].md`.\n\nFirst ensure the directory exists:\n```\nmkdir -p .workrail/pitches\n```\n\nSummary to print:\n- Problem: [one sentence]\n- Appetite: [sizingBucket, calendarDays days]\n- Solution: [element names]\n- Rabbit holes: [count] identified\n- Assumptions flagged for review: [count of confidence < 0.6 entries]\n- Files written: [paths]\n- Next: hand pitch.md to a human engineering team, or run coding-task-workflow-agentic\n\nAfter writing pitch.md, emit a wr.shaping_handoff artifact in your complete_step call:\n{\n \"kind\": \"wr.shaping_handoff\",\n \"version\": 1,\n \"pitchPath\": \"<absolute path to .workrail/current-pitch.md>\",\n \"selectedShape\": \"<one sentence: which solution shape was chosen>\",\n \"appetite\": \"<time budget: e.g. 'Small batch (1-2 days)', 'Medium (1 week)'>\",\n \"keyConstraints\": [\"<design constraint the coding agent must respect>\", ...],\n \"rabbitHoles\": [\"<scope trap to avoid during implementation>\", ...],\n \"outOfScope\": [\"<explicitly ruled out>\", ...],\n \"validationChecklist\": [\"<verifiable acceptance criterion for the review agent>\", ...]\n}\nThe validationChecklist items should be specific and verifiable: \"All existing tests pass\", \"No new DB columns added\", \"Auth middleware is not modified\".",
194
+ "requireConfirmation": false,
195
+ "outputContract": {
196
+ "contractRef": "wr.contracts.shaping_handoff"
197
+ }
195
198
  }
196
199
  ]
197
200
  }