@bastani/atomic 0.6.4 → 0.6.5-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/create-spec/SKILL.md +6 -3
- package/.agents/skills/tdd/SKILL.md +107 -0
- package/.agents/skills/tdd/deep-modules.md +33 -0
- package/.agents/skills/tdd/interface-design.md +31 -0
- package/.agents/skills/tdd/mocking.md +59 -0
- package/.agents/skills/tdd/refactoring.md +10 -0
- package/.agents/skills/tdd/tests.md +61 -0
- package/.agents/skills/workflow-creator/SKILL.md +550 -0
- package/.agents/skills/workflow-creator/references/agent-sessions.md +891 -0
- package/.agents/skills/workflow-creator/references/agent-setup-recipe.md +266 -0
- package/.agents/skills/workflow-creator/references/computation-and-validation.md +201 -0
- package/.agents/skills/workflow-creator/references/control-flow.md +470 -0
- package/.agents/skills/workflow-creator/references/failure-modes.md +1014 -0
- package/.agents/skills/workflow-creator/references/getting-started.md +392 -0
- package/.agents/skills/workflow-creator/references/registry-and-validation.md +141 -0
- package/.agents/skills/workflow-creator/references/running-workflows.md +418 -0
- package/.agents/skills/workflow-creator/references/session-config.md +384 -0
- package/.agents/skills/workflow-creator/references/state-and-data-flow.md +356 -0
- package/.agents/skills/workflow-creator/references/user-input.md +234 -0
- package/.agents/skills/workflow-creator/references/workflow-inputs.md +392 -0
- package/.claude/agents/debugger.md +2 -2
- package/.claude/agents/reviewer.md +1 -1
- package/.claude/agents/worker.md +2 -2
- package/.github/agents/debugger.md +1 -1
- package/.github/agents/worker.md +1 -1
- package/.mcp.json +5 -1
- package/.opencode/agents/debugger.md +1 -1
- package/.opencode/agents/worker.md +1 -1
- package/README.md +236 -201
- package/dist/sdk/define-workflow.d.ts +11 -6
- package/dist/sdk/define-workflow.d.ts.map +1 -1
- package/dist/sdk/errors.d.ts +10 -0
- package/dist/sdk/errors.d.ts.map +1 -1
- package/dist/sdk/index.d.ts +21 -9
- package/dist/sdk/index.d.ts.map +1 -1
- package/dist/sdk/primitives/inputs.d.ts +36 -0
- package/dist/sdk/primitives/inputs.d.ts.map +1 -0
- package/dist/sdk/primitives/metadata.d.ts +40 -0
- package/dist/sdk/primitives/metadata.d.ts.map +1 -0
- package/dist/sdk/primitives/run.d.ts +57 -0
- package/dist/sdk/primitives/run.d.ts.map +1 -0
- package/dist/sdk/primitives/sessions.d.ts +128 -0
- package/dist/sdk/primitives/sessions.d.ts.map +1 -0
- package/dist/sdk/runtime/executor.d.ts +24 -56
- package/dist/sdk/runtime/executor.d.ts.map +1 -1
- package/dist/sdk/runtime/orchestrator-entry.d.ts +26 -0
- package/dist/sdk/runtime/orchestrator-entry.d.ts.map +1 -0
- package/dist/sdk/runtime/tmux.d.ts +20 -0
- package/dist/sdk/runtime/tmux.d.ts.map +1 -1
- package/dist/sdk/types.d.ts +26 -86
- package/dist/sdk/types.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/open-claude-design/claude/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/open-claude-design/copilot/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/open-claude-design/opencode/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/ralph/claude/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/ralph/copilot/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/ralph/opencode/index.d.ts.map +1 -1
- package/dist/sdk/workflows/index.d.ts +20 -12
- package/dist/sdk/workflows/index.d.ts.map +1 -1
- package/dist/services/config/additional-instructions.d.ts +1 -1
- package/dist/services/config/additional-instructions.d.ts.map +1 -1
- package/package.json +4 -4
- package/src/cli.ts +39 -56
- package/src/commands/builtin-registry.ts +37 -0
- package/src/commands/cli/chat/index.ts +1 -3
- package/src/{sdk → commands/cli}/management-commands.ts +15 -55
- package/src/commands/cli/session.ts +1 -1
- package/src/commands/cli/workflow-command.test.ts +250 -16
- package/src/commands/cli/workflow-inputs.test.ts +1 -0
- package/src/commands/cli/workflow-inputs.ts +13 -3
- package/src/commands/cli/workflow-list.test.ts +1 -0
- package/src/commands/cli/workflow-list.ts +0 -0
- package/src/commands/cli/workflow-status.ts +1 -1
- package/src/commands/cli/workflow.ts +191 -11
- package/src/sdk/define-workflow.test.ts +47 -16
- package/src/sdk/define-workflow.ts +24 -6
- package/src/sdk/errors.test.ts +11 -0
- package/src/sdk/errors.ts +13 -0
- package/src/sdk/index.test.ts +92 -0
- package/src/sdk/index.ts +71 -15
- package/src/sdk/primitives/inputs.ts +48 -0
- package/src/sdk/primitives/metadata.ts +63 -0
- package/src/sdk/primitives/run.ts +81 -0
- package/src/sdk/primitives/sessions.test.ts +594 -0
- package/src/sdk/primitives/sessions.ts +328 -0
- package/src/sdk/runtime/executor.ts +36 -115
- package/src/sdk/runtime/orchestrator-entry.ts +110 -0
- package/src/sdk/runtime/tmux.ts +33 -0
- package/src/sdk/types.ts +26 -91
- package/src/sdk/workflows/builtin/deep-research-codebase/claude/index.ts +1 -0
- package/src/sdk/workflows/builtin/deep-research-codebase/copilot/index.ts +1 -0
- package/src/sdk/workflows/builtin/deep-research-codebase/opencode/index.ts +1 -0
- package/src/sdk/workflows/builtin/open-claude-design/claude/index.ts +1 -0
- package/src/sdk/workflows/builtin/open-claude-design/copilot/index.ts +1 -0
- package/src/sdk/workflows/builtin/open-claude-design/opencode/index.ts +1 -0
- package/src/sdk/workflows/builtin/ralph/claude/index.ts +1 -0
- package/src/sdk/workflows/builtin/ralph/copilot/index.ts +1 -0
- package/src/sdk/workflows/builtin/ralph/opencode/index.ts +1 -0
- package/src/sdk/workflows/index.ts +68 -51
- package/src/services/config/additional-instructions.ts +1 -1
- package/.agents/skills/test-driven-development/SKILL.md +0 -371
- package/.agents/skills/test-driven-development/testing-anti-patterns.md +0 -299
- package/dist/commands/cli/session.d.ts +0 -67
- package/dist/commands/cli/session.d.ts.map +0 -1
- package/dist/commands/cli/workflow-status.d.ts +0 -63
- package/dist/commands/cli/workflow-status.d.ts.map +0 -1
- package/dist/sdk/commander.d.ts +0 -74
- package/dist/sdk/commander.d.ts.map +0 -1
- package/dist/sdk/management-commands.d.ts +0 -42
- package/dist/sdk/management-commands.d.ts.map +0 -1
- package/dist/sdk/workflow-cli.d.ts +0 -103
- package/dist/sdk/workflow-cli.d.ts.map +0 -1
- package/dist/sdk/workflows/builtin-registry.d.ts +0 -113
- package/dist/sdk/workflows/builtin-registry.d.ts.map +0 -1
- package/src/sdk/commander.ts +0 -161
- package/src/sdk/workflow-cli.ts +0 -409
- package/src/sdk/workflows/builtin-registry.ts +0 -23
|
@@ -0,0 +1,1014 @@
|
|
|
1
|
+
# Failure Modes
|
|
2
|
+
|
|
3
|
+
Common, **silent** ways workflows break across Claude Code, Copilot CLI, and
|
|
4
|
+
OpenCode — and the wrong-vs-right patterns to avoid them.
|
|
5
|
+
|
|
6
|
+
**Read this before you ship a multi-session workflow.** Most failures here
|
|
7
|
+
don't throw — they produce degraded output that looks plausible, which is
|
|
8
|
+
the hardest kind of bug to catch in review.
|
|
9
|
+
|
|
10
|
+
## When to consult
|
|
11
|
+
|
|
12
|
+
- Before writing a planner → orchestrator → reviewer handoff (Copilot / OpenCode)
|
|
13
|
+
- When a stage receives context from a prior stage and the output smells off
|
|
14
|
+
- When a review/fix loop works on small inputs but drifts on large ones
|
|
15
|
+
- When a JSON/markdown parser in a helper stops matching the model's output
|
|
16
|
+
- When you cannot explain where a particular sentence in a downstream prompt came from
|
|
17
|
+
|
|
18
|
+
## Silent vs. loud
|
|
19
|
+
|
|
20
|
+
| Severity | What happens | Detection |
|
|
21
|
+
|---|---|---|
|
|
22
|
+
| **Silent** | Wrong output, no exception. Downstream stages consume garbage. | Requires end-to-end observation. Easy to miss in review. |
|
|
23
|
+
| **Loud** | Exception thrown, stage aborts. | Stack trace surfaces in logs. |
|
|
24
|
+
|
|
25
|
+
Silent failures are catalogued first below. Loud failures are grouped at the end.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick reference
|
|
30
|
+
|
|
31
|
+
| # | Failure | Affected | Silent? |
|
|
32
|
+
|---|---|---|---|
|
|
33
|
+
| [F1](#f1-copilot-getlastassistanttext-returns-empty-string) | Copilot: `getLastAssistantText` returns empty string | Copilot | silent |
|
|
34
|
+
| [F2](#f2-copilot-subagent-messages-pollute-getmessages-stream) | Copilot: subagent messages pollute `getMessages()` stream | Copilot | silent |
|
|
35
|
+
| [F3](#f3-opencode-result-parts-contain-non-text-parts) | OpenCode: `result.data.parts` contains non-text parts | OpenCode | silent |
|
|
36
|
+
| [F4](#f4-claude-ssessionquery-returns-sessionmessage-extract-text-with-extractassistanttext) | Claude: `s.session.query()` returns `SessionMessage[]` — extract text with `extractAssistantText(result, 0)` | Claude | silent |
|
|
37
|
+
| [F5](#f5-fresh-session-wipes-prior-stage-context) | Fresh session wipes prior stage context | Copilot, OpenCode | silent |
|
|
38
|
+
| [F6](#f6-planner-prompts-that-dont-request-trailing-commentary-produce-empty-handoffs) | Planner prompts that don't request trailing commentary produce empty handoffs | all | silent |
|
|
39
|
+
| [F7](#f7-continued-sessions-accumulate-state-across-loop-iterations) | Continued sessions accumulate state across loop iterations (lost-in-middle) | all | silent |
|
|
40
|
+
| [F8](#f8-fenced-block-parsers-break-when-the-model-adds-prose) | Fenced-block parsers break when the model adds prose before/after | all | silent |
|
|
41
|
+
| [F9](#f9-ssave-receives-the-wrong-shape) | `s.save()` receives the wrong shape for the SDK | all | silent |
|
|
42
|
+
| [F10](#f10-copilot-sendandwait-default-60s-timeout-throws) | Copilot: `sendAndWait` default 60s timeout throws (use `send` by default) | Copilot | loud |
|
|
43
|
+
| [F11](#f11-provider-level-resume-tries-to-swap-agents) | Provider-level resume tries to swap agents | Copilot, OpenCode | loud |
|
|
44
|
+
| [F12](#f12-parallel-siblings-read-each-others-transcripts) | Parallel siblings read each other's transcripts | all | loud |
|
|
45
|
+
| [F13](#f13-forgetting-to-await-ctxstage) | Forgetting to `await` `ctx.stage()` | all | silent |
|
|
46
|
+
| [F14](#f14-using-a-pending-sessionhandle-before-completion) | Using a pending `SessionHandle` before completion | all | silent |
|
|
47
|
+
| [F15](#f15-headless-stage-errors-are-invisible-in-the-graph) | Headless stage errors are invisible in the graph | all | silent |
|
|
48
|
+
| [F16](#f16-claude-importing-sdk-query-inside-a-non-headless-stage) | Claude: importing the SDK `query()` inside a non-headless stage (anti-pattern) | Claude | silent |
|
|
49
|
+
| [F17](#f17-duplicate-registration-throws-at-composition-root) | Duplicate registration throws at composition root | all | loud |
|
|
50
|
+
| [F22](#f22-ctxstage-with-no-llm-query-spawns-an-empty-idle-pane) | `ctx.stage()` with no LLM query spawns an empty, idle pane | all | silent |
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Silent failures
|
|
55
|
+
|
|
56
|
+
### F1. Copilot: `getLastAssistantText` returns empty string
|
|
57
|
+
|
|
58
|
+
**Symptom.** The orchestrator (or any downstream stage) receives an empty
|
|
59
|
+
`plannerNotes` / `reviewerOutput` despite the prior agent running successfully
|
|
60
|
+
and producing visible output in the TUI.
|
|
61
|
+
|
|
62
|
+
**Root cause.** Copilot emits an **empty terminating `assistant.message` event**
|
|
63
|
+
after every turn that included a tool call. The actual prose + toolRequests
|
|
64
|
+
live in the earlier `assistant.message` event; the trailing one has
|
|
65
|
+
`content: ""` and no `toolRequests`. Picking `.at(-1).data.content` reliably
|
|
66
|
+
lands on the empty terminator and throws away the real content.
|
|
67
|
+
|
|
68
|
+
Verified empirically with a toy script against Copilot CLI 1.0.22: a
|
|
69
|
+
single-turn "think then call tool" prompt produced 2 assistant.message
|
|
70
|
+
events, `[{length: 512, toolRequests: 1}, {length: 0, toolRequests: 0}]`.
|
|
71
|
+
The second one is what `.at(-1)` returns.
|
|
72
|
+
|
|
73
|
+
The event type carries both `content: string` and `toolRequests?: [...]` —
|
|
74
|
+
see `node_modules/@github/copilot-sdk/dist/generated/session-events.d.ts:1408-1455`.
|
|
75
|
+
|
|
76
|
+
This means the bug affects **any** stage whose final turn includes a tool
|
|
77
|
+
call — not just tool-calls-only turns. Planner, reviewer, debugger, and
|
|
78
|
+
orchestrator stages all hit it if they end on a tool invocation.
|
|
79
|
+
|
|
80
|
+
**Affected SDKs.** Copilot only.
|
|
81
|
+
|
|
82
|
+
### ❌ Wrong
|
|
83
|
+
|
|
84
|
+
```ts
|
|
85
|
+
function getLastAssistantText(messages: SessionEvent[]): string {
|
|
86
|
+
const assistantMessages = messages.filter(
|
|
87
|
+
(m): m is Extract<SessionEvent, { type: "assistant.message" }> =>
|
|
88
|
+
m.type === "assistant.message",
|
|
89
|
+
);
|
|
90
|
+
return assistantMessages.at(-1)?.data.content ?? "";
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### ✅ Right
|
|
95
|
+
|
|
96
|
+
```ts
|
|
97
|
+
/** Concatenate every top-level assistant turn's non-empty content. */
|
|
98
|
+
function getAssistantText(messages: SessionEvent[]): string {
|
|
99
|
+
return messages
|
|
100
|
+
.filter(
|
|
101
|
+
(m): m is Extract<SessionEvent, { type: "assistant.message" }> =>
|
|
102
|
+
m.type === "assistant.message" && !m.data.parentToolCallId,
|
|
103
|
+
)
|
|
104
|
+
.map((m) => m.data.content)
|
|
105
|
+
.filter((c) => c.length > 0)
|
|
106
|
+
.join("\n\n");
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
**Detection.** Log the returned text length after every `getAssistantText`
|
|
111
|
+
call during development. An empty or surprisingly short string for a stage
|
|
112
|
+
that clearly ran is the signature.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
### F2. Copilot: subagent messages pollute `getMessages()` stream
|
|
117
|
+
|
|
118
|
+
**Symptom.** Downstream stages receive a snippet of text that doesn't match
|
|
119
|
+
what the top-level agent said — it looks like a subagent's output.
|
|
120
|
+
|
|
121
|
+
**Root cause.** `assistant.message` events carry a `parentToolCallId?: string`
|
|
122
|
+
field, documented as *"Tool call ID of the parent tool invocation when this
|
|
123
|
+
event originates from a subagent"*. When the top-level agent delegates,
|
|
124
|
+
`getMessages()` returns **the complete history including subagent messages**.
|
|
125
|
+
Filters that don't exclude `parentToolCallId` can pick a subagent's final
|
|
126
|
+
message via `.at(-1)`.
|
|
127
|
+
|
|
128
|
+
**Affected SDKs.** Copilot.
|
|
129
|
+
|
|
130
|
+
### ❌ Wrong
|
|
131
|
+
|
|
132
|
+
```ts
|
|
133
|
+
messages.filter((m) => m.type === "assistant.message")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### ✅ Right
|
|
137
|
+
|
|
138
|
+
```ts
|
|
139
|
+
messages.filter(
|
|
140
|
+
(m) => m.type === "assistant.message" && !m.data.parentToolCallId,
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Detection.** Same as F1 — diff what you extract against the TUI
|
|
145
|
+
scrollback for the top-level agent.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
### F3. OpenCode: `result.data.parts` contains non-text parts
|
|
150
|
+
|
|
151
|
+
**Symptom.** Concatenated response text contains `[object Object]`,
|
|
152
|
+
truncated content, or swallows tool-call payloads into the prompt.
|
|
153
|
+
|
|
154
|
+
**Root cause.** `client.session.prompt()` returns `result.data.parts: Part[]`
|
|
155
|
+
where parts can be `type: "text" | "tool" | "file" | "reasoning" | ...`.
|
|
156
|
+
Naive `.map(p => p.text).join()` emits `undefined` for non-text parts.
|
|
157
|
+
|
|
158
|
+
**Affected SDKs.** OpenCode.
|
|
159
|
+
|
|
160
|
+
### ❌ Wrong
|
|
161
|
+
|
|
162
|
+
```ts
|
|
163
|
+
const text = result.data!.parts.map((p) => p.text).join("\n");
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### ✅ Right
|
|
167
|
+
|
|
168
|
+
```ts
|
|
169
|
+
function extractResponseText(
|
|
170
|
+
parts: Array<{ type: string; [key: string]: unknown }>,
|
|
171
|
+
): string {
|
|
172
|
+
return parts
|
|
173
|
+
.filter((p) => p.type === "text")
|
|
174
|
+
.map((p) => (p as { type: string; text: string }).text)
|
|
175
|
+
.join("\n");
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Detection.** Grep extracted text for `[object Object]` or `undefined`.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
### F4. Claude: `s.session.query()` returns `SessionMessage[]` — extract text with `extractAssistantText`
|
|
184
|
+
|
|
185
|
+
**Symptom.** Workflow code tries to access `.output` or `.text` on the
|
|
186
|
+
result of `s.session.query()` and gets `undefined`, or passes the result
|
|
187
|
+
directly to a string parser that throws.
|
|
188
|
+
|
|
189
|
+
**Root cause.** `s.session.query()` returns `SessionMessage[]` — the native
|
|
190
|
+
Claude Agent SDK type. It does NOT return a `{ output: string }` object or a
|
|
191
|
+
raw TUI scrollback string. The assistant's text lives inside structured content
|
|
192
|
+
blocks within those messages and must be extracted explicitly.
|
|
193
|
+
|
|
194
|
+
**Affected SDKs.** Claude.
|
|
195
|
+
|
|
196
|
+
### ❌ Wrong
|
|
197
|
+
|
|
198
|
+
```ts
|
|
199
|
+
// result is SessionMessage[], not { output: string }
|
|
200
|
+
const result = await s.session.query(prompt);
|
|
201
|
+
const parsed = JSON.parse(result.output); // TypeError: result.output is undefined
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### ✅ Right — use `extractAssistantText(result, 0)`
|
|
205
|
+
|
|
206
|
+
```ts
|
|
207
|
+
import { extractAssistantText } from "@bastani/atomic/workflows";
|
|
208
|
+
|
|
209
|
+
const result = await s.session.query(prompt);
|
|
210
|
+
const text = extractAssistantText(result, 0);
|
|
211
|
+
// Now `text` is the concatenated assistant prose for this turn
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
`extractAssistantText(msgs, afterIndex)` walks `SessionMessage[]` from
|
|
215
|
+
`afterIndex` forward, pulls `TextBlock.text` from each `assistant` message's
|
|
216
|
+
content array, and joins them with newlines.
|
|
217
|
+
|
|
218
|
+
The ralph helpers in `src/sdk/workflows/builtin/ralph/helpers/prompts.ts`
|
|
219
|
+
(`parseReviewResult`, `extractMarkdownBlock`) use this pattern — always
|
|
220
|
+
extract text first, then parse.
|
|
221
|
+
|
|
222
|
+
**Detection.** Log `typeof result` after `s.session.query()`. If it's
|
|
223
|
+
`object` (an array), you need `extractAssistantText`. Accessing `.output`
|
|
224
|
+
on an array returns `undefined`.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
### F5. Fresh session wipes prior stage context
|
|
229
|
+
|
|
230
|
+
**Symptom.** The orchestrator says "I don't see a task list" or "what
|
|
231
|
+
specification are you referring to?" even though the planner clearly ran.
|
|
232
|
+
|
|
233
|
+
**Root cause.** `client.createSession()` / `client.session.create()` always
|
|
234
|
+
returns a **fresh, empty conversation**. The CLIENT object is just the
|
|
235
|
+
transport — each session is independent. The new session sees only what you
|
|
236
|
+
put in its first prompt.
|
|
237
|
+
|
|
238
|
+
**Affected SDKs.** Copilot, OpenCode. (Claude's session model is
|
|
239
|
+
different — context accumulates within the same SDK session, so this failure
|
|
240
|
+
mode does NOT apply to `s.session.query()`.)
|
|
241
|
+
|
|
242
|
+
### ❌ Wrong
|
|
243
|
+
|
|
244
|
+
```ts
|
|
245
|
+
await ctx.stage({ name: "planner" }, {}, { agent: "planner" }, async (s) => {
|
|
246
|
+
await s.session.send({ prompt: buildPlannerPrompt((s.inputs.prompt ?? "")) });
|
|
247
|
+
s.save(await s.session.getMessages());
|
|
248
|
+
});
|
|
249
|
+
// orchestrator is a fresh session — it has no idea what the planner produced
|
|
250
|
+
await ctx.stage({ name: "orchestrator" }, {}, { agent: "orchestrator" }, async (s) => {
|
|
251
|
+
await s.session.send({ prompt: buildOrchestratorPrompt() });
|
|
252
|
+
s.save(await s.session.getMessages());
|
|
253
|
+
});
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### ✅ Right — explicit handoff
|
|
257
|
+
|
|
258
|
+
```ts
|
|
259
|
+
const plannerHandle = await ctx.stage(
|
|
260
|
+
{ name: "planner" },
|
|
261
|
+
{},
|
|
262
|
+
{ agent: "planner" },
|
|
263
|
+
async (s) => {
|
|
264
|
+
await s.session.send({ prompt: buildPlannerPrompt((s.inputs.prompt ?? "")) });
|
|
265
|
+
const messages = await s.session.getMessages();
|
|
266
|
+
s.save(messages);
|
|
267
|
+
return getAssistantText(messages); // see F1 for getAssistantText
|
|
268
|
+
},
|
|
269
|
+
);
|
|
270
|
+
|
|
271
|
+
await ctx.stage(
|
|
272
|
+
{ name: "orchestrator" },
|
|
273
|
+
{},
|
|
274
|
+
{ agent: "orchestrator" },
|
|
275
|
+
async (s) => {
|
|
276
|
+
await s.session.send({
|
|
277
|
+
prompt: buildOrchestratorPrompt(
|
|
278
|
+
(s.inputs.prompt ?? ""),
|
|
279
|
+
{ plannerNotes: plannerHandle.result },
|
|
280
|
+
),
|
|
281
|
+
});
|
|
282
|
+
s.save(await s.session.getMessages());
|
|
283
|
+
},
|
|
284
|
+
);
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
Alternatives: write to shared state (`TaskCreate`/`TaskList`, files, git) and
|
|
288
|
+
have the next stage read from there, or keep the follow-up inside the same
|
|
289
|
+
stage callback when it needs the full live conversation. Provider-level resume
|
|
290
|
+
is an advanced same-role escape hatch, not the normal stage-to-stage handoff.
|
|
291
|
+
|
|
292
|
+
**Full write-up.** `agent-sessions.md` §"Critical pitfall: session lifecycle
|
|
293
|
+
controls what context is available".
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
### F6. Planner prompts that don't request trailing commentary produce empty handoffs
|
|
298
|
+
|
|
299
|
+
**Symptom.** F1 / F5 are fixed, extraction is correct — and the orchestrator
|
|
300
|
+
still receives empty `plannerNotes` because the planner's last turn legitimately
|
|
301
|
+
had no prose.
|
|
302
|
+
|
|
303
|
+
**Root cause.** This is a **prompt engineering** bug, not a code bug. When a
|
|
304
|
+
prompt ends with "call `TaskList` to verify" and does not explicitly ask for
|
|
305
|
+
trailing commentary, many models end the turn with just the tool call and
|
|
306
|
+
no text at all. There's nothing in any turn's `content` to extract because
|
|
307
|
+
the model never wrote any.
|
|
308
|
+
|
|
309
|
+
**Affected SDKs.** All three — though Claude's pane scrollback masks it by
|
|
310
|
+
still capturing something visible.
|
|
311
|
+
|
|
312
|
+
### ❌ Wrong — silent handoff
|
|
313
|
+
|
|
314
|
+
```ts
|
|
315
|
+
return `# Planning
|
|
316
|
+
|
|
317
|
+
${spec}
|
|
318
|
+
|
|
319
|
+
Decompose the specification into tasks via TaskCreate. After creating all
|
|
320
|
+
tasks, call TaskList to verify.`;
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### ✅ Right — explicit trailing commentary requirement
|
|
324
|
+
|
|
325
|
+
```ts
|
|
326
|
+
return `# Planning
|
|
327
|
+
|
|
328
|
+
${spec}
|
|
329
|
+
|
|
330
|
+
Decompose the specification into tasks via TaskCreate. After creating all
|
|
331
|
+
tasks, call TaskList to verify.
|
|
332
|
+
|
|
333
|
+
## Final output (required)
|
|
334
|
+
|
|
335
|
+
After the TaskList call, write a short "Handoff Notes" section with:
|
|
336
|
+
- Risks or ambiguities the orchestrator must know about
|
|
337
|
+
- Any assumptions you made that could be wrong
|
|
338
|
+
- Ordering constraints that don't fit into task bodies
|
|
339
|
+
|
|
340
|
+
The orchestrator will run in a fresh session — anything not in your
|
|
341
|
+
TaskCreate calls or this section will be lost.`;
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
**Pair this fix with F1.** Even with the correct extraction helper, you need
|
|
345
|
+
the model to actually produce text for the helper to extract.
|
|
346
|
+
|
|
347
|
+
**Detection.** Log the extracted handoff text during development. An empty
|
|
348
|
+
string + a correctly-fixed extraction helper = F6.
|
|
349
|
+
|
|
350
|
+
---
|
|
351
|
+
|
|
352
|
+
### F7. Continued sessions accumulate state across loop iterations (lost-in-middle)
|
|
353
|
+
|
|
354
|
+
**Symptom.** A review/fix loop works on iterations 1-3 then starts
|
|
355
|
+
producing worse output — misidentifying files, hallucinating line numbers,
|
|
356
|
+
or "forgetting" a requirement that was clearly stated in the original spec.
|
|
357
|
+
|
|
358
|
+
**Root cause.** Each loop iteration adds turns to the same continued
|
|
359
|
+
session, and context grows past the attention window. The model starts
|
|
360
|
+
dropping middle-of-context information (classic lost-in-middle).
|
|
361
|
+
|
|
362
|
+
**Affected SDKs.** All three. Claude's session transcript accumulates every
|
|
363
|
+
intermediate turn, so long loops grow the context window substantially.
|
|
364
|
+
|
|
365
|
+
### ❌ Wrong — unbounded loop on a single session
|
|
366
|
+
|
|
367
|
+
```ts
|
|
368
|
+
await ctx.stage({ name: "review-loop" }, {}, {}, async (s) => {
|
|
369
|
+
for (let i = 0; i < 20; i++) {
|
|
370
|
+
await s.session.query(buildReviewPrompt());
|
|
371
|
+
await s.session.query(buildFixPrompt());
|
|
372
|
+
}
|
|
373
|
+
});
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
### ✅ Right — compact or reset between iterations
|
|
377
|
+
|
|
378
|
+
Options, in order of preference:
|
|
379
|
+
|
|
380
|
+
1. **Compact** — summarize prior turns via the SDK's compaction mechanism
|
|
381
|
+
(Claude's `/compact`, OpenCode's summarizer, a sidecar summarization call
|
|
382
|
+
for Copilot). Keeps decisions and file paths; drops verbose tool output.
|
|
383
|
+
2. **Offload to files** — write intermediate findings to files and reference
|
|
384
|
+
them by path in the next iteration's prompt (`filesystem-context` skill).
|
|
385
|
+
3. **Fresh session per iteration with explicit handoff** — see F5's pattern;
|
|
386
|
+
lose the in-session reasoning but gain a clean context window.
|
|
387
|
+
|
|
388
|
+
```ts
|
|
389
|
+
await ctx.stage({ name: "review-loop" }, {}, {}, async (s) => {
|
|
390
|
+
const MAX_TURNS_BEFORE_COMPACT = 10;
|
|
391
|
+
let turnsSinceCompact = 0;
|
|
392
|
+
|
|
393
|
+
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
|
394
|
+
if (turnsSinceCompact >= MAX_TURNS_BEFORE_COMPACT) {
|
|
395
|
+
await s.session.query("/compact");
|
|
396
|
+
turnsSinceCompact = 0;
|
|
397
|
+
}
|
|
398
|
+
await s.session.query(buildReviewPrompt());
|
|
399
|
+
turnsSinceCompact += 1;
|
|
400
|
+
}
|
|
401
|
+
});
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
**Consult.** `context-degradation`, `context-compression`, `context-optimization`.
|
|
405
|
+
|
|
406
|
+
**Detection.** Quality-vs-iteration chart. If quality degrades past
|
|
407
|
+
iteration N, N is your safe-turn budget before compaction.
|
|
408
|
+
|
|
409
|
+
---
|
|
410
|
+
|
|
411
|
+
### F8. Fenced-block parsers break when the model adds prose
|
|
412
|
+
|
|
413
|
+
**Symptom.** `JSON.parse(content)` throws, or a "matches the first fenced
|
|
414
|
+
block" regex picks up a code example inside prose instead of the actual
|
|
415
|
+
structured output.
|
|
416
|
+
|
|
417
|
+
**Root cause.** A prompt asks for `only JSON inside a single fenced block`
|
|
418
|
+
and the model adds a sentence of explanation, a "# Summary" heading, or
|
|
419
|
+
quotes a snippet of its own reasoning in a code fence earlier in the reply.
|
|
420
|
+
|
|
421
|
+
**Affected SDKs.** All three — this is a model-behavior issue, not
|
|
422
|
+
SDK-specific.
|
|
423
|
+
|
|
424
|
+
### ❌ Wrong
|
|
425
|
+
|
|
426
|
+
```ts
|
|
427
|
+
const parsed = JSON.parse(content);
|
|
428
|
+
// or:
|
|
429
|
+
const match = content.match(/```json\n([\s\S]*?)\n```/);
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
### ✅ Right — layered fallback: direct parse → last fenced block → last balanced object
|
|
433
|
+
|
|
434
|
+
```ts
|
|
435
|
+
export function parseReviewResult(content: string): ReviewResult | null {
|
|
436
|
+
// 1. Direct JSON
|
|
437
|
+
try {
|
|
438
|
+
const parsed = JSON.parse(content);
|
|
439
|
+
if (parsed?.findings && parsed?.overall_correctness) return parsed;
|
|
440
|
+
} catch { /* fall through */ }
|
|
441
|
+
|
|
442
|
+
// 2. LAST fenced code block (not the first — prose often quotes examples)
|
|
443
|
+
const blockRe = /```(?:json)?\s*\n([\s\S]*?)\n```/g;
|
|
444
|
+
let lastBlock: string | null = null;
|
|
445
|
+
let m: RegExpExecArray | null;
|
|
446
|
+
while ((m = blockRe.exec(content)) !== null) {
|
|
447
|
+
if (m[1]) lastBlock = m[1];
|
|
448
|
+
}
|
|
449
|
+
if (lastBlock) {
|
|
450
|
+
try {
|
|
451
|
+
const parsed = JSON.parse(lastBlock);
|
|
452
|
+
if (parsed?.findings && parsed?.overall_correctness) return parsed;
|
|
453
|
+
} catch { /* fall through */ }
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// 3. Last balanced object containing the required key
|
|
457
|
+
// (implementation in src/sdk/workflows/builtin/ralph/helpers/prompts.ts)
|
|
458
|
+
return null;
|
|
459
|
+
}
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
**Detection.** Fuzz test the parser against real model output captured
|
|
463
|
+
over several runs. If 1 in 20 runs fails to parse, you have F8.
|
|
464
|
+
|
|
465
|
+
---
|
|
466
|
+
|
|
467
|
+
### F9. `s.save()` receives the wrong shape
|
|
468
|
+
|
|
469
|
+
**Symptom.** `s.transcript("stage-name")` returns an empty or malformed
|
|
470
|
+
`content` string in the next stage.
|
|
471
|
+
|
|
472
|
+
**Root cause.** Each SDK has a different contract for what `s.save()`
|
|
473
|
+
expects, and the runtime doesn't type-check the argument beyond "anything".
|
|
474
|
+
|
|
475
|
+
**Affected SDKs.** All three — the mistake is in the workflow author's code.
|
|
476
|
+
|
|
477
|
+
### Correct shapes
|
|
478
|
+
|
|
479
|
+
| SDK | Correct argument |
|
|
480
|
+
|---|---|
|
|
481
|
+
| Claude | `s.save(s.sessionId)` — pass the session ID; the runtime reads the transcript file |
|
|
482
|
+
| Copilot | `s.save(await s.session.getMessages())` — pass `SessionEvent[]` |
|
|
483
|
+
| OpenCode | `s.save(result.data!)` — pass the `{ info, parts }` object |
|
|
484
|
+
|
|
485
|
+
### ❌ Wrong
|
|
486
|
+
|
|
487
|
+
```ts
|
|
488
|
+
// Claude — saves the wrong thing (result is SessionMessage[], not { output: string })
|
|
489
|
+
s.save(result.output); // TypeError: result.output is undefined; use s.save(s.sessionId)
|
|
490
|
+
|
|
491
|
+
// Copilot — calling getMessages() BEFORE send() returns an empty array
|
|
492
|
+
const earlyMessages = await s.session.getMessages(); // [] — no turns yet
|
|
493
|
+
s.save(earlyMessages);
|
|
494
|
+
|
|
495
|
+
// Copilot — saving a single message instead of the full array
|
|
496
|
+
s.save((await s.session.getMessages()).at(-1));
|
|
497
|
+
|
|
498
|
+
// OpenCode — missing the data unwrap
|
|
499
|
+
s.save(result);
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
### ✅ Right
|
|
503
|
+
|
|
504
|
+
See the per-SDK examples in `SKILL.md` §"Write the Workflow File" and the
|
|
505
|
+
`SessionContext` reference table.
|
|
506
|
+
|
|
507
|
+
**Detection.** Read `s.transcript(name).content` in the next stage and
|
|
508
|
+
log the length. A 0-length or JSON-that-isn't-prose signature = F9.
|
|
509
|
+
|
|
510
|
+
---
|
|
511
|
+
|
|
512
|
+
## Loud failures (throw, but still worth knowing)
|
|
513
|
+
|
|
514
|
+
### F10. Copilot: `sendAndWait` default 60s timeout throws
|
|
515
|
+
|
|
516
|
+
**Symptom.** `Timeout after 60000ms waiting for session.idle`. Every
|
|
517
|
+
subsequent `ctx.stage()` call never executes — the throw propagates out of
|
|
518
|
+
`run()` and halts the workflow.
|
|
519
|
+
|
|
520
|
+
**Root cause.** The raw Copilot SDK's `sendAndWait(options, timeout?)`
|
|
521
|
+
defaults to a 60-second timeout that throws on expiry. Real agent work
|
|
522
|
+
(planners, reviewers, orchestrators) routinely exceeds this.
|
|
523
|
+
|
|
524
|
+
**Fix.** Use `send` instead. Inside an Atomic stage the runtime wraps
|
|
525
|
+
`s.session.send()` so it blocks until `session.idle` with **no timeout** —
|
|
526
|
+
the same blocking semantics as Claude's `query()` and OpenCode's
|
|
527
|
+
`session.prompt()`. The wrapper lives in `wrapCopilotSend`
|
|
528
|
+
(`src/sdk/runtime/executor.ts`) and is installed per-stage.
|
|
529
|
+
|
|
530
|
+
```ts
|
|
531
|
+
// Correct: send() in an Atomic stage blocks until idle, no timeout.
|
|
532
|
+
await s.session.send({ prompt });
|
|
533
|
+
const messages = await s.session.getMessages(); // safe to read
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
**Do not reach for `sendAndWait` with a larger explicit timeout.** `send`
|
|
537
|
+
already waits for idle; `sendAndWait` just adds a throw-on-timeout failure
|
|
538
|
+
mode on top. If you catch yourself writing `sendAndWait(..., 5 * 60 * 1000)`
|
|
539
|
+
to "be safe", you want `send`.
|
|
540
|
+
|
|
541
|
+
---
|
|
542
|
+
|
|
543
|
+
### F11. Provider-level resume tries to swap agents
|
|
544
|
+
|
|
545
|
+
**Symptom.** Resumed Copilot / OpenCode session behaves as the original
|
|
546
|
+
agent instead of the requested new one — or the SDK throws "agent mismatch"
|
|
547
|
+
on resume.
|
|
548
|
+
|
|
549
|
+
**Root cause.** Each session is **bound to one agent at creation time**.
|
|
550
|
+
`resumeSession` reattaches the conversation but does not change the agent.
|
|
551
|
+
|
|
552
|
+
**Fix.** Use provider-level resume only for multi-turn work within the same
|
|
553
|
+
role. To swap agents, create a new session (fresh) and forward context via
|
|
554
|
+
F5's pattern. In normal workflow code, prefer a same-stage multi-turn session
|
|
555
|
+
over trying to reopen a prior stage.
|
|
556
|
+
|
|
557
|
+
---
|
|
558
|
+
|
|
559
|
+
### F12. Parallel siblings read each other's transcripts
|
|
560
|
+
|
|
561
|
+
**Symptom.** `s.transcript("sibling-name")` inside a parallel session
|
|
562
|
+
throws or returns empty.
|
|
563
|
+
|
|
564
|
+
**Root cause.** `s.transcript()` only exposes **prior completed sessions** —
|
|
565
|
+
ones whose callback has returned and whose saves have flushed. Sessions
|
|
566
|
+
launched concurrently via `Promise.all([ctx.stage(...), ctx.stage(...)])` run
|
|
567
|
+
at the same time; forward-only data flow is enforced.
|
|
568
|
+
|
|
569
|
+
**Fix.** Restructure to either a linear chain, a "fan-out, then merge"
|
|
570
|
+
pattern where a subsequent session reads both, or use external
|
|
571
|
+
shared state (files, DB) if siblings genuinely need to coordinate.
|
|
572
|
+
|
|
573
|
+
```ts
|
|
574
|
+
// Fan-out → merge
|
|
575
|
+
// Strings used here for brevity; prefer handles (s.transcript(handle)) when one is in scope.
|
|
576
|
+
const describe = await ctx.stage({ name: "describe" }, {}, {}, async (s) => { /* ... */ });
|
|
577
|
+
|
|
578
|
+
const [summarizeA, summarizeB] = await Promise.all([
|
|
579
|
+
ctx.stage({ name: "summarize-a" }, {}, {}, async (s) => {
|
|
580
|
+
const d = await s.transcript(describe); // OK — prior completed session (handle-based, preferred)
|
|
581
|
+
// s.transcript("summarize-b") would fail here — sibling not yet complete
|
|
582
|
+
}),
|
|
583
|
+
ctx.stage({ name: "summarize-b" }, {}, {}, async (s) => {
|
|
584
|
+
const d = await s.transcript(describe); // OK — prior completed session
|
|
585
|
+
}),
|
|
586
|
+
]);
|
|
587
|
+
|
|
588
|
+
await ctx.stage({ name: "merge" }, {}, {}, async (s) => {
|
|
589
|
+
const a = await s.transcript(summarizeA); // OK — handle-based, preferred over "summarize-a"
|
|
590
|
+
const b = await s.transcript(summarizeB);
|
|
591
|
+
});
|
|
592
|
+
```
|
|
593
|
+
|
|
594
|
+
---
|
|
595
|
+
|
|
596
|
+
### F13. Forgetting to `await` `ctx.stage()`
|
|
597
|
+
|
|
598
|
+
**Symptom.** A session runs (its tmux window opens, the agent does work)
|
|
599
|
+
but the orchestrator doesn't wait for it. Subsequent sessions that depend
|
|
600
|
+
on its output via `transcript()` or `getMessages()` see empty or missing
|
|
601
|
+
data. The workflow may finish "successfully" before the session's callback
|
|
602
|
+
has returned.
|
|
603
|
+
|
|
604
|
+
**Root cause.** `ctx.stage()` returns a `Promise<SessionHandle<T>>`.
|
|
605
|
+
Without `await`, the session is spawned but the `.run()` callback continues
|
|
606
|
+
immediately. The session's save never reaches the `completedRegistry`
|
|
607
|
+
before downstream code tries to read it.
|
|
608
|
+
|
|
609
|
+
**Affected SDKs.** All three — this is a TypeScript control-flow bug, not
|
|
610
|
+
SDK-specific.
|
|
611
|
+
|
|
612
|
+
### ❌ Wrong
|
|
613
|
+
|
|
614
|
+
```ts
|
|
615
|
+
// Missing await — session fires but orchestrator doesn't wait
|
|
616
|
+
ctx.stage({ name: "research" }, {}, {}, async (s) => {
|
|
617
|
+
// ... agent work ...
|
|
618
|
+
s.save(s.sessionId);
|
|
619
|
+
});
|
|
620
|
+
|
|
621
|
+
// This runs before "research" completes
|
|
622
|
+
await ctx.stage({ name: "synthesize" }, {}, {}, async (s) => {
|
|
623
|
+
const r = await s.transcript("research"); // empty or throws
|
|
624
|
+
});
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
### ✅ Right
|
|
628
|
+
|
|
629
|
+
```ts
|
|
630
|
+
await ctx.stage({ name: "research" }, {}, {}, async (s) => {
|
|
631
|
+
// ... agent work ...
|
|
632
|
+
s.save(s.sessionId);
|
|
633
|
+
});
|
|
634
|
+
|
|
635
|
+
await ctx.stage({ name: "synthesize" }, {}, {}, async (s) => {
|
|
636
|
+
const r = await s.transcript("research"); // works
|
|
637
|
+
});
|
|
638
|
+
```
|
|
639
|
+
|
|
640
|
+
**Detection.** If a session's graph node shows as "running" while
|
|
641
|
+
downstream sessions are already executing, you likely dropped an `await`.
|
|
642
|
+
TypeScript's `@typescript-eslint/no-floating-promises` lint rule catches
|
|
643
|
+
this at compile time.
|
|
644
|
+
|
|
645
|
+
---
|
|
646
|
+
|
|
647
|
+
### F14. Using a pending `SessionHandle` before completion
|
|
648
|
+
|
|
649
|
+
**Symptom.** `handle.result` is `undefined` or stale, or
|
|
650
|
+
`s.transcript(handle)` throws / returns empty even though the session
|
|
651
|
+
eventually completes.
|
|
652
|
+
|
|
653
|
+
**Root cause.** `ctx.stage()` returns a `SessionHandle<T>` whose
|
|
654
|
+
`.result` is only populated after the callback returns. If you store the
|
|
655
|
+
promise but access the handle before awaiting it, the result field is
|
|
656
|
+
not yet set and the session is not in the `completedRegistry`.
|
|
657
|
+
|
|
658
|
+
**Affected SDKs.** All three.
|
|
659
|
+
|
|
660
|
+
### ❌ Wrong
|
|
661
|
+
|
|
662
|
+
```ts
|
|
663
|
+
// Start both but access handles before awaiting
|
|
664
|
+
const handleA = ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return 42; });
|
|
665
|
+
const handleB = ctx.stage({ name: "b" }, {}, {}, async (s) => {
|
|
666
|
+
// handleA is a Promise, not a resolved SessionHandle
|
|
667
|
+
const transcript = await s.transcript(handleA); // fails
|
|
668
|
+
});
|
|
669
|
+
```
|
|
670
|
+
|
|
671
|
+
### ✅ Right
|
|
672
|
+
|
|
673
|
+
```ts
|
|
674
|
+
// Await first, then use the resolved handle
|
|
675
|
+
const handleA = await ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return 42; });
|
|
676
|
+
|
|
677
|
+
await ctx.stage({ name: "b" }, {}, {}, async (s) => {
|
|
678
|
+
const transcript = await s.transcript(handleA); // works — handleA is resolved
|
|
679
|
+
console.log(handleA.result); // 42
|
|
680
|
+
});
|
|
681
|
+
```
|
|
682
|
+
|
|
683
|
+
For parallel sessions, use `Promise.all()` and access handles only after
|
|
684
|
+
all promises resolve:
|
|
685
|
+
|
|
686
|
+
```ts
|
|
687
|
+
const [a, b] = await Promise.all([
|
|
688
|
+
ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return "x"; }),
|
|
689
|
+
ctx.stage({ name: "b" }, {}, {}, async (s) => { /* ... */ return "y"; }),
|
|
690
|
+
]);
|
|
691
|
+
// a.result === "x", b.result === "y"
|
|
692
|
+
```
|
|
693
|
+
|
|
694
|
+
**Detection.** TypeScript's type system helps — `ctx.stage()` returns
|
|
695
|
+
`Promise<SessionHandle<T>>`, not `SessionHandle<T>` directly. If you're
|
|
696
|
+
accessing `.result` without awaiting, the type will be `Promise`, not `T`.
|
|
697
|
+
|
|
698
|
+
---
|
|
699
|
+
|
|
700
|
+
### F15. Headless stage errors are invisible in the graph
|
|
701
|
+
|
|
702
|
+
**Symptom.** A workflow fails but the graph shows all visible stages as
|
|
703
|
+
completed. The error message references a session name that doesn't appear
|
|
704
|
+
in the graph panel.
|
|
705
|
+
|
|
706
|
+
**Root cause.** Headless stages (`{ headless: true }`) are invisible in the
|
|
707
|
+
workflow graph — they have no graph node, no tmux window, and no pane
|
|
708
|
+
preview. When a headless stage throws, the error is recorded in the
|
|
709
|
+
`failedRegistry` and the workflow halts, but the failure is only visible in
|
|
710
|
+
the orchestrator's error output and the session's `error.txt` file on disk.
|
|
711
|
+
|
|
712
|
+
**Affected SDKs.** All three — this is an executor-level behavior, not
|
|
713
|
+
SDK-specific.
|
|
714
|
+
|
|
715
|
+
### ❌ Wrong — no error context for headless stages
|
|
716
|
+
|
|
717
|
+
```ts
|
|
718
|
+
// Headless stage fails silently in the graph
|
|
719
|
+
const [a, b, c] = await Promise.all([
|
|
720
|
+
ctx.stage({ name: "gather-a", headless: true }, {}, {}, async (s) => {
|
|
721
|
+
throw new Error("API key expired"); // Fails — no graph node to show red
|
|
722
|
+
}),
|
|
723
|
+
ctx.stage({ name: "gather-b", headless: true }, {}, {}, async (s) => { /* ... */ }),
|
|
724
|
+
ctx.stage({ name: "gather-c", headless: true }, {}, {}, async (s) => { /* ... */ }),
|
|
725
|
+
]);
|
|
726
|
+
```
|
|
727
|
+
|
|
728
|
+
### ✅ Right — wrap headless stages with descriptive error context
|
|
729
|
+
|
|
730
|
+
```ts
|
|
731
|
+
const [a, b, c] = await Promise.all([
|
|
732
|
+
ctx.stage({ name: "gather-a", headless: true }, {}, {}, async (s) => {
|
|
733
|
+
try {
|
|
734
|
+
return await doWork(s);
|
|
735
|
+
} catch (error) {
|
|
736
|
+
throw new Error(`[gather-a] ${error instanceof Error ? error.message : String(error)}`);
|
|
737
|
+
}
|
|
738
|
+
}),
|
|
739
|
+
// ... same pattern for b, c
|
|
740
|
+
]);
|
|
741
|
+
```
|
|
742
|
+
|
|
743
|
+
**Detection.** If a workflow fails and the graph shows no failed nodes,
|
|
744
|
+
check the orchestrator log (`orchestrator.log` in the session directory)
|
|
745
|
+
and look for `headless-<name>` in the error output. The session directory
|
|
746
|
+
at `~/.atomic/sessions/<run-id>/<name>-<id>/error.txt` contains the
|
|
747
|
+
full error for each failed headless stage.
|
|
748
|
+
|
|
749
|
+
---
|
|
750
|
+
|
|
751
|
+
### F16. Claude: importing the SDK `query()` inside a non-headless stage
|
|
752
|
+
|
|
753
|
+
**Symptom.** A reviewer / extractor / structured-output stage shows up in
|
|
754
|
+
the workflow graph as a tmux pane, but the pane sits idle on the Claude
|
|
755
|
+
welcome screen for the entire stage duration. The stage still produces a
|
|
756
|
+
result — but the visible session never moved. CPU and token cost double:
|
|
757
|
+
two Claude processes ran, one in the pane (idle) and one in-process (the
|
|
758
|
+
SDK call that actually did the work).
|
|
759
|
+
|
|
760
|
+
**Root cause.** The stage was registered without `headless: true`, so the
|
|
761
|
+
runtime spawned an interactive Claude TUI in a tmux pane and bound
|
|
762
|
+
`s.session` to it. The callback ignored that and called
|
|
763
|
+
`query()` from `@anthropic-ai/claude-agent-sdk` directly:
|
|
764
|
+
|
|
765
|
+
```ts
|
|
766
|
+
import { query } from "@anthropic-ai/claude-agent-sdk";
|
|
767
|
+
// ...
|
|
768
|
+
ctx.stage({ name: "review" }, {}, {}, async (s) => {
|
|
769
|
+
for await (const msg of query({ prompt, options: { outputFormat: ... } })) { /* ... */ }
|
|
770
|
+
});
|
|
771
|
+
```
|
|
772
|
+
|
|
773
|
+
That import bypasses `s.session` entirely. The runtime cannot route the
|
|
774
|
+
SDK call through the TUI it just started, so:
|
|
775
|
+
|
|
776
|
+
1. The visible pane never receives a prompt — the user sees a blank Claude
|
|
777
|
+
session in the graph.
|
|
778
|
+
2. A second Claude process spins up in the orchestrator process to service
|
|
779
|
+
the SDK call. Both processes count against rate limits and token spend.
|
|
780
|
+
3. Idle detection on the pane never fires because no prompt was ever sent;
|
|
781
|
+
the runtime relies on session-state events that won't arrive, and stage
|
|
782
|
+
completion happens only because the callback returned (not because the
|
|
783
|
+
pane finished work).
|
|
784
|
+
|
|
785
|
+
The runtime exposes exactly two routes for an SDK feature:
|
|
786
|
+
|
|
787
|
+
| You want to use… | Stage shape | Code in callback |
|
|
788
|
+
|---|---|---|
|
|
789
|
+
| `outputFormat`, custom `agents`, `maxBudgetUsd`, etc. **without** a visible pane | `{ headless: true }` | `s.session.query(prompt, sdkOptions)` — wraps `HeadlessClaudeSessionWrapper.query()` which forwards `options` to the SDK |
|
|
790
|
+
| The visible TUI with a subagent | omit `headless` and pass `chatFlags: ["--agent", "<name>", ...]` | `s.session.query(prompt)` — sends through tmux send-keys |
|
|
791
|
+
|
|
792
|
+
The one option that does **not** exist is "visible pane + in-process SDK call".
|
|
793
|
+
That combination is always wrong — pick one route or the other.
|
|
794
|
+
|
|
795
|
+
**Affected SDKs.** Claude only. Copilot and OpenCode don't expose a
|
|
796
|
+
parallel "import the bare SDK" foot-gun in this codebase.
|
|
797
|
+
|
|
798
|
+
### ❌ Wrong — visible pane + bypassed-SDK call
|
|
799
|
+
|
|
800
|
+
```ts
|
|
801
|
+
import { query as claudeSdkQuery } from "@anthropic-ai/claude-agent-sdk";
|
|
802
|
+
|
|
803
|
+
await ctx.stage({ name: "review" }, {}, {}, async (s) => {
|
|
804
|
+
// Visible TUI was started, but we're ignoring it.
|
|
805
|
+
for await (const msg of claudeSdkQuery({
|
|
806
|
+
prompt: reviewPrompt,
|
|
807
|
+
options: {
|
|
808
|
+
outputFormat: { type: "json_schema", schema: REVIEW_SCHEMA },
|
|
809
|
+
},
|
|
810
|
+
})) {
|
|
811
|
+
if (msg.type === "result") { /* ... */ }
|
|
812
|
+
}
|
|
813
|
+
s.save(s.sessionId);
|
|
814
|
+
});
|
|
815
|
+
```
|
|
816
|
+
|
|
817
|
+
### ✅ Right (a) — visible TUI with subagent + chatFlags
|
|
818
|
+
|
|
819
|
+
When you want the user to watch the review happen, run the subagent in
|
|
820
|
+
the pane via `--agent` and parse JSON out of the assistant text. The
|
|
821
|
+
prompt should enumerate the schema fields so the model emits matching
|
|
822
|
+
JSON; a tolerant parser (last-fenced-block + last-balanced-object
|
|
823
|
+
fallback, F8) handles any prose the model adds:
|
|
824
|
+
|
|
825
|
+
```ts
|
|
826
|
+
await ctx.stage(
|
|
827
|
+
{ name: "review" },
|
|
828
|
+
{ chatFlags: ["--agent", "reviewer", "--allow-dangerously-skip-permissions", "--dangerously-skip-permissions"] },
|
|
829
|
+
{},
|
|
830
|
+
async (s) => {
|
|
831
|
+
const messages = await s.session.query(reviewPrompt);
|
|
832
|
+
s.save(s.sessionId);
|
|
833
|
+
return parseReviewResult(extractAssistantText(messages, 0));
|
|
834
|
+
},
|
|
835
|
+
);
|
|
836
|
+
```
|
|
837
|
+
|
|
838
|
+
This is the pattern used by `src/sdk/workflows/builtin/ralph/claude/index.ts`
|
|
839
|
+
for its planner, orchestrator, reviewer, and debugger stages.
|
|
840
|
+
|
|
841
|
+
### ✅ Right (b) — headless stage with SDK options via `s.session.query()`
|
|
842
|
+
|
|
843
|
+
When you don't need the pane (e.g. background data gathering), set
|
|
844
|
+
`headless: true` and pass SDK options as the second argument to
|
|
845
|
+
`s.session.query()`. The runtime uses `HeadlessClaudeSessionWrapper`,
|
|
846
|
+
which calls the SDK's `query()` in-process and exposes the full options
|
|
847
|
+
surface (`agent`, `outputFormat`, `permissionMode`, `maxBudgetUsd`, etc.):
|
|
848
|
+
|
|
849
|
+
```ts
|
|
850
|
+
await ctx.stage(
|
|
851
|
+
{ name: "review", headless: true },
|
|
852
|
+
{}, {},
|
|
853
|
+
async (s) => {
|
|
854
|
+
const messages = await s.session.query(reviewPrompt, {
|
|
855
|
+
agent: "reviewer",
|
|
856
|
+
permissionMode: "bypassPermissions",
|
|
857
|
+
allowDangerouslySkipPermissions: true,
|
|
858
|
+
});
|
|
859
|
+
s.save(s.sessionId);
|
|
860
|
+
return extractAssistantText(messages, 0);
|
|
861
|
+
},
|
|
862
|
+
);
|
|
863
|
+
```
|
|
864
|
+
|
|
865
|
+
> **Note on `--json-schema`.** The CLI's `--json-schema` flag requires
|
|
866
|
+
> `-p` (print mode) and therefore can't be passed via `chatFlags` to the
|
|
867
|
+
> interactive TUI. If you need SDK-validated structured output, use route
|
|
868
|
+
> (b) — set `headless: true` and pass `outputFormat: { type: "json_schema", schema }`
|
|
869
|
+
> in the `s.session.query()` options. Pair (a)'s visible TUI with a
|
|
870
|
+
> tolerant JSON parser instead. (Note: `s.session.query()`'s headless
|
|
871
|
+
> wrapper currently returns `SessionMessage[]` and discards the SDK
|
|
872
|
+
> result event's `structured_output` field — for now, parse JSON out of
|
|
873
|
+
> the assistant text either way.)
|
|
874
|
+
|
|
875
|
+
**Detection.**
|
|
876
|
+
1. Grep your workflow for `from "@anthropic-ai/claude-agent-sdk"` —
|
|
877
|
+
`query`, `tool`, `createSdkMcpServer` and similar imports inside a
|
|
878
|
+
`.run()` callback are the smell. Workflow code should import from
|
|
879
|
+
`@bastani/atomic/workflows` and access the SDK exclusively through
|
|
880
|
+
`s.client` and `s.session`.
|
|
881
|
+
2. Watch the workflow run. If a visible pane shows the Claude welcome
|
|
882
|
+
screen for the entire duration of a stage and never receives a prompt,
|
|
883
|
+
you have F16.
|
|
884
|
+
3. Cost monitoring. F16 roughly doubles the Claude process count — if
|
|
885
|
+
stage spend looks 2× a single run, audit imports.
|
|
886
|
+
|
|
887
|
+
---
|
|
888
|
+
|
|
889
|
+
## Design checklist
|
|
890
|
+
|
|
891
|
+
Before shipping a multi-session workflow, walk the list:
|
|
892
|
+
|
|
893
|
+
- [ ] Copilot stages use `s.session.send` by default; `sendAndWait` only with an explicit user-requested timeout (F10)
|
|
894
|
+
- [ ] Every fresh-session handoff forwards context explicitly (F5)
|
|
895
|
+
- [ ] Every prompt whose output feeds a downstream stage explicitly requests trailing commentary (F6)
|
|
896
|
+
- [ ] Response-text extraction uses the per-SDK correct pattern (F1-F4)
|
|
897
|
+
- [ ] Structured-output parsers extract the LAST fenced block, not the first (F8)
|
|
898
|
+
- [ ] `s.save()` receives the per-SDK correct shape — Copilot uses `s.session.getMessages()` (F9)
|
|
899
|
+
- [ ] Loops over 10 iterations have a compaction / reset strategy (F7)
|
|
900
|
+
- [ ] Parallel groups only read from prior completed sessions, never siblings (F12)
|
|
901
|
+
- [ ] Every `ctx.stage()` call is `await`ed (F13)
|
|
902
|
+
- [ ] `SessionHandle` values are only used after the promise resolves (F14)
|
|
903
|
+
- [ ] If provider-level resume/fork is used at all, it stays within the same agent role (F11)
|
|
904
|
+
- [ ] Headless stage callbacks include descriptive error context so failures can be diagnosed without a graph node (F15)
|
|
905
|
+
- [ ] Claude stages never import `query` (or other entry points) from `@anthropic-ai/claude-agent-sdk` directly — go through `s.session.query()` so the runtime routes to the TUI (interactive) or the SDK (headless) consistently (F16)
|
|
906
|
+
- [ ] No duplicate `${agent}/${name}` registrations in the composition root (F17)
|
|
907
|
+
- [ ] Every `ctx.stage()` callback contains at least one LLM call (`s.session.query` / `s.session.send` / `s.client.session.prompt`); stages that are pure deterministic code have been demoted to plain TypeScript in `.run()` (F22)
|
|
908
|
+
|
|
909
|
+
---
|
|
910
|
+
|
|
911
|
+
### F17. Duplicate registration throws at composition root
|
|
912
|
+
|
|
913
|
+
**Symptom.** `createRegistry().register(wf)` throws immediately when
|
|
914
|
+
`wf` has the same `${agent}/${name}` key as an already-registered workflow:
|
|
915
|
+
|
|
916
|
+
```
|
|
917
|
+
[atomic] Duplicate workflow registration: "claude/my-workflow" is already registered.
|
|
918
|
+
Each (agent, name) pair must be unique.
|
|
919
|
+
```
|
|
920
|
+
|
|
921
|
+
**Fix.** Ensure each `(agent, name)` pair appears exactly once in the
|
|
922
|
+
composition root. Two cross-agent variants of the same logical workflow
|
|
923
|
+
(`"claude/ralph"` + `"copilot/ralph"`) are distinct keys — register both
|
|
924
|
+
without conflict.
|
|
925
|
+
|
|
926
|
+
---
|
|
927
|
+
|
|
928
|
+
### F22. `ctx.stage()` with no LLM query spawns an empty, idle pane
|
|
929
|
+
|
|
930
|
+
**Symptom.** A stage in the workflow graph opens a tmux window, the agent
|
|
931
|
+
CLI boots up, and then the pane just... sits there. No prompt ever gets
|
|
932
|
+
sent; the pane shows the Claude / Copilot / OpenCode welcome screen for
|
|
933
|
+
the entire stage duration. Users watching the graph see a completed stage
|
|
934
|
+
node whose pane was visibly empty, ask "why didn't it do anything?", and
|
|
935
|
+
lose trust in the workflow.
|
|
936
|
+
|
|
937
|
+
**Root cause.** A `ctx.stage()` callback that contains only deterministic
|
|
938
|
+
TypeScript — file I/O, `fetch()`, `child_process.exec`, `JSON.parse`, a
|
|
939
|
+
git command, a helper function — but no `s.session.query()` /
|
|
940
|
+
`s.session.send()` / `s.client.session.prompt()` call. The runtime sees
|
|
941
|
+
the stage as a valid unit of work (it spins up the pane, creates the SDK
|
|
942
|
+
session, runs the callback, tears everything down), but the session
|
|
943
|
+
itself never receives a prompt. Token cost is near-zero but the UX cost is
|
|
944
|
+
high: the empty pane is indistinguishable from a broken stage.
|
|
945
|
+
|
|
946
|
+
**Affected SDKs.** All three. The symptom is most obvious with Claude
|
|
947
|
+
because the pane is a full interactive TUI; Copilot and OpenCode show a
|
|
948
|
+
similarly idle welcome screen.
|
|
949
|
+
|
|
950
|
+
### ❌ Wrong — pure-TS work wrapped in a stage
|
|
951
|
+
|
|
952
|
+
```ts
|
|
953
|
+
// The previous stage returned a plan object. We want to write it to disk
|
|
954
|
+
// and set up some scratch directories before the next LLM call.
|
|
955
|
+
await ctx.stage({ name: "prepare-workspace" }, {}, {}, async (s) => {
|
|
956
|
+
await fs.mkdir(".atomic/scratch", { recursive: true });
|
|
957
|
+
await fs.writeFile(".atomic/scratch/plan.json", JSON.stringify(plan.result));
|
|
958
|
+
execSync("git checkout -b ralph/wip");
|
|
959
|
+
// ⚠️ No LLM call. A tmux pane opens, Claude boots, nothing ever
|
|
960
|
+
// gets typed into it, the stage "completes", pane tears down.
|
|
961
|
+
});
|
|
962
|
+
```
|
|
963
|
+
|
|
964
|
+
### ✅ Right (a) — lift pure-TS work into `.run()` directly
|
|
965
|
+
|
|
966
|
+
```ts
|
|
967
|
+
// Run deterministic setup at the orchestrator level, outside any stage.
|
|
968
|
+
// No pane, no graph node, no confusion.
|
|
969
|
+
await fs.mkdir(".atomic/scratch", { recursive: true });
|
|
970
|
+
await fs.writeFile(".atomic/scratch/plan.json", JSON.stringify(plan.result));
|
|
971
|
+
execSync("git checkout -b ralph/wip");
|
|
972
|
+
|
|
973
|
+
// Next stage actually uses the LLM — a pane here makes sense.
|
|
974
|
+
await ctx.stage({ name: "implement" }, {}, {}, async (s) => {
|
|
975
|
+
await s.session.query("Implement the plan in .atomic/scratch/plan.json.");
|
|
976
|
+
s.save(s.sessionId);
|
|
977
|
+
});
|
|
978
|
+
```
|
|
979
|
+
|
|
980
|
+
### ✅ Right (b) — bundle deterministic work into the nearest LLM stage
|
|
981
|
+
|
|
982
|
+
When the TS logic is conceptually bound to a specific LLM call (e.g.
|
|
983
|
+
validating the query's response, writing a derived artifact from the
|
|
984
|
+
assistant text), put it inside the same callback:
|
|
985
|
+
|
|
986
|
+
```ts
|
|
987
|
+
await ctx.stage({ name: "plan" }, {}, {}, async (s) => {
|
|
988
|
+
const messages = await s.session.query("Produce a plan as JSON.");
|
|
989
|
+
const text = extractAssistantText(messages, 0);
|
|
990
|
+
const plan = parsePlan(text); // deterministic — fine here
|
|
991
|
+
validatePlan(plan); // deterministic — fine here
|
|
992
|
+
await fs.writeFile("plan.json", JSON.stringify(plan)); // fine here
|
|
993
|
+
s.save(plan);
|
|
994
|
+
return plan;
|
|
995
|
+
});
|
|
996
|
+
```
|
|
997
|
+
|
|
998
|
+
**Detection.**
|
|
999
|
+
|
|
1000
|
+
1. Grep every `ctx.stage()` callback for at least one of:
|
|
1001
|
+
`s.session.query`, `s.session.send`, `s.client.session.prompt`. A
|
|
1002
|
+
callback with none is an F22 candidate.
|
|
1003
|
+
2. Watch the workflow run in the TUI. If a stage's pane shows only the
|
|
1004
|
+
agent welcome banner for the whole duration and closes without ever
|
|
1005
|
+
echoing a prompt, you have F22.
|
|
1006
|
+
3. Any stage whose only callback statements are `await fs.*`,
|
|
1007
|
+
`execSync`, `fetch`, `await s.save(...)`, or pure data manipulation
|
|
1008
|
+
is almost certainly F22 — there's no reason to pay for a pane just to
|
|
1009
|
+
run TypeScript the orchestrator could run directly.
|
|
1010
|
+
|
|
1011
|
+
**Legitimate exception.** Stages that spawn subordinate LLM work via
|
|
1012
|
+
`s.stage()` (nested sub-sessions) are fine — the child stages carry the
|
|
1013
|
+
LLM calls and the parent acts as a grouping scope. This pattern is rare
|
|
1014
|
+
and usually better expressed with headless fan-out + `Promise.all`.
|