@bastani/atomic 0.5.34-0 → 0.6.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +329 -50
  2. package/dist/commands/cli/session.d.ts +67 -0
  3. package/dist/commands/cli/session.d.ts.map +1 -0
  4. package/dist/commands/cli/workflow-status.d.ts +63 -0
  5. package/dist/commands/cli/workflow-status.d.ts.map +1 -0
  6. package/dist/sdk/commander.d.ts +74 -0
  7. package/dist/sdk/commander.d.ts.map +1 -0
  8. package/dist/sdk/components/workflow-picker-panel.d.ts +14 -17
  9. package/dist/sdk/components/workflow-picker-panel.d.ts.map +1 -1
  10. package/dist/sdk/define-workflow.d.ts +18 -9
  11. package/dist/sdk/define-workflow.d.ts.map +1 -1
  12. package/dist/sdk/index.d.ts +4 -3
  13. package/dist/sdk/index.d.ts.map +1 -1
  14. package/dist/sdk/management-commands.d.ts +42 -0
  15. package/dist/sdk/management-commands.d.ts.map +1 -0
  16. package/dist/sdk/registry.d.ts +27 -0
  17. package/dist/sdk/registry.d.ts.map +1 -0
  18. package/dist/sdk/runtime/attached-footer.d.ts +1 -1
  19. package/dist/sdk/runtime/executor-env.d.ts +20 -0
  20. package/dist/sdk/runtime/executor-env.d.ts.map +1 -0
  21. package/dist/sdk/runtime/executor.d.ts +61 -10
  22. package/dist/sdk/runtime/executor.d.ts.map +1 -1
  23. package/dist/sdk/types.d.ts +147 -4
  24. package/dist/sdk/types.d.ts.map +1 -1
  25. package/dist/sdk/worker-shared.d.ts +42 -0
  26. package/dist/sdk/worker-shared.d.ts.map +1 -0
  27. package/dist/sdk/workflow-cli.d.ts +103 -0
  28. package/dist/sdk/workflow-cli.d.ts.map +1 -0
  29. package/dist/sdk/workflows/builtin-registry.d.ts +113 -0
  30. package/dist/sdk/workflows/builtin-registry.d.ts.map +1 -0
  31. package/dist/sdk/workflows/index.d.ts +5 -5
  32. package/dist/sdk/workflows/index.d.ts.map +1 -1
  33. package/package.json +12 -8
  34. package/src/cli.ts +85 -144
  35. package/src/commands/cli/chat/index.ts +10 -0
  36. package/src/commands/cli/workflow-command.test.ts +279 -938
  37. package/src/commands/cli/workflow-inputs.test.ts +41 -11
  38. package/src/commands/cli/workflow-inputs.ts +47 -12
  39. package/src/commands/cli/workflow-list.test.ts +234 -0
  40. package/src/commands/cli/workflow-list.ts +0 -0
  41. package/src/commands/cli/workflow.ts +11 -798
  42. package/src/scripts/constants.ts +2 -1
  43. package/src/sdk/commander.ts +161 -0
  44. package/src/sdk/components/workflow-picker-panel.tsx +78 -258
  45. package/src/sdk/define-workflow.test.ts +104 -11
  46. package/src/sdk/define-workflow.ts +47 -11
  47. package/src/sdk/errors.test.ts +16 -0
  48. package/src/sdk/index.ts +8 -8
  49. package/src/sdk/management-commands.ts +151 -0
  50. package/src/sdk/registry.ts +132 -0
  51. package/src/sdk/runtime/attached-footer.ts +1 -1
  52. package/src/sdk/runtime/executor-env.ts +45 -0
  53. package/src/sdk/runtime/executor.test.ts +37 -0
  54. package/src/sdk/runtime/executor.ts +147 -68
  55. package/src/sdk/types.ts +169 -4
  56. package/src/sdk/worker-shared.test.ts +163 -0
  57. package/src/sdk/worker-shared.ts +155 -0
  58. package/src/sdk/workflow-cli.ts +409 -0
  59. package/src/sdk/workflows/builtin/deep-research-codebase/claude/index.ts +1 -1
  60. package/src/sdk/workflows/builtin/deep-research-codebase/copilot/index.ts +1 -1
  61. package/src/sdk/workflows/builtin/deep-research-codebase/opencode/index.ts +1 -1
  62. package/src/sdk/workflows/builtin/open-claude-design/claude/index.ts +1 -1
  63. package/src/sdk/workflows/builtin/open-claude-design/copilot/index.ts +1 -1
  64. package/src/sdk/workflows/builtin/open-claude-design/opencode/index.ts +1 -1
  65. package/src/sdk/workflows/builtin/ralph/claude/index.ts +1 -1
  66. package/src/sdk/workflows/builtin/ralph/copilot/index.ts +1 -1
  67. package/src/sdk/workflows/builtin/ralph/opencode/index.ts +1 -1
  68. package/src/sdk/workflows/builtin-registry.ts +23 -0
  69. package/src/sdk/workflows/index.ts +10 -20
  70. package/src/services/system/auth.test.ts +63 -1
  71. package/.agents/skills/workflow-creator/SKILL.md +0 -334
  72. package/.agents/skills/workflow-creator/references/agent-sessions.md +0 -888
  73. package/.agents/skills/workflow-creator/references/computation-and-validation.md +0 -201
  74. package/.agents/skills/workflow-creator/references/control-flow.md +0 -470
  75. package/.agents/skills/workflow-creator/references/discovery-and-verification.md +0 -232
  76. package/.agents/skills/workflow-creator/references/failure-modes.md +0 -903
  77. package/.agents/skills/workflow-creator/references/getting-started.md +0 -275
  78. package/.agents/skills/workflow-creator/references/running-workflows.md +0 -235
  79. package/.agents/skills/workflow-creator/references/session-config.md +0 -384
  80. package/.agents/skills/workflow-creator/references/state-and-data-flow.md +0 -357
  81. package/.agents/skills/workflow-creator/references/user-input.md +0 -234
  82. package/.agents/skills/workflow-creator/references/workflow-inputs.md +0 -272
  83. package/dist/sdk/runtime/discovery.d.ts +0 -132
  84. package/dist/sdk/runtime/discovery.d.ts.map +0 -1
  85. package/dist/sdk/runtime/executor-entry.d.ts +0 -11
  86. package/dist/sdk/runtime/executor-entry.d.ts.map +0 -1
  87. package/dist/sdk/runtime/loader.d.ts +0 -70
  88. package/dist/sdk/runtime/loader.d.ts.map +0 -1
  89. package/dist/version.d.ts +0 -2
  90. package/dist/version.d.ts.map +0 -1
  91. package/src/commands/cli/workflow.test.ts +0 -317
  92. package/src/sdk/runtime/discovery.ts +0 -368
  93. package/src/sdk/runtime/executor-entry.ts +0 -18
  94. package/src/sdk/runtime/loader.ts +0 -267
@@ -1,903 +0,0 @@
1
- # Failure Modes
2
-
3
- Common, **silent** ways workflows break across Claude Code, Copilot CLI, and
4
- OpenCode — and the wrong-vs-right patterns to avoid them.
5
-
6
- **Read this before you ship a multi-session workflow.** Most failures here
7
- don't throw — they produce degraded output that looks plausible, which is
8
- the hardest kind of bug to catch in review.
9
-
10
- ## When to consult
11
-
12
- - Before writing a planner → orchestrator → reviewer handoff (Copilot / OpenCode)
13
- - When a stage receives context from a prior stage and the output smells off
14
- - When a review/fix loop works on small inputs but drifts on large ones
15
- - When a JSON/markdown parser in a helper stops matching the model's output
16
- - When you cannot explain where a particular sentence in a downstream prompt came from
17
-
18
- ## Silent vs. loud
19
-
20
- | Severity | What happens | Detection |
21
- |---|---|---|
22
- | **Silent** | Wrong output, no exception. Downstream stages consume garbage. | Requires end-to-end observation. Easy to miss in review. |
23
- | **Loud** | Exception thrown, stage aborts. | Stack trace surfaces in logs. |
24
-
25
- Silent failures are catalogued first below. Loud failures are grouped at the end.
26
-
27
- ---
28
-
29
- ## Quick reference
30
-
31
- | # | Failure | Affected | Silent? |
32
- |---|---|---|---|
33
- | [F1](#f1-copilot-getlastassistanttext-returns-empty-string) | Copilot: `getLastAssistantText` returns empty string | Copilot | silent |
34
- | [F2](#f2-copilot-subagent-messages-pollute-getmessages-stream) | Copilot: subagent messages pollute `getMessages()` stream | Copilot | silent |
35
- | [F3](#f3-opencode-result-parts-contain-non-text-parts) | OpenCode: `result.data.parts` contains non-text parts | OpenCode | silent |
36
- | [F4](#f4-claude-ssessionquery-returns-sessionmessage-extract-text-with-extractassistanttext) | Claude: `s.session.query()` returns `SessionMessage[]` — extract text with `extractAssistantText(result, 0)` | Claude | silent |
37
- | [F5](#f5-fresh-session-wipes-prior-stage-context) | Fresh session wipes prior stage context | Copilot, OpenCode | silent |
38
- | [F6](#f6-planner-prompts-that-dont-request-trailing-commentary-produce-empty-handoffs) | Planner prompts that don't request trailing commentary produce empty handoffs | all | silent |
39
- | [F7](#f7-continued-sessions-accumulate-state-across-loop-iterations) | Continued sessions accumulate state across loop iterations (lost-in-middle) | all | silent |
40
- | [F8](#f8-fenced-block-parsers-break-when-the-model-adds-prose) | Fenced-block parsers break when the model adds prose before/after | all | silent |
41
- | [F9](#f9-ssave-receives-the-wrong-shape) | `s.save()` receives the wrong shape for the SDK | all | silent |
42
- | [F10](#f10-copilot-sendandwait-default-60s-timeout-throws) | Copilot: `sendAndWait` default 60s timeout throws (use `send` by default) | Copilot | loud |
43
- | [F11](#f11-provider-level-resume-tries-to-swap-agents) | Provider-level resume tries to swap agents | Copilot, OpenCode | loud |
44
- | [F12](#f12-parallel-siblings-read-each-others-transcripts) | Parallel siblings read each other's transcripts | all | loud |
45
- | [F13](#f13-forgetting-to-await-ctxstage) | Forgetting to `await` `ctx.stage()` | all | silent |
46
- | [F14](#f14-using-a-pending-sessionhandle-before-completion) | Using a pending `SessionHandle` before completion | all | silent |
47
- | [F15](#f15-headless-stage-errors-are-invisible-in-the-graph) | Headless stage errors are invisible in the graph | all | silent |
48
- | [F16](#f16-claude-importing-sdk-query-inside-a-non-headless-stage) | Claude: importing the SDK `query()` inside a non-headless stage (anti-pattern) | Claude | silent |
49
-
50
- ---
51
-
52
- ## Silent failures
53
-
54
- ### F1. Copilot: `getLastAssistantText` returns empty string
55
-
56
- **Symptom.** The orchestrator (or any downstream stage) receives an empty
57
- `plannerNotes` / `reviewerOutput` despite the prior agent running successfully
58
- and producing visible output in the TUI.
59
-
60
- **Root cause.** Copilot emits an **empty terminating `assistant.message` event**
61
- after every turn that included a tool call. The actual prose + toolRequests
62
- live in the earlier `assistant.message` event; the trailing one has
63
- `content: ""` and no `toolRequests`. Picking `.at(-1).data.content` reliably
64
- lands on the empty terminator and throws away the real content.
65
-
66
- Verified empirically with a toy script against Copilot CLI 1.0.22: a
67
- single-turn "think then call tool" prompt produced 2 assistant.message
68
- events, `[{length: 512, toolRequests: 1}, {length: 0, toolRequests: 0}]`.
69
- The second one is what `.at(-1)` returns.
70
-
71
- The event type carries both `content: string` and `toolRequests?: [...]` —
72
- see `node_modules/@github/copilot-sdk/dist/generated/session-events.d.ts:1408-1455`.
73
-
74
- This means the bug affects **any** stage whose final turn includes a tool
75
- call — not just tool-calls-only turns. Planner, reviewer, debugger, and
76
- orchestrator stages all hit it if they end on a tool invocation.
77
-
78
- **Affected SDKs.** Copilot only.
79
-
80
- ### ❌ Wrong
81
-
82
- ```ts
83
- function getLastAssistantText(messages: SessionEvent[]): string {
84
- const assistantMessages = messages.filter(
85
- (m): m is Extract<SessionEvent, { type: "assistant.message" }> =>
86
- m.type === "assistant.message",
87
- );
88
- return assistantMessages.at(-1)?.data.content ?? "";
89
- }
90
- ```
91
-
92
- ### ✅ Right
93
-
94
- ```ts
95
- /** Concatenate every top-level assistant turn's non-empty content. */
96
- function getAssistantText(messages: SessionEvent[]): string {
97
- return messages
98
- .filter(
99
- (m): m is Extract<SessionEvent, { type: "assistant.message" }> =>
100
- m.type === "assistant.message" && !m.data.parentToolCallId,
101
- )
102
- .map((m) => m.data.content)
103
- .filter((c) => c.length > 0)
104
- .join("\n\n");
105
- }
106
- ```
107
-
108
- **Detection.** Log the returned text length after every `getAssistantText`
109
- call during development. An empty or surprisingly short string for a stage
110
- that clearly ran is the signature.
111
-
112
- ---
113
-
114
- ### F2. Copilot: subagent messages pollute `getMessages()` stream
115
-
116
- **Symptom.** Downstream stages receive a snippet of text that doesn't match
117
- what the top-level agent said — it looks like a subagent's output.
118
-
119
- **Root cause.** `assistant.message` events carry a `parentToolCallId?: string`
120
- field, documented as *"Tool call ID of the parent tool invocation when this
121
- event originates from a subagent"*. When the top-level agent delegates,
122
- `getMessages()` returns **the complete history including subagent messages**.
123
- Filters that don't exclude `parentToolCallId` can pick a subagent's final
124
- message via `.at(-1)`.
125
-
126
- **Affected SDKs.** Copilot.
127
-
128
- ### ❌ Wrong
129
-
130
- ```ts
131
- messages.filter((m) => m.type === "assistant.message")
132
- ```
133
-
134
- ### ✅ Right
135
-
136
- ```ts
137
- messages.filter(
138
- (m) => m.type === "assistant.message" && !m.data.parentToolCallId,
139
- )
140
- ```
141
-
142
- **Detection.** Same as F1 — diff what you extract against the TUI
143
- scrollback for the top-level agent.
144
-
145
- ---
146
-
147
- ### F3. OpenCode: `result.data.parts` contains non-text parts
148
-
149
- **Symptom.** Concatenated response text contains `[object Object]`,
150
- truncated content, or swallows tool-call payloads into the prompt.
151
-
152
- **Root cause.** `client.session.prompt()` returns `result.data.parts: Part[]`
153
- where parts can be `type: "text" | "tool" | "file" | "reasoning" | ...`.
154
- Naive `.map(p => p.text).join()` emits `undefined` for non-text parts.
155
-
156
- **Affected SDKs.** OpenCode.
157
-
158
- ### ❌ Wrong
159
-
160
- ```ts
161
- const text = result.data!.parts.map((p) => p.text).join("\n");
162
- ```
163
-
164
- ### ✅ Right
165
-
166
- ```ts
167
- function extractResponseText(
168
- parts: Array<{ type: string; [key: string]: unknown }>,
169
- ): string {
170
- return parts
171
- .filter((p) => p.type === "text")
172
- .map((p) => (p as { type: string; text: string }).text)
173
- .join("\n");
174
- }
175
- ```
176
-
177
- **Detection.** Grep extracted text for `[object Object]` or `undefined`.
178
-
179
- ---
180
-
181
- ### F4. Claude: `s.session.query()` returns `SessionMessage[]` — extract text with `extractAssistantText`
182
-
183
- **Symptom.** Workflow code tries to access `.output` or `.text` on the
184
- result of `s.session.query()` and gets `undefined`, or passes the result
185
- directly to a string parser that throws.
186
-
187
- **Root cause.** `s.session.query()` returns `SessionMessage[]` — the native
188
- Claude Agent SDK type. It does NOT return a `{ output: string }` object or a
189
- raw TUI scrollback string. The assistant's text lives inside structured content
190
- blocks within those messages and must be extracted explicitly.
191
-
192
- **Affected SDKs.** Claude.
193
-
194
- ### ❌ Wrong
195
-
196
- ```ts
197
- // result is SessionMessage[], not { output: string }
198
- const result = await s.session.query(prompt);
199
- const parsed = JSON.parse(result.output); // TypeError: result.output is undefined
200
- ```
201
-
202
- ### ✅ Right — use `extractAssistantText(result, 0)`
203
-
204
- ```ts
205
- import { extractAssistantText } from "@bastani/atomic/workflows";
206
-
207
- const result = await s.session.query(prompt);
208
- const text = extractAssistantText(result, 0);
209
- // Now `text` is the concatenated assistant prose for this turn
210
- ```
211
-
212
- `extractAssistantText(msgs, afterIndex)` walks `SessionMessage[]` from
213
- `afterIndex` forward, pulls `TextBlock.text` from each `assistant` message's
214
- content array, and joins them with newlines.
215
-
216
- The ralph helpers in `src/sdk/workflows/builtin/ralph/helpers/prompts.ts`
217
- (`parseReviewResult`, `extractMarkdownBlock`) use this pattern — always
218
- extract text first, then parse.
219
-
220
- **Detection.** Log `typeof result` after `s.session.query()`. If it's
221
- `object` (an array), you need `extractAssistantText`. Accessing `.output`
222
- on an array returns `undefined`.
223
-
224
- ---
225
-
226
- ### F5. Fresh session wipes prior stage context
227
-
228
- **Symptom.** The orchestrator says "I don't see a task list" or "what
229
- specification are you referring to?" even though the planner clearly ran.
230
-
231
- **Root cause.** `client.createSession()` / `client.session.create()` always
232
- returns a **fresh, empty conversation**. The CLIENT object is just the
233
- transport — each session is independent. The new session sees only what you
234
- put in its first prompt.
235
-
236
- **Affected SDKs.** Copilot, OpenCode. (Claude's session model is
237
- different — context accumulates within the same SDK session, so this failure
238
- mode does NOT apply to `s.session.query()`.)
239
-
240
- ### ❌ Wrong
241
-
242
- ```ts
243
- await ctx.stage({ name: "planner" }, {}, { agent: "planner" }, async (s) => {
244
- await s.session.send({ prompt: buildPlannerPrompt((s.inputs.prompt ?? "")) });
245
- s.save(await s.session.getMessages());
246
- });
247
- // orchestrator is a fresh session — it has no idea what the planner produced
248
- await ctx.stage({ name: "orchestrator" }, {}, { agent: "orchestrator" }, async (s) => {
249
- await s.session.send({ prompt: buildOrchestratorPrompt() });
250
- s.save(await s.session.getMessages());
251
- });
252
- ```
253
-
254
- ### ✅ Right — explicit handoff
255
-
256
- ```ts
257
- const plannerHandle = await ctx.stage(
258
- { name: "planner" },
259
- {},
260
- { agent: "planner" },
261
- async (s) => {
262
- await s.session.send({ prompt: buildPlannerPrompt((s.inputs.prompt ?? "")) });
263
- const messages = await s.session.getMessages();
264
- s.save(messages);
265
- return getAssistantText(messages); // see F1 for getAssistantText
266
- },
267
- );
268
-
269
- await ctx.stage(
270
- { name: "orchestrator" },
271
- {},
272
- { agent: "orchestrator" },
273
- async (s) => {
274
- await s.session.send({
275
- prompt: buildOrchestratorPrompt(
276
- (s.inputs.prompt ?? ""),
277
- { plannerNotes: plannerHandle.result },
278
- ),
279
- });
280
- s.save(await s.session.getMessages());
281
- },
282
- );
283
- ```
284
-
285
- Alternatives: write to shared state (`TaskCreate`/`TaskList`, files, git) and
286
- have the next stage read from there, or keep the follow-up inside the same
287
- stage callback when it needs the full live conversation. Provider-level resume
288
- is an advanced same-role escape hatch, not the normal stage-to-stage handoff.
289
-
290
- **Full write-up.** `agent-sessions.md` §"Critical pitfall: session lifecycle
291
- controls what context is available".
292
-
293
- ---
294
-
295
- ### F6. Planner prompts that don't request trailing commentary produce empty handoffs
296
-
297
- **Symptom.** F1 / F5 are fixed, extraction is correct — and the orchestrator
298
- still receives empty `plannerNotes` because the planner's last turn legitimately
299
- had no prose.
300
-
301
- **Root cause.** This is a **prompt engineering** bug, not a code bug. When a
302
- prompt ends with "call `TaskList` to verify" and does not explicitly ask for
303
- trailing commentary, many models end the turn with just the tool call and
304
- no text at all. There's nothing in any turn's `content` to extract because
305
- the model never wrote any.
306
-
307
- **Affected SDKs.** All three — though Claude's pane scrollback masks it by
308
- still capturing something visible.
309
-
310
- ### ❌ Wrong — silent handoff
311
-
312
- ```ts
313
- return `# Planning
314
-
315
- ${spec}
316
-
317
- Decompose the specification into tasks via TaskCreate. After creating all
318
- tasks, call TaskList to verify.`;
319
- ```
320
-
321
- ### ✅ Right — explicit trailing commentary requirement
322
-
323
- ```ts
324
- return `# Planning
325
-
326
- ${spec}
327
-
328
- Decompose the specification into tasks via TaskCreate. After creating all
329
- tasks, call TaskList to verify.
330
-
331
- ## Final output (required)
332
-
333
- After the TaskList call, write a short "Handoff Notes" section with:
334
- - Risks or ambiguities the orchestrator must know about
335
- - Any assumptions you made that could be wrong
336
- - Ordering constraints that don't fit into task bodies
337
-
338
- The orchestrator will run in a fresh session — anything not in your
339
- TaskCreate calls or this section will be lost.`;
340
- ```
341
-
342
- **Pair this fix with F1.** Even with the correct extraction helper, you need
343
- the model to actually produce text for the helper to extract.
344
-
345
- **Detection.** Log the extracted handoff text during development. An empty
346
- string + a correctly-fixed extraction helper = F6.
347
-
348
- ---
349
-
350
- ### F7. Continued sessions accumulate state across loop iterations (lost-in-middle)
351
-
352
- **Symptom.** A review/fix loop works on iterations 1-3 then starts
353
- producing worse output — misidentifying files, hallucinating line numbers,
354
- or "forgetting" a requirement that was clearly stated in the original spec.
355
-
356
- **Root cause.** Each loop iteration adds turns to the same continued
357
- session, and context grows past the attention window. The model starts
358
- dropping middle-of-context information (classic lost-in-middle).
359
-
360
- **Affected SDKs.** All three. Claude's session transcript accumulates every
361
- intermediate turn, so long loops grow the context window substantially.
362
-
363
- ### ❌ Wrong — unbounded loop on a single session
364
-
365
- ```ts
366
- await ctx.stage({ name: "review-loop" }, {}, {}, async (s) => {
367
- for (let i = 0; i < 20; i++) {
368
- await s.session.query(buildReviewPrompt());
369
- await s.session.query(buildFixPrompt());
370
- }
371
- });
372
- ```
373
-
374
- ### ✅ Right — compact or reset between iterations
375
-
376
- Options, in order of preference:
377
-
378
- 1. **Compact** — summarize prior turns via the SDK's compaction mechanism
379
- (Claude's `/compact`, OpenCode's summarizer, a sidecar summarization call
380
- for Copilot). Keeps decisions and file paths; drops verbose tool output.
381
- 2. **Offload to files** — write intermediate findings to files and reference
382
- them by path in the next iteration's prompt (`filesystem-context` skill).
383
- 3. **Fresh session per iteration with explicit handoff** — see F5's pattern;
384
- lose the in-session reasoning but gain a clean context window.
385
-
386
- ```ts
387
- await ctx.stage({ name: "review-loop" }, {}, {}, async (s) => {
388
- const MAX_TURNS_BEFORE_COMPACT = 10;
389
- let turnsSinceCompact = 0;
390
-
391
- for (let i = 0; i < MAX_ITERATIONS; i++) {
392
- if (turnsSinceCompact >= MAX_TURNS_BEFORE_COMPACT) {
393
- await s.session.query("/compact");
394
- turnsSinceCompact = 0;
395
- }
396
- await s.session.query(buildReviewPrompt());
397
- turnsSinceCompact += 1;
398
- }
399
- });
400
- ```
401
-
402
- **Consult.** `context-degradation`, `context-compression`, `context-optimization`.
403
-
404
- **Detection.** Quality-vs-iteration chart. If quality degrades past
405
- iteration N, N is your safe-turn budget before compaction.
406
-
407
- ---
408
-
409
- ### F8. Fenced-block parsers break when the model adds prose
410
-
411
- **Symptom.** `JSON.parse(content)` throws, or a "matches the first fenced
412
- block" regex picks up a code example inside prose instead of the actual
413
- structured output.
414
-
415
- **Root cause.** A prompt asks for `only JSON inside a single fenced block`
416
- and the model adds a sentence of explanation, a "# Summary" heading, or
417
- quotes a snippet of its own reasoning in a code fence earlier in the reply.
418
-
419
- **Affected SDKs.** All three — this is a model-behavior issue, not
420
- SDK-specific.
421
-
422
- ### ❌ Wrong
423
-
424
- ```ts
425
- const parsed = JSON.parse(content);
426
- // or:
427
- const match = content.match(/```json\n([\s\S]*?)\n```/);
428
- ```
429
-
430
- ### ✅ Right — layered fallback: direct parse → last fenced block → last balanced object
431
-
432
- ```ts
433
- export function parseReviewResult(content: string): ReviewResult | null {
434
- // 1. Direct JSON
435
- try {
436
- const parsed = JSON.parse(content);
437
- if (parsed?.findings && parsed?.overall_correctness) return parsed;
438
- } catch { /* fall through */ }
439
-
440
- // 2. LAST fenced code block (not the first — prose often quotes examples)
441
- const blockRe = /```(?:json)?\s*\n([\s\S]*?)\n```/g;
442
- let lastBlock: string | null = null;
443
- let m: RegExpExecArray | null;
444
- while ((m = blockRe.exec(content)) !== null) {
445
- if (m[1]) lastBlock = m[1];
446
- }
447
- if (lastBlock) {
448
- try {
449
- const parsed = JSON.parse(lastBlock);
450
- if (parsed?.findings && parsed?.overall_correctness) return parsed;
451
- } catch { /* fall through */ }
452
- }
453
-
454
- // 3. Last balanced object containing the required key
455
- // (implementation in src/sdk/workflows/builtin/ralph/helpers/prompts.ts)
456
- return null;
457
- }
458
- ```
459
-
460
- **Detection.** Fuzz test the parser against real model output captured
461
- over several runs. If 1 in 20 runs fails to parse, you have F8.
462
-
463
- ---
464
-
465
- ### F9. `s.save()` receives the wrong shape
466
-
467
- **Symptom.** `s.transcript("stage-name")` returns an empty or malformed
468
- `content` string in the next stage.
469
-
470
- **Root cause.** Each SDK has a different contract for what `s.save()`
471
- expects, and the runtime doesn't type-check the argument beyond "anything".
472
-
473
- **Affected SDKs.** All three — the mistake is in the workflow author's code.
474
-
475
- ### Correct shapes
476
-
477
- | SDK | Correct argument |
478
- |---|---|
479
- | Claude | `s.save(s.sessionId)` — pass the session ID; the runtime reads the transcript file |
480
- | Copilot | `s.save(await s.session.getMessages())` — pass `SessionEvent[]` |
481
- | OpenCode | `s.save(result.data!)` — pass the `{ info, parts }` object |
482
-
483
- ### ❌ Wrong
484
-
485
- ```ts
486
- // Claude — saves the wrong thing (result is SessionMessage[], not { output: string })
487
- s.save(result.output); // TypeError: result.output is undefined; use s.save(s.sessionId)
488
-
489
- // Copilot — calling getMessages() BEFORE send() returns an empty array
490
- const earlyMessages = await s.session.getMessages(); // [] — no turns yet
491
- s.save(earlyMessages);
492
-
493
- // Copilot — saving a single message instead of the full array
494
- s.save((await s.session.getMessages()).at(-1));
495
-
496
- // OpenCode — missing the data unwrap
497
- s.save(result);
498
- ```
499
-
500
- ### ✅ Right
501
-
502
- See the per-SDK examples in `SKILL.md` §"Write the Workflow File" and the
503
- `SessionContext` reference table.
504
-
505
- **Detection.** Read `s.transcript(name).content` in the next stage and
506
- log the length. A 0-length or JSON-that-isn't-prose signature = F9.
507
-
508
- ---
509
-
510
- ## Loud failures (throw, but still worth knowing)
511
-
512
- ### F10. Copilot: `sendAndWait` default 60s timeout throws
513
-
514
- **Symptom.** `Timeout after 60000ms waiting for session.idle`. Every
515
- subsequent `ctx.stage()` call never executes — the throw propagates out of
516
- `run()` and halts the workflow.
517
-
518
- **Root cause.** The raw Copilot SDK's `sendAndWait(options, timeout?)`
519
- defaults to a 60-second timeout that throws on expiry. Real agent work
520
- (planners, reviewers, orchestrators) routinely exceeds this.
521
-
522
- **Fix.** Use `send` instead. Inside an Atomic stage the runtime wraps
523
- `s.session.send()` so it blocks until `session.idle` with **no timeout** —
524
- the same blocking semantics as Claude's `query()` and OpenCode's
525
- `session.prompt()`. The wrapper lives in `wrapCopilotSend`
526
- (`src/sdk/runtime/executor.ts`) and is installed per-stage.
527
-
528
- ```ts
529
- // Correct: send() in an Atomic stage blocks until idle, no timeout.
530
- await s.session.send({ prompt });
531
- const messages = await s.session.getMessages(); // safe to read
532
- ```
533
-
534
- **Do not reach for `sendAndWait` with a larger explicit timeout.** `send`
535
- already waits for idle; `sendAndWait` just adds a throw-on-timeout failure
536
- mode on top. If you catch yourself writing `sendAndWait(..., 5 * 60 * 1000)`
537
- to "be safe", you want `send`.
538
-
539
- ---
540
-
541
- ### F11. Provider-level resume tries to swap agents
542
-
543
- **Symptom.** Resumed Copilot / OpenCode session behaves as the original
544
- agent instead of the requested new one — or the SDK throws "agent mismatch"
545
- on resume.
546
-
547
- **Root cause.** Each session is **bound to one agent at creation time**.
548
- `resumeSession` reattaches the conversation but does not change the agent.
549
-
550
- **Fix.** Use provider-level resume only for multi-turn work within the same
551
- role. To swap agents, create a new session (fresh) and forward context via
552
- F5's pattern. In normal workflow code, prefer a same-stage multi-turn session
553
- over trying to reopen a prior stage.
554
-
555
- ---
556
-
557
- ### F12. Parallel siblings read each other's transcripts
558
-
559
- **Symptom.** `s.transcript("sibling-name")` inside a parallel session
560
- throws or returns empty.
561
-
562
- **Root cause.** `s.transcript()` only exposes **prior completed sessions** —
563
- ones whose callback has returned and whose saves have flushed. Sessions
564
- launched concurrently via `Promise.all([ctx.stage(...), ctx.stage(...)])` run
565
- at the same time; forward-only data flow is enforced.
566
-
567
- **Fix.** Restructure to either a linear chain, a "fan-out, then merge"
568
- pattern where a subsequent session reads both, or use external
569
- shared state (files, DB) if siblings genuinely need to coordinate.
570
-
571
- ```ts
572
- // Fan-out → merge
573
- // Strings used here for brevity; prefer handles (s.transcript(handle)) when one is in scope.
574
- const describe = await ctx.stage({ name: "describe" }, {}, {}, async (s) => { /* ... */ });
575
-
576
- const [summarizeA, summarizeB] = await Promise.all([
577
- ctx.stage({ name: "summarize-a" }, {}, {}, async (s) => {
578
- const d = await s.transcript(describe); // OK — prior completed session (handle-based, preferred)
579
- // s.transcript("summarize-b") would fail here — sibling not yet complete
580
- }),
581
- ctx.stage({ name: "summarize-b" }, {}, {}, async (s) => {
582
- const d = await s.transcript(describe); // OK — prior completed session
583
- }),
584
- ]);
585
-
586
- await ctx.stage({ name: "merge" }, {}, {}, async (s) => {
587
- const a = await s.transcript(summarizeA); // OK — handle-based, preferred over "summarize-a"
588
- const b = await s.transcript(summarizeB);
589
- });
590
- ```
591
-
592
- ---
593
-
594
- ### F13. Forgetting to `await` `ctx.stage()`
595
-
596
- **Symptom.** A session runs (its tmux window opens, the agent does work)
597
- but the orchestrator doesn't wait for it. Subsequent sessions that depend
598
- on its output via `transcript()` or `getMessages()` see empty or missing
599
- data. The workflow may finish "successfully" before the session's callback
600
- has returned.
601
-
602
- **Root cause.** `ctx.stage()` returns a `Promise<SessionHandle<T>>`.
603
- Without `await`, the session is spawned but the `.run()` callback continues
604
- immediately. The session's save never reaches the `completedRegistry`
605
- before downstream code tries to read it.
606
-
607
- **Affected SDKs.** All three — this is a TypeScript control-flow bug, not
608
- SDK-specific.
609
-
610
- ### ❌ Wrong
611
-
612
- ```ts
613
- // Missing await — session fires but orchestrator doesn't wait
614
- ctx.stage({ name: "research" }, {}, {}, async (s) => {
615
- // ... agent work ...
616
- s.save(s.sessionId);
617
- });
618
-
619
- // This runs before "research" completes
620
- await ctx.stage({ name: "synthesize" }, {}, {}, async (s) => {
621
- const r = await s.transcript("research"); // empty or throws
622
- });
623
- ```
624
-
625
- ### ✅ Right
626
-
627
- ```ts
628
- await ctx.stage({ name: "research" }, {}, {}, async (s) => {
629
- // ... agent work ...
630
- s.save(s.sessionId);
631
- });
632
-
633
- await ctx.stage({ name: "synthesize" }, {}, {}, async (s) => {
634
- const r = await s.transcript("research"); // works
635
- });
636
- ```
637
-
638
- **Detection.** If a session's graph node shows as "running" while
639
- downstream sessions are already executing, you likely dropped an `await`.
640
- TypeScript's `@typescript-eslint/no-floating-promises` lint rule catches
641
- this at compile time.
642
-
643
- ---
644
-
645
- ### F14. Using a pending `SessionHandle` before completion
646
-
647
- **Symptom.** `handle.result` is `undefined` or stale, or
648
- `s.transcript(handle)` throws / returns empty even though the session
649
- eventually completes.
650
-
651
- **Root cause.** `ctx.stage()` returns a `SessionHandle<T>` whose
652
- `.result` is only populated after the callback returns. If you store the
653
- promise but access the handle before awaiting it, the result field is
654
- not yet set and the session is not in the `completedRegistry`.
655
-
656
- **Affected SDKs.** All three.
657
-
658
- ### ❌ Wrong
659
-
660
- ```ts
661
- // Start both but access handles before awaiting
662
- const handleA = ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return 42; });
663
- const handleB = ctx.stage({ name: "b" }, {}, {}, async (s) => {
664
- // handleA is a Promise, not a resolved SessionHandle
665
- const transcript = await s.transcript(handleA); // fails
666
- });
667
- ```
668
-
669
- ### ✅ Right
670
-
671
- ```ts
672
- // Await first, then use the resolved handle
673
- const handleA = await ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return 42; });
674
-
675
- await ctx.stage({ name: "b" }, {}, {}, async (s) => {
676
- const transcript = await s.transcript(handleA); // works — handleA is resolved
677
- console.log(handleA.result); // 42
678
- });
679
- ```
680
-
681
- For parallel sessions, use `Promise.all()` and access handles only after
682
- all promises resolve:
683
-
684
- ```ts
685
- const [a, b] = await Promise.all([
686
- ctx.stage({ name: "a" }, {}, {}, async (s) => { /* ... */ return "x"; }),
687
- ctx.stage({ name: "b" }, {}, {}, async (s) => { /* ... */ return "y"; }),
688
- ]);
689
- // a.result === "x", b.result === "y"
690
- ```
691
-
692
- **Detection.** TypeScript's type system helps — `ctx.stage()` returns
693
- `Promise<SessionHandle<T>>`, not `SessionHandle<T>` directly. If you're
694
- accessing `.result` without awaiting, the type will be `Promise`, not `T`.
695
-
696
- ---
697
-
698
- ### F15. Headless stage errors are invisible in the graph
699
-
700
- **Symptom.** A workflow fails but the graph shows all visible stages as
701
- completed. The error message references a session name that doesn't appear
702
- in the graph panel.
703
-
704
- **Root cause.** Headless stages (`{ headless: true }`) are invisible in the
705
- workflow graph — they have no graph node, no tmux window, and no pane
706
- preview. When a headless stage throws, the error is recorded in the
707
- `failedRegistry` and the workflow halts, but the failure is only visible in
708
- the orchestrator's error output and the session's `error.txt` file on disk.
709
-
710
- **Affected SDKs.** All three — this is an executor-level behavior, not
711
- SDK-specific.
712
-
713
- ### ❌ Wrong — no error context for headless stages
714
-
715
- ```ts
716
- // Headless stage fails silently in the graph
717
- const [a, b, c] = await Promise.all([
718
- ctx.stage({ name: "gather-a", headless: true }, {}, {}, async (s) => {
719
- throw new Error("API key expired"); // Fails — no graph node to show red
720
- }),
721
- ctx.stage({ name: "gather-b", headless: true }, {}, {}, async (s) => { /* ... */ }),
722
- ctx.stage({ name: "gather-c", headless: true }, {}, {}, async (s) => { /* ... */ }),
723
- ]);
724
- ```
725
-
726
- ### ✅ Right — wrap headless stages with descriptive error context
727
-
728
- ```ts
729
- const [a, b, c] = await Promise.all([
730
- ctx.stage({ name: "gather-a", headless: true }, {}, {}, async (s) => {
731
- try {
732
- return await doWork(s);
733
- } catch (error) {
734
- throw new Error(`[gather-a] ${error instanceof Error ? error.message : String(error)}`);
735
- }
736
- }),
737
- // ... same pattern for b, c
738
- ]);
739
- ```
740
-
741
- **Detection.** If a workflow fails and the graph shows no failed nodes,
742
- check the orchestrator log (`orchestrator.log` in the session directory)
743
- and look for `headless-<name>` in the error output. The session directory
744
- at `~/.atomic/sessions/<run-id>/<name>-<id>/error.txt` contains the
745
- full error for each failed headless stage.
746
-
747
- ---
748
-
749
- ### F16. Claude: importing the SDK `query()` inside a non-headless stage
750
-
751
- **Symptom.** A reviewer / extractor / structured-output stage shows up in
752
- the workflow graph as a tmux pane, but the pane sits idle on the Claude
753
- welcome screen for the entire stage duration. The stage still produces a
754
- result — but the visible session never moved. CPU and token cost double:
755
- two Claude processes ran, one in the pane (idle) and one in-process (the
756
- SDK call that actually did the work).
757
-
758
- **Root cause.** The stage was registered without `headless: true`, so the
759
- runtime spawned an interactive Claude TUI in a tmux pane and bound
760
- `s.session` to it. The callback ignored that and called
761
- `query()` from `@anthropic-ai/claude-agent-sdk` directly:
762
-
763
- ```ts
764
- import { query } from "@anthropic-ai/claude-agent-sdk";
765
- // ...
766
- ctx.stage({ name: "review" }, {}, {}, async (s) => {
767
- for await (const msg of query({ prompt, options: { outputFormat: ... } })) { /* ... */ }
768
- });
769
- ```
770
-
771
- That import bypasses `s.session` entirely. The runtime cannot route the
772
- SDK call through the TUI it just started, so:
773
-
774
- 1. The visible pane never receives a prompt — the user sees a blank Claude
775
- session in the graph.
776
- 2. A second Claude process spins up in the orchestrator process to service
777
- the SDK call. Both processes count against rate limits and token spend.
778
- 3. Idle detection on the pane never fires because no prompt was ever sent;
779
- the runtime relies on session-state events that won't arrive, and stage
780
- completion happens only because the callback returned (not because the
781
- pane finished work).
782
-
783
- The runtime exposes exactly two routes for an SDK feature:
784
-
785
- | You want to use… | Stage shape | Code in callback |
786
- |---|---|---|
787
- | `outputFormat`, custom `agents`, `maxBudgetUsd`, etc. **without** a visible pane | `{ headless: true }` | `s.session.query(prompt, sdkOptions)` — wraps `HeadlessClaudeSessionWrapper.query()` which forwards `options` to the SDK |
788
- | The visible TUI with a subagent | omit `headless` and pass `chatFlags: ["--agent", "<name>", ...]` | `s.session.query(prompt)` — sends through tmux send-keys |
789
-
790
- The one option that does **not** exist is "visible pane + in-process SDK call".
791
- That combination is always wrong — pick one route or the other.
792
-
793
- **Affected SDKs.** Claude only. Copilot and OpenCode don't expose a
794
- parallel "import the bare SDK" foot-gun in this codebase.
795
-
796
- ### ❌ Wrong — visible pane + bypassed-SDK call
797
-
798
- ```ts
799
- import { query as claudeSdkQuery } from "@anthropic-ai/claude-agent-sdk";
800
-
801
- await ctx.stage({ name: "review" }, {}, {}, async (s) => {
802
- // Visible TUI was started, but we're ignoring it.
803
- for await (const msg of claudeSdkQuery({
804
- prompt: reviewPrompt,
805
- options: {
806
- outputFormat: { type: "json_schema", schema: REVIEW_SCHEMA },
807
- },
808
- })) {
809
- if (msg.type === "result") { /* ... */ }
810
- }
811
- s.save(s.sessionId);
812
- });
813
- ```
814
-
815
- ### ✅ Right (a) — visible TUI with subagent + chatFlags
816
-
817
- When you want the user to watch the review happen, run the subagent in
818
- the pane via `--agent` and parse JSON out of the assistant text. The
819
- prompt should enumerate the schema fields so the model emits matching
820
- JSON; a tolerant parser (last-fenced-block + last-balanced-object
821
- fallback, F8) handles any prose the model adds:
822
-
823
- ```ts
824
- await ctx.stage(
825
- { name: "review" },
826
- { chatFlags: ["--agent", "reviewer", "--allow-dangerously-skip-permissions", "--dangerously-skip-permissions"] },
827
- {},
828
- async (s) => {
829
- const messages = await s.session.query(reviewPrompt);
830
- s.save(s.sessionId);
831
- return parseReviewResult(extractAssistantText(messages, 0));
832
- },
833
- );
834
- ```
835
-
836
- This is the pattern used by `src/sdk/workflows/builtin/ralph/claude/index.ts`
837
- for its planner, orchestrator, reviewer, and debugger stages.
838
-
839
- ### ✅ Right (b) — headless stage with SDK options via `s.session.query()`
840
-
841
- When you don't need the pane (e.g. background data gathering), set
842
- `headless: true` and pass SDK options as the second argument to
843
- `s.session.query()`. The runtime uses `HeadlessClaudeSessionWrapper`,
844
- which calls the SDK's `query()` in-process and exposes the full options
845
- surface (`agent`, `outputFormat`, `permissionMode`, `maxBudgetUsd`, etc.):
846
-
847
- ```ts
848
- await ctx.stage(
849
- { name: "review", headless: true },
850
- {}, {},
851
- async (s) => {
852
- const messages = await s.session.query(reviewPrompt, {
853
- agent: "reviewer",
854
- permissionMode: "bypassPermissions",
855
- allowDangerouslySkipPermissions: true,
856
- });
857
- s.save(s.sessionId);
858
- return extractAssistantText(messages, 0);
859
- },
860
- );
861
- ```
862
-
863
- > **Note on `--json-schema`.** The CLI's `--json-schema` flag requires
864
- > `-p` (print mode) and therefore can't be passed via `chatFlags` to the
865
- > interactive TUI. If you need SDK-validated structured output, use route
866
- > (b) — set `headless: true` and pass `outputFormat: { type: "json_schema", schema }`
867
- > in the `s.session.query()` options. Pair (a)'s visible TUI with a
868
- > tolerant JSON parser instead. (Note: `s.session.query()`'s headless
869
- > wrapper currently returns `SessionMessage[]` and discards the SDK
870
- > result event's `structured_output` field — for now, parse JSON out of
871
- > the assistant text either way.)
872
-
873
- **Detection.**
874
- 1. Grep your workflow for `from "@anthropic-ai/claude-agent-sdk"` —
875
- `query`, `tool`, `createSdkMcpServer` and similar imports inside a
876
- `.run()` callback are the smell. Workflow code should import from
877
- `@bastani/atomic/workflows` and access the SDK exclusively through
878
- `s.client` and `s.session`.
879
- 2. Watch the workflow run. If a visible pane shows the Claude welcome
880
- screen for the entire duration of a stage and never receives a prompt,
881
- you have F16.
882
- 3. Cost monitoring. F16 roughly doubles the Claude process count — if
883
- stage spend looks 2× a single run, audit imports.
884
-
885
- ---
886
-
887
- ## Design checklist
888
-
889
- Before shipping a multi-session workflow, walk the list:
890
-
891
- - [ ] Copilot stages use `s.session.send` by default; `sendAndWait` only with an explicit user-requested timeout (F10)
892
- - [ ] Every fresh-session handoff forwards context explicitly (F5)
893
- - [ ] Every prompt whose output feeds a downstream stage explicitly requests trailing commentary (F6)
894
- - [ ] Response-text extraction uses the per-SDK correct pattern (F1-F4)
895
- - [ ] Structured-output parsers extract the LAST fenced block, not the first (F8)
896
- - [ ] `s.save()` receives the per-SDK correct shape — Copilot uses `s.session.getMessages()` (F9)
897
- - [ ] Loops over 10 iterations have a compaction / reset strategy (F7)
898
- - [ ] Parallel groups only read from prior completed sessions, never siblings (F12)
899
- - [ ] Every `ctx.stage()` call is `await`ed (F13)
900
- - [ ] `SessionHandle` values are only used after the promise resolves (F14)
901
- - [ ] If provider-level resume/fork is used at all, it stays within the same agent role (F11)
902
- - [ ] Headless stage callbacks include descriptive error context so failures can be diagnosed without a graph node (F15)
903
- - [ ] Claude stages never import `query` (or other entry points) from `@anthropic-ai/claude-agent-sdk` directly — go through `s.session.query()` so the runtime routes to the TUI (interactive) or the SDK (headless) consistently (F16)