@bastani/atomic 0.9.0-alpha.2 → 0.9.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/builtin/cursor/package.json +2 -2
  3. package/dist/builtin/intercom/package.json +1 -1
  4. package/dist/builtin/mcp/package.json +1 -1
  5. package/dist/builtin/subagents/package.json +1 -1
  6. package/dist/builtin/web-access/package.json +1 -1
  7. package/dist/builtin/workflows/CHANGELOG.md +24 -0
  8. package/dist/builtin/workflows/README.md +12 -12
  9. package/dist/builtin/workflows/builtin/goal-ledger.ts +2 -0
  10. package/dist/builtin/workflows/builtin/goal-prompts.ts +8 -0
  11. package/dist/builtin/workflows/builtin/goal-reports.ts +5 -0
  12. package/dist/builtin/workflows/builtin/goal-runner.ts +103 -4
  13. package/dist/builtin/workflows/builtin/goal-types.ts +4 -0
  14. package/dist/builtin/workflows/builtin/goal.d.ts +4 -0
  15. package/dist/builtin/workflows/builtin/goal.ts +14 -2
  16. package/dist/builtin/workflows/builtin/index.d.ts +8 -8
  17. package/dist/builtin/workflows/builtin/open-claude-design-feedback.ts +359 -0
  18. package/dist/builtin/workflows/builtin/open-claude-design-phases.ts +254 -352
  19. package/dist/builtin/workflows/builtin/open-claude-design-runner.ts +256 -414
  20. package/dist/builtin/workflows/builtin/open-claude-design-setup.ts +272 -0
  21. package/dist/builtin/workflows/builtin/open-claude-design-utils.ts +58 -68
  22. package/dist/builtin/workflows/builtin/open-claude-design.d.ts +5 -9
  23. package/dist/builtin/workflows/builtin/open-claude-design.ts +14 -26
  24. package/dist/builtin/workflows/builtin/prompt-refinement.ts +102 -0
  25. package/dist/builtin/workflows/builtin/ralph-core.ts +6 -4
  26. package/dist/builtin/workflows/builtin/ralph-runner.ts +22 -24
  27. package/dist/builtin/workflows/builtin/ralph.d.ts +2 -0
  28. package/dist/builtin/workflows/builtin/ralph.ts +3 -1
  29. package/dist/builtin/workflows/package.json +1 -1
  30. package/dist/builtin/workflows/skills/impeccable/SKILL.md +14 -23
  31. package/dist/builtin/workflows/skills/impeccable/reference/brand.md +2 -2
  32. package/dist/builtin/workflows/skills/impeccable/reference/live.md +25 -4
  33. package/dist/builtin/workflows/skills/impeccable/scripts/context-signals.mjs +1 -1
  34. package/dist/builtin/workflows/skills/impeccable/scripts/context.mjs +724 -29
  35. package/dist/builtin/workflows/skills/impeccable/scripts/critique-storage.mjs +1 -1
  36. package/dist/builtin/workflows/skills/impeccable/scripts/detector/browser/injected/index.mjs +219 -7
  37. package/dist/builtin/workflows/skills/impeccable/scripts/detector/cli/main.mjs +57 -11
  38. package/dist/builtin/workflows/skills/impeccable/scripts/detector/design-system.mjs +750 -0
  39. package/dist/builtin/workflows/skills/impeccable/scripts/detector/detect-antipatterns-browser.js +648 -53
  40. package/dist/builtin/workflows/skills/impeccable/scripts/detector/detect-antipatterns.mjs +7 -0
  41. package/dist/builtin/workflows/skills/impeccable/scripts/detector/engines/browser/detect-url.mjs +29 -4
  42. package/dist/builtin/workflows/skills/impeccable/scripts/detector/engines/regex/detect-text.mjs +44 -11
  43. package/dist/builtin/workflows/skills/impeccable/scripts/detector/engines/static-html/css-cascade.mjs +29 -0
  44. package/dist/builtin/workflows/skills/impeccable/scripts/detector/engines/static-html/detect-html.mjs +27 -1
  45. package/dist/builtin/workflows/skills/impeccable/scripts/detector/node/file-system.mjs +1 -1
  46. package/dist/builtin/workflows/skills/impeccable/scripts/detector/registry/antipatterns.mjs +29 -0
  47. package/dist/builtin/workflows/skills/impeccable/scripts/detector/rules/checks.mjs +401 -46
  48. package/dist/builtin/workflows/skills/impeccable/scripts/detector/shared/inline-ignores.mjs +148 -0
  49. package/dist/builtin/workflows/skills/impeccable/scripts/detector/shared/page.mjs +6 -6
  50. package/dist/builtin/workflows/skills/impeccable/scripts/{design-parser.mjs → lib/design-parser.mjs} +8 -1
  51. package/dist/builtin/workflows/skills/impeccable/scripts/lib/impeccable-config.mjs +638 -0
  52. package/dist/builtin/workflows/skills/impeccable/scripts/lib/impeccable-paths.mjs +128 -0
  53. package/dist/builtin/workflows/skills/impeccable/scripts/{is-generated.mjs → lib/is-generated.mjs} +2 -2
  54. package/dist/builtin/workflows/skills/impeccable/scripts/lib/target-args.mjs +42 -0
  55. package/dist/builtin/workflows/skills/impeccable/scripts/live/browser-script-parts.mjs +49 -0
  56. package/dist/builtin/workflows/skills/impeccable/scripts/{live-completion.mjs → live/completion.mjs} +1 -0
  57. package/dist/builtin/workflows/skills/impeccable/scripts/{live-event-validation.mjs → live/event-validation.mjs} +6 -5
  58. package/dist/builtin/workflows/skills/impeccable/scripts/live/manual-apply.mjs +939 -0
  59. package/dist/builtin/workflows/skills/impeccable/scripts/live/manual-edit-routes.mjs +357 -0
  60. package/dist/builtin/workflows/skills/impeccable/scripts/{live-manual-edits-buffer.mjs → live/manual-edits-buffer.mjs} +1 -1
  61. package/dist/builtin/workflows/skills/impeccable/scripts/{live-session-store.mjs → live/session-store.mjs} +21 -3
  62. package/dist/builtin/workflows/skills/impeccable/scripts/live/svelte-component.mjs +835 -0
  63. package/dist/builtin/workflows/skills/impeccable/scripts/live/sveltekit-adapter.mjs +274 -0
  64. package/dist/builtin/workflows/skills/impeccable/scripts/live/ui-core.mjs +180 -0
  65. package/dist/builtin/workflows/skills/impeccable/scripts/live/vocabulary.mjs +36 -0
  66. package/dist/builtin/workflows/skills/impeccable/scripts/live-accept.mjs +185 -60
  67. package/dist/builtin/workflows/skills/impeccable/scripts/live-browser-dom.js +146 -0
  68. package/dist/builtin/workflows/skills/impeccable/scripts/live-browser.js +3369 -1026
  69. package/dist/builtin/workflows/skills/impeccable/scripts/live-commit-manual-edits.mjs +2 -2
  70. package/dist/builtin/workflows/skills/impeccable/scripts/live-complete.mjs +2 -2
  71. package/dist/builtin/workflows/skills/impeccable/scripts/live-discard-manual-edits.mjs +1 -1
  72. package/dist/builtin/workflows/skills/impeccable/scripts/live-inject.mjs +133 -9
  73. package/dist/builtin/workflows/skills/impeccable/scripts/live-insert.mjs +42 -2
  74. package/dist/builtin/workflows/skills/impeccable/scripts/live-manual-edit-evidence.mjs +4 -4
  75. package/dist/builtin/workflows/skills/impeccable/scripts/live-poll.mjs +21 -15
  76. package/dist/builtin/workflows/skills/impeccable/scripts/live-resume.mjs +1 -1
  77. package/dist/builtin/workflows/skills/impeccable/scripts/live-server.mjs +205 -1269
  78. package/dist/builtin/workflows/skills/impeccable/scripts/live-status.mjs +2 -2
  79. package/dist/builtin/workflows/skills/impeccable/scripts/live-target.mjs +30 -0
  80. package/dist/builtin/workflows/skills/impeccable/scripts/live-wrap.mjs +69 -26
  81. package/dist/builtin/workflows/skills/impeccable/scripts/live.mjs +73 -22
  82. package/dist/builtin/workflows/src/extension/workflow-prompts.ts +3 -1
  83. package/dist/core/atomic-guide-command.d.ts.map +1 -1
  84. package/dist/core/atomic-guide-command.js +5 -5
  85. package/dist/core/atomic-guide-command.js.map +1 -1
  86. package/dist/core/system-prompt.d.ts.map +1 -1
  87. package/dist/core/system-prompt.js +0 -1
  88. package/dist/core/system-prompt.js.map +1 -1
  89. package/docs/index.md +2 -2
  90. package/docs/quickstart.md +9 -9
  91. package/docs/workflows.md +816 -47
  92. package/package.json +2 -2
  93. package/dist/builtin/workflows/skills/impeccable/scripts/cleanup-deprecated.mjs +0 -284
  94. package/dist/builtin/workflows/skills/impeccable/scripts/impeccable-paths.mjs +0 -126
  95. /package/dist/builtin/workflows/skills/impeccable/scripts/{live-insert-ui.mjs → live/insert-ui.mjs} +0 -0
package/docs/workflows.md CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  # Workflows
4
4
 
5
- Workflows let Atomic run reusable multi-stage automation with tracked stages, parallel branches, artifacts, human input, live status, and resumable background execution.
5
+ Workflows are how Atomic runs executable engineering loops: reusable multi-stage automation with tracked stages, parallel branches, artifacts, human input, live status, checkpoints, and resumable background execution.
6
6
 
7
- Use a workflow when a task should be repeatable, inspectable, resumable, or split across multiple model sessions. For one-off work, the `workflow` tool can also run a tracked single task, parallel fan-out, or chain without creating a saved workflow file.
7
+ Use a workflow when a task should be repeatable, inspectable, resumable, or split across multiple model sessions. Markdown prompts can describe a loop; Atomic workflows run the loop with scoped context, tools, artifacts, verification, subagents, review gates, and human approvals. For one-off work, the `workflow` tool can also run a tracked single task, parallel fan-out, or chain without creating a saved workflow file.
8
8
 
9
9
  **Key capabilities:**
10
10
  - **Tracked stages** - Name each step and inspect it in workflow status and graph views
@@ -13,6 +13,7 @@ Use a workflow when a task should be repeatable, inspectable, resumable, or spli
13
13
  - **Human input** - Pause for `ctx.ui.input`, `confirm`, `select`, `editor`, or custom TUI widget decisions during a run
14
14
  - **Resumable control** - Interrupt, pause, resume, attach to, or kill workflow runs
15
15
  - **Artifacts** - Save large outputs to files instead of pushing everything through model context
16
+ - **Verification and gates** - Preserve evidence, run checks, and stop for human approval where reliability matters
16
17
  - **Model fallback chains** - Retry important stages on fallback models when providers fail
17
18
  - **Package distribution** - Ship workflows through Atomic packages, settings, or conventional directories
18
19
 
@@ -43,12 +44,14 @@ Use a workflow when a task should be repeatable, inspectable, resumable, or spli
43
44
  - [Direct One-Off Runs](#direct-one-off-runs)
44
45
  - [Fast Inference for Workflow Stages](#fast-inference-for-workflow-stages)
45
46
  - [Writing a Workflow](#writing-a-workflow)
47
+ - [Migrating from the `defineWorkflow()` Builder API](#migrating-from-the-defineworkflow-builder-api)
46
48
  - [Workflow Primitives](#workflow-primitives)
47
49
  - [Task and Stage Options](#task-and-stage-options)
48
50
  - [Programmatic Usage](#programmatic-usage)
49
51
  - [Context Engineering](#context-engineering)
50
52
  - [Design Checklist](#design-checklist)
51
53
  - [Common Mistakes](#common-mistakes)
54
+ - [Workflow Best Practices](#workflow-best-practices)
52
55
 
53
56
  ## Quick Start
54
57
 
@@ -155,9 +158,9 @@ For the builtin result tables below, `deep-research-codebase`, `goal`, and `ralp
155
158
  | Workflow | What it does | When to use |
156
159
  |---|---|---|
157
160
  | `deep-research-codebase` | Scout + research-history chain → parallel specialist waves → aggregator. Indexes the whole repo and synthesizes findings. | Broad or cross-cutting research before you decide what to change. Prefer `/skill:research-codebase` for one subsystem. |
158
- | `goal` | Persisted goal ledger → bounded worker turns → receipts → three-reviewer gate → deterministic reducer → final report. | Small-to-medium scope changes when you can identify the work surface, state the exact outcome, and name the validation that proves it is done for example tests, lint/typecheck, docs builds, or observable behavior. |
159
- | `ralph` | Prompt-engineering → codebase/online research → sub-agent orchestration → multi-model parallel review → optional final-stage PR handoff. | Larger migrations, broad refactors, and multi-package changes where you want Atomic to transform the prompt into a research question, research the codebase before implementing, delegate through sub-agents, review, iterate, and optionally allow only the final `pull-request` stage to attempt PR creation with `create_pr=true`. |
160
- | `open-claude-design` | Design-system onboarding → reference importHTML generation → impeccable-driven refinementquality gate → rich HTML handoff. Renders a live `preview.html` you can iterate against (opens through `browser` when available). | UI, page, component, theme, or design-token work that benefits from generation + critique loops. |
161
+ | `goal` | Persisted goal ledger → bounded worker turns → receipts → three-reviewer gate → deterministic reducer → final report → optional final-stage PR handoff after approval. | Small-to-medium scope changes when you can identify the work surface, state the exact outcome, name the validation that proves it is done, and optionally allow only the final `pull-request` stage to attempt PR creation with `create_pr=true` after Goal reaches `complete`. |
162
+ | `ralph` | Prompt-refinementresearch-prompt-refinement → codebase/online research → sub-agent orchestration → multi-model parallel review → optional final-stage PR handoff. | Larger migrations, broad refactors, and multi-package changes where you want Atomic to refine the prompt for clarity, transform it into a research question, research the codebase before implementing, delegate through sub-agents, review, iterate, and optionally allow only the final `pull-request` stage to attempt PR creation with `create_pr=true`. |
163
+ | `open-claude-design` | Combined discovery/init (`/skill:impeccable shape` + `/skill:impeccable init` in one `discovery` stage) design-system/reference research (`ds-*`) curated gallery reference-discovery using that context a forked `generate-*` / `user-feedback-*` loop → rich HTML handoff (`exporter` → `final-display`). The discovery stage asks what to build, the output type, and which references to emulate, then lets impeccable init detect/create/reconcile `PRODUCT.md` and `DESIGN.md` (references take precedence over project context). Renders a live `preview.html` you can iterate against in the browser (opens through impeccable `live` / the `playwright-cli` skill when available). | UI, page, component, theme, or design-token work that benefits from a guided brief, beautiful references, and generation + user feedback loops. |
161
164
 
162
165
  ### `deep-research-codebase`
163
166
 
@@ -211,7 +214,8 @@ Inputs:
211
214
  |---|---|---|---|---|
212
215
  | `objective` | text | yes | — | Goal-runner objective. Include the desired end state, expected outcome, testing/validation instructions, and any explicit done criteria. |
213
216
  | `max_turns` | number | no | `10` | Maximum worker/review turns before human follow-up is needed. |
214
- | `base_branch` | string | no | `origin/main` | Branch reviewers compare the current code delta against. |
217
+ | `base_branch` | string | no | `origin/main` | Branch reviewers and the optional final stage compare the current code delta against. |
218
+ | `create_pr` | boolean | no | `false` | Safe-by-default PR creation flag. Omitted or `false` skips the final `pull-request` stage and omits `pr_report`; prompt text alone does not opt in, and only strict `true` authorizes the final `pull-request` stage to attempt provider-appropriate PR/MR/review creation after Goal reaches `complete`. |
215
219
 
216
220
  `goal` defaults to 10 worker/review turns. Reviewer quorum is fixed internally at 2 reviewer `complete` votes. The repeated-blocker threshold defaults to 3 consecutive same-blocker turns and is clamped to `max_turns` when you run fewer than 3 turns.
217
221
 
@@ -221,9 +225,10 @@ Run examples:
221
225
  /workflow goal objective="Implement specs/2026-03-rate-limit.md, add the requested regression tests, run bun test packages/api/rate-limit.test.ts, and finish only when burst traffic returns 429 with Retry-After"
222
226
  /workflow goal objective="Update the CLI docs to describe the new --json flag, include one usage example, and verify the docs build still passes" max_turns=3
223
227
  /workflow goal objective="Fix the settings form validation bug; add/adjust the focused test and consider it done when invalid emails show the inline error without submitting"
228
+ /workflow goal objective="Implement the focused docs fix, run the docs validation command, and open a PR when complete" create_pr=true
224
229
  ```
225
230
 
226
- `goal` creates an OS-temp `goal-ledger.json` artifact, renders goal-continuation context for each worker turn, writes each worker receipt to `work-turn-N.md`, and appends receipts, reviewer decisions, blockers, reducer decisions, and lifecycle events to the ledger. The objective is treated as user-provided data, not higher-priority instructions.
231
+ `goal` starts with a single `prompt-refinement` stage that invokes the `prompt-engineer` skill (`/skill:prompt-engineer`) to sharpen the raw objective into a clearer, more actionable form using the Workflow Best Practices prompt anatomy documented later in this guide; the refined objective becomes the operative one recorded in the ledger (the original is preserved as `original_objective` and shown in the final report when it differs). `goal` then creates an OS-temp `goal-ledger.json` artifact, renders goal-continuation context for each worker turn, writes each worker receipt to `work-turn-N.md`, and appends receipts, reviewer decisions, blockers, reducer decisions, and lifecycle events to the ledger. The objective is treated as user-provided data, not higher-priority instructions. By default `goal` does not start the final `pull-request` stage, and `pr_report` is omitted. Prompt text alone does not opt in. Pass `create_pr=true` only when you explicitly want the final stage to inspect provider credentials and attempt provider-appropriate PR/MR/review creation, such as GitHub `gh`, Azure Repos `az repos pr create`, or Sapling/Phabricator tooling, after Goal reaches `complete` within `max_turns`. Goal worker and reviewer prompts explicitly tell intermediate stages to ignore PR-creation requests; only the final `pull-request` stage may attempt that handoff.
227
232
 
228
233
  Write the `objective` like a compact acceptance spec. Say what should exist when the run is done, how you want testing handled, which command(s) or manual checks matter, and what outcome proves completion. The workflow is intentionally lean: it does not first generate an RFC or migration plan, so the developer-supplied objective is where scope, validation, and completion criteria belong.
229
234
 
@@ -237,13 +242,16 @@ Result fields:
237
242
  | `status` | Final reducer status: `complete`, `blocked`, or `needs_human` (or `active` only if externally interrupted). |
238
243
  | `approved` | Whether the reducer reached `complete`. |
239
244
  | `goal_id` | Per-run goal identifier stored in the ledger. |
240
- | `objective` | Normalized goal objective used by the run. |
245
+ | `objective` | Normalized goal objective used by the run (after the `prompt-refinement` stage refines the raw objective). |
246
+ | `original_objective` | The raw user-provided objective exactly as given, before `prompt-refinement`. Omitted when refinement left it unchanged. |
241
247
  | `ledger_path` | OS-temp path to `goal-ledger.json`, including receipts, reviewer decisions, reducer decisions, blockers, and lifecycle events. |
242
248
  | `turns_completed` | Worker/review turns completed. |
243
249
  | `iterations_completed` | Same value as `turns_completed`, retained for status summaries. |
244
250
  | `receipts` | Ledger receipt summaries and worker artifact paths. |
245
251
  | `remaining_work` | Remaining gaps/blockers when incomplete, or `none`. |
246
252
  | `review_report` | Markdown report containing the last structured reviewer decision payloads used by the reducer. |
253
+ | `review_report_path` | JSON artifact path for the latest Goal review round. |
254
+ | `pr_report` | Pull-request report emitted only when `create_pr=true`, Goal reaches `complete`, and the final `pull-request` stage runs. |
247
255
 
248
256
  ### `ralph`
249
257
 
@@ -265,7 +273,7 @@ Run examples:
265
273
  /workflow ralph prompt="Safely implement the API refactor" git_worktree_dir=../atomic-ralph-api-wt base_branch=main
266
274
  ```
267
275
 
268
- Each `ralph` iteration starts by prompt-engineering the user prompt with `/skill:prompt-engineer Transform the following user prompt to a codebase and online research question which can be thoroughly explored: ...`, then researches that transformed question with `/skill:research-codebase ...` and writes the findings under `research/`. The orchestrator treats that research artifact as its primary implementation context, initializes/updates an OS-temp implementation notes file while generating verifiable evidence for any claims it records in the notes and reviewer artifacts, delegates implementation through sub-agents, and asks three independent reviewers to inspect the patch directly against `base_branch`. The reviewer fan-out runs each reviewer on a different primary model family (with shared fallbacks) so the adversarial review gets cross-model coverage instead of three passes from one model. Ralph's orchestrator and reviewers are prompted to verify user-visible behavior end-to-end when practical, using `playwright-cli`-skilled subagents for web/frontend flows that may depend on backend/API behavior and tmux-skilled subagents for TUI or terminal-app scenarios. For UI-applicable or full-stack changes, the orchestrator runs a `playwright-cli` end-to-end QA pass and records a reviewable proof video (referenced in the implementation notes and surfaced as `qa_video_path`); when `create_pr=true`, the final `pull-request` stage attaches or links that video to the created PR/MR/review. If reviewers find issues, the next prompt-engineering and research stages receive the review artifact path so follow-up research can address unresolved findings, and research stages fork from prior research session data when available. The loop stops only when all three reviewers independently approve (each finds no issues) or `max_loops` is reached, so a P0–P3 finding from any single reviewer keeps Ralph iterating instead of being out-voted by a majority quorum. By default Ralph does not start the final `pull-request` stage, and `pr_report` is omitted. Prompt text alone does not opt in. Pass `create_pr=true` only when you explicitly want the final `pull-request` stage to inspect provider credentials and attempt provider-appropriate PR/MR/review creation, such as GitHub `gh`, Azure Repos `az repos pr create`, or Sapling/Phabricator tooling; Ralph's own PR-creation instructions live in that final stage.
276
+ Each `ralph` run starts with a single `prompt-refinement` stage that invokes the `prompt-engineer` skill (`/skill:prompt-engineer`) to sharpen the raw user prompt into a clearer, more actionable objective using the Workflow Best Practices prompt anatomy documented later in this guide; that refined prompt becomes the operative objective for research, orchestration, and review, while the original is surfaced as `original_prompt`. Each iteration then transforms the refined prompt with `/skill:prompt-engineer Transform the following refined user request into a codebase and online research question which can be thoroughly explored: ...` (`research-prompt-refinement`), researches that transformed question with `/skill:research-codebase ...`, and writes the findings under `research/`. The orchestrator treats that research artifact as its primary implementation context, initializes/updates an OS-temp implementation notes file while generating verifiable evidence for any claims it records in the notes and reviewer artifacts, delegates implementation through sub-agents, and asks three independent reviewers to inspect the patch directly against `base_branch`. The reviewer fan-out runs each reviewer on a different primary model family (with shared fallbacks) so the adversarial review gets cross-model coverage instead of three passes from one model. Ralph's orchestrator and reviewers are prompted to verify user-visible behavior end-to-end when practical, using `playwright-cli`-skilled subagents for web/frontend flows that may depend on backend/API behavior and tmux-skilled subagents for TUI or terminal-app scenarios. For UI-applicable or full-stack changes, the orchestrator runs a `playwright-cli` end-to-end QA pass and records a reviewable proof video (referenced in the implementation notes and surfaced as `qa_video_path`); when `create_pr=true`, the final `pull-request` stage attaches or links that video to the created PR/MR/review. If reviewers find issues, the next `research-prompt-refinement` and research stages receive the review artifact path so follow-up research can address unresolved findings, and research stages fork from prior research session data when available. The loop stops only when all three reviewers independently approve (each finds no issues) or `max_loops` is reached, so a P0–P3 finding from any single reviewer keeps Ralph iterating instead of being out-voted by a majority quorum. By default Ralph does not start the final `pull-request` stage, and `pr_report` is omitted. Prompt text alone does not opt in. Pass `create_pr=true` only when you explicitly want the final `pull-request` stage to inspect provider credentials and attempt provider-appropriate PR/MR/review creation, such as GitHub `gh`, Azure Repos `az repos pr create`, or Sapling/Phabricator tooling; Ralph's own PR-creation instructions live in that final stage.
269
277
 
270
278
  Set `git_worktree_dir` when you want Ralph's worker stages isolated in a reusable Git worktree. Relative paths resolve from the invoking repository root, existing same-repository worktree roots are reused, and missing paths are created from `base_branch`. Ralph preserves the invoking repo-relative cwd inside the worktree, so launching from `repo/packages/api` with `git_worktree_dir=../repo-wt` runs stages from `../repo-wt/packages/api`.
271
279
 
@@ -285,8 +293,10 @@ Result fields:
285
293
  | `iterations_completed` | Number of research/orchestrate/review loops completed. |
286
294
  | `review_report` | Compact reference to the latest reviewer payload artifact. |
287
295
  | `review_report_path` | JSON artifact path for the latest Ralph review round. |
296
+ | `original_prompt` | The raw user prompt exactly as provided, before the `prompt-refinement` stage. |
297
+ | `refined_prompt` | The clarity-refined prompt produced by the `prompt-refinement` stage and used as the operative objective for research, orchestration, and review. |
288
298
 
289
- A typical end-to-end flow is `/skill:research-codebase` → `/skill:create-spec` → `/workflow goal objective="Implement the researched rate-limit behavior, run the focused tests, and finish when the documented burst behavior is validated"` when you can identify the work surface, state the exact outcome, and name the validation that proves it is done. Keep using `/workflow ralph` for larger migrations, broad refactors, and multi-package changes where you want Atomic to research first, delegate through sub-agents, review, iterate, and optionally allow only the final `pull-request` stage to attempt PR creation with `create_pr=true`.
299
+ A typical planned flow is `/skill:research-codebase` → `/skill:create-spec` → `/workflow ralph prompt="Implement specs/2026-03-rate-limit.md and validate the documented burst behavior"`. Ralph can start from a spec path, GitHub issue, or crisp ticket description, then refines the prompt, researches as needed, delegates through sub-agents, reviews, records a QA proof video for UI/full-stack changes when practical, and iterates. For smaller one-off tasks, use `/workflow goal` with a concrete objective that identifies the work surface, states the exact outcome, and names the validation that proves it is done; add `create_pr=true` only when you want Goal's final `pull-request` stage after approval.
290
300
 
291
301
  ### `open-claude-design`
292
302
 
@@ -294,21 +304,21 @@ Inputs:
294
304
 
295
305
  | Input | Type | Required | Default | Description |
296
306
  |---|---|---|---|---|
297
- | `prompt` | text | yes | — | What to design (dashboard, page, component, prototype, …). |
298
- | `reference` | text | no | | URL, file path, screenshot path, or design doc to import as a reference. |
299
- | `output_type` | select | no | `prototype` | One of `prototype`, `wireframe`, `page`, `component`, `theme`, `tokens`. |
300
- | `design_system` | text | no | — | Path(s) or description of an existing design system (e.g. `DESIGN.md`, `PRODUCT.md`). Skips onboarding when provided. |
301
- | `max_refinements` | number | no | `3` | Maximum critique/apply refinement iterations. |
307
+ | `prompt` | text | yes | — | What to design (dashboard, page, component, prototype, …). The discovery stage refines this into a confirmed brief and asks for the output type and references. |
308
+ | `discover_references` | boolean | no | `true` | Discover beautiful, current reference designs (Awwwards, recent.design, Dribbble, Monet, Motionsites) and feed them to generation. Set `false` to skip the network/browser reference pass. |
309
+ | `max_refinements` | number | no | `3` | Maximum generate/user-feedback loop iterations. |
310
+
311
+ The output type (`prototype`, `wireframe`, `page`, `component`, `theme`, `tokens`) and any reference designs are **not** inputs — the discovery stage asks for them. There is no `design_system` input; the project's `DESIGN.md`/`PRODUCT.md` are established/loaded automatically.
302
312
 
303
313
  Result fields:
304
314
 
305
315
  | Field | Meaning |
306
316
  |---|---|
307
- | `output_type` | Kind of design artifact produced. |
308
- | `design_system` | Design system source used for generation: supplied input or project-derived design system. |
317
+ | `output_type` | Kind of design artifact produced (chosen during the discovery interview). |
318
+ | `design_system` | Design system source used for generation: the project-derived design system. |
309
319
  | `artifact` | Latest final design summary from the approved preview artifact. |
310
320
  | `handoff` | Final rich HTML spec and implementation handoff summary. |
311
- | `approved_for_export` | Whether refinement completed before the final export gate. |
321
+ | `approved_for_export` | Whether the latest user-feedback stage reported no further changes before export. |
312
322
  | `refinements_completed` | Number of refinement iterations completed. |
313
323
  | `import_context` | Reference-import context used during generation. |
314
324
  | `run_id` | Per-run design workflow artifact identifier. |
@@ -321,14 +331,28 @@ Result fields:
321
331
 
322
332
  `open-claude-design` has no `result` output; it exposes only the declared fields listed above. Use the declared `artifact` and `handoff` fields for generated content.
323
333
 
334
+ **Combined discovery/init.** The workflow's first and only front-door stage runs `/skill:impeccable shape` and `/skill:impeccable init` together. It interviews you (via the structured question tool) about what you want to build, the **output type** (`prototype`, `wireframe`, `page`, `component`, `theme`, or `tokens`), and which **references** to emulate (URLs, local file paths, screenshots, or design docs). Then, in the same `discovery` stage, impeccable init performs its own `PRODUCT.md`/`DESIGN.md` detection and creates or reconciles those files as needed. The references you name take **precedence over `DESIGN.md`/`PRODUCT.md`** during generation (the design system fills gaps the references don't cover, and `PRODUCT.md` still governs strategic register/voice). Headless runs infer a defensible brief, output type, references, and project-context assumptions rather than blocking.
335
+
336
+ **Context and reference phase.** Design-system/reference research runs first, then gallery reference discovery uses those findings before the generator consumes the combined context:
337
+
338
+ - *Design-system/reference research* — three parallel passes (`ds-locator` / `ds-analyzer` / `ds-patterns`) extract the project's design-system evidence and also handle user-provided references. URL references are captured with browser/screenshot tooling where available; local files, screenshots, and design docs are parsed by the applicable `ds-*` pass. Their extracted requirements feed the generator and **take precedence over `DESIGN.md`/`PRODUCT.md`**. There are no separate `web-capture-*`, `file-parser-*`, or `design-system-builder` stages.
339
+ - *Reference discovery* (gated by `discover_references=true`, the default) — after the `ds-*` passes complete, the `reference-discovery` stage receives their evidence plus the `PRODUCT.md`/`DESIGN.md` init summary. It uses the `playwright-cli` skill to browse five curated galleries — [Awwwards](https://www.awwwards.com/websites/), [recent.design](https://recent.design/), [Dribbble recents](https://dribbble.com/shots/recent), [Monet](https://www.monet.design/c), and [Motionsites](https://motionsites.ai/) — then **clicks into the standout work** and, ideally, **records a scroll-through video of each real design page so its animations are captured** (with a full-page screenshot as a supplement/fallback) plus the real destination URL (it does not just screenshot the gallery thumbnails; web-search fallback when the browser is unavailable). It then asks which curated reference direction you prefer; if none align, it asks you to provide a reference image, screenshot, URL, or local path for best results. The curated **references brief** is persisted to `<artifact_dir>/references.md` and threaded into the generator (`reference_inspiration`) and refinement. Set `discover_references=false` to skip it.
340
+
341
+ **Generate/user-feedback loop.** Refinement is intentionally simple and mirrors Ralph's implement/reviewer rhythm: `generate-1` writes the first `preview.html`, `user-feedback-1` opens that preview with `/skill:impeccable live`, and any captured `live_changes`, `user_notes`, or `annotated_snapshot` feed the next forked `generate-*` stage. Each later `generate-*` forks from the previous generate session, and each later `user-feedback-*` forks from the previous feedback session, preserving focused continuity without adding extra critique/screenshot/apply stages. When a `user-feedback-*` stage captures no meaningful feedback, the loop exports immediately. Export is deliberately just `exporter` followed by `final-display`; there is no pre-export scan, forced-fix stage, or export gate. Captured feedback is persisted as durable artifacts under `<artifact_dir>/feedback/iteration-<n>.md` / `.json` (plus a best-effort copy of the annotated snapshot, constrained to files within the project/artifact dir). If captured notes fail to thread into the next generate prompt, the run fails loudly rather than silently generating without user feedback.
342
+
343
+ **Browser requirement.** open-claude-design is browser-centric (the discovery/preview review and the `live` QA loop need the `playwright-cli` skill's browser). If the browser cannot be made available, the workflow exits cleanly up front — surfacing the would-be artifact paths and install instructions — rather than generating a design you could not review interactively. (This early exit is skipped under the test harness so headless test runs still complete.)
344
+
324
345
  Run examples:
325
346
 
326
347
  ```text
327
348
  /workflow open-claude-design prompt="Refresh the settings page hierarchy"
328
- /workflow open-claude-design prompt="Design a billing page" reference=https://stripe.com/billing output_type=page
329
- /workflow open-claude-design prompt="Generate spacing and color tokens" output_type=tokens design_system=./DESIGN.md
349
+ /workflow open-claude-design prompt="Design a billing page like Stripe's"
350
+ /workflow open-claude-design prompt="Generate spacing and color tokens"
351
+ /workflow open-claude-design prompt="Design a marketing landing page" discover_references=false
330
352
  ```
331
353
 
354
+ The discovery interview asks for the output type and any reference URLs/files, so you no longer pass `output_type`, `reference`, or `design_system` on the command line.
355
+
332
356
  ### Launching with natural language
333
357
 
334
358
  You can also kick off a built-in workflow by describing the task in chat. Atomic picks the matching workflow and fills in inputs from your request:
@@ -389,7 +413,7 @@ If the task is only deterministic TypeScript with no LLM/session stage, use a sc
389
413
  | User goal | Use |
390
414
  |-----------|-----|
391
415
  | Run, inspect, attach to, pause, interrupt, resume, or check status for an existing workflow | `/workflow ...` or `workflow({ action: ... })` |
392
- | Implement a small-to-medium scope change with an identifiable work surface, exact outcome, and named validation | `/workflow goal objective="..."` so Atomic keeps the run bounded, captures receipts in a goal ledger, gates completion through reviewers, and stops as `complete`, `blocked`, or `needs_human` |
416
+ | Implement a small-to-medium scope change with an identifiable work surface, exact outcome, and named validation | `/workflow goal objective="..."` so Atomic keeps the run bounded, captures receipts in a goal ledger, gates completion through reviewers, stops as `complete`, `blocked`, or `needs_human`, and can optionally run a final PR handoff with `create_pr=true` after approval |
393
417
  | Research and execute a larger migration, broad refactor, or multi-package change | `/workflow ralph prompt="..."` so Atomic can transform the prompt into a research question, research the codebase first, delegate implementation through sub-agents, review, and iterate; prompt text alone does not opt in to PR creation, so add `create_pr=true` only when you want the final `pull-request` stage and `pr_report` |
394
418
  | Create or edit reusable automation | a TypeScript workflow definition exported from `workflow({...})` |
395
419
  | Track one-off work without saving a workflow file | direct `workflow({ task })`, `workflow({ tasks })`, or `workflow({ chain })` calls |
@@ -570,13 +594,13 @@ Record the selected pattern in your spec or workflow README, then adapt the diag
570
594
 
571
595
  Claude Code Dynamic Workflows and Atomic are trying to solve a similar class of problem: important software engineering work is too large for one agent pass, so the system should split the job into stages, run agents in parallel, verify the result, and keep enough state to finish long-running work.
572
596
 
573
- The difference is where control lives.
597
+ Atomic's category is broader and more explicit: it is the loop engine for engineering work. The difference is where control lives and how much of the loop you can inspect, version, extend, and connect to your stack.
574
598
 
575
599
  | Dimension | Atomic | Claude Code Dynamic Workflows |
576
600
  | --- | --- | --- |
577
- | Core idea | Open-source, repo-native workflow automation for coding agents. You can run built-ins, tell the coding agent to use a workflow for a task, describe new workflows in natural language for Atomic to scaffold dynamically, or version them as explicit TypeScript files. | Claude dynamically creates orchestration scripts for a task and fans work out to many parallel Claude subagents. |
578
- | Best fit | Teams that want repeatable software engineering workflows they can inspect, version, extend, and run across providers. | Claude Code users who want Claude to decide when a task needs a larger dynamic workflow and orchestrate it automatically. |
579
- | Workflow control | The process is explicit: stages, inputs, handoffs, retries, artifacts, model choices, and human gates are part of the workflow definition. | The process is generated dynamically by Claude for the current task, with confirmation before the first workflow run. |
601
+ | Core idea | Open-source, repo-native loop engine for coding agents. You can run built-ins, tell the coding agent to use a workflow for a task, describe new loops in natural language for Atomic to scaffold dynamically, or version them as explicit TypeScript files. | Claude dynamically creates orchestration scripts for a task and fans work out to many parallel Claude subagents. |
602
+ | Best fit | Teams that want repeatable software engineering loops they can inspect, version, extend, connect to tools, and run across providers. | Claude Code users who want Claude to decide when a task needs a larger dynamic workflow and orchestrate it automatically. |
603
+ | Workflow control | The process is explicit: stages, inputs, handoffs, retries, artifacts, model choices, checkpoints, and human gates are part of the workflow definition. | The process is generated dynamically by Claude for the current task, with confirmation before the first workflow run. |
580
604
  | Models | Model-agnostic. Atomic connects directly to supported API-key and subscription providers, and workflows can use model fallback chains. | Claude-first. Availability is tied to Claude Code, Claude plans, and Anthropic-supported API/cloud channels. |
581
605
  | Extensibility | Built on Pi extensions: add tools, TUI, MCP, web access, intercom, skills, prompt templates, themes, custom providers, and packaged workflows. | Optimized for Claude Code's built-in dynamic orchestration experience rather than an open extension SDK you own in-repo. |
582
606
  | Artifacts and auditability | Research docs, specs, logs, transcripts, reviewer notes, check output, and final summaries can live in the repo or workflow run directory. | Progress is saved and resumable, but the orchestration is primarily a Claude Code runtime behavior. |
@@ -1071,27 +1095,7 @@ Authoring basics:
1071
1095
  - `outputs` declares typed outputs that parent workflows receive from `ctx.workflow(childWorkflow, ...)`.
1072
1096
  - `run: async (ctx) => { ... }` defines the workflow body.
1073
1097
 
1074
- Codemod-style migration from the removed builder API:
1075
-
1076
- ```diff
1077
- -import { defineWorkflow, Type } from "@bastani/workflows";
1078
- +import { workflow } from "@bastani/workflows";
1079
- +import { Type } from "typebox";
1080
-
1081
- -export default defineWorkflow("review-changes")
1082
- - .description("Run two reviewers and synthesize findings.")
1083
- - .input("target", Type.String())
1084
- - .output("decision", Type.String())
1085
- - .run(async (ctx) => ({ decision: await review(ctx) }))
1086
- - .compile();
1087
- +export default workflow({
1088
- + name: "review-changes",
1089
- + description: "Run two reviewers and synthesize findings.",
1090
- + inputs: { target: Type.String() },
1091
- + outputs: { decision: Type.String() },
1092
- + run: async (ctx) => ({ decision: await review(ctx) }),
1093
- +});
1094
- ```
1098
+ Migrating an existing file from the removed `defineWorkflow(...).compile()` builder? See [Migrating from the `defineWorkflow()` Builder API](#migrating-from-the-defineworkflow-builder-api) for the full method-to-key mapping, a before/after walkthrough, and a conversion checklist.
1095
1099
 
1096
1100
  `prompt` and `task` are aliases for task text. Prefer `prompt` inside authored workflow files because it mirrors lower-level `stage.prompt(...)`; `task` remains useful in direct tool calls and chain examples.
1097
1101
 
@@ -1383,7 +1387,7 @@ Common builtin import targets:
1383
1387
  | Workflow name | TypeScript export | Individual module path | Typical use inside another workflow |
1384
1388
  |---|---|---|---|
1385
1389
  | `deep-research-codebase` | `deepResearchCodebase` | `@bastani/workflows/builtin/deep-research-codebase` | Gather broad repo research before planning, synthesis, or implementation. |
1386
- | `goal` | `goal` | `@bastani/workflows/builtin/goal` | Run a bounded implementation/check loop with receipts and reviewer-gated completion. |
1390
+ | `goal` | `goal` | `@bastani/workflows/builtin/goal` | Run a bounded implementation/check loop with receipts and reviewer-gated completion; pass `create_pr=true` to authorize only the final PR-creation stage after approval. |
1387
1391
  | `ralph` | `ralph` | `@bastani/workflows/builtin/ralph` | Delegate a larger migration/refactor effort to Ralph's research/orchestrate/review loop; pass `create_pr=true` to authorize only the final PR-creation stage. |
1388
1392
  | `open-claude-design` | `openClaudeDesign` | `@bastani/workflows/builtin/open-claude-design` | Generate and refine a UI/design artifact and handoff spec. |
1389
1393
 
@@ -1503,6 +1507,121 @@ If a parent workflow exits through `ctx.exit(...)` while a child workflow is in
1503
1507
 
1504
1508
  Continuation replay treats the parent child-workflow boundary as the durable checkpoint: a previously completed child boundary replays with the original exposed outputs and without re-running the child, while a child that failed or was interrupted before completion starts again from the beginning on continuation. If `ctx.exit(...)` wins while a completed boundary is being replayed but before replay finalization, the boundary is finalized as skipped and its preloaded child metadata is omitted from store, persistence, restore, and expanded graph views.
1505
1509
 
1510
+ ## Migrating from the `defineWorkflow()` Builder API
1511
+
1512
+ The chained builder API — `defineWorkflow(name).description(...).input(...).output(...).worktreeFromInputs(...).run(...).compile()` — was removed in [#1457](https://github.com/bastani-inc/atomic/pull/1457). The single `workflow({ name?, description, inputs, outputs, run })` object form is now the only authoring door. There is no shim and no deprecation period: workflow files that still call `defineWorkflow(...).compile()` fail discovery with a module-load error until they are migrated.
1513
+
1514
+ This section is for workflow files written against the previous API. If you are authoring a new workflow, skip it and start from [Writing a Workflow](#writing-a-workflow).
1515
+
1516
+ ### What changed
1517
+
1518
+ - `import { defineWorkflow, Type } from "@bastani/workflows"` → `workflow` now comes from `@bastani/workflows`, and `Type` comes from the `typebox` package directly. `@bastani/workflows` no longer re-exports `Type`. The `Static` and `TSchema` *type* exports are still re-exported from `@bastani/workflows`, so `import type { Static } from "@bastani/workflows"` keeps working — only the runtime `Type` builder moved.
1519
+ - The fluent builder chain became one object literal passed to `workflow({ ... })`.
1520
+ - `name` moved from the `defineWorkflow(name)` argument into the object. It is now **optional** — omit it and discovery derives the name from the filename (the recommended style used by the builtins and most examples), or keep it when you want the name to differ from the file's basename.
1521
+ - `outputs` is now **required**. Workflows that declared no outputs before must now pass `outputs: {}`.
1522
+ - `.compile()` is gone. `workflow({ ... })` returns the frozen, branded definition directly; `export default` it.
1523
+ - The imperative object-form `runWorkflow(...)` runner is also removed (it is a `never` placeholder that throws on access). Programmatic execution uses the exported `run(def, inputs)` helper or a registry — see [Programmatic Usage](#programmatic-usage).
1524
+
1525
+ ### Builder method → object key
1526
+
1527
+ | Removed builder API | New `workflow({ ... })` key |
1528
+ | --- | --- |
1529
+ | `defineWorkflow("name")` argument | `name: "name"` (optional; derived from the filename when omitted) |
1530
+ | `.description(text)` | `description: text` |
1531
+ | `.input(key, schema)` (repeatable) | `inputs: { key: schema, ... }` |
1532
+ | `.output(key, schema)` (repeatable) | `outputs: { key: schema, ... }` (required, even if `{}`) |
1533
+ | `.worktreeFromInputs(binding)` | `worktreeFromInputs: binding` (binding shape unchanged) |
1534
+ | `.run(fn)` callback | `run: fn` |
1535
+ | `.compile()` terminal | delete — `workflow({ ... })` returns the definition |
1536
+
1537
+ `ctx` and every primitive (`ctx.task`, `ctx.chain`, `ctx.parallel`, `ctx.stage`, `ctx.workflow`, `ctx.exit`, `ctx.ui`) are unchanged, so workflow **bodies do not need rewriting** — only the authoring wrapper changes.
1538
+
1539
+ ### Full before / after
1540
+
1541
+ Before (removed API):
1542
+
1543
+ ```ts
1544
+ import { defineWorkflow, Type } from "@bastani/workflows";
1545
+
1546
+ export default defineWorkflow("review-changes")
1547
+ .description("Run two reviewers in parallel and synthesize a decision.")
1548
+ .input("target", Type.String({ description: "Path or change target to review." }))
1549
+ .input("base_branch", Type.String({ default: "origin/main" }))
1550
+ .output("decision", Type.String())
1551
+ .output("concerns", Type.Optional(Type.Array(Type.String())))
1552
+ .worktreeFromInputs({ baseBranch: "base_branch" })
1553
+ .run(async (ctx) => {
1554
+ const target = String(ctx.inputs.target);
1555
+ const [quality, runtime] = await ctx.parallel(
1556
+ [
1557
+ { name: "quality", prompt: `Review quality of ${target}` },
1558
+ { name: "runtime", prompt: `Review runtime behavior of ${target}` },
1559
+ ],
1560
+ { concurrency: 2 },
1561
+ );
1562
+ return { decision: `${quality.text}\n${runtime.text}`, concerns: [] };
1563
+ })
1564
+ .compile();
1565
+ ```
1566
+
1567
+ After (current API):
1568
+
1569
+ ```ts
1570
+ import { workflow } from "@bastani/workflows";
1571
+ import { Type } from "typebox";
1572
+
1573
+ export default workflow({
1574
+ name: "review-changes", // optional — omit to derive from filename
1575
+ description: "Run two reviewers in parallel and synthesize a decision.",
1576
+ inputs: {
1577
+ target: Type.String({ description: "Path or change target to review." }),
1578
+ base_branch: Type.String({ default: "origin/main" }),
1579
+ },
1580
+ outputs: {
1581
+ decision: Type.String(),
1582
+ concerns: Type.Optional(Type.Array(Type.String())),
1583
+ },
1584
+ worktreeFromInputs: { baseBranch: "base_branch" },
1585
+ run: async (ctx) => {
1586
+ const target = String(ctx.inputs.target);
1587
+ const [quality, runtime] = await ctx.parallel(
1588
+ [
1589
+ { name: "quality", prompt: `Review quality of ${target}` },
1590
+ { name: "runtime", prompt: `Review runtime behavior of ${target}` },
1591
+ ],
1592
+ { concurrency: 2 },
1593
+ );
1594
+ return { decision: `${quality.text}\n${runtime.text}`, concerns: [] };
1595
+ },
1596
+ });
1597
+ ```
1598
+
1599
+ ### Conversion checklist
1600
+
1601
+ For each `.atomic/workflows/*.ts` (or workflow-package) file:
1602
+
1603
+ 1. Swap the import to `import { workflow } from "@bastani/workflows"` and add `import { Type } from "typebox"`. Drop `defineWorkflow` from the `@bastani/workflows` import. `import type { Static, TSchema }` can stay on the `@bastani/workflows` import if you use those types.
1604
+ 2. Replace `defineWorkflow("<name>")` with `workflow({`. You may keep `name: "<name>"` or drop the key entirely to derive the name from the filename.
1605
+ 3. Move `.description("<text>")` to a `description: "<text>",` property.
1606
+ 4. Collect every `.input(key, schema)` into one `inputs: { key: schema, ... },` map.
1607
+ 5. Collect every `.output(key, schema)` into one `outputs: { key: schema, ... },` map. If there were no `.output(...)` calls, add `outputs: {},` — it is now required.
1608
+ 6. Move `.worktreeFromInputs(binding)` to a `worktreeFromInputs: binding,` property (same binding shape, unchanged).
1609
+ 7. Move the `.run(fn)` callback to a `run: fn,` property; the body stays byte-for-byte the same.
1610
+ 8. Delete the trailing `.compile()`, close the object with `})`, and keep `export default`.
1611
+ 9. Run `/workflow reload` (or restart Atomic) and `/workflow list` to confirm the file loads. Because `ctx` and its primitives are unchanged, stage behavior, graph layout, resume/kill, and human-input prompts are unaffected.
1612
+
1613
+ ### Gotchas
1614
+
1615
+ - **`outputs` is required.** The old `.output(...)` calls were optional, and a workflow with none compiled fine. The new object form throws `workflow: outputs must be a schema map` when `outputs` is missing, so declare `outputs: {}` for outputless workflows.
1616
+ - **`Type` is no longer re-exported.** `import { Type } from "@bastani/workflows"` fails type-checking; import it from `typebox` instead. (`Static` and `TSchema` *types* are still re-exported from `@bastani/workflows`, so those imports do not need to change.)
1617
+ - **`.compile()` does not exist.** Leaving it produces a runtime `TypeError`; `workflow({ ... })` already returns the frozen, branded definition.
1618
+ - **`name` is derived from the filename when omitted.** `review-changes.ts` becomes the `review-changes` workflow, so an explicit `name` is only needed when it should differ from the basename.
1619
+ - **No hand-rolled definitions.** Objects carrying `__piWorkflow: true` that you construct by hand are rejected by discovery and by `ctx.workflow(...)`. Only definitions minted by `workflow({ ... })` are accepted.
1620
+ - **The imperative `runWorkflow` runner is gone.** It is now a `never` placeholder that throws on access; use the exported `run(def, inputs)` helper or a registry for programmatic execution.
1621
+ - **Keep `outputs` inline for the strictest type checking.** The old builder enforced no-extra-output keys through a `NoExtraOutputs` generic on `.run(fn)`; the object form re-creates that check for inline `outputs` maps, but cannot recover output keys when a schema map is widened or built up before being passed to `workflow({ ... })`. Keep the `outputs` literal inline so the declared-key check stays exact.
1622
+
1623
+ Everything else — stage primitives, `ctx.inputs` typing, runtime validation, DAG inference, MCP scoping, resume/kill, worktree binding, model fallback, and the `/workflow` tool contract — is unchanged.
1624
+
1506
1625
  ## Workflow Primitives
1507
1626
 
1508
1627
  Prefer high-level primitives because they create tracked graph nodes, provide consistent handoff semantics, and keep workflow definitions easier to read.
@@ -1904,3 +2023,653 @@ Good workflows are information-flow systems, not just prompt sequences. Keep sta
1904
2023
  - Do not write stage prompts that depend on hidden workflow-wide awareness; make each model stage locally scoped and self-described.
1905
2024
  - Do not parse model gate decisions from ad-hoc prose with regular expressions; configure `schema` on a focused workflow item and consume `result.structured`.
1906
2025
  - Return compact structured decisions and save large artifacts to files; artifact handoffs should still use files when the next stage does not need the whole payload in context.
2026
+
2027
+ ## Workflow Best Practices
2028
+
2029
+ This is the playbook I use to get consistently better results from coding agents and workflow systems.
2030
+
2031
+ The core idea is simple: do not treat an agent like a magic box. Treat it like a capable engineering partner that needs a clear objective, tight scope, explicit validation, and occasional steering.
2032
+
2033
+ Most weak agent runs fail for predictable reasons: the goal is vague, the scope is too broad, validation is missing, or the agent keeps following the wrong signal. This playbook is about avoiding those failure modes.
2034
+
2035
+ The examples below are synthetic and intentionally generic. Replace placeholders like `[component]`, `[test command]`, and `[workflow]` with your own project details.
2036
+
2037
+ ---
2038
+
2039
+ ### The core loop
2040
+
2041
+ The workflow pattern I rely on most often is:
2042
+
2043
+ ```text
2044
+ Objective -> Scope -> Done criteria -> Run -> Inspect -> Steer -> Validate -> Summarize
2045
+ ```
2046
+
2047
+ In practice, that means:
2048
+
2049
+ 1. Define the end state.
2050
+ 2. Constrain the blast radius.
2051
+ 3. State what counts as done.
2052
+ 4. Let the agent or workflow work.
2053
+ 5. Inspect status before reading details.
2054
+ 6. Steer only when the run is off track, blocked, or missing criteria.
2055
+ 7. Require evidence before accepting the result.
2056
+ 8. Ask for a summary, handoff, or next-step plan.
2057
+
2058
+ A good workflow prompt does not just say what to try. It says what success looks like.
2059
+
2060
+ ---
2061
+
2062
+ ### Prompt anatomy
2063
+
2064
+ A strong workflow prompt usually has these parts:
2065
+
2066
+ #### Objective
2067
+
2068
+ What should be true when the work is complete?
2069
+
2070
+ ```text
2071
+ Implement `[specific behavior]` in `[component]`.
2072
+ ```
2073
+
2074
+ #### Context
2075
+
2076
+ What does the agent need to know before acting?
2077
+
2078
+ ```text
2079
+ This is needed because `[reason]`. The relevant code likely lives near `[area]`.
2080
+ ```
2081
+
2082
+ #### Scope
2083
+
2084
+ What is the agent allowed to change?
2085
+
2086
+ ```text
2087
+ Only touch files directly required for `[behavior]`.
2088
+ ```
2089
+
2090
+ #### Non-goals
2091
+
2092
+ What should the agent avoid?
2093
+
2094
+ ```text
2095
+ Do not redesign `[subsystem]`, refactor unrelated code, or change public behavior outside `[case]`.
2096
+ ```
2097
+
2098
+ #### Done criteria
2099
+
2100
+ How will we know the work is complete?
2101
+
2102
+ ```text
2103
+ Done means:
2104
+ - `[new behavior]` works.
2105
+ - `[existing behavior]` is unchanged.
2106
+ - `[test command]` passes.
2107
+ - The final response includes changed files, validation results, and remaining risks.
2108
+ ```
2109
+
2110
+ #### Stop conditions
2111
+
2112
+ When should the agent stop and ask instead of guessing?
2113
+
2114
+ ```text
2115
+ If this requires changing `[public API/security behavior/data migration]`, stop and ask first.
2116
+ ```
2117
+
2118
+ ---
2119
+
2120
+ ### Core principles
2121
+
2122
+ #### 1. Start with the end state
2123
+
2124
+ I try to describe what should be true at the end, not just what the agent should investigate.
2125
+
2126
+ Bad:
2127
+
2128
+ ```text
2129
+ Look into the login issue.
2130
+ ```
2131
+
2132
+ Better:
2133
+
2134
+ ```text
2135
+ Fix the login redirect regression. Done means users who sign in from `[page]` return to `[expected destination]`, and `[test command]` passes.
2136
+ ```
2137
+
2138
+ #### 2. Keep scope tight
2139
+
2140
+ Agents are often tempted to clean up nearby code. Sometimes that is useful, but most workflow runs should be bounded.
2141
+
2142
+ Use phrases like:
2143
+
2144
+ - `Only touch files required for this behavior.`
2145
+ - `Do not refactor unrelated code.`
2146
+ - `Preserve existing behavior for [case].`
2147
+ - `Make the smallest correct change.`
2148
+
2149
+ #### 3. Separate implementation from validation
2150
+
2151
+ A change is not done because the agent says it is done. It is done when the relevant evidence supports it.
2152
+
2153
+ That evidence can be:
2154
+
2155
+ - a targeted test,
2156
+ - a broader regression test,
2157
+ - a smoke command,
2158
+ - a typecheck or lint command,
2159
+ - a structured output contract check,
2160
+ - or a clear manual verification step.
2161
+
2162
+ #### 4. Prefer evidence over speculation
2163
+
2164
+ When something fails, I steer the agent back to the observable signal: the error, failing test, log line, user behavior, or broken contract.
2165
+
2166
+ ```text
2167
+ Treat the failing assertion as the source of truth. Do not guess from nearby code alone.
2168
+ ```
2169
+
2170
+ #### 5. Use staged thinking
2171
+
2172
+ For ambiguous work, I usually separate the flow into stages:
2173
+
2174
+ ```text
2175
+ Investigate -> identify root cause -> propose fix -> implement -> validate -> summarize
2176
+ ```
2177
+
2178
+ If the cause is not clear, I do not want the agent making broad changes just to see what happens.
2179
+
2180
+ #### 6. Steer, do not micromanage
2181
+
2182
+ The best steering messages are short and corrective. They add constraints, redirect attention, or provide a decision.
2183
+
2184
+ You usually do not need to rewrite the whole prompt. You need to say what changed.
2185
+
2186
+ #### 7. Treat failed validation as the next task
2187
+
2188
+ A failed test is not a footnote. It becomes the next objective.
2189
+
2190
+ ```text
2191
+ Validation failed on `[command]`. Treat that as the source of truth. Fix the root cause only, rerun the failing check, then report the result.
2192
+ ```
2193
+
2194
+ #### 8. Interrupt stale or wrong work
2195
+
2196
+ If a run is solving the wrong problem, based on outdated assumptions, or duplicating another run, stop it. Letting it continue usually creates more cleanup later.
2197
+
2198
+ #### 9. Inspect at the right level
2199
+
2200
+ For long-running workflows, I do not start by reading every log. I check:
2201
+
2202
+ 1. overall status,
2203
+ 2. current stage,
2204
+ 3. blocker or failure reason,
2205
+ 4. relevant stage details only if needed.
2206
+
2207
+ #### 10. Ask for synthesis before handoff
2208
+
2209
+ Before switching from investigation to implementation, or from implementation to review, I often ask for a concise synthesis:
2210
+
2211
+ ```text
2212
+ Summarize root cause, proposed fix, files involved, validation plan, and remaining risks.
2213
+ ```
2214
+
2215
+ ---
2216
+
2217
+ ### Common workflow patterns
2218
+
2219
+ #### Scoped implementation sprint
2220
+
2221
+ **Use when:** You have a clear feature, bug fix, or issue to delegate.
2222
+
2223
+ **Prompt shape:**
2224
+
2225
+ ```text
2226
+ Implement `[feature]` in `[component]`. Only touch files directly needed for this behavior. Done means the new behavior works, existing behavior is unchanged, and `[test command]` passes.
2227
+ ```
2228
+
2229
+ **Why it works:** The agent gets autonomy, but the objective and blast radius are bounded.
2230
+
2231
+ **Validation:** Run the most relevant targeted check first, then a broader nearby check if the change is risky.
2232
+
2233
+ ---
2234
+
2235
+ #### Regression repair loop
2236
+
2237
+ **Use when:** CI, tests, typecheck, lint, or smoke validation fails.
2238
+
2239
+ **Prompt shape:**
2240
+
2241
+ ```text
2242
+ Fix the failing `[test suite]` regression. Treat the failure output as the source of truth. Do not refactor unrelated code. Done means the failing test passes and no nearby tests regress.
2243
+ ```
2244
+
2245
+ **Why it works:** It anchors the run to observable evidence instead of speculation.
2246
+
2247
+ **Validation:** Reproduce the failure, fix the root cause, rerun the failing check, then run a nearby or broader check.
2248
+
2249
+ ---
2250
+
2251
+ #### Workflow or tooling smoke test
2252
+
2253
+ **Use when:** You changed a workflow definition, prompt contract, structured output, CLI behavior, or developer tool.
2254
+
2255
+ **Prompt shape:**
2256
+
2257
+ ```text
2258
+ Validate `[workflow/tool]` after the change. Run a minimal smoke case, confirm required outputs are present, and report whether it can be invoked with expected inputs.
2259
+ ```
2260
+
2261
+ **Why it works:** Workflow and tooling changes often fail at integration boundaries. A small smoke case catches those failures early.
2262
+
2263
+ **Validation:** Reload or rerun the tool, check the output shape, and report contract mismatches.
2264
+
2265
+ ---
2266
+
2267
+ #### Human-in-the-loop checkpoint
2268
+
2269
+ **Use when:** The workflow might need a product decision, API decision, migration choice, or risky approval.
2270
+
2271
+ **Prompt shape:**
2272
+
2273
+ ```text
2274
+ If blocked, ask before changing public API behavior. Otherwise proceed with the smallest compatible fix.
2275
+ ```
2276
+
2277
+ **Why it works:** The agent keeps moving where it can, but does not guess on high-impact decisions.
2278
+
2279
+ **Validation:** Confirm the decision is reflected in the final behavior and summary.
2280
+
2281
+ ---
2282
+
2283
+ #### Release gate
2284
+
2285
+ **Use when:** Preparing a release, version bump, changelog, publish step, migration, or deployment-adjacent task.
2286
+
2287
+ **Prompt shape:**
2288
+
2289
+ ```text
2290
+ Prepare a `[release kind]` release for `[version]`. Do not publish unless validation passes. Report the exact checks performed and any unresolved blockers.
2291
+ ```
2292
+
2293
+ **Why it works:** Release work needs explicit gates and stop conditions.
2294
+
2295
+ **Validation:** Require changelog review, tests, build/package checks, and a clear publish/no-publish decision.
2296
+
2297
+ ---
2298
+
2299
+ #### Monitor-and-steer long run
2300
+
2301
+ **Use when:** A workflow runs asynchronously, has multiple stages, or may need supervision.
2302
+
2303
+ **Prompt shape:**
2304
+
2305
+ ```text
2306
+ Show the current stage and blocker. If implementation is complete, summarize validation status and remaining risks.
2307
+ ```
2308
+
2309
+ **Why it works:** It avoids both blind trust and excessive log-reading.
2310
+
2311
+ **Validation:** Inspect status first, then stages, then only the relevant details.
2312
+
2313
+ ---
2314
+
2315
+ #### Investigate before implementing
2316
+
2317
+ **Use when:** A bug or request is ambiguous.
2318
+
2319
+ **Prompt shape:**
2320
+
2321
+ ```text
2322
+ Investigate `[bug]`, identify root cause, and propose the smallest fix. Do not implement until the cause is clear.
2323
+ ```
2324
+
2325
+ **Why it works:** It prevents the agent from making changes before it understands the failure mode.
2326
+
2327
+ **Validation:** Ask for a reproduction, root-cause explanation, proposed fix, and test plan before implementation.
2328
+
2329
+ ---
2330
+
2331
+ ### Steering patterns
2332
+
2333
+ #### Tighten scope
2334
+
2335
+ **Signal:** The agent starts expanding into adjacent cleanup, unrelated files, or broad refactors.
2336
+
2337
+ **Steer:**
2338
+
2339
+ ```text
2340
+ Narrow this to `[specific behavior]` in `[component]`. Do not refactor unrelated code or change `[adjacent area]`. Done means `[specific acceptance criteria]`.
2341
+ ```
2342
+
2343
+ **Why:** Prevents risky changes and keeps the run reviewable.
2344
+
2345
+ ---
2346
+
2347
+ #### Add missing done criteria
2348
+
2349
+ **Signal:** The agent has a plan, but no clear finish line.
2350
+
2351
+ **Steer:**
2352
+
2353
+ ```text
2354
+ Use these done criteria:
2355
+ 1. `[behavior]` works.
2356
+ 2. `[regression]` remains unchanged.
2357
+ 3. `[test command]` passes.
2358
+ 4. Report files changed and validation results.
2359
+ ```
2360
+
2361
+ **Why:** Makes completion verifiable.
2362
+
2363
+ ---
2364
+
2365
+ #### Redirect an off-track stage
2366
+
2367
+ **Signal:** The workflow is investigating the wrong area or solving the wrong problem.
2368
+
2369
+ **Steer:**
2370
+
2371
+ ```text
2372
+ Stop pursuing `[wrong direction]`. The relevant signal is `[error/test/user behavior]`. Re-focus on `[target area]` and continue from there.
2373
+ ```
2374
+
2375
+ **Why:** Saves time and prevents wrong assumptions from compounding.
2376
+
2377
+ ---
2378
+
2379
+ #### Respond to a blocked prompt
2380
+
2381
+ **Signal:** The workflow asks for approval, a choice, or clarification.
2382
+
2383
+ **Steer:**
2384
+
2385
+ ```text
2386
+ Choose `[option]`. Continue only if `[condition]`; otherwise stop and report the blocker.
2387
+ ```
2388
+
2389
+ **Why:** Keeps the workflow unblocked without adding ambiguity.
2390
+
2391
+ ---
2392
+
2393
+ #### Turn failed validation into the next task
2394
+
2395
+ **Signal:** Tests, typecheck, lint, build, or smoke checks fail.
2396
+
2397
+ **Steer:**
2398
+
2399
+ ```text
2400
+ Validation failed on `[command]`. Treat that as the source of truth. Fix the root cause only, rerun the failing check, then report the result.
2401
+ ```
2402
+
2403
+ **Why:** Prevents accepting partially working output.
2404
+
2405
+ ---
2406
+
2407
+ #### Ask for synthesis
2408
+
2409
+ **Signal:** The workflow has gathered information, but the next action is unclear.
2410
+
2411
+ **Steer:**
2412
+
2413
+ ```text
2414
+ Synthesize the current findings into: root cause, proposed fix, files likely involved, validation plan, and remaining risks.
2415
+ ```
2416
+
2417
+ **Why:** Converts exploration into a usable plan.
2418
+
2419
+ ---
2420
+
2421
+ #### Pause, kill, or rerun
2422
+
2423
+ **Signal:** A run is stale, duplicated, superseded, or based on outdated assumptions.
2424
+
2425
+ **Steer:**
2426
+
2427
+ ```text
2428
+ Pause this run; it has been superseded by `[new context]`. Resume only with `[updated objective]`, or stop and summarize current state.
2429
+ ```
2430
+
2431
+ **Why:** Avoids conflicting changes and wasted work.
2432
+
2433
+ ---
2434
+
2435
+ ### Copy-paste templates
2436
+
2437
+ #### Start a workflow
2438
+
2439
+ ```text
2440
+ Objective:
2441
+ Implement/fix `[specific behavior]` in `[component]`.
2442
+
2443
+ Context:
2444
+ `[short context about why this matters or where to look]`
2445
+
2446
+ Scope:
2447
+ - Only touch files required for `[behavior]`.
2448
+ - Do not refactor unrelated code.
2449
+ - Preserve existing behavior for `[existing case]`.
2450
+
2451
+ Done criteria:
2452
+ - `[new behavior]` works.
2453
+ - `[regression case]` still works.
2454
+ - `[test command]` passes.
2455
+ - Report changed files, validation results, and any risks.
2456
+
2457
+ Stop conditions:
2458
+ - If this requires `[risky decision]`, stop and ask first.
2459
+ ```
2460
+
2461
+ #### Tighten scope
2462
+
2463
+ ```text
2464
+ Tighten scope to `[specific target]`.
2465
+
2466
+ Do not work on:
2467
+ - `[excluded area 1]`
2468
+ - `[excluded area 2]`
2469
+ - broad cleanup or unrelated refactors
2470
+
2471
+ Continue only on the path needed to satisfy:
2472
+ `[acceptance criterion]`.
2473
+ ```
2474
+
2475
+ #### Add acceptance criteria
2476
+
2477
+ ```text
2478
+ Add these acceptance criteria before continuing:
2479
+
2480
+ 1. User can `[action]`.
2481
+ 2. System handles `[edge case]`.
2482
+ 3. Existing behavior `[existing behavior]` is unchanged.
2483
+ 4. `[test command]` passes.
2484
+ 5. Final response includes validation evidence.
2485
+ ```
2486
+
2487
+ #### Redirect a stage
2488
+
2489
+ ```text
2490
+ This stage is off track.
2491
+
2492
+ Stop investigating `[wrong area]`.
2493
+ The relevant signal is `[error/output/requirement]`.
2494
+ Refocus on `[correct area]`.
2495
+
2496
+ Next:
2497
+ 1. Reproduce or inspect `[signal]`.
2498
+ 2. Identify root cause.
2499
+ 3. Make the smallest fix.
2500
+ 4. Run `[validation command]`.
2501
+ ```
2502
+
2503
+ #### Handle failed validation
2504
+
2505
+ ```text
2506
+ Validation failed:
2507
+
2508
+ Command:
2509
+ `[command]`
2510
+
2511
+ Failure:
2512
+ `[short sanitized failure summary]`
2513
+
2514
+ Treat this as the source of truth.
2515
+ Fix only the root cause.
2516
+ Rerun the failing command.
2517
+ If it still fails, summarize the blocker and stop.
2518
+ ```
2519
+
2520
+ #### Ask for synthesis
2521
+
2522
+ ```text
2523
+ Synthesize current progress into:
2524
+
2525
+ - What was attempted
2526
+ - What changed
2527
+ - What evidence supports the result
2528
+ - What remains uncertain
2529
+ - Recommended next steps
2530
+ - Exact validation commands run
2531
+ ```
2532
+
2533
+ #### Turn findings into implementation steps
2534
+
2535
+ ```text
2536
+ Convert the findings into an implementation plan:
2537
+
2538
+ 1. Files/components to change
2539
+ 2. Order of changes
2540
+ 3. Tests to add or update
2541
+ 4. Validation commands
2542
+ 5. Risks or edge cases
2543
+ 6. Stop conditions
2544
+ ```
2545
+
2546
+ #### Prepare a release gate
2547
+
2548
+ ```text
2549
+ Prepare `[version]` as a `[release kind]` release.
2550
+
2551
+ Requirements:
2552
+ - Verify changelog entries are complete.
2553
+ - Run `[test command]`.
2554
+ - Run `[build/package command]`.
2555
+ - Do not publish unless all validation passes.
2556
+ - If any gate fails, stop and report blockers.
2557
+
2558
+ Final response should include:
2559
+ - Version
2560
+ - Checks run
2561
+ - Results
2562
+ - Files changed
2563
+ - Publish readiness
2564
+ ```
2565
+
2566
+ ---
2567
+
2568
+ ### Concrete examples
2569
+
2570
+ #### Example 1: Fixing a failing test
2571
+
2572
+ **Scenario:** A package has one failing unit test after a recent change.
2573
+
2574
+ **Initial objective:**
2575
+
2576
+ ```text
2577
+ Fix the failing `[unit test]`. Do not rewrite the module. Done means the test passes and nearby tests still pass.
2578
+ ```
2579
+
2580
+ **Steering message:**
2581
+
2582
+ ```text
2583
+ Stop exploring unrelated failures. Focus only on the assertion mismatch in `[test file]`.
2584
+ ```
2585
+
2586
+ **Validation:** Run `[targeted test command]`, then `[nearby test command]`.
2587
+
2588
+ **Outcome:** Small fix applied, regression test passes, and the workflow reports exact commands and results.
2589
+
2590
+ ---
2591
+
2592
+ #### Example 2: Repairing a workflow definition
2593
+
2594
+ **Scenario:** A custom workflow no longer returns the expected structured output.
2595
+
2596
+ **Initial objective:**
2597
+
2598
+ ```text
2599
+ Validate `[workflow]` and fix its output contract. Done means the smoke run returns `[required fields]`.
2600
+ ```
2601
+
2602
+ **Steering message:**
2603
+
2604
+ ```text
2605
+ Treat the missing output field as the root issue. Do not change unrelated stage prompts.
2606
+ ```
2607
+
2608
+ **Validation:** Reload workflow, run minimal smoke input, inspect structured result.
2609
+
2610
+ **Outcome:** Contract fixed, smoke test passes, and the workflow can be reused safely.
2611
+
2612
+ ---
2613
+
2614
+ #### Example 3: Investigating before implementing
2615
+
2616
+ **Scenario:** A user-reported bug is ambiguous.
2617
+
2618
+ **Initial objective:**
2619
+
2620
+ ```text
2621
+ Investigate `[bug]`, identify root cause, and propose the smallest fix. Do not implement until the cause is clear.
2622
+ ```
2623
+
2624
+ **Steering message:**
2625
+
2626
+ ```text
2627
+ Synthesize findings first: root cause, affected path, proposed fix, and validation plan.
2628
+ ```
2629
+
2630
+ **Validation:** Add or run a reproduction test before changing code.
2631
+
2632
+ **Outcome:** Clear implementation plan produced, then delegated as a scoped fix.
2633
+
2634
+ ---
2635
+
2636
+ ### Anti-patterns
2637
+
2638
+ | Anti-pattern | Better approach |
2639
+ | --- | --- |
2640
+ | `Fix this.` | `Fix [specific failure]; done means [test command] passes.` |
2641
+ | No validation step | Require tests, smoke checks, typecheck, or explicit manual verification. |
2642
+ | Broad refactors | Constrain the run to the files needed for the objective. |
2643
+ | Letting a wrong stage continue | Redirect or interrupt as soon as the agent follows the wrong signal. |
2644
+ | Accepting unverified summaries | Ask for changed files, commands run, results, and remaining risks. |
2645
+ | Mixing investigation and implementation too early | Ask for root cause and proposed fix before code changes. |
2646
+ | Ignoring blocked stages | Answer directly with one decision and any constraints. |
2647
+ | Continuing stale runs | Pause, kill, or rerun with updated context. |
2648
+ | Reading every log | Inspect status, then stages, then only relevant details. |
2649
+ | Publishing without gates | Require release validation and explicit stop conditions. |
2650
+
2651
+ ---
2652
+
2653
+ ### Quick reference
2654
+
2655
+ Before starting a workflow, include:
2656
+
2657
+ - [ ] Objective
2658
+ - [ ] Context
2659
+ - [ ] Scope
2660
+ - [ ] Non-goals
2661
+ - [ ] Done criteria
2662
+ - [ ] Validation command
2663
+ - [ ] Reporting requirements
2664
+ - [ ] Stop conditions
2665
+
2666
+ Before accepting a workflow result, ask:
2667
+
2668
+ - [ ] What changed?
2669
+ - [ ] Why was this the right fix?
2670
+ - [ ] What evidence supports it?
2671
+ - [ ] Which commands were run?
2672
+ - [ ] What still might be risky?
2673
+ - [ ] Is anything blocked or unresolved?
2674
+
2675
+ The better the prompt defines the game, the better the agent can play it.