@bastani/atomic 0.8.31-alpha.4 → 0.8.31-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -23,6 +23,7 @@
23
23
 
24
24
  ### Fixed
25
25
 
26
+ - Exposed `SessionManager.usesDefaultSessionDir()` through the read-only extension session-manager surface so bundled extensions can distinguish default global session storage from non-default `--session-dir`, `ATOMIC_CODING_AGENT_SESSION_DIR`, or settings-backed session directories without path guessing ([#1444](https://github.com/bastani-inc/atomic/issues/1444)).
26
27
  - Fixed `github-copilot/*` Gemini models (for example `github-copilot/gemini-3.1-pro-preview` and `github-copilot/gemini-3.5-flash`) failing **every** chat turn with `Error: 400 invalid request body`. These models are served through GitHub's Copilot API (CAPI), which translates the OpenAI chat-completions request into a Google GenAI `GenerateContent` request and forwards tool/function JSON Schema `anyOf`/`oneOf` verbatim into Gemini's `FunctionDeclaration` schema. Gemini rejects a union whose branch is a complex **object** schema, so Google returned HTTP 400 and CAPI relabelled it `{"error":{"code":"invalid_request_body"}}`. Because Atomic's bundled `workflow` tool — and any tool using the TypeBox `Type.Union([Type.Object(...), Type.String()])` pattern for fields such as `task`, `chain`, and `parallel` — is present in normal chat turns, the request failed before the model ever ran (it was previously masked only when a fallback model existed). Atomic now sanitizes outbound tool JSON Schemas for GitHub Copilot Gemini models into the subset CAPI/Gemini honors: it resolves object/array-bearing `anyOf`/`oneOf` to their most expressive branch, converts `const`/literal unions to `enum`, collapses nullable unions to `nullable`, prunes `required` to existing properties, and drops non-portable keywords (`additionalProperties`, `patternProperties`, `$schema`, `format`, `pattern`, numeric/length bounds, `default`, `title`, etc.). The transform is gated to `github-copilot` Gemini `openai-completions` models and runs last in the provider-payload pipeline (so it also covers extension/SDK-injected tools), leaving every other provider/model payload unchanged.
27
28
  - Fixed `github-copilot/*` Gemini models getting stuck in an infinite tool-call retry loop (most visibly on the workflow `structured_output` tool). Capturing the raw CAPI stream confirmed that Gemini serializes array/object function-call arguments as **flattened indexed keys** on the wire — for example `{ keywords: ["a", "b"] }` arrives as `{ "keywords[0]": "a", "keywords[1]": "b" }` — so schema validation failed (`keywords: must have required properties keywords` and `root: must not have additional properties`) and the model re-emitted the same shape forever. Atomic now reconstructs flattened tool-call arguments (`name[i]`, `name[i].sub`, `parent.child`) back into proper arrays/objects in each tool's `prepareArguments` step, before validation runs. Gated to GitHub Copilot Gemini models at call time and a no-op for well-formed arguments, so it covers built-in, extension, SDK, and MCP tools without affecting any other provider/model.
28
29
  - Fixed `github-copilot/*` Gemini models (for example `github-copilot/gemini-3.1-pro-preview`) silently dying mid-task instead of continuing the turn. Inspecting the affected sessions and confirming against GitHub's Copilot API (CAPI) source showed two distinct degenerate stream endings that Atomic was not recovering from: (1) CAPI's `getAzureFinishReason` maps several Gemini finish reasons — `MALFORMED_FUNCTION_CALL`, `OTHER`, `LANGUAGE`, and `UNEXPECTED_TOOL_CALL` — to a bare OpenAI `finish_reason: "error"`, which `pi-ai` surfaces as `"Provider finish_reason: error"`; the auto-retry classifier's regex did not match it, so the turn ended with an empty assistant message and no retry; and (2) Gemini intermittently ends the stream with `finish_reason: "stop"`, an **empty content array**, and **0 output tokens**, which Atomic treated as a successful (if empty) turn and stopped. Atomic now treats bare `finish_reason: error`/`content_filter` as retryable and detects degenerate empty completions (no text/tool-call/thinking content **and** zero output tokens on a `stop`/`toolUse` turn) as retryable, re-issuing the request with the existing exponential-backoff path. Empty `stop` completions also no longer reset the auto-retry counter, so repeated empties stay bounded by `maxRetries` instead of retrying forever.
@@ -4,7 +4,7 @@
4
4
 
5
5
  ### Changed
6
6
 
7
- - Published a synchronized Atomic 0.8.31-alpha.3 prerelease; no functional Cursor provider changes were made after 0.8.30.
7
+ - Published a synchronized Atomic 0.8.31-alpha.5 prerelease; no functional Cursor provider changes were made after 0.8.30.
8
8
 
9
9
  ## [0.8.30] - 2026-06-17
10
10
 
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/cursor",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Experimental first-party Atomic extension for Cursor OAuth, model discovery, and streaming provider registration.",
6
6
  "contributors": [
@@ -40,7 +40,7 @@
40
40
  }
41
41
  },
42
42
  "dependencies": {
43
- "@bastani/atomic-natives": "0.8.31-alpha.4",
43
+ "@bastani/atomic-natives": "0.8.31-alpha.5",
44
44
  "@bufbuild/protobuf": "^2.0.0"
45
45
  }
46
46
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/intercom",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Atomic extension providing a private coordination channel between parent and child agent sessions. Fork of: https://github.com/nicobailon/pi-intercom",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/mcp",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Atomic extension that adapts MCP (Model Context Protocol) servers into the coding agent. Fork of: https://github.com/nicobailon/pi-mcp-adapter",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/subagents",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Atomic extension for delegating tasks to subagents with chains, parallel execution, and TUI clarification. Fork of: https://github.com/nicobailon/pi-subagents",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/web-access",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Atomic extension for web search, URL fetching, GitHub repo cloning, PDF/video extraction. Fork of: https://github.com/nicobailon/pi-web-access",
6
6
  "contributors": [
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
12
12
 
13
13
  ### Added
14
14
 
15
+ - Added a deterministic workflow-stage resume stop hook: after an interactive interrupt/pause is resumed with a message, the executor suppresses the #1099/#1264 readiness prompt for that resume-answer turn (including `ask_user_question` turns) and, when the stage remains promptable, sends `Continue where you left off.` in the same stage session once per resume; schema-backed stages that already finalized with `structured_output` consume the token without a second prompt ([#1407](https://github.com/bastani-inc/atomic/issues/1407)).
15
16
  - Added a QA end-to-end proof video to the builtin `ralph` workflow. For UI-applicable or full-stack changes, the orchestrator now runs a `playwright-cli` end-to-end QA pass that drives the running app like a user, records a reviewable video (`playwright-cli video-start`/`video-stop`) to a stable run path, references it in the implementation notes (`## QA E2E Video`), and exposes it as the new optional `qa_video_path` output so the proof is available when the orchestrator finishes. When `create_pr=true`, the final `pull-request` stage attaches or links that video to the created PR/MR/review (embedding/linking where the provider supports media uploads, otherwise surfacing the absolute path). When no user-visible UI scenario applies, no video is produced and the notes record why.
16
17
  - Added a per-model context-window authoring token to workflow model strings: a parenthesized size token placed in the model-name portion, *before* the optional `:reasoning` suffix, e.g. `github-copilot/claude-opus-4.8 (1m):xhigh`. Adopting GitHub Copilot's `Claude Opus 4.8 (1M context)` naming convention keeps the window separate from the reasoning level so the two never collide. The token is resolved against the candidate model's advertised windows — an exact match wins, otherwise the largest supported window not exceeding the request (so `(1m)` selects a model's ~936K long-context tier), and it falls back to the model's default (short) window when no larger tier is available. It applies only to the candidate that carries the token, leaving primary and other fallback models untouched. Also surfaced `contextWindow`/`contextWindowStrict` on `StageOptions` and the workflow tool's direct-task schema for stage-level selection.
17
18
 
@@ -24,6 +25,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
24
25
 
25
26
  ### Fixed
26
27
 
28
+ - Fixed workflow stage transcripts ignoring the host's resolved non-default session directory in headless runs. Stages without an explicit `sessionDir` now inherit the active main-session directory when it comes from `--session-dir`, `ATOMIC_CODING_AGENT_SESSION_DIR`, or settings; explicit per-stage `sessionDir` still wins, default host sessions keep writing stages to the global store, and forked stages inherit the non-default directory too ([#1444](https://github.com/bastani-inc/atomic/issues/1444)).
29
+ - Fixed a manual workflow pause/resume not updating the main-chat run status the way the `workflow` tool and `/workflow pause`/`/workflow resume` do. Pausing a stage from the attached stage chat (Escape) or any direct live-handle path recorded only the **stage** as paused (`recordStagePaused`) and never the **run** (`recordRunPaused`), so the below-editor status widget and `/workflow status` kept showing the run as `running` (`●`) even though work was paused; resume had the symmetric gap. The executor stage-control handle now records run-level pause/resume itself — marking the run paused once no stage is still actively running (mirroring `pauseRun`'s all-active-stages-paused rule) and restoring it on resume — so manual and tool-driven pause/resume update the main chat identically. Both run-level transitions are idempotent, so the tool/slash path and cascade re-entry stay safe.
30
+ - Fixed the builtin `ralph` workflow review loop iterating until `max_loops` even when reviewers judged the patch correct. The unanimous-approval gate required a literally empty `findings` array, so a single low-priority **P3** nit — or a placeholder/dummy finding a reviewer appended because it wrongly believed an empty array would fail schema validation — kept the loop spinning despite every reviewer reporting `overall_correctness: "patch is correct"`. Approval is now **severity-aware and deterministic**: a reviewer approves when it judged the patch correct, reported no `reviewer_error`, and filed no *blocking* finding, where blocking = **P0/P1/P2** (priority 0/1/2) and **P3** (priority 3) is a non-blocking nice-to-have; a finding without a determinable priority (`null`/`undefined`) is treated as blocking so ambiguity never silently approves. The decision is computed from finding priorities rather than the reviewer's self-reported `stop_review_loop` flag. Extracted the gate into `builtin/ralph-review-gate.ts` (`reviewDecisionApproved`, `isBlockingFinding`) with unit coverage, and updated the reviewer prompt so an empty `findings` array is explicitly valid and placeholder findings are never fabricated ([#1407](https://github.com/bastani-inc/atomic/issues/1407)).
27
31
  - Fixed workflow stage **model fallback misreporting configured providers as `No API key found`**. Each fallback candidate session was created with a fresh `AuthStorage`/`ModelRegistry`, so after a primary model failed (for example the Ralph `reviewer-a` chain hitting an unavailable `anthropic/claude-fable-5` and getting a real provider 404), every fallback candidate re-read `auth.json` from scratch. Under concurrent reviewer stages and OAuth token refreshes holding the `auth.json` lock, that fresh synchronous reload could fail and silently fall back to an empty credential set, reporting `No API key found` for `anthropic`/`openai-codex`/`github-copilot` even while sibling reviewer stages used those exact providers successfully. A stage now captures the `ModelRegistry` (and its already-loaded `AuthStorage`) from its first session and threads it into every subsequent fallback candidate, so a successfully-loaded credential store is reused across the whole fallback chain instead of being discarded and re-loaded per candidate. Combined with the coding-agent change that surfaces a real credential-store load failure instead of `No API key found`, a transient store-read failure remains a recoverable/retryable auth failure ([#1431](https://github.com/bastani-inc/atomic/issues/1431)).
28
32
  - Fixed post-completion workflow follow-ups replaying the entire model-fallback chain from an unavailable primary instead of resuming on the model the stage settled on. After model fallback succeeded, the stage kept its working `session` but left `sessionPromise` undefined, and `ensureSession()` only checked `sessionPromise` — so a follow-up (`ctx.followUp`/`ctx.steer`/`ensureAttached`, and post-completion `workflow send`/TUI prompts) created a brand-new session from `candidates[0]` (the primary), discarding the working fallback session. For a chain whose primary 404s (e.g. `anthropic/claude-fable-5`), every follow-up re-ran `primary -> 404 -> ... -> working model` and could leave the stage stuck on the unavailable primary. `ensureSession()` now reuses an already-attached session, and `promptWithFallback()` retries the last-settled model first (for both live retained sessions and disk-reattached sessions), restarting the full chain from the primary only if that model fails again retryably ([#1431](https://github.com/bastani-inc/atomic/issues/1431)).
29
33
 
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Review-gate severity logic for the builtin `ralph` workflow.
3
+ *
4
+ * The bounded review loop must stop as soon as the patch is judged correct, even
5
+ * when a reviewer leaves a low-priority nit (or, occasionally, appends a
6
+ * placeholder finding because it wrongly believed an empty `findings` array would
7
+ * fail schema validation). Requiring a literally empty `findings` array made the
8
+ * loop iterate forever in those cases despite unanimous "patch is correct"
9
+ * verdicts.
10
+ *
11
+ * Approval is therefore severity-aware and deterministic. A single reviewer
12
+ * approves when it judged the patch correct, reported no `reviewer_error`, and
13
+ * filed no *blocking* finding:
14
+ *
15
+ * - Blocking = P0/P1/P2 (numeric priority 0, 1, or 2).
16
+ * - Non-blocking = P3 (numeric priority 3) — a nice-to-have that should not keep
17
+ * the loop spinning.
18
+ * - A finding whose priority cannot be determined (`null`/`undefined`) is treated
19
+ * as blocking, so genuine ambiguity never silently approves.
20
+ *
21
+ * The decision is computed from the structured findings rather than the
22
+ * reviewer's self-reported `stop_review_loop` boolean, so the gate does not
23
+ * depend on the model correctly deriving that flag.
24
+ */
25
+
26
+ export type ReviewFinding = {
27
+ readonly title: string;
28
+ readonly body: string;
29
+ readonly confidence_score: number;
30
+ readonly priority?: number | null;
31
+ readonly code_location: {
32
+ readonly absolute_file_path: string;
33
+ readonly line_range: {
34
+ readonly start: number;
35
+ readonly end: number;
36
+ };
37
+ };
38
+ };
39
+
40
+ export type ReviewerError = {
41
+ readonly kind:
42
+ | "validation_unavailable"
43
+ | "dependency_unavailable"
44
+ | "tool_failure"
45
+ | "reviewer_failure";
46
+ readonly message: string;
47
+ readonly attempted_recovery: string;
48
+ };
49
+
50
+ export type ReviewDecision = {
51
+ readonly findings: readonly ReviewFinding[];
52
+ readonly overall_correctness: "patch is correct" | "patch is incorrect";
53
+ readonly overall_explanation: string;
54
+ readonly overall_confidence_score: number;
55
+ readonly stop_review_loop: boolean;
56
+ readonly reviewer_error?: ReviewerError | null;
57
+ };
58
+
59
+ /**
60
+ * Highest finding priority that still blocks approval. P0=0, P1=1, P2=2 block;
61
+ * P3=3 does not.
62
+ */
63
+ export const MAX_BLOCKING_PRIORITY = 2;
64
+
65
+ /**
66
+ * True when a finding must keep the review loop iterating. P0/P1/P2 block; P3 is
67
+ * a non-blocking nice-to-have. A finding without a determinable priority
68
+ * (`null`/`undefined`) is treated as blocking so ambiguity never silently
69
+ * approves.
70
+ */
71
+ export function isBlockingFinding(finding: ReviewFinding): boolean {
72
+ const priority = finding.priority;
73
+ if (priority === undefined || priority === null) return true;
74
+ return priority <= MAX_BLOCKING_PRIORITY;
75
+ }
76
+
77
+ /**
78
+ * A single reviewer approves (would stop the loop) when it judged the patch
79
+ * correct, surfaced no reviewer execution error, and filed no blocking
80
+ * (P0/P1/P2) finding. P3 nice-to-haves and placeholder/dummy findings do not
81
+ * block approval.
82
+ */
83
+ export function reviewDecisionApproved(decision: ReviewDecision): boolean {
84
+ return (
85
+ decision.overall_correctness === "patch is correct" &&
86
+ decision.reviewer_error == null &&
87
+ !decision.findings.some(isBlockingFinding)
88
+ );
89
+ }
@@ -18,6 +18,7 @@ import type {
18
18
  WorkflowTaskResult,
19
19
  } from "../src/shared/types.js";
20
20
  import { E2E_VERIFICATION_GUIDANCE, WORKER_PREFLIGHT_CONTRACT } from "./shared-prompts.js";
21
+ import { reviewDecisionApproved, type ReviewDecision } from "./ralph-review-gate.js";
21
22
 
22
23
  const DEFAULT_MAX_LOOPS = 10;
23
24
  const DEFAULT_RESEARCH_DIR = "research";
@@ -25,44 +26,15 @@ const IMPLEMENTATION_NOTES_FILENAME = "implementation-notes.md";
25
26
  const QA_E2E_VIDEO_FILENAME = "qa-e2e-evidence.webm";
26
27
  const MAX_RESEARCH_SLUG_LENGTH = 80;
27
28
  // Reviewer fan-out launches three independent reviewers; the loop stops only when
28
- // all three reviewers independently approve (find no issues). Requiring unanimous
29
- // approval means a P0–P3 finding from any single reviewer keeps the loop iterating
30
- // instead of being out-voted by a majority, so lower-severity issues stay surfaced.
29
+ // all three reviewers independently approve. Approval is severity-aware: a
30
+ // reviewer approves when it judged the patch correct, reported no reviewer_error,
31
+ // and filed no *blocking* (P0/P1/P2) finding. P3 nice-to-haves no longer keep the
32
+ // loop iterating, so a single low-priority nit (or a placeholder finding) can no
33
+ // longer strand an otherwise-approved patch. Requiring unanimous approval still
34
+ // means a blocking finding from any one reviewer keeps the loop going. See
35
+ // ./ralph-review-gate.ts for the gate types and decision logic.
31
36
  const REVIEWER_COUNT = 3;
32
37
 
33
- type ReviewFinding = {
34
- readonly title: string;
35
- readonly body: string;
36
- readonly confidence_score: number;
37
- readonly priority?: number | null;
38
- readonly code_location: {
39
- readonly absolute_file_path: string;
40
- readonly line_range: {
41
- readonly start: number;
42
- readonly end: number;
43
- };
44
- };
45
- };
46
-
47
- type ReviewerError = {
48
- readonly kind:
49
- | "validation_unavailable"
50
- | "dependency_unavailable"
51
- | "tool_failure"
52
- | "reviewer_failure";
53
- readonly message: string;
54
- readonly attempted_recovery: string;
55
- };
56
-
57
- type ReviewDecision = {
58
- readonly findings: readonly ReviewFinding[];
59
- readonly overall_correctness: "patch is correct" | "patch is incorrect";
60
- readonly overall_explanation: string;
61
- readonly overall_confidence_score: number;
62
- readonly stop_review_loop: boolean;
63
- readonly reviewer_error?: ReviewerError | null;
64
- };
65
-
66
38
  const reviewFindingSchema = Type.Object(
67
39
  {
68
40
  title: Type.String(),
@@ -220,15 +192,6 @@ function reviewDecisionFromResult(result: WorkflowTaskResult): ReviewDecision |
220
192
  return result.structured as ReviewDecision | undefined;
221
193
  }
222
194
 
223
- function reviewDecisionApproved(decision: ReviewDecision): boolean {
224
- return (
225
- decision.stop_review_loop === true &&
226
- decision.overall_correctness === "patch is correct" &&
227
- decision.findings.length === 0 &&
228
- decision.reviewer_error == null
229
- );
230
- }
231
-
232
195
  function reviewerErrorDecision(error: string): ReviewDecision {
233
196
  return {
234
197
  findings: [],
@@ -785,14 +748,14 @@ async function runRalphWorkflow(
785
748
  "Speculation is insufficient: identify the code path, scenario, environment, or input that is provably affected.",
786
749
  "Do not flag intentional behavior changes as bugs unless they clearly violate the task or documented contract.",
787
750
  "Ignore trivial style unless it obscures meaning or violates documented standards in a way that affects correctness/security/maintainability.",
788
- "If no finding clears this bar, return an empty findings array, mark the patch correct, and set stop_review_loop true.",
751
+ "If no finding clears this bar, return an empty findings array, mark the patch correct, and set stop_review_loop true. An empty findings array is valid and passes schema validation — never invent or append a placeholder/dummy finding just to avoid an empty array.",
789
752
  ].join("\n"),
790
753
  ],
791
754
  [
792
755
  "comment_guidelines",
793
756
  [
794
757
  "Each finding title must start with a priority tag: [P0] drop-everything blocker, [P1] urgent next-cycle fix, [P2] normal fix, [P3] low-priority nice-to-have.",
795
- "Also include numeric priority: 0 for P0, 1 for P1, 2 for P2, 3 for P3; use null only if priority genuinely cannot be determined.",
758
+ "Also include numeric priority: 0 for P0, 1 for P1, 2 for P2, 3 for P3; use null only if priority genuinely cannot be determined. Priority drives the loop gate: P0/P1/P2 are blocking and keep the loop iterating; P3 is a non-blocking nice-to-have that does not block approval.",
796
759
  "The body must be one concise paragraph explaining why this is a bug and the exact scenario, environment, or inputs required for it to arise.",
797
760
  "Use a matter-of-fact, non-accusatory tone. Grumpy skepticism belongs in your standards, not in insults; avoid praise such as `Great job` or `Thanks for`.",
798
761
  "Keep code_location ranges as short as possible, ideally one line and never longer than 5-10 lines unless unavoidable.",
@@ -805,7 +768,7 @@ async function runRalphWorkflow(
805
768
  "how_many_findings",
806
769
  [
807
770
  "Return all findings the original author would definitely want to fix.",
808
- "If no such findings exist, return an empty findings array and mark the patch correct.",
771
+ "If no such findings exist, return an empty findings array and mark the patch correct. Do not pad the array with placeholder or speculative findings.",
809
772
  "Do not stop after the first qualifying finding; continue until every qualifying finding is listed.",
810
773
  ].join("\n"),
811
774
  ],
@@ -836,7 +799,7 @@ async function runRalphWorkflow(
836
799
  [
837
800
  "decision_rules",
838
801
  [
839
- "Set stop_review_loop=true only when findings is empty, overall_correctness is patch is correct, and reviewer_error is null/omitted.",
802
+ "Set stop_review_loop=true when the patch is correct, reviewer_error is null/omitted, and there are no blocking (P0/P1/P2) findings; remaining P3 nice-to-haves do not block approval. The loop gate is computed from finding priorities, so an unresolved P0/P1/P2 keeps the loop going regardless of this flag.",
840
803
  "If you hit a reviewer/tool/validation error, set stop_review_loop=false and populate reviewer_error instead of pretending the patch is approved.",
841
804
  ].join("\n"),
842
805
  ],
@@ -908,8 +871,9 @@ async function runRalphWorkflow(
908
871
  ).length;
909
872
  // Require unanimous approval: every reviewer must have run and independently
910
873
  // approved. A fan-out error that collapses to a single error entry (fewer than
911
- // REVIEWER_COUNT reviews) or any reviewer surfacing a finding keeps the loop
912
- // iterating rather than letting a majority paper over outstanding issues.
874
+ // REVIEWER_COUNT reviews) or any reviewer surfacing a blocking (P0/P1/P2)
875
+ // finding keeps the loop iterating rather than letting a majority paper over
876
+ // outstanding issues. P3 nice-to-haves do not block approval.
913
877
  approved =
914
878
  reviewEntries.length === REVIEWER_COUNT &&
915
879
  approvalCount === REVIEWER_COUNT;
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/workflows",
3
- "version": "0.8.31-alpha.4",
3
+ "version": "0.8.31-alpha.5",
4
4
  "private": true,
5
5
  "description": "Atomic extension for multi-stage workflow authoring and execution.",
6
6
  "contributors": [
@@ -69,6 +69,8 @@ export interface DispatcherOpts {
69
69
  policy?: WorkflowExecutionPolicy;
70
70
  /** Invocation cwd used for workflow execution. */
71
71
  cwd?: string;
72
+ /** Host-resolved non-default session directory inherited by stages without explicit sessionDir. */
73
+ defaultSessionDir?: string;
72
74
  }
73
75
 
74
76
  // ---------------------------------------------------------------------------
@@ -173,6 +175,7 @@ export async function dispatch(
173
175
  models: opts.models,
174
176
  executionMode: policy.mode,
175
177
  cwd: opts.cwd,
178
+ defaultSessionDir: opts.defaultSessionDir,
176
179
  });
177
180
  if (policy.awaitTerminalRun === true) {
178
181
  const tracker = opts.jobs ?? defaultJobTracker;
@@ -2562,6 +2562,17 @@ function factory(pi: ExtensionAPI): void {
2562
2562
  : undefined,
2563
2563
  parentSession: () => intercomParentSession ?? undefined,
2564
2564
  };
2565
+ const hostStageSessionDir: { current: string | undefined } = { current: undefined };
2566
+ const resolveDefaultStageSessionDir = (): string | undefined => hostStageSessionDir.current;
2567
+ const updateHostStageSessionDir = (sessionManager: SessionManager | undefined): void => {
2568
+ try {
2569
+ hostStageSessionDir.current = sessionManager?.usesDefaultSessionDir?.() === false
2570
+ ? sessionManager.getSessionDir?.()
2571
+ : undefined;
2572
+ } catch {
2573
+ hostStageSessionDir.current = undefined;
2574
+ }
2575
+ };
2565
2576
 
2566
2577
  const startupDiscovery = discoverStartupWorkflowsSync();
2567
2578
  const runtimeRef: { current: ExtensionRuntime } = {
@@ -2574,6 +2585,7 @@ function factory(pi: ExtensionAPI): void {
2574
2585
  mcp: mcpPort,
2575
2586
  intercom: intercomPort,
2576
2587
  config: runtimeConfigRef.current,
2588
+ resolveDefaultStageSessionDir,
2577
2589
  }),
2578
2590
  };
2579
2591
  const discoveryRef: { current: DiscoveryResult | null } = { current: null };
@@ -2641,6 +2653,7 @@ function factory(pi: ExtensionAPI): void {
2641
2653
  intercom: intercomPort,
2642
2654
  config: runtimeConfigRef.current,
2643
2655
  models,
2656
+ resolveDefaultStageSessionDir,
2644
2657
  });
2645
2658
  }
2646
2659
 
@@ -2735,6 +2748,7 @@ function factory(pi: ExtensionAPI): void {
2735
2748
  mcp: mcpPort,
2736
2749
  intercom: intercomPort,
2737
2750
  config: runtimeConfigRef.current,
2751
+ resolveDefaultStageSessionDir,
2738
2752
  });
2739
2753
  }
2740
2754
 
@@ -4025,6 +4039,7 @@ function factory(pi: ExtensionAPI): void {
4025
4039
  }
4026
4040
 
4027
4041
  const sessionManager = ctx?.sessionManager ?? pi.sessionManager;
4042
+ updateHostStageSessionDir(sessionManager);
4028
4043
  if (sessionManager) {
4029
4044
  const cfg = configLoadRef.current?.config;
4030
4045
  withWorkflowLifecycleNotificationsSuppressed(
@@ -86,6 +86,8 @@ export interface ExtensionRuntimeOpts {
86
86
  jobs?: JobTracker;
87
87
  /** Invocation cwd used for workflow execution. Defaults to process.cwd(). */
88
88
  cwd?: string;
89
+ /** Resolve the host's non-default session directory for workflow stage transcripts. */
90
+ resolveDefaultStageSessionDir?: () => string | undefined;
89
91
  }
90
92
 
91
93
  // ---------------------------------------------------------------------------
@@ -149,6 +151,7 @@ export function createExtensionRuntime(opts: ExtensionRuntimeOpts = {}): Extensi
149
151
  const models = opts.models;
150
152
  const jobs = opts.jobs;
151
153
  const runtimeCwd = opts.cwd ?? process.cwd();
154
+ const resolveDefaultStageSessionDir = opts.resolveDefaultStageSessionDir;
152
155
 
153
156
  function runOptions(args: WorkflowToolArgs, policy?: WorkflowExecutionPolicy): RunOpts {
154
157
  const argConcurrency =
@@ -166,6 +169,7 @@ export function createExtensionRuntime(opts: ExtensionRuntimeOpts = {}): Extensi
166
169
  ...(config?.statusFilePath !== undefined ? { statusFilePath: config.statusFilePath } : {}),
167
170
  resumeInFlight: config?.resumeInFlight ?? "ask",
168
171
  };
172
+ const defaultSessionDir = resolveDefaultStageSessionDir?.();
169
173
  return {
170
174
  adapters,
171
175
  store: activeStore,
@@ -174,6 +178,7 @@ export function createExtensionRuntime(opts: ExtensionRuntimeOpts = {}): Extensi
174
178
  mcp,
175
179
  config: effectiveConfig,
176
180
  models,
181
+ ...(defaultSessionDir !== undefined ? { defaultSessionDir } : {}),
177
182
  ...(policy !== undefined ? { executionMode: policy.mode } : {}),
178
183
  registry,
179
184
  cwd: runtimeCwd,
@@ -510,6 +515,7 @@ export function createExtensionRuntime(opts: ExtensionRuntimeOpts = {}): Extensi
510
515
  },
511
516
 
512
517
  dispatch(args: WorkflowToolArgs, options?: RuntimeDispatchOptions): Promise<WorkflowToolResult> {
518
+ const defaultSessionDir = resolveDefaultStageSessionDir?.();
513
519
  return dispatch(args, {
514
520
  registry,
515
521
  adapters,
@@ -522,6 +528,7 @@ export function createExtensionRuntime(opts: ExtensionRuntimeOpts = {}): Extensi
522
528
  models,
523
529
  policy: options?.policy,
524
530
  cwd: runtimeCwd,
531
+ ...(defaultSessionDir !== undefined ? { defaultSessionDir } : {}),
525
532
  });
526
533
  },
527
534
 
@@ -121,6 +121,8 @@ export interface RunOpts extends Omit<AuthoringContract.RunOpts, "adapters" | "s
121
121
  ui?: WorkflowUIAdapter;
122
122
  /** Runtime execution mode. Controls child session policy metadata. */
123
123
  executionMode?: WorkflowExecutionMode;
124
+ /** Host-resolved non-default session directory inherited by stages without explicit sessionDir. */
125
+ defaultSessionDir?: string;
124
126
  /** Internal detached-run mode: surface ctx.ui.* as node-local workflow prompt stages. */
125
127
  usePromptNodesForUi?: boolean;
126
128
  /**
@@ -1003,6 +1005,24 @@ export function toolResultHasChatAnswer(result: unknown): boolean {
1003
1005
  );
1004
1006
  }
1005
1007
 
1008
+ // ---------------------------------------------------------------------------
1009
+ // Resume continuation hook (#1407)
1010
+ // ---------------------------------------------------------------------------
1011
+ // When an interactive paused stage is resumed with a user message, the resumed
1012
+ // answer turn should be followed by one deterministic same-session nudge so the
1013
+ // model returns to the interrupted work without showing the readiness gate for
1014
+ // the resume-answer turn itself.
1015
+
1016
+ export const RESUME_CONTINUATION_PROMPT = "Continue where you left off.";
1017
+
1018
+ export function shouldInjectResumeContinuation(state: {
1019
+ readonly resumeOccurred: boolean;
1020
+ readonly gateEnabled: boolean;
1021
+ readonly aborted: boolean;
1022
+ }): boolean {
1023
+ return state.resumeOccurred && state.gateEnabled && !state.aborted;
1024
+ }
1025
+
1006
1026
  let cachedReadinessGateTool: ReturnType<typeof createAskUserQuestionToolDefinition> | undefined;
1007
1027
  function readinessGateTool(): ReturnType<typeof createAskUserQuestionToolDefinition> {
1008
1028
  return (cachedReadinessGateTool ??= createAskUserQuestionToolDefinition());
@@ -4199,6 +4219,7 @@ export async function run<TInputs extends WorkflowInputValues>(
4199
4219
  __requestPause: async () => rejectReplayMutation("pause"),
4200
4220
  __resume: async () => rejectReplayMutation("resume"),
4201
4221
  __isPaused: () => false,
4222
+ __structuredOutputFinalized: () => false,
4202
4223
  };
4203
4224
  return replayContext;
4204
4225
  }
@@ -4233,6 +4254,7 @@ export async function run<TInputs extends WorkflowInputValues>(
4233
4254
  stageOptions: stageOptionsForContext,
4234
4255
  models: opts.models,
4235
4256
  executionMode: opts.executionMode,
4257
+ defaultSessionDir: opts.defaultSessionDir,
4236
4258
  onModelFallbackMetaChange(meta) {
4237
4259
  applyModelFallbackMeta(meta);
4238
4260
  if (stageSnapshot.status === "running") {
@@ -4251,6 +4273,12 @@ export async function run<TInputs extends WorkflowInputValues>(
4251
4273
  // When true the readiness gate is bypassed — the stage stays in the
4252
4274
  // composer without showing an extra confirmation UI (#1264).
4253
4275
  let chatAnswerObservedThisTurn = false;
4276
+ // Saturated one-slot marker for the latest real pause->resume(message)
4277
+ // transition that still needs the deterministic same-session continuation
4278
+ // prompt (#1407). Later paused resumes before the same post-turn drain
4279
+ // supersede earlier unfinished resumes; the slot is consumed before
4280
+ // prompting so a pause/resume of the continuation turn can set it again.
4281
+ let resumeContinuationPending = false;
4254
4282
  const hasActiveAskUserQuestion = (): boolean =>
4255
4283
  activeAskUserQuestionCalls.size > 0 || activeAskUserQuestionAnonymousCalls > 0;
4256
4284
  const unsubscribeAskUserQuestionWatcher = innerCtx.subscribe((event) => {
@@ -4421,6 +4449,18 @@ export async function run<TInputs extends WorkflowInputValues>(
4421
4449
  if (changed) {
4422
4450
  ensureReleaseBarrier(stageId);
4423
4451
  await cascadePauseFrom(stageId);
4452
+ // Mark the run paused once no stage is still actively running,
4453
+ // mirroring pauseRun() (runs/background/status.ts). This keeps a
4454
+ // manual TUI/Escape pause updating run-level status — and therefore
4455
+ // the main-chat status widget and `/workflow status` — identically
4456
+ // to the `workflow` tool and `/workflow pause`. recordRunPaused is
4457
+ // idempotent, so double-recording from the tool/slash path or from
4458
+ // cascade re-entry is safe.
4459
+ const run = activeStore.runs().find((candidate) => candidate.id === runId);
4460
+ const stillActive = run?.stages.some(
4461
+ (s) => s.status === "running" && s.id !== stageId,
4462
+ ) ?? false;
4463
+ if (!stillActive) activeStore.recordRunPaused(runId);
4424
4464
  }
4425
4465
  if (statusBeforePause === "pending" || statusBeforePause === "running" || innerCtx.isStreaming) {
4426
4466
  await innerCtx.__requestPause();
@@ -4429,13 +4469,30 @@ export async function run<TInputs extends WorkflowInputValues>(
4429
4469
  async resume(message?: string) {
4430
4470
  throwIfStageMutationBlocked();
4431
4471
  await ensureMessagingSession();
4432
- const changed = activeStore.recordStageResumed(runId, stageId);
4433
- if (changed) {
4434
- releaseStageBarrier(stageId);
4435
- await cascadeResumeFrom(stageId);
4472
+ const wasPausedBeforeResume = innerCtx.__isPaused();
4473
+ const hasResumeContinuationMessage = typeof message === "string" && message.trim().length > 0;
4474
+ const previousResumeContinuationPending = resumeContinuationPending;
4475
+ const queuedResumeContinuation = wasPausedBeforeResume && hasResumeContinuationMessage;
4476
+ if (queuedResumeContinuation) {
4477
+ resumeContinuationPending = true;
4436
4478
  }
4437
4479
  try {
4480
+ const changed = activeStore.recordStageResumed(runId, stageId);
4481
+ if (changed) {
4482
+ releaseStageBarrier(stageId);
4483
+ await cascadeResumeFrom(stageId);
4484
+ // Restore run-level status so a manual resume updates the main chat
4485
+ // like the `workflow` tool / `/workflow resume`. recordRunResumed is
4486
+ // a no-op when the run is not paused, so this is safe under cascade
4487
+ // and the tool/slash path.
4488
+ activeStore.recordRunResumed(runId);
4489
+ }
4438
4490
  await innerCtx.__resume(message);
4491
+ } catch (err) {
4492
+ if (queuedResumeContinuation) {
4493
+ resumeContinuationPending = previousResumeContinuationPending;
4494
+ }
4495
+ throw err;
4439
4496
  } finally {
4440
4497
  captureStageSessionMeta();
4441
4498
  }
@@ -4584,7 +4641,44 @@ export async function run<TInputs extends WorkflowInputValues>(
4584
4641
  }
4585
4642
  };
4586
4643
 
4587
- const runTrackedStageCall = async (call: () => Promise<string>, eagerSession = false): Promise<string> => {
4644
+ const suppressReadinessForCurrentTurn = (): void => {
4645
+ askUserQuestionObservedThisTurn = false;
4646
+ chatAnswerObservedThisTurn = false;
4647
+ };
4648
+
4649
+ const skipResumeContinuationInjection = (): boolean => {
4650
+ if (stageFinalized) return true;
4651
+ if (skippedForParallelFailFast) return true;
4652
+ if (stageSnapshot.status === "skipped" && stageSnapshot.skippedReason === "fail-fast") return true;
4653
+ if (isTerminalStage(stageSnapshot)) return true;
4654
+ if (stageFailFastScope?.failed === true && stageFailFastScope.activeStages.has(stageId)) return true;
4655
+ // A schema-backed stage can finalize during the resumed answer turn by
4656
+ // calling structured_output. That consumes the resume slot and
4657
+ // suppresses readiness for the resume-answer turn, but a second prompt
4658
+ // would violate the one-prompt schema contract.
4659
+ if (innerCtx.__structuredOutputFinalized()) return true;
4660
+ return false;
4661
+ };
4662
+
4663
+ const drainResumeContinuations = async <T>(currentResult: T): Promise<T> => {
4664
+ let result = currentResult;
4665
+ while (resumeContinuationPending) {
4666
+ resumeContinuationPending = false;
4667
+ suppressReadinessForCurrentTurn();
4668
+ if (!shouldInjectResumeContinuation({
4669
+ resumeOccurred: true,
4670
+ gateEnabled: readinessGateEnabled,
4671
+ aborted: ownController.signal.aborted,
4672
+ })) {
4673
+ continue;
4674
+ }
4675
+ if (skipResumeContinuationInjection()) continue;
4676
+ result = await raceAbort(innerCtx.prompt(RESUME_CONTINUATION_PROMPT), ownController.signal) as T;
4677
+ }
4678
+ return result;
4679
+ };
4680
+
4681
+ const runTrackedStageCall = async <T>(call: () => Promise<T>, eagerSession = false): Promise<T> => {
4588
4682
  throwIfWorkflowExitSelected();
4589
4683
  await waitForStageRelease();
4590
4684
  if (stageFinalized) {
@@ -4661,12 +4755,13 @@ export async function run<TInputs extends WorkflowInputValues>(
4661
4755
  };
4662
4756
  if (ownController.signal.aborted) abortSession();
4663
4757
  else ownController.signal.addEventListener("abort", abortSession, { once: true });
4664
- let result = "";
4758
+ let result: T;
4665
4759
  try {
4666
4760
  // Run the stage's initial agent turn.
4667
4761
  askUserQuestionObservedThisTurn = false;
4668
4762
  chatAnswerObservedThisTurn = false;
4669
4763
  result = await raceAbort(call(), ownController.signal);
4764
+ result = await drainResumeContinuations(result);
4670
4765
 
4671
4766
  // Per-turn readiness gate (#1099). When an agent turn ENDS (control
4672
4767
  // returns to the user): if the turn issued no ask_user_question
@@ -4706,7 +4801,8 @@ export async function run<TInputs extends WorkflowInputValues>(
4706
4801
  ownController.signal,
4707
4802
  );
4708
4803
  if (ownController.signal.aborted) break;
4709
- result = innerCtx.__getLastAssistantText() ?? result;
4804
+ result = (innerCtx.__getLastAssistantText() ?? result) as T;
4805
+ result = await drainResumeContinuations(result);
4710
4806
  }
4711
4807
  } finally {
4712
4808
  resolveNextTurnEnd = null;