@stigmer/react 3.0.8-dev.20260612122433 → 3.0.8-dev.20260613041848

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/execution/useExecutionStream.d.ts +49 -5
  2. package/execution/useExecutionStream.d.ts.map +1 -1
  3. package/execution/useExecutionStream.js +118 -18
  4. package/execution/useExecutionStream.js.map +1 -1
  5. package/internal/backoff.d.ts +61 -0
  6. package/internal/backoff.d.ts.map +1 -0
  7. package/internal/backoff.js +79 -0
  8. package/internal/backoff.js.map +1 -0
  9. package/internal/store/conversation-store.d.ts +12 -0
  10. package/internal/store/conversation-store.d.ts.map +1 -1
  11. package/internal/store/conversation-store.js +7 -0
  12. package/internal/store/conversation-store.js.map +1 -1
  13. package/internal/store/workflow-execution-event-store.d.ts +12 -0
  14. package/internal/store/workflow-execution-event-store.d.ts.map +1 -1
  15. package/internal/store/workflow-execution-event-store.js +7 -0
  16. package/internal/store/workflow-execution-event-store.js.map +1 -1
  17. package/internal/stream-controller.d.ts +11 -19
  18. package/internal/stream-controller.d.ts.map +1 -1
  19. package/internal/stream-controller.js +24 -1
  20. package/internal/stream-controller.js.map +1 -1
  21. package/package.json +4 -4
  22. package/session/SessionViewer.js +4 -1
  23. package/session/SessionViewer.js.map +1 -1
  24. package/session/useSessionConversation.d.ts +7 -1
  25. package/session/useSessionConversation.d.ts.map +1 -1
  26. package/session/useSessionConversation.js +1 -0
  27. package/session/useSessionConversation.js.map +1 -1
  28. package/src/execution/__tests__/useExecutionStream.test.tsx +184 -0
  29. package/src/execution/useExecutionStream.ts +174 -30
  30. package/src/internal/__tests__/backoff.test.ts +99 -0
  31. package/src/internal/backoff.ts +100 -0
  32. package/src/internal/store/conversation-store.ts +22 -0
  33. package/src/internal/store/workflow-execution-event-store.ts +22 -0
  34. package/src/internal/stream-controller.ts +30 -25
  35. package/src/session/SessionViewer.tsx +27 -0
  36. package/src/session/useSessionConversation.ts +8 -1
  37. package/src/workflow/WorkflowExecutionHeader.tsx +4 -1
  38. package/src/workflow/WorkflowExecutionTimeline.tsx +2 -1
  39. package/src/workflow/__tests__/useWorkflowExecutionEventStream.test.tsx +117 -1
  40. package/src/workflow/execution/useWaterfallEntries.ts +2 -1
  41. package/src/workflow/useWorkflowExecutionEventStream.ts +122 -41
  42. package/src/workflow/waterfall/WaterfallTimeline.tsx +2 -1
  43. package/styles.css +1 -1
  44. package/workflow/WorkflowExecutionHeader.d.ts.map +1 -1
  45. package/workflow/WorkflowExecutionHeader.js +3 -1
  46. package/workflow/WorkflowExecutionHeader.js.map +1 -1
  47. package/workflow/WorkflowExecutionTimeline.d.ts.map +1 -1
  48. package/workflow/WorkflowExecutionTimeline.js +1 -1
  49. package/workflow/WorkflowExecutionTimeline.js.map +1 -1
  50. package/workflow/execution/useWaterfallEntries.d.ts.map +1 -1
  51. package/workflow/execution/useWaterfallEntries.js +1 -1
  52. package/workflow/execution/useWaterfallEntries.js.map +1 -1
  53. package/workflow/useWorkflowExecutionEventStream.d.ts +32 -4
  54. package/workflow/useWorkflowExecutionEventStream.d.ts.map +1 -1
  55. package/workflow/useWorkflowExecutionEventStream.js +75 -32
  56. package/workflow/useWorkflowExecutionEventStream.js.map +1 -1
  57. package/workflow/waterfall/WaterfallTimeline.d.ts.map +1 -1
  58. package/workflow/waterfall/WaterfallTimeline.js +1 -1
  59. package/workflow/waterfall/WaterfallTimeline.js.map +1 -1
@@ -11,6 +11,7 @@ import {
11
11
  } from "react";
12
12
  import type { AgentExecution } from "@stigmer/protos/ai/stigmer/agentic/agentexecution/v1/api_pb";
13
13
  import { ExecutionPhase } from "@stigmer/protos/ai/stigmer/agentic/agentexecution/v1/enum_pb";
14
+ import { isTransientStreamError } from "@stigmer/sdk";
14
15
  import { useStigmer } from "../hooks";
15
16
  import { toError } from "../internal/toError";
16
17
  import { useStreamRate } from "../internal/dev";
@@ -18,7 +19,13 @@ import {
18
19
  StreamController,
19
20
  type StreamControllerSink,
20
21
  } from "../internal/stream-controller";
21
- import { ConversationStore, type StreamState } from "../internal/store";
22
+ import {
23
+ computeBackoffDelay,
24
+ sleep,
25
+ DEFAULT_RECONNECT_MAX_ATTEMPTS,
26
+ type BackoffOptions,
27
+ } from "../internal/backoff";
28
+ import { ConversationStore } from "../internal/store";
22
29
  import { isTerminalPhase } from "./execution-phases";
23
30
 
24
31
  /** Return value of {@link useExecutionStream}. */
@@ -37,14 +44,31 @@ export interface UseExecutionStreamReturn {
37
44
  readonly isStreaming: boolean;
38
45
  /** `true` after subscription starts but before the first snapshot arrives. */
39
46
  readonly isConnecting: boolean;
40
- /** Error from the last failed stream attempt, or `null` when healthy. */
47
+ /**
48
+ * `true` while a transient drop is being retried automatically in the
49
+ * background. The last snapshot stays visible and `error` remains `null` —
50
+ * surface a subtle "Reconnecting…" affordance, not an error. Becomes
51
+ * `false` once a snapshot is received (back to `isStreaming`) or retries
52
+ * are exhausted (then `error` is set).
53
+ */
54
+ readonly isReconnecting: boolean;
55
+ /** 1-based count of the in-flight reconnect attempt; `0` when not reconnecting. */
56
+ readonly reconnectAttempt: number;
57
+ /**
58
+ * Error from the last failed stream attempt, or `null` when healthy.
59
+ *
60
+ * Only set once auto-reconnect has exhausted its attempts (or for a
61
+ * non-transient failure that is not retried). It stays `null` throughout
62
+ * background reconnection so a recoverable hiccup never shows as an error.
63
+ */
41
64
  readonly error: Error | null;
42
65
  /**
43
66
  * Reset error state and re-establish the stream subscription.
44
67
  *
45
- * Works in any lifecycle state error, complete, or mid-stream.
46
- * Uses the `connectKey` counter pattern consistent with `refetch()`
47
- * in other SDK hooks.
68
+ * The fallback after auto-reconnect exhausts, and a manual escape hatch in
69
+ * any lifecycle state error, complete, or mid-stream. Resets the retry
70
+ * counter and preserves the last snapshot (no flash to empty). Uses the
71
+ * `connectKey` counter pattern consistent with `refetch()` in other SDK hooks.
48
72
  */
49
73
  readonly reconnect: () => void;
50
74
  }
@@ -65,6 +89,21 @@ export interface UseExecutionStreamOptions {
65
89
  * preserving backward compatibility for standalone usage.
66
90
  */
67
91
  readonly store?: ConversationStore;
92
+ /**
93
+ * Automatically re-establish the subscription with exponential backoff
94
+ * when a non-terminal stream drops (transport error, idle timeout, laptop
95
+ * sleep). Defaults to `true`. Set `false` to opt out and surface every
96
+ * drop as an immediate `error` for manual `reconnect()`.
97
+ */
98
+ readonly autoReconnect?: boolean;
99
+ /**
100
+ * Tune the auto-reconnect backoff schedule and attempt cap. Omitted fields
101
+ * fall back to SDK defaults (base 1s, ×2, max 30s, 10 attempts).
102
+ */
103
+ readonly reconnectOptions?: BackoffOptions & {
104
+ /** Max attempts before surfacing a terminal `error`. */
105
+ readonly maxAttempts?: number;
106
+ };
68
107
  }
69
108
 
70
109
  /**
@@ -73,7 +112,18 @@ export interface UseExecutionStreamOptions {
73
112
  *
74
113
  * Manages the full subscription lifecycle through a finite state
75
114
  * machine: connection establishment, rAF-coalesced snapshot streaming,
76
- * terminal-phase detection, error handling, and manual reconnection.
115
+ * terminal-phase detection, automatic reconnection with exponential
116
+ * backoff on transient drops, and manual reconnection as the fallback.
117
+ *
118
+ * **Resilience:** a non-terminal stream drop — whether a thrown transport
119
+ * error (WebKit "Load failed", `fetch failed`, `Unavailable`) or a graceful
120
+ * server close mid-run (idle timeout, load-balancer recycle) — is retried
121
+ * automatically with backoff. The last snapshot stays visible
122
+ * (`isReconnecting`), the access token is re-read on each attempt via the
123
+ * per-request interceptor, and `error` is surfaced only once attempts are
124
+ * exhausted. Completion is decided by the terminal phase, never by the
125
+ * stream merely ending (a graceful close of a running execution reconnects
126
+ * rather than falsely reporting "complete"). Opt out via `autoReconnect: false`.
77
127
  *
78
128
  * **Performance characteristics:**
79
129
  * - Non-terminal snapshots are coalesced via `requestAnimationFrame`
@@ -152,6 +202,17 @@ export function useExecutionStream(
152
202
  const streamRateRef = useRef(streamRate);
153
203
  streamRateRef.current = streamRate;
154
204
 
205
+ // -- Reconnect config (ref-backed so option identity churn never resubscribes)
206
+ const autoReconnect = options?.autoReconnect ?? true;
207
+ const reconnectOptions = options?.reconnectOptions;
208
+ const configRef = useRef({ autoReconnect, reconnectOptions });
209
+ configRef.current = { autoReconnect, reconnectOptions };
210
+
211
+ // Tracks the execution the store currently holds, so we reset the store on
212
+ // a genuine identity change (A → B) but preserve it across reconnects of the
213
+ // SAME execution. Mirrors useWorkflowExecutionEventStream / useFetch.
214
+ const prevExecutionIdRef = useRef<string | null>(null);
215
+
155
216
  // -- Subscription effect --------------------------------------------------
156
217
  // Note: controller, store, and streamRate are ref-backed stable objects —
157
218
  // they MUST NOT appear in the deps array. Including them would cause
@@ -160,45 +221,117 @@ export function useExecutionStream(
160
221
  if (!executionId) {
161
222
  controller.reset();
162
223
  store.reset();
224
+ prevExecutionIdRef.current = null;
163
225
  return;
164
226
  }
165
227
 
228
+ // Reset only when switching to a different execution. Crucially we do NOT
229
+ // reset the store on reconnect (connectKey bump) or on cleanup — that
230
+ // would wipe the conversation to an empty "Connecting…" on every retry.
231
+ // The full-snapshot subscribe re-delivers the entire state on reconnect,
232
+ // so keeping the last-known-good snapshot is both correct and seamless.
233
+ if (
234
+ prevExecutionIdRef.current !== null &&
235
+ prevExecutionIdRef.current !== executionId
236
+ ) {
237
+ store.reset();
238
+ }
239
+ prevExecutionIdRef.current = executionId;
240
+
166
241
  const abortController = new AbortController();
242
+ const signal = abortController.signal;
167
243
  controller.start(executionId);
168
244
 
169
245
  (async () => {
170
- try {
171
- for await (const snapshot of stigmer.agentExecution.subscribe(
172
- executionId,
173
- abortController.signal,
174
- )) {
175
- if (abortController.signal.aborted) return;
176
-
177
- controller.handleSnapshot(snapshot);
178
- streamRateRef.current.tick(
179
- snapshot.status?.messages?.length ?? 0,
180
- );
181
-
182
- const phase =
183
- snapshot.status?.phase ??
184
- ExecutionPhase.EXECUTION_PHASE_UNSPECIFIED;
185
- if (isTerminalPhase(phase)) break;
246
+ const { autoReconnect: auto, reconnectOptions: backoff } =
247
+ configRef.current;
248
+ const maxAttempts = backoff?.maxAttempts ?? DEFAULT_RECONNECT_MAX_ATTEMPTS;
249
+
250
+ // 1-based count of consecutive failed attempts. Reset to 0 by any
251
+ // successful snapshot, so each healthy stretch gets a fresh backoff
252
+ // budget rather than inheriting the previous outage's attempt count.
253
+ let attempt = 0;
254
+
255
+ // Schedule the next retry after `error`, or stop. Returns `true` when
256
+ // the loop should continue (a retry was scheduled), `false` when it
257
+ // should exit (opted out, exhausted, or aborted). Shared by the
258
+ // thrown-error and premature-end paths so both converge on one policy.
259
+ const scheduleRetry = async (error: Error): Promise<boolean> => {
260
+ if (!auto || attempt >= maxAttempts) {
261
+ controller.handleError(error);
262
+ return false;
186
263
  }
264
+ attempt += 1;
265
+ controller.handleReconnecting(attempt, error);
266
+ try {
267
+ await sleep(computeBackoffDelay(attempt, backoff), signal);
268
+ } catch {
269
+ return false; // aborted mid-backoff
270
+ }
271
+ return !signal.aborted;
272
+ };
273
+
274
+ while (!signal.aborted) {
275
+ let sawTerminal = false;
276
+ try {
277
+ for await (const snapshot of stigmer.agentExecution.subscribe(
278
+ executionId,
279
+ signal,
280
+ )) {
281
+ if (signal.aborted) return;
282
+
283
+ attempt = 0; // a snapshot proves the connection is healthy
284
+ controller.handleSnapshot(snapshot);
285
+ streamRateRef.current.tick(snapshot.status?.messages?.length ?? 0);
187
286
 
188
- if (!abortController.signal.aborted) {
287
+ const phase =
288
+ snapshot.status?.phase ??
289
+ ExecutionPhase.EXECUTION_PHASE_UNSPECIFIED;
290
+ if (isTerminalPhase(phase)) {
291
+ sawTerminal = true;
292
+ break;
293
+ }
294
+ }
295
+ } catch (err) {
296
+ if (signal.aborted) return;
297
+ const error = toError(err);
298
+ // Only known-transient transport noise is retried. A non-transient
299
+ // error (not-found, invalid-argument, …) is deterministic — the
300
+ // same request would fail identically, so surface it immediately.
301
+ if (!auto || !isTransientStreamError(error)) {
302
+ controller.handleError(error);
303
+ return;
304
+ }
305
+ if (await scheduleRetry(error)) continue;
306
+ return;
307
+ }
308
+
309
+ if (signal.aborted) return;
310
+
311
+ if (sawTerminal) {
312
+ // handleSnapshot already transitioned to `complete`; flush any
313
+ // buffered frame and finish. Completion is decided by the terminal
314
+ // phase, never by the stream merely ending.
189
315
  controller.handleStreamEnd();
190
316
  streamRateRef.current.summary();
317
+ return;
191
318
  }
192
- } catch (err) {
193
- if (abortController.signal.aborted) return;
194
- controller.handleError(toError(err));
319
+
320
+ // The iterator finished without a terminal phase: the server closed a
321
+ // still-running stream (idle timeout, load-balancer recycle, pod
322
+ // restart). This is transient by definition — reconnect and the next
323
+ // full snapshot reconciles whatever changed (including, if it ended
324
+ // meanwhile, the terminal state we missed).
325
+ if (await scheduleRetry(new Error("The connection was interrupted."))) {
326
+ continue;
327
+ }
328
+ return;
195
329
  }
196
330
  })();
197
331
 
198
332
  return () => {
199
333
  abortController.abort();
200
334
  controller.reset();
201
- store.reset();
202
335
  };
203
336
  }, [executionId, stigmer, connectKey]);
204
337
 
@@ -218,8 +351,19 @@ export function useExecutionStream(
218
351
 
219
352
  const isStreaming = streamState.stage === "streaming";
220
353
  const isConnecting = streamState.stage === "connecting";
221
- const error =
222
- streamState.stage === "error" ? streamState.error : null;
354
+ const isReconnecting = streamState.stage === "reconnecting";
355
+ const reconnectAttempt =
356
+ streamState.stage === "reconnecting" ? streamState.attempt : 0;
357
+ const error = streamState.stage === "error" ? streamState.error : null;
223
358
 
224
- return { execution, phase, isStreaming, isConnecting, error, reconnect };
359
+ return {
360
+ execution,
361
+ phase,
362
+ isStreaming,
363
+ isConnecting,
364
+ isReconnecting,
365
+ reconnectAttempt,
366
+ error,
367
+ reconnect,
368
+ };
225
369
  }
@@ -0,0 +1,99 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
2
+ import {
3
+ computeBackoffDelay,
4
+ sleep,
5
+ AbortError,
6
+ DEFAULT_RECONNECT_BASE_DELAY_MS,
7
+ DEFAULT_RECONNECT_MAX_DELAY_MS,
8
+ } from "../backoff";
9
+
10
+ describe("computeBackoffDelay", () => {
11
+ // random=()=>1 collapses full jitter to its upper bound, exposing the raw
12
+ // exponential schedule for exact assertions.
13
+ const noJitter = () => 1;
14
+
15
+ it("grows exponentially from the base delay", () => {
16
+ expect(computeBackoffDelay(1, undefined, noJitter)).toBe(
17
+ DEFAULT_RECONNECT_BASE_DELAY_MS,
18
+ );
19
+ expect(computeBackoffDelay(2, undefined, noJitter)).toBe(2_000);
20
+ expect(computeBackoffDelay(3, undefined, noJitter)).toBe(4_000);
21
+ expect(computeBackoffDelay(5, undefined, noJitter)).toBe(16_000);
22
+ });
23
+
24
+ it("caps at maxDelayMs", () => {
25
+ // attempt 6 → 32_000 raw, clamped to the 30_000 ceiling.
26
+ expect(computeBackoffDelay(6, undefined, noJitter)).toBe(
27
+ DEFAULT_RECONNECT_MAX_DELAY_MS,
28
+ );
29
+ expect(computeBackoffDelay(50, undefined, noJitter)).toBe(
30
+ DEFAULT_RECONNECT_MAX_DELAY_MS,
31
+ );
32
+ });
33
+
34
+ it("applies full jitter within [0, capped]", () => {
35
+ expect(computeBackoffDelay(3, undefined, () => 0)).toBe(0);
36
+ expect(computeBackoffDelay(3, undefined, () => 0.5)).toBe(2_000);
37
+ for (let i = 0; i < 200; i++) {
38
+ const d = computeBackoffDelay(4); // real Math.random
39
+ expect(d).toBeGreaterThanOrEqual(0);
40
+ expect(d).toBeLessThanOrEqual(8_000);
41
+ }
42
+ });
43
+
44
+ it("honors custom options", () => {
45
+ const opts = { baseDelayMs: 100, factor: 3, maxDelayMs: 1_000 };
46
+ expect(computeBackoffDelay(1, opts, noJitter)).toBe(100);
47
+ expect(computeBackoffDelay(2, opts, noJitter)).toBe(300);
48
+ expect(computeBackoffDelay(3, opts, noJitter)).toBe(900);
49
+ expect(computeBackoffDelay(4, opts, noJitter)).toBe(1_000); // 2700 capped
50
+ });
51
+
52
+ it("treats attempt < 1 as the first attempt", () => {
53
+ expect(computeBackoffDelay(0, undefined, noJitter)).toBe(
54
+ DEFAULT_RECONNECT_BASE_DELAY_MS,
55
+ );
56
+ expect(computeBackoffDelay(-5, undefined, noJitter)).toBe(
57
+ DEFAULT_RECONNECT_BASE_DELAY_MS,
58
+ );
59
+ });
60
+ });
61
+
62
+ describe("sleep", () => {
63
+ beforeEach(() => vi.useFakeTimers());
64
+ afterEach(() => vi.useRealTimers());
65
+
66
+ it("resolves after the delay", async () => {
67
+ const settled = vi.fn();
68
+ const p = sleep(1_000).then(settled);
69
+ await vi.advanceTimersByTimeAsync(999);
70
+ expect(settled).not.toHaveBeenCalled();
71
+ await vi.advanceTimersByTimeAsync(1);
72
+ await p;
73
+ expect(settled).toHaveBeenCalledOnce();
74
+ });
75
+
76
+ it("rejects immediately with AbortError when the signal is already aborted", async () => {
77
+ const ac = new AbortController();
78
+ ac.abort();
79
+ await expect(sleep(1_000, ac.signal)).rejects.toBeInstanceOf(AbortError);
80
+ });
81
+
82
+ it("rejects when aborted mid-wait and leaves no pending timer", async () => {
83
+ const ac = new AbortController();
84
+ const p = sleep(10_000, ac.signal);
85
+ ac.abort();
86
+ await expect(p).rejects.toBeInstanceOf(AbortError);
87
+ // No timer should survive the abort — advancing time settles nothing.
88
+ expect(vi.getTimerCount()).toBe(0);
89
+ });
90
+
91
+ it("does not reject after resolving (listener removed on success)", async () => {
92
+ const ac = new AbortController();
93
+ const p = sleep(500, ac.signal);
94
+ await vi.advanceTimersByTimeAsync(500);
95
+ await expect(p).resolves.toBeUndefined();
96
+ // Aborting after the fact must not produce an unhandled rejection.
97
+ expect(() => ac.abort()).not.toThrow();
98
+ });
99
+ });
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Exponential-backoff scheduling for resilient stream reconnection.
3
+ *
4
+ * Pure and framework-agnostic — the timing math is a plain function and the
5
+ * wait is a cancelable promise, so both are exhaustively unit-testable
6
+ * without React or fake DOM (mirrors the codebase's extract-the-pure-core
7
+ * convention, e.g. `computeFollowCenter` / `isRecoveryTransition`).
8
+ *
9
+ * @internal Not part of the public `@stigmer/react` API.
10
+ */
11
+
12
+ /** Tunable backoff schedule. All fields optional — sensible defaults apply. */
13
+ export interface BackoffOptions {
14
+ /** Delay before the first retry, in milliseconds. */
15
+ readonly baseDelayMs?: number;
16
+ /** Upper bound on any single delay, in milliseconds. */
17
+ readonly maxDelayMs?: number;
18
+ /** Multiplier applied per attempt (`base * factor^(attempt-1)`). */
19
+ readonly factor?: number;
20
+ }
21
+
22
+ /** Delay before the first reconnect attempt. */
23
+ export const DEFAULT_RECONNECT_BASE_DELAY_MS = 1_000;
24
+ /** Ceiling for any single reconnect delay. */
25
+ export const DEFAULT_RECONNECT_MAX_DELAY_MS = 30_000;
26
+ /** Per-attempt growth multiplier. */
27
+ export const DEFAULT_RECONNECT_FACTOR = 2;
28
+ /**
29
+ * Attempts before giving up and surfacing a terminal error. With the
30
+ * defaults above this is ≈ several minutes of outage before the user sees
31
+ * an error banner — long enough to ride out sleep/wake and network blips,
32
+ * bounded enough to avoid an unbounded background loop against a stream
33
+ * that will never recover (e.g. a deleted execution).
34
+ */
35
+ export const DEFAULT_RECONNECT_MAX_ATTEMPTS = 10;
36
+
37
+ /**
38
+ * Compute the backoff delay (ms) for a 1-based reconnect attempt.
39
+ *
40
+ * Exponential growth (`base * factor^(attempt-1)`) capped at `maxDelayMs`,
41
+ * then **full jitter** — a uniform random point in `[0, capped]`. Full
42
+ * jitter (AWS, "Exponential Backoff And Jitter") de-synchronizes a fleet of
43
+ * clients that all dropped at the same instant, preventing a reconnect
44
+ * thundering herd against a recovering server.
45
+ *
46
+ * `random` is injectable purely so tests can assert exact values; callers
47
+ * should omit it.
48
+ */
49
+ export function computeBackoffDelay(
50
+ attempt: number,
51
+ opts?: BackoffOptions,
52
+ random: () => number = Math.random,
53
+ ): number {
54
+ const base = opts?.baseDelayMs ?? DEFAULT_RECONNECT_BASE_DELAY_MS;
55
+ const max = opts?.maxDelayMs ?? DEFAULT_RECONNECT_MAX_DELAY_MS;
56
+ const factor = opts?.factor ?? DEFAULT_RECONNECT_FACTOR;
57
+
58
+ const safeAttempt = Math.max(1, Math.floor(attempt));
59
+ const exponential = base * factor ** (safeAttempt - 1);
60
+ const capped = Math.min(exponential, max);
61
+ return Math.round(random() * capped);
62
+ }
63
+
64
+ /** Rejection reason for an aborted {@link sleep}, distinguishable by name. */
65
+ export class AbortError extends Error {
66
+ constructor() {
67
+ super("The operation was aborted.");
68
+ this.name = "AbortError";
69
+ }
70
+ }
71
+
72
+ /**
73
+ * Promise-based delay that settles after `ms`, or rejects immediately with
74
+ * {@link AbortError} if `signal` is (or becomes) aborted.
75
+ *
76
+ * The timer is cleared and the abort listener removed on every exit path, so
77
+ * a reconnect wait leaves nothing pending when a component unmounts or the
78
+ * subscription is torn down mid-backoff — no leaked timer, no resubscribe
79
+ * after teardown.
80
+ */
81
+ export function sleep(ms: number, signal?: AbortSignal): Promise<void> {
82
+ return new Promise<void>((resolve, reject) => {
83
+ if (signal?.aborted) {
84
+ reject(new AbortError());
85
+ return;
86
+ }
87
+
88
+ const onAbort = () => {
89
+ clearTimeout(timer);
90
+ reject(new AbortError());
91
+ };
92
+
93
+ const timer = setTimeout(() => {
94
+ signal?.removeEventListener("abort", onAbort);
95
+ resolve();
96
+ }, ms);
97
+
98
+ signal?.addEventListener("abort", onAbort, { once: true });
99
+ });
100
+ }
@@ -9,6 +9,19 @@ export type StreamState =
9
9
  | { readonly stage: "idle" }
10
10
  | { readonly stage: "connecting"; readonly executionId: string }
11
11
  | { readonly stage: "streaming"; readonly executionId: string }
12
+ | {
13
+ /**
14
+ * A non-terminal stream drop is being retried in the background. The
15
+ * last-known-good snapshot stays visible and no error is surfaced —
16
+ * the public `error` only appears once retries are exhausted. `attempt`
17
+ * is the 1-based retry count; `error` is the transient cause, retained
18
+ * for diagnostics (it is not shown to the user while reconnecting).
19
+ */
20
+ readonly stage: "reconnecting";
21
+ readonly executionId: string;
22
+ readonly attempt: number;
23
+ readonly error: Error;
24
+ }
12
25
  | { readonly stage: "complete"; readonly executionId: string }
13
26
  | {
14
27
  readonly stage: "error";
@@ -122,6 +135,15 @@ function streamStateEqual(a: StreamState, b: StreamState): boolean {
122
135
  a.error === b.error
123
136
  )
124
137
  return true;
138
+ // Each retry bumps `attempt`, so two reconnecting states are only equal
139
+ // when the attempt matches — every attempt must re-notify subscribers.
140
+ if (
141
+ a.stage === "reconnecting" &&
142
+ b.stage === "reconnecting" &&
143
+ a.executionId === b.executionId &&
144
+ a.attempt === b.attempt
145
+ )
146
+ return true;
125
147
  if ("executionId" in a && "executionId" in b)
126
148
  return a.executionId === b.executionId;
127
149
  return false;
@@ -10,6 +10,19 @@ export type WorkflowEventStreamState =
10
10
  | { readonly stage: "idle" }
11
11
  | { readonly stage: "connecting"; readonly executionId: string }
12
12
  | { readonly stage: "streaming"; readonly executionId: string }
13
+ | {
14
+ /**
15
+ * A non-terminal event stream drop is being retried in the background.
16
+ * Accumulated events stay visible and no error is surfaced until retries
17
+ * are exhausted. On reconnect the subscription resumes from the last
18
+ * received `sequence_number`, so no events are lost. `attempt` is the
19
+ * 1-based retry count; `error` is the transient cause (diagnostic only).
20
+ */
21
+ readonly stage: "reconnecting";
22
+ readonly executionId: string;
23
+ readonly attempt: number;
24
+ readonly error: Error;
25
+ }
13
26
  | { readonly stage: "complete"; readonly executionId: string }
14
27
  | {
15
28
  readonly stage: "error";
@@ -422,6 +435,15 @@ function streamStateEqual(
422
435
  a.error === b.error
423
436
  )
424
437
  return true;
438
+ // Each retry bumps `attempt`, so two reconnecting states are only equal
439
+ // when the attempt matches — every attempt must re-notify subscribers.
440
+ if (
441
+ a.stage === "reconnecting" &&
442
+ b.stage === "reconnecting" &&
443
+ a.executionId === b.executionId &&
444
+ a.attempt === b.attempt
445
+ )
446
+ return true;
425
447
  if ("executionId" in a && "executionId" in b)
426
448
  return a.executionId === b.executionId;
427
449
  return false;
@@ -1,5 +1,5 @@
1
1
  import type { AgentExecution } from "@stigmer/protos/ai/stigmer/agentic/agentexecution/v1/api_pb";
2
- import type { ConversationStore } from "./store/conversation-store";
2
+ import type { StreamState } from "./store/conversation-store";
3
3
  import { isTerminalPhase } from "../execution/execution-phases";
4
4
  import { ExecutionPhase } from "@stigmer/protos/ai/stigmer/agentic/agentexecution/v1/enum_pb";
5
5
 
@@ -7,25 +7,12 @@ import { ExecutionPhase } from "@stigmer/protos/ai/stigmer/agentic/agentexecutio
7
7
  // Types
8
8
  // ---------------------------------------------------------------------------
9
9
 
10
- export type ControllerStage =
11
- | "idle"
12
- | "connecting"
13
- | "streaming"
14
- | "complete"
15
- | "error";
16
-
17
- export type ControllerState =
18
- | { readonly stage: "idle" }
19
- | { readonly stage: "connecting"; readonly executionId: string }
20
- | { readonly stage: "streaming"; readonly executionId: string }
21
- | { readonly stage: "complete"; readonly executionId: string }
22
- | {
23
- readonly stage: "error";
24
- readonly executionId: string;
25
- readonly error: Error;
26
- };
27
-
28
- const IDLE: ControllerState = { stage: "idle" };
10
+ // The controller's FSM state is exactly the store's `StreamState` — they
11
+ // were once duplicated unions kept in lock-step by hand. The controller
12
+ // reuses the store's type so the lifecycle (including the `reconnecting`
13
+ // stage) is defined in one place and can never drift.
14
+
15
+ const IDLE: StreamState = { stage: "idle" };
29
16
 
30
17
  /**
31
18
  * Callback interface for the stream controller to communicate with
@@ -36,7 +23,7 @@ export interface StreamControllerSink {
36
23
  /** Ingest a snapshot into the store (applies structural sharing). */
37
24
  ingestSnapshot(snapshot: AgentExecution): void;
38
25
  /** Transition the store's stream lifecycle state. */
39
- setStreamState(state: ControllerState): void;
26
+ setStreamState(state: StreamState): void;
40
27
  }
41
28
 
42
29
  // ---------------------------------------------------------------------------
@@ -58,7 +45,7 @@ export interface StreamControllerSink {
58
45
  * (typically `requestAnimationFrame`).
59
46
  */
60
47
  export class StreamController {
61
- private _state: ControllerState = IDLE;
48
+ private _state: StreamState = IDLE;
62
49
  private _bufferedSnapshot: AgentExecution | null = null;
63
50
  private _rafId: number | null = null;
64
51
  private _sink: StreamControllerSink;
@@ -80,7 +67,7 @@ export class StreamController {
80
67
  }
81
68
 
82
69
  /** Current FSM state (read-only). */
83
- get state(): ControllerState {
70
+ get state(): StreamState {
84
71
  return this._state;
85
72
  }
86
73
 
@@ -113,7 +100,12 @@ export class StreamController {
113
100
  this._sink.ingestSnapshot(snapshot);
114
101
  this._transition({ stage: "complete", executionId });
115
102
  } else {
116
- if (this._state.stage === "connecting") {
103
+ // A snapshot proves the (re)connection is healthy: advance from either
104
+ // the initial `connecting` or a `reconnecting` retry into `streaming`.
105
+ if (
106
+ this._state.stage === "connecting" ||
107
+ this._state.stage === "reconnecting"
108
+ ) {
117
109
  this._transition({ stage: "streaming", executionId });
118
110
  }
119
111
  this._bufferedSnapshot = snapshot;
@@ -121,6 +113,19 @@ export class StreamController {
121
113
  }
122
114
  }
123
115
 
116
+ /**
117
+ * Enter the `reconnecting` stage after a transient drop. Unlike
118
+ * {@link start}, this preserves the buffered snapshot and never resets the
119
+ * store, so the last-known-good conversation stays on screen while the
120
+ * background retry is in flight. No-op once idle (the subscription is
121
+ * already torn down).
122
+ */
123
+ handleReconnecting(attempt: number, error: Error): void {
124
+ const executionId = this._activeExecutionId();
125
+ if (!executionId) return;
126
+ this._transition({ stage: "reconnecting", executionId, attempt, error });
127
+ }
128
+
124
129
  /**
125
130
  * Handle stream completion (iterator exhausted without error).
126
131
  * If we still have a buffered snapshot, flush it first.
@@ -171,7 +176,7 @@ export class StreamController {
171
176
  return this._state.executionId;
172
177
  }
173
178
 
174
- private _transition(next: ControllerState): void {
179
+ private _transition(next: StreamState): void {
175
180
  this._state = next;
176
181
  this._sink.setStreamState(next);
177
182
  }