deepline 0.1.90 → 0.1.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1356 -225
- package/dist/cli/index.mjs +1356 -225
- package/dist/index.d.mts +74 -5
- package/dist/index.d.ts +74 -5
- package/dist/index.js +1018 -62
- package/dist/index.mjs +1007 -62
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +87 -20
- package/dist/repo/apps/play-runner-workers/src/entry.ts +52 -14
- package/dist/repo/sdk/src/client.ts +289 -40
- package/dist/repo/sdk/src/index.ts +1 -0
- package/dist/repo/sdk/src/release.ts +2 -2
- package/dist/repo/sdk/src/runs/observe-transport.ts +481 -0
- package/dist/repo/sdk/src/stream-reconnect.ts +44 -0
- package/dist/repo/sdk/src/types.ts +10 -3
- package/dist/repo/shared_libs/play-runtime/live-events.ts +217 -0
- package/dist/repo/shared_libs/play-runtime/run-ledger.ts +1074 -0
- package/dist/repo/shared_libs/play-runtime/run-snapshot-stream.ts +581 -0
- package/package.json +5 -2
|
@@ -50,10 +50,10 @@ export type SdkRelease = {
|
|
|
50
50
|
};
|
|
51
51
|
|
|
52
52
|
export const SDK_RELEASE = {
|
|
53
|
-
version: '0.1.
|
|
53
|
+
version: '0.1.93',
|
|
54
54
|
apiContract: '2026-06-dataset-column-cell-stale-hard-cutover',
|
|
55
55
|
supportPolicy: {
|
|
56
|
-
latest: '0.1.
|
|
56
|
+
latest: '0.1.93',
|
|
57
57
|
minimumSupported: '0.1.53',
|
|
58
58
|
deprecatedBelow: '0.1.53',
|
|
59
59
|
},
|
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
import { DeeplineError } from '../errors.js';
|
|
2
|
+
import {
|
|
3
|
+
buildPlayRunStatusSnapshot,
|
|
4
|
+
diffPlayRunStreamEvents,
|
|
5
|
+
isTerminalPlayRunLiveStatus,
|
|
6
|
+
resolvePlayRunLogGap,
|
|
7
|
+
EMPTY_PLAY_RUN_STREAM_DIFF_STATE,
|
|
8
|
+
type LedgerBackedRunLike,
|
|
9
|
+
type PlayRunStreamDiffState,
|
|
10
|
+
type PlayRunStreamEvent,
|
|
11
|
+
} from '../../../shared_libs/play-runtime/run-snapshot-stream.js';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Run Observe Transport (ADR-0008).
|
|
15
|
+
*
|
|
16
|
+
* Watches one Play Run by subscribing directly to the Convex Run Snapshot:
|
|
17
|
+
*
|
|
18
|
+
* 1. `POST /api/v2/runs/:runId/observe-grant` mints a short-lived Run
|
|
19
|
+
* Observe Grant JWT (the Deepline app stays the only auth authority).
|
|
20
|
+
* 2. A lazily-imported `convex/browser` ConvexClient authenticates with the
|
|
21
|
+
* grant and subscribes to `runObservers.getPlayRunSnapshotForObserver`.
|
|
22
|
+
* 3. Each snapshot update is run through the shared snapshot differ, so the
|
|
23
|
+
* emitted `play.*` events are identical to the legacy SSE stream.
|
|
24
|
+
*
|
|
25
|
+
* Steady-state observation therefore holds no Vercel function open and never
|
|
26
|
+
* consults the coordinator. The legacy SSE reconnect loop remains only as the
|
|
27
|
+
* support-window fallback: callers catch
|
|
28
|
+
* {@link RunObserveTransportUnavailableError} (grant endpoint missing on an
|
|
29
|
+
* older/unconfigured server, or Convex unreachable) and fall back with a
|
|
30
|
+
* single visible notice.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
/** Grant response from POST /api/v2/runs/:runId/observe-grant. */
|
|
34
|
+
export type RunObserveGrantResponse = {
|
|
35
|
+
convexUrl: string;
|
|
36
|
+
token: string;
|
|
37
|
+
expiresAt: number;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Bootstrap failure: this server/deployment cannot serve the subscription
|
|
42
|
+
* transport. Callers fall back to the support-window SSE stream (ADR-0008).
|
|
43
|
+
*/
|
|
44
|
+
export class RunObserveTransportUnavailableError extends Error {
|
|
45
|
+
constructor(
|
|
46
|
+
message: string,
|
|
47
|
+
public readonly reason: string,
|
|
48
|
+
) {
|
|
49
|
+
super(message);
|
|
50
|
+
this.name = 'RunObserveTransportUnavailableError';
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
type ObserveHttp = {
|
|
55
|
+
post<T = unknown>(path: string, body: unknown): Promise<T>;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
export type ObserveRunEventsOptions = {
|
|
59
|
+
http: ObserveHttp;
|
|
60
|
+
runId: string;
|
|
61
|
+
signal?: AbortSignal;
|
|
62
|
+
/**
|
|
63
|
+
* Display-only notices (human mode): websocket reconnecting >10s, or a
|
|
64
|
+
* non-terminal run with no snapshot update for >120s.
|
|
65
|
+
*/
|
|
66
|
+
onNotice?: (message: string) => void;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
/** How long to wait for the first snapshot before declaring Convex unreachable. */
|
|
70
|
+
const OBSERVE_BOOTSTRAP_TIMEOUT_MS = 10_000;
|
|
71
|
+
/** Surface a "reconnecting" notice when the websocket is down this long. */
|
|
72
|
+
const OBSERVE_RECONNECT_NOTICE_MS = 10_000;
|
|
73
|
+
/** Display-only staleness warning for non-terminal runs. */
|
|
74
|
+
const OBSERVE_STALE_WARNING_MS = 120_000;
|
|
75
|
+
const OBSERVE_WATCHDOG_TICK_MS = 5_000;
|
|
76
|
+
/** Re-mint the grant slightly before expiry. */
|
|
77
|
+
const GRANT_REFRESH_MARGIN_MS = 5 * 60_000;
|
|
78
|
+
const BACKFILL_PAGE_LIMIT = 1000;
|
|
79
|
+
const BACKFILL_MAX_PAGES = 30;
|
|
80
|
+
|
|
81
|
+
const OBSERVER_SNAPSHOT_QUERY = 'runObservers:getPlayRunSnapshotForObserver';
|
|
82
|
+
const OBSERVER_LOG_PAGE_QUERY = 'runObservers:getRunLogPageForObserver';
|
|
83
|
+
|
|
84
|
+
function errorText(error: unknown): string {
|
|
85
|
+
return error instanceof Error ? error.message : String(error);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
async function mintRunObserveGrant(
|
|
89
|
+
http: ObserveHttp,
|
|
90
|
+
runId: string,
|
|
91
|
+
): Promise<RunObserveGrantResponse> {
|
|
92
|
+
let response: unknown;
|
|
93
|
+
try {
|
|
94
|
+
response = await http.post(
|
|
95
|
+
`/api/v2/runs/${encodeURIComponent(runId)}/observe-grant`,
|
|
96
|
+
{},
|
|
97
|
+
);
|
|
98
|
+
} catch (error) {
|
|
99
|
+
if (error instanceof DeeplineError) {
|
|
100
|
+
// 401/403 are real auth failures on a server that HAS the endpoint —
|
|
101
|
+
// surface them loudly. 404/405 mean an older server without the grant
|
|
102
|
+
// route; 5xx/503 means the deployment cannot mint grants. Both of those
|
|
103
|
+
// are bootstrap failures handled by the dated SSE fallback (ADR-0008).
|
|
104
|
+
if (error.statusCode === 401 || error.statusCode === 403) {
|
|
105
|
+
throw error;
|
|
106
|
+
}
|
|
107
|
+
throw new RunObserveTransportUnavailableError(
|
|
108
|
+
`observe-grant endpoint unavailable (${error.statusCode ?? 'network'}): ${error.message}`,
|
|
109
|
+
'grant_endpoint_unavailable',
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
throw new RunObserveTransportUnavailableError(
|
|
113
|
+
`observe-grant request failed: ${errorText(error)}`,
|
|
114
|
+
'grant_request_failed',
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
const grant = response as Partial<RunObserveGrantResponse> | null;
|
|
118
|
+
if (
|
|
119
|
+
!grant ||
|
|
120
|
+
typeof grant.convexUrl !== 'string' ||
|
|
121
|
+
!grant.convexUrl.trim() ||
|
|
122
|
+
typeof grant.token !== 'string' ||
|
|
123
|
+
!grant.token.trim() ||
|
|
124
|
+
typeof grant.expiresAt !== 'number'
|
|
125
|
+
) {
|
|
126
|
+
throw new RunObserveTransportUnavailableError(
|
|
127
|
+
'observe-grant endpoint returned an invalid grant payload.',
|
|
128
|
+
'grant_payload_invalid',
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
return grant as RunObserveGrantResponse;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
type ObserverRunDoc = LedgerBackedRunLike & { status: string };
|
|
135
|
+
|
|
136
|
+
type ObserverRunLogPage = {
|
|
137
|
+
runId: string;
|
|
138
|
+
lastStoredSeq: number;
|
|
139
|
+
entries: Array<{ seq: number; line: string }>;
|
|
140
|
+
} | null;
|
|
141
|
+
|
|
142
|
+
type QueueItem =
|
|
143
|
+
| { kind: 'run'; run: ObserverRunDoc | null }
|
|
144
|
+
| { kind: 'error'; error: unknown };
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Log-gap backfill from the durable Run Log Stream (ADR-0009). Log line
|
|
148
|
+
* bodies live in `playRunLogChunks` with exact absolute sequence numbers
|
|
149
|
+
* (they are no longer persisted to `playRunEvents`), so the grant-scoped
|
|
150
|
+
* `getRunLogPageForObserver` query returns precisely the missing seq range.
|
|
151
|
+
* Returns null when the page read fails or the range is not fully stored —
|
|
152
|
+
* the differ's loud `[stream] N log lines not retained` marker remains the
|
|
153
|
+
* truth in that case.
|
|
154
|
+
*/
|
|
155
|
+
async function backfillLogGap(input: {
|
|
156
|
+
queryLogPage: (
|
|
157
|
+
afterSeq: number,
|
|
158
|
+
limit: number,
|
|
159
|
+
) => Promise<ObserverRunLogPage>;
|
|
160
|
+
lastLogSeq: number;
|
|
161
|
+
tailFirstSeq: number;
|
|
162
|
+
}): Promise<string[] | null> {
|
|
163
|
+
const lines: string[] = [];
|
|
164
|
+
let cursor = input.lastLogSeq;
|
|
165
|
+
for (let page = 0; page < BACKFILL_MAX_PAGES; page += 1) {
|
|
166
|
+
if (cursor >= input.tailFirstSeq - 1) {
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
let logPage: ObserverRunLogPage;
|
|
170
|
+
try {
|
|
171
|
+
logPage = await input.queryLogPage(
|
|
172
|
+
cursor,
|
|
173
|
+
Math.min(BACKFILL_PAGE_LIMIT, input.tailFirstSeq - 1 - cursor),
|
|
174
|
+
);
|
|
175
|
+
} catch {
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
const entries = (logPage?.entries ?? []).filter(
|
|
179
|
+
(entry) => entry.seq > cursor && entry.seq < input.tailFirstSeq,
|
|
180
|
+
);
|
|
181
|
+
if (entries.length === 0) {
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
for (const entry of entries) {
|
|
185
|
+
if (entry.seq !== cursor + 1) {
|
|
186
|
+
// Stored seqs must be dense for the emitted payload's `firstSeq` to
|
|
187
|
+
// describe a contiguous run; a hole (e.g. a legacy run whose chunk
|
|
188
|
+
// storage starts mid-run) falls back to the differ's loud marker.
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
lines.push(entry.line);
|
|
192
|
+
cursor = entry.seq;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
if (cursor < input.tailFirstSeq - 1) {
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
return lines;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Subscribe to one run's snapshot and yield the same `play.*` live events the
|
|
203
|
+
* SSE stream emits, ending after the terminal snapshot has been emitted.
|
|
204
|
+
*
|
|
205
|
+
* Failure contract:
|
|
206
|
+
* - bootstrap failure → {@link RunObserveTransportUnavailableError} (caller
|
|
207
|
+
* falls back to SSE with one notice)
|
|
208
|
+
* - snapshot `null` → loud `RUN_NOT_FOUND`
|
|
209
|
+
* - grant rejected after one re-mint → loud failure
|
|
210
|
+
* - websocket drops mid-run → Convex client auto-reconnects; a display-only
|
|
211
|
+
* "reconnecting" notice surfaces after 10s
|
|
212
|
+
*/
|
|
213
|
+
export async function* observeRunEvents(
|
|
214
|
+
options: ObserveRunEventsOptions,
|
|
215
|
+
): AsyncGenerator<PlayRunStreamEvent> {
|
|
216
|
+
const { http, runId } = options;
|
|
217
|
+
let grant = await mintRunObserveGrant(http, runId);
|
|
218
|
+
|
|
219
|
+
let convexBrowser: typeof import('convex/browser');
|
|
220
|
+
let convexServer: typeof import('convex/server');
|
|
221
|
+
try {
|
|
222
|
+
// Lazy: non-watch commands never load the Convex client.
|
|
223
|
+
convexBrowser = await import('convex/browser');
|
|
224
|
+
convexServer = await import('convex/server');
|
|
225
|
+
} catch (error) {
|
|
226
|
+
throw new RunObserveTransportUnavailableError(
|
|
227
|
+
`convex client module unavailable: ${errorText(error)}`,
|
|
228
|
+
'convex_module_unavailable',
|
|
229
|
+
);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
let webSocketConstructor: typeof WebSocket | undefined;
|
|
233
|
+
if (typeof WebSocket === 'undefined') {
|
|
234
|
+
// Node 18/20 have no global WebSocket; use the ws shim.
|
|
235
|
+
try {
|
|
236
|
+
// 'ws' lives in sdk/package.json (not the repo root, which typechecks
|
|
237
|
+
// this file without sdk's node_modules) — resolve dynamically so the
|
|
238
|
+
// root tsc does not need @types/ws.
|
|
239
|
+
const wsModuleName = 'ws';
|
|
240
|
+
const ws = (await import(wsModuleName)) as { default: unknown };
|
|
241
|
+
webSocketConstructor = ws.default as typeof WebSocket;
|
|
242
|
+
} catch (error) {
|
|
243
|
+
throw new RunObserveTransportUnavailableError(
|
|
244
|
+
`no WebSocket implementation available: ${errorText(error)}`,
|
|
245
|
+
'websocket_unavailable',
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const snapshotQuery = convexServer.makeFunctionReference<'query'>(
|
|
251
|
+
OBSERVER_SNAPSHOT_QUERY,
|
|
252
|
+
);
|
|
253
|
+
const logPageQuery = convexServer.makeFunctionReference<'query'>(
|
|
254
|
+
OBSERVER_LOG_PAGE_QUERY,
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
const client = new convexBrowser.ConvexClient(grant.convexUrl, {
|
|
258
|
+
...(webSocketConstructor ? { webSocketConstructor } : {}),
|
|
259
|
+
unsavedChangesWarning: false,
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
const queue: QueueItem[] = [];
|
|
263
|
+
let wake: (() => void) | null = null;
|
|
264
|
+
const push = (item: QueueItem) => {
|
|
265
|
+
queue.push(item);
|
|
266
|
+
wake?.();
|
|
267
|
+
wake = null;
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
let lastForcedRefreshAt = 0;
|
|
271
|
+
client.setAuth(async ({ forceRefreshToken }) => {
|
|
272
|
+
const now = Date.now();
|
|
273
|
+
if (!forceRefreshToken && grant.expiresAt - now > GRANT_REFRESH_MARGIN_MS) {
|
|
274
|
+
return grant.token;
|
|
275
|
+
}
|
|
276
|
+
if (forceRefreshToken && now - lastForcedRefreshAt < 5_000) {
|
|
277
|
+
// Convex rejected a token we just re-minted: one re-mint retry has
|
|
278
|
+
// already happened, so fail loudly instead of looping silently.
|
|
279
|
+
push({
|
|
280
|
+
kind: 'error',
|
|
281
|
+
error: new DeeplineError(
|
|
282
|
+
`Run observe grant for ${runId} was rejected after a re-mint. ` +
|
|
283
|
+
'The server and Convex deployment disagree on the grant issuer/JWKS.',
|
|
284
|
+
401,
|
|
285
|
+
'RUN_OBSERVE_GRANT_REJECTED',
|
|
286
|
+
),
|
|
287
|
+
});
|
|
288
|
+
return null;
|
|
289
|
+
}
|
|
290
|
+
if (forceRefreshToken) {
|
|
291
|
+
lastForcedRefreshAt = now;
|
|
292
|
+
}
|
|
293
|
+
try {
|
|
294
|
+
grant = await mintRunObserveGrant(http, runId);
|
|
295
|
+
return grant.token;
|
|
296
|
+
} catch (error) {
|
|
297
|
+
push({ kind: 'error', error });
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const unsubscribe = client.onUpdate(
|
|
303
|
+
snapshotQuery,
|
|
304
|
+
{ workflowId: runId },
|
|
305
|
+
(run) => push({ kind: 'run', run: (run ?? null) as ObserverRunDoc | null }),
|
|
306
|
+
(error) => push({ kind: 'error', error }),
|
|
307
|
+
);
|
|
308
|
+
|
|
309
|
+
let lastUpdateAt = Date.now();
|
|
310
|
+
let lastStatusTerminal = false;
|
|
311
|
+
let disconnectedSince: number | null = null;
|
|
312
|
+
let warnedReconnecting = false;
|
|
313
|
+
let warnedStale = false;
|
|
314
|
+
const watchdog = setInterval(() => {
|
|
315
|
+
const now = Date.now();
|
|
316
|
+
try {
|
|
317
|
+
const connectionState = client.connectionState();
|
|
318
|
+
if (connectionState.isWebSocketConnected) {
|
|
319
|
+
disconnectedSince = null;
|
|
320
|
+
warnedReconnecting = false;
|
|
321
|
+
} else {
|
|
322
|
+
disconnectedSince ??= now;
|
|
323
|
+
if (
|
|
324
|
+
!warnedReconnecting &&
|
|
325
|
+
now - disconnectedSince >= OBSERVE_RECONNECT_NOTICE_MS
|
|
326
|
+
) {
|
|
327
|
+
warnedReconnecting = true;
|
|
328
|
+
options.onNotice?.(
|
|
329
|
+
`[observe] connection lost; reconnecting to live run ${runId}…`,
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
} catch {
|
|
334
|
+
// connectionState is diagnostic only.
|
|
335
|
+
}
|
|
336
|
+
if (
|
|
337
|
+
!lastStatusTerminal &&
|
|
338
|
+
!warnedStale &&
|
|
339
|
+
now - lastUpdateAt >= OBSERVE_STALE_WARNING_MS
|
|
340
|
+
) {
|
|
341
|
+
warnedStale = true;
|
|
342
|
+
options.onNotice?.(
|
|
343
|
+
`[observe] no live updates for ${Math.round((now - lastUpdateAt) / 1000)}s; ` +
|
|
344
|
+
`run ${runId} may be stalled (status checks continue)`,
|
|
345
|
+
);
|
|
346
|
+
}
|
|
347
|
+
}, OBSERVE_WATCHDOG_TICK_MS);
|
|
348
|
+
// Don't keep the process alive for the watchdog.
|
|
349
|
+
watchdog.unref?.();
|
|
350
|
+
|
|
351
|
+
const abortListener = () =>
|
|
352
|
+
push({
|
|
353
|
+
kind: 'error',
|
|
354
|
+
error: new DeeplineError(
|
|
355
|
+
'Run observation aborted.',
|
|
356
|
+
undefined,
|
|
357
|
+
'ABORTED',
|
|
358
|
+
),
|
|
359
|
+
});
|
|
360
|
+
options.signal?.addEventListener('abort', abortListener, { once: true });
|
|
361
|
+
|
|
362
|
+
let diffState: PlayRunStreamDiffState = EMPTY_PLAY_RUN_STREAM_DIFF_STATE;
|
|
363
|
+
const streamId = ['observe', runId].join(':');
|
|
364
|
+
let sawFirstSnapshot = false;
|
|
365
|
+
|
|
366
|
+
try {
|
|
367
|
+
for (;;) {
|
|
368
|
+
if (queue.length === 0) {
|
|
369
|
+
const waitForItem = new Promise<void>((resolve) => {
|
|
370
|
+
wake = resolve;
|
|
371
|
+
});
|
|
372
|
+
if (!sawFirstSnapshot) {
|
|
373
|
+
// Convex unreachable is a bootstrap failure: no first result within
|
|
374
|
+
// the bootstrap window means the caller should fall back to SSE.
|
|
375
|
+
const timedOut = await Promise.race([
|
|
376
|
+
waitForItem.then(() => false),
|
|
377
|
+
new Promise<boolean>((resolve) =>
|
|
378
|
+
setTimeout(() => resolve(true), OBSERVE_BOOTSTRAP_TIMEOUT_MS),
|
|
379
|
+
),
|
|
380
|
+
]);
|
|
381
|
+
if (timedOut && queue.length === 0) {
|
|
382
|
+
throw new RunObserveTransportUnavailableError(
|
|
383
|
+
`no snapshot from Convex at ${grant.convexUrl} within ${OBSERVE_BOOTSTRAP_TIMEOUT_MS}ms`,
|
|
384
|
+
'convex_unreachable',
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
} else {
|
|
388
|
+
await waitForItem;
|
|
389
|
+
}
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
const item = queue.shift()!;
|
|
394
|
+
if (item.kind === 'error') {
|
|
395
|
+
if (options.signal?.aborted) {
|
|
396
|
+
return;
|
|
397
|
+
}
|
|
398
|
+
throw item.error;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
sawFirstSnapshot = true;
|
|
402
|
+
lastUpdateAt = Date.now();
|
|
403
|
+
warnedStale = false;
|
|
404
|
+
|
|
405
|
+
if (item.run === null) {
|
|
406
|
+
throw new DeeplineError(
|
|
407
|
+
`Run ${runId} was not found (or is not visible to this grant).`,
|
|
408
|
+
404,
|
|
409
|
+
'RUN_NOT_FOUND',
|
|
410
|
+
);
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
const snapshot = buildPlayRunStatusSnapshot({ run: item.run });
|
|
414
|
+
lastStatusTerminal = isTerminalPlayRunLiveStatus(snapshot.status);
|
|
415
|
+
|
|
416
|
+
// Backfill log lines that rotated out of the retained tail window from
|
|
417
|
+
// the durable Run Log Stream before diffing, so the differ emits the
|
|
418
|
+
// retained tail without its loud gap marker. First-connect to an
|
|
419
|
+
// in-flight run (lastLogSeq === 0) is intentionally NOT backfilled:
|
|
420
|
+
// watchers get the tail plus the marker pointing at `runs logs`.
|
|
421
|
+
const gap = resolvePlayRunLogGap(snapshot, diffState.lastLogSeq);
|
|
422
|
+
if (gap && diffState.lastLogSeq > 0) {
|
|
423
|
+
const backfilled = await backfillLogGap({
|
|
424
|
+
queryLogPage: (afterSeq, limit) =>
|
|
425
|
+
client.query(logPageQuery, {
|
|
426
|
+
workflowId: runId,
|
|
427
|
+
afterSeq,
|
|
428
|
+
limit,
|
|
429
|
+
}) as Promise<ObserverRunLogPage>,
|
|
430
|
+
lastLogSeq: diffState.lastLogSeq,
|
|
431
|
+
tailFirstSeq: gap.tailFirstSeq,
|
|
432
|
+
});
|
|
433
|
+
if (backfilled && backfilled.length > 0) {
|
|
434
|
+
yield {
|
|
435
|
+
cursor: String(snapshot.updatedAt ?? Date.now()),
|
|
436
|
+
streamId,
|
|
437
|
+
scope: 'play',
|
|
438
|
+
type: 'play.run.log',
|
|
439
|
+
at: new Date().toISOString(),
|
|
440
|
+
payload: {
|
|
441
|
+
runId: snapshot.runId,
|
|
442
|
+
lines: backfilled,
|
|
443
|
+
source: 'worker',
|
|
444
|
+
firstSeq: diffState.lastLogSeq + 1,
|
|
445
|
+
totalLogCount: snapshot.totalLogCount,
|
|
446
|
+
},
|
|
447
|
+
};
|
|
448
|
+
diffState = { ...diffState, lastLogSeq: gap.tailFirstSeq - 1 };
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
const { events, next } = diffPlayRunStreamEvents({
|
|
453
|
+
streamId,
|
|
454
|
+
snapshot,
|
|
455
|
+
previous: diffState,
|
|
456
|
+
});
|
|
457
|
+
diffState = next;
|
|
458
|
+
// Snapshot event first, mirroring the SSE stream's ordering.
|
|
459
|
+
const ordered = [
|
|
460
|
+
...events.filter((event) => event.type === 'play.run.snapshot'),
|
|
461
|
+
...events.filter((event) => event.type !== 'play.run.snapshot'),
|
|
462
|
+
];
|
|
463
|
+
for (const event of ordered) {
|
|
464
|
+
yield event;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
if (lastStatusTerminal) {
|
|
468
|
+
return;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
} finally {
|
|
472
|
+
clearInterval(watchdog);
|
|
473
|
+
options.signal?.removeEventListener('abort', abortListener);
|
|
474
|
+
try {
|
|
475
|
+
unsubscribe();
|
|
476
|
+
} catch {
|
|
477
|
+
// Already closed.
|
|
478
|
+
}
|
|
479
|
+
await client.close().catch(() => undefined);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { DeeplineError } from './errors.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Shared reconnect policy for the canonical run SSE stream.
|
|
5
|
+
*
|
|
6
|
+
* Server stream windows are finite: the platform ends them cleanly at the
|
|
7
|
+
* function ceiling even while the run keeps executing. Consumers that wait for
|
|
8
|
+
* a terminal status (`plays run --watch`, `runs tail`, `client.runs.tail`)
|
|
9
|
+
* therefore reconnect with full-jitter exponential backoff instead of treating
|
|
10
|
+
* a window end as a failure.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/** Base delay for the full-jitter exponential backoff between reconnects. */
|
|
14
|
+
export const STREAM_RECONNECT_BASE_DELAY_MS = 500;
|
|
15
|
+
/** Upper bound for a single reconnect delay. */
|
|
16
|
+
export const STREAM_RECONNECT_MAX_DELAY_MS = 15_000;
|
|
17
|
+
/**
|
|
18
|
+
* A connection that stayed open at least this long (or delivered any event)
|
|
19
|
+
* counts as healthy and resets the backoff sequence.
|
|
20
|
+
*/
|
|
21
|
+
export const STREAM_HEALTHY_CONNECTION_MS = 30_000;
|
|
22
|
+
|
|
23
|
+
/** Full-jitter exponential backoff: uniform in [1, min(cap, base * 2^attempt)]. */
|
|
24
|
+
export function streamReconnectDelayMs(attempt: number): number {
|
|
25
|
+
const cappedExponentialMs = Math.min(
|
|
26
|
+
STREAM_RECONNECT_MAX_DELAY_MS,
|
|
27
|
+
STREAM_RECONNECT_BASE_DELAY_MS * 2 ** Math.max(0, attempt),
|
|
28
|
+
);
|
|
29
|
+
return Math.max(1, Math.floor(Math.random() * (cappedExponentialMs + 1)));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export function isTransientPlayStreamError(error: unknown): boolean {
|
|
33
|
+
if (error instanceof DeeplineError && typeof error.statusCode === 'number') {
|
|
34
|
+
// Server-shaped errors with a definite status code are NOT transient by
|
|
35
|
+
// pattern — only network-level failures are. 5xx counts as transient
|
|
36
|
+
// since the server may recover, but 4xx (especially 404 = run gone) is
|
|
37
|
+
// terminal and should not be hidden behind a silent retry loop.
|
|
38
|
+
return error.statusCode >= 500 && error.statusCode < 600;
|
|
39
|
+
}
|
|
40
|
+
const text = error instanceof Error ? error.message : String(error);
|
|
41
|
+
return /auth validation backend timed out|fetch failed|eaddrnotavail|econnreset|etimedout|eai_again|socket hang up/i.test(
|
|
42
|
+
text,
|
|
43
|
+
);
|
|
44
|
+
}
|
|
@@ -540,12 +540,19 @@ export type PlayLiveEvent = LiveEventEnvelope<unknown> & {
|
|
|
540
540
|
* Result returned by {@link DeeplineClient.stopPlay}.
|
|
541
541
|
*/
|
|
542
542
|
export interface StopPlayRunResult {
|
|
543
|
-
/** Public play-run identifier
|
|
543
|
+
/** Public play-run identifier the stop request targeted. */
|
|
544
544
|
runId: string;
|
|
545
|
-
/**
|
|
546
|
-
stopped:
|
|
545
|
+
/** Whether the server confirmed the run was stopped. */
|
|
546
|
+
stopped: boolean;
|
|
547
547
|
/** Number of open HITL interactions marked cancelled. */
|
|
548
548
|
hitlCancelledCount: number;
|
|
549
|
+
/**
|
|
550
|
+
* True when the scheduler state for the run was stale and the stop could
|
|
551
|
+
* not be confirmed. Absent on older servers (treated as confirmed).
|
|
552
|
+
*/
|
|
553
|
+
staleSchedulerState?: boolean;
|
|
554
|
+
/** Server-side error detail when the stop was not confirmed. */
|
|
555
|
+
error?: string;
|
|
549
556
|
}
|
|
550
557
|
|
|
551
558
|
/**
|