deepline 0.1.90 → 0.1.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3211,8 +3211,22 @@ function formatTailLogPart(value: unknown): string {
3211
3211
  }
3212
3212
  }
3213
3213
 
3214
+ // Operator-diagnostic console lines that carry the [deepline-run:] prefix but
3215
+ // are not user-facing run output. The console scrape fans run-prefixed lines
3216
+ // back into the run's durable Run Log Stream ('system' channel), so harness/
3217
+ // coordinator plumbing noise is filtered at ingestion, never at read time.
3218
+ // User play log lines (runner-event echoes) intentionally pass through.
3219
+ const OPERATOR_NOISE_LOG_PATTERNS: readonly RegExp[] = [
3220
+ /\[perf-trace\]/,
3221
+ /\[harness-probe\]/,
3222
+ /TenantWorkflow\.run entered/,
3223
+ /TenantWorkflow\.run threw/,
3224
+ /failed to forward runner perf trace/,
3225
+ /failed to forward TenantWorkflow\.run error/,
3226
+ ];
3227
+
3214
3228
  function parseRunLogLine(line: string): { runId: string; line: string } | null {
3215
- if (line.includes('[perf-trace]')) {
3229
+ if (OPERATOR_NOISE_LOG_PATTERNS.some((pattern) => pattern.test(line))) {
3216
3230
  return null;
3217
3231
  }
3218
3232
  const prefixed = line.match(RUN_LOG_PREFIX_RE);
@@ -3661,26 +3675,79 @@ async function handleWorkflowRoute(input: {
3661
3675
  }
3662
3676
  try {
3663
3677
  if (action === 'cancel') {
3664
- if (!instance) {
3665
- return Response.json({ runId, status: 'cancelled' });
3678
+ if (instance) {
3679
+ try {
3680
+ await instance.terminate();
3681
+ } catch (error) {
3682
+ const message =
3683
+ error instanceof Error ? error.message : String(error);
3684
+ // Tolerate four classes of error here:
3685
+ // - already-terminal (complete / errored / terminated)
3686
+ // - "Cannot terminate instance since its on a finite state"
3687
+ // (the runtime's wording for "already finished")
3688
+ // - "not implemented" (wrangler dev local mode doesn't support
3689
+ // instance.terminate() yet — silently no-op there)
3690
+ // - "not found" (instance never existed)
3691
+ if (
3692
+ !/complete|terminated|errored|finite state|cannot[ _]terminate|not[ _]implemented|not[ _]found|404/i.test(
3693
+ message,
3694
+ )
3695
+ ) {
3696
+ throw error;
3697
+ }
3698
+ }
3666
3699
  }
3667
- try {
3668
- await instance.terminate();
3669
- } catch (error) {
3670
- const message = error instanceof Error ? error.message : String(error);
3671
- // Tolerate four classes of error here:
3672
- // - already-terminal (complete / errored / terminated)
3673
- // - "Cannot terminate instance since its on a finite state"
3674
- // (the runtime's wording for "already finished")
3675
- // - "not implemented" (wrangler dev local mode doesn't support
3676
- // instance.terminate() yet silently no-op there)
3677
- // - "not found" (instance never existed)
3678
- if (
3679
- !/complete|terminated|errored|finite state|cannot[ _]terminate|not[ _]implemented|not[ _]found|404/i.test(
3680
- message,
3681
- )
3682
- ) {
3683
- throw error;
3700
+ // terminate() kills the dynamic worker before its run() wrapper can
3701
+ // write terminal state (the only place completed/failed land), so
3702
+ // without this write /tail reports 'running' forever and any
3703
+ // start-stream watcher hangs after a cancel. Land the cancelled
3704
+ // terminal state here terminal-set appends a 'terminal' run event
3705
+ // and wakes the dedup DO's long-poll waiters, which unblocks tails.
3706
+ //
3707
+ // Idempotency: first-wins from this side — if the run already went
3708
+ // terminal (completed/failed/cancelled) we keep that state. The DO
3709
+ // stores the cached terminal state under a single storage key
3710
+ // (last-wins on raw writes), but the run-event log is append-only
3711
+ // and /tail truncates at the FIRST terminal event, so a racing
3712
+ // completed/failed write from a dying worker can at worst replace
3713
+ // the cached key with another terminal status — it can never
3714
+ // resurrect 'running'.
3715
+ const existingTerminal = await readCoordinatorTerminalState(
3716
+ env,
3717
+ runId,
3718
+ ).catch((error: unknown) => {
3719
+ // Tolerated: better to risk a harmless terminal-over-terminal
3720
+ // overwrite than to skip the cancelled write and hang watchers.
3721
+ console.warn('[coordinator] terminal state read before cancel failed', {
3722
+ runId,
3723
+ error: error instanceof Error ? error.message : String(error),
3724
+ });
3725
+ return null;
3726
+ });
3727
+ if (!existingTerminal) {
3728
+ try {
3729
+ await writeCoordinatorTerminalState(env, {
3730
+ runId,
3731
+ status: 'cancelled',
3732
+ error: 'Run cancelled',
3733
+ });
3734
+ } catch (error) {
3735
+ // Fail loudly: the workflow was terminated but watchers would
3736
+ // hang on 'running' forever without the terminal event.
3737
+ const message =
3738
+ error instanceof Error ? error.message : String(error);
3739
+ console.error('[coordinator] cancel terminal state write failed', {
3740
+ runId,
3741
+ error: message,
3742
+ });
3743
+ return Response.json(
3744
+ {
3745
+ runId,
3746
+ status: 'error',
3747
+ error: `workflow terminated but cancelled terminal state write failed: ${message}`,
3748
+ },
3749
+ { status: 500 },
3750
+ );
3684
3751
  }
3685
3752
  }
3686
3753
  return Response.json({ runId, status: 'cancelled' });
@@ -1206,7 +1206,10 @@ async function waitForSyntheticIntegrationEvent(
1206
1206
  {
1207
1207
  type: 'log.appended',
1208
1208
  runId: req.runId,
1209
- source: 'worker',
1209
+ // 'system' (windowed text-dedupe channel), NOT 'worker': this line is
1210
+ // emitted outside the harness log buffer, so it has no positional
1211
+ // channelOffset and must not pollute the worker channel cursor.
1212
+ source: 'system',
1210
1213
  occurredAt: nowMs(),
1211
1214
  lines: [
1212
1215
  `Waiting for integration_event:${eventKey} for up to ${timeoutMs}ms.`,
@@ -5402,6 +5405,14 @@ async function executeRunRequest(
5402
5405
  const abortSignal = abortController.signal;
5403
5406
  let runLogBuffer: string[] = [];
5404
5407
  let pendingRunLogLines: string[] = [];
5408
+ // Monotonic count of every line ever appended to this run's worker log
5409
+ // channel. runLogBuffer/pendingRunLogLines are rotating tails of those
5410
+ // lines (RUN_LOG_BUFFER_LIMIT is the coordinator transport cache only), so
5411
+ // each log.appended batch can carry the absolute channelOffset of its first
5412
+ // line: totalEmittedLogLines - pendingRunLogLines.length. Run Log Stream
5413
+ // ingestion skips re-sent prefixes positionally (exactly-once, repeated
5414
+ // identical lines preserved) instead of text-deduping.
5415
+ let totalEmittedLogLines = 0;
5405
5416
  let stepProgressByNodeId: LiveNodeProgressMap = {};
5406
5417
  let dirtyProgressNodeIds = new Set<string>();
5407
5418
  let pendingLedgerEvents: PlayRunLedgerEvent[] = [
@@ -5424,6 +5435,7 @@ async function executeRunRequest(
5424
5435
  const appendRunLogLine = (line: string) => {
5425
5436
  const trimmed = redactSecretsFromLogString(line.trim());
5426
5437
  if (!trimmed) return;
5438
+ totalEmittedLogLines += 1;
5427
5439
  runLogBuffer = [...runLogBuffer, trimmed].slice(-RUN_LOG_BUFFER_LIMIT);
5428
5440
  pendingRunLogLines = [...pendingRunLogLines, trimmed].slice(
5429
5441
  -RUN_LOG_BUFFER_LIMIT,
@@ -5614,6 +5626,12 @@ async function executeRunRequest(
5614
5626
  source: 'worker',
5615
5627
  occurredAt,
5616
5628
  lines: pendingRunLogLines,
5629
+ // Positional cursor: pendingRunLogLines always holds the LAST
5630
+ // pending lines emitted on this channel, so the offset of its first
5631
+ // line is total-emitted minus pending length. This also covers the
5632
+ // terminal full-buffer re-send (pending = runLogBuffer), which
5633
+ // ingestion then skips positionally instead of via text dedupe.
5634
+ channelOffset: totalEmittedLogLines - pendingRunLogLines.length,
5617
5635
  });
5618
5636
  pendingRunLogLines = [];
5619
5637
  }
@@ -5709,6 +5727,9 @@ async function executeRunRequest(
5709
5727
  ): Promise<void> => {
5710
5728
  if (!options?.persistResultDatasets) return;
5711
5729
  const now = nowMs();
5730
+ // Terminal re-send of the full retained buffer. drainPendingLedgerEvents
5731
+ // stamps it with channelOffset = totalEmitted - buffer length, so Run Log
5732
+ // Stream ingestion drops the already-ingested prefix positionally.
5712
5733
  pendingRunLogLines = runLogBuffer;
5713
5734
  dirtyProgressNodeIds = new Set([
5714
5735
  ...dirtyProgressNodeIds,
@@ -5859,6 +5880,25 @@ async function executeRunRequest(
5859
5880
  ms: nowMs() - resultDatasetStartedAt,
5860
5881
  });
5861
5882
  const parentSignal = startParentTerminalSignal();
5883
+ // Capped runs settle compute billing BEFORE declaring run.completed: a
5884
+ // per-run cap denial (422 billing_cap_exceeded) must fail the run as
5885
+ // its ONLY terminal. Flushing completed first opens a race — watchers
5886
+ // stream the ledger snapshot and exit on the transient completed
5887
+ // before the demoting run.failed lands.
5888
+ const capped = extractMaxCreditsPerRun(req.contractSnapshot) !== null;
5889
+ if (capped) {
5890
+ const billingStartedAt = nowMs();
5891
+ await finalizeWorkerComputeBilling({
5892
+ req,
5893
+ success: true,
5894
+ actionEstimate: 4,
5895
+ });
5896
+ recordRunnerPerfTrace({
5897
+ req,
5898
+ phase: 'runner.compute_billing_finalize',
5899
+ ms: nowMs() - billingStartedAt,
5900
+ });
5901
+ }
5862
5902
  const terminalOccurredAt = nowMs();
5863
5903
  const terminalUpdateStartedAt = nowMs();
5864
5904
  await flushTerminalLedgerEvents({
@@ -5874,21 +5914,19 @@ async function executeRunRequest(
5874
5914
  ms: nowMs() - terminalUpdateStartedAt,
5875
5915
  });
5876
5916
 
5877
- const billingStartedAt = nowMs();
5878
- const billingPromise = finalizeWorkerComputeBilling({
5879
- req,
5880
- success: true,
5881
- actionEstimate: 4,
5882
- }).then(() => {
5883
- recordRunnerPerfTrace({
5917
+ if (!capped) {
5918
+ const billingStartedAt = nowMs();
5919
+ const billingPromise = finalizeWorkerComputeBilling({
5884
5920
  req,
5885
- phase: 'runner.compute_billing_finalize',
5886
- ms: nowMs() - billingStartedAt,
5921
+ success: true,
5922
+ actionEstimate: 4,
5923
+ }).then(() => {
5924
+ recordRunnerPerfTrace({
5925
+ req,
5926
+ phase: 'runner.compute_billing_finalize',
5927
+ ms: nowMs() - billingStartedAt,
5928
+ });
5887
5929
  });
5888
- });
5889
- if (extractMaxCreditsPerRun(req.contractSnapshot) !== null) {
5890
- await billingPromise;
5891
- } else {
5892
5930
  const nonBlockingBillingPromise = billingPromise.catch((error) => {
5893
5931
  console.error(
5894
5932
  `[play-harness] non-fatal compute billing finalize failed runId=${req.runId}: ${
@@ -36,6 +36,15 @@
36
36
  import { resolveConfig } from './config.js';
37
37
  import { DeeplineError } from './errors.js';
38
38
  import { HttpClient } from './http.js';
39
+ import {
40
+ STREAM_HEALTHY_CONNECTION_MS,
41
+ isTransientPlayStreamError,
42
+ streamReconnectDelayMs,
43
+ } from './stream-reconnect.js';
44
+ import {
45
+ observeRunEvents,
46
+ RunObserveTransportUnavailableError,
47
+ } from './runs/observe-transport.js';
39
48
  import type {
40
49
  DeeplineClientOptions,
41
50
  ResolvedConfig,
@@ -129,11 +138,29 @@ export type RunsListOptions = {
129
138
  /** Streaming options for `client.runs.tail(...)`. */
130
139
  export type RunsTailOptions = {
131
140
  signal?: AbortSignal;
141
+ /**
142
+ * Called before each stream reconnect. Server stream windows are finite, so
143
+ * long runs reconnect with backoff until a terminal status is observed.
144
+ */
145
+ onReconnect?: (info: {
146
+ attempt: number;
147
+ delayMs: number;
148
+ reason: string;
149
+ }) => void;
150
+ /**
151
+ * Display-only transport notices: subscription-transport reconnects,
152
+ * staleness warnings, and the one-time fallback notice when the server
153
+ * cannot serve the Convex subscription transport (ADR-0008).
154
+ */
155
+ onNotice?: (message: string) => void;
132
156
  };
133
157
 
134
158
  /** Log fetch options for `client.runs.logs(...)`. */
135
159
  export type RunsLogsOptions = {
160
+ /** Return the LAST `limit` stored log lines (default 200). */
136
161
  limit?: number;
162
+ /** Fetch every stored log line, paginating to the full totalCount. */
163
+ all?: boolean;
137
164
  };
138
165
 
139
166
  /** Persisted log response for one play run. */
@@ -146,6 +173,28 @@ export type RunsLogsResult = {
146
173
  truncated: boolean;
147
174
  hasMore: boolean;
148
175
  entries: string[];
176
+ /**
177
+ * True when the run crossed the Run Log Stream retention cap: `totalCount`
178
+ * keeps counting, but stored line bodies end at a loud truncation marker.
179
+ */
180
+ logsTruncated?: boolean;
181
+ };
182
+
183
+ /** Server page cap for GET /api/v2/runs/:runId/logs (ADR-0009). */
184
+ const RUN_LOGS_PAGE_LIMIT = 1_000;
185
+
186
+ /** Wire shape of one GET /api/v2/runs/:runId/logs page. */
187
+ type RunLogsPageResponse = {
188
+ runId: string;
189
+ totalLogCount: number;
190
+ logsTruncated: boolean;
191
+ lastStoredSeq: number;
192
+ afterSeq: number;
193
+ entries: Array<{ seq: number; line: string }>;
194
+ firstSeq: number | null;
195
+ lastSeq: number | null;
196
+ hasMore: boolean;
197
+ nextAfterSeq: number | null;
149
198
  };
150
199
 
151
200
  /** One persisted runtime-sheet row returned by `client.runs.exportDatasetRows(...)`. */
@@ -328,6 +377,13 @@ type PlayLiveStatusState = {
328
377
  runId: string;
329
378
  status: PlayStatus['status'];
330
379
  logs: string[];
380
+ /**
381
+ * Absolute (1-based) sequence number of the last log line appended to
382
+ * `logs`. play.run.log payloads carry `firstSeq` (ADR-0009), so overlapping
383
+ * re-deliveries are skipped positionally — repeated identical lines are
384
+ * preserved and snapshots never replace the accumulated log list.
385
+ */
386
+ lastLogSeq: number;
331
387
  result?: unknown;
332
388
  error?: string;
333
389
  latest: PlayStatus | null;
@@ -355,13 +411,52 @@ function normalizeLiveStatus(value: unknown): PlayStatus['status'] | null {
355
411
  return null;
356
412
  }
357
413
 
414
+ function appendPlayLiveLogLines(
415
+ state: PlayLiveStatusState,
416
+ payload: Record<string, unknown>,
417
+ ): void {
418
+ const lines = readStringArray(payload.lines);
419
+ if (lines.length === 0) {
420
+ return;
421
+ }
422
+ const firstSeq =
423
+ typeof payload.firstSeq === 'number' &&
424
+ Number.isFinite(payload.firstSeq) &&
425
+ payload.firstSeq >= 1
426
+ ? Math.trunc(payload.firstSeq)
427
+ : null;
428
+ if (firstSeq === null) {
429
+ // Marker payloads (gap/unavailable notices) and pre-ADR-0009 servers
430
+ // carry no seq: append verbatim and advance the cursor by the payload's
431
+ // cumulative count when present so later seq-stamped lines line up.
432
+ state.logs.push(...lines);
433
+ const totalLogCount =
434
+ typeof payload.totalLogCount === 'number' &&
435
+ Number.isFinite(payload.totalLogCount)
436
+ ? Math.trunc(payload.totalLogCount)
437
+ : null;
438
+ if (totalLogCount !== null) {
439
+ state.lastLogSeq = Math.max(state.lastLogSeq, totalLogCount);
440
+ }
441
+ return;
442
+ }
443
+ // Positional append: skip the already-seen prefix of overlapping
444
+ // re-deliveries; repeated identical lines are preserved.
445
+ const skip = Math.max(0, state.lastLogSeq + 1 - firstSeq);
446
+ if (skip >= lines.length) {
447
+ return;
448
+ }
449
+ state.logs.push(...lines.slice(skip));
450
+ state.lastLogSeq = Math.max(state.lastLogSeq, firstSeq + lines.length - 1);
451
+ }
452
+
358
453
  function updatePlayLiveStatusState(
359
454
  state: PlayLiveStatusState,
360
455
  event: PlayLiveEvent,
361
456
  ): PlayStatus | null {
362
457
  const payload = getPlayLiveEventPayload(event);
363
458
  if (event.type === 'play.run.log') {
364
- state.logs.push(...readStringArray(payload.lines));
459
+ appendPlayLiveLogLines(state, payload);
365
460
  return null;
366
461
  }
367
462
  if (
@@ -385,15 +480,23 @@ function updatePlayLiveStatusState(
385
480
  : null) ??
386
481
  state.status;
387
482
  const progressPayload = isRecord(payload.progress) ? payload.progress : {};
388
- const payloadLogs = readStringArray(payload.logs);
389
- const progressLogs = readStringArray(progressPayload.logs);
390
- const logs = payloadLogs.length > 0 ? payloadLogs : progressLogs;
483
+ // Snapshots no longer REPLACE accumulated logs (ADR-0009): the snapshot
484
+ // only retains a bounded tail, so replacing would clobber the seq-keyed
485
+ // log list built from play.run.log events (the stream differ always emits
486
+ // log lines through play.run.log, snapshot ticks included). A terminal
487
+ // final_status payload may still seed an EMPTY state — that is the only
488
+ // event some non-stream flows ever see.
391
489
  if (
392
- logs.length > 0 ||
393
- event.type === 'play.run.snapshot' ||
394
- (event.type === 'play.run.final_status' && !isPlayRunPackage(payload))
490
+ event.type === 'play.run.final_status' &&
491
+ state.logs.length === 0 &&
492
+ state.lastLogSeq === 0
395
493
  ) {
396
- state.logs = logs;
494
+ const payloadLogs = readStringArray(payload.logs);
495
+ const progressLogs = readStringArray(progressPayload.logs);
496
+ const seedLogs = payloadLogs.length > 0 ? payloadLogs : progressLogs;
497
+ if (seedLogs.length > 0) {
498
+ state.logs = seedLogs;
499
+ }
397
500
  }
398
501
  if ('result' in payload) {
399
502
  state.result = payload.result;
@@ -1560,44 +1663,161 @@ export class DeeplineClient {
1560
1663
  return response.runs ?? [];
1561
1664
  }
1562
1665
 
1563
- /** Read the canonical run stream and return the latest run snapshot. */
1564
- async tailRun(runId: string, options?: RunsTailOptions): Promise<PlayStatus> {
1666
+ /**
1667
+ * Observe one run's live events through the Convex Run Snapshot
1668
+ * subscription transport (ADR-0008). Yields the same `play.*` event
1669
+ * envelopes as {@link streamPlayRunEvents} and ends after the terminal
1670
+ * snapshot. Throws {@link RunObserveTransportUnavailableError} when this
1671
+ * server cannot serve the transport (older server, unconfigured grants, or
1672
+ * unreachable Convex) — callers fall back to the SSE stream with a notice.
1673
+ */
1674
+ observeRunEvents(
1675
+ runId: string,
1676
+ options?: { signal?: AbortSignal; onNotice?: (message: string) => void },
1677
+ ): AsyncGenerator<PlayLiveEvent> {
1678
+ return observeRunEvents({
1679
+ http: this.http,
1680
+ runId,
1681
+ signal: options?.signal,
1682
+ onNotice: options?.onNotice,
1683
+ }) as AsyncGenerator<PlayLiveEvent>;
1684
+ }
1685
+
1686
+ /**
1687
+ * Tail one run through the subscription transport until terminal, then
1688
+ * return one durable REST status read (the final Run Response Package).
1689
+ */
1690
+ private async tailRunViaObserveTransport(
1691
+ runId: string,
1692
+ options?: RunsTailOptions,
1693
+ ): Promise<PlayStatus> {
1565
1694
  const state: PlayLiveStatusState = {
1566
1695
  runId,
1567
1696
  status: 'running',
1568
1697
  logs: [],
1698
+ lastLogSeq: 0,
1569
1699
  latest: null,
1570
1700
  };
1571
- let terminal = false;
1572
- for await (const event of this.streamPlayRunEvents(runId, {
1573
- mode: 'cli',
1701
+ for await (const event of this.observeRunEvents(runId, {
1574
1702
  signal: options?.signal,
1703
+ onNotice: options?.onNotice,
1575
1704
  })) {
1576
1705
  const status = updatePlayLiveStatusState(state, event);
1577
- if (!status) {
1706
+ if (!status || !TERMINAL_PLAY_STATUSES.has(status.status)) {
1578
1707
  continue;
1579
1708
  }
1580
- terminal = TERMINAL_PLAY_STATUSES.has(status.status);
1581
- if (terminal) {
1582
- break;
1583
- }
1584
- }
1585
- if (terminal && state.latest) {
1586
- return await this.getRunStatus(state.latest.runId || runId).catch(
1709
+ return await this.getRunStatus(status.runId || runId).catch(
1587
1710
  () => state.latest ?? playRunStatusFromState(state),
1588
1711
  );
1589
1712
  }
1590
- if (state.latest) {
1591
- return state.latest;
1713
+ if (options?.signal?.aborted) {
1714
+ throw new DeeplineError('Run observation aborted.', undefined, 'ABORTED');
1715
+ }
1716
+ // The transport ends only after a terminal snapshot; the differ always
1717
+ // emits a terminal `play.run.status` first, so reaching here means the
1718
+ // terminal package read raced — re-check durable status once, loudly.
1719
+ const refreshed = await this.getRunStatus(runId);
1720
+ if (TERMINAL_PLAY_STATUSES.has(refreshed.status)) {
1721
+ return refreshed;
1592
1722
  }
1593
1723
  throw new DeeplineError(
1594
- `Run stream for ${runId} ended before the initial snapshot.`,
1724
+ `Run observation for ${runId} ended before a terminal status.`,
1595
1725
  undefined,
1596
- 'PLAY_RUN_STREAM_EMPTY',
1597
- { runId },
1726
+ 'PLAY_LIVE_STREAM_ENDED',
1598
1727
  );
1599
1728
  }
1600
1729
 
1730
+ /**
1731
+ * Read the canonical run stream until a terminal run status is observed.
1732
+ *
1733
+ * Tries the Convex Run Snapshot subscription transport first (ADR-0008);
1734
+ * when the server cannot serve it (grant endpoint missing/unconfigured or
1735
+ * Convex unreachable) it falls back — with one `onNotice` message — to the
1736
+ * support-window SSE stream below.
1737
+ *
1738
+ * Server stream windows are finite: they end cleanly at the function
1739
+ * ceiling even while the run keeps executing. A window that ends (cleanly
1740
+ * or via transient network error) without a terminal event triggers one
1741
+ * durable-status re-check followed by a backed-off reconnect, so long runs
1742
+ * tail to completion. Abort via `options.signal` to stop waiting.
1743
+ */
1744
+ async tailRun(runId: string, options?: RunsTailOptions): Promise<PlayStatus> {
1745
+ try {
1746
+ return await this.tailRunViaObserveTransport(runId, options);
1747
+ } catch (error) {
1748
+ if (!(error instanceof RunObserveTransportUnavailableError)) {
1749
+ throw error;
1750
+ }
1751
+ options?.onNotice?.(
1752
+ `[observe] live subscription unavailable (${error.reason}); falling back to SSE tail (support window, ADR-0008)`,
1753
+ );
1754
+ }
1755
+ const state: PlayLiveStatusState = {
1756
+ runId,
1757
+ status: 'running',
1758
+ logs: [],
1759
+ lastLogSeq: 0,
1760
+ latest: null,
1761
+ };
1762
+ let reconnectAttempt = 0;
1763
+
1764
+ for (;;) {
1765
+ const connectedAt = Date.now();
1766
+ let sawEvent = false;
1767
+ let endedReason = 'stream window ended before a terminal event';
1768
+ try {
1769
+ for await (const event of this.streamPlayRunEvents(runId, {
1770
+ mode: 'cli',
1771
+ signal: options?.signal,
1772
+ })) {
1773
+ sawEvent = true;
1774
+ const status = updatePlayLiveStatusState(state, event);
1775
+ if (!status || !TERMINAL_PLAY_STATUSES.has(status.status)) {
1776
+ continue;
1777
+ }
1778
+ return await this.getRunStatus(status.runId || runId).catch(
1779
+ () => state.latest ?? playRunStatusFromState(state),
1780
+ );
1781
+ }
1782
+ } catch (error) {
1783
+ if (options?.signal?.aborted || !isTransientPlayStreamError(error)) {
1784
+ throw error;
1785
+ }
1786
+ endedReason = error instanceof Error ? error.message : String(error);
1787
+ }
1788
+
1789
+ // Window ended without a terminal event. The run may have finished
1790
+ // during the gap — re-check durable status once before reconnecting.
1791
+ // Non-transient status failures (e.g. 404 = run gone) fail loudly.
1792
+ let refreshed: PlayStatus | null = null;
1793
+ try {
1794
+ refreshed = await this.getRunStatus(runId);
1795
+ } catch (error) {
1796
+ if (!isTransientPlayStreamError(error)) {
1797
+ throw error;
1798
+ }
1799
+ }
1800
+ if (refreshed && TERMINAL_PLAY_STATUSES.has(refreshed.status)) {
1801
+ return refreshed;
1802
+ }
1803
+
1804
+ if (
1805
+ sawEvent ||
1806
+ Date.now() - connectedAt >= STREAM_HEALTHY_CONNECTION_MS
1807
+ ) {
1808
+ reconnectAttempt = 0;
1809
+ }
1810
+ const delayMs = streamReconnectDelayMs(reconnectAttempt);
1811
+ reconnectAttempt += 1;
1812
+ options?.onReconnect?.({
1813
+ attempt: reconnectAttempt,
1814
+ delayMs,
1815
+ reason: endedReason,
1816
+ });
1817
+ await sleep(delayMs);
1818
+ }
1819
+ }
1820
+
1601
1821
  /**
1602
1822
  * Fetch persisted logs for a run using the public runs resource model.
1603
1823
  *
@@ -1611,23 +1831,51 @@ export class DeeplineClient {
1611
1831
  runId: string,
1612
1832
  options?: RunsLogsOptions,
1613
1833
  ): Promise<RunsLogsResult> {
1614
- const status = await this.getRunStatus(runId, { full: true });
1615
- const logs = status.progress?.logs ?? [];
1616
- const limit =
1617
- typeof options?.limit === 'number' && Number.isFinite(options.limit)
1618
- ? Math.max(0, Math.trunc(options.limit))
1834
+ const limit = options?.all
1835
+ ? Number.MAX_SAFE_INTEGER
1836
+ : typeof options?.limit === 'number' &&
1837
+ Number.isFinite(options.limit) &&
1838
+ options.limit > 0
1839
+ ? Math.trunc(options.limit)
1619
1840
  : 200;
1620
- const entries = logs.slice(Math.max(0, logs.length - limit));
1841
+ const fetchPage = (afterSeq: number, pageLimit: number) =>
1842
+ this.http.get<RunLogsPageResponse>(
1843
+ `/api/v2/runs/${encodeURIComponent(runId)}/logs?afterSeq=${afterSeq}&limit=${pageLimit}`,
1844
+ );
1845
+ // Probe for the run's stored extent, then read the LAST `limit` stored
1846
+ // lines (matching the historical tail-slice semantics), paginating in
1847
+ // server-capped pages until the window is exhausted.
1848
+ const probe = await fetchPage(0, 1);
1849
+ const lastStoredSeq = probe.lastStoredSeq;
1850
+ let afterSeq = options?.all ? 0 : Math.max(0, lastStoredSeq - limit);
1851
+ const entries: Array<{ seq: number; line: string }> = [];
1852
+ while (entries.length < limit) {
1853
+ const page = await fetchPage(
1854
+ afterSeq,
1855
+ Math.min(RUN_LOGS_PAGE_LIMIT, limit - entries.length),
1856
+ );
1857
+ if (page.entries.length === 0) {
1858
+ break;
1859
+ }
1860
+ entries.push(...page.entries);
1861
+ afterSeq = page.entries[page.entries.length - 1]!.seq;
1862
+ if (!page.hasMore) {
1863
+ break;
1864
+ }
1865
+ }
1866
+ const firstSequence = entries.length > 0 ? entries[0]!.seq : null;
1867
+ const lastSequence =
1868
+ entries.length > 0 ? entries[entries.length - 1]!.seq : null;
1621
1869
  return {
1622
- runId: status.runId,
1623
- totalCount: logs.length,
1870
+ runId: probe.runId,
1871
+ totalCount: probe.totalLogCount,
1624
1872
  returnedCount: entries.length,
1625
- firstSequence:
1626
- logs.length === 0 ? null : logs.length - entries.length + 1,
1627
- lastSequence: logs.length === 0 ? null : logs.length,
1628
- truncated: logs.length > entries.length,
1629
- hasMore: logs.length > entries.length,
1630
- entries,
1873
+ firstSequence,
1874
+ lastSequence,
1875
+ truncated: entries.length < probe.totalLogCount,
1876
+ hasMore: lastSequence !== null && lastSequence < lastStoredSeq,
1877
+ entries: entries.map((entry) => entry.line),
1878
+ ...(probe.logsTruncated ? { logsTruncated: true } : {}),
1631
1879
  };
1632
1880
  }
1633
1881
 
@@ -1993,6 +2241,7 @@ export class DeeplineClient {
1993
2241
  runId: workflowId,
1994
2242
  status: 'running',
1995
2243
  logs: [],
2244
+ lastLogSeq: 0,
1996
2245
  latest: null,
1997
2246
  };
1998
2247
 
@@ -55,6 +55,7 @@
55
55
 
56
56
  // ——— Client ———
57
57
  export { DeeplineClient } from './client.js';
58
+ export { RunObserveTransportUnavailableError } from './runs/observe-transport.js';
58
59
  export type {
59
60
  PlayStatus,
60
61
  PlaySheetRow,