typeclaw 0.35.0 → 0.35.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "typeclaw",
3
- "version": "0.35.0",
3
+ "version": "0.35.1",
4
4
  "homepage": "https://github.com/typeclaw/typeclaw#readme",
5
5
  "bugs": {
6
6
  "url": "https://github.com/typeclaw/typeclaw/issues"
@@ -37,15 +37,16 @@ import type {
37
37
  } from '@/plugin'
38
38
  import {
39
39
  buildSandboxedCommand,
40
- canBindProcSafely,
41
40
  canMountRealProc,
42
41
  DEFAULT_SANDBOX_ENV,
43
42
  ensureBwrapAvailable,
44
43
  ensureSessionTmpDir,
44
+ getProcBindSafetyVerdict,
45
45
  isPackageInstallCommand,
46
46
  mapVirtualTmpPath,
47
47
  resolveHiddenPaths,
48
48
  resolvePackageInstallZones,
49
+ resolveProcBindSafetyWithRetry,
49
50
  resolveProcSelfExe,
50
51
  resolveProtectedZones,
51
52
  resolveSandboxSymlinks,
@@ -673,12 +674,12 @@ function subtractMaskedProtected(
673
674
  // the kernel permits the mount (canMountRealProc) — it adds PID isolation but
674
675
  // needs CAP_SYS_ADMIN (unshare --mount-proc), so it is a deliberate, narrow
675
676
  // opt-in; else 'proc-bind' (--ro-bind /proc, NO CAP_SYS_ADMIN) when its userns
676
- // leak-block is verified safe (canBindProcSafely); else 'tmpfs'. Because
677
- // sandbox.realProc DEFAULTS FALSE, the first branch is normally skipped and
678
- // proc-bind is the de-facto default — which is the point: the common path needs
679
- // no broad outer capability. 'tmpfs' is the last-resort degraded mode where
680
- // external packages can't run; reached only when BOTH probes fail (e.g. a kernel
681
- // that would leak cross-userns environ proc-bind fails closed there).
677
+ // leak-block is verified safe; else 'tmpfs'. Because sandbox.realProc DEFAULTS
678
+ // FALSE, the first branch is normally skipped and proc-bind is the de-facto
679
+ // default — which is the point: the common path needs no broad outer capability.
680
+ // 'tmpfs' is the last-resort degraded mode where external packages can't run;
681
+ // reached only when proc-bind is DEFINITIVELY unavailable (a real cross-userns
682
+ // environ leak fail closed) or its safety stays unverifiable after retries.
682
683
  //
683
684
  // Read from the boot-time `config` snapshot, NOT live getConfig(): sandbox is
684
685
  // restart-required, and the strategy MUST track the boot-time CAP_SYS_ADMIN
@@ -688,7 +689,16 @@ function subtractMaskedProtected(
688
689
  // container lifetime regardless of how many bash calls hit it.
689
690
  async function resolveProcStrategy(): Promise<SandboxProcStrategy> {
690
691
  if (config.sandbox.realProc && (await canMountRealProc())) return 'real-proc'
691
- if (await canBindProcSafely()) return 'proc-bind'
692
+ // Retry an 'inconclusive' proc-bind probe (transient under load) before
693
+ // degrading — a single such hiccup must not break external-package runs on a
694
+ // capable host. 'unsafe' still fails closed with no retry.
695
+ if (
696
+ await resolveProcBindSafetyWithRetry(
697
+ () => getProcBindSafetyVerdict(),
698
+ (ms) => Bun.sleep(ms),
699
+ )
700
+ )
701
+ return 'proc-bind'
692
702
  // Degraded last resort: no working /proc strategy. External package runners
693
703
  // (bunx/bun add/bun run <pkg-bin>) will fail with Bun's opaque "NotDir" because
694
704
  // /proc/self/{fd,maps} are absent. Warn once so an operator on such an exotic
@@ -11,9 +11,15 @@ export type StreamLiveOptions = {
11
11
  onSubscribed?: (live: boolean) => void
12
12
  onError?: (message: string) => void
13
13
  connectTimeoutMs?: number
14
+ heartbeatIntervalMs?: number
15
+ pongTimeoutMs?: number
16
+ bufferedAmountCeiling?: number
14
17
  }
15
18
 
16
19
  const DEFAULT_CONNECT_TIMEOUT_MS = 5_000
20
+ const DEFAULT_HEARTBEAT_INTERVAL_MS = 10_000
21
+ const DEFAULT_PONG_TIMEOUT_MS = 30_000
22
+ const DEFAULT_BUFFERED_AMOUNT_CEILING = 1_048_576
17
23
 
18
24
  export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<InspectEvent> {
19
25
  const WS = opts.WebSocketImpl ?? WebSocket
@@ -26,6 +32,17 @@ export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<Inspe
26
32
  const accumulators = new Map<string, string>()
27
33
  const thinkingAccumulators = new Map<string, string>()
28
34
 
35
+ let heartbeat: ReturnType<typeof setInterval> | null = null
36
+ let awaitingPongSince: number | null = null
37
+ let supportsPing = false
38
+
39
+ const stopHeartbeat = (): void => {
40
+ if (heartbeat !== null) {
41
+ clearInterval(heartbeat)
42
+ heartbeat = null
43
+ }
44
+ }
45
+
29
46
  const wake = (): void => {
30
47
  if (resolveNext !== null) {
31
48
  const fn = resolveNext
@@ -43,13 +60,19 @@ export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<Inspe
43
60
  return
44
61
  }
45
62
  if (msg.type === 'subscribed') {
63
+ supportsPing = msg.supportsPing === true
46
64
  opts.onSubscribed?.(msg.sessionLive)
47
65
  return
48
66
  }
67
+ if (msg.type === 'pong') {
68
+ awaitingPongSince = null
69
+ return
70
+ }
49
71
  if (msg.type === 'error') {
50
72
  opts.onError?.(msg.message)
51
73
  pendingError = msg.message
52
74
  closed = true
75
+ stopHeartbeat()
53
76
  try {
54
77
  ws.close()
55
78
  } catch {
@@ -84,6 +107,7 @@ export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<Inspe
84
107
  })
85
108
  ws.addEventListener('close', () => {
86
109
  closed = true
110
+ stopHeartbeat()
87
111
  wake()
88
112
  })
89
113
 
@@ -99,6 +123,7 @@ export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<Inspe
99
123
  'abort',
100
124
  () => {
101
125
  closed = true
126
+ stopHeartbeat()
102
127
  try {
103
128
  ws.close()
104
129
  } catch {
@@ -134,25 +159,115 @@ export async function* streamLive(opts: StreamLiveOptions): AsyncGenerator<Inspe
134
159
  }
135
160
  ws.send(JSON.stringify(subscribe))
136
161
 
137
- while (true) {
138
- if (buffer.length > 0) {
139
- const next = buffer.shift()!
140
- yield next
141
- continue
162
+ startHeartbeat({
163
+ ws,
164
+ intervalMs: opts.heartbeatIntervalMs ?? DEFAULT_HEARTBEAT_INTERVAL_MS,
165
+ pongTimeoutMs: opts.pongTimeoutMs ?? DEFAULT_PONG_TIMEOUT_MS,
166
+ bufferedAmountCeiling: opts.bufferedAmountCeiling ?? DEFAULT_BUFFERED_AMOUNT_CEILING,
167
+ supportsPing: () => supportsPing,
168
+ isAwaitingPongSince: () => awaitingPongSince,
169
+ setAwaitingPongSince: (at) => {
170
+ awaitingPongSince = at
171
+ },
172
+ setTimer: (timer) => {
173
+ heartbeat = timer
174
+ },
175
+ onDead: () => {
176
+ closed = true
177
+ stopHeartbeat()
178
+ try {
179
+ ws.close()
180
+ } catch {
181
+ /* ignore */
182
+ }
183
+ wake()
184
+ },
185
+ })
186
+
187
+ try {
188
+ while (true) {
189
+ if (buffer.length > 0) {
190
+ const next = buffer.shift()!
191
+ yield next
192
+ continue
193
+ }
194
+ if (closed) {
195
+ if (pendingError !== null) throw new Error(pendingError)
196
+ return
197
+ }
198
+ const { event, done } = await new Promise<{ event: InspectEvent | null; done: boolean }>((resolve) => {
199
+ resolveNext = resolve
200
+ })
201
+ if (event !== null) yield event
202
+ if (done) {
203
+ if (pendingError !== null) throw new Error(pendingError)
204
+ return
205
+ }
206
+ }
207
+ } finally {
208
+ // Also fired when the consumer abandons the generator (break from a
209
+ // `for await` calls .return()): close the socket so it can't outlive the
210
+ // viewer, not just the heartbeat timer.
211
+ stopHeartbeat()
212
+ closed = true
213
+ try {
214
+ ws.close()
215
+ } catch {
216
+ /* ignore */
142
217
  }
143
- if (closed) {
144
- if (pendingError !== null) throw new Error(pendingError)
218
+ }
219
+ }
220
+
221
+ type HeartbeatOptions = {
222
+ ws: WebSocket
223
+ intervalMs: number
224
+ pongTimeoutMs: number
225
+ bufferedAmountCeiling: number
226
+ // Read live: the `subscribed` reply that sets it arrives after the timer is
227
+ // armed, so a snapshot taken at startHeartbeat time would always be false.
228
+ supportsPing: () => boolean
229
+ isAwaitingPongSince: () => number | null
230
+ setAwaitingPongSince: (at: number | null) => void
231
+ setTimer: (timer: ReturnType<typeof setInterval>) => void
232
+ onDead: () => void
233
+ }
234
+
235
+ // Steady-state liveness watchdog. The connect gate only bounds the OPENING
236
+ // phase; once subscribed, a wedged socket (send queue not draining, no
237
+ // 'close'/'error') would park the read loop forever. The interval fires on the
238
+ // event-loop timer queue independent of the dead socket, so it always runs.
239
+ // Two death signals, both treated as a clean close (return, never throw) so the
240
+ // viewer recovers to the picker:
241
+ // 1. bufferedAmount past a ceiling — our writes are not draining. Always on:
242
+ // it needs no server cooperation, so it works against any server version.
243
+ // 2. a ping with no pong within the deadline — round-trip liveness lost,
244
+ // which also covers idle tails (a quiet-but-healthy tail still pongs). Only
245
+ // armed when the server advertised supportsPing; a pre-heartbeat server
246
+ // answers an unknown ping with error+close, so probing it would kill the
247
+ // tail. Such a server degrades to bufferedAmount-only detection.
248
+ function startHeartbeat(opts: HeartbeatOptions): void {
249
+ let pingId = 0
250
+ const tick = (): void => {
251
+ if (opts.ws.bufferedAmount >= opts.bufferedAmountCeiling) {
252
+ opts.onDead()
145
253
  return
146
254
  }
147
- const { event, done } = await new Promise<{ event: InspectEvent | null; done: boolean }>((resolve) => {
148
- resolveNext = resolve
149
- })
150
- if (event !== null) yield event
151
- if (done) {
152
- if (pendingError !== null) throw new Error(pendingError)
255
+ if (!opts.supportsPing()) return
256
+ const awaiting = opts.isAwaitingPongSince()
257
+ if (awaiting !== null) {
258
+ if (Date.now() - awaiting >= opts.pongTimeoutMs) opts.onDead()
153
259
  return
154
260
  }
261
+ pingId += 1
262
+ const ping: InspectClientMessage = { type: 'ping', id: pingId }
263
+ try {
264
+ opts.ws.send(JSON.stringify(ping))
265
+ opts.setAwaitingPongSince(Date.now())
266
+ } catch {
267
+ opts.onDead()
268
+ }
155
269
  }
270
+ opts.setTimer(setInterval(tick, opts.intervalMs))
156
271
  }
157
272
 
158
273
  function frameToEvent(
@@ -138,6 +138,27 @@ export function _resetRealProcProbeCacheForTests(): void {
138
138
  // future bwrap flag change, would turn this strategy into a secret leak. So we
139
139
  // PROBE it directly before ever selecting it — plant a real secret in a sibling
140
140
  // process's env and assert the sandbox cannot read it back.
141
+ // The probe has THREE outcomes, not two — collapsing them to a boolean is what
142
+ // caused the silent-degrade bug this verdict type fixes. 'safe'/'unsafe' are definitive capability
143
+ // facts (the userns block held / a leak was observed); 'inconclusive' is a
144
+ // transient local failure (probe timeout under CPU/IO contention, sentinel dying
145
+ // mid-probe, a bwrap startup hiccup) that proves NOTHING about the host. A caller
146
+ // deciding the /proc strategy must tell these apart: an inconclusive probe must
147
+ // trigger a RETRY, never a fall-through to tmpfs that breaks the whole bash call
148
+ // on a host that is actually capable. 'unsafe' must still fail closed with no
149
+ // retry. canBindProcSafely() keeps the old boolean shape for callers that only
150
+ // need "is proc-bind selectable right now"; getProcBindSafetyVerdict() exposes
151
+ // the third state for the retry-owning strategy resolver.
152
+ export type ProcBindSafetyVerdict = 'safe' | 'unsafe' | 'inconclusive'
153
+
154
+ // Only DEFINITIVE verdicts are process-global facts worth caching. Caching
155
+ // 'inconclusive' (i.e. its boolean `false`) would PERMANENTLY disable proc-bind
156
+ // for the process — a single slow first bash call would silently break every
157
+ // later bunx until container restart (the exact "works after restart" symptom
158
+ // this whole machinery exists to kill). So the cache type structurally excludes
159
+ // it.
160
+ type CacheableProcBindSafetyVerdict = Exclude<ProcBindSafetyVerdict, 'inconclusive'>
161
+
141
162
  // Keyed by resolved bwrapPath, like ensureBwrapAvailable: the safety answer is a
142
163
  // fact about a SPECIFIC bwrap binary, so a caller pinning a non-default path
143
164
  // (tests, or a future deployment) must re-probe rather than inherit the default
@@ -145,19 +166,21 @@ export function _resetRealProcProbeCacheForTests(): void {
145
166
  // concurrent first callers for one path share a single probe. Both cached
146
167
  // process-globally (the answer is a per-container capability fact). Not abortable
147
168
  // (see canMountRealProc).
148
- const procBindProbeCache = new Map<string, boolean>()
149
- const procBindProbeInFlight = new Map<string, Promise<boolean>>()
150
-
151
- // `safe` is the answer; `cacheable` is false for INCONCLUSIVE outcomes (a probe
152
- // timeout under load, or the sentinel dying mid-probe). Those are transient
153
- // failure modes, not capability facts, so caching their `safe=false` would
154
- // PERMANENTLY disable proc-bind for the process — a single slow first bash call
155
- // would silently break every later bunx until container restart (the exact
156
- // "works after restart" symptom this whole fix exists to kill). Only a probe that
157
- // ran to a verdict (definitively safe OR definitively leaking) is cached.
158
- type ProcBindProbe = { safe: boolean; cacheable: boolean }
169
+ const procBindProbeCache = new Map<string, CacheableProcBindSafetyVerdict>()
170
+ const procBindProbeInFlight = new Map<string, Promise<ProcBindSafetyVerdict>>()
159
171
 
160
- export function canBindProcSafely(options?: { bwrapPath?: string }): Promise<boolean> {
172
+ // `verdict` is the answer; only definitive verdicts are `cacheable`. INCONCLUSIVE
173
+ // outcomes (a probe timeout under load, or the sentinel dying mid-probe) are
174
+ // transient failure modes, not capability facts — see the cache rationale above.
175
+ type ProcBindProbe =
176
+ | { verdict: CacheableProcBindSafetyVerdict; cacheable: true }
177
+ | { verdict: 'inconclusive'; cacheable: false }
178
+
179
+ // The three-state probe, deduped + cached like canBindProcSafely. The strategy
180
+ // resolver (resolveProcStrategy in plugin-tools.ts) consumes this so it can RETRY
181
+ // an 'inconclusive' result before degrading the bash call to tmpfs, while still
182
+ // failing closed on 'unsafe'.
183
+ export function getProcBindSafetyVerdict(options?: { bwrapPath?: string }): Promise<ProcBindSafetyVerdict> {
161
184
  const bwrap = options?.bwrapPath ?? 'bwrap'
162
185
  const cached = procBindProbeCache.get(bwrap)
163
186
  if (cached !== undefined) return Promise.resolve(cached)
@@ -165,9 +188,9 @@ export function canBindProcSafely(options?: { bwrapPath?: string }): Promise<boo
165
188
  if (existing !== undefined) return existing
166
189
 
167
190
  const promise = probeProcBind(bwrap)
168
- .then(({ safe, cacheable }) => {
169
- if (cacheable) procBindProbeCache.set(bwrap, safe)
170
- return safe
191
+ .then(({ verdict, cacheable }) => {
192
+ if (cacheable) procBindProbeCache.set(bwrap, verdict)
193
+ return verdict
171
194
  })
172
195
  .finally(() => {
173
196
  procBindProbeInFlight.delete(bwrap)
@@ -176,9 +199,53 @@ export function canBindProcSafely(options?: { bwrapPath?: string }): Promise<boo
176
199
  return promise
177
200
  }
178
201
 
202
+ // Boolean convenience wrapper: 'safe' is the ONLY verdict that makes proc-bind
203
+ // selectable. 'unsafe' AND 'inconclusive' both map to false — callers that only
204
+ // take a boolean (and do not own a retry budget) must fail closed on either.
205
+ // Derives from the deduped verdict probe, so concurrent callers still share one
206
+ // spawn even though this wrapper's own promise identity differs per call.
207
+ export function canBindProcSafely(options?: { bwrapPath?: string }): Promise<boolean> {
208
+ return getProcBindSafetyVerdict(options).then((verdict) => verdict === 'safe')
209
+ }
210
+
211
+ // Default backoff between proc-bind safety re-probes, in ms. Array length = retry
212
+ // count (2 retries after the initial attempt = 3 probes total). The probe is
213
+ // normally sub-ms; it only returns 'inconclusive' under transient CPU/IO
214
+ // contention (e.g. a boot-time storm of concurrent LLM calls saturating the box
215
+ // and tripping the probe's own timeout), so a short staggered wait lets the spike
216
+ // pass before re-proving.
217
+ export const PROC_BIND_RETRY_BACKOFF_MS = [250, 1_000] as const
218
+
219
+ // proc-bind selection must distinguish "definitely unavailable" from "couldn't
220
+ // verify right now". A DEFINITIVE verdict is final: 'safe'→true; a real userns
221
+ // leak ('unsafe')→false with NO retry. Only an 'inconclusive' verdict (transient
222
+ // probe failure that proves nothing about the host) is retried, because degrading
223
+ // the bash call to tmpfs over a transient hiccup is what silently broke
224
+ // external-package runs on capable hosts. 'inconclusive' is never cached
225
+ // (see the cache type), so each retry re-probes from scratch. After the backoff
226
+ // budget is exhausted we fail CLOSED — an unverified leak-block is never treated
227
+ // as safe. Pure and dependency-injected (probe + sleep) so the retry policy is
228
+ // unit-testable without spawning processes; production passes
229
+ // getProcBindSafetyVerdict and Bun.sleep.
230
+ export async function resolveProcBindSafetyWithRetry(
231
+ probe: () => Promise<ProcBindSafetyVerdict>,
232
+ sleep: (ms: number) => Promise<void>,
233
+ backoffMs: readonly number[] = PROC_BIND_RETRY_BACKOFF_MS,
234
+ ): Promise<boolean> {
235
+ for (let attempt = 0; ; attempt++) {
236
+ const verdict = await probe()
237
+ if (verdict === 'safe') return true
238
+ if (verdict === 'unsafe') return false
239
+
240
+ const backoff = backoffMs[attempt]
241
+ if (backoff === undefined) return false
242
+ await sleep(backoff)
243
+ }
244
+ }
245
+
179
246
  const PROC_BIND_PROBE_SECRET = 'TYPECLAW_PROCBIND_PROBE_SECRET'
180
247
 
181
- const INCONCLUSIVE: ProcBindProbe = { safe: false, cacheable: false }
248
+ const INCONCLUSIVE: ProcBindProbe = { verdict: 'inconclusive', cacheable: false }
182
249
 
183
250
  async function probeProcBind(bwrap: string): Promise<ProcBindProbe> {
184
251
  // The sentinel must model the REAL threat geometry: the agent runtime holds
@@ -277,13 +344,13 @@ async function probeProcBind(bwrap: string): Promise<ProcBindProbe> {
277
344
  // "non-zero" — a non-zero exit also covers script setup failures (a bwrap that
278
345
  // started but couldn't read /proc/self/fd), bwrap startup failures (missing
279
346
  // lib, transient mount EBUSY → bwrap's own exit), and an external SIGKILL.
280
- // Caching any of those transient failures as a definitive safe=false would
347
+ // Caching any of those transient failures as a definitive 'unsafe' would
281
348
  // PERMANENTLY disable proc-bind — the same cache-poisoning class as the
282
349
  // timeout bug. So only the script's two designated codes are cacheable:
283
350
  // PROC_BIND_SAFE (clean run, every open blocked) and PROC_BIND_LEAK (an open
284
351
  // SUCCEEDED — a real leak). Setup failures use PROC_BIND_SETUP_FAILED, and any
285
352
  // other code (bwrap startup, signals, 127) is treated as inconclusive.
286
- if (proc.exitCode === PROC_BIND_LEAK) return { safe: false, cacheable: true }
353
+ if (proc.exitCode === PROC_BIND_LEAK) return { verdict: 'unsafe', cacheable: true }
287
354
  if (proc.exitCode !== PROC_BIND_SAFE) return INCONCLUSIVE
288
355
  // Final liveness: the in-sandbox blocked-open assertions are only meaningful
289
356
  // if the sentinel was alive throughout. Re-read its MARKER from the PARENT —
@@ -293,12 +360,13 @@ async function probeProcBind(bwrap: string): Promise<ProcBindProbe> {
293
360
  // kernel liveness, so this marker re-read is the stronger postcondition. A
294
361
  // failure here means the sentinel vanished mid-probe → inconclusive.
295
362
  if (!(await parentReadsSentinelMarker(sentinelPid))) return INCONCLUSIVE
296
- return { safe: true, cacheable: true }
363
+ return { verdict: 'safe', cacheable: true }
297
364
  } catch {
298
365
  return INCONCLUSIVE
299
366
  } finally {
300
367
  try {
301
368
  sentinel?.kill()
369
+ await sentinel?.exited.catch(() => {})
302
370
  } catch {
303
371
  // killing an already-exited sentinel can throw on some runtimes; cleanup
304
372
  // must never propagate out of the probe.
@@ -4,7 +4,11 @@ export {
4
4
  canBindProcSafely,
5
5
  canMountRealProc,
6
6
  ensureBwrapAvailable,
7
+ getProcBindSafetyVerdict,
8
+ PROC_BIND_RETRY_BACKOFF_MS,
9
+ resolveProcBindSafetyWithRetry,
7
10
  resolveProcSelfExe,
11
+ type ProcBindSafetyVerdict,
8
12
  _resetBwrapAvailabilityCacheForTests,
9
13
  _resetProcBindProbeCacheForTests,
10
14
  _resetRealProcProbeCacheForTests,
@@ -1265,6 +1265,10 @@ function handleInspectMessage(
1265
1265
  ws.close()
1266
1266
  return
1267
1267
  }
1268
+ if (msg.type === 'ping') {
1269
+ sendInspect(ws, { type: 'pong', id: msg.id })
1270
+ return
1271
+ }
1268
1272
  if (msg.type !== 'subscribe' || typeof msg.sessionId !== 'string' || msg.sessionId === '') {
1269
1273
  sendInspect(ws, { type: 'error', message: 'invalid inspect subscription' })
1270
1274
  ws.close()
@@ -1314,7 +1318,7 @@ function handleInspectMessage(
1314
1318
  })
1315
1319
  }
1316
1320
 
1317
- sendInspect(ws, { type: 'subscribed', sessionId: msg.sessionId, sessionLive: live !== undefined })
1321
+ sendInspect(ws, { type: 'subscribed', sessionId: msg.sessionId, sessionLive: live !== undefined, supportsPing: true })
1318
1322
  }
1319
1323
 
1320
1324
  function extractJobId(target: StreamMessage['target']): string {
@@ -44,16 +44,22 @@ export type TunnelLogsServerMessage =
44
44
  | { type: 'error'; message: string }
45
45
  | { type: 'end' }
46
46
 
47
- export type InspectClientMessage = {
48
- type: 'subscribe'
49
- sessionId: string
50
- // sinceMs is a wall-clock cutoff for backfilling broadcasts from the
51
- // in-process Stream ring buffer. The client uses Date.now() - duration;
52
- // omit to skip broadcast backfill. AgentSession events are NEVER
53
- // backfilled (the session's pi-coding-agent subscribe API delivers
54
- // future events only).
55
- sinceMs?: number
56
- }
47
+ export type InspectClientMessage =
48
+ | {
49
+ type: 'subscribe'
50
+ sessionId: string
51
+ // sinceMs is a wall-clock cutoff for backfilling broadcasts from the
52
+ // in-process Stream ring buffer. The client uses Date.now() - duration;
53
+ // omit to skip broadcast backfill. AgentSession events are NEVER
54
+ // backfilled (the session's pi-coding-agent subscribe API delivers
55
+ // future events only).
56
+ sinceMs?: number
57
+ }
58
+ // Steady-state liveness probe echoed back as a pong. A live tail is
59
+ // legitimately quiet for long stretches, so absence of inbound frames cannot
60
+ // distinguish "idle" from "dead"; a missed pong can. Guards a wedged
61
+ // WebSocket that stays ESTABLISHED yet never fires 'close'/'error'.
62
+ | { type: 'ping'; id: number }
57
63
 
58
64
  export type InspectFramePayload =
59
65
  | { kind: 'text_delta'; sessionId: string; delta: string }
@@ -123,9 +129,14 @@ export type InspectFramePayload =
123
129
  }
124
130
 
125
131
  export type InspectServerMessage =
126
- | { type: 'subscribed'; sessionId: string; sessionLive: boolean }
132
+ // supportsPing is the heartbeat capability flag. A pre-heartbeat server omits
133
+ // it; the client must treat its absence as "no ping support" and never send a
134
+ // ping (an old server answers an unknown ping with an error + close, killing
135
+ // the tail). Strict opt-in: only an explicit true arms round-trip probing.
136
+ | { type: 'subscribed'; sessionId: string; sessionLive: boolean; supportsPing?: true }
127
137
  | { type: 'frame'; ts: number; payload: InspectFramePayload }
128
138
  | { type: 'error'; message: string }
139
+ | { type: 'pong'; id: number }
129
140
 
130
141
  export type ClientMessage =
131
142
  | { type: 'prompt'; text: string; delivery?: PromptDelivery }