@qodo/sdk 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,12 @@ export var ConnectionState;
28
28
  export var ReadyState;
29
29
  (function (ReadyState) {
30
30
  ReadyState["IDLE"] = "IDLE";
31
+ /**
32
+ * Post-connect, waiting for ResumeAck from the server before sending any
33
+ * turn. Replaces the old WAITING_INITIAL_READY — the server's authoritative
34
+ * view is now the precondition for proceeding, not a bare "Ready" signal.
35
+ */
36
+ ReadyState["AWAITING_RESUME_ACK"] = "AWAITING_RESUME_ACK";
31
37
  ReadyState["WAITING_INITIAL_READY"] = "WAITING_INITIAL_READY";
32
38
  ReadyState["READY"] = "READY";
33
39
  ReadyState["MESSAGE_SENT"] = "MESSAGE_SENT";
@@ -73,8 +79,60 @@ export class WebSocketClient extends EventEmitter {
73
79
  latestCheckpointId;
74
80
  lastSentMessage;
75
81
  readyTimer;
76
- pendingOutbox = [];
77
82
  receivedResponsesSinceMessage = false; // Track if any responses received after sending message
83
+ /**
84
+ * Protocol-v2 counters. Public read-only via `getMetrics()` — wired into
85
+ * SDK telemetry so observability dashboards can alert on e.g. rising
86
+ * `reconcile_failures_total` without digging through debug logs.
87
+ */
88
+ metrics = {
89
+ reconnects_total: 0,
90
+ turns_sent_total: 0,
91
+ turns_acked_total: 0,
92
+ turns_replayed_total: 0,
93
+ resume_acks_total: 0,
94
+ reconcile_failures_total: 0,
95
+ stalls_detected_total: 0,
96
+ idempotent_replays_total: 0,
97
+ };
98
+ /** Last time the socket transitioned from CONNECTED to not-CONNECTED; used for downtime metrics. */
99
+ lastDisconnectAt;
100
+ getMetrics() {
101
+ return { ...this.metrics };
102
+ }
103
+ /**
104
+ * Outbox of turns the client has sent (or is about to send) but has not
105
+ * received a TurnAck for. Keyed by turn_id so idempotent replay across
106
+ * reconnect is an O(1) lookup. Only pruned on TurnAck or ResumeAck.
107
+ */
108
+ outbox = new Map();
109
+ /**
110
+ * FIFO order of turn_ids in the outbox. We send one turn at a time — the
111
+ * Ready/MESSAGE_SENT ping-pong enforces this — but the order matters for
112
+ * replay after reconcile, so we track it explicitly rather than relying on
113
+ * Map iteration order across Node versions.
114
+ */
115
+ outboxOrder = [];
116
+ /** SLA timer: Resume must be ack'd within this window or the socket is dead. */
117
+ resumeAckTimer;
118
+ /** SLA timer: post-ResumeAck, the server must produce forward progress or we surface a stall. */
119
+ postResumeProgressTimer;
120
+ /**
121
+ * Short grace period after a bare Ready arrives while we're awaiting a
122
+ * ResumeAck. If no ResumeAck shows up in that window, we assume the
123
+ * backend is a pre-v2 deployment that only emits Ready and fall back —
124
+ * instead of hanging for 15s until the ResumeAck SLA closes the socket.
125
+ */
126
+ legacyFallbackTimer;
127
+ /**
128
+ * Flips true on the first transition to READY and stays true for the life
129
+ * of the instance. Represents "initial handshake has completed at least
130
+ * once." Used by ``handleClose`` to distinguish startup failures (fail
131
+ * fast with a real error) from mid-session drops (enter reconnect loop).
132
+ * Decoupling this from specific ReadyState values means new handshake
133
+ * states can be added without silently changing close behavior.
134
+ */
135
+ hasCompletedHandshake = false;
78
136
  /** Expose current ready state so callers can detect post-await state changes. */
79
137
  getReadyState() {
80
138
  return this.readyState;
@@ -101,6 +159,18 @@ export class WebSocketClient extends EventEmitter {
101
159
  if (this.readyState !== newState) {
102
160
  const oldState = this.readyState;
103
161
  this.readyState = newState;
162
+ // First entry into READY marks the handshake as complete. Any close
163
+ // before this point is a startup failure; any close after is a
164
+ // mid-session drop that should trigger the reconnect loop.
165
+ if (newState === ReadyState.READY && !this.hasCompletedHandshake) {
166
+ this.hasCompletedHandshake = true;
167
+ }
168
+ // Leaving AWAITING_RESUME_ACK — either we got what we wanted
169
+ // (ResumeAck) or we gave up. Either way the legacy-fallback grace
170
+ // timer is no longer relevant.
171
+ if (oldState === ReadyState.AWAITING_RESUME_ACK) {
172
+ this.clearLegacyFallbackTimer();
173
+ }
104
174
  this.debug(`[WebSocketClient] Ready state transition: ${oldState} → ${newState}` +
105
175
  (context ? ` | Context: ${context}` : '') +
106
176
  ` | Session: ${this.sessionId?.substring(0, 8)}...` +
@@ -108,6 +178,298 @@ export class WebSocketClient extends EventEmitter {
108
178
  this.emit('readyStateChanged', { oldState, newState, context });
109
179
  }
110
180
  }
181
+ /**
182
+ * SLA for the Resume→ResumeAck round trip. Tuned to cover a cold graph load
183
+ * from postgres with some headroom, but tight enough that a silently dead
184
+ * socket reveals itself before the outer timeout layer even notices.
185
+ */
186
+ static RESUME_ACK_TIMEOUT_MS = 15000;
187
+ /**
188
+ * SLA for "something must happen after ResumeAck." If the server says it's
189
+ * awaiting nothing and we also have nothing to send, we stay in READY (not
190
+ * a stall). Otherwise a quiet socket for this long is a stall.
191
+ */
192
+ static POST_RESUME_PROGRESS_TIMEOUT_MS = 30000;
193
+ /**
194
+ * Grace period after a bare Ready arrives in AWAITING_RESUME_ACK. Long
195
+ * enough to cover the backend's ResumeAck round-trip latency, short
196
+ * enough that a true legacy server doesn't hang the SDK startup.
197
+ */
198
+ static LEGACY_READY_FALLBACK_MS = 2000;
199
+ addToOutbox(turn) {
200
+ if (this.outbox.has(turn.turn_id))
201
+ return;
202
+ this.outbox.set(turn.turn_id, turn);
203
+ this.outboxOrder.push(turn.turn_id);
204
+ this.metrics.turns_sent_total++;
205
+ }
206
+ removeFromOutbox(turnId) {
207
+ if (!this.outbox.delete(turnId))
208
+ return false;
209
+ const idx = this.outboxOrder.indexOf(turnId);
210
+ if (idx >= 0)
211
+ this.outboxOrder.splice(idx, 1);
212
+ return true;
213
+ }
214
+ outboxSnapshot() {
215
+ return this.outboxOrder
216
+ .map(id => this.outbox.get(id))
217
+ .filter((t) => t !== undefined);
218
+ }
219
+ /** Build the client half of a PendingTurn for the Resume envelope. */
220
+ pendingTurnDescriptor(turn) {
221
+ if (turn.type === 'UserQuery') {
222
+ return { turn_id: turn.turn_id, type: 'prompt' };
223
+ }
224
+ return {
225
+ turn_id: turn.turn_id,
226
+ type: 'tool_result',
227
+ ...(turn.tool_call_id ? { tool_call_id: turn.tool_call_id } : {}),
228
+ };
229
+ }
230
+ clearResumeAckTimer() {
231
+ if (this.resumeAckTimer) {
232
+ clearTimeout(this.resumeAckTimer);
233
+ this.resumeAckTimer = undefined;
234
+ }
235
+ }
236
+ clearPostResumeProgressTimer() {
237
+ if (this.postResumeProgressTimer) {
238
+ clearTimeout(this.postResumeProgressTimer);
239
+ this.postResumeProgressTimer = undefined;
240
+ }
241
+ }
242
+ startResumeAckTimer() {
243
+ this.clearResumeAckTimer();
244
+ this.resumeAckTimer = setTimeout(() => {
245
+ this.debug(`[WebSocketClient] ResumeAck timeout expired | ` +
246
+ `State: ${this.readyState} | ` +
247
+ `Session: ${this.sessionId?.substring(0, 8)}...`);
248
+ // The server didn't reply to Resume — treat the socket as dead and cycle it.
249
+ try {
250
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
251
+ this.ws.close(4001, 'ResumeAck timeout');
252
+ }
253
+ }
254
+ catch { /* cleanup: forcing close on an already-closing socket is expected to no-op */ }
255
+ this.emit('resumeAckTimeout');
256
+ }, WebSocketClient.RESUME_ACK_TIMEOUT_MS);
257
+ this.resumeAckTimer.unref?.();
258
+ }
259
+ clearLegacyFallbackTimer() {
260
+ if (this.legacyFallbackTimer) {
261
+ clearTimeout(this.legacyFallbackTimer);
262
+ this.legacyFallbackTimer = undefined;
263
+ }
264
+ }
265
+ /**
266
+ * Start the legacy-server fallback grace period.
267
+ *
268
+ * Called when we receive a bare ``Ready`` while still in
269
+ * ``AWAITING_RESUME_ACK``. In the normal v2 flow the server sends Ready
270
+ * first and ResumeAck right after — we want to wait for ResumeAck. But if
271
+ * the server is a pre-v2 deployment that never emits ResumeAck, waiting
272
+ * for the 15s SLA would stall every startup. Instead, we wait a short
273
+ * grace period; if no ResumeAck arrives, we treat the Ready we already
274
+ * saw as a legacy-protocol signal and transition to READY.
275
+ */
276
+ startLegacyFallbackTimer() {
277
+ // Idempotent — multiple Ready messages shouldn't pile up timers.
278
+ if (this.legacyFallbackTimer)
279
+ return;
280
+ this.legacyFallbackTimer = setTimeout(() => {
281
+ this.legacyFallbackTimer = undefined;
282
+ if (this.readyState !== ReadyState.AWAITING_RESUME_ACK)
283
+ return;
284
+ this.debug(`[WebSocketClient] Legacy fallback: Ready seen without ResumeAck, ` +
285
+ `assuming pre-v2 backend | outbox: ${this.outbox.size}`);
286
+ // Cancel the longer ResumeAck SLA — we've decided the server doesn't speak it.
287
+ this.clearResumeAckTimer();
288
+ this.setReadyState(ReadyState.READY, 'Legacy fallback after Ready without ResumeAck');
289
+ // Drain anything queued. Legacy flow has no TurnAck, so outbox
290
+ // entries will linger; a later reconnect (against a v2 backend)
291
+ // would reconcile them, but that's out of scope for this path.
292
+ void this.processOutbox();
293
+ }, WebSocketClient.LEGACY_READY_FALLBACK_MS);
294
+ this.legacyFallbackTimer.unref?.();
295
+ }
296
+ startPostResumeProgressTimer() {
297
+ this.clearPostResumeProgressTimer();
298
+ this.postResumeProgressTimer = setTimeout(() => {
299
+ this.debug(`[WebSocketClient] Post-ResumeAck stall | ` +
300
+ `State: ${this.readyState} | ` +
301
+ `Outbox size: ${this.outbox.size} | ` +
302
+ `Session: ${this.sessionId?.substring(0, 8)}...`);
303
+ this.metrics.stalls_detected_total++;
304
+ this.emit('stallDetected', {
305
+ stage: 'post_resume',
306
+ outboxSize: this.outbox.size,
307
+ });
308
+ }, WebSocketClient.POST_RESUME_PROGRESS_TIMEOUT_MS);
309
+ this.postResumeProgressTimer.unref?.();
310
+ }
311
+ /**
312
+ * Send the protocol-v2 Resume envelope. Called on every (re)connect before
313
+ * any turn can flow. Gives the backend enough state to reply with an
314
+ * authoritative ResumeAck.
315
+ */
316
+ sendResume() {
317
+ if (!this.ws || this.state !== ConnectionState.CONNECTED) {
318
+ this.debug('[WebSocketClient] sendResume: socket not connected, skipping');
319
+ return;
320
+ }
321
+ const payload = {
322
+ last_seen_checkpoint: this.latestCheckpointId ?? null,
323
+ pending_turns: this.outboxSnapshot().map(t => this.pendingTurnDescriptor(t)),
324
+ };
325
+ const formatted = `Resume ${JSON.stringify(payload)}\n`;
326
+ try {
327
+ this.ws.send(formatted);
328
+ this.debug(`[WebSocketClient] Sent Resume | ` +
329
+ `pending_turns: ${payload.pending_turns.length} | ` +
330
+ `last_seen_checkpoint: ${(this.latestCheckpointId ?? 'none').toString().substring(0, 8)}`);
331
+ this.setReadyState(ReadyState.AWAITING_RESUME_ACK, 'Resume sent');
332
+ this.startResumeAckTimer();
333
+ }
334
+ catch (err) {
335
+ this.debug('[WebSocketClient] Error sending Resume:', err);
336
+ }
337
+ }
338
+ /**
339
+ * Apply the server's authoritative view. This is the anti-deadlock primitive:
340
+ * every reconnect path runs through here and produces an explicit decision
341
+ * instead of both sides waiting on each other.
342
+ *
343
+ * Reconcile only runs on the FIRST ResumeAck of a (re)connect — the one we
344
+ * were explicitly waiting for. Any subsequent ResumeAcks (e.g. the server
345
+ * re-emits one as a loop-top heartbeat, or replies to our Resume probe) are
346
+ * informational; acting on them would double-send turns we've already
347
+ * replayed.
348
+ */
349
+ async handleResumeAck(message) {
350
+ const args = message?.data?.tool_args ?? {};
351
+ const currentCheckpoint = args.current_checkpoint;
352
+ const lastConsumedTurnId = args.last_consumed_turn_id;
353
+ const awaiting = args.awaiting ?? { type: 'none' };
354
+ const awaitingType = awaiting.type ?? 'none';
355
+ const awaitingToolCallId = awaiting.tool_call_id;
356
+ this.clearResumeAckTimer();
357
+ if (currentCheckpoint && currentCheckpoint !== this.latestCheckpointId) {
358
+ this.latestCheckpointId = currentCheckpoint;
359
+ }
360
+ const isReconcilePhase = this.readyState === ReadyState.AWAITING_RESUME_ACK;
361
+ this.metrics.resume_acks_total++;
362
+ this.emit('resumeAck', {
363
+ current_checkpoint: currentCheckpoint,
364
+ last_consumed_turn_id: lastConsumedTurnId,
365
+ awaiting: { type: awaitingType, tool_call_id: awaitingToolCallId },
366
+ outbox_size: this.outbox.size,
367
+ });
368
+ if (!isReconcilePhase) {
369
+ this.debug(`[WebSocketClient] Informational ResumeAck (already reconciled) | ` +
370
+ `State: ${this.readyState}`);
371
+ return;
372
+ }
373
+ // 1) Drop turns the server has already consumed (implicit ack via Resume).
374
+ if (lastConsumedTurnId && this.outbox.has(lastConsumedTurnId)) {
375
+ this.removeFromOutbox(lastConsumedTurnId);
376
+ this.debug(`[WebSocketClient] ResumeAck dropped acked turn from outbox | ` +
377
+ `turn_id: ${lastConsumedTurnId}`);
378
+ }
379
+ // 2) Reconcile what the server is waiting for against what we hold.
380
+ let reconcileFailedReason;
381
+ let resumeToSend;
382
+ if (awaitingType === 'tool_result') {
383
+ // Find a pending tool_result for the exact tool_call_id the server wants.
384
+ const candidate = this.outboxSnapshot().find(t => t.type === 'IDERetrievalAnswer' && t.tool_call_id === awaitingToolCallId);
385
+ if (candidate) {
386
+ resumeToSend = candidate;
387
+ }
388
+ else {
389
+ // Look for a MISMATCHING tool_result in the outbox — that's a true
390
+ // conflict (server awaits Y, client holds X). Emit reconcileFailed so
391
+ // the consumer can decide whether to abort or recompute.
392
+ const mismatchingToolResult = this.outboxSnapshot().find(t => t.type === 'IDERetrievalAnswer' && t.tool_call_id !== awaitingToolCallId);
393
+ if (mismatchingToolResult) {
394
+ reconcileFailedReason = awaitingToolCallId
395
+ ? `Server awaits tool_result for tool_call_id=${awaitingToolCallId}, but outbox holds tool_result for ${mismatchingToolResult.tool_call_id}`
396
+ : 'Server awaits tool_result but outbox holds tool_result for a different tool_call_id';
397
+ }
398
+ // Otherwise: outbox is empty (or holds only UserQuery). This is a
399
+ // legitimate in-flight state — the tool is executing in the MCP layer
400
+ // and will send the tool_result when done. No failure here.
401
+ }
402
+ }
403
+ else if (awaitingType === 'prompt' || awaitingType === 'none') {
404
+ // Server is idle — any leftover outbox turns are stale consumed or never-delivered
405
+ // probes. We keep prompt-style outbox entries (they will flow via normal send),
406
+ // but drop any stale tool_results since the graph is no longer at that step.
407
+ for (const turn of this.outboxSnapshot()) {
408
+ if (turn.type === 'IDERetrievalAnswer') {
409
+ this.removeFromOutbox(turn.turn_id);
410
+ this.debug(`[WebSocketClient] ResumeAck dropped stale tool_result from outbox | ` +
411
+ `turn_id: ${turn.turn_id}`);
412
+ }
413
+ }
414
+ // If we still have a UserQuery queued (client sent a prompt but never saw TurnAck)
415
+ // we should replay it.
416
+ const promptCandidate = this.outboxSnapshot().find(t => t.type === 'UserQuery');
417
+ if (promptCandidate)
418
+ resumeToSend = promptCandidate;
419
+ }
420
+ if (reconcileFailedReason) {
421
+ this.debug(`[WebSocketClient] Reconcile failed | ${reconcileFailedReason} | ` +
422
+ `outbox: ${this.outbox.size}`);
423
+ this.metrics.reconcile_failures_total++;
424
+ this.emit('reconcileFailed', { reason: reconcileFailedReason, awaiting });
425
+ // Don't lock up — transition to READY so the consumer's own error handling
426
+ // can decide what to do. The consumer can still send a new prompt.
427
+ this.setReadyState(ReadyState.READY, 'Reconcile failed');
428
+ return;
429
+ }
430
+ // 3) Happy path: transition to READY and replay anything still pending.
431
+ this.setReadyState(ReadyState.READY, 'ResumeAck reconciled');
432
+ if (resumeToSend) {
433
+ this.startPostResumeProgressTimer();
434
+ this.metrics.turns_replayed_total++;
435
+ await this.sendImmediately({
436
+ type: resumeToSend.type,
437
+ data: resumeToSend.data,
438
+ timestamp: resumeToSend.timestamp,
439
+ });
440
+ this.receivedResponsesSinceMessage = false;
441
+ this.setReadyState(ReadyState.MESSAGE_SENT, `Replayed ${resumeToSend.type} after reconcile`);
442
+ this.startReadyTimer();
443
+ }
444
+ else if (this.outbox.size > 0) {
445
+ // No specific replay target, but we have queued work — drain it.
446
+ await this.processOutbox();
447
+ }
448
+ }
449
+ handleTurnAck(message) {
450
+ const args = message?.data?.tool_args ?? {};
451
+ const turnId = args.turn_id;
452
+ const checkpointId = args.checkpoint_id;
453
+ if (!turnId) {
454
+ this.debug('[WebSocketClient] TurnAck received without turn_id, ignoring');
455
+ return;
456
+ }
457
+ const removed = this.removeFromOutbox(turnId);
458
+ if (checkpointId)
459
+ this.latestCheckpointId = checkpointId;
460
+ this.clearPostResumeProgressTimer();
461
+ this.metrics.turns_acked_total++;
462
+ // A TurnAck for a turn we no longer have in the outbox means the server
463
+ // re-ACK'd an already-dropped turn (idempotent replay confirmation).
464
+ if (!removed)
465
+ this.metrics.idempotent_replays_total++;
466
+ this.debug(`[WebSocketClient] TurnAck received | ` +
467
+ `turn_id: ${turnId} | ` +
468
+ `checkpoint: ${checkpointId?.substring(0, 8) || 'none'} | ` +
469
+ `outbox_removed: ${removed} | ` +
470
+ `outbox_remaining: ${this.outbox.size}`);
471
+ this.emit('turnAck', { turn_id: turnId, checkpoint_id: checkpointId });
472
+ }
111
473
  async handleReadyMessage(message) {
112
474
  const checkpointId = message.data?.tool_args?.checkpoint_id;
113
475
  const previousCheckpoint = this.latestCheckpointId;
@@ -124,6 +486,15 @@ export class WebSocketClient extends EventEmitter {
124
486
  }
125
487
  // Handle based on current state
126
488
  switch (this.readyState) {
489
+ case ReadyState.AWAITING_RESUME_ACK:
490
+ // In the normal v2 flow the server sends Ready then ResumeAck;
491
+ // Ready here is informational and we stay put until ResumeAck.
492
+ // Start a short legacy-fallback grace period so that a pre-v2
493
+ // backend (Ready only, no ResumeAck) doesn't stall the whole
494
+ // 15s ResumeAck SLA before we make progress.
495
+ this.debug('[WebSocketClient] Ready received while awaiting ResumeAck — starting legacy fallback grace');
496
+ this.startLegacyFallbackTimer();
497
+ break;
127
498
  case ReadyState.WAITING_INITIAL_READY:
128
499
  this.clearReadyTimer();
129
500
  this.setReadyState(ReadyState.READY, 'Initial Ready received after connection');
@@ -189,32 +560,48 @@ export class WebSocketClient extends EventEmitter {
189
560
  async sendMessage(type, data) {
190
561
  // Reset idle timer on any outgoing message
191
562
  this.resetIdleTimer();
192
- const messageToSend = { type, data, timestamp: Date.now() };
563
+ // Ensure every turn carries a turn_id — the backend idempotency key.
564
+ // Accepts a caller-supplied id (rare; only tests need that) and otherwise
565
+ // mints one here so existing call sites don't need updates.
566
+ const turn_id = (data && typeof data === 'object' && typeof data.turn_id === 'string' && data.turn_id)
567
+ ? data.turn_id
568
+ : uuid();
569
+ const dataWithTurnId = (data && typeof data === 'object')
570
+ ? { ...data, turn_id }
571
+ : { turn_id, payload: data };
572
+ const tool_call_id = type === 'IDERetrievalAnswer'
573
+ ? (dataWithTurnId.tool_id ?? dataWithTurnId.tool_call_id)
574
+ : undefined;
575
+ const turn = {
576
+ turn_id,
577
+ type,
578
+ data: dataWithTurnId,
579
+ tool_call_id,
580
+ timestamp: Date.now(),
581
+ };
582
+ this.addToOutbox(turn);
193
583
  this.debug(`[WebSocketClient] sendMessage called | ` +
194
584
  `Type: ${type} | ` +
585
+ `turn_id: ${turn_id} | ` +
195
586
  `ReadyState: ${this.readyState} | ` +
196
- `ConnectionState: ${this.state}`);
197
- // Save as last sent message for potential recovery
198
- this.lastSentMessage = messageToSend;
199
- this.debug(`[WebSocketClient] Last sent message cached | ` +
200
- `Type: ${type} | ` +
201
- `Timestamp: ${messageToSend.timestamp}`);
587
+ `ConnectionState: ${this.state} | ` +
588
+ `outbox_size: ${this.outbox.size}`);
589
+ // Save as last sent message for potential recovery (legacy path)
590
+ this.lastSentMessage = { type, data: dataWithTurnId, timestamp: turn.timestamp };
202
591
  if (this.readyState === ReadyState.READY && this.ws && this.state === ConnectionState.CONNECTED) {
203
- // Send immediately
204
- this.receivedResponsesSinceMessage = false; // Reset flag when sending new message
205
- await this.sendImmediately(messageToSend);
592
+ this.receivedResponsesSinceMessage = false;
593
+ await this.sendImmediately({ type, data: dataWithTurnId, timestamp: turn.timestamp });
206
594
  this.setReadyState(ReadyState.MESSAGE_SENT, `Sent ${type} message`);
207
- this.startReadyTimer(); // Expect Ready after server processes message
595
+ this.startReadyTimer();
208
596
  }
209
597
  else {
210
- // Queue until Ready
211
- this.pendingOutbox.push(messageToSend);
212
- this.debug(`[WebSocketClient] Message queued | ` +
598
+ this.debug(`[WebSocketClient] Turn parked in outbox | ` +
213
599
  `Type: ${type} | ` +
214
- `Queue size: ${this.pendingOutbox.length} | ` +
600
+ `turn_id: ${turn_id} | ` +
601
+ `outbox_size: ${this.outbox.size} | ` +
215
602
  `Reason: ReadyState=${this.readyState}, Connected=${this.state === ConnectionState.CONNECTED}`);
216
603
  if (this.state === ConnectionState.DISCONNECTED) {
217
- this.debug('[WebSocketClient] Initiating connection due to queued message');
604
+ this.debug('[WebSocketClient] Initiating connection due to queued turn');
218
605
  await this.connect();
219
606
  }
220
607
  }
@@ -226,29 +613,38 @@ export class WebSocketClient extends EventEmitter {
226
613
  `Type: ${msg.type} | ` +
227
614
  `Size: ${formatted.length} bytes`);
228
615
  }
616
+ /**
617
+ * Drain the outbox into the socket. Only one turn is in-flight at a time
618
+ * (MESSAGE_SENT gate enforces this). Called on ResumeAck (after reconcile)
619
+ * and on transitions back to READY.
620
+ */
229
621
  async processOutbox() {
230
622
  this.debug(`[WebSocketClient] Processing outbox | ` +
231
- `Queue size: ${this.pendingOutbox.length} | ` +
623
+ `outbox_size: ${this.outbox.size} | ` +
232
624
  `ReadyState: ${this.readyState}`);
233
- if (this.pendingOutbox.length > 0 && this.readyState === ReadyState.READY) {
234
- const msg = this.pendingOutbox.shift();
235
- this.debug(`[WebSocketClient] Sending queued message | ` +
236
- `Type: ${msg.type} | ` +
237
- `Queued at: ${msg.timestamp} | ` +
238
- `Wait time: ${Date.now() - msg.timestamp}ms | ` +
239
- `Remaining in queue: ${this.pendingOutbox.length}`);
240
- this.receivedResponsesSinceMessage = false; // Reset flag when sending queued message
241
- await this.sendImmediately(msg);
242
- this.setReadyState(ReadyState.MESSAGE_SENT, `Sent queued ${msg.type} message`);
243
- this.startReadyTimer(); // Expect Ready after server processes message
244
- }
245
- else if (this.pendingOutbox.length === 0) {
625
+ if (this.outbox.size === 0) {
246
626
  this.debug('[WebSocketClient] Outbox is empty, no messages to process');
627
+ return;
247
628
  }
248
- else {
629
+ if (this.readyState !== ReadyState.READY) {
249
630
  this.debug(`[WebSocketClient] Cannot process outbox | ` +
250
631
  `ReadyState: ${this.readyState} (expected READY)`);
632
+ return;
251
633
  }
634
+ const nextId = this.outboxOrder[0];
635
+ const turn = nextId ? this.outbox.get(nextId) : undefined;
636
+ if (!turn)
637
+ return;
638
+ this.debug(`[WebSocketClient] Sending queued turn | ` +
639
+ `Type: ${turn.type} | ` +
640
+ `turn_id: ${turn.turn_id} | ` +
641
+ `Queued at: ${turn.timestamp} | ` +
642
+ `Wait: ${Date.now() - turn.timestamp}ms | ` +
643
+ `outbox_remaining: ${this.outbox.size}`);
644
+ this.receivedResponsesSinceMessage = false;
645
+ await this.sendImmediately({ type: turn.type, data: turn.data, timestamp: turn.timestamp });
646
+ this.setReadyState(ReadyState.MESSAGE_SENT, `Sent queued ${turn.type} message`);
647
+ this.startReadyTimer();
252
648
  }
253
649
  startReadyTimer() {
254
650
  this.clearReadyTimer();
@@ -268,44 +664,32 @@ export class WebSocketClient extends EventEmitter {
268
664
  this.readyTimer = undefined;
269
665
  }
270
666
  }
667
+ /**
668
+ * Legacy recovery entry point.
669
+ *
670
+ * Under protocol v2 the outbox holds every un-ACK'd turn by turn_id and the
671
+ * Resume/ResumeAck handshake replays it authoritatively. All we need to do
672
+ * here is cycle the socket — the open handler will send Resume, the server
673
+ * will reply with ResumeAck, and reconciliation will happen automatically.
674
+ */
271
675
  async initiateCheckpointRecovery(reason) {
272
- this.debug(`[WebSocketClient] ===== CHECKPOINT RECOVERY INITIATED ===== | ` +
676
+ this.debug(`[WebSocketClient] Forcing reconnect for recovery | ` +
273
677
  `Reason: ${reason} | ` +
274
678
  `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
275
- `Last message type: ${this.lastSentMessage?.type} | ` +
276
- `Current state: ${this.readyState}`);
277
- this.setReadyState(ReadyState.CHECKPOINT_RECOVERY, reason);
278
- // Disconnect current connection
279
- this.debug('[WebSocketClient] Disconnecting for checkpoint recovery');
280
- this.disconnect();
281
- // Reconnect with checkpoint (URL will include checkpoint_id)
282
- this.debug(`[WebSocketClient] Reconnecting with checkpoint | ` +
283
- `Checkpoint ID: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
284
- `Session ID: ${this.sessionId?.substring(0, 8)}...`);
679
+ `outbox_size: ${this.outbox.size} | ` +
680
+ `State: ${this.readyState}`);
681
+ this.emit('checkpointRecovery', { reason, checkpoint: this.latestCheckpointId });
285
682
  try {
286
- await this.establishConnection();
287
- this.debug('[WebSocketClient] Reconnection successful during recovery');
288
- // Wait for Ready state is set in handleReadyMessage when initial Ready is received
289
- // Resend last message once Ready received
290
- if (this.lastSentMessage) {
291
- this.pendingOutbox.unshift(this.lastSentMessage);
292
- this.debug(`[WebSocketClient] Last message re-queued for recovery | ` +
293
- `Type: ${this.lastSentMessage.type} | ` +
294
- `Original timestamp: ${this.lastSentMessage.timestamp}`);
295
- }
296
- else {
297
- this.debug('[WebSocketClient] No last message to resend during recovery');
683
+ // Close the socket hard. handleClose will schedule reconnection via
684
+ // attemptReconnection, and the open handler will send a fresh Resume.
685
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
686
+ this.ws.close(4002, `Recovery: ${reason}`);
298
687
  }
299
- // Emit event for monitoring
300
- this.emit('checkpointRecovery', { reason, checkpoint: this.latestCheckpointId });
301
688
  }
302
689
  catch (error) {
303
- this.debug(`[WebSocketClient] Checkpoint recovery failed | ` +
690
+ this.debug(`[WebSocketClient] Recovery reconnect failed | ` +
304
691
  `Error: ${error instanceof Error ? error.message : error}`);
305
- this.setReadyState(ReadyState.IDLE, 'Recovery failed');
306
- throw error;
307
692
  }
308
- this.debug('[WebSocketClient] ===== CHECKPOINT RECOVERY COMPLETED =====');
309
693
  }
310
694
  disconnect() {
311
695
  // Hold a reference to the current socket to close it safely
@@ -388,17 +772,18 @@ export class WebSocketClient extends EventEmitter {
388
772
  this.emit('connected');
389
773
  this.debug(`[WebSocketClient] WebSocket connected | ` +
390
774
  `Session: ${this.sessionId?.substring(0, 8)}... | ` +
391
- `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'}`);
392
- // Transition to waiting for initial Ready
393
- this.setReadyState(ReadyState.WAITING_INITIAL_READY, 'Connection opened, awaiting initial Ready');
394
- this.startReadyTimer(); // Wait for initial Ready
395
- // Start idle timer immediately upon connection
775
+ `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
776
+ `outbox: ${this.outbox.size}`);
777
+ // Start idle timer and heartbeat immediately so dead sockets surface.
396
778
  this.startIdleTimer();
397
- // Start heartbeat
398
779
  this.startHeartbeat();
399
- // Process any queued messages (old mechanism, will be replaced by pendingOutbox)
400
- this.processQueuedMessages();
401
- this.debug('[WebSocketClient] Connection established, waiting for initial Ready signal');
780
+ // Protocol v2: every (re)connect opens with a Resume envelope. The
781
+ // server's ResumeAck is the authoritative signal — the old "wait for
782
+ // Ready" handshake is left as a fallback for servers that don't yet
783
+ // understand Resume (they emit Ready first, which we treat as info).
784
+ this.setReadyState(ReadyState.AWAITING_RESUME_ACK, 'Connection opened, sending Resume');
785
+ this.sendResume();
786
+ this.debug('[WebSocketClient] Connection established, waiting for ResumeAck');
402
787
  resolve();
403
788
  });
404
789
  ws.on('message', (data) => {
@@ -407,22 +792,33 @@ export class WebSocketClient extends EventEmitter {
407
792
  this.resetIdleTimer();
408
793
  this.lastPongReceived = Date.now(); // any message is also a ping
409
794
  const message = data.toString();
410
- // Try to detect Ready messages
795
+ // Intercept protocol-v2 control messages (ResumeAck, TurnAck, Ready)
796
+ // before forwarding to the consumer. These are transport concerns.
411
797
  try {
412
798
  const parsed = JSON.parse(message);
413
- if (parsed && parsed.data && parsed.data.tool === 'Ready') {
414
- // Handle Ready message separately
799
+ const tool = parsed?.data?.tool;
800
+ if (tool === 'Ready') {
415
801
  this.handleReadyMessage(parsed);
416
802
  return;
417
803
  }
804
+ if (tool === 'ResumeAck') {
805
+ this.handleResumeAck(parsed);
806
+ return;
807
+ }
808
+ if (tool === 'TurnAck') {
809
+ this.handleTurnAck(parsed);
810
+ return;
811
+ }
418
812
  }
419
813
  catch (e) {
420
- // Not JSON or not a Ready message, continue with normal flow
814
+ // Not JSON or not a control message fall through to consumer.
421
815
  }
422
816
  // Track that we received a non-Ready message (response)
423
817
  if (this.readyState === ReadyState.MESSAGE_SENT) {
424
818
  this.receivedResponsesSinceMessage = true;
425
819
  }
820
+ // A domain event after ResumeAck counts as forward progress — clear the stall timer.
821
+ this.clearPostResumeProgressTimer();
426
822
  // Emit non-Ready messages to AgentAPI
427
823
  this.emit('message', message);
428
824
  });
@@ -533,8 +929,11 @@ export class WebSocketClient extends EventEmitter {
533
929
  this.setState(ConnectionState.FAILED);
534
930
  this.emit('error', new Error('Authentication failed'));
535
931
  }
536
- else if (this.readyState === ReadyState.WAITING_INITIAL_READY) {
537
- // Connection closed during initial handshake - fail fast with clear error
932
+ else if (!this.hasCompletedHandshake) {
933
+ // No successful handshake has completed on this instance. Treat any
934
+ // close here as a startup failure — auth issues, wrong URL, server
935
+ // rejecting the protocol, etc. should surface as a real error rather
936
+ // than enter an indefinite reconnect loop that hides the root cause.
538
937
  this.setState(ConnectionState.FAILED);
539
938
  const errorMessage = this.buildConnectionErrorMessage(code, reason);
540
939
  this.emit('error', new Error(errorMessage));
@@ -573,13 +972,22 @@ export class WebSocketClient extends EventEmitter {
573
972
  this.isReconnecting = true;
574
973
  this.setState(ConnectionState.RECONNECTING);
575
974
  this.reconnectAttempts++;
975
+ this.metrics.reconnects_total++;
976
+ // Stamp the disconnect time on the first reconnect attempt of a cycle so
977
+ // the eventual `reconnected` event can carry an accurate downtime_ms.
978
+ if (this.reconnectAttempts === 1 && !this.lastDisconnectAt) {
979
+ this.lastDisconnectAt = Date.now();
980
+ }
981
+ this.emit('reconnecting', { attempt: this.reconnectAttempts });
576
982
  const delay = this.calculateBackoffDelay(this.reconnectAttempts);
577
983
  this.debug(`Attempting reconnection ${this.reconnectAttempts} in ${delay}ms`);
578
984
  this.reconnectTimer = setTimeout(async () => {
579
985
  try {
580
986
  await this.establishConnection();
581
987
  this.isReconnecting = false;
582
- this.emit('reconnected');
988
+ const downtime_ms = this.lastDisconnectAt ? Date.now() - this.lastDisconnectAt : undefined;
989
+ this.lastDisconnectAt = undefined;
990
+ this.emit('reconnected', { downtime_ms });
583
991
  }
584
992
  catch (error) {
585
993
  this.debug('Reconnection failed:', error instanceof Error ? error.message : error);
@@ -690,6 +1098,9 @@ export class WebSocketClient extends EventEmitter {
690
1098
  this.clearIdleTimer();
691
1099
  this.stopHeartbeat();
692
1100
  this.clearReadyTimer();
1101
+ this.clearResumeAckTimer();
1102
+ this.clearPostResumeProgressTimer();
1103
+ this.clearLegacyFallbackTimer();
693
1104
  if (this.reconnectTimer) {
694
1105
  clearTimeout(this.reconnectTimer);
695
1106
  this.reconnectTimer = undefined;
@@ -704,17 +1115,35 @@ export class WebSocketClient extends EventEmitter {
704
1115
  `State: ${this.state} | ` +
705
1116
  `ReadyState: ${this.readyState} | ` +
706
1117
  `Queued messages: ${this.messageQueue.length} | ` +
707
- `Pending outbox: ${this.pendingOutbox.length}`);
1118
+ `Outbox: ${this.outbox.size}`);
708
1119
  this.clearTimers();
709
1120
  this.disconnect();
710
1121
  this.messageQueue = [];
711
- this.pendingOutbox = [];
1122
+ this.outbox.clear();
1123
+ this.outboxOrder.length = 0;
712
1124
  this.isReconnecting = false;
713
1125
  this.latestCheckpointId = undefined;
714
1126
  this.lastSentMessage = undefined;
1127
+ this.hasCompletedHandshake = false;
715
1128
  this.setReadyState(ReadyState.IDLE, 'Cleanup performed');
716
1129
  this.removeAllListeners();
717
1130
  this.debug('[WebSocketClient] Cleanup completed');
718
1131
  }
1132
+ // --- Test hooks -----------------------------------------------------------
1133
+ // These are intentionally tiny, read-only accessors so the chaos harness
1134
+ // can assert on internal state without resorting to `any` casts. They are
1135
+ // not part of the public SDK API.
1136
+ /** @internal */
1137
+ __getOutboxIds() {
1138
+ return [...this.outboxOrder];
1139
+ }
1140
+ /** @internal */
1141
+ __getOutboxSize() {
1142
+ return this.outbox.size;
1143
+ }
1144
+ /** @internal */
1145
+ __getLatestCheckpointId() {
1146
+ return this.latestCheckpointId;
1147
+ }
719
1148
  }
720
1149
  //# sourceMappingURL=websocket.js.map