@qodo/sdk 0.12.0 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,12 @@ export var ConnectionState;
28
28
  export var ReadyState;
29
29
  (function (ReadyState) {
30
30
  ReadyState["IDLE"] = "IDLE";
31
+ /**
32
+ * Post-connect, waiting for ResumeAck from the server before sending any
33
+ * turn. Replaces the old WAITING_INITIAL_READY — the server's authoritative
34
+ * view is now the precondition for proceeding, not a bare "Ready" signal.
35
+ */
36
+ ReadyState["AWAITING_RESUME_ACK"] = "AWAITING_RESUME_ACK";
31
37
  ReadyState["WAITING_INITIAL_READY"] = "WAITING_INITIAL_READY";
32
38
  ReadyState["READY"] = "READY";
33
39
  ReadyState["MESSAGE_SENT"] = "MESSAGE_SENT";
@@ -73,8 +79,60 @@ export class WebSocketClient extends EventEmitter {
73
79
  latestCheckpointId;
74
80
  lastSentMessage;
75
81
  readyTimer;
76
- pendingOutbox = [];
77
82
  receivedResponsesSinceMessage = false; // Track if any responses received after sending message
83
+ /**
84
+ * Protocol-v2 counters. Public read-only via `getMetrics()` — wired into
85
+ * SDK telemetry so observability dashboards can alert on e.g. rising
86
+ * `reconcile_failures_total` without digging through debug logs.
87
+ */
88
+ metrics = {
89
+ reconnects_total: 0,
90
+ turns_sent_total: 0,
91
+ turns_acked_total: 0,
92
+ turns_replayed_total: 0,
93
+ resume_acks_total: 0,
94
+ reconcile_failures_total: 0,
95
+ stalls_detected_total: 0,
96
+ idempotent_replays_total: 0,
97
+ };
98
+ /** Last time the socket transitioned from CONNECTED to not-CONNECTED; used for downtime metrics. */
99
+ lastDisconnectAt;
100
+ getMetrics() {
101
+ return { ...this.metrics };
102
+ }
103
+ /**
104
+ * Outbox of turns the client has sent (or is about to send) but has not
105
+ * received a TurnAck for. Keyed by turn_id so idempotent replay across
106
+ * reconnect is an O(1) lookup. Only pruned on TurnAck or ResumeAck.
107
+ */
108
+ outbox = new Map();
109
+ /**
110
+ * FIFO order of turn_ids in the outbox. We send one turn at a time — the
111
+ * Ready/MESSAGE_SENT ping-pong enforces this — but the order matters for
112
+ * replay after reconcile, so we track it explicitly rather than relying on
113
+ * Map iteration order across Node versions.
114
+ */
115
+ outboxOrder = [];
116
+ /** SLA timer: Resume must be ack'd within this window or the socket is dead. */
117
+ resumeAckTimer;
118
+ /** SLA timer: post-ResumeAck, the server must produce forward progress or we surface a stall. */
119
+ postResumeProgressTimer;
120
+ /**
121
+ * Short grace period after a bare Ready arrives while we're awaiting a
122
+ * ResumeAck. If no ResumeAck shows up in that window, we assume the
123
+ * backend is a pre-v2 deployment that only emits Ready and fall back —
124
+ * instead of hanging for 15s until the ResumeAck SLA closes the socket.
125
+ */
126
+ legacyFallbackTimer;
127
+ /**
128
+ * Flips true on the first transition to READY and stays true for the life
129
+ * of the instance. Represents "initial handshake has completed at least
130
+ * once." Used by ``handleClose`` to distinguish startup failures (fail
131
+ * fast with a real error) from mid-session drops (enter reconnect loop).
132
+ * Decoupling this from specific ReadyState values means new handshake
133
+ * states can be added without silently changing close behavior.
134
+ */
135
+ hasCompletedHandshake = false;
78
136
  /** Expose current ready state so callers can detect post-await state changes. */
79
137
  getReadyState() {
80
138
  return this.readyState;
@@ -101,6 +159,18 @@ export class WebSocketClient extends EventEmitter {
101
159
  if (this.readyState !== newState) {
102
160
  const oldState = this.readyState;
103
161
  this.readyState = newState;
162
+ // First entry into READY marks the handshake as complete. Any close
163
+ // before this point is a startup failure; any close after is a
164
+ // mid-session drop that should trigger the reconnect loop.
165
+ if (newState === ReadyState.READY && !this.hasCompletedHandshake) {
166
+ this.hasCompletedHandshake = true;
167
+ }
168
+ // Leaving AWAITING_RESUME_ACK — either we got what we wanted
169
+ // (ResumeAck) or we gave up. Either way the legacy-fallback grace
170
+ // timer is no longer relevant.
171
+ if (oldState === ReadyState.AWAITING_RESUME_ACK) {
172
+ this.clearLegacyFallbackTimer();
173
+ }
104
174
  this.debug(`[WebSocketClient] Ready state transition: ${oldState} → ${newState}` +
105
175
  (context ? ` | Context: ${context}` : '') +
106
176
  ` | Session: ${this.sessionId?.substring(0, 8)}...` +
@@ -108,6 +178,308 @@ export class WebSocketClient extends EventEmitter {
108
178
  this.emit('readyStateChanged', { oldState, newState, context });
109
179
  }
110
180
  }
181
+ /**
182
+ * SLA for the Resume→ResumeAck round trip. Tuned to cover a cold graph load
183
+ * from postgres with some headroom, but tight enough that a silently dead
184
+ * socket reveals itself before the outer timeout layer even notices.
185
+ */
186
+ static RESUME_ACK_TIMEOUT_MS = 15000;
187
+ /**
188
+ * SLA for "something must happen after ResumeAck." If the server says it's
189
+ * awaiting nothing and we also have nothing to send, we stay in READY (not
190
+ * a stall). Otherwise a quiet socket for this long is a stall.
191
+ */
192
+ static POST_RESUME_PROGRESS_TIMEOUT_MS = 30000;
193
+ /**
194
+ * Grace period after a bare Ready arrives in AWAITING_RESUME_ACK. Long
195
+ * enough to cover the backend's ResumeAck round-trip latency, short
196
+ * enough that a true legacy server doesn't hang the SDK startup.
197
+ */
198
+ static LEGACY_READY_FALLBACK_MS = 2000;
199
+ addToOutbox(turn) {
200
+ if (this.outbox.has(turn.turn_id))
201
+ return;
202
+ this.outbox.set(turn.turn_id, turn);
203
+ this.outboxOrder.push(turn.turn_id);
204
+ this.metrics.turns_sent_total++;
205
+ }
206
+ removeFromOutbox(turnId) {
207
+ if (!this.outbox.delete(turnId))
208
+ return false;
209
+ const idx = this.outboxOrder.indexOf(turnId);
210
+ if (idx >= 0)
211
+ this.outboxOrder.splice(idx, 1);
212
+ return true;
213
+ }
214
+ outboxSnapshot() {
215
+ return this.outboxOrder
216
+ .map(id => this.outbox.get(id))
217
+ .filter((t) => t !== undefined);
218
+ }
219
+ /** Build the client half of a PendingTurn for the Resume envelope. */
220
+ pendingTurnDescriptor(turn) {
221
+ if (turn.type === 'UserQuery') {
222
+ return { turn_id: turn.turn_id, type: 'prompt' };
223
+ }
224
+ return {
225
+ turn_id: turn.turn_id,
226
+ type: 'tool_result',
227
+ ...(turn.tool_call_id ? { tool_call_id: turn.tool_call_id } : {}),
228
+ };
229
+ }
230
+ clearResumeAckTimer() {
231
+ if (this.resumeAckTimer) {
232
+ clearTimeout(this.resumeAckTimer);
233
+ this.resumeAckTimer = undefined;
234
+ }
235
+ }
236
+ clearPostResumeProgressTimer() {
237
+ if (this.postResumeProgressTimer) {
238
+ clearTimeout(this.postResumeProgressTimer);
239
+ this.postResumeProgressTimer = undefined;
240
+ }
241
+ }
242
+ startResumeAckTimer() {
243
+ this.clearResumeAckTimer();
244
+ this.resumeAckTimer = setTimeout(() => {
245
+ this.debug(`[WebSocketClient] ResumeAck timeout expired | ` +
246
+ `State: ${this.readyState} | ` +
247
+ `Session: ${this.sessionId?.substring(0, 8)}...`);
248
+ // The server didn't reply to Resume — treat the socket as dead and cycle it.
249
+ try {
250
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
251
+ this.ws.close(4001, 'ResumeAck timeout');
252
+ }
253
+ }
254
+ catch { /* cleanup: forcing close on an already-closing socket is expected to no-op */ }
255
+ this.emit('resumeAckTimeout');
256
+ }, WebSocketClient.RESUME_ACK_TIMEOUT_MS);
257
+ this.resumeAckTimer.unref?.();
258
+ }
259
+ clearLegacyFallbackTimer() {
260
+ if (this.legacyFallbackTimer) {
261
+ clearTimeout(this.legacyFallbackTimer);
262
+ this.legacyFallbackTimer = undefined;
263
+ }
264
+ }
265
+ /**
266
+ * Start the legacy-server fallback grace period.
267
+ *
268
+ * Called when we receive a bare ``Ready`` while still in
269
+ * ``AWAITING_RESUME_ACK``. In the normal v2 flow the server sends Ready
270
+ * first and ResumeAck right after — we want to wait for ResumeAck. But if
271
+ * the server is a pre-v2 deployment that never emits ResumeAck, waiting
272
+ * for the 15s SLA would stall every startup. Instead, we wait a short
273
+ * grace period; if no ResumeAck arrives, we treat the Ready we already
274
+ * saw as a legacy-protocol signal and transition to READY.
275
+ */
276
+ startLegacyFallbackTimer() {
277
+ // Idempotent — multiple Ready messages shouldn't pile up timers.
278
+ if (this.legacyFallbackTimer)
279
+ return;
280
+ this.legacyFallbackTimer = setTimeout(() => {
281
+ this.legacyFallbackTimer = undefined;
282
+ if (this.readyState !== ReadyState.AWAITING_RESUME_ACK)
283
+ return;
284
+ this.debug(`[WebSocketClient] Legacy fallback: Ready seen without ResumeAck, ` +
285
+ `assuming pre-v2 backend | outbox: ${this.outbox.size}`);
286
+ // Cancel the longer ResumeAck SLA — we've decided the server doesn't speak it.
287
+ this.clearResumeAckTimer();
288
+ this.setReadyState(ReadyState.READY, 'Legacy fallback after Ready without ResumeAck');
289
+ // Drain anything queued. Legacy flow has no TurnAck, so outbox
290
+ // entries will linger; a later reconnect (against a v2 backend)
291
+ // would reconcile them, but that's out of scope for this path.
292
+ void this.processOutbox();
293
+ }, WebSocketClient.LEGACY_READY_FALLBACK_MS);
294
+ this.legacyFallbackTimer.unref?.();
295
+ }
296
+ startPostResumeProgressTimer() {
297
+ this.clearPostResumeProgressTimer();
298
+ this.postResumeProgressTimer = setTimeout(() => {
299
+ this.debug(`[WebSocketClient] Post-ResumeAck stall | ` +
300
+ `State: ${this.readyState} | ` +
301
+ `Outbox size: ${this.outbox.size} | ` +
302
+ `Session: ${this.sessionId?.substring(0, 8)}...`);
303
+ this.metrics.stalls_detected_total++;
304
+ this.emit('stallDetected', {
305
+ stage: 'post_resume',
306
+ outboxSize: this.outbox.size,
307
+ });
308
+ }, WebSocketClient.POST_RESUME_PROGRESS_TIMEOUT_MS);
309
+ this.postResumeProgressTimer.unref?.();
310
+ }
311
+ /**
312
+ * Send the protocol-v2 Resume envelope. Called on every (re)connect before
313
+ * any turn can flow. Gives the backend enough state to reply with an
314
+ * authoritative ResumeAck.
315
+ */
316
+ sendResume() {
317
+ if (!this.ws || this.state !== ConnectionState.CONNECTED) {
318
+ this.debug('[WebSocketClient] sendResume: socket not connected, skipping');
319
+ return;
320
+ }
321
+ const payload = {
322
+ last_seen_checkpoint: this.latestCheckpointId ?? null,
323
+ pending_turns: this.outboxSnapshot().map(t => this.pendingTurnDescriptor(t)),
324
+ };
325
+ const formatted = `Resume ${JSON.stringify(payload)}\n`;
326
+ try {
327
+ this.ws.send(formatted);
328
+ this.debug(`[WebSocketClient] Sent Resume | ` +
329
+ `pending_turns: ${payload.pending_turns.length} | ` +
330
+ `last_seen_checkpoint: ${(this.latestCheckpointId ?? 'none').toString().substring(0, 8)}`);
331
+ this.setReadyState(ReadyState.AWAITING_RESUME_ACK, 'Resume sent');
332
+ this.startResumeAckTimer();
333
+ }
334
+ catch (err) {
335
+ this.debug('[WebSocketClient] Error sending Resume:', err);
336
+ }
337
+ }
338
+ /**
339
+ * Apply the server's authoritative view. This is the anti-deadlock primitive:
340
+ * every reconnect path runs through here and produces an explicit decision
341
+ * instead of both sides waiting on each other.
342
+ *
343
+ * Reconcile only runs on the FIRST ResumeAck of a (re)connect — the one we
344
+ * were explicitly waiting for. Any subsequent ResumeAcks (e.g. the server
345
+ * re-emits one as a loop-top heartbeat, or replies to our Resume probe) are
346
+ * informational; acting on them would double-send turns we've already
347
+ * replayed.
348
+ */
349
+ async handleResumeAck(message) {
350
+ const args = message?.data?.tool_args ?? {};
351
+ const currentCheckpoint = args.current_checkpoint;
352
+ const lastConsumedTurnId = args.last_consumed_turn_id;
353
+ const awaiting = args.awaiting ?? { type: 'none' };
354
+ const awaitingType = awaiting.type ?? 'none';
355
+ const awaitingToolCallId = awaiting.tool_call_id;
356
+ this.clearResumeAckTimer();
357
+ if (currentCheckpoint && currentCheckpoint !== this.latestCheckpointId) {
358
+ this.latestCheckpointId = currentCheckpoint;
359
+ }
360
+ const isReconcilePhase = this.readyState === ReadyState.AWAITING_RESUME_ACK;
361
+ this.metrics.resume_acks_total++;
362
+ this.emit('resumeAck', {
363
+ current_checkpoint: currentCheckpoint,
364
+ last_consumed_turn_id: lastConsumedTurnId,
365
+ awaiting: { type: awaitingType, tool_call_id: awaitingToolCallId },
366
+ outbox_size: this.outbox.size,
367
+ });
368
+ if (!isReconcilePhase) {
369
+ this.debug(`[WebSocketClient] Informational ResumeAck (already reconciled) | ` +
370
+ `State: ${this.readyState}`);
371
+ return;
372
+ }
373
+ // 1) Drop turns the server has already consumed (implicit ack via Resume).
374
+ if (lastConsumedTurnId && this.outbox.has(lastConsumedTurnId)) {
375
+ this.removeFromOutbox(lastConsumedTurnId);
376
+ this.debug(`[WebSocketClient] ResumeAck dropped acked turn from outbox | ` +
377
+ `turn_id: ${lastConsumedTurnId}`);
378
+ }
379
+ // 2) Reconcile what the server is waiting for against what we hold.
380
+ let reconcileFailedReason;
381
+ let resumeToSend;
382
+ if (awaitingType === 'tool_result') {
383
+ // Find a pending tool_result for the exact tool_call_id the server wants.
384
+ const candidate = this.outboxSnapshot().find(t => t.type === 'IDERetrievalAnswer' && t.tool_call_id === awaitingToolCallId);
385
+ if (candidate) {
386
+ resumeToSend = candidate;
387
+ }
388
+ else {
389
+ // Look for a MISMATCHING tool_result in the outbox — that's a true
390
+ // conflict (server awaits Y, client holds X). Emit reconcileFailed so
391
+ // the consumer can decide whether to abort or recompute.
392
+ const mismatchingToolResult = this.outboxSnapshot().find(t => t.type === 'IDERetrievalAnswer' && t.tool_call_id !== awaitingToolCallId);
393
+ if (mismatchingToolResult) {
394
+ reconcileFailedReason = awaitingToolCallId
395
+ ? `Server awaits tool_result for tool_call_id=${awaitingToolCallId}, but outbox holds tool_result for ${mismatchingToolResult.tool_call_id}`
396
+ : 'Server awaits tool_result but outbox holds tool_result for a different tool_call_id';
397
+ }
398
+ // Otherwise: outbox is empty (or holds only UserQuery). This is a
399
+ // legitimate in-flight state — the tool is executing in the MCP layer
400
+ // and will send the tool_result when done. No failure here.
401
+ }
402
+ }
403
+ else if (awaitingType === 'prompt' || awaitingType === 'none') {
404
+ // Server is idle — any leftover outbox turns are stale consumed or never-delivered
405
+ // probes. We keep prompt-style outbox entries (they will flow via normal send),
406
+ // but drop any stale tool_results since the graph is no longer at that step.
407
+ for (const turn of this.outboxSnapshot()) {
408
+ if (turn.type === 'IDERetrievalAnswer') {
409
+ this.removeFromOutbox(turn.turn_id);
410
+ this.debug(`[WebSocketClient] ResumeAck dropped stale tool_result from outbox | ` +
411
+ `turn_id: ${turn.turn_id}`);
412
+ }
413
+ }
414
+ // If we still have a UserQuery queued (client sent a prompt but never saw TurnAck)
415
+ // we should replay it.
416
+ const promptCandidate = this.outboxSnapshot().find(t => t.type === 'UserQuery');
417
+ if (promptCandidate)
418
+ resumeToSend = promptCandidate;
419
+ }
420
+ if (reconcileFailedReason) {
421
+ this.debug(`[WebSocketClient] Reconcile failed | ${reconcileFailedReason} | ` +
422
+ `outbox: ${this.outbox.size}`);
423
+ this.metrics.reconcile_failures_total++;
424
+ this.emit('reconcileFailed', { reason: reconcileFailedReason, awaiting });
425
+ // Don't lock up — transition to READY so the consumer's own error handling
426
+ // can decide what to do. The consumer can still send a new prompt.
427
+ this.setReadyState(ReadyState.READY, 'Reconcile failed');
428
+ return;
429
+ }
430
+ // 3) Happy path: transition to READY and replay anything still pending.
431
+ this.setReadyState(ReadyState.READY, 'ResumeAck reconciled');
432
+ if (resumeToSend) {
433
+ this.startPostResumeProgressTimer();
434
+ this.metrics.turns_replayed_total++;
435
+ await this.sendImmediately({
436
+ type: resumeToSend.type,
437
+ data: resumeToSend.data,
438
+ timestamp: resumeToSend.timestamp,
439
+ });
440
+ this.receivedResponsesSinceMessage = false;
441
+ this.setReadyState(ReadyState.MESSAGE_SENT, `Replayed ${resumeToSend.type} after reconcile`);
442
+ this.startReadyTimer();
443
+ }
444
+ else if (this.outbox.size > 0) {
445
+ // No specific replay target, but we have queued work — drain it.
446
+ await this.processOutbox();
447
+ }
448
+ }
449
+ handleTurnAck(message) {
450
+ const args = message?.data?.tool_args ?? {};
451
+ const turnId = args.turn_id;
452
+ const checkpointId = args.checkpoint_id;
453
+ if (!turnId) {
454
+ this.debug('[WebSocketClient] TurnAck received without turn_id, ignoring');
455
+ return;
456
+ }
457
+ const removed = this.removeFromOutbox(turnId);
458
+ if (checkpointId)
459
+ this.latestCheckpointId = checkpointId;
460
+ this.clearPostResumeProgressTimer();
461
+ // TurnAck is the v2 completion signal — the in-flight turn is consumed,
462
+ // so the TurnAck/Ready SLA started on send is no longer needed.
463
+ this.clearReadyTimer();
464
+ // Transition out of MESSAGE_SENT so the next queued turn can drain and
465
+ // a subsequent legacy-Ready (informational) no longer looks mid-send.
466
+ if (this.readyState === ReadyState.MESSAGE_SENT) {
467
+ this.setReadyState(ReadyState.READY, 'TurnAck received — turn durably consumed');
468
+ // If more turns piled up while we were waiting, drain one now.
469
+ void this.processOutbox();
470
+ }
471
+ this.metrics.turns_acked_total++;
472
+ // A TurnAck for a turn we no longer have in the outbox means the server
473
+ // re-ACK'd an already-dropped turn (idempotent replay confirmation).
474
+ if (!removed)
475
+ this.metrics.idempotent_replays_total++;
476
+ this.debug(`[WebSocketClient] TurnAck received | ` +
477
+ `turn_id: ${turnId} | ` +
478
+ `checkpoint: ${checkpointId?.substring(0, 8) || 'none'} | ` +
479
+ `outbox_removed: ${removed} | ` +
480
+ `outbox_remaining: ${this.outbox.size}`);
481
+ this.emit('turnAck', { turn_id: turnId, checkpoint_id: checkpointId });
482
+ }
111
483
  async handleReadyMessage(message) {
112
484
  const checkpointId = message.data?.tool_args?.checkpoint_id;
113
485
  const previousCheckpoint = this.latestCheckpointId;
@@ -124,6 +496,15 @@ export class WebSocketClient extends EventEmitter {
124
496
  }
125
497
  // Handle based on current state
126
498
  switch (this.readyState) {
499
+ case ReadyState.AWAITING_RESUME_ACK:
500
+ // In the normal v2 flow the server sends Ready then ResumeAck;
501
+ // Ready here is informational and we stay put until ResumeAck.
502
+ // Start a short legacy-fallback grace period so that a pre-v2
503
+ // backend (Ready only, no ResumeAck) doesn't stall the whole
504
+ // 15s ResumeAck SLA before we make progress.
505
+ this.debug('[WebSocketClient] Ready received while awaiting ResumeAck — starting legacy fallback grace');
506
+ this.startLegacyFallbackTimer();
507
+ break;
127
508
  case ReadyState.WAITING_INITIAL_READY:
128
509
  this.clearReadyTimer();
129
510
  this.setReadyState(ReadyState.READY, 'Initial Ready received after connection');
@@ -135,21 +516,20 @@ export class WebSocketClient extends EventEmitter {
135
516
  await this.processOutbox();
136
517
  break;
137
518
  case ReadyState.MESSAGE_SENT:
138
- // Ready in MESSAGE_SENT should only be accepted if responses were received
139
- if (!this.receivedResponsesSinceMessage) {
140
- this.debug(`[WebSocketClient] Unexpected Ready without responses | ` +
141
- `Checkpoint: ${checkpointId?.substring(0, 8) || 'none'} | ` +
142
- `This indicates server failed to process the message`);
143
- // Trigger checkpoint recovery - server acknowledged but didn't process
144
- this.clearReadyTimer();
145
- await this.initiateCheckpointRecovery('Ready received without any responses (server processing failure)');
146
- return;
147
- }
148
- // Normal case: Ready after responses signals server finished processing
149
- this.debug('[WebSocketClient] Ready received, responses complete');
150
- this.clearReadyTimer();
151
- this.setReadyState(ReadyState.READY, 'Ready received, server finished processing');
152
- await this.processOutbox();
519
+ // Under protocol-v2, ``TurnAck`` is the authoritative signal that our
520
+ // message was consumed — not ``Ready``. The backend sends Ready at
521
+ // the top of every loop iteration (including between turns and after
522
+ // handling a Resume probe), so Ready can legitimately arrive while
523
+ // we're still waiting on TurnAck — sometimes even before the graph's
524
+ // own output lands, purely due to network ordering. Acting on it
525
+ // (the old "server failed to process" path that force-reconnected)
526
+ // created a tight infinite loop where every UserQuery triggered a
527
+ // spurious reconnect.
528
+ //
529
+ // Treat Ready here as informational. The timer started in
530
+ // ``sendMessage`` / ``processOutbox`` still bounds how long we wait
531
+ // for TurnAck, and ``handleTurnAck`` is the real state transition.
532
+ this.debug('[WebSocketClient] Ready received in MESSAGE_SENT — ignoring (v2 relies on TurnAck)');
153
533
  break;
154
534
  case ReadyState.READY:
155
535
  this.debug('[WebSocketClient] Ready received while already in READY state (duplicate)');
@@ -189,32 +569,48 @@ export class WebSocketClient extends EventEmitter {
189
569
  async sendMessage(type, data) {
190
570
  // Reset idle timer on any outgoing message
191
571
  this.resetIdleTimer();
192
- const messageToSend = { type, data, timestamp: Date.now() };
572
+ // Ensure every turn carries a turn_id — the backend idempotency key.
573
+ // Accepts a caller-supplied id (rare; only tests need that) and otherwise
574
+ // mints one here so existing call sites don't need updates.
575
+ const turn_id = (data && typeof data === 'object' && typeof data.turn_id === 'string' && data.turn_id)
576
+ ? data.turn_id
577
+ : uuid();
578
+ const dataWithTurnId = (data && typeof data === 'object')
579
+ ? { ...data, turn_id }
580
+ : { turn_id, payload: data };
581
+ const tool_call_id = type === 'IDERetrievalAnswer'
582
+ ? (dataWithTurnId.tool_id ?? dataWithTurnId.tool_call_id)
583
+ : undefined;
584
+ const turn = {
585
+ turn_id,
586
+ type,
587
+ data: dataWithTurnId,
588
+ tool_call_id,
589
+ timestamp: Date.now(),
590
+ };
591
+ this.addToOutbox(turn);
193
592
  this.debug(`[WebSocketClient] sendMessage called | ` +
194
593
  `Type: ${type} | ` +
594
+ `turn_id: ${turn_id} | ` +
195
595
  `ReadyState: ${this.readyState} | ` +
196
- `ConnectionState: ${this.state}`);
197
- // Save as last sent message for potential recovery
198
- this.lastSentMessage = messageToSend;
199
- this.debug(`[WebSocketClient] Last sent message cached | ` +
200
- `Type: ${type} | ` +
201
- `Timestamp: ${messageToSend.timestamp}`);
596
+ `ConnectionState: ${this.state} | ` +
597
+ `outbox_size: ${this.outbox.size}`);
598
+ // Save as last sent message for potential recovery (legacy path)
599
+ this.lastSentMessage = { type, data: dataWithTurnId, timestamp: turn.timestamp };
202
600
  if (this.readyState === ReadyState.READY && this.ws && this.state === ConnectionState.CONNECTED) {
203
- // Send immediately
204
- this.receivedResponsesSinceMessage = false; // Reset flag when sending new message
205
- await this.sendImmediately(messageToSend);
601
+ this.receivedResponsesSinceMessage = false;
602
+ await this.sendImmediately({ type, data: dataWithTurnId, timestamp: turn.timestamp });
206
603
  this.setReadyState(ReadyState.MESSAGE_SENT, `Sent ${type} message`);
207
- this.startReadyTimer(); // Expect Ready after server processes message
604
+ this.startReadyTimer();
208
605
  }
209
606
  else {
210
- // Queue until Ready
211
- this.pendingOutbox.push(messageToSend);
212
- this.debug(`[WebSocketClient] Message queued | ` +
607
+ this.debug(`[WebSocketClient] Turn parked in outbox | ` +
213
608
  `Type: ${type} | ` +
214
- `Queue size: ${this.pendingOutbox.length} | ` +
609
+ `turn_id: ${turn_id} | ` +
610
+ `outbox_size: ${this.outbox.size} | ` +
215
611
  `Reason: ReadyState=${this.readyState}, Connected=${this.state === ConnectionState.CONNECTED}`);
216
612
  if (this.state === ConnectionState.DISCONNECTED) {
217
- this.debug('[WebSocketClient] Initiating connection due to queued message');
613
+ this.debug('[WebSocketClient] Initiating connection due to queued turn');
218
614
  await this.connect();
219
615
  }
220
616
  }
@@ -226,29 +622,38 @@ export class WebSocketClient extends EventEmitter {
226
622
  `Type: ${msg.type} | ` +
227
623
  `Size: ${formatted.length} bytes`);
228
624
  }
625
+ /**
626
+ * Drain the outbox into the socket. Only one turn is in-flight at a time
627
+ * (MESSAGE_SENT gate enforces this). Called on ResumeAck (after reconcile)
628
+ * and on transitions back to READY.
629
+ */
229
630
  async processOutbox() {
230
631
  this.debug(`[WebSocketClient] Processing outbox | ` +
231
- `Queue size: ${this.pendingOutbox.length} | ` +
632
+ `outbox_size: ${this.outbox.size} | ` +
232
633
  `ReadyState: ${this.readyState}`);
233
- if (this.pendingOutbox.length > 0 && this.readyState === ReadyState.READY) {
234
- const msg = this.pendingOutbox.shift();
235
- this.debug(`[WebSocketClient] Sending queued message | ` +
236
- `Type: ${msg.type} | ` +
237
- `Queued at: ${msg.timestamp} | ` +
238
- `Wait time: ${Date.now() - msg.timestamp}ms | ` +
239
- `Remaining in queue: ${this.pendingOutbox.length}`);
240
- this.receivedResponsesSinceMessage = false; // Reset flag when sending queued message
241
- await this.sendImmediately(msg);
242
- this.setReadyState(ReadyState.MESSAGE_SENT, `Sent queued ${msg.type} message`);
243
- this.startReadyTimer(); // Expect Ready after server processes message
244
- }
245
- else if (this.pendingOutbox.length === 0) {
634
+ if (this.outbox.size === 0) {
246
635
  this.debug('[WebSocketClient] Outbox is empty, no messages to process');
636
+ return;
247
637
  }
248
- else {
638
+ if (this.readyState !== ReadyState.READY) {
249
639
  this.debug(`[WebSocketClient] Cannot process outbox | ` +
250
640
  `ReadyState: ${this.readyState} (expected READY)`);
641
+ return;
251
642
  }
643
+ const nextId = this.outboxOrder[0];
644
+ const turn = nextId ? this.outbox.get(nextId) : undefined;
645
+ if (!turn)
646
+ return;
647
+ this.debug(`[WebSocketClient] Sending queued turn | ` +
648
+ `Type: ${turn.type} | ` +
649
+ `turn_id: ${turn.turn_id} | ` +
650
+ `Queued at: ${turn.timestamp} | ` +
651
+ `Wait: ${Date.now() - turn.timestamp}ms | ` +
652
+ `outbox_remaining: ${this.outbox.size}`);
653
+ this.receivedResponsesSinceMessage = false;
654
+ await this.sendImmediately({ type: turn.type, data: turn.data, timestamp: turn.timestamp });
655
+ this.setReadyState(ReadyState.MESSAGE_SENT, `Sent queued ${turn.type} message`);
656
+ this.startReadyTimer();
252
657
  }
253
658
  startReadyTimer() {
254
659
  this.clearReadyTimer();
@@ -268,44 +673,32 @@ export class WebSocketClient extends EventEmitter {
268
673
  this.readyTimer = undefined;
269
674
  }
270
675
  }
676
+ /**
677
+ * Legacy recovery entry point.
678
+ *
679
+ * Under protocol v2 the outbox holds every un-ACK'd turn by turn_id and the
680
+ * Resume/ResumeAck handshake replays it authoritatively. All we need to do
681
+ * here is cycle the socket — the open handler will send Resume, the server
682
+ * will reply with ResumeAck, and reconciliation will happen automatically.
683
+ */
271
684
  async initiateCheckpointRecovery(reason) {
272
- this.debug(`[WebSocketClient] ===== CHECKPOINT RECOVERY INITIATED ===== | ` +
685
+ this.debug(`[WebSocketClient] Forcing reconnect for recovery | ` +
273
686
  `Reason: ${reason} | ` +
274
687
  `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
275
- `Last message type: ${this.lastSentMessage?.type} | ` +
276
- `Current state: ${this.readyState}`);
277
- this.setReadyState(ReadyState.CHECKPOINT_RECOVERY, reason);
278
- // Disconnect current connection
279
- this.debug('[WebSocketClient] Disconnecting for checkpoint recovery');
280
- this.disconnect();
281
- // Reconnect with checkpoint (URL will include checkpoint_id)
282
- this.debug(`[WebSocketClient] Reconnecting with checkpoint | ` +
283
- `Checkpoint ID: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
284
- `Session ID: ${this.sessionId?.substring(0, 8)}...`);
688
+ `outbox_size: ${this.outbox.size} | ` +
689
+ `State: ${this.readyState}`);
690
+ this.emit('checkpointRecovery', { reason, checkpoint: this.latestCheckpointId });
285
691
  try {
286
- await this.establishConnection();
287
- this.debug('[WebSocketClient] Reconnection successful during recovery');
288
- // Wait for Ready state is set in handleReadyMessage when initial Ready is received
289
- // Resend last message once Ready received
290
- if (this.lastSentMessage) {
291
- this.pendingOutbox.unshift(this.lastSentMessage);
292
- this.debug(`[WebSocketClient] Last message re-queued for recovery | ` +
293
- `Type: ${this.lastSentMessage.type} | ` +
294
- `Original timestamp: ${this.lastSentMessage.timestamp}`);
295
- }
296
- else {
297
- this.debug('[WebSocketClient] No last message to resend during recovery');
692
+ // Close the socket hard. handleClose will schedule reconnection via
693
+ // attemptReconnection, and the open handler will send a fresh Resume.
694
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
695
+ this.ws.close(4002, `Recovery: ${reason}`);
298
696
  }
299
- // Emit event for monitoring
300
- this.emit('checkpointRecovery', { reason, checkpoint: this.latestCheckpointId });
301
697
  }
302
698
  catch (error) {
303
- this.debug(`[WebSocketClient] Checkpoint recovery failed | ` +
699
+ this.debug(`[WebSocketClient] Recovery reconnect failed | ` +
304
700
  `Error: ${error instanceof Error ? error.message : error}`);
305
- this.setReadyState(ReadyState.IDLE, 'Recovery failed');
306
- throw error;
307
701
  }
308
- this.debug('[WebSocketClient] ===== CHECKPOINT RECOVERY COMPLETED =====');
309
702
  }
310
703
  disconnect() {
311
704
  // Hold a reference to the current socket to close it safely
@@ -388,17 +781,18 @@ export class WebSocketClient extends EventEmitter {
388
781
  this.emit('connected');
389
782
  this.debug(`[WebSocketClient] WebSocket connected | ` +
390
783
  `Session: ${this.sessionId?.substring(0, 8)}... | ` +
391
- `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'}`);
392
- // Transition to waiting for initial Ready
393
- this.setReadyState(ReadyState.WAITING_INITIAL_READY, 'Connection opened, awaiting initial Ready');
394
- this.startReadyTimer(); // Wait for initial Ready
395
- // Start idle timer immediately upon connection
784
+ `Checkpoint: ${this.latestCheckpointId?.substring(0, 8) || 'none'} | ` +
785
+ `outbox: ${this.outbox.size}`);
786
+ // Start idle timer and heartbeat immediately so dead sockets surface.
396
787
  this.startIdleTimer();
397
- // Start heartbeat
398
788
  this.startHeartbeat();
399
- // Process any queued messages (old mechanism, will be replaced by pendingOutbox)
400
- this.processQueuedMessages();
401
- this.debug('[WebSocketClient] Connection established, waiting for initial Ready signal');
789
+ // Protocol v2: every (re)connect opens with a Resume envelope. The
790
+ // server's ResumeAck is the authoritative signal — the old "wait for
791
+ // Ready" handshake is left as a fallback for servers that don't yet
792
+ // understand Resume (they emit Ready first, which we treat as info).
793
+ this.setReadyState(ReadyState.AWAITING_RESUME_ACK, 'Connection opened, sending Resume');
794
+ this.sendResume();
795
+ this.debug('[WebSocketClient] Connection established, waiting for ResumeAck');
402
796
  resolve();
403
797
  });
404
798
  ws.on('message', (data) => {
@@ -407,22 +801,33 @@ export class WebSocketClient extends EventEmitter {
407
801
  this.resetIdleTimer();
408
802
  this.lastPongReceived = Date.now(); // any message is also a ping
409
803
  const message = data.toString();
410
- // Try to detect Ready messages
804
+ // Intercept protocol-v2 control messages (ResumeAck, TurnAck, Ready)
805
+ // before forwarding to the consumer. These are transport concerns.
411
806
  try {
412
807
  const parsed = JSON.parse(message);
413
- if (parsed && parsed.data && parsed.data.tool === 'Ready') {
414
- // Handle Ready message separately
808
+ const tool = parsed?.data?.tool;
809
+ if (tool === 'Ready') {
415
810
  this.handleReadyMessage(parsed);
416
811
  return;
417
812
  }
813
+ if (tool === 'ResumeAck') {
814
+ this.handleResumeAck(parsed);
815
+ return;
816
+ }
817
+ if (tool === 'TurnAck') {
818
+ this.handleTurnAck(parsed);
819
+ return;
820
+ }
418
821
  }
419
822
  catch (e) {
420
- // Not JSON or not a Ready message, continue with normal flow
823
+ // Not JSON or not a control message fall through to consumer.
421
824
  }
422
825
  // Track that we received a non-Ready message (response)
423
826
  if (this.readyState === ReadyState.MESSAGE_SENT) {
424
827
  this.receivedResponsesSinceMessage = true;
425
828
  }
829
+ // A domain event after ResumeAck counts as forward progress — clear the stall timer.
830
+ this.clearPostResumeProgressTimer();
426
831
  // Emit non-Ready messages to AgentAPI
427
832
  this.emit('message', message);
428
833
  });
@@ -533,8 +938,11 @@ export class WebSocketClient extends EventEmitter {
533
938
  this.setState(ConnectionState.FAILED);
534
939
  this.emit('error', new Error('Authentication failed'));
535
940
  }
536
- else if (this.readyState === ReadyState.WAITING_INITIAL_READY) {
537
- // Connection closed during initial handshake - fail fast with clear error
941
+ else if (!this.hasCompletedHandshake) {
942
+ // No successful handshake has completed on this instance. Treat any
943
+ // close here as a startup failure — auth issues, wrong URL, server
944
+ // rejecting the protocol, etc. should surface as a real error rather
945
+ // than enter an indefinite reconnect loop that hides the root cause.
538
946
  this.setState(ConnectionState.FAILED);
539
947
  const errorMessage = this.buildConnectionErrorMessage(code, reason);
540
948
  this.emit('error', new Error(errorMessage));
@@ -573,13 +981,22 @@ export class WebSocketClient extends EventEmitter {
573
981
  this.isReconnecting = true;
574
982
  this.setState(ConnectionState.RECONNECTING);
575
983
  this.reconnectAttempts++;
984
+ this.metrics.reconnects_total++;
985
+ // Stamp the disconnect time on the first reconnect attempt of a cycle so
986
+ // the eventual `reconnected` event can carry an accurate downtime_ms.
987
+ if (this.reconnectAttempts === 1 && !this.lastDisconnectAt) {
988
+ this.lastDisconnectAt = Date.now();
989
+ }
990
+ this.emit('reconnecting', { attempt: this.reconnectAttempts });
576
991
  const delay = this.calculateBackoffDelay(this.reconnectAttempts);
577
992
  this.debug(`Attempting reconnection ${this.reconnectAttempts} in ${delay}ms`);
578
993
  this.reconnectTimer = setTimeout(async () => {
579
994
  try {
580
995
  await this.establishConnection();
581
996
  this.isReconnecting = false;
582
- this.emit('reconnected');
997
+ const downtime_ms = this.lastDisconnectAt ? Date.now() - this.lastDisconnectAt : undefined;
998
+ this.lastDisconnectAt = undefined;
999
+ this.emit('reconnected', { downtime_ms });
583
1000
  }
584
1001
  catch (error) {
585
1002
  this.debug('Reconnection failed:', error instanceof Error ? error.message : error);
@@ -690,6 +1107,9 @@ export class WebSocketClient extends EventEmitter {
690
1107
  this.clearIdleTimer();
691
1108
  this.stopHeartbeat();
692
1109
  this.clearReadyTimer();
1110
+ this.clearResumeAckTimer();
1111
+ this.clearPostResumeProgressTimer();
1112
+ this.clearLegacyFallbackTimer();
693
1113
  if (this.reconnectTimer) {
694
1114
  clearTimeout(this.reconnectTimer);
695
1115
  this.reconnectTimer = undefined;
@@ -704,17 +1124,35 @@ export class WebSocketClient extends EventEmitter {
704
1124
  `State: ${this.state} | ` +
705
1125
  `ReadyState: ${this.readyState} | ` +
706
1126
  `Queued messages: ${this.messageQueue.length} | ` +
707
- `Pending outbox: ${this.pendingOutbox.length}`);
1127
+ `Outbox: ${this.outbox.size}`);
708
1128
  this.clearTimers();
709
1129
  this.disconnect();
710
1130
  this.messageQueue = [];
711
- this.pendingOutbox = [];
1131
+ this.outbox.clear();
1132
+ this.outboxOrder.length = 0;
712
1133
  this.isReconnecting = false;
713
1134
  this.latestCheckpointId = undefined;
714
1135
  this.lastSentMessage = undefined;
1136
+ this.hasCompletedHandshake = false;
715
1137
  this.setReadyState(ReadyState.IDLE, 'Cleanup performed');
716
1138
  this.removeAllListeners();
717
1139
  this.debug('[WebSocketClient] Cleanup completed');
718
1140
  }
1141
+ // --- Test hooks -----------------------------------------------------------
1142
+ // These are intentionally tiny, read-only accessors so the chaos harness
1143
+ // can assert on internal state without resorting to `any` casts. They are
1144
+ // not part of the public SDK API.
1145
+ /** @internal */
1146
+ __getOutboxIds() {
1147
+ return [...this.outboxOrder];
1148
+ }
1149
+ /** @internal */
1150
+ __getOutboxSize() {
1151
+ return this.outbox.size;
1152
+ }
1153
+ /** @internal */
1154
+ __getLatestCheckpointId() {
1155
+ return this.latestCheckpointId;
1156
+ }
719
1157
  }
720
1158
  //# sourceMappingURL=websocket.js.map