@rivetkit/engine-runner 2.0.24-rc.1 → 2.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/mod.ts CHANGED
@@ -1,39 +1,28 @@
1
1
  import * as protocol from "@rivetkit/engine-runner-protocol";
2
2
  import type { Logger } from "pino";
3
3
  import type WebSocket from "ws";
4
+ import { type ActorConfig, RunnerActor } from "./actor";
4
5
  import { logger, setLogger } from "./log.js";
5
- import { stringifyCommandWrapper, stringifyEvent } from "./stringify";
6
- import { Tunnel } from "./tunnel";
6
+ import { stringifyToClient, stringifyToServer } from "./stringify";
7
+ import { type HibernatingWebSocketMetadata, Tunnel } from "./tunnel";
7
8
  import {
8
9
  calculateBackoff,
9
10
  parseWebSocketCloseReason,
10
11
  unreachable,
11
12
  } from "./utils";
12
13
  import { importWebSocket } from "./websocket.js";
13
- import type { WebSocketTunnelAdapter } from "./websocket-tunnel-adapter";
14
+
15
+ export type { HibernatingWebSocketMetadata };
16
+ export { RunnerActor, type ActorConfig };
17
+ export { idToStr } from "./utils";
14
18
 
15
19
  const KV_EXPIRE: number = 30_000;
16
- const PROTOCOL_VERSION: number = 2;
20
+ const PROTOCOL_VERSION: number = 3;
17
21
  const RUNNER_PING_INTERVAL = 3_000;
18
22
 
19
23
  /** Warn once the backlog significantly exceeds the server's ack batch size. */
20
24
  const EVENT_BACKLOG_WARN_THRESHOLD = 10_000;
21
- const SIGNAL_HANDLERS: (() => void)[] = [];
22
-
23
- export interface ActorInstance {
24
- actorId: string;
25
- generation: number;
26
- config: ActorConfig;
27
- requests: Set<string>; // Track active request IDs
28
- webSockets: Set<string>; // Track active WebSocket IDs
29
- }
30
-
31
- export interface ActorConfig {
32
- name: string;
33
- key: string | null;
34
- createTs: bigint;
35
- input: Uint8Array | null;
36
- }
25
+ const SIGNAL_HANDLERS: (() => void | Promise<void>)[] = [];
37
26
 
38
27
  export interface RunnerConfig {
39
28
  logger?: Logger;
@@ -51,38 +40,137 @@ export interface RunnerConfig {
51
40
  onConnected: () => void;
52
41
  onDisconnected: (code: number, reason: string) => void;
53
42
  onShutdown: () => void;
43
+
44
+ /** Called when receiving a network request. */
54
45
  fetch: (
55
46
  runner: Runner,
56
47
  actorId: string,
48
+ gatewayId: protocol.GatewayId,
57
49
  requestId: protocol.RequestId,
58
50
  request: Request,
59
51
  ) => Promise<Response>;
60
- websocket?: (
52
+
53
+ /**
54
+ * Called when receiving a WebSocket connection.
55
+ *
56
+ * All event listeners must be added synchronously inside this function or
57
+ * else events may be missed. The open event will fire immediately after
58
+ * this function finishes.
59
+ *
60
+ * Any errors thrown here will disconnect the WebSocket immediately.
61
+ *
62
+ * While `path` and `headers` are partially redundant to the data in the
63
+ * `Request`, they may vary slightly from the actual content of `Request`.
64
+ * Prefer to persist the `path` and `headers` properties instead of the
65
+ * `Request` itself.
66
+ *
67
+ * ## Hibernating Web Sockets
68
+ *
69
+ * ### Implementation Requirements
70
+ *
71
+ * **Requirement 1: Persist HWS Immediately**
72
+ *
73
+ * This is responsible for persisting hibernatable WebSockets immediately
74
+ * (do not wait for open event). It is not time sensitive to flush the
75
+ * connection state. If this fails to persist the HWS, the client's
76
+ * WebSocket will be disconnected on next wake in the call to
77
+ * `Tunnel::restoreHibernatingRequests` since the connection entry will not
78
+ * exist.
79
+ *
80
+ * **Requirement 2: Persist Message Index On `message`**
81
+ *
82
+ * In the `message` event listener, this handler must persist the message
83
+ * index from the event. The request ID is available at
84
+ * `event.rivetRequestId` and message index at `event.rivetMessageIndex`.
85
+ *
86
+ * The message index should not be flushed immediately. Instead, this
87
+ * should:
88
+ *
89
+ * - Debounce calls to persist the message index
90
+ * - After each persist, call
91
+ * `Runner::sendHibernatableWebSocketMessageAck` to acknowledge the
92
+ * message
93
+ *
94
+ * This mechanism allows us to buffer messages on the gateway so we can
95
+ * batch-persist events on our end on a given interval.
96
+ *
97
+ * If this fails to persist, then the gateway will replay unacked
98
+ * messages when the actor starts again.
99
+ *
100
+ * **Requirement 3: Remove HWS From Storage On `close`**
101
+ *
102
+ * This handler should add an event listener for `close` to remove the
103
+ * connection from storage.
104
+ *
105
+ * If the connection remove fails to persist, the close event will be
106
+ * called again on the next actor start in
107
+ * `Tunnel::restoreHibernatingRequests` since there will be no request for
108
+ * the given connection.
109
+ *
110
+ * ### Restoring Connections
111
+ *
112
+ * The user of this library is responsible for:
113
+ * 1. Loading all persisted hibernatable WebSocket metadata for an actor
114
+ * 2. Calling `Runner::restoreHibernatingRequests` with this metadata at
115
+ * the end of `onActorStart`
116
+ *
117
+ * `restoreHibernatingRequests` will restore all connections and attach
118
+ * the appropriate event listeners.
119
+ *
120
+ * ### No Open Event On Restoration
121
+ *
122
+ * When restoring a HWS, the open event will not be called again. It will
123
+ * go straight to the message or close event.
124
+ */
125
+ websocket: (
61
126
  runner: Runner,
62
127
  actorId: string,
63
128
  ws: any,
129
+ gatewayId: protocol.GatewayId,
64
130
  requestId: protocol.RequestId,
65
131
  request: Request,
132
+ path: string,
133
+ headers: Record<string, string>,
134
+ isHibernatable: boolean,
135
+ isRestoringHibernatable: boolean,
66
136
  ) => Promise<void>;
137
+
138
+ hibernatableWebSocket: {
139
+ /**
140
+ * Determines if a WebSocket can continue to live while an actor goes to
141
+ * sleep.
142
+ */
143
+ canHibernate: (
144
+ actorId: string,
145
+ gatewayId: ArrayBuffer,
146
+ requestId: ArrayBuffer,
147
+ request: Request,
148
+ ) => boolean;
149
+ };
150
+
151
+ /**
152
+ * Called when an actor starts.
153
+ *
154
+ * This callback is responsible for:
155
+ * 1. Initializing the actor instance
156
+ * 2. Loading all persisted hibernatable WebSocket metadata for this actor
157
+ * 3. Calling `Runner::restoreHibernatingRequests` with the loaded metadata
158
+ * to restore hibernatable WebSocket connections
159
+ *
160
+ * The actor should not be marked as "ready" until after
161
+ * `restoreHibernatingRequests` completes to ensure all hibernatable
162
+ * connections are fully restored before the actor processes new requests.
163
+ */
67
164
  onActorStart: (
68
165
  actorId: string,
69
166
  generation: number,
70
167
  config: ActorConfig,
71
168
  ) => Promise<void>;
169
+
72
170
  onActorStop: (actorId: string, generation: number) => Promise<void>;
73
- getActorHibernationConfig: (
74
- actorId: string,
75
- requestId: ArrayBuffer,
76
- request: Request,
77
- ) => HibernationConfig;
78
171
  noAutoShutdown?: boolean;
79
172
  }
80
173
 
81
- export interface HibernationConfig {
82
- enabled: boolean;
83
- lastMsgIndex: number | undefined;
84
- }
85
-
86
174
  export interface KvListOptions {
87
175
  reverse?: boolean;
88
176
  limit?: number;
@@ -104,17 +192,17 @@ export class Runner {
104
192
  return this.#config;
105
193
  }
106
194
 
107
- #actors: Map<string, ActorInstance> = new Map();
108
- #actorWebSockets: Map<string, Set<WebSocketTunnelAdapter>> = new Map();
195
+ #actors: Map<string, RunnerActor> = new Map();
109
196
 
110
197
  // WebSocket
111
- #pegboardWebSocket?: WebSocket;
198
+ __pegboardWebSocket?: WebSocket;
112
199
  runnerId?: string;
113
200
  #lastCommandIdx: number = -1;
114
201
  #pingLoop?: NodeJS.Timeout;
115
202
  #nextEventIdx: bigint = 0n;
116
203
  #started: boolean = false;
117
204
  #shutdown: boolean = false;
205
+ #shuttingDown: boolean = false;
118
206
  #reconnectAttempt: number = 0;
119
207
  #reconnectTimeout?: NodeJS.Timeout;
120
208
 
@@ -130,7 +218,7 @@ export class Runner {
130
218
  #ackInterval?: NodeJS.Timeout;
131
219
 
132
220
  // KV operations
133
- #nextRequestId: number = 0;
221
+ #nextKvRequestId: number = 0;
134
222
  #kvRequests: Map<number, KvRequestEntry> = new Map();
135
223
  #kvCleanupInterval?: NodeJS.Timeout;
136
224
 
@@ -173,13 +261,6 @@ export class Runner {
173
261
 
174
262
  // MARK: Manage actors
175
263
  sleepActor(actorId: string, generation?: number) {
176
- if (this.#shutdown) {
177
- this.log?.warn({
178
- msg: "runner is shut down, cannot sleep actor",
179
- });
180
- return;
181
- }
182
-
183
264
  const actor = this.getActor(actorId, generation);
184
265
  if (!actor) return;
185
266
 
@@ -201,7 +282,7 @@ export class Runner {
201
282
  }
202
283
 
203
284
  async forceStopActor(actorId: string, generation?: number) {
204
- const actor = this.#removeActor(actorId, generation);
285
+ const actor = this.getActor(actorId, generation);
205
286
  if (!actor) return;
206
287
 
207
288
  // If onActorStop times out, Pegboard will handle this timeout with ACTOR_STOP_THRESHOLD_DURATION_MS
@@ -218,6 +299,11 @@ export class Runner {
218
299
  // Close requests after onActorStop so you can send messages over the tunnel
219
300
  this.#tunnel?.closeActiveRequests(actor);
220
301
 
302
+ // Remove actor after stopping in order to ensure that we can still
303
+ // call actions on the runner. Do this before sending stopped update in
304
+ // order to ensure we don't have duplicate actors.
305
+ this.#removeActor(actorId, generation);
306
+
221
307
  this.#sendActorStateUpdate(actorId, actor.generation, "stopped");
222
308
  }
223
309
 
@@ -232,17 +318,17 @@ export class Runner {
232
318
  }
233
319
  }
234
320
 
235
- getActor(actorId: string, generation?: number): ActorInstance | undefined {
321
+ getActor(actorId: string, generation?: number): RunnerActor | undefined {
236
322
  const actor = this.#actors.get(actorId);
237
323
  if (!actor) {
238
- this.log?.error({
324
+ this.log?.warn({
239
325
  msg: "actor not found",
240
326
  actorId,
241
327
  });
242
328
  return undefined;
243
329
  }
244
330
  if (generation !== undefined && actor.generation !== generation) {
245
- this.log?.error({
331
+ this.log?.warn({
246
332
  msg: "actor generation mismatch",
247
333
  actorId,
248
334
  generation,
@@ -253,6 +339,16 @@ export class Runner {
253
339
  return actor;
254
340
  }
255
341
 
342
+ async getAndWaitForActor(
343
+ actorId: string,
344
+ generation?: number,
345
+ ): Promise<RunnerActor | undefined> {
346
+ const actor = this.getActor(actorId, generation);
347
+ if (!actor) return;
348
+ await actor.actorStartPromise.promise;
349
+ return actor;
350
+ }
351
+
256
352
  hasActor(actorId: string, generation?: number): boolean {
257
353
  const actor = this.#actors.get(actorId);
258
354
 
@@ -262,11 +358,15 @@ export class Runner {
262
358
  );
263
359
  }
264
360
 
361
+ get actors() {
362
+ return this.#actors;
363
+ }
364
+
265
365
  // IMPORTANT: Make sure to call stopActiveRequests if calling #removeActor
266
366
  #removeActor(
267
367
  actorId: string,
268
368
  generation?: number,
269
- ): ActorInstance | undefined {
369
+ ): RunnerActor | undefined {
270
370
  const actor = this.#actors.get(actorId);
271
371
  if (!actor) {
272
372
  this.log?.error({
@@ -286,6 +386,12 @@ export class Runner {
286
386
 
287
387
  this.#actors.delete(actorId);
288
388
 
389
+ this.log?.info({
390
+ msg: "removed actor",
391
+ actorId,
392
+ actors: this.#actors.size,
393
+ });
394
+
289
395
  return actor;
290
396
  }
291
397
 
@@ -308,23 +414,25 @@ export class Runner {
308
414
 
309
415
  if (!this.#config.noAutoShutdown) {
310
416
  if (!SIGNAL_HANDLERS.length) {
311
- process.on("SIGTERM", () => {
417
+ process.on("SIGTERM", async () => {
312
418
  this.log?.debug("received SIGTERM");
313
419
 
314
420
  for (const handler of SIGNAL_HANDLERS) {
315
- handler();
421
+ await handler();
316
422
  }
317
423
 
318
- process.exit(0);
424
+ // TODO: Add back
425
+ // process.exit(0);
319
426
  });
320
- process.on("SIGINT", () => {
427
+ process.on("SIGINT", async () => {
321
428
  this.log?.debug("received SIGINT");
322
429
 
323
430
  for (const handler of SIGNAL_HANDLERS) {
324
- handler();
431
+ await handler();
325
432
  }
326
433
 
327
- process.exit(0);
434
+ // TODO: Add back
435
+ // process.exit(0);
328
436
  });
329
437
 
330
438
  this.log?.debug({
@@ -332,15 +440,24 @@ export class Runner {
332
440
  });
333
441
  }
334
442
 
335
- SIGNAL_HANDLERS.push(() => {
443
+ SIGNAL_HANDLERS.push(async () => {
336
444
  const weak = new WeakRef(this);
337
- weak.deref()?.shutdown(false, false);
445
+ await weak.deref()?.shutdown(false, false);
338
446
  });
339
447
  }
340
448
  }
341
449
 
342
450
  // MARK: Shutdown
343
451
  async shutdown(immediate: boolean, exit: boolean = false) {
452
+ // Prevent concurrent shutdowns
453
+ if (this.#shuttingDown) {
454
+ this.log?.debug({
455
+ msg: "shutdown already in progress, ignoring",
456
+ });
457
+ return;
458
+ }
459
+ this.#shuttingDown = true;
460
+
344
461
  this.log?.info({
345
462
  msg: "starting shutdown",
346
463
  immediate,
@@ -387,11 +504,8 @@ export class Runner {
387
504
  this.#kvRequests.clear();
388
505
 
389
506
  // Close WebSocket
390
- if (
391
- this.#pegboardWebSocket &&
392
- this.#pegboardWebSocket.readyState === 1
393
- ) {
394
- const pegboardWebSocket = this.#pegboardWebSocket;
507
+ if (this.__webSocketReady()) {
508
+ const pegboardWebSocket = this.__pegboardWebSocket;
395
509
  if (immediate) {
396
510
  // Stop immediately
397
511
  pegboardWebSocket.close(1000, "pegboard.runner_shutdown");
@@ -403,22 +517,14 @@ export class Runner {
403
517
  readyState: pegboardWebSocket.readyState,
404
518
  });
405
519
 
406
- // NOTE: We don't use #sendToServer here because that function checks if the runner is
407
- // shut down
408
- const encoded = protocol.encodeToServer({
520
+ // Start stopping
521
+ //
522
+ // The runner workflow will send StopActor commands for all
523
+ // actors
524
+ this.__sendToServer({
409
525
  tag: "ToServerStopping",
410
526
  val: null,
411
527
  });
412
- if (
413
- this.#pegboardWebSocket &&
414
- this.#pegboardWebSocket.readyState === 1
415
- ) {
416
- this.#pegboardWebSocket.send(encoded);
417
- } else {
418
- this.log?.error(
419
- "WebSocket not available or not open for sending data",
420
- );
421
- }
422
528
 
423
529
  const closePromise = new Promise<void>((resolve) => {
424
530
  if (!pegboardWebSocket)
@@ -434,7 +540,8 @@ export class Runner {
434
540
  });
435
541
  });
436
542
 
437
- // TODO: Wait for all actors to stop before closing ws
543
+ // Wait for all actors to stop before closing ws
544
+ await this.#waitForActorsToStop(pegboardWebSocket);
438
545
 
439
546
  this.log?.info({
440
547
  msg: "closing WebSocket",
@@ -459,7 +566,7 @@ export class Runner {
459
566
  // the runner has already shut down
460
567
  this.log?.debug({
461
568
  msg: "no runner WebSocket to shutdown or already closed",
462
- readyState: this.#pegboardWebSocket?.readyState,
569
+ readyState: this.__pegboardWebSocket?.readyState,
463
570
  });
464
571
  }
465
572
 
@@ -469,9 +576,96 @@ export class Runner {
469
576
  this.#tunnel = undefined;
470
577
  }
471
578
 
579
+ this.#config.onShutdown();
580
+
472
581
  if (exit) process.exit(0);
582
+ }
473
583
 
474
- this.#config.onShutdown();
584
+ /**
585
+ * Wait for all actors to stop before proceeding with shutdown.
586
+ *
587
+ * This method polls every 100ms to check if all actors have been stopped.
588
+ *
589
+ * It will resolve early if:
590
+ * - All actors are stopped
591
+ * - The WebSocket connection is closed
592
+ * - The shutdown timeout is reached (120 seconds)
593
+ */
594
+ async #waitForActorsToStop(ws: WebSocket): Promise<void> {
595
+ const shutdownTimeout = 120_000; // 120 seconds
596
+ const shutdownCheckInterval = 100; // Check every 100ms
597
+ const progressLogInterval = 5_000; // Log progress every 5 seconds
598
+ const shutdownStartTs = Date.now();
599
+ let lastProgressLogTs = 0; // Ensure first log happens immediately
600
+
601
+ return new Promise<void>((resolve) => {
602
+ const checkActors = () => {
603
+ const now = Date.now();
604
+ const elapsed = now - shutdownStartTs;
605
+ const wsIsClosed = ws.readyState === 2 || ws.readyState === 3;
606
+
607
+ if (this.#actors.size === 0) {
608
+ this.log?.info({
609
+ msg: "all actors stopped",
610
+ elapsed,
611
+ });
612
+ return true;
613
+ } else if (wsIsClosed) {
614
+ this.log?.warn({
615
+ msg: "websocket closed before all actors stopped",
616
+ remainingActors: this.#actors.size,
617
+ elapsed,
618
+ });
619
+ return true;
620
+ } else if (elapsed >= shutdownTimeout) {
621
+ this.log?.warn({
622
+ msg: "shutdown timeout reached, forcing close",
623
+ remainingActors: this.#actors.size,
624
+ elapsed,
625
+ });
626
+ return true;
627
+ } else {
628
+ // Log progress every 5 seconds
629
+ if (now - lastProgressLogTs >= progressLogInterval) {
630
+ this.log?.info({
631
+ msg: "waiting for actors to stop",
632
+ remainingActors: this.#actors.size,
633
+ elapsed,
634
+ });
635
+ lastProgressLogTs = now;
636
+ }
637
+ return false;
638
+ }
639
+ };
640
+
641
+ // Check immediately first
642
+ if (checkActors()) {
643
+ this.log?.debug({
644
+ msg: "actors check completed immediately",
645
+ });
646
+ resolve();
647
+ return;
648
+ }
649
+
650
+ this.log?.debug({
651
+ msg: "starting actor wait interval",
652
+ checkInterval: shutdownCheckInterval,
653
+ });
654
+
655
+ const interval = setInterval(() => {
656
+ this.log?.debug({
657
+ msg: "actor wait interval tick",
658
+ actorCount: this.#actors.size,
659
+ });
660
+ if (checkActors()) {
661
+ this.log?.debug({
662
+ msg: "actors check completed, clearing interval",
663
+ });
664
+ clearInterval(interval);
665
+ resolve();
666
+ }
667
+ }, shutdownCheckInterval);
668
+ });
475
669
  }
476
670
 
477
671
  // MARK: Networking
@@ -498,7 +692,7 @@ export class Runner {
498
692
 
499
693
  const WS = await importWebSocket();
500
694
  const ws = new WS(this.pegboardUrl, protocols) as any as WebSocket;
501
- this.#pegboardWebSocket = ws;
695
+ this.__pegboardWebSocket = ws;
502
696
 
503
697
  this.log?.info({
504
698
  msg: "connecting",
@@ -564,9 +758,6 @@ export class Runner {
564
758
  val: init,
565
759
  });
566
760
 
567
- // Process unsent KV requests
568
- this.#processUnsentKvRequests();
569
-
570
761
  // Start ping interval
571
762
  const pingLoop = setInterval(() => {
572
763
  if (ws.readyState === 1) {
@@ -612,6 +803,10 @@ export class Runner {
612
803
 
613
804
  // Parse message
614
805
  const message = protocol.decodeToClient(buf);
806
+ this.log?.debug({
807
+ msg: "received runner message",
808
+ data: stringifyToClient(message),
809
+ });
615
810
 
616
811
  // Handle message
617
812
  if (message.tag === "ToClientInit") {
@@ -635,8 +830,10 @@ export class Runner {
635
830
  runnerLostThreshold: this.#runnerLostThreshold,
636
831
  });
637
832
 
638
- // Resend events that haven't been acknowledged
833
+ // Resend pending events
834
+ this.#processUnsentKvRequests();
639
835
  this.#resendUnacknowledgedEvents(init.lastEventIdx);
836
+ this.#tunnel?.resendBufferedEvents();
640
837
 
641
838
  this.#config.onConnected();
642
839
  } else if (message.tag === "ToClientCommands") {
@@ -753,13 +950,11 @@ export class Runner {
753
950
  });
754
951
 
755
952
  for (const commandWrapper of commands) {
756
- this.log?.info({
757
- msg: "received command",
758
- command: stringifyCommandWrapper(commandWrapper),
759
- });
760
953
  if (commandWrapper.inner.tag === "CommandStartActor") {
954
+ // Spawn background promise
761
955
  this.#handleCommandStartActor(commandWrapper);
762
956
  } else if (commandWrapper.inner.tag === "CommandStopActor") {
957
+ // Spawn background promise
763
958
  this.#handleCommandStopActor(commandWrapper);
764
959
  } else {
765
960
  unreachable(commandWrapper.inner);
@@ -808,7 +1003,13 @@ export class Runner {
808
1003
  }
809
1004
  }
810
1005
 
811
- #handleCommandStartActor(commandWrapper: protocol.CommandWrapper) {
1006
+ async #handleCommandStartActor(commandWrapper: protocol.CommandWrapper) {
1007
+ // IMPORTANT: Make sure no async code runs before inserting #actors and
1008
+ // calling addRequestToActor in order to prevent race conditions with
1009
+ // subsequence commands
1010
+
1011
+ if (!this.#tunnel) throw new Error("missing tunnel on actor start");
1012
+
812
1013
  const startCommand = commandWrapper.inner
813
1014
  .val as protocol.CommandStartActor;
814
1015
 
@@ -823,43 +1024,80 @@ export class Runner {
823
1024
  input: config.input ? new Uint8Array(config.input) : null,
824
1025
  };
825
1026
 
826
- const instance: ActorInstance = {
1027
+ const instance = new RunnerActor(
827
1028
  actorId,
828
1029
  generation,
829
- config: actorConfig,
830
- requests: new Set(),
831
- webSockets: new Set(),
832
- };
1030
+ actorConfig,
1031
+ startCommand.hibernatingRequests,
1032
+ );
1033
+
1034
+ const existingActor = this.#actors.get(actorId);
1035
+ if (existingActor) {
1036
+ this.log?.warn({
1037
+ msg: "replacing existing actor in actors map",
1038
+ actorId,
1039
+ existingGeneration: existingActor.generation,
1040
+ newGeneration: generation,
1041
+ existingPendingRequests: existingActor.pendingRequests.length,
1042
+ });
1043
+ }
833
1044
 
834
1045
  this.#actors.set(actorId, instance);
835
1046
 
1047
+ // NOTE: We have to populate the requestToActor map BEFORE running any
1048
+ // async code in order for incoming tunnel messages to wait for
1049
+ // instance.actorStartPromise before processing messages
1050
+ // TODO: Where is this GC'd if something fails?
1051
+ for (const hr of startCommand.hibernatingRequests) {
1052
+ this.#tunnel.addRequestToActor(hr.gatewayId, hr.requestId, actorId);
1053
+ }
1054
+
1055
+ this.log?.info({
1056
+ msg: "created actor",
1057
+ actors: this.#actors.size,
1058
+ actorId,
1059
+ name: config.name,
1060
+ key: config.key,
1061
+ generation,
1062
+ hibernatingRequests: startCommand.hibernatingRequests.length,
1063
+ });
1064
+
836
1065
  this.#sendActorStateUpdate(actorId, generation, "running");
837
1066
 
838
- // TODO: Add timeout to onActorStart
839
- // Call onActorStart asynchronously and handle errors
840
- this.#config
841
- .onActorStart(actorId, generation, actorConfig)
842
- .catch((err) => {
843
- this.log?.error({
844
- msg: "error in onactorstart for actor",
845
- actorId,
846
- err,
847
- });
1067
+ try {
1068
+ // TODO: Add timeout to onActorStart
1069
+ // Call onActorStart asynchronously and handle errors
1070
+ this.log?.debug({
1071
+ msg: "calling onActorStart",
1072
+ actorId,
1073
+ generation,
1074
+ });
1075
+ await this.#config.onActorStart(actorId, generation, actorConfig);
848
1076
 
849
- // TODO: Mark as crashed
850
- // Send stopped state update if start failed
851
- this.forceStopActor(actorId, generation);
1077
+ instance.actorStartPromise.resolve();
1078
+ } catch (err) {
1079
+ this.log?.error({
1080
+ msg: "error starting runner actor",
1081
+ actorId,
1082
+ err,
852
1083
  });
1084
+
1085
+ instance.actorStartPromise.reject(err);
1086
+
1087
+ // TODO: Mark as crashed
1088
+ // Send stopped state update if start failed
1089
+ await this.forceStopActor(actorId, generation);
1090
+ }
853
1091
  }
854
1092
 
855
- #handleCommandStopActor(commandWrapper: protocol.CommandWrapper) {
1093
+ async #handleCommandStopActor(commandWrapper: protocol.CommandWrapper) {
856
1094
  const stopCommand = commandWrapper.inner
857
1095
  .val as protocol.CommandStopActor;
858
1096
 
859
1097
  const actorId = stopCommand.actorId;
860
1098
  const generation = stopCommand.generation;
861
1099
 
862
- this.forceStopActor(actorId, generation);
1100
+ await this.forceStopActor(actorId, generation);
863
1101
  }
864
1102
 
865
1103
  #sendActorIntent(
@@ -867,13 +1105,6 @@ export class Runner {
867
1105
  generation: number,
868
1106
  intentType: "sleep" | "stop",
869
1107
  ) {
870
- if (this.#shutdown) {
871
- console.trace("send actor intent", actorId, intentType);
872
- this.log?.warn({
873
- msg: "Runner is shut down, cannot send actor intent",
874
- });
875
- return;
876
- }
877
1108
  let actorIntent: protocol.ActorIntent;
878
1109
 
879
1110
  if (intentType === "sleep") {
@@ -904,12 +1135,6 @@ export class Runner {
904
1135
 
905
1136
  this.#recordEvent(eventWrapper);
906
1137
 
907
- this.log?.info({
908
- msg: "sending event to server",
909
- event: stringifyEvent(eventWrapper.inner),
910
- index: eventWrapper.index.toString(),
911
- });
912
-
913
1138
  this.__sendToServer({
914
1139
  tag: "ToServerEvents",
915
1140
  val: [eventWrapper],
@@ -921,12 +1146,6 @@ export class Runner {
921
1146
  generation: number,
922
1147
  stateType: "running" | "stopped",
923
1148
  ) {
924
- if (this.#shutdown) {
925
- this.log?.warn({
926
- msg: "Runner is shut down, cannot send actor state update",
927
- });
928
- return;
929
- }
930
1149
  let actorState: protocol.ActorState;
931
1150
 
932
1151
  if (stateType === "running") {
@@ -960,12 +1179,6 @@ export class Runner {
960
1179
 
961
1180
  this.#recordEvent(eventWrapper);
962
1181
 
963
- this.log?.info({
964
- msg: "sending event to server",
965
- event: stringifyEvent(eventWrapper.inner),
966
- index: eventWrapper.index.toString(),
967
- });
968
-
969
1182
  this.__sendToServer({
970
1183
  tag: "ToServerEvents",
971
1184
  val: [eventWrapper],
@@ -973,13 +1186,6 @@ export class Runner {
973
1186
  }
974
1187
 
975
1188
  #sendCommandAcknowledgment() {
976
- if (this.#shutdown) {
977
- this.log?.warn({
978
- msg: "Runner is shut down, cannot send command acknowledgment",
979
- });
980
- return;
981
- }
982
-
983
1189
  if (this.#lastCommandIdx < 0) {
984
1190
  // No commands received yet, nothing to acknowledge
985
1191
  return;
@@ -1288,11 +1494,6 @@ export class Runner {
1288
1494
  const actor = this.getActor(actorId, generation);
1289
1495
  if (!actor) return;
1290
1496
 
1291
- if (this.#shutdown) {
1292
- console.warn("Runner is shut down, cannot set alarm");
1293
- return;
1294
- }
1295
-
1296
1497
  const alarmEvent: protocol.EventActorSetAlarm = {
1297
1498
  actorId,
1298
1499
  generation: actor.generation,
@@ -1325,15 +1526,7 @@ export class Runner {
1325
1526
  requestData: protocol.KvRequestData,
1326
1527
  ): Promise<any> {
1327
1528
  return new Promise((resolve, reject) => {
1328
- if (this.#shutdown) {
1329
- reject(new Error("Runner is shut down"));
1330
- return;
1331
- }
1332
-
1333
- const requestId = this.#nextRequestId++;
1334
- const isConnected =
1335
- this.#pegboardWebSocket &&
1336
- this.#pegboardWebSocket.readyState === 1;
1529
+ const requestId = this.#nextKvRequestId++;
1337
1530
 
1338
1531
  // Store the request
1339
1532
  const requestEntry = {
@@ -1347,7 +1540,7 @@ export class Runner {
1347
1540
 
1348
1541
  this.#kvRequests.set(requestId, requestEntry);
1349
1542
 
1350
- if (isConnected) {
1543
+ if (this.__webSocketReady()) {
1351
1544
  // Send immediately
1352
1545
  this.#sendSingleKvRequest(requestId);
1353
1546
  }
@@ -1380,10 +1573,7 @@ export class Runner {
1380
1573
  }
1381
1574
 
1382
1575
  #processUnsentKvRequests() {
1383
- if (
1384
- !this.#pegboardWebSocket ||
1385
- this.#pegboardWebSocket.readyState !== 1
1386
- ) {
1576
+ if (!this.__webSocketReady()) {
1387
1577
  return;
1388
1578
  }
1389
1579
 
@@ -1400,26 +1590,25 @@ export class Runner {
1400
1590
  }
1401
1591
  }
1402
1592
 
1403
- __webSocketReady(): boolean {
1404
- return this.#pegboardWebSocket
1405
- ? this.#pegboardWebSocket.readyState === 1
1406
- : false;
1593
+ /** Asserts WebSocket exists and is ready. */
1594
+ __webSocketReady(): this is this & {
1595
+ __pegboardWebSocket: NonNullable<Runner["__pegboardWebSocket"]>;
1596
+ } {
1597
+ return (
1598
+ !!this.__pegboardWebSocket &&
1599
+ this.__pegboardWebSocket.readyState === 1
1600
+ );
1407
1601
  }
1408
1602
 
1409
1603
  __sendToServer(message: protocol.ToServer) {
1410
- if (this.#shutdown) {
1411
- this.log?.warn({
1412
- msg: "Runner is shut down, cannot send message to server",
1413
- });
1414
- return;
1415
- }
1604
+ this.log?.debug({
1605
+ msg: "sending runner message",
1606
+ data: stringifyToServer(message),
1607
+ });
1416
1608
 
1417
1609
  const encoded = protocol.encodeToServer(message);
1418
- if (
1419
- this.#pegboardWebSocket &&
1420
- this.#pegboardWebSocket.readyState === 1
1421
- ) {
1422
- this.#pegboardWebSocket.send(encoded);
1610
+ if (this.__webSocketReady()) {
1611
+ this.__pegboardWebSocket.send(encoded);
1423
1612
  } else {
1424
1613
  this.log?.error({
1425
1614
  msg: "WebSocket not available or not open for sending data",
@@ -1427,8 +1616,50 @@ export class Runner {
1427
1616
  }
1428
1617
  }
1429
1618
 
1430
- sendWebsocketMessageAck(requestId: ArrayBuffer, index: number) {
1431
- this.#tunnel?.__ackWebsocketMessage(requestId, index);
1619
+ sendHibernatableWebSocketMessageAck(
1620
+ gatewayId: ArrayBuffer,
1621
+ requestId: ArrayBuffer,
1622
+ index: number,
1623
+ ) {
1624
+ if (!this.#tunnel)
1625
+ throw new Error("missing tunnel to send message ack");
1626
+ this.#tunnel.sendHibernatableWebSocketMessageAck(
1627
+ gatewayId,
1628
+ requestId,
1629
+ index,
1630
+ );
1631
+ }
1632
+
1633
+ /**
1634
+ * Restores hibernatable WebSocket connections for an actor.
1635
+ *
1636
+ * This method should be called at the end of `onActorStart` after the
1637
+ * actor instance is fully initialized.
1638
+ *
1639
+ * This method will:
1640
+ * - Restore all provided hibernatable WebSocket connections
1641
+ * - Attach event listeners to the restored WebSockets
1642
+ * - Close any WebSocket connections that failed to restore
1643
+ *
1644
+ * The provided metadata list should include all hibernatable WebSockets
1645
+ * that were persisted for this actor. The gateway will automatically
1646
+ * close any connections that are not restored (i.e., not included in
1647
+ * this list).
1648
+ *
1649
+ * **Important:** This method must be called after `onActorStart` completes
1650
+ * and before marking the actor as "ready" to ensure all hibernatable
1651
+ * connections are fully restored.
1652
+ *
1653
+ * @param actorId - The ID of the actor to restore connections for
1654
+ * @param metaEntries - Array of hibernatable WebSocket metadata to restore
1655
+ */
1656
+ async restoreHibernatingRequests(
1657
+ actorId: string,
1658
+ metaEntries: HibernatingWebSocketMetadata[],
1659
+ ) {
1660
+ if (!this.#tunnel)
1661
+ throw new Error("missing tunnel to restore hibernating requests");
1662
+ await this.#tunnel.restoreHibernatingRequests(actorId, metaEntries);
1432
1663
  }
1433
1664
 
1434
1665
  getServerlessInitPacket(): string | undefined {
@@ -1486,9 +1717,10 @@ export class Runner {
1486
1717
 
1487
1718
  if (eventsToResend.length === 0) return;
1488
1719
 
1489
- //this.#log?.log(
1490
- // `Resending ${eventsToResend.length} unacknowledged events from index ${Number(lastEventIdx) + 1}`,
1491
- //);
1720
+ this.log?.info({
1721
+ msg: "resending unacknowledged events",
1722
+ fromIndex: lastEventIdx + 1n,
1723
+ });
1492
1724
 
1493
1725
  // Resend events in batches
1494
1726
  this.__sendToServer({