@rivetkit/engine-runner 2.0.4-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/mod.ts ADDED
@@ -0,0 +1,1862 @@
1
+ import * as protocol from "@rivetkit/engine-runner-protocol";
2
+ import type { Logger } from "pino";
3
+ import type WebSocket from "ws";
4
+ import { type ActorConfig, RunnerActor } from "./actor";
5
+ import { logger, setLogger } from "./log.js";
6
+ import { stringifyToClient, stringifyToServer } from "./stringify";
7
+ import { type HibernatingWebSocketMetadata, Tunnel } from "./tunnel";
8
+ import {
9
+ calculateBackoff,
10
+ parseWebSocketCloseReason,
11
+ stringifyError,
12
+ unreachable,
13
+ } from "./utils";
14
+ import { importWebSocket } from "./websocket.js";
15
+
16
+ export type { HibernatingWebSocketMetadata };
17
+ export { RunnerActor, type ActorConfig };
18
+ export { idToStr } from "./utils";
19
+
20
+ const KV_EXPIRE: number = 30_000;
21
+ const PROTOCOL_VERSION: number = 5;
22
+
23
+ /** Warn once the backlog significantly exceeds the server's ack batch size. */
24
+ const EVENT_BACKLOG_WARN_THRESHOLD = 10_000;
25
+ const SIGNAL_HANDLERS: (() => void | Promise<void>)[] = [];
26
+
27
+ export class RunnerShutdownError extends Error {
28
+ constructor() {
29
+ super("Runner shut down");
30
+ }
31
+ }
32
+
33
+ export interface RunnerConfig {
34
+ logger?: Logger;
35
+ version: number;
36
+ endpoint: string;
37
+ token?: string;
38
+ pegboardEndpoint?: string;
39
+ pegboardRelayEndpoint?: string;
40
+ namespace: string;
41
+ totalSlots: number;
42
+ runnerName: string;
43
+ runnerKey: string;
44
+ prepopulateActorNames: Record<string, { metadata: Record<string, any> }>;
45
+ metadata?: Record<string, any>;
46
+ onConnected: () => void;
47
+ onDisconnected: (code: number, reason: string) => void;
48
+ onShutdown: () => void;
49
+
50
+ /** Called when receiving a network request. */
51
+ fetch: (
52
+ runner: Runner,
53
+ actorId: string,
54
+ gatewayId: protocol.GatewayId,
55
+ requestId: protocol.RequestId,
56
+ request: Request,
57
+ ) => Promise<Response>;
58
+
59
+ /**
60
+ * Called when receiving a WebSocket connection.
61
+ *
62
+ * All event listeners must be added synchronously inside this function or
63
+ * else events may be missed. The open event will fire immediately after
64
+ * this function finishes.
65
+ *
66
+ * Any errors thrown here will disconnect the WebSocket immediately.
67
+ *
68
+ * While `path` and `headers` are partially redundant to the data in the
69
+ * `Request`, they may vary slightly from the actual content of `Request`.
70
+ * Prefer to persist the `path` and `headers` properties instead of the
71
+ * `Request` itself.
72
+ *
73
+ * ## Hibernating Web Sockets
74
+ *
75
+ * ### Implementation Requirements
76
+ *
77
+ * **Requirement 1: Persist HWS Immediately**
78
+ *
79
+ * This is responsible for persisting hibernatable WebSockets immediately
80
+ * (do not wait for open event). It is not time sensitive to flush the
81
+ * connection state. If this fails to persist the HWS, the client's
82
+ * WebSocket will be disconnected on next wake in the call to
83
+ * `Tunnel::restoreHibernatingRequests` since the connection entry will not
84
+ * exist.
85
+ *
86
+ * **Requirement 2: Persist Message Index On `message`**
87
+ *
88
+ * In the `message` event listener, this handler must persist the message
89
+ * index from the event. The request ID is available at
90
+ * `event.rivetRequestId` and message index at `event.rivetMessageIndex`.
91
+ *
92
+ * The message index should not be flushed immediately. Instead, this
93
+ * should:
94
+ *
95
+ * - Debounce calls to persist the message index
96
+ * - After each persist, call
97
+ * `Runner::sendHibernatableWebSocketMessageAck` to acknowledge the
98
+ * message
99
+ *
100
+ * This mechanism allows us to buffer messages on the gateway so we can
101
+ * batch-persist events on our end on a given interval.
102
+ *
103
+ * If this fails to persist, then the gateway will replay unacked
104
+ * messages when the actor starts again.
105
+ *
106
+ * **Requirement 3: Remove HWS From Storage On `close`**
107
+ *
108
+ * This handler should add an event listener for `close` to remove the
109
+ * connection from storage.
110
+ *
111
+ * If the connection remove fails to persist, the close event will be
112
+ * called again on the next actor start in
113
+ * `Tunnel::restoreHibernatingRequests` since there will be no request for
114
+ * the given connection.
115
+ *
116
+ * ### Restoring Connections
117
+ *
118
+ * The user of this library is responsible for:
119
+ * 1. Loading all persisted hibernatable WebSocket metadata for an actor
120
+ * 2. Calling `Runner::restoreHibernatingRequests` with this metadata at
121
+ * the end of `onActorStart`
122
+ *
123
+ * `restoreHibernatingRequests` will restore all connections and attach
124
+ * the appropriate event listeners.
125
+ *
126
+ * ### No Open Event On Restoration
127
+ *
128
+ * When restoring a HWS, the open event will not be called again. It will
129
+ * go straight to the message or close event.
130
+ */
131
+ websocket: (
132
+ runner: Runner,
133
+ actorId: string,
134
+ ws: any,
135
+ gatewayId: protocol.GatewayId,
136
+ requestId: protocol.RequestId,
137
+ request: Request,
138
+ path: string,
139
+ headers: Record<string, string>,
140
+ isHibernatable: boolean,
141
+ isRestoringHibernatable: boolean,
142
+ ) => Promise<void>;
143
+
144
+ hibernatableWebSocket: {
145
+ /**
146
+ * Determines if a WebSocket can continue to live while an actor goes to
147
+ * sleep.
148
+ */
149
+ canHibernate: (
150
+ actorId: string,
151
+ gatewayId: ArrayBuffer,
152
+ requestId: ArrayBuffer,
153
+ request: Request,
154
+ ) => boolean;
155
+ };
156
+
157
+ /**
158
+ * Called when an actor starts.
159
+ *
160
+ * This callback is responsible for:
161
+ * 1. Initializing the actor instance
162
+ * 2. Loading all persisted hibernatable WebSocket metadata for this actor
163
+ * 3. Calling `Runner::restoreHibernatingRequests` with the loaded metadata
164
+ * to restore hibernatable WebSocket connections
165
+ *
166
+ * The actor should not be marked as "ready" until after
167
+ * `restoreHibernatingRequests` completes to ensure all hibernatable
168
+ * connections are fully restored before the actor processes new requests.
169
+ */
170
+ onActorStart: (
171
+ actorId: string,
172
+ generation: number,
173
+ config: ActorConfig,
174
+ ) => Promise<void>;
175
+
176
+ onActorStop: (actorId: string, generation: number) => Promise<void>;
177
+ noAutoShutdown?: boolean;
178
+ }
179
+
180
+ export interface KvListOptions {
181
+ reverse?: boolean;
182
+ limit?: number;
183
+ }
184
+
185
+ interface KvRequestEntry {
186
+ actorId: string;
187
+ data: protocol.KvRequestData;
188
+ resolve: (value: any) => void;
189
+ reject: (error: unknown) => void;
190
+ sent: boolean;
191
+ timestamp: number;
192
+ }
193
+
194
+ export class Runner {
195
+ #config: RunnerConfig;
196
+
197
+ get config(): RunnerConfig {
198
+ return this.#config;
199
+ }
200
+
201
+ #actors: Map<string, RunnerActor> = new Map();
202
+
203
+ // WebSocket
204
+ #pegboardWebSocket?: WebSocket;
205
+ runnerId?: string;
206
+ #started: boolean = false;
207
+ #shutdown: boolean = false;
208
+ #reconnectAttempt: number = 0;
209
+ #reconnectTimeout?: NodeJS.Timeout;
210
+
211
+ // Runner lost threshold management
212
+ #runnerLostThreshold?: number;
213
+ #runnerLostTimeout?: NodeJS.Timeout;
214
+
215
+ // Event storage for resending
216
+ #eventBacklogWarned: boolean = false;
217
+
218
+ // Command acknowledgment
219
+ #ackInterval?: NodeJS.Timeout;
220
+
221
+ // KV operations
222
+ #nextKvRequestId: number = 0;
223
+ #kvRequests: Map<number, KvRequestEntry> = new Map();
224
+ #kvCleanupInterval?: NodeJS.Timeout;
225
+
226
+ // Tunnel for HTTP/WebSocket forwarding
227
+ #tunnel: Tunnel | undefined;
228
+
229
+ // Cached child logger with runner-specific attributes
230
+ #logCached?: Logger;
231
+
232
+ get log(): Logger | undefined {
233
+ if (this.#logCached) return this.#logCached;
234
+
235
+ const l = logger();
236
+ if (l) {
237
+ // If has connected, create child logger with relevant metadata
238
+ //
239
+ // Otherwise, return default logger
240
+ if (this.runnerId) {
241
+ this.#logCached = l.child({
242
+ runnerId: this.runnerId,
243
+ });
244
+ return this.#logCached;
245
+ } else {
246
+ return l;
247
+ }
248
+ }
249
+
250
+ return undefined;
251
+ }
252
+
253
+ constructor(config: RunnerConfig) {
254
+ this.#config = config;
255
+ if (this.#config.logger) setLogger(this.#config.logger);
256
+
257
+ // Start cleaning up old unsent KV requests every 15 seconds
258
+ this.#kvCleanupInterval = setInterval(() => {
259
+ try {
260
+ this.#cleanupOldKvRequests();
261
+ } catch (err) {
262
+ this.log?.error({
263
+ msg: "error cleaning up kv requests",
264
+ error: stringifyError(err),
265
+ });
266
+ }
267
+ }, 15000); // Run every 15 seconds
268
+ }
269
+
270
+ // MARK: Manage actors
271
+ sleepActor(actorId: string, generation?: number) {
272
+ const actor = this.getActor(actorId, generation);
273
+ if (!actor) return;
274
+
275
+ // Keep the actor instance in memory during sleep
276
+ this.#sendActorIntent(actorId, actor.generation, "sleep");
277
+
278
+ // NOTE: We do NOT remove the actor from this.#actors here
279
+ // The server will send a StopActor command if it wants to fully stop
280
+ }
281
+
282
+ async stopActor(actorId: string, generation?: number) {
283
+ const actor = this.getActor(actorId, generation);
284
+ if (!actor) return;
285
+
286
+ this.#sendActorIntent(actorId, actor.generation, "stop");
287
+
288
+ // NOTE: We do NOT remove the actor from this.#actors here
289
+ // The server will send a StopActor command if it wants to fully stop
290
+ }
291
+
292
+ async forceStopActor(actorId: string, generation?: number) {
293
+ this.log?.debug({
294
+ msg: "force stopping actor",
295
+ actorId,
296
+ });
297
+
298
+ const actor = this.getActor(actorId, generation);
299
+ if (!actor) return;
300
+
301
+ // If onActorStop times out, Pegboard will handle this timeout with ACTOR_STOP_THRESHOLD_DURATION_MS
302
+ //
303
+ // If we receive a request while onActorStop is running, a Service
304
+ // Unavailable error will be returned to Guard and the request will be
305
+ // retried
306
+ try {
307
+ await this.#config.onActorStop(actorId, actor.generation);
308
+ } catch (err) {
309
+ console.error(`Error in onActorStop for actor ${actorId}:`, err);
310
+ }
311
+
312
+ // Close requests after onActorStop so you can send messages over the tunnel
313
+ this.#tunnel?.closeActiveRequests(actor);
314
+
315
+ this.#sendActorStateUpdate(actorId, actor.generation, "stopped");
316
+
317
+ // Remove actor after stopping in order to ensure that we can still
318
+ // call actions on the runner
319
+ this.#removeActor(actorId, generation);
320
+ }
321
+
322
+ #handleLost() {
323
+ this.log?.info({
324
+ msg: "stopping all actors due to runner lost threshold",
325
+ });
326
+
327
+ // Remove all remaining kv requests
328
+ for (const [_, request] of this.#kvRequests.entries()) {
329
+ request.reject(new RunnerShutdownError());
330
+ }
331
+
332
+ this.#kvRequests.clear();
333
+
334
+ this.#stopAllActors();
335
+ }
336
+
337
+ #stopAllActors() {
338
+ const actorIds = Array.from(this.#actors.keys());
339
+ for (const actorId of actorIds) {
340
+ this.forceStopActor(actorId).catch((err) => {
341
+ this.log?.error({
342
+ msg: "error stopping actor",
343
+ actorId,
344
+ error: stringifyError(err),
345
+ });
346
+ });
347
+ }
348
+ }
349
+
350
+ getActor(actorId: string, generation?: number): RunnerActor | undefined {
351
+ const actor = this.#actors.get(actorId);
352
+ if (!actor) {
353
+ this.log?.warn({
354
+ msg: "actor not found",
355
+ actorId,
356
+ });
357
+ return undefined;
358
+ }
359
+ if (generation !== undefined && actor.generation !== generation) {
360
+ this.log?.warn({
361
+ msg: "actor generation mismatch",
362
+ actorId,
363
+ generation,
364
+ });
365
+ return undefined;
366
+ }
367
+
368
+ return actor;
369
+ }
370
+
371
+ async getAndWaitForActor(
372
+ actorId: string,
373
+ generation?: number,
374
+ ): Promise<RunnerActor | undefined> {
375
+ const actor = this.getActor(actorId, generation);
376
+ if (!actor) return;
377
+ await actor.actorStartPromise.promise;
378
+ return actor;
379
+ }
380
+
381
+ hasActor(actorId: string, generation?: number): boolean {
382
+ const actor = this.#actors.get(actorId);
383
+
384
+ return (
385
+ !!actor &&
386
+ (generation === undefined || actor.generation === generation)
387
+ );
388
+ }
389
+
390
+ get actors() {
391
+ return this.#actors;
392
+ }
393
+
394
+ // IMPORTANT: Make sure to call stopActiveRequests if calling #removeActor
395
+ #removeActor(
396
+ actorId: string,
397
+ generation?: number,
398
+ ): RunnerActor | undefined {
399
+ const actor = this.#actors.get(actorId);
400
+ if (!actor) {
401
+ this.log?.error({
402
+ msg: "actor not found for removal",
403
+ actorId,
404
+ });
405
+ return undefined;
406
+ }
407
+ if (generation !== undefined && actor.generation !== generation) {
408
+ this.log?.error({
409
+ msg: "actor generation mismatch",
410
+ actorId,
411
+ generation,
412
+ });
413
+ return undefined;
414
+ }
415
+
416
+ this.#actors.delete(actorId);
417
+
418
+ this.log?.info({
419
+ msg: "removed actor",
420
+ actorId,
421
+ actors: this.#actors.size,
422
+ });
423
+
424
+ return actor;
425
+ }
426
+
427
+ // MARK: Start
428
+ async start() {
429
+ if (this.#started) throw new Error("Cannot call runner.start twice");
430
+ this.#started = true;
431
+
432
+ this.log?.info({ msg: "starting runner" });
433
+
434
+ this.#tunnel = new Tunnel(this);
435
+ this.#tunnel.start();
436
+
437
+ try {
438
+ await this.#openPegboardWebSocket();
439
+ } catch (error) {
440
+ this.#started = false;
441
+ throw error;
442
+ }
443
+
444
+ if (!this.#config.noAutoShutdown) {
445
+ if (!SIGNAL_HANDLERS.length) {
446
+ process.on("SIGTERM", async () => {
447
+ this.log?.debug("received SIGTERM");
448
+
449
+ for (const handler of SIGNAL_HANDLERS) {
450
+ await handler();
451
+ }
452
+
453
+ // TODO: Add back
454
+ // process.exit(0);
455
+ });
456
+ process.on("SIGINT", async () => {
457
+ this.log?.debug("received SIGINT");
458
+
459
+ for (const handler of SIGNAL_HANDLERS) {
460
+ await handler();
461
+ }
462
+
463
+ // TODO: Add back
464
+ // process.exit(0);
465
+ });
466
+
467
+ this.log?.debug({
468
+ msg: "added SIGTERM listeners",
469
+ });
470
+ }
471
+
472
+ SIGNAL_HANDLERS.push(async () => {
473
+ const weak = new WeakRef(this);
474
+ await weak.deref()?.shutdown(false, false);
475
+ });
476
+ }
477
+ }
478
+
479
+ // MARK: Shutdown
480
+ async shutdown(immediate: boolean, exit: boolean = false) {
481
+ // Prevent concurrent shutdowns
482
+ if (this.#shutdown) {
483
+ this.log?.debug({
484
+ msg: "shutdown already in progress, ignoring",
485
+ });
486
+ return;
487
+ }
488
+ this.#shutdown = true;
489
+
490
+ this.log?.info({
491
+ msg: "starting shutdown",
492
+ immediate,
493
+ exit,
494
+ });
495
+
496
+ // Clear reconnect timeout
497
+ if (this.#reconnectTimeout) {
498
+ clearTimeout(this.#reconnectTimeout);
499
+ this.#reconnectTimeout = undefined;
500
+ }
501
+
502
+ // Clear runner lost timeout
503
+ if (this.#runnerLostTimeout) {
504
+ clearTimeout(this.#runnerLostTimeout);
505
+ this.#runnerLostTimeout = undefined;
506
+ }
507
+
508
+ // Clear ack interval
509
+ if (this.#ackInterval) {
510
+ clearInterval(this.#ackInterval);
511
+ this.#ackInterval = undefined;
512
+ }
513
+
514
+ // Clear KV cleanup interval
515
+ if (this.#kvCleanupInterval) {
516
+ clearInterval(this.#kvCleanupInterval);
517
+ this.#kvCleanupInterval = undefined;
518
+ }
519
+
520
+ // Reject all KV requests
521
+ for (const request of this.#kvRequests.values()) {
522
+ request.reject(
523
+ new Error("WebSocket connection closed during shutdown"),
524
+ );
525
+ }
526
+ this.#kvRequests.clear();
527
+
528
+ // Close WebSocket
529
+ const pegboardWebSocket = this.getPegboardWebSocketIfReady();
530
+ if (pegboardWebSocket) {
531
+ if (immediate) {
532
+ // Stop immediately
533
+ pegboardWebSocket.close(1000, "pegboard.runner_shutdown");
534
+ } else {
535
+ // Wait for actors to shut down before stopping
536
+ try {
537
+ this.log?.info({
538
+ msg: "sending stopping message",
539
+ readyState: pegboardWebSocket.readyState,
540
+ });
541
+
542
+ // Start stopping
543
+ //
544
+ // The runner workflow will send StopActor commands for all
545
+ // actors
546
+ this.__sendToServer({
547
+ tag: "ToServerStopping",
548
+ val: null,
549
+ });
550
+
551
+ const closePromise = new Promise<void>((resolve) => {
552
+ if (!pegboardWebSocket)
553
+ throw new Error("missing pegboardWebSocket");
554
+
555
+ pegboardWebSocket.addEventListener("close", (ev) => {
556
+ this.log?.info({
557
+ msg: "connection closed",
558
+ code: ev.code,
559
+ reason: ev.reason.toString(),
560
+ });
561
+ resolve();
562
+ });
563
+ });
564
+
565
+ // Wait for all actors to stop before closing ws
566
+ await this.#waitForActorsToStop(pegboardWebSocket);
567
+
568
+ this.log?.info({
569
+ msg: "closing WebSocket",
570
+ });
571
+ pegboardWebSocket.close(1000, "pegboard.runner_shutdown");
572
+
573
+ await closePromise;
574
+
575
+ this.log?.info({
576
+ msg: "websocket shutdown completed",
577
+ });
578
+ } catch (error) {
579
+ this.log?.error({
580
+ msg: "error during websocket shutdown:",
581
+ error,
582
+ });
583
+ pegboardWebSocket.close();
584
+ }
585
+ }
586
+ } else {
587
+ // This is often logged when the serverless SSE stream closes after
588
+ // the runner has already shut down
589
+ this.log?.debug({
590
+ msg: "no runner WebSocket to shutdown or already closed",
591
+ readyState: this.#pegboardWebSocket?.readyState,
592
+ });
593
+ }
594
+
595
+ // Close tunnel
596
+ if (this.#tunnel) {
597
+ this.#tunnel.shutdown();
598
+ this.#tunnel = undefined;
599
+ }
600
+
601
+ this.#config.onShutdown();
602
+
603
+ if (exit) process.exit(0);
604
+ }
605
+
606
+ /**
607
+ * Wait for all actors to stop before proceeding with shutdown.
608
+ *
609
+ * This method polls every 100ms to check if all actors have been stopped.
610
+ *
611
+ * It will resolve early if:
612
+ * - All actors are stopped
613
+ * - The WebSocket connection is closed
614
+ * - The shutdown timeout is reached (120 seconds)
615
+ */
616
+ async #waitForActorsToStop(ws: WebSocket): Promise<void> {
617
+ const shutdownTimeout = 120_000; // 120 seconds
618
+ const shutdownCheckInterval = 100; // Check every 100ms
619
+ const progressLogInterval = 5_000; // Log progress every 5 seconds
620
+ const shutdownStartTs = Date.now();
621
+ let lastProgressLogTs = 0; // Ensure first log happens immediately
622
+
623
+ return new Promise<void>((resolve) => {
624
+ const checkActors = () => {
625
+ const now = Date.now();
626
+ const elapsed = now - shutdownStartTs;
627
+ const wsIsClosed = ws.readyState === 2 || ws.readyState === 3;
628
+
629
+ if (this.#actors.size === 0) {
630
+ this.log?.info({
631
+ msg: "all actors stopped",
632
+ elapsed,
633
+ });
634
+ return true;
635
+ } else if (wsIsClosed) {
636
+ this.log?.warn({
637
+ msg: "websocket closed before all actors stopped",
638
+ remainingActors: this.#actors.size,
639
+ elapsed,
640
+ });
641
+ return true;
642
+ } else if (elapsed >= shutdownTimeout) {
643
+ this.log?.warn({
644
+ msg: "shutdown timeout reached, forcing close",
645
+ remainingActors: this.#actors.size,
646
+ elapsed,
647
+ });
648
+ return true;
649
+ } else {
650
+ // Log progress every 5 seconds
651
+ if (now - lastProgressLogTs >= progressLogInterval) {
652
+ this.log?.info({
653
+ msg: "waiting for actors to stop",
654
+ remainingActors: this.#actors.size,
655
+ elapsed,
656
+ });
657
+ lastProgressLogTs = now;
658
+ }
659
+ return false;
660
+ }
661
+ };
662
+
663
+ // Check immediately first
664
+ if (checkActors()) {
665
+ this.log?.debug({
666
+ msg: "actors check completed immediately",
667
+ });
668
+ resolve();
669
+ return;
670
+ }
671
+
672
+ this.log?.debug({
673
+ msg: "starting actor wait interval",
674
+ checkInterval: shutdownCheckInterval,
675
+ });
676
+
677
+ const interval = setInterval(() => {
678
+ this.log?.debug({
679
+ msg: "actor wait interval tick",
680
+ actorCount: this.#actors.size,
681
+ });
682
+ if (checkActors()) {
683
+ this.log?.debug({
684
+ msg: "actors check completed, clearing interval",
685
+ });
686
+ clearInterval(interval);
687
+ resolve();
688
+ }
689
+ }, shutdownCheckInterval);
690
+ });
691
+ }
692
+
693
+ // MARK: Networking
694
+ get pegboardEndpoint() {
695
+ return this.#config.pegboardEndpoint || this.#config.endpoint;
696
+ }
697
+ get pegboardUrl() {
698
+ const wsEndpoint = this.pegboardEndpoint
699
+ .replace("http://", "ws://")
700
+ .replace("https://", "wss://");
701
+
702
+ // Ensure the endpoint ends with /runners/connect
703
+ const baseUrl = wsEndpoint.endsWith("/")
704
+ ? wsEndpoint.slice(0, -1)
705
+ : wsEndpoint;
706
+ return `${baseUrl}/runners/connect?protocol_version=${PROTOCOL_VERSION}&namespace=${encodeURIComponent(this.#config.namespace)}&runner_key=${encodeURIComponent(this.#config.runnerKey)}`;
707
+ }
708
+
709
+ // MARK: Runner protocol
710
+ async #openPegboardWebSocket() {
711
+ const protocols = ["rivet"];
712
+ if (this.config.token)
713
+ protocols.push(`rivet_token.${this.config.token}`);
714
+
715
+ const WS = await importWebSocket();
716
+
717
+ // Assertion to clear previous WebSocket
718
+ if (
719
+ this.#pegboardWebSocket &&
720
+ (this.#pegboardWebSocket.readyState === WS.CONNECTING ||
721
+ this.#pegboardWebSocket.readyState === WS.OPEN)
722
+ ) {
723
+ this.log?.error(
724
+ "found duplicate pegboardWebSocket, closing previous",
725
+ );
726
+ this.#pegboardWebSocket.close(1000, "duplicate_websocket");
727
+ }
728
+
729
+ const ws = new WS(this.pegboardUrl, protocols) as any as WebSocket;
730
+ this.#pegboardWebSocket = ws;
731
+
732
+ this.log?.info({
733
+ msg: "connecting",
734
+ endpoint: this.pegboardEndpoint,
735
+ namespace: this.#config.namespace,
736
+ runnerKey: this.#config.runnerKey,
737
+ hasToken: !!this.config.token,
738
+ });
739
+
740
+ ws.addEventListener("open", () => {
741
+ if (this.#reconnectAttempt > 0) {
742
+ this.log?.info({
743
+ msg: "runner reconnected",
744
+ namespace: this.#config.namespace,
745
+ runnerName: this.#config.runnerName,
746
+ reconnectAttempt: this.#reconnectAttempt,
747
+ });
748
+ } else {
749
+ this.log?.debug({
750
+ msg: "runner connected",
751
+ namespace: this.#config.namespace,
752
+ runnerName: this.#config.runnerName,
753
+ });
754
+ }
755
+
756
+ // Reset reconnect attempt counter on successful connection
757
+ this.#reconnectAttempt = 0;
758
+
759
+ // Clear any pending reconnect timeout
760
+ if (this.#reconnectTimeout) {
761
+ clearTimeout(this.#reconnectTimeout);
762
+ this.#reconnectTimeout = undefined;
763
+ }
764
+
765
+ // Clear any pending runner lost timeout since we're reconnecting
766
+ if (this.#runnerLostTimeout) {
767
+ clearTimeout(this.#runnerLostTimeout);
768
+ this.#runnerLostTimeout = undefined;
769
+ }
770
+
771
+ // Send init message
772
+ const init: protocol.ToServerInit = {
773
+ name: this.#config.runnerName,
774
+ version: this.#config.version,
775
+ totalSlots: this.#config.totalSlots,
776
+ prepopulateActorNames: new Map(
777
+ Object.entries(this.#config.prepopulateActorNames).map(
778
+ ([name, data]) => [
779
+ name,
780
+ { metadata: JSON.stringify(data.metadata) },
781
+ ],
782
+ ),
783
+ ),
784
+ metadata: JSON.stringify(this.#config.metadata),
785
+ };
786
+
787
+ this.__sendToServer({
788
+ tag: "ToServerInit",
789
+ val: init,
790
+ });
791
+
792
+ // Start command acknowledgment interval (5 minutes)
793
+ const ackInterval = 5 * 60 * 1000; // 5 minutes in milliseconds
794
+ const ackLoop = setInterval(() => {
795
+ try {
796
+ if (ws.readyState === 1) {
797
+ this.#sendCommandAcknowledgment();
798
+ } else {
799
+ clearInterval(ackLoop);
800
+ this.log?.info({
801
+ msg: "WebSocket not open, stopping ack loop",
802
+ });
803
+ }
804
+ } catch (err) {
805
+ this.log?.error({
806
+ msg: "error in command acknowledgment loop",
807
+ error: stringifyError(err),
808
+ });
809
+ }
810
+ }, ackInterval);
811
+ this.#ackInterval = ackLoop;
812
+ });
813
+
814
+ ws.addEventListener("message", async (ev) => {
815
+ let buf: Uint8Array;
816
+ if (ev.data instanceof Blob) {
817
+ buf = new Uint8Array(await ev.data.arrayBuffer());
818
+ } else if (Buffer.isBuffer(ev.data)) {
819
+ buf = new Uint8Array(ev.data);
820
+ } else {
821
+ throw new Error(`expected binary data, got ${typeof ev.data}`);
822
+ }
823
+
824
+ // Parse message
825
+ const message = protocol.decodeToClient(buf);
826
+ this.log?.debug({
827
+ msg: "received runner message",
828
+ data: stringifyToClient(message),
829
+ });
830
+
831
+ // Handle message
832
+ if (message.tag === "ToClientInit") {
833
+ const init = message.val;
834
+
835
+ if (this.runnerId !== init.runnerId) {
836
+ this.runnerId = init.runnerId;
837
+
838
+ // Clear actors if runner id changed
839
+ this.#stopAllActors();
840
+ }
841
+
842
+ // Store the runner lost threshold from metadata
843
+ this.#runnerLostThreshold = init.metadata?.runnerLostThreshold
844
+ ? Number(init.metadata.runnerLostThreshold)
845
+ : undefined;
846
+
847
+ this.log?.info({
848
+ msg: "received init",
849
+ runnerLostThreshold: this.#runnerLostThreshold,
850
+ });
851
+
852
+ // Resend pending events
853
+ this.#processUnsentKvRequests();
854
+ this.#resendUnacknowledgedEvents();
855
+ this.#tunnel?.resendBufferedEvents();
856
+
857
+ this.#config.onConnected();
858
+ } else if (message.tag === "ToClientCommands") {
859
+ const commands = message.val;
860
+ this.#handleCommands(commands);
861
+ } else if (message.tag === "ToClientAckEvents") {
862
+ this.#handleAckEvents(message.val);
863
+ } else if (message.tag === "ToClientKvResponse") {
864
+ const kvResponse = message.val;
865
+ this.#handleKvResponse(kvResponse);
866
+ } else if (message.tag === "ToClientTunnelMessage") {
867
+ this.#tunnel?.handleTunnelMessage(message.val).catch((err) => {
868
+ this.log?.error({
869
+ msg: "error handling tunnel message",
870
+ error: stringifyError(err),
871
+ });
872
+ });
873
+ } else if (message.tag === "ToClientPing") {
874
+ this.__sendToServer({
875
+ tag: "ToServerPong",
876
+ val: {
877
+ ts: message.val.ts,
878
+ },
879
+ });
880
+ } else {
881
+ unreachable(message);
882
+ }
883
+ });
884
+
885
+ ws.addEventListener("error", (ev) => {
886
+ this.log?.error({
887
+ msg: `WebSocket error: ${stringifyError(ev.error)}`,
888
+ });
889
+
890
+ if (!this.#shutdown) {
891
+ // Start runner lost timeout if we have a threshold and are not shutting down
892
+ if (
893
+ !this.#runnerLostTimeout &&
894
+ this.#runnerLostThreshold &&
895
+ this.#runnerLostThreshold > 0
896
+ ) {
897
+ this.log?.info({
898
+ msg: "starting runner lost timeout",
899
+ seconds: this.#runnerLostThreshold / 1000,
900
+ });
901
+ this.#runnerLostTimeout = setTimeout(() => {
902
+ try {
903
+ this.#handleLost();
904
+ } catch (err) {
905
+ this.log?.error({
906
+ msg: "error handling runner lost",
907
+ error: stringifyError(err),
908
+ });
909
+ }
910
+ }, this.#runnerLostThreshold);
911
+ }
912
+
913
+ // Attempt to reconnect if not stopped
914
+ this.#scheduleReconnect();
915
+ }
916
+ });
917
+
918
+ ws.addEventListener("close", async (ev) => {
919
+ if (!this.#shutdown) {
920
+ const closeError = parseWebSocketCloseReason(ev.reason);
921
+ if (
922
+ closeError?.group === "ws" &&
923
+ closeError?.error === "eviction"
924
+ ) {
925
+ this.log?.info("runner websocket evicted");
926
+
927
+ this.#config.onDisconnected(ev.code, ev.reason);
928
+
929
+ await this.shutdown(true);
930
+ } else {
931
+ this.log?.warn({
932
+ msg: "runner disconnected",
933
+ code: ev.code,
934
+ reason: ev.reason.toString(),
935
+ closeError,
936
+ });
937
+
938
+ this.#config.onDisconnected(ev.code, ev.reason);
939
+ }
940
+
941
+ // Clear ack interval on close
942
+ if (this.#ackInterval) {
943
+ clearInterval(this.#ackInterval);
944
+ this.#ackInterval = undefined;
945
+ }
946
+
947
+ // Start runner lost timeout if we have a threshold and are not shutting down
948
+ if (
949
+ !this.#runnerLostTimeout &&
950
+ this.#runnerLostThreshold &&
951
+ this.#runnerLostThreshold > 0
952
+ ) {
953
+ this.log?.info({
954
+ msg: "starting runner lost timeout",
955
+ seconds: this.#runnerLostThreshold / 1000,
956
+ });
957
+ this.#runnerLostTimeout = setTimeout(() => {
958
+ try {
959
+ this.#handleLost();
960
+ } catch (err) {
961
+ this.log?.error({
962
+ msg: "error handling runner lost",
963
+ error: stringifyError(err),
964
+ });
965
+ }
966
+ }, this.#runnerLostThreshold);
967
+ }
968
+
969
+ // Attempt to reconnect if not stopped
970
+ this.#scheduleReconnect();
971
+ } else {
972
+ this.log?.info("websocket closed");
973
+
974
+ this.#config.onDisconnected(ev.code, ev.reason);
975
+ }
976
+ });
977
+ }
978
+
979
+ #handleCommands(commands: protocol.ToClientCommands) {
980
+ this.log?.info({
981
+ msg: "received commands",
982
+ commandCount: commands.length,
983
+ });
984
+
985
+ for (const commandWrapper of commands) {
986
+ if (commandWrapper.inner.tag === "CommandStartActor") {
987
+ // Spawn background promise
988
+ this.#handleCommandStartActor(commandWrapper).catch((err) => {
989
+ this.log?.error({
990
+ msg: "error handling start actor command",
991
+ actorId: commandWrapper.checkpoint.actorId,
992
+ error: stringifyError(err),
993
+ });
994
+ });
995
+
996
+ // NOTE: We don't do this for CommandStopActor because the actor will be removed by that call
997
+ // so we cant update the checkpoint
998
+ const actor = this.getActor(
999
+ commandWrapper.checkpoint.actorId,
1000
+ commandWrapper.checkpoint.generation,
1001
+ );
1002
+ if (actor)
1003
+ actor.lastCommandIdx = commandWrapper.checkpoint.index;
1004
+ } else if (commandWrapper.inner.tag === "CommandStopActor") {
1005
+ // Spawn background promise
1006
+ this.#handleCommandStopActor(commandWrapper).catch((err) => {
1007
+ this.log?.error({
1008
+ msg: "error handling stop actor command",
1009
+ actorId: commandWrapper.checkpoint.actorId,
1010
+ error: stringifyError(err),
1011
+ });
1012
+ });
1013
+ } else {
1014
+ unreachable(commandWrapper.inner);
1015
+ }
1016
+ }
1017
+ }
1018
+
1019
+ #handleAckEvents(ack: protocol.ToClientAckEvents) {
1020
+ const originalTotalEvents = Array.from(this.#actors).reduce(
1021
+ (s, [_, actor]) => s + actor.eventHistory.length,
1022
+ 0,
1023
+ );
1024
+
1025
+ for (const [_, actor] of this.#actors) {
1026
+ const checkpoint = ack.lastEventCheckpoints.find(
1027
+ (x) => x.actorId == actor.actorId,
1028
+ );
1029
+
1030
+ if (checkpoint) actor.handleAckEvents(checkpoint.index);
1031
+ }
1032
+
1033
+ const totalEvents = Array.from(this.#actors).reduce(
1034
+ (s, [_, actor]) => s + actor.eventHistory.length,
1035
+ 0,
1036
+ );
1037
+ const prunedCount = originalTotalEvents - totalEvents;
1038
+
1039
+ if (prunedCount > 0) {
1040
+ this.log?.info({
1041
+ msg: "pruned acknowledged events",
1042
+ prunedCount,
1043
+ });
1044
+ }
1045
+
1046
+ if (totalEvents <= EVENT_BACKLOG_WARN_THRESHOLD) {
1047
+ this.#eventBacklogWarned = false;
1048
+ }
1049
+ }
1050
+
1051
+ /** Track events to send to the server in case we need to resend it on disconnect. */
1052
+ #recordEvent(eventWrapper: protocol.EventWrapper) {
1053
+ const actor = this.getActor(eventWrapper.checkpoint.actorId);
1054
+ if (!actor) return;
1055
+
1056
+ actor.recordEvent(eventWrapper);
1057
+
1058
+ const totalEvents = Array.from(this.#actors).reduce(
1059
+ (s, [_, actor]) => s + actor.eventHistory.length,
1060
+ 0,
1061
+ );
1062
+
1063
+ if (
1064
+ totalEvents > EVENT_BACKLOG_WARN_THRESHOLD &&
1065
+ !this.#eventBacklogWarned
1066
+ ) {
1067
+ this.#eventBacklogWarned = true;
1068
+ this.log?.warn({
1069
+ msg: "unacknowledged event backlog exceeds threshold",
1070
+ backlogSize: totalEvents,
1071
+ threshold: EVENT_BACKLOG_WARN_THRESHOLD,
1072
+ });
1073
+ }
1074
+ }
1075
+
1076
+ async #handleCommandStartActor(commandWrapper: protocol.CommandWrapper) {
1077
+ // IMPORTANT: Make sure no async code runs before inserting #actors and
1078
+ // calling addRequestToActor in order to prevent race conditions with
1079
+ // subsequence commands
1080
+
1081
+ if (!this.#tunnel) throw new Error("missing tunnel on actor start");
1082
+
1083
+ const startCommand = commandWrapper.inner
1084
+ .val as protocol.CommandStartActor;
1085
+
1086
+ const actorId = commandWrapper.checkpoint.actorId;
1087
+ const generation = commandWrapper.checkpoint.generation;
1088
+ const config = startCommand.config;
1089
+
1090
+ const actorConfig: ActorConfig = {
1091
+ name: config.name,
1092
+ key: config.key,
1093
+ createTs: config.createTs,
1094
+ input: config.input ? new Uint8Array(config.input) : null,
1095
+ };
1096
+
1097
+ const instance = new RunnerActor(
1098
+ actorId,
1099
+ generation,
1100
+ actorConfig,
1101
+ startCommand.hibernatingRequests,
1102
+ );
1103
+
1104
+ const existingActor = this.#actors.get(actorId);
1105
+ if (existingActor) {
1106
+ this.log?.warn({
1107
+ msg: "replacing existing actor in actors map",
1108
+ actorId,
1109
+ existingGeneration: existingActor.generation,
1110
+ newGeneration: generation,
1111
+ existingPendingRequests: existingActor.pendingRequests.length,
1112
+ });
1113
+ }
1114
+
1115
+ this.#actors.set(actorId, instance);
1116
+
1117
+ // NOTE: We have to populate the requestToActor map BEFORE running any
1118
+ // async code in order for incoming tunnel messages to wait for
1119
+ // instance.actorStartPromise before processing messages
1120
+ // TODO: Where is this GC'd if something fails?
1121
+ for (const hr of startCommand.hibernatingRequests) {
1122
+ this.#tunnel.addRequestToActor(hr.gatewayId, hr.requestId, actorId);
1123
+ }
1124
+
1125
+ this.log?.info({
1126
+ msg: "created actor",
1127
+ actors: this.#actors.size,
1128
+ actorId,
1129
+ name: config.name,
1130
+ key: config.key,
1131
+ generation,
1132
+ hibernatingRequests: startCommand.hibernatingRequests.length,
1133
+ });
1134
+
1135
+ this.#sendActorStateUpdate(actorId, generation, "running");
1136
+
1137
+ try {
1138
+ // TODO: Add timeout to onActorStart
1139
+ // Call onActorStart asynchronously and handle errors
1140
+ this.log?.debug({
1141
+ msg: "calling onActorStart",
1142
+ actorId,
1143
+ generation,
1144
+ });
1145
+ await this.#config.onActorStart(actorId, generation, actorConfig);
1146
+
1147
+ instance.actorStartPromise.resolve();
1148
+ } catch (err) {
1149
+ this.log?.error({
1150
+ msg: "error starting runner actor",
1151
+ actorId,
1152
+ err,
1153
+ });
1154
+
1155
+ instance.actorStartPromise.reject(err);
1156
+
1157
+ // TODO: Mark as crashed
1158
+ // Send stopped state update if start failed
1159
+ await this.forceStopActor(actorId, generation);
1160
+ }
1161
+ }
1162
+
1163
+ async #handleCommandStopActor(commandWrapper: protocol.CommandWrapper) {
1164
+ const stopCommand = commandWrapper.inner
1165
+ .val as protocol.CommandStopActor;
1166
+
1167
+ const actorId = commandWrapper.checkpoint.actorId;
1168
+ const generation = commandWrapper.checkpoint.generation;
1169
+
1170
+ await this.forceStopActor(actorId, generation);
1171
+ }
1172
+
1173
+ #sendActorIntent(
1174
+ actorId: string,
1175
+ generation: number,
1176
+ intentType: "sleep" | "stop",
1177
+ ) {
1178
+ const actor = this.getActor(actorId, generation);
1179
+ if (!actor) return;
1180
+
1181
+ let actorIntent: protocol.ActorIntent;
1182
+
1183
+ if (intentType === "sleep") {
1184
+ actorIntent = { tag: "ActorIntentSleep", val: null };
1185
+ } else if (intentType === "stop") {
1186
+ actorIntent = {
1187
+ tag: "ActorIntentStop",
1188
+ val: null,
1189
+ };
1190
+ } else {
1191
+ unreachable(intentType);
1192
+ }
1193
+
1194
+ const intentEvent: protocol.EventActorIntent = {
1195
+ intent: actorIntent,
1196
+ };
1197
+
1198
+ const eventWrapper: protocol.EventWrapper = {
1199
+ checkpoint: {
1200
+ actorId,
1201
+ generation,
1202
+ index: actor.nextEventIdx++,
1203
+ },
1204
+ inner: {
1205
+ tag: "EventActorIntent",
1206
+ val: intentEvent,
1207
+ },
1208
+ };
1209
+
1210
+ this.#recordEvent(eventWrapper);
1211
+
1212
+ this.__sendToServer({
1213
+ tag: "ToServerEvents",
1214
+ val: [eventWrapper],
1215
+ });
1216
+ }
1217
+
1218
+ #sendActorStateUpdate(
1219
+ actorId: string,
1220
+ generation: number,
1221
+ stateType: "running" | "stopped",
1222
+ ) {
1223
+ const actor = this.getActor(actorId, generation);
1224
+ if (!actor) return;
1225
+
1226
+ let actorState: protocol.ActorState;
1227
+
1228
+ if (stateType === "running") {
1229
+ actorState = { tag: "ActorStateRunning", val: null };
1230
+ } else if (stateType === "stopped") {
1231
+ actorState = {
1232
+ tag: "ActorStateStopped",
1233
+ val: {
1234
+ code: protocol.StopCode.Ok,
1235
+ message: null,
1236
+ },
1237
+ };
1238
+ } else {
1239
+ unreachable(stateType);
1240
+ }
1241
+
1242
+ const stateUpdateEvent: protocol.EventActorStateUpdate = {
1243
+ state: actorState,
1244
+ };
1245
+
1246
+ const eventWrapper: protocol.EventWrapper = {
1247
+ checkpoint: {
1248
+ actorId,
1249
+ generation,
1250
+ index: actor.nextEventIdx++,
1251
+ },
1252
+ inner: {
1253
+ tag: "EventActorStateUpdate",
1254
+ val: stateUpdateEvent,
1255
+ },
1256
+ };
1257
+
1258
+ this.#recordEvent(eventWrapper);
1259
+
1260
+ this.__sendToServer({
1261
+ tag: "ToServerEvents",
1262
+ val: [eventWrapper],
1263
+ });
1264
+ }
1265
+
1266
+ #sendCommandAcknowledgment() {
1267
+ const lastCommandCheckpoints = [];
1268
+
1269
+ for (const [_, actor] of this.#actors) {
1270
+ if (actor.lastCommandIdx < 0) {
1271
+ // No commands received yet, nothing to acknowledge
1272
+ continue;
1273
+ }
1274
+
1275
+ lastCommandCheckpoints.push({
1276
+ actorId: actor.actorId,
1277
+ generation: actor.generation,
1278
+ index: actor.lastCommandIdx,
1279
+ });
1280
+ }
1281
+
1282
+ //this.#log?.log("Sending command acknowledgment", this.#lastCommandIdx);
1283
+
1284
+ this.__sendToServer({
1285
+ tag: "ToServerAckCommands",
1286
+ val: {
1287
+ lastCommandCheckpoints,
1288
+ },
1289
+ });
1290
+ }
1291
+
1292
+ #handleKvResponse(response: protocol.ToClientKvResponse) {
1293
+ const requestId = response.requestId;
1294
+ const request = this.#kvRequests.get(requestId);
1295
+
1296
+ if (!request) {
1297
+ this.log?.error({
1298
+ msg: "received kv response for unknown request id",
1299
+ requestId,
1300
+ });
1301
+ return;
1302
+ }
1303
+
1304
+ this.#kvRequests.delete(requestId);
1305
+
1306
+ if (response.data.tag === "KvErrorResponse") {
1307
+ request.reject(
1308
+ new Error(response.data.val.message || "Unknown KV error"),
1309
+ );
1310
+ } else {
1311
+ request.resolve(response.data.val);
1312
+ }
1313
+ }
1314
+
1315
+ #parseGetResponseSimple(
1316
+ response: protocol.KvGetResponse,
1317
+ requestedKeys: Uint8Array[],
1318
+ ): (Uint8Array | null)[] {
1319
+ // Parse the response keys and values
1320
+ const responseKeys: Uint8Array[] = [];
1321
+ const responseValues: Uint8Array[] = [];
1322
+
1323
+ for (const key of response.keys) {
1324
+ responseKeys.push(new Uint8Array(key));
1325
+ }
1326
+
1327
+ for (const value of response.values) {
1328
+ responseValues.push(new Uint8Array(value));
1329
+ }
1330
+
1331
+ // Map response back to requested key order
1332
+ const result: (Uint8Array | null)[] = [];
1333
+ for (const requestedKey of requestedKeys) {
1334
+ let found = false;
1335
+ for (let i = 0; i < responseKeys.length; i++) {
1336
+ if (this.#keysEqual(requestedKey, responseKeys[i])) {
1337
+ result.push(responseValues[i]);
1338
+ found = true;
1339
+ break;
1340
+ }
1341
+ }
1342
+ if (!found) {
1343
+ result.push(null);
1344
+ }
1345
+ }
1346
+
1347
+ return result;
1348
+ }
1349
+
1350
+ #keysEqual(key1: Uint8Array, key2: Uint8Array): boolean {
1351
+ if (key1.length !== key2.length) return false;
1352
+ for (let i = 0; i < key1.length; i++) {
1353
+ if (key1[i] !== key2[i]) return false;
1354
+ }
1355
+ return true;
1356
+ }
1357
+
1358
+ //#parseGetResponse(response: protocol.KvGetResponse) {
1359
+ // const keys: string[] = [];
1360
+ // const values: Uint8Array[] = [];
1361
+ // const metadata: { version: Uint8Array; createTs: bigint }[] = [];
1362
+ //
1363
+ // for (const key of response.keys) {
1364
+ // keys.push(new TextDecoder().decode(key));
1365
+ // }
1366
+ //
1367
+ // for (const value of response.values) {
1368
+ // values.push(new Uint8Array(value));
1369
+ // }
1370
+ //
1371
+ // for (const meta of response.metadata) {
1372
+ // metadata.push({
1373
+ // version: new Uint8Array(meta.version),
1374
+ // createTs: meta.createTs,
1375
+ // });
1376
+ // }
1377
+ //
1378
+ // return { keys, values, metadata };
1379
+ //}
1380
+
1381
+ #parseListResponseSimple(
1382
+ response: protocol.KvListResponse,
1383
+ ): [Uint8Array, Uint8Array][] {
1384
+ const result: [Uint8Array, Uint8Array][] = [];
1385
+
1386
+ for (let i = 0; i < response.keys.length; i++) {
1387
+ const key = response.keys[i];
1388
+ const value = response.values[i];
1389
+
1390
+ if (key && value) {
1391
+ const keyBytes = new Uint8Array(key);
1392
+ const valueBytes = new Uint8Array(value);
1393
+ result.push([keyBytes, valueBytes]);
1394
+ }
1395
+ }
1396
+
1397
+ return result;
1398
+ }
1399
+
1400
+ //#parseListResponse(response: protocol.KvListResponse) {
1401
+ // const keys: string[] = [];
1402
+ // const values: Uint8Array[] = [];
1403
+ // const metadata: { version: Uint8Array; createTs: bigint }[] = [];
1404
+ //
1405
+ // for (const key of response.keys) {
1406
+ // keys.push(new TextDecoder().decode(key));
1407
+ // }
1408
+ //
1409
+ // for (const value of response.values) {
1410
+ // values.push(new Uint8Array(value));
1411
+ // }
1412
+ //
1413
+ // for (const meta of response.metadata) {
1414
+ // metadata.push({
1415
+ // version: new Uint8Array(meta.version),
1416
+ // createTs: meta.createTs,
1417
+ // });
1418
+ // }
1419
+ //
1420
+ // return { keys, values, metadata };
1421
+ //}
1422
+
1423
+ // MARK: KV Operations
1424
+ async kvGet(
1425
+ actorId: string,
1426
+ keys: Uint8Array[],
1427
+ ): Promise<(Uint8Array | null)[]> {
1428
+ const kvKeys: protocol.KvKey[] = keys.map(
1429
+ (key) =>
1430
+ key.buffer.slice(
1431
+ key.byteOffset,
1432
+ key.byteOffset + key.byteLength,
1433
+ ) as ArrayBuffer,
1434
+ );
1435
+
1436
+ const requestData: protocol.KvRequestData = {
1437
+ tag: "KvGetRequest",
1438
+ val: { keys: kvKeys },
1439
+ };
1440
+
1441
+ const response = await this.#sendKvRequest(actorId, requestData);
1442
+ return this.#parseGetResponseSimple(response, keys);
1443
+ }
1444
+
1445
+ async kvListAll(
1446
+ actorId: string,
1447
+ options?: KvListOptions,
1448
+ ): Promise<[Uint8Array, Uint8Array][]> {
1449
+ const requestData: protocol.KvRequestData = {
1450
+ tag: "KvListRequest",
1451
+ val: {
1452
+ query: { tag: "KvListAllQuery", val: null },
1453
+ reverse: options?.reverse || null,
1454
+ limit:
1455
+ options?.limit !== undefined ? BigInt(options.limit) : null,
1456
+ },
1457
+ };
1458
+
1459
+ const response = await this.#sendKvRequest(actorId, requestData);
1460
+ return this.#parseListResponseSimple(response);
1461
+ }
1462
+
1463
+ async kvListRange(
1464
+ actorId: string,
1465
+ start: Uint8Array,
1466
+ end: Uint8Array,
1467
+ exclusive?: boolean,
1468
+ options?: KvListOptions,
1469
+ ): Promise<[Uint8Array, Uint8Array][]> {
1470
+ const startKey: protocol.KvKey = start.buffer.slice(
1471
+ start.byteOffset,
1472
+ start.byteOffset + start.byteLength,
1473
+ ) as ArrayBuffer;
1474
+ const endKey: protocol.KvKey = end.buffer.slice(
1475
+ end.byteOffset,
1476
+ end.byteOffset + end.byteLength,
1477
+ ) as ArrayBuffer;
1478
+
1479
+ const requestData: protocol.KvRequestData = {
1480
+ tag: "KvListRequest",
1481
+ val: {
1482
+ query: {
1483
+ tag: "KvListRangeQuery",
1484
+ val: {
1485
+ start: startKey,
1486
+ end: endKey,
1487
+ exclusive: exclusive || false,
1488
+ },
1489
+ },
1490
+ reverse: options?.reverse || null,
1491
+ limit:
1492
+ options?.limit !== undefined ? BigInt(options.limit) : null,
1493
+ },
1494
+ };
1495
+
1496
+ const response = await this.#sendKvRequest(actorId, requestData);
1497
+ return this.#parseListResponseSimple(response);
1498
+ }
1499
+
1500
+ async kvListPrefix(
1501
+ actorId: string,
1502
+ prefix: Uint8Array,
1503
+ options?: KvListOptions,
1504
+ ): Promise<[Uint8Array, Uint8Array][]> {
1505
+ const prefixKey: protocol.KvKey = prefix.buffer.slice(
1506
+ prefix.byteOffset,
1507
+ prefix.byteOffset + prefix.byteLength,
1508
+ ) as ArrayBuffer;
1509
+
1510
+ const requestData: protocol.KvRequestData = {
1511
+ tag: "KvListRequest",
1512
+ val: {
1513
+ query: {
1514
+ tag: "KvListPrefixQuery",
1515
+ val: { key: prefixKey },
1516
+ },
1517
+ reverse: options?.reverse || null,
1518
+ limit:
1519
+ options?.limit !== undefined ? BigInt(options.limit) : null,
1520
+ },
1521
+ };
1522
+
1523
+ const response = await this.#sendKvRequest(actorId, requestData);
1524
+ return this.#parseListResponseSimple(response);
1525
+ }
1526
+
1527
+ async kvPut(
1528
+ actorId: string,
1529
+ entries: [Uint8Array, Uint8Array][],
1530
+ ): Promise<void> {
1531
+ const keys: protocol.KvKey[] = entries.map(
1532
+ ([key, _value]) =>
1533
+ key.buffer.slice(
1534
+ key.byteOffset,
1535
+ key.byteOffset + key.byteLength,
1536
+ ) as ArrayBuffer,
1537
+ );
1538
+ const values: protocol.KvValue[] = entries.map(
1539
+ ([_key, value]) =>
1540
+ value.buffer.slice(
1541
+ value.byteOffset,
1542
+ value.byteOffset + value.byteLength,
1543
+ ) as ArrayBuffer,
1544
+ );
1545
+
1546
+ const requestData: protocol.KvRequestData = {
1547
+ tag: "KvPutRequest",
1548
+ val: { keys, values },
1549
+ };
1550
+
1551
+ await this.#sendKvRequest(actorId, requestData);
1552
+ }
1553
+
1554
+ async kvDelete(actorId: string, keys: Uint8Array[]): Promise<void> {
1555
+ const kvKeys: protocol.KvKey[] = keys.map(
1556
+ (key) =>
1557
+ key.buffer.slice(
1558
+ key.byteOffset,
1559
+ key.byteOffset + key.byteLength,
1560
+ ) as ArrayBuffer,
1561
+ );
1562
+
1563
+ const requestData: protocol.KvRequestData = {
1564
+ tag: "KvDeleteRequest",
1565
+ val: { keys: kvKeys },
1566
+ };
1567
+
1568
+ await this.#sendKvRequest(actorId, requestData);
1569
+ }
1570
+
1571
+ async kvDrop(actorId: string): Promise<void> {
1572
+ const requestData: protocol.KvRequestData = {
1573
+ tag: "KvDropRequest",
1574
+ val: null,
1575
+ };
1576
+
1577
+ await this.#sendKvRequest(actorId, requestData);
1578
+ }
1579
+
1580
+ // MARK: Alarm Operations
1581
+ setAlarm(actorId: string, alarmTs: number | null, generation?: number) {
1582
+ const actor = this.getActor(actorId, generation);
1583
+ if (!actor) return;
1584
+
1585
+ const alarmEvent: protocol.EventActorSetAlarm = {
1586
+ alarmTs: alarmTs !== null ? BigInt(alarmTs) : null,
1587
+ };
1588
+
1589
+ const eventWrapper: protocol.EventWrapper = {
1590
+ checkpoint: {
1591
+ actorId,
1592
+ generation: actor.generation,
1593
+ index: actor.nextEventIdx++,
1594
+ },
1595
+ inner: {
1596
+ tag: "EventActorSetAlarm",
1597
+ val: alarmEvent,
1598
+ },
1599
+ };
1600
+
1601
+ this.#recordEvent(eventWrapper);
1602
+
1603
+ this.__sendToServer({
1604
+ tag: "ToServerEvents",
1605
+ val: [eventWrapper],
1606
+ });
1607
+ }
1608
+
1609
+ clearAlarm(actorId: string, generation?: number) {
1610
+ this.setAlarm(actorId, null, generation);
1611
+ }
1612
+
1613
+ #sendKvRequest(
1614
+ actorId: string,
1615
+ requestData: protocol.KvRequestData,
1616
+ ): Promise<any> {
1617
+ return new Promise((resolve, reject) => {
1618
+ const requestId = this.#nextKvRequestId++;
1619
+
1620
+ // Store the request
1621
+ const requestEntry = {
1622
+ actorId,
1623
+ data: requestData,
1624
+ resolve,
1625
+ reject,
1626
+ sent: false,
1627
+ timestamp: Date.now(),
1628
+ };
1629
+
1630
+ this.#kvRequests.set(requestId, requestEntry);
1631
+
1632
+ if (this.getPegboardWebSocketIfReady()) {
1633
+ // Send immediately
1634
+ this.#sendSingleKvRequest(requestId);
1635
+ }
1636
+ });
1637
+ }
1638
+
1639
+ #sendSingleKvRequest(requestId: number) {
1640
+ const request = this.#kvRequests.get(requestId);
1641
+ if (!request || request.sent) return;
1642
+
1643
+ try {
1644
+ const kvRequest: protocol.ToServerKvRequest = {
1645
+ actorId: request.actorId,
1646
+ requestId,
1647
+ data: request.data,
1648
+ };
1649
+
1650
+ this.__sendToServer({
1651
+ tag: "ToServerKvRequest",
1652
+ val: kvRequest,
1653
+ });
1654
+
1655
+ // Mark as sent and update timestamp
1656
+ request.sent = true;
1657
+ request.timestamp = Date.now();
1658
+ } catch (error) {
1659
+ this.#kvRequests.delete(requestId);
1660
+ request.reject(error);
1661
+ }
1662
+ }
1663
+
1664
+ #processUnsentKvRequests() {
1665
+ if (!this.getPegboardWebSocketIfReady()) {
1666
+ return;
1667
+ }
1668
+
1669
+ let processedCount = 0;
1670
+ for (const [requestId, request] of this.#kvRequests.entries()) {
1671
+ if (!request.sent) {
1672
+ this.#sendSingleKvRequest(requestId);
1673
+ processedCount++;
1674
+ }
1675
+ }
1676
+
1677
+ if (processedCount > 0) {
1678
+ //this.#log?.log(`Processed ${processedCount} queued KV requests`);
1679
+ }
1680
+ }
1681
+
1682
+ /** Asserts WebSocket exists and is ready. */
1683
+ getPegboardWebSocketIfReady(): WebSocket | undefined {
1684
+ if (
1685
+ !!this.#pegboardWebSocket &&
1686
+ this.#pegboardWebSocket.readyState === 1
1687
+ ) {
1688
+ return this.#pegboardWebSocket;
1689
+ } else {
1690
+ return undefined;
1691
+ }
1692
+ }
1693
+
1694
+ __sendToServer(message: protocol.ToServer) {
1695
+ this.log?.debug({
1696
+ msg: "sending runner message",
1697
+ data: stringifyToServer(message),
1698
+ });
1699
+
1700
+ const encoded = protocol.encodeToServer(message);
1701
+ const pegboardWebSocket = this.getPegboardWebSocketIfReady();
1702
+ if (pegboardWebSocket) {
1703
+ pegboardWebSocket.send(encoded);
1704
+ } else {
1705
+ this.log?.error({
1706
+ msg: "WebSocket not available or not open for sending data",
1707
+ });
1708
+ }
1709
+ }
1710
+
1711
+ sendHibernatableWebSocketMessageAck(
1712
+ gatewayId: ArrayBuffer,
1713
+ requestId: ArrayBuffer,
1714
+ index: number,
1715
+ ) {
1716
+ if (!this.#tunnel)
1717
+ throw new Error("missing tunnel to send message ack");
1718
+ this.#tunnel.sendHibernatableWebSocketMessageAck(
1719
+ gatewayId,
1720
+ requestId,
1721
+ index,
1722
+ );
1723
+ }
1724
+
1725
+ /**
1726
+ * Restores hibernatable WebSocket connections for an actor.
1727
+ *
1728
+ * This method should be called at the end of `onActorStart` after the
1729
+ * actor instance is fully initialized.
1730
+ *
1731
+ * This method will:
1732
+ * - Restore all provided hibernatable WebSocket connections
1733
+ * - Attach event listeners to the restored WebSockets
1734
+ * - Close any WebSocket connections that failed to restore
1735
+ *
1736
+ * The provided metadata list should include all hibernatable WebSockets
1737
+ * that were persisted for this actor. The gateway will automatically
1738
+ * close any connections that are not restored (i.e., not included in
1739
+ * this list).
1740
+ *
1741
+ * **Important:** This method must be called after `onActorStart` completes
1742
+ * and before marking the actor as "ready" to ensure all hibernatable
1743
+ * connections are fully restored.
1744
+ *
1745
+ * @param actorId - The ID of the actor to restore connections for
1746
+ * @param metaEntries - Array of hibernatable WebSocket metadata to restore
1747
+ */
1748
+ async restoreHibernatingRequests(
1749
+ actorId: string,
1750
+ metaEntries: HibernatingWebSocketMetadata[],
1751
+ ) {
1752
+ if (!this.#tunnel)
1753
+ throw new Error("missing tunnel to restore hibernating requests");
1754
+ await this.#tunnel.restoreHibernatingRequests(actorId, metaEntries);
1755
+ }
1756
+
1757
+ getServerlessInitPacket(): string | undefined {
1758
+ if (!this.runnerId) return undefined;
1759
+
1760
+ const data = protocol.encodeToServerlessServer({
1761
+ tag: "ToServerlessServerInit",
1762
+ val: {
1763
+ runnerId: this.runnerId,
1764
+ runnerProtocolVersion: PROTOCOL_VERSION,
1765
+ },
1766
+ });
1767
+
1768
+ // Embed version
1769
+ const buffer = Buffer.alloc(data.length + 2);
1770
+ buffer.writeUInt16LE(PROTOCOL_VERSION, 0);
1771
+ Buffer.from(data).copy(buffer, 2);
1772
+
1773
+ return buffer.toString("base64");
1774
+ }
1775
+
1776
+ #scheduleReconnect() {
1777
+ if (this.#shutdown) {
1778
+ this.log?.debug({
1779
+ msg: "Runner is shut down, not attempting reconnect",
1780
+ });
1781
+ return;
1782
+ }
1783
+
1784
+ const delay = calculateBackoff(this.#reconnectAttempt, {
1785
+ initialDelay: 1000,
1786
+ maxDelay: 30000,
1787
+ multiplier: 2,
1788
+ jitter: true,
1789
+ });
1790
+
1791
+ this.log?.debug({
1792
+ msg: `Scheduling reconnect attempt ${this.#reconnectAttempt + 1} in ${delay}ms`,
1793
+ });
1794
+
1795
+ if (this.#reconnectTimeout) {
1796
+ this.log?.info(
1797
+ "clearing previous reconnect timeout in schedule reconnect",
1798
+ );
1799
+ clearTimeout(this.#reconnectTimeout);
1800
+ }
1801
+
1802
+ this.#reconnectTimeout = setTimeout(() => {
1803
+ if (!this.#shutdown) {
1804
+ this.#reconnectAttempt++;
1805
+ this.log?.debug({
1806
+ msg: `Attempting to reconnect (attempt ${this.#reconnectAttempt})...`,
1807
+ });
1808
+ this.#openPegboardWebSocket().catch((err) => {
1809
+ this.log?.error({
1810
+ msg: "error during websocket reconnection",
1811
+ error: stringifyError(err),
1812
+ });
1813
+ });
1814
+ }
1815
+ }, delay);
1816
+ }
1817
+
1818
+ #resendUnacknowledgedEvents() {
1819
+ const eventsToResend = [];
1820
+
1821
+ for (const [_, actor] of this.#actors) {
1822
+ eventsToResend.push(...actor.eventHistory);
1823
+ }
1824
+
1825
+ if (eventsToResend.length === 0) return;
1826
+
1827
+ this.log?.info({
1828
+ msg: "resending unacknowledged events",
1829
+ count: eventsToResend.length,
1830
+ });
1831
+
1832
+ // Resend events in batches
1833
+ this.__sendToServer({
1834
+ tag: "ToServerEvents",
1835
+ val: eventsToResend,
1836
+ });
1837
+ }
1838
+
1839
+ #cleanupOldKvRequests() {
1840
+ const thirtySecondsAgo = Date.now() - KV_EXPIRE;
1841
+ const toDelete: number[] = [];
1842
+
1843
+ for (const [requestId, request] of this.#kvRequests.entries()) {
1844
+ if (request.timestamp < thirtySecondsAgo) {
1845
+ request.reject(
1846
+ new Error(
1847
+ "KV request timed out waiting for WebSocket connection",
1848
+ ),
1849
+ );
1850
+ toDelete.push(requestId);
1851
+ }
1852
+ }
1853
+
1854
+ for (const requestId of toDelete) {
1855
+ this.#kvRequests.delete(requestId);
1856
+ }
1857
+
1858
+ if (toDelete.length > 0) {
1859
+ //this.#log?.log(`Cleaned up ${toDelete.length} expired KV requests`);
1860
+ }
1861
+ }
1862
+ }