@rivetkit/engine-runner 2.0.25-rc.2 → 2.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rivetkit/engine-runner",
3
- "version": "2.0.25-rc.2",
3
+ "version": "2.0.26",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  "import": {
@@ -16,7 +16,7 @@
16
16
  "uuid": "^12.0.0",
17
17
  "pino": "^9.9.5",
18
18
  "ws": "^8.18.3",
19
- "@rivetkit/engine-runner-protocol": "2.0.25-rc.2"
19
+ "@rivetkit/engine-runner-protocol": "2.0.26"
20
20
  },
21
21
  "devDependencies": {
22
22
  "@types/node": "^22.18.1",
package/src/actor.ts CHANGED
@@ -27,6 +27,10 @@ export class RunnerActor {
27
27
  }> = [];
28
28
  actorStartPromise: ReturnType<typeof promiseWithResolvers<void>>;
29
29
 
30
+ lastCommandIdx: bigint = -1n;
31
+ nextEventIdx: bigint = 0n;
32
+ eventHistory: protocol.EventWrapper[] = [];
33
+
30
34
  /**
31
35
  * If restoreHibernatingRequests has been called. This is used to assert
32
36
  * that the caller is implemented correctly.
@@ -81,8 +85,8 @@ export class RunnerActor {
81
85
  gatewayId,
82
86
  requestId,
83
87
  request: {
84
- resolve: () => {},
85
- reject: () => {},
88
+ resolve: () => { },
89
+ reject: () => { },
86
90
  actorId: this.actorId,
87
91
  gatewayId: gatewayId,
88
92
  requestId: requestId,
@@ -118,8 +122,8 @@ export class RunnerActor {
118
122
  gatewayId,
119
123
  requestId,
120
124
  request: {
121
- resolve: () => {},
122
- reject: () => {},
125
+ resolve: () => { },
126
+ reject: () => { },
123
127
  actorId: this.actorId,
124
128
  gatewayId: gatewayId,
125
129
  requestId: requestId,
@@ -193,4 +197,14 @@ export class RunnerActor {
193
197
  this.webSockets.splice(index, 1);
194
198
  }
195
199
  }
200
+
201
+ handleAckEvents(lastEventIdx: bigint) {
202
+ this.eventHistory = this.eventHistory.filter(
203
+ (event) => event.checkpoint.index > lastEventIdx,
204
+ );
205
+ }
206
+
207
+ recordEvent(eventWrapper: protocol.EventWrapper) {
208
+ this.eventHistory.push(eventWrapper);
209
+ }
196
210
  }
package/src/mod.ts CHANGED
@@ -8,6 +8,7 @@ import { type HibernatingWebSocketMetadata, Tunnel } from "./tunnel";
8
8
  import {
9
9
  calculateBackoff,
10
10
  parseWebSocketCloseReason,
11
+ stringifyError,
11
12
  unreachable,
12
13
  } from "./utils";
13
14
  import { importWebSocket } from "./websocket.js";
@@ -17,13 +18,18 @@ export { RunnerActor, type ActorConfig };
17
18
  export { idToStr } from "./utils";
18
19
 
19
20
  const KV_EXPIRE: number = 30_000;
20
- const PROTOCOL_VERSION: number = 3;
21
- const RUNNER_PING_INTERVAL = 3_000;
21
+ const PROTOCOL_VERSION: number = 4;
22
22
 
23
23
  /** Warn once the backlog significantly exceeds the server's ack batch size. */
24
24
  const EVENT_BACKLOG_WARN_THRESHOLD = 10_000;
25
25
  const SIGNAL_HANDLERS: (() => void | Promise<void>)[] = [];
26
26
 
27
+ export class RunnerShutdownError extends Error {
28
+ constructor() {
29
+ super("Runner shut down");
30
+ }
31
+ }
32
+
27
33
  export interface RunnerConfig {
28
34
  logger?: Logger;
29
35
  version: number;
@@ -197,9 +203,6 @@ export class Runner {
197
203
  // WebSocket
198
204
  __pegboardWebSocket?: WebSocket;
199
205
  runnerId?: string;
200
- #lastCommandIdx: number = -1;
201
- #pingLoop?: NodeJS.Timeout;
202
- #nextEventIdx: bigint = 0n;
203
206
  #started: boolean = false;
204
207
  #shutdown: boolean = false;
205
208
  #shuttingDown: boolean = false;
@@ -211,7 +214,6 @@ export class Runner {
211
214
  #runnerLostTimeout?: NodeJS.Timeout;
212
215
 
213
216
  // Event storage for resending
214
- #eventHistory: protocol.EventWrapper[] = [];
215
217
  #eventBacklogWarned: boolean = false;
216
218
 
217
219
  // Command acknowledgment
@@ -255,7 +257,14 @@ export class Runner {
255
257
 
256
258
  // Start cleaning up old unsent KV requests every 15 seconds
257
259
  this.#kvCleanupInterval = setInterval(() => {
258
- this.#cleanupOldKvRequests();
260
+ try {
261
+ this.#cleanupOldKvRequests();
262
+ } catch (err) {
263
+ this.log?.error({
264
+ msg: "error cleaning up kv requests",
265
+ error: stringifyError(err),
266
+ });
267
+ }
259
268
  }, 15000); // Run every 15 seconds
260
269
  }
261
270
 
@@ -307,14 +316,31 @@ export class Runner {
307
316
  this.#sendActorStateUpdate(actorId, actor.generation, "stopped");
308
317
  }
309
318
 
310
- #stopAllActors() {
319
+ #handleLost() {
311
320
  this.log?.info({
312
- msg: "stopping all actors due to runner lost threshold exceeded",
321
+ msg: "stopping all actors due to runner lost threshold",
313
322
  });
314
323
 
324
+ // Remove all remaining kv requests
325
+ for (const [_, request] of this.#kvRequests.entries()) {
326
+ request.reject(new RunnerShutdownError());
327
+ }
328
+
329
+ this.#kvRequests.clear();
330
+
331
+ this.#stopAllActors();
332
+ }
333
+
334
+ #stopAllActors() {
315
335
  const actorIds = Array.from(this.#actors.keys());
316
336
  for (const actorId of actorIds) {
317
- this.forceStopActor(actorId);
337
+ this.forceStopActor(actorId).catch((err) => {
338
+ this.log?.error({
339
+ msg: "error stopping actor",
340
+ actorId,
341
+ error: stringifyError(err),
342
+ });
343
+ });
318
344
  }
319
345
  }
320
346
 
@@ -477,12 +503,6 @@ export class Runner {
477
503
  this.#runnerLostTimeout = undefined;
478
504
  }
479
505
 
480
- // Clear ping loop
481
- if (this.#pingLoop) {
482
- clearInterval(this.#pingLoop);
483
- this.#pingLoop = undefined;
484
- }
485
-
486
506
  // Clear ack interval
487
507
  if (this.#ackInterval) {
488
508
  clearInterval(this.#ackInterval);
@@ -738,10 +758,6 @@ export class Runner {
738
758
  name: this.#config.runnerName,
739
759
  version: this.#config.version,
740
760
  totalSlots: this.#config.totalSlots,
741
- lastCommandIdx:
742
- this.#lastCommandIdx >= 0
743
- ? BigInt(this.#lastCommandIdx)
744
- : null,
745
761
  prepopulateActorNames: new Map(
746
762
  Object.entries(this.#config.prepopulateActorNames).map(
747
763
  ([name, data]) => [
@@ -758,33 +774,22 @@ export class Runner {
758
774
  val: init,
759
775
  });
760
776
 
761
- // Start ping interval
762
- const pingLoop = setInterval(() => {
763
- if (ws.readyState === 1) {
764
- this.__sendToServer({
765
- tag: "ToServerPing",
766
- val: {
767
- ts: BigInt(Date.now()),
768
- },
769
- });
770
- } else {
771
- clearInterval(pingLoop);
772
- this.log?.info({
773
- msg: "WebSocket not open, stopping ping loop",
774
- });
775
- }
776
- }, RUNNER_PING_INTERVAL);
777
- this.#pingLoop = pingLoop;
778
-
779
777
  // Start command acknowledgment interval (5 minutes)
780
778
  const ackInterval = 5 * 60 * 1000; // 5 minutes in milliseconds
781
779
  const ackLoop = setInterval(() => {
782
- if (ws.readyState === 1) {
783
- this.#sendCommandAcknowledgment();
784
- } else {
785
- clearInterval(ackLoop);
786
- this.log?.info({
787
- msg: "WebSocket not open, stopping ack loop",
780
+ try {
781
+ if (ws.readyState === 1) {
782
+ this.#sendCommandAcknowledgment();
783
+ } else {
784
+ clearInterval(ackLoop);
785
+ this.log?.info({
786
+ msg: "WebSocket not open, stopping ack loop",
787
+ });
788
+ }
789
+ } catch (err) {
790
+ this.log?.error({
791
+ msg: "error in command acknowledgment loop",
792
+ error: stringifyError(err),
788
793
  });
789
794
  }
790
795
  }, ackInterval);
@@ -815,8 +820,8 @@ export class Runner {
815
820
  if (this.runnerId !== init.runnerId) {
816
821
  this.runnerId = init.runnerId;
817
822
 
818
- // Clear history if runner id changed
819
- this.#eventHistory.length = 0;
823
+ // Clear actors if runner id changed
824
+ this.#stopAllActors();
820
825
  }
821
826
 
822
827
  // Store the runner lost threshold from metadata
@@ -826,13 +831,12 @@ export class Runner {
826
831
 
827
832
  this.log?.info({
828
833
  msg: "received init",
829
- lastEventIdx: init.lastEventIdx,
830
834
  runnerLostThreshold: this.#runnerLostThreshold,
831
835
  });
832
836
 
833
837
  // Resend pending events
834
838
  this.#processUnsentKvRequests();
835
- this.#resendUnacknowledgedEvents(init.lastEventIdx);
839
+ this.#resendUnacknowledgedEvents();
836
840
  this.#tunnel?.resendBufferedEvents();
837
841
 
838
842
  this.#config.onConnected();
@@ -845,10 +849,19 @@ export class Runner {
845
849
  const kvResponse = message.val;
846
850
  this.#handleKvResponse(kvResponse);
847
851
  } else if (message.tag === "ToClientTunnelMessage") {
848
- this.#tunnel?.handleTunnelMessage(message.val);
849
- } else if (message.tag === "ToClientClose") {
850
- this.#tunnel?.shutdown();
851
- ws.close(1000, "manual closure");
852
+ this.#tunnel?.handleTunnelMessage(message.val).catch((err) => {
853
+ this.log?.error({
854
+ msg: "error handling tunnel message",
855
+ error: stringifyError(err),
856
+ });
857
+ });
858
+ } else if (message.tag === "ToClientPing") {
859
+ this.__sendToServer({
860
+ tag: "ToServerPong",
861
+ val: {
862
+ ts: message.val.ts,
863
+ },
864
+ });
852
865
  } else {
853
866
  unreachable(message);
854
867
  }
@@ -871,7 +884,14 @@ export class Runner {
871
884
  seconds: this.#runnerLostThreshold / 1000,
872
885
  });
873
886
  this.#runnerLostTimeout = setTimeout(() => {
874
- this.#stopAllActors();
887
+ try {
888
+ this.#handleLost();
889
+ } catch (err) {
890
+ this.log?.error({
891
+ msg: "error handling runner lost",
892
+ error: stringifyError(err),
893
+ });
894
+ }
875
895
  }, this.#runnerLostThreshold);
876
896
  }
877
897
 
@@ -909,12 +929,6 @@ export class Runner {
909
929
  this.#config.onDisconnected(ev.code, ev.reason);
910
930
  }
911
931
 
912
- // Clear ping loop on close
913
- if (this.#pingLoop) {
914
- clearInterval(this.#pingLoop);
915
- this.#pingLoop = undefined;
916
- }
917
-
918
932
  // Clear ack interval on close
919
933
  if (this.#ackInterval) {
920
934
  clearInterval(this.#ackInterval);
@@ -933,7 +947,14 @@ export class Runner {
933
947
  seconds: this.#runnerLostThreshold / 1000,
934
948
  });
935
949
  this.#runnerLostTimeout = setTimeout(() => {
936
- this.#stopAllActors();
950
+ try {
951
+ this.#handleLost();
952
+ } catch (err) {
953
+ this.log?.error({
954
+ msg: "error handling runner lost",
955
+ error: stringifyError(err),
956
+ });
957
+ }
937
958
  }, this.#runnerLostThreshold);
938
959
  }
939
960
 
@@ -952,52 +973,86 @@ export class Runner {
952
973
  for (const commandWrapper of commands) {
953
974
  if (commandWrapper.inner.tag === "CommandStartActor") {
954
975
  // Spawn background promise
955
- this.#handleCommandStartActor(commandWrapper);
976
+ this.#handleCommandStartActor(commandWrapper).catch((err) => {
977
+ this.log?.error({
978
+ msg: "error handling start actor command",
979
+ actorId: commandWrapper.checkpoint.actorId,
980
+ error: stringifyError(err),
981
+ });
982
+ });
956
983
  } else if (commandWrapper.inner.tag === "CommandStopActor") {
957
984
  // Spawn background promise
958
- this.#handleCommandStopActor(commandWrapper);
985
+ this.#handleCommandStopActor(commandWrapper).catch((err) => {
986
+ this.log?.error({
987
+ msg: "error handling stop actor command",
988
+ actorId: commandWrapper.checkpoint.actorId,
989
+ error: stringifyError(err),
990
+ });
991
+ });
959
992
  } else {
960
993
  unreachable(commandWrapper.inner);
961
994
  }
962
995
 
963
- this.#lastCommandIdx = Number(commandWrapper.index);
996
+ const actor = this.getActor(
997
+ commandWrapper.checkpoint.actorId,
998
+ commandWrapper.inner.val.generation,
999
+ );
1000
+ if (actor) actor.lastCommandIdx = commandWrapper.checkpoint.index;
964
1001
  }
965
1002
  }
966
1003
 
967
1004
  #handleAckEvents(ack: protocol.ToClientAckEvents) {
968
- const lastAckedIdx = ack.lastEventIdx;
1005
+ let originalTotalEvents = Array.from(this.#actors).reduce(
1006
+ (s, [_, actor]) => s + actor.eventHistory.length,
1007
+ 0,
1008
+ );
1009
+
1010
+ for (const [_, actor] of this.#actors) {
1011
+ let checkpoint = ack.lastEventCheckpoints.find(
1012
+ (x) => x.actorId == actor.actorId,
1013
+ );
1014
+
1015
+ if (checkpoint) actor.handleAckEvents(checkpoint.index);
1016
+ }
969
1017
 
970
- const originalLength = this.#eventHistory.length;
971
- this.#eventHistory = this.#eventHistory.filter(
972
- (event) => event.index > lastAckedIdx,
1018
+ const totalEvents = Array.from(this.#actors).reduce(
1019
+ (s, [_, actor]) => s + actor.eventHistory.length,
1020
+ 0,
973
1021
  );
1022
+ const prunedCount = originalTotalEvents - totalEvents;
974
1023
 
975
- const prunedCount = originalLength - this.#eventHistory.length;
976
1024
  if (prunedCount > 0) {
977
1025
  this.log?.info({
978
1026
  msg: "pruned acknowledged events",
979
- lastAckedIdx: lastAckedIdx.toString(),
980
1027
  prunedCount,
981
1028
  });
982
1029
  }
983
1030
 
984
- if (this.#eventHistory.length <= EVENT_BACKLOG_WARN_THRESHOLD) {
1031
+ if (totalEvents <= EVENT_BACKLOG_WARN_THRESHOLD) {
985
1032
  this.#eventBacklogWarned = false;
986
1033
  }
987
1034
  }
988
1035
 
989
1036
  /** Track events to send to the server in case we need to resend it on disconnect. */
990
1037
  #recordEvent(eventWrapper: protocol.EventWrapper) {
991
- this.#eventHistory.push(eventWrapper);
1038
+ const actor = this.getActor(eventWrapper.checkpoint.actorId);
1039
+ if (!actor) return;
1040
+
1041
+ actor.recordEvent(eventWrapper);
1042
+
1043
+ let totalEvents = Array.from(this.#actors).reduce(
1044
+ (s, [_, actor]) => s + actor.eventHistory.length,
1045
+ 0,
1046
+ );
992
1047
 
993
1048
  if (
994
- this.#eventHistory.length > EVENT_BACKLOG_WARN_THRESHOLD &&
1049
+ totalEvents > EVENT_BACKLOG_WARN_THRESHOLD &&
995
1050
  !this.#eventBacklogWarned
996
1051
  ) {
997
1052
  this.#eventBacklogWarned = true;
998
1053
  this.log?.warn({
999
1054
  msg: "unacknowledged event backlog exceeds threshold",
1000
- backlogSize: this.#eventHistory.length,
1055
+ backlogSize: totalEvents,
1001
1056
  threshold: EVENT_BACKLOG_WARN_THRESHOLD,
1002
1057
  });
1003
1058
  }
@@ -1013,7 +1068,7 @@ export class Runner {
1013
1068
  const startCommand = commandWrapper.inner
1014
1069
  .val as protocol.CommandStartActor;
1015
1070
 
1016
- const actorId = startCommand.actorId;
1071
+ const actorId = commandWrapper.checkpoint.actorId;
1017
1072
  const generation = startCommand.generation;
1018
1073
  const config = startCommand.config;
1019
1074
 
@@ -1094,7 +1149,7 @@ export class Runner {
1094
1149
  const stopCommand = commandWrapper.inner
1095
1150
  .val as protocol.CommandStopActor;
1096
1151
 
1097
- const actorId = stopCommand.actorId;
1152
+ const actorId = commandWrapper.checkpoint.actorId;
1098
1153
  const generation = stopCommand.generation;
1099
1154
 
1100
1155
  await this.forceStopActor(actorId, generation);
@@ -1105,6 +1160,9 @@ export class Runner {
1105
1160
  generation: number,
1106
1161
  intentType: "sleep" | "stop",
1107
1162
  ) {
1163
+ const actor = this.getActor(actorId, generation);
1164
+ if (!actor) return;
1165
+
1108
1166
  let actorIntent: protocol.ActorIntent;
1109
1167
 
1110
1168
  if (intentType === "sleep") {
@@ -1124,9 +1182,11 @@ export class Runner {
1124
1182
  intent: actorIntent,
1125
1183
  };
1126
1184
 
1127
- const eventIndex = this.#nextEventIdx++;
1128
1185
  const eventWrapper: protocol.EventWrapper = {
1129
- index: eventIndex,
1186
+ checkpoint: {
1187
+ actorId,
1188
+ index: actor.nextEventIdx++,
1189
+ },
1130
1190
  inner: {
1131
1191
  tag: "EventActorIntent",
1132
1192
  val: intentEvent,
@@ -1146,6 +1206,9 @@ export class Runner {
1146
1206
  generation: number,
1147
1207
  stateType: "running" | "stopped",
1148
1208
  ) {
1209
+ const actor = this.getActor(actorId, generation);
1210
+ if (!actor) return;
1211
+
1149
1212
  let actorState: protocol.ActorState;
1150
1213
 
1151
1214
  if (stateType === "running") {
@@ -1168,9 +1231,11 @@ export class Runner {
1168
1231
  state: actorState,
1169
1232
  };
1170
1233
 
1171
- const eventIndex = this.#nextEventIdx++;
1172
1234
  const eventWrapper: protocol.EventWrapper = {
1173
- index: eventIndex,
1235
+ checkpoint: {
1236
+ actorId,
1237
+ index: actor.nextEventIdx++,
1238
+ },
1174
1239
  inner: {
1175
1240
  tag: "EventActorStateUpdate",
1176
1241
  val: stateUpdateEvent,
@@ -1186,9 +1251,18 @@ export class Runner {
1186
1251
  }
1187
1252
 
1188
1253
  #sendCommandAcknowledgment() {
1189
- if (this.#lastCommandIdx < 0) {
1190
- // No commands received yet, nothing to acknowledge
1191
- return;
1254
+ const lastCommandCheckpoints = [];
1255
+
1256
+ for (const [_, actor] of this.#actors) {
1257
+ if (actor.lastCommandIdx < 0) {
1258
+ // No commands received yet, nothing to acknowledge
1259
+ continue;
1260
+ }
1261
+
1262
+ lastCommandCheckpoints.push({
1263
+ actorId: actor.actorId,
1264
+ index: actor.lastCommandIdx,
1265
+ });
1192
1266
  }
1193
1267
 
1194
1268
  //this.#log?.log("Sending command acknowledgment", this.#lastCommandIdx);
@@ -1196,7 +1270,7 @@ export class Runner {
1196
1270
  this.__sendToServer({
1197
1271
  tag: "ToServerAckCommands",
1198
1272
  val: {
1199
- lastCommandIdx: BigInt(this.#lastCommandIdx),
1273
+ lastCommandCheckpoints,
1200
1274
  },
1201
1275
  });
1202
1276
  }
@@ -1500,9 +1574,11 @@ export class Runner {
1500
1574
  alarmTs: alarmTs !== null ? BigInt(alarmTs) : null,
1501
1575
  };
1502
1576
 
1503
- const eventIndex = this.#nextEventIdx++;
1504
1577
  const eventWrapper: protocol.EventWrapper = {
1505
- index: eventIndex,
1578
+ checkpoint: {
1579
+ actorId,
1580
+ index: actor.nextEventIdx++,
1581
+ },
1506
1582
  inner: {
1507
1583
  tag: "EventActorSetAlarm",
1508
1584
  val: alarmEvent,
@@ -1669,6 +1745,7 @@ export class Runner {
1669
1745
  tag: "ToServerlessServerInit",
1670
1746
  val: {
1671
1747
  runnerId: this.runnerId,
1748
+ runnerProtocolVersion: PROTOCOL_VERSION,
1672
1749
  },
1673
1750
  });
1674
1751
 
@@ -1699,27 +1776,34 @@ export class Runner {
1699
1776
  msg: `Scheduling reconnect attempt ${this.#reconnectAttempt + 1} in ${delay}ms`,
1700
1777
  });
1701
1778
 
1702
- this.#reconnectTimeout = setTimeout(async () => {
1779
+ this.#reconnectTimeout = setTimeout(() => {
1703
1780
  if (!this.#shutdown) {
1704
1781
  this.#reconnectAttempt++;
1705
1782
  this.log?.debug({
1706
1783
  msg: `Attempting to reconnect (attempt ${this.#reconnectAttempt})...`,
1707
1784
  });
1708
- await this.#openPegboardWebSocket();
1785
+ this.#openPegboardWebSocket().catch((err) => {
1786
+ this.log?.error({
1787
+ msg: "error during websocket reconnection",
1788
+ error: stringifyError(err),
1789
+ });
1790
+ });
1709
1791
  }
1710
1792
  }, delay);
1711
1793
  }
1712
1794
 
1713
- #resendUnacknowledgedEvents(lastEventIdx: bigint) {
1714
- const eventsToResend = this.#eventHistory.filter(
1715
- (event) => event.index > lastEventIdx,
1716
- );
1795
+ #resendUnacknowledgedEvents() {
1796
+ const eventsToResend = [];
1797
+
1798
+ for (const [_, actor] of this.#actors) {
1799
+ eventsToResend.push(...actor.eventHistory);
1800
+ }
1717
1801
 
1718
1802
  if (eventsToResend.length === 0) return;
1719
1803
 
1720
1804
  this.log?.info({
1721
1805
  msg: "resending unacknowledged events",
1722
- fromIndex: lastEventIdx + 1n,
1806
+ count: eventsToResend.length,
1723
1807
  });
1724
1808
 
1725
1809
  // Resend events in batches