@openfn/ws-worker 1.14.2 → 1.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # ws-worker
2
2
 
3
+ ## 1.14.4
4
+
5
+ ### Patch Changes
6
+
7
+ - 5a613e7: Adjust logging
8
+ - 3a33557: Count outstanding claim requests as capacity. This fixes an issue where `work-available` messages can cause a worker to over-claim, particularly during periods of high load on the Lightning database.
9
+
10
+ ## 1.14.3
11
+
12
+ ### Patch Changes
13
+
14
+ - 032430f: Increase timeout on claim events
15
+ - 0957412: Stop trying to claim after the queue has been closed
16
+
3
17
  ## 1.14.2
4
18
 
5
19
  ### Patch Changes
package/dist/index.d.ts CHANGED
@@ -78,6 +78,7 @@ type ServerOptions = {
78
78
  sentryEnv?: string;
79
79
  socketTimeoutSeconds?: number;
80
80
  messageTimeoutSeconds?: number;
81
+ claimTimeoutSeconds?: number;
81
82
  payloadLimitMb?: number;
82
83
  collectionsVersion?: string;
83
84
  collectionsUrl?: string;
@@ -88,6 +89,7 @@ interface ServerApp extends Koa {
88
89
  socket?: any;
89
90
  queueChannel?: Channel;
90
91
  workflows: Record<string, true | Context>;
92
+ openClaims: Record<string, number>;
91
93
  destroyed: boolean;
92
94
  events: EventEmitter;
93
95
  server: Server;
package/dist/index.js CHANGED
@@ -140,14 +140,25 @@ var ClaimError = class extends Error {
140
140
  this.abort = true;
141
141
  }
142
142
  };
143
+ var claimIdGen = 0;
143
144
  var claim = (app, logger = mockLogger, options = {}) => {
144
145
  return new Promise((resolve, reject) => {
145
- const { maxWorkers = 5 } = options;
146
+ app.openClaims ??= {};
147
+ const { maxWorkers = 5, demand = 1 } = options;
146
148
  const podName = NAME ? `[${NAME}] ` : "";
147
149
  const activeWorkers = Object.keys(app.workflows).length;
150
+ const pendingClaims = Object.values(app.openClaims).reduce(
151
+ (a, b) => a + b,
152
+ 0
153
+ );
148
154
  if (activeWorkers >= maxWorkers) {
149
155
  app.workloop?.stop(`server at capacity (${activeWorkers}/${maxWorkers})`);
150
156
  return reject(new ClaimError("Server at capacity"));
157
+ } else if (activeWorkers + pendingClaims >= maxWorkers) {
158
+ app.workloop?.stop(
159
+ `server at capacity (${activeWorkers}/${maxWorkers}, ${pendingClaims} pending)`
160
+ );
161
+ return reject(new ClaimError("Server at capacity"));
151
162
  }
152
163
  if (!app.queueChannel) {
153
164
  logger.warn("skipping claim attempt: websocket unavailable");
@@ -159,12 +170,15 @@ var claim = (app, logger = mockLogger, options = {}) => {
159
170
  logger.warn("skipping claim attempt: channel closed");
160
171
  return reject(e);
161
172
  }
173
+ const claimId = ++claimIdGen;
174
+ app.openClaims[claimId] = demand;
162
175
  logger.debug(`requesting run (capacity ${activeWorkers}/${maxWorkers})`);
163
176
  const start = Date.now();
164
177
  app.queueChannel.push(CLAIM, {
165
- demand: 1,
178
+ demand,
166
179
  worker_name: NAME || null
167
180
  }).receive("ok", ({ runs }) => {
181
+ delete app.openClaims[claimId];
168
182
  const duration = Date.now() - start;
169
183
  logger.debug(
170
184
  `${podName}claimed ${runs.length} runs in ${duration}ms (${runs.length ? runs.map((r) => r.id).join(",") : "-"})`
@@ -192,8 +206,11 @@ var claim = (app, logger = mockLogger, options = {}) => {
192
206
  resolve();
193
207
  });
194
208
  }).receive("error", (e) => {
209
+ delete app.openClaims[claimId];
195
210
  logger.error("Error on claim", e);
211
+ reject(new Error("claim error"));
196
212
  }).receive("timeout", () => {
213
+ delete app.openClaims[claimId];
197
214
  logger.error("TIMEOUT on claim. Runs may be lost.");
198
215
  reject(new Error("timeout"));
199
216
  });
@@ -1015,9 +1032,8 @@ var joinRunChannel = (socket, token, runId, logger, timeout = 30) => {
1015
1032
  channel.onClose(() => {
1016
1033
  logger.debug(`Leaving ${channelName}`);
1017
1034
  });
1018
- channel.onError((e) => {
1019
- logger.debug(`Error in ${channelName}`);
1020
- logger.debug(e);
1035
+ channel.onError((...args) => {
1036
+ logger.debug(`Critical error in channel ${channelName}`, args);
1021
1037
  });
1022
1038
  });
1023
1039
  };
@@ -1052,7 +1068,16 @@ var generateWorkerToken = async (secret, workerId, logger) => {
1052
1068
  var worker_token_default = generateWorkerToken;
1053
1069
 
1054
1070
  // src/channels/worker-queue.ts
1055
- var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger, SocketConstructor = PhxSocket) => {
1071
+ var DEFAULT_MESSAGE_TIMEOUT_SECONDS = 30;
1072
+ var DEFAULT_CLAIM_TIMEOUT_SECONDS = 60 * 60;
1073
+ var connectToWorkerQueue = (endpoint, serverId, secret, logger, options) => {
1074
+ const {
1075
+ // Sets the DEFAULT timeout for all messages on the websocket
1076
+ // This can be overridden by different channels (although we tend not to)
1077
+ messageTimeout = DEFAULT_MESSAGE_TIMEOUT_SECONDS,
1078
+ claimTimeout = DEFAULT_CLAIM_TIMEOUT_SECONDS,
1079
+ SocketConstructor = PhxSocket
1080
+ } = options;
1056
1081
  const events = new EventEmitter();
1057
1082
  Sentry5.addBreadcrumb({
1058
1083
  category: "lifecycle",
@@ -1073,7 +1098,7 @@ var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger, So
1073
1098
  const socket = new SocketConstructor(endpoint, {
1074
1099
  params,
1075
1100
  transport: WebSocket,
1076
- timeout: timeout * 1e3,
1101
+ timeout: messageTimeout * 1e3,
1077
1102
  reconnectAfterMs: (tries) => Math.max(tries * 1e3)
1078
1103
  });
1079
1104
  let didOpen = false;
@@ -1091,7 +1116,7 @@ var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger, So
1091
1116
  events.emit("message", ev, load);
1092
1117
  return load;
1093
1118
  };
1094
- channel.join().receive("ok", () => {
1119
+ channel.join(claimTimeout * 1e3).receive("ok", () => {
1095
1120
  logger.debug("Connected to worker queue socket");
1096
1121
  events.emit("connect", { socket, channel });
1097
1122
  }).receive("error", (e) => {
@@ -1178,13 +1203,12 @@ function connect(app, logger, options = {}) {
1178
1203
  });
1179
1204
  }
1180
1205
  };
1181
- worker_queue_default(
1182
- options.lightning,
1183
- app.id,
1184
- options.secret,
1185
- options.socketTimeoutSeconds,
1186
- logger
1187
- ).on("connect", onConnect).on("disconnect", onDisconnect).on("error", onError).on("message", onMessage);
1206
+ worker_queue_default(options.lightning, app.id, options.secret, logger, {
1207
+ // TODO: options.socketTimeoutSeconds wins because this is what USED to be used
1208
+ // But it's deprecated and should be removed soon
1209
+ messageTimeout: options.socketTimeoutSeconds ?? options.messageTimeoutSeconds,
1210
+ claimTimeout: options.claimTimeoutSeconds
1211
+ }).on("connect", onConnect).on("disconnect", onDisconnect).on("error", onError).on("message", onMessage);
1188
1212
  }
1189
1213
  async function setupCollections(options, logger) {
1190
1214
  if (options.collectionsUrl) {
@@ -1232,6 +1256,7 @@ function createServer(engine, options = {}) {
1232
1256
  logger.debug(str);
1233
1257
  })
1234
1258
  );
1259
+ app.openClaims = {};
1235
1260
  app.workflows = {};
1236
1261
  app.destroyed = false;
1237
1262
  app.server = app.listen(port);
@@ -1271,7 +1296,6 @@ function createServer(engine, options = {}) {
1271
1296
  collectionsVersion: app.options.collectionsVersion,
1272
1297
  monorepoPath: app.options.monorepoDir
1273
1298
  });
1274
- logger.debug("converted run body into execution plan:", plan);
1275
1299
  if (plan.workflow.credentials?.collections_token) {
1276
1300
  plan.workflow.credentials.collections_token = token;
1277
1301
  }
package/dist/start.js CHANGED
@@ -280,14 +280,25 @@ var ClaimError = class extends Error {
280
280
  this.abort = true;
281
281
  }
282
282
  };
283
+ var claimIdGen = 0;
283
284
  var claim = (app, logger2 = mockLogger, options = {}) => {
284
285
  return new Promise((resolve5, reject) => {
285
- const { maxWorkers = 5 } = options;
286
+ app.openClaims ??= {};
287
+ const { maxWorkers = 5, demand = 1 } = options;
286
288
  const podName = NAME ? `[${NAME}] ` : "";
287
289
  const activeWorkers = Object.keys(app.workflows).length;
290
+ const pendingClaims = Object.values(app.openClaims).reduce(
291
+ (a, b) => a + b,
292
+ 0
293
+ );
288
294
  if (activeWorkers >= maxWorkers) {
289
295
  app.workloop?.stop(`server at capacity (${activeWorkers}/${maxWorkers})`);
290
296
  return reject(new ClaimError("Server at capacity"));
297
+ } else if (activeWorkers + pendingClaims >= maxWorkers) {
298
+ app.workloop?.stop(
299
+ `server at capacity (${activeWorkers}/${maxWorkers}, ${pendingClaims} pending)`
300
+ );
301
+ return reject(new ClaimError("Server at capacity"));
291
302
  }
292
303
  if (!app.queueChannel) {
293
304
  logger2.warn("skipping claim attempt: websocket unavailable");
@@ -299,12 +310,15 @@ var claim = (app, logger2 = mockLogger, options = {}) => {
299
310
  logger2.warn("skipping claim attempt: channel closed");
300
311
  return reject(e);
301
312
  }
313
+ const claimId = ++claimIdGen;
314
+ app.openClaims[claimId] = demand;
302
315
  logger2.debug(`requesting run (capacity ${activeWorkers}/${maxWorkers})`);
303
316
  const start = Date.now();
304
317
  app.queueChannel.push(CLAIM, {
305
- demand: 1,
318
+ demand,
306
319
  worker_name: NAME || null
307
320
  }).receive("ok", ({ runs }) => {
321
+ delete app.openClaims[claimId];
308
322
  const duration = Date.now() - start;
309
323
  logger2.debug(
310
324
  `${podName}claimed ${runs.length} runs in ${duration}ms (${runs.length ? runs.map((r) => r.id).join(",") : "-"})`
@@ -332,8 +346,11 @@ var claim = (app, logger2 = mockLogger, options = {}) => {
332
346
  resolve5();
333
347
  });
334
348
  }).receive("error", (e) => {
349
+ delete app.openClaims[claimId];
335
350
  logger2.error("Error on claim", e);
351
+ reject(new Error("claim error"));
336
352
  }).receive("timeout", () => {
353
+ delete app.openClaims[claimId];
337
354
  logger2.error("TIMEOUT on claim. Runs may be lost.");
338
355
  reject(new Error("timeout"));
339
356
  });
@@ -1155,9 +1172,8 @@ var joinRunChannel = (socket, token, runId, logger2, timeout = 30) => {
1155
1172
  channel.onClose(() => {
1156
1173
  logger2.debug(`Leaving ${channelName}`);
1157
1174
  });
1158
- channel.onError((e) => {
1159
- logger2.debug(`Error in ${channelName}`);
1160
- logger2.debug(e);
1175
+ channel.onError((...args2) => {
1176
+ logger2.debug(`Critical error in channel ${channelName}`, args2);
1161
1177
  });
1162
1178
  });
1163
1179
  };
@@ -1192,7 +1208,16 @@ var generateWorkerToken = async (secret, workerId, logger2) => {
1192
1208
  var worker_token_default = generateWorkerToken;
1193
1209
 
1194
1210
  // src/channels/worker-queue.ts
1195
- var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger2, SocketConstructor = PhxSocket) => {
1211
+ var DEFAULT_MESSAGE_TIMEOUT_SECONDS = 30;
1212
+ var DEFAULT_CLAIM_TIMEOUT_SECONDS = 60 * 60;
1213
+ var connectToWorkerQueue = (endpoint, serverId, secret, logger2, options) => {
1214
+ const {
1215
+ // Sets the DEFAULT timeout for all messages on the websocket
1216
+ // This can be overridden by different channels (although we tend not to)
1217
+ messageTimeout = DEFAULT_MESSAGE_TIMEOUT_SECONDS,
1218
+ claimTimeout = DEFAULT_CLAIM_TIMEOUT_SECONDS,
1219
+ SocketConstructor = PhxSocket
1220
+ } = options;
1196
1221
  const events = new EventEmitter2();
1197
1222
  Sentry5.addBreadcrumb({
1198
1223
  category: "lifecycle",
@@ -1213,7 +1238,7 @@ var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger2, S
1213
1238
  const socket = new SocketConstructor(endpoint, {
1214
1239
  params,
1215
1240
  transport: WebSocket,
1216
- timeout: timeout * 1e3,
1241
+ timeout: messageTimeout * 1e3,
1217
1242
  reconnectAfterMs: (tries) => Math.max(tries * 1e3)
1218
1243
  });
1219
1244
  let didOpen = false;
@@ -1231,7 +1256,7 @@ var connectToWorkerQueue = (endpoint, serverId, secret, timeout = 10, logger2, S
1231
1256
  events.emit("message", ev, load);
1232
1257
  return load;
1233
1258
  };
1234
- channel.join().receive("ok", () => {
1259
+ channel.join(claimTimeout * 1e3).receive("ok", () => {
1235
1260
  logger2.debug("Connected to worker queue socket");
1236
1261
  events.emit("connect", { socket, channel });
1237
1262
  }).receive("error", (e) => {
@@ -1318,13 +1343,12 @@ function connect(app, logger2, options = {}) {
1318
1343
  });
1319
1344
  }
1320
1345
  };
1321
- worker_queue_default(
1322
- options.lightning,
1323
- app.id,
1324
- options.secret,
1325
- options.socketTimeoutSeconds,
1326
- logger2
1327
- ).on("connect", onConnect).on("disconnect", onDisconnect).on("error", onError).on("message", onMessage);
1346
+ worker_queue_default(options.lightning, app.id, options.secret, logger2, {
1347
+ // TODO: options.socketTimeoutSeconds wins because this is what USED to be used
1348
+ // But it's deprecated and should be removed soon
1349
+ messageTimeout: options.socketTimeoutSeconds ?? options.messageTimeoutSeconds,
1350
+ claimTimeout: options.claimTimeoutSeconds
1351
+ }).on("connect", onConnect).on("disconnect", onDisconnect).on("error", onError).on("message", onMessage);
1328
1352
  }
1329
1353
  async function setupCollections(options, logger2) {
1330
1354
  if (options.collectionsUrl) {
@@ -1372,6 +1396,7 @@ function createServer(engine, options = {}) {
1372
1396
  logger2.debug(str);
1373
1397
  })
1374
1398
  );
1399
+ app.openClaims = {};
1375
1400
  app.workflows = {};
1376
1401
  app.destroyed = false;
1377
1402
  app.server = app.listen(port);
@@ -1411,7 +1436,6 @@ function createServer(engine, options = {}) {
1411
1436
  collectionsVersion: app.options.collectionsVersion,
1412
1437
  monorepoPath: app.options.monorepoDir
1413
1438
  });
1414
- logger2.debug("converted run body into execution plan:", plan);
1415
1439
  if (plan.workflow.credentials?.collections_token) {
1416
1440
  plan.workflow.credentials.collections_token = token;
1417
1441
  }
@@ -6352,8 +6376,6 @@ var yargs_default = Yargs;
6352
6376
  // src/util/cli.ts
6353
6377
  var DEFAULT_PORT2 = 2222;
6354
6378
  var DEFAULT_WORKER_CAPACITY = 5;
6355
- var DEFAULT_SOCKET_TIMEOUT_SECONDS = 10;
6356
- var DEFAULT_MESSAGE_TIMEOUT_SECONDS = 30;
6357
6379
  function setArg(argValue, envValue, defaultValue) {
6358
6380
  if (Array.isArray(defaultValue) && !argValue && typeof envValue === "string") {
6359
6381
  return envValue.split(",");
@@ -6368,6 +6390,7 @@ function parseArgs(argv) {
6368
6390
  OPENFN_ADAPTORS_REPO,
6369
6391
  WORKER_BACKOFF,
6370
6392
  WORKER_CAPACITY,
6393
+ WORKER_CLAIM_TIMEOUT_SECONDS,
6371
6394
  WORKER_COLLECTIONS_URL,
6372
6395
  WORKER_COLLECTIONS_VERSION,
6373
6396
  WORKER_LIGHTNING_PUBLIC_KEY,
@@ -6410,9 +6433,12 @@ function parseArgs(argv) {
6410
6433
  }).option("sentry-env", {
6411
6434
  description: "Sentry environment. Defaults to 'dev'. Env: WORKER_SENTRY_ENV"
6412
6435
  }).option("socket-timeout", {
6413
- description: `Timeout for websockets to Lightning, in seconds. Defaults to 10.Env: WORKER_SOCKET_TIMEOUT_SECONDS`
6436
+ description: `[deprecated] Timeout for websockets to Lightning, in seconds. Defaults to 10.Env: WORKER_SOCKET_TIMEOUT_SECONDS`,
6437
+ hidden: true
6414
6438
  }).option("message-timeout", {
6415
- description: `Timeout for messages in the run channel in seconds. Defaults to 1. Env: WORKER_MESSAGE_TIMEOUT_SECONDS`
6439
+ description: `Timeout for all messages send to lightning via websocket. Defaults to 30. Env: WORKER_MESSAGE_TIMEOUT_SECONDS`
6440
+ }).option("claim-timeout", {
6441
+ description: `Timeout for claim requests for new Runs. This should be set to a high value or else runs may be lost. Defaults to 3600 (1 hour). Env: WORKER_CLAIM_TIMEOUT_SECONDS`
6416
6442
  }).option("lightning-public-key", {
6417
6443
  description: "Base64-encoded public key. Used to verify run tokens. Env: WORKER_LIGHTNING_PUBLIC_KEY"
6418
6444
  }).option("log", {
@@ -6483,10 +6509,14 @@ function parseArgs(argv) {
6483
6509
  WORKER_MAX_RUN_DURATION_SECONDS,
6484
6510
  300
6485
6511
  ),
6512
+ claimTimeoutSeconds: setArg(
6513
+ args2.claimTimeoutSeconds,
6514
+ WORKER_CLAIM_TIMEOUT_SECONDS,
6515
+ DEFAULT_CLAIM_TIMEOUT_SECONDS
6516
+ ),
6486
6517
  socketTimeoutSeconds: setArg(
6487
6518
  args2.socketTimeoutSeconds,
6488
- WORKER_SOCKET_TIMEOUT_SECONDS,
6489
- DEFAULT_SOCKET_TIMEOUT_SECONDS
6519
+ WORKER_SOCKET_TIMEOUT_SECONDS
6490
6520
  ),
6491
6521
  messageTimeoutSeconds: setArg(
6492
6522
  args2.messageTimeoutSeconds,
@@ -6535,8 +6565,15 @@ function engineReady(engine) {
6535
6565
  collectionsUrl: args.collectionsUrl,
6536
6566
  monorepoDir: args.monorepoDir,
6537
6567
  messageTimeoutSeconds: args.messageTimeoutSeconds,
6568
+ claimTimeoutSeconds: args.claimTimeoutSeconds,
6569
+ // deprecated!
6538
6570
  socketTimeoutSeconds: args.socketTimeoutSeconds
6539
6571
  };
6572
+ if ("socketTimeoutSeconds" in args) {
6573
+ logger.warn(
6574
+ "WARNING: deprecated socketTimeoutSeconds value passed.\n\nThis will be respected as the default socket timeout value, but will be removed from future versions of the worker."
6575
+ );
6576
+ }
6540
6577
  if (args.lightningPublicKey) {
6541
6578
  logger.info(
6542
6579
  "Lightning public key found: run tokens from Lightning will be verified by this worker"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openfn/ws-worker",
3
- "version": "1.14.2",
3
+ "version": "1.14.4",
4
4
  "description": "A Websocket Worker to connect Lightning to a Runtime Engine",
5
5
  "main": "dist/index.js",
6
6
  "type": "module",
@@ -24,8 +24,8 @@
24
24
  "phoenix": "1.7.10",
25
25
  "ws": "^8.18.0",
26
26
  "@openfn/engine-multi": "1.6.8",
27
- "@openfn/lexicon": "^1.2.2",
28
27
  "@openfn/logger": "1.0.5",
28
+ "@openfn/lexicon": "^1.2.2",
29
29
  "@openfn/runtime": "1.7.1"
30
30
  },
31
31
  "devDependencies": {
@@ -43,7 +43,7 @@
43
43
  "tsup": "^6.2.3",
44
44
  "typescript": "^4.6.4",
45
45
  "yargs": "^17.6.2",
46
- "@openfn/lightning-mock": "2.2.2"
46
+ "@openfn/lightning-mock": "2.2.3"
47
47
  },
48
48
  "files": [
49
49
  "dist",