trigger.dev 3.0.0-beta.45 → 3.0.0-beta.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,9 +70,276 @@ var SimpleLogger = class {
70
70
  var EXIT_CODE_ALREADY_HANDLED = 111;
71
71
  var EXIT_CODE_CHILD_NONZERO = 112;
72
72
 
73
- // src/workers/prod/entry-point.ts
74
- import { readFile } from "node:fs/promises";
75
- import { createServer } from "node:http";
73
+ // ../core-apps/src/backoff.ts
74
+ import { setTimeout as timeout } from "node:timers/promises";
75
+ var StopRetrying = class extends Error {
76
+ constructor(message) {
77
+ super(message);
78
+ this.name = "StopRetrying";
79
+ }
80
+ };
81
+ var AttemptTimeout = class extends Error {
82
+ constructor(message) {
83
+ super(message);
84
+ this.name = "AttemptTimeout";
85
+ }
86
+ };
87
+ var RetryLimitExceeded = class extends Error {
88
+ constructor(message) {
89
+ super(message);
90
+ this.name = "RetryLimitExceeded";
91
+ }
92
+ };
93
+ var ExponentialBackoff = class _ExponentialBackoff {
94
+ #retries = 0;
95
+ #type;
96
+ #base;
97
+ #factor;
98
+ #min;
99
+ #max;
100
+ #maxRetries;
101
+ #maxElapsed;
102
+ constructor(type, opts = {}) {
103
+ this.#type = type ?? "NoJitter";
104
+ this.#base = opts.base ?? 2;
105
+ this.#factor = opts.factor ?? 1;
106
+ this.#min = opts.min ?? -Infinity;
107
+ this.#max = opts.max ?? Infinity;
108
+ this.#maxRetries = opts.maxRetries ?? Infinity;
109
+ this.#maxElapsed = opts.maxElapsed ?? Infinity;
110
+ }
111
+ #clone(type, opts = {}) {
112
+ return new _ExponentialBackoff(type ?? this.#type, {
113
+ base: opts.base ?? this.#base,
114
+ factor: opts.factor ?? this.#factor,
115
+ min: opts.min ?? this.#min,
116
+ max: opts.max ?? this.#max,
117
+ maxRetries: opts.maxRetries ?? this.#maxRetries,
118
+ maxElapsed: opts.maxElapsed ?? this.#maxElapsed
119
+ });
120
+ }
121
+ type(type) {
122
+ return this.#clone(type);
123
+ }
124
+ base(base) {
125
+ return this.#clone(void 0, { base });
126
+ }
127
+ factor(factor) {
128
+ return this.#clone(void 0, { factor });
129
+ }
130
+ min(min) {
131
+ return this.#clone(void 0, { min });
132
+ }
133
+ max(max) {
134
+ return this.#clone(void 0, { max });
135
+ }
136
+ maxRetries(maxRetries) {
137
+ return this.#clone(void 0, { maxRetries });
138
+ }
139
+ // TODO: With .execute(), should this also include the time it takes to execute the callback?
140
+ maxElapsed(maxElapsed) {
141
+ return this.#clone(void 0, { maxElapsed });
142
+ }
143
+ retries(retries) {
144
+ if (typeof retries !== "undefined") {
145
+ if (retries > this.#maxRetries) {
146
+ console.error(
147
+ `Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
148
+ );
149
+ this.#retries = this.#maxRetries;
150
+ } else {
151
+ this.#retries = retries;
152
+ }
153
+ }
154
+ return this.#clone();
155
+ }
156
+ async *retryAsync(maxRetries = this.#maxRetries ?? Infinity) {
157
+ let elapsed = 0;
158
+ let retry = 0;
159
+ while (retry <= maxRetries) {
160
+ const delay = this.delay(retry);
161
+ elapsed += delay;
162
+ if (elapsed > this.#maxElapsed) {
163
+ break;
164
+ }
165
+ yield {
166
+ delay: {
167
+ seconds: delay,
168
+ milliseconds: delay * 1e3
169
+ },
170
+ retry
171
+ };
172
+ retry++;
173
+ }
174
+ }
175
+ async *[Symbol.asyncIterator]() {
176
+ yield* this.retryAsync();
177
+ }
178
+ /** Returns the delay for the current retry in seconds. */
179
+ delay(retries = this.#retries, jitter = true) {
180
+ if (retries > this.#maxRetries) {
181
+ console.error(
182
+ `Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
183
+ );
184
+ retries = this.#maxRetries;
185
+ }
186
+ let delay = this.#factor * this.#base ** retries;
187
+ switch (this.#type) {
188
+ case "NoJitter": {
189
+ break;
190
+ }
191
+ case "FullJitter": {
192
+ if (!jitter) {
193
+ delay = 0;
194
+ break;
195
+ }
196
+ delay *= Math.random();
197
+ break;
198
+ }
199
+ case "EqualJitter": {
200
+ if (!jitter) {
201
+ delay *= 0.5;
202
+ break;
203
+ }
204
+ delay *= 0.5 * (1 + Math.random());
205
+ break;
206
+ }
207
+ default: {
208
+ throw new Error(`Unknown backoff type: ${this.#type}`);
209
+ }
210
+ }
211
+ if (delay < this.#min) {
212
+ delay = this.#min + Math.random() * (this.#min * 0.2);
213
+ }
214
+ if (delay > this.#max) {
215
+ delay = this.#max - Math.random() * (this.#max * 0.2);
216
+ }
217
+ delay = Math.round(delay);
218
+ return delay;
219
+ }
220
+ /** Waits with the appropriate delay for the current retry. */
221
+ async wait(retries = this.#retries, jitter = true) {
222
+ if (retries > this.#maxRetries) {
223
+ console.error(`Retry limit exceeded: ${retries} > ${this.#maxRetries}`);
224
+ throw new RetryLimitExceeded();
225
+ }
226
+ const delay = this.delay(retries, jitter);
227
+ return await timeout(delay * 1e3);
228
+ }
229
+ elapsed(retries = this.#retries, jitter = true) {
230
+ let elapsed = 0;
231
+ for (let i = 0; i <= retries; i++) {
232
+ elapsed += this.delay(i, jitter);
233
+ }
234
+ const total = elapsed;
235
+ let days = 0;
236
+ if (elapsed > 3600 * 24) {
237
+ days = Math.floor(elapsed / 3600 / 24);
238
+ elapsed -= days * 3600 * 24;
239
+ }
240
+ let hours = 0;
241
+ if (elapsed > 3600) {
242
+ hours = Math.floor(elapsed / 3600);
243
+ elapsed -= hours * 3600;
244
+ }
245
+ let minutes = 0;
246
+ if (elapsed > 60) {
247
+ minutes = Math.floor(elapsed / 60);
248
+ elapsed -= minutes * 60;
249
+ }
250
+ const seconds = elapsed;
251
+ return {
252
+ seconds,
253
+ minutes,
254
+ hours,
255
+ days,
256
+ total
257
+ };
258
+ }
259
+ reset() {
260
+ this.#retries = 0;
261
+ return this;
262
+ }
263
+ next() {
264
+ this.#retries++;
265
+ return this.delay();
266
+ }
267
+ stop() {
268
+ throw new StopRetrying();
269
+ }
270
+ get state() {
271
+ return {
272
+ retries: this.#retries,
273
+ type: this.#type,
274
+ base: this.#base,
275
+ factor: this.#factor,
276
+ min: this.#min,
277
+ max: this.#max,
278
+ maxRetries: this.#maxRetries,
279
+ maxElapsed: this.#maxElapsed
280
+ };
281
+ }
282
+ async execute(callback, { attemptTimeoutMs = 0 } = {}) {
283
+ let elapsedMs = 0;
284
+ let finalError = void 0;
285
+ for await (const { delay, retry } of this) {
286
+ const start = Date.now();
287
+ if (retry > 0) {
288
+ console.log(`Retrying in ${delay.milliseconds}ms`);
289
+ await timeout(delay.milliseconds);
290
+ }
291
+ let attemptTimeout = void 0;
292
+ try {
293
+ const result = await new Promise(async (resolve, reject) => {
294
+ if (attemptTimeoutMs > 0) {
295
+ attemptTimeout = setTimeout(() => {
296
+ reject(new AttemptTimeout());
297
+ }, attemptTimeoutMs);
298
+ }
299
+ try {
300
+ const callbackResult = await callback({ delay, retry, elapsedMs });
301
+ resolve(callbackResult);
302
+ } catch (error) {
303
+ reject(error);
304
+ }
305
+ });
306
+ return {
307
+ success: true,
308
+ result
309
+ };
310
+ } catch (error) {
311
+ finalError = error;
312
+ if (error instanceof StopRetrying) {
313
+ return {
314
+ success: false,
315
+ cause: "StopRetrying",
316
+ error: error.message
317
+ };
318
+ }
319
+ if (error instanceof AttemptTimeout) {
320
+ continue;
321
+ }
322
+ } finally {
323
+ elapsedMs += Date.now() - start;
324
+ clearTimeout(attemptTimeout);
325
+ }
326
+ }
327
+ if (finalError instanceof AttemptTimeout) {
328
+ return {
329
+ success: false,
330
+ cause: "Timeout"
331
+ };
332
+ } else {
333
+ return {
334
+ success: false,
335
+ cause: "MaxRetries",
336
+ error: finalError
337
+ };
338
+ }
339
+ }
340
+ static RetryLimitExceeded = RetryLimitExceeded;
341
+ static StopRetrying = StopRetrying;
342
+ };
76
343
 
77
344
  // src/workers/prod/backgroundWorker.ts
78
345
  import {
@@ -175,13 +442,9 @@ var ProdBackgroundWorker = class {
175
442
  */
176
443
  onTaskHeartbeat = new Evt();
177
444
  onTaskRunHeartbeat = new Evt();
178
- onWaitForBatch = new Evt();
179
445
  onWaitForDuration = new Evt();
180
446
  onWaitForTask = new Evt();
181
- preCheckpointNotification = Evt.create();
182
- checkpointCanceledNotification = Evt.create();
183
- onReadyForCheckpoint = Evt.create();
184
- onCancelCheckpoint = Evt.create();
447
+ onWaitForBatch = new Evt();
185
448
  onCreateTaskRunAttempt = Evt.create();
186
449
  attemptCreatedNotification = Evt.create();
187
450
  _onClose = new Evt();
@@ -219,7 +482,10 @@ var ProdBackgroundWorker = class {
219
482
  this._closed = true;
220
483
  }
221
484
  async flushTelemetry() {
485
+ console.log("Flushing telemetry");
486
+ const start = performance.now();
222
487
  await this._taskRunProcess?.cleanup(false);
488
+ console.log("Flushed telemetry", { duration: performance.now() - start });
223
489
  }
224
490
  async initialize(options) {
225
491
  if (this._initialized) {
@@ -242,7 +508,7 @@ var ProdBackgroundWorker = class {
242
508
  ...options?.env
243
509
  }
244
510
  });
245
- const timeout = setTimeout(() => {
511
+ const timeout3 = setTimeout(() => {
246
512
  if (resolved) {
247
513
  return;
248
514
  }
@@ -259,7 +525,7 @@ var ProdBackgroundWorker = class {
259
525
  });
260
526
  child.on("exit", (code) => {
261
527
  if (!resolved) {
262
- clearTimeout(timeout);
528
+ clearTimeout(timeout3);
263
529
  resolved = true;
264
530
  reject(new Error(`Worker exited with code ${code}`));
265
531
  }
@@ -271,7 +537,7 @@ var ProdBackgroundWorker = class {
271
537
  handlers: {
272
538
  TASKS_READY: async (message) => {
273
539
  if (!resolved) {
274
- clearTimeout(timeout);
540
+ clearTimeout(timeout3);
275
541
  resolved = true;
276
542
  resolve(message.tasks);
277
543
  child.kill();
@@ -279,7 +545,7 @@ var ProdBackgroundWorker = class {
279
545
  },
280
546
  UNCAUGHT_EXCEPTION: async (message) => {
281
547
  if (!resolved) {
282
- clearTimeout(timeout);
548
+ clearTimeout(timeout3);
283
549
  resolved = true;
284
550
  reject(new UncaughtExceptionError(message.error, message.origin));
285
551
  child.kill();
@@ -287,7 +553,7 @@ var ProdBackgroundWorker = class {
287
553
  },
288
554
  TASKS_FAILED_TO_PARSE: async (message) => {
289
555
  if (!resolved) {
290
- clearTimeout(timeout);
556
+ clearTimeout(timeout3);
291
557
  resolved = true;
292
558
  reject(new TaskMetadataParseError(message.zodIssues, message.tasks));
293
559
  child.kill();
@@ -364,18 +630,6 @@ var ProdBackgroundWorker = class {
364
630
  taskRunProcess.onWaitForTask.attach((message) => {
365
631
  this.onWaitForTask.post(message);
366
632
  });
367
- taskRunProcess.onReadyForCheckpoint.attach((message) => {
368
- this.onReadyForCheckpoint.post(message);
369
- });
370
- taskRunProcess.onCancelCheckpoint.attach((message) => {
371
- this.onCancelCheckpoint.post(message);
372
- });
373
- this.preCheckpointNotification.attach((message) => {
374
- taskRunProcess.preCheckpointNotification.post(message);
375
- });
376
- this.checkpointCanceledNotification.attach((message) => {
377
- taskRunProcess.checkpointCanceledNotification.post(message);
378
- });
379
633
  await taskRunProcess.initialize();
380
634
  this._taskRunProcess = taskRunProcess;
381
635
  return this._taskRunProcess;
@@ -416,6 +670,7 @@ var ProdBackgroundWorker = class {
416
670
  }
417
671
  }
418
672
  async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
673
+ console.log("Trying graceful exit", { kill, initialSignal });
419
674
  try {
420
675
  const initialExit = taskRunProcess.onExit.waitFor(5e3);
421
676
  if (kill) {
@@ -428,6 +683,7 @@ var ProdBackgroundWorker = class {
428
683
  }
429
684
  }
430
685
  async #tryForcefulExit(taskRunProcess) {
686
+ console.log("Trying forceful exit");
431
687
  try {
432
688
  const forcedKill = taskRunProcess.onExit.waitFor(5e3);
433
689
  taskRunProcess.kill("SIGKILL");
@@ -541,16 +797,19 @@ var ProdBackgroundWorker = class {
541
797
  this.onCreateTaskRunAttempt.post({ runId: payload.runId });
542
798
  let execution;
543
799
  try {
544
- const attemptCreated = await this.attemptCreatedNotification.waitFor(3e4);
800
+ const start = performance.now();
801
+ const attemptCreated = await this.attemptCreatedNotification.waitFor(12e4);
545
802
  if (!attemptCreated.success) {
546
- throw new Error(
547
- `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
548
- );
803
+ throw new Error(`${attemptCreated.reason ?? "Unknown error"}`);
549
804
  }
805
+ console.log("Attempt created", {
806
+ number: attemptCreated.execution.attempt.number,
807
+ duration: performance.now() - start
808
+ });
550
809
  execution = attemptCreated.execution;
551
810
  } catch (error) {
552
811
  console.error("Error while creating attempt", error);
553
- throw new Error(`Failed to create task run attempt: ${error}`);
812
+ throw new Error(`Failed to create attempt: ${error}`);
554
813
  }
555
814
  const completion = await this.executeTaskRun(
556
815
  {
@@ -596,13 +855,10 @@ var TaskRunProcess = class {
596
855
  onTaskRunHeartbeat = new Evt();
597
856
  onExit = new Evt();
598
857
  onIsBeingKilled = new Evt();
599
- onWaitForBatch = new Evt();
600
858
  onWaitForDuration = new Evt();
601
859
  onWaitForTask = new Evt();
860
+ onWaitForBatch = new Evt();
602
861
  preCheckpointNotification = Evt.create();
603
- checkpointCanceledNotification = Evt.create();
604
- onReadyForCheckpoint = Evt.create();
605
- onCancelCheckpoint = Evt.create();
606
862
  async initialize() {
607
863
  this._child = fork(this.path, {
608
864
  stdio: [
@@ -650,6 +906,10 @@ var TaskRunProcess = class {
650
906
  if (this.messageId) {
651
907
  this.onTaskRunHeartbeat.post(this.messageId);
652
908
  } else {
909
+ console.error(
910
+ "No message id for task heartbeat, falling back to (deprecated) attempt heartbeat",
911
+ { id: message.id }
912
+ );
653
913
  this.onTaskHeartbeat.post(message.id);
654
914
  }
655
915
  },
@@ -663,41 +923,6 @@ var TaskRunProcess = class {
663
923
  },
664
924
  WAIT_FOR_DURATION: async (message) => {
665
925
  this.onWaitForDuration.post(message);
666
- try {
667
- const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor(
668
- 3e4
669
- );
670
- return {
671
- willCheckpointAndRestore
672
- };
673
- } catch (error) {
674
- console.error("Error while waiting for pre-checkpoint notification", error);
675
- return {
676
- willCheckpointAndRestore: false
677
- };
678
- }
679
- },
680
- READY_FOR_CHECKPOINT: async (message) => {
681
- this.onReadyForCheckpoint.post(message);
682
- },
683
- CANCEL_CHECKPOINT: async (message) => {
684
- const version = "v2";
685
- this.onCancelCheckpoint.post(message);
686
- try {
687
- const { checkpointCanceled } = await this.checkpointCanceledNotification.waitFor(
688
- 3e4
689
- );
690
- return {
691
- version,
692
- checkpointCanceled
693
- };
694
- } catch (error) {
695
- console.error("Error while waiting for checkpoint cancellation", error);
696
- return {
697
- version,
698
- checkpointCanceled: true
699
- };
700
- }
701
926
  }
702
927
  }
703
928
  });
@@ -727,14 +952,21 @@ var TaskRunProcess = class {
727
952
  childPid: this._childPid,
728
953
  realChildPid: this._child?.pid
729
954
  });
730
- await this._ipc?.sendWithAck(
731
- "CLEANUP",
732
- {
733
- flush: true,
734
- kill: killParentProcess
735
- },
736
- 3e4
737
- );
955
+ try {
956
+ await this._ipc?.sendWithAck(
957
+ "CLEANUP",
958
+ {
959
+ flush: true,
960
+ kill: killParentProcess
961
+ },
962
+ 3e4
963
+ );
964
+ } catch (error) {
965
+ console.error("Error while cleaning up task run process", error);
966
+ if (killParentProcess) {
967
+ process.exit(0);
968
+ }
969
+ }
738
970
  if (killChildProcess) {
739
971
  this._gracefulExitTimeoutElapsed = true;
740
972
  await this.kill("SIGKILL");
@@ -764,19 +996,30 @@ var TaskRunProcess = class {
764
996
  }
765
997
  taskRunCompletedNotification(completion) {
766
998
  if (!completion.ok && typeof completion.retry !== "undefined") {
999
+ console.error(
1000
+ "Task run completed with error and wants to retry, won't send task run completed notification"
1001
+ );
767
1002
  return;
768
1003
  }
769
- if (this._child?.connected && !this._isBeingKilled && !this._child.killed) {
770
- this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
771
- version: "v2",
772
- completion
773
- });
1004
+ if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
1005
+ console.error(
1006
+ "Child process not connected or being killed, can't send task run completed notification"
1007
+ );
1008
+ return;
774
1009
  }
1010
+ this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
1011
+ version: "v2",
1012
+ completion
1013
+ });
775
1014
  }
776
1015
  waitCompletedNotification() {
777
- if (this._child?.connected && !this._isBeingKilled && !this._child.killed) {
778
- this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
1016
+ if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
1017
+ console.error(
1018
+ "Child process not connected or being killed, can't send wait completed notification"
1019
+ );
1020
+ return;
779
1021
  }
1022
+ this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
780
1023
  }
781
1024
  async #handleExit(code, signal) {
782
1025
  console.log("handling child exit", { code, signal });
@@ -837,7 +1080,11 @@ var TaskRunProcess = class {
837
1080
  };
838
1081
 
839
1082
  // src/workers/prod/entry-point.ts
840
- import { setTimeout as setTimeout2 } from "node:timers/promises";
1083
+ import { checkpointSafeTimeout, unboundedTimeout } from "@trigger.dev/core/v3/utils/timers";
1084
+ import { randomUUID } from "node:crypto";
1085
+ import { readFile } from "node:fs/promises";
1086
+ import { createServer } from "node:http";
1087
+ import { setTimeout as timeout2 } from "node:timers/promises";
841
1088
  var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
842
1089
  var COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
843
1090
  var COORDINATOR_PORT = Number(process.env.COORDINATOR_PORT || 50080);
@@ -845,6 +1092,9 @@ var MACHINE_NAME = process.env.MACHINE_NAME || "local";
845
1092
  var POD_NAME = process.env.POD_NAME || "some-pod";
846
1093
  var SHORT_HASH = process.env.TRIGGER_CONTENT_HASH.slice(0, 9);
847
1094
  var logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`);
1095
+ var defaultBackoff = new ExponentialBackoff("FullJitter", {
1096
+ maxRetries: 5
1097
+ });
848
1098
  var ProdWorker = class {
849
1099
  constructor(port, host = "0.0.0.0") {
850
1100
  this.host = host;
@@ -869,6 +1119,12 @@ var ProdWorker = class {
869
1119
  attemptFriendlyId;
870
1120
  nextResumeAfter;
871
1121
  waitForPostStart = false;
1122
+ connectionCount = 0;
1123
+ waitForTaskReplay;
1124
+ waitForBatchReplay;
1125
+ readyForLazyAttemptReplay;
1126
+ submitAttemptCompletionReplay;
1127
+ durationResumeFallback;
872
1128
  #httpPort;
873
1129
  #backgroundWorker;
874
1130
  #httpServer;
@@ -882,7 +1138,7 @@ var ProdWorker = class {
882
1138
  logger.log("Waiting for attempt to complete before exiting", {
883
1139
  terminationGracePeriodSeconds
884
1140
  });
885
- await setTimeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
1141
+ await timeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
886
1142
  gracefulExitTimeoutElapsed = true;
887
1143
  logger.log("Termination timeout reached, exiting gracefully.");
888
1144
  } else {
@@ -899,14 +1155,10 @@ var ProdWorker = class {
899
1155
  process.exit(exitCode);
900
1156
  }
901
1157
  }
902
- async #reconnect(isPostStart = false, reconnectImmediately = false) {
903
- if (isPostStart) {
904
- this.waitForPostStart = false;
905
- }
1158
+ async #reconnectAfterPostStart() {
1159
+ this.waitForPostStart = false;
906
1160
  this.#coordinatorSocket.close();
907
- if (!reconnectImmediately) {
908
- await setTimeout2(1e3);
909
- }
1161
+ this.connectionCount = 0;
910
1162
  let coordinatorHost = COORDINATOR_HOST;
911
1163
  try {
912
1164
  if (this.runningInKubernetes) {
@@ -930,6 +1182,98 @@ var ProdWorker = class {
930
1182
  this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
931
1183
  }
932
1184
  }
1185
+ // MARK: TASK WAIT
1186
+ async #waitForTaskHandler(message, replayIdempotencyKey) {
1187
+ const waitForTask = await defaultBackoff.execute(async ({ retry }) => {
1188
+ logger.log("Wait for task with backoff", { retry });
1189
+ if (!this.attemptFriendlyId) {
1190
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1191
+ throw new ExponentialBackoff.StopRetrying("No attempt ID");
1192
+ }
1193
+ return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_TASK", {
1194
+ version: "v2",
1195
+ friendlyId: message.friendlyId,
1196
+ attemptFriendlyId: this.attemptFriendlyId
1197
+ });
1198
+ });
1199
+ if (!waitForTask.success) {
1200
+ logger.error("Failed to wait for task with backoff", {
1201
+ cause: waitForTask.cause,
1202
+ error: waitForTask.error
1203
+ });
1204
+ this.#emitUnrecoverableError(
1205
+ "WaitForTaskFailed",
1206
+ `${waitForTask.cause}: ${waitForTask.error}`
1207
+ );
1208
+ return;
1209
+ }
1210
+ const { willCheckpointAndRestore } = waitForTask.result;
1211
+ await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
1212
+ if (willCheckpointAndRestore) {
1213
+ if (!this.waitForTaskReplay) {
1214
+ this.waitForTaskReplay = {
1215
+ message,
1216
+ attempt: 1,
1217
+ idempotencyKey: randomUUID()
1218
+ };
1219
+ } else {
1220
+ if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
1221
+ logger.error(
1222
+ "wait for task handler called with mismatched idempotency key, won't overwrite replay request"
1223
+ );
1224
+ return;
1225
+ }
1226
+ this.waitForTaskReplay.attempt++;
1227
+ }
1228
+ }
1229
+ }
1230
+ // MARK: BATCH WAIT
1231
+ async #waitForBatchHandler(message, replayIdempotencyKey) {
1232
+ const waitForBatch = await defaultBackoff.execute(async ({ retry }) => {
1233
+ logger.log("Wait for batch with backoff", { retry });
1234
+ if (!this.attemptFriendlyId) {
1235
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1236
+ throw new ExponentialBackoff.StopRetrying("No attempt ID");
1237
+ }
1238
+ return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_BATCH", {
1239
+ version: "v2",
1240
+ batchFriendlyId: message.batchFriendlyId,
1241
+ runFriendlyIds: message.runFriendlyIds,
1242
+ attemptFriendlyId: this.attemptFriendlyId
1243
+ });
1244
+ });
1245
+ if (!waitForBatch.success) {
1246
+ logger.error("Failed to wait for batch with backoff", {
1247
+ cause: waitForBatch.cause,
1248
+ error: waitForBatch.error
1249
+ });
1250
+ this.#emitUnrecoverableError(
1251
+ "WaitForBatchFailed",
1252
+ `${waitForBatch.cause}: ${waitForBatch.error}`
1253
+ );
1254
+ return;
1255
+ }
1256
+ const { willCheckpointAndRestore } = waitForBatch.result;
1257
+ await this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
1258
+ if (willCheckpointAndRestore) {
1259
+ if (!this.waitForBatchReplay) {
1260
+ this.waitForBatchReplay = {
1261
+ message,
1262
+ attempt: 1,
1263
+ idempotencyKey: randomUUID()
1264
+ };
1265
+ } else {
1266
+ if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
1267
+ logger.error(
1268
+ "wait for task handler called with mismatched idempotency key, won't overwrite replay request"
1269
+ );
1270
+ return;
1271
+ }
1272
+ this.waitForBatchReplay.attempt++;
1273
+ }
1274
+ }
1275
+ }
1276
+ // MARK: WORKER CREATION
933
1277
  #createBackgroundWorker() {
934
1278
  const backgroundWorker = new ProdBackgroundWorker("worker.js", {
935
1279
  projectConfig: __PROJECT_CONFIG__,
@@ -942,128 +1286,152 @@ var ProdWorker = class {
942
1286
  contentHash: this.contentHash
943
1287
  });
944
1288
  backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
945
- this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId });
1289
+ logger.log("onTaskHeartbeat", { attemptFriendlyId });
1290
+ this.#coordinatorSocket.socket.volatile.emit("TASK_HEARTBEAT", {
1291
+ version: "v1",
1292
+ attemptFriendlyId
1293
+ });
946
1294
  });
947
1295
  backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
948
- this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
949
- });
950
- backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
951
- await this.#prepareForCheckpoint();
952
- this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
953
- });
954
- backgroundWorker.onCancelCheckpoint.attach(async (message) => {
955
- logger.log("onCancelCheckpoint", { message });
956
- const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
957
- "CANCEL_CHECKPOINT",
958
- {
959
- version: "v2",
960
- reason: message.reason
961
- }
962
- );
963
- logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
964
- if (checkpointCanceled) {
965
- if (message.reason === "WAIT_FOR_DURATION") {
966
- this.paused = false;
967
- this.nextResumeAfter = void 0;
968
- this.waitForPostStart = false;
969
- }
970
- }
971
- backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
1296
+ logger.log("onTaskRunHeartbeat", { runId });
1297
+ this.#coordinatorSocket.socket.volatile.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
972
1298
  });
973
1299
  backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
974
1300
  logger.log("onCreateTaskRunAttempt()", { message });
975
- const createAttempt = await this.#coordinatorSocket.socket.emitWithAck(
976
- "CREATE_TASK_RUN_ATTEMPT",
977
- {
1301
+ const createAttempt = await defaultBackoff.execute(async ({ retry }) => {
1302
+ logger.log("Create task run attempt with backoff", { retry });
1303
+ return await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CREATE_TASK_RUN_ATTEMPT", {
978
1304
  version: "v1",
979
1305
  runId: message.runId
980
- }
981
- );
1306
+ });
1307
+ });
982
1308
  if (!createAttempt.success) {
983
1309
  backgroundWorker.attemptCreatedNotification.post({
984
1310
  success: false,
985
- reason: createAttempt.reason
1311
+ reason: `Failed to create attempt with backoff due to ${createAttempt.cause}. ${createAttempt.error}`
1312
+ });
1313
+ return;
1314
+ }
1315
+ if (!createAttempt.result.success) {
1316
+ backgroundWorker.attemptCreatedNotification.post({
1317
+ success: false,
1318
+ reason: createAttempt.result.reason
986
1319
  });
987
1320
  return;
988
1321
  }
989
1322
  backgroundWorker.attemptCreatedNotification.post({
990
1323
  success: true,
991
- execution: createAttempt.executionPayload.execution
1324
+ execution: createAttempt.result.executionPayload.execution
992
1325
  });
993
1326
  });
994
1327
  backgroundWorker.attemptCreatedNotification.attach((message) => {
1328
+ logger.log("attemptCreatedNotification", {
1329
+ success: message.success,
1330
+ ...message.success ? {
1331
+ attempt: message.execution.attempt,
1332
+ queue: message.execution.queue,
1333
+ worker: message.execution.worker,
1334
+ machine: message.execution.machine
1335
+ } : {
1336
+ reason: message.reason
1337
+ }
1338
+ });
995
1339
  if (!message.success) {
996
1340
  return;
997
1341
  }
998
1342
  this.attemptFriendlyId = message.execution.attempt.id;
999
1343
  });
1000
1344
  backgroundWorker.onWaitForDuration.attach(async (message) => {
1001
- if (!this.attemptFriendlyId) {
1002
- logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1003
- this.#emitUnrecoverableError(
1004
- "NoAttemptId",
1005
- "Attempt ID not set before waiting for duration"
1006
- );
1007
- return;
1008
- }
1009
- const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
1010
- "WAIT_FOR_DURATION",
1011
- {
1012
- ...message,
1013
- attemptFriendlyId: this.attemptFriendlyId
1345
+ logger.log("onWaitForDuration", { ...message, drift: Date.now() - message.now });
1346
+ noResume: {
1347
+ const { ms, waitThresholdInMs } = message;
1348
+ const internalTimeout = unboundedTimeout(ms, "internal");
1349
+ const checkpointSafeInternalTimeout = checkpointSafeTimeout(ms);
1350
+ if (ms < waitThresholdInMs) {
1351
+ await internalTimeout;
1352
+ break noResume;
1014
1353
  }
1015
- );
1016
- this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
1017
- });
1018
- backgroundWorker.onWaitForTask.attach(async (message) => {
1019
- if (!this.attemptFriendlyId) {
1020
- logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1021
- this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task");
1022
- return;
1023
- }
1024
- const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
1025
- "WAIT_FOR_TASK",
1026
- {
1027
- ...message,
1028
- attemptFriendlyId: this.attemptFriendlyId
1354
+ const waitForDuration = await defaultBackoff.execute(async ({ retry }) => {
1355
+ logger.log("Wait for duration with backoff", { retry });
1356
+ if (!this.attemptFriendlyId) {
1357
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1358
+ throw new ExponentialBackoff.StopRetrying("No attempt ID");
1359
+ }
1360
+ return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_DURATION", {
1361
+ ...message,
1362
+ attemptFriendlyId: this.attemptFriendlyId
1363
+ });
1364
+ });
1365
+ if (!waitForDuration.success) {
1366
+ logger.error("Failed to wait for duration with backoff", {
1367
+ cause: waitForDuration.cause,
1368
+ error: waitForDuration.error
1369
+ });
1370
+ this.#emitUnrecoverableError(
1371
+ "WaitForDurationFailed",
1372
+ `${waitForDuration.cause}: ${waitForDuration.error}`
1373
+ );
1374
+ return;
1375
+ }
1376
+ const { willCheckpointAndRestore } = waitForDuration.result;
1377
+ if (!willCheckpointAndRestore) {
1378
+ await internalTimeout;
1379
+ break noResume;
1380
+ }
1381
+ await this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
1382
+ await Promise.race([internalTimeout, checkpointSafeInternalTimeout]);
1383
+ try {
1384
+ const { checkpointCanceled } = await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CANCEL_CHECKPOINT", {
1385
+ version: "v2",
1386
+ reason: "WAIT_FOR_DURATION"
1387
+ });
1388
+ logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
1389
+ if (checkpointCanceled) {
1390
+ break noResume;
1391
+ }
1392
+ logger.log("Waiting for external duration resume as we may have been restored");
1393
+ const idempotencyKey = randomUUID();
1394
+ this.durationResumeFallback = { idempotencyKey };
1395
+ setTimeout(() => {
1396
+ if (!this.durationResumeFallback) {
1397
+ logger.error("Already resumed after duration, skipping fallback");
1398
+ return;
1399
+ }
1400
+ if (this.durationResumeFallback.idempotencyKey !== idempotencyKey) {
1401
+ logger.error("Duration resume idempotency key mismatch, skipping fallback");
1402
+ return;
1403
+ }
1404
+ logger.log("Resuming after duration with fallback");
1405
+ this.#resumeAfterDuration();
1406
+ }, 15e3);
1407
+ } catch (error) {
1408
+ logger.debug("Checkpoint cancellation timed out", { error });
1409
+ break noResume;
1029
1410
  }
1030
- );
1031
- this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
1032
- });
1033
- backgroundWorker.onWaitForBatch.attach(async (message) => {
1034
- if (!this.attemptFriendlyId) {
1035
- logger.error("Failed to send wait message, attempt friendly ID not set", { message });
1036
- this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
1037
1411
  return;
1038
1412
  }
1039
- const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
1040
- "WAIT_FOR_BATCH",
1041
- {
1042
- ...message,
1043
- attemptFriendlyId: this.attemptFriendlyId
1044
- }
1045
- );
1046
- this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
1413
+ this.#resumeAfterDuration();
1047
1414
  });
1415
+ backgroundWorker.onWaitForTask.attach(this.#waitForTaskHandler.bind(this));
1416
+ backgroundWorker.onWaitForBatch.attach(this.#waitForBatchHandler.bind(this));
1048
1417
  return backgroundWorker;
1049
1418
  }
1050
1419
  async #prepareForWait(reason, willCheckpointAndRestore) {
1051
1420
  logger.log(`prepare for ${reason}`, { willCheckpointAndRestore });
1052
- this.#backgroundWorker.preCheckpointNotification.post({ willCheckpointAndRestore });
1053
- if (willCheckpointAndRestore) {
1054
- this.paused = true;
1055
- this.nextResumeAfter = reason;
1056
- this.waitForPostStart = true;
1057
- if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
1058
- await this.#prepareForCheckpoint();
1059
- }
1421
+ if (!willCheckpointAndRestore) {
1422
+ return;
1060
1423
  }
1424
+ this.paused = true;
1425
+ this.nextResumeAfter = reason;
1426
+ this.waitForPostStart = true;
1427
+ await this.#prepareForCheckpoint();
1061
1428
  }
1429
+ // MARK: RETRY PREP
1062
1430
  async #prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode) {
1063
- logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit });
1431
+ logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit, exitCode });
1064
1432
  if (shouldExit) {
1065
1433
  if (willCheckpointAndRestore) {
1066
- logger.log("WARNING: Will checkpoint but also requested exit. This won't end well.");
1434
+ logger.error("WARNING: Will checkpoint but also requested exit. This won't end well.");
1067
1435
  }
1068
1436
  await this.#exitGracefully(false, exitCode);
1069
1437
  return;
@@ -1072,18 +1440,33 @@ var ProdWorker = class {
1072
1440
  this.waitForPostStart = false;
1073
1441
  this.executing = false;
1074
1442
  this.attemptFriendlyId = void 0;
1075
- if (willCheckpointAndRestore) {
1076
- this.waitForPostStart = true;
1077
- this.#prepareForCheckpoint(false);
1078
- this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
1443
+ if (!willCheckpointAndRestore) {
1079
1444
  return;
1080
1445
  }
1446
+ this.waitForPostStart = true;
1447
+ await this.#prepareForCheckpoint(false);
1081
1448
  }
1449
+ // MARK: CHECKPOINT PREP
1082
1450
  async #prepareForCheckpoint(flush = true) {
1083
1451
  if (flush) {
1084
- await this.#backgroundWorker.flushTelemetry();
1452
+ try {
1453
+ await this.#backgroundWorker.flushTelemetry();
1454
+ } catch (error) {
1455
+ logger.error(
1456
+ "Failed to flush telemetry while preparing for checkpoint, will proceed anyway",
1457
+ { error }
1458
+ );
1459
+ }
1460
+ }
1461
+ try {
1462
+ await this.#backgroundWorker.forceKillOldTaskRunProcesses();
1463
+ } catch (error) {
1464
+ logger.error(
1465
+ "Failed to kill previous worker while preparing for checkpoint, will proceed anyway",
1466
+ { error }
1467
+ );
1085
1468
  }
1086
- await this.#backgroundWorker.forceKillOldTaskRunProcesses();
1469
+ this.#readyForCheckpoint();
1087
1470
  }
1088
1471
  #resumeAfterDuration() {
1089
1472
  this.paused = false;
@@ -1091,6 +1474,106 @@ var ProdWorker = class {
1091
1474
  this.waitForPostStart = false;
1092
1475
  this.#backgroundWorker.waitCompletedNotification();
1093
1476
  }
1477
+ async #readyForLazyAttempt() {
1478
+ const idempotencyKey = randomUUID();
1479
+ this.readyForLazyAttemptReplay = {
1480
+ idempotencyKey
1481
+ };
1482
+ for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(3)) {
1483
+ if (retry > 0) {
1484
+ logger.log("retrying ready for lazy attempt", { retry });
1485
+ }
1486
+ this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
1487
+ version: "v1",
1488
+ runId: this.runId,
1489
+ totalCompletions: this.completed.size
1490
+ });
1491
+ await timeout2(delay.milliseconds);
1492
+ if (!this.readyForLazyAttemptReplay) {
1493
+ logger.error("replay ready for lazy attempt cancelled, discarding", {
1494
+ idempotencyKey
1495
+ });
1496
+ return;
1497
+ }
1498
+ if (idempotencyKey !== this.readyForLazyAttemptReplay.idempotencyKey) {
1499
+ logger.error("replay ready for lazy attempt idempotency key mismatch, discarding", {
1500
+ idempotencyKey,
1501
+ newIdempotencyKey: this.readyForLazyAttemptReplay.idempotencyKey
1502
+ });
1503
+ return;
1504
+ }
1505
+ }
1506
+ this.#failRun(this.runId, "Failed to receive execute request in a reasonable time");
1507
+ }
1508
+ #readyForCheckpoint() {
1509
+ this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
1510
+ }
1511
+ #failRun(anyRunId, error) {
1512
+ logger.error("Failing run", { anyRunId, error });
1513
+ const completion = {
1514
+ ok: false,
1515
+ id: anyRunId,
1516
+ retry: void 0,
1517
+ error: error instanceof Error ? {
1518
+ type: "BUILT_IN_ERROR",
1519
+ name: error.name,
1520
+ message: error.message,
1521
+ stackTrace: error.stack ?? ""
1522
+ } : {
1523
+ type: "BUILT_IN_ERROR",
1524
+ name: "UnknownError",
1525
+ message: String(error),
1526
+ stackTrace: ""
1527
+ }
1528
+ };
1529
+ this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
1530
+ version: "v1",
1531
+ completion
1532
+ });
1533
+ }
1534
+ // MARK: ATTEMPT COMPLETION
1535
+ async #submitAttemptCompletion(execution, completion, replayIdempotencyKey) {
1536
+ const taskRunCompleted = await defaultBackoff.execute(async ({ retry }) => {
1537
+ logger.log("Submit attempt completion with backoff", { retry });
1538
+ return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("TASK_RUN_COMPLETED", {
1539
+ version: "v1",
1540
+ execution,
1541
+ completion
1542
+ });
1543
+ });
1544
+ if (!taskRunCompleted.success) {
1545
+ logger.error("Failed to complete lazy attempt with backoff", {
1546
+ cause: taskRunCompleted.cause,
1547
+ error: taskRunCompleted.error
1548
+ });
1549
+ this.#failRun(execution.run.id, taskRunCompleted.error);
1550
+ return;
1551
+ }
1552
+ const { willCheckpointAndRestore, shouldExit } = taskRunCompleted.result;
1553
+ logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1554
+ const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
1555
+ await this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
1556
+ if (willCheckpointAndRestore) {
1557
+ if (!this.submitAttemptCompletionReplay) {
1558
+ this.submitAttemptCompletionReplay = {
1559
+ message: {
1560
+ execution,
1561
+ completion
1562
+ },
1563
+ attempt: 1,
1564
+ idempotencyKey: randomUUID()
1565
+ };
1566
+ } else {
1567
+ if (replayIdempotencyKey && replayIdempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
1568
+ logger.error(
1569
+ "attempt completion handler called with mismatched idempotency key, won't overwrite replay request"
1570
+ );
1571
+ return;
1572
+ }
1573
+ this.submitAttemptCompletionReplay.attempt++;
1574
+ }
1575
+ }
1576
+ }
1094
1577
  #returnValidatedExtraHeaders(headers) {
1095
1578
  for (const [key, value] of Object.entries(headers)) {
1096
1579
  if (value === void 0) {
@@ -1099,7 +1582,7 @@ var ProdWorker = class {
1099
1582
  }
1100
1583
  return headers;
1101
1584
  }
1102
- // FIXME: If the the worker can't connect for a while, this runs MANY times - it should only run once
1585
+ // MARK: COORDINATOR SOCKET
1103
1586
  #createCoordinatorSocket(host) {
1104
1587
  const extraHeaders = this.#returnValidatedExtraHeaders({
1105
1588
  "x-machine-name": MACHINE_NAME,
@@ -1123,6 +1606,10 @@ var ProdWorker = class {
1123
1606
  clientMessages: ProdWorkerToCoordinatorMessages,
1124
1607
  serverMessages: CoordinatorToProdWorkerMessages,
1125
1608
  extraHeaders,
1609
+ ioOptions: {
1610
+ reconnectionDelay: 1e3,
1611
+ reconnectionDelayMax: 3e3
1612
+ },
1126
1613
  handlers: {
1127
1614
  RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
1128
1615
  if (!this.paused) {
@@ -1148,6 +1635,16 @@ var ProdWorker = class {
1148
1635
  );
1149
1636
  return;
1150
1637
  }
1638
+ switch (this.nextResumeAfter) {
1639
+ case "WAIT_FOR_TASK": {
1640
+ this.waitForTaskReplay = void 0;
1641
+ break;
1642
+ }
1643
+ case "WAIT_FOR_BATCH": {
1644
+ this.waitForBatchReplay = void 0;
1645
+ break;
1646
+ }
1647
+ }
1151
1648
  this.paused = false;
1152
1649
  this.nextResumeAfter = void 0;
1153
1650
  this.waitForPostStart = false;
@@ -1171,8 +1668,10 @@ var ProdWorker = class {
1171
1668
  });
1172
1669
  return;
1173
1670
  }
1671
+ this.durationResumeFallback = void 0;
1174
1672
  this.#resumeAfterDuration();
1175
1673
  },
1674
+ // Deprecated: This will never get called as this worker supports lazy attempts. It's only here for a quick view of the flow old workers use.
1176
1675
  EXECUTE_TASK_RUN: async ({ executionPayload }) => {
1177
1676
  if (this.executing) {
1178
1677
  logger.error("dropping execute request, already executing");
@@ -1193,47 +1692,31 @@ var ProdWorker = class {
1193
1692
  completion
1194
1693
  });
1195
1694
  logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1196
- this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
1695
+ await this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
1197
1696
  },
1198
1697
  EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
1698
+ this.readyForLazyAttemptReplay = void 0;
1199
1699
  if (this.executing) {
1200
1700
  logger.error("dropping execute request, already executing");
1201
1701
  return;
1202
1702
  }
1703
+ const attemptCount = message.lazyPayload.attemptCount ?? 0;
1704
+ logger.log("execute attempt counts", { attemptCount, completed: this.completed.size });
1705
+ if (this.completed.size > 0 && this.completed.size >= attemptCount + 1) {
1706
+ logger.error("dropping execute request, already completed");
1707
+ return;
1708
+ }
1203
1709
  this.executing = true;
1204
1710
  try {
1205
1711
  const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
1206
1712
  logger.log("completed", completion);
1207
1713
  this.completed.add(execution.attempt.id);
1208
- const { willCheckpointAndRestore, shouldExit } = await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
1209
- version: "v1",
1210
- execution,
1211
- completion
1212
- });
1213
- logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1214
- const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
1215
- this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
1714
+ await this.#submitAttemptCompletion(execution, completion);
1216
1715
  } catch (error) {
1217
- const completion = {
1218
- ok: false,
1219
- id: message.lazyPayload.runId,
1220
- retry: void 0,
1221
- error: error instanceof Error ? {
1222
- type: "BUILT_IN_ERROR",
1223
- name: error.name,
1224
- message: error.message,
1225
- stackTrace: error.stack ?? ""
1226
- } : {
1227
- type: "BUILT_IN_ERROR",
1228
- name: "UnknownError",
1229
- message: String(error),
1230
- stackTrace: ""
1231
- }
1232
- };
1233
- this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
1234
- version: "v1",
1235
- completion
1716
+ logger.error("Failed to complete lazy attempt", {
1717
+ error
1236
1718
  });
1719
+ this.#failRun(message.lazyPayload.runId, error);
1237
1720
  }
1238
1721
  },
1239
1722
  REQUEST_ATTEMPT_CANCELLATION: async (message) => {
@@ -1247,133 +1730,153 @@ var ProdWorker = class {
1247
1730
  REQUEST_EXIT: async (message) => {
1248
1731
  if (message.version === "v2" && message.delayInMs) {
1249
1732
  logger.log("exit requested with delay", { delayInMs: message.delayInMs });
1250
- await setTimeout2(message.delayInMs);
1733
+ await timeout2(message.delayInMs);
1251
1734
  }
1252
1735
  this.#coordinatorSocket.close();
1253
1736
  process.exit(0);
1254
1737
  },
1255
1738
  READY_FOR_RETRY: async (message) => {
1256
1739
  if (this.completed.size < 1) {
1740
+ logger.error("Received READY_FOR_RETRY but no completions yet. This is a bug.");
1257
1741
  return;
1258
1742
  }
1259
- this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
1260
- version: "v1",
1261
- runId: this.runId,
1262
- totalCompletions: this.completed.size
1263
- });
1743
+ this.submitAttemptCompletionReplay = void 0;
1744
+ await this.#readyForLazyAttempt();
1264
1745
  }
1265
1746
  },
1747
+ // MARK: ON CONNECTION
1266
1748
  onConnection: async (socket, handler, sender, logger2) => {
1267
- logger2.log("connected to coordinator", { status: this.#status });
1268
- if (this.waitForPostStart) {
1269
- logger2.log("skip connection handler, waiting for post start hook");
1270
- return;
1271
- }
1272
- if (this.paused) {
1273
- if (!this.nextResumeAfter) {
1274
- logger2.error("Missing next resume reason", { status: this.#status });
1275
- this.#emitUnrecoverableError(
1276
- "NoNextResume",
1277
- "Next resume reason not set while resuming from paused state"
1278
- );
1279
- return;
1280
- }
1281
- if (!this.attemptFriendlyId) {
1282
- logger2.error("Missing friendly ID", { status: this.#status });
1283
- this.#emitUnrecoverableError(
1284
- "NoAttemptId",
1285
- "Attempt ID not set while resuming from paused state"
1286
- );
1749
+ logger2.log("connected to coordinator", {
1750
+ status: this.#status,
1751
+ connectionCount: ++this.connectionCount
1752
+ });
1753
+ socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId });
1754
+ try {
1755
+ if (this.waitForPostStart) {
1756
+ logger2.log("skip connection handler, waiting for post start hook");
1287
1757
  return;
1288
1758
  }
1289
- socket.emit("READY_FOR_RESUME", {
1290
- version: "v1",
1291
- attemptFriendlyId: this.attemptFriendlyId,
1292
- type: this.nextResumeAfter
1293
- });
1294
- return;
1295
- }
1296
- if (process.env.INDEX_TASKS === "true") {
1297
- const failIndex = (error) => {
1298
- socket.emit("INDEXING_FAILED", {
1759
+ if (this.paused) {
1760
+ if (!this.nextResumeAfter) {
1761
+ logger2.error("Missing next resume reason", { status: this.#status });
1762
+ this.#emitUnrecoverableError(
1763
+ "NoNextResume",
1764
+ "Next resume reason not set while resuming from paused state"
1765
+ );
1766
+ return;
1767
+ }
1768
+ if (!this.attemptFriendlyId) {
1769
+ logger2.error("Missing friendly ID", { status: this.#status });
1770
+ this.#emitUnrecoverableError(
1771
+ "NoAttemptId",
1772
+ "Attempt ID not set while resuming from paused state"
1773
+ );
1774
+ return;
1775
+ }
1776
+ socket.emit("READY_FOR_RESUME", {
1299
1777
  version: "v1",
1300
- deploymentId: this.deploymentId,
1301
- error
1302
- });
1303
- };
1304
- try {
1305
- const taskResources = await this.#initializeWorker();
1306
- const { success } = await socket.emitWithAck("INDEX_TASKS", {
1307
- version: "v2",
1308
- deploymentId: this.deploymentId,
1309
- ...taskResources,
1310
- supportsLazyAttempts: true
1778
+ attemptFriendlyId: this.attemptFriendlyId,
1779
+ type: this.nextResumeAfter
1311
1780
  });
1312
- if (success) {
1313
- logger2.info("indexing done, shutting down..");
1314
- process.exit(0);
1315
- } else {
1316
- logger2.info("indexing failure, shutting down..");
1317
- process.exit(1);
1318
- }
1319
- } catch (e) {
1320
- const stderr = this.#backgroundWorker.stderr.join("\n");
1321
- if (e instanceof TaskMetadataParseError) {
1322
- logger2.error("tasks metadata parse error", {
1323
- zodIssues: e.zodIssues,
1324
- tasks: e.tasks
1325
- });
1326
- failIndex({
1327
- name: "TaskMetadataParseError",
1328
- message: "There was an error parsing the task metadata",
1329
- stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
1330
- stderr
1781
+ return;
1782
+ }
1783
+ if (process.env.INDEX_TASKS === "true") {
1784
+ const failIndex = (error) => {
1785
+ socket.emit("INDEXING_FAILED", {
1786
+ version: "v1",
1787
+ deploymentId: this.deploymentId,
1788
+ error
1331
1789
  });
1332
- } else if (e instanceof UncaughtExceptionError) {
1333
- const error = {
1334
- name: e.originalError.name,
1335
- message: e.originalError.message,
1336
- stack: e.originalError.stack,
1337
- stderr
1338
- };
1339
- logger2.error("uncaught exception", { originalError: error });
1340
- failIndex(error);
1341
- } else if (e instanceof Error) {
1342
- const error = {
1343
- name: e.name,
1344
- message: e.message,
1345
- stack: e.stack,
1346
- stderr
1347
- };
1348
- logger2.error("error", { error });
1790
+ };
1791
+ process.removeAllListeners("uncaughtException");
1792
+ process.on("uncaughtException", (error) => {
1793
+ console.error("Uncaught exception while indexing", error);
1349
1794
  failIndex(error);
1350
- } else if (typeof e === "string") {
1351
- logger2.error("string error", { error: { message: e } });
1352
- failIndex({
1353
- name: "Error",
1354
- message: e,
1355
- stderr
1356
- });
1357
- } else {
1358
- logger2.error("unknown error", { error: e });
1359
- failIndex({
1360
- name: "Error",
1361
- message: "Unknown error",
1362
- stderr
1795
+ });
1796
+ try {
1797
+ const taskResources = await this.#initializeWorker();
1798
+ const indexTasks = await defaultBackoff.maxRetries(3).execute(async () => {
1799
+ return await socket.timeout(2e4).emitWithAck("INDEX_TASKS", {
1800
+ version: "v2",
1801
+ deploymentId: this.deploymentId,
1802
+ ...taskResources,
1803
+ supportsLazyAttempts: true
1804
+ });
1363
1805
  });
1806
+ if (!indexTasks.success || !indexTasks.result.success) {
1807
+ logger2.error("indexing failure, shutting down..", { indexTasks });
1808
+ process.exit(1);
1809
+ } else {
1810
+ logger2.info("indexing done, shutting down..");
1811
+ process.exit(0);
1812
+ }
1813
+ } catch (e) {
1814
+ const stderr = this.#backgroundWorker.stderr.join("\n");
1815
+ if (e instanceof TaskMetadataParseError) {
1816
+ logger2.error("tasks metadata parse error", {
1817
+ zodIssues: e.zodIssues,
1818
+ tasks: e.tasks
1819
+ });
1820
+ failIndex({
1821
+ name: "TaskMetadataParseError",
1822
+ message: "There was an error parsing the task metadata",
1823
+ stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
1824
+ stderr
1825
+ });
1826
+ } else if (e instanceof UncaughtExceptionError) {
1827
+ const error = {
1828
+ name: e.originalError.name,
1829
+ message: e.originalError.message,
1830
+ stack: e.originalError.stack,
1831
+ stderr
1832
+ };
1833
+ logger2.error("uncaught exception", { originalError: error });
1834
+ failIndex(error);
1835
+ } else if (e instanceof Error) {
1836
+ const error = {
1837
+ name: e.name,
1838
+ message: e.message,
1839
+ stack: e.stack,
1840
+ stderr
1841
+ };
1842
+ logger2.error("error", { error });
1843
+ failIndex(error);
1844
+ } else if (typeof e === "string") {
1845
+ logger2.error("string error", { error: { message: e } });
1846
+ failIndex({
1847
+ name: "Error",
1848
+ message: e,
1849
+ stderr
1850
+ });
1851
+ } else {
1852
+ logger2.error("unknown error", { error: e });
1853
+ failIndex({
1854
+ name: "Error",
1855
+ message: "Unknown error",
1856
+ stderr
1857
+ });
1858
+ }
1859
+ await timeout2(1e3);
1860
+ process.exit(EXIT_CODE_ALREADY_HANDLED);
1364
1861
  }
1365
- await setTimeout2(200);
1366
- process.exit(EXIT_CODE_ALREADY_HANDLED);
1367
1862
  }
1863
+ if (this.executing) {
1864
+ return;
1865
+ }
1866
+ process.removeAllListeners("uncaughtException");
1867
+ process.on("uncaughtException", (error) => {
1868
+ console.error("Uncaught exception during run", error);
1869
+ this.#failRun(this.runId, error);
1870
+ });
1871
+ await this.#readyForLazyAttempt();
1872
+ } catch (error) {
1873
+ logger2.error("connection handler error", { error });
1874
+ } finally {
1875
+ if (this.connectionCount === 1) {
1876
+ return;
1877
+ }
1878
+ this.#handleReplays();
1368
1879
  }
1369
- if (this.executing) {
1370
- return;
1371
- }
1372
- socket.emit("READY_FOR_LAZY_ATTEMPT", {
1373
- version: "v1",
1374
- runId: this.runId,
1375
- totalCompletions: this.completed.size
1376
- });
1377
1880
  },
1378
1881
  onError: async (socket, err, logger2) => {
1379
1882
  logger2.error("onError", {
@@ -1382,13 +1885,109 @@ var ProdWorker = class {
1382
1885
  message: err.message
1383
1886
  }
1384
1887
  });
1385
- await this.#reconnect();
1386
- },
1387
- onDisconnect: async (socket, reason, description, logger2) => {
1388
1888
  }
1389
1889
  });
1390
1890
  return coordinatorConnection;
1391
1891
  }
1892
+ // MARK: REPLAYS
1893
+ async #handleReplays() {
1894
+ const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
1895
+ const replayCancellationDelay = 2e4;
1896
+ if (this.waitForTaskReplay) {
1897
+ logger.log("replaying wait for task", { ...this.waitForTaskReplay });
1898
+ const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
1899
+ await timeout2(replayCancellationDelay);
1900
+ if (!this.waitForTaskReplay) {
1901
+ logger.error("wait for task replay cancelled, discarding", {
1902
+ originalMessage: { idempotencyKey, message, attempt }
1903
+ });
1904
+ return;
1905
+ }
1906
+ if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
1907
+ logger.error("wait for task replay idempotency key mismatch, discarding", {
1908
+ originalMessage: { idempotencyKey, message, attempt },
1909
+ newMessage: this.waitForTaskReplay
1910
+ });
1911
+ return;
1912
+ }
1913
+ try {
1914
+ await backoff.wait(attempt + 1);
1915
+ await this.#waitForTaskHandler(message);
1916
+ } catch (error) {
1917
+ if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1918
+ logger.error("wait for task replay retry limit exceeded", { error });
1919
+ } else {
1920
+ logger.error("wait for task replay error", { error });
1921
+ }
1922
+ }
1923
+ return;
1924
+ }
1925
+ if (this.waitForBatchReplay) {
1926
+ logger.log("replaying wait for batch", {
1927
+ ...this.waitForBatchReplay,
1928
+ cancellationDelay: replayCancellationDelay
1929
+ });
1930
+ const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
1931
+ await timeout2(replayCancellationDelay);
1932
+ if (!this.waitForBatchReplay) {
1933
+ logger.error("wait for batch replay cancelled, discarding", {
1934
+ originalMessage: { idempotencyKey, message, attempt }
1935
+ });
1936
+ return;
1937
+ }
1938
+ if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
1939
+ logger.error("wait for batch replay idempotency key mismatch, discarding", {
1940
+ originalMessage: { idempotencyKey, message, attempt },
1941
+ newMessage: this.waitForBatchReplay
1942
+ });
1943
+ return;
1944
+ }
1945
+ try {
1946
+ await backoff.wait(attempt + 1);
1947
+ await this.#waitForBatchHandler(message);
1948
+ } catch (error) {
1949
+ if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1950
+ logger.error("wait for batch replay retry limit exceeded", { error });
1951
+ } else {
1952
+ logger.error("wait for batch replay error", { error });
1953
+ }
1954
+ }
1955
+ return;
1956
+ }
1957
+ if (this.submitAttemptCompletionReplay) {
1958
+ logger.log("replaying attempt completion", {
1959
+ ...this.submitAttemptCompletionReplay,
1960
+ cancellationDelay: replayCancellationDelay
1961
+ });
1962
+ const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
1963
+ await timeout2(replayCancellationDelay);
1964
+ if (!this.submitAttemptCompletionReplay) {
1965
+ logger.error("attempt completion replay cancelled, discarding", {
1966
+ originalMessage: { idempotencyKey, message, attempt }
1967
+ });
1968
+ return;
1969
+ }
1970
+ if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
1971
+ logger.error("attempt completion replay idempotency key mismatch, discarding", {
1972
+ originalMessage: { idempotencyKey, message, attempt },
1973
+ newMessage: this.submitAttemptCompletionReplay
1974
+ });
1975
+ return;
1976
+ }
1977
+ try {
1978
+ await backoff.wait(attempt + 1);
1979
+ await this.#submitAttemptCompletion(message.execution, message.completion, idempotencyKey);
1980
+ } catch (error) {
1981
+ if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
1982
+ logger.error("attempt completion replay retry limit exceeded", { error });
1983
+ } else {
1984
+ logger.error("attempt completion replay error", { error });
1985
+ }
1986
+ }
1987
+ return;
1988
+ }
1989
+ }
1990
+ // MARK: HTTP SERVER
1392
1991
  #createHttpServer() {
1393
1992
  const httpServer = createServer(async (req, res) => {
1394
1993
  logger.log(`[${req.method}]`, req.url);
@@ -1407,17 +2006,13 @@ var ProdWorker = class {
1407
2006
  return reply.text("Connected to coordinator");
1408
2007
  }
1409
2008
  case "/close": {
1410
- await this.#coordinatorSocket.sendWithAck("LOG", {
1411
- version: "v1",
1412
- text: `[${req.method}] ${req.url}`
1413
- });
1414
2009
  this.#coordinatorSocket.close();
2010
+ this.connectionCount = 0;
1415
2011
  return reply.text("Disconnected from coordinator");
1416
2012
  }
1417
2013
  case "/test": {
1418
- await this.#coordinatorSocket.sendWithAck("LOG", {
1419
- version: "v1",
1420
- text: `[${req.method}] ${req.url}`
2014
+ await this.#coordinatorSocket.socket.timeout(1e4).emitWithAck("TEST", {
2015
+ version: "v1"
1421
2016
  });
1422
2017
  return reply.text("Received ACK from coordinator");
1423
2018
  }
@@ -1452,7 +2047,7 @@ var ProdWorker = class {
1452
2047
  break;
1453
2048
  }
1454
2049
  case "restore": {
1455
- await this.#reconnect(true, true);
2050
+ await this.#reconnectAfterPostStart();
1456
2051
  break;
1457
2052
  }
1458
2053
  default: {
@@ -1483,7 +2078,7 @@ var ProdWorker = class {
1483
2078
  }
1484
2079
  logger.error(`port ${this.#httpPort} already in use, retrying with random port..`);
1485
2080
  this.#httpPort = getRandomPortNumber();
1486
- await setTimeout2(100);
2081
+ await timeout2(100);
1487
2082
  this.start();
1488
2083
  });
1489
2084
  return httpServer;
@@ -1493,8 +2088,12 @@ var ProdWorker = class {
1493
2088
  await this.#backgroundWorker.initialize({ env: envVars });
1494
2089
  let packageVersion;
1495
2090
  const taskResources = [];
1496
- if (!this.#backgroundWorker.tasks) {
1497
- throw new Error(`Background Worker started without tasks`);
2091
+ if (!this.#backgroundWorker.tasks || this.#backgroundWorker.tasks.length === 0) {
2092
+ throw new Error(
2093
+ `Background Worker started without tasks. Searched in: ${__PROJECT_CONFIG__.triggerDirectories?.join(
2094
+ ", "
2095
+ )}`
2096
+ );
1498
2097
  }
1499
2098
  for (const task of this.#backgroundWorker.tasks) {
1500
2099
  taskResources.push(task);
@@ -1528,7 +2127,9 @@ var ProdWorker = class {
1528
2127
  completed: this.completed.size,
1529
2128
  nextResumeAfter: this.nextResumeAfter,
1530
2129
  waitForPostStart: this.waitForPostStart,
1531
- attemptFriendlyId: this.attemptFriendlyId
2130
+ attemptFriendlyId: this.attemptFriendlyId,
2131
+ waitForTaskReplay: this.waitForTaskReplay,
2132
+ waitForBatchReplay: this.waitForBatchReplay
1532
2133
  };
1533
2134
  }
1534
2135
  #emitUnrecoverableError(name, message) {