trigger.dev 3.0.0-beta.45 → 3.0.0-beta.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -70,9 +70,276 @@ var SimpleLogger = class {
|
|
|
70
70
|
var EXIT_CODE_ALREADY_HANDLED = 111;
|
|
71
71
|
var EXIT_CODE_CHILD_NONZERO = 112;
|
|
72
72
|
|
|
73
|
-
// src/
|
|
74
|
-
import {
|
|
75
|
-
|
|
73
|
+
// ../core-apps/src/backoff.ts
|
|
74
|
+
import { setTimeout as timeout } from "node:timers/promises";
|
|
75
|
+
var StopRetrying = class extends Error {
|
|
76
|
+
constructor(message) {
|
|
77
|
+
super(message);
|
|
78
|
+
this.name = "StopRetrying";
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
var AttemptTimeout = class extends Error {
|
|
82
|
+
constructor(message) {
|
|
83
|
+
super(message);
|
|
84
|
+
this.name = "AttemptTimeout";
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
var RetryLimitExceeded = class extends Error {
|
|
88
|
+
constructor(message) {
|
|
89
|
+
super(message);
|
|
90
|
+
this.name = "RetryLimitExceeded";
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
var ExponentialBackoff = class _ExponentialBackoff {
|
|
94
|
+
#retries = 0;
|
|
95
|
+
#type;
|
|
96
|
+
#base;
|
|
97
|
+
#factor;
|
|
98
|
+
#min;
|
|
99
|
+
#max;
|
|
100
|
+
#maxRetries;
|
|
101
|
+
#maxElapsed;
|
|
102
|
+
constructor(type, opts = {}) {
|
|
103
|
+
this.#type = type ?? "NoJitter";
|
|
104
|
+
this.#base = opts.base ?? 2;
|
|
105
|
+
this.#factor = opts.factor ?? 1;
|
|
106
|
+
this.#min = opts.min ?? -Infinity;
|
|
107
|
+
this.#max = opts.max ?? Infinity;
|
|
108
|
+
this.#maxRetries = opts.maxRetries ?? Infinity;
|
|
109
|
+
this.#maxElapsed = opts.maxElapsed ?? Infinity;
|
|
110
|
+
}
|
|
111
|
+
#clone(type, opts = {}) {
|
|
112
|
+
return new _ExponentialBackoff(type ?? this.#type, {
|
|
113
|
+
base: opts.base ?? this.#base,
|
|
114
|
+
factor: opts.factor ?? this.#factor,
|
|
115
|
+
min: opts.min ?? this.#min,
|
|
116
|
+
max: opts.max ?? this.#max,
|
|
117
|
+
maxRetries: opts.maxRetries ?? this.#maxRetries,
|
|
118
|
+
maxElapsed: opts.maxElapsed ?? this.#maxElapsed
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
type(type) {
|
|
122
|
+
return this.#clone(type);
|
|
123
|
+
}
|
|
124
|
+
base(base) {
|
|
125
|
+
return this.#clone(void 0, { base });
|
|
126
|
+
}
|
|
127
|
+
factor(factor) {
|
|
128
|
+
return this.#clone(void 0, { factor });
|
|
129
|
+
}
|
|
130
|
+
min(min) {
|
|
131
|
+
return this.#clone(void 0, { min });
|
|
132
|
+
}
|
|
133
|
+
max(max) {
|
|
134
|
+
return this.#clone(void 0, { max });
|
|
135
|
+
}
|
|
136
|
+
maxRetries(maxRetries) {
|
|
137
|
+
return this.#clone(void 0, { maxRetries });
|
|
138
|
+
}
|
|
139
|
+
// TODO: With .execute(), should this also include the time it takes to execute the callback?
|
|
140
|
+
maxElapsed(maxElapsed) {
|
|
141
|
+
return this.#clone(void 0, { maxElapsed });
|
|
142
|
+
}
|
|
143
|
+
retries(retries) {
|
|
144
|
+
if (typeof retries !== "undefined") {
|
|
145
|
+
if (retries > this.#maxRetries) {
|
|
146
|
+
console.error(
|
|
147
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
148
|
+
);
|
|
149
|
+
this.#retries = this.#maxRetries;
|
|
150
|
+
} else {
|
|
151
|
+
this.#retries = retries;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return this.#clone();
|
|
155
|
+
}
|
|
156
|
+
async *retryAsync(maxRetries = this.#maxRetries ?? Infinity) {
|
|
157
|
+
let elapsed = 0;
|
|
158
|
+
let retry = 0;
|
|
159
|
+
while (retry <= maxRetries) {
|
|
160
|
+
const delay = this.delay(retry);
|
|
161
|
+
elapsed += delay;
|
|
162
|
+
if (elapsed > this.#maxElapsed) {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
yield {
|
|
166
|
+
delay: {
|
|
167
|
+
seconds: delay,
|
|
168
|
+
milliseconds: delay * 1e3
|
|
169
|
+
},
|
|
170
|
+
retry
|
|
171
|
+
};
|
|
172
|
+
retry++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
async *[Symbol.asyncIterator]() {
|
|
176
|
+
yield* this.retryAsync();
|
|
177
|
+
}
|
|
178
|
+
/** Returns the delay for the current retry in seconds. */
|
|
179
|
+
delay(retries = this.#retries, jitter = true) {
|
|
180
|
+
if (retries > this.#maxRetries) {
|
|
181
|
+
console.error(
|
|
182
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
183
|
+
);
|
|
184
|
+
retries = this.#maxRetries;
|
|
185
|
+
}
|
|
186
|
+
let delay = this.#factor * this.#base ** retries;
|
|
187
|
+
switch (this.#type) {
|
|
188
|
+
case "NoJitter": {
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
case "FullJitter": {
|
|
192
|
+
if (!jitter) {
|
|
193
|
+
delay = 0;
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
delay *= Math.random();
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
case "EqualJitter": {
|
|
200
|
+
if (!jitter) {
|
|
201
|
+
delay *= 0.5;
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
delay *= 0.5 * (1 + Math.random());
|
|
205
|
+
break;
|
|
206
|
+
}
|
|
207
|
+
default: {
|
|
208
|
+
throw new Error(`Unknown backoff type: ${this.#type}`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
if (delay < this.#min) {
|
|
212
|
+
delay = this.#min + Math.random() * (this.#min * 0.2);
|
|
213
|
+
}
|
|
214
|
+
if (delay > this.#max) {
|
|
215
|
+
delay = this.#max - Math.random() * (this.#max * 0.2);
|
|
216
|
+
}
|
|
217
|
+
delay = Math.round(delay);
|
|
218
|
+
return delay;
|
|
219
|
+
}
|
|
220
|
+
/** Waits with the appropriate delay for the current retry. */
|
|
221
|
+
async wait(retries = this.#retries, jitter = true) {
|
|
222
|
+
if (retries > this.#maxRetries) {
|
|
223
|
+
console.error(`Retry limit exceeded: ${retries} > ${this.#maxRetries}`);
|
|
224
|
+
throw new RetryLimitExceeded();
|
|
225
|
+
}
|
|
226
|
+
const delay = this.delay(retries, jitter);
|
|
227
|
+
return await timeout(delay * 1e3);
|
|
228
|
+
}
|
|
229
|
+
elapsed(retries = this.#retries, jitter = true) {
|
|
230
|
+
let elapsed = 0;
|
|
231
|
+
for (let i = 0; i <= retries; i++) {
|
|
232
|
+
elapsed += this.delay(i, jitter);
|
|
233
|
+
}
|
|
234
|
+
const total = elapsed;
|
|
235
|
+
let days = 0;
|
|
236
|
+
if (elapsed > 3600 * 24) {
|
|
237
|
+
days = Math.floor(elapsed / 3600 / 24);
|
|
238
|
+
elapsed -= days * 3600 * 24;
|
|
239
|
+
}
|
|
240
|
+
let hours = 0;
|
|
241
|
+
if (elapsed > 3600) {
|
|
242
|
+
hours = Math.floor(elapsed / 3600);
|
|
243
|
+
elapsed -= hours * 3600;
|
|
244
|
+
}
|
|
245
|
+
let minutes = 0;
|
|
246
|
+
if (elapsed > 60) {
|
|
247
|
+
minutes = Math.floor(elapsed / 60);
|
|
248
|
+
elapsed -= minutes * 60;
|
|
249
|
+
}
|
|
250
|
+
const seconds = elapsed;
|
|
251
|
+
return {
|
|
252
|
+
seconds,
|
|
253
|
+
minutes,
|
|
254
|
+
hours,
|
|
255
|
+
days,
|
|
256
|
+
total
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
reset() {
|
|
260
|
+
this.#retries = 0;
|
|
261
|
+
return this;
|
|
262
|
+
}
|
|
263
|
+
next() {
|
|
264
|
+
this.#retries++;
|
|
265
|
+
return this.delay();
|
|
266
|
+
}
|
|
267
|
+
stop() {
|
|
268
|
+
throw new StopRetrying();
|
|
269
|
+
}
|
|
270
|
+
get state() {
|
|
271
|
+
return {
|
|
272
|
+
retries: this.#retries,
|
|
273
|
+
type: this.#type,
|
|
274
|
+
base: this.#base,
|
|
275
|
+
factor: this.#factor,
|
|
276
|
+
min: this.#min,
|
|
277
|
+
max: this.#max,
|
|
278
|
+
maxRetries: this.#maxRetries,
|
|
279
|
+
maxElapsed: this.#maxElapsed
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
async execute(callback, { attemptTimeoutMs = 0 } = {}) {
|
|
283
|
+
let elapsedMs = 0;
|
|
284
|
+
let finalError = void 0;
|
|
285
|
+
for await (const { delay, retry } of this) {
|
|
286
|
+
const start = Date.now();
|
|
287
|
+
if (retry > 0) {
|
|
288
|
+
console.log(`Retrying in ${delay.milliseconds}ms`);
|
|
289
|
+
await timeout(delay.milliseconds);
|
|
290
|
+
}
|
|
291
|
+
let attemptTimeout = void 0;
|
|
292
|
+
try {
|
|
293
|
+
const result = await new Promise(async (resolve, reject) => {
|
|
294
|
+
if (attemptTimeoutMs > 0) {
|
|
295
|
+
attemptTimeout = setTimeout(() => {
|
|
296
|
+
reject(new AttemptTimeout());
|
|
297
|
+
}, attemptTimeoutMs);
|
|
298
|
+
}
|
|
299
|
+
try {
|
|
300
|
+
const callbackResult = await callback({ delay, retry, elapsedMs });
|
|
301
|
+
resolve(callbackResult);
|
|
302
|
+
} catch (error) {
|
|
303
|
+
reject(error);
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
return {
|
|
307
|
+
success: true,
|
|
308
|
+
result
|
|
309
|
+
};
|
|
310
|
+
} catch (error) {
|
|
311
|
+
finalError = error;
|
|
312
|
+
if (error instanceof StopRetrying) {
|
|
313
|
+
return {
|
|
314
|
+
success: false,
|
|
315
|
+
cause: "StopRetrying",
|
|
316
|
+
error: error.message
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
if (error instanceof AttemptTimeout) {
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
} finally {
|
|
323
|
+
elapsedMs += Date.now() - start;
|
|
324
|
+
clearTimeout(attemptTimeout);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
if (finalError instanceof AttemptTimeout) {
|
|
328
|
+
return {
|
|
329
|
+
success: false,
|
|
330
|
+
cause: "Timeout"
|
|
331
|
+
};
|
|
332
|
+
} else {
|
|
333
|
+
return {
|
|
334
|
+
success: false,
|
|
335
|
+
cause: "MaxRetries",
|
|
336
|
+
error: finalError
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
static RetryLimitExceeded = RetryLimitExceeded;
|
|
341
|
+
static StopRetrying = StopRetrying;
|
|
342
|
+
};
|
|
76
343
|
|
|
77
344
|
// src/workers/prod/backgroundWorker.ts
|
|
78
345
|
import {
|
|
@@ -175,13 +442,9 @@ var ProdBackgroundWorker = class {
|
|
|
175
442
|
*/
|
|
176
443
|
onTaskHeartbeat = new Evt();
|
|
177
444
|
onTaskRunHeartbeat = new Evt();
|
|
178
|
-
onWaitForBatch = new Evt();
|
|
179
445
|
onWaitForDuration = new Evt();
|
|
180
446
|
onWaitForTask = new Evt();
|
|
181
|
-
|
|
182
|
-
checkpointCanceledNotification = Evt.create();
|
|
183
|
-
onReadyForCheckpoint = Evt.create();
|
|
184
|
-
onCancelCheckpoint = Evt.create();
|
|
447
|
+
onWaitForBatch = new Evt();
|
|
185
448
|
onCreateTaskRunAttempt = Evt.create();
|
|
186
449
|
attemptCreatedNotification = Evt.create();
|
|
187
450
|
_onClose = new Evt();
|
|
@@ -219,7 +482,10 @@ var ProdBackgroundWorker = class {
|
|
|
219
482
|
this._closed = true;
|
|
220
483
|
}
|
|
221
484
|
async flushTelemetry() {
|
|
485
|
+
console.log("Flushing telemetry");
|
|
486
|
+
const start = performance.now();
|
|
222
487
|
await this._taskRunProcess?.cleanup(false);
|
|
488
|
+
console.log("Flushed telemetry", { duration: performance.now() - start });
|
|
223
489
|
}
|
|
224
490
|
async initialize(options) {
|
|
225
491
|
if (this._initialized) {
|
|
@@ -242,7 +508,7 @@ var ProdBackgroundWorker = class {
|
|
|
242
508
|
...options?.env
|
|
243
509
|
}
|
|
244
510
|
});
|
|
245
|
-
const
|
|
511
|
+
const timeout3 = setTimeout(() => {
|
|
246
512
|
if (resolved) {
|
|
247
513
|
return;
|
|
248
514
|
}
|
|
@@ -259,7 +525,7 @@ var ProdBackgroundWorker = class {
|
|
|
259
525
|
});
|
|
260
526
|
child.on("exit", (code) => {
|
|
261
527
|
if (!resolved) {
|
|
262
|
-
clearTimeout(
|
|
528
|
+
clearTimeout(timeout3);
|
|
263
529
|
resolved = true;
|
|
264
530
|
reject(new Error(`Worker exited with code ${code}`));
|
|
265
531
|
}
|
|
@@ -271,7 +537,7 @@ var ProdBackgroundWorker = class {
|
|
|
271
537
|
handlers: {
|
|
272
538
|
TASKS_READY: async (message) => {
|
|
273
539
|
if (!resolved) {
|
|
274
|
-
clearTimeout(
|
|
540
|
+
clearTimeout(timeout3);
|
|
275
541
|
resolved = true;
|
|
276
542
|
resolve(message.tasks);
|
|
277
543
|
child.kill();
|
|
@@ -279,7 +545,7 @@ var ProdBackgroundWorker = class {
|
|
|
279
545
|
},
|
|
280
546
|
UNCAUGHT_EXCEPTION: async (message) => {
|
|
281
547
|
if (!resolved) {
|
|
282
|
-
clearTimeout(
|
|
548
|
+
clearTimeout(timeout3);
|
|
283
549
|
resolved = true;
|
|
284
550
|
reject(new UncaughtExceptionError(message.error, message.origin));
|
|
285
551
|
child.kill();
|
|
@@ -287,7 +553,7 @@ var ProdBackgroundWorker = class {
|
|
|
287
553
|
},
|
|
288
554
|
TASKS_FAILED_TO_PARSE: async (message) => {
|
|
289
555
|
if (!resolved) {
|
|
290
|
-
clearTimeout(
|
|
556
|
+
clearTimeout(timeout3);
|
|
291
557
|
resolved = true;
|
|
292
558
|
reject(new TaskMetadataParseError(message.zodIssues, message.tasks));
|
|
293
559
|
child.kill();
|
|
@@ -364,18 +630,6 @@ var ProdBackgroundWorker = class {
|
|
|
364
630
|
taskRunProcess.onWaitForTask.attach((message) => {
|
|
365
631
|
this.onWaitForTask.post(message);
|
|
366
632
|
});
|
|
367
|
-
taskRunProcess.onReadyForCheckpoint.attach((message) => {
|
|
368
|
-
this.onReadyForCheckpoint.post(message);
|
|
369
|
-
});
|
|
370
|
-
taskRunProcess.onCancelCheckpoint.attach((message) => {
|
|
371
|
-
this.onCancelCheckpoint.post(message);
|
|
372
|
-
});
|
|
373
|
-
this.preCheckpointNotification.attach((message) => {
|
|
374
|
-
taskRunProcess.preCheckpointNotification.post(message);
|
|
375
|
-
});
|
|
376
|
-
this.checkpointCanceledNotification.attach((message) => {
|
|
377
|
-
taskRunProcess.checkpointCanceledNotification.post(message);
|
|
378
|
-
});
|
|
379
633
|
await taskRunProcess.initialize();
|
|
380
634
|
this._taskRunProcess = taskRunProcess;
|
|
381
635
|
return this._taskRunProcess;
|
|
@@ -416,6 +670,7 @@ var ProdBackgroundWorker = class {
|
|
|
416
670
|
}
|
|
417
671
|
}
|
|
418
672
|
async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
|
|
673
|
+
console.log("Trying graceful exit", { kill, initialSignal });
|
|
419
674
|
try {
|
|
420
675
|
const initialExit = taskRunProcess.onExit.waitFor(5e3);
|
|
421
676
|
if (kill) {
|
|
@@ -428,6 +683,7 @@ var ProdBackgroundWorker = class {
|
|
|
428
683
|
}
|
|
429
684
|
}
|
|
430
685
|
async #tryForcefulExit(taskRunProcess) {
|
|
686
|
+
console.log("Trying forceful exit");
|
|
431
687
|
try {
|
|
432
688
|
const forcedKill = taskRunProcess.onExit.waitFor(5e3);
|
|
433
689
|
taskRunProcess.kill("SIGKILL");
|
|
@@ -541,16 +797,19 @@ var ProdBackgroundWorker = class {
|
|
|
541
797
|
this.onCreateTaskRunAttempt.post({ runId: payload.runId });
|
|
542
798
|
let execution;
|
|
543
799
|
try {
|
|
544
|
-
const
|
|
800
|
+
const start = performance.now();
|
|
801
|
+
const attemptCreated = await this.attemptCreatedNotification.waitFor(12e4);
|
|
545
802
|
if (!attemptCreated.success) {
|
|
546
|
-
throw new Error(
|
|
547
|
-
`Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
|
|
548
|
-
);
|
|
803
|
+
throw new Error(`${attemptCreated.reason ?? "Unknown error"}`);
|
|
549
804
|
}
|
|
805
|
+
console.log("Attempt created", {
|
|
806
|
+
number: attemptCreated.execution.attempt.number,
|
|
807
|
+
duration: performance.now() - start
|
|
808
|
+
});
|
|
550
809
|
execution = attemptCreated.execution;
|
|
551
810
|
} catch (error) {
|
|
552
811
|
console.error("Error while creating attempt", error);
|
|
553
|
-
throw new Error(`Failed to create
|
|
812
|
+
throw new Error(`Failed to create attempt: ${error}`);
|
|
554
813
|
}
|
|
555
814
|
const completion = await this.executeTaskRun(
|
|
556
815
|
{
|
|
@@ -596,13 +855,10 @@ var TaskRunProcess = class {
|
|
|
596
855
|
onTaskRunHeartbeat = new Evt();
|
|
597
856
|
onExit = new Evt();
|
|
598
857
|
onIsBeingKilled = new Evt();
|
|
599
|
-
onWaitForBatch = new Evt();
|
|
600
858
|
onWaitForDuration = new Evt();
|
|
601
859
|
onWaitForTask = new Evt();
|
|
860
|
+
onWaitForBatch = new Evt();
|
|
602
861
|
preCheckpointNotification = Evt.create();
|
|
603
|
-
checkpointCanceledNotification = Evt.create();
|
|
604
|
-
onReadyForCheckpoint = Evt.create();
|
|
605
|
-
onCancelCheckpoint = Evt.create();
|
|
606
862
|
async initialize() {
|
|
607
863
|
this._child = fork(this.path, {
|
|
608
864
|
stdio: [
|
|
@@ -650,6 +906,10 @@ var TaskRunProcess = class {
|
|
|
650
906
|
if (this.messageId) {
|
|
651
907
|
this.onTaskRunHeartbeat.post(this.messageId);
|
|
652
908
|
} else {
|
|
909
|
+
console.error(
|
|
910
|
+
"No message id for task heartbeat, falling back to (deprecated) attempt heartbeat",
|
|
911
|
+
{ id: message.id }
|
|
912
|
+
);
|
|
653
913
|
this.onTaskHeartbeat.post(message.id);
|
|
654
914
|
}
|
|
655
915
|
},
|
|
@@ -663,41 +923,6 @@ var TaskRunProcess = class {
|
|
|
663
923
|
},
|
|
664
924
|
WAIT_FOR_DURATION: async (message) => {
|
|
665
925
|
this.onWaitForDuration.post(message);
|
|
666
|
-
try {
|
|
667
|
-
const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor(
|
|
668
|
-
3e4
|
|
669
|
-
);
|
|
670
|
-
return {
|
|
671
|
-
willCheckpointAndRestore
|
|
672
|
-
};
|
|
673
|
-
} catch (error) {
|
|
674
|
-
console.error("Error while waiting for pre-checkpoint notification", error);
|
|
675
|
-
return {
|
|
676
|
-
willCheckpointAndRestore: false
|
|
677
|
-
};
|
|
678
|
-
}
|
|
679
|
-
},
|
|
680
|
-
READY_FOR_CHECKPOINT: async (message) => {
|
|
681
|
-
this.onReadyForCheckpoint.post(message);
|
|
682
|
-
},
|
|
683
|
-
CANCEL_CHECKPOINT: async (message) => {
|
|
684
|
-
const version = "v2";
|
|
685
|
-
this.onCancelCheckpoint.post(message);
|
|
686
|
-
try {
|
|
687
|
-
const { checkpointCanceled } = await this.checkpointCanceledNotification.waitFor(
|
|
688
|
-
3e4
|
|
689
|
-
);
|
|
690
|
-
return {
|
|
691
|
-
version,
|
|
692
|
-
checkpointCanceled
|
|
693
|
-
};
|
|
694
|
-
} catch (error) {
|
|
695
|
-
console.error("Error while waiting for checkpoint cancellation", error);
|
|
696
|
-
return {
|
|
697
|
-
version,
|
|
698
|
-
checkpointCanceled: true
|
|
699
|
-
};
|
|
700
|
-
}
|
|
701
926
|
}
|
|
702
927
|
}
|
|
703
928
|
});
|
|
@@ -727,14 +952,21 @@ var TaskRunProcess = class {
|
|
|
727
952
|
childPid: this._childPid,
|
|
728
953
|
realChildPid: this._child?.pid
|
|
729
954
|
});
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
955
|
+
try {
|
|
956
|
+
await this._ipc?.sendWithAck(
|
|
957
|
+
"CLEANUP",
|
|
958
|
+
{
|
|
959
|
+
flush: true,
|
|
960
|
+
kill: killParentProcess
|
|
961
|
+
},
|
|
962
|
+
3e4
|
|
963
|
+
);
|
|
964
|
+
} catch (error) {
|
|
965
|
+
console.error("Error while cleaning up task run process", error);
|
|
966
|
+
if (killParentProcess) {
|
|
967
|
+
process.exit(0);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
738
970
|
if (killChildProcess) {
|
|
739
971
|
this._gracefulExitTimeoutElapsed = true;
|
|
740
972
|
await this.kill("SIGKILL");
|
|
@@ -764,19 +996,30 @@ var TaskRunProcess = class {
|
|
|
764
996
|
}
|
|
765
997
|
taskRunCompletedNotification(completion) {
|
|
766
998
|
if (!completion.ok && typeof completion.retry !== "undefined") {
|
|
999
|
+
console.error(
|
|
1000
|
+
"Task run completed with error and wants to retry, won't send task run completed notification"
|
|
1001
|
+
);
|
|
767
1002
|
return;
|
|
768
1003
|
}
|
|
769
|
-
if (this._child?.connected
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
1004
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1005
|
+
console.error(
|
|
1006
|
+
"Child process not connected or being killed, can't send task run completed notification"
|
|
1007
|
+
);
|
|
1008
|
+
return;
|
|
774
1009
|
}
|
|
1010
|
+
this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
|
|
1011
|
+
version: "v2",
|
|
1012
|
+
completion
|
|
1013
|
+
});
|
|
775
1014
|
}
|
|
776
1015
|
waitCompletedNotification() {
|
|
777
|
-
if (this._child?.connected
|
|
778
|
-
|
|
1016
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1017
|
+
console.error(
|
|
1018
|
+
"Child process not connected or being killed, can't send wait completed notification"
|
|
1019
|
+
);
|
|
1020
|
+
return;
|
|
779
1021
|
}
|
|
1022
|
+
this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
|
|
780
1023
|
}
|
|
781
1024
|
async #handleExit(code, signal) {
|
|
782
1025
|
console.log("handling child exit", { code, signal });
|
|
@@ -837,7 +1080,11 @@ var TaskRunProcess = class {
|
|
|
837
1080
|
};
|
|
838
1081
|
|
|
839
1082
|
// src/workers/prod/entry-point.ts
|
|
840
|
-
import {
|
|
1083
|
+
import { checkpointSafeTimeout, unboundedTimeout } from "@trigger.dev/core/v3/utils/timers";
|
|
1084
|
+
import { randomUUID } from "node:crypto";
|
|
1085
|
+
import { readFile } from "node:fs/promises";
|
|
1086
|
+
import { createServer } from "node:http";
|
|
1087
|
+
import { setTimeout as timeout2 } from "node:timers/promises";
|
|
841
1088
|
var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
|
|
842
1089
|
var COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
|
|
843
1090
|
var COORDINATOR_PORT = Number(process.env.COORDINATOR_PORT || 50080);
|
|
@@ -845,6 +1092,9 @@ var MACHINE_NAME = process.env.MACHINE_NAME || "local";
|
|
|
845
1092
|
var POD_NAME = process.env.POD_NAME || "some-pod";
|
|
846
1093
|
var SHORT_HASH = process.env.TRIGGER_CONTENT_HASH.slice(0, 9);
|
|
847
1094
|
var logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`);
|
|
1095
|
+
var defaultBackoff = new ExponentialBackoff("FullJitter", {
|
|
1096
|
+
maxRetries: 5
|
|
1097
|
+
});
|
|
848
1098
|
var ProdWorker = class {
|
|
849
1099
|
constructor(port, host = "0.0.0.0") {
|
|
850
1100
|
this.host = host;
|
|
@@ -869,6 +1119,12 @@ var ProdWorker = class {
|
|
|
869
1119
|
attemptFriendlyId;
|
|
870
1120
|
nextResumeAfter;
|
|
871
1121
|
waitForPostStart = false;
|
|
1122
|
+
connectionCount = 0;
|
|
1123
|
+
waitForTaskReplay;
|
|
1124
|
+
waitForBatchReplay;
|
|
1125
|
+
readyForLazyAttemptReplay;
|
|
1126
|
+
submitAttemptCompletionReplay;
|
|
1127
|
+
durationResumeFallback;
|
|
872
1128
|
#httpPort;
|
|
873
1129
|
#backgroundWorker;
|
|
874
1130
|
#httpServer;
|
|
@@ -882,7 +1138,7 @@ var ProdWorker = class {
|
|
|
882
1138
|
logger.log("Waiting for attempt to complete before exiting", {
|
|
883
1139
|
terminationGracePeriodSeconds
|
|
884
1140
|
});
|
|
885
|
-
await
|
|
1141
|
+
await timeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
|
|
886
1142
|
gracefulExitTimeoutElapsed = true;
|
|
887
1143
|
logger.log("Termination timeout reached, exiting gracefully.");
|
|
888
1144
|
} else {
|
|
@@ -899,14 +1155,10 @@ var ProdWorker = class {
|
|
|
899
1155
|
process.exit(exitCode);
|
|
900
1156
|
}
|
|
901
1157
|
}
|
|
902
|
-
async #
|
|
903
|
-
|
|
904
|
-
this.waitForPostStart = false;
|
|
905
|
-
}
|
|
1158
|
+
async #reconnectAfterPostStart() {
|
|
1159
|
+
this.waitForPostStart = false;
|
|
906
1160
|
this.#coordinatorSocket.close();
|
|
907
|
-
|
|
908
|
-
await setTimeout2(1e3);
|
|
909
|
-
}
|
|
1161
|
+
this.connectionCount = 0;
|
|
910
1162
|
let coordinatorHost = COORDINATOR_HOST;
|
|
911
1163
|
try {
|
|
912
1164
|
if (this.runningInKubernetes) {
|
|
@@ -930,6 +1182,98 @@ var ProdWorker = class {
|
|
|
930
1182
|
this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
|
|
931
1183
|
}
|
|
932
1184
|
}
|
|
1185
|
+
// MARK: TASK WAIT
|
|
1186
|
+
async #waitForTaskHandler(message, replayIdempotencyKey) {
|
|
1187
|
+
const waitForTask = await defaultBackoff.execute(async ({ retry }) => {
|
|
1188
|
+
logger.log("Wait for task with backoff", { retry });
|
|
1189
|
+
if (!this.attemptFriendlyId) {
|
|
1190
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1191
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1192
|
+
}
|
|
1193
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_TASK", {
|
|
1194
|
+
version: "v2",
|
|
1195
|
+
friendlyId: message.friendlyId,
|
|
1196
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1197
|
+
});
|
|
1198
|
+
});
|
|
1199
|
+
if (!waitForTask.success) {
|
|
1200
|
+
logger.error("Failed to wait for task with backoff", {
|
|
1201
|
+
cause: waitForTask.cause,
|
|
1202
|
+
error: waitForTask.error
|
|
1203
|
+
});
|
|
1204
|
+
this.#emitUnrecoverableError(
|
|
1205
|
+
"WaitForTaskFailed",
|
|
1206
|
+
`${waitForTask.cause}: ${waitForTask.error}`
|
|
1207
|
+
);
|
|
1208
|
+
return;
|
|
1209
|
+
}
|
|
1210
|
+
const { willCheckpointAndRestore } = waitForTask.result;
|
|
1211
|
+
await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
1212
|
+
if (willCheckpointAndRestore) {
|
|
1213
|
+
if (!this.waitForTaskReplay) {
|
|
1214
|
+
this.waitForTaskReplay = {
|
|
1215
|
+
message,
|
|
1216
|
+
attempt: 1,
|
|
1217
|
+
idempotencyKey: randomUUID()
|
|
1218
|
+
};
|
|
1219
|
+
} else {
|
|
1220
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1221
|
+
logger.error(
|
|
1222
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1223
|
+
);
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
this.waitForTaskReplay.attempt++;
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
// MARK: BATCH WAIT
|
|
1231
|
+
async #waitForBatchHandler(message, replayIdempotencyKey) {
|
|
1232
|
+
const waitForBatch = await defaultBackoff.execute(async ({ retry }) => {
|
|
1233
|
+
logger.log("Wait for batch with backoff", { retry });
|
|
1234
|
+
if (!this.attemptFriendlyId) {
|
|
1235
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1236
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1237
|
+
}
|
|
1238
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_BATCH", {
|
|
1239
|
+
version: "v2",
|
|
1240
|
+
batchFriendlyId: message.batchFriendlyId,
|
|
1241
|
+
runFriendlyIds: message.runFriendlyIds,
|
|
1242
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1243
|
+
});
|
|
1244
|
+
});
|
|
1245
|
+
if (!waitForBatch.success) {
|
|
1246
|
+
logger.error("Failed to wait for batch with backoff", {
|
|
1247
|
+
cause: waitForBatch.cause,
|
|
1248
|
+
error: waitForBatch.error
|
|
1249
|
+
});
|
|
1250
|
+
this.#emitUnrecoverableError(
|
|
1251
|
+
"WaitForBatchFailed",
|
|
1252
|
+
`${waitForBatch.cause}: ${waitForBatch.error}`
|
|
1253
|
+
);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
const { willCheckpointAndRestore } = waitForBatch.result;
|
|
1257
|
+
await this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
1258
|
+
if (willCheckpointAndRestore) {
|
|
1259
|
+
if (!this.waitForBatchReplay) {
|
|
1260
|
+
this.waitForBatchReplay = {
|
|
1261
|
+
message,
|
|
1262
|
+
attempt: 1,
|
|
1263
|
+
idempotencyKey: randomUUID()
|
|
1264
|
+
};
|
|
1265
|
+
} else {
|
|
1266
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1267
|
+
logger.error(
|
|
1268
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1269
|
+
);
|
|
1270
|
+
return;
|
|
1271
|
+
}
|
|
1272
|
+
this.waitForBatchReplay.attempt++;
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
// MARK: WORKER CREATION
|
|
933
1277
|
#createBackgroundWorker() {
|
|
934
1278
|
const backgroundWorker = new ProdBackgroundWorker("worker.js", {
|
|
935
1279
|
projectConfig: __PROJECT_CONFIG__,
|
|
@@ -942,128 +1286,152 @@ var ProdWorker = class {
|
|
|
942
1286
|
contentHash: this.contentHash
|
|
943
1287
|
});
|
|
944
1288
|
backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
|
|
945
|
-
|
|
1289
|
+
logger.log("onTaskHeartbeat", { attemptFriendlyId });
|
|
1290
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_HEARTBEAT", {
|
|
1291
|
+
version: "v1",
|
|
1292
|
+
attemptFriendlyId
|
|
1293
|
+
});
|
|
946
1294
|
});
|
|
947
1295
|
backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
|
|
951
|
-
await this.#prepareForCheckpoint();
|
|
952
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
953
|
-
});
|
|
954
|
-
backgroundWorker.onCancelCheckpoint.attach(async (message) => {
|
|
955
|
-
logger.log("onCancelCheckpoint", { message });
|
|
956
|
-
const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
957
|
-
"CANCEL_CHECKPOINT",
|
|
958
|
-
{
|
|
959
|
-
version: "v2",
|
|
960
|
-
reason: message.reason
|
|
961
|
-
}
|
|
962
|
-
);
|
|
963
|
-
logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
|
|
964
|
-
if (checkpointCanceled) {
|
|
965
|
-
if (message.reason === "WAIT_FOR_DURATION") {
|
|
966
|
-
this.paused = false;
|
|
967
|
-
this.nextResumeAfter = void 0;
|
|
968
|
-
this.waitForPostStart = false;
|
|
969
|
-
}
|
|
970
|
-
}
|
|
971
|
-
backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
|
|
1296
|
+
logger.log("onTaskRunHeartbeat", { runId });
|
|
1297
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
|
|
972
1298
|
});
|
|
973
1299
|
backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
|
|
974
1300
|
logger.log("onCreateTaskRunAttempt()", { message });
|
|
975
|
-
const createAttempt = await
|
|
976
|
-
"
|
|
977
|
-
{
|
|
1301
|
+
const createAttempt = await defaultBackoff.execute(async ({ retry }) => {
|
|
1302
|
+
logger.log("Create task run attempt with backoff", { retry });
|
|
1303
|
+
return await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CREATE_TASK_RUN_ATTEMPT", {
|
|
978
1304
|
version: "v1",
|
|
979
1305
|
runId: message.runId
|
|
980
|
-
}
|
|
981
|
-
);
|
|
1306
|
+
});
|
|
1307
|
+
});
|
|
982
1308
|
if (!createAttempt.success) {
|
|
983
1309
|
backgroundWorker.attemptCreatedNotification.post({
|
|
984
1310
|
success: false,
|
|
985
|
-
reason: createAttempt.
|
|
1311
|
+
reason: `Failed to create attempt with backoff due to ${createAttempt.cause}. ${createAttempt.error}`
|
|
1312
|
+
});
|
|
1313
|
+
return;
|
|
1314
|
+
}
|
|
1315
|
+
if (!createAttempt.result.success) {
|
|
1316
|
+
backgroundWorker.attemptCreatedNotification.post({
|
|
1317
|
+
success: false,
|
|
1318
|
+
reason: createAttempt.result.reason
|
|
986
1319
|
});
|
|
987
1320
|
return;
|
|
988
1321
|
}
|
|
989
1322
|
backgroundWorker.attemptCreatedNotification.post({
|
|
990
1323
|
success: true,
|
|
991
|
-
execution: createAttempt.executionPayload.execution
|
|
1324
|
+
execution: createAttempt.result.executionPayload.execution
|
|
992
1325
|
});
|
|
993
1326
|
});
|
|
994
1327
|
backgroundWorker.attemptCreatedNotification.attach((message) => {
|
|
1328
|
+
logger.log("attemptCreatedNotification", {
|
|
1329
|
+
success: message.success,
|
|
1330
|
+
...message.success ? {
|
|
1331
|
+
attempt: message.execution.attempt,
|
|
1332
|
+
queue: message.execution.queue,
|
|
1333
|
+
worker: message.execution.worker,
|
|
1334
|
+
machine: message.execution.machine
|
|
1335
|
+
} : {
|
|
1336
|
+
reason: message.reason
|
|
1337
|
+
}
|
|
1338
|
+
});
|
|
995
1339
|
if (!message.success) {
|
|
996
1340
|
return;
|
|
997
1341
|
}
|
|
998
1342
|
this.attemptFriendlyId = message.execution.attempt.id;
|
|
999
1343
|
});
|
|
1000
1344
|
backgroundWorker.onWaitForDuration.attach(async (message) => {
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
)
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
1010
|
-
"WAIT_FOR_DURATION",
|
|
1011
|
-
{
|
|
1012
|
-
...message,
|
|
1013
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
1345
|
+
logger.log("onWaitForDuration", { ...message, drift: Date.now() - message.now });
|
|
1346
|
+
noResume: {
|
|
1347
|
+
const { ms, waitThresholdInMs } = message;
|
|
1348
|
+
const internalTimeout = unboundedTimeout(ms, "internal");
|
|
1349
|
+
const checkpointSafeInternalTimeout = checkpointSafeTimeout(ms);
|
|
1350
|
+
if (ms < waitThresholdInMs) {
|
|
1351
|
+
await internalTimeout;
|
|
1352
|
+
break noResume;
|
|
1014
1353
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
{
|
|
1027
|
-
|
|
1028
|
-
|
|
1354
|
+
const waitForDuration = await defaultBackoff.execute(async ({ retry }) => {
|
|
1355
|
+
logger.log("Wait for duration with backoff", { retry });
|
|
1356
|
+
if (!this.attemptFriendlyId) {
|
|
1357
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1358
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1359
|
+
}
|
|
1360
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_DURATION", {
|
|
1361
|
+
...message,
|
|
1362
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1363
|
+
});
|
|
1364
|
+
});
|
|
1365
|
+
if (!waitForDuration.success) {
|
|
1366
|
+
logger.error("Failed to wait for duration with backoff", {
|
|
1367
|
+
cause: waitForDuration.cause,
|
|
1368
|
+
error: waitForDuration.error
|
|
1369
|
+
});
|
|
1370
|
+
this.#emitUnrecoverableError(
|
|
1371
|
+
"WaitForDurationFailed",
|
|
1372
|
+
`${waitForDuration.cause}: ${waitForDuration.error}`
|
|
1373
|
+
);
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
const { willCheckpointAndRestore } = waitForDuration.result;
|
|
1377
|
+
if (!willCheckpointAndRestore) {
|
|
1378
|
+
await internalTimeout;
|
|
1379
|
+
break noResume;
|
|
1380
|
+
}
|
|
1381
|
+
await this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
|
|
1382
|
+
await Promise.race([internalTimeout, checkpointSafeInternalTimeout]);
|
|
1383
|
+
try {
|
|
1384
|
+
const { checkpointCanceled } = await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CANCEL_CHECKPOINT", {
|
|
1385
|
+
version: "v2",
|
|
1386
|
+
reason: "WAIT_FOR_DURATION"
|
|
1387
|
+
});
|
|
1388
|
+
logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
|
|
1389
|
+
if (checkpointCanceled) {
|
|
1390
|
+
break noResume;
|
|
1391
|
+
}
|
|
1392
|
+
logger.log("Waiting for external duration resume as we may have been restored");
|
|
1393
|
+
const idempotencyKey = randomUUID();
|
|
1394
|
+
this.durationResumeFallback = { idempotencyKey };
|
|
1395
|
+
setTimeout(() => {
|
|
1396
|
+
if (!this.durationResumeFallback) {
|
|
1397
|
+
logger.error("Already resumed after duration, skipping fallback");
|
|
1398
|
+
return;
|
|
1399
|
+
}
|
|
1400
|
+
if (this.durationResumeFallback.idempotencyKey !== idempotencyKey) {
|
|
1401
|
+
logger.error("Duration resume idempotency key mismatch, skipping fallback");
|
|
1402
|
+
return;
|
|
1403
|
+
}
|
|
1404
|
+
logger.log("Resuming after duration with fallback");
|
|
1405
|
+
this.#resumeAfterDuration();
|
|
1406
|
+
}, 15e3);
|
|
1407
|
+
} catch (error) {
|
|
1408
|
+
logger.debug("Checkpoint cancellation timed out", { error });
|
|
1409
|
+
break noResume;
|
|
1029
1410
|
}
|
|
1030
|
-
);
|
|
1031
|
-
this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
1032
|
-
});
|
|
1033
|
-
backgroundWorker.onWaitForBatch.attach(async (message) => {
|
|
1034
|
-
if (!this.attemptFriendlyId) {
|
|
1035
|
-
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1036
|
-
this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
|
|
1037
1411
|
return;
|
|
1038
1412
|
}
|
|
1039
|
-
|
|
1040
|
-
"WAIT_FOR_BATCH",
|
|
1041
|
-
{
|
|
1042
|
-
...message,
|
|
1043
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
1044
|
-
}
|
|
1045
|
-
);
|
|
1046
|
-
this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
1413
|
+
this.#resumeAfterDuration();
|
|
1047
1414
|
});
|
|
1415
|
+
backgroundWorker.onWaitForTask.attach(this.#waitForTaskHandler.bind(this));
|
|
1416
|
+
backgroundWorker.onWaitForBatch.attach(this.#waitForBatchHandler.bind(this));
|
|
1048
1417
|
return backgroundWorker;
|
|
1049
1418
|
}
|
|
1050
1419
|
async #prepareForWait(reason, willCheckpointAndRestore) {
|
|
1051
1420
|
logger.log(`prepare for ${reason}`, { willCheckpointAndRestore });
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
this.paused = true;
|
|
1055
|
-
this.nextResumeAfter = reason;
|
|
1056
|
-
this.waitForPostStart = true;
|
|
1057
|
-
if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
|
|
1058
|
-
await this.#prepareForCheckpoint();
|
|
1059
|
-
}
|
|
1421
|
+
if (!willCheckpointAndRestore) {
|
|
1422
|
+
return;
|
|
1060
1423
|
}
|
|
1424
|
+
this.paused = true;
|
|
1425
|
+
this.nextResumeAfter = reason;
|
|
1426
|
+
this.waitForPostStart = true;
|
|
1427
|
+
await this.#prepareForCheckpoint();
|
|
1061
1428
|
}
|
|
1429
|
+
// MARK: RETRY PREP
|
|
1062
1430
|
async #prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode) {
|
|
1063
|
-
logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit });
|
|
1431
|
+
logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit, exitCode });
|
|
1064
1432
|
if (shouldExit) {
|
|
1065
1433
|
if (willCheckpointAndRestore) {
|
|
1066
|
-
logger.
|
|
1434
|
+
logger.error("WARNING: Will checkpoint but also requested exit. This won't end well.");
|
|
1067
1435
|
}
|
|
1068
1436
|
await this.#exitGracefully(false, exitCode);
|
|
1069
1437
|
return;
|
|
@@ -1072,18 +1440,33 @@ var ProdWorker = class {
|
|
|
1072
1440
|
this.waitForPostStart = false;
|
|
1073
1441
|
this.executing = false;
|
|
1074
1442
|
this.attemptFriendlyId = void 0;
|
|
1075
|
-
if (willCheckpointAndRestore) {
|
|
1076
|
-
this.waitForPostStart = true;
|
|
1077
|
-
this.#prepareForCheckpoint(false);
|
|
1078
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1443
|
+
if (!willCheckpointAndRestore) {
|
|
1079
1444
|
return;
|
|
1080
1445
|
}
|
|
1446
|
+
this.waitForPostStart = true;
|
|
1447
|
+
await this.#prepareForCheckpoint(false);
|
|
1081
1448
|
}
|
|
1449
|
+
// MARK: CHECKPOINT PREP
|
|
1082
1450
|
async #prepareForCheckpoint(flush = true) {
|
|
1083
1451
|
if (flush) {
|
|
1084
|
-
|
|
1452
|
+
try {
|
|
1453
|
+
await this.#backgroundWorker.flushTelemetry();
|
|
1454
|
+
} catch (error) {
|
|
1455
|
+
logger.error(
|
|
1456
|
+
"Failed to flush telemetry while preparing for checkpoint, will proceed anyway",
|
|
1457
|
+
{ error }
|
|
1458
|
+
);
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
try {
|
|
1462
|
+
await this.#backgroundWorker.forceKillOldTaskRunProcesses();
|
|
1463
|
+
} catch (error) {
|
|
1464
|
+
logger.error(
|
|
1465
|
+
"Failed to kill previous worker while preparing for checkpoint, will proceed anyway",
|
|
1466
|
+
{ error }
|
|
1467
|
+
);
|
|
1085
1468
|
}
|
|
1086
|
-
|
|
1469
|
+
this.#readyForCheckpoint();
|
|
1087
1470
|
}
|
|
1088
1471
|
#resumeAfterDuration() {
|
|
1089
1472
|
this.paused = false;
|
|
@@ -1091,6 +1474,106 @@ var ProdWorker = class {
|
|
|
1091
1474
|
this.waitForPostStart = false;
|
|
1092
1475
|
this.#backgroundWorker.waitCompletedNotification();
|
|
1093
1476
|
}
|
|
1477
|
+
async #readyForLazyAttempt() {
|
|
1478
|
+
const idempotencyKey = randomUUID();
|
|
1479
|
+
this.readyForLazyAttemptReplay = {
|
|
1480
|
+
idempotencyKey
|
|
1481
|
+
};
|
|
1482
|
+
for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(3)) {
|
|
1483
|
+
if (retry > 0) {
|
|
1484
|
+
logger.log("retrying ready for lazy attempt", { retry });
|
|
1485
|
+
}
|
|
1486
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
|
|
1487
|
+
version: "v1",
|
|
1488
|
+
runId: this.runId,
|
|
1489
|
+
totalCompletions: this.completed.size
|
|
1490
|
+
});
|
|
1491
|
+
await timeout2(delay.milliseconds);
|
|
1492
|
+
if (!this.readyForLazyAttemptReplay) {
|
|
1493
|
+
logger.error("replay ready for lazy attempt cancelled, discarding", {
|
|
1494
|
+
idempotencyKey
|
|
1495
|
+
});
|
|
1496
|
+
return;
|
|
1497
|
+
}
|
|
1498
|
+
if (idempotencyKey !== this.readyForLazyAttemptReplay.idempotencyKey) {
|
|
1499
|
+
logger.error("replay ready for lazy attempt idempotency key mismatch, discarding", {
|
|
1500
|
+
idempotencyKey,
|
|
1501
|
+
newIdempotencyKey: this.readyForLazyAttemptReplay.idempotencyKey
|
|
1502
|
+
});
|
|
1503
|
+
return;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
this.#failRun(this.runId, "Failed to receive execute request in a reasonable time");
|
|
1507
|
+
}
|
|
1508
|
+
#readyForCheckpoint() {
|
|
1509
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1510
|
+
}
|
|
1511
|
+
#failRun(anyRunId, error) {
|
|
1512
|
+
logger.error("Failing run", { anyRunId, error });
|
|
1513
|
+
const completion = {
|
|
1514
|
+
ok: false,
|
|
1515
|
+
id: anyRunId,
|
|
1516
|
+
retry: void 0,
|
|
1517
|
+
error: error instanceof Error ? {
|
|
1518
|
+
type: "BUILT_IN_ERROR",
|
|
1519
|
+
name: error.name,
|
|
1520
|
+
message: error.message,
|
|
1521
|
+
stackTrace: error.stack ?? ""
|
|
1522
|
+
} : {
|
|
1523
|
+
type: "BUILT_IN_ERROR",
|
|
1524
|
+
name: "UnknownError",
|
|
1525
|
+
message: String(error),
|
|
1526
|
+
stackTrace: ""
|
|
1527
|
+
}
|
|
1528
|
+
};
|
|
1529
|
+
this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
|
|
1530
|
+
version: "v1",
|
|
1531
|
+
completion
|
|
1532
|
+
});
|
|
1533
|
+
}
|
|
1534
|
+
// MARK: ATTEMPT COMPLETION
|
|
1535
|
+
async #submitAttemptCompletion(execution, completion, replayIdempotencyKey) {
|
|
1536
|
+
const taskRunCompleted = await defaultBackoff.execute(async ({ retry }) => {
|
|
1537
|
+
logger.log("Submit attempt completion with backoff", { retry });
|
|
1538
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("TASK_RUN_COMPLETED", {
|
|
1539
|
+
version: "v1",
|
|
1540
|
+
execution,
|
|
1541
|
+
completion
|
|
1542
|
+
});
|
|
1543
|
+
});
|
|
1544
|
+
if (!taskRunCompleted.success) {
|
|
1545
|
+
logger.error("Failed to complete lazy attempt with backoff", {
|
|
1546
|
+
cause: taskRunCompleted.cause,
|
|
1547
|
+
error: taskRunCompleted.error
|
|
1548
|
+
});
|
|
1549
|
+
this.#failRun(execution.run.id, taskRunCompleted.error);
|
|
1550
|
+
return;
|
|
1551
|
+
}
|
|
1552
|
+
const { willCheckpointAndRestore, shouldExit } = taskRunCompleted.result;
|
|
1553
|
+
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1554
|
+
const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
|
|
1555
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
|
|
1556
|
+
if (willCheckpointAndRestore) {
|
|
1557
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1558
|
+
this.submitAttemptCompletionReplay = {
|
|
1559
|
+
message: {
|
|
1560
|
+
execution,
|
|
1561
|
+
completion
|
|
1562
|
+
},
|
|
1563
|
+
attempt: 1,
|
|
1564
|
+
idempotencyKey: randomUUID()
|
|
1565
|
+
};
|
|
1566
|
+
} else {
|
|
1567
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1568
|
+
logger.error(
|
|
1569
|
+
"attempt completion handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1570
|
+
);
|
|
1571
|
+
return;
|
|
1572
|
+
}
|
|
1573
|
+
this.submitAttemptCompletionReplay.attempt++;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1094
1577
|
#returnValidatedExtraHeaders(headers) {
|
|
1095
1578
|
for (const [key, value] of Object.entries(headers)) {
|
|
1096
1579
|
if (value === void 0) {
|
|
@@ -1099,7 +1582,7 @@ var ProdWorker = class {
|
|
|
1099
1582
|
}
|
|
1100
1583
|
return headers;
|
|
1101
1584
|
}
|
|
1102
|
-
//
|
|
1585
|
+
// MARK: COORDINATOR SOCKET
|
|
1103
1586
|
#createCoordinatorSocket(host) {
|
|
1104
1587
|
const extraHeaders = this.#returnValidatedExtraHeaders({
|
|
1105
1588
|
"x-machine-name": MACHINE_NAME,
|
|
@@ -1123,6 +1606,10 @@ var ProdWorker = class {
|
|
|
1123
1606
|
clientMessages: ProdWorkerToCoordinatorMessages,
|
|
1124
1607
|
serverMessages: CoordinatorToProdWorkerMessages,
|
|
1125
1608
|
extraHeaders,
|
|
1609
|
+
ioOptions: {
|
|
1610
|
+
reconnectionDelay: 1e3,
|
|
1611
|
+
reconnectionDelayMax: 3e3
|
|
1612
|
+
},
|
|
1126
1613
|
handlers: {
|
|
1127
1614
|
RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
|
|
1128
1615
|
if (!this.paused) {
|
|
@@ -1148,6 +1635,16 @@ var ProdWorker = class {
|
|
|
1148
1635
|
);
|
|
1149
1636
|
return;
|
|
1150
1637
|
}
|
|
1638
|
+
switch (this.nextResumeAfter) {
|
|
1639
|
+
case "WAIT_FOR_TASK": {
|
|
1640
|
+
this.waitForTaskReplay = void 0;
|
|
1641
|
+
break;
|
|
1642
|
+
}
|
|
1643
|
+
case "WAIT_FOR_BATCH": {
|
|
1644
|
+
this.waitForBatchReplay = void 0;
|
|
1645
|
+
break;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1151
1648
|
this.paused = false;
|
|
1152
1649
|
this.nextResumeAfter = void 0;
|
|
1153
1650
|
this.waitForPostStart = false;
|
|
@@ -1171,8 +1668,10 @@ var ProdWorker = class {
|
|
|
1171
1668
|
});
|
|
1172
1669
|
return;
|
|
1173
1670
|
}
|
|
1671
|
+
this.durationResumeFallback = void 0;
|
|
1174
1672
|
this.#resumeAfterDuration();
|
|
1175
1673
|
},
|
|
1674
|
+
// Deprecated: This will never get called as this worker supports lazy attempts. It's only here for a quick view of the flow old workers use.
|
|
1176
1675
|
EXECUTE_TASK_RUN: async ({ executionPayload }) => {
|
|
1177
1676
|
if (this.executing) {
|
|
1178
1677
|
logger.error("dropping execute request, already executing");
|
|
@@ -1193,47 +1692,31 @@ var ProdWorker = class {
|
|
|
1193
1692
|
completion
|
|
1194
1693
|
});
|
|
1195
1694
|
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1196
|
-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1695
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1197
1696
|
},
|
|
1198
1697
|
EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
|
|
1698
|
+
this.readyForLazyAttemptReplay = void 0;
|
|
1199
1699
|
if (this.executing) {
|
|
1200
1700
|
logger.error("dropping execute request, already executing");
|
|
1201
1701
|
return;
|
|
1202
1702
|
}
|
|
1703
|
+
const attemptCount = message.lazyPayload.attemptCount ?? 0;
|
|
1704
|
+
logger.log("execute attempt counts", { attemptCount, completed: this.completed.size });
|
|
1705
|
+
if (this.completed.size > 0 && this.completed.size >= attemptCount + 1) {
|
|
1706
|
+
logger.error("dropping execute request, already completed");
|
|
1707
|
+
return;
|
|
1708
|
+
}
|
|
1203
1709
|
this.executing = true;
|
|
1204
1710
|
try {
|
|
1205
1711
|
const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
|
|
1206
1712
|
logger.log("completed", completion);
|
|
1207
1713
|
this.completed.add(execution.attempt.id);
|
|
1208
|
-
|
|
1209
|
-
version: "v1",
|
|
1210
|
-
execution,
|
|
1211
|
-
completion
|
|
1212
|
-
});
|
|
1213
|
-
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1214
|
-
const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
|
|
1215
|
-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
|
|
1714
|
+
await this.#submitAttemptCompletion(execution, completion);
|
|
1216
1715
|
} catch (error) {
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
id: message.lazyPayload.runId,
|
|
1220
|
-
retry: void 0,
|
|
1221
|
-
error: error instanceof Error ? {
|
|
1222
|
-
type: "BUILT_IN_ERROR",
|
|
1223
|
-
name: error.name,
|
|
1224
|
-
message: error.message,
|
|
1225
|
-
stackTrace: error.stack ?? ""
|
|
1226
|
-
} : {
|
|
1227
|
-
type: "BUILT_IN_ERROR",
|
|
1228
|
-
name: "UnknownError",
|
|
1229
|
-
message: String(error),
|
|
1230
|
-
stackTrace: ""
|
|
1231
|
-
}
|
|
1232
|
-
};
|
|
1233
|
-
this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
|
|
1234
|
-
version: "v1",
|
|
1235
|
-
completion
|
|
1716
|
+
logger.error("Failed to complete lazy attempt", {
|
|
1717
|
+
error
|
|
1236
1718
|
});
|
|
1719
|
+
this.#failRun(message.lazyPayload.runId, error);
|
|
1237
1720
|
}
|
|
1238
1721
|
},
|
|
1239
1722
|
REQUEST_ATTEMPT_CANCELLATION: async (message) => {
|
|
@@ -1247,133 +1730,153 @@ var ProdWorker = class {
|
|
|
1247
1730
|
REQUEST_EXIT: async (message) => {
|
|
1248
1731
|
if (message.version === "v2" && message.delayInMs) {
|
|
1249
1732
|
logger.log("exit requested with delay", { delayInMs: message.delayInMs });
|
|
1250
|
-
await
|
|
1733
|
+
await timeout2(message.delayInMs);
|
|
1251
1734
|
}
|
|
1252
1735
|
this.#coordinatorSocket.close();
|
|
1253
1736
|
process.exit(0);
|
|
1254
1737
|
},
|
|
1255
1738
|
READY_FOR_RETRY: async (message) => {
|
|
1256
1739
|
if (this.completed.size < 1) {
|
|
1740
|
+
logger.error("Received READY_FOR_RETRY but no completions yet. This is a bug.");
|
|
1257
1741
|
return;
|
|
1258
1742
|
}
|
|
1259
|
-
this
|
|
1260
|
-
|
|
1261
|
-
runId: this.runId,
|
|
1262
|
-
totalCompletions: this.completed.size
|
|
1263
|
-
});
|
|
1743
|
+
this.submitAttemptCompletionReplay = void 0;
|
|
1744
|
+
await this.#readyForLazyAttempt();
|
|
1264
1745
|
}
|
|
1265
1746
|
},
|
|
1747
|
+
// MARK: ON CONNECTION
|
|
1266
1748
|
onConnection: async (socket, handler, sender, logger2) => {
|
|
1267
|
-
logger2.log("connected to coordinator", {
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
if (
|
|
1274
|
-
logger2.
|
|
1275
|
-
this.#emitUnrecoverableError(
|
|
1276
|
-
"NoNextResume",
|
|
1277
|
-
"Next resume reason not set while resuming from paused state"
|
|
1278
|
-
);
|
|
1279
|
-
return;
|
|
1280
|
-
}
|
|
1281
|
-
if (!this.attemptFriendlyId) {
|
|
1282
|
-
logger2.error("Missing friendly ID", { status: this.#status });
|
|
1283
|
-
this.#emitUnrecoverableError(
|
|
1284
|
-
"NoAttemptId",
|
|
1285
|
-
"Attempt ID not set while resuming from paused state"
|
|
1286
|
-
);
|
|
1749
|
+
logger2.log("connected to coordinator", {
|
|
1750
|
+
status: this.#status,
|
|
1751
|
+
connectionCount: ++this.connectionCount
|
|
1752
|
+
});
|
|
1753
|
+
socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId });
|
|
1754
|
+
try {
|
|
1755
|
+
if (this.waitForPostStart) {
|
|
1756
|
+
logger2.log("skip connection handler, waiting for post start hook");
|
|
1287
1757
|
return;
|
|
1288
1758
|
}
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1759
|
+
if (this.paused) {
|
|
1760
|
+
if (!this.nextResumeAfter) {
|
|
1761
|
+
logger2.error("Missing next resume reason", { status: this.#status });
|
|
1762
|
+
this.#emitUnrecoverableError(
|
|
1763
|
+
"NoNextResume",
|
|
1764
|
+
"Next resume reason not set while resuming from paused state"
|
|
1765
|
+
);
|
|
1766
|
+
return;
|
|
1767
|
+
}
|
|
1768
|
+
if (!this.attemptFriendlyId) {
|
|
1769
|
+
logger2.error("Missing friendly ID", { status: this.#status });
|
|
1770
|
+
this.#emitUnrecoverableError(
|
|
1771
|
+
"NoAttemptId",
|
|
1772
|
+
"Attempt ID not set while resuming from paused state"
|
|
1773
|
+
);
|
|
1774
|
+
return;
|
|
1775
|
+
}
|
|
1776
|
+
socket.emit("READY_FOR_RESUME", {
|
|
1299
1777
|
version: "v1",
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
});
|
|
1303
|
-
};
|
|
1304
|
-
try {
|
|
1305
|
-
const taskResources = await this.#initializeWorker();
|
|
1306
|
-
const { success } = await socket.emitWithAck("INDEX_TASKS", {
|
|
1307
|
-
version: "v2",
|
|
1308
|
-
deploymentId: this.deploymentId,
|
|
1309
|
-
...taskResources,
|
|
1310
|
-
supportsLazyAttempts: true
|
|
1778
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
1779
|
+
type: this.nextResumeAfter
|
|
1311
1780
|
});
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
const stderr = this.#backgroundWorker.stderr.join("\n");
|
|
1321
|
-
if (e instanceof TaskMetadataParseError) {
|
|
1322
|
-
logger2.error("tasks metadata parse error", {
|
|
1323
|
-
zodIssues: e.zodIssues,
|
|
1324
|
-
tasks: e.tasks
|
|
1325
|
-
});
|
|
1326
|
-
failIndex({
|
|
1327
|
-
name: "TaskMetadataParseError",
|
|
1328
|
-
message: "There was an error parsing the task metadata",
|
|
1329
|
-
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
|
|
1330
|
-
stderr
|
|
1781
|
+
return;
|
|
1782
|
+
}
|
|
1783
|
+
if (process.env.INDEX_TASKS === "true") {
|
|
1784
|
+
const failIndex = (error) => {
|
|
1785
|
+
socket.emit("INDEXING_FAILED", {
|
|
1786
|
+
version: "v1",
|
|
1787
|
+
deploymentId: this.deploymentId,
|
|
1788
|
+
error
|
|
1331
1789
|
});
|
|
1332
|
-
}
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
stack: e.originalError.stack,
|
|
1337
|
-
stderr
|
|
1338
|
-
};
|
|
1339
|
-
logger2.error("uncaught exception", { originalError: error });
|
|
1340
|
-
failIndex(error);
|
|
1341
|
-
} else if (e instanceof Error) {
|
|
1342
|
-
const error = {
|
|
1343
|
-
name: e.name,
|
|
1344
|
-
message: e.message,
|
|
1345
|
-
stack: e.stack,
|
|
1346
|
-
stderr
|
|
1347
|
-
};
|
|
1348
|
-
logger2.error("error", { error });
|
|
1790
|
+
};
|
|
1791
|
+
process.removeAllListeners("uncaughtException");
|
|
1792
|
+
process.on("uncaughtException", (error) => {
|
|
1793
|
+
console.error("Uncaught exception while indexing", error);
|
|
1349
1794
|
failIndex(error);
|
|
1350
|
-
}
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
name: "Error",
|
|
1361
|
-
message: "Unknown error",
|
|
1362
|
-
stderr
|
|
1795
|
+
});
|
|
1796
|
+
try {
|
|
1797
|
+
const taskResources = await this.#initializeWorker();
|
|
1798
|
+
const indexTasks = await defaultBackoff.maxRetries(3).execute(async () => {
|
|
1799
|
+
return await socket.timeout(2e4).emitWithAck("INDEX_TASKS", {
|
|
1800
|
+
version: "v2",
|
|
1801
|
+
deploymentId: this.deploymentId,
|
|
1802
|
+
...taskResources,
|
|
1803
|
+
supportsLazyAttempts: true
|
|
1804
|
+
});
|
|
1363
1805
|
});
|
|
1806
|
+
if (!indexTasks.success || !indexTasks.result.success) {
|
|
1807
|
+
logger2.error("indexing failure, shutting down..", { indexTasks });
|
|
1808
|
+
process.exit(1);
|
|
1809
|
+
} else {
|
|
1810
|
+
logger2.info("indexing done, shutting down..");
|
|
1811
|
+
process.exit(0);
|
|
1812
|
+
}
|
|
1813
|
+
} catch (e) {
|
|
1814
|
+
const stderr = this.#backgroundWorker.stderr.join("\n");
|
|
1815
|
+
if (e instanceof TaskMetadataParseError) {
|
|
1816
|
+
logger2.error("tasks metadata parse error", {
|
|
1817
|
+
zodIssues: e.zodIssues,
|
|
1818
|
+
tasks: e.tasks
|
|
1819
|
+
});
|
|
1820
|
+
failIndex({
|
|
1821
|
+
name: "TaskMetadataParseError",
|
|
1822
|
+
message: "There was an error parsing the task metadata",
|
|
1823
|
+
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
|
|
1824
|
+
stderr
|
|
1825
|
+
});
|
|
1826
|
+
} else if (e instanceof UncaughtExceptionError) {
|
|
1827
|
+
const error = {
|
|
1828
|
+
name: e.originalError.name,
|
|
1829
|
+
message: e.originalError.message,
|
|
1830
|
+
stack: e.originalError.stack,
|
|
1831
|
+
stderr
|
|
1832
|
+
};
|
|
1833
|
+
logger2.error("uncaught exception", { originalError: error });
|
|
1834
|
+
failIndex(error);
|
|
1835
|
+
} else if (e instanceof Error) {
|
|
1836
|
+
const error = {
|
|
1837
|
+
name: e.name,
|
|
1838
|
+
message: e.message,
|
|
1839
|
+
stack: e.stack,
|
|
1840
|
+
stderr
|
|
1841
|
+
};
|
|
1842
|
+
logger2.error("error", { error });
|
|
1843
|
+
failIndex(error);
|
|
1844
|
+
} else if (typeof e === "string") {
|
|
1845
|
+
logger2.error("string error", { error: { message: e } });
|
|
1846
|
+
failIndex({
|
|
1847
|
+
name: "Error",
|
|
1848
|
+
message: e,
|
|
1849
|
+
stderr
|
|
1850
|
+
});
|
|
1851
|
+
} else {
|
|
1852
|
+
logger2.error("unknown error", { error: e });
|
|
1853
|
+
failIndex({
|
|
1854
|
+
name: "Error",
|
|
1855
|
+
message: "Unknown error",
|
|
1856
|
+
stderr
|
|
1857
|
+
});
|
|
1858
|
+
}
|
|
1859
|
+
await timeout2(1e3);
|
|
1860
|
+
process.exit(EXIT_CODE_ALREADY_HANDLED);
|
|
1364
1861
|
}
|
|
1365
|
-
await setTimeout2(200);
|
|
1366
|
-
process.exit(EXIT_CODE_ALREADY_HANDLED);
|
|
1367
1862
|
}
|
|
1863
|
+
if (this.executing) {
|
|
1864
|
+
return;
|
|
1865
|
+
}
|
|
1866
|
+
process.removeAllListeners("uncaughtException");
|
|
1867
|
+
process.on("uncaughtException", (error) => {
|
|
1868
|
+
console.error("Uncaught exception during run", error);
|
|
1869
|
+
this.#failRun(this.runId, error);
|
|
1870
|
+
});
|
|
1871
|
+
await this.#readyForLazyAttempt();
|
|
1872
|
+
} catch (error) {
|
|
1873
|
+
logger2.error("connection handler error", { error });
|
|
1874
|
+
} finally {
|
|
1875
|
+
if (this.connectionCount === 1) {
|
|
1876
|
+
return;
|
|
1877
|
+
}
|
|
1878
|
+
this.#handleReplays();
|
|
1368
1879
|
}
|
|
1369
|
-
if (this.executing) {
|
|
1370
|
-
return;
|
|
1371
|
-
}
|
|
1372
|
-
socket.emit("READY_FOR_LAZY_ATTEMPT", {
|
|
1373
|
-
version: "v1",
|
|
1374
|
-
runId: this.runId,
|
|
1375
|
-
totalCompletions: this.completed.size
|
|
1376
|
-
});
|
|
1377
1880
|
},
|
|
1378
1881
|
onError: async (socket, err, logger2) => {
|
|
1379
1882
|
logger2.error("onError", {
|
|
@@ -1382,13 +1885,109 @@ var ProdWorker = class {
|
|
|
1382
1885
|
message: err.message
|
|
1383
1886
|
}
|
|
1384
1887
|
});
|
|
1385
|
-
await this.#reconnect();
|
|
1386
|
-
},
|
|
1387
|
-
onDisconnect: async (socket, reason, description, logger2) => {
|
|
1388
1888
|
}
|
|
1389
1889
|
});
|
|
1390
1890
|
return coordinatorConnection;
|
|
1391
1891
|
}
|
|
1892
|
+
// MARK: REPLAYS
|
|
1893
|
+
async #handleReplays() {
|
|
1894
|
+
const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
|
|
1895
|
+
const replayCancellationDelay = 2e4;
|
|
1896
|
+
if (this.waitForTaskReplay) {
|
|
1897
|
+
logger.log("replaying wait for task", { ...this.waitForTaskReplay });
|
|
1898
|
+
const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
|
|
1899
|
+
await timeout2(replayCancellationDelay);
|
|
1900
|
+
if (!this.waitForTaskReplay) {
|
|
1901
|
+
logger.error("wait for task replay cancelled, discarding", {
|
|
1902
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1903
|
+
});
|
|
1904
|
+
return;
|
|
1905
|
+
}
|
|
1906
|
+
if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1907
|
+
logger.error("wait for task replay idempotency key mismatch, discarding", {
|
|
1908
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1909
|
+
newMessage: this.waitForTaskReplay
|
|
1910
|
+
});
|
|
1911
|
+
return;
|
|
1912
|
+
}
|
|
1913
|
+
try {
|
|
1914
|
+
await backoff.wait(attempt + 1);
|
|
1915
|
+
await this.#waitForTaskHandler(message);
|
|
1916
|
+
} catch (error) {
|
|
1917
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1918
|
+
logger.error("wait for task replay retry limit exceeded", { error });
|
|
1919
|
+
} else {
|
|
1920
|
+
logger.error("wait for task replay error", { error });
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
return;
|
|
1924
|
+
}
|
|
1925
|
+
if (this.waitForBatchReplay) {
|
|
1926
|
+
logger.log("replaying wait for batch", {
|
|
1927
|
+
...this.waitForBatchReplay,
|
|
1928
|
+
cancellationDelay: replayCancellationDelay
|
|
1929
|
+
});
|
|
1930
|
+
const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
|
|
1931
|
+
await timeout2(replayCancellationDelay);
|
|
1932
|
+
if (!this.waitForBatchReplay) {
|
|
1933
|
+
logger.error("wait for batch replay cancelled, discarding", {
|
|
1934
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1935
|
+
});
|
|
1936
|
+
return;
|
|
1937
|
+
}
|
|
1938
|
+
if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1939
|
+
logger.error("wait for batch replay idempotency key mismatch, discarding", {
|
|
1940
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1941
|
+
newMessage: this.waitForBatchReplay
|
|
1942
|
+
});
|
|
1943
|
+
return;
|
|
1944
|
+
}
|
|
1945
|
+
try {
|
|
1946
|
+
await backoff.wait(attempt + 1);
|
|
1947
|
+
await this.#waitForBatchHandler(message);
|
|
1948
|
+
} catch (error) {
|
|
1949
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1950
|
+
logger.error("wait for batch replay retry limit exceeded", { error });
|
|
1951
|
+
} else {
|
|
1952
|
+
logger.error("wait for batch replay error", { error });
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
return;
|
|
1956
|
+
}
|
|
1957
|
+
if (this.submitAttemptCompletionReplay) {
|
|
1958
|
+
logger.log("replaying attempt completion", {
|
|
1959
|
+
...this.submitAttemptCompletionReplay,
|
|
1960
|
+
cancellationDelay: replayCancellationDelay
|
|
1961
|
+
});
|
|
1962
|
+
const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
|
|
1963
|
+
await timeout2(replayCancellationDelay);
|
|
1964
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1965
|
+
logger.error("attempt completion replay cancelled, discarding", {
|
|
1966
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1967
|
+
});
|
|
1968
|
+
return;
|
|
1969
|
+
}
|
|
1970
|
+
if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1971
|
+
logger.error("attempt completion replay idempotency key mismatch, discarding", {
|
|
1972
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1973
|
+
newMessage: this.submitAttemptCompletionReplay
|
|
1974
|
+
});
|
|
1975
|
+
return;
|
|
1976
|
+
}
|
|
1977
|
+
try {
|
|
1978
|
+
await backoff.wait(attempt + 1);
|
|
1979
|
+
await this.#submitAttemptCompletion(message.execution, message.completion, idempotencyKey);
|
|
1980
|
+
} catch (error) {
|
|
1981
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1982
|
+
logger.error("attempt completion replay retry limit exceeded", { error });
|
|
1983
|
+
} else {
|
|
1984
|
+
logger.error("attempt completion replay error", { error });
|
|
1985
|
+
}
|
|
1986
|
+
}
|
|
1987
|
+
return;
|
|
1988
|
+
}
|
|
1989
|
+
}
|
|
1990
|
+
// MARK: HTTP SERVER
|
|
1392
1991
|
#createHttpServer() {
|
|
1393
1992
|
const httpServer = createServer(async (req, res) => {
|
|
1394
1993
|
logger.log(`[${req.method}]`, req.url);
|
|
@@ -1407,17 +2006,13 @@ var ProdWorker = class {
|
|
|
1407
2006
|
return reply.text("Connected to coordinator");
|
|
1408
2007
|
}
|
|
1409
2008
|
case "/close": {
|
|
1410
|
-
await this.#coordinatorSocket.sendWithAck("LOG", {
|
|
1411
|
-
version: "v1",
|
|
1412
|
-
text: `[${req.method}] ${req.url}`
|
|
1413
|
-
});
|
|
1414
2009
|
this.#coordinatorSocket.close();
|
|
2010
|
+
this.connectionCount = 0;
|
|
1415
2011
|
return reply.text("Disconnected from coordinator");
|
|
1416
2012
|
}
|
|
1417
2013
|
case "/test": {
|
|
1418
|
-
await this.#coordinatorSocket.
|
|
1419
|
-
version: "v1"
|
|
1420
|
-
text: `[${req.method}] ${req.url}`
|
|
2014
|
+
await this.#coordinatorSocket.socket.timeout(1e4).emitWithAck("TEST", {
|
|
2015
|
+
version: "v1"
|
|
1421
2016
|
});
|
|
1422
2017
|
return reply.text("Received ACK from coordinator");
|
|
1423
2018
|
}
|
|
@@ -1452,7 +2047,7 @@ var ProdWorker = class {
|
|
|
1452
2047
|
break;
|
|
1453
2048
|
}
|
|
1454
2049
|
case "restore": {
|
|
1455
|
-
await this.#
|
|
2050
|
+
await this.#reconnectAfterPostStart();
|
|
1456
2051
|
break;
|
|
1457
2052
|
}
|
|
1458
2053
|
default: {
|
|
@@ -1483,7 +2078,7 @@ var ProdWorker = class {
|
|
|
1483
2078
|
}
|
|
1484
2079
|
logger.error(`port ${this.#httpPort} already in use, retrying with random port..`);
|
|
1485
2080
|
this.#httpPort = getRandomPortNumber();
|
|
1486
|
-
await
|
|
2081
|
+
await timeout2(100);
|
|
1487
2082
|
this.start();
|
|
1488
2083
|
});
|
|
1489
2084
|
return httpServer;
|
|
@@ -1493,8 +2088,12 @@ var ProdWorker = class {
|
|
|
1493
2088
|
await this.#backgroundWorker.initialize({ env: envVars });
|
|
1494
2089
|
let packageVersion;
|
|
1495
2090
|
const taskResources = [];
|
|
1496
|
-
if (!this.#backgroundWorker.tasks) {
|
|
1497
|
-
throw new Error(
|
|
2091
|
+
if (!this.#backgroundWorker.tasks || this.#backgroundWorker.tasks.length === 0) {
|
|
2092
|
+
throw new Error(
|
|
2093
|
+
`Background Worker started without tasks. Searched in: ${__PROJECT_CONFIG__.triggerDirectories?.join(
|
|
2094
|
+
", "
|
|
2095
|
+
)}`
|
|
2096
|
+
);
|
|
1498
2097
|
}
|
|
1499
2098
|
for (const task of this.#backgroundWorker.tasks) {
|
|
1500
2099
|
taskResources.push(task);
|
|
@@ -1528,7 +2127,9 @@ var ProdWorker = class {
|
|
|
1528
2127
|
completed: this.completed.size,
|
|
1529
2128
|
nextResumeAfter: this.nextResumeAfter,
|
|
1530
2129
|
waitForPostStart: this.waitForPostStart,
|
|
1531
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
2130
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
2131
|
+
waitForTaskReplay: this.waitForTaskReplay,
|
|
2132
|
+
waitForBatchReplay: this.waitForBatchReplay
|
|
1532
2133
|
};
|
|
1533
2134
|
}
|
|
1534
2135
|
#emitUnrecoverableError(name, message) {
|