trigger.dev 3.0.0-beta.44 → 3.0.0-beta.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Containerfile.prod +22 -9
- package/dist/index.js +292 -114
- package/dist/index.js.map +1 -1
- package/dist/workers/dev/worker-setup.js +1 -1
- package/dist/workers/prod/entry-point.js +980 -344
- package/dist/workers/prod/worker-setup.js +1 -1
- package/package.json +6 -7
|
@@ -3,7 +3,8 @@ import {
|
|
|
3
3
|
CoordinatorToProdWorkerMessages,
|
|
4
4
|
PostStartCauses,
|
|
5
5
|
PreStopCauses,
|
|
6
|
-
ProdWorkerToCoordinatorMessages
|
|
6
|
+
ProdWorkerToCoordinatorMessages,
|
|
7
|
+
TaskRunErrorCodes as TaskRunErrorCodes2
|
|
7
8
|
} from "@trigger.dev/core/v3";
|
|
8
9
|
import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
|
|
9
10
|
|
|
@@ -65,9 +66,280 @@ var SimpleLogger = class {
|
|
|
65
66
|
}
|
|
66
67
|
};
|
|
67
68
|
|
|
68
|
-
// src/
|
|
69
|
-
|
|
70
|
-
|
|
69
|
+
// ../core-apps/src/process.ts
|
|
70
|
+
var EXIT_CODE_ALREADY_HANDLED = 111;
|
|
71
|
+
var EXIT_CODE_CHILD_NONZERO = 112;
|
|
72
|
+
|
|
73
|
+
// ../core-apps/src/backoff.ts
|
|
74
|
+
import { setTimeout as timeout } from "node:timers/promises";
|
|
75
|
+
var StopRetrying = class extends Error {
|
|
76
|
+
constructor(message) {
|
|
77
|
+
super(message);
|
|
78
|
+
this.name = "StopRetrying";
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
var AttemptTimeout = class extends Error {
|
|
82
|
+
constructor(message) {
|
|
83
|
+
super(message);
|
|
84
|
+
this.name = "AttemptTimeout";
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
var RetryLimitExceeded = class extends Error {
|
|
88
|
+
constructor(message) {
|
|
89
|
+
super(message);
|
|
90
|
+
this.name = "RetryLimitExceeded";
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
var ExponentialBackoff = class _ExponentialBackoff {
|
|
94
|
+
#retries = 0;
|
|
95
|
+
#type;
|
|
96
|
+
#base;
|
|
97
|
+
#factor;
|
|
98
|
+
#min;
|
|
99
|
+
#max;
|
|
100
|
+
#maxRetries;
|
|
101
|
+
#maxElapsed;
|
|
102
|
+
constructor(type, opts = {}) {
|
|
103
|
+
this.#type = type ?? "NoJitter";
|
|
104
|
+
this.#base = opts.base ?? 2;
|
|
105
|
+
this.#factor = opts.factor ?? 1;
|
|
106
|
+
this.#min = opts.min ?? -Infinity;
|
|
107
|
+
this.#max = opts.max ?? Infinity;
|
|
108
|
+
this.#maxRetries = opts.maxRetries ?? Infinity;
|
|
109
|
+
this.#maxElapsed = opts.maxElapsed ?? Infinity;
|
|
110
|
+
}
|
|
111
|
+
#clone(type, opts = {}) {
|
|
112
|
+
return new _ExponentialBackoff(type ?? this.#type, {
|
|
113
|
+
base: opts.base ?? this.#base,
|
|
114
|
+
factor: opts.factor ?? this.#factor,
|
|
115
|
+
min: opts.min ?? this.#min,
|
|
116
|
+
max: opts.max ?? this.#max,
|
|
117
|
+
maxRetries: opts.maxRetries ?? this.#maxRetries,
|
|
118
|
+
maxElapsed: opts.maxElapsed ?? this.#maxElapsed
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
type(type) {
|
|
122
|
+
return this.#clone(type);
|
|
123
|
+
}
|
|
124
|
+
base(base) {
|
|
125
|
+
return this.#clone(void 0, { base });
|
|
126
|
+
}
|
|
127
|
+
factor(factor) {
|
|
128
|
+
return this.#clone(void 0, { factor });
|
|
129
|
+
}
|
|
130
|
+
min(min) {
|
|
131
|
+
return this.#clone(void 0, { min });
|
|
132
|
+
}
|
|
133
|
+
max(max) {
|
|
134
|
+
return this.#clone(void 0, { max });
|
|
135
|
+
}
|
|
136
|
+
maxRetries(maxRetries) {
|
|
137
|
+
return this.#clone(void 0, { maxRetries });
|
|
138
|
+
}
|
|
139
|
+
// TODO: With .execute(), should this also include the time it takes to execute the callback?
|
|
140
|
+
maxElapsed(maxElapsed) {
|
|
141
|
+
return this.#clone(void 0, { maxElapsed });
|
|
142
|
+
}
|
|
143
|
+
retries(retries) {
|
|
144
|
+
if (typeof retries !== "undefined") {
|
|
145
|
+
if (retries > this.#maxRetries) {
|
|
146
|
+
console.error(
|
|
147
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
148
|
+
);
|
|
149
|
+
this.#retries = this.#maxRetries;
|
|
150
|
+
} else {
|
|
151
|
+
this.#retries = retries;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return this.#clone();
|
|
155
|
+
}
|
|
156
|
+
async *retryAsync(maxRetries = this.#maxRetries ?? Infinity) {
|
|
157
|
+
let elapsed = 0;
|
|
158
|
+
let retry = 0;
|
|
159
|
+
while (retry <= maxRetries) {
|
|
160
|
+
const delay = this.delay(retry);
|
|
161
|
+
elapsed += delay;
|
|
162
|
+
if (elapsed > this.#maxElapsed) {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
yield {
|
|
166
|
+
delay: {
|
|
167
|
+
seconds: delay,
|
|
168
|
+
milliseconds: delay * 1e3
|
|
169
|
+
},
|
|
170
|
+
retry
|
|
171
|
+
};
|
|
172
|
+
retry++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
async *[Symbol.asyncIterator]() {
|
|
176
|
+
yield* this.retryAsync();
|
|
177
|
+
}
|
|
178
|
+
/** Returns the delay for the current retry in seconds. */
|
|
179
|
+
delay(retries = this.#retries, jitter = true) {
|
|
180
|
+
if (retries > this.#maxRetries) {
|
|
181
|
+
console.error(
|
|
182
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
183
|
+
);
|
|
184
|
+
retries = this.#maxRetries;
|
|
185
|
+
}
|
|
186
|
+
let delay = this.#factor * this.#base ** retries;
|
|
187
|
+
switch (this.#type) {
|
|
188
|
+
case "NoJitter": {
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
case "FullJitter": {
|
|
192
|
+
if (!jitter) {
|
|
193
|
+
delay = 0;
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
delay *= Math.random();
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
case "EqualJitter": {
|
|
200
|
+
if (!jitter) {
|
|
201
|
+
delay *= 0.5;
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
delay *= 0.5 * (1 + Math.random());
|
|
205
|
+
break;
|
|
206
|
+
}
|
|
207
|
+
default: {
|
|
208
|
+
throw new Error(`Unknown backoff type: ${this.#type}`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
if (delay < this.#min) {
|
|
212
|
+
delay = this.#min + Math.random() * (this.#min * 0.2);
|
|
213
|
+
}
|
|
214
|
+
if (delay > this.#max) {
|
|
215
|
+
delay = this.#max - Math.random() * (this.#max * 0.2);
|
|
216
|
+
}
|
|
217
|
+
delay = Math.round(delay);
|
|
218
|
+
return delay;
|
|
219
|
+
}
|
|
220
|
+
/** Waits with the appropriate delay for the current retry. */
|
|
221
|
+
async wait(retries = this.#retries, jitter = true) {
|
|
222
|
+
if (retries > this.#maxRetries) {
|
|
223
|
+
console.error(`Retry limit exceeded: ${retries} > ${this.#maxRetries}`);
|
|
224
|
+
throw new RetryLimitExceeded();
|
|
225
|
+
}
|
|
226
|
+
const delay = this.delay(retries, jitter);
|
|
227
|
+
return await timeout(delay * 1e3);
|
|
228
|
+
}
|
|
229
|
+
elapsed(retries = this.#retries, jitter = true) {
|
|
230
|
+
let elapsed = 0;
|
|
231
|
+
for (let i = 0; i <= retries; i++) {
|
|
232
|
+
elapsed += this.delay(i, jitter);
|
|
233
|
+
}
|
|
234
|
+
const total = elapsed;
|
|
235
|
+
let days = 0;
|
|
236
|
+
if (elapsed > 3600 * 24) {
|
|
237
|
+
days = Math.floor(elapsed / 3600 / 24);
|
|
238
|
+
elapsed -= days * 3600 * 24;
|
|
239
|
+
}
|
|
240
|
+
let hours = 0;
|
|
241
|
+
if (elapsed > 3600) {
|
|
242
|
+
hours = Math.floor(elapsed / 3600);
|
|
243
|
+
elapsed -= hours * 3600;
|
|
244
|
+
}
|
|
245
|
+
let minutes = 0;
|
|
246
|
+
if (elapsed > 60) {
|
|
247
|
+
minutes = Math.floor(elapsed / 60);
|
|
248
|
+
elapsed -= minutes * 60;
|
|
249
|
+
}
|
|
250
|
+
const seconds = elapsed;
|
|
251
|
+
return {
|
|
252
|
+
seconds,
|
|
253
|
+
minutes,
|
|
254
|
+
hours,
|
|
255
|
+
days,
|
|
256
|
+
total
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
reset() {
|
|
260
|
+
this.#retries = 0;
|
|
261
|
+
return this;
|
|
262
|
+
}
|
|
263
|
+
next() {
|
|
264
|
+
this.#retries++;
|
|
265
|
+
return this.delay();
|
|
266
|
+
}
|
|
267
|
+
stop() {
|
|
268
|
+
throw new StopRetrying();
|
|
269
|
+
}
|
|
270
|
+
get state() {
|
|
271
|
+
return {
|
|
272
|
+
retries: this.#retries,
|
|
273
|
+
type: this.#type,
|
|
274
|
+
base: this.#base,
|
|
275
|
+
factor: this.#factor,
|
|
276
|
+
min: this.#min,
|
|
277
|
+
max: this.#max,
|
|
278
|
+
maxRetries: this.#maxRetries,
|
|
279
|
+
maxElapsed: this.#maxElapsed
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
async execute(callback, { attemptTimeoutMs = 0 } = {}) {
|
|
283
|
+
let elapsedMs = 0;
|
|
284
|
+
let finalError = void 0;
|
|
285
|
+
for await (const { delay, retry } of this) {
|
|
286
|
+
const start = Date.now();
|
|
287
|
+
if (retry > 0) {
|
|
288
|
+
console.log(`Retrying in ${delay.milliseconds}ms`);
|
|
289
|
+
await timeout(delay.milliseconds);
|
|
290
|
+
}
|
|
291
|
+
let attemptTimeout = void 0;
|
|
292
|
+
try {
|
|
293
|
+
const result = await new Promise(async (resolve, reject) => {
|
|
294
|
+
if (attemptTimeoutMs > 0) {
|
|
295
|
+
attemptTimeout = setTimeout(() => {
|
|
296
|
+
reject(new AttemptTimeout());
|
|
297
|
+
}, attemptTimeoutMs);
|
|
298
|
+
}
|
|
299
|
+
try {
|
|
300
|
+
const callbackResult = await callback({ delay, retry, elapsedMs });
|
|
301
|
+
resolve(callbackResult);
|
|
302
|
+
} catch (error) {
|
|
303
|
+
reject(error);
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
return {
|
|
307
|
+
success: true,
|
|
308
|
+
result
|
|
309
|
+
};
|
|
310
|
+
} catch (error) {
|
|
311
|
+
finalError = error;
|
|
312
|
+
if (error instanceof StopRetrying) {
|
|
313
|
+
return {
|
|
314
|
+
success: false,
|
|
315
|
+
cause: "StopRetrying",
|
|
316
|
+
error: error.message
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
if (error instanceof AttemptTimeout) {
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
} finally {
|
|
323
|
+
elapsedMs += Date.now() - start;
|
|
324
|
+
clearTimeout(attemptTimeout);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
if (finalError instanceof AttemptTimeout) {
|
|
328
|
+
return {
|
|
329
|
+
success: false,
|
|
330
|
+
cause: "Timeout"
|
|
331
|
+
};
|
|
332
|
+
} else {
|
|
333
|
+
return {
|
|
334
|
+
success: false,
|
|
335
|
+
cause: "MaxRetries",
|
|
336
|
+
error: finalError
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
static RetryLimitExceeded = RetryLimitExceeded;
|
|
341
|
+
static StopRetrying = StopRetrying;
|
|
342
|
+
};
|
|
71
343
|
|
|
72
344
|
// src/workers/prod/backgroundWorker.ts
|
|
73
345
|
import {
|
|
@@ -99,9 +371,11 @@ var TaskMetadataParseError = class extends Error {
|
|
|
99
371
|
}
|
|
100
372
|
};
|
|
101
373
|
var UnexpectedExitError = class extends Error {
|
|
102
|
-
constructor(code) {
|
|
374
|
+
constructor(code, signal, stderr) {
|
|
103
375
|
super(`Unexpected exit with code ${code}`);
|
|
104
376
|
this.code = code;
|
|
377
|
+
this.signal = signal;
|
|
378
|
+
this.stderr = stderr;
|
|
105
379
|
this.name = "UnexpectedExitError";
|
|
106
380
|
}
|
|
107
381
|
};
|
|
@@ -129,6 +403,32 @@ var GracefulExitTimeoutError = class extends Error {
|
|
|
129
403
|
this.name = "GracefulExitTimeoutError";
|
|
130
404
|
}
|
|
131
405
|
};
|
|
406
|
+
function getFriendlyErrorMessage(code, signal, stderr, dockerMode = true) {
|
|
407
|
+
const message = (text) => {
|
|
408
|
+
if (signal) {
|
|
409
|
+
return `[${signal}] ${text}`;
|
|
410
|
+
} else {
|
|
411
|
+
return text;
|
|
412
|
+
}
|
|
413
|
+
};
|
|
414
|
+
if (code === 137) {
|
|
415
|
+
if (dockerMode) {
|
|
416
|
+
return message(
|
|
417
|
+
"Process ran out of memory! Try choosing a machine preset with more memory for this task."
|
|
418
|
+
);
|
|
419
|
+
} else {
|
|
420
|
+
return message(
|
|
421
|
+
"Process most likely ran out of memory, but we can't be certain. Try choosing a machine preset with more memory for this task."
|
|
422
|
+
);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if (stderr?.includes("OOMErrorHandler")) {
|
|
426
|
+
return message(
|
|
427
|
+
"Process ran out of memory! Try choosing a machine preset with more memory for this task."
|
|
428
|
+
);
|
|
429
|
+
}
|
|
430
|
+
return message(`Process exited with code ${code}.`);
|
|
431
|
+
}
|
|
132
432
|
|
|
133
433
|
// src/workers/prod/backgroundWorker.ts
|
|
134
434
|
var ProdBackgroundWorker = class {
|
|
@@ -142,13 +442,9 @@ var ProdBackgroundWorker = class {
|
|
|
142
442
|
*/
|
|
143
443
|
onTaskHeartbeat = new Evt();
|
|
144
444
|
onTaskRunHeartbeat = new Evt();
|
|
145
|
-
onWaitForBatch = new Evt();
|
|
146
445
|
onWaitForDuration = new Evt();
|
|
147
446
|
onWaitForTask = new Evt();
|
|
148
|
-
|
|
149
|
-
checkpointCanceledNotification = Evt.create();
|
|
150
|
-
onReadyForCheckpoint = Evt.create();
|
|
151
|
-
onCancelCheckpoint = Evt.create();
|
|
447
|
+
onWaitForBatch = new Evt();
|
|
152
448
|
onCreateTaskRunAttempt = Evt.create();
|
|
153
449
|
attemptCreatedNotification = Evt.create();
|
|
154
450
|
_onClose = new Evt();
|
|
@@ -186,7 +482,10 @@ var ProdBackgroundWorker = class {
|
|
|
186
482
|
this._closed = true;
|
|
187
483
|
}
|
|
188
484
|
async flushTelemetry() {
|
|
485
|
+
console.log("Flushing telemetry");
|
|
486
|
+
const start = performance.now();
|
|
189
487
|
await this._taskRunProcess?.cleanup(false);
|
|
488
|
+
console.log("Flushed telemetry", { duration: performance.now() - start });
|
|
190
489
|
}
|
|
191
490
|
async initialize(options) {
|
|
192
491
|
if (this._initialized) {
|
|
@@ -209,7 +508,7 @@ var ProdBackgroundWorker = class {
|
|
|
209
508
|
...options?.env
|
|
210
509
|
}
|
|
211
510
|
});
|
|
212
|
-
const
|
|
511
|
+
const timeout3 = setTimeout(() => {
|
|
213
512
|
if (resolved) {
|
|
214
513
|
return;
|
|
215
514
|
}
|
|
@@ -226,7 +525,7 @@ var ProdBackgroundWorker = class {
|
|
|
226
525
|
});
|
|
227
526
|
child.on("exit", (code) => {
|
|
228
527
|
if (!resolved) {
|
|
229
|
-
clearTimeout(
|
|
528
|
+
clearTimeout(timeout3);
|
|
230
529
|
resolved = true;
|
|
231
530
|
reject(new Error(`Worker exited with code ${code}`));
|
|
232
531
|
}
|
|
@@ -238,7 +537,7 @@ var ProdBackgroundWorker = class {
|
|
|
238
537
|
handlers: {
|
|
239
538
|
TASKS_READY: async (message) => {
|
|
240
539
|
if (!resolved) {
|
|
241
|
-
clearTimeout(
|
|
540
|
+
clearTimeout(timeout3);
|
|
242
541
|
resolved = true;
|
|
243
542
|
resolve(message.tasks);
|
|
244
543
|
child.kill();
|
|
@@ -246,7 +545,7 @@ var ProdBackgroundWorker = class {
|
|
|
246
545
|
},
|
|
247
546
|
UNCAUGHT_EXCEPTION: async (message) => {
|
|
248
547
|
if (!resolved) {
|
|
249
|
-
clearTimeout(
|
|
548
|
+
clearTimeout(timeout3);
|
|
250
549
|
resolved = true;
|
|
251
550
|
reject(new UncaughtExceptionError(message.error, message.origin));
|
|
252
551
|
child.kill();
|
|
@@ -254,7 +553,7 @@ var ProdBackgroundWorker = class {
|
|
|
254
553
|
},
|
|
255
554
|
TASKS_FAILED_TO_PARSE: async (message) => {
|
|
256
555
|
if (!resolved) {
|
|
257
|
-
clearTimeout(
|
|
556
|
+
clearTimeout(timeout3);
|
|
258
557
|
resolved = true;
|
|
259
558
|
reject(new TaskMetadataParseError(message.zodIssues, message.tasks));
|
|
260
559
|
child.kill();
|
|
@@ -331,18 +630,6 @@ var ProdBackgroundWorker = class {
|
|
|
331
630
|
taskRunProcess.onWaitForTask.attach((message) => {
|
|
332
631
|
this.onWaitForTask.post(message);
|
|
333
632
|
});
|
|
334
|
-
taskRunProcess.onReadyForCheckpoint.attach((message) => {
|
|
335
|
-
this.onReadyForCheckpoint.post(message);
|
|
336
|
-
});
|
|
337
|
-
taskRunProcess.onCancelCheckpoint.attach((message) => {
|
|
338
|
-
this.onCancelCheckpoint.post(message);
|
|
339
|
-
});
|
|
340
|
-
this.preCheckpointNotification.attach((message) => {
|
|
341
|
-
taskRunProcess.preCheckpointNotification.post(message);
|
|
342
|
-
});
|
|
343
|
-
this.checkpointCanceledNotification.attach((message) => {
|
|
344
|
-
taskRunProcess.checkpointCanceledNotification.post(message);
|
|
345
|
-
});
|
|
346
633
|
await taskRunProcess.initialize();
|
|
347
634
|
this._taskRunProcess = taskRunProcess;
|
|
348
635
|
return this._taskRunProcess;
|
|
@@ -383,6 +670,7 @@ var ProdBackgroundWorker = class {
|
|
|
383
670
|
}
|
|
384
671
|
}
|
|
385
672
|
async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
|
|
673
|
+
console.log("Trying graceful exit", { kill, initialSignal });
|
|
386
674
|
try {
|
|
387
675
|
const initialExit = taskRunProcess.onExit.waitFor(5e3);
|
|
388
676
|
if (kill) {
|
|
@@ -395,6 +683,7 @@ var ProdBackgroundWorker = class {
|
|
|
395
683
|
}
|
|
396
684
|
}
|
|
397
685
|
async #tryForcefulExit(taskRunProcess) {
|
|
686
|
+
console.log("Trying forceful exit");
|
|
398
687
|
try {
|
|
399
688
|
const forcedKill = taskRunProcess.onExit.waitFor(5e3);
|
|
400
689
|
taskRunProcess.kill("SIGKILL");
|
|
@@ -455,7 +744,9 @@ var ProdBackgroundWorker = class {
|
|
|
455
744
|
retry: void 0,
|
|
456
745
|
error: {
|
|
457
746
|
type: "INTERNAL_ERROR",
|
|
458
|
-
code: TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE
|
|
747
|
+
code: TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE,
|
|
748
|
+
message: getFriendlyErrorMessage(e.code, e.signal, e.stderr),
|
|
749
|
+
stackTrace: e.stderr
|
|
459
750
|
}
|
|
460
751
|
};
|
|
461
752
|
}
|
|
@@ -506,16 +797,19 @@ var ProdBackgroundWorker = class {
|
|
|
506
797
|
this.onCreateTaskRunAttempt.post({ runId: payload.runId });
|
|
507
798
|
let execution;
|
|
508
799
|
try {
|
|
509
|
-
const
|
|
800
|
+
const start = performance.now();
|
|
801
|
+
const attemptCreated = await this.attemptCreatedNotification.waitFor(12e4);
|
|
510
802
|
if (!attemptCreated.success) {
|
|
511
|
-
throw new Error(
|
|
512
|
-
`Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
|
|
513
|
-
);
|
|
803
|
+
throw new Error(`${attemptCreated.reason ?? "Unknown error"}`);
|
|
514
804
|
}
|
|
805
|
+
console.log("Attempt created", {
|
|
806
|
+
number: attemptCreated.execution.attempt.number,
|
|
807
|
+
duration: performance.now() - start
|
|
808
|
+
});
|
|
515
809
|
execution = attemptCreated.execution;
|
|
516
810
|
} catch (error) {
|
|
517
811
|
console.error("Error while creating attempt", error);
|
|
518
|
-
throw new Error(`Failed to create
|
|
812
|
+
throw new Error(`Failed to create attempt: ${error}`);
|
|
519
813
|
}
|
|
520
814
|
const completion = await this.executeTaskRun(
|
|
521
815
|
{
|
|
@@ -553,6 +847,7 @@ var TaskRunProcess = class {
|
|
|
553
847
|
_isBeingKilled = false;
|
|
554
848
|
_isBeingCancelled = false;
|
|
555
849
|
_gracefulExitTimeoutElapsed = false;
|
|
850
|
+
_stderr = [];
|
|
556
851
|
/**
|
|
557
852
|
* @deprecated use onTaskRunHeartbeat instead
|
|
558
853
|
*/
|
|
@@ -560,13 +855,10 @@ var TaskRunProcess = class {
|
|
|
560
855
|
onTaskRunHeartbeat = new Evt();
|
|
561
856
|
onExit = new Evt();
|
|
562
857
|
onIsBeingKilled = new Evt();
|
|
563
|
-
onWaitForBatch = new Evt();
|
|
564
858
|
onWaitForDuration = new Evt();
|
|
565
859
|
onWaitForTask = new Evt();
|
|
860
|
+
onWaitForBatch = new Evt();
|
|
566
861
|
preCheckpointNotification = Evt.create();
|
|
567
|
-
checkpointCanceledNotification = Evt.create();
|
|
568
|
-
onReadyForCheckpoint = Evt.create();
|
|
569
|
-
onCancelCheckpoint = Evt.create();
|
|
570
862
|
async initialize() {
|
|
571
863
|
this._child = fork(this.path, {
|
|
572
864
|
stdio: [
|
|
@@ -614,6 +906,10 @@ var TaskRunProcess = class {
|
|
|
614
906
|
if (this.messageId) {
|
|
615
907
|
this.onTaskRunHeartbeat.post(this.messageId);
|
|
616
908
|
} else {
|
|
909
|
+
console.error(
|
|
910
|
+
"No message id for task heartbeat, falling back to (deprecated) attempt heartbeat",
|
|
911
|
+
{ id: message.id }
|
|
912
|
+
);
|
|
617
913
|
this.onTaskHeartbeat.post(message.id);
|
|
618
914
|
}
|
|
619
915
|
},
|
|
@@ -627,41 +923,6 @@ var TaskRunProcess = class {
|
|
|
627
923
|
},
|
|
628
924
|
WAIT_FOR_DURATION: async (message) => {
|
|
629
925
|
this.onWaitForDuration.post(message);
|
|
630
|
-
try {
|
|
631
|
-
const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor(
|
|
632
|
-
3e4
|
|
633
|
-
);
|
|
634
|
-
return {
|
|
635
|
-
willCheckpointAndRestore
|
|
636
|
-
};
|
|
637
|
-
} catch (error) {
|
|
638
|
-
console.error("Error while waiting for pre-checkpoint notification", error);
|
|
639
|
-
return {
|
|
640
|
-
willCheckpointAndRestore: false
|
|
641
|
-
};
|
|
642
|
-
}
|
|
643
|
-
},
|
|
644
|
-
READY_FOR_CHECKPOINT: async (message) => {
|
|
645
|
-
this.onReadyForCheckpoint.post(message);
|
|
646
|
-
},
|
|
647
|
-
CANCEL_CHECKPOINT: async (message) => {
|
|
648
|
-
const version = "v2";
|
|
649
|
-
this.onCancelCheckpoint.post(message);
|
|
650
|
-
try {
|
|
651
|
-
const { checkpointCanceled } = await this.checkpointCanceledNotification.waitFor(
|
|
652
|
-
3e4
|
|
653
|
-
);
|
|
654
|
-
return {
|
|
655
|
-
version,
|
|
656
|
-
checkpointCanceled
|
|
657
|
-
};
|
|
658
|
-
} catch (error) {
|
|
659
|
-
console.error("Error while waiting for checkpoint cancellation", error);
|
|
660
|
-
return {
|
|
661
|
-
version,
|
|
662
|
-
checkpointCanceled: true
|
|
663
|
-
};
|
|
664
|
-
}
|
|
665
926
|
}
|
|
666
927
|
}
|
|
667
928
|
});
|
|
@@ -691,14 +952,21 @@ var TaskRunProcess = class {
|
|
|
691
952
|
childPid: this._childPid,
|
|
692
953
|
realChildPid: this._child?.pid
|
|
693
954
|
});
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
955
|
+
try {
|
|
956
|
+
await this._ipc?.sendWithAck(
|
|
957
|
+
"CLEANUP",
|
|
958
|
+
{
|
|
959
|
+
flush: true,
|
|
960
|
+
kill: killParentProcess
|
|
961
|
+
},
|
|
962
|
+
3e4
|
|
963
|
+
);
|
|
964
|
+
} catch (error) {
|
|
965
|
+
console.error("Error while cleaning up task run process", error);
|
|
966
|
+
if (killParentProcess) {
|
|
967
|
+
process.exit(0);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
702
970
|
if (killChildProcess) {
|
|
703
971
|
this._gracefulExitTimeoutElapsed = true;
|
|
704
972
|
await this.kill("SIGKILL");
|
|
@@ -728,19 +996,30 @@ var TaskRunProcess = class {
|
|
|
728
996
|
}
|
|
729
997
|
taskRunCompletedNotification(completion) {
|
|
730
998
|
if (!completion.ok && typeof completion.retry !== "undefined") {
|
|
999
|
+
console.error(
|
|
1000
|
+
"Task run completed with error and wants to retry, won't send task run completed notification"
|
|
1001
|
+
);
|
|
731
1002
|
return;
|
|
732
1003
|
}
|
|
733
|
-
if (this._child?.connected
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
1004
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1005
|
+
console.error(
|
|
1006
|
+
"Child process not connected or being killed, can't send task run completed notification"
|
|
1007
|
+
);
|
|
1008
|
+
return;
|
|
738
1009
|
}
|
|
1010
|
+
this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
|
|
1011
|
+
version: "v2",
|
|
1012
|
+
completion
|
|
1013
|
+
});
|
|
739
1014
|
}
|
|
740
1015
|
waitCompletedNotification() {
|
|
741
|
-
if (this._child?.connected
|
|
742
|
-
|
|
1016
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1017
|
+
console.error(
|
|
1018
|
+
"Child process not connected or being killed, can't send wait completed notification"
|
|
1019
|
+
);
|
|
1020
|
+
return;
|
|
743
1021
|
}
|
|
1022
|
+
this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
|
|
744
1023
|
}
|
|
745
1024
|
async #handleExit(code, signal) {
|
|
746
1025
|
console.log("handling child exit", { code, signal });
|
|
@@ -760,7 +1039,13 @@ var TaskRunProcess = class {
|
|
|
760
1039
|
} else if (this._isBeingKilled) {
|
|
761
1040
|
rejecter(new CleanupProcessError());
|
|
762
1041
|
} else {
|
|
763
|
-
rejecter(
|
|
1042
|
+
rejecter(
|
|
1043
|
+
new UnexpectedExitError(
|
|
1044
|
+
code ?? -1,
|
|
1045
|
+
signal,
|
|
1046
|
+
this._stderr.length ? this._stderr.join("\n") : void 0
|
|
1047
|
+
)
|
|
1048
|
+
);
|
|
764
1049
|
}
|
|
765
1050
|
}
|
|
766
1051
|
}
|
|
@@ -770,7 +1055,12 @@ var TaskRunProcess = class {
|
|
|
770
1055
|
console.log(data.toString());
|
|
771
1056
|
}
|
|
772
1057
|
#handleStdErr(data) {
|
|
773
|
-
|
|
1058
|
+
const text = data.toString();
|
|
1059
|
+
console.error(text);
|
|
1060
|
+
if (this._stderr.length > 100) {
|
|
1061
|
+
this._stderr.shift();
|
|
1062
|
+
}
|
|
1063
|
+
this._stderr.push(text);
|
|
774
1064
|
}
|
|
775
1065
|
async kill(signal, timeoutInMs) {
|
|
776
1066
|
this._isBeingKilled = true;
|
|
@@ -790,7 +1080,11 @@ var TaskRunProcess = class {
|
|
|
790
1080
|
};
|
|
791
1081
|
|
|
792
1082
|
// src/workers/prod/entry-point.ts
|
|
793
|
-
import {
|
|
1083
|
+
import { checkpointSafeTimeout, unboundedTimeout } from "@trigger.dev/core/v3/utils/timers";
|
|
1084
|
+
import { randomUUID } from "node:crypto";
|
|
1085
|
+
import { readFile } from "node:fs/promises";
|
|
1086
|
+
import { createServer } from "node:http";
|
|
1087
|
+
import { setTimeout as timeout2 } from "node:timers/promises";
|
|
794
1088
|
var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
|
|
795
1089
|
var COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
|
|
796
1090
|
var COORDINATOR_PORT = Number(process.env.COORDINATOR_PORT || 50080);
|
|
@@ -798,6 +1092,9 @@ var MACHINE_NAME = process.env.MACHINE_NAME || "local";
|
|
|
798
1092
|
var POD_NAME = process.env.POD_NAME || "some-pod";
|
|
799
1093
|
var SHORT_HASH = process.env.TRIGGER_CONTENT_HASH.slice(0, 9);
|
|
800
1094
|
var logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`);
|
|
1095
|
+
var defaultBackoff = new ExponentialBackoff("FullJitter", {
|
|
1096
|
+
maxRetries: 5
|
|
1097
|
+
});
|
|
801
1098
|
var ProdWorker = class {
|
|
802
1099
|
constructor(port, host = "0.0.0.0") {
|
|
803
1100
|
this.host = host;
|
|
@@ -822,6 +1119,12 @@ var ProdWorker = class {
|
|
|
822
1119
|
attemptFriendlyId;
|
|
823
1120
|
nextResumeAfter;
|
|
824
1121
|
waitForPostStart = false;
|
|
1122
|
+
connectionCount = 0;
|
|
1123
|
+
waitForTaskReplay;
|
|
1124
|
+
waitForBatchReplay;
|
|
1125
|
+
readyForLazyAttemptReplay;
|
|
1126
|
+
submitAttemptCompletionReplay;
|
|
1127
|
+
durationResumeFallback;
|
|
825
1128
|
#httpPort;
|
|
826
1129
|
#backgroundWorker;
|
|
827
1130
|
#httpServer;
|
|
@@ -835,7 +1138,7 @@ var ProdWorker = class {
|
|
|
835
1138
|
logger.log("Waiting for attempt to complete before exiting", {
|
|
836
1139
|
terminationGracePeriodSeconds
|
|
837
1140
|
});
|
|
838
|
-
await
|
|
1141
|
+
await timeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
|
|
839
1142
|
gracefulExitTimeoutElapsed = true;
|
|
840
1143
|
logger.log("Termination timeout reached, exiting gracefully.");
|
|
841
1144
|
} else {
|
|
@@ -846,20 +1149,16 @@ var ProdWorker = class {
|
|
|
846
1149
|
}
|
|
847
1150
|
logger.log("Unhandled signal", { signal });
|
|
848
1151
|
}
|
|
849
|
-
async #exitGracefully(gracefulExitTimeoutElapsed = false) {
|
|
1152
|
+
async #exitGracefully(gracefulExitTimeoutElapsed = false, exitCode = 0) {
|
|
850
1153
|
await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
|
|
851
1154
|
if (!gracefulExitTimeoutElapsed) {
|
|
852
|
-
process.exit(
|
|
1155
|
+
process.exit(exitCode);
|
|
853
1156
|
}
|
|
854
1157
|
}
|
|
855
|
-
async #
|
|
856
|
-
|
|
857
|
-
this.waitForPostStart = false;
|
|
858
|
-
}
|
|
1158
|
+
async #reconnectAfterPostStart() {
|
|
1159
|
+
this.waitForPostStart = false;
|
|
859
1160
|
this.#coordinatorSocket.close();
|
|
860
|
-
|
|
861
|
-
await setTimeout2(1e3);
|
|
862
|
-
}
|
|
1161
|
+
this.connectionCount = 0;
|
|
863
1162
|
let coordinatorHost = COORDINATOR_HOST;
|
|
864
1163
|
try {
|
|
865
1164
|
if (this.runningInKubernetes) {
|
|
@@ -883,6 +1182,98 @@ var ProdWorker = class {
|
|
|
883
1182
|
this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
|
|
884
1183
|
}
|
|
885
1184
|
}
|
|
1185
|
+
// MARK: TASK WAIT
|
|
1186
|
+
async #waitForTaskHandler(message, replayIdempotencyKey) {
|
|
1187
|
+
const waitForTask = await defaultBackoff.execute(async ({ retry }) => {
|
|
1188
|
+
logger.log("Wait for task with backoff", { retry });
|
|
1189
|
+
if (!this.attemptFriendlyId) {
|
|
1190
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1191
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1192
|
+
}
|
|
1193
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_TASK", {
|
|
1194
|
+
version: "v2",
|
|
1195
|
+
friendlyId: message.friendlyId,
|
|
1196
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1197
|
+
});
|
|
1198
|
+
});
|
|
1199
|
+
if (!waitForTask.success) {
|
|
1200
|
+
logger.error("Failed to wait for task with backoff", {
|
|
1201
|
+
cause: waitForTask.cause,
|
|
1202
|
+
error: waitForTask.error
|
|
1203
|
+
});
|
|
1204
|
+
this.#emitUnrecoverableError(
|
|
1205
|
+
"WaitForTaskFailed",
|
|
1206
|
+
`${waitForTask.cause}: ${waitForTask.error}`
|
|
1207
|
+
);
|
|
1208
|
+
return;
|
|
1209
|
+
}
|
|
1210
|
+
const { willCheckpointAndRestore } = waitForTask.result;
|
|
1211
|
+
await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
1212
|
+
if (willCheckpointAndRestore) {
|
|
1213
|
+
if (!this.waitForTaskReplay) {
|
|
1214
|
+
this.waitForTaskReplay = {
|
|
1215
|
+
message,
|
|
1216
|
+
attempt: 1,
|
|
1217
|
+
idempotencyKey: randomUUID()
|
|
1218
|
+
};
|
|
1219
|
+
} else {
|
|
1220
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1221
|
+
logger.error(
|
|
1222
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1223
|
+
);
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
this.waitForTaskReplay.attempt++;
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
// MARK: BATCH WAIT
|
|
1231
|
+
async #waitForBatchHandler(message, replayIdempotencyKey) {
|
|
1232
|
+
const waitForBatch = await defaultBackoff.execute(async ({ retry }) => {
|
|
1233
|
+
logger.log("Wait for batch with backoff", { retry });
|
|
1234
|
+
if (!this.attemptFriendlyId) {
|
|
1235
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1236
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1237
|
+
}
|
|
1238
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_BATCH", {
|
|
1239
|
+
version: "v2",
|
|
1240
|
+
batchFriendlyId: message.batchFriendlyId,
|
|
1241
|
+
runFriendlyIds: message.runFriendlyIds,
|
|
1242
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1243
|
+
});
|
|
1244
|
+
});
|
|
1245
|
+
if (!waitForBatch.success) {
|
|
1246
|
+
logger.error("Failed to wait for batch with backoff", {
|
|
1247
|
+
cause: waitForBatch.cause,
|
|
1248
|
+
error: waitForBatch.error
|
|
1249
|
+
});
|
|
1250
|
+
this.#emitUnrecoverableError(
|
|
1251
|
+
"WaitForBatchFailed",
|
|
1252
|
+
`${waitForBatch.cause}: ${waitForBatch.error}`
|
|
1253
|
+
);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
const { willCheckpointAndRestore } = waitForBatch.result;
|
|
1257
|
+
await this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
1258
|
+
if (willCheckpointAndRestore) {
|
|
1259
|
+
if (!this.waitForBatchReplay) {
|
|
1260
|
+
this.waitForBatchReplay = {
|
|
1261
|
+
message,
|
|
1262
|
+
attempt: 1,
|
|
1263
|
+
idempotencyKey: randomUUID()
|
|
1264
|
+
};
|
|
1265
|
+
} else {
|
|
1266
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1267
|
+
logger.error(
|
|
1268
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1269
|
+
);
|
|
1270
|
+
return;
|
|
1271
|
+
}
|
|
1272
|
+
this.waitForBatchReplay.attempt++;
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
// MARK: WORKER CREATION
|
|
886
1277
|
#createBackgroundWorker() {
|
|
887
1278
|
const backgroundWorker = new ProdBackgroundWorker("worker.js", {
|
|
888
1279
|
projectConfig: __PROJECT_CONFIG__,
|
|
@@ -895,148 +1286,187 @@ var ProdWorker = class {
|
|
|
895
1286
|
contentHash: this.contentHash
|
|
896
1287
|
});
|
|
897
1288
|
backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
|
|
898
|
-
|
|
1289
|
+
logger.log("onTaskHeartbeat", { attemptFriendlyId });
|
|
1290
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_HEARTBEAT", {
|
|
1291
|
+
version: "v1",
|
|
1292
|
+
attemptFriendlyId
|
|
1293
|
+
});
|
|
899
1294
|
});
|
|
900
1295
|
backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
|
|
904
|
-
await this.#prepareForCheckpoint();
|
|
905
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
906
|
-
});
|
|
907
|
-
backgroundWorker.onCancelCheckpoint.attach(async (message) => {
|
|
908
|
-
logger.log("onCancelCheckpoint", { message });
|
|
909
|
-
const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
910
|
-
"CANCEL_CHECKPOINT",
|
|
911
|
-
{
|
|
912
|
-
version: "v2",
|
|
913
|
-
reason: message.reason
|
|
914
|
-
}
|
|
915
|
-
);
|
|
916
|
-
logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
|
|
917
|
-
if (checkpointCanceled) {
|
|
918
|
-
if (message.reason === "WAIT_FOR_DURATION") {
|
|
919
|
-
this.paused = false;
|
|
920
|
-
this.nextResumeAfter = void 0;
|
|
921
|
-
this.waitForPostStart = false;
|
|
922
|
-
}
|
|
923
|
-
}
|
|
924
|
-
backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
|
|
1296
|
+
logger.log("onTaskRunHeartbeat", { runId });
|
|
1297
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
|
|
925
1298
|
});
|
|
926
1299
|
backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
|
|
927
1300
|
logger.log("onCreateTaskRunAttempt()", { message });
|
|
928
|
-
const createAttempt = await
|
|
929
|
-
"
|
|
930
|
-
{
|
|
1301
|
+
const createAttempt = await defaultBackoff.execute(async ({ retry }) => {
|
|
1302
|
+
logger.log("Create task run attempt with backoff", { retry });
|
|
1303
|
+
return await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CREATE_TASK_RUN_ATTEMPT", {
|
|
931
1304
|
version: "v1",
|
|
932
1305
|
runId: message.runId
|
|
933
|
-
}
|
|
934
|
-
);
|
|
1306
|
+
});
|
|
1307
|
+
});
|
|
935
1308
|
if (!createAttempt.success) {
|
|
936
1309
|
backgroundWorker.attemptCreatedNotification.post({
|
|
937
1310
|
success: false,
|
|
938
|
-
reason: createAttempt.
|
|
1311
|
+
reason: `Failed to create attempt with backoff due to ${createAttempt.cause}. ${createAttempt.error}`
|
|
1312
|
+
});
|
|
1313
|
+
return;
|
|
1314
|
+
}
|
|
1315
|
+
if (!createAttempt.result.success) {
|
|
1316
|
+
backgroundWorker.attemptCreatedNotification.post({
|
|
1317
|
+
success: false,
|
|
1318
|
+
reason: createAttempt.result.reason
|
|
939
1319
|
});
|
|
940
1320
|
return;
|
|
941
1321
|
}
|
|
942
1322
|
backgroundWorker.attemptCreatedNotification.post({
|
|
943
1323
|
success: true,
|
|
944
|
-
execution: createAttempt.executionPayload.execution
|
|
1324
|
+
execution: createAttempt.result.executionPayload.execution
|
|
945
1325
|
});
|
|
946
1326
|
});
|
|
947
1327
|
backgroundWorker.attemptCreatedNotification.attach((message) => {
|
|
1328
|
+
logger.log("attemptCreatedNotification", {
|
|
1329
|
+
success: message.success,
|
|
1330
|
+
...message.success ? {
|
|
1331
|
+
attempt: message.execution.attempt,
|
|
1332
|
+
queue: message.execution.queue,
|
|
1333
|
+
worker: message.execution.worker,
|
|
1334
|
+
machine: message.execution.machine
|
|
1335
|
+
} : {
|
|
1336
|
+
reason: message.reason
|
|
1337
|
+
}
|
|
1338
|
+
});
|
|
948
1339
|
if (!message.success) {
|
|
949
1340
|
return;
|
|
950
1341
|
}
|
|
951
1342
|
this.attemptFriendlyId = message.execution.attempt.id;
|
|
952
1343
|
});
|
|
953
1344
|
backgroundWorker.onWaitForDuration.attach(async (message) => {
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
)
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
963
|
-
"WAIT_FOR_DURATION",
|
|
964
|
-
{
|
|
965
|
-
...message,
|
|
966
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
1345
|
+
logger.log("onWaitForDuration", { ...message, drift: Date.now() - message.now });
|
|
1346
|
+
noResume: {
|
|
1347
|
+
const { ms, waitThresholdInMs } = message;
|
|
1348
|
+
const internalTimeout = unboundedTimeout(ms, "internal");
|
|
1349
|
+
const checkpointSafeInternalTimeout = checkpointSafeTimeout(ms);
|
|
1350
|
+
if (ms < waitThresholdInMs) {
|
|
1351
|
+
await internalTimeout;
|
|
1352
|
+
break noResume;
|
|
967
1353
|
}
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
{
|
|
980
|
-
|
|
981
|
-
|
|
1354
|
+
const waitForDuration = await defaultBackoff.execute(async ({ retry }) => {
|
|
1355
|
+
logger.log("Wait for duration with backoff", { retry });
|
|
1356
|
+
if (!this.attemptFriendlyId) {
|
|
1357
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1358
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1359
|
+
}
|
|
1360
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_DURATION", {
|
|
1361
|
+
...message,
|
|
1362
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1363
|
+
});
|
|
1364
|
+
});
|
|
1365
|
+
if (!waitForDuration.success) {
|
|
1366
|
+
logger.error("Failed to wait for duration with backoff", {
|
|
1367
|
+
cause: waitForDuration.cause,
|
|
1368
|
+
error: waitForDuration.error
|
|
1369
|
+
});
|
|
1370
|
+
this.#emitUnrecoverableError(
|
|
1371
|
+
"WaitForDurationFailed",
|
|
1372
|
+
`${waitForDuration.cause}: ${waitForDuration.error}`
|
|
1373
|
+
);
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
const { willCheckpointAndRestore } = waitForDuration.result;
|
|
1377
|
+
if (!willCheckpointAndRestore) {
|
|
1378
|
+
await internalTimeout;
|
|
1379
|
+
break noResume;
|
|
1380
|
+
}
|
|
1381
|
+
await this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
|
|
1382
|
+
await Promise.race([internalTimeout, checkpointSafeInternalTimeout]);
|
|
1383
|
+
try {
|
|
1384
|
+
const { checkpointCanceled } = await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CANCEL_CHECKPOINT", {
|
|
1385
|
+
version: "v2",
|
|
1386
|
+
reason: "WAIT_FOR_DURATION"
|
|
1387
|
+
});
|
|
1388
|
+
logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
|
|
1389
|
+
if (checkpointCanceled) {
|
|
1390
|
+
break noResume;
|
|
1391
|
+
}
|
|
1392
|
+
logger.log("Waiting for external duration resume as we may have been restored");
|
|
1393
|
+
const idempotencyKey = randomUUID();
|
|
1394
|
+
this.durationResumeFallback = { idempotencyKey };
|
|
1395
|
+
setTimeout(() => {
|
|
1396
|
+
if (!this.durationResumeFallback) {
|
|
1397
|
+
logger.error("Already resumed after duration, skipping fallback");
|
|
1398
|
+
return;
|
|
1399
|
+
}
|
|
1400
|
+
if (this.durationResumeFallback.idempotencyKey !== idempotencyKey) {
|
|
1401
|
+
logger.error("Duration resume idempotency key mismatch, skipping fallback");
|
|
1402
|
+
return;
|
|
1403
|
+
}
|
|
1404
|
+
logger.log("Resuming after duration with fallback");
|
|
1405
|
+
this.#resumeAfterDuration();
|
|
1406
|
+
}, 15e3);
|
|
1407
|
+
} catch (error) {
|
|
1408
|
+
logger.debug("Checkpoint cancellation timed out", { error });
|
|
1409
|
+
break noResume;
|
|
982
1410
|
}
|
|
983
|
-
);
|
|
984
|
-
this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
985
|
-
});
|
|
986
|
-
backgroundWorker.onWaitForBatch.attach(async (message) => {
|
|
987
|
-
if (!this.attemptFriendlyId) {
|
|
988
|
-
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
989
|
-
this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
|
|
990
1411
|
return;
|
|
991
1412
|
}
|
|
992
|
-
|
|
993
|
-
"WAIT_FOR_BATCH",
|
|
994
|
-
{
|
|
995
|
-
...message,
|
|
996
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
997
|
-
}
|
|
998
|
-
);
|
|
999
|
-
this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
1413
|
+
this.#resumeAfterDuration();
|
|
1000
1414
|
});
|
|
1415
|
+
backgroundWorker.onWaitForTask.attach(this.#waitForTaskHandler.bind(this));
|
|
1416
|
+
backgroundWorker.onWaitForBatch.attach(this.#waitForBatchHandler.bind(this));
|
|
1001
1417
|
return backgroundWorker;
|
|
1002
1418
|
}
|
|
1003
1419
|
async #prepareForWait(reason, willCheckpointAndRestore) {
|
|
1004
1420
|
logger.log(`prepare for ${reason}`, { willCheckpointAndRestore });
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
this.paused = true;
|
|
1008
|
-
this.nextResumeAfter = reason;
|
|
1009
|
-
this.waitForPostStart = true;
|
|
1010
|
-
if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
|
|
1011
|
-
await this.#prepareForCheckpoint();
|
|
1012
|
-
}
|
|
1421
|
+
if (!willCheckpointAndRestore) {
|
|
1422
|
+
return;
|
|
1013
1423
|
}
|
|
1424
|
+
this.paused = true;
|
|
1425
|
+
this.nextResumeAfter = reason;
|
|
1426
|
+
this.waitForPostStart = true;
|
|
1427
|
+
await this.#prepareForCheckpoint();
|
|
1014
1428
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1429
|
+
// MARK: RETRY PREP
|
|
1430
|
+
async #prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode) {
|
|
1431
|
+
logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit, exitCode });
|
|
1017
1432
|
if (shouldExit) {
|
|
1018
1433
|
if (willCheckpointAndRestore) {
|
|
1019
|
-
logger.
|
|
1434
|
+
logger.error("WARNING: Will checkpoint but also requested exit. This won't end well.");
|
|
1020
1435
|
}
|
|
1021
|
-
await this.#exitGracefully();
|
|
1436
|
+
await this.#exitGracefully(false, exitCode);
|
|
1022
1437
|
return;
|
|
1023
1438
|
}
|
|
1024
1439
|
this.paused = false;
|
|
1025
1440
|
this.waitForPostStart = false;
|
|
1026
1441
|
this.executing = false;
|
|
1027
1442
|
this.attemptFriendlyId = void 0;
|
|
1028
|
-
if (willCheckpointAndRestore) {
|
|
1029
|
-
this.waitForPostStart = true;
|
|
1030
|
-
this.#prepareForCheckpoint(false);
|
|
1031
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1443
|
+
if (!willCheckpointAndRestore) {
|
|
1032
1444
|
return;
|
|
1033
1445
|
}
|
|
1446
|
+
this.waitForPostStart = true;
|
|
1447
|
+
await this.#prepareForCheckpoint(false);
|
|
1034
1448
|
}
|
|
1449
|
+
// MARK: CHECKPOINT PREP
|
|
1035
1450
|
async #prepareForCheckpoint(flush = true) {
|
|
1036
1451
|
if (flush) {
|
|
1037
|
-
|
|
1452
|
+
try {
|
|
1453
|
+
await this.#backgroundWorker.flushTelemetry();
|
|
1454
|
+
} catch (error) {
|
|
1455
|
+
logger.error(
|
|
1456
|
+
"Failed to flush telemetry while preparing for checkpoint, will proceed anyway",
|
|
1457
|
+
{ error }
|
|
1458
|
+
);
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
try {
|
|
1462
|
+
await this.#backgroundWorker.forceKillOldTaskRunProcesses();
|
|
1463
|
+
} catch (error) {
|
|
1464
|
+
logger.error(
|
|
1465
|
+
"Failed to kill previous worker while preparing for checkpoint, will proceed anyway",
|
|
1466
|
+
{ error }
|
|
1467
|
+
);
|
|
1038
1468
|
}
|
|
1039
|
-
|
|
1469
|
+
this.#readyForCheckpoint();
|
|
1040
1470
|
}
|
|
1041
1471
|
#resumeAfterDuration() {
|
|
1042
1472
|
this.paused = false;
|
|
@@ -1044,6 +1474,106 @@ var ProdWorker = class {
|
|
|
1044
1474
|
this.waitForPostStart = false;
|
|
1045
1475
|
this.#backgroundWorker.waitCompletedNotification();
|
|
1046
1476
|
}
|
|
1477
|
+
async #readyForLazyAttempt() {
|
|
1478
|
+
const idempotencyKey = randomUUID();
|
|
1479
|
+
this.readyForLazyAttemptReplay = {
|
|
1480
|
+
idempotencyKey
|
|
1481
|
+
};
|
|
1482
|
+
for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(3)) {
|
|
1483
|
+
if (retry > 0) {
|
|
1484
|
+
logger.log("retrying ready for lazy attempt", { retry });
|
|
1485
|
+
}
|
|
1486
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
|
|
1487
|
+
version: "v1",
|
|
1488
|
+
runId: this.runId,
|
|
1489
|
+
totalCompletions: this.completed.size
|
|
1490
|
+
});
|
|
1491
|
+
await timeout2(delay.milliseconds);
|
|
1492
|
+
if (!this.readyForLazyAttemptReplay) {
|
|
1493
|
+
logger.error("replay ready for lazy attempt cancelled, discarding", {
|
|
1494
|
+
idempotencyKey
|
|
1495
|
+
});
|
|
1496
|
+
return;
|
|
1497
|
+
}
|
|
1498
|
+
if (idempotencyKey !== this.readyForLazyAttemptReplay.idempotencyKey) {
|
|
1499
|
+
logger.error("replay ready for lazy attempt idempotency key mismatch, discarding", {
|
|
1500
|
+
idempotencyKey,
|
|
1501
|
+
newIdempotencyKey: this.readyForLazyAttemptReplay.idempotencyKey
|
|
1502
|
+
});
|
|
1503
|
+
return;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
this.#failRun(this.runId, "Failed to receive execute request in a reasonable time");
|
|
1507
|
+
}
|
|
1508
|
+
#readyForCheckpoint() {
|
|
1509
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1510
|
+
}
|
|
1511
|
+
#failRun(anyRunId, error) {
|
|
1512
|
+
logger.error("Failing run", { anyRunId, error });
|
|
1513
|
+
const completion = {
|
|
1514
|
+
ok: false,
|
|
1515
|
+
id: anyRunId,
|
|
1516
|
+
retry: void 0,
|
|
1517
|
+
error: error instanceof Error ? {
|
|
1518
|
+
type: "BUILT_IN_ERROR",
|
|
1519
|
+
name: error.name,
|
|
1520
|
+
message: error.message,
|
|
1521
|
+
stackTrace: error.stack ?? ""
|
|
1522
|
+
} : {
|
|
1523
|
+
type: "BUILT_IN_ERROR",
|
|
1524
|
+
name: "UnknownError",
|
|
1525
|
+
message: String(error),
|
|
1526
|
+
stackTrace: ""
|
|
1527
|
+
}
|
|
1528
|
+
};
|
|
1529
|
+
this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
|
|
1530
|
+
version: "v1",
|
|
1531
|
+
completion
|
|
1532
|
+
});
|
|
1533
|
+
}
|
|
1534
|
+
// MARK: ATTEMPT COMPLETION
|
|
1535
|
+
async #submitAttemptCompletion(execution, completion, replayIdempotencyKey) {
|
|
1536
|
+
const taskRunCompleted = await defaultBackoff.execute(async ({ retry }) => {
|
|
1537
|
+
logger.log("Submit attempt completion with backoff", { retry });
|
|
1538
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("TASK_RUN_COMPLETED", {
|
|
1539
|
+
version: "v1",
|
|
1540
|
+
execution,
|
|
1541
|
+
completion
|
|
1542
|
+
});
|
|
1543
|
+
});
|
|
1544
|
+
if (!taskRunCompleted.success) {
|
|
1545
|
+
logger.error("Failed to complete lazy attempt with backoff", {
|
|
1546
|
+
cause: taskRunCompleted.cause,
|
|
1547
|
+
error: taskRunCompleted.error
|
|
1548
|
+
});
|
|
1549
|
+
this.#failRun(execution.run.id, taskRunCompleted.error);
|
|
1550
|
+
return;
|
|
1551
|
+
}
|
|
1552
|
+
const { willCheckpointAndRestore, shouldExit } = taskRunCompleted.result;
|
|
1553
|
+
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1554
|
+
const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
|
|
1555
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
|
|
1556
|
+
if (willCheckpointAndRestore) {
|
|
1557
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1558
|
+
this.submitAttemptCompletionReplay = {
|
|
1559
|
+
message: {
|
|
1560
|
+
execution,
|
|
1561
|
+
completion
|
|
1562
|
+
},
|
|
1563
|
+
attempt: 1,
|
|
1564
|
+
idempotencyKey: randomUUID()
|
|
1565
|
+
};
|
|
1566
|
+
} else {
|
|
1567
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1568
|
+
logger.error(
|
|
1569
|
+
"attempt completion handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1570
|
+
);
|
|
1571
|
+
return;
|
|
1572
|
+
}
|
|
1573
|
+
this.submitAttemptCompletionReplay.attempt++;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1047
1577
|
#returnValidatedExtraHeaders(headers) {
|
|
1048
1578
|
for (const [key, value] of Object.entries(headers)) {
|
|
1049
1579
|
if (value === void 0) {
|
|
@@ -1052,7 +1582,7 @@ var ProdWorker = class {
|
|
|
1052
1582
|
}
|
|
1053
1583
|
return headers;
|
|
1054
1584
|
}
|
|
1055
|
-
//
|
|
1585
|
+
// MARK: COORDINATOR SOCKET
|
|
1056
1586
|
#createCoordinatorSocket(host) {
|
|
1057
1587
|
const extraHeaders = this.#returnValidatedExtraHeaders({
|
|
1058
1588
|
"x-machine-name": MACHINE_NAME,
|
|
@@ -1076,6 +1606,10 @@ var ProdWorker = class {
|
|
|
1076
1606
|
clientMessages: ProdWorkerToCoordinatorMessages,
|
|
1077
1607
|
serverMessages: CoordinatorToProdWorkerMessages,
|
|
1078
1608
|
extraHeaders,
|
|
1609
|
+
ioOptions: {
|
|
1610
|
+
reconnectionDelay: 1e3,
|
|
1611
|
+
reconnectionDelayMax: 3e3
|
|
1612
|
+
},
|
|
1079
1613
|
handlers: {
|
|
1080
1614
|
RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
|
|
1081
1615
|
if (!this.paused) {
|
|
@@ -1101,6 +1635,16 @@ var ProdWorker = class {
|
|
|
1101
1635
|
);
|
|
1102
1636
|
return;
|
|
1103
1637
|
}
|
|
1638
|
+
switch (this.nextResumeAfter) {
|
|
1639
|
+
case "WAIT_FOR_TASK": {
|
|
1640
|
+
this.waitForTaskReplay = void 0;
|
|
1641
|
+
break;
|
|
1642
|
+
}
|
|
1643
|
+
case "WAIT_FOR_BATCH": {
|
|
1644
|
+
this.waitForBatchReplay = void 0;
|
|
1645
|
+
break;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1104
1648
|
this.paused = false;
|
|
1105
1649
|
this.nextResumeAfter = void 0;
|
|
1106
1650
|
this.waitForPostStart = false;
|
|
@@ -1124,8 +1668,10 @@ var ProdWorker = class {
|
|
|
1124
1668
|
});
|
|
1125
1669
|
return;
|
|
1126
1670
|
}
|
|
1671
|
+
this.durationResumeFallback = void 0;
|
|
1127
1672
|
this.#resumeAfterDuration();
|
|
1128
1673
|
},
|
|
1674
|
+
// Deprecated: This will never get called as this worker supports lazy attempts. It's only here for a quick view of the flow old workers use.
|
|
1129
1675
|
EXECUTE_TASK_RUN: async ({ executionPayload }) => {
|
|
1130
1676
|
if (this.executing) {
|
|
1131
1677
|
logger.error("dropping execute request, already executing");
|
|
@@ -1146,46 +1692,31 @@ var ProdWorker = class {
|
|
|
1146
1692
|
completion
|
|
1147
1693
|
});
|
|
1148
1694
|
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1149
|
-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1695
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1150
1696
|
},
|
|
1151
1697
|
EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
|
|
1698
|
+
this.readyForLazyAttemptReplay = void 0;
|
|
1152
1699
|
if (this.executing) {
|
|
1153
1700
|
logger.error("dropping execute request, already executing");
|
|
1154
1701
|
return;
|
|
1155
1702
|
}
|
|
1703
|
+
const attemptCount = message.lazyPayload.attemptCount ?? 0;
|
|
1704
|
+
logger.log("execute attempt counts", { attemptCount, completed: this.completed.size });
|
|
1705
|
+
if (this.completed.size > 0 && this.completed.size >= attemptCount + 1) {
|
|
1706
|
+
logger.error("dropping execute request, already completed");
|
|
1707
|
+
return;
|
|
1708
|
+
}
|
|
1156
1709
|
this.executing = true;
|
|
1157
1710
|
try {
|
|
1158
1711
|
const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
|
|
1159
1712
|
logger.log("completed", completion);
|
|
1160
1713
|
this.completed.add(execution.attempt.id);
|
|
1161
|
-
|
|
1162
|
-
version: "v1",
|
|
1163
|
-
execution,
|
|
1164
|
-
completion
|
|
1165
|
-
});
|
|
1166
|
-
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1167
|
-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1714
|
+
await this.#submitAttemptCompletion(execution, completion);
|
|
1168
1715
|
} catch (error) {
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
id: message.lazyPayload.runId,
|
|
1172
|
-
retry: void 0,
|
|
1173
|
-
error: error instanceof Error ? {
|
|
1174
|
-
type: "BUILT_IN_ERROR",
|
|
1175
|
-
name: error.name,
|
|
1176
|
-
message: error.message,
|
|
1177
|
-
stackTrace: error.stack ?? ""
|
|
1178
|
-
} : {
|
|
1179
|
-
type: "BUILT_IN_ERROR",
|
|
1180
|
-
name: "UnknownError",
|
|
1181
|
-
message: String(error),
|
|
1182
|
-
stackTrace: ""
|
|
1183
|
-
}
|
|
1184
|
-
};
|
|
1185
|
-
this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
|
|
1186
|
-
version: "v1",
|
|
1187
|
-
completion
|
|
1716
|
+
logger.error("Failed to complete lazy attempt", {
|
|
1717
|
+
error
|
|
1188
1718
|
});
|
|
1719
|
+
this.#failRun(message.lazyPayload.runId, error);
|
|
1189
1720
|
}
|
|
1190
1721
|
},
|
|
1191
1722
|
REQUEST_ATTEMPT_CANCELLATION: async (message) => {
|
|
@@ -1199,146 +1730,153 @@ var ProdWorker = class {
|
|
|
1199
1730
|
REQUEST_EXIT: async (message) => {
|
|
1200
1731
|
if (message.version === "v2" && message.delayInMs) {
|
|
1201
1732
|
logger.log("exit requested with delay", { delayInMs: message.delayInMs });
|
|
1202
|
-
await
|
|
1733
|
+
await timeout2(message.delayInMs);
|
|
1203
1734
|
}
|
|
1204
1735
|
this.#coordinatorSocket.close();
|
|
1205
1736
|
process.exit(0);
|
|
1206
1737
|
},
|
|
1207
1738
|
READY_FOR_RETRY: async (message) => {
|
|
1208
1739
|
if (this.completed.size < 1) {
|
|
1740
|
+
logger.error("Received READY_FOR_RETRY but no completions yet. This is a bug.");
|
|
1209
1741
|
return;
|
|
1210
1742
|
}
|
|
1211
|
-
this
|
|
1212
|
-
|
|
1213
|
-
runId: this.runId,
|
|
1214
|
-
totalCompletions: this.completed.size
|
|
1215
|
-
});
|
|
1743
|
+
this.submitAttemptCompletionReplay = void 0;
|
|
1744
|
+
await this.#readyForLazyAttempt();
|
|
1216
1745
|
}
|
|
1217
1746
|
},
|
|
1747
|
+
// MARK: ON CONNECTION
|
|
1218
1748
|
onConnection: async (socket, handler, sender, logger2) => {
|
|
1219
|
-
logger2.log("connected to coordinator", {
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
}
|
|
1224
|
-
|
|
1225
|
-
if (
|
|
1226
|
-
logger2.
|
|
1227
|
-
this.#emitUnrecoverableError(
|
|
1228
|
-
"NoNextResume",
|
|
1229
|
-
"Next resume reason not set while resuming from paused state"
|
|
1230
|
-
);
|
|
1749
|
+
logger2.log("connected to coordinator", {
|
|
1750
|
+
status: this.#status,
|
|
1751
|
+
connectionCount: ++this.connectionCount
|
|
1752
|
+
});
|
|
1753
|
+
socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId });
|
|
1754
|
+
try {
|
|
1755
|
+
if (this.waitForPostStart) {
|
|
1756
|
+
logger2.log("skip connection handler, waiting for post start hook");
|
|
1231
1757
|
return;
|
|
1232
1758
|
}
|
|
1233
|
-
if (
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1759
|
+
if (this.paused) {
|
|
1760
|
+
if (!this.nextResumeAfter) {
|
|
1761
|
+
logger2.error("Missing next resume reason", { status: this.#status });
|
|
1762
|
+
this.#emitUnrecoverableError(
|
|
1763
|
+
"NoNextResume",
|
|
1764
|
+
"Next resume reason not set while resuming from paused state"
|
|
1765
|
+
);
|
|
1766
|
+
return;
|
|
1767
|
+
}
|
|
1768
|
+
if (!this.attemptFriendlyId) {
|
|
1769
|
+
logger2.error("Missing friendly ID", { status: this.#status });
|
|
1770
|
+
this.#emitUnrecoverableError(
|
|
1771
|
+
"NoAttemptId",
|
|
1772
|
+
"Attempt ID not set while resuming from paused state"
|
|
1773
|
+
);
|
|
1774
|
+
return;
|
|
1775
|
+
}
|
|
1776
|
+
socket.emit("READY_FOR_RESUME", {
|
|
1777
|
+
version: "v1",
|
|
1778
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
1779
|
+
type: this.nextResumeAfter
|
|
1780
|
+
});
|
|
1239
1781
|
return;
|
|
1240
1782
|
}
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
attemptFriendlyId: this.attemptFriendlyId,
|
|
1244
|
-
type: this.nextResumeAfter
|
|
1245
|
-
});
|
|
1246
|
-
return;
|
|
1247
|
-
}
|
|
1248
|
-
if (process.env.INDEX_TASKS === "true") {
|
|
1249
|
-
try {
|
|
1250
|
-
const taskResources = await this.#initializeWorker();
|
|
1251
|
-
const { success } = await socket.emitWithAck("INDEX_TASKS", {
|
|
1252
|
-
version: "v2",
|
|
1253
|
-
deploymentId: this.deploymentId,
|
|
1254
|
-
...taskResources,
|
|
1255
|
-
supportsLazyAttempts: true
|
|
1256
|
-
});
|
|
1257
|
-
if (success) {
|
|
1258
|
-
logger2.info("indexing done, shutting down..");
|
|
1259
|
-
process.exit(0);
|
|
1260
|
-
} else {
|
|
1261
|
-
logger2.info("indexing failure, shutting down..");
|
|
1262
|
-
process.exit(1);
|
|
1263
|
-
}
|
|
1264
|
-
} catch (e) {
|
|
1265
|
-
const stderr = this.#backgroundWorker.stderr.join("\n");
|
|
1266
|
-
if (e instanceof TaskMetadataParseError) {
|
|
1267
|
-
logger2.error("tasks metadata parse error", {
|
|
1268
|
-
zodIssues: e.zodIssues,
|
|
1269
|
-
tasks: e.tasks
|
|
1270
|
-
});
|
|
1783
|
+
if (process.env.INDEX_TASKS === "true") {
|
|
1784
|
+
const failIndex = (error) => {
|
|
1271
1785
|
socket.emit("INDEXING_FAILED", {
|
|
1272
1786
|
version: "v1",
|
|
1273
1787
|
deploymentId: this.deploymentId,
|
|
1274
|
-
error
|
|
1788
|
+
error
|
|
1789
|
+
});
|
|
1790
|
+
};
|
|
1791
|
+
process.removeAllListeners("uncaughtException");
|
|
1792
|
+
process.on("uncaughtException", (error) => {
|
|
1793
|
+
console.error("Uncaught exception while indexing", error);
|
|
1794
|
+
failIndex(error);
|
|
1795
|
+
});
|
|
1796
|
+
try {
|
|
1797
|
+
const taskResources = await this.#initializeWorker();
|
|
1798
|
+
const indexTasks = await defaultBackoff.maxRetries(3).execute(async () => {
|
|
1799
|
+
return await socket.timeout(2e4).emitWithAck("INDEX_TASKS", {
|
|
1800
|
+
version: "v2",
|
|
1801
|
+
deploymentId: this.deploymentId,
|
|
1802
|
+
...taskResources,
|
|
1803
|
+
supportsLazyAttempts: true
|
|
1804
|
+
});
|
|
1805
|
+
});
|
|
1806
|
+
if (!indexTasks.success || !indexTasks.result.success) {
|
|
1807
|
+
logger2.error("indexing failure, shutting down..", { indexTasks });
|
|
1808
|
+
process.exit(1);
|
|
1809
|
+
} else {
|
|
1810
|
+
logger2.info("indexing done, shutting down..");
|
|
1811
|
+
process.exit(0);
|
|
1812
|
+
}
|
|
1813
|
+
} catch (e) {
|
|
1814
|
+
const stderr = this.#backgroundWorker.stderr.join("\n");
|
|
1815
|
+
if (e instanceof TaskMetadataParseError) {
|
|
1816
|
+
logger2.error("tasks metadata parse error", {
|
|
1817
|
+
zodIssues: e.zodIssues,
|
|
1818
|
+
tasks: e.tasks
|
|
1819
|
+
});
|
|
1820
|
+
failIndex({
|
|
1275
1821
|
name: "TaskMetadataParseError",
|
|
1276
1822
|
message: "There was an error parsing the task metadata",
|
|
1277
1823
|
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
|
|
1278
1824
|
stderr
|
|
1279
|
-
}
|
|
1280
|
-
})
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
logger2.error("error", { error });
|
|
1302
|
-
socket.emit("INDEXING_FAILED", {
|
|
1303
|
-
version: "v1",
|
|
1304
|
-
deploymentId: this.deploymentId,
|
|
1305
|
-
error
|
|
1306
|
-
});
|
|
1307
|
-
} else if (typeof e === "string") {
|
|
1308
|
-
logger2.error("string error", { error: { message: e } });
|
|
1309
|
-
socket.emit("INDEXING_FAILED", {
|
|
1310
|
-
version: "v1",
|
|
1311
|
-
deploymentId: this.deploymentId,
|
|
1312
|
-
error: {
|
|
1825
|
+
});
|
|
1826
|
+
} else if (e instanceof UncaughtExceptionError) {
|
|
1827
|
+
const error = {
|
|
1828
|
+
name: e.originalError.name,
|
|
1829
|
+
message: e.originalError.message,
|
|
1830
|
+
stack: e.originalError.stack,
|
|
1831
|
+
stderr
|
|
1832
|
+
};
|
|
1833
|
+
logger2.error("uncaught exception", { originalError: error });
|
|
1834
|
+
failIndex(error);
|
|
1835
|
+
} else if (e instanceof Error) {
|
|
1836
|
+
const error = {
|
|
1837
|
+
name: e.name,
|
|
1838
|
+
message: e.message,
|
|
1839
|
+
stack: e.stack,
|
|
1840
|
+
stderr
|
|
1841
|
+
};
|
|
1842
|
+
logger2.error("error", { error });
|
|
1843
|
+
failIndex(error);
|
|
1844
|
+
} else if (typeof e === "string") {
|
|
1845
|
+
logger2.error("string error", { error: { message: e } });
|
|
1846
|
+
failIndex({
|
|
1313
1847
|
name: "Error",
|
|
1314
1848
|
message: e,
|
|
1315
1849
|
stderr
|
|
1316
|
-
}
|
|
1317
|
-
}
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
socket.emit("INDEXING_FAILED", {
|
|
1321
|
-
version: "v1",
|
|
1322
|
-
deploymentId: this.deploymentId,
|
|
1323
|
-
error: {
|
|
1850
|
+
});
|
|
1851
|
+
} else {
|
|
1852
|
+
logger2.error("unknown error", { error: e });
|
|
1853
|
+
failIndex({
|
|
1324
1854
|
name: "Error",
|
|
1325
1855
|
message: "Unknown error",
|
|
1326
1856
|
stderr
|
|
1327
|
-
}
|
|
1328
|
-
}
|
|
1857
|
+
});
|
|
1858
|
+
}
|
|
1859
|
+
await timeout2(1e3);
|
|
1860
|
+
process.exit(EXIT_CODE_ALREADY_HANDLED);
|
|
1329
1861
|
}
|
|
1330
|
-
await setTimeout2(200);
|
|
1331
|
-
process.exit(111);
|
|
1332
1862
|
}
|
|
1863
|
+
if (this.executing) {
|
|
1864
|
+
return;
|
|
1865
|
+
}
|
|
1866
|
+
process.removeAllListeners("uncaughtException");
|
|
1867
|
+
process.on("uncaughtException", (error) => {
|
|
1868
|
+
console.error("Uncaught exception during run", error);
|
|
1869
|
+
this.#failRun(this.runId, error);
|
|
1870
|
+
});
|
|
1871
|
+
await this.#readyForLazyAttempt();
|
|
1872
|
+
} catch (error) {
|
|
1873
|
+
logger2.error("connection handler error", { error });
|
|
1874
|
+
} finally {
|
|
1875
|
+
if (this.connectionCount === 1) {
|
|
1876
|
+
return;
|
|
1877
|
+
}
|
|
1878
|
+
this.#handleReplays();
|
|
1333
1879
|
}
|
|
1334
|
-
if (this.executing) {
|
|
1335
|
-
return;
|
|
1336
|
-
}
|
|
1337
|
-
socket.emit("READY_FOR_LAZY_ATTEMPT", {
|
|
1338
|
-
version: "v1",
|
|
1339
|
-
runId: this.runId,
|
|
1340
|
-
totalCompletions: this.completed.size
|
|
1341
|
-
});
|
|
1342
1880
|
},
|
|
1343
1881
|
onError: async (socket, err, logger2) => {
|
|
1344
1882
|
logger2.error("onError", {
|
|
@@ -1347,13 +1885,109 @@ var ProdWorker = class {
|
|
|
1347
1885
|
message: err.message
|
|
1348
1886
|
}
|
|
1349
1887
|
});
|
|
1350
|
-
await this.#reconnect();
|
|
1351
|
-
},
|
|
1352
|
-
onDisconnect: async (socket, reason, description, logger2) => {
|
|
1353
1888
|
}
|
|
1354
1889
|
});
|
|
1355
1890
|
return coordinatorConnection;
|
|
1356
1891
|
}
|
|
1892
|
+
// MARK: REPLAYS
|
|
1893
|
+
async #handleReplays() {
|
|
1894
|
+
const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
|
|
1895
|
+
const replayCancellationDelay = 2e4;
|
|
1896
|
+
if (this.waitForTaskReplay) {
|
|
1897
|
+
logger.log("replaying wait for task", { ...this.waitForTaskReplay });
|
|
1898
|
+
const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
|
|
1899
|
+
await timeout2(replayCancellationDelay);
|
|
1900
|
+
if (!this.waitForTaskReplay) {
|
|
1901
|
+
logger.error("wait for task replay cancelled, discarding", {
|
|
1902
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1903
|
+
});
|
|
1904
|
+
return;
|
|
1905
|
+
}
|
|
1906
|
+
if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1907
|
+
logger.error("wait for task replay idempotency key mismatch, discarding", {
|
|
1908
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1909
|
+
newMessage: this.waitForTaskReplay
|
|
1910
|
+
});
|
|
1911
|
+
return;
|
|
1912
|
+
}
|
|
1913
|
+
try {
|
|
1914
|
+
await backoff.wait(attempt + 1);
|
|
1915
|
+
await this.#waitForTaskHandler(message);
|
|
1916
|
+
} catch (error) {
|
|
1917
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1918
|
+
logger.error("wait for task replay retry limit exceeded", { error });
|
|
1919
|
+
} else {
|
|
1920
|
+
logger.error("wait for task replay error", { error });
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
return;
|
|
1924
|
+
}
|
|
1925
|
+
if (this.waitForBatchReplay) {
|
|
1926
|
+
logger.log("replaying wait for batch", {
|
|
1927
|
+
...this.waitForBatchReplay,
|
|
1928
|
+
cancellationDelay: replayCancellationDelay
|
|
1929
|
+
});
|
|
1930
|
+
const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
|
|
1931
|
+
await timeout2(replayCancellationDelay);
|
|
1932
|
+
if (!this.waitForBatchReplay) {
|
|
1933
|
+
logger.error("wait for batch replay cancelled, discarding", {
|
|
1934
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1935
|
+
});
|
|
1936
|
+
return;
|
|
1937
|
+
}
|
|
1938
|
+
if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1939
|
+
logger.error("wait for batch replay idempotency key mismatch, discarding", {
|
|
1940
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1941
|
+
newMessage: this.waitForBatchReplay
|
|
1942
|
+
});
|
|
1943
|
+
return;
|
|
1944
|
+
}
|
|
1945
|
+
try {
|
|
1946
|
+
await backoff.wait(attempt + 1);
|
|
1947
|
+
await this.#waitForBatchHandler(message);
|
|
1948
|
+
} catch (error) {
|
|
1949
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1950
|
+
logger.error("wait for batch replay retry limit exceeded", { error });
|
|
1951
|
+
} else {
|
|
1952
|
+
logger.error("wait for batch replay error", { error });
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
return;
|
|
1956
|
+
}
|
|
1957
|
+
if (this.submitAttemptCompletionReplay) {
|
|
1958
|
+
logger.log("replaying attempt completion", {
|
|
1959
|
+
...this.submitAttemptCompletionReplay,
|
|
1960
|
+
cancellationDelay: replayCancellationDelay
|
|
1961
|
+
});
|
|
1962
|
+
const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
|
|
1963
|
+
await timeout2(replayCancellationDelay);
|
|
1964
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1965
|
+
logger.error("attempt completion replay cancelled, discarding", {
|
|
1966
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1967
|
+
});
|
|
1968
|
+
return;
|
|
1969
|
+
}
|
|
1970
|
+
if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1971
|
+
logger.error("attempt completion replay idempotency key mismatch, discarding", {
|
|
1972
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1973
|
+
newMessage: this.submitAttemptCompletionReplay
|
|
1974
|
+
});
|
|
1975
|
+
return;
|
|
1976
|
+
}
|
|
1977
|
+
try {
|
|
1978
|
+
await backoff.wait(attempt + 1);
|
|
1979
|
+
await this.#submitAttemptCompletion(message.execution, message.completion, idempotencyKey);
|
|
1980
|
+
} catch (error) {
|
|
1981
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1982
|
+
logger.error("attempt completion replay retry limit exceeded", { error });
|
|
1983
|
+
} else {
|
|
1984
|
+
logger.error("attempt completion replay error", { error });
|
|
1985
|
+
}
|
|
1986
|
+
}
|
|
1987
|
+
return;
|
|
1988
|
+
}
|
|
1989
|
+
}
|
|
1990
|
+
// MARK: HTTP SERVER
|
|
1357
1991
|
#createHttpServer() {
|
|
1358
1992
|
const httpServer = createServer(async (req, res) => {
|
|
1359
1993
|
logger.log(`[${req.method}]`, req.url);
|
|
@@ -1372,17 +2006,13 @@ var ProdWorker = class {
|
|
|
1372
2006
|
return reply.text("Connected to coordinator");
|
|
1373
2007
|
}
|
|
1374
2008
|
case "/close": {
|
|
1375
|
-
await this.#coordinatorSocket.sendWithAck("LOG", {
|
|
1376
|
-
version: "v1",
|
|
1377
|
-
text: `[${req.method}] ${req.url}`
|
|
1378
|
-
});
|
|
1379
2009
|
this.#coordinatorSocket.close();
|
|
2010
|
+
this.connectionCount = 0;
|
|
1380
2011
|
return reply.text("Disconnected from coordinator");
|
|
1381
2012
|
}
|
|
1382
2013
|
case "/test": {
|
|
1383
|
-
await this.#coordinatorSocket.
|
|
1384
|
-
version: "v1"
|
|
1385
|
-
text: `[${req.method}] ${req.url}`
|
|
2014
|
+
await this.#coordinatorSocket.socket.timeout(1e4).emitWithAck("TEST", {
|
|
2015
|
+
version: "v1"
|
|
1386
2016
|
});
|
|
1387
2017
|
return reply.text("Received ACK from coordinator");
|
|
1388
2018
|
}
|
|
@@ -1417,7 +2047,7 @@ var ProdWorker = class {
|
|
|
1417
2047
|
break;
|
|
1418
2048
|
}
|
|
1419
2049
|
case "restore": {
|
|
1420
|
-
await this.#
|
|
2050
|
+
await this.#reconnectAfterPostStart();
|
|
1421
2051
|
break;
|
|
1422
2052
|
}
|
|
1423
2053
|
default: {
|
|
@@ -1448,7 +2078,7 @@ var ProdWorker = class {
|
|
|
1448
2078
|
}
|
|
1449
2079
|
logger.error(`port ${this.#httpPort} already in use, retrying with random port..`);
|
|
1450
2080
|
this.#httpPort = getRandomPortNumber();
|
|
1451
|
-
await
|
|
2081
|
+
await timeout2(100);
|
|
1452
2082
|
this.start();
|
|
1453
2083
|
});
|
|
1454
2084
|
return httpServer;
|
|
@@ -1458,8 +2088,12 @@ var ProdWorker = class {
|
|
|
1458
2088
|
await this.#backgroundWorker.initialize({ env: envVars });
|
|
1459
2089
|
let packageVersion;
|
|
1460
2090
|
const taskResources = [];
|
|
1461
|
-
if (!this.#backgroundWorker.tasks) {
|
|
1462
|
-
throw new Error(
|
|
2091
|
+
if (!this.#backgroundWorker.tasks || this.#backgroundWorker.tasks.length === 0) {
|
|
2092
|
+
throw new Error(
|
|
2093
|
+
`Background Worker started without tasks. Searched in: ${__PROJECT_CONFIG__.triggerDirectories?.join(
|
|
2094
|
+
", "
|
|
2095
|
+
)}`
|
|
2096
|
+
);
|
|
1463
2097
|
}
|
|
1464
2098
|
for (const task of this.#backgroundWorker.tasks) {
|
|
1465
2099
|
taskResources.push(task);
|
|
@@ -1493,7 +2127,9 @@ var ProdWorker = class {
|
|
|
1493
2127
|
completed: this.completed.size,
|
|
1494
2128
|
nextResumeAfter: this.nextResumeAfter,
|
|
1495
2129
|
waitForPostStart: this.waitForPostStart,
|
|
1496
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
2130
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
2131
|
+
waitForTaskReplay: this.waitForTaskReplay,
|
|
2132
|
+
waitForBatchReplay: this.waitForBatchReplay
|
|
1497
2133
|
};
|
|
1498
2134
|
}
|
|
1499
2135
|
#emitUnrecoverableError(name, message) {
|