trigger.dev 3.0.0-beta.5 → 3.0.0-beta.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Containerfile.prod +32 -4
- package/dist/index.js +2679 -909
- package/dist/index.js.map +1 -1
- package/dist/templates/trigger.config.ts.template +1 -1
- package/dist/workers/dev/worker-facade.js +71 -66
- package/dist/workers/dev/worker-setup.js +18 -23
- package/dist/workers/prod/entry-point.js +1439 -421
- package/dist/workers/prod/worker-facade.js +102 -72
- package/dist/workers/prod/worker-setup.js +14 -25
- package/package.json +15 -20
|
@@ -4,8 +4,9 @@ import {
|
|
|
4
4
|
PostStartCauses,
|
|
5
5
|
PreStopCauses,
|
|
6
6
|
ProdWorkerToCoordinatorMessages,
|
|
7
|
-
|
|
7
|
+
TaskRunErrorCodes as TaskRunErrorCodes2
|
|
8
8
|
} from "@trigger.dev/core/v3";
|
|
9
|
+
import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
|
|
9
10
|
|
|
10
11
|
// ../core-apps/src/http.ts
|
|
11
12
|
var HttpReply = class {
|
|
@@ -65,27 +66,280 @@ var SimpleLogger = class {
|
|
|
65
66
|
}
|
|
66
67
|
};
|
|
67
68
|
|
|
68
|
-
// ../core-apps/src/
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
clientWebsocketMessages,
|
|
72
|
-
PlatformToProviderMessages,
|
|
73
|
-
ProviderToPlatformMessages,
|
|
74
|
-
SharedQueueToClientMessages,
|
|
75
|
-
ZodMessageSender,
|
|
76
|
-
ZodSocketConnection
|
|
77
|
-
} from "@trigger.dev/core/v3";
|
|
78
|
-
var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
|
|
79
|
-
var MACHINE_NAME = process.env.MACHINE_NAME || "local";
|
|
80
|
-
var PLATFORM_HOST = process.env.PLATFORM_HOST || "127.0.0.1";
|
|
81
|
-
var PLATFORM_WS_PORT = process.env.PLATFORM_WS_PORT || 3030;
|
|
82
|
-
var PLATFORM_SECRET = process.env.PLATFORM_SECRET || "provider-secret";
|
|
83
|
-
var SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ?? "false");
|
|
84
|
-
var logger = new SimpleLogger(`[${MACHINE_NAME}]`);
|
|
69
|
+
// ../core-apps/src/process.ts
|
|
70
|
+
var EXIT_CODE_ALREADY_HANDLED = 111;
|
|
71
|
+
var EXIT_CODE_CHILD_NONZERO = 112;
|
|
85
72
|
|
|
86
|
-
// src/
|
|
87
|
-
import {
|
|
88
|
-
|
|
73
|
+
// ../core-apps/src/backoff.ts
|
|
74
|
+
import { setTimeout as timeout } from "node:timers/promises";
|
|
75
|
+
var StopRetrying = class extends Error {
|
|
76
|
+
constructor(message) {
|
|
77
|
+
super(message);
|
|
78
|
+
this.name = "StopRetrying";
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
var AttemptTimeout = class extends Error {
|
|
82
|
+
constructor(message) {
|
|
83
|
+
super(message);
|
|
84
|
+
this.name = "AttemptTimeout";
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
var RetryLimitExceeded = class extends Error {
|
|
88
|
+
constructor(message) {
|
|
89
|
+
super(message);
|
|
90
|
+
this.name = "RetryLimitExceeded";
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
var ExponentialBackoff = class _ExponentialBackoff {
|
|
94
|
+
#retries = 0;
|
|
95
|
+
#type;
|
|
96
|
+
#base;
|
|
97
|
+
#factor;
|
|
98
|
+
#min;
|
|
99
|
+
#max;
|
|
100
|
+
#maxRetries;
|
|
101
|
+
#maxElapsed;
|
|
102
|
+
constructor(type, opts = {}) {
|
|
103
|
+
this.#type = type ?? "NoJitter";
|
|
104
|
+
this.#base = opts.base ?? 2;
|
|
105
|
+
this.#factor = opts.factor ?? 1;
|
|
106
|
+
this.#min = opts.min ?? -Infinity;
|
|
107
|
+
this.#max = opts.max ?? Infinity;
|
|
108
|
+
this.#maxRetries = opts.maxRetries ?? Infinity;
|
|
109
|
+
this.#maxElapsed = opts.maxElapsed ?? Infinity;
|
|
110
|
+
}
|
|
111
|
+
#clone(type, opts = {}) {
|
|
112
|
+
return new _ExponentialBackoff(type ?? this.#type, {
|
|
113
|
+
base: opts.base ?? this.#base,
|
|
114
|
+
factor: opts.factor ?? this.#factor,
|
|
115
|
+
min: opts.min ?? this.#min,
|
|
116
|
+
max: opts.max ?? this.#max,
|
|
117
|
+
maxRetries: opts.maxRetries ?? this.#maxRetries,
|
|
118
|
+
maxElapsed: opts.maxElapsed ?? this.#maxElapsed
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
type(type) {
|
|
122
|
+
return this.#clone(type);
|
|
123
|
+
}
|
|
124
|
+
base(base) {
|
|
125
|
+
return this.#clone(void 0, { base });
|
|
126
|
+
}
|
|
127
|
+
factor(factor) {
|
|
128
|
+
return this.#clone(void 0, { factor });
|
|
129
|
+
}
|
|
130
|
+
min(min) {
|
|
131
|
+
return this.#clone(void 0, { min });
|
|
132
|
+
}
|
|
133
|
+
max(max) {
|
|
134
|
+
return this.#clone(void 0, { max });
|
|
135
|
+
}
|
|
136
|
+
maxRetries(maxRetries) {
|
|
137
|
+
return this.#clone(void 0, { maxRetries });
|
|
138
|
+
}
|
|
139
|
+
// TODO: With .execute(), should this also include the time it takes to execute the callback?
|
|
140
|
+
maxElapsed(maxElapsed) {
|
|
141
|
+
return this.#clone(void 0, { maxElapsed });
|
|
142
|
+
}
|
|
143
|
+
retries(retries) {
|
|
144
|
+
if (typeof retries !== "undefined") {
|
|
145
|
+
if (retries > this.#maxRetries) {
|
|
146
|
+
console.error(
|
|
147
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
148
|
+
);
|
|
149
|
+
this.#retries = this.#maxRetries;
|
|
150
|
+
} else {
|
|
151
|
+
this.#retries = retries;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return this.#clone();
|
|
155
|
+
}
|
|
156
|
+
async *retryAsync(maxRetries = this.#maxRetries ?? Infinity) {
|
|
157
|
+
let elapsed = 0;
|
|
158
|
+
let retry = 0;
|
|
159
|
+
while (retry <= maxRetries) {
|
|
160
|
+
const delay = this.delay(retry);
|
|
161
|
+
elapsed += delay;
|
|
162
|
+
if (elapsed > this.#maxElapsed) {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
yield {
|
|
166
|
+
delay: {
|
|
167
|
+
seconds: delay,
|
|
168
|
+
milliseconds: delay * 1e3
|
|
169
|
+
},
|
|
170
|
+
retry
|
|
171
|
+
};
|
|
172
|
+
retry++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
async *[Symbol.asyncIterator]() {
|
|
176
|
+
yield* this.retryAsync();
|
|
177
|
+
}
|
|
178
|
+
/** Returns the delay for the current retry in seconds. */
|
|
179
|
+
delay(retries = this.#retries, jitter = true) {
|
|
180
|
+
if (retries > this.#maxRetries) {
|
|
181
|
+
console.error(
|
|
182
|
+
`Can't set retries ${retries} higher than maxRetries (${this.#maxRetries}), setting to maxRetries instead.`
|
|
183
|
+
);
|
|
184
|
+
retries = this.#maxRetries;
|
|
185
|
+
}
|
|
186
|
+
let delay = this.#factor * this.#base ** retries;
|
|
187
|
+
switch (this.#type) {
|
|
188
|
+
case "NoJitter": {
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
case "FullJitter": {
|
|
192
|
+
if (!jitter) {
|
|
193
|
+
delay = 0;
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
delay *= Math.random();
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
case "EqualJitter": {
|
|
200
|
+
if (!jitter) {
|
|
201
|
+
delay *= 0.5;
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
delay *= 0.5 * (1 + Math.random());
|
|
205
|
+
break;
|
|
206
|
+
}
|
|
207
|
+
default: {
|
|
208
|
+
throw new Error(`Unknown backoff type: ${this.#type}`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
if (delay < this.#min) {
|
|
212
|
+
delay = this.#min + Math.random() * (this.#min * 0.2);
|
|
213
|
+
}
|
|
214
|
+
if (delay > this.#max) {
|
|
215
|
+
delay = this.#max - Math.random() * (this.#max * 0.2);
|
|
216
|
+
}
|
|
217
|
+
delay = Math.round(delay);
|
|
218
|
+
return delay;
|
|
219
|
+
}
|
|
220
|
+
/** Waits with the appropriate delay for the current retry. */
|
|
221
|
+
async wait(retries = this.#retries, jitter = true) {
|
|
222
|
+
if (retries > this.#maxRetries) {
|
|
223
|
+
console.error(`Retry limit exceeded: ${retries} > ${this.#maxRetries}`);
|
|
224
|
+
throw new RetryLimitExceeded();
|
|
225
|
+
}
|
|
226
|
+
const delay = this.delay(retries, jitter);
|
|
227
|
+
return await timeout(delay * 1e3);
|
|
228
|
+
}
|
|
229
|
+
elapsed(retries = this.#retries, jitter = true) {
|
|
230
|
+
let elapsed = 0;
|
|
231
|
+
for (let i = 0; i <= retries; i++) {
|
|
232
|
+
elapsed += this.delay(i, jitter);
|
|
233
|
+
}
|
|
234
|
+
const total = elapsed;
|
|
235
|
+
let days = 0;
|
|
236
|
+
if (elapsed > 3600 * 24) {
|
|
237
|
+
days = Math.floor(elapsed / 3600 / 24);
|
|
238
|
+
elapsed -= days * 3600 * 24;
|
|
239
|
+
}
|
|
240
|
+
let hours = 0;
|
|
241
|
+
if (elapsed > 3600) {
|
|
242
|
+
hours = Math.floor(elapsed / 3600);
|
|
243
|
+
elapsed -= hours * 3600;
|
|
244
|
+
}
|
|
245
|
+
let minutes = 0;
|
|
246
|
+
if (elapsed > 60) {
|
|
247
|
+
minutes = Math.floor(elapsed / 60);
|
|
248
|
+
elapsed -= minutes * 60;
|
|
249
|
+
}
|
|
250
|
+
const seconds = elapsed;
|
|
251
|
+
return {
|
|
252
|
+
seconds,
|
|
253
|
+
minutes,
|
|
254
|
+
hours,
|
|
255
|
+
days,
|
|
256
|
+
total
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
reset() {
|
|
260
|
+
this.#retries = 0;
|
|
261
|
+
return this;
|
|
262
|
+
}
|
|
263
|
+
next() {
|
|
264
|
+
this.#retries++;
|
|
265
|
+
return this.delay();
|
|
266
|
+
}
|
|
267
|
+
stop() {
|
|
268
|
+
throw new StopRetrying();
|
|
269
|
+
}
|
|
270
|
+
get state() {
|
|
271
|
+
return {
|
|
272
|
+
retries: this.#retries,
|
|
273
|
+
type: this.#type,
|
|
274
|
+
base: this.#base,
|
|
275
|
+
factor: this.#factor,
|
|
276
|
+
min: this.#min,
|
|
277
|
+
max: this.#max,
|
|
278
|
+
maxRetries: this.#maxRetries,
|
|
279
|
+
maxElapsed: this.#maxElapsed
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
async execute(callback, { attemptTimeoutMs = 0 } = {}) {
|
|
283
|
+
let elapsedMs = 0;
|
|
284
|
+
let finalError = void 0;
|
|
285
|
+
for await (const { delay, retry } of this) {
|
|
286
|
+
const start = Date.now();
|
|
287
|
+
if (retry > 0) {
|
|
288
|
+
console.log(`Retrying in ${delay.milliseconds}ms`);
|
|
289
|
+
await timeout(delay.milliseconds);
|
|
290
|
+
}
|
|
291
|
+
let attemptTimeout = void 0;
|
|
292
|
+
try {
|
|
293
|
+
const result = await new Promise(async (resolve, reject) => {
|
|
294
|
+
if (attemptTimeoutMs > 0) {
|
|
295
|
+
attemptTimeout = setTimeout(() => {
|
|
296
|
+
reject(new AttemptTimeout());
|
|
297
|
+
}, attemptTimeoutMs);
|
|
298
|
+
}
|
|
299
|
+
try {
|
|
300
|
+
const callbackResult = await callback({ delay, retry, elapsedMs });
|
|
301
|
+
resolve(callbackResult);
|
|
302
|
+
} catch (error) {
|
|
303
|
+
reject(error);
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
return {
|
|
307
|
+
success: true,
|
|
308
|
+
result
|
|
309
|
+
};
|
|
310
|
+
} catch (error) {
|
|
311
|
+
finalError = error;
|
|
312
|
+
if (error instanceof StopRetrying) {
|
|
313
|
+
return {
|
|
314
|
+
success: false,
|
|
315
|
+
cause: "StopRetrying",
|
|
316
|
+
error: error.message
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
if (error instanceof AttemptTimeout) {
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
} finally {
|
|
323
|
+
elapsedMs += Date.now() - start;
|
|
324
|
+
clearTimeout(attemptTimeout);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
if (finalError instanceof AttemptTimeout) {
|
|
328
|
+
return {
|
|
329
|
+
success: false,
|
|
330
|
+
cause: "Timeout"
|
|
331
|
+
};
|
|
332
|
+
} else {
|
|
333
|
+
return {
|
|
334
|
+
success: false,
|
|
335
|
+
cause: "MaxRetries",
|
|
336
|
+
error: finalError
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
static RetryLimitExceeded = RetryLimitExceeded;
|
|
341
|
+
static StopRetrying = StopRetrying;
|
|
342
|
+
};
|
|
89
343
|
|
|
90
344
|
// src/workers/prod/backgroundWorker.ts
|
|
91
345
|
import {
|
|
@@ -93,9 +347,9 @@ import {
|
|
|
93
347
|
ProdWorkerToChildMessages,
|
|
94
348
|
SemanticInternalAttributes,
|
|
95
349
|
TaskRunErrorCodes,
|
|
96
|
-
ZodIpcConnection,
|
|
97
350
|
correctErrorStackTrace
|
|
98
351
|
} from "@trigger.dev/core/v3";
|
|
352
|
+
import { ZodIpcConnection } from "@trigger.dev/core/v3/zodIpc";
|
|
99
353
|
import { Evt } from "evt";
|
|
100
354
|
import { fork } from "node:child_process";
|
|
101
355
|
|
|
@@ -116,12 +370,12 @@ var TaskMetadataParseError = class extends Error {
|
|
|
116
370
|
this.name = "TaskMetadataParseError";
|
|
117
371
|
}
|
|
118
372
|
};
|
|
119
|
-
|
|
120
|
-
// src/workers/prod/backgroundWorker.ts
|
|
121
373
|
var UnexpectedExitError = class extends Error {
|
|
122
|
-
constructor(code) {
|
|
374
|
+
constructor(code, signal, stderr) {
|
|
123
375
|
super(`Unexpected exit with code ${code}`);
|
|
124
376
|
this.code = code;
|
|
377
|
+
this.signal = signal;
|
|
378
|
+
this.stderr = stderr;
|
|
125
379
|
this.name = "UnexpectedExitError";
|
|
126
380
|
}
|
|
127
381
|
};
|
|
@@ -137,33 +391,101 @@ var CancelledProcessError = class extends Error {
|
|
|
137
391
|
this.name = "CancelledProcessError";
|
|
138
392
|
}
|
|
139
393
|
};
|
|
394
|
+
var SigKillTimeoutProcessError = class extends Error {
|
|
395
|
+
constructor() {
|
|
396
|
+
super("Process kill timeout");
|
|
397
|
+
this.name = "SigKillTimeoutProcessError";
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
var GracefulExitTimeoutError = class extends Error {
|
|
401
|
+
constructor() {
|
|
402
|
+
super("Graceful exit timeout");
|
|
403
|
+
this.name = "GracefulExitTimeoutError";
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
function getFriendlyErrorMessage(code, signal, stderr, dockerMode = true) {
|
|
407
|
+
const message = (text) => {
|
|
408
|
+
if (signal) {
|
|
409
|
+
return `[${signal}] ${text}`;
|
|
410
|
+
} else {
|
|
411
|
+
return text;
|
|
412
|
+
}
|
|
413
|
+
};
|
|
414
|
+
if (code === 137) {
|
|
415
|
+
if (dockerMode) {
|
|
416
|
+
return message(
|
|
417
|
+
"Process ran out of memory! Try choosing a machine preset with more memory for this task."
|
|
418
|
+
);
|
|
419
|
+
} else {
|
|
420
|
+
return message(
|
|
421
|
+
"Process most likely ran out of memory, but we can't be certain. Try choosing a machine preset with more memory for this task."
|
|
422
|
+
);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if (stderr?.includes("OOMErrorHandler")) {
|
|
426
|
+
return message(
|
|
427
|
+
"Process ran out of memory! Try choosing a machine preset with more memory for this task."
|
|
428
|
+
);
|
|
429
|
+
}
|
|
430
|
+
return message(`Process exited with code ${code}.`);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// src/workers/prod/backgroundWorker.ts
|
|
140
434
|
var ProdBackgroundWorker = class {
|
|
141
435
|
constructor(path, params) {
|
|
142
436
|
this.path = path;
|
|
143
437
|
this.params = params;
|
|
144
438
|
}
|
|
145
439
|
_initialized = false;
|
|
440
|
+
/**
|
|
441
|
+
* @deprecated use onTaskRunHeartbeat instead
|
|
442
|
+
*/
|
|
146
443
|
onTaskHeartbeat = new Evt();
|
|
147
|
-
|
|
444
|
+
onTaskRunHeartbeat = new Evt();
|
|
148
445
|
onWaitForDuration = new Evt();
|
|
149
446
|
onWaitForTask = new Evt();
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
447
|
+
onWaitForBatch = new Evt();
|
|
448
|
+
onCreateTaskRunAttempt = Evt.create();
|
|
449
|
+
attemptCreatedNotification = Evt.create();
|
|
153
450
|
_onClose = new Evt();
|
|
154
451
|
tasks = [];
|
|
452
|
+
stderr = [];
|
|
155
453
|
_taskRunProcess;
|
|
454
|
+
_taskRunProcessesBeingKilled = /* @__PURE__ */ new Map();
|
|
156
455
|
_closed = false;
|
|
157
|
-
async close() {
|
|
456
|
+
async close(gracefulExitTimeoutElapsed = false) {
|
|
457
|
+
console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed });
|
|
158
458
|
if (this._closed) {
|
|
159
459
|
return;
|
|
160
460
|
}
|
|
161
461
|
this._closed = true;
|
|
162
462
|
this.onTaskHeartbeat.detach();
|
|
163
|
-
|
|
463
|
+
this.onTaskRunHeartbeat.detach();
|
|
464
|
+
await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed);
|
|
465
|
+
}
|
|
466
|
+
async #killTaskRunProcess(flush = true, initialSignal = "SIGTERM") {
|
|
467
|
+
console.log("Killing task run process", { flush, initialSignal, closed: this._closed });
|
|
468
|
+
if (this._closed || !this._taskRunProcess) {
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
if (flush) {
|
|
472
|
+
await this.flushTelemetry();
|
|
473
|
+
}
|
|
474
|
+
const currentTaskRunProcess = this._taskRunProcess;
|
|
475
|
+
this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => {
|
|
476
|
+
console.error("Error while trying graceful exit", error);
|
|
477
|
+
});
|
|
478
|
+
console.log("Killed task run process, setting closed to true", {
|
|
479
|
+
closed: this._closed,
|
|
480
|
+
pid: currentTaskRunProcess.pid
|
|
481
|
+
});
|
|
482
|
+
this._closed = true;
|
|
164
483
|
}
|
|
165
484
|
async flushTelemetry() {
|
|
485
|
+
console.log("Flushing telemetry");
|
|
486
|
+
const start = performance.now();
|
|
166
487
|
await this._taskRunProcess?.cleanup(false);
|
|
488
|
+
console.log("Flushed telemetry", { duration: performance.now() - start });
|
|
167
489
|
}
|
|
168
490
|
async initialize(options) {
|
|
169
491
|
if (this._initialized) {
|
|
@@ -186,7 +508,7 @@ var ProdBackgroundWorker = class {
|
|
|
186
508
|
...options?.env
|
|
187
509
|
}
|
|
188
510
|
});
|
|
189
|
-
const
|
|
511
|
+
const timeout3 = setTimeout(() => {
|
|
190
512
|
if (resolved) {
|
|
191
513
|
return;
|
|
192
514
|
}
|
|
@@ -194,6 +516,20 @@ var ProdBackgroundWorker = class {
|
|
|
194
516
|
child.kill();
|
|
195
517
|
reject(new Error("Worker timed out"));
|
|
196
518
|
}, 1e4);
|
|
519
|
+
child.stdout?.on("data", (data) => {
|
|
520
|
+
console.log(data.toString());
|
|
521
|
+
});
|
|
522
|
+
child.stderr?.on("data", (data) => {
|
|
523
|
+
console.error(data.toString());
|
|
524
|
+
this.stderr.push(data.toString());
|
|
525
|
+
});
|
|
526
|
+
child.on("exit", (code) => {
|
|
527
|
+
if (!resolved) {
|
|
528
|
+
clearTimeout(timeout3);
|
|
529
|
+
resolved = true;
|
|
530
|
+
reject(new Error(`Worker exited with code ${code}`));
|
|
531
|
+
}
|
|
532
|
+
});
|
|
197
533
|
new ZodIpcConnection({
|
|
198
534
|
listenSchema: ProdChildToWorkerMessages,
|
|
199
535
|
emitSchema: ProdWorkerToChildMessages,
|
|
@@ -201,7 +537,7 @@ var ProdBackgroundWorker = class {
|
|
|
201
537
|
handlers: {
|
|
202
538
|
TASKS_READY: async (message) => {
|
|
203
539
|
if (!resolved) {
|
|
204
|
-
clearTimeout(
|
|
540
|
+
clearTimeout(timeout3);
|
|
205
541
|
resolved = true;
|
|
206
542
|
resolve(message.tasks);
|
|
207
543
|
child.kill();
|
|
@@ -209,7 +545,7 @@ var ProdBackgroundWorker = class {
|
|
|
209
545
|
},
|
|
210
546
|
UNCAUGHT_EXCEPTION: async (message) => {
|
|
211
547
|
if (!resolved) {
|
|
212
|
-
clearTimeout(
|
|
548
|
+
clearTimeout(timeout3);
|
|
213
549
|
resolved = true;
|
|
214
550
|
reject(new UncaughtExceptionError(message.error, message.origin));
|
|
215
551
|
child.kill();
|
|
@@ -217,7 +553,7 @@ var ProdBackgroundWorker = class {
|
|
|
217
553
|
},
|
|
218
554
|
TASKS_FAILED_TO_PARSE: async (message) => {
|
|
219
555
|
if (!resolved) {
|
|
220
|
-
clearTimeout(
|
|
556
|
+
clearTimeout(timeout3);
|
|
221
557
|
resolved = true;
|
|
222
558
|
reject(new TaskMetadataParseError(message.zodIssues, message.tasks));
|
|
223
559
|
child.kill();
|
|
@@ -225,19 +561,6 @@ var ProdBackgroundWorker = class {
|
|
|
225
561
|
}
|
|
226
562
|
}
|
|
227
563
|
});
|
|
228
|
-
child.stdout?.on("data", (data) => {
|
|
229
|
-
console.log(data.toString());
|
|
230
|
-
});
|
|
231
|
-
child.stderr?.on("data", (data) => {
|
|
232
|
-
console.error(data.toString());
|
|
233
|
-
});
|
|
234
|
-
child.on("exit", (code) => {
|
|
235
|
-
if (!resolved) {
|
|
236
|
-
clearTimeout(timeout);
|
|
237
|
-
resolved = true;
|
|
238
|
-
reject(new Error(`Worker exited with code ${code}`));
|
|
239
|
-
}
|
|
240
|
-
});
|
|
241
564
|
});
|
|
242
565
|
this._initialized = true;
|
|
243
566
|
}
|
|
@@ -250,63 +573,135 @@ var ProdBackgroundWorker = class {
|
|
|
250
573
|
}
|
|
251
574
|
// We need to notify all the task run processes that a task run has completed,
|
|
252
575
|
// in case they are waiting for it through triggerAndWait
|
|
253
|
-
async taskRunCompletedNotification(completion
|
|
254
|
-
this._taskRunProcess?.taskRunCompletedNotification(completion
|
|
576
|
+
async taskRunCompletedNotification(completion) {
|
|
577
|
+
this._taskRunProcess?.taskRunCompletedNotification(completion);
|
|
255
578
|
}
|
|
256
579
|
async waitCompletedNotification() {
|
|
257
580
|
this._taskRunProcess?.waitCompletedNotification();
|
|
258
581
|
}
|
|
259
|
-
async #
|
|
582
|
+
async #getFreshTaskRunProcess(payload, messageId) {
|
|
260
583
|
const metadata = this.getMetadata(
|
|
261
584
|
payload.execution.worker.id,
|
|
262
585
|
payload.execution.worker.version
|
|
263
586
|
);
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
587
|
+
console.log("Getting fresh task run process, setting closed to false", {
|
|
588
|
+
closed: this._closed
|
|
589
|
+
});
|
|
590
|
+
this._closed = false;
|
|
591
|
+
await this.#killCurrentTaskRunProcessBeforeAttempt();
|
|
592
|
+
const taskRunProcess = new TaskRunProcess(
|
|
593
|
+
payload.execution.run.id,
|
|
594
|
+
payload.execution.run.isTest,
|
|
595
|
+
this.path,
|
|
596
|
+
{
|
|
597
|
+
...this.params.env,
|
|
598
|
+
...payload.environment ?? {}
|
|
599
|
+
},
|
|
600
|
+
metadata,
|
|
601
|
+
this.params,
|
|
602
|
+
messageId
|
|
603
|
+
);
|
|
604
|
+
taskRunProcess.onExit.attach(({ pid }) => {
|
|
605
|
+
console.log("Task run process exited", { pid });
|
|
606
|
+
if (this._taskRunProcess?.pid === pid) {
|
|
276
607
|
this._taskRunProcess = void 0;
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
this.
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
this.
|
|
301
|
-
}
|
|
608
|
+
}
|
|
609
|
+
if (pid) {
|
|
610
|
+
this._taskRunProcessesBeingKilled.delete(pid);
|
|
611
|
+
}
|
|
612
|
+
});
|
|
613
|
+
taskRunProcess.onIsBeingKilled.attach((taskRunProcess2) => {
|
|
614
|
+
if (taskRunProcess2?.pid) {
|
|
615
|
+
this._taskRunProcessesBeingKilled.set(taskRunProcess2.pid, taskRunProcess2);
|
|
616
|
+
}
|
|
617
|
+
});
|
|
618
|
+
taskRunProcess.onTaskHeartbeat.attach((id) => {
|
|
619
|
+
this.onTaskHeartbeat.post(id);
|
|
620
|
+
});
|
|
621
|
+
taskRunProcess.onTaskRunHeartbeat.attach((id) => {
|
|
622
|
+
this.onTaskRunHeartbeat.post(id);
|
|
623
|
+
});
|
|
624
|
+
taskRunProcess.onWaitForBatch.attach((message) => {
|
|
625
|
+
this.onWaitForBatch.post(message);
|
|
626
|
+
});
|
|
627
|
+
taskRunProcess.onWaitForDuration.attach((message) => {
|
|
628
|
+
this.onWaitForDuration.post(message);
|
|
629
|
+
});
|
|
630
|
+
taskRunProcess.onWaitForTask.attach((message) => {
|
|
631
|
+
this.onWaitForTask.post(message);
|
|
632
|
+
});
|
|
633
|
+
await taskRunProcess.initialize();
|
|
634
|
+
this._taskRunProcess = taskRunProcess;
|
|
302
635
|
return this._taskRunProcess;
|
|
303
636
|
}
|
|
304
|
-
|
|
305
|
-
|
|
637
|
+
async forceKillOldTaskRunProcesses() {
|
|
638
|
+
for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) {
|
|
639
|
+
try {
|
|
640
|
+
await taskRunProcess.kill("SIGKILL");
|
|
641
|
+
} catch (error) {
|
|
642
|
+
console.error("Error while force killing old task run processes", error);
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
async #killCurrentTaskRunProcessBeforeAttempt() {
|
|
647
|
+
console.log("killCurrentTaskRunProcessBeforeAttempt()", {
|
|
648
|
+
hasTaskRunProcess: !!this._taskRunProcess
|
|
649
|
+
});
|
|
650
|
+
if (!this._taskRunProcess) {
|
|
651
|
+
return;
|
|
652
|
+
}
|
|
653
|
+
const currentTaskRunProcess = this._taskRunProcess;
|
|
654
|
+
console.log("Killing current task run process", {
|
|
655
|
+
isBeingKilled: currentTaskRunProcess?.isBeingKilled,
|
|
656
|
+
totalBeingKilled: this._taskRunProcessesBeingKilled.size
|
|
657
|
+
});
|
|
658
|
+
if (currentTaskRunProcess.isBeingKilled) {
|
|
659
|
+
if (this._taskRunProcessesBeingKilled.size > 1) {
|
|
660
|
+
await this.#tryGracefulExit(currentTaskRunProcess);
|
|
661
|
+
} else {
|
|
662
|
+
}
|
|
663
|
+
} else {
|
|
664
|
+
if (this._taskRunProcessesBeingKilled.size > 0) {
|
|
665
|
+
await this.#tryGracefulExit(currentTaskRunProcess);
|
|
666
|
+
} else {
|
|
667
|
+
currentTaskRunProcess.kill("SIGTERM", 5e3).catch(() => {
|
|
668
|
+
});
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
|
|
673
|
+
console.log("Trying graceful exit", { kill, initialSignal });
|
|
306
674
|
try {
|
|
307
|
-
const
|
|
675
|
+
const initialExit = taskRunProcess.onExit.waitFor(5e3);
|
|
676
|
+
if (kill) {
|
|
677
|
+
taskRunProcess.kill(initialSignal);
|
|
678
|
+
}
|
|
679
|
+
await initialExit;
|
|
680
|
+
} catch (error) {
|
|
681
|
+
console.error("TaskRunProcess graceful kill timeout exceeded", error);
|
|
682
|
+
this.#tryForcefulExit(taskRunProcess);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
async #tryForcefulExit(taskRunProcess) {
|
|
686
|
+
console.log("Trying forceful exit");
|
|
687
|
+
try {
|
|
688
|
+
const forcedKill = taskRunProcess.onExit.waitFor(5e3);
|
|
689
|
+
taskRunProcess.kill("SIGKILL");
|
|
690
|
+
await forcedKill;
|
|
691
|
+
} catch (error) {
|
|
692
|
+
console.error("TaskRunProcess forced kill timeout exceeded", error);
|
|
693
|
+
throw new SigKillTimeoutProcessError();
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
// We need to fork the process before we can execute any tasks, use a fresh process for each execution
|
|
697
|
+
async executeTaskRun(payload, messageId) {
|
|
698
|
+
try {
|
|
699
|
+
const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId);
|
|
700
|
+
console.log("executing task run", {
|
|
701
|
+
attempt: payload.execution.attempt.id,
|
|
702
|
+
taskRunPid: taskRunProcess.pid
|
|
703
|
+
});
|
|
308
704
|
const result = await taskRunProcess.executeTaskRun(payload);
|
|
309
|
-
await taskRunProcess.cleanup(result.ok || result.retry === void 0);
|
|
310
705
|
if (result.ok) {
|
|
311
706
|
return result;
|
|
312
707
|
}
|
|
@@ -349,7 +744,32 @@ var ProdBackgroundWorker = class {
|
|
|
349
744
|
retry: void 0,
|
|
350
745
|
error: {
|
|
351
746
|
type: "INTERNAL_ERROR",
|
|
352
|
-
code: TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE
|
|
747
|
+
code: TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE,
|
|
748
|
+
message: getFriendlyErrorMessage(e.code, e.signal, e.stderr),
|
|
749
|
+
stackTrace: e.stderr
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
}
|
|
753
|
+
if (e instanceof SigKillTimeoutProcessError) {
|
|
754
|
+
return {
|
|
755
|
+
id: payload.execution.attempt.id,
|
|
756
|
+
ok: false,
|
|
757
|
+
retry: void 0,
|
|
758
|
+
error: {
|
|
759
|
+
type: "INTERNAL_ERROR",
|
|
760
|
+
code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT
|
|
761
|
+
}
|
|
762
|
+
};
|
|
763
|
+
}
|
|
764
|
+
if (e instanceof GracefulExitTimeoutError) {
|
|
765
|
+
return {
|
|
766
|
+
id: payload.execution.attempt.id,
|
|
767
|
+
ok: false,
|
|
768
|
+
retry: void 0,
|
|
769
|
+
error: {
|
|
770
|
+
type: "INTERNAL_ERROR",
|
|
771
|
+
code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT,
|
|
772
|
+
message: "Worker process killed while attempt in progress."
|
|
353
773
|
}
|
|
354
774
|
};
|
|
355
775
|
}
|
|
@@ -362,10 +782,44 @@ var ProdBackgroundWorker = class {
|
|
|
362
782
|
code: TaskRunErrorCodes.TASK_EXECUTION_FAILED
|
|
363
783
|
}
|
|
364
784
|
};
|
|
785
|
+
} finally {
|
|
786
|
+
await this.#killTaskRunProcess();
|
|
365
787
|
}
|
|
366
788
|
}
|
|
367
789
|
async cancelAttempt(attemptId) {
|
|
368
|
-
|
|
790
|
+
if (!this._taskRunProcess) {
|
|
791
|
+
console.error("No task run process to cancel attempt", { attemptId });
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
await this._taskRunProcess.cancel();
|
|
795
|
+
}
|
|
796
|
+
async executeTaskRunLazyAttempt(payload) {
|
|
797
|
+
this.onCreateTaskRunAttempt.post({ runId: payload.runId });
|
|
798
|
+
let execution;
|
|
799
|
+
try {
|
|
800
|
+
const start = performance.now();
|
|
801
|
+
const attemptCreated = await this.attemptCreatedNotification.waitFor(12e4);
|
|
802
|
+
if (!attemptCreated.success) {
|
|
803
|
+
throw new Error(`${attemptCreated.reason ?? "Unknown error"}`);
|
|
804
|
+
}
|
|
805
|
+
console.log("Attempt created", {
|
|
806
|
+
number: attemptCreated.execution.attempt.number,
|
|
807
|
+
duration: performance.now() - start
|
|
808
|
+
});
|
|
809
|
+
execution = attemptCreated.execution;
|
|
810
|
+
} catch (error) {
|
|
811
|
+
console.error("Error while creating attempt", error);
|
|
812
|
+
throw new Error(`Failed to create attempt: ${error}`);
|
|
813
|
+
}
|
|
814
|
+
const completion = await this.executeTaskRun(
|
|
815
|
+
{
|
|
816
|
+
execution,
|
|
817
|
+
traceContext: payload.traceContext,
|
|
818
|
+
environment: payload.environment
|
|
819
|
+
},
|
|
820
|
+
payload.messageId
|
|
821
|
+
);
|
|
822
|
+
return { execution, completion };
|
|
369
823
|
}
|
|
370
824
|
async #correctError(error, execution) {
|
|
371
825
|
return {
|
|
@@ -375,28 +829,36 @@ var ProdBackgroundWorker = class {
|
|
|
375
829
|
}
|
|
376
830
|
};
|
|
377
831
|
var TaskRunProcess = class {
|
|
378
|
-
constructor(
|
|
379
|
-
this.
|
|
832
|
+
constructor(runId, isTest, path, env, metadata, worker, messageId) {
|
|
833
|
+
this.runId = runId;
|
|
834
|
+
this.isTest = isTest;
|
|
380
835
|
this.path = path;
|
|
381
836
|
this.env = env;
|
|
382
837
|
this.metadata = metadata;
|
|
383
838
|
this.worker = worker;
|
|
839
|
+
this.messageId = messageId;
|
|
384
840
|
}
|
|
385
841
|
_ipc;
|
|
386
842
|
_child;
|
|
843
|
+
_childPid;
|
|
387
844
|
_attemptPromises = /* @__PURE__ */ new Map();
|
|
388
845
|
_attemptStatuses = /* @__PURE__ */ new Map();
|
|
389
846
|
_currentExecution;
|
|
390
847
|
_isBeingKilled = false;
|
|
391
848
|
_isBeingCancelled = false;
|
|
849
|
+
_gracefulExitTimeoutElapsed = false;
|
|
850
|
+
_stderr = [];
|
|
851
|
+
/**
|
|
852
|
+
* @deprecated use onTaskRunHeartbeat instead
|
|
853
|
+
*/
|
|
392
854
|
onTaskHeartbeat = new Evt();
|
|
855
|
+
onTaskRunHeartbeat = new Evt();
|
|
393
856
|
onExit = new Evt();
|
|
394
|
-
|
|
857
|
+
onIsBeingKilled = new Evt();
|
|
395
858
|
onWaitForDuration = new Evt();
|
|
396
859
|
onWaitForTask = new Evt();
|
|
860
|
+
onWaitForBatch = new Evt();
|
|
397
861
|
preCheckpointNotification = Evt.create();
|
|
398
|
-
onReadyForCheckpoint = Evt.create();
|
|
399
|
-
onCancelCheckpoint = Evt.create();
|
|
400
862
|
async initialize() {
|
|
401
863
|
this._child = fork(this.path, {
|
|
402
864
|
stdio: [
|
|
@@ -409,7 +871,7 @@ var TaskRunProcess = class {
|
|
|
409
871
|
"ipc"
|
|
410
872
|
],
|
|
411
873
|
env: {
|
|
412
|
-
...this.
|
|
874
|
+
...this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {},
|
|
413
875
|
...this.env,
|
|
414
876
|
OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({
|
|
415
877
|
[SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir
|
|
@@ -417,6 +879,7 @@ var TaskRunProcess = class {
|
|
|
417
879
|
...this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}
|
|
418
880
|
}
|
|
419
881
|
});
|
|
882
|
+
this._childPid = this._child?.pid;
|
|
420
883
|
this._ipc = new ZodIpcConnection({
|
|
421
884
|
listenSchema: ProdChildToWorkerMessages,
|
|
422
885
|
emitSchema: ProdWorkerToChildMessages,
|
|
@@ -437,28 +900,29 @@ var TaskRunProcess = class {
|
|
|
437
900
|
resolver(result);
|
|
438
901
|
},
|
|
439
902
|
READY_TO_DISPOSE: async (message) => {
|
|
903
|
+
process.exit(0);
|
|
440
904
|
},
|
|
441
905
|
TASK_HEARTBEAT: async (message) => {
|
|
442
|
-
this.
|
|
906
|
+
if (this.messageId) {
|
|
907
|
+
this.onTaskRunHeartbeat.post(this.messageId);
|
|
908
|
+
} else {
|
|
909
|
+
console.error(
|
|
910
|
+
"No message id for task heartbeat, falling back to (deprecated) attempt heartbeat",
|
|
911
|
+
{ id: message.id }
|
|
912
|
+
);
|
|
913
|
+
this.onTaskHeartbeat.post(message.id);
|
|
914
|
+
}
|
|
443
915
|
},
|
|
444
916
|
TASKS_READY: async (message) => {
|
|
445
917
|
},
|
|
918
|
+
WAIT_FOR_TASK: async (message) => {
|
|
919
|
+
this.onWaitForTask.post(message);
|
|
920
|
+
},
|
|
446
921
|
WAIT_FOR_BATCH: async (message) => {
|
|
447
922
|
this.onWaitForBatch.post(message);
|
|
448
923
|
},
|
|
449
924
|
WAIT_FOR_DURATION: async (message) => {
|
|
450
925
|
this.onWaitForDuration.post(message);
|
|
451
|
-
const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor();
|
|
452
|
-
return { willCheckpointAndRestore };
|
|
453
|
-
},
|
|
454
|
-
WAIT_FOR_TASK: async (message) => {
|
|
455
|
-
this.onWaitForTask.post(message);
|
|
456
|
-
},
|
|
457
|
-
READY_FOR_CHECKPOINT: async (message) => {
|
|
458
|
-
this.onReadyForCheckpoint.post(message);
|
|
459
|
-
},
|
|
460
|
-
CANCEL_CHECKPOINT: async (message) => {
|
|
461
|
-
this.onCancelCheckpoint.post(message);
|
|
462
926
|
}
|
|
463
927
|
}
|
|
464
928
|
});
|
|
@@ -470,15 +934,43 @@ var TaskRunProcess = class {
|
|
|
470
934
|
this._isBeingCancelled = true;
|
|
471
935
|
await this.cleanup(true);
|
|
472
936
|
}
|
|
473
|
-
async cleanup(kill = false) {
|
|
937
|
+
async cleanup(kill = false, gracefulExitTimeoutElapsed = false) {
|
|
938
|
+
console.log("cleanup()", { kill, gracefulExitTimeoutElapsed });
|
|
474
939
|
if (kill && this._isBeingKilled) {
|
|
475
940
|
return;
|
|
476
941
|
}
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
942
|
+
if (kill) {
|
|
943
|
+
this._isBeingKilled = true;
|
|
944
|
+
this.onIsBeingKilled.post(this);
|
|
945
|
+
}
|
|
946
|
+
const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution;
|
|
947
|
+
const killParentProcess = kill && !killChildProcess;
|
|
948
|
+
console.log("Cleaning up task run process", {
|
|
949
|
+
killChildProcess,
|
|
950
|
+
killParentProcess,
|
|
951
|
+
ipc: this._ipc,
|
|
952
|
+
childPid: this._childPid,
|
|
953
|
+
realChildPid: this._child?.pid
|
|
481
954
|
});
|
|
955
|
+
try {
|
|
956
|
+
await this._ipc?.sendWithAck(
|
|
957
|
+
"CLEANUP",
|
|
958
|
+
{
|
|
959
|
+
flush: true,
|
|
960
|
+
kill: killParentProcess
|
|
961
|
+
},
|
|
962
|
+
3e4
|
|
963
|
+
);
|
|
964
|
+
} catch (error) {
|
|
965
|
+
console.error("Error while cleaning up task run process", error);
|
|
966
|
+
if (killParentProcess) {
|
|
967
|
+
process.exit(0);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
if (killChildProcess) {
|
|
971
|
+
this._gracefulExitTimeoutElapsed = true;
|
|
972
|
+
await this.kill("SIGKILL");
|
|
973
|
+
}
|
|
482
974
|
}
|
|
483
975
|
async executeTaskRun(payload) {
|
|
484
976
|
let resolver;
|
|
@@ -502,25 +994,38 @@ var TaskRunProcess = class {
|
|
|
502
994
|
this._currentExecution = void 0;
|
|
503
995
|
return result;
|
|
504
996
|
}
|
|
505
|
-
taskRunCompletedNotification(completion
|
|
997
|
+
taskRunCompletedNotification(completion) {
|
|
506
998
|
if (!completion.ok && typeof completion.retry !== "undefined") {
|
|
999
|
+
console.error(
|
|
1000
|
+
"Task run completed with error and wants to retry, won't send task run completed notification"
|
|
1001
|
+
);
|
|
507
1002
|
return;
|
|
508
1003
|
}
|
|
509
|
-
if (this._child?.connected
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
1004
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1005
|
+
console.error(
|
|
1006
|
+
"Child process not connected or being killed, can't send task run completed notification"
|
|
1007
|
+
);
|
|
1008
|
+
return;
|
|
514
1009
|
}
|
|
1010
|
+
this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
|
|
1011
|
+
version: "v2",
|
|
1012
|
+
completion
|
|
1013
|
+
});
|
|
515
1014
|
}
|
|
516
1015
|
waitCompletedNotification() {
|
|
517
|
-
if (this._child?.connected
|
|
518
|
-
|
|
1016
|
+
if (!this._child?.connected || this._isBeingKilled || this._child.killed) {
|
|
1017
|
+
console.error(
|
|
1018
|
+
"Child process not connected or being killed, can't send wait completed notification"
|
|
1019
|
+
);
|
|
1020
|
+
return;
|
|
519
1021
|
}
|
|
1022
|
+
this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
|
|
520
1023
|
}
|
|
521
|
-
async #handleExit(code) {
|
|
1024
|
+
async #handleExit(code, signal) {
|
|
1025
|
+
console.log("handling child exit", { code, signal });
|
|
522
1026
|
for (const [id, status] of this._attemptStatuses.entries()) {
|
|
523
1027
|
if (status === "PENDING") {
|
|
1028
|
+
console.log("found pending attempt", { id });
|
|
524
1029
|
this._attemptStatuses.set(id, "REJECTED");
|
|
525
1030
|
const attemptPromise = this._attemptPromises.get(id);
|
|
526
1031
|
if (!attemptPromise) {
|
|
@@ -529,124 +1034,73 @@ var TaskRunProcess = class {
|
|
|
529
1034
|
const { rejecter } = attemptPromise;
|
|
530
1035
|
if (this._isBeingCancelled) {
|
|
531
1036
|
rejecter(new CancelledProcessError());
|
|
1037
|
+
} else if (this._gracefulExitTimeoutElapsed) {
|
|
1038
|
+
rejecter(new GracefulExitTimeoutError());
|
|
532
1039
|
} else if (this._isBeingKilled) {
|
|
533
1040
|
rejecter(new CleanupProcessError());
|
|
534
1041
|
} else {
|
|
535
|
-
rejecter(
|
|
1042
|
+
rejecter(
|
|
1043
|
+
new UnexpectedExitError(
|
|
1044
|
+
code ?? -1,
|
|
1045
|
+
signal,
|
|
1046
|
+
this._stderr.length ? this._stderr.join("\n") : void 0
|
|
1047
|
+
)
|
|
1048
|
+
);
|
|
536
1049
|
}
|
|
537
1050
|
}
|
|
538
1051
|
}
|
|
539
|
-
this.onExit.post(code);
|
|
1052
|
+
this.onExit.post({ code, signal, pid: this.pid });
|
|
540
1053
|
}
|
|
541
1054
|
#handleLog(data) {
|
|
542
|
-
|
|
543
|
-
return;
|
|
544
|
-
}
|
|
545
|
-
console.log(
|
|
546
|
-
`[${this.metadata.version}][${this._currentExecution.run.id}.${this._currentExecution.attempt.number}] ${data.toString()}`
|
|
547
|
-
);
|
|
1055
|
+
console.log(data.toString());
|
|
548
1056
|
}
|
|
549
1057
|
#handleStdErr(data) {
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
console.error(`[${this.metadata.version}] ${data.toString()}`);
|
|
555
|
-
return;
|
|
1058
|
+
const text = data.toString();
|
|
1059
|
+
console.error(text);
|
|
1060
|
+
if (this._stderr.length > 100) {
|
|
1061
|
+
this._stderr.shift();
|
|
556
1062
|
}
|
|
557
|
-
|
|
558
|
-
`[${this.metadata.version}][${this._currentExecution.run.id}.${this._currentExecution.attempt.number}] ${data.toString()}`
|
|
559
|
-
);
|
|
1063
|
+
this._stderr.push(text);
|
|
560
1064
|
}
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
1065
|
+
async kill(signal, timeoutInMs) {
|
|
1066
|
+
this._isBeingKilled = true;
|
|
1067
|
+
const killTimeout = this.onExit.waitFor(timeoutInMs);
|
|
1068
|
+
this.onIsBeingKilled.post(this);
|
|
1069
|
+
this._child?.kill(signal);
|
|
1070
|
+
if (timeoutInMs) {
|
|
1071
|
+
await killTimeout;
|
|
564
1072
|
}
|
|
565
1073
|
}
|
|
1074
|
+
get isBeingKilled() {
|
|
1075
|
+
return this._isBeingKilled || this._child?.killed;
|
|
1076
|
+
}
|
|
1077
|
+
get pid() {
|
|
1078
|
+
return this._childPid;
|
|
1079
|
+
}
|
|
566
1080
|
};
|
|
567
1081
|
|
|
568
1082
|
// src/workers/prod/entry-point.ts
|
|
569
|
-
import {
|
|
570
|
-
|
|
571
|
-
|
|
1083
|
+
import { checkpointSafeTimeout, unboundedTimeout } from "@trigger.dev/core/v3/utils/timers";
|
|
1084
|
+
import { randomUUID } from "node:crypto";
|
|
1085
|
+
import { readFile } from "node:fs/promises";
|
|
1086
|
+
import { createServer } from "node:http";
|
|
1087
|
+
import { setTimeout as timeout2 } from "node:timers/promises";
|
|
1088
|
+
var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
|
|
1089
|
+
var COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
|
|
572
1090
|
var COORDINATOR_PORT = Number(process.env.COORDINATOR_PORT || 50080);
|
|
573
|
-
var
|
|
1091
|
+
var MACHINE_NAME = process.env.MACHINE_NAME || "local";
|
|
574
1092
|
var POD_NAME = process.env.POD_NAME || "some-pod";
|
|
575
1093
|
var SHORT_HASH = process.env.TRIGGER_CONTENT_HASH.slice(0, 9);
|
|
576
|
-
var
|
|
1094
|
+
var logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`);
|
|
1095
|
+
var defaultBackoff = new ExponentialBackoff("FullJitter", {
|
|
1096
|
+
maxRetries: 5
|
|
1097
|
+
});
|
|
577
1098
|
var ProdWorker = class {
|
|
578
1099
|
constructor(port, host = "0.0.0.0") {
|
|
579
1100
|
this.host = host;
|
|
1101
|
+
process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM"));
|
|
580
1102
|
this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST);
|
|
581
|
-
this.#backgroundWorker =
|
|
582
|
-
projectConfig: __PROJECT_CONFIG__,
|
|
583
|
-
env: {
|
|
584
|
-
...gatherProcessEnv(),
|
|
585
|
-
TRIGGER_API_URL: this.apiUrl,
|
|
586
|
-
TRIGGER_SECRET_KEY: this.apiKey,
|
|
587
|
-
OTEL_EXPORTER_OTLP_ENDPOINT: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318"
|
|
588
|
-
},
|
|
589
|
-
contentHash: this.contentHash
|
|
590
|
-
});
|
|
591
|
-
this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
|
|
592
|
-
this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId });
|
|
593
|
-
});
|
|
594
|
-
this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
|
|
595
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
596
|
-
});
|
|
597
|
-
this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => {
|
|
598
|
-
logger2.log("onCancelCheckpoint() clearing paused state, don't wait for post start hook", {
|
|
599
|
-
paused: this.paused,
|
|
600
|
-
nextResumeAfter: this.nextResumeAfter,
|
|
601
|
-
waitForPostStart: this.waitForPostStart
|
|
602
|
-
});
|
|
603
|
-
this.paused = false;
|
|
604
|
-
this.nextResumeAfter = void 0;
|
|
605
|
-
this.waitForPostStart = false;
|
|
606
|
-
this.#coordinatorSocket.socket.emit("CANCEL_CHECKPOINT", { version: "v1" });
|
|
607
|
-
});
|
|
608
|
-
this.#backgroundWorker.onWaitForDuration.attach(async (message) => {
|
|
609
|
-
if (!this.attemptFriendlyId) {
|
|
610
|
-
logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
611
|
-
return;
|
|
612
|
-
}
|
|
613
|
-
const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
614
|
-
"WAIT_FOR_DURATION",
|
|
615
|
-
{
|
|
616
|
-
...message,
|
|
617
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
618
|
-
}
|
|
619
|
-
);
|
|
620
|
-
this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
|
|
621
|
-
});
|
|
622
|
-
this.#backgroundWorker.onWaitForTask.attach(async (message) => {
|
|
623
|
-
if (!this.attemptFriendlyId) {
|
|
624
|
-
logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
625
|
-
return;
|
|
626
|
-
}
|
|
627
|
-
const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
628
|
-
"WAIT_FOR_TASK",
|
|
629
|
-
{
|
|
630
|
-
...message,
|
|
631
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
632
|
-
}
|
|
633
|
-
);
|
|
634
|
-
this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
635
|
-
});
|
|
636
|
-
this.#backgroundWorker.onWaitForBatch.attach(async (message) => {
|
|
637
|
-
if (!this.attemptFriendlyId) {
|
|
638
|
-
logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
639
|
-
return;
|
|
640
|
-
}
|
|
641
|
-
const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
|
|
642
|
-
"WAIT_FOR_BATCH",
|
|
643
|
-
{
|
|
644
|
-
...message,
|
|
645
|
-
attemptFriendlyId: this.attemptFriendlyId
|
|
646
|
-
}
|
|
647
|
-
);
|
|
648
|
-
this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
649
|
-
});
|
|
1103
|
+
this.#backgroundWorker = this.#createBackgroundWorker();
|
|
650
1104
|
this.#httpPort = port;
|
|
651
1105
|
this.#httpServer = this.#createHttpServer();
|
|
652
1106
|
}
|
|
@@ -665,68 +1119,461 @@ var ProdWorker = class {
|
|
|
665
1119
|
attemptFriendlyId;
|
|
666
1120
|
nextResumeAfter;
|
|
667
1121
|
waitForPostStart = false;
|
|
1122
|
+
connectionCount = 0;
|
|
1123
|
+
waitForTaskReplay;
|
|
1124
|
+
waitForBatchReplay;
|
|
1125
|
+
readyForLazyAttemptReplay;
|
|
1126
|
+
submitAttemptCompletionReplay;
|
|
1127
|
+
durationResumeFallback;
|
|
668
1128
|
#httpPort;
|
|
669
1129
|
#backgroundWorker;
|
|
670
1130
|
#httpServer;
|
|
671
1131
|
#coordinatorSocket;
|
|
672
|
-
async #
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
1132
|
+
async #handleSignal(signal) {
|
|
1133
|
+
logger.log("Received signal", { signal });
|
|
1134
|
+
if (signal === "SIGTERM") {
|
|
1135
|
+
let gracefulExitTimeoutElapsed = false;
|
|
1136
|
+
if (this.executing) {
|
|
1137
|
+
const terminationGracePeriodSeconds = 60 * 60;
|
|
1138
|
+
logger.log("Waiting for attempt to complete before exiting", {
|
|
1139
|
+
terminationGracePeriodSeconds
|
|
1140
|
+
});
|
|
1141
|
+
await timeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
|
|
1142
|
+
gracefulExitTimeoutElapsed = true;
|
|
1143
|
+
logger.log("Termination timeout reached, exiting gracefully.");
|
|
1144
|
+
} else {
|
|
1145
|
+
logger.log("Not executing, exiting immediately.");
|
|
1146
|
+
}
|
|
1147
|
+
await this.#exitGracefully(gracefulExitTimeoutElapsed);
|
|
679
1148
|
return;
|
|
680
1149
|
}
|
|
1150
|
+
logger.log("Unhandled signal", { signal });
|
|
1151
|
+
}
|
|
1152
|
+
async #exitGracefully(gracefulExitTimeoutElapsed = false, exitCode = 0) {
|
|
1153
|
+
await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
|
|
1154
|
+
if (!gracefulExitTimeoutElapsed) {
|
|
1155
|
+
process.exit(exitCode);
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
async #reconnectAfterPostStart() {
|
|
1159
|
+
this.waitForPostStart = false;
|
|
1160
|
+
this.#coordinatorSocket.close();
|
|
1161
|
+
this.connectionCount = 0;
|
|
1162
|
+
let coordinatorHost = COORDINATOR_HOST;
|
|
681
1163
|
try {
|
|
682
|
-
|
|
683
|
-
"
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
1164
|
+
if (this.runningInKubernetes) {
|
|
1165
|
+
coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
|
|
1166
|
+
"\n",
|
|
1167
|
+
""
|
|
1168
|
+
);
|
|
1169
|
+
logger.log("reconnecting", {
|
|
1170
|
+
coordinatorHost: {
|
|
1171
|
+
fromEnv: COORDINATOR_HOST,
|
|
1172
|
+
fromVolume: coordinatorHost,
|
|
1173
|
+
current: this.#coordinatorSocket.socket.io.opts.hostname
|
|
1174
|
+
}
|
|
1175
|
+
});
|
|
1176
|
+
}
|
|
1177
|
+
} catch (error) {
|
|
1178
|
+
logger.error("taskinfo read error during reconnect", {
|
|
1179
|
+
error: error instanceof Error ? error.message : error
|
|
692
1180
|
});
|
|
1181
|
+
} finally {
|
|
693
1182
|
this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
|
|
694
|
-
} catch (error) {
|
|
695
|
-
logger2.error("taskinfo read error during reconnect", { error });
|
|
696
|
-
this.#coordinatorSocket.connect();
|
|
697
1183
|
}
|
|
698
1184
|
}
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
1185
|
+
// MARK: TASK WAIT
|
|
1186
|
+
async #waitForTaskHandler(message, replayIdempotencyKey) {
|
|
1187
|
+
const waitForTask = await defaultBackoff.execute(async ({ retry }) => {
|
|
1188
|
+
logger.log("Wait for task with backoff", { retry });
|
|
1189
|
+
if (!this.attemptFriendlyId) {
|
|
1190
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1191
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1192
|
+
}
|
|
1193
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_TASK", {
|
|
1194
|
+
version: "v2",
|
|
1195
|
+
friendlyId: message.friendlyId,
|
|
1196
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1197
|
+
});
|
|
1198
|
+
});
|
|
1199
|
+
if (!waitForTask.success) {
|
|
1200
|
+
logger.error("Failed to wait for task with backoff", {
|
|
1201
|
+
cause: waitForTask.cause,
|
|
1202
|
+
error: waitForTask.error
|
|
1203
|
+
});
|
|
1204
|
+
this.#emitUnrecoverableError(
|
|
1205
|
+
"WaitForTaskFailed",
|
|
1206
|
+
`${waitForTask.cause}: ${waitForTask.error}`
|
|
1207
|
+
);
|
|
1208
|
+
return;
|
|
1209
|
+
}
|
|
1210
|
+
const { willCheckpointAndRestore } = waitForTask.result;
|
|
1211
|
+
await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
|
|
1212
|
+
if (willCheckpointAndRestore) {
|
|
1213
|
+
if (!this.waitForTaskReplay) {
|
|
1214
|
+
this.waitForTaskReplay = {
|
|
1215
|
+
message,
|
|
1216
|
+
attempt: 1,
|
|
1217
|
+
idempotencyKey: randomUUID()
|
|
1218
|
+
};
|
|
1219
|
+
} else {
|
|
1220
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1221
|
+
logger.error(
|
|
1222
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1223
|
+
);
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
this.waitForTaskReplay.attempt++;
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
// MARK: BATCH WAIT
|
|
1231
|
+
async #waitForBatchHandler(message, replayIdempotencyKey) {
|
|
1232
|
+
const waitForBatch = await defaultBackoff.execute(async ({ retry }) => {
|
|
1233
|
+
logger.log("Wait for batch with backoff", { retry });
|
|
1234
|
+
if (!this.attemptFriendlyId) {
|
|
1235
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1236
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1237
|
+
}
|
|
1238
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_BATCH", {
|
|
1239
|
+
version: "v2",
|
|
1240
|
+
batchFriendlyId: message.batchFriendlyId,
|
|
1241
|
+
runFriendlyIds: message.runFriendlyIds,
|
|
1242
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1243
|
+
});
|
|
1244
|
+
});
|
|
1245
|
+
if (!waitForBatch.success) {
|
|
1246
|
+
logger.error("Failed to wait for batch with backoff", {
|
|
1247
|
+
cause: waitForBatch.cause,
|
|
1248
|
+
error: waitForBatch.error
|
|
1249
|
+
});
|
|
1250
|
+
this.#emitUnrecoverableError(
|
|
1251
|
+
"WaitForBatchFailed",
|
|
1252
|
+
`${waitForBatch.cause}: ${waitForBatch.error}`
|
|
1253
|
+
);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
const { willCheckpointAndRestore } = waitForBatch.result;
|
|
1257
|
+
await this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
|
|
702
1258
|
if (willCheckpointAndRestore) {
|
|
703
|
-
this.
|
|
704
|
-
|
|
705
|
-
|
|
1259
|
+
if (!this.waitForBatchReplay) {
|
|
1260
|
+
this.waitForBatchReplay = {
|
|
1261
|
+
message,
|
|
1262
|
+
attempt: 1,
|
|
1263
|
+
idempotencyKey: randomUUID()
|
|
1264
|
+
};
|
|
1265
|
+
} else {
|
|
1266
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1267
|
+
logger.error(
|
|
1268
|
+
"wait for task handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1269
|
+
);
|
|
1270
|
+
return;
|
|
1271
|
+
}
|
|
1272
|
+
this.waitForBatchReplay.attempt++;
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
// MARK: WORKER CREATION
|
|
1277
|
+
#createBackgroundWorker() {
|
|
1278
|
+
const backgroundWorker = new ProdBackgroundWorker("worker.js", {
|
|
1279
|
+
projectConfig: __PROJECT_CONFIG__,
|
|
1280
|
+
env: {
|
|
1281
|
+
...gatherProcessEnv(),
|
|
1282
|
+
TRIGGER_API_URL: this.apiUrl,
|
|
1283
|
+
TRIGGER_SECRET_KEY: this.apiKey,
|
|
1284
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318"
|
|
1285
|
+
},
|
|
1286
|
+
contentHash: this.contentHash
|
|
1287
|
+
});
|
|
1288
|
+
backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
|
|
1289
|
+
logger.log("onTaskHeartbeat", { attemptFriendlyId });
|
|
1290
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_HEARTBEAT", {
|
|
1291
|
+
version: "v1",
|
|
1292
|
+
attemptFriendlyId
|
|
1293
|
+
});
|
|
1294
|
+
});
|
|
1295
|
+
backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
|
|
1296
|
+
logger.log("onTaskRunHeartbeat", { runId });
|
|
1297
|
+
this.#coordinatorSocket.socket.volatile.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
|
|
1298
|
+
});
|
|
1299
|
+
backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
|
|
1300
|
+
logger.log("onCreateTaskRunAttempt()", { message });
|
|
1301
|
+
const createAttempt = await defaultBackoff.execute(async ({ retry }) => {
|
|
1302
|
+
logger.log("Create task run attempt with backoff", { retry });
|
|
1303
|
+
return await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CREATE_TASK_RUN_ATTEMPT", {
|
|
1304
|
+
version: "v1",
|
|
1305
|
+
runId: message.runId
|
|
1306
|
+
});
|
|
1307
|
+
});
|
|
1308
|
+
if (!createAttempt.success) {
|
|
1309
|
+
backgroundWorker.attemptCreatedNotification.post({
|
|
1310
|
+
success: false,
|
|
1311
|
+
reason: `Failed to create attempt with backoff due to ${createAttempt.cause}. ${createAttempt.error}`
|
|
1312
|
+
});
|
|
1313
|
+
return;
|
|
1314
|
+
}
|
|
1315
|
+
if (!createAttempt.result.success) {
|
|
1316
|
+
backgroundWorker.attemptCreatedNotification.post({
|
|
1317
|
+
success: false,
|
|
1318
|
+
reason: createAttempt.result.reason
|
|
1319
|
+
});
|
|
1320
|
+
return;
|
|
1321
|
+
}
|
|
1322
|
+
backgroundWorker.attemptCreatedNotification.post({
|
|
1323
|
+
success: true,
|
|
1324
|
+
execution: createAttempt.result.executionPayload.execution
|
|
1325
|
+
});
|
|
1326
|
+
});
|
|
1327
|
+
backgroundWorker.attemptCreatedNotification.attach((message) => {
|
|
1328
|
+
logger.log("attemptCreatedNotification", {
|
|
1329
|
+
success: message.success,
|
|
1330
|
+
...message.success ? {
|
|
1331
|
+
attempt: message.execution.attempt,
|
|
1332
|
+
queue: message.execution.queue,
|
|
1333
|
+
worker: message.execution.worker,
|
|
1334
|
+
machine: message.execution.machine
|
|
1335
|
+
} : {
|
|
1336
|
+
reason: message.reason
|
|
1337
|
+
}
|
|
1338
|
+
});
|
|
1339
|
+
if (!message.success) {
|
|
1340
|
+
return;
|
|
1341
|
+
}
|
|
1342
|
+
this.attemptFriendlyId = message.execution.attempt.id;
|
|
1343
|
+
});
|
|
1344
|
+
backgroundWorker.onWaitForDuration.attach(async (message) => {
|
|
1345
|
+
logger.log("onWaitForDuration", { ...message, drift: Date.now() - message.now });
|
|
1346
|
+
noResume: {
|
|
1347
|
+
const { ms, waitThresholdInMs } = message;
|
|
1348
|
+
const internalTimeout = unboundedTimeout(ms, "internal");
|
|
1349
|
+
const checkpointSafeInternalTimeout = checkpointSafeTimeout(ms);
|
|
1350
|
+
if (ms < waitThresholdInMs) {
|
|
1351
|
+
await internalTimeout;
|
|
1352
|
+
break noResume;
|
|
1353
|
+
}
|
|
1354
|
+
const waitForDuration = await defaultBackoff.execute(async ({ retry }) => {
|
|
1355
|
+
logger.log("Wait for duration with backoff", { retry });
|
|
1356
|
+
if (!this.attemptFriendlyId) {
|
|
1357
|
+
logger.error("Failed to send wait message, attempt friendly ID not set", { message });
|
|
1358
|
+
throw new ExponentialBackoff.StopRetrying("No attempt ID");
|
|
1359
|
+
}
|
|
1360
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("WAIT_FOR_DURATION", {
|
|
1361
|
+
...message,
|
|
1362
|
+
attemptFriendlyId: this.attemptFriendlyId
|
|
1363
|
+
});
|
|
1364
|
+
});
|
|
1365
|
+
if (!waitForDuration.success) {
|
|
1366
|
+
logger.error("Failed to wait for duration with backoff", {
|
|
1367
|
+
cause: waitForDuration.cause,
|
|
1368
|
+
error: waitForDuration.error
|
|
1369
|
+
});
|
|
1370
|
+
this.#emitUnrecoverableError(
|
|
1371
|
+
"WaitForDurationFailed",
|
|
1372
|
+
`${waitForDuration.cause}: ${waitForDuration.error}`
|
|
1373
|
+
);
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
const { willCheckpointAndRestore } = waitForDuration.result;
|
|
1377
|
+
if (!willCheckpointAndRestore) {
|
|
1378
|
+
await internalTimeout;
|
|
1379
|
+
break noResume;
|
|
1380
|
+
}
|
|
1381
|
+
await this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
|
|
1382
|
+
await Promise.race([internalTimeout, checkpointSafeInternalTimeout]);
|
|
1383
|
+
try {
|
|
1384
|
+
const { checkpointCanceled } = await this.#coordinatorSocket.socket.timeout(15e3).emitWithAck("CANCEL_CHECKPOINT", {
|
|
1385
|
+
version: "v2",
|
|
1386
|
+
reason: "WAIT_FOR_DURATION"
|
|
1387
|
+
});
|
|
1388
|
+
logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
|
|
1389
|
+
if (checkpointCanceled) {
|
|
1390
|
+
break noResume;
|
|
1391
|
+
}
|
|
1392
|
+
logger.log("Waiting for external duration resume as we may have been restored");
|
|
1393
|
+
const idempotencyKey = randomUUID();
|
|
1394
|
+
this.durationResumeFallback = { idempotencyKey };
|
|
1395
|
+
setTimeout(() => {
|
|
1396
|
+
if (!this.durationResumeFallback) {
|
|
1397
|
+
logger.error("Already resumed after duration, skipping fallback");
|
|
1398
|
+
return;
|
|
1399
|
+
}
|
|
1400
|
+
if (this.durationResumeFallback.idempotencyKey !== idempotencyKey) {
|
|
1401
|
+
logger.error("Duration resume idempotency key mismatch, skipping fallback");
|
|
1402
|
+
return;
|
|
1403
|
+
}
|
|
1404
|
+
logger.log("Resuming after duration with fallback");
|
|
1405
|
+
this.#resumeAfterDuration();
|
|
1406
|
+
}, 15e3);
|
|
1407
|
+
} catch (error) {
|
|
1408
|
+
logger.debug("Checkpoint cancellation timed out", { error });
|
|
1409
|
+
break noResume;
|
|
1410
|
+
}
|
|
1411
|
+
return;
|
|
1412
|
+
}
|
|
1413
|
+
this.#resumeAfterDuration();
|
|
1414
|
+
});
|
|
1415
|
+
backgroundWorker.onWaitForTask.attach(this.#waitForTaskHandler.bind(this));
|
|
1416
|
+
backgroundWorker.onWaitForBatch.attach(this.#waitForBatchHandler.bind(this));
|
|
1417
|
+
return backgroundWorker;
|
|
1418
|
+
}
|
|
1419
|
+
async #prepareForWait(reason, willCheckpointAndRestore) {
|
|
1420
|
+
logger.log(`prepare for ${reason}`, { willCheckpointAndRestore });
|
|
1421
|
+
if (!willCheckpointAndRestore) {
|
|
1422
|
+
return;
|
|
706
1423
|
}
|
|
1424
|
+
this.paused = true;
|
|
1425
|
+
this.nextResumeAfter = reason;
|
|
1426
|
+
this.waitForPostStart = true;
|
|
1427
|
+
await this.#prepareForCheckpoint();
|
|
707
1428
|
}
|
|
708
|
-
|
|
709
|
-
|
|
1429
|
+
// MARK: RETRY PREP
|
|
1430
|
+
async #prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode) {
|
|
1431
|
+
logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit, exitCode });
|
|
710
1432
|
if (shouldExit) {
|
|
711
1433
|
if (willCheckpointAndRestore) {
|
|
712
|
-
|
|
1434
|
+
logger.error("WARNING: Will checkpoint but also requested exit. This won't end well.");
|
|
713
1435
|
}
|
|
714
|
-
await this.#
|
|
715
|
-
|
|
1436
|
+
await this.#exitGracefully(false, exitCode);
|
|
1437
|
+
return;
|
|
716
1438
|
}
|
|
1439
|
+
this.paused = false;
|
|
1440
|
+
this.waitForPostStart = false;
|
|
717
1441
|
this.executing = false;
|
|
718
1442
|
this.attemptFriendlyId = void 0;
|
|
719
|
-
if (willCheckpointAndRestore) {
|
|
720
|
-
this.waitForPostStart = true;
|
|
721
|
-
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1443
|
+
if (!willCheckpointAndRestore) {
|
|
722
1444
|
return;
|
|
723
1445
|
}
|
|
1446
|
+
this.waitForPostStart = true;
|
|
1447
|
+
await this.#prepareForCheckpoint(false);
|
|
1448
|
+
}
|
|
1449
|
+
// MARK: CHECKPOINT PREP
|
|
1450
|
+
async #prepareForCheckpoint(flush = true) {
|
|
1451
|
+
if (flush) {
|
|
1452
|
+
try {
|
|
1453
|
+
await this.#backgroundWorker.flushTelemetry();
|
|
1454
|
+
} catch (error) {
|
|
1455
|
+
logger.error(
|
|
1456
|
+
"Failed to flush telemetry while preparing for checkpoint, will proceed anyway",
|
|
1457
|
+
{ error }
|
|
1458
|
+
);
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
try {
|
|
1462
|
+
await this.#backgroundWorker.forceKillOldTaskRunProcesses();
|
|
1463
|
+
} catch (error) {
|
|
1464
|
+
logger.error(
|
|
1465
|
+
"Failed to kill previous worker while preparing for checkpoint, will proceed anyway",
|
|
1466
|
+
{ error }
|
|
1467
|
+
);
|
|
1468
|
+
}
|
|
1469
|
+
this.#readyForCheckpoint();
|
|
724
1470
|
}
|
|
725
1471
|
#resumeAfterDuration() {
|
|
726
1472
|
this.paused = false;
|
|
727
1473
|
this.nextResumeAfter = void 0;
|
|
1474
|
+
this.waitForPostStart = false;
|
|
728
1475
|
this.#backgroundWorker.waitCompletedNotification();
|
|
729
1476
|
}
|
|
1477
|
+
async #readyForLazyAttempt() {
|
|
1478
|
+
const idempotencyKey = randomUUID();
|
|
1479
|
+
this.readyForLazyAttemptReplay = {
|
|
1480
|
+
idempotencyKey
|
|
1481
|
+
};
|
|
1482
|
+
for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(3)) {
|
|
1483
|
+
if (retry > 0) {
|
|
1484
|
+
logger.log("retrying ready for lazy attempt", { retry });
|
|
1485
|
+
}
|
|
1486
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
|
|
1487
|
+
version: "v1",
|
|
1488
|
+
runId: this.runId,
|
|
1489
|
+
totalCompletions: this.completed.size
|
|
1490
|
+
});
|
|
1491
|
+
await timeout2(delay.milliseconds);
|
|
1492
|
+
if (!this.readyForLazyAttemptReplay) {
|
|
1493
|
+
logger.error("replay ready for lazy attempt cancelled, discarding", {
|
|
1494
|
+
idempotencyKey
|
|
1495
|
+
});
|
|
1496
|
+
return;
|
|
1497
|
+
}
|
|
1498
|
+
if (idempotencyKey !== this.readyForLazyAttemptReplay.idempotencyKey) {
|
|
1499
|
+
logger.error("replay ready for lazy attempt idempotency key mismatch, discarding", {
|
|
1500
|
+
idempotencyKey,
|
|
1501
|
+
newIdempotencyKey: this.readyForLazyAttemptReplay.idempotencyKey
|
|
1502
|
+
});
|
|
1503
|
+
return;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
this.#failRun(this.runId, "Failed to receive execute request in a reasonable time");
|
|
1507
|
+
}
|
|
1508
|
+
#readyForCheckpoint() {
|
|
1509
|
+
this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
|
|
1510
|
+
}
|
|
1511
|
+
#failRun(anyRunId, error) {
|
|
1512
|
+
logger.error("Failing run", { anyRunId, error });
|
|
1513
|
+
const completion = {
|
|
1514
|
+
ok: false,
|
|
1515
|
+
id: anyRunId,
|
|
1516
|
+
retry: void 0,
|
|
1517
|
+
error: error instanceof Error ? {
|
|
1518
|
+
type: "BUILT_IN_ERROR",
|
|
1519
|
+
name: error.name,
|
|
1520
|
+
message: error.message,
|
|
1521
|
+
stackTrace: error.stack ?? ""
|
|
1522
|
+
} : {
|
|
1523
|
+
type: "BUILT_IN_ERROR",
|
|
1524
|
+
name: "UnknownError",
|
|
1525
|
+
message: String(error),
|
|
1526
|
+
stackTrace: ""
|
|
1527
|
+
}
|
|
1528
|
+
};
|
|
1529
|
+
this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
|
|
1530
|
+
version: "v1",
|
|
1531
|
+
completion
|
|
1532
|
+
});
|
|
1533
|
+
}
|
|
1534
|
+
// MARK: ATTEMPT COMPLETION
|
|
1535
|
+
async #submitAttemptCompletion(execution, completion, replayIdempotencyKey) {
|
|
1536
|
+
const taskRunCompleted = await defaultBackoff.execute(async ({ retry }) => {
|
|
1537
|
+
logger.log("Submit attempt completion with backoff", { retry });
|
|
1538
|
+
return await this.#coordinatorSocket.socket.timeout(2e4).emitWithAck("TASK_RUN_COMPLETED", {
|
|
1539
|
+
version: "v1",
|
|
1540
|
+
execution,
|
|
1541
|
+
completion
|
|
1542
|
+
});
|
|
1543
|
+
});
|
|
1544
|
+
if (!taskRunCompleted.success) {
|
|
1545
|
+
logger.error("Failed to complete lazy attempt with backoff", {
|
|
1546
|
+
cause: taskRunCompleted.cause,
|
|
1547
|
+
error: taskRunCompleted.error
|
|
1548
|
+
});
|
|
1549
|
+
this.#failRun(execution.run.id, taskRunCompleted.error);
|
|
1550
|
+
return;
|
|
1551
|
+
}
|
|
1552
|
+
const { willCheckpointAndRestore, shouldExit } = taskRunCompleted.result;
|
|
1553
|
+
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1554
|
+
const exitCode = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes2.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE ? EXIT_CODE_CHILD_NONZERO : 0;
|
|
1555
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode);
|
|
1556
|
+
if (willCheckpointAndRestore) {
|
|
1557
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1558
|
+
this.submitAttemptCompletionReplay = {
|
|
1559
|
+
message: {
|
|
1560
|
+
execution,
|
|
1561
|
+
completion
|
|
1562
|
+
},
|
|
1563
|
+
attempt: 1,
|
|
1564
|
+
idempotencyKey: randomUUID()
|
|
1565
|
+
};
|
|
1566
|
+
} else {
|
|
1567
|
+
if (replayIdempotencyKey && replayIdempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1568
|
+
logger.error(
|
|
1569
|
+
"attempt completion handler called with mismatched idempotency key, won't overwrite replay request"
|
|
1570
|
+
);
|
|
1571
|
+
return;
|
|
1572
|
+
}
|
|
1573
|
+
this.submitAttemptCompletionReplay.attempt++;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
730
1577
|
#returnValidatedExtraHeaders(headers) {
|
|
731
1578
|
for (const [key, value] of Object.entries(headers)) {
|
|
732
1579
|
if (value === void 0) {
|
|
@@ -735,9 +1582,10 @@ var ProdWorker = class {
|
|
|
735
1582
|
}
|
|
736
1583
|
return headers;
|
|
737
1584
|
}
|
|
1585
|
+
// MARK: COORDINATOR SOCKET
|
|
738
1586
|
#createCoordinatorSocket(host) {
|
|
739
1587
|
const extraHeaders = this.#returnValidatedExtraHeaders({
|
|
740
|
-
"x-machine-name":
|
|
1588
|
+
"x-machine-name": MACHINE_NAME,
|
|
741
1589
|
"x-pod-name": POD_NAME,
|
|
742
1590
|
"x-trigger-content-hash": this.contentHash,
|
|
743
1591
|
"x-trigger-project-ref": this.projectRef,
|
|
@@ -749,247 +1597,400 @@ var ProdWorker = class {
|
|
|
749
1597
|
if (this.attemptFriendlyId) {
|
|
750
1598
|
extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId;
|
|
751
1599
|
}
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
extraHeaders
|
|
756
|
-
});
|
|
757
|
-
const coordinatorConnection = new ZodSocketConnection2({
|
|
1600
|
+
logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`);
|
|
1601
|
+
logger.debug(`connecting with extra headers`, { extraHeaders });
|
|
1602
|
+
const coordinatorConnection = new ZodSocketConnection({
|
|
758
1603
|
namespace: "prod-worker",
|
|
759
1604
|
host,
|
|
760
1605
|
port: COORDINATOR_PORT,
|
|
761
1606
|
clientMessages: ProdWorkerToCoordinatorMessages,
|
|
762
1607
|
serverMessages: CoordinatorToProdWorkerMessages,
|
|
763
1608
|
extraHeaders,
|
|
1609
|
+
ioOptions: {
|
|
1610
|
+
reconnectionDelay: 1e3,
|
|
1611
|
+
reconnectionDelayMax: 3e3
|
|
1612
|
+
},
|
|
764
1613
|
handlers: {
|
|
765
|
-
RESUME_AFTER_DEPENDENCY: async (
|
|
1614
|
+
RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
|
|
766
1615
|
if (!this.paused) {
|
|
767
|
-
|
|
768
|
-
completions: message.completions,
|
|
769
|
-
executions: message.executions
|
|
770
|
-
});
|
|
1616
|
+
logger.error("Failed to resume after dependency: Worker not paused");
|
|
771
1617
|
return;
|
|
772
1618
|
}
|
|
773
|
-
if (
|
|
774
|
-
|
|
775
|
-
completions: message.completions,
|
|
776
|
-
executions: message.executions
|
|
777
|
-
});
|
|
778
|
-
return;
|
|
779
|
-
}
|
|
780
|
-
if (message.completions.length === 0 || message.executions.length === 0) {
|
|
781
|
-
logger2.error("no completions or executions", {
|
|
782
|
-
completions: message.completions,
|
|
783
|
-
executions: message.executions
|
|
784
|
-
});
|
|
1619
|
+
if (completions.length === 0) {
|
|
1620
|
+
logger.error("Failed to resume after dependency: No completions");
|
|
785
1621
|
return;
|
|
786
1622
|
}
|
|
787
1623
|
if (this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH") {
|
|
788
|
-
|
|
1624
|
+
logger.error("Failed to resume after dependency: Invalid next resume", {
|
|
789
1625
|
nextResumeAfter: this.nextResumeAfter
|
|
790
1626
|
});
|
|
791
1627
|
return;
|
|
792
1628
|
}
|
|
793
|
-
if (this.nextResumeAfter === "WAIT_FOR_TASK" &&
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
1629
|
+
if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) {
|
|
1630
|
+
logger.error(
|
|
1631
|
+
"Failed to resume after dependency: Waiting for single task but got multiple completions",
|
|
1632
|
+
{
|
|
1633
|
+
completions
|
|
1634
|
+
}
|
|
1635
|
+
);
|
|
798
1636
|
return;
|
|
799
1637
|
}
|
|
1638
|
+
switch (this.nextResumeAfter) {
|
|
1639
|
+
case "WAIT_FOR_TASK": {
|
|
1640
|
+
this.waitForTaskReplay = void 0;
|
|
1641
|
+
break;
|
|
1642
|
+
}
|
|
1643
|
+
case "WAIT_FOR_BATCH": {
|
|
1644
|
+
this.waitForBatchReplay = void 0;
|
|
1645
|
+
break;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
800
1648
|
this.paused = false;
|
|
801
1649
|
this.nextResumeAfter = void 0;
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
const
|
|
805
|
-
if (!completion
|
|
1650
|
+
this.waitForPostStart = false;
|
|
1651
|
+
for (let i = 0; i < completions.length; i++) {
|
|
1652
|
+
const completion = completions[i];
|
|
1653
|
+
if (!completion)
|
|
806
1654
|
continue;
|
|
807
|
-
this.#backgroundWorker.taskRunCompletedNotification(completion
|
|
1655
|
+
this.#backgroundWorker.taskRunCompletedNotification(completion);
|
|
808
1656
|
}
|
|
809
1657
|
},
|
|
810
1658
|
RESUME_AFTER_DURATION: async (message) => {
|
|
811
1659
|
if (!this.paused) {
|
|
812
|
-
|
|
1660
|
+
logger.error("worker not paused", {
|
|
813
1661
|
attemptId: message.attemptId
|
|
814
1662
|
});
|
|
815
1663
|
return;
|
|
816
1664
|
}
|
|
817
1665
|
if (this.nextResumeAfter !== "WAIT_FOR_DURATION") {
|
|
818
|
-
|
|
1666
|
+
logger.error("not waiting to resume after duration", {
|
|
819
1667
|
nextResumeAfter: this.nextResumeAfter
|
|
820
1668
|
});
|
|
821
1669
|
return;
|
|
822
1670
|
}
|
|
1671
|
+
this.durationResumeFallback = void 0;
|
|
823
1672
|
this.#resumeAfterDuration();
|
|
824
1673
|
},
|
|
1674
|
+
// Deprecated: This will never get called as this worker supports lazy attempts. It's only here for a quick view of the flow old workers use.
|
|
825
1675
|
EXECUTE_TASK_RUN: async ({ executionPayload }) => {
|
|
826
1676
|
if (this.executing) {
|
|
827
|
-
|
|
1677
|
+
logger.error("dropping execute request, already executing");
|
|
828
1678
|
return;
|
|
829
1679
|
}
|
|
830
1680
|
if (this.completed.has(executionPayload.execution.attempt.id)) {
|
|
831
|
-
|
|
1681
|
+
logger.error("dropping execute request, already completed");
|
|
832
1682
|
return;
|
|
833
1683
|
}
|
|
834
1684
|
this.executing = true;
|
|
835
1685
|
this.attemptFriendlyId = executionPayload.execution.attempt.id;
|
|
836
1686
|
const completion = await this.#backgroundWorker.executeTaskRun(executionPayload);
|
|
837
|
-
|
|
1687
|
+
logger.log("completed", completion);
|
|
838
1688
|
this.completed.add(executionPayload.execution.attempt.id);
|
|
839
|
-
await this.#backgroundWorker.flushTelemetry();
|
|
840
1689
|
const { willCheckpointAndRestore, shouldExit } = await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
|
|
841
1690
|
version: "v1",
|
|
842
1691
|
execution: executionPayload.execution,
|
|
843
1692
|
completion
|
|
844
1693
|
});
|
|
845
|
-
|
|
846
|
-
this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1694
|
+
logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
|
|
1695
|
+
await this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
|
|
1696
|
+
},
|
|
1697
|
+
EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
|
|
1698
|
+
this.readyForLazyAttemptReplay = void 0;
|
|
1699
|
+
if (this.executing) {
|
|
1700
|
+
logger.error("dropping execute request, already executing");
|
|
1701
|
+
return;
|
|
1702
|
+
}
|
|
1703
|
+
const attemptCount = message.lazyPayload.attemptCount ?? 0;
|
|
1704
|
+
logger.log("execute attempt counts", { attemptCount, completed: this.completed.size });
|
|
1705
|
+
if (this.completed.size > 0 && this.completed.size >= attemptCount + 1) {
|
|
1706
|
+
logger.error("dropping execute request, already completed");
|
|
1707
|
+
return;
|
|
1708
|
+
}
|
|
1709
|
+
this.executing = true;
|
|
1710
|
+
try {
|
|
1711
|
+
const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
|
|
1712
|
+
logger.log("completed", completion);
|
|
1713
|
+
this.completed.add(execution.attempt.id);
|
|
1714
|
+
await this.#submitAttemptCompletion(execution, completion);
|
|
1715
|
+
} catch (error) {
|
|
1716
|
+
logger.error("Failed to complete lazy attempt", {
|
|
1717
|
+
error
|
|
1718
|
+
});
|
|
1719
|
+
this.#failRun(message.lazyPayload.runId, error);
|
|
1720
|
+
}
|
|
847
1721
|
},
|
|
848
1722
|
REQUEST_ATTEMPT_CANCELLATION: async (message) => {
|
|
849
1723
|
if (!this.executing) {
|
|
1724
|
+
logger.log("dropping cancel request, not executing", { status: this.#status });
|
|
850
1725
|
return;
|
|
851
1726
|
}
|
|
1727
|
+
logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status });
|
|
852
1728
|
await this.#backgroundWorker.cancelAttempt(message.attemptId);
|
|
853
1729
|
},
|
|
854
|
-
REQUEST_EXIT: async () => {
|
|
1730
|
+
REQUEST_EXIT: async (message) => {
|
|
1731
|
+
if (message.version === "v2" && message.delayInMs) {
|
|
1732
|
+
logger.log("exit requested with delay", { delayInMs: message.delayInMs });
|
|
1733
|
+
await timeout2(message.delayInMs);
|
|
1734
|
+
}
|
|
855
1735
|
this.#coordinatorSocket.close();
|
|
856
1736
|
process.exit(0);
|
|
857
1737
|
},
|
|
858
1738
|
READY_FOR_RETRY: async (message) => {
|
|
859
1739
|
if (this.completed.size < 1) {
|
|
1740
|
+
logger.error("Received READY_FOR_RETRY but no completions yet. This is a bug.");
|
|
860
1741
|
return;
|
|
861
1742
|
}
|
|
862
|
-
this
|
|
863
|
-
|
|
864
|
-
runId: this.runId,
|
|
865
|
-
totalCompletions: this.completed.size
|
|
866
|
-
});
|
|
1743
|
+
this.submitAttemptCompletionReplay = void 0;
|
|
1744
|
+
await this.#readyForLazyAttempt();
|
|
867
1745
|
}
|
|
868
1746
|
},
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
1747
|
+
// MARK: ON CONNECTION
|
|
1748
|
+
onConnection: async (socket, handler, sender, logger2) => {
|
|
1749
|
+
logger2.log("connected to coordinator", {
|
|
1750
|
+
status: this.#status,
|
|
1751
|
+
connectionCount: ++this.connectionCount
|
|
1752
|
+
});
|
|
1753
|
+
socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId });
|
|
1754
|
+
try {
|
|
1755
|
+
if (this.waitForPostStart) {
|
|
1756
|
+
logger2.log("skip connection handler, waiting for post start hook");
|
|
1757
|
+
return;
|
|
1758
|
+
}
|
|
1759
|
+
if (this.paused) {
|
|
1760
|
+
if (!this.nextResumeAfter) {
|
|
1761
|
+
logger2.error("Missing next resume reason", { status: this.#status });
|
|
1762
|
+
this.#emitUnrecoverableError(
|
|
1763
|
+
"NoNextResume",
|
|
1764
|
+
"Next resume reason not set while resuming from paused state"
|
|
1765
|
+
);
|
|
1766
|
+
return;
|
|
1767
|
+
}
|
|
1768
|
+
if (!this.attemptFriendlyId) {
|
|
1769
|
+
logger2.error("Missing friendly ID", { status: this.#status });
|
|
1770
|
+
this.#emitUnrecoverableError(
|
|
1771
|
+
"NoAttemptId",
|
|
1772
|
+
"Attempt ID not set while resuming from paused state"
|
|
1773
|
+
);
|
|
1774
|
+
return;
|
|
1775
|
+
}
|
|
1776
|
+
socket.emit("READY_FOR_RESUME", {
|
|
878
1777
|
version: "v1",
|
|
879
|
-
|
|
880
|
-
|
|
1778
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
1779
|
+
type: this.nextResumeAfter
|
|
881
1780
|
});
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
logger3.info("indexing failure, shutting down..");
|
|
887
|
-
process.exit(1);
|
|
888
|
-
}
|
|
889
|
-
} catch (e) {
|
|
890
|
-
if (e instanceof TaskMetadataParseError) {
|
|
891
|
-
logger3.error("tasks metadata parse error", { message: e.zodIssues, tasks: e.tasks });
|
|
1781
|
+
return;
|
|
1782
|
+
}
|
|
1783
|
+
if (process.env.INDEX_TASKS === "true") {
|
|
1784
|
+
const failIndex = (error) => {
|
|
892
1785
|
socket.emit("INDEXING_FAILED", {
|
|
893
1786
|
version: "v1",
|
|
894
1787
|
deploymentId: this.deploymentId,
|
|
895
|
-
error
|
|
1788
|
+
error
|
|
1789
|
+
});
|
|
1790
|
+
};
|
|
1791
|
+
process.removeAllListeners("uncaughtException");
|
|
1792
|
+
process.on("uncaughtException", (error) => {
|
|
1793
|
+
console.error("Uncaught exception while indexing", error);
|
|
1794
|
+
failIndex(error);
|
|
1795
|
+
});
|
|
1796
|
+
try {
|
|
1797
|
+
const taskResources = await this.#initializeWorker();
|
|
1798
|
+
const indexTasks = await defaultBackoff.maxRetries(3).execute(async () => {
|
|
1799
|
+
return await socket.timeout(2e4).emitWithAck("INDEX_TASKS", {
|
|
1800
|
+
version: "v2",
|
|
1801
|
+
deploymentId: this.deploymentId,
|
|
1802
|
+
...taskResources,
|
|
1803
|
+
supportsLazyAttempts: true
|
|
1804
|
+
});
|
|
1805
|
+
});
|
|
1806
|
+
if (!indexTasks.success || !indexTasks.result.success) {
|
|
1807
|
+
logger2.error("indexing failure, shutting down..", { indexTasks });
|
|
1808
|
+
process.exit(1);
|
|
1809
|
+
} else {
|
|
1810
|
+
logger2.info("indexing done, shutting down..");
|
|
1811
|
+
process.exit(0);
|
|
1812
|
+
}
|
|
1813
|
+
} catch (e) {
|
|
1814
|
+
const stderr = this.#backgroundWorker.stderr.join("\n");
|
|
1815
|
+
if (e instanceof TaskMetadataParseError) {
|
|
1816
|
+
logger2.error("tasks metadata parse error", {
|
|
1817
|
+
zodIssues: e.zodIssues,
|
|
1818
|
+
tasks: e.tasks
|
|
1819
|
+
});
|
|
1820
|
+
failIndex({
|
|
896
1821
|
name: "TaskMetadataParseError",
|
|
897
1822
|
message: "There was an error parsing the task metadata",
|
|
898
|
-
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks })
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
socket.emit("INDEXING_FAILED", {
|
|
904
|
-
version: "v1",
|
|
905
|
-
deploymentId: this.deploymentId,
|
|
906
|
-
error: {
|
|
1823
|
+
stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
|
|
1824
|
+
stderr
|
|
1825
|
+
});
|
|
1826
|
+
} else if (e instanceof UncaughtExceptionError) {
|
|
1827
|
+
const error = {
|
|
907
1828
|
name: e.originalError.name,
|
|
908
1829
|
message: e.originalError.message,
|
|
909
|
-
stack: e.originalError.stack
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
deploymentId: this.deploymentId,
|
|
917
|
-
error: {
|
|
1830
|
+
stack: e.originalError.stack,
|
|
1831
|
+
stderr
|
|
1832
|
+
};
|
|
1833
|
+
logger2.error("uncaught exception", { originalError: error });
|
|
1834
|
+
failIndex(error);
|
|
1835
|
+
} else if (e instanceof Error) {
|
|
1836
|
+
const error = {
|
|
918
1837
|
name: e.name,
|
|
919
1838
|
message: e.message,
|
|
920
|
-
stack: e.stack
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
error: {
|
|
1839
|
+
stack: e.stack,
|
|
1840
|
+
stderr
|
|
1841
|
+
};
|
|
1842
|
+
logger2.error("error", { error });
|
|
1843
|
+
failIndex(error);
|
|
1844
|
+
} else if (typeof e === "string") {
|
|
1845
|
+
logger2.error("string error", { error: { message: e } });
|
|
1846
|
+
failIndex({
|
|
929
1847
|
name: "Error",
|
|
930
|
-
message: e
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
version: "v1",
|
|
937
|
-
deploymentId: this.deploymentId,
|
|
938
|
-
error: {
|
|
1848
|
+
message: e,
|
|
1849
|
+
stderr
|
|
1850
|
+
});
|
|
1851
|
+
} else {
|
|
1852
|
+
logger2.error("unknown error", { error: e });
|
|
1853
|
+
failIndex({
|
|
939
1854
|
name: "Error",
|
|
940
|
-
message: "Unknown error"
|
|
941
|
-
|
|
942
|
-
|
|
1855
|
+
message: "Unknown error",
|
|
1856
|
+
stderr
|
|
1857
|
+
});
|
|
1858
|
+
}
|
|
1859
|
+
await timeout2(1e3);
|
|
1860
|
+
process.exit(EXIT_CODE_ALREADY_HANDLED);
|
|
943
1861
|
}
|
|
944
|
-
await setTimeout2(200);
|
|
945
|
-
process.exit(1);
|
|
946
1862
|
}
|
|
947
|
-
|
|
948
|
-
if (this.paused) {
|
|
949
|
-
if (!this.nextResumeAfter) {
|
|
950
|
-
return;
|
|
951
|
-
}
|
|
952
|
-
if (!this.attemptFriendlyId) {
|
|
953
|
-
logger3.error("Missing friendly ID");
|
|
1863
|
+
if (this.executing) {
|
|
954
1864
|
return;
|
|
955
1865
|
}
|
|
956
|
-
|
|
957
|
-
|
|
1866
|
+
process.removeAllListeners("uncaughtException");
|
|
1867
|
+
process.on("uncaughtException", (error) => {
|
|
1868
|
+
console.error("Uncaught exception during run", error);
|
|
1869
|
+
this.#failRun(this.runId, error);
|
|
1870
|
+
});
|
|
1871
|
+
await this.#readyForLazyAttempt();
|
|
1872
|
+
} catch (error) {
|
|
1873
|
+
logger2.error("connection handler error", { error });
|
|
1874
|
+
} finally {
|
|
1875
|
+
if (this.connectionCount === 1) {
|
|
958
1876
|
return;
|
|
959
1877
|
}
|
|
960
|
-
|
|
961
|
-
version: "v1",
|
|
962
|
-
attemptFriendlyId: this.attemptFriendlyId,
|
|
963
|
-
type: this.nextResumeAfter
|
|
964
|
-
});
|
|
965
|
-
return;
|
|
966
|
-
}
|
|
967
|
-
if (this.executing) {
|
|
968
|
-
return;
|
|
1878
|
+
this.#handleReplays();
|
|
969
1879
|
}
|
|
970
|
-
socket.emit("READY_FOR_EXECUTION", {
|
|
971
|
-
version: "v1",
|
|
972
|
-
runId: this.runId,
|
|
973
|
-
totalCompletions: this.completed.size
|
|
974
|
-
});
|
|
975
1880
|
},
|
|
976
|
-
onError: async (socket, err,
|
|
977
|
-
|
|
1881
|
+
onError: async (socket, err, logger2) => {
|
|
1882
|
+
logger2.error("onError", {
|
|
978
1883
|
error: {
|
|
979
1884
|
name: err.name,
|
|
980
1885
|
message: err.message
|
|
981
1886
|
}
|
|
982
1887
|
});
|
|
983
|
-
await this.#reconnect();
|
|
984
|
-
},
|
|
985
|
-
onDisconnect: async (socket, reason, description, logger3) => {
|
|
986
1888
|
}
|
|
987
1889
|
});
|
|
988
1890
|
return coordinatorConnection;
|
|
989
1891
|
}
|
|
1892
|
+
// MARK: REPLAYS
|
|
1893
|
+
async #handleReplays() {
|
|
1894
|
+
const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3);
|
|
1895
|
+
const replayCancellationDelay = 2e4;
|
|
1896
|
+
if (this.waitForTaskReplay) {
|
|
1897
|
+
logger.log("replaying wait for task", { ...this.waitForTaskReplay });
|
|
1898
|
+
const { idempotencyKey, message, attempt } = this.waitForTaskReplay;
|
|
1899
|
+
await timeout2(replayCancellationDelay);
|
|
1900
|
+
if (!this.waitForTaskReplay) {
|
|
1901
|
+
logger.error("wait for task replay cancelled, discarding", {
|
|
1902
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1903
|
+
});
|
|
1904
|
+
return;
|
|
1905
|
+
}
|
|
1906
|
+
if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) {
|
|
1907
|
+
logger.error("wait for task replay idempotency key mismatch, discarding", {
|
|
1908
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1909
|
+
newMessage: this.waitForTaskReplay
|
|
1910
|
+
});
|
|
1911
|
+
return;
|
|
1912
|
+
}
|
|
1913
|
+
try {
|
|
1914
|
+
await backoff.wait(attempt + 1);
|
|
1915
|
+
await this.#waitForTaskHandler(message);
|
|
1916
|
+
} catch (error) {
|
|
1917
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1918
|
+
logger.error("wait for task replay retry limit exceeded", { error });
|
|
1919
|
+
} else {
|
|
1920
|
+
logger.error("wait for task replay error", { error });
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
return;
|
|
1924
|
+
}
|
|
1925
|
+
if (this.waitForBatchReplay) {
|
|
1926
|
+
logger.log("replaying wait for batch", {
|
|
1927
|
+
...this.waitForBatchReplay,
|
|
1928
|
+
cancellationDelay: replayCancellationDelay
|
|
1929
|
+
});
|
|
1930
|
+
const { idempotencyKey, message, attempt } = this.waitForBatchReplay;
|
|
1931
|
+
await timeout2(replayCancellationDelay);
|
|
1932
|
+
if (!this.waitForBatchReplay) {
|
|
1933
|
+
logger.error("wait for batch replay cancelled, discarding", {
|
|
1934
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1935
|
+
});
|
|
1936
|
+
return;
|
|
1937
|
+
}
|
|
1938
|
+
if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) {
|
|
1939
|
+
logger.error("wait for batch replay idempotency key mismatch, discarding", {
|
|
1940
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1941
|
+
newMessage: this.waitForBatchReplay
|
|
1942
|
+
});
|
|
1943
|
+
return;
|
|
1944
|
+
}
|
|
1945
|
+
try {
|
|
1946
|
+
await backoff.wait(attempt + 1);
|
|
1947
|
+
await this.#waitForBatchHandler(message);
|
|
1948
|
+
} catch (error) {
|
|
1949
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1950
|
+
logger.error("wait for batch replay retry limit exceeded", { error });
|
|
1951
|
+
} else {
|
|
1952
|
+
logger.error("wait for batch replay error", { error });
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
return;
|
|
1956
|
+
}
|
|
1957
|
+
if (this.submitAttemptCompletionReplay) {
|
|
1958
|
+
logger.log("replaying attempt completion", {
|
|
1959
|
+
...this.submitAttemptCompletionReplay,
|
|
1960
|
+
cancellationDelay: replayCancellationDelay
|
|
1961
|
+
});
|
|
1962
|
+
const { idempotencyKey, message, attempt } = this.submitAttemptCompletionReplay;
|
|
1963
|
+
await timeout2(replayCancellationDelay);
|
|
1964
|
+
if (!this.submitAttemptCompletionReplay) {
|
|
1965
|
+
logger.error("attempt completion replay cancelled, discarding", {
|
|
1966
|
+
originalMessage: { idempotencyKey, message, attempt }
|
|
1967
|
+
});
|
|
1968
|
+
return;
|
|
1969
|
+
}
|
|
1970
|
+
if (idempotencyKey !== this.submitAttemptCompletionReplay.idempotencyKey) {
|
|
1971
|
+
logger.error("attempt completion replay idempotency key mismatch, discarding", {
|
|
1972
|
+
originalMessage: { idempotencyKey, message, attempt },
|
|
1973
|
+
newMessage: this.submitAttemptCompletionReplay
|
|
1974
|
+
});
|
|
1975
|
+
return;
|
|
1976
|
+
}
|
|
1977
|
+
try {
|
|
1978
|
+
await backoff.wait(attempt + 1);
|
|
1979
|
+
await this.#submitAttemptCompletion(message.execution, message.completion, idempotencyKey);
|
|
1980
|
+
} catch (error) {
|
|
1981
|
+
if (error instanceof ExponentialBackoff.RetryLimitExceeded) {
|
|
1982
|
+
logger.error("attempt completion replay retry limit exceeded", { error });
|
|
1983
|
+
} else {
|
|
1984
|
+
logger.error("attempt completion replay error", { error });
|
|
1985
|
+
}
|
|
1986
|
+
}
|
|
1987
|
+
return;
|
|
1988
|
+
}
|
|
1989
|
+
}
|
|
1990
|
+
// MARK: HTTP SERVER
|
|
990
1991
|
#createHttpServer() {
|
|
991
1992
|
const httpServer = createServer(async (req, res) => {
|
|
992
|
-
|
|
1993
|
+
logger.log(`[${req.method}]`, req.url);
|
|
993
1994
|
const reply = new HttpReply(res);
|
|
994
1995
|
try {
|
|
995
1996
|
const url = new URL(req.url ?? "", `http://${req.headers.host}`);
|
|
@@ -998,35 +1999,27 @@ var ProdWorker = class {
|
|
|
998
1999
|
return reply.text("ok");
|
|
999
2000
|
}
|
|
1000
2001
|
case "/status": {
|
|
1001
|
-
return reply.json(
|
|
1002
|
-
executing: this.executing,
|
|
1003
|
-
pause: this.paused,
|
|
1004
|
-
nextResumeAfter: this.nextResumeAfter
|
|
1005
|
-
});
|
|
2002
|
+
return reply.json(this.#status);
|
|
1006
2003
|
}
|
|
1007
2004
|
case "/connect": {
|
|
1008
2005
|
this.#coordinatorSocket.connect();
|
|
1009
2006
|
return reply.text("Connected to coordinator");
|
|
1010
2007
|
}
|
|
1011
2008
|
case "/close": {
|
|
1012
|
-
await this.#coordinatorSocket.sendWithAck("LOG", {
|
|
1013
|
-
version: "v1",
|
|
1014
|
-
text: `[${req.method}] ${req.url}`
|
|
1015
|
-
});
|
|
1016
2009
|
this.#coordinatorSocket.close();
|
|
2010
|
+
this.connectionCount = 0;
|
|
1017
2011
|
return reply.text("Disconnected from coordinator");
|
|
1018
2012
|
}
|
|
1019
2013
|
case "/test": {
|
|
1020
|
-
await this.#coordinatorSocket.
|
|
1021
|
-
version: "v1"
|
|
1022
|
-
text: `[${req.method}] ${req.url}`
|
|
2014
|
+
await this.#coordinatorSocket.socket.timeout(1e4).emitWithAck("TEST", {
|
|
2015
|
+
version: "v1"
|
|
1023
2016
|
});
|
|
1024
2017
|
return reply.text("Received ACK from coordinator");
|
|
1025
2018
|
}
|
|
1026
2019
|
case "/preStop": {
|
|
1027
2020
|
const cause = PreStopCauses.safeParse(url.searchParams.get("cause"));
|
|
1028
2021
|
if (!cause.success) {
|
|
1029
|
-
|
|
2022
|
+
logger.error("Failed to parse cause", { cause });
|
|
1030
2023
|
return reply.text("Failed to parse cause", 400);
|
|
1031
2024
|
}
|
|
1032
2025
|
switch (cause.data) {
|
|
@@ -1034,17 +2027,16 @@ var ProdWorker = class {
|
|
|
1034
2027
|
break;
|
|
1035
2028
|
}
|
|
1036
2029
|
default: {
|
|
1037
|
-
|
|
2030
|
+
logger.error("Unhandled cause", { cause: cause.data });
|
|
1038
2031
|
break;
|
|
1039
2032
|
}
|
|
1040
2033
|
}
|
|
1041
|
-
logger2.log("preStop", { url: req.url });
|
|
1042
2034
|
return reply.text("preStop ok");
|
|
1043
2035
|
}
|
|
1044
2036
|
case "/postStart": {
|
|
1045
2037
|
const cause = PostStartCauses.safeParse(url.searchParams.get("cause"));
|
|
1046
2038
|
if (!cause.success) {
|
|
1047
|
-
|
|
2039
|
+
logger.error("Failed to parse cause", { cause });
|
|
1048
2040
|
return reply.text("Failed to parse cause", 400);
|
|
1049
2041
|
}
|
|
1050
2042
|
switch (cause.data) {
|
|
@@ -1055,11 +2047,11 @@ var ProdWorker = class {
|
|
|
1055
2047
|
break;
|
|
1056
2048
|
}
|
|
1057
2049
|
case "restore": {
|
|
1058
|
-
await this.#
|
|
2050
|
+
await this.#reconnectAfterPostStart();
|
|
1059
2051
|
break;
|
|
1060
2052
|
}
|
|
1061
2053
|
default: {
|
|
1062
|
-
|
|
2054
|
+
logger.error("Unhandled cause", { cause: cause.data });
|
|
1063
2055
|
break;
|
|
1064
2056
|
}
|
|
1065
2057
|
}
|
|
@@ -1070,7 +2062,7 @@ var ProdWorker = class {
|
|
|
1070
2062
|
}
|
|
1071
2063
|
}
|
|
1072
2064
|
} catch (error) {
|
|
1073
|
-
|
|
2065
|
+
logger.error("HTTP server error", { error });
|
|
1074
2066
|
reply.empty(500);
|
|
1075
2067
|
}
|
|
1076
2068
|
});
|
|
@@ -1078,15 +2070,15 @@ var ProdWorker = class {
|
|
|
1078
2070
|
socket.end("HTTP/1.1 400 Bad Request\r\n\r\n");
|
|
1079
2071
|
});
|
|
1080
2072
|
httpServer.on("listening", () => {
|
|
1081
|
-
|
|
2073
|
+
logger.log("http server listening on port", this.#httpPort);
|
|
1082
2074
|
});
|
|
1083
2075
|
httpServer.on("error", async (error) => {
|
|
1084
2076
|
if (error.code != "EADDRINUSE") {
|
|
1085
2077
|
return;
|
|
1086
2078
|
}
|
|
1087
|
-
|
|
2079
|
+
logger.error(`port ${this.#httpPort} already in use, retrying with random port..`);
|
|
1088
2080
|
this.#httpPort = getRandomPortNumber();
|
|
1089
|
-
await
|
|
2081
|
+
await timeout2(100);
|
|
1090
2082
|
this.start();
|
|
1091
2083
|
});
|
|
1092
2084
|
return httpServer;
|
|
@@ -1096,8 +2088,12 @@ var ProdWorker = class {
|
|
|
1096
2088
|
await this.#backgroundWorker.initialize({ env: envVars });
|
|
1097
2089
|
let packageVersion;
|
|
1098
2090
|
const taskResources = [];
|
|
1099
|
-
if (!this.#backgroundWorker.tasks) {
|
|
1100
|
-
throw new Error(
|
|
2091
|
+
if (!this.#backgroundWorker.tasks || this.#backgroundWorker.tasks.length === 0) {
|
|
2092
|
+
throw new Error(
|
|
2093
|
+
`Background Worker started without tasks. Searched in: ${__PROJECT_CONFIG__.triggerDirectories?.join(
|
|
2094
|
+
", "
|
|
2095
|
+
)}`
|
|
2096
|
+
);
|
|
1101
2097
|
}
|
|
1102
2098
|
for (const task of this.#backgroundWorker.tasks) {
|
|
1103
2099
|
taskResources.push(task);
|
|
@@ -1124,11 +2120,32 @@ var ProdWorker = class {
|
|
|
1124
2120
|
const data = await response.json();
|
|
1125
2121
|
return data?.variables ?? {};
|
|
1126
2122
|
}
|
|
2123
|
+
get #status() {
|
|
2124
|
+
return {
|
|
2125
|
+
executing: this.executing,
|
|
2126
|
+
paused: this.paused,
|
|
2127
|
+
completed: this.completed.size,
|
|
2128
|
+
nextResumeAfter: this.nextResumeAfter,
|
|
2129
|
+
waitForPostStart: this.waitForPostStart,
|
|
2130
|
+
attemptFriendlyId: this.attemptFriendlyId,
|
|
2131
|
+
waitForTaskReplay: this.waitForTaskReplay,
|
|
2132
|
+
waitForBatchReplay: this.waitForBatchReplay
|
|
2133
|
+
};
|
|
2134
|
+
}
|
|
2135
|
+
#emitUnrecoverableError(name, message) {
|
|
2136
|
+
this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", {
|
|
2137
|
+
version: "v1",
|
|
2138
|
+
error: {
|
|
2139
|
+
name,
|
|
2140
|
+
message
|
|
2141
|
+
}
|
|
2142
|
+
});
|
|
2143
|
+
}
|
|
1127
2144
|
start() {
|
|
1128
2145
|
this.#httpServer.listen(this.#httpPort, this.host);
|
|
1129
2146
|
}
|
|
1130
2147
|
};
|
|
1131
|
-
var prodWorker = new ProdWorker(
|
|
2148
|
+
var prodWorker = new ProdWorker(HTTP_SERVER_PORT);
|
|
1132
2149
|
prodWorker.start();
|
|
1133
2150
|
function gatherProcessEnv() {
|
|
1134
2151
|
const env = {
|
|
@@ -1139,7 +2156,8 @@ function gatherProcessEnv() {
|
|
|
1139
2156
|
LANG: process.env.LANG,
|
|
1140
2157
|
TERM: process.env.TERM,
|
|
1141
2158
|
NODE_PATH: process.env.NODE_PATH,
|
|
1142
|
-
HOME: process.env.HOME
|
|
2159
|
+
HOME: process.env.HOME,
|
|
2160
|
+
NODE_EXTRA_CA_CERTS: process.env.NODE_EXTRA_CA_CERTS
|
|
1143
2161
|
};
|
|
1144
2162
|
return Object.fromEntries(Object.entries(env).filter(([key, value]) => value !== void 0));
|
|
1145
2163
|
}
|