trigger.dev 3.0.0-beta.4 → 3.0.0-beta.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,9 @@ import {
3
3
  CoordinatorToProdWorkerMessages,
4
4
  PostStartCauses,
5
5
  PreStopCauses,
6
- ProdWorkerToCoordinatorMessages,
7
- ZodSocketConnection as ZodSocketConnection2
6
+ ProdWorkerToCoordinatorMessages
8
7
  } from "@trigger.dev/core/v3";
8
+ import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
9
9
 
10
10
  // ../core-apps/src/http.ts
11
11
  var HttpReply = class {
@@ -65,24 +65,6 @@ var SimpleLogger = class {
65
65
  }
66
66
  };
67
67
 
68
- // ../core-apps/src/provider.ts
69
- import {
70
- ClientToSharedQueueMessages,
71
- clientWebsocketMessages,
72
- PlatformToProviderMessages,
73
- ProviderToPlatformMessages,
74
- SharedQueueToClientMessages,
75
- ZodMessageSender,
76
- ZodSocketConnection
77
- } from "@trigger.dev/core/v3";
78
- var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
79
- var MACHINE_NAME = process.env.MACHINE_NAME || "local";
80
- var PLATFORM_HOST = process.env.PLATFORM_HOST || "127.0.0.1";
81
- var PLATFORM_WS_PORT = process.env.PLATFORM_WS_PORT || 3030;
82
- var PLATFORM_SECRET = process.env.PLATFORM_SECRET || "provider-secret";
83
- var SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ?? "false");
84
- var logger = new SimpleLogger(`[${MACHINE_NAME}]`);
85
-
86
68
  // src/workers/prod/entry-point.ts
87
69
  import { readFile } from "node:fs/promises";
88
70
  import { createServer } from "node:http";
@@ -93,9 +75,9 @@ import {
93
75
  ProdWorkerToChildMessages,
94
76
  SemanticInternalAttributes,
95
77
  TaskRunErrorCodes,
96
- ZodIpcConnection,
97
78
  correctErrorStackTrace
98
79
  } from "@trigger.dev/core/v3";
80
+ import { ZodIpcConnection } from "@trigger.dev/core/v3/zodIpc";
99
81
  import { Evt } from "evt";
100
82
  import { fork } from "node:child_process";
101
83
 
@@ -116,8 +98,6 @@ var TaskMetadataParseError = class extends Error {
116
98
  this.name = "TaskMetadataParseError";
117
99
  }
118
100
  };
119
-
120
- // src/workers/prod/backgroundWorker.ts
121
101
  var UnexpectedExitError = class extends Error {
122
102
  constructor(code) {
123
103
  super(`Unexpected exit with code ${code}`);
@@ -137,30 +117,73 @@ var CancelledProcessError = class extends Error {
137
117
  this.name = "CancelledProcessError";
138
118
  }
139
119
  };
120
+ var SigKillTimeoutProcessError = class extends Error {
121
+ constructor() {
122
+ super("Process kill timeout");
123
+ this.name = "SigKillTimeoutProcessError";
124
+ }
125
+ };
126
+ var GracefulExitTimeoutError = class extends Error {
127
+ constructor() {
128
+ super("Graceful exit timeout");
129
+ this.name = "GracefulExitTimeoutError";
130
+ }
131
+ };
132
+
133
+ // src/workers/prod/backgroundWorker.ts
140
134
  var ProdBackgroundWorker = class {
141
135
  constructor(path, params) {
142
136
  this.path = path;
143
137
  this.params = params;
144
138
  }
145
139
  _initialized = false;
140
+ /**
141
+ * @deprecated use onTaskRunHeartbeat instead
142
+ */
146
143
  onTaskHeartbeat = new Evt();
144
+ onTaskRunHeartbeat = new Evt();
147
145
  onWaitForBatch = new Evt();
148
146
  onWaitForDuration = new Evt();
149
147
  onWaitForTask = new Evt();
150
148
  preCheckpointNotification = Evt.create();
149
+ checkpointCanceledNotification = Evt.create();
151
150
  onReadyForCheckpoint = Evt.create();
152
151
  onCancelCheckpoint = Evt.create();
152
+ onCreateTaskRunAttempt = Evt.create();
153
+ attemptCreatedNotification = Evt.create();
153
154
  _onClose = new Evt();
154
155
  tasks = [];
156
+ stderr = [];
155
157
  _taskRunProcess;
158
+ _taskRunProcessesBeingKilled = /* @__PURE__ */ new Map();
156
159
  _closed = false;
157
- async close() {
160
+ async close(gracefulExitTimeoutElapsed = false) {
161
+ console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed });
158
162
  if (this._closed) {
159
163
  return;
160
164
  }
161
165
  this._closed = true;
162
166
  this.onTaskHeartbeat.detach();
163
- await this._taskRunProcess?.cleanup(true);
167
+ this.onTaskRunHeartbeat.detach();
168
+ await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed);
169
+ }
170
+ async #killTaskRunProcess(flush = true, initialSignal = "SIGTERM") {
171
+ console.log("Killing task run process", { flush, initialSignal, closed: this._closed });
172
+ if (this._closed || !this._taskRunProcess) {
173
+ return;
174
+ }
175
+ if (flush) {
176
+ await this.flushTelemetry();
177
+ }
178
+ const currentTaskRunProcess = this._taskRunProcess;
179
+ this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => {
180
+ console.error("Error while trying graceful exit", error);
181
+ });
182
+ console.log("Killed task run process, setting closed to true", {
183
+ closed: this._closed,
184
+ pid: currentTaskRunProcess.pid
185
+ });
186
+ this._closed = true;
164
187
  }
165
188
  async flushTelemetry() {
166
189
  await this._taskRunProcess?.cleanup(false);
@@ -194,6 +217,20 @@ var ProdBackgroundWorker = class {
194
217
  child.kill();
195
218
  reject(new Error("Worker timed out"));
196
219
  }, 1e4);
220
+ child.stdout?.on("data", (data) => {
221
+ console.log(data.toString());
222
+ });
223
+ child.stderr?.on("data", (data) => {
224
+ console.error(data.toString());
225
+ this.stderr.push(data.toString());
226
+ });
227
+ child.on("exit", (code) => {
228
+ if (!resolved) {
229
+ clearTimeout(timeout);
230
+ resolved = true;
231
+ reject(new Error(`Worker exited with code ${code}`));
232
+ }
233
+ });
197
234
  new ZodIpcConnection({
198
235
  listenSchema: ProdChildToWorkerMessages,
199
236
  emitSchema: ProdWorkerToChildMessages,
@@ -225,19 +262,6 @@ var ProdBackgroundWorker = class {
225
262
  }
226
263
  }
227
264
  });
228
- child.stdout?.on("data", (data) => {
229
- console.log(data.toString());
230
- });
231
- child.stderr?.on("data", (data) => {
232
- console.error(data.toString());
233
- });
234
- child.on("exit", (code) => {
235
- if (!resolved) {
236
- clearTimeout(timeout);
237
- resolved = true;
238
- reject(new Error(`Worker exited with code ${code}`));
239
- }
240
- });
241
265
  });
242
266
  this._initialized = true;
243
267
  }
@@ -250,63 +274,145 @@ var ProdBackgroundWorker = class {
250
274
  }
251
275
  // We need to notify all the task run processes that a task run has completed,
252
276
  // in case they are waiting for it through triggerAndWait
253
- async taskRunCompletedNotification(completion, execution) {
254
- this._taskRunProcess?.taskRunCompletedNotification(completion, execution);
277
+ async taskRunCompletedNotification(completion) {
278
+ this._taskRunProcess?.taskRunCompletedNotification(completion);
255
279
  }
256
280
  async waitCompletedNotification() {
257
281
  this._taskRunProcess?.waitCompletedNotification();
258
282
  }
259
- async #initializeTaskRunProcess(payload) {
283
+ async #getFreshTaskRunProcess(payload, messageId) {
260
284
  const metadata = this.getMetadata(
261
285
  payload.execution.worker.id,
262
286
  payload.execution.worker.version
263
287
  );
264
- if (!this._taskRunProcess) {
265
- const taskRunProcess = new TaskRunProcess(
266
- payload.execution,
267
- this.path,
268
- {
269
- ...this.params.env,
270
- ...payload.environment ?? {}
271
- },
272
- metadata,
273
- this.params
274
- );
275
- taskRunProcess.onExit.attach(() => {
288
+ console.log("Getting fresh task run process, setting closed to false", {
289
+ closed: this._closed
290
+ });
291
+ this._closed = false;
292
+ await this.#killCurrentTaskRunProcessBeforeAttempt();
293
+ const taskRunProcess = new TaskRunProcess(
294
+ payload.execution.run.id,
295
+ payload.execution.run.isTest,
296
+ this.path,
297
+ {
298
+ ...this.params.env,
299
+ ...payload.environment ?? {}
300
+ },
301
+ metadata,
302
+ this.params,
303
+ messageId
304
+ );
305
+ taskRunProcess.onExit.attach(({ pid }) => {
306
+ console.log("Task run process exited", { pid });
307
+ if (this._taskRunProcess?.pid === pid) {
276
308
  this._taskRunProcess = void 0;
277
- });
278
- taskRunProcess.onTaskHeartbeat.attach((id) => {
279
- this.onTaskHeartbeat.post(id);
280
- });
281
- taskRunProcess.onWaitForBatch.attach((message) => {
282
- this.onWaitForBatch.post(message);
283
- });
284
- taskRunProcess.onWaitForDuration.attach((message) => {
285
- this.onWaitForDuration.post(message);
286
- });
287
- taskRunProcess.onWaitForTask.attach((message) => {
288
- this.onWaitForTask.post(message);
289
- });
290
- taskRunProcess.onReadyForCheckpoint.attach((message) => {
291
- this.onReadyForCheckpoint.post(message);
292
- });
293
- taskRunProcess.onCancelCheckpoint.attach((message) => {
294
- this.onCancelCheckpoint.post(message);
295
- });
296
- this.preCheckpointNotification.attach((message) => {
297
- taskRunProcess.preCheckpointNotification.post(message);
298
- });
299
- await taskRunProcess.initialize();
300
- this._taskRunProcess = taskRunProcess;
301
- }
309
+ }
310
+ if (pid) {
311
+ this._taskRunProcessesBeingKilled.delete(pid);
312
+ }
313
+ });
314
+ taskRunProcess.onIsBeingKilled.attach((taskRunProcess2) => {
315
+ if (taskRunProcess2?.pid) {
316
+ this._taskRunProcessesBeingKilled.set(taskRunProcess2.pid, taskRunProcess2);
317
+ }
318
+ });
319
+ taskRunProcess.onTaskHeartbeat.attach((id) => {
320
+ this.onTaskHeartbeat.post(id);
321
+ });
322
+ taskRunProcess.onTaskRunHeartbeat.attach((id) => {
323
+ this.onTaskRunHeartbeat.post(id);
324
+ });
325
+ taskRunProcess.onWaitForBatch.attach((message) => {
326
+ this.onWaitForBatch.post(message);
327
+ });
328
+ taskRunProcess.onWaitForDuration.attach((message) => {
329
+ this.onWaitForDuration.post(message);
330
+ });
331
+ taskRunProcess.onWaitForTask.attach((message) => {
332
+ this.onWaitForTask.post(message);
333
+ });
334
+ taskRunProcess.onReadyForCheckpoint.attach((message) => {
335
+ this.onReadyForCheckpoint.post(message);
336
+ });
337
+ taskRunProcess.onCancelCheckpoint.attach((message) => {
338
+ this.onCancelCheckpoint.post(message);
339
+ });
340
+ this.preCheckpointNotification.attach((message) => {
341
+ taskRunProcess.preCheckpointNotification.post(message);
342
+ });
343
+ this.checkpointCanceledNotification.attach((message) => {
344
+ taskRunProcess.checkpointCanceledNotification.post(message);
345
+ });
346
+ await taskRunProcess.initialize();
347
+ this._taskRunProcess = taskRunProcess;
302
348
  return this._taskRunProcess;
303
349
  }
304
- // We need to fork the process before we can execute any tasks
305
- async executeTaskRun(payload) {
350
+ async forceKillOldTaskRunProcesses() {
351
+ for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) {
352
+ try {
353
+ await taskRunProcess.kill("SIGKILL");
354
+ } catch (error) {
355
+ console.error("Error while force killing old task run processes", error);
356
+ }
357
+ }
358
+ }
359
+ async #killCurrentTaskRunProcessBeforeAttempt() {
360
+ console.log("killCurrentTaskRunProcessBeforeAttempt()", {
361
+ hasTaskRunProcess: !!this._taskRunProcess
362
+ });
363
+ if (!this._taskRunProcess) {
364
+ return;
365
+ }
366
+ const currentTaskRunProcess = this._taskRunProcess;
367
+ console.log("Killing current task run process", {
368
+ isBeingKilled: currentTaskRunProcess?.isBeingKilled,
369
+ totalBeingKilled: this._taskRunProcessesBeingKilled.size
370
+ });
371
+ if (currentTaskRunProcess.isBeingKilled) {
372
+ if (this._taskRunProcessesBeingKilled.size > 1) {
373
+ await this.#tryGracefulExit(currentTaskRunProcess);
374
+ } else {
375
+ }
376
+ } else {
377
+ if (this._taskRunProcessesBeingKilled.size > 0) {
378
+ await this.#tryGracefulExit(currentTaskRunProcess);
379
+ } else {
380
+ currentTaskRunProcess.kill("SIGTERM", 5e3).catch(() => {
381
+ });
382
+ }
383
+ }
384
+ }
385
+ async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
306
386
  try {
307
- const taskRunProcess = await this.#initializeTaskRunProcess(payload);
387
+ const initialExit = taskRunProcess.onExit.waitFor(5e3);
388
+ if (kill) {
389
+ taskRunProcess.kill(initialSignal);
390
+ }
391
+ await initialExit;
392
+ } catch (error) {
393
+ console.error("TaskRunProcess graceful kill timeout exceeded", error);
394
+ this.#tryForcefulExit(taskRunProcess);
395
+ }
396
+ }
397
+ async #tryForcefulExit(taskRunProcess) {
398
+ try {
399
+ const forcedKill = taskRunProcess.onExit.waitFor(5e3);
400
+ taskRunProcess.kill("SIGKILL");
401
+ await forcedKill;
402
+ } catch (error) {
403
+ console.error("TaskRunProcess forced kill timeout exceeded", error);
404
+ throw new SigKillTimeoutProcessError();
405
+ }
406
+ }
407
+ // We need to fork the process before we can execute any tasks, use a fresh process for each execution
408
+ async executeTaskRun(payload, messageId) {
409
+ try {
410
+ const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId);
411
+ console.log("executing task run", {
412
+ attempt: payload.execution.attempt.id,
413
+ taskRunPid: taskRunProcess.pid
414
+ });
308
415
  const result = await taskRunProcess.executeTaskRun(payload);
309
- await taskRunProcess.cleanup(result.ok || result.retry === void 0);
310
416
  if (result.ok) {
311
417
  return result;
312
418
  }
@@ -353,6 +459,29 @@ var ProdBackgroundWorker = class {
353
459
  }
354
460
  };
355
461
  }
462
+ if (e instanceof SigKillTimeoutProcessError) {
463
+ return {
464
+ id: payload.execution.attempt.id,
465
+ ok: false,
466
+ retry: void 0,
467
+ error: {
468
+ type: "INTERNAL_ERROR",
469
+ code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT
470
+ }
471
+ };
472
+ }
473
+ if (e instanceof GracefulExitTimeoutError) {
474
+ return {
475
+ id: payload.execution.attempt.id,
476
+ ok: false,
477
+ retry: void 0,
478
+ error: {
479
+ type: "INTERNAL_ERROR",
480
+ code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT,
481
+ message: "Worker process killed while attempt in progress."
482
+ }
483
+ };
484
+ }
356
485
  return {
357
486
  id: payload.execution.attempt.id,
358
487
  ok: false,
@@ -362,10 +491,41 @@ var ProdBackgroundWorker = class {
362
491
  code: TaskRunErrorCodes.TASK_EXECUTION_FAILED
363
492
  }
364
493
  };
494
+ } finally {
495
+ await this.#killTaskRunProcess();
365
496
  }
366
497
  }
367
498
  async cancelAttempt(attemptId) {
368
- await this._taskRunProcess?.cancel();
499
+ if (!this._taskRunProcess) {
500
+ console.error("No task run process to cancel attempt", { attemptId });
501
+ return;
502
+ }
503
+ await this._taskRunProcess.cancel();
504
+ }
505
+ async executeTaskRunLazyAttempt(payload) {
506
+ this.onCreateTaskRunAttempt.post({ runId: payload.runId });
507
+ let execution;
508
+ try {
509
+ const attemptCreated = await this.attemptCreatedNotification.waitFor(3e4);
510
+ if (!attemptCreated.success) {
511
+ throw new Error(
512
+ `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
513
+ );
514
+ }
515
+ execution = attemptCreated.execution;
516
+ } catch (error) {
517
+ console.error("Error while creating attempt", error);
518
+ throw new Error(`Failed to create task run attempt: ${error}`);
519
+ }
520
+ const completion = await this.executeTaskRun(
521
+ {
522
+ execution,
523
+ traceContext: payload.traceContext,
524
+ environment: payload.environment
525
+ },
526
+ payload.messageId
527
+ );
528
+ return { execution, completion };
369
529
  }
370
530
  async #correctError(error, execution) {
371
531
  return {
@@ -375,26 +535,36 @@ var ProdBackgroundWorker = class {
375
535
  }
376
536
  };
377
537
  var TaskRunProcess = class {
378
- constructor(execution, path, env, metadata, worker) {
379
- this.execution = execution;
538
+ constructor(runId, isTest, path, env, metadata, worker, messageId) {
539
+ this.runId = runId;
540
+ this.isTest = isTest;
380
541
  this.path = path;
381
542
  this.env = env;
382
543
  this.metadata = metadata;
383
544
  this.worker = worker;
545
+ this.messageId = messageId;
384
546
  }
385
547
  _ipc;
386
548
  _child;
549
+ _childPid;
387
550
  _attemptPromises = /* @__PURE__ */ new Map();
388
551
  _attemptStatuses = /* @__PURE__ */ new Map();
389
552
  _currentExecution;
390
553
  _isBeingKilled = false;
391
554
  _isBeingCancelled = false;
555
+ _gracefulExitTimeoutElapsed = false;
556
+ /**
557
+ * @deprecated use onTaskRunHeartbeat instead
558
+ */
392
559
  onTaskHeartbeat = new Evt();
560
+ onTaskRunHeartbeat = new Evt();
393
561
  onExit = new Evt();
562
+ onIsBeingKilled = new Evt();
394
563
  onWaitForBatch = new Evt();
395
564
  onWaitForDuration = new Evt();
396
565
  onWaitForTask = new Evt();
397
566
  preCheckpointNotification = Evt.create();
567
+ checkpointCanceledNotification = Evt.create();
398
568
  onReadyForCheckpoint = Evt.create();
399
569
  onCancelCheckpoint = Evt.create();
400
570
  async initialize() {
@@ -409,7 +579,7 @@ var TaskRunProcess = class {
409
579
  "ipc"
410
580
  ],
411
581
  env: {
412
- ...this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {},
582
+ ...this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {},
413
583
  ...this.env,
414
584
  OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({
415
585
  [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir
@@ -417,6 +587,7 @@ var TaskRunProcess = class {
417
587
  ...this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}
418
588
  }
419
589
  });
590
+ this._childPid = this._child?.pid;
420
591
  this._ipc = new ZodIpcConnection({
421
592
  listenSchema: ProdChildToWorkerMessages,
422
593
  emitSchema: ProdWorkerToChildMessages,
@@ -437,28 +608,60 @@ var TaskRunProcess = class {
437
608
  resolver(result);
438
609
  },
439
610
  READY_TO_DISPOSE: async (message) => {
611
+ process.exit(0);
440
612
  },
441
613
  TASK_HEARTBEAT: async (message) => {
442
- this.onTaskHeartbeat.post(message.id);
614
+ if (this.messageId) {
615
+ this.onTaskRunHeartbeat.post(this.messageId);
616
+ } else {
617
+ this.onTaskHeartbeat.post(message.id);
618
+ }
443
619
  },
444
620
  TASKS_READY: async (message) => {
445
621
  },
622
+ WAIT_FOR_TASK: async (message) => {
623
+ this.onWaitForTask.post(message);
624
+ },
446
625
  WAIT_FOR_BATCH: async (message) => {
447
626
  this.onWaitForBatch.post(message);
448
627
  },
449
628
  WAIT_FOR_DURATION: async (message) => {
450
629
  this.onWaitForDuration.post(message);
451
- const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor();
452
- return { willCheckpointAndRestore };
453
- },
454
- WAIT_FOR_TASK: async (message) => {
455
- this.onWaitForTask.post(message);
630
+ try {
631
+ const { willCheckpointAndRestore } = await this.preCheckpointNotification.waitFor(
632
+ 3e4
633
+ );
634
+ return {
635
+ willCheckpointAndRestore
636
+ };
637
+ } catch (error) {
638
+ console.error("Error while waiting for pre-checkpoint notification", error);
639
+ return {
640
+ willCheckpointAndRestore: false
641
+ };
642
+ }
456
643
  },
457
644
  READY_FOR_CHECKPOINT: async (message) => {
458
645
  this.onReadyForCheckpoint.post(message);
459
646
  },
460
647
  CANCEL_CHECKPOINT: async (message) => {
648
+ const version = "v2";
461
649
  this.onCancelCheckpoint.post(message);
650
+ try {
651
+ const { checkpointCanceled } = await this.checkpointCanceledNotification.waitFor(
652
+ 3e4
653
+ );
654
+ return {
655
+ version,
656
+ checkpointCanceled
657
+ };
658
+ } catch (error) {
659
+ console.error("Error while waiting for checkpoint cancellation", error);
660
+ return {
661
+ version,
662
+ checkpointCanceled: true
663
+ };
664
+ }
462
665
  }
463
666
  }
464
667
  });
@@ -470,15 +673,36 @@ var TaskRunProcess = class {
470
673
  this._isBeingCancelled = true;
471
674
  await this.cleanup(true);
472
675
  }
473
- async cleanup(kill = false) {
676
+ async cleanup(kill = false, gracefulExitTimeoutElapsed = false) {
677
+ console.log("cleanup()", { kill, gracefulExitTimeoutElapsed });
474
678
  if (kill && this._isBeingKilled) {
475
679
  return;
476
680
  }
477
- this._isBeingKilled = kill;
478
- await this._ipc?.sendWithAck("CLEANUP", {
479
- flush: true,
480
- kill
681
+ if (kill) {
682
+ this._isBeingKilled = true;
683
+ this.onIsBeingKilled.post(this);
684
+ }
685
+ const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution;
686
+ const killParentProcess = kill && !killChildProcess;
687
+ console.log("Cleaning up task run process", {
688
+ killChildProcess,
689
+ killParentProcess,
690
+ ipc: this._ipc,
691
+ childPid: this._childPid,
692
+ realChildPid: this._child?.pid
481
693
  });
694
+ await this._ipc?.sendWithAck(
695
+ "CLEANUP",
696
+ {
697
+ flush: true,
698
+ kill: killParentProcess
699
+ },
700
+ 3e4
701
+ );
702
+ if (killChildProcess) {
703
+ this._gracefulExitTimeoutElapsed = true;
704
+ await this.kill("SIGKILL");
705
+ }
482
706
  }
483
707
  async executeTaskRun(payload) {
484
708
  let resolver;
@@ -502,14 +726,14 @@ var TaskRunProcess = class {
502
726
  this._currentExecution = void 0;
503
727
  return result;
504
728
  }
505
- taskRunCompletedNotification(completion, execution) {
729
+ taskRunCompletedNotification(completion) {
506
730
  if (!completion.ok && typeof completion.retry !== "undefined") {
507
731
  return;
508
732
  }
509
733
  if (this._child?.connected && !this._isBeingKilled && !this._child.killed) {
510
734
  this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
511
- completion,
512
- execution
735
+ version: "v2",
736
+ completion
513
737
  });
514
738
  }
515
739
  }
@@ -518,9 +742,11 @@ var TaskRunProcess = class {
518
742
  this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
519
743
  }
520
744
  }
521
- async #handleExit(code) {
745
+ async #handleExit(code, signal) {
746
+ console.log("handling child exit", { code, signal });
522
747
  for (const [id, status] of this._attemptStatuses.entries()) {
523
748
  if (status === "PENDING") {
749
+ console.log("found pending attempt", { id });
524
750
  this._attemptStatuses.set(id, "REJECTED");
525
751
  const attemptPromise = this._attemptPromises.get(id);
526
752
  if (!attemptPromise) {
@@ -529,56 +755,136 @@ var TaskRunProcess = class {
529
755
  const { rejecter } = attemptPromise;
530
756
  if (this._isBeingCancelled) {
531
757
  rejecter(new CancelledProcessError());
758
+ } else if (this._gracefulExitTimeoutElapsed) {
759
+ rejecter(new GracefulExitTimeoutError());
532
760
  } else if (this._isBeingKilled) {
533
761
  rejecter(new CleanupProcessError());
534
762
  } else {
535
- rejecter(new UnexpectedExitError(code));
763
+ rejecter(new UnexpectedExitError(code ?? -1));
536
764
  }
537
765
  }
538
766
  }
539
- this.onExit.post(code);
767
+ this.onExit.post({ code, signal, pid: this.pid });
540
768
  }
541
769
  #handleLog(data) {
542
- if (!this._currentExecution) {
543
- return;
544
- }
545
- console.log(
546
- `[${this.metadata.version}][${this._currentExecution.run.id}.${this._currentExecution.attempt.number}] ${data.toString()}`
547
- );
770
+ console.log(data.toString());
548
771
  }
549
772
  #handleStdErr(data) {
550
- if (this._isBeingKilled) {
551
- return;
552
- }
553
- if (!this._currentExecution) {
554
- console.error(`[${this.metadata.version}] ${data.toString()}`);
555
- return;
556
- }
557
- console.error(
558
- `[${this.metadata.version}][${this._currentExecution.run.id}.${this._currentExecution.attempt.number}] ${data.toString()}`
559
- );
773
+ console.error(data.toString());
560
774
  }
561
- #kill() {
562
- if (this._child && !this._child.killed) {
563
- this._child?.kill();
775
+ async kill(signal, timeoutInMs) {
776
+ this._isBeingKilled = true;
777
+ const killTimeout = this.onExit.waitFor(timeoutInMs);
778
+ this.onIsBeingKilled.post(this);
779
+ this._child?.kill(signal);
780
+ if (timeoutInMs) {
781
+ await killTimeout;
564
782
  }
565
783
  }
784
+ get isBeingKilled() {
785
+ return this._isBeingKilled || this._child?.killed;
786
+ }
787
+ get pid() {
788
+ return this._childPid;
789
+ }
566
790
  };
567
791
 
568
792
  // src/workers/prod/entry-point.ts
569
793
  import { setTimeout as setTimeout2 } from "node:timers/promises";
570
- var HTTP_SERVER_PORT2 = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
794
+ var HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber());
571
795
  var COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
572
796
  var COORDINATOR_PORT = Number(process.env.COORDINATOR_PORT || 50080);
573
- var MACHINE_NAME2 = process.env.MACHINE_NAME || "local";
797
+ var MACHINE_NAME = process.env.MACHINE_NAME || "local";
574
798
  var POD_NAME = process.env.POD_NAME || "some-pod";
575
799
  var SHORT_HASH = process.env.TRIGGER_CONTENT_HASH.slice(0, 9);
576
- var logger2 = new SimpleLogger(`[${MACHINE_NAME2}][${SHORT_HASH}]`);
800
+ var logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`);
577
801
  var ProdWorker = class {
578
802
  constructor(port, host = "0.0.0.0") {
579
803
  this.host = host;
804
+ process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM"));
580
805
  this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST);
581
- this.#backgroundWorker = new ProdBackgroundWorker("worker.js", {
806
+ this.#backgroundWorker = this.#createBackgroundWorker();
807
+ this.#httpPort = port;
808
+ this.#httpServer = this.#createHttpServer();
809
+ }
810
+ apiUrl = process.env.TRIGGER_API_URL;
811
+ apiKey = process.env.TRIGGER_SECRET_KEY;
812
+ contentHash = process.env.TRIGGER_CONTENT_HASH;
813
+ projectRef = process.env.TRIGGER_PROJECT_REF;
814
+ envId = process.env.TRIGGER_ENV_ID;
815
+ runId = process.env.TRIGGER_RUN_ID || "index-only";
816
+ deploymentId = process.env.TRIGGER_DEPLOYMENT_ID;
817
+ deploymentVersion = process.env.TRIGGER_DEPLOYMENT_VERSION;
818
+ runningInKubernetes = !!process.env.KUBERNETES_PORT;
819
+ executing = false;
820
+ completed = /* @__PURE__ */ new Set();
821
+ paused = false;
822
+ attemptFriendlyId;
823
+ nextResumeAfter;
824
+ waitForPostStart = false;
825
+ #httpPort;
826
+ #backgroundWorker;
827
+ #httpServer;
828
+ #coordinatorSocket;
829
+ async #handleSignal(signal) {
830
+ logger.log("Received signal", { signal });
831
+ if (signal === "SIGTERM") {
832
+ let gracefulExitTimeoutElapsed = false;
833
+ if (this.executing) {
834
+ const terminationGracePeriodSeconds = 60 * 60;
835
+ logger.log("Waiting for attempt to complete before exiting", {
836
+ terminationGracePeriodSeconds
837
+ });
838
+ await setTimeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
839
+ gracefulExitTimeoutElapsed = true;
840
+ logger.log("Termination timeout reached, exiting gracefully.");
841
+ } else {
842
+ logger.log("Not executing, exiting immediately.");
843
+ }
844
+ await this.#exitGracefully(gracefulExitTimeoutElapsed);
845
+ return;
846
+ }
847
+ logger.log("Unhandled signal", { signal });
848
+ }
849
+ async #exitGracefully(gracefulExitTimeoutElapsed = false) {
850
+ await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
851
+ if (!gracefulExitTimeoutElapsed) {
852
+ process.exit(0);
853
+ }
854
+ }
855
+ async #reconnect(isPostStart = false, reconnectImmediately = false) {
856
+ if (isPostStart) {
857
+ this.waitForPostStart = false;
858
+ }
859
+ this.#coordinatorSocket.close();
860
+ if (!reconnectImmediately) {
861
+ await setTimeout2(1e3);
862
+ }
863
+ let coordinatorHost = COORDINATOR_HOST;
864
+ try {
865
+ if (this.runningInKubernetes) {
866
+ coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
867
+ "\n",
868
+ ""
869
+ );
870
+ logger.log("reconnecting", {
871
+ coordinatorHost: {
872
+ fromEnv: COORDINATOR_HOST,
873
+ fromVolume: coordinatorHost,
874
+ current: this.#coordinatorSocket.socket.io.opts.hostname
875
+ }
876
+ });
877
+ }
878
+ } catch (error) {
879
+ logger.error("taskinfo read error during reconnect", {
880
+ error: error instanceof Error ? error.message : error
881
+ });
882
+ } finally {
883
+ this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
884
+ }
885
+ }
886
+ #createBackgroundWorker() {
887
+ const backgroundWorker = new ProdBackgroundWorker("worker.js", {
582
888
  projectConfig: __PROJECT_CONFIG__,
583
889
  env: {
584
890
  ...gatherProcessEnv(),
@@ -588,26 +894,69 @@ var ProdWorker = class {
588
894
  },
589
895
  contentHash: this.contentHash
590
896
  });
591
- this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
897
+ backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
592
898
  this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId });
593
899
  });
594
- this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
900
+ backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
901
+ this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
902
+ });
903
+ backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
904
+ await this.#prepareForCheckpoint();
595
905
  this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
596
906
  });
597
- this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => {
598
- logger2.log("onCancelCheckpoint() clearing paused state, don't wait for post start hook", {
599
- paused: this.paused,
600
- nextResumeAfter: this.nextResumeAfter,
601
- waitForPostStart: this.waitForPostStart
907
+ backgroundWorker.onCancelCheckpoint.attach(async (message) => {
908
+ logger.log("onCancelCheckpoint", { message });
909
+ const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
910
+ "CANCEL_CHECKPOINT",
911
+ {
912
+ version: "v2",
913
+ reason: message.reason
914
+ }
915
+ );
916
+ logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
917
+ if (checkpointCanceled) {
918
+ if (message.reason === "WAIT_FOR_DURATION") {
919
+ this.paused = false;
920
+ this.nextResumeAfter = void 0;
921
+ this.waitForPostStart = false;
922
+ }
923
+ }
924
+ backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
925
+ });
926
+ backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
927
+ logger.log("onCreateTaskRunAttempt()", { message });
928
+ const createAttempt = await this.#coordinatorSocket.socket.emitWithAck(
929
+ "CREATE_TASK_RUN_ATTEMPT",
930
+ {
931
+ version: "v1",
932
+ runId: message.runId
933
+ }
934
+ );
935
+ if (!createAttempt.success) {
936
+ backgroundWorker.attemptCreatedNotification.post({
937
+ success: false,
938
+ reason: createAttempt.reason
939
+ });
940
+ return;
941
+ }
942
+ backgroundWorker.attemptCreatedNotification.post({
943
+ success: true,
944
+ execution: createAttempt.executionPayload.execution
602
945
  });
603
- this.paused = false;
604
- this.nextResumeAfter = void 0;
605
- this.waitForPostStart = false;
606
- this.#coordinatorSocket.socket.emit("CANCEL_CHECKPOINT", { version: "v1" });
607
946
  });
608
- this.#backgroundWorker.onWaitForDuration.attach(async (message) => {
947
+ backgroundWorker.attemptCreatedNotification.attach((message) => {
948
+ if (!message.success) {
949
+ return;
950
+ }
951
+ this.attemptFriendlyId = message.execution.attempt.id;
952
+ });
953
+ backgroundWorker.onWaitForDuration.attach(async (message) => {
609
954
  if (!this.attemptFriendlyId) {
610
- logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
955
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
956
+ this.#emitUnrecoverableError(
957
+ "NoAttemptId",
958
+ "Attempt ID not set before waiting for duration"
959
+ );
611
960
  return;
612
961
  }
613
962
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -619,9 +968,10 @@ var ProdWorker = class {
619
968
  );
620
969
  this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
621
970
  });
622
- this.#backgroundWorker.onWaitForTask.attach(async (message) => {
971
+ backgroundWorker.onWaitForTask.attach(async (message) => {
623
972
  if (!this.attemptFriendlyId) {
624
- logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
973
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
974
+ this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task");
625
975
  return;
626
976
  }
627
977
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -633,9 +983,10 @@ var ProdWorker = class {
633
983
  );
634
984
  this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
635
985
  });
636
- this.#backgroundWorker.onWaitForBatch.attach(async (message) => {
986
+ backgroundWorker.onWaitForBatch.attach(async (message) => {
637
987
  if (!this.attemptFriendlyId) {
638
- logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
988
+ logger.error("Failed to send wait message, attempt friendly ID not set", { message });
989
+ this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
639
990
  return;
640
991
  }
641
992
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -647,84 +998,50 @@ var ProdWorker = class {
647
998
  );
648
999
  this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
649
1000
  });
650
- this.#httpPort = port;
651
- this.#httpServer = this.#createHttpServer();
1001
+ return backgroundWorker;
652
1002
  }
653
- apiUrl = process.env.TRIGGER_API_URL;
654
- apiKey = process.env.TRIGGER_SECRET_KEY;
655
- contentHash = process.env.TRIGGER_CONTENT_HASH;
656
- projectRef = process.env.TRIGGER_PROJECT_REF;
657
- envId = process.env.TRIGGER_ENV_ID;
658
- runId = process.env.TRIGGER_RUN_ID || "index-only";
659
- deploymentId = process.env.TRIGGER_DEPLOYMENT_ID;
660
- deploymentVersion = process.env.TRIGGER_DEPLOYMENT_VERSION;
661
- runningInKubernetes = !!process.env.KUBERNETES_PORT;
662
- executing = false;
663
- completed = /* @__PURE__ */ new Set();
664
- paused = false;
665
- attemptFriendlyId;
666
- nextResumeAfter;
667
- waitForPostStart = false;
668
- #httpPort;
669
- #backgroundWorker;
670
- #httpServer;
671
- #coordinatorSocket;
672
- async #reconnect(isPostStart = false) {
673
- if (isPostStart) {
674
- this.waitForPostStart = false;
675
- }
676
- this.#coordinatorSocket.close();
677
- if (!this.runningInKubernetes) {
678
- this.#coordinatorSocket.connect();
679
- return;
680
- }
681
- try {
682
- const coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
683
- "\n",
684
- ""
685
- );
686
- logger2.log("reconnecting", {
687
- coordinatorHost: {
688
- fromEnv: COORDINATOR_HOST,
689
- fromVolume: coordinatorHost,
690
- current: this.#coordinatorSocket.socket.io.opts.hostname
691
- }
692
- });
693
- this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
694
- } catch (error) {
695
- logger2.error("taskinfo read error during reconnect", { error });
696
- this.#coordinatorSocket.connect();
697
- }
698
- }
699
- #prepareForWait(reason, willCheckpointAndRestore) {
700
- logger2.log(`prepare for ${reason}`, { willCheckpointAndRestore });
1003
+ async #prepareForWait(reason, willCheckpointAndRestore) {
1004
+ logger.log(`prepare for ${reason}`, { willCheckpointAndRestore });
701
1005
  this.#backgroundWorker.preCheckpointNotification.post({ willCheckpointAndRestore });
702
1006
  if (willCheckpointAndRestore) {
703
1007
  this.paused = true;
704
1008
  this.nextResumeAfter = reason;
705
1009
  this.waitForPostStart = true;
1010
+ if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
1011
+ await this.#prepareForCheckpoint();
1012
+ }
706
1013
  }
707
1014
  }
708
1015
  async #prepareForRetry(willCheckpointAndRestore, shouldExit) {
709
- logger2.log("prepare for retry", { willCheckpointAndRestore, shouldExit });
1016
+ logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit });
710
1017
  if (shouldExit) {
711
1018
  if (willCheckpointAndRestore) {
712
- logger2.log("WARNING: Will checkpoint but also requested exit. This won't end well.");
1019
+ logger.log("WARNING: Will checkpoint but also requested exit. This won't end well.");
713
1020
  }
714
- await this.#backgroundWorker.close();
715
- process.exit(0);
1021
+ await this.#exitGracefully();
1022
+ return;
716
1023
  }
1024
+ this.paused = false;
1025
+ this.waitForPostStart = false;
717
1026
  this.executing = false;
718
1027
  this.attemptFriendlyId = void 0;
719
1028
  if (willCheckpointAndRestore) {
720
1029
  this.waitForPostStart = true;
1030
+ this.#prepareForCheckpoint(false);
721
1031
  this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
722
1032
  return;
723
1033
  }
724
1034
  }
1035
+ async #prepareForCheckpoint(flush = true) {
1036
+ if (flush) {
1037
+ await this.#backgroundWorker.flushTelemetry();
1038
+ }
1039
+ await this.#backgroundWorker.forceKillOldTaskRunProcesses();
1040
+ }
725
1041
  #resumeAfterDuration() {
726
1042
  this.paused = false;
727
1043
  this.nextResumeAfter = void 0;
1044
+ this.waitForPostStart = false;
728
1045
  this.#backgroundWorker.waitCompletedNotification();
729
1046
  }
730
1047
  #returnValidatedExtraHeaders(headers) {
@@ -735,9 +1052,10 @@ var ProdWorker = class {
735
1052
  }
736
1053
  return headers;
737
1054
  }
1055
+ // FIXME: If the the worker can't connect for a while, this runs MANY times - it should only run once
738
1056
  #createCoordinatorSocket(host) {
739
1057
  const extraHeaders = this.#returnValidatedExtraHeaders({
740
- "x-machine-name": MACHINE_NAME2,
1058
+ "x-machine-name": MACHINE_NAME,
741
1059
  "x-pod-name": POD_NAME,
742
1060
  "x-trigger-content-hash": this.contentHash,
743
1061
  "x-trigger-project-ref": this.projectRef,
@@ -749,12 +1067,9 @@ var ProdWorker = class {
749
1067
  if (this.attemptFriendlyId) {
750
1068
  extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId;
751
1069
  }
752
- logger2.log("connecting to coordinator", {
753
- host,
754
- port: COORDINATOR_PORT,
755
- extraHeaders
756
- });
757
- const coordinatorConnection = new ZodSocketConnection2({
1070
+ logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`);
1071
+ logger.debug(`connecting with extra headers`, { extraHeaders });
1072
+ const coordinatorConnection = new ZodSocketConnection({
758
1073
  namespace: "prod-worker",
759
1074
  host,
760
1075
  port: COORDINATOR_PORT,
@@ -762,60 +1077,49 @@ var ProdWorker = class {
762
1077
  serverMessages: CoordinatorToProdWorkerMessages,
763
1078
  extraHeaders,
764
1079
  handlers: {
765
- RESUME_AFTER_DEPENDENCY: async (message) => {
1080
+ RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
766
1081
  if (!this.paused) {
767
- logger2.error("worker not paused", {
768
- completions: message.completions,
769
- executions: message.executions
770
- });
771
- return;
772
- }
773
- if (message.completions.length !== message.executions.length) {
774
- logger2.error("did not receive the same number of completions and executions", {
775
- completions: message.completions,
776
- executions: message.executions
777
- });
1082
+ logger.error("Failed to resume after dependency: Worker not paused");
778
1083
  return;
779
1084
  }
780
- if (message.completions.length === 0 || message.executions.length === 0) {
781
- logger2.error("no completions or executions", {
782
- completions: message.completions,
783
- executions: message.executions
784
- });
1085
+ if (completions.length === 0) {
1086
+ logger.error("Failed to resume after dependency: No completions");
785
1087
  return;
786
1088
  }
787
1089
  if (this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH") {
788
- logger2.error("not waiting to resume after dependency", {
1090
+ logger.error("Failed to resume after dependency: Invalid next resume", {
789
1091
  nextResumeAfter: this.nextResumeAfter
790
1092
  });
791
1093
  return;
792
1094
  }
793
- if (this.nextResumeAfter === "WAIT_FOR_TASK" && message.completions.length > 1) {
794
- logger2.error("waiting for single task but got multiple completions", {
795
- completions: message.completions,
796
- executions: message.executions
797
- });
1095
+ if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) {
1096
+ logger.error(
1097
+ "Failed to resume after dependency: Waiting for single task but got multiple completions",
1098
+ {
1099
+ completions
1100
+ }
1101
+ );
798
1102
  return;
799
1103
  }
800
1104
  this.paused = false;
801
1105
  this.nextResumeAfter = void 0;
802
- for (let i = 0; i < message.completions.length; i++) {
803
- const completion = message.completions[i];
804
- const execution = message.executions[i];
805
- if (!completion || !execution)
1106
+ this.waitForPostStart = false;
1107
+ for (let i = 0; i < completions.length; i++) {
1108
+ const completion = completions[i];
1109
+ if (!completion)
806
1110
  continue;
807
- this.#backgroundWorker.taskRunCompletedNotification(completion, execution);
1111
+ this.#backgroundWorker.taskRunCompletedNotification(completion);
808
1112
  }
809
1113
  },
810
1114
  RESUME_AFTER_DURATION: async (message) => {
811
1115
  if (!this.paused) {
812
- logger2.error("worker not paused", {
1116
+ logger.error("worker not paused", {
813
1117
  attemptId: message.attemptId
814
1118
  });
815
1119
  return;
816
1120
  }
817
1121
  if (this.nextResumeAfter !== "WAIT_FOR_DURATION") {
818
- logger2.error("not waiting to resume after duration", {
1122
+ logger.error("not waiting to resume after duration", {
819
1123
  nextResumeAfter: this.nextResumeAfter
820
1124
  });
821
1125
  return;
@@ -824,34 +1128,79 @@ var ProdWorker = class {
824
1128
  },
825
1129
  EXECUTE_TASK_RUN: async ({ executionPayload }) => {
826
1130
  if (this.executing) {
827
- logger2.error("dropping execute request, already executing");
1131
+ logger.error("dropping execute request, already executing");
828
1132
  return;
829
1133
  }
830
1134
  if (this.completed.has(executionPayload.execution.attempt.id)) {
831
- logger2.error("dropping execute request, already completed");
1135
+ logger.error("dropping execute request, already completed");
832
1136
  return;
833
1137
  }
834
1138
  this.executing = true;
835
1139
  this.attemptFriendlyId = executionPayload.execution.attempt.id;
836
1140
  const completion = await this.#backgroundWorker.executeTaskRun(executionPayload);
837
- logger2.log("completed", completion);
1141
+ logger.log("completed", completion);
838
1142
  this.completed.add(executionPayload.execution.attempt.id);
839
- await this.#backgroundWorker.flushTelemetry();
840
1143
  const { willCheckpointAndRestore, shouldExit } = await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
841
1144
  version: "v1",
842
1145
  execution: executionPayload.execution,
843
1146
  completion
844
1147
  });
845
- logger2.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1148
+ logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
846
1149
  this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
847
1150
  },
1151
+ EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
1152
+ if (this.executing) {
1153
+ logger.error("dropping execute request, already executing");
1154
+ return;
1155
+ }
1156
+ this.executing = true;
1157
+ try {
1158
+ const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
1159
+ logger.log("completed", completion);
1160
+ this.completed.add(execution.attempt.id);
1161
+ const { willCheckpointAndRestore, shouldExit } = await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
1162
+ version: "v1",
1163
+ execution,
1164
+ completion
1165
+ });
1166
+ logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1167
+ this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
1168
+ } catch (error) {
1169
+ const completion = {
1170
+ ok: false,
1171
+ id: message.lazyPayload.runId,
1172
+ retry: void 0,
1173
+ error: error instanceof Error ? {
1174
+ type: "BUILT_IN_ERROR",
1175
+ name: error.name,
1176
+ message: error.message,
1177
+ stackTrace: error.stack ?? ""
1178
+ } : {
1179
+ type: "BUILT_IN_ERROR",
1180
+ name: "UnknownError",
1181
+ message: String(error),
1182
+ stackTrace: ""
1183
+ }
1184
+ };
1185
+ this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
1186
+ version: "v1",
1187
+ completion
1188
+ });
1189
+ }
1190
+ },
848
1191
  REQUEST_ATTEMPT_CANCELLATION: async (message) => {
849
1192
  if (!this.executing) {
1193
+ logger.log("dropping cancel request, not executing", { status: this.#status });
850
1194
  return;
851
1195
  }
1196
+ logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status });
852
1197
  await this.#backgroundWorker.cancelAttempt(message.attemptId);
853
1198
  },
854
- REQUEST_EXIT: async () => {
1199
+ REQUEST_EXIT: async (message) => {
1200
+ if (message.version === "v2" && message.delayInMs) {
1201
+ logger.log("exit requested with delay", { delayInMs: message.delayInMs });
1202
+ await setTimeout2(message.delayInMs);
1203
+ }
855
1204
  this.#coordinatorSocket.close();
856
1205
  process.exit(0);
857
1206
  },
@@ -859,122 +1208,140 @@ var ProdWorker = class {
859
1208
  if (this.completed.size < 1) {
860
1209
  return;
861
1210
  }
862
- this.#coordinatorSocket.socket.emit("READY_FOR_EXECUTION", {
1211
+ this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
863
1212
  version: "v1",
864
1213
  runId: this.runId,
865
1214
  totalCompletions: this.completed.size
866
1215
  });
867
1216
  }
868
1217
  },
869
- onConnection: async (socket, handler, sender, logger3) => {
1218
+ onConnection: async (socket, handler, sender, logger2) => {
1219
+ logger2.log("connected to coordinator", { status: this.#status });
870
1220
  if (this.waitForPostStart) {
871
- logger3.log("skip connection handler, waiting for post start hook");
1221
+ logger2.log("skip connection handler, waiting for post start hook");
1222
+ return;
1223
+ }
1224
+ if (this.paused) {
1225
+ if (!this.nextResumeAfter) {
1226
+ logger2.error("Missing next resume reason", { status: this.#status });
1227
+ this.#emitUnrecoverableError(
1228
+ "NoNextResume",
1229
+ "Next resume reason not set while resuming from paused state"
1230
+ );
1231
+ return;
1232
+ }
1233
+ if (!this.attemptFriendlyId) {
1234
+ logger2.error("Missing friendly ID", { status: this.#status });
1235
+ this.#emitUnrecoverableError(
1236
+ "NoAttemptId",
1237
+ "Attempt ID not set while resuming from paused state"
1238
+ );
1239
+ return;
1240
+ }
1241
+ socket.emit("READY_FOR_RESUME", {
1242
+ version: "v1",
1243
+ attemptFriendlyId: this.attemptFriendlyId,
1244
+ type: this.nextResumeAfter
1245
+ });
872
1246
  return;
873
1247
  }
874
1248
  if (process.env.INDEX_TASKS === "true") {
875
1249
  try {
876
1250
  const taskResources = await this.#initializeWorker();
877
1251
  const { success } = await socket.emitWithAck("INDEX_TASKS", {
878
- version: "v1",
1252
+ version: "v2",
879
1253
  deploymentId: this.deploymentId,
880
- ...taskResources
1254
+ ...taskResources,
1255
+ supportsLazyAttempts: true
881
1256
  });
882
1257
  if (success) {
883
- logger3.info("indexing done, shutting down..");
1258
+ logger2.info("indexing done, shutting down..");
884
1259
  process.exit(0);
885
1260
  } else {
886
- logger3.info("indexing failure, shutting down..");
1261
+ logger2.info("indexing failure, shutting down..");
887
1262
  process.exit(1);
888
1263
  }
889
1264
  } catch (e) {
1265
+ const stderr = this.#backgroundWorker.stderr.join("\n");
890
1266
  if (e instanceof TaskMetadataParseError) {
891
- logger3.error("tasks metadata parse error", { message: e.zodIssues, tasks: e.tasks });
1267
+ logger2.error("tasks metadata parse error", {
1268
+ zodIssues: e.zodIssues,
1269
+ tasks: e.tasks
1270
+ });
892
1271
  socket.emit("INDEXING_FAILED", {
893
1272
  version: "v1",
894
1273
  deploymentId: this.deploymentId,
895
1274
  error: {
896
1275
  name: "TaskMetadataParseError",
897
1276
  message: "There was an error parsing the task metadata",
898
- stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks })
1277
+ stack: JSON.stringify({ zodIssues: e.zodIssues, tasks: e.tasks }),
1278
+ stderr
899
1279
  }
900
1280
  });
901
1281
  } else if (e instanceof UncaughtExceptionError) {
902
- logger3.error("uncaught exception", { message: e.originalError.message });
1282
+ const error = {
1283
+ name: e.originalError.name,
1284
+ message: e.originalError.message,
1285
+ stack: e.originalError.stack,
1286
+ stderr
1287
+ };
1288
+ logger2.error("uncaught exception", { originalError: error });
903
1289
  socket.emit("INDEXING_FAILED", {
904
1290
  version: "v1",
905
1291
  deploymentId: this.deploymentId,
906
- error: {
907
- name: e.originalError.name,
908
- message: e.originalError.message,
909
- stack: e.originalError.stack
910
- }
1292
+ error
911
1293
  });
912
1294
  } else if (e instanceof Error) {
913
- logger3.error("error", { message: e.message });
1295
+ const error = {
1296
+ name: e.name,
1297
+ message: e.message,
1298
+ stack: e.stack,
1299
+ stderr
1300
+ };
1301
+ logger2.error("error", { error });
914
1302
  socket.emit("INDEXING_FAILED", {
915
1303
  version: "v1",
916
1304
  deploymentId: this.deploymentId,
917
- error: {
918
- name: e.name,
919
- message: e.message,
920
- stack: e.stack
921
- }
1305
+ error
922
1306
  });
923
1307
  } else if (typeof e === "string") {
924
- logger3.error("string error", { message: e });
1308
+ logger2.error("string error", { error: { message: e } });
925
1309
  socket.emit("INDEXING_FAILED", {
926
1310
  version: "v1",
927
1311
  deploymentId: this.deploymentId,
928
1312
  error: {
929
1313
  name: "Error",
930
- message: e
1314
+ message: e,
1315
+ stderr
931
1316
  }
932
1317
  });
933
1318
  } else {
934
- logger3.error("unknown error", { error: e });
1319
+ logger2.error("unknown error", { error: e });
935
1320
  socket.emit("INDEXING_FAILED", {
936
1321
  version: "v1",
937
1322
  deploymentId: this.deploymentId,
938
1323
  error: {
939
1324
  name: "Error",
940
- message: "Unknown error"
1325
+ message: "Unknown error",
1326
+ stderr
941
1327
  }
942
1328
  });
943
1329
  }
944
1330
  await setTimeout2(200);
945
- process.exit(1);
1331
+ process.exit(111);
946
1332
  }
947
1333
  }
948
- if (this.paused) {
949
- if (!this.nextResumeAfter) {
950
- return;
951
- }
952
- if (!this.attemptFriendlyId) {
953
- logger3.error("Missing friendly ID");
954
- return;
955
- }
956
- if (this.nextResumeAfter === "WAIT_FOR_DURATION") {
957
- this.#resumeAfterDuration();
958
- return;
959
- }
960
- socket.emit("READY_FOR_RESUME", {
961
- version: "v1",
962
- attemptFriendlyId: this.attemptFriendlyId,
963
- type: this.nextResumeAfter
964
- });
965
- return;
966
- }
967
1334
  if (this.executing) {
968
1335
  return;
969
1336
  }
970
- socket.emit("READY_FOR_EXECUTION", {
1337
+ socket.emit("READY_FOR_LAZY_ATTEMPT", {
971
1338
  version: "v1",
972
1339
  runId: this.runId,
973
1340
  totalCompletions: this.completed.size
974
1341
  });
975
1342
  },
976
- onError: async (socket, err, logger3) => {
977
- logger3.error("onError", {
1343
+ onError: async (socket, err, logger2) => {
1344
+ logger2.error("onError", {
978
1345
  error: {
979
1346
  name: err.name,
980
1347
  message: err.message
@@ -982,14 +1349,14 @@ var ProdWorker = class {
982
1349
  });
983
1350
  await this.#reconnect();
984
1351
  },
985
- onDisconnect: async (socket, reason, description, logger3) => {
1352
+ onDisconnect: async (socket, reason, description, logger2) => {
986
1353
  }
987
1354
  });
988
1355
  return coordinatorConnection;
989
1356
  }
990
1357
  #createHttpServer() {
991
1358
  const httpServer = createServer(async (req, res) => {
992
- logger2.log(`[${req.method}]`, req.url);
1359
+ logger.log(`[${req.method}]`, req.url);
993
1360
  const reply = new HttpReply(res);
994
1361
  try {
995
1362
  const url = new URL(req.url ?? "", `http://${req.headers.host}`);
@@ -998,11 +1365,7 @@ var ProdWorker = class {
998
1365
  return reply.text("ok");
999
1366
  }
1000
1367
  case "/status": {
1001
- return reply.json({
1002
- executing: this.executing,
1003
- pause: this.paused,
1004
- nextResumeAfter: this.nextResumeAfter
1005
- });
1368
+ return reply.json(this.#status);
1006
1369
  }
1007
1370
  case "/connect": {
1008
1371
  this.#coordinatorSocket.connect();
@@ -1026,7 +1389,7 @@ var ProdWorker = class {
1026
1389
  case "/preStop": {
1027
1390
  const cause = PreStopCauses.safeParse(url.searchParams.get("cause"));
1028
1391
  if (!cause.success) {
1029
- logger2.error("Failed to parse cause", { cause });
1392
+ logger.error("Failed to parse cause", { cause });
1030
1393
  return reply.text("Failed to parse cause", 400);
1031
1394
  }
1032
1395
  switch (cause.data) {
@@ -1034,17 +1397,16 @@ var ProdWorker = class {
1034
1397
  break;
1035
1398
  }
1036
1399
  default: {
1037
- logger2.error("Unhandled cause", { cause: cause.data });
1400
+ logger.error("Unhandled cause", { cause: cause.data });
1038
1401
  break;
1039
1402
  }
1040
1403
  }
1041
- logger2.log("preStop", { url: req.url });
1042
1404
  return reply.text("preStop ok");
1043
1405
  }
1044
1406
  case "/postStart": {
1045
1407
  const cause = PostStartCauses.safeParse(url.searchParams.get("cause"));
1046
1408
  if (!cause.success) {
1047
- logger2.error("Failed to parse cause", { cause });
1409
+ logger.error("Failed to parse cause", { cause });
1048
1410
  return reply.text("Failed to parse cause", 400);
1049
1411
  }
1050
1412
  switch (cause.data) {
@@ -1055,11 +1417,11 @@ var ProdWorker = class {
1055
1417
  break;
1056
1418
  }
1057
1419
  case "restore": {
1058
- await this.#reconnect(true);
1420
+ await this.#reconnect(true, true);
1059
1421
  break;
1060
1422
  }
1061
1423
  default: {
1062
- logger2.error("Unhandled cause", { cause: cause.data });
1424
+ logger.error("Unhandled cause", { cause: cause.data });
1063
1425
  break;
1064
1426
  }
1065
1427
  }
@@ -1070,7 +1432,7 @@ var ProdWorker = class {
1070
1432
  }
1071
1433
  }
1072
1434
  } catch (error) {
1073
- logger2.error("HTTP server error", { error });
1435
+ logger.error("HTTP server error", { error });
1074
1436
  reply.empty(500);
1075
1437
  }
1076
1438
  });
@@ -1078,13 +1440,13 @@ var ProdWorker = class {
1078
1440
  socket.end("HTTP/1.1 400 Bad Request\r\n\r\n");
1079
1441
  });
1080
1442
  httpServer.on("listening", () => {
1081
- logger2.log("http server listening on port", this.#httpPort);
1443
+ logger.log("http server listening on port", this.#httpPort);
1082
1444
  });
1083
1445
  httpServer.on("error", async (error) => {
1084
1446
  if (error.code != "EADDRINUSE") {
1085
1447
  return;
1086
1448
  }
1087
- logger2.error(`port ${this.#httpPort} already in use, retrying with random port..`);
1449
+ logger.error(`port ${this.#httpPort} already in use, retrying with random port..`);
1088
1450
  this.#httpPort = getRandomPortNumber();
1089
1451
  await setTimeout2(100);
1090
1452
  this.start();
@@ -1124,11 +1486,30 @@ var ProdWorker = class {
1124
1486
  const data = await response.json();
1125
1487
  return data?.variables ?? {};
1126
1488
  }
1489
+ get #status() {
1490
+ return {
1491
+ executing: this.executing,
1492
+ paused: this.paused,
1493
+ completed: this.completed.size,
1494
+ nextResumeAfter: this.nextResumeAfter,
1495
+ waitForPostStart: this.waitForPostStart,
1496
+ attemptFriendlyId: this.attemptFriendlyId
1497
+ };
1498
+ }
1499
+ #emitUnrecoverableError(name, message) {
1500
+ this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", {
1501
+ version: "v1",
1502
+ error: {
1503
+ name,
1504
+ message
1505
+ }
1506
+ });
1507
+ }
1127
1508
  start() {
1128
1509
  this.#httpServer.listen(this.#httpPort, this.host);
1129
1510
  }
1130
1511
  };
1131
- var prodWorker = new ProdWorker(HTTP_SERVER_PORT2);
1512
+ var prodWorker = new ProdWorker(HTTP_SERVER_PORT);
1132
1513
  prodWorker.start();
1133
1514
  function gatherProcessEnv() {
1134
1515
  const env = {