trigger.dev 3.0.0-beta.34 → 3.0.0-beta.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -116,8 +116,6 @@ var TaskMetadataParseError = class extends Error {
116
116
  this.name = "TaskMetadataParseError";
117
117
  }
118
118
  };
119
-
120
- // src/workers/prod/backgroundWorker.ts
121
119
  var UnexpectedExitError = class extends Error {
122
120
  constructor(code) {
123
121
  super(`Unexpected exit with code ${code}`);
@@ -137,13 +135,31 @@ var CancelledProcessError = class extends Error {
137
135
  this.name = "CancelledProcessError";
138
136
  }
139
137
  };
138
+ var SigKillTimeoutProcessError = class extends Error {
139
+ constructor() {
140
+ super("Process kill timeout");
141
+ this.name = "SigKillTimeoutProcessError";
142
+ }
143
+ };
144
+ var GracefulExitTimeoutError = class extends Error {
145
+ constructor() {
146
+ super("Graceful exit timeout");
147
+ this.name = "GracefulExitTimeoutError";
148
+ }
149
+ };
150
+
151
+ // src/workers/prod/backgroundWorker.ts
140
152
  var ProdBackgroundWorker = class {
141
153
  constructor(path, params) {
142
154
  this.path = path;
143
155
  this.params = params;
144
156
  }
145
157
  _initialized = false;
158
+ /**
159
+ * @deprecated use onTaskRunHeartbeat instead
160
+ */
146
161
  onTaskHeartbeat = new Evt();
162
+ onTaskRunHeartbeat = new Evt();
147
163
  onWaitForBatch = new Evt();
148
164
  onWaitForDuration = new Evt();
149
165
  onWaitForTask = new Evt();
@@ -151,17 +167,40 @@ var ProdBackgroundWorker = class {
151
167
  checkpointCanceledNotification = Evt.create();
152
168
  onReadyForCheckpoint = Evt.create();
153
169
  onCancelCheckpoint = Evt.create();
170
+ onCreateTaskRunAttempt = Evt.create();
171
+ attemptCreatedNotification = Evt.create();
154
172
  _onClose = new Evt();
155
173
  tasks = [];
156
174
  _taskRunProcess;
175
+ _taskRunProcessesBeingKilled = /* @__PURE__ */ new Map();
157
176
  _closed = false;
158
- async close() {
177
+ async close(gracefulExitTimeoutElapsed = false) {
178
+ console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed });
159
179
  if (this._closed) {
160
180
  return;
161
181
  }
162
182
  this._closed = true;
163
183
  this.onTaskHeartbeat.detach();
164
- await this._taskRunProcess?.cleanup(true);
184
+ this.onTaskRunHeartbeat.detach();
185
+ await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed);
186
+ }
187
+ async #killTaskRunProcess(flush = true, initialSignal = "SIGTERM") {
188
+ console.log("Killing task run process", { flush, initialSignal, closed: this._closed });
189
+ if (this._closed || !this._taskRunProcess) {
190
+ return;
191
+ }
192
+ if (flush) {
193
+ await this.flushTelemetry();
194
+ }
195
+ const currentTaskRunProcess = this._taskRunProcess;
196
+ this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => {
197
+ console.error("Error while trying graceful exit", error);
198
+ });
199
+ console.log("Killed task run process, setting closed to true", {
200
+ closed: this._closed,
201
+ pid: currentTaskRunProcess.pid
202
+ });
203
+ this._closed = true;
165
204
  }
166
205
  async flushTelemetry() {
167
206
  await this._taskRunProcess?.cleanup(false);
@@ -251,64 +290,144 @@ var ProdBackgroundWorker = class {
251
290
  }
252
291
  // We need to notify all the task run processes that a task run has completed,
253
292
  // in case they are waiting for it through triggerAndWait
254
- async taskRunCompletedNotification(completion, execution) {
255
- this._taskRunProcess?.taskRunCompletedNotification(completion, execution);
293
+ async taskRunCompletedNotification(completion) {
294
+ this._taskRunProcess?.taskRunCompletedNotification(completion);
256
295
  }
257
296
  async waitCompletedNotification() {
258
297
  this._taskRunProcess?.waitCompletedNotification();
259
298
  }
260
- async #initializeTaskRunProcess(payload) {
299
+ async #getFreshTaskRunProcess(payload, messageId) {
261
300
  const metadata = this.getMetadata(
262
301
  payload.execution.worker.id,
263
302
  payload.execution.worker.version
264
303
  );
265
- if (!this._taskRunProcess) {
266
- const taskRunProcess = new TaskRunProcess(
267
- payload.execution,
268
- this.path,
269
- {
270
- ...this.params.env,
271
- ...payload.environment ?? {}
272
- },
273
- metadata,
274
- this.params
275
- );
276
- taskRunProcess.onExit.attach(() => {
304
+ console.log("Getting fresh task run process, setting closed to false", {
305
+ closed: this._closed
306
+ });
307
+ this._closed = false;
308
+ await this.#killCurrentTaskRunProcessBeforeAttempt();
309
+ const taskRunProcess = new TaskRunProcess(
310
+ payload.execution.run.id,
311
+ payload.execution.run.isTest,
312
+ this.path,
313
+ {
314
+ ...this.params.env,
315
+ ...payload.environment ?? {}
316
+ },
317
+ metadata,
318
+ this.params,
319
+ messageId
320
+ );
321
+ taskRunProcess.onExit.attach(({ pid }) => {
322
+ console.log("Task run process exited", { pid });
323
+ if (this._taskRunProcess?.pid === pid) {
277
324
  this._taskRunProcess = void 0;
278
- });
279
- taskRunProcess.onTaskHeartbeat.attach((id) => {
280
- this.onTaskHeartbeat.post(id);
281
- });
282
- taskRunProcess.onWaitForBatch.attach((message) => {
283
- this.onWaitForBatch.post(message);
284
- });
285
- taskRunProcess.onWaitForDuration.attach((message) => {
286
- this.onWaitForDuration.post(message);
287
- });
288
- taskRunProcess.onWaitForTask.attach((message) => {
289
- this.onWaitForTask.post(message);
290
- });
291
- taskRunProcess.onReadyForCheckpoint.attach((message) => {
292
- this.onReadyForCheckpoint.post(message);
293
- });
294
- taskRunProcess.onCancelCheckpoint.attach((message) => {
295
- this.onCancelCheckpoint.post(message);
296
- });
297
- this.preCheckpointNotification.attach((message) => {
298
- taskRunProcess.preCheckpointNotification.post(message);
299
- });
300
- this.checkpointCanceledNotification.attach((message) => {
301
- taskRunProcess.checkpointCanceledNotification.post(message);
302
- });
303
- await taskRunProcess.initialize();
304
- this._taskRunProcess = taskRunProcess;
305
- }
325
+ }
326
+ if (pid) {
327
+ this._taskRunProcessesBeingKilled.delete(pid);
328
+ }
329
+ });
330
+ taskRunProcess.onIsBeingKilled.attach((taskRunProcess2) => {
331
+ if (taskRunProcess2?.pid) {
332
+ this._taskRunProcessesBeingKilled.set(taskRunProcess2.pid, taskRunProcess2);
333
+ }
334
+ });
335
+ taskRunProcess.onTaskHeartbeat.attach((id) => {
336
+ this.onTaskHeartbeat.post(id);
337
+ });
338
+ taskRunProcess.onTaskRunHeartbeat.attach((id) => {
339
+ this.onTaskRunHeartbeat.post(id);
340
+ });
341
+ taskRunProcess.onWaitForBatch.attach((message) => {
342
+ this.onWaitForBatch.post(message);
343
+ });
344
+ taskRunProcess.onWaitForDuration.attach((message) => {
345
+ this.onWaitForDuration.post(message);
346
+ });
347
+ taskRunProcess.onWaitForTask.attach((message) => {
348
+ this.onWaitForTask.post(message);
349
+ });
350
+ taskRunProcess.onReadyForCheckpoint.attach((message) => {
351
+ this.onReadyForCheckpoint.post(message);
352
+ });
353
+ taskRunProcess.onCancelCheckpoint.attach((message) => {
354
+ this.onCancelCheckpoint.post(message);
355
+ });
356
+ this.preCheckpointNotification.attach((message) => {
357
+ taskRunProcess.preCheckpointNotification.post(message);
358
+ });
359
+ this.checkpointCanceledNotification.attach((message) => {
360
+ taskRunProcess.checkpointCanceledNotification.post(message);
361
+ });
362
+ await taskRunProcess.initialize();
363
+ this._taskRunProcess = taskRunProcess;
306
364
  return this._taskRunProcess;
307
365
  }
308
- // We need to fork the process before we can execute any tasks
309
- async executeTaskRun(payload) {
366
+ async forceKillOldTaskRunProcesses() {
367
+ for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) {
368
+ try {
369
+ await taskRunProcess.kill("SIGKILL");
370
+ } catch (error) {
371
+ console.error("Error while force killing old task run processes", error);
372
+ }
373
+ }
374
+ }
375
+ async #killCurrentTaskRunProcessBeforeAttempt() {
376
+ console.log("killCurrentTaskRunProcessBeforeAttempt()", {
377
+ hasTaskRunProcess: !!this._taskRunProcess
378
+ });
379
+ if (!this._taskRunProcess) {
380
+ return;
381
+ }
382
+ const currentTaskRunProcess = this._taskRunProcess;
383
+ console.log("Killing current task run process", {
384
+ isBeingKilled: currentTaskRunProcess?.isBeingKilled,
385
+ totalBeingKilled: this._taskRunProcessesBeingKilled.size
386
+ });
387
+ if (currentTaskRunProcess.isBeingKilled) {
388
+ if (this._taskRunProcessesBeingKilled.size > 1) {
389
+ await this.#tryGracefulExit(currentTaskRunProcess);
390
+ } else {
391
+ }
392
+ } else {
393
+ if (this._taskRunProcessesBeingKilled.size > 0) {
394
+ await this.#tryGracefulExit(currentTaskRunProcess);
395
+ } else {
396
+ currentTaskRunProcess.kill("SIGTERM", 5e3).catch(() => {
397
+ });
398
+ }
399
+ }
400
+ }
401
+ async #tryGracefulExit(taskRunProcess, kill = false, initialSignal = "SIGTERM") {
310
402
  try {
311
- const taskRunProcess = await this.#initializeTaskRunProcess(payload);
403
+ const initialExit = taskRunProcess.onExit.waitFor(5e3);
404
+ if (kill) {
405
+ taskRunProcess.kill(initialSignal);
406
+ }
407
+ await initialExit;
408
+ } catch (error) {
409
+ console.error("TaskRunProcess graceful kill timeout exceeded", error);
410
+ this.#tryForcefulExit(taskRunProcess);
411
+ }
412
+ }
413
+ async #tryForcefulExit(taskRunProcess) {
414
+ try {
415
+ const forcedKill = taskRunProcess.onExit.waitFor(5e3);
416
+ taskRunProcess.kill("SIGKILL");
417
+ await forcedKill;
418
+ } catch (error) {
419
+ console.error("TaskRunProcess forced kill timeout exceeded", error);
420
+ throw new SigKillTimeoutProcessError();
421
+ }
422
+ }
423
+ // We need to fork the process before we can execute any tasks, use a fresh process for each execution
424
+ async executeTaskRun(payload, messageId) {
425
+ try {
426
+ const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId);
427
+ console.log("executing task run", {
428
+ attempt: payload.execution.attempt.id,
429
+ taskRunPid: taskRunProcess.pid
430
+ });
312
431
  const result = await taskRunProcess.executeTaskRun(payload);
313
432
  if (result.ok) {
314
433
  return result;
@@ -356,6 +475,29 @@ var ProdBackgroundWorker = class {
356
475
  }
357
476
  };
358
477
  }
478
+ if (e instanceof SigKillTimeoutProcessError) {
479
+ return {
480
+ id: payload.execution.attempt.id,
481
+ ok: false,
482
+ retry: void 0,
483
+ error: {
484
+ type: "INTERNAL_ERROR",
485
+ code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT
486
+ }
487
+ };
488
+ }
489
+ if (e instanceof GracefulExitTimeoutError) {
490
+ return {
491
+ id: payload.execution.attempt.id,
492
+ ok: false,
493
+ retry: void 0,
494
+ error: {
495
+ type: "INTERNAL_ERROR",
496
+ code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT,
497
+ message: "Worker process killed while attempt in progress."
498
+ }
499
+ };
500
+ }
359
501
  return {
360
502
  id: payload.execution.attempt.id,
361
503
  ok: false,
@@ -365,10 +507,41 @@ var ProdBackgroundWorker = class {
365
507
  code: TaskRunErrorCodes.TASK_EXECUTION_FAILED
366
508
  }
367
509
  };
510
+ } finally {
511
+ await this.#killTaskRunProcess();
368
512
  }
369
513
  }
370
514
  async cancelAttempt(attemptId) {
371
- await this._taskRunProcess?.cancel();
515
+ if (!this._taskRunProcess) {
516
+ console.error("No task run process to cancel attempt", { attemptId });
517
+ return;
518
+ }
519
+ await this._taskRunProcess.cancel();
520
+ }
521
+ async executeTaskRunLazyAttempt(payload) {
522
+ this.onCreateTaskRunAttempt.post({ runId: payload.runId });
523
+ let execution;
524
+ try {
525
+ const attemptCreated = await this.attemptCreatedNotification.waitFor(3e4);
526
+ if (!attemptCreated.success) {
527
+ throw new Error(
528
+ `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
529
+ );
530
+ }
531
+ execution = attemptCreated.execution;
532
+ } catch (error) {
533
+ console.error("Error while creating attempt", error);
534
+ throw new Error(`Failed to create task run attempt: ${error}`);
535
+ }
536
+ const completion = await this.executeTaskRun(
537
+ {
538
+ execution,
539
+ traceContext: payload.traceContext,
540
+ environment: payload.environment
541
+ },
542
+ payload.messageId
543
+ );
544
+ return { execution, completion };
372
545
  }
373
546
  async #correctError(error, execution) {
374
547
  return {
@@ -378,22 +551,31 @@ var ProdBackgroundWorker = class {
378
551
  }
379
552
  };
380
553
  var TaskRunProcess = class {
381
- constructor(execution, path, env, metadata, worker) {
382
- this.execution = execution;
554
+ constructor(runId, isTest, path, env, metadata, worker, messageId) {
555
+ this.runId = runId;
556
+ this.isTest = isTest;
383
557
  this.path = path;
384
558
  this.env = env;
385
559
  this.metadata = metadata;
386
560
  this.worker = worker;
561
+ this.messageId = messageId;
387
562
  }
388
563
  _ipc;
389
564
  _child;
565
+ _childPid;
390
566
  _attemptPromises = /* @__PURE__ */ new Map();
391
567
  _attemptStatuses = /* @__PURE__ */ new Map();
392
568
  _currentExecution;
393
569
  _isBeingKilled = false;
394
570
  _isBeingCancelled = false;
571
+ _gracefulExitTimeoutElapsed = false;
572
+ /**
573
+ * @deprecated use onTaskRunHeartbeat instead
574
+ */
395
575
  onTaskHeartbeat = new Evt();
576
+ onTaskRunHeartbeat = new Evt();
396
577
  onExit = new Evt();
578
+ onIsBeingKilled = new Evt();
397
579
  onWaitForBatch = new Evt();
398
580
  onWaitForDuration = new Evt();
399
581
  onWaitForTask = new Evt();
@@ -413,7 +595,7 @@ var TaskRunProcess = class {
413
595
  "ipc"
414
596
  ],
415
597
  env: {
416
- ...this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {},
598
+ ...this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {},
417
599
  ...this.env,
418
600
  OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({
419
601
  [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir
@@ -421,6 +603,7 @@ var TaskRunProcess = class {
421
603
  ...this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}
422
604
  }
423
605
  });
606
+ this._childPid = this._child?.pid;
424
607
  this._ipc = new ZodIpcConnection({
425
608
  listenSchema: ProdChildToWorkerMessages,
426
609
  emitSchema: ProdWorkerToChildMessages,
@@ -444,7 +627,11 @@ var TaskRunProcess = class {
444
627
  process.exit(0);
445
628
  },
446
629
  TASK_HEARTBEAT: async (message) => {
447
- this.onTaskHeartbeat.post(message.id);
630
+ if (this.messageId) {
631
+ this.onTaskRunHeartbeat.post(this.messageId);
632
+ } else {
633
+ this.onTaskHeartbeat.post(message.id);
634
+ }
448
635
  },
449
636
  TASKS_READY: async (message) => {
450
637
  },
@@ -502,15 +689,29 @@ var TaskRunProcess = class {
502
689
  this._isBeingCancelled = true;
503
690
  await this.cleanup(true);
504
691
  }
505
- async cleanup(kill = false) {
692
+ async cleanup(kill = false, gracefulExitTimeoutElapsed = false) {
693
+ console.log("cleanup()", { kill, gracefulExitTimeoutElapsed });
506
694
  if (kill && this._isBeingKilled) {
507
695
  return;
508
696
  }
509
- this._isBeingKilled = kill;
697
+ if (kill) {
698
+ this._isBeingKilled = true;
699
+ this.onIsBeingKilled.post(this);
700
+ }
701
+ const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution;
702
+ const killParentProcess = kill && !killChildProcess;
703
+ console.log("Cleaning up task run process", {
704
+ killChildProcess,
705
+ killParentProcess
706
+ });
510
707
  await this._ipc?.sendWithAck("CLEANUP", {
511
708
  flush: true,
512
- kill
709
+ kill: killParentProcess
513
710
  });
711
+ if (killChildProcess) {
712
+ this._gracefulExitTimeoutElapsed = true;
713
+ await this.kill("SIGKILL");
714
+ }
514
715
  }
515
716
  async executeTaskRun(payload) {
516
717
  let resolver;
@@ -534,14 +735,14 @@ var TaskRunProcess = class {
534
735
  this._currentExecution = void 0;
535
736
  return result;
536
737
  }
537
- taskRunCompletedNotification(completion, execution) {
738
+ taskRunCompletedNotification(completion) {
538
739
  if (!completion.ok && typeof completion.retry !== "undefined") {
539
740
  return;
540
741
  }
541
742
  if (this._child?.connected && !this._isBeingKilled && !this._child.killed) {
542
743
  this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
543
- completion,
544
- execution
744
+ version: "v2",
745
+ completion
545
746
  });
546
747
  }
547
748
  }
@@ -550,9 +751,11 @@ var TaskRunProcess = class {
550
751
  this._ipc?.send("WAIT_COMPLETED_NOTIFICATION", {});
551
752
  }
552
753
  }
553
- async #handleExit(code) {
754
+ async #handleExit(code, signal) {
755
+ console.log("handling child exit", { code, signal });
554
756
  for (const [id, status] of this._attemptStatuses.entries()) {
555
757
  if (status === "PENDING") {
758
+ console.log("found pending attempt", { id });
556
759
  this._attemptStatuses.set(id, "REJECTED");
557
760
  const attemptPromise = this._attemptPromises.get(id);
558
761
  if (!attemptPromise) {
@@ -561,14 +764,16 @@ var TaskRunProcess = class {
561
764
  const { rejecter } = attemptPromise;
562
765
  if (this._isBeingCancelled) {
563
766
  rejecter(new CancelledProcessError());
767
+ } else if (this._gracefulExitTimeoutElapsed) {
768
+ rejecter(new GracefulExitTimeoutError());
564
769
  } else if (this._isBeingKilled) {
565
770
  rejecter(new CleanupProcessError());
566
771
  } else {
567
- rejecter(new UnexpectedExitError(code));
772
+ rejecter(new UnexpectedExitError(code ?? -1));
568
773
  }
569
774
  }
570
775
  }
571
- this.onExit.post(code);
776
+ this.onExit.post({ code, signal, pid: this.pid });
572
777
  }
573
778
  #handleLog(data) {
574
779
  if (!this._currentExecution) {
@@ -590,11 +795,21 @@ var TaskRunProcess = class {
590
795
  `[${this.metadata.version}][${this._currentExecution.run.id}.${this._currentExecution.attempt.number}] ${data.toString()}`
591
796
  );
592
797
  }
593
- #kill() {
594
- if (this._child && !this._child.killed) {
595
- this._child?.kill();
798
+ async kill(signal, timeoutInMs) {
799
+ this._isBeingKilled = true;
800
+ const killTimeout = this.onExit.waitFor(timeoutInMs);
801
+ this.onIsBeingKilled.post(this);
802
+ this._child?.kill(signal);
803
+ if (timeoutInMs) {
804
+ await killTimeout;
596
805
  }
597
806
  }
807
+ get isBeingKilled() {
808
+ return this._isBeingKilled || this._child?.killed;
809
+ }
810
+ get pid() {
811
+ return this._childPid;
812
+ }
598
813
  };
599
814
 
600
815
  // src/workers/prod/entry-point.ts
@@ -611,7 +826,88 @@ var ProdWorker = class {
611
826
  this.host = host;
612
827
  process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM"));
613
828
  this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST);
614
- this.#backgroundWorker = new ProdBackgroundWorker("worker.js", {
829
+ this.#backgroundWorker = this.#createBackgroundWorker();
830
+ this.#httpPort = port;
831
+ this.#httpServer = this.#createHttpServer();
832
+ }
833
+ apiUrl = process.env.TRIGGER_API_URL;
834
+ apiKey = process.env.TRIGGER_SECRET_KEY;
835
+ contentHash = process.env.TRIGGER_CONTENT_HASH;
836
+ projectRef = process.env.TRIGGER_PROJECT_REF;
837
+ envId = process.env.TRIGGER_ENV_ID;
838
+ runId = process.env.TRIGGER_RUN_ID || "index-only";
839
+ deploymentId = process.env.TRIGGER_DEPLOYMENT_ID;
840
+ deploymentVersion = process.env.TRIGGER_DEPLOYMENT_VERSION;
841
+ runningInKubernetes = !!process.env.KUBERNETES_PORT;
842
+ executing = false;
843
+ completed = /* @__PURE__ */ new Set();
844
+ paused = false;
845
+ attemptFriendlyId;
846
+ nextResumeAfter;
847
+ waitForPostStart = false;
848
+ #httpPort;
849
+ #backgroundWorker;
850
+ #httpServer;
851
+ #coordinatorSocket;
852
+ async #handleSignal(signal) {
853
+ logger2.log("Received signal", { signal });
854
+ if (signal === "SIGTERM") {
855
+ let gracefulExitTimeoutElapsed = false;
856
+ if (this.executing) {
857
+ const terminationGracePeriodSeconds = 60 * 60;
858
+ logger2.log("Waiting for attempt to complete before exiting", {
859
+ terminationGracePeriodSeconds
860
+ });
861
+ await setTimeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
862
+ gracefulExitTimeoutElapsed = true;
863
+ logger2.log("Termination timeout reached, exiting gracefully.");
864
+ } else {
865
+ logger2.log("Not executing, exiting immediately.");
866
+ }
867
+ await this.#exitGracefully(gracefulExitTimeoutElapsed);
868
+ return;
869
+ }
870
+ logger2.log("Unhandled signal", { signal });
871
+ }
872
+ async #exitGracefully(gracefulExitTimeoutElapsed = false) {
873
+ await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
874
+ if (!gracefulExitTimeoutElapsed) {
875
+ process.exit(0);
876
+ }
877
+ }
878
+ async #reconnect(isPostStart = false, reconnectImmediately = false) {
879
+ if (isPostStart) {
880
+ this.waitForPostStart = false;
881
+ }
882
+ this.#coordinatorSocket.close();
883
+ if (!reconnectImmediately) {
884
+ await setTimeout2(1e3);
885
+ }
886
+ let coordinatorHost = COORDINATOR_HOST;
887
+ try {
888
+ if (this.runningInKubernetes) {
889
+ coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
890
+ "\n",
891
+ ""
892
+ );
893
+ logger2.log("reconnecting", {
894
+ coordinatorHost: {
895
+ fromEnv: COORDINATOR_HOST,
896
+ fromVolume: coordinatorHost,
897
+ current: this.#coordinatorSocket.socket.io.opts.hostname
898
+ }
899
+ });
900
+ }
901
+ } catch (error) {
902
+ logger2.error("taskinfo read error during reconnect", {
903
+ error: error instanceof Error ? error.message : error
904
+ });
905
+ } finally {
906
+ this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
907
+ }
908
+ }
909
+ #createBackgroundWorker() {
910
+ const backgroundWorker = new ProdBackgroundWorker("worker.js", {
615
911
  projectConfig: __PROJECT_CONFIG__,
616
912
  env: {
617
913
  ...gatherProcessEnv(),
@@ -621,14 +917,17 @@ var ProdWorker = class {
621
917
  },
622
918
  contentHash: this.contentHash
623
919
  });
624
- this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
920
+ backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
625
921
  this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId });
626
922
  });
627
- this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
628
- await this.#backgroundWorker.flushTelemetry();
923
+ backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
924
+ this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
925
+ });
926
+ backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
927
+ await this.#prepareForCheckpoint();
629
928
  this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
630
929
  });
631
- this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => {
930
+ backgroundWorker.onCancelCheckpoint.attach(async (message) => {
632
931
  logger2.log("onCancelCheckpoint", { message });
633
932
  const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
634
933
  "CANCEL_CHECKPOINT",
@@ -637,6 +936,7 @@ var ProdWorker = class {
637
936
  reason: message.reason
638
937
  }
639
938
  );
939
+ logger2.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
640
940
  if (checkpointCanceled) {
641
941
  if (message.reason === "WAIT_FOR_DURATION") {
642
942
  this.paused = false;
@@ -644,11 +944,42 @@ var ProdWorker = class {
644
944
  this.waitForPostStart = false;
645
945
  }
646
946
  }
647
- this.#backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
947
+ backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
948
+ });
949
+ backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
950
+ logger2.log("onCreateTaskRunAttempt()", { message });
951
+ const createAttempt = await this.#coordinatorSocket.socket.emitWithAck(
952
+ "CREATE_TASK_RUN_ATTEMPT",
953
+ {
954
+ version: "v1",
955
+ runId: message.runId
956
+ }
957
+ );
958
+ if (!createAttempt.success) {
959
+ backgroundWorker.attemptCreatedNotification.post({
960
+ success: false,
961
+ reason: createAttempt.reason
962
+ });
963
+ return;
964
+ }
965
+ backgroundWorker.attemptCreatedNotification.post({
966
+ success: true,
967
+ execution: createAttempt.executionPayload.execution
968
+ });
969
+ });
970
+ backgroundWorker.attemptCreatedNotification.attach((message) => {
971
+ if (!message.success) {
972
+ return;
973
+ }
974
+ this.attemptFriendlyId = message.execution.attempt.id;
648
975
  });
649
- this.#backgroundWorker.onWaitForDuration.attach(async (message) => {
976
+ backgroundWorker.onWaitForDuration.attach(async (message) => {
650
977
  if (!this.attemptFriendlyId) {
651
978
  logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
979
+ this.#emitUnrecoverableError(
980
+ "NoAttemptId",
981
+ "Attempt ID not set before waiting for duration"
982
+ );
652
983
  return;
653
984
  }
654
985
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -660,9 +991,10 @@ var ProdWorker = class {
660
991
  );
661
992
  this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
662
993
  });
663
- this.#backgroundWorker.onWaitForTask.attach(async (message) => {
994
+ backgroundWorker.onWaitForTask.attach(async (message) => {
664
995
  if (!this.attemptFriendlyId) {
665
996
  logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
997
+ this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task");
666
998
  return;
667
999
  }
668
1000
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -674,9 +1006,10 @@ var ProdWorker = class {
674
1006
  );
675
1007
  this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
676
1008
  });
677
- this.#backgroundWorker.onWaitForBatch.attach(async (message) => {
1009
+ backgroundWorker.onWaitForBatch.attach(async (message) => {
678
1010
  if (!this.attemptFriendlyId) {
679
1011
  logger2.error("Failed to send wait message, attempt friendly ID not set", { message });
1012
+ this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
680
1013
  return;
681
1014
  }
682
1015
  const { willCheckpointAndRestore } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -688,77 +1021,7 @@ var ProdWorker = class {
688
1021
  );
689
1022
  this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
690
1023
  });
691
- this.#httpPort = port;
692
- this.#httpServer = this.#createHttpServer();
693
- }
694
- apiUrl = process.env.TRIGGER_API_URL;
695
- apiKey = process.env.TRIGGER_SECRET_KEY;
696
- contentHash = process.env.TRIGGER_CONTENT_HASH;
697
- projectRef = process.env.TRIGGER_PROJECT_REF;
698
- envId = process.env.TRIGGER_ENV_ID;
699
- runId = process.env.TRIGGER_RUN_ID || "index-only";
700
- deploymentId = process.env.TRIGGER_DEPLOYMENT_ID;
701
- deploymentVersion = process.env.TRIGGER_DEPLOYMENT_VERSION;
702
- runningInKubernetes = !!process.env.KUBERNETES_PORT;
703
- executing = false;
704
- completed = /* @__PURE__ */ new Set();
705
- paused = false;
706
- attemptFriendlyId;
707
- nextResumeAfter;
708
- waitForPostStart = false;
709
- #httpPort;
710
- #backgroundWorker;
711
- #httpServer;
712
- #coordinatorSocket;
713
- async #handleSignal(signal) {
714
- logger2.log("Received signal", { signal });
715
- if (signal === "SIGTERM") {
716
- if (this.executing) {
717
- const terminationGracePeriodSeconds = 60 * 60;
718
- logger2.log("Waiting for attempt to complete before exiting", {
719
- terminationGracePeriodSeconds
720
- });
721
- await setTimeout2(terminationGracePeriodSeconds * 1e3 - 5e3);
722
- logger2.log("Termination timeout reached, exiting gracefully.");
723
- } else {
724
- logger2.log("Not executing, exiting immediately.");
725
- }
726
- await this.#exitGracefully();
727
- }
728
- logger2.log("Unhandled signal", { signal });
729
- }
730
- async #exitGracefully() {
731
- await this.#backgroundWorker.close();
732
- process.exit(0);
733
- }
734
- async #reconnect(isPostStart = false, reconnectImmediately = false) {
735
- if (isPostStart) {
736
- this.waitForPostStart = false;
737
- }
738
- this.#coordinatorSocket.close();
739
- if (!reconnectImmediately) {
740
- await setTimeout2(1e3);
741
- }
742
- let coordinatorHost = COORDINATOR_HOST;
743
- try {
744
- if (this.runningInKubernetes) {
745
- coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
746
- "\n",
747
- ""
748
- );
749
- logger2.log("reconnecting", {
750
- coordinatorHost: {
751
- fromEnv: COORDINATOR_HOST,
752
- fromVolume: coordinatorHost,
753
- current: this.#coordinatorSocket.socket.io.opts.hostname
754
- }
755
- });
756
- }
757
- } catch (error) {
758
- logger2.error("taskinfo read error during reconnect", { error });
759
- } finally {
760
- this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
761
- }
1024
+ return backgroundWorker;
762
1025
  }
763
1026
  async #prepareForWait(reason, willCheckpointAndRestore) {
764
1027
  logger2.log(`prepare for ${reason}`, { willCheckpointAndRestore });
@@ -768,7 +1031,7 @@ var ProdWorker = class {
768
1031
  this.nextResumeAfter = reason;
769
1032
  this.waitForPostStart = true;
770
1033
  if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
771
- await this.#backgroundWorker.flushTelemetry();
1034
+ await this.#prepareForCheckpoint();
772
1035
  }
773
1036
  }
774
1037
  }
@@ -779,15 +1042,25 @@ var ProdWorker = class {
779
1042
  logger2.log("WARNING: Will checkpoint but also requested exit. This won't end well.");
780
1043
  }
781
1044
  await this.#exitGracefully();
1045
+ return;
782
1046
  }
1047
+ this.paused = false;
1048
+ this.waitForPostStart = false;
783
1049
  this.executing = false;
784
1050
  this.attemptFriendlyId = void 0;
785
1051
  if (willCheckpointAndRestore) {
786
1052
  this.waitForPostStart = true;
1053
+ this.#prepareForCheckpoint(false);
787
1054
  this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
788
1055
  return;
789
1056
  }
790
1057
  }
1058
+ async #prepareForCheckpoint(flush = true) {
1059
+ if (flush) {
1060
+ await this.#backgroundWorker.flushTelemetry();
1061
+ }
1062
+ await this.#backgroundWorker.forceKillOldTaskRunProcesses();
1063
+ }
791
1064
  #resumeAfterDuration() {
792
1065
  this.paused = false;
793
1066
  this.nextResumeAfter = void 0;
@@ -817,11 +1090,8 @@ var ProdWorker = class {
817
1090
  if (this.attemptFriendlyId) {
818
1091
  extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId;
819
1092
  }
820
- logger2.log("connecting to coordinator", {
821
- host,
822
- port: COORDINATOR_PORT,
823
- extraHeaders
824
- });
1093
+ logger2.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`);
1094
+ logger2.debug(`connecting with extra headers`, { extraHeaders });
825
1095
  const coordinatorConnection = new ZodSocketConnection2({
826
1096
  namespace: "prod-worker",
827
1097
  host,
@@ -830,50 +1100,38 @@ var ProdWorker = class {
830
1100
  serverMessages: CoordinatorToProdWorkerMessages,
831
1101
  extraHeaders,
832
1102
  handlers: {
833
- RESUME_AFTER_DEPENDENCY: async (message) => {
1103
+ RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
834
1104
  if (!this.paused) {
835
- logger2.error("worker not paused", {
836
- completions: message.completions,
837
- executions: message.executions
838
- });
839
- return;
840
- }
841
- if (message.completions.length !== message.executions.length) {
842
- logger2.error("did not receive the same number of completions and executions", {
843
- completions: message.completions,
844
- executions: message.executions
845
- });
1105
+ logger2.error("Failed to resume after dependency: Worker not paused");
846
1106
  return;
847
1107
  }
848
- if (message.completions.length === 0 || message.executions.length === 0) {
849
- logger2.error("no completions or executions", {
850
- completions: message.completions,
851
- executions: message.executions
852
- });
1108
+ if (completions.length === 0) {
1109
+ logger2.error("Failed to resume after dependency: No completions");
853
1110
  return;
854
1111
  }
855
1112
  if (this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH") {
856
- logger2.error("not waiting to resume after dependency", {
1113
+ logger2.error("Failed to resume after dependency: Invalid next resume", {
857
1114
  nextResumeAfter: this.nextResumeAfter
858
1115
  });
859
1116
  return;
860
1117
  }
861
- if (this.nextResumeAfter === "WAIT_FOR_TASK" && message.completions.length > 1) {
862
- logger2.error("waiting for single task but got multiple completions", {
863
- completions: message.completions,
864
- executions: message.executions
865
- });
1118
+ if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) {
1119
+ logger2.error(
1120
+ "Failed to resume after dependency: Waiting for single task but got multiple completions",
1121
+ {
1122
+ completions
1123
+ }
1124
+ );
866
1125
  return;
867
1126
  }
868
1127
  this.paused = false;
869
1128
  this.nextResumeAfter = void 0;
870
1129
  this.waitForPostStart = false;
871
- for (let i = 0; i < message.completions.length; i++) {
872
- const completion = message.completions[i];
873
- const execution = message.executions[i];
874
- if (!completion || !execution)
1130
+ for (let i = 0; i < completions.length; i++) {
1131
+ const completion = completions[i];
1132
+ if (!completion)
875
1133
  continue;
876
- this.#backgroundWorker.taskRunCompletedNotification(completion, execution);
1134
+ this.#backgroundWorker.taskRunCompletedNotification(completion);
877
1135
  }
878
1136
  },
879
1137
  RESUME_AFTER_DURATION: async (message) => {
@@ -913,13 +1171,59 @@ var ProdWorker = class {
913
1171
  logger2.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
914
1172
  this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
915
1173
  },
1174
+ EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
1175
+ if (this.executing) {
1176
+ logger2.error("dropping execute request, already executing");
1177
+ return;
1178
+ }
1179
+ this.executing = true;
1180
+ try {
1181
+ const { completion, execution } = await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
1182
+ logger2.log("completed", completion);
1183
+ this.completed.add(execution.attempt.id);
1184
+ const { willCheckpointAndRestore, shouldExit } = await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
1185
+ version: "v1",
1186
+ execution,
1187
+ completion
1188
+ });
1189
+ logger2.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
1190
+ this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
1191
+ } catch (error) {
1192
+ const completion = {
1193
+ ok: false,
1194
+ id: message.lazyPayload.runId,
1195
+ retry: void 0,
1196
+ error: error instanceof Error ? {
1197
+ type: "BUILT_IN_ERROR",
1198
+ name: error.name,
1199
+ message: error.message,
1200
+ stackTrace: error.stack ?? ""
1201
+ } : {
1202
+ type: "BUILT_IN_ERROR",
1203
+ name: "UnknownError",
1204
+ message: String(error),
1205
+ stackTrace: ""
1206
+ }
1207
+ };
1208
+ this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
1209
+ version: "v1",
1210
+ completion
1211
+ });
1212
+ }
1213
+ },
916
1214
  REQUEST_ATTEMPT_CANCELLATION: async (message) => {
917
1215
  if (!this.executing) {
1216
+ logger2.log("dropping cancel request, not executing", { status: this.#status });
918
1217
  return;
919
1218
  }
1219
+ logger2.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status });
920
1220
  await this.#backgroundWorker.cancelAttempt(message.attemptId);
921
1221
  },
922
- REQUEST_EXIT: async () => {
1222
+ REQUEST_EXIT: async (message) => {
1223
+ if (message.version === "v2" && message.delayInMs) {
1224
+ logger2.log("exit requested with delay", { delayInMs: message.delayInMs });
1225
+ await setTimeout2(message.delayInMs);
1226
+ }
923
1227
  this.#coordinatorSocket.close();
924
1228
  process.exit(0);
925
1229
  },
@@ -927,7 +1231,7 @@ var ProdWorker = class {
927
1231
  if (this.completed.size < 1) {
928
1232
  return;
929
1233
  }
930
- this.#coordinatorSocket.socket.emit("READY_FOR_EXECUTION", {
1234
+ this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
931
1235
  version: "v1",
932
1236
  runId: this.runId,
933
1237
  totalCompletions: this.completed.size
@@ -935,16 +1239,26 @@ var ProdWorker = class {
935
1239
  }
936
1240
  },
937
1241
  onConnection: async (socket, handler, sender, logger3) => {
1242
+ logger3.log("connected to coordinator", { status: this.#status });
938
1243
  if (this.waitForPostStart) {
939
1244
  logger3.log("skip connection handler, waiting for post start hook");
940
1245
  return;
941
1246
  }
942
1247
  if (this.paused) {
943
1248
  if (!this.nextResumeAfter) {
1249
+ logger3.error("Missing next resume reason", { status: this.#status });
1250
+ this.#emitUnrecoverableError(
1251
+ "NoNextResume",
1252
+ "Next resume reason not set while resuming from paused state"
1253
+ );
944
1254
  return;
945
1255
  }
946
1256
  if (!this.attemptFriendlyId) {
947
- logger3.error("Missing friendly ID");
1257
+ logger3.error("Missing friendly ID", { status: this.#status });
1258
+ this.#emitUnrecoverableError(
1259
+ "NoAttemptId",
1260
+ "Attempt ID not set while resuming from paused state"
1261
+ );
948
1262
  return;
949
1263
  }
950
1264
  socket.emit("READY_FOR_RESUME", {
@@ -958,9 +1272,10 @@ var ProdWorker = class {
958
1272
  try {
959
1273
  const taskResources = await this.#initializeWorker();
960
1274
  const { success } = await socket.emitWithAck("INDEX_TASKS", {
961
- version: "v1",
1275
+ version: "v2",
962
1276
  deploymentId: this.deploymentId,
963
- ...taskResources
1277
+ ...taskResources,
1278
+ supportsLazyAttempts: true
964
1279
  });
965
1280
  if (success) {
966
1281
  logger3.info("indexing done, shutting down..");
@@ -1036,7 +1351,7 @@ var ProdWorker = class {
1036
1351
  if (this.executing) {
1037
1352
  return;
1038
1353
  }
1039
- socket.emit("READY_FOR_EXECUTION", {
1354
+ socket.emit("READY_FOR_LAZY_ATTEMPT", {
1040
1355
  version: "v1",
1041
1356
  runId: this.runId,
1042
1357
  totalCompletions: this.completed.size
@@ -1067,12 +1382,7 @@ var ProdWorker = class {
1067
1382
  return reply.text("ok");
1068
1383
  }
1069
1384
  case "/status": {
1070
- return reply.json({
1071
- executing: this.executing,
1072
- paused: this.paused,
1073
- completed: this.completed.size,
1074
- nextResumeAfter: this.nextResumeAfter
1075
- });
1385
+ return reply.json(this.#status);
1076
1386
  }
1077
1387
  case "/connect": {
1078
1388
  this.#coordinatorSocket.connect();
@@ -1193,6 +1503,25 @@ var ProdWorker = class {
1193
1503
  const data = await response.json();
1194
1504
  return data?.variables ?? {};
1195
1505
  }
1506
+ get #status() {
1507
+ return {
1508
+ executing: this.executing,
1509
+ paused: this.paused,
1510
+ completed: this.completed.size,
1511
+ nextResumeAfter: this.nextResumeAfter,
1512
+ waitForPostStart: this.waitForPostStart,
1513
+ attemptFriendlyId: this.attemptFriendlyId
1514
+ };
1515
+ }
1516
+ #emitUnrecoverableError(name, message) {
1517
+ this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", {
1518
+ version: "v1",
1519
+ error: {
1520
+ name,
1521
+ message
1522
+ }
1523
+ });
1524
+ }
1196
1525
  start() {
1197
1526
  this.#httpServer.listen(this.#httpPort, this.host);
1198
1527
  }