trigger.dev 0.0.0-re2-20250408120954 → 0.0.0-re2-20250409105131
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/entryPoints/managed-run-controller.js +532 -218
- package/dist/esm/entryPoints/managed-run-controller.js.map +1 -1
- package/dist/esm/executions/taskRunProcess.d.ts +2 -0
- package/dist/esm/executions/taskRunProcess.js +12 -2
- package/dist/esm/executions/taskRunProcess.js.map +1 -1
- package/dist/esm/version.js +1 -1
- package/package.json +3 -3
|
@@ -4,7 +4,7 @@ import { env as stdEnv } from "std-env";
|
|
|
4
4
|
import { z } from "zod";
|
|
5
5
|
import { randomUUID } from "crypto";
|
|
6
6
|
import { readJSONFile } from "../utilities/fileSystem.js";
|
|
7
|
-
import { HeartbeatService, WorkerManifest, } from "@trigger.dev/core/v3";
|
|
7
|
+
import { HeartbeatService, SuspendedProcessError, WorkerManifest, } from "@trigger.dev/core/v3";
|
|
8
8
|
import { WarmStartClient, WORKLOAD_HEADERS, WorkloadHttpClient, } from "@trigger.dev/core/v3/workers";
|
|
9
9
|
import { assertExhaustive } from "../utilities/assertExhaustive.js";
|
|
10
10
|
import { setTimeout as sleep } from "timers/promises";
|
|
@@ -36,6 +36,7 @@ const Env = z.object({
|
|
|
36
36
|
TRIGGER_MACHINE_MEMORY: z.string().default("0"),
|
|
37
37
|
TRIGGER_RUNNER_ID: z.string(),
|
|
38
38
|
TRIGGER_METADATA_URL: z.string().optional(),
|
|
39
|
+
TRIGGER_PRE_SUSPEND_WAIT_MS: z.coerce.number().default(200),
|
|
39
40
|
// Timeline metrics
|
|
40
41
|
TRIGGER_POD_SCHEDULED_AT_MS: DateEnv,
|
|
41
42
|
TRIGGER_DEQUEUED_AT_MS: DateEnv,
|
|
@@ -85,14 +86,8 @@ class ManagedRunController {
|
|
|
85
86
|
failureExitCode = env.TRIGGER_FAILURE_EXIT_CODE;
|
|
86
87
|
state = { phase: "IDLE" };
|
|
87
88
|
constructor(opts) {
|
|
88
|
-
logger.debug("[ManagedRunController] Creating controller", { env });
|
|
89
89
|
this.workerManifest = opts.workerManifest;
|
|
90
90
|
this.runnerId = env.TRIGGER_RUNNER_ID;
|
|
91
|
-
this.heartbeatIntervalSeconds = env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS;
|
|
92
|
-
this.snapshotPollIntervalSeconds = env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS;
|
|
93
|
-
if (env.TRIGGER_METADATA_URL) {
|
|
94
|
-
this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL);
|
|
95
|
-
}
|
|
96
91
|
this.workerApiUrl = `${env.TRIGGER_SUPERVISOR_API_PROTOCOL}://${env.TRIGGER_SUPERVISOR_API_DOMAIN}:${env.TRIGGER_SUPERVISOR_API_PORT}`;
|
|
97
92
|
this.workerInstanceName = env.TRIGGER_WORKER_INSTANCE_NAME;
|
|
98
93
|
this.httpClient = new WorkloadHttpClient({
|
|
@@ -102,6 +97,21 @@ class ManagedRunController {
|
|
|
102
97
|
deploymentVersion: env.TRIGGER_DEPLOYMENT_VERSION,
|
|
103
98
|
projectRef: env.TRIGGER_PROJECT_REF,
|
|
104
99
|
});
|
|
100
|
+
const properties = {
|
|
101
|
+
...env,
|
|
102
|
+
TRIGGER_POD_SCHEDULED_AT_MS: env.TRIGGER_POD_SCHEDULED_AT_MS.toISOString(),
|
|
103
|
+
TRIGGER_DEQUEUED_AT_MS: env.TRIGGER_DEQUEUED_AT_MS.toISOString(),
|
|
104
|
+
};
|
|
105
|
+
this.sendDebugLog({
|
|
106
|
+
runId: env.TRIGGER_RUN_ID,
|
|
107
|
+
message: "Creating run controller",
|
|
108
|
+
properties,
|
|
109
|
+
});
|
|
110
|
+
this.heartbeatIntervalSeconds = env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS;
|
|
111
|
+
this.snapshotPollIntervalSeconds = env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS;
|
|
112
|
+
if (env.TRIGGER_METADATA_URL) {
|
|
113
|
+
this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL);
|
|
114
|
+
}
|
|
105
115
|
if (env.TRIGGER_WARM_START_URL) {
|
|
106
116
|
this.warmStartClient = new WarmStartClient({
|
|
107
117
|
apiUrl: new URL(env.TRIGGER_WARM_START_URL),
|
|
@@ -115,10 +125,16 @@ class ManagedRunController {
|
|
|
115
125
|
this.snapshotPoller = new HeartbeatService({
|
|
116
126
|
heartbeat: async () => {
|
|
117
127
|
if (!this.runFriendlyId) {
|
|
118
|
-
|
|
128
|
+
this.sendDebugLog({
|
|
129
|
+
runId: env.TRIGGER_RUN_ID,
|
|
130
|
+
message: "Skipping snapshot poll, no run ID",
|
|
131
|
+
});
|
|
119
132
|
return;
|
|
120
133
|
}
|
|
121
|
-
|
|
134
|
+
this.sendDebugLog({
|
|
135
|
+
runId: env.TRIGGER_RUN_ID,
|
|
136
|
+
message: "Polling for latest snapshot",
|
|
137
|
+
});
|
|
122
138
|
this.sendDebugLog({
|
|
123
139
|
runId: this.runFriendlyId,
|
|
124
140
|
message: `snapshot poll: started`,
|
|
@@ -128,7 +144,13 @@ class ManagedRunController {
|
|
|
128
144
|
});
|
|
129
145
|
const response = await this.httpClient.getRunExecutionData(this.runFriendlyId);
|
|
130
146
|
if (!response.success) {
|
|
131
|
-
|
|
147
|
+
this.sendDebugLog({
|
|
148
|
+
runId: this.runFriendlyId,
|
|
149
|
+
message: "Snapshot poll failed",
|
|
150
|
+
properties: {
|
|
151
|
+
error: response.error,
|
|
152
|
+
},
|
|
153
|
+
});
|
|
132
154
|
this.sendDebugLog({
|
|
133
155
|
runId: this.runFriendlyId,
|
|
134
156
|
message: `snapshot poll: failed`,
|
|
@@ -144,19 +166,28 @@ class ManagedRunController {
|
|
|
144
166
|
intervalMs: this.snapshotPollIntervalSeconds * 1000,
|
|
145
167
|
leadingEdge: false,
|
|
146
168
|
onError: async (error) => {
|
|
147
|
-
|
|
169
|
+
this.sendDebugLog({
|
|
170
|
+
runId: this.runFriendlyId,
|
|
171
|
+
message: "Failed to poll for snapshot",
|
|
172
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
173
|
+
});
|
|
148
174
|
},
|
|
149
175
|
});
|
|
150
176
|
this.runHeartbeat = new HeartbeatService({
|
|
151
177
|
heartbeat: async () => {
|
|
152
178
|
if (!this.runFriendlyId || !this.snapshotFriendlyId) {
|
|
153
|
-
|
|
179
|
+
this.sendDebugLog({
|
|
180
|
+
runId: this.runFriendlyId,
|
|
181
|
+
message: "Skipping heartbeat, no run ID or snapshot ID",
|
|
182
|
+
});
|
|
154
183
|
return;
|
|
155
184
|
}
|
|
156
|
-
|
|
185
|
+
this.sendDebugLog({
|
|
186
|
+
runId: this.runFriendlyId,
|
|
187
|
+
message: "heartbeat: started",
|
|
188
|
+
});
|
|
157
189
|
const response = await this.httpClient.heartbeatRun(this.runFriendlyId, this.snapshotFriendlyId);
|
|
158
190
|
if (!response.success) {
|
|
159
|
-
console.error("[ManagedRunController] Heartbeat failed", { error: response.error });
|
|
160
191
|
this.sendDebugLog({
|
|
161
192
|
runId: this.runFriendlyId,
|
|
162
193
|
message: "heartbeat: failed",
|
|
@@ -169,11 +200,18 @@ class ManagedRunController {
|
|
|
169
200
|
intervalMs: this.heartbeatIntervalSeconds * 1000,
|
|
170
201
|
leadingEdge: false,
|
|
171
202
|
onError: async (error) => {
|
|
172
|
-
|
|
203
|
+
this.sendDebugLog({
|
|
204
|
+
runId: this.runFriendlyId,
|
|
205
|
+
message: "Failed to send heartbeat",
|
|
206
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
207
|
+
});
|
|
173
208
|
},
|
|
174
209
|
});
|
|
175
210
|
process.on("SIGTERM", async () => {
|
|
176
|
-
|
|
211
|
+
this.sendDebugLog({
|
|
212
|
+
runId: this.runFriendlyId,
|
|
213
|
+
message: "Received SIGTERM, stopping worker",
|
|
214
|
+
});
|
|
177
215
|
await this.stop();
|
|
178
216
|
});
|
|
179
217
|
}
|
|
@@ -214,7 +252,11 @@ class ManagedRunController {
|
|
|
214
252
|
throw new Error("Mismatched run IDs");
|
|
215
253
|
}
|
|
216
254
|
if (this.state.snapshot.friendlyId === snapshot.friendlyId) {
|
|
217
|
-
|
|
255
|
+
this.sendDebugLog({
|
|
256
|
+
runId: run.friendlyId,
|
|
257
|
+
message: "updateRunPhase: Snapshot not changed",
|
|
258
|
+
properties: { run: run.friendlyId, snapshot: snapshot.friendlyId },
|
|
259
|
+
});
|
|
218
260
|
this.sendDebugLog({
|
|
219
261
|
runId: run.friendlyId,
|
|
220
262
|
message: `updateRunPhase: Snapshot not changed`,
|
|
@@ -249,15 +291,27 @@ class ManagedRunController {
|
|
|
249
291
|
onExitRunPhase(newRun = undefined) {
|
|
250
292
|
// We're not in a run phase, nothing to do
|
|
251
293
|
if (this.state.phase !== "RUN") {
|
|
252
|
-
|
|
294
|
+
this.sendDebugLog({
|
|
295
|
+
runId: this.runFriendlyId,
|
|
296
|
+
message: "onExitRunPhase: Not in run phase, skipping",
|
|
297
|
+
properties: { phase: this.state.phase },
|
|
298
|
+
});
|
|
253
299
|
return;
|
|
254
300
|
}
|
|
255
301
|
// This is still the same run, so we're not exiting the phase
|
|
256
302
|
if (newRun?.friendlyId === this.state.run.friendlyId) {
|
|
257
|
-
|
|
303
|
+
this.sendDebugLog({
|
|
304
|
+
runId: this.runFriendlyId,
|
|
305
|
+
message: "onExitRunPhase: Same run, skipping",
|
|
306
|
+
properties: { newRun: newRun?.friendlyId },
|
|
307
|
+
});
|
|
258
308
|
return;
|
|
259
309
|
}
|
|
260
|
-
|
|
310
|
+
this.sendDebugLog({
|
|
311
|
+
runId: this.runFriendlyId,
|
|
312
|
+
message: "onExitRunPhase: Exiting run phase",
|
|
313
|
+
properties: { newRun: newRun?.friendlyId },
|
|
314
|
+
});
|
|
261
315
|
this.runHeartbeat.stop();
|
|
262
316
|
this.snapshotPoller.stop();
|
|
263
317
|
const { run, snapshot } = this.state;
|
|
@@ -300,15 +354,22 @@ class ManagedRunController {
|
|
|
300
354
|
handleSnapshotChangeLock = false;
|
|
301
355
|
async handleSnapshotChange({ run, snapshot, completedWaitpoints, }) {
|
|
302
356
|
if (this.handleSnapshotChangeLock) {
|
|
303
|
-
|
|
357
|
+
this.sendDebugLog({
|
|
358
|
+
runId: run.friendlyId,
|
|
359
|
+
message: "handleSnapshotChange: already in progress",
|
|
360
|
+
});
|
|
304
361
|
return;
|
|
305
362
|
}
|
|
306
363
|
this.handleSnapshotChangeLock = true;
|
|
307
364
|
try {
|
|
308
365
|
if (!this.snapshotFriendlyId) {
|
|
309
|
-
|
|
366
|
+
this.sendDebugLog({
|
|
310
367
|
runId: run.friendlyId,
|
|
311
|
-
|
|
368
|
+
message: "handleSnapshotChange: Missing snapshot ID",
|
|
369
|
+
properties: {
|
|
370
|
+
newSnapshotId: snapshot.friendlyId,
|
|
371
|
+
newSnapshotStatus: snapshot.executionStatus,
|
|
372
|
+
},
|
|
312
373
|
});
|
|
313
374
|
this.sendDebugLog({
|
|
314
375
|
runId: run.friendlyId,
|
|
@@ -321,7 +382,11 @@ class ManagedRunController {
|
|
|
321
382
|
return;
|
|
322
383
|
}
|
|
323
384
|
if (this.snapshotFriendlyId === snapshot.friendlyId) {
|
|
324
|
-
|
|
385
|
+
this.sendDebugLog({
|
|
386
|
+
runId: run.friendlyId,
|
|
387
|
+
message: "handleSnapshotChange: snapshot not changed, skipping",
|
|
388
|
+
properties: { snapshot: snapshot.friendlyId },
|
|
389
|
+
});
|
|
325
390
|
this.sendDebugLog({
|
|
326
391
|
runId: run.friendlyId,
|
|
327
392
|
message: "snapshot change: skipping, no change",
|
|
@@ -332,12 +397,6 @@ class ManagedRunController {
|
|
|
332
397
|
});
|
|
333
398
|
return;
|
|
334
399
|
}
|
|
335
|
-
console.log(`handleSnapshotChange: ${snapshot.executionStatus}`, {
|
|
336
|
-
run,
|
|
337
|
-
oldSnapshotId: this.snapshotFriendlyId,
|
|
338
|
-
newSnapshot: snapshot,
|
|
339
|
-
completedWaitpoints: completedWaitpoints.length,
|
|
340
|
-
});
|
|
341
400
|
this.sendDebugLog({
|
|
342
401
|
runId: run.friendlyId,
|
|
343
402
|
message: `snapshot change: ${snapshot.executionStatus}`,
|
|
@@ -351,11 +410,6 @@ class ManagedRunController {
|
|
|
351
410
|
this.updateRunPhase(run, snapshot);
|
|
352
411
|
}
|
|
353
412
|
catch (error) {
|
|
354
|
-
console.error("handleSnapshotChange: failed to update run phase", {
|
|
355
|
-
run,
|
|
356
|
-
snapshot,
|
|
357
|
-
error,
|
|
358
|
-
});
|
|
359
413
|
this.sendDebugLog({
|
|
360
414
|
runId: run.friendlyId,
|
|
361
415
|
message: "snapshot change: failed to update run phase",
|
|
@@ -373,8 +427,12 @@ class ManagedRunController {
|
|
|
373
427
|
await this.cancelAttempt(run.friendlyId);
|
|
374
428
|
}
|
|
375
429
|
catch (error) {
|
|
376
|
-
|
|
377
|
-
|
|
430
|
+
this.sendDebugLog({
|
|
431
|
+
runId: run.friendlyId,
|
|
432
|
+
message: "snapshot change: failed to cancel attempt",
|
|
433
|
+
properties: {
|
|
434
|
+
error: error instanceof Error ? error.message : String(error),
|
|
435
|
+
},
|
|
378
436
|
});
|
|
379
437
|
this.waitForNextRun();
|
|
380
438
|
return;
|
|
@@ -382,46 +440,73 @@ class ManagedRunController {
|
|
|
382
440
|
return;
|
|
383
441
|
}
|
|
384
442
|
case "FINISHED": {
|
|
385
|
-
|
|
443
|
+
this.sendDebugLog({
|
|
444
|
+
runId: run.friendlyId,
|
|
445
|
+
message: "Run is finished, will wait for next run",
|
|
446
|
+
});
|
|
386
447
|
this.waitForNextRun();
|
|
387
448
|
return;
|
|
388
449
|
}
|
|
389
450
|
case "QUEUED_EXECUTING":
|
|
390
451
|
case "EXECUTING_WITH_WAITPOINTS": {
|
|
391
|
-
|
|
452
|
+
this.sendDebugLog({
|
|
453
|
+
runId: run.friendlyId,
|
|
454
|
+
message: "Run is executing with waitpoints",
|
|
455
|
+
properties: { snapshot: snapshot.friendlyId },
|
|
456
|
+
});
|
|
392
457
|
try {
|
|
458
|
+
// This should never throw. It should also never fail the run.
|
|
393
459
|
await this.taskRunProcess?.cleanup(false);
|
|
394
460
|
}
|
|
395
461
|
catch (error) {
|
|
396
|
-
|
|
462
|
+
this.sendDebugLog({
|
|
463
|
+
runId: run.friendlyId,
|
|
464
|
+
message: "Failed to cleanup task run process",
|
|
465
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
466
|
+
});
|
|
397
467
|
}
|
|
398
468
|
if (snapshot.friendlyId !== this.snapshotFriendlyId) {
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
469
|
+
this.sendDebugLog({
|
|
470
|
+
runId: run.friendlyId,
|
|
471
|
+
message: "Snapshot changed after cleanup, abort",
|
|
472
|
+
properties: {
|
|
473
|
+
oldSnapshotId: snapshot.friendlyId,
|
|
474
|
+
newSnapshotId: this.snapshotFriendlyId,
|
|
475
|
+
},
|
|
402
476
|
});
|
|
403
477
|
return;
|
|
404
478
|
}
|
|
405
|
-
|
|
406
|
-
await sleep(200);
|
|
479
|
+
await sleep(env.TRIGGER_PRE_SUSPEND_WAIT_MS);
|
|
407
480
|
if (snapshot.friendlyId !== this.snapshotFriendlyId) {
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
481
|
+
this.sendDebugLog({
|
|
482
|
+
runId: run.friendlyId,
|
|
483
|
+
message: "Snapshot changed after suspend threshold, abort",
|
|
484
|
+
properties: {
|
|
485
|
+
oldSnapshotId: snapshot.friendlyId,
|
|
486
|
+
newSnapshotId: this.snapshotFriendlyId,
|
|
487
|
+
},
|
|
411
488
|
});
|
|
412
489
|
return;
|
|
413
490
|
}
|
|
414
491
|
if (!this.runFriendlyId || !this.snapshotFriendlyId) {
|
|
415
|
-
|
|
416
|
-
runId:
|
|
417
|
-
|
|
492
|
+
this.sendDebugLog({
|
|
493
|
+
runId: run.friendlyId,
|
|
494
|
+
message: "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort",
|
|
495
|
+
properties: {
|
|
496
|
+
runId: this.runFriendlyId,
|
|
497
|
+
snapshotId: this.snapshotFriendlyId,
|
|
498
|
+
},
|
|
418
499
|
});
|
|
419
500
|
return;
|
|
420
501
|
}
|
|
421
502
|
const suspendResult = await this.httpClient.suspendRun(this.runFriendlyId, this.snapshotFriendlyId);
|
|
422
503
|
if (!suspendResult.success) {
|
|
423
|
-
|
|
424
|
-
|
|
504
|
+
this.sendDebugLog({
|
|
505
|
+
runId: run.friendlyId,
|
|
506
|
+
message: "Failed to suspend run, staying alive 🎶",
|
|
507
|
+
properties: {
|
|
508
|
+
error: suspendResult.error,
|
|
509
|
+
},
|
|
425
510
|
});
|
|
426
511
|
this.sendDebugLog({
|
|
427
512
|
runId: run.friendlyId,
|
|
@@ -434,9 +519,6 @@ class ManagedRunController {
|
|
|
434
519
|
return;
|
|
435
520
|
}
|
|
436
521
|
if (!suspendResult.data.ok) {
|
|
437
|
-
console.error("Failed to suspend run, staying alive 🎶🎶", {
|
|
438
|
-
suspendResult: suspendResult.data,
|
|
439
|
-
});
|
|
440
522
|
this.sendDebugLog({
|
|
441
523
|
runId: run.friendlyId,
|
|
442
524
|
message: "checkpoint: failed to suspend run",
|
|
@@ -447,21 +529,34 @@ class ManagedRunController {
|
|
|
447
529
|
});
|
|
448
530
|
return;
|
|
449
531
|
}
|
|
450
|
-
|
|
532
|
+
this.sendDebugLog({
|
|
533
|
+
runId: run.friendlyId,
|
|
534
|
+
message: "Suspending, any day now 🚬",
|
|
535
|
+
properties: { ok: suspendResult.data.ok },
|
|
536
|
+
});
|
|
451
537
|
return;
|
|
452
538
|
}
|
|
453
539
|
case "SUSPENDED": {
|
|
454
|
-
|
|
455
|
-
run,
|
|
456
|
-
|
|
540
|
+
this.sendDebugLog({
|
|
541
|
+
runId: run.friendlyId,
|
|
542
|
+
message: "Run was suspended, kill the process and wait for more runs",
|
|
543
|
+
properties: { run: run.friendlyId, snapshot: snapshot.friendlyId },
|
|
457
544
|
});
|
|
458
|
-
|
|
545
|
+
// This will kill the process and fail the execution with a SuspendedProcessError
|
|
546
|
+
await this.taskRunProcess?.suspend();
|
|
459
547
|
return;
|
|
460
548
|
}
|
|
461
549
|
case "PENDING_EXECUTING": {
|
|
462
|
-
|
|
550
|
+
this.sendDebugLog({
|
|
551
|
+
runId: run.friendlyId,
|
|
552
|
+
message: "Run is pending execution",
|
|
553
|
+
properties: { run: run.friendlyId, snapshot: snapshot.friendlyId },
|
|
554
|
+
});
|
|
463
555
|
if (completedWaitpoints.length === 0) {
|
|
464
|
-
|
|
556
|
+
this.sendDebugLog({
|
|
557
|
+
runId: run.friendlyId,
|
|
558
|
+
message: "No waitpoints to complete, nothing to do",
|
|
559
|
+
});
|
|
465
560
|
return;
|
|
466
561
|
}
|
|
467
562
|
// There are waitpoints to complete so we've been restored after being suspended
|
|
@@ -472,7 +567,6 @@ class ManagedRunController {
|
|
|
472
567
|
// We need to let the platform know we're ready to continue
|
|
473
568
|
const continuationResult = await this.httpClient.continueRunExecution(run.friendlyId, snapshot.friendlyId);
|
|
474
569
|
if (!continuationResult.success) {
|
|
475
|
-
console.error("Failed to continue execution", { error: continuationResult.error });
|
|
476
570
|
this.sendDebugLog({
|
|
477
571
|
runId: run.friendlyId,
|
|
478
572
|
message: "failed to continue execution",
|
|
@@ -486,14 +580,24 @@ class ManagedRunController {
|
|
|
486
580
|
return;
|
|
487
581
|
}
|
|
488
582
|
case "EXECUTING": {
|
|
489
|
-
|
|
583
|
+
this.sendDebugLog({
|
|
584
|
+
runId: run.friendlyId,
|
|
585
|
+
message: "Run is now executing",
|
|
586
|
+
properties: { run: run.friendlyId, snapshot: snapshot.friendlyId },
|
|
587
|
+
});
|
|
490
588
|
if (completedWaitpoints.length === 0) {
|
|
491
589
|
return;
|
|
492
590
|
}
|
|
493
|
-
|
|
591
|
+
this.sendDebugLog({
|
|
592
|
+
runId: run.friendlyId,
|
|
593
|
+
message: "Processing completed waitpoints",
|
|
594
|
+
properties: { completedWaitpoints: completedWaitpoints.length },
|
|
595
|
+
});
|
|
494
596
|
if (!this.taskRunProcess) {
|
|
495
|
-
|
|
496
|
-
|
|
597
|
+
this.sendDebugLog({
|
|
598
|
+
runId: run.friendlyId,
|
|
599
|
+
message: "No task run process, ignoring completed waitpoints",
|
|
600
|
+
properties: { completedWaitpoints: completedWaitpoints.length },
|
|
497
601
|
});
|
|
498
602
|
return;
|
|
499
603
|
}
|
|
@@ -504,7 +608,11 @@ class ManagedRunController {
|
|
|
504
608
|
}
|
|
505
609
|
case "RUN_CREATED":
|
|
506
610
|
case "QUEUED": {
|
|
507
|
-
|
|
611
|
+
this.sendDebugLog({
|
|
612
|
+
runId: run.friendlyId,
|
|
613
|
+
message: "Status change not handled",
|
|
614
|
+
properties: { status: snapshot.executionStatus },
|
|
615
|
+
});
|
|
508
616
|
return;
|
|
509
617
|
}
|
|
510
618
|
default: {
|
|
@@ -513,7 +621,6 @@ class ManagedRunController {
|
|
|
513
621
|
}
|
|
514
622
|
}
|
|
515
623
|
catch (error) {
|
|
516
|
-
console.error("handleSnapshotChange: unexpected error", { error });
|
|
517
624
|
this.sendDebugLog({
|
|
518
625
|
runId: run.friendlyId,
|
|
519
626
|
message: "snapshot change: unexpected error",
|
|
@@ -529,15 +636,25 @@ class ManagedRunController {
|
|
|
529
636
|
}
|
|
530
637
|
async processEnvOverrides() {
|
|
531
638
|
if (!this.metadataClient) {
|
|
532
|
-
|
|
639
|
+
this.sendDebugLog({
|
|
640
|
+
runId: this.runFriendlyId,
|
|
641
|
+
message: "No metadata client, skipping env overrides",
|
|
642
|
+
});
|
|
533
643
|
return;
|
|
534
644
|
}
|
|
535
645
|
const overrides = await this.metadataClient.getEnvOverrides();
|
|
536
646
|
if (!overrides) {
|
|
537
|
-
|
|
647
|
+
this.sendDebugLog({
|
|
648
|
+
runId: this.runFriendlyId,
|
|
649
|
+
message: "No env overrides, skipping",
|
|
650
|
+
});
|
|
538
651
|
return;
|
|
539
652
|
}
|
|
540
|
-
|
|
653
|
+
this.sendDebugLog({
|
|
654
|
+
runId: this.runFriendlyId,
|
|
655
|
+
message: "Processing env overrides",
|
|
656
|
+
properties: { ...overrides },
|
|
657
|
+
});
|
|
541
658
|
if (overrides.TRIGGER_SUCCESS_EXIT_CODE) {
|
|
542
659
|
this.successExitCode = overrides.TRIGGER_SUCCESS_EXIT_CODE;
|
|
543
660
|
}
|
|
@@ -569,111 +686,174 @@ class ManagedRunController {
|
|
|
569
686
|
this.httpClient.updateRunnerId(this.runnerId);
|
|
570
687
|
}
|
|
571
688
|
}
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
}
|
|
576
|
-
this.subscribeToRunNotifications({
|
|
577
|
-
run: { friendlyId: runFriendlyId },
|
|
578
|
-
snapshot: { friendlyId: snapshotFriendlyId },
|
|
579
|
-
});
|
|
580
|
-
const attemptStartedAt = Date.now();
|
|
581
|
-
const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, {
|
|
582
|
-
isWarmStart,
|
|
583
|
-
});
|
|
584
|
-
if (!start.success) {
|
|
585
|
-
console.error("[ManagedRunController] Failed to start run", { error: start.error });
|
|
689
|
+
activeRunExecution = null;
|
|
690
|
+
async startAndExecuteRunAttempt({ runFriendlyId, snapshotFriendlyId, dequeuedAt, podScheduledAt, isWarmStart, skipLockCheckForImmediateRetry: skipLockCheck, }) {
|
|
691
|
+
if (!skipLockCheck && this.activeRunExecution) {
|
|
586
692
|
this.sendDebugLog({
|
|
587
693
|
runId: runFriendlyId,
|
|
588
|
-
message: "
|
|
589
|
-
properties: {
|
|
590
|
-
error: start.error,
|
|
591
|
-
},
|
|
694
|
+
message: "startAndExecuteRunAttempt: already in progress",
|
|
592
695
|
});
|
|
593
|
-
this.waitForNextRun();
|
|
594
696
|
return;
|
|
595
697
|
}
|
|
596
|
-
const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
event: "create_attempt",
|
|
607
|
-
timestamp: attemptStartedAt,
|
|
608
|
-
duration: attemptDuration,
|
|
609
|
-
},
|
|
610
|
-
]
|
|
611
|
-
.concat(dequeuedAt
|
|
612
|
-
? [
|
|
613
|
-
{
|
|
614
|
-
name: "start",
|
|
615
|
-
event: "dequeue",
|
|
616
|
-
timestamp: dequeuedAt.getTime(),
|
|
617
|
-
duration: 0,
|
|
618
|
-
},
|
|
619
|
-
]
|
|
620
|
-
: [])
|
|
621
|
-
.concat(podScheduledAt
|
|
622
|
-
? [
|
|
623
|
-
{
|
|
624
|
-
name: "start",
|
|
625
|
-
event: "pod_scheduled",
|
|
626
|
-
timestamp: podScheduledAt.getTime(),
|
|
627
|
-
duration: 0,
|
|
628
|
-
},
|
|
629
|
-
]
|
|
630
|
-
: []);
|
|
631
|
-
const taskRunEnv = {
|
|
632
|
-
...gatherProcessEnv(),
|
|
633
|
-
...envVars,
|
|
634
|
-
};
|
|
635
|
-
try {
|
|
636
|
-
return await this.executeRun({ run, snapshot, envVars: taskRunEnv, execution, metrics });
|
|
637
|
-
}
|
|
638
|
-
catch (error) {
|
|
639
|
-
console.error("Error while executing attempt", {
|
|
640
|
-
error,
|
|
698
|
+
const execution = async () => {
|
|
699
|
+
if (!this.socket) {
|
|
700
|
+
this.sendDebugLog({
|
|
701
|
+
runId: runFriendlyId,
|
|
702
|
+
message: "Starting run without socket connection",
|
|
703
|
+
});
|
|
704
|
+
}
|
|
705
|
+
this.subscribeToRunNotifications({
|
|
706
|
+
run: { friendlyId: runFriendlyId },
|
|
707
|
+
snapshot: { friendlyId: snapshotFriendlyId },
|
|
641
708
|
});
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
error: TaskRunProcess.parseExecuteError(error),
|
|
652
|
-
};
|
|
653
|
-
const completionResult = await this.httpClient.completeRunAttempt(run.friendlyId, this.snapshotFriendlyId ?? snapshot.friendlyId, { completion });
|
|
654
|
-
if (!completionResult.success) {
|
|
655
|
-
console.error("Failed to submit completion after error", {
|
|
656
|
-
error: completionResult.error,
|
|
709
|
+
const attemptStartedAt = Date.now();
|
|
710
|
+
const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, {
|
|
711
|
+
isWarmStart,
|
|
712
|
+
});
|
|
713
|
+
if (!start.success) {
|
|
714
|
+
this.sendDebugLog({
|
|
715
|
+
runId: runFriendlyId,
|
|
716
|
+
message: "Failed to start run",
|
|
717
|
+
properties: { error: start.error },
|
|
657
718
|
});
|
|
658
719
|
this.sendDebugLog({
|
|
659
|
-
runId:
|
|
660
|
-
message: "
|
|
720
|
+
runId: runFriendlyId,
|
|
721
|
+
message: "failed to start run attempt",
|
|
661
722
|
properties: {
|
|
662
|
-
error:
|
|
723
|
+
error: start.error,
|
|
663
724
|
},
|
|
664
725
|
});
|
|
665
726
|
this.waitForNextRun();
|
|
666
727
|
return;
|
|
667
728
|
}
|
|
668
|
-
|
|
729
|
+
const attemptDuration = Date.now() - attemptStartedAt;
|
|
730
|
+
const { run, snapshot, execution, envVars } = start.data;
|
|
731
|
+
this.sendDebugLog({
|
|
732
|
+
runId: run.friendlyId,
|
|
733
|
+
message: "Started run",
|
|
734
|
+
properties: { snapshot: snapshot.friendlyId },
|
|
735
|
+
});
|
|
736
|
+
this.enterRunPhase(run, snapshot);
|
|
737
|
+
const metrics = [
|
|
738
|
+
{
|
|
739
|
+
name: "start",
|
|
740
|
+
event: "create_attempt",
|
|
741
|
+
timestamp: attemptStartedAt,
|
|
742
|
+
duration: attemptDuration,
|
|
743
|
+
},
|
|
744
|
+
]
|
|
745
|
+
.concat(dequeuedAt
|
|
746
|
+
? [
|
|
747
|
+
{
|
|
748
|
+
name: "start",
|
|
749
|
+
event: "dequeue",
|
|
750
|
+
timestamp: dequeuedAt.getTime(),
|
|
751
|
+
duration: 0,
|
|
752
|
+
},
|
|
753
|
+
]
|
|
754
|
+
: [])
|
|
755
|
+
.concat(podScheduledAt
|
|
756
|
+
? [
|
|
757
|
+
{
|
|
758
|
+
name: "start",
|
|
759
|
+
event: "pod_scheduled",
|
|
760
|
+
timestamp: podScheduledAt.getTime(),
|
|
761
|
+
duration: 0,
|
|
762
|
+
},
|
|
763
|
+
]
|
|
764
|
+
: []);
|
|
765
|
+
const taskRunEnv = {
|
|
766
|
+
...gatherProcessEnv(),
|
|
767
|
+
...envVars,
|
|
768
|
+
};
|
|
669
769
|
try {
|
|
670
|
-
await this.
|
|
770
|
+
return await this.executeRun({ run, snapshot, envVars: taskRunEnv, execution, metrics });
|
|
671
771
|
}
|
|
672
772
|
catch (error) {
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
773
|
+
if (error instanceof SuspendedProcessError) {
|
|
774
|
+
this.sendDebugLog({
|
|
775
|
+
runId: run.friendlyId,
|
|
776
|
+
message: "Run was suspended and task run process was killed, waiting for next run",
|
|
777
|
+
properties: { run: run.friendlyId, snapshot: snapshot.friendlyId },
|
|
778
|
+
});
|
|
779
|
+
this.waitForNextRun();
|
|
780
|
+
return;
|
|
781
|
+
}
|
|
782
|
+
this.sendDebugLog({
|
|
783
|
+
runId: run.friendlyId,
|
|
784
|
+
message: "Error while executing attempt",
|
|
785
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
786
|
+
});
|
|
787
|
+
this.sendDebugLog({
|
|
788
|
+
runId: run.friendlyId,
|
|
789
|
+
message: "Submitting attempt completion",
|
|
790
|
+
properties: {
|
|
791
|
+
snapshotId: snapshot.friendlyId,
|
|
792
|
+
updatedSnapshotId: this.snapshotFriendlyId,
|
|
793
|
+
},
|
|
794
|
+
});
|
|
795
|
+
const completion = {
|
|
796
|
+
id: execution.run.id,
|
|
797
|
+
ok: false,
|
|
798
|
+
retry: undefined,
|
|
799
|
+
error: TaskRunProcess.parseExecuteError(error),
|
|
800
|
+
};
|
|
801
|
+
const completionResult = await this.httpClient.completeRunAttempt(run.friendlyId,
|
|
802
|
+
// FIXME: if the snapshot has changed since starting the run, this won't be accurate
|
|
803
|
+
// ..but we probably shouldn't fetch the latest snapshot either because we may be in an "unhealthy" state while the next runner has already taken over
|
|
804
|
+
this.snapshotFriendlyId ?? snapshot.friendlyId, { completion });
|
|
805
|
+
if (!completionResult.success) {
|
|
806
|
+
this.sendDebugLog({
|
|
807
|
+
runId: run.friendlyId,
|
|
808
|
+
message: "Failed to submit completion after error",
|
|
809
|
+
properties: { error: completionResult.error },
|
|
810
|
+
});
|
|
811
|
+
this.sendDebugLog({
|
|
812
|
+
runId: run.friendlyId,
|
|
813
|
+
message: "completion: failed to submit after error",
|
|
814
|
+
properties: {
|
|
815
|
+
error: completionResult.error,
|
|
816
|
+
},
|
|
817
|
+
});
|
|
818
|
+
this.waitForNextRun();
|
|
819
|
+
return;
|
|
820
|
+
}
|
|
821
|
+
this.sendDebugLog({
|
|
822
|
+
runId: run.friendlyId,
|
|
823
|
+
message: "Attempt completion submitted after error",
|
|
824
|
+
properties: {
|
|
825
|
+
attemptStatus: completionResult.data.result.attemptStatus,
|
|
826
|
+
runId: completionResult.data.result.run.friendlyId,
|
|
827
|
+
snapshotId: completionResult.data.result.snapshot.friendlyId,
|
|
828
|
+
},
|
|
829
|
+
});
|
|
830
|
+
try {
|
|
831
|
+
await this.handleCompletionResult(completion, completionResult.data.result);
|
|
832
|
+
}
|
|
833
|
+
catch (error) {
|
|
834
|
+
this.sendDebugLog({
|
|
835
|
+
runId: run.friendlyId,
|
|
836
|
+
message: "Failed to handle completion result after error",
|
|
837
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
838
|
+
});
|
|
839
|
+
this.waitForNextRun();
|
|
840
|
+
return;
|
|
841
|
+
}
|
|
676
842
|
}
|
|
843
|
+
};
|
|
844
|
+
this.activeRunExecution = execution();
|
|
845
|
+
try {
|
|
846
|
+
await this.activeRunExecution;
|
|
847
|
+
}
|
|
848
|
+
catch (error) {
|
|
849
|
+
this.sendDebugLog({
|
|
850
|
+
runId: runFriendlyId,
|
|
851
|
+
message: "startAndExecuteRunAttempt: unexpected error",
|
|
852
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
853
|
+
});
|
|
854
|
+
}
|
|
855
|
+
finally {
|
|
856
|
+
this.activeRunExecution = null;
|
|
677
857
|
}
|
|
678
858
|
}
|
|
679
859
|
waitForNextRunLock = false;
|
|
@@ -682,44 +862,75 @@ class ManagedRunController {
|
|
|
682
862
|
* configured duration. */
|
|
683
863
|
async waitForNextRun() {
|
|
684
864
|
if (this.waitForNextRunLock) {
|
|
685
|
-
|
|
865
|
+
this.sendDebugLog({
|
|
866
|
+
runId: this.runFriendlyId,
|
|
867
|
+
message: "waitForNextRun: already in progress",
|
|
868
|
+
});
|
|
686
869
|
return;
|
|
687
870
|
}
|
|
688
871
|
this.waitForNextRunLock = true;
|
|
689
872
|
const previousRunId = this.runFriendlyId;
|
|
690
873
|
try {
|
|
691
|
-
|
|
692
|
-
this.
|
|
693
|
-
|
|
874
|
+
// If there's a run execution in progress, we need to kill it and wait for it to finish
|
|
875
|
+
if (this.activeRunExecution) {
|
|
876
|
+
this.sendDebugLog({
|
|
877
|
+
runId: this.runFriendlyId,
|
|
878
|
+
message: "waitForNextRun: waiting for existing run execution to finish",
|
|
879
|
+
});
|
|
880
|
+
await this.activeRunExecution;
|
|
881
|
+
}
|
|
882
|
+
// Just for good measure
|
|
694
883
|
await this.taskRunProcess?.kill("SIGKILL");
|
|
884
|
+
this.sendDebugLog({
|
|
885
|
+
runId: this.runFriendlyId,
|
|
886
|
+
message: "waitForNextRun: waiting for next run",
|
|
887
|
+
});
|
|
888
|
+
this.enterWarmStartPhase();
|
|
695
889
|
if (!this.warmStartClient) {
|
|
696
|
-
|
|
890
|
+
this.sendDebugLog({
|
|
891
|
+
runId: this.runFriendlyId,
|
|
892
|
+
message: "waitForNextRun: warm starts disabled, shutting down",
|
|
893
|
+
});
|
|
697
894
|
this.exitProcess(this.successExitCode);
|
|
698
895
|
}
|
|
699
896
|
if (this.taskRunProcess) {
|
|
700
|
-
|
|
897
|
+
this.sendDebugLog({
|
|
898
|
+
runId: this.runFriendlyId,
|
|
899
|
+
message: "waitForNextRun: eagerly recreating task run process with options",
|
|
900
|
+
});
|
|
701
901
|
this.taskRunProcess = new TaskRunProcess({
|
|
702
902
|
...this.taskRunProcess.options,
|
|
703
903
|
isWarmStart: true,
|
|
704
904
|
}).initialize();
|
|
705
905
|
}
|
|
706
906
|
else {
|
|
707
|
-
|
|
907
|
+
this.sendDebugLog({
|
|
908
|
+
runId: this.runFriendlyId,
|
|
909
|
+
message: "waitForNextRun: no existing task run process, so we can't eagerly recreate it",
|
|
910
|
+
});
|
|
708
911
|
}
|
|
709
912
|
// Check the service is up and get additional warm start config
|
|
710
913
|
const connect = await this.warmStartClient.connect();
|
|
711
914
|
if (!connect.success) {
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
915
|
+
this.sendDebugLog({
|
|
916
|
+
runId: this.runFriendlyId,
|
|
917
|
+
message: "waitForNextRun: failed to connect to warm start service",
|
|
918
|
+
properties: {
|
|
919
|
+
warmStartUrl: env.TRIGGER_WARM_START_URL,
|
|
920
|
+
error: connect.error,
|
|
921
|
+
},
|
|
715
922
|
});
|
|
716
923
|
this.exitProcess(this.successExitCode);
|
|
717
924
|
}
|
|
718
925
|
const connectionTimeoutMs = connect.data.connectionTimeoutMs ?? env.TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS;
|
|
719
926
|
const keepaliveMs = connect.data.keepaliveMs ?? env.TRIGGER_WARM_START_KEEPALIVE_MS;
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
927
|
+
this.sendDebugLog({
|
|
928
|
+
runId: this.runFriendlyId,
|
|
929
|
+
message: "waitForNextRun: connected to warm start service",
|
|
930
|
+
properties: {
|
|
931
|
+
connectionTimeoutMs,
|
|
932
|
+
keepaliveMs,
|
|
933
|
+
},
|
|
723
934
|
});
|
|
724
935
|
if (previousRunId) {
|
|
725
936
|
this.sendDebugLog({
|
|
@@ -732,9 +943,13 @@ class ManagedRunController {
|
|
|
732
943
|
});
|
|
733
944
|
}
|
|
734
945
|
if (!connectionTimeoutMs || !keepaliveMs) {
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
946
|
+
this.sendDebugLog({
|
|
947
|
+
runId: this.runFriendlyId,
|
|
948
|
+
message: "waitForNextRun: warm starts disabled after connect",
|
|
949
|
+
properties: {
|
|
950
|
+
connectionTimeoutMs,
|
|
951
|
+
keepaliveMs,
|
|
952
|
+
},
|
|
738
953
|
});
|
|
739
954
|
this.exitProcess(this.successExitCode);
|
|
740
955
|
}
|
|
@@ -744,10 +959,17 @@ class ManagedRunController {
|
|
|
744
959
|
keepaliveMs,
|
|
745
960
|
});
|
|
746
961
|
if (!nextRun) {
|
|
747
|
-
|
|
962
|
+
this.sendDebugLog({
|
|
963
|
+
runId: this.runFriendlyId,
|
|
964
|
+
message: "waitForNextRun: warm start failed, shutting down",
|
|
965
|
+
});
|
|
748
966
|
this.exitProcess(this.successExitCode);
|
|
749
967
|
}
|
|
750
|
-
|
|
968
|
+
this.sendDebugLog({
|
|
969
|
+
runId: this.runFriendlyId,
|
|
970
|
+
message: "waitForNextRun: got next run",
|
|
971
|
+
properties: { nextRun: nextRun.run.friendlyId },
|
|
972
|
+
});
|
|
751
973
|
this.startAndExecuteRunAttempt({
|
|
752
974
|
runFriendlyId: nextRun.run.friendlyId,
|
|
753
975
|
snapshotFriendlyId: nextRun.snapshot.friendlyId,
|
|
@@ -757,7 +979,11 @@ class ManagedRunController {
|
|
|
757
979
|
return;
|
|
758
980
|
}
|
|
759
981
|
catch (error) {
|
|
760
|
-
|
|
982
|
+
this.sendDebugLog({
|
|
983
|
+
runId: this.runFriendlyId,
|
|
984
|
+
message: "waitForNextRun: unexpected error",
|
|
985
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
986
|
+
});
|
|
761
987
|
this.exitProcess(this.failureExitCode);
|
|
762
988
|
}
|
|
763
989
|
finally {
|
|
@@ -765,7 +991,11 @@ class ManagedRunController {
|
|
|
765
991
|
}
|
|
766
992
|
}
|
|
767
993
|
exitProcess(code) {
|
|
768
|
-
|
|
994
|
+
this.sendDebugLog({
|
|
995
|
+
runId: this.runFriendlyId,
|
|
996
|
+
message: "Exiting process",
|
|
997
|
+
properties: { code },
|
|
998
|
+
});
|
|
769
999
|
if (this.taskRunProcess?.isPreparedForNextRun) {
|
|
770
1000
|
this.taskRunProcess.forceExit();
|
|
771
1001
|
}
|
|
@@ -781,25 +1011,23 @@ class ManagedRunController {
|
|
|
781
1011
|
},
|
|
782
1012
|
});
|
|
783
1013
|
this.socket.on("run:notify", async ({ version, run }) => {
|
|
784
|
-
console.log("[ManagedRunController] Received run notification", { version, run });
|
|
785
1014
|
this.sendDebugLog({
|
|
786
1015
|
runId: run.friendlyId,
|
|
787
1016
|
message: "run:notify received by runner",
|
|
1017
|
+
properties: { version, runId: run.friendlyId },
|
|
788
1018
|
});
|
|
789
1019
|
if (!this.runFriendlyId) {
|
|
790
|
-
|
|
1020
|
+
this.sendDebugLog({
|
|
791
1021
|
runId: run.friendlyId,
|
|
792
|
-
|
|
793
|
-
|
|
1022
|
+
message: "run:notify: ignoring notification, no local run ID",
|
|
1023
|
+
properties: {
|
|
1024
|
+
currentRunId: this.runFriendlyId,
|
|
1025
|
+
currentSnapshotId: this.snapshotFriendlyId,
|
|
1026
|
+
},
|
|
794
1027
|
});
|
|
795
1028
|
return;
|
|
796
1029
|
}
|
|
797
1030
|
if (run.friendlyId !== this.runFriendlyId) {
|
|
798
|
-
console.log("[ManagedRunController] Ignoring notification for different run", {
|
|
799
|
-
runId: run.friendlyId,
|
|
800
|
-
currentRunId: this.runFriendlyId,
|
|
801
|
-
currentSnapshotId: this.snapshotFriendlyId,
|
|
802
|
-
});
|
|
803
1031
|
this.sendDebugLog({
|
|
804
1032
|
runId: run.friendlyId,
|
|
805
1033
|
message: "run:notify: ignoring notification for different run",
|
|
@@ -815,7 +1043,6 @@ class ManagedRunController {
|
|
|
815
1043
|
this.snapshotPoller.resetCurrentInterval();
|
|
816
1044
|
const latestSnapshot = await this.httpClient.getRunExecutionData(this.runFriendlyId);
|
|
817
1045
|
if (!latestSnapshot.success) {
|
|
818
|
-
console.error("Failed to get latest snapshot data", latestSnapshot.error);
|
|
819
1046
|
this.sendDebugLog({
|
|
820
1047
|
runId: this.runFriendlyId,
|
|
821
1048
|
message: "run:notify: failed to get latest snapshot data",
|
|
@@ -830,7 +1057,10 @@ class ManagedRunController {
|
|
|
830
1057
|
await this.handleSnapshotChange(latestSnapshot.data.execution);
|
|
831
1058
|
});
|
|
832
1059
|
this.socket.on("connect", () => {
|
|
833
|
-
|
|
1060
|
+
this.sendDebugLog({
|
|
1061
|
+
runId: this.runFriendlyId,
|
|
1062
|
+
message: "Connected to supervisor",
|
|
1063
|
+
});
|
|
834
1064
|
// This should handle the case where we reconnect after being restored
|
|
835
1065
|
if (this.state.phase === "RUN") {
|
|
836
1066
|
const { run, snapshot } = this.state;
|
|
@@ -838,10 +1068,18 @@ class ManagedRunController {
|
|
|
838
1068
|
}
|
|
839
1069
|
});
|
|
840
1070
|
this.socket.on("connect_error", (error) => {
|
|
841
|
-
|
|
1071
|
+
this.sendDebugLog({
|
|
1072
|
+
runId: this.runFriendlyId,
|
|
1073
|
+
message: "Connection error",
|
|
1074
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
1075
|
+
});
|
|
842
1076
|
});
|
|
843
1077
|
this.socket.on("disconnect", (reason, description) => {
|
|
844
|
-
|
|
1078
|
+
this.sendDebugLog({
|
|
1079
|
+
runId: this.runFriendlyId,
|
|
1080
|
+
message: "Disconnected from supervisor",
|
|
1081
|
+
properties: { reason, description: description?.toString() },
|
|
1082
|
+
});
|
|
845
1083
|
});
|
|
846
1084
|
}
|
|
847
1085
|
async executeRun({ run, snapshot, envVars, execution, metrics, }) {
|
|
@@ -859,9 +1097,13 @@ class ManagedRunController {
|
|
|
859
1097
|
machine: execution.machine,
|
|
860
1098
|
}).initialize();
|
|
861
1099
|
}
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
1100
|
+
this.sendDebugLog({
|
|
1101
|
+
runId: this.runFriendlyId,
|
|
1102
|
+
message: "executing task run process",
|
|
1103
|
+
properties: {
|
|
1104
|
+
attemptId: execution.attempt.id,
|
|
1105
|
+
runId: execution.run.id,
|
|
1106
|
+
},
|
|
865
1107
|
});
|
|
866
1108
|
const completion = await this.taskRunProcess.execute({
|
|
867
1109
|
payload: {
|
|
@@ -872,19 +1114,30 @@ class ManagedRunController {
|
|
|
872
1114
|
messageId: run.friendlyId,
|
|
873
1115
|
env: envVars,
|
|
874
1116
|
});
|
|
875
|
-
|
|
1117
|
+
this.sendDebugLog({
|
|
1118
|
+
runId: this.runFriendlyId,
|
|
1119
|
+
message: "Completed run",
|
|
1120
|
+
properties: { completion: completion.ok },
|
|
1121
|
+
});
|
|
876
1122
|
try {
|
|
1123
|
+
// The execution has finished, so we can cleanup the task run process. Killing it should be safe.
|
|
877
1124
|
await this.taskRunProcess.cleanup(true);
|
|
878
1125
|
}
|
|
879
1126
|
catch (error) {
|
|
880
|
-
|
|
881
|
-
|
|
1127
|
+
this.sendDebugLog({
|
|
1128
|
+
runId: this.runFriendlyId,
|
|
1129
|
+
message: "Failed to cleanup task run process, submitting completion anyway",
|
|
1130
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
882
1131
|
});
|
|
883
1132
|
}
|
|
884
1133
|
if (!this.runFriendlyId || !this.snapshotFriendlyId) {
|
|
885
|
-
|
|
1134
|
+
this.sendDebugLog({
|
|
886
1135
|
runId: this.runFriendlyId,
|
|
887
|
-
|
|
1136
|
+
message: "executeRun: Missing run ID or snapshot ID after execution",
|
|
1137
|
+
properties: {
|
|
1138
|
+
runId: this.runFriendlyId,
|
|
1139
|
+
snapshotId: this.snapshotFriendlyId,
|
|
1140
|
+
},
|
|
888
1141
|
});
|
|
889
1142
|
this.waitForNextRun();
|
|
890
1143
|
return;
|
|
@@ -893,8 +1146,12 @@ class ManagedRunController {
|
|
|
893
1146
|
completion,
|
|
894
1147
|
});
|
|
895
1148
|
if (!completionResult.success) {
|
|
896
|
-
|
|
897
|
-
|
|
1149
|
+
this.sendDebugLog({
|
|
1150
|
+
runId: run.friendlyId,
|
|
1151
|
+
message: "completion: failed to submit",
|
|
1152
|
+
properties: {
|
|
1153
|
+
error: completionResult.error,
|
|
1154
|
+
},
|
|
898
1155
|
});
|
|
899
1156
|
this.sendDebugLog({
|
|
900
1157
|
runId: run.friendlyId,
|
|
@@ -906,38 +1163,72 @@ class ManagedRunController {
|
|
|
906
1163
|
this.waitForNextRun();
|
|
907
1164
|
return;
|
|
908
1165
|
}
|
|
909
|
-
|
|
1166
|
+
this.sendDebugLog({
|
|
1167
|
+
runId: run.friendlyId,
|
|
1168
|
+
message: "Attempt completion submitted",
|
|
1169
|
+
properties: {
|
|
1170
|
+
attemptStatus: completionResult.data.result.attemptStatus,
|
|
1171
|
+
runId: completionResult.data.result.run.friendlyId,
|
|
1172
|
+
snapshotId: completionResult.data.result.snapshot.friendlyId,
|
|
1173
|
+
},
|
|
1174
|
+
});
|
|
910
1175
|
try {
|
|
911
1176
|
await this.handleCompletionResult(completion, completionResult.data.result);
|
|
912
1177
|
}
|
|
913
1178
|
catch (error) {
|
|
914
|
-
|
|
1179
|
+
this.sendDebugLog({
|
|
1180
|
+
runId: run.friendlyId,
|
|
1181
|
+
message: "Failed to handle completion result",
|
|
1182
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
1183
|
+
});
|
|
915
1184
|
this.waitForNextRun();
|
|
916
1185
|
return;
|
|
917
1186
|
}
|
|
918
1187
|
}
|
|
919
1188
|
async handleCompletionResult(completion, result) {
|
|
920
|
-
|
|
1189
|
+
this.sendDebugLog({
|
|
1190
|
+
runId: this.runFriendlyId,
|
|
1191
|
+
message: "Handling completion result",
|
|
1192
|
+
properties: {
|
|
1193
|
+
completion: completion.ok,
|
|
1194
|
+
attemptStatus: result.attemptStatus,
|
|
1195
|
+
snapshotId: result.snapshot.friendlyId,
|
|
1196
|
+
runId: result.run.friendlyId,
|
|
1197
|
+
},
|
|
1198
|
+
});
|
|
921
1199
|
const { attemptStatus, snapshot: completionSnapshot, run } = result;
|
|
922
1200
|
try {
|
|
923
1201
|
this.updateRunPhase(run, completionSnapshot);
|
|
924
1202
|
}
|
|
925
1203
|
catch (error) {
|
|
926
|
-
|
|
1204
|
+
this.sendDebugLog({
|
|
1205
|
+
runId: run.friendlyId,
|
|
1206
|
+
message: "Failed to update run phase after completion",
|
|
1207
|
+
properties: { error: error instanceof Error ? error.message : String(error) },
|
|
1208
|
+
});
|
|
927
1209
|
this.waitForNextRun();
|
|
928
1210
|
return;
|
|
929
1211
|
}
|
|
930
1212
|
if (attemptStatus === "RUN_FINISHED") {
|
|
931
|
-
|
|
1213
|
+
this.sendDebugLog({
|
|
1214
|
+
runId: run.friendlyId,
|
|
1215
|
+
message: "Run finished",
|
|
1216
|
+
});
|
|
932
1217
|
this.waitForNextRun();
|
|
933
1218
|
return;
|
|
934
1219
|
}
|
|
935
1220
|
if (attemptStatus === "RUN_PENDING_CANCEL") {
|
|
936
|
-
|
|
1221
|
+
this.sendDebugLog({
|
|
1222
|
+
runId: run.friendlyId,
|
|
1223
|
+
message: "Run pending cancel",
|
|
1224
|
+
});
|
|
937
1225
|
return;
|
|
938
1226
|
}
|
|
939
1227
|
if (attemptStatus === "RETRY_QUEUED") {
|
|
940
|
-
|
|
1228
|
+
this.sendDebugLog({
|
|
1229
|
+
runId: run.friendlyId,
|
|
1230
|
+
message: "Retry queued",
|
|
1231
|
+
});
|
|
941
1232
|
this.waitForNextRun();
|
|
942
1233
|
return;
|
|
943
1234
|
}
|
|
@@ -955,28 +1246,48 @@ class ManagedRunController {
|
|
|
955
1246
|
this.startAndExecuteRunAttempt({
|
|
956
1247
|
runFriendlyId: run.friendlyId,
|
|
957
1248
|
snapshotFriendlyId: this.snapshotFriendlyId,
|
|
1249
|
+
skipLockCheckForImmediateRetry: true,
|
|
958
1250
|
}).finally(() => { });
|
|
959
1251
|
return;
|
|
960
1252
|
}
|
|
961
1253
|
assertExhaustive(attemptStatus);
|
|
962
1254
|
}
|
|
963
1255
|
sendDebugLog({ runId, message, date, properties, }) {
|
|
1256
|
+
if (!runId) {
|
|
1257
|
+
runId = this.runFriendlyId;
|
|
1258
|
+
}
|
|
1259
|
+
if (!runId) {
|
|
1260
|
+
runId = env.TRIGGER_RUN_ID;
|
|
1261
|
+
}
|
|
1262
|
+
if (!runId) {
|
|
1263
|
+
return;
|
|
1264
|
+
}
|
|
1265
|
+
const mergedProperties = {
|
|
1266
|
+
...properties,
|
|
1267
|
+
runId,
|
|
1268
|
+
runnerId: this.runnerId,
|
|
1269
|
+
workerName: this.workerInstanceName,
|
|
1270
|
+
};
|
|
1271
|
+
console.log(message, mergedProperties);
|
|
964
1272
|
this.httpClient.sendDebugLog(runId, {
|
|
965
1273
|
message,
|
|
966
1274
|
time: date ?? new Date(),
|
|
967
|
-
properties:
|
|
968
|
-
...properties,
|
|
969
|
-
runnerId: this.runnerId,
|
|
970
|
-
workerName: this.workerInstanceName,
|
|
971
|
-
},
|
|
1275
|
+
properties: mergedProperties,
|
|
972
1276
|
});
|
|
973
1277
|
}
|
|
974
1278
|
async cancelAttempt(runId) {
|
|
975
|
-
|
|
1279
|
+
this.sendDebugLog({
|
|
1280
|
+
runId,
|
|
1281
|
+
message: "cancelling attempt",
|
|
1282
|
+
properties: { runId },
|
|
1283
|
+
});
|
|
976
1284
|
await this.taskRunProcess?.cancel();
|
|
977
1285
|
}
|
|
978
1286
|
async start() {
|
|
979
|
-
|
|
1287
|
+
this.sendDebugLog({
|
|
1288
|
+
runId: this.runFriendlyId,
|
|
1289
|
+
message: "Starting up",
|
|
1290
|
+
});
|
|
980
1291
|
// Websocket notifications are only an optimisation so we don't need to wait for a successful connection
|
|
981
1292
|
this.createSocket();
|
|
982
1293
|
// If we have run and snapshot IDs, we can start an attempt immediately
|
|
@@ -994,7 +1305,10 @@ class ManagedRunController {
|
|
|
994
1305
|
return;
|
|
995
1306
|
}
|
|
996
1307
|
async stop() {
|
|
997
|
-
|
|
1308
|
+
this.sendDebugLog({
|
|
1309
|
+
runId: this.runFriendlyId,
|
|
1310
|
+
message: "Shutting down",
|
|
1311
|
+
});
|
|
998
1312
|
if (this.taskRunProcess) {
|
|
999
1313
|
await this.taskRunProcess.cleanup(true);
|
|
1000
1314
|
}
|