qdone 2.2.4 → 2.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commonjs/src/scheduler/jobExecutor.js +202 -17
- package/package.json +1 -1
- package/src/scheduler/jobExecutor.js +210 -19
|
@@ -11,10 +11,13 @@ exports.JobExecutor = void 0;
|
|
|
11
11
|
const client_sqs_1 = require("@aws-sdk/client-sqs");
|
|
12
12
|
const chalk_1 = __importDefault(require("chalk"));
|
|
13
13
|
const debug_1 = __importDefault(require("debug"));
|
|
14
|
+
const tree_kill_1 = __importDefault(require("tree-kill"));
|
|
14
15
|
const dedup_js_1 = require("../dedup.js");
|
|
15
16
|
const sqs_js_1 = require("../sqs.js");
|
|
16
17
|
const debug = (0, debug_1.default)('qdone:jobExecutor');
|
|
17
18
|
const maxJobSeconds = 12 * 60 * 60;
|
|
19
|
+
const defaultVisibilityTimeout = 120;
|
|
20
|
+
const SIGKILL_DELAY_MS = 5000;
|
|
18
21
|
class JobExecutor {
|
|
19
22
|
constructor(opt) {
|
|
20
23
|
this.opt = opt;
|
|
@@ -29,7 +32,8 @@ class JobExecutor {
|
|
|
29
32
|
timeoutsExtended: 0,
|
|
30
33
|
jobsSucceeded: 0,
|
|
31
34
|
jobsFailed: 0,
|
|
32
|
-
jobsDeleted: 0
|
|
35
|
+
jobsDeleted: 0,
|
|
36
|
+
jobsKilled: 0
|
|
33
37
|
};
|
|
34
38
|
this.maintainPromise = this.maintainVisibility();
|
|
35
39
|
debug({ this: this });
|
|
@@ -60,6 +64,149 @@ class JobExecutor {
|
|
|
60
64
|
runningCount += job.status === 'running';
|
|
61
65
|
return runningCount;
|
|
62
66
|
}
|
|
67
|
+
clearJobTimers(job) {
|
|
68
|
+
clearTimeout(job.killTimer);
|
|
69
|
+
clearTimeout(job.killSignalTimer);
|
|
70
|
+
}
|
|
71
|
+
getExecutionTimeMs(job, start = new Date()) {
|
|
72
|
+
return start - job.executionStart;
|
|
73
|
+
}
|
|
74
|
+
shouldEnforceKillAfter(job) {
|
|
75
|
+
return !!(this.opt.killAfter && job.executionMode !== 'inline');
|
|
76
|
+
}
|
|
77
|
+
scheduleKillAfter(job) {
|
|
78
|
+
if (!this.opt.killAfter)
|
|
79
|
+
return;
|
|
80
|
+
clearTimeout(job.killTimer);
|
|
81
|
+
job.killTimer = setTimeout(() => {
|
|
82
|
+
job.killDue = true;
|
|
83
|
+
this.killJob(job, new Date());
|
|
84
|
+
}, this.opt.killAfter * 1000);
|
|
85
|
+
job.killTimer.unref?.();
|
|
86
|
+
}
|
|
87
|
+
killJob(job, start = new Date()) {
|
|
88
|
+
if (!job.executionStart || job.status !== 'running')
|
|
89
|
+
return;
|
|
90
|
+
if (job.killed)
|
|
91
|
+
return;
|
|
92
|
+
if (!this.shouldEnforceKillAfter(job))
|
|
93
|
+
return;
|
|
94
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start);
|
|
95
|
+
if (executionTimeMs < this.opt.killAfter * 1000)
|
|
96
|
+
return;
|
|
97
|
+
const executionTime = Math.floor(executionTimeMs / 1000);
|
|
98
|
+
job.killDue = true;
|
|
99
|
+
if (!job.pid) {
|
|
100
|
+
debug('killAfter reached before PID registration', { messageId: job.message?.MessageId, executionTime });
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
job.killed = true;
|
|
104
|
+
this.stats.jobsKilled++;
|
|
105
|
+
const pid = job.pid;
|
|
106
|
+
const killTree = this.opt.killTree || tree_kill_1.default;
|
|
107
|
+
if (this.opt.verbose) {
|
|
108
|
+
console.error(chalk_1.default.red('KILLING'), job.prettyQname, chalk_1.default.red('pid'), pid, chalk_1.default.red('after'), executionTime, chalk_1.default.red('seconds (limit:'), this.opt.killAfter + ')');
|
|
109
|
+
}
|
|
110
|
+
else if (!this.opt.disableLog) {
|
|
111
|
+
console.log(JSON.stringify({
|
|
112
|
+
event: 'JOB_KILL_AFTER',
|
|
113
|
+
timestamp: start,
|
|
114
|
+
queue: job.qname,
|
|
115
|
+
messageId: job.message.MessageId,
|
|
116
|
+
pid,
|
|
117
|
+
executionTime,
|
|
118
|
+
killAfter: this.opt.killAfter,
|
|
119
|
+
payload: job.payload
|
|
120
|
+
}));
|
|
121
|
+
}
|
|
122
|
+
killTree(pid, 'SIGTERM', (err) => {
|
|
123
|
+
if (err)
|
|
124
|
+
debug('treeKill SIGTERM error', err.message);
|
|
125
|
+
});
|
|
126
|
+
clearTimeout(job.killSignalTimer);
|
|
127
|
+
job.killSignalTimer = setTimeout(() => {
|
|
128
|
+
try {
|
|
129
|
+
process.kill(pid, 0);
|
|
130
|
+
}
|
|
131
|
+
catch (e) {
|
|
132
|
+
if (e.code === 'ESRCH')
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
killTree(pid, 'SIGKILL', (err) => {
|
|
136
|
+
if (err)
|
|
137
|
+
debug('treeKill SIGKILL error', err.message);
|
|
138
|
+
});
|
|
139
|
+
}, SIGKILL_DELAY_MS);
|
|
140
|
+
job.killSignalTimer.unref?.();
|
|
141
|
+
}
|
|
142
|
+
async setJobVisibilityTimeout(job, visibilityTimeout, start = new Date()) {
|
|
143
|
+
job.visibilityTimeout = visibilityTimeout;
|
|
144
|
+
const jobRunTime = Math.round((start - job.start) / 1000);
|
|
145
|
+
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2);
|
|
146
|
+
const input = {
|
|
147
|
+
QueueUrl: job.qrl,
|
|
148
|
+
ReceiptHandle: job.message.ReceiptHandle,
|
|
149
|
+
VisibilityTimeout: job.visibilityTimeout
|
|
150
|
+
};
|
|
151
|
+
debug({ ChangeMessageVisibility: input });
|
|
152
|
+
try {
|
|
153
|
+
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.ChangeMessageVisibilityCommand(input));
|
|
154
|
+
debug('ChangeMessageVisibility returned', result);
|
|
155
|
+
this.stats.sqsCalls++;
|
|
156
|
+
this.stats.timeoutsExtended++;
|
|
157
|
+
}
|
|
158
|
+
catch (err) {
|
|
159
|
+
debug('ChangeMessageVisibility error', err);
|
|
160
|
+
if (this.opt.verbose) {
|
|
161
|
+
console.error(chalk_1.default.red('FAILED_TO_SET_VISIBILITY_TIMEOUT'), { err, input });
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
async setRunningVisibilityTimeout(job) {
|
|
166
|
+
if (!this.shouldEnforceKillAfter(job))
|
|
167
|
+
return;
|
|
168
|
+
const visibilityTimeout = Math.max(1, Math.min(job.visibilityTimeout, this.opt.killAfter));
|
|
169
|
+
if (visibilityTimeout >= job.visibilityTimeout)
|
|
170
|
+
return;
|
|
171
|
+
await this.setJobVisibilityTimeout(job, visibilityTimeout);
|
|
172
|
+
}
|
|
173
|
+
async registerInlineExecution(job) {
|
|
174
|
+
if (job.executionMode === 'inline')
|
|
175
|
+
return;
|
|
176
|
+
if (job.executionMode === 'child_process') {
|
|
177
|
+
debug('registerInlineExecution ignored after registerPid', { messageId: job.message?.MessageId });
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
job.executionMode = 'inline';
|
|
181
|
+
job.killDue = false;
|
|
182
|
+
this.clearJobTimers(job);
|
|
183
|
+
if (job.status === 'running' && job.visibilityTimeout < defaultVisibilityTimeout) {
|
|
184
|
+
await this.setJobVisibilityTimeout(job, defaultVisibilityTimeout);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
logInlineKillAfterOverrun(job, start = new Date()) {
|
|
188
|
+
if (!this.opt.killAfter || !job.executionStart || job.inlineKillAfterLogged)
|
|
189
|
+
return;
|
|
190
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start);
|
|
191
|
+
if (executionTimeMs < this.opt.killAfter * 1000)
|
|
192
|
+
return;
|
|
193
|
+
job.inlineKillAfterLogged = true;
|
|
194
|
+
const executionTime = Math.floor(executionTimeMs / 1000);
|
|
195
|
+
if (this.opt.verbose) {
|
|
196
|
+
console.error(chalk_1.default.yellow('INLINE_JOB_EXCEEDED_KILL_AFTER'), job.prettyQname, chalk_1.default.yellow('after'), executionTime, chalk_1.default.yellow('seconds (limit:'), this.opt.killAfter + ')');
|
|
197
|
+
}
|
|
198
|
+
else if (!this.opt.disableLog) {
|
|
199
|
+
console.log(JSON.stringify({
|
|
200
|
+
event: 'INLINE_JOB_EXCEEDED_KILL_AFTER',
|
|
201
|
+
timestamp: start,
|
|
202
|
+
queue: job.qname,
|
|
203
|
+
messageId: job.message.MessageId,
|
|
204
|
+
executionTime,
|
|
205
|
+
killAfter: this.opt.killAfter,
|
|
206
|
+
payload: job.payload
|
|
207
|
+
}));
|
|
208
|
+
}
|
|
209
|
+
}
|
|
63
210
|
/**
|
|
64
211
|
* Changes message visibility on all running jobs using as few calls as possible.
|
|
65
212
|
*/
|
|
@@ -77,7 +224,6 @@ class JobExecutor {
|
|
|
77
224
|
this.maintainVisibilityTimeout = setTimeout(() => {
|
|
78
225
|
this.maintainPromise = this.maintainVisibility();
|
|
79
226
|
}, nextCheckInMs);
|
|
80
|
-
// debug('maintainVisibility', this.jobs)
|
|
81
227
|
const start = new Date();
|
|
82
228
|
const jobsToExtendByQrl = {};
|
|
83
229
|
const jobsToDeleteByQrl = {};
|
|
@@ -88,7 +234,6 @@ class JobExecutor {
|
|
|
88
234
|
const job = this.jobs[i];
|
|
89
235
|
const jobRunTime = Math.round((start - job.start) / 1000);
|
|
90
236
|
jobStatuses[job.status] = (jobStatuses[job.status] || 0) + 1;
|
|
91
|
-
// debug('considering job', job)
|
|
92
237
|
if (job.status === 'complete') {
|
|
93
238
|
const jobsToDelete = jobsToDeleteByQrl[job.qrl] || [];
|
|
94
239
|
job.status = 'deleting';
|
|
@@ -101,18 +246,36 @@ class JobExecutor {
|
|
|
101
246
|
else if (job.status !== 'deleting') {
|
|
102
247
|
// Any other job state gets visibility accounting
|
|
103
248
|
debug('processing', { job, jobRunTime });
|
|
249
|
+
// Kill-after enforcement: terminate child process if it exceeds the deadline.
|
|
250
|
+
// Uses executionStart (when runJob began) so FIFO serial jobs aren't
|
|
251
|
+
// penalized for queue wait time.
|
|
252
|
+
if (this.shouldEnforceKillAfter(job) && job.executionStart && !job.killed) {
|
|
253
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start);
|
|
254
|
+
if (executionTimeMs >= this.opt.killAfter * 1000) {
|
|
255
|
+
job.killDue = true;
|
|
256
|
+
this.killJob(job, start);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
else if (job.executionMode === 'inline') {
|
|
260
|
+
this.logInlineKillAfterOverrun(job, start);
|
|
261
|
+
}
|
|
104
262
|
if (jobRunTime >= job.extendAtSecond) {
|
|
105
263
|
// Add it to our organized list of jobs
|
|
106
264
|
const jobsToExtend = jobsToExtendByQrl[job.qrl] || [];
|
|
107
265
|
jobsToExtend.push(job);
|
|
108
266
|
jobsToExtendByQrl[job.qrl] = jobsToExtend;
|
|
109
|
-
// Update the visibility timeout, double every time, up to max
|
|
267
|
+
// Update the visibility timeout, double every time, up to max.
|
|
268
|
+
// Only cap at killAfter once execution has started — waiting FIFO
|
|
269
|
+
// jobs should not have their visibility reduced prematurely.
|
|
110
270
|
const doubled = job.visibilityTimeout * 2;
|
|
111
271
|
const secondsUntilMax = Math.max(1, maxJobSeconds - jobRunTime);
|
|
112
|
-
|
|
113
|
-
|
|
272
|
+
const executionTimeMs = job.executionStart ? this.getExecutionTimeMs(job, start) : 0;
|
|
273
|
+
const secondsUntilKill = (this.shouldEnforceKillAfter(job) && job.executionStart)
|
|
274
|
+
? Math.max(1, Math.ceil((this.opt.killAfter * 1000 - executionTimeMs) / 1000))
|
|
275
|
+
: Infinity;
|
|
276
|
+
job.visibilityTimeout = Math.min(doubled, secondsUntilMax, secondsUntilKill);
|
|
114
277
|
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2); // this is what we use next time
|
|
115
|
-
debug({ doubled, secondsUntilMax, job });
|
|
278
|
+
debug({ doubled, secondsUntilMax, secondsUntilKill, job });
|
|
116
279
|
}
|
|
117
280
|
}
|
|
118
281
|
}
|
|
@@ -160,7 +323,7 @@ class JobExecutor {
|
|
|
160
323
|
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.ChangeMessageVisibilityBatchCommand(input));
|
|
161
324
|
debug('ChangeMessageVisibilityBatch returned', result);
|
|
162
325
|
this.stats.sqsCalls++;
|
|
163
|
-
if (result.Failed) {
|
|
326
|
+
if (result.Failed?.length) {
|
|
164
327
|
console.error('FAILED_MESSAGES', result.Failed);
|
|
165
328
|
for (const failed of result.Failed) {
|
|
166
329
|
console.error('FAILED_TO_EXTEND_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] });
|
|
@@ -169,7 +332,7 @@ class JobExecutor {
|
|
|
169
332
|
this.jobsByMessageId[failed.Id].status = 'failed';
|
|
170
333
|
}
|
|
171
334
|
}
|
|
172
|
-
if (result.Successful) {
|
|
335
|
+
if (result.Successful?.length) {
|
|
173
336
|
const count = result.Successful.length || 0;
|
|
174
337
|
this.stats.timeoutsExtended += count;
|
|
175
338
|
if (this.opt.verbose) {
|
|
@@ -204,7 +367,7 @@ class JobExecutor {
|
|
|
204
367
|
debug({ DeleteMessageBatch: input });
|
|
205
368
|
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.DeleteMessageBatchCommand(input));
|
|
206
369
|
this.stats.sqsCalls++;
|
|
207
|
-
if (result.Failed) {
|
|
370
|
+
if (result.Failed?.length) {
|
|
208
371
|
console.error('FAILED_MESSAGES', result.Failed);
|
|
209
372
|
for (const failed of result.Failed) {
|
|
210
373
|
console.error('FAILED_TO_DELETE_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] });
|
|
@@ -213,7 +376,7 @@ class JobExecutor {
|
|
|
213
376
|
this.jobsByMessageId[failed.Id].status = 'failed';
|
|
214
377
|
}
|
|
215
378
|
}
|
|
216
|
-
if (result.Successful) {
|
|
379
|
+
if (result.Successful?.length) {
|
|
217
380
|
const count = result.Successful.length || 0;
|
|
218
381
|
this.stats.jobsDeleted += count;
|
|
219
382
|
if (this.opt.verbose) {
|
|
@@ -245,7 +408,6 @@ class JobExecutor {
|
|
|
245
408
|
}
|
|
246
409
|
addJob(message, callback, qname, qrl) {
|
|
247
410
|
// Create job entry and track it
|
|
248
|
-
const defaultVisibilityTimeout = 120;
|
|
249
411
|
const job = {
|
|
250
412
|
status: 'waiting',
|
|
251
413
|
start: new Date(),
|
|
@@ -308,8 +470,11 @@ class JobExecutor {
|
|
|
308
470
|
}));
|
|
309
471
|
}
|
|
310
472
|
job.status = 'running';
|
|
473
|
+
job.executionStart = new Date();
|
|
311
474
|
this.stats.runningJobs++;
|
|
312
475
|
this.stats.waitingJobs--;
|
|
476
|
+
this.scheduleKillAfter(job);
|
|
477
|
+
await this.setRunningVisibilityTimeout(job);
|
|
313
478
|
const queue = job.qname.slice(this.opt.prefix.length);
|
|
314
479
|
const attributes = {
|
|
315
480
|
queueName: job.qname,
|
|
@@ -317,7 +482,26 @@ class JobExecutor {
|
|
|
317
482
|
receiveCount: job.message.Attributes?.ApproximateReceiveCount || '1',
|
|
318
483
|
sentTimestamp: job.message.Attributes?.SentTimestamp || '',
|
|
319
484
|
firstReceiveTimestamp: job.message.Attributes?.ApproximateFirstReceiveTimestamp || '',
|
|
320
|
-
messageGroupId: job.message.Attributes?.MessageGroupId || ''
|
|
485
|
+
messageGroupId: job.message.Attributes?.MessageGroupId || '',
|
|
486
|
+
/** Call with a child process PID to enable kill-after process termination. */
|
|
487
|
+
registerPid: (pid) => {
|
|
488
|
+
if (job.executionMode === 'inline') {
|
|
489
|
+
debug('registerPid ignored after registerInlineExecution', { messageId: job.message?.MessageId });
|
|
490
|
+
return;
|
|
491
|
+
}
|
|
492
|
+
if (typeof pid !== 'number' || !Number.isInteger(pid) || pid <= 1 || pid === process.pid) {
|
|
493
|
+
debug('registerPid: rejected invalid PID', pid);
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
job.executionMode = 'child_process';
|
|
497
|
+
job.pid = pid;
|
|
498
|
+
if (job.killDue && !job.killed)
|
|
499
|
+
this.killJob(job, new Date());
|
|
500
|
+
},
|
|
501
|
+
/** Call before inline work starts to opt out of kill-after visibility expiry. */
|
|
502
|
+
registerInlineExecution: async () => {
|
|
503
|
+
await this.registerInlineExecution(job);
|
|
504
|
+
}
|
|
321
505
|
};
|
|
322
506
|
const result = await job.callback(queue, job.payload, attributes);
|
|
323
507
|
debug('executeJob callback finished', { payload: job.payload, result });
|
|
@@ -362,8 +546,11 @@ class JobExecutor {
|
|
|
362
546
|
}));
|
|
363
547
|
}
|
|
364
548
|
}
|
|
365
|
-
|
|
366
|
-
|
|
549
|
+
finally {
|
|
550
|
+
this.clearJobTimers(job);
|
|
551
|
+
this.stats.activeJobs--;
|
|
552
|
+
this.stats.runningJobs--;
|
|
553
|
+
}
|
|
367
554
|
}
|
|
368
555
|
async executeJobs(messages, callback, qname, qrl) {
|
|
369
556
|
if (this.shutdownRequested)
|
|
@@ -372,13 +559,11 @@ class JobExecutor {
|
|
|
372
559
|
const jobs = messages.map(message => this.addJob(message, callback, qname, qrl));
|
|
373
560
|
const isFifo = qrl.endsWith('.fifo');
|
|
374
561
|
const runningJobs = [];
|
|
375
|
-
// console.log(jobs)
|
|
376
562
|
// Begin executing
|
|
377
563
|
for (const [job, i] of jobs.map((job, i) => [job, i])) {
|
|
378
564
|
// Figure out if the next job needs to happen in serial, otherwise we can parallel execute
|
|
379
565
|
const nextJob = jobs[i + 1];
|
|
380
566
|
const nextJobIsSerial = isFifo && nextJob && job.message?.Attributes?.MessageGroupId === nextJob.message?.Attributes?.MessageGroupId;
|
|
381
|
-
// console.log({ i, nextJobAtt: nextJob?.message?.Attributes, nextJobIsSerial })
|
|
382
567
|
// Execute serial or parallel
|
|
383
568
|
if (nextJobIsSerial)
|
|
384
569
|
await this.runJob(job);
|
package/package.json
CHANGED
|
@@ -3,10 +3,15 @@
|
|
|
3
3
|
* their visibility timeouts and deleting them when they are successful.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
ChangeMessageVisibilityBatchCommand,
|
|
8
|
+
ChangeMessageVisibilityCommand,
|
|
9
|
+
DeleteMessageBatchCommand
|
|
10
|
+
} from '@aws-sdk/client-sqs'
|
|
7
11
|
|
|
8
12
|
import chalk from 'chalk'
|
|
9
13
|
import Debug from 'debug'
|
|
14
|
+
import treeKill from 'tree-kill'
|
|
10
15
|
|
|
11
16
|
import { dedupSuccessfullyProcessed } from '../dedup.js'
|
|
12
17
|
import { getSQSClient } from '../sqs.js'
|
|
@@ -14,6 +19,8 @@ import { getSQSClient } from '../sqs.js'
|
|
|
14
19
|
const debug = Debug('qdone:jobExecutor')
|
|
15
20
|
|
|
16
21
|
const maxJobSeconds = 12 * 60 * 60
|
|
22
|
+
const defaultVisibilityTimeout = 120
|
|
23
|
+
const SIGKILL_DELAY_MS = 5000
|
|
17
24
|
|
|
18
25
|
export class JobExecutor {
|
|
19
26
|
constructor (opt) {
|
|
@@ -29,7 +36,8 @@ export class JobExecutor {
|
|
|
29
36
|
timeoutsExtended: 0,
|
|
30
37
|
jobsSucceeded: 0,
|
|
31
38
|
jobsFailed: 0,
|
|
32
|
-
jobsDeleted: 0
|
|
39
|
+
jobsDeleted: 0,
|
|
40
|
+
jobsKilled: 0
|
|
33
41
|
}
|
|
34
42
|
this.maintainPromise = this.maintainVisibility()
|
|
35
43
|
debug({ this: this })
|
|
@@ -64,6 +72,153 @@ export class JobExecutor {
|
|
|
64
72
|
return runningCount
|
|
65
73
|
}
|
|
66
74
|
|
|
75
|
+
clearJobTimers (job) {
|
|
76
|
+
clearTimeout(job.killTimer)
|
|
77
|
+
clearTimeout(job.killSignalTimer)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
getExecutionTimeMs (job, start = new Date()) {
|
|
81
|
+
return start - job.executionStart
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
shouldEnforceKillAfter (job) {
|
|
85
|
+
return !!(this.opt.killAfter && job.executionMode !== 'inline')
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
scheduleKillAfter (job) {
|
|
89
|
+
if (!this.opt.killAfter) return
|
|
90
|
+
clearTimeout(job.killTimer)
|
|
91
|
+
job.killTimer = setTimeout(() => {
|
|
92
|
+
job.killDue = true
|
|
93
|
+
this.killJob(job, new Date())
|
|
94
|
+
}, this.opt.killAfter * 1000)
|
|
95
|
+
job.killTimer.unref?.()
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
killJob (job, start = new Date()) {
|
|
99
|
+
if (!job.executionStart || job.status !== 'running') return
|
|
100
|
+
if (job.killed) return
|
|
101
|
+
if (!this.shouldEnforceKillAfter(job)) return
|
|
102
|
+
|
|
103
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start)
|
|
104
|
+
if (executionTimeMs < this.opt.killAfter * 1000) return
|
|
105
|
+
const executionTime = Math.floor(executionTimeMs / 1000)
|
|
106
|
+
|
|
107
|
+
job.killDue = true
|
|
108
|
+
if (!job.pid) {
|
|
109
|
+
debug('killAfter reached before PID registration', { messageId: job.message?.MessageId, executionTime })
|
|
110
|
+
return
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
job.killed = true
|
|
114
|
+
this.stats.jobsKilled++
|
|
115
|
+
const pid = job.pid
|
|
116
|
+
const killTree = this.opt.killTree || treeKill
|
|
117
|
+
|
|
118
|
+
if (this.opt.verbose) {
|
|
119
|
+
console.error(chalk.red('KILLING'), job.prettyQname, chalk.red('pid'), pid,
|
|
120
|
+
chalk.red('after'), executionTime, chalk.red('seconds (limit:'), this.opt.killAfter + ')')
|
|
121
|
+
} else if (!this.opt.disableLog) {
|
|
122
|
+
console.log(JSON.stringify({
|
|
123
|
+
event: 'JOB_KILL_AFTER',
|
|
124
|
+
timestamp: start,
|
|
125
|
+
queue: job.qname,
|
|
126
|
+
messageId: job.message.MessageId,
|
|
127
|
+
pid,
|
|
128
|
+
executionTime,
|
|
129
|
+
killAfter: this.opt.killAfter,
|
|
130
|
+
payload: job.payload
|
|
131
|
+
}))
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
killTree(pid, 'SIGTERM', (err) => {
|
|
135
|
+
if (err) debug('treeKill SIGTERM error', err.message)
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
clearTimeout(job.killSignalTimer)
|
|
139
|
+
job.killSignalTimer = setTimeout(() => {
|
|
140
|
+
try { process.kill(pid, 0) } catch (e) { if (e.code === 'ESRCH') return }
|
|
141
|
+
killTree(pid, 'SIGKILL', (err) => {
|
|
142
|
+
if (err) debug('treeKill SIGKILL error', err.message)
|
|
143
|
+
})
|
|
144
|
+
}, SIGKILL_DELAY_MS)
|
|
145
|
+
job.killSignalTimer.unref?.()
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async setJobVisibilityTimeout (job, visibilityTimeout, start = new Date()) {
|
|
149
|
+
job.visibilityTimeout = visibilityTimeout
|
|
150
|
+
const jobRunTime = Math.round((start - job.start) / 1000)
|
|
151
|
+
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2)
|
|
152
|
+
|
|
153
|
+
const input = {
|
|
154
|
+
QueueUrl: job.qrl,
|
|
155
|
+
ReceiptHandle: job.message.ReceiptHandle,
|
|
156
|
+
VisibilityTimeout: job.visibilityTimeout
|
|
157
|
+
}
|
|
158
|
+
debug({ ChangeMessageVisibility: input })
|
|
159
|
+
|
|
160
|
+
try {
|
|
161
|
+
const result = await getSQSClient().send(new ChangeMessageVisibilityCommand(input))
|
|
162
|
+
debug('ChangeMessageVisibility returned', result)
|
|
163
|
+
this.stats.sqsCalls++
|
|
164
|
+
this.stats.timeoutsExtended++
|
|
165
|
+
} catch (err) {
|
|
166
|
+
debug('ChangeMessageVisibility error', err)
|
|
167
|
+
if (this.opt.verbose) {
|
|
168
|
+
console.error(chalk.red('FAILED_TO_SET_VISIBILITY_TIMEOUT'), { err, input })
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async setRunningVisibilityTimeout (job) {
|
|
174
|
+
if (!this.shouldEnforceKillAfter(job)) return
|
|
175
|
+
|
|
176
|
+
const visibilityTimeout = Math.max(1, Math.min(job.visibilityTimeout, this.opt.killAfter))
|
|
177
|
+
if (visibilityTimeout >= job.visibilityTimeout) return
|
|
178
|
+
|
|
179
|
+
await this.setJobVisibilityTimeout(job, visibilityTimeout)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async registerInlineExecution (job) {
|
|
183
|
+
if (job.executionMode === 'inline') return
|
|
184
|
+
if (job.executionMode === 'child_process') {
|
|
185
|
+
debug('registerInlineExecution ignored after registerPid', { messageId: job.message?.MessageId })
|
|
186
|
+
return
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
job.executionMode = 'inline'
|
|
190
|
+
job.killDue = false
|
|
191
|
+
this.clearJobTimers(job)
|
|
192
|
+
|
|
193
|
+
if (job.status === 'running' && job.visibilityTimeout < defaultVisibilityTimeout) {
|
|
194
|
+
await this.setJobVisibilityTimeout(job, defaultVisibilityTimeout)
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
logInlineKillAfterOverrun (job, start = new Date()) {
|
|
199
|
+
if (!this.opt.killAfter || !job.executionStart || job.inlineKillAfterLogged) return
|
|
200
|
+
|
|
201
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start)
|
|
202
|
+
if (executionTimeMs < this.opt.killAfter * 1000) return
|
|
203
|
+
|
|
204
|
+
job.inlineKillAfterLogged = true
|
|
205
|
+
const executionTime = Math.floor(executionTimeMs / 1000)
|
|
206
|
+
if (this.opt.verbose) {
|
|
207
|
+
console.error(chalk.yellow('INLINE_JOB_EXCEEDED_KILL_AFTER'), job.prettyQname,
|
|
208
|
+
chalk.yellow('after'), executionTime, chalk.yellow('seconds (limit:'), this.opt.killAfter + ')')
|
|
209
|
+
} else if (!this.opt.disableLog) {
|
|
210
|
+
console.log(JSON.stringify({
|
|
211
|
+
event: 'INLINE_JOB_EXCEEDED_KILL_AFTER',
|
|
212
|
+
timestamp: start,
|
|
213
|
+
queue: job.qname,
|
|
214
|
+
messageId: job.message.MessageId,
|
|
215
|
+
executionTime,
|
|
216
|
+
killAfter: this.opt.killAfter,
|
|
217
|
+
payload: job.payload
|
|
218
|
+
}))
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
67
222
|
/**
|
|
68
223
|
* Changes message visibility on all running jobs using as few calls as possible.
|
|
69
224
|
*/
|
|
@@ -83,7 +238,6 @@ export class JobExecutor {
|
|
|
83
238
|
this.maintainPromise = this.maintainVisibility()
|
|
84
239
|
}, nextCheckInMs)
|
|
85
240
|
|
|
86
|
-
// debug('maintainVisibility', this.jobs)
|
|
87
241
|
const start = new Date()
|
|
88
242
|
const jobsToExtendByQrl = {}
|
|
89
243
|
const jobsToDeleteByQrl = {}
|
|
@@ -95,7 +249,6 @@ export class JobExecutor {
|
|
|
95
249
|
const job = this.jobs[i]
|
|
96
250
|
const jobRunTime = Math.round((start - job.start) / 1000)
|
|
97
251
|
jobStatuses[job.status] = (jobStatuses[job.status] || 0) + 1
|
|
98
|
-
// debug('considering job', job)
|
|
99
252
|
if (job.status === 'complete') {
|
|
100
253
|
const jobsToDelete = jobsToDeleteByQrl[job.qrl] || []
|
|
101
254
|
job.status = 'deleting'
|
|
@@ -106,19 +259,38 @@ export class JobExecutor {
|
|
|
106
259
|
} else if (job.status !== 'deleting') {
|
|
107
260
|
// Any other job state gets visibility accounting
|
|
108
261
|
debug('processing', { job, jobRunTime })
|
|
262
|
+
|
|
263
|
+
// Kill-after enforcement: terminate child process if it exceeds the deadline.
|
|
264
|
+
// Uses executionStart (when runJob began) so FIFO serial jobs aren't
|
|
265
|
+
// penalized for queue wait time.
|
|
266
|
+
if (this.shouldEnforceKillAfter(job) && job.executionStart && !job.killed) {
|
|
267
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start)
|
|
268
|
+
if (executionTimeMs >= this.opt.killAfter * 1000) {
|
|
269
|
+
job.killDue = true
|
|
270
|
+
this.killJob(job, start)
|
|
271
|
+
}
|
|
272
|
+
} else if (job.executionMode === 'inline') {
|
|
273
|
+
this.logInlineKillAfterOverrun(job, start)
|
|
274
|
+
}
|
|
275
|
+
|
|
109
276
|
if (jobRunTime >= job.extendAtSecond) {
|
|
110
277
|
// Add it to our organized list of jobs
|
|
111
278
|
const jobsToExtend = jobsToExtendByQrl[job.qrl] || []
|
|
112
279
|
jobsToExtend.push(job)
|
|
113
280
|
jobsToExtendByQrl[job.qrl] = jobsToExtend
|
|
114
281
|
|
|
115
|
-
// Update the visibility timeout, double every time, up to max
|
|
282
|
+
// Update the visibility timeout, double every time, up to max.
|
|
283
|
+
// Only cap at killAfter once execution has started — waiting FIFO
|
|
284
|
+
// jobs should not have their visibility reduced prematurely.
|
|
116
285
|
const doubled = job.visibilityTimeout * 2
|
|
117
286
|
const secondsUntilMax = Math.max(1, maxJobSeconds - jobRunTime)
|
|
118
|
-
|
|
119
|
-
|
|
287
|
+
const executionTimeMs = job.executionStart ? this.getExecutionTimeMs(job, start) : 0
|
|
288
|
+
const secondsUntilKill = (this.shouldEnforceKillAfter(job) && job.executionStart)
|
|
289
|
+
? Math.max(1, Math.ceil((this.opt.killAfter * 1000 - executionTimeMs) / 1000))
|
|
290
|
+
: Infinity
|
|
291
|
+
job.visibilityTimeout = Math.min(doubled, secondsUntilMax, secondsUntilKill)
|
|
120
292
|
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2) // this is what we use next time
|
|
121
|
-
debug({ doubled, secondsUntilMax, job })
|
|
293
|
+
debug({ doubled, secondsUntilMax, secondsUntilKill, job })
|
|
122
294
|
}
|
|
123
295
|
}
|
|
124
296
|
}
|
|
@@ -164,7 +336,7 @@ export class JobExecutor {
|
|
|
164
336
|
const result = await getSQSClient().send(new ChangeMessageVisibilityBatchCommand(input))
|
|
165
337
|
debug('ChangeMessageVisibilityBatch returned', result)
|
|
166
338
|
this.stats.sqsCalls++
|
|
167
|
-
if (result.Failed) {
|
|
339
|
+
if (result.Failed?.length) {
|
|
168
340
|
console.error('FAILED_MESSAGES', result.Failed)
|
|
169
341
|
for (const failed of result.Failed) {
|
|
170
342
|
console.error('FAILED_TO_EXTEND_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] })
|
|
@@ -172,7 +344,7 @@ export class JobExecutor {
|
|
|
172
344
|
if (this.jobsByMessageId[failed.Id]) this.jobsByMessageId[failed.Id].status = 'failed'
|
|
173
345
|
}
|
|
174
346
|
}
|
|
175
|
-
if (result.Successful) {
|
|
347
|
+
if (result.Successful?.length) {
|
|
176
348
|
const count = result.Successful.length || 0
|
|
177
349
|
this.stats.timeoutsExtended += count
|
|
178
350
|
if (this.opt.verbose) {
|
|
@@ -208,7 +380,7 @@ export class JobExecutor {
|
|
|
208
380
|
debug({ DeleteMessageBatch: input })
|
|
209
381
|
const result = await getSQSClient().send(new DeleteMessageBatchCommand(input))
|
|
210
382
|
this.stats.sqsCalls++
|
|
211
|
-
if (result.Failed) {
|
|
383
|
+
if (result.Failed?.length) {
|
|
212
384
|
console.error('FAILED_MESSAGES', result.Failed)
|
|
213
385
|
for (const failed of result.Failed) {
|
|
214
386
|
console.error('FAILED_TO_DELETE_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] })
|
|
@@ -216,7 +388,7 @@ export class JobExecutor {
|
|
|
216
388
|
if (this.jobsByMessageId[failed.Id]) this.jobsByMessageId[failed.Id].status = 'failed'
|
|
217
389
|
}
|
|
218
390
|
}
|
|
219
|
-
if (result.Successful) {
|
|
391
|
+
if (result.Successful?.length) {
|
|
220
392
|
const count = result.Successful.length || 0
|
|
221
393
|
this.stats.jobsDeleted += count
|
|
222
394
|
if (this.opt.verbose) {
|
|
@@ -254,7 +426,6 @@ export class JobExecutor {
|
|
|
254
426
|
|
|
255
427
|
addJob (message, callback, qname, qrl) {
|
|
256
428
|
// Create job entry and track it
|
|
257
|
-
const defaultVisibilityTimeout = 120
|
|
258
429
|
const job = {
|
|
259
430
|
status: 'waiting',
|
|
260
431
|
start: new Date(),
|
|
@@ -319,8 +490,11 @@ export class JobExecutor {
|
|
|
319
490
|
}))
|
|
320
491
|
}
|
|
321
492
|
job.status = 'running'
|
|
493
|
+
job.executionStart = new Date()
|
|
322
494
|
this.stats.runningJobs++
|
|
323
495
|
this.stats.waitingJobs--
|
|
496
|
+
this.scheduleKillAfter(job)
|
|
497
|
+
await this.setRunningVisibilityTimeout(job)
|
|
324
498
|
const queue = job.qname.slice(this.opt.prefix.length)
|
|
325
499
|
const attributes = {
|
|
326
500
|
queueName: job.qname,
|
|
@@ -328,7 +502,25 @@ export class JobExecutor {
|
|
|
328
502
|
receiveCount: job.message.Attributes?.ApproximateReceiveCount || '1',
|
|
329
503
|
sentTimestamp: job.message.Attributes?.SentTimestamp || '',
|
|
330
504
|
firstReceiveTimestamp: job.message.Attributes?.ApproximateFirstReceiveTimestamp || '',
|
|
331
|
-
messageGroupId: job.message.Attributes?.MessageGroupId || ''
|
|
505
|
+
messageGroupId: job.message.Attributes?.MessageGroupId || '',
|
|
506
|
+
/** Call with a child process PID to enable kill-after process termination. */
|
|
507
|
+
registerPid: (pid) => {
|
|
508
|
+
if (job.executionMode === 'inline') {
|
|
509
|
+
debug('registerPid ignored after registerInlineExecution', { messageId: job.message?.MessageId })
|
|
510
|
+
return
|
|
511
|
+
}
|
|
512
|
+
if (typeof pid !== 'number' || !Number.isInteger(pid) || pid <= 1 || pid === process.pid) {
|
|
513
|
+
debug('registerPid: rejected invalid PID', pid)
|
|
514
|
+
return
|
|
515
|
+
}
|
|
516
|
+
job.executionMode = 'child_process'
|
|
517
|
+
job.pid = pid
|
|
518
|
+
if (job.killDue && !job.killed) this.killJob(job, new Date())
|
|
519
|
+
},
|
|
520
|
+
/** Call before inline work starts to opt out of kill-after visibility expiry. */
|
|
521
|
+
registerInlineExecution: async () => {
|
|
522
|
+
await this.registerInlineExecution(job)
|
|
523
|
+
}
|
|
332
524
|
}
|
|
333
525
|
const result = await job.callback(queue, job.payload, attributes)
|
|
334
526
|
debug('executeJob callback finished', { payload: job.payload, result })
|
|
@@ -370,9 +562,11 @@ export class JobExecutor {
|
|
|
370
562
|
err
|
|
371
563
|
}))
|
|
372
564
|
}
|
|
565
|
+
} finally {
|
|
566
|
+
this.clearJobTimers(job)
|
|
567
|
+
this.stats.activeJobs--
|
|
568
|
+
this.stats.runningJobs--
|
|
373
569
|
}
|
|
374
|
-
this.stats.activeJobs--
|
|
375
|
-
this.stats.runningJobs--
|
|
376
570
|
}
|
|
377
571
|
|
|
378
572
|
async executeJobs (messages, callback, qname, qrl) {
|
|
@@ -383,15 +577,12 @@ export class JobExecutor {
|
|
|
383
577
|
const isFifo = qrl.endsWith('.fifo')
|
|
384
578
|
const runningJobs = []
|
|
385
579
|
|
|
386
|
-
// console.log(jobs)
|
|
387
|
-
|
|
388
580
|
// Begin executing
|
|
389
581
|
for (const [job, i] of jobs.map((job, i) => [job, i])) {
|
|
390
582
|
// Figure out if the next job needs to happen in serial, otherwise we can parallel execute
|
|
391
583
|
const nextJob = jobs[i + 1]
|
|
392
584
|
const nextJobIsSerial = isFifo && nextJob && job.message?.Attributes?.MessageGroupId === nextJob.message?.Attributes?.MessageGroupId
|
|
393
585
|
|
|
394
|
-
// console.log({ i, nextJobAtt: nextJob?.message?.Attributes, nextJobIsSerial })
|
|
395
586
|
// Execute serial or parallel
|
|
396
587
|
if (nextJobIsSerial) await this.runJob(job)
|
|
397
588
|
else runningJobs.push(this.runJob(job))
|