qdone 2.2.3 → 2.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commonjs/src/monitor.js +15 -3
- package/commonjs/src/scheduler/jobExecutor.js +144 -13
- package/package.json +1 -1
- package/src/cli.js +5 -3
- package/src/monitor.js +17 -4
- package/src/scheduler/jobExecutor.js +149 -14
package/commonjs/src/monitor.js
CHANGED
|
@@ -26,7 +26,7 @@ async function monitor(queue, save, options) {
|
|
|
26
26
|
const opt = (0, defaults_js_1.getOptionsWithDefaults)(options);
|
|
27
27
|
const queueName = (0, qrlCache_js_1.normalizeQueueName)(queue, opt);
|
|
28
28
|
debug({ options, opt, queue, queueName });
|
|
29
|
-
const data = await getAggregateData(queueName);
|
|
29
|
+
const data = await getAggregateData(queueName, opt);
|
|
30
30
|
console.log(data);
|
|
31
31
|
if (save) {
|
|
32
32
|
if (opt.verbose)
|
|
@@ -85,7 +85,7 @@ async function getQueueAge(queueName) {
|
|
|
85
85
|
* Metrics (from CloudWatch):
|
|
86
86
|
* - ApproximateAgeOfOldestMessage: Max
|
|
87
87
|
*/
|
|
88
|
-
async function getAggregateData(queueName) {
|
|
88
|
+
async function getAggregateData(queueName, opt) {
|
|
89
89
|
const { prefix, suffixRegex } = interpretWildcard(queueName);
|
|
90
90
|
const qrls = await (0, sqs_js_1.getMatchingQueues)(prefix, suffixRegex);
|
|
91
91
|
// debug({ qrls })
|
|
@@ -105,7 +105,19 @@ async function getAggregateData(queueName) {
|
|
|
105
105
|
}
|
|
106
106
|
// Fetch ApproximateAgeOfOldestMessage from CloudWatch (not available via SQS API)
|
|
107
107
|
// Only query queues with messages to minimize CloudWatch API costs
|
|
108
|
-
|
|
108
|
+
// Filter out dead and failed queues for age calculation only — their messages
|
|
109
|
+
// age indefinitely by design, polluting the active age metric.
|
|
110
|
+
// But if the pattern itself targets dead/failed queues, don't filter them out.
|
|
111
|
+
const failSuffix = (opt && opt.failSuffix) || defaults_js_1.defaults.failSuffix;
|
|
112
|
+
const dlqSuffix = (opt && opt.dlqSuffix) || defaults_js_1.defaults.dlqSuffix;
|
|
113
|
+
const strippedPattern = queueName.replace(/\.fifo$/, '');
|
|
114
|
+
const patternTargetsDeadFailed = strippedPattern.endsWith(failSuffix) || strippedPattern.endsWith(dlqSuffix);
|
|
115
|
+
const esc = s => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
116
|
+
const deadFailedRegex = new RegExp(`(${esc(failSuffix)}|${esc(dlqSuffix)})(\\.fifo)?$`);
|
|
117
|
+
const activeQueueNames = patternTargetsDeadFailed
|
|
118
|
+
? [...total.contributingQueueNames]
|
|
119
|
+
: [...total.contributingQueueNames].filter(q => !deadFailedRegex.test(q));
|
|
120
|
+
const ageResults = await Promise.all(activeQueueNames.map(queue => getQueueAge(queue)));
|
|
109
121
|
total.ApproximateAgeOfOldestMessage = Math.max(0, ...ageResults);
|
|
110
122
|
// debug({ total })
|
|
111
123
|
// convert set to array
|
|
@@ -11,10 +11,13 @@ exports.JobExecutor = void 0;
|
|
|
11
11
|
const client_sqs_1 = require("@aws-sdk/client-sqs");
|
|
12
12
|
const chalk_1 = __importDefault(require("chalk"));
|
|
13
13
|
const debug_1 = __importDefault(require("debug"));
|
|
14
|
+
const tree_kill_1 = __importDefault(require("tree-kill"));
|
|
14
15
|
const dedup_js_1 = require("../dedup.js");
|
|
15
16
|
const sqs_js_1 = require("../sqs.js");
|
|
16
17
|
const debug = (0, debug_1.default)('qdone:jobExecutor');
|
|
17
18
|
const maxJobSeconds = 12 * 60 * 60;
|
|
19
|
+
const defaultVisibilityTimeout = 120;
|
|
20
|
+
const SIGKILL_DELAY_MS = 5000;
|
|
18
21
|
class JobExecutor {
|
|
19
22
|
constructor(opt) {
|
|
20
23
|
this.opt = opt;
|
|
@@ -29,7 +32,8 @@ class JobExecutor {
|
|
|
29
32
|
timeoutsExtended: 0,
|
|
30
33
|
jobsSucceeded: 0,
|
|
31
34
|
jobsFailed: 0,
|
|
32
|
-
jobsDeleted: 0
|
|
35
|
+
jobsDeleted: 0,
|
|
36
|
+
jobsKilled: 0
|
|
33
37
|
};
|
|
34
38
|
this.maintainPromise = this.maintainVisibility();
|
|
35
39
|
debug({ this: this });
|
|
@@ -60,6 +64,103 @@ class JobExecutor {
|
|
|
60
64
|
runningCount += job.status === 'running';
|
|
61
65
|
return runningCount;
|
|
62
66
|
}
|
|
67
|
+
clearJobTimers(job) {
|
|
68
|
+
clearTimeout(job.killTimer);
|
|
69
|
+
clearTimeout(job.killSignalTimer);
|
|
70
|
+
}
|
|
71
|
+
getExecutionTimeMs(job, start = new Date()) {
|
|
72
|
+
return start - job.executionStart;
|
|
73
|
+
}
|
|
74
|
+
scheduleKillAfter(job) {
|
|
75
|
+
if (!this.opt.killAfter)
|
|
76
|
+
return;
|
|
77
|
+
clearTimeout(job.killTimer);
|
|
78
|
+
job.killTimer = setTimeout(() => {
|
|
79
|
+
job.killDue = true;
|
|
80
|
+
this.killJob(job, new Date());
|
|
81
|
+
}, this.opt.killAfter * 1000);
|
|
82
|
+
job.killTimer.unref?.();
|
|
83
|
+
}
|
|
84
|
+
killJob(job, start = new Date()) {
|
|
85
|
+
if (!job.executionStart || job.status !== 'running')
|
|
86
|
+
return;
|
|
87
|
+
if (job.killed)
|
|
88
|
+
return;
|
|
89
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start);
|
|
90
|
+
if (executionTimeMs < this.opt.killAfter * 1000)
|
|
91
|
+
return;
|
|
92
|
+
const executionTime = Math.floor(executionTimeMs / 1000);
|
|
93
|
+
job.killDue = true;
|
|
94
|
+
if (!job.pid) {
|
|
95
|
+
debug('killAfter reached before PID registration', { messageId: job.message?.MessageId, executionTime });
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
job.killed = true;
|
|
99
|
+
this.stats.jobsKilled++;
|
|
100
|
+
const pid = job.pid;
|
|
101
|
+
const killTree = this.opt.killTree || tree_kill_1.default;
|
|
102
|
+
if (this.opt.verbose) {
|
|
103
|
+
console.error(chalk_1.default.red('KILLING'), job.prettyQname, chalk_1.default.red('pid'), pid, chalk_1.default.red('after'), executionTime, chalk_1.default.red('seconds (limit:'), this.opt.killAfter + ')');
|
|
104
|
+
}
|
|
105
|
+
else if (!this.opt.disableLog) {
|
|
106
|
+
console.log(JSON.stringify({
|
|
107
|
+
event: 'JOB_KILL_AFTER',
|
|
108
|
+
timestamp: start,
|
|
109
|
+
queue: job.qname,
|
|
110
|
+
messageId: job.message.MessageId,
|
|
111
|
+
pid,
|
|
112
|
+
executionTime,
|
|
113
|
+
killAfter: this.opt.killAfter,
|
|
114
|
+
payload: job.payload
|
|
115
|
+
}));
|
|
116
|
+
}
|
|
117
|
+
killTree(pid, 'SIGTERM', (err) => {
|
|
118
|
+
if (err)
|
|
119
|
+
debug('treeKill SIGTERM error', err.message);
|
|
120
|
+
});
|
|
121
|
+
clearTimeout(job.killSignalTimer);
|
|
122
|
+
job.killSignalTimer = setTimeout(() => {
|
|
123
|
+
try {
|
|
124
|
+
process.kill(pid, 0);
|
|
125
|
+
}
|
|
126
|
+
catch (e) {
|
|
127
|
+
if (e.code === 'ESRCH')
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
killTree(pid, 'SIGKILL', (err) => {
|
|
131
|
+
if (err)
|
|
132
|
+
debug('treeKill SIGKILL error', err.message);
|
|
133
|
+
});
|
|
134
|
+
}, SIGKILL_DELAY_MS);
|
|
135
|
+
job.killSignalTimer.unref?.();
|
|
136
|
+
}
|
|
137
|
+
async setRunningVisibilityTimeout(job) {
|
|
138
|
+
if (!this.opt.killAfter)
|
|
139
|
+
return;
|
|
140
|
+
const visibilityTimeout = Math.max(1, Math.min(job.visibilityTimeout, this.opt.killAfter));
|
|
141
|
+
if (visibilityTimeout >= job.visibilityTimeout)
|
|
142
|
+
return;
|
|
143
|
+
job.visibilityTimeout = visibilityTimeout;
|
|
144
|
+
job.extendAtSecond = Math.round(job.visibilityTimeout / 2);
|
|
145
|
+
const input = {
|
|
146
|
+
QueueUrl: job.qrl,
|
|
147
|
+
ReceiptHandle: job.message.ReceiptHandle,
|
|
148
|
+
VisibilityTimeout: job.visibilityTimeout
|
|
149
|
+
};
|
|
150
|
+
debug({ ChangeMessageVisibility: input });
|
|
151
|
+
try {
|
|
152
|
+
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.ChangeMessageVisibilityCommand(input));
|
|
153
|
+
debug('ChangeMessageVisibility returned', result);
|
|
154
|
+
this.stats.sqsCalls++;
|
|
155
|
+
this.stats.timeoutsExtended++;
|
|
156
|
+
}
|
|
157
|
+
catch (err) {
|
|
158
|
+
debug('ChangeMessageVisibility error', err);
|
|
159
|
+
if (this.opt.verbose) {
|
|
160
|
+
console.error(chalk_1.default.red('FAILED_TO_SET_VISIBILITY_TIMEOUT'), { err, input });
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
63
164
|
/**
|
|
64
165
|
* Changes message visibility on all running jobs using as few calls as possible.
|
|
65
166
|
*/
|
|
@@ -101,18 +202,33 @@ class JobExecutor {
|
|
|
101
202
|
else if (job.status !== 'deleting') {
|
|
102
203
|
// Any other job state gets visibility accounting
|
|
103
204
|
debug('processing', { job, jobRunTime });
|
|
205
|
+
// Kill-after enforcement: terminate child process if it exceeds the deadline.
|
|
206
|
+
// Uses executionStart (when runJob began) so FIFO serial jobs aren't
|
|
207
|
+
// penalized for queue wait time.
|
|
208
|
+
if (this.opt.killAfter && job.executionStart && !job.killed) {
|
|
209
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start);
|
|
210
|
+
if (executionTimeMs >= this.opt.killAfter * 1000) {
|
|
211
|
+
job.killDue = true;
|
|
212
|
+
this.killJob(job, start);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
104
215
|
if (jobRunTime >= job.extendAtSecond) {
|
|
105
216
|
// Add it to our organized list of jobs
|
|
106
217
|
const jobsToExtend = jobsToExtendByQrl[job.qrl] || [];
|
|
107
218
|
jobsToExtend.push(job);
|
|
108
219
|
jobsToExtendByQrl[job.qrl] = jobsToExtend;
|
|
109
|
-
// Update the visibility timeout, double every time, up to max
|
|
220
|
+
// Update the visibility timeout, double every time, up to max.
|
|
221
|
+
// Only cap at killAfter once execution has started — waiting FIFO
|
|
222
|
+
// jobs should not have their visibility reduced prematurely.
|
|
110
223
|
const doubled = job.visibilityTimeout * 2;
|
|
111
224
|
const secondsUntilMax = Math.max(1, maxJobSeconds - jobRunTime);
|
|
112
|
-
|
|
113
|
-
|
|
225
|
+
const executionTimeMs = job.executionStart ? this.getExecutionTimeMs(job, start) : 0;
|
|
226
|
+
const secondsUntilKill = (this.opt.killAfter && job.executionStart)
|
|
227
|
+
? Math.max(1, Math.ceil((this.opt.killAfter * 1000 - executionTimeMs) / 1000))
|
|
228
|
+
: Infinity;
|
|
229
|
+
job.visibilityTimeout = Math.min(doubled, secondsUntilMax, secondsUntilKill);
|
|
114
230
|
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2); // this is what we use next time
|
|
115
|
-
debug({ doubled, secondsUntilMax, job });
|
|
231
|
+
debug({ doubled, secondsUntilMax, secondsUntilKill, job });
|
|
116
232
|
}
|
|
117
233
|
}
|
|
118
234
|
}
|
|
@@ -160,7 +276,7 @@ class JobExecutor {
|
|
|
160
276
|
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.ChangeMessageVisibilityBatchCommand(input));
|
|
161
277
|
debug('ChangeMessageVisibilityBatch returned', result);
|
|
162
278
|
this.stats.sqsCalls++;
|
|
163
|
-
if (result.Failed) {
|
|
279
|
+
if (result.Failed?.length) {
|
|
164
280
|
console.error('FAILED_MESSAGES', result.Failed);
|
|
165
281
|
for (const failed of result.Failed) {
|
|
166
282
|
console.error('FAILED_TO_EXTEND_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] });
|
|
@@ -169,7 +285,7 @@ class JobExecutor {
|
|
|
169
285
|
this.jobsByMessageId[failed.Id].status = 'failed';
|
|
170
286
|
}
|
|
171
287
|
}
|
|
172
|
-
if (result.Successful) {
|
|
288
|
+
if (result.Successful?.length) {
|
|
173
289
|
const count = result.Successful.length || 0;
|
|
174
290
|
this.stats.timeoutsExtended += count;
|
|
175
291
|
if (this.opt.verbose) {
|
|
@@ -204,7 +320,7 @@ class JobExecutor {
|
|
|
204
320
|
debug({ DeleteMessageBatch: input });
|
|
205
321
|
const result = await (0, sqs_js_1.getSQSClient)().send(new client_sqs_1.DeleteMessageBatchCommand(input));
|
|
206
322
|
this.stats.sqsCalls++;
|
|
207
|
-
if (result.Failed) {
|
|
323
|
+
if (result.Failed?.length) {
|
|
208
324
|
console.error('FAILED_MESSAGES', result.Failed);
|
|
209
325
|
for (const failed of result.Failed) {
|
|
210
326
|
console.error('FAILED_TO_DELETE_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] });
|
|
@@ -213,7 +329,7 @@ class JobExecutor {
|
|
|
213
329
|
this.jobsByMessageId[failed.Id].status = 'failed';
|
|
214
330
|
}
|
|
215
331
|
}
|
|
216
|
-
if (result.Successful) {
|
|
332
|
+
if (result.Successful?.length) {
|
|
217
333
|
const count = result.Successful.length || 0;
|
|
218
334
|
this.stats.jobsDeleted += count;
|
|
219
335
|
if (this.opt.verbose) {
|
|
@@ -245,7 +361,6 @@ class JobExecutor {
|
|
|
245
361
|
}
|
|
246
362
|
addJob(message, callback, qname, qrl) {
|
|
247
363
|
// Create job entry and track it
|
|
248
|
-
const defaultVisibilityTimeout = 120;
|
|
249
364
|
const job = {
|
|
250
365
|
status: 'waiting',
|
|
251
366
|
start: new Date(),
|
|
@@ -308,8 +423,11 @@ class JobExecutor {
|
|
|
308
423
|
}));
|
|
309
424
|
}
|
|
310
425
|
job.status = 'running';
|
|
426
|
+
job.executionStart = new Date();
|
|
311
427
|
this.stats.runningJobs++;
|
|
312
428
|
this.stats.waitingJobs--;
|
|
429
|
+
this.scheduleKillAfter(job);
|
|
430
|
+
await this.setRunningVisibilityTimeout(job);
|
|
313
431
|
const queue = job.qname.slice(this.opt.prefix.length);
|
|
314
432
|
const attributes = {
|
|
315
433
|
queueName: job.qname,
|
|
@@ -317,7 +435,17 @@ class JobExecutor {
|
|
|
317
435
|
receiveCount: job.message.Attributes?.ApproximateReceiveCount || '1',
|
|
318
436
|
sentTimestamp: job.message.Attributes?.SentTimestamp || '',
|
|
319
437
|
firstReceiveTimestamp: job.message.Attributes?.ApproximateFirstReceiveTimestamp || '',
|
|
320
|
-
messageGroupId: job.message.Attributes?.MessageGroupId || ''
|
|
438
|
+
messageGroupId: job.message.Attributes?.MessageGroupId || '',
|
|
439
|
+
/** Call with a child process PID to enable kill-after process termination. */
|
|
440
|
+
registerPid: (pid) => {
|
|
441
|
+
if (typeof pid !== 'number' || !Number.isInteger(pid) || pid <= 1 || pid === process.pid) {
|
|
442
|
+
debug('registerPid: rejected invalid PID', pid);
|
|
443
|
+
return;
|
|
444
|
+
}
|
|
445
|
+
job.pid = pid;
|
|
446
|
+
if (job.killDue && !job.killed)
|
|
447
|
+
this.killJob(job, new Date());
|
|
448
|
+
}
|
|
321
449
|
};
|
|
322
450
|
const result = await job.callback(queue, job.payload, attributes);
|
|
323
451
|
debug('executeJob callback finished', { payload: job.payload, result });
|
|
@@ -362,8 +490,11 @@ class JobExecutor {
|
|
|
362
490
|
}));
|
|
363
491
|
}
|
|
364
492
|
}
|
|
365
|
-
|
|
366
|
-
|
|
493
|
+
finally {
|
|
494
|
+
this.clearJobTimers(job);
|
|
495
|
+
this.stats.activeJobs--;
|
|
496
|
+
this.stats.runningJobs--;
|
|
497
|
+
}
|
|
367
498
|
}
|
|
368
499
|
async executeJobs(messages, callback, qname, qrl) {
|
|
369
500
|
if (this.shutdownRequested)
|
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -197,10 +197,11 @@ export async function check (argv, testHook) {
|
|
|
197
197
|
}
|
|
198
198
|
|
|
199
199
|
const monitorOptionDefinitions = [
|
|
200
|
-
{ name: 'save', alias: 's', type: Boolean, description: 'Saves data to CloudWatch' }
|
|
200
|
+
{ name: 'save', alias: 's', type: Boolean, description: 'Saves data to CloudWatch' },
|
|
201
|
+
{ name: 'dlq-suffix', type: String, description: `Suffix to append to each queue to generate DLQ name [default: ${defaults.dlqSuffix}]` }
|
|
201
202
|
]
|
|
202
203
|
|
|
203
|
-
export async function monitor (argv) {
|
|
204
|
+
export async function monitor (argv, testHook) {
|
|
204
205
|
const optionDefinitions = [].concat(monitorOptionDefinitions, globalOptionDefinitions)
|
|
205
206
|
const usageSections = [
|
|
206
207
|
{ content: 'usage: qdone monitor <queuePattern> ', raw: true },
|
|
@@ -234,7 +235,8 @@ export async function monitor (argv) {
|
|
|
234
235
|
|
|
235
236
|
// Load module after AWS global load
|
|
236
237
|
setupAWS(options)
|
|
237
|
-
const { monitor } = await import('./monitor.js')
|
|
238
|
+
const { monitor: monitorOriginal } = await import('./monitor.js')
|
|
239
|
+
const monitor = testHook || monitorOriginal
|
|
238
240
|
return monitor(queue, options.save, options)
|
|
239
241
|
}
|
|
240
242
|
|
package/src/monitor.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { getMatchingQueues, getQueueAttributes } from './sqs.js'
|
|
6
6
|
import { putAggregateData, getCloudWatchClient } from './cloudWatch.js'
|
|
7
7
|
import { GetMetricStatisticsCommand } from '@aws-sdk/client-cloudwatch'
|
|
8
|
-
import { getOptionsWithDefaults } from './defaults.js'
|
|
8
|
+
import { getOptionsWithDefaults, defaults } from './defaults.js'
|
|
9
9
|
import { normalizeQueueName } from './qrlCache.js'
|
|
10
10
|
import Debug from 'debug'
|
|
11
11
|
const debug = Debug('qdone:monitor')
|
|
@@ -18,7 +18,7 @@ export async function monitor (queue, save, options) {
|
|
|
18
18
|
const opt = getOptionsWithDefaults(options)
|
|
19
19
|
const queueName = normalizeQueueName(queue, opt)
|
|
20
20
|
debug({ options, opt, queue, queueName })
|
|
21
|
-
const data = await getAggregateData(queueName)
|
|
21
|
+
const data = await getAggregateData(queueName, opt)
|
|
22
22
|
console.log(data)
|
|
23
23
|
if (save) {
|
|
24
24
|
if (opt.verbose) process.stderr.write('Saving to CloudWatch...')
|
|
@@ -76,7 +76,7 @@ export async function getQueueAge (queueName) {
|
|
|
76
76
|
* Metrics (from CloudWatch):
|
|
77
77
|
* - ApproximateAgeOfOldestMessage: Max
|
|
78
78
|
*/
|
|
79
|
-
export async function getAggregateData (queueName) {
|
|
79
|
+
export async function getAggregateData (queueName, opt) {
|
|
80
80
|
const { prefix, suffixRegex } = interpretWildcard(queueName)
|
|
81
81
|
const qrls = await getMatchingQueues(prefix, suffixRegex)
|
|
82
82
|
// debug({ qrls })
|
|
@@ -97,8 +97,21 @@ export async function getAggregateData (queueName) {
|
|
|
97
97
|
|
|
98
98
|
// Fetch ApproximateAgeOfOldestMessage from CloudWatch (not available via SQS API)
|
|
99
99
|
// Only query queues with messages to minimize CloudWatch API costs
|
|
100
|
+
// Filter out dead and failed queues for age calculation only — their messages
|
|
101
|
+
// age indefinitely by design, polluting the active age metric.
|
|
102
|
+
// But if the pattern itself targets dead/failed queues, don't filter them out.
|
|
103
|
+
const failSuffix = (opt && opt.failSuffix) || defaults.failSuffix
|
|
104
|
+
const dlqSuffix = (opt && opt.dlqSuffix) || defaults.dlqSuffix
|
|
105
|
+
const strippedPattern = queueName.replace(/\.fifo$/, '')
|
|
106
|
+
const patternTargetsDeadFailed = strippedPattern.endsWith(failSuffix) || strippedPattern.endsWith(dlqSuffix)
|
|
107
|
+
const esc = s => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
108
|
+
const deadFailedRegex = new RegExp(`(${esc(failSuffix)}|${esc(dlqSuffix)})(\\.fifo)?$`)
|
|
109
|
+
const activeQueueNames = patternTargetsDeadFailed
|
|
110
|
+
? [...total.contributingQueueNames]
|
|
111
|
+
: [...total.contributingQueueNames].filter(q => !deadFailedRegex.test(q))
|
|
112
|
+
|
|
100
113
|
const ageResults = await Promise.all(
|
|
101
|
-
|
|
114
|
+
activeQueueNames.map(queue => getQueueAge(queue))
|
|
102
115
|
)
|
|
103
116
|
total.ApproximateAgeOfOldestMessage = Math.max(0, ...ageResults)
|
|
104
117
|
|
|
@@ -3,10 +3,15 @@
|
|
|
3
3
|
* their visibility timeouts and deleting them when they are successful.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
ChangeMessageVisibilityBatchCommand,
|
|
8
|
+
ChangeMessageVisibilityCommand,
|
|
9
|
+
DeleteMessageBatchCommand
|
|
10
|
+
} from '@aws-sdk/client-sqs'
|
|
7
11
|
|
|
8
12
|
import chalk from 'chalk'
|
|
9
13
|
import Debug from 'debug'
|
|
14
|
+
import treeKill from 'tree-kill'
|
|
10
15
|
|
|
11
16
|
import { dedupSuccessfullyProcessed } from '../dedup.js'
|
|
12
17
|
import { getSQSClient } from '../sqs.js'
|
|
@@ -14,6 +19,8 @@ import { getSQSClient } from '../sqs.js'
|
|
|
14
19
|
const debug = Debug('qdone:jobExecutor')
|
|
15
20
|
|
|
16
21
|
const maxJobSeconds = 12 * 60 * 60
|
|
22
|
+
const defaultVisibilityTimeout = 120
|
|
23
|
+
const SIGKILL_DELAY_MS = 5000
|
|
17
24
|
|
|
18
25
|
export class JobExecutor {
|
|
19
26
|
constructor (opt) {
|
|
@@ -29,7 +36,8 @@ export class JobExecutor {
|
|
|
29
36
|
timeoutsExtended: 0,
|
|
30
37
|
jobsSucceeded: 0,
|
|
31
38
|
jobsFailed: 0,
|
|
32
|
-
jobsDeleted: 0
|
|
39
|
+
jobsDeleted: 0,
|
|
40
|
+
jobsKilled: 0
|
|
33
41
|
}
|
|
34
42
|
this.maintainPromise = this.maintainVisibility()
|
|
35
43
|
debug({ this: this })
|
|
@@ -64,6 +72,103 @@ export class JobExecutor {
|
|
|
64
72
|
return runningCount
|
|
65
73
|
}
|
|
66
74
|
|
|
75
|
+
clearJobTimers (job) {
|
|
76
|
+
clearTimeout(job.killTimer)
|
|
77
|
+
clearTimeout(job.killSignalTimer)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
getExecutionTimeMs (job, start = new Date()) {
|
|
81
|
+
return start - job.executionStart
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
scheduleKillAfter (job) {
|
|
85
|
+
if (!this.opt.killAfter) return
|
|
86
|
+
clearTimeout(job.killTimer)
|
|
87
|
+
job.killTimer = setTimeout(() => {
|
|
88
|
+
job.killDue = true
|
|
89
|
+
this.killJob(job, new Date())
|
|
90
|
+
}, this.opt.killAfter * 1000)
|
|
91
|
+
job.killTimer.unref?.()
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
killJob (job, start = new Date()) {
|
|
95
|
+
if (!job.executionStart || job.status !== 'running') return
|
|
96
|
+
if (job.killed) return
|
|
97
|
+
|
|
98
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start)
|
|
99
|
+
if (executionTimeMs < this.opt.killAfter * 1000) return
|
|
100
|
+
const executionTime = Math.floor(executionTimeMs / 1000)
|
|
101
|
+
|
|
102
|
+
job.killDue = true
|
|
103
|
+
if (!job.pid) {
|
|
104
|
+
debug('killAfter reached before PID registration', { messageId: job.message?.MessageId, executionTime })
|
|
105
|
+
return
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
job.killed = true
|
|
109
|
+
this.stats.jobsKilled++
|
|
110
|
+
const pid = job.pid
|
|
111
|
+
const killTree = this.opt.killTree || treeKill
|
|
112
|
+
|
|
113
|
+
if (this.opt.verbose) {
|
|
114
|
+
console.error(chalk.red('KILLING'), job.prettyQname, chalk.red('pid'), pid,
|
|
115
|
+
chalk.red('after'), executionTime, chalk.red('seconds (limit:'), this.opt.killAfter + ')')
|
|
116
|
+
} else if (!this.opt.disableLog) {
|
|
117
|
+
console.log(JSON.stringify({
|
|
118
|
+
event: 'JOB_KILL_AFTER',
|
|
119
|
+
timestamp: start,
|
|
120
|
+
queue: job.qname,
|
|
121
|
+
messageId: job.message.MessageId,
|
|
122
|
+
pid,
|
|
123
|
+
executionTime,
|
|
124
|
+
killAfter: this.opt.killAfter,
|
|
125
|
+
payload: job.payload
|
|
126
|
+
}))
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
killTree(pid, 'SIGTERM', (err) => {
|
|
130
|
+
if (err) debug('treeKill SIGTERM error', err.message)
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
clearTimeout(job.killSignalTimer)
|
|
134
|
+
job.killSignalTimer = setTimeout(() => {
|
|
135
|
+
try { process.kill(pid, 0) } catch (e) { if (e.code === 'ESRCH') return }
|
|
136
|
+
killTree(pid, 'SIGKILL', (err) => {
|
|
137
|
+
if (err) debug('treeKill SIGKILL error', err.message)
|
|
138
|
+
})
|
|
139
|
+
}, SIGKILL_DELAY_MS)
|
|
140
|
+
job.killSignalTimer.unref?.()
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
async setRunningVisibilityTimeout (job) {
|
|
144
|
+
if (!this.opt.killAfter) return
|
|
145
|
+
|
|
146
|
+
const visibilityTimeout = Math.max(1, Math.min(job.visibilityTimeout, this.opt.killAfter))
|
|
147
|
+
if (visibilityTimeout >= job.visibilityTimeout) return
|
|
148
|
+
|
|
149
|
+
job.visibilityTimeout = visibilityTimeout
|
|
150
|
+
job.extendAtSecond = Math.round(job.visibilityTimeout / 2)
|
|
151
|
+
|
|
152
|
+
const input = {
|
|
153
|
+
QueueUrl: job.qrl,
|
|
154
|
+
ReceiptHandle: job.message.ReceiptHandle,
|
|
155
|
+
VisibilityTimeout: job.visibilityTimeout
|
|
156
|
+
}
|
|
157
|
+
debug({ ChangeMessageVisibility: input })
|
|
158
|
+
|
|
159
|
+
try {
|
|
160
|
+
const result = await getSQSClient().send(new ChangeMessageVisibilityCommand(input))
|
|
161
|
+
debug('ChangeMessageVisibility returned', result)
|
|
162
|
+
this.stats.sqsCalls++
|
|
163
|
+
this.stats.timeoutsExtended++
|
|
164
|
+
} catch (err) {
|
|
165
|
+
debug('ChangeMessageVisibility error', err)
|
|
166
|
+
if (this.opt.verbose) {
|
|
167
|
+
console.error(chalk.red('FAILED_TO_SET_VISIBILITY_TIMEOUT'), { err, input })
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
67
172
|
/**
|
|
68
173
|
* Changes message visibility on all running jobs using as few calls as possible.
|
|
69
174
|
*/
|
|
@@ -106,19 +211,36 @@ export class JobExecutor {
|
|
|
106
211
|
} else if (job.status !== 'deleting') {
|
|
107
212
|
// Any other job state gets visibility accounting
|
|
108
213
|
debug('processing', { job, jobRunTime })
|
|
214
|
+
|
|
215
|
+
// Kill-after enforcement: terminate child process if it exceeds the deadline.
|
|
216
|
+
// Uses executionStart (when runJob began) so FIFO serial jobs aren't
|
|
217
|
+
// penalized for queue wait time.
|
|
218
|
+
if (this.opt.killAfter && job.executionStart && !job.killed) {
|
|
219
|
+
const executionTimeMs = this.getExecutionTimeMs(job, start)
|
|
220
|
+
if (executionTimeMs >= this.opt.killAfter * 1000) {
|
|
221
|
+
job.killDue = true
|
|
222
|
+
this.killJob(job, start)
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
109
226
|
if (jobRunTime >= job.extendAtSecond) {
|
|
110
227
|
// Add it to our organized list of jobs
|
|
111
228
|
const jobsToExtend = jobsToExtendByQrl[job.qrl] || []
|
|
112
229
|
jobsToExtend.push(job)
|
|
113
230
|
jobsToExtendByQrl[job.qrl] = jobsToExtend
|
|
114
231
|
|
|
115
|
-
// Update the visibility timeout, double every time, up to max
|
|
232
|
+
// Update the visibility timeout, double every time, up to max.
|
|
233
|
+
// Only cap at killAfter once execution has started — waiting FIFO
|
|
234
|
+
// jobs should not have their visibility reduced prematurely.
|
|
116
235
|
const doubled = job.visibilityTimeout * 2
|
|
117
236
|
const secondsUntilMax = Math.max(1, maxJobSeconds - jobRunTime)
|
|
118
|
-
|
|
119
|
-
|
|
237
|
+
const executionTimeMs = job.executionStart ? this.getExecutionTimeMs(job, start) : 0
|
|
238
|
+
const secondsUntilKill = (this.opt.killAfter && job.executionStart)
|
|
239
|
+
? Math.max(1, Math.ceil((this.opt.killAfter * 1000 - executionTimeMs) / 1000))
|
|
240
|
+
: Infinity
|
|
241
|
+
job.visibilityTimeout = Math.min(doubled, secondsUntilMax, secondsUntilKill)
|
|
120
242
|
job.extendAtSecond = Math.round(jobRunTime + job.visibilityTimeout / 2) // this is what we use next time
|
|
121
|
-
debug({ doubled, secondsUntilMax, job })
|
|
243
|
+
debug({ doubled, secondsUntilMax, secondsUntilKill, job })
|
|
122
244
|
}
|
|
123
245
|
}
|
|
124
246
|
}
|
|
@@ -164,7 +286,7 @@ export class JobExecutor {
|
|
|
164
286
|
const result = await getSQSClient().send(new ChangeMessageVisibilityBatchCommand(input))
|
|
165
287
|
debug('ChangeMessageVisibilityBatch returned', result)
|
|
166
288
|
this.stats.sqsCalls++
|
|
167
|
-
if (result.Failed) {
|
|
289
|
+
if (result.Failed?.length) {
|
|
168
290
|
console.error('FAILED_MESSAGES', result.Failed)
|
|
169
291
|
for (const failed of result.Failed) {
|
|
170
292
|
console.error('FAILED_TO_EXTEND_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] })
|
|
@@ -172,7 +294,7 @@ export class JobExecutor {
|
|
|
172
294
|
if (this.jobsByMessageId[failed.Id]) this.jobsByMessageId[failed.Id].status = 'failed'
|
|
173
295
|
}
|
|
174
296
|
}
|
|
175
|
-
if (result.Successful) {
|
|
297
|
+
if (result.Successful?.length) {
|
|
176
298
|
const count = result.Successful.length || 0
|
|
177
299
|
this.stats.timeoutsExtended += count
|
|
178
300
|
if (this.opt.verbose) {
|
|
@@ -208,7 +330,7 @@ export class JobExecutor {
|
|
|
208
330
|
debug({ DeleteMessageBatch: input })
|
|
209
331
|
const result = await getSQSClient().send(new DeleteMessageBatchCommand(input))
|
|
210
332
|
this.stats.sqsCalls++
|
|
211
|
-
if (result.Failed) {
|
|
333
|
+
if (result.Failed?.length) {
|
|
212
334
|
console.error('FAILED_MESSAGES', result.Failed)
|
|
213
335
|
for (const failed of result.Failed) {
|
|
214
336
|
console.error('FAILED_TO_DELETE_JOB', { failedEntry: failed, job: this.jobsByMessageId[failed.Id] })
|
|
@@ -216,7 +338,7 @@ export class JobExecutor {
|
|
|
216
338
|
if (this.jobsByMessageId[failed.Id]) this.jobsByMessageId[failed.Id].status = 'failed'
|
|
217
339
|
}
|
|
218
340
|
}
|
|
219
|
-
if (result.Successful) {
|
|
341
|
+
if (result.Successful?.length) {
|
|
220
342
|
const count = result.Successful.length || 0
|
|
221
343
|
this.stats.jobsDeleted += count
|
|
222
344
|
if (this.opt.verbose) {
|
|
@@ -254,7 +376,6 @@ export class JobExecutor {
|
|
|
254
376
|
|
|
255
377
|
addJob (message, callback, qname, qrl) {
|
|
256
378
|
// Create job entry and track it
|
|
257
|
-
const defaultVisibilityTimeout = 120
|
|
258
379
|
const job = {
|
|
259
380
|
status: 'waiting',
|
|
260
381
|
start: new Date(),
|
|
@@ -319,8 +440,11 @@ export class JobExecutor {
|
|
|
319
440
|
}))
|
|
320
441
|
}
|
|
321
442
|
job.status = 'running'
|
|
443
|
+
job.executionStart = new Date()
|
|
322
444
|
this.stats.runningJobs++
|
|
323
445
|
this.stats.waitingJobs--
|
|
446
|
+
this.scheduleKillAfter(job)
|
|
447
|
+
await this.setRunningVisibilityTimeout(job)
|
|
324
448
|
const queue = job.qname.slice(this.opt.prefix.length)
|
|
325
449
|
const attributes = {
|
|
326
450
|
queueName: job.qname,
|
|
@@ -328,7 +452,16 @@ export class JobExecutor {
|
|
|
328
452
|
receiveCount: job.message.Attributes?.ApproximateReceiveCount || '1',
|
|
329
453
|
sentTimestamp: job.message.Attributes?.SentTimestamp || '',
|
|
330
454
|
firstReceiveTimestamp: job.message.Attributes?.ApproximateFirstReceiveTimestamp || '',
|
|
331
|
-
messageGroupId: job.message.Attributes?.MessageGroupId || ''
|
|
455
|
+
messageGroupId: job.message.Attributes?.MessageGroupId || '',
|
|
456
|
+
/** Call with a child process PID to enable kill-after process termination. */
|
|
457
|
+
registerPid: (pid) => {
|
|
458
|
+
if (typeof pid !== 'number' || !Number.isInteger(pid) || pid <= 1 || pid === process.pid) {
|
|
459
|
+
debug('registerPid: rejected invalid PID', pid)
|
|
460
|
+
return
|
|
461
|
+
}
|
|
462
|
+
job.pid = pid
|
|
463
|
+
if (job.killDue && !job.killed) this.killJob(job, new Date())
|
|
464
|
+
}
|
|
332
465
|
}
|
|
333
466
|
const result = await job.callback(queue, job.payload, attributes)
|
|
334
467
|
debug('executeJob callback finished', { payload: job.payload, result })
|
|
@@ -370,9 +503,11 @@ export class JobExecutor {
|
|
|
370
503
|
err
|
|
371
504
|
}))
|
|
372
505
|
}
|
|
506
|
+
} finally {
|
|
507
|
+
this.clearJobTimers(job)
|
|
508
|
+
this.stats.activeJobs--
|
|
509
|
+
this.stats.runningJobs--
|
|
373
510
|
}
|
|
374
|
-
this.stats.activeJobs--
|
|
375
|
-
this.stats.runningJobs--
|
|
376
511
|
}
|
|
377
512
|
|
|
378
513
|
async executeJobs (messages, callback, qname, qrl) {
|