@monque/core 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/CHANGELOG.md +31 -0
- package/dist/index.cjs +589 -325
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +109 -34
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +109 -34
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +590 -327
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
- package/src/events/types.ts +2 -2
- package/src/index.ts +1 -0
- package/src/jobs/document-to-persisted-job.ts +52 -0
- package/src/jobs/index.ts +2 -0
- package/src/scheduler/monque.ts +124 -179
- package/src/scheduler/services/change-stream-handler.ts +2 -1
- package/src/scheduler/services/index.ts +1 -0
- package/src/scheduler/services/job-manager.ts +112 -140
- package/src/scheduler/services/job-processor.ts +94 -62
- package/src/scheduler/services/job-query.ts +81 -36
- package/src/scheduler/services/job-scheduler.ts +42 -2
- package/src/scheduler/services/lifecycle-manager.ts +154 -0
- package/src/scheduler/services/types.ts +5 -1
- package/src/scheduler/types.ts +34 -0
- package/src/shared/errors.ts +31 -0
- package/src/shared/index.ts +2 -0
- package/src/shared/utils/error.ts +33 -0
- package/src/shared/utils/index.ts +1 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { ChangeStream, ChangeStreamDocument, Document } from 'mongodb';
|
|
2
2
|
|
|
3
3
|
import { JobStatus } from '@/jobs';
|
|
4
|
+
import { toError } from '@/shared';
|
|
4
5
|
|
|
5
6
|
import type { SchedulerContext } from './types.js';
|
|
6
7
|
|
|
@@ -133,7 +134,7 @@ export class ChangeStreamHandler {
|
|
|
133
134
|
this.debounceTimer = setTimeout(() => {
|
|
134
135
|
this.debounceTimer = null;
|
|
135
136
|
this.onPoll().catch((error: unknown) => {
|
|
136
|
-
this.ctx.emit('job:error', { error: error
|
|
137
|
+
this.ctx.emit('job:error', { error: toError(error) });
|
|
137
138
|
});
|
|
138
139
|
}, 100);
|
|
139
140
|
}
|
|
@@ -4,5 +4,6 @@ export { JobManager } from './job-manager.js';
|
|
|
4
4
|
export { JobProcessor } from './job-processor.js';
|
|
5
5
|
export { JobQueryService } from './job-query.js';
|
|
6
6
|
export { JobScheduler } from './job-scheduler.js';
|
|
7
|
+
export { LifecycleManager } from './lifecycle-manager.js';
|
|
7
8
|
// Types
|
|
8
9
|
export type { ResolvedMonqueOptions, SchedulerContext } from './types.js';
|
|
@@ -1,14 +1,8 @@
|
|
|
1
|
-
import { ObjectId
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
type BulkOperationResult,
|
|
5
|
-
type Job,
|
|
6
|
-
type JobSelector,
|
|
7
|
-
JobStatus,
|
|
8
|
-
type PersistedJob,
|
|
9
|
-
} from '@/jobs';
|
|
1
|
+
import { ObjectId } from 'mongodb';
|
|
2
|
+
|
|
3
|
+
import { type BulkOperationResult, type JobSelector, JobStatus, type PersistedJob } from '@/jobs';
|
|
10
4
|
import { buildSelectorQuery } from '@/scheduler';
|
|
11
|
-
import { JobStateError } from '@/shared';
|
|
5
|
+
import { ConnectionError, JobStateError, MonqueError } from '@/shared';
|
|
12
6
|
|
|
13
7
|
import type { SchedulerContext } from './types.js';
|
|
14
8
|
|
|
@@ -49,17 +43,15 @@ export class JobManager {
|
|
|
49
43
|
const jobDoc = await this.ctx.collection.findOne({ _id });
|
|
50
44
|
if (!jobDoc) return null;
|
|
51
45
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if (currentJob.status === JobStatus.CANCELLED) {
|
|
55
|
-
return this.ctx.documentToPersistedJob(currentJob);
|
|
46
|
+
if (jobDoc['status'] === JobStatus.CANCELLED) {
|
|
47
|
+
return this.ctx.documentToPersistedJob(jobDoc);
|
|
56
48
|
}
|
|
57
49
|
|
|
58
|
-
if (
|
|
50
|
+
if (jobDoc['status'] !== JobStatus.PENDING) {
|
|
59
51
|
throw new JobStateError(
|
|
60
|
-
`Cannot cancel job in status '${
|
|
52
|
+
`Cannot cancel job in status '${jobDoc['status']}'`,
|
|
61
53
|
jobId,
|
|
62
|
-
|
|
54
|
+
jobDoc['status'],
|
|
63
55
|
'cancel',
|
|
64
56
|
);
|
|
65
57
|
}
|
|
@@ -183,13 +175,11 @@ export class JobManager {
|
|
|
183
175
|
|
|
184
176
|
if (!currentJobDoc) return null;
|
|
185
177
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
if (currentJob.status !== JobStatus.PENDING) {
|
|
178
|
+
if (currentJobDoc['status'] !== JobStatus.PENDING) {
|
|
189
179
|
throw new JobStateError(
|
|
190
|
-
`Cannot reschedule job in status '${
|
|
180
|
+
`Cannot reschedule job in status '${currentJobDoc['status']}'`,
|
|
191
181
|
jobId,
|
|
192
|
-
|
|
182
|
+
currentJobDoc['status'],
|
|
193
183
|
'reschedule',
|
|
194
184
|
);
|
|
195
185
|
}
|
|
@@ -254,14 +244,15 @@ export class JobManager {
|
|
|
254
244
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
255
245
|
|
|
256
246
|
/**
|
|
257
|
-
* Cancel multiple jobs matching the given filter.
|
|
247
|
+
* Cancel multiple jobs matching the given filter via a single updateMany call.
|
|
258
248
|
*
|
|
259
|
-
* Only cancels jobs in 'pending' status
|
|
260
|
-
*
|
|
249
|
+
* Only cancels jobs in 'pending' status — the status guard is applied regardless
|
|
250
|
+
* of what the filter specifies. Jobs in other states are silently skipped (not
|
|
251
|
+
* matched by the query). Emits a 'jobs:cancelled' event with the count of
|
|
261
252
|
* successfully cancelled jobs.
|
|
262
253
|
*
|
|
263
254
|
* @param filter - Selector for which jobs to cancel (name, status, date range)
|
|
264
|
-
* @returns Result with count of cancelled jobs
|
|
255
|
+
* @returns Result with count of cancelled jobs (errors array always empty for bulk ops)
|
|
265
256
|
*
|
|
266
257
|
* @example Cancel all pending jobs for a queue
|
|
267
258
|
* ```typescript
|
|
@@ -273,76 +264,54 @@ export class JobManager {
|
|
|
273
264
|
* ```
|
|
274
265
|
*/
|
|
275
266
|
async cancelJobs(filter: JobSelector): Promise<BulkOperationResult> {
|
|
276
|
-
const
|
|
277
|
-
const errors: Array<{ jobId: string; error: string }> = [];
|
|
278
|
-
const cancelledIds: string[] = [];
|
|
279
|
-
|
|
280
|
-
// Find all matching jobs and stream them to avoid memory pressure
|
|
281
|
-
const cursor = this.ctx.collection.find(baseQuery);
|
|
282
|
-
|
|
283
|
-
for await (const doc of cursor) {
|
|
284
|
-
const job = doc as unknown as WithId<Job>;
|
|
285
|
-
const jobId = job._id.toString();
|
|
286
|
-
|
|
287
|
-
if (job.status !== JobStatus.PENDING && job.status !== JobStatus.CANCELLED) {
|
|
288
|
-
errors.push({
|
|
289
|
-
jobId,
|
|
290
|
-
error: `Cannot cancel job in status '${job.status}'`,
|
|
291
|
-
});
|
|
292
|
-
continue;
|
|
293
|
-
}
|
|
267
|
+
const query = buildSelectorQuery(filter);
|
|
294
268
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
269
|
+
// Enforce allowed status, but respect explicit status filters
|
|
270
|
+
if (filter.status !== undefined) {
|
|
271
|
+
const requested = Array.isArray(filter.status) ? filter.status : [filter.status];
|
|
272
|
+
if (!requested.includes(JobStatus.PENDING)) {
|
|
273
|
+
return { count: 0, errors: [] };
|
|
299
274
|
}
|
|
275
|
+
}
|
|
276
|
+
query['status'] = JobStatus.PENDING;
|
|
300
277
|
|
|
301
|
-
|
|
302
|
-
const result = await this.ctx.collection.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
status: JobStatus.CANCELLED,
|
|
307
|
-
updatedAt: new Date(),
|
|
308
|
-
},
|
|
278
|
+
try {
|
|
279
|
+
const result = await this.ctx.collection.updateMany(query, {
|
|
280
|
+
$set: {
|
|
281
|
+
status: JobStatus.CANCELLED,
|
|
282
|
+
updatedAt: new Date(),
|
|
309
283
|
},
|
|
310
|
-
|
|
311
|
-
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
const count = result.modifiedCount;
|
|
312
287
|
|
|
313
|
-
if (
|
|
314
|
-
|
|
315
|
-
} else {
|
|
316
|
-
// Race condition: status changed
|
|
317
|
-
errors.push({
|
|
318
|
-
jobId,
|
|
319
|
-
error: 'Job status changed during cancellation',
|
|
320
|
-
});
|
|
288
|
+
if (count > 0) {
|
|
289
|
+
this.ctx.emit('jobs:cancelled', { count });
|
|
321
290
|
}
|
|
322
|
-
}
|
|
323
291
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
}
|
|
292
|
+
return { count, errors: [] };
|
|
293
|
+
} catch (error) {
|
|
294
|
+
if (error instanceof MonqueError) {
|
|
295
|
+
throw error;
|
|
296
|
+
}
|
|
297
|
+
const message = error instanceof Error ? error.message : 'Unknown error during cancelJobs';
|
|
298
|
+
throw new ConnectionError(
|
|
299
|
+
`Failed to cancel jobs: ${message}`,
|
|
300
|
+
error instanceof Error ? { cause: error } : undefined,
|
|
301
|
+
);
|
|
329
302
|
}
|
|
330
|
-
|
|
331
|
-
return {
|
|
332
|
-
count: cancelledIds.length,
|
|
333
|
-
errors,
|
|
334
|
-
};
|
|
335
303
|
}
|
|
336
304
|
|
|
337
305
|
/**
|
|
338
|
-
* Retry multiple jobs matching the given filter.
|
|
306
|
+
* Retry multiple jobs matching the given filter via a single pipeline-style updateMany call.
|
|
339
307
|
*
|
|
340
|
-
* Only retries jobs in 'failed' or 'cancelled' status
|
|
341
|
-
*
|
|
342
|
-
*
|
|
308
|
+
* Only retries jobs in 'failed' or 'cancelled' status — the status guard is applied
|
|
309
|
+
* regardless of what the filter specifies. Jobs in other states are silently skipped.
|
|
310
|
+
* Uses `$rand` for per-document staggered `nextRunAt` to avoid thundering herd on retry.
|
|
311
|
+
* Emits a 'jobs:retried' event with the count of successfully retried jobs.
|
|
343
312
|
*
|
|
344
313
|
* @param filter - Selector for which jobs to retry (name, status, date range)
|
|
345
|
-
* @returns Result with count of retried jobs
|
|
314
|
+
* @returns Result with count of retried jobs (errors array always empty for bulk ops)
|
|
346
315
|
*
|
|
347
316
|
* @example Retry all failed jobs
|
|
348
317
|
* ```typescript
|
|
@@ -353,68 +322,60 @@ export class JobManager {
|
|
|
353
322
|
* ```
|
|
354
323
|
*/
|
|
355
324
|
async retryJobs(filter: JobSelector): Promise<BulkOperationResult> {
|
|
356
|
-
const
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
if (
|
|
367
|
-
errors
|
|
368
|
-
jobId,
|
|
369
|
-
error: `Cannot retry job in status '${job.status}'`,
|
|
370
|
-
});
|
|
371
|
-
continue;
|
|
325
|
+
const query = buildSelectorQuery(filter);
|
|
326
|
+
|
|
327
|
+
// Enforce allowed statuses, but respect explicit status filters
|
|
328
|
+
const retryable = [JobStatus.FAILED, JobStatus.CANCELLED] as const;
|
|
329
|
+
if (filter.status !== undefined) {
|
|
330
|
+
const requested = Array.isArray(filter.status) ? filter.status : [filter.status];
|
|
331
|
+
const allowed = requested.filter(
|
|
332
|
+
(status): status is (typeof retryable)[number] =>
|
|
333
|
+
status === JobStatus.FAILED || status === JobStatus.CANCELLED,
|
|
334
|
+
);
|
|
335
|
+
if (allowed.length === 0) {
|
|
336
|
+
return { count: 0, errors: [] };
|
|
372
337
|
}
|
|
338
|
+
query['status'] = allowed.length === 1 ? allowed[0] : { $in: allowed };
|
|
339
|
+
} else {
|
|
340
|
+
query['status'] = { $in: retryable };
|
|
341
|
+
}
|
|
373
342
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
},
|
|
343
|
+
const spreadWindowMs = 30_000; // 30s max spread for staggered retry
|
|
344
|
+
|
|
345
|
+
try {
|
|
346
|
+
const result = await this.ctx.collection.updateMany(query, [
|
|
379
347
|
{
|
|
380
348
|
$set: {
|
|
381
349
|
status: JobStatus.PENDING,
|
|
382
350
|
failCount: 0,
|
|
383
|
-
nextRunAt:
|
|
351
|
+
nextRunAt: {
|
|
352
|
+
$add: [new Date(), { $multiply: [{ $rand: {} }, spreadWindowMs] }],
|
|
353
|
+
},
|
|
384
354
|
updatedAt: new Date(),
|
|
385
355
|
},
|
|
386
|
-
$unset: {
|
|
387
|
-
failReason: '',
|
|
388
|
-
lockedAt: '',
|
|
389
|
-
claimedBy: '',
|
|
390
|
-
lastHeartbeat: '',
|
|
391
|
-
heartbeatInterval: '',
|
|
392
|
-
},
|
|
393
356
|
},
|
|
394
|
-
{
|
|
395
|
-
|
|
357
|
+
{
|
|
358
|
+
$unset: ['failReason', 'lockedAt', 'claimedBy', 'lastHeartbeat', 'heartbeatInterval'],
|
|
359
|
+
},
|
|
360
|
+
]);
|
|
361
|
+
|
|
362
|
+
const count = result.modifiedCount;
|
|
396
363
|
|
|
397
|
-
if (
|
|
398
|
-
|
|
399
|
-
} else {
|
|
400
|
-
errors.push({
|
|
401
|
-
jobId,
|
|
402
|
-
error: 'Job status changed during retry attempt',
|
|
403
|
-
});
|
|
364
|
+
if (count > 0) {
|
|
365
|
+
this.ctx.emit('jobs:retried', { count });
|
|
404
366
|
}
|
|
405
|
-
}
|
|
406
367
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
}
|
|
368
|
+
return { count, errors: [] };
|
|
369
|
+
} catch (error) {
|
|
370
|
+
if (error instanceof MonqueError) {
|
|
371
|
+
throw error;
|
|
372
|
+
}
|
|
373
|
+
const message = error instanceof Error ? error.message : 'Unknown error during retryJobs';
|
|
374
|
+
throw new ConnectionError(
|
|
375
|
+
`Failed to retry jobs: ${message}`,
|
|
376
|
+
error instanceof Error ? { cause: error } : undefined,
|
|
377
|
+
);
|
|
412
378
|
}
|
|
413
|
-
|
|
414
|
-
return {
|
|
415
|
-
count: retriedIds.length,
|
|
416
|
-
errors,
|
|
417
|
-
};
|
|
418
379
|
}
|
|
419
380
|
|
|
420
381
|
/**
|
|
@@ -440,16 +401,27 @@ export class JobManager {
|
|
|
440
401
|
async deleteJobs(filter: JobSelector): Promise<BulkOperationResult> {
|
|
441
402
|
const query = buildSelectorQuery(filter);
|
|
442
403
|
|
|
443
|
-
|
|
444
|
-
|
|
404
|
+
try {
|
|
405
|
+
// Use deleteMany for efficiency
|
|
406
|
+
const result = await this.ctx.collection.deleteMany(query);
|
|
445
407
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
408
|
+
if (result.deletedCount > 0) {
|
|
409
|
+
this.ctx.emit('jobs:deleted', { count: result.deletedCount });
|
|
410
|
+
}
|
|
449
411
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
412
|
+
return {
|
|
413
|
+
count: result.deletedCount,
|
|
414
|
+
errors: [],
|
|
415
|
+
};
|
|
416
|
+
} catch (error) {
|
|
417
|
+
if (error instanceof MonqueError) {
|
|
418
|
+
throw error;
|
|
419
|
+
}
|
|
420
|
+
const message = error instanceof Error ? error.message : 'Unknown error during deleteJobs';
|
|
421
|
+
throw new ConnectionError(
|
|
422
|
+
`Failed to delete jobs: ${message}`,
|
|
423
|
+
error instanceof Error ? { cause: error } : undefined,
|
|
424
|
+
);
|
|
425
|
+
}
|
|
454
426
|
}
|
|
455
427
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { isPersistedJob, type Job, JobStatus, type PersistedJob } from '@/jobs';
|
|
2
|
-
import { calculateBackoff, getNextCronDate } from '@/shared';
|
|
2
|
+
import { calculateBackoff, getNextCronDate, toError } from '@/shared';
|
|
3
3
|
import type { WorkerRegistration } from '@/workers';
|
|
4
4
|
|
|
5
5
|
import type { SchedulerContext } from './types.js';
|
|
@@ -117,7 +117,7 @@ export class JobProcessor {
|
|
|
117
117
|
worker.activeJobs.set(job._id.toString(), job);
|
|
118
118
|
|
|
119
119
|
this.processJob(job, worker).catch((error: unknown) => {
|
|
120
|
-
this.ctx.emit('job:error', { error: error
|
|
120
|
+
this.ctx.emit('job:error', { error: toError(error), job });
|
|
121
121
|
});
|
|
122
122
|
} else {
|
|
123
123
|
// No more jobs available for this worker
|
|
@@ -189,6 +189,10 @@ export class JobProcessor {
|
|
|
189
189
|
* both success and failure cases. On success, calls `completeJob()`. On failure,
|
|
190
190
|
* calls `failJob()` which implements exponential backoff retry logic.
|
|
191
191
|
*
|
|
192
|
+
* Events are only emitted when the underlying atomic status transition succeeds,
|
|
193
|
+
* ensuring event consumers receive reliable, consistent data backed by the actual
|
|
194
|
+
* database state.
|
|
195
|
+
*
|
|
192
196
|
* @param job - The job to process
|
|
193
197
|
* @param worker - The worker registration containing the handler and active job tracking
|
|
194
198
|
*/
|
|
@@ -202,39 +206,50 @@ export class JobProcessor {
|
|
|
202
206
|
|
|
203
207
|
// Job completed successfully
|
|
204
208
|
const duration = Date.now() - startTime;
|
|
205
|
-
await this.completeJob(job);
|
|
206
|
-
|
|
209
|
+
const updatedJob = await this.completeJob(job);
|
|
210
|
+
|
|
211
|
+
if (updatedJob) {
|
|
212
|
+
this.ctx.emit('job:complete', { job: updatedJob, duration });
|
|
213
|
+
}
|
|
207
214
|
} catch (error) {
|
|
208
215
|
// Job failed
|
|
209
216
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
210
|
-
await this.failJob(job, err);
|
|
217
|
+
const updatedJob = await this.failJob(job, err);
|
|
211
218
|
|
|
212
|
-
|
|
213
|
-
|
|
219
|
+
if (updatedJob) {
|
|
220
|
+
const willRetry = updatedJob.status === JobStatus.PENDING;
|
|
221
|
+
this.ctx.emit('job:fail', { job: updatedJob, error: err, willRetry });
|
|
222
|
+
}
|
|
214
223
|
} finally {
|
|
215
224
|
worker.activeJobs.delete(jobId);
|
|
216
225
|
}
|
|
217
226
|
}
|
|
218
227
|
|
|
219
228
|
/**
|
|
220
|
-
* Mark a job as completed successfully.
|
|
229
|
+
* Mark a job as completed successfully using an atomic status transition.
|
|
230
|
+
*
|
|
231
|
+
* Uses `findOneAndUpdate` with `status: processing` and `claimedBy: instanceId`
|
|
232
|
+
* preconditions to ensure the transition only occurs if the job is still owned by this
|
|
233
|
+
* scheduler instance. Returns `null` if the job was concurrently modified (e.g., reclaimed
|
|
234
|
+
* by another instance after stale recovery).
|
|
221
235
|
*
|
|
222
236
|
* For recurring jobs (with `repeatInterval`), schedules the next run based on the cron
|
|
223
237
|
* expression and resets `failCount` to 0. For one-time jobs, sets status to `completed`.
|
|
224
238
|
* Clears `lockedAt` and `failReason` fields in both cases.
|
|
225
239
|
*
|
|
226
240
|
* @param job - The job that completed successfully
|
|
241
|
+
* @returns The updated job document, or `null` if the transition could not be applied
|
|
227
242
|
*/
|
|
228
|
-
async completeJob(job: Job): Promise<
|
|
243
|
+
async completeJob(job: Job): Promise<PersistedJob | null> {
|
|
229
244
|
if (!isPersistedJob(job)) {
|
|
230
|
-
return;
|
|
245
|
+
return null;
|
|
231
246
|
}
|
|
232
247
|
|
|
233
248
|
if (job.repeatInterval) {
|
|
234
249
|
// Recurring job - schedule next run
|
|
235
250
|
const nextRunAt = getNextCronDate(job.repeatInterval);
|
|
236
|
-
await this.ctx.collection.
|
|
237
|
-
{ _id: job._id },
|
|
251
|
+
const result = await this.ctx.collection.findOneAndUpdate(
|
|
252
|
+
{ _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
|
|
238
253
|
{
|
|
239
254
|
$set: {
|
|
240
255
|
status: JobStatus.PENDING,
|
|
@@ -250,52 +265,63 @@ export class JobProcessor {
|
|
|
250
265
|
failReason: '',
|
|
251
266
|
},
|
|
252
267
|
},
|
|
268
|
+
{ returnDocument: 'after' },
|
|
253
269
|
);
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
await this.ctx.collection.updateOne(
|
|
257
|
-
{ _id: job._id },
|
|
258
|
-
{
|
|
259
|
-
$set: {
|
|
260
|
-
status: JobStatus.COMPLETED,
|
|
261
|
-
updatedAt: new Date(),
|
|
262
|
-
},
|
|
263
|
-
$unset: {
|
|
264
|
-
lockedAt: '',
|
|
265
|
-
claimedBy: '',
|
|
266
|
-
lastHeartbeat: '',
|
|
267
|
-
heartbeatInterval: '',
|
|
268
|
-
failReason: '',
|
|
269
|
-
},
|
|
270
|
-
},
|
|
271
|
-
);
|
|
272
|
-
job.status = JobStatus.COMPLETED;
|
|
270
|
+
|
|
271
|
+
return result ? this.ctx.documentToPersistedJob(result) : null;
|
|
273
272
|
}
|
|
273
|
+
|
|
274
|
+
// One-time job - mark as completed
|
|
275
|
+
const result = await this.ctx.collection.findOneAndUpdate(
|
|
276
|
+
{ _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
|
|
277
|
+
{
|
|
278
|
+
$set: {
|
|
279
|
+
status: JobStatus.COMPLETED,
|
|
280
|
+
updatedAt: new Date(),
|
|
281
|
+
},
|
|
282
|
+
$unset: {
|
|
283
|
+
lockedAt: '',
|
|
284
|
+
claimedBy: '',
|
|
285
|
+
lastHeartbeat: '',
|
|
286
|
+
heartbeatInterval: '',
|
|
287
|
+
failReason: '',
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
{ returnDocument: 'after' },
|
|
291
|
+
);
|
|
292
|
+
|
|
293
|
+
return result ? this.ctx.documentToPersistedJob(result) : null;
|
|
274
294
|
}
|
|
275
295
|
|
|
276
296
|
/**
|
|
277
|
-
* Handle job failure with exponential backoff retry logic.
|
|
297
|
+
* Handle job failure with exponential backoff retry logic using an atomic status transition.
|
|
298
|
+
*
|
|
299
|
+
* Uses `findOneAndUpdate` with `status: processing` and `claimedBy: instanceId`
|
|
300
|
+
* preconditions to ensure the transition only occurs if the job is still owned by this
|
|
301
|
+
* scheduler instance. Returns `null` if the job was concurrently modified (e.g., reclaimed
|
|
302
|
+
* by another instance after stale recovery).
|
|
278
303
|
*
|
|
279
304
|
* Increments `failCount` and calculates next retry time using exponential backoff:
|
|
280
|
-
* `nextRunAt = 2^failCount
|
|
305
|
+
* `nextRunAt = 2^failCount * baseRetryInterval` (capped by optional `maxBackoffDelay`).
|
|
281
306
|
*
|
|
282
307
|
* If `failCount >= maxRetries`, marks job as permanently `failed`. Otherwise, resets
|
|
283
308
|
* to `pending` status for retry. Stores error message in `failReason` field.
|
|
284
309
|
*
|
|
285
310
|
* @param job - The job that failed
|
|
286
311
|
* @param error - The error that caused the failure
|
|
312
|
+
* @returns The updated job document, or `null` if the transition could not be applied
|
|
287
313
|
*/
|
|
288
|
-
async failJob(job: Job, error: Error): Promise<
|
|
314
|
+
async failJob(job: Job, error: Error): Promise<PersistedJob | null> {
|
|
289
315
|
if (!isPersistedJob(job)) {
|
|
290
|
-
return;
|
|
316
|
+
return null;
|
|
291
317
|
}
|
|
292
318
|
|
|
293
319
|
const newFailCount = job.failCount + 1;
|
|
294
320
|
|
|
295
321
|
if (newFailCount >= this.ctx.options.maxRetries) {
|
|
296
322
|
// Permanent failure
|
|
297
|
-
await this.ctx.collection.
|
|
298
|
-
{ _id: job._id },
|
|
323
|
+
const result = await this.ctx.collection.findOneAndUpdate(
|
|
324
|
+
{ _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
|
|
299
325
|
{
|
|
300
326
|
$set: {
|
|
301
327
|
status: JobStatus.FAILED,
|
|
@@ -310,34 +336,40 @@ export class JobProcessor {
|
|
|
310
336
|
heartbeatInterval: '',
|
|
311
337
|
},
|
|
312
338
|
},
|
|
313
|
-
|
|
314
|
-
} else {
|
|
315
|
-
// Schedule retry with exponential backoff
|
|
316
|
-
const nextRunAt = calculateBackoff(
|
|
317
|
-
newFailCount,
|
|
318
|
-
this.ctx.options.baseRetryInterval,
|
|
319
|
-
this.ctx.options.maxBackoffDelay,
|
|
339
|
+
{ returnDocument: 'after' },
|
|
320
340
|
);
|
|
321
341
|
|
|
322
|
-
|
|
323
|
-
{ _id: job._id },
|
|
324
|
-
{
|
|
325
|
-
$set: {
|
|
326
|
-
status: JobStatus.PENDING,
|
|
327
|
-
failCount: newFailCount,
|
|
328
|
-
failReason: error.message,
|
|
329
|
-
nextRunAt,
|
|
330
|
-
updatedAt: new Date(),
|
|
331
|
-
},
|
|
332
|
-
$unset: {
|
|
333
|
-
lockedAt: '',
|
|
334
|
-
claimedBy: '',
|
|
335
|
-
lastHeartbeat: '',
|
|
336
|
-
heartbeatInterval: '',
|
|
337
|
-
},
|
|
338
|
-
},
|
|
339
|
-
);
|
|
342
|
+
return result ? this.ctx.documentToPersistedJob(result) : null;
|
|
340
343
|
}
|
|
344
|
+
|
|
345
|
+
// Schedule retry with exponential backoff
|
|
346
|
+
const nextRunAt = calculateBackoff(
|
|
347
|
+
newFailCount,
|
|
348
|
+
this.ctx.options.baseRetryInterval,
|
|
349
|
+
this.ctx.options.maxBackoffDelay,
|
|
350
|
+
);
|
|
351
|
+
|
|
352
|
+
const result = await this.ctx.collection.findOneAndUpdate(
|
|
353
|
+
{ _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
|
|
354
|
+
{
|
|
355
|
+
$set: {
|
|
356
|
+
status: JobStatus.PENDING,
|
|
357
|
+
failCount: newFailCount,
|
|
358
|
+
failReason: error.message,
|
|
359
|
+
nextRunAt,
|
|
360
|
+
updatedAt: new Date(),
|
|
361
|
+
},
|
|
362
|
+
$unset: {
|
|
363
|
+
lockedAt: '',
|
|
364
|
+
claimedBy: '',
|
|
365
|
+
lastHeartbeat: '',
|
|
366
|
+
heartbeatInterval: '',
|
|
367
|
+
},
|
|
368
|
+
},
|
|
369
|
+
{ returnDocument: 'after' },
|
|
370
|
+
);
|
|
371
|
+
|
|
372
|
+
return result ? this.ctx.documentToPersistedJob(result) : null;
|
|
341
373
|
}
|
|
342
374
|
|
|
343
375
|
/**
|