@monque/core 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import type { ChangeStream, ChangeStreamDocument, Document } from 'mongodb';
2
2
 
3
3
  import { JobStatus } from '@/jobs';
4
+ import { toError } from '@/shared';
4
5
 
5
6
  import type { SchedulerContext } from './types.js';
6
7
 
@@ -133,7 +134,7 @@ export class ChangeStreamHandler {
133
134
  this.debounceTimer = setTimeout(() => {
134
135
  this.debounceTimer = null;
135
136
  this.onPoll().catch((error: unknown) => {
136
- this.ctx.emit('job:error', { error: error as Error });
137
+ this.ctx.emit('job:error', { error: toError(error) });
137
138
  });
138
139
  }, 100);
139
140
  }
@@ -4,5 +4,6 @@ export { JobManager } from './job-manager.js';
4
4
  export { JobProcessor } from './job-processor.js';
5
5
  export { JobQueryService } from './job-query.js';
6
6
  export { JobScheduler } from './job-scheduler.js';
7
+ export { LifecycleManager } from './lifecycle-manager.js';
7
8
  // Types
8
9
  export type { ResolvedMonqueOptions, SchedulerContext } from './types.js';
@@ -1,14 +1,8 @@
1
- import { ObjectId, type WithId } from 'mongodb';
2
-
3
- import {
4
- type BulkOperationResult,
5
- type Job,
6
- type JobSelector,
7
- JobStatus,
8
- type PersistedJob,
9
- } from '@/jobs';
1
+ import { ObjectId } from 'mongodb';
2
+
3
+ import { type BulkOperationResult, type JobSelector, JobStatus, type PersistedJob } from '@/jobs';
10
4
  import { buildSelectorQuery } from '@/scheduler';
11
- import { JobStateError } from '@/shared';
5
+ import { ConnectionError, JobStateError, MonqueError } from '@/shared';
12
6
 
13
7
  import type { SchedulerContext } from './types.js';
14
8
 
@@ -49,17 +43,15 @@ export class JobManager {
49
43
  const jobDoc = await this.ctx.collection.findOne({ _id });
50
44
  if (!jobDoc) return null;
51
45
 
52
- const currentJob = jobDoc as unknown as WithId<Job>;
53
-
54
- if (currentJob.status === JobStatus.CANCELLED) {
55
- return this.ctx.documentToPersistedJob(currentJob);
46
+ if (jobDoc['status'] === JobStatus.CANCELLED) {
47
+ return this.ctx.documentToPersistedJob(jobDoc);
56
48
  }
57
49
 
58
- if (currentJob.status !== JobStatus.PENDING) {
50
+ if (jobDoc['status'] !== JobStatus.PENDING) {
59
51
  throw new JobStateError(
60
- `Cannot cancel job in status '${currentJob.status}'`,
52
+ `Cannot cancel job in status '${jobDoc['status']}'`,
61
53
  jobId,
62
- currentJob.status,
54
+ jobDoc['status'],
63
55
  'cancel',
64
56
  );
65
57
  }
@@ -183,13 +175,11 @@ export class JobManager {
183
175
 
184
176
  if (!currentJobDoc) return null;
185
177
 
186
- const currentJob = currentJobDoc as unknown as WithId<Job>;
187
-
188
- if (currentJob.status !== JobStatus.PENDING) {
178
+ if (currentJobDoc['status'] !== JobStatus.PENDING) {
189
179
  throw new JobStateError(
190
- `Cannot reschedule job in status '${currentJob.status}'`,
180
+ `Cannot reschedule job in status '${currentJobDoc['status']}'`,
191
181
  jobId,
192
- currentJob.status,
182
+ currentJobDoc['status'],
193
183
  'reschedule',
194
184
  );
195
185
  }
@@ -254,14 +244,15 @@ export class JobManager {
254
244
  // ─────────────────────────────────────────────────────────────────────────────
255
245
 
256
246
  /**
257
- * Cancel multiple jobs matching the given filter.
247
+ * Cancel multiple jobs matching the given filter via a single updateMany call.
258
248
  *
259
- * Only cancels jobs in 'pending' status. Jobs in other states are collected
260
- * as errors in the result. Emits a 'jobs:cancelled' event with the IDs of
249
+ * Only cancels jobs in 'pending' status the status guard is applied regardless
250
+ * of what the filter specifies. Jobs in other states are silently skipped (not
251
+ * matched by the query). Emits a 'jobs:cancelled' event with the count of
261
252
  * successfully cancelled jobs.
262
253
  *
263
254
  * @param filter - Selector for which jobs to cancel (name, status, date range)
264
- * @returns Result with count of cancelled jobs and any errors encountered
255
+ * @returns Result with count of cancelled jobs (errors array always empty for bulk ops)
265
256
  *
266
257
  * @example Cancel all pending jobs for a queue
267
258
  * ```typescript
@@ -273,76 +264,54 @@ export class JobManager {
273
264
  * ```
274
265
  */
275
266
  async cancelJobs(filter: JobSelector): Promise<BulkOperationResult> {
276
- const baseQuery = buildSelectorQuery(filter);
277
- const errors: Array<{ jobId: string; error: string }> = [];
278
- const cancelledIds: string[] = [];
279
-
280
- // Find all matching jobs and stream them to avoid memory pressure
281
- const cursor = this.ctx.collection.find(baseQuery);
282
-
283
- for await (const doc of cursor) {
284
- const job = doc as unknown as WithId<Job>;
285
- const jobId = job._id.toString();
286
-
287
- if (job.status !== JobStatus.PENDING && job.status !== JobStatus.CANCELLED) {
288
- errors.push({
289
- jobId,
290
- error: `Cannot cancel job in status '${job.status}'`,
291
- });
292
- continue;
293
- }
267
+ const query = buildSelectorQuery(filter);
294
268
 
295
- // Skip already cancelled jobs (idempotent)
296
- if (job.status === JobStatus.CANCELLED) {
297
- cancelledIds.push(jobId);
298
- continue;
269
+ // Enforce allowed status, but respect explicit status filters
270
+ if (filter.status !== undefined) {
271
+ const requested = Array.isArray(filter.status) ? filter.status : [filter.status];
272
+ if (!requested.includes(JobStatus.PENDING)) {
273
+ return { count: 0, errors: [] };
299
274
  }
275
+ }
276
+ query['status'] = JobStatus.PENDING;
300
277
 
301
- // Atomically update to cancelled
302
- const result = await this.ctx.collection.findOneAndUpdate(
303
- { _id: job._id, status: JobStatus.PENDING },
304
- {
305
- $set: {
306
- status: JobStatus.CANCELLED,
307
- updatedAt: new Date(),
308
- },
278
+ try {
279
+ const result = await this.ctx.collection.updateMany(query, {
280
+ $set: {
281
+ status: JobStatus.CANCELLED,
282
+ updatedAt: new Date(),
309
283
  },
310
- { returnDocument: 'after' },
311
- );
284
+ });
285
+
286
+ const count = result.modifiedCount;
312
287
 
313
- if (result) {
314
- cancelledIds.push(jobId);
315
- } else {
316
- // Race condition: status changed
317
- errors.push({
318
- jobId,
319
- error: 'Job status changed during cancellation',
320
- });
288
+ if (count > 0) {
289
+ this.ctx.emit('jobs:cancelled', { count });
321
290
  }
322
- }
323
291
 
324
- if (cancelledIds.length > 0) {
325
- this.ctx.emit('jobs:cancelled', {
326
- jobIds: cancelledIds,
327
- count: cancelledIds.length,
328
- });
292
+ return { count, errors: [] };
293
+ } catch (error) {
294
+ if (error instanceof MonqueError) {
295
+ throw error;
296
+ }
297
+ const message = error instanceof Error ? error.message : 'Unknown error during cancelJobs';
298
+ throw new ConnectionError(
299
+ `Failed to cancel jobs: ${message}`,
300
+ error instanceof Error ? { cause: error } : undefined,
301
+ );
329
302
  }
330
-
331
- return {
332
- count: cancelledIds.length,
333
- errors,
334
- };
335
303
  }
336
304
 
337
305
  /**
338
- * Retry multiple jobs matching the given filter.
306
+ * Retry multiple jobs matching the given filter via a single pipeline-style updateMany call.
339
307
  *
340
- * Only retries jobs in 'failed' or 'cancelled' status. Jobs in other states
341
- * are collected as errors in the result. Emits a 'jobs:retried' event with
342
- * the IDs of successfully retried jobs.
308
+ * Only retries jobs in 'failed' or 'cancelled' status the status guard is applied
309
+ * regardless of what the filter specifies. Jobs in other states are silently skipped.
310
+ * Uses `$rand` for per-document staggered `nextRunAt` to avoid thundering herd on retry.
311
+ * Emits a 'jobs:retried' event with the count of successfully retried jobs.
343
312
  *
344
313
  * @param filter - Selector for which jobs to retry (name, status, date range)
345
- * @returns Result with count of retried jobs and any errors encountered
314
+ * @returns Result with count of retried jobs (errors array always empty for bulk ops)
346
315
  *
347
316
  * @example Retry all failed jobs
348
317
  * ```typescript
@@ -353,68 +322,60 @@ export class JobManager {
353
322
  * ```
354
323
  */
355
324
  async retryJobs(filter: JobSelector): Promise<BulkOperationResult> {
356
- const baseQuery = buildSelectorQuery(filter);
357
- const errors: Array<{ jobId: string; error: string }> = [];
358
- const retriedIds: string[] = [];
359
-
360
- const cursor = this.ctx.collection.find(baseQuery);
361
-
362
- for await (const doc of cursor) {
363
- const job = doc as unknown as WithId<Job>;
364
- const jobId = job._id.toString();
365
-
366
- if (job.status !== JobStatus.FAILED && job.status !== JobStatus.CANCELLED) {
367
- errors.push({
368
- jobId,
369
- error: `Cannot retry job in status '${job.status}'`,
370
- });
371
- continue;
325
+ const query = buildSelectorQuery(filter);
326
+
327
+ // Enforce allowed statuses, but respect explicit status filters
328
+ const retryable = [JobStatus.FAILED, JobStatus.CANCELLED] as const;
329
+ if (filter.status !== undefined) {
330
+ const requested = Array.isArray(filter.status) ? filter.status : [filter.status];
331
+ const allowed = requested.filter(
332
+ (status): status is (typeof retryable)[number] =>
333
+ status === JobStatus.FAILED || status === JobStatus.CANCELLED,
334
+ );
335
+ if (allowed.length === 0) {
336
+ return { count: 0, errors: [] };
372
337
  }
338
+ query['status'] = allowed.length === 1 ? allowed[0] : { $in: allowed };
339
+ } else {
340
+ query['status'] = { $in: retryable };
341
+ }
373
342
 
374
- const result = await this.ctx.collection.findOneAndUpdate(
375
- {
376
- _id: job._id,
377
- status: { $in: [JobStatus.FAILED, JobStatus.CANCELLED] },
378
- },
343
+ const spreadWindowMs = 30_000; // 30s max spread for staggered retry
344
+
345
+ try {
346
+ const result = await this.ctx.collection.updateMany(query, [
379
347
  {
380
348
  $set: {
381
349
  status: JobStatus.PENDING,
382
350
  failCount: 0,
383
- nextRunAt: new Date(),
351
+ nextRunAt: {
352
+ $add: [new Date(), { $multiply: [{ $rand: {} }, spreadWindowMs] }],
353
+ },
384
354
  updatedAt: new Date(),
385
355
  },
386
- $unset: {
387
- failReason: '',
388
- lockedAt: '',
389
- claimedBy: '',
390
- lastHeartbeat: '',
391
- heartbeatInterval: '',
392
- },
393
356
  },
394
- { returnDocument: 'after' },
395
- );
357
+ {
358
+ $unset: ['failReason', 'lockedAt', 'claimedBy', 'lastHeartbeat', 'heartbeatInterval'],
359
+ },
360
+ ]);
361
+
362
+ const count = result.modifiedCount;
396
363
 
397
- if (result) {
398
- retriedIds.push(jobId);
399
- } else {
400
- errors.push({
401
- jobId,
402
- error: 'Job status changed during retry attempt',
403
- });
364
+ if (count > 0) {
365
+ this.ctx.emit('jobs:retried', { count });
404
366
  }
405
- }
406
367
 
407
- if (retriedIds.length > 0) {
408
- this.ctx.emit('jobs:retried', {
409
- jobIds: retriedIds,
410
- count: retriedIds.length,
411
- });
368
+ return { count, errors: [] };
369
+ } catch (error) {
370
+ if (error instanceof MonqueError) {
371
+ throw error;
372
+ }
373
+ const message = error instanceof Error ? error.message : 'Unknown error during retryJobs';
374
+ throw new ConnectionError(
375
+ `Failed to retry jobs: ${message}`,
376
+ error instanceof Error ? { cause: error } : undefined,
377
+ );
412
378
  }
413
-
414
- return {
415
- count: retriedIds.length,
416
- errors,
417
- };
418
379
  }
419
380
 
420
381
  /**
@@ -440,16 +401,27 @@ export class JobManager {
440
401
  async deleteJobs(filter: JobSelector): Promise<BulkOperationResult> {
441
402
  const query = buildSelectorQuery(filter);
442
403
 
443
- // Use deleteMany for efficiency
444
- const result = await this.ctx.collection.deleteMany(query);
404
+ try {
405
+ // Use deleteMany for efficiency
406
+ const result = await this.ctx.collection.deleteMany(query);
445
407
 
446
- if (result.deletedCount > 0) {
447
- this.ctx.emit('jobs:deleted', { count: result.deletedCount });
448
- }
408
+ if (result.deletedCount > 0) {
409
+ this.ctx.emit('jobs:deleted', { count: result.deletedCount });
410
+ }
449
411
 
450
- return {
451
- count: result.deletedCount,
452
- errors: [],
453
- };
412
+ return {
413
+ count: result.deletedCount,
414
+ errors: [],
415
+ };
416
+ } catch (error) {
417
+ if (error instanceof MonqueError) {
418
+ throw error;
419
+ }
420
+ const message = error instanceof Error ? error.message : 'Unknown error during deleteJobs';
421
+ throw new ConnectionError(
422
+ `Failed to delete jobs: ${message}`,
423
+ error instanceof Error ? { cause: error } : undefined,
424
+ );
425
+ }
454
426
  }
455
427
  }
@@ -1,5 +1,5 @@
1
1
  import { isPersistedJob, type Job, JobStatus, type PersistedJob } from '@/jobs';
2
- import { calculateBackoff, getNextCronDate } from '@/shared';
2
+ import { calculateBackoff, getNextCronDate, toError } from '@/shared';
3
3
  import type { WorkerRegistration } from '@/workers';
4
4
 
5
5
  import type { SchedulerContext } from './types.js';
@@ -117,7 +117,7 @@ export class JobProcessor {
117
117
  worker.activeJobs.set(job._id.toString(), job);
118
118
 
119
119
  this.processJob(job, worker).catch((error: unknown) => {
120
- this.ctx.emit('job:error', { error: error as Error, job });
120
+ this.ctx.emit('job:error', { error: toError(error), job });
121
121
  });
122
122
  } else {
123
123
  // No more jobs available for this worker
@@ -189,6 +189,10 @@ export class JobProcessor {
189
189
  * both success and failure cases. On success, calls `completeJob()`. On failure,
190
190
  * calls `failJob()` which implements exponential backoff retry logic.
191
191
  *
192
+ * Events are only emitted when the underlying atomic status transition succeeds,
193
+ * ensuring event consumers receive reliable, consistent data backed by the actual
194
+ * database state.
195
+ *
192
196
  * @param job - The job to process
193
197
  * @param worker - The worker registration containing the handler and active job tracking
194
198
  */
@@ -202,39 +206,50 @@ export class JobProcessor {
202
206
 
203
207
  // Job completed successfully
204
208
  const duration = Date.now() - startTime;
205
- await this.completeJob(job);
206
- this.ctx.emit('job:complete', { job, duration });
209
+ const updatedJob = await this.completeJob(job);
210
+
211
+ if (updatedJob) {
212
+ this.ctx.emit('job:complete', { job: updatedJob, duration });
213
+ }
207
214
  } catch (error) {
208
215
  // Job failed
209
216
  const err = error instanceof Error ? error : new Error(String(error));
210
- await this.failJob(job, err);
217
+ const updatedJob = await this.failJob(job, err);
211
218
 
212
- const willRetry = job.failCount + 1 < this.ctx.options.maxRetries;
213
- this.ctx.emit('job:fail', { job, error: err, willRetry });
219
+ if (updatedJob) {
220
+ const willRetry = updatedJob.status === JobStatus.PENDING;
221
+ this.ctx.emit('job:fail', { job: updatedJob, error: err, willRetry });
222
+ }
214
223
  } finally {
215
224
  worker.activeJobs.delete(jobId);
216
225
  }
217
226
  }
218
227
 
219
228
  /**
220
- * Mark a job as completed successfully.
229
+ * Mark a job as completed successfully using an atomic status transition.
230
+ *
231
+ * Uses `findOneAndUpdate` with `status: processing` and `claimedBy: instanceId`
232
+ * preconditions to ensure the transition only occurs if the job is still owned by this
233
+ * scheduler instance. Returns `null` if the job was concurrently modified (e.g., reclaimed
234
+ * by another instance after stale recovery).
221
235
  *
222
236
  * For recurring jobs (with `repeatInterval`), schedules the next run based on the cron
223
237
  * expression and resets `failCount` to 0. For one-time jobs, sets status to `completed`.
224
238
  * Clears `lockedAt` and `failReason` fields in both cases.
225
239
  *
226
240
  * @param job - The job that completed successfully
241
+ * @returns The updated job document, or `null` if the transition could not be applied
227
242
  */
228
- async completeJob(job: Job): Promise<void> {
243
+ async completeJob(job: Job): Promise<PersistedJob | null> {
229
244
  if (!isPersistedJob(job)) {
230
- return;
245
+ return null;
231
246
  }
232
247
 
233
248
  if (job.repeatInterval) {
234
249
  // Recurring job - schedule next run
235
250
  const nextRunAt = getNextCronDate(job.repeatInterval);
236
- await this.ctx.collection.updateOne(
237
- { _id: job._id },
251
+ const result = await this.ctx.collection.findOneAndUpdate(
252
+ { _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
238
253
  {
239
254
  $set: {
240
255
  status: JobStatus.PENDING,
@@ -250,52 +265,63 @@ export class JobProcessor {
250
265
  failReason: '',
251
266
  },
252
267
  },
268
+ { returnDocument: 'after' },
253
269
  );
254
- } else {
255
- // One-time job - mark as completed
256
- await this.ctx.collection.updateOne(
257
- { _id: job._id },
258
- {
259
- $set: {
260
- status: JobStatus.COMPLETED,
261
- updatedAt: new Date(),
262
- },
263
- $unset: {
264
- lockedAt: '',
265
- claimedBy: '',
266
- lastHeartbeat: '',
267
- heartbeatInterval: '',
268
- failReason: '',
269
- },
270
- },
271
- );
272
- job.status = JobStatus.COMPLETED;
270
+
271
+ return result ? this.ctx.documentToPersistedJob(result) : null;
273
272
  }
273
+
274
+ // One-time job - mark as completed
275
+ const result = await this.ctx.collection.findOneAndUpdate(
276
+ { _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
277
+ {
278
+ $set: {
279
+ status: JobStatus.COMPLETED,
280
+ updatedAt: new Date(),
281
+ },
282
+ $unset: {
283
+ lockedAt: '',
284
+ claimedBy: '',
285
+ lastHeartbeat: '',
286
+ heartbeatInterval: '',
287
+ failReason: '',
288
+ },
289
+ },
290
+ { returnDocument: 'after' },
291
+ );
292
+
293
+ return result ? this.ctx.documentToPersistedJob(result) : null;
274
294
  }
275
295
 
276
296
  /**
277
- * Handle job failure with exponential backoff retry logic.
297
+ * Handle job failure with exponential backoff retry logic using an atomic status transition.
298
+ *
299
+ * Uses `findOneAndUpdate` with `status: processing` and `claimedBy: instanceId`
300
+ * preconditions to ensure the transition only occurs if the job is still owned by this
301
+ * scheduler instance. Returns `null` if the job was concurrently modified (e.g., reclaimed
302
+ * by another instance after stale recovery).
278
303
  *
279
304
  * Increments `failCount` and calculates next retry time using exponential backoff:
280
- * `nextRunAt = 2^failCount × baseRetryInterval` (capped by optional `maxBackoffDelay`).
305
+ * `nextRunAt = 2^failCount * baseRetryInterval` (capped by optional `maxBackoffDelay`).
281
306
  *
282
307
  * If `failCount >= maxRetries`, marks job as permanently `failed`. Otherwise, resets
283
308
  * to `pending` status for retry. Stores error message in `failReason` field.
284
309
  *
285
310
  * @param job - The job that failed
286
311
  * @param error - The error that caused the failure
312
+ * @returns The updated job document, or `null` if the transition could not be applied
287
313
  */
288
- async failJob(job: Job, error: Error): Promise<void> {
314
+ async failJob(job: Job, error: Error): Promise<PersistedJob | null> {
289
315
  if (!isPersistedJob(job)) {
290
- return;
316
+ return null;
291
317
  }
292
318
 
293
319
  const newFailCount = job.failCount + 1;
294
320
 
295
321
  if (newFailCount >= this.ctx.options.maxRetries) {
296
322
  // Permanent failure
297
- await this.ctx.collection.updateOne(
298
- { _id: job._id },
323
+ const result = await this.ctx.collection.findOneAndUpdate(
324
+ { _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
299
325
  {
300
326
  $set: {
301
327
  status: JobStatus.FAILED,
@@ -310,34 +336,40 @@ export class JobProcessor {
310
336
  heartbeatInterval: '',
311
337
  },
312
338
  },
313
- );
314
- } else {
315
- // Schedule retry with exponential backoff
316
- const nextRunAt = calculateBackoff(
317
- newFailCount,
318
- this.ctx.options.baseRetryInterval,
319
- this.ctx.options.maxBackoffDelay,
339
+ { returnDocument: 'after' },
320
340
  );
321
341
 
322
- await this.ctx.collection.updateOne(
323
- { _id: job._id },
324
- {
325
- $set: {
326
- status: JobStatus.PENDING,
327
- failCount: newFailCount,
328
- failReason: error.message,
329
- nextRunAt,
330
- updatedAt: new Date(),
331
- },
332
- $unset: {
333
- lockedAt: '',
334
- claimedBy: '',
335
- lastHeartbeat: '',
336
- heartbeatInterval: '',
337
- },
338
- },
339
- );
342
+ return result ? this.ctx.documentToPersistedJob(result) : null;
340
343
  }
344
+
345
+ // Schedule retry with exponential backoff
346
+ const nextRunAt = calculateBackoff(
347
+ newFailCount,
348
+ this.ctx.options.baseRetryInterval,
349
+ this.ctx.options.maxBackoffDelay,
350
+ );
351
+
352
+ const result = await this.ctx.collection.findOneAndUpdate(
353
+ { _id: job._id, status: JobStatus.PROCESSING, claimedBy: this.ctx.instanceId },
354
+ {
355
+ $set: {
356
+ status: JobStatus.PENDING,
357
+ failCount: newFailCount,
358
+ failReason: error.message,
359
+ nextRunAt,
360
+ updatedAt: new Date(),
361
+ },
362
+ $unset: {
363
+ lockedAt: '',
364
+ claimedBy: '',
365
+ lastHeartbeat: '',
366
+ heartbeatInterval: '',
367
+ },
368
+ },
369
+ { returnDocument: 'after' },
370
+ );
371
+
372
+ return result ? this.ctx.documentToPersistedJob(result) : null;
341
373
  }
342
374
 
343
375
  /**