@nest-batch/bullmq 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +333 -0
  3. package/dist/src/adapters/bullmq.adapter.d.ts +157 -0
  4. package/dist/src/adapters/bullmq.adapter.d.ts.map +1 -0
  5. package/dist/src/adapters/bullmq.adapter.js +252 -0
  6. package/dist/src/adapters/bullmq.adapter.js.map +1 -0
  7. package/dist/src/adapters/index.d.ts +12 -0
  8. package/dist/src/adapters/index.d.ts.map +1 -0
  9. package/dist/src/adapters/index.js +29 -0
  10. package/dist/src/adapters/index.js.map +1 -0
  11. package/dist/src/bullmq-execution-strategy.d.ts +59 -0
  12. package/dist/src/bullmq-execution-strategy.d.ts.map +1 -0
  13. package/dist/src/bullmq-execution-strategy.js +60 -0
  14. package/dist/src/bullmq-execution-strategy.js.map +1 -0
  15. package/dist/src/bullmq-runtime.service.d.ts +237 -0
  16. package/dist/src/bullmq-runtime.service.d.ts.map +1 -0
  17. package/dist/src/bullmq-runtime.service.js +441 -0
  18. package/dist/src/bullmq-runtime.service.js.map +1 -0
  19. package/dist/src/bullmq-schedule.service.d.ts +121 -0
  20. package/dist/src/bullmq-schedule.service.d.ts.map +1 -0
  21. package/dist/src/bullmq-schedule.service.js +232 -0
  22. package/dist/src/bullmq-schedule.service.js.map +1 -0
  23. package/dist/src/connection.d.ts +83 -0
  24. package/dist/src/connection.d.ts.map +1 -0
  25. package/dist/src/connection.js +72 -0
  26. package/dist/src/connection.js.map +1 -0
  27. package/dist/src/index.d.ts +29 -0
  28. package/dist/src/index.d.ts.map +1 -0
  29. package/dist/src/index.js +46 -0
  30. package/dist/src/index.js.map +1 -0
  31. package/dist/src/module-options.d.ts +68 -0
  32. package/dist/src/module-options.d.ts.map +1 -0
  33. package/dist/src/module-options.js +13 -0
  34. package/dist/src/module-options.js.map +1 -0
  35. package/package.json +71 -0
  36. package/src/adapters/bullmq.adapter.ts +346 -0
  37. package/src/adapters/index.ts +11 -0
  38. package/src/bullmq-execution-strategy.ts +81 -0
  39. package/src/bullmq-runtime.service.ts +540 -0
  40. package/src/bullmq-schedule.service.ts +271 -0
  41. package/src/connection.ts +97 -0
  42. package/src/index.ts +28 -0
  43. package/src/module-options.ts +74 -0
@@ -0,0 +1,81 @@
1
+ import { Injectable, Logger } from '@nestjs/common';
2
+
3
+ import {
4
+ type ExecutionStrategyContext,
5
+ type IExecutionStrategy,
6
+ type JobDefinition,
7
+ type JobParameters,
8
+ type LaunchResult,
9
+ } from '@nest-batch/core';
10
+
11
+ import {
12
+ BullmqRuntimeService,
13
+ BULLMQ_STRATEGY_NAME,
14
+ } from './bullmq-runtime.service';
15
+
16
+ /**
17
+ * BullMQ execution strategy — the `@nest-batch/core`-facing
18
+ * transport that hands a `JobExecution` off to a BullMQ `Queue`
19
+ * and lets a `Worker` process the work.
20
+ *
21
+ * Design (T18):
22
+ * - The actual BullMQ resource ownership (queue / worker /
23
+ * queue-events lifecycle, connection tuning, event bridge)
24
+ * lives in `BullmqRuntimeService`. This class is a thin
25
+ * adapter that maps the `IExecutionStrategy` contract to
26
+ * the runtime service's `launch()` shape.
27
+ * - Splitting the two lets the runtime service be
28
+ * independently testable (e.g. a test that wants to drive
29
+ * the worker without going through the launcher can
30
+ * instantiate the runtime service alone), and lets the
31
+ * strategy class stay as a stable public surface for
32
+ * `EXECUTION_STRATEGY` consumers.
33
+ * - The strategy inherits the runtime service's
34
+ * `name` (`'bullmq'`) — the runtime service is the
35
+ * single source of truth for the strategy name.
36
+ *
37
+ * `name` and `launch()` together comprise the T18 contract:
38
+ * - `name = 'bullmq'` — replaces the T17 stub's
39
+ * `'bullmq-stub'` so log lines and boundary reports can
40
+ * tell the real implementation from the skeleton.
41
+ * - `launch()` enqueues exactly one BullMQ job per step
42
+ * (one job per step, NEVER one job per row/chunk) and
43
+ * returns `{ kind: 'enqueued', queueJobId }`. The launch
44
+ * is fire-and-forget; the launcher re-resolves the
45
+ * canonical `JobExecution` from the repository.
46
+ */
47
+ @Injectable()
48
+ export class BullMqExecutionStrategy implements IExecutionStrategy {
49
+ /** Strategy name. Mirrors the runtime service's name. */
50
+ readonly name = BULLMQ_STRATEGY_NAME;
51
+
52
+ private readonly logger = new Logger(BullMqExecutionStrategy.name);
53
+
54
+ constructor(private readonly runtime: BullmqRuntimeService) {}
55
+
56
+ /**
57
+ * Enqueue the work and return the BullMQ job id. The DB
58
+ * execution row was created by the launcher BEFORE this
59
+ * method was called — this method MUST NOT re-create it
60
+ * (the launcher's atomic create-or-lock would race with us).
61
+ *
62
+ * Throws on producer failure. The launcher propagates the
63
+ * error to its caller; the canonical `JobExecution` row
64
+ * stays in `STARTING` and the host's recovery path is
65
+ * responsible for transitioning it (a future task will
66
+ * wire a "dead letter" cleanup).
67
+ */
68
+ async launch(
69
+ job: JobDefinition,
70
+ params: JobParameters,
71
+ ctx: ExecutionStrategyContext,
72
+ ): Promise<LaunchResult> {
73
+ return this.runtime.launch(job, params, ctx);
74
+ }
75
+ }
76
+
77
+ /**
78
+ * Re-export the canonical `name` for tests that want to assert
79
+ * on it without importing the runtime service directly.
80
+ */
81
+ export { BULLMQ_STRATEGY_NAME };
@@ -0,0 +1,540 @@
1
+ import {
2
+ Inject,
3
+ Injectable,
4
+ Logger,
5
+ OnApplicationBootstrap,
6
+ OnApplicationShutdown,
7
+ Optional,
8
+ } from '@nestjs/common';
9
+ import { Queue, QueueEvents, Worker, type JobsOptions } from 'bullmq';
10
+
11
+ import {
12
+ type IExecutionStrategy,
13
+ type JobDefinition,
14
+ type BatchObserver,
15
+ type JsonValue,
16
+ type JobRepository,
17
+ JOB_REPOSITORY_TOKEN,
18
+ enforcePartitionIndex,
19
+ validatePartitions,
20
+ } from '@nest-batch/core';
21
+ import { JobExecutor, JobRegistry, NoopBatchObserver, BATCH_EVENT } from '@nest-batch/core';
22
+
23
+ import {
24
+ BULLMQ_MODULE_OPTIONS,
25
+ type ResolvedBullMqModuleOptions,
26
+ } from './module-options';
27
+
28
+ /**
29
+ * Payload shape stored in a BullMQ job's `data` field.
30
+ *
31
+ * The strategy enqueues one BullMQ job per step (or per partition,
32
+ * in a future enhancement). The worker reconstructs the
33
+ * `JobExecution` from the repository via `executionId` and the
34
+ * `JobDefinition` from the registry via `jobId`.
35
+ *
36
+ * Why not store the full `JobDefinition` in the payload?
37
+ * - IR is mutable across the host process (decorators / builders
38
+ * may swap providers in tests, hot-reload, etc.). The
39
+ * repository + registry are the canonical sources; the
40
+ * payload carries only the keys needed to look them up.
41
+ * - Storage size — IRs can be large (listeners, resolvers).
42
+ * Redis is transport, not cache; small payloads are cheaper.
43
+ */
44
+ export interface BullmqJobPayload {
45
+ /** JobExecution id, used to load the canonical execution row. */
46
+ readonly executionId: string;
47
+ /** Mirrors `executionId` today; kept distinct for forward compat. */
48
+ readonly jobExecutionId: string;
49
+ /** JobDefinition id, used to look up the IR from the registry. */
50
+ readonly jobId: string;
51
+ /** Step id (the `name` field of the BullMQ job). */
52
+ readonly stepId: string;
53
+ /**
54
+ * Partition index. Reserved for a future enhancement where a
55
+ * chunk step is split into N partitions and enqueued as N
56
+ * BullMQ jobs. Today the strategy always enqueues one job
57
+ * per step (regardless of chunk size), so the field is
58
+ * `undefined`. Kept in the payload shape so the worker
59
+ * can distinguish "this is a step" from "this is a partition"
60
+ * without a separate discriminator.
61
+ */
62
+ readonly partitionIndex?: number;
63
+ }
64
+
65
+ /**
66
+ * The single BullMQ queue name used by the strategy + worker +
67
+ * queue-events. We deliberately do not fan out into per-step
68
+ * queues — that would force the host to pre-declare every step
69
+ * name at compile time, which is at odds with the decorator /
70
+ * builder APIs that discover steps at runtime. A single queue
71
+ * keyed by the step's `name` field is the standard BullMQ pattern
72
+ * (the `name` field discriminates the work).
73
+ *
74
+ * BullMQ 5 rejects queue names that contain a colon (`:`) because
75
+ * it is the path separator in the Redis key layout. We use a
76
+ * hyphen-separated name accordingly.
77
+ */
78
+ export const BULLMQ_QUEUE_NAME = 'nest-batch-work';
79
+
80
+ /**
81
+ * Name of the BullMQ strategy. Logged by the bridge for diagnostic
82
+ * purposes and asserted by tests that need to distinguish the
83
+ * real implementation from the T17 stub.
84
+ */
85
+ export const BULLMQ_STRATEGY_NAME = 'bullmq';
86
+
87
+ /**
88
+ * Bridge between the BullMQ `Queue` / `Worker` / `QueueEvents` and
89
+ * the `@nest-batch/core` execution pipeline.
90
+ *
91
+ * Responsibilities (T18 contract):
92
+ * 1. Own the producer / worker connection clients with the
93
+ * role-specific tuning (fail-fast producer, blocking worker).
94
+ * 2. Implement the `IExecutionStrategy` contract: `launch()`
95
+ * enqueues a single BullMQ job per step, returns
96
+ * `{ kind: 'enqueued', queueJobId }`. The launch is
97
+ * fire-and-forget — the strategy does NOT block on the
98
+ * worker.
99
+ * 3. Drive the worker lifecycle (`OnApplicationBootstrap` /
100
+ * `OnApplicationShutdown`).
101
+ * 4. Bridge `QueueEvents` `completed` / `failed` / `stalled`
102
+ * into the `BatchObserver` (defaulting to `NoopBatchObserver`).
103
+ * 5. Hand off to `JobExecutor.execute(execution, jobDef)` from
104
+ * inside the worker — Batch Core remains the source of truth
105
+ * for state transitions, skip/retry, checkpoint, restart.
106
+ *
107
+ * Why a single class (not separate `Queue` / `Worker` providers)?
108
+ * - The producer and worker share a `connection` record but
109
+ * carry *different* `ConnectionOptions` (different
110
+ * `maxRetriesPerRequest`, `enableReadyCheck`, ...). Splitting
111
+ * them across providers would force the connection-tuning
112
+ * logic into two places and risk the worker accidentally
113
+ * inheriting the producer's fail-fast config (or vice versa).
114
+ * - Lifecycle is a unit: open producer + worker + events
115
+ * together, close them together in the documented order
116
+ * (workers first, then events, then queues). Centralising
117
+ * this in one class makes the close-order a single source
118
+ * of truth and a single method (`close()`).
119
+ */
120
+ @Injectable()
121
+ export class BullmqRuntimeService
122
+ implements IExecutionStrategy, OnApplicationBootstrap, OnApplicationShutdown
123
+ {
124
+ /**
125
+ * Strategy name. Distinct from the T17 stub's `'bullmq-stub'`
126
+ * so log lines and boundary reports can tell them apart.
127
+ */
128
+ readonly name = BULLMQ_STRATEGY_NAME;
129
+
130
+ private readonly logger = new Logger(BullmqRuntimeService.name);
131
+
132
+ /** BullMQ queue (producer side). */
133
+ private queue: Queue | null = null;
134
+ /** BullMQ worker (consumer side). */
135
+ private worker: Worker<BullmqJobPayload> | null = null;
136
+ /** BullMQ QueueEvents stream listener. */
137
+ private queueEvents: QueueEvents | null = null;
138
+ /**
139
+ * Promise-chain lock for the close path. We capture the first
140
+ * `close()` invocation and short-circuit subsequent ones so a
141
+ * stray double-shutdown (Nest calls `OnApplicationShutdown`
142
+ * once, but tests sometimes do their own) does not race the
143
+ * in-flight close.
144
+ */
145
+ private closePromise: Promise<void> | null = null;
146
+
147
+ constructor(
148
+ @Inject(BULLMQ_MODULE_OPTIONS)
149
+ private readonly options: ResolvedBullMqModuleOptions,
150
+ @Inject(JOB_REPOSITORY_TOKEN)
151
+ private readonly repository: JobRepository,
152
+ private readonly registry: JobRegistry,
153
+ private readonly jobExecutor: JobExecutor,
154
+ @Optional()
155
+ private readonly observer: BatchObserver = new NoopBatchObserver() as BatchObserver,
156
+ ) {}
157
+
158
+ /**
159
+ * Nest lifecycle: spin up the queue, worker, and queue-events
160
+ * after the DI container is fully wired. We do this in
161
+ * `onApplicationBootstrap` (not `onModuleInit`) so every other
162
+ * provider — including user-supplied `JobRepository` overrides —
163
+ * is already instantiated and injectable.
164
+ *
165
+ * Worker startup is gated on `options.autoStartWorker`. The
166
+ * flag exists for launcher-only deployments (e.g. an API
167
+ * service that only enqueues) and for tests that want to
168
+ * exercise the producer side in isolation. When the flag is
169
+ * `false` the queue is still created (so `launch()` can
170
+ * enqueue), but the worker is not started (no consumer means
171
+ * the jobs sit in the queue indefinitely).
172
+ */
173
+ onApplicationBootstrap(): void {
174
+ this.queue = this.buildQueue();
175
+ this.queueEvents = this.buildQueueEvents();
176
+ this.attachQueueEventsBridge();
177
+
178
+ if (this.options.autoStartWorker) {
179
+ this.worker = this.buildWorker();
180
+ this.logger.log(
181
+ `BullmqRuntimeService started: queue="${BULLMQ_QUEUE_NAME}" ` +
182
+ `worker=auto, keyPrefix="${this.options.connection.keyPrefix}"`,
183
+ );
184
+ } else {
185
+ this.logger.log(
186
+ `BullmqRuntimeService started: queue="${BULLMQ_QUEUE_NAME}" ` +
187
+ `worker=manual (autoStartWorker=false)`,
188
+ );
189
+ }
190
+ }
191
+
192
+ /**
193
+ * Nest lifecycle: close every BullMQ resource in the documented
194
+ * order — workers first (let in-flight jobs finish or be
195
+ * returned to the queue), then events (no new events can
196
+ * arrive once the worker is closed), then queues (the producer
197
+ * is closed last so any pending `add()` calls had a chance to
198
+ * land).
199
+ *
200
+ * Idempotent: a second call to `onApplicationShutdown` (which
201
+ * can happen in tests) short-circuits to the first close's
202
+ * promise rather than racing.
203
+ */
204
+ async onApplicationShutdown(): Promise<void> {
205
+ if (this.closePromise !== null) {
206
+ return this.closePromise;
207
+ }
208
+ this.closePromise = this.close();
209
+ return this.closePromise;
210
+ }
211
+
212
+ // -----------------------------------------------------------------------
213
+ // IExecutionStrategy
214
+ // -----------------------------------------------------------------------
215
+
216
+ /**
217
+ * Enqueue a single BullMQ job per step. Returns
218
+ * `{ kind: 'enqueued', queueJobId }` after the producer has
219
+ * acknowledged the enqueue. The execution is fire-and-forget:
220
+ * the launcher resolves the latest persisted `JobExecution`
221
+ * (which is still in `STARTING`/`STARTED` because the executor
222
+ * has not run yet).
223
+ *
224
+ * The canonical `JobExecution` row is created by the launcher
225
+ * via `repository.createExecutionAtomic` BEFORE this method is
226
+ * called (the `executionId` in `ctx` is the result). This
227
+ * strategy does NOT re-create it; doing so would race the
228
+ * launcher's atomic create and break the `SELECT ... FOR
229
+ * UPDATE SKIP LOCKED` invariant.
230
+ *
231
+ * Throws if the producer cannot enqueue (Redis down, key
232
+ * collision, etc.). The launcher re-throws the error to its
233
+ * caller; the `JobExecution` row remains in `STARTING` —
234
+ * the host's recovery path (or a manual cleanup) is
235
+ * responsible for transitioning it.
236
+ */
237
+ async launch(
238
+ job: JobDefinition,
239
+ _params: Record<string, unknown>,
240
+ ctx: { executionId: string; jobExecutionId: string },
241
+ ): Promise<{ kind: 'enqueued'; queueJobId: string }> {
242
+ if (this.queue === null) {
243
+ throw new Error(
244
+ `[BullmqRuntimeService] launch() called before onApplicationBootstrap — ` +
245
+ 'module is not initialized. Did you forget to import BullmqBatchModule?',
246
+ );
247
+ }
248
+ // T8 (partition orchestration): when the start step declares
249
+ // `partitions.count >= 2`, the strategy enqueues one BullMQ job
250
+ // per partition (each carrying a distinct `partitionIndex`).
251
+ // Otherwise (default, `count === 1`, or absent) it preserves
252
+ // the 0.1.0 "one job per step" behaviour. The validate call
253
+ // surfaces a misconfiguration (e.g. `count <= 0`) at the
254
+ // launcher's boundary so the host's caller sees the failure
255
+ // before the worker is ever asked to process the job.
256
+ const stepId = job.startStepId;
257
+ const startStep = job.steps[stepId];
258
+ const partitions = startStep?.kind === 'chunk' ? startStep.partitions : undefined;
259
+ validatePartitions(partitions);
260
+ const partitionCount = partitions?.count ?? 1;
261
+ const partitionOrdinals: Array<number | undefined> =
262
+ partitionCount >= 2 ? Array.from({ length: partitionCount }, (_, i) => i) : [undefined];
263
+
264
+ const jobOpts: JobsOptions = {
265
+ attempts: 3,
266
+ backoff: { type: 'exponential', delay: 100, jitter: 0.5 },
267
+ removeOnComplete: { count: 100, age: 3600 },
268
+ removeOnFail: { count: 1000 },
269
+ };
270
+
271
+ let lastQueueJobId: string | null = null;
272
+ for (const partitionIndex of partitionOrdinals) {
273
+ const payload: BullmqJobPayload = {
274
+ executionId: ctx.executionId,
275
+ jobExecutionId: ctx.jobExecutionId,
276
+ jobId: job.id,
277
+ stepId,
278
+ ...(partitionIndex !== undefined ? { partitionIndex } : {}),
279
+ };
280
+ const enqueued = await this.queue.add(stepId, payload, jobOpts);
281
+ if (enqueued.id === undefined) {
282
+ // BullMQ returns a job with `id` undefined only when the
283
+ // producer cannot reach Redis and the in-memory buffer
284
+ // (which is disabled by `enableOfflineQueue: false`) is
285
+ // not available. Surface this as a hard error so the
286
+ // launcher propagates the failure.
287
+ throw new Error(
288
+ `[BullmqRuntimeService] enqueue returned undefined job id (Redis down?)`,
289
+ );
290
+ }
291
+ const qid = String(enqueued.id);
292
+ lastQueueJobId = qid;
293
+ this.logger.debug(
294
+ `Enqueued step "${stepId}" for execution ${ctx.executionId}` +
295
+ (partitionIndex !== undefined ? ` (partition ${partitionIndex}/${partitionCount})` : '') +
296
+ ` as BullMQ job ${qid}`,
297
+ );
298
+ }
299
+ if (lastQueueJobId === null) {
300
+ // Defensive: the loop above always runs at least once
301
+ // (partitionOrdinals has length >= 1), so this branch is
302
+ // unreachable in practice. Keep the explicit throw so a
303
+ // future refactor cannot quietly enqueue zero jobs.
304
+ throw new Error(`[BullmqRuntimeService] enqueued zero jobs for execution ${ctx.executionId}`);
305
+ }
306
+ return { kind: 'enqueued', queueJobId: lastQueueJobId };
307
+ }
308
+
309
+ // -----------------------------------------------------------------------
310
+ // Construction
311
+ // -----------------------------------------------------------------------
312
+
313
+ private buildQueue(): Queue {
314
+ return new Queue(BULLMQ_QUEUE_NAME, {
315
+ connection: this.producerConnectionOptions(),
316
+ // `defaultJobOptions` is a defence-in-depth measure. The
317
+ // strategy already passes per-call `JobsOptions` (with
318
+ // the T18 retry / remove policy) so this is the fallback
319
+ // for any code path that calls `queue.add` without
320
+ // explicit options. Today the only caller is the strategy.
321
+ defaultJobOptions: {
322
+ attempts: 3,
323
+ backoff: { type: 'exponential', delay: 100, jitter: 0.5 },
324
+ removeOnComplete: { count: 100, age: 3600 },
325
+ removeOnFail: { count: 1000 },
326
+ },
327
+ prefix: this.options.connection.keyPrefix,
328
+ // Skip waiting for the producer connection to become ready
329
+ // before returning from `add`. The fail-fast producer
330
+ // options (see `producerConnectionOptions`) make a dead
331
+ // Redis surface as a synchronous error on the first `add`,
332
+ // which is exactly what the "Redis-down" test asserts.
333
+ skipWaitingForReady: true,
334
+ // BullMQ 5 calls `client.info()` to discover the server
335
+ // version + database type at `Queue` construction time. With
336
+ // `enableOfflineQueue: false` and the ioredis client not
337
+ // yet ready, the call throws `Stream isn't writeable`.
338
+ // `skipVersionCheck: true` short-circuits that probe — the
339
+ // strategy never depends on the version, and a dead Redis
340
+ // still surfaces synchronously on the first `add()` (per
341
+ // the fail-fast contract above).
342
+ skipVersionCheck: true,
343
+ });
344
+ }
345
+
346
+ private buildWorker(): Worker<BullmqJobPayload> {
347
+ return new Worker<BullmqJobPayload>(
348
+ BULLMQ_QUEUE_NAME,
349
+ async (job) => this.processJob(job.data),
350
+ {
351
+ connection: this.workerConnectionOptions(),
352
+ prefix: this.options.connection.keyPrefix,
353
+ concurrency: 1,
354
+ },
355
+ );
356
+ }
357
+
358
+ private buildQueueEvents(): QueueEvents {
359
+ return new QueueEvents(BULLMQ_QUEUE_NAME, {
360
+ connection: this.workerConnectionOptions(),
361
+ prefix: this.options.connection.keyPrefix,
362
+ });
363
+ }
364
+
365
+ /**
366
+ * Wire the `QueueEvents` listeners to the configured
367
+ * `BatchObserver`. Each listener swallows observer errors so
368
+ * a slow / failing observer cannot poison the BullMQ event
369
+ * stream.
370
+ */
371
+ private attachQueueEventsBridge(): void {
372
+ if (this.queueEvents === null) return;
373
+ this.queueEvents.on('completed', ({ jobId }) => {
374
+ void this.bridgeEvent(BATCH_EVENT.JOB_COMPLETED, { queueJobId: jobId, kind: 'completed' });
375
+ });
376
+ this.queueEvents.on('failed', ({ jobId, failedReason }) => {
377
+ void this.bridgeEvent(BATCH_EVENT.JOB_FAILED, {
378
+ queueJobId: jobId,
379
+ kind: 'failed',
380
+ reason: failedReason,
381
+ });
382
+ });
383
+ this.queueEvents.on('stalled', ({ jobId }) => {
384
+ void this.bridgeEvent(BATCH_EVENT.JOB_FAILED, { queueJobId: jobId, kind: 'stalled' });
385
+ });
386
+ }
387
+
388
+ private async bridgeEvent(
389
+ type: (typeof BATCH_EVENT)[keyof typeof BATCH_EVENT],
390
+ data: Record<string, unknown>,
391
+ ): Promise<void> {
392
+ try {
393
+ await this.observer.onEvent({
394
+ type,
395
+ timestamp: new Date(),
396
+ jobExecutionId: String(data['queueJobId'] ?? '<unknown>'),
397
+ data: data as unknown as JsonValue,
398
+ });
399
+ } catch (err) {
400
+ this.logger.warn(
401
+ `BatchObserver threw on event ${type}: ${err instanceof Error ? err.message : String(err)}`,
402
+ );
403
+ }
404
+ }
405
+
406
+ // -----------------------------------------------------------------------
407
+ // Worker processor — delegated to JobExecutor
408
+ // -----------------------------------------------------------------------
409
+
410
+ /**
411
+ * Worker entry point. Loads the canonical `JobExecution` from
412
+ * the repository and the `JobDefinition` from the registry, then
413
+ * hands the work to `JobExecutor.execute`. All batch semantics
414
+ * (step dispatch, chunk loop, skip/retry, checkpoint) live in
415
+ * the executor — this method is a thin bridge.
416
+ */
417
+ private async processJob(payload: BullmqJobPayload): Promise<void> {
418
+ const execution = await this.repository.getJobExecution(payload.executionId);
419
+ if (execution === null) {
420
+ // The DB row is gone. The launcher pre-created it via
421
+ // `createExecutionAtomic`; if it's missing now, the host
422
+ // either deleted it or restored a DB without the row.
423
+ // Surface as a BullMQ-level failure so the technical
424
+ // retry / dead-letter path handles it.
425
+ throw new Error(
426
+ `[BullmqRuntimeService] JobExecution ${payload.executionId} not found in repository`,
427
+ );
428
+ }
429
+ const jobDef = this.registry.get(payload.jobId);
430
+ // `JobRegistry.get` throws `JobNotFoundError` if the
431
+ // definition is missing. We let it propagate so BullMQ
432
+ // records the failure and the dead-letter queue catches
433
+ // it (a missing job definition is a misconfiguration that
434
+ // should be loud, not silent).
435
+ await this.jobExecutor.execute(execution, jobDef);
436
+ }
437
+
438
+ // -----------------------------------------------------------------------
439
+ // Connection options
440
+ // -----------------------------------------------------------------------
441
+
442
+ /**
443
+ * Producer-side connection tuning. The two flags below are
444
+ * the contract the T18 "Redis-down" test depends on:
445
+ *
446
+ * - `enableOfflineQueue: false` — a `Queue.add()` against a
447
+ * dead Redis MUST throw synchronously rather than buffer
448
+ * the command. Without this, BullMQ keeps the command in
449
+ * memory and `add()` returns success, breaking the
450
+ * "fail fast" guarantee.
451
+ * - `maxRetriesPerRequest: 1` — keep the first `add`
452
+ * fast; subsequent reconnects are handled by ioredis
453
+ * itself (we do not want BullMQ to block on retries
454
+ * during the launcher call).
455
+ *
456
+ * BullMQ specifically warns against `maxRetriesPerRequest: null`
457
+ * on the producer, because the producer does not use blocking
458
+ * commands. We use `1` for the same reason.
459
+ */
460
+ private producerConnectionOptions(): Record<string, unknown> {
461
+ return {
462
+ host: this.options.connection.host,
463
+ port: this.options.connection.port,
464
+ password: this.options.connection.password,
465
+ username: this.options.connection.username,
466
+ db: this.options.connection.db,
467
+ ...(this.options.connection.tls ? { tls: true } : {}),
468
+ enableOfflineQueue: false,
469
+ maxRetriesPerRequest: 1,
470
+ };
471
+ }
472
+
473
+ /**
474
+ * Worker-side connection tuning. Two flags that BullMQ
475
+ * *requires* for blocking workers (per the BullMQ docs):
476
+ *
477
+ * - `maxRetriesPerRequest: null` — the worker's
478
+ * `BLPOP` / `BRPOPLPUSH` / `XREADGROUP` commands MUST NOT
479
+ * retry per request. A stalled worker surfaces as a
480
+ * stall, not a connection error.
481
+ * - `enableReadyCheck: false` — the worker should not
482
+ * refuse to start when Redis is in the middle of a
483
+ * failover; ioredis reconnects on its own.
484
+ */
485
+ private workerConnectionOptions(): Record<string, unknown> {
486
+ return {
487
+ host: this.options.connection.host,
488
+ port: this.options.connection.port,
489
+ password: this.options.connection.password,
490
+ username: this.options.connection.username,
491
+ db: this.options.connection.db,
492
+ ...(this.options.connection.tls ? { tls: true } : {}),
493
+ maxRetriesPerRequest: null,
494
+ enableReadyCheck: false,
495
+ };
496
+ }
497
+
498
+ // -----------------------------------------------------------------------
499
+ // Close
500
+ // -----------------------------------------------------------------------
501
+
502
+ /**
503
+ * Close all BullMQ resources in the documented order:
504
+ * worker → events → queue. Each step is best-effort: a close
505
+ * error on one resource does not prevent the others from
506
+ * being closed.
507
+ */
508
+ private async close(): Promise<void> {
509
+ if (this.worker !== null) {
510
+ try {
511
+ await this.worker.close();
512
+ } catch (err) {
513
+ this.logger.warn(
514
+ `Worker close failed: ${err instanceof Error ? err.message : String(err)}`,
515
+ );
516
+ }
517
+ this.worker = null;
518
+ }
519
+ if (this.queueEvents !== null) {
520
+ try {
521
+ await this.queueEvents.close();
522
+ } catch (err) {
523
+ this.logger.warn(
524
+ `QueueEvents close failed: ${err instanceof Error ? err.message : String(err)}`,
525
+ );
526
+ }
527
+ this.queueEvents = null;
528
+ }
529
+ if (this.queue !== null) {
530
+ try {
531
+ await this.queue.close();
532
+ } catch (err) {
533
+ this.logger.warn(
534
+ `Queue close failed: ${err instanceof Error ? err.message : String(err)}`,
535
+ );
536
+ }
537
+ this.queue = null;
538
+ }
539
+ }
540
+ }