@nest-batch/bullmq 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +333 -0
- package/dist/src/adapters/bullmq.adapter.d.ts +157 -0
- package/dist/src/adapters/bullmq.adapter.d.ts.map +1 -0
- package/dist/src/adapters/bullmq.adapter.js +252 -0
- package/dist/src/adapters/bullmq.adapter.js.map +1 -0
- package/dist/src/adapters/index.d.ts +12 -0
- package/dist/src/adapters/index.d.ts.map +1 -0
- package/dist/src/adapters/index.js +29 -0
- package/dist/src/adapters/index.js.map +1 -0
- package/dist/src/bullmq-execution-strategy.d.ts +59 -0
- package/dist/src/bullmq-execution-strategy.d.ts.map +1 -0
- package/dist/src/bullmq-execution-strategy.js +60 -0
- package/dist/src/bullmq-execution-strategy.js.map +1 -0
- package/dist/src/bullmq-runtime.service.d.ts +237 -0
- package/dist/src/bullmq-runtime.service.d.ts.map +1 -0
- package/dist/src/bullmq-runtime.service.js +441 -0
- package/dist/src/bullmq-runtime.service.js.map +1 -0
- package/dist/src/bullmq-schedule.service.d.ts +121 -0
- package/dist/src/bullmq-schedule.service.d.ts.map +1 -0
- package/dist/src/bullmq-schedule.service.js +232 -0
- package/dist/src/bullmq-schedule.service.js.map +1 -0
- package/dist/src/connection.d.ts +83 -0
- package/dist/src/connection.d.ts.map +1 -0
- package/dist/src/connection.js +72 -0
- package/dist/src/connection.js.map +1 -0
- package/dist/src/index.d.ts +29 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +46 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/module-options.d.ts +68 -0
- package/dist/src/module-options.d.ts.map +1 -0
- package/dist/src/module-options.js +13 -0
- package/dist/src/module-options.js.map +1 -0
- package/package.json +71 -0
- package/src/adapters/bullmq.adapter.ts +346 -0
- package/src/adapters/index.ts +11 -0
- package/src/bullmq-execution-strategy.ts +81 -0
- package/src/bullmq-runtime.service.ts +540 -0
- package/src/bullmq-schedule.service.ts +271 -0
- package/src/connection.ts +97 -0
- package/src/index.ts +28 -0
- package/src/module-options.ts +74 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { Injectable, Logger } from '@nestjs/common';
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
type ExecutionStrategyContext,
|
|
5
|
+
type IExecutionStrategy,
|
|
6
|
+
type JobDefinition,
|
|
7
|
+
type JobParameters,
|
|
8
|
+
type LaunchResult,
|
|
9
|
+
} from '@nest-batch/core';
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
BullmqRuntimeService,
|
|
13
|
+
BULLMQ_STRATEGY_NAME,
|
|
14
|
+
} from './bullmq-runtime.service';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* BullMQ execution strategy — the `@nest-batch/core`-facing
|
|
18
|
+
* transport that hands a `JobExecution` off to a BullMQ `Queue`
|
|
19
|
+
* and lets a `Worker` process the work.
|
|
20
|
+
*
|
|
21
|
+
* Design (T18):
|
|
22
|
+
* - The actual BullMQ resource ownership (queue / worker /
|
|
23
|
+
* queue-events lifecycle, connection tuning, event bridge)
|
|
24
|
+
* lives in `BullmqRuntimeService`. This class is a thin
|
|
25
|
+
* adapter that maps the `IExecutionStrategy` contract to
|
|
26
|
+
* the runtime service's `launch()` shape.
|
|
27
|
+
* - Splitting the two lets the runtime service be
|
|
28
|
+
* independently testable (e.g. a test that wants to drive
|
|
29
|
+
* the worker without going through the launcher can
|
|
30
|
+
* instantiate the runtime service alone), and lets the
|
|
31
|
+
* strategy class stay as a stable public surface for
|
|
32
|
+
* `EXECUTION_STRATEGY` consumers.
|
|
33
|
+
* - The strategy inherits the runtime service's
|
|
34
|
+
* `name` (`'bullmq'`) — the runtime service is the
|
|
35
|
+
* single source of truth for the strategy name.
|
|
36
|
+
*
|
|
37
|
+
* `name` and `launch()` together comprise the T18 contract:
|
|
38
|
+
* - `name = 'bullmq'` — replaces the T17 stub's
|
|
39
|
+
* `'bullmq-stub'` so log lines and boundary reports can
|
|
40
|
+
* tell the real implementation from the skeleton.
|
|
41
|
+
* - `launch()` enqueues exactly one BullMQ job per step
|
|
42
|
+
* (one job per step, NEVER one job per row/chunk) and
|
|
43
|
+
* returns `{ kind: 'enqueued', queueJobId }`. The launch
|
|
44
|
+
* is fire-and-forget; the launcher re-resolves the
|
|
45
|
+
* canonical `JobExecution` from the repository.
|
|
46
|
+
*/
|
|
47
|
+
@Injectable()
|
|
48
|
+
export class BullMqExecutionStrategy implements IExecutionStrategy {
|
|
49
|
+
/** Strategy name. Mirrors the runtime service's name. */
|
|
50
|
+
readonly name = BULLMQ_STRATEGY_NAME;
|
|
51
|
+
|
|
52
|
+
private readonly logger = new Logger(BullMqExecutionStrategy.name);
|
|
53
|
+
|
|
54
|
+
constructor(private readonly runtime: BullmqRuntimeService) {}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Enqueue the work and return the BullMQ job id. The DB
|
|
58
|
+
* execution row was created by the launcher BEFORE this
|
|
59
|
+
* method was called — this method MUST NOT re-create it
|
|
60
|
+
* (the launcher's atomic create-or-lock would race with us).
|
|
61
|
+
*
|
|
62
|
+
* Throws on producer failure. The launcher propagates the
|
|
63
|
+
* error to its caller; the canonical `JobExecution` row
|
|
64
|
+
* stays in `STARTING` and the host's recovery path is
|
|
65
|
+
* responsible for transitioning it (a future task will
|
|
66
|
+
* wire a "dead letter" cleanup).
|
|
67
|
+
*/
|
|
68
|
+
async launch(
|
|
69
|
+
job: JobDefinition,
|
|
70
|
+
params: JobParameters,
|
|
71
|
+
ctx: ExecutionStrategyContext,
|
|
72
|
+
): Promise<LaunchResult> {
|
|
73
|
+
return this.runtime.launch(job, params, ctx);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Re-export the canonical `name` for tests that want to assert
|
|
79
|
+
* on it without importing the runtime service directly.
|
|
80
|
+
*/
|
|
81
|
+
export { BULLMQ_STRATEGY_NAME };
|
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
import {
|
|
2
|
+
Inject,
|
|
3
|
+
Injectable,
|
|
4
|
+
Logger,
|
|
5
|
+
OnApplicationBootstrap,
|
|
6
|
+
OnApplicationShutdown,
|
|
7
|
+
Optional,
|
|
8
|
+
} from '@nestjs/common';
|
|
9
|
+
import { Queue, QueueEvents, Worker, type JobsOptions } from 'bullmq';
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
type IExecutionStrategy,
|
|
13
|
+
type JobDefinition,
|
|
14
|
+
type BatchObserver,
|
|
15
|
+
type JsonValue,
|
|
16
|
+
type JobRepository,
|
|
17
|
+
JOB_REPOSITORY_TOKEN,
|
|
18
|
+
enforcePartitionIndex,
|
|
19
|
+
validatePartitions,
|
|
20
|
+
} from '@nest-batch/core';
|
|
21
|
+
import { JobExecutor, JobRegistry, NoopBatchObserver, BATCH_EVENT } from '@nest-batch/core';
|
|
22
|
+
|
|
23
|
+
import {
|
|
24
|
+
BULLMQ_MODULE_OPTIONS,
|
|
25
|
+
type ResolvedBullMqModuleOptions,
|
|
26
|
+
} from './module-options';
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Payload shape stored in a BullMQ job's `data` field.
|
|
30
|
+
*
|
|
31
|
+
* The strategy enqueues one BullMQ job per step (or per partition,
|
|
32
|
+
* in a future enhancement). The worker reconstructs the
|
|
33
|
+
* `JobExecution` from the repository via `executionId` and the
|
|
34
|
+
* `JobDefinition` from the registry via `jobId`.
|
|
35
|
+
*
|
|
36
|
+
* Why not store the full `JobDefinition` in the payload?
|
|
37
|
+
* - IR is mutable across the host process (decorators / builders
|
|
38
|
+
* may swap providers in tests, hot-reload, etc.). The
|
|
39
|
+
* repository + registry are the canonical sources; the
|
|
40
|
+
* payload carries only the keys needed to look them up.
|
|
41
|
+
* - Storage size — IRs can be large (listeners, resolvers).
|
|
42
|
+
* Redis is transport, not cache; small payloads are cheaper.
|
|
43
|
+
*/
|
|
44
|
+
export interface BullmqJobPayload {
|
|
45
|
+
/** JobExecution id, used to load the canonical execution row. */
|
|
46
|
+
readonly executionId: string;
|
|
47
|
+
/** Mirrors `executionId` today; kept distinct for forward compat. */
|
|
48
|
+
readonly jobExecutionId: string;
|
|
49
|
+
/** JobDefinition id, used to look up the IR from the registry. */
|
|
50
|
+
readonly jobId: string;
|
|
51
|
+
/** Step id (the `name` field of the BullMQ job). */
|
|
52
|
+
readonly stepId: string;
|
|
53
|
+
/**
|
|
54
|
+
* Partition index. Reserved for a future enhancement where a
|
|
55
|
+
* chunk step is split into N partitions and enqueued as N
|
|
56
|
+
* BullMQ jobs. Today the strategy always enqueues one job
|
|
57
|
+
* per step (regardless of chunk size), so the field is
|
|
58
|
+
* `undefined`. Kept in the payload shape so the worker
|
|
59
|
+
* can distinguish "this is a step" from "this is a partition"
|
|
60
|
+
* without a separate discriminator.
|
|
61
|
+
*/
|
|
62
|
+
readonly partitionIndex?: number;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* The single BullMQ queue name used by the strategy + worker +
|
|
67
|
+
* queue-events. We deliberately do not fan out into per-step
|
|
68
|
+
* queues — that would force the host to pre-declare every step
|
|
69
|
+
* name at compile time, which is at odds with the decorator /
|
|
70
|
+
* builder APIs that discover steps at runtime. A single queue
|
|
71
|
+
* keyed by the step's `name` field is the standard BullMQ pattern
|
|
72
|
+
* (the `name` field discriminates the work).
|
|
73
|
+
*
|
|
74
|
+
* BullMQ 5 rejects queue names that contain a colon (`:`) because
|
|
75
|
+
* it is the path separator in the Redis key layout. We use a
|
|
76
|
+
* hyphen-separated name accordingly.
|
|
77
|
+
*/
|
|
78
|
+
export const BULLMQ_QUEUE_NAME = 'nest-batch-work';
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Name of the BullMQ strategy. Logged by the bridge for diagnostic
|
|
82
|
+
* purposes and asserted by tests that need to distinguish the
|
|
83
|
+
* real implementation from the T17 stub.
|
|
84
|
+
*/
|
|
85
|
+
export const BULLMQ_STRATEGY_NAME = 'bullmq';
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Bridge between the BullMQ `Queue` / `Worker` / `QueueEvents` and
|
|
89
|
+
* the `@nest-batch/core` execution pipeline.
|
|
90
|
+
*
|
|
91
|
+
* Responsibilities (T18 contract):
|
|
92
|
+
* 1. Own the producer / worker connection clients with the
|
|
93
|
+
* role-specific tuning (fail-fast producer, blocking worker).
|
|
94
|
+
* 2. Implement the `IExecutionStrategy` contract: `launch()`
|
|
95
|
+
* enqueues a single BullMQ job per step, returns
|
|
96
|
+
* `{ kind: 'enqueued', queueJobId }`. The launch is
|
|
97
|
+
* fire-and-forget — the strategy does NOT block on the
|
|
98
|
+
* worker.
|
|
99
|
+
* 3. Drive the worker lifecycle (`OnApplicationBootstrap` /
|
|
100
|
+
* `OnApplicationShutdown`).
|
|
101
|
+
* 4. Bridge `QueueEvents` `completed` / `failed` / `stalled`
|
|
102
|
+
* into the `BatchObserver` (defaulting to `NoopBatchObserver`).
|
|
103
|
+
* 5. Hand off to `JobExecutor.execute(execution, jobDef)` from
|
|
104
|
+
* inside the worker — Batch Core remains the source of truth
|
|
105
|
+
* for state transitions, skip/retry, checkpoint, restart.
|
|
106
|
+
*
|
|
107
|
+
* Why a single class (not separate `Queue` / `Worker` providers)?
|
|
108
|
+
* - The producer and worker share a `connection` record but
|
|
109
|
+
* carry *different* `ConnectionOptions` (different
|
|
110
|
+
* `maxRetriesPerRequest`, `enableReadyCheck`, ...). Splitting
|
|
111
|
+
* them across providers would force the connection-tuning
|
|
112
|
+
* logic into two places and risk the worker accidentally
|
|
113
|
+
* inheriting the producer's fail-fast config (or vice versa).
|
|
114
|
+
* - Lifecycle is a unit: open producer + worker + events
|
|
115
|
+
* together, close them together in the documented order
|
|
116
|
+
* (workers first, then events, then queues). Centralising
|
|
117
|
+
* this in one class makes the close-order a single source
|
|
118
|
+
* of truth and a single method (`close()`).
|
|
119
|
+
*/
|
|
120
|
+
@Injectable()
|
|
121
|
+
export class BullmqRuntimeService
|
|
122
|
+
implements IExecutionStrategy, OnApplicationBootstrap, OnApplicationShutdown
|
|
123
|
+
{
|
|
124
|
+
/**
|
|
125
|
+
* Strategy name. Distinct from the T17 stub's `'bullmq-stub'`
|
|
126
|
+
* so log lines and boundary reports can tell them apart.
|
|
127
|
+
*/
|
|
128
|
+
readonly name = BULLMQ_STRATEGY_NAME;
|
|
129
|
+
|
|
130
|
+
private readonly logger = new Logger(BullmqRuntimeService.name);
|
|
131
|
+
|
|
132
|
+
/** BullMQ queue (producer side). */
|
|
133
|
+
private queue: Queue | null = null;
|
|
134
|
+
/** BullMQ worker (consumer side). */
|
|
135
|
+
private worker: Worker<BullmqJobPayload> | null = null;
|
|
136
|
+
/** BullMQ QueueEvents stream listener. */
|
|
137
|
+
private queueEvents: QueueEvents | null = null;
|
|
138
|
+
/**
|
|
139
|
+
* Promise-chain lock for the close path. We capture the first
|
|
140
|
+
* `close()` invocation and short-circuit subsequent ones so a
|
|
141
|
+
* stray double-shutdown (Nest calls `OnApplicationShutdown`
|
|
142
|
+
* once, but tests sometimes do their own) does not race the
|
|
143
|
+
* in-flight close.
|
|
144
|
+
*/
|
|
145
|
+
private closePromise: Promise<void> | null = null;
|
|
146
|
+
|
|
147
|
+
constructor(
|
|
148
|
+
@Inject(BULLMQ_MODULE_OPTIONS)
|
|
149
|
+
private readonly options: ResolvedBullMqModuleOptions,
|
|
150
|
+
@Inject(JOB_REPOSITORY_TOKEN)
|
|
151
|
+
private readonly repository: JobRepository,
|
|
152
|
+
private readonly registry: JobRegistry,
|
|
153
|
+
private readonly jobExecutor: JobExecutor,
|
|
154
|
+
@Optional()
|
|
155
|
+
private readonly observer: BatchObserver = new NoopBatchObserver() as BatchObserver,
|
|
156
|
+
) {}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Nest lifecycle: spin up the queue, worker, and queue-events
|
|
160
|
+
* after the DI container is fully wired. We do this in
|
|
161
|
+
* `onApplicationBootstrap` (not `onModuleInit`) so every other
|
|
162
|
+
* provider — including user-supplied `JobRepository` overrides —
|
|
163
|
+
* is already instantiated and injectable.
|
|
164
|
+
*
|
|
165
|
+
* Worker startup is gated on `options.autoStartWorker`. The
|
|
166
|
+
* flag exists for launcher-only deployments (e.g. an API
|
|
167
|
+
* service that only enqueues) and for tests that want to
|
|
168
|
+
* exercise the producer side in isolation. When the flag is
|
|
169
|
+
* `false` the queue is still created (so `launch()` can
|
|
170
|
+
* enqueue), but the worker is not started (no consumer means
|
|
171
|
+
* the jobs sit in the queue indefinitely).
|
|
172
|
+
*/
|
|
173
|
+
onApplicationBootstrap(): void {
|
|
174
|
+
this.queue = this.buildQueue();
|
|
175
|
+
this.queueEvents = this.buildQueueEvents();
|
|
176
|
+
this.attachQueueEventsBridge();
|
|
177
|
+
|
|
178
|
+
if (this.options.autoStartWorker) {
|
|
179
|
+
this.worker = this.buildWorker();
|
|
180
|
+
this.logger.log(
|
|
181
|
+
`BullmqRuntimeService started: queue="${BULLMQ_QUEUE_NAME}" ` +
|
|
182
|
+
`worker=auto, keyPrefix="${this.options.connection.keyPrefix}"`,
|
|
183
|
+
);
|
|
184
|
+
} else {
|
|
185
|
+
this.logger.log(
|
|
186
|
+
`BullmqRuntimeService started: queue="${BULLMQ_QUEUE_NAME}" ` +
|
|
187
|
+
`worker=manual (autoStartWorker=false)`,
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Nest lifecycle: close every BullMQ resource in the documented
|
|
194
|
+
* order — workers first (let in-flight jobs finish or be
|
|
195
|
+
* returned to the queue), then events (no new events can
|
|
196
|
+
* arrive once the worker is closed), then queues (the producer
|
|
197
|
+
* is closed last so any pending `add()` calls had a chance to
|
|
198
|
+
* land).
|
|
199
|
+
*
|
|
200
|
+
* Idempotent: a second call to `onApplicationShutdown` (which
|
|
201
|
+
* can happen in tests) short-circuits to the first close's
|
|
202
|
+
* promise rather than racing.
|
|
203
|
+
*/
|
|
204
|
+
async onApplicationShutdown(): Promise<void> {
|
|
205
|
+
if (this.closePromise !== null) {
|
|
206
|
+
return this.closePromise;
|
|
207
|
+
}
|
|
208
|
+
this.closePromise = this.close();
|
|
209
|
+
return this.closePromise;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// -----------------------------------------------------------------------
|
|
213
|
+
// IExecutionStrategy
|
|
214
|
+
// -----------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Enqueue a single BullMQ job per step. Returns
|
|
218
|
+
* `{ kind: 'enqueued', queueJobId }` after the producer has
|
|
219
|
+
* acknowledged the enqueue. The execution is fire-and-forget:
|
|
220
|
+
* the launcher resolves the latest persisted `JobExecution`
|
|
221
|
+
* (which is still in `STARTING`/`STARTED` because the executor
|
|
222
|
+
* has not run yet).
|
|
223
|
+
*
|
|
224
|
+
* The canonical `JobExecution` row is created by the launcher
|
|
225
|
+
* via `repository.createExecutionAtomic` BEFORE this method is
|
|
226
|
+
* called (the `executionId` in `ctx` is the result). This
|
|
227
|
+
* strategy does NOT re-create it; doing so would race the
|
|
228
|
+
* launcher's atomic create and break the `SELECT ... FOR
|
|
229
|
+
* UPDATE SKIP LOCKED` invariant.
|
|
230
|
+
*
|
|
231
|
+
* Throws if the producer cannot enqueue (Redis down, key
|
|
232
|
+
* collision, etc.). The launcher re-throws the error to its
|
|
233
|
+
* caller; the `JobExecution` row remains in `STARTING` —
|
|
234
|
+
* the host's recovery path (or a manual cleanup) is
|
|
235
|
+
* responsible for transitioning it.
|
|
236
|
+
*/
|
|
237
|
+
async launch(
|
|
238
|
+
job: JobDefinition,
|
|
239
|
+
_params: Record<string, unknown>,
|
|
240
|
+
ctx: { executionId: string; jobExecutionId: string },
|
|
241
|
+
): Promise<{ kind: 'enqueued'; queueJobId: string }> {
|
|
242
|
+
if (this.queue === null) {
|
|
243
|
+
throw new Error(
|
|
244
|
+
`[BullmqRuntimeService] launch() called before onApplicationBootstrap — ` +
|
|
245
|
+
'module is not initialized. Did you forget to import BullmqBatchModule?',
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
// T8 (partition orchestration): when the start step declares
|
|
249
|
+
// `partitions.count >= 2`, the strategy enqueues one BullMQ job
|
|
250
|
+
// per partition (each carrying a distinct `partitionIndex`).
|
|
251
|
+
// Otherwise (default, `count === 1`, or absent) it preserves
|
|
252
|
+
// the 0.1.0 "one job per step" behaviour. The validate call
|
|
253
|
+
// surfaces a misconfiguration (e.g. `count <= 0`) at the
|
|
254
|
+
// launcher's boundary so the host's caller sees the failure
|
|
255
|
+
// before the worker is ever asked to process the job.
|
|
256
|
+
const stepId = job.startStepId;
|
|
257
|
+
const startStep = job.steps[stepId];
|
|
258
|
+
const partitions = startStep?.kind === 'chunk' ? startStep.partitions : undefined;
|
|
259
|
+
validatePartitions(partitions);
|
|
260
|
+
const partitionCount = partitions?.count ?? 1;
|
|
261
|
+
const partitionOrdinals: Array<number | undefined> =
|
|
262
|
+
partitionCount >= 2 ? Array.from({ length: partitionCount }, (_, i) => i) : [undefined];
|
|
263
|
+
|
|
264
|
+
const jobOpts: JobsOptions = {
|
|
265
|
+
attempts: 3,
|
|
266
|
+
backoff: { type: 'exponential', delay: 100, jitter: 0.5 },
|
|
267
|
+
removeOnComplete: { count: 100, age: 3600 },
|
|
268
|
+
removeOnFail: { count: 1000 },
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
let lastQueueJobId: string | null = null;
|
|
272
|
+
for (const partitionIndex of partitionOrdinals) {
|
|
273
|
+
const payload: BullmqJobPayload = {
|
|
274
|
+
executionId: ctx.executionId,
|
|
275
|
+
jobExecutionId: ctx.jobExecutionId,
|
|
276
|
+
jobId: job.id,
|
|
277
|
+
stepId,
|
|
278
|
+
...(partitionIndex !== undefined ? { partitionIndex } : {}),
|
|
279
|
+
};
|
|
280
|
+
const enqueued = await this.queue.add(stepId, payload, jobOpts);
|
|
281
|
+
if (enqueued.id === undefined) {
|
|
282
|
+
// BullMQ returns a job with `id` undefined only when the
|
|
283
|
+
// producer cannot reach Redis and the in-memory buffer
|
|
284
|
+
// (which is disabled by `enableOfflineQueue: false`) is
|
|
285
|
+
// not available. Surface this as a hard error so the
|
|
286
|
+
// launcher propagates the failure.
|
|
287
|
+
throw new Error(
|
|
288
|
+
`[BullmqRuntimeService] enqueue returned undefined job id (Redis down?)`,
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
const qid = String(enqueued.id);
|
|
292
|
+
lastQueueJobId = qid;
|
|
293
|
+
this.logger.debug(
|
|
294
|
+
`Enqueued step "${stepId}" for execution ${ctx.executionId}` +
|
|
295
|
+
(partitionIndex !== undefined ? ` (partition ${partitionIndex}/${partitionCount})` : '') +
|
|
296
|
+
` as BullMQ job ${qid}`,
|
|
297
|
+
);
|
|
298
|
+
}
|
|
299
|
+
if (lastQueueJobId === null) {
|
|
300
|
+
// Defensive: the loop above always runs at least once
|
|
301
|
+
// (partitionOrdinals has length >= 1), so this branch is
|
|
302
|
+
// unreachable in practice. Keep the explicit throw so a
|
|
303
|
+
// future refactor cannot quietly enqueue zero jobs.
|
|
304
|
+
throw new Error(`[BullmqRuntimeService] enqueued zero jobs for execution ${ctx.executionId}`);
|
|
305
|
+
}
|
|
306
|
+
return { kind: 'enqueued', queueJobId: lastQueueJobId };
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// -----------------------------------------------------------------------
|
|
310
|
+
// Construction
|
|
311
|
+
// -----------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
private buildQueue(): Queue {
|
|
314
|
+
return new Queue(BULLMQ_QUEUE_NAME, {
|
|
315
|
+
connection: this.producerConnectionOptions(),
|
|
316
|
+
// `defaultJobOptions` is a defence-in-depth measure. The
|
|
317
|
+
// strategy already passes per-call `JobsOptions` (with
|
|
318
|
+
// the T18 retry / remove policy) so this is the fallback
|
|
319
|
+
// for any code path that calls `queue.add` without
|
|
320
|
+
// explicit options. Today the only caller is the strategy.
|
|
321
|
+
defaultJobOptions: {
|
|
322
|
+
attempts: 3,
|
|
323
|
+
backoff: { type: 'exponential', delay: 100, jitter: 0.5 },
|
|
324
|
+
removeOnComplete: { count: 100, age: 3600 },
|
|
325
|
+
removeOnFail: { count: 1000 },
|
|
326
|
+
},
|
|
327
|
+
prefix: this.options.connection.keyPrefix,
|
|
328
|
+
// Skip waiting for the producer connection to become ready
|
|
329
|
+
// before returning from `add`. The fail-fast producer
|
|
330
|
+
// options (see `producerConnectionOptions`) make a dead
|
|
331
|
+
// Redis surface as a synchronous error on the first `add`,
|
|
332
|
+
// which is exactly what the "Redis-down" test asserts.
|
|
333
|
+
skipWaitingForReady: true,
|
|
334
|
+
// BullMQ 5 calls `client.info()` to discover the server
|
|
335
|
+
// version + database type at `Queue` construction time. With
|
|
336
|
+
// `enableOfflineQueue: false` and the ioredis client not
|
|
337
|
+
// yet ready, the call throws `Stream isn't writeable`.
|
|
338
|
+
// `skipVersionCheck: true` short-circuits that probe — the
|
|
339
|
+
// strategy never depends on the version, and a dead Redis
|
|
340
|
+
// still surfaces synchronously on the first `add()` (per
|
|
341
|
+
// the fail-fast contract above).
|
|
342
|
+
skipVersionCheck: true,
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
private buildWorker(): Worker<BullmqJobPayload> {
|
|
347
|
+
return new Worker<BullmqJobPayload>(
|
|
348
|
+
BULLMQ_QUEUE_NAME,
|
|
349
|
+
async (job) => this.processJob(job.data),
|
|
350
|
+
{
|
|
351
|
+
connection: this.workerConnectionOptions(),
|
|
352
|
+
prefix: this.options.connection.keyPrefix,
|
|
353
|
+
concurrency: 1,
|
|
354
|
+
},
|
|
355
|
+
);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private buildQueueEvents(): QueueEvents {
|
|
359
|
+
return new QueueEvents(BULLMQ_QUEUE_NAME, {
|
|
360
|
+
connection: this.workerConnectionOptions(),
|
|
361
|
+
prefix: this.options.connection.keyPrefix,
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Wire the `QueueEvents` listeners to the configured
|
|
367
|
+
* `BatchObserver`. Each listener swallows observer errors so
|
|
368
|
+
* a slow / failing observer cannot poison the BullMQ event
|
|
369
|
+
* stream.
|
|
370
|
+
*/
|
|
371
|
+
private attachQueueEventsBridge(): void {
|
|
372
|
+
if (this.queueEvents === null) return;
|
|
373
|
+
this.queueEvents.on('completed', ({ jobId }) => {
|
|
374
|
+
void this.bridgeEvent(BATCH_EVENT.JOB_COMPLETED, { queueJobId: jobId, kind: 'completed' });
|
|
375
|
+
});
|
|
376
|
+
this.queueEvents.on('failed', ({ jobId, failedReason }) => {
|
|
377
|
+
void this.bridgeEvent(BATCH_EVENT.JOB_FAILED, {
|
|
378
|
+
queueJobId: jobId,
|
|
379
|
+
kind: 'failed',
|
|
380
|
+
reason: failedReason,
|
|
381
|
+
});
|
|
382
|
+
});
|
|
383
|
+
this.queueEvents.on('stalled', ({ jobId }) => {
|
|
384
|
+
void this.bridgeEvent(BATCH_EVENT.JOB_FAILED, { queueJobId: jobId, kind: 'stalled' });
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
private async bridgeEvent(
|
|
389
|
+
type: (typeof BATCH_EVENT)[keyof typeof BATCH_EVENT],
|
|
390
|
+
data: Record<string, unknown>,
|
|
391
|
+
): Promise<void> {
|
|
392
|
+
try {
|
|
393
|
+
await this.observer.onEvent({
|
|
394
|
+
type,
|
|
395
|
+
timestamp: new Date(),
|
|
396
|
+
jobExecutionId: String(data['queueJobId'] ?? '<unknown>'),
|
|
397
|
+
data: data as unknown as JsonValue,
|
|
398
|
+
});
|
|
399
|
+
} catch (err) {
|
|
400
|
+
this.logger.warn(
|
|
401
|
+
`BatchObserver threw on event ${type}: ${err instanceof Error ? err.message : String(err)}`,
|
|
402
|
+
);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// -----------------------------------------------------------------------
|
|
407
|
+
// Worker processor — delegated to JobExecutor
|
|
408
|
+
// -----------------------------------------------------------------------
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Worker entry point. Loads the canonical `JobExecution` from
|
|
412
|
+
* the repository and the `JobDefinition` from the registry, then
|
|
413
|
+
* hands the work to `JobExecutor.execute`. All batch semantics
|
|
414
|
+
* (step dispatch, chunk loop, skip/retry, checkpoint) live in
|
|
415
|
+
* the executor — this method is a thin bridge.
|
|
416
|
+
*/
|
|
417
|
+
private async processJob(payload: BullmqJobPayload): Promise<void> {
|
|
418
|
+
const execution = await this.repository.getJobExecution(payload.executionId);
|
|
419
|
+
if (execution === null) {
|
|
420
|
+
// The DB row is gone. The launcher pre-created it via
|
|
421
|
+
// `createExecutionAtomic`; if it's missing now, the host
|
|
422
|
+
// either deleted it or restored a DB without the row.
|
|
423
|
+
// Surface as a BullMQ-level failure so the technical
|
|
424
|
+
// retry / dead-letter path handles it.
|
|
425
|
+
throw new Error(
|
|
426
|
+
`[BullmqRuntimeService] JobExecution ${payload.executionId} not found in repository`,
|
|
427
|
+
);
|
|
428
|
+
}
|
|
429
|
+
const jobDef = this.registry.get(payload.jobId);
|
|
430
|
+
// `JobRegistry.get` throws `JobNotFoundError` if the
|
|
431
|
+
// definition is missing. We let it propagate so BullMQ
|
|
432
|
+
// records the failure and the dead-letter queue catches
|
|
433
|
+
// it (a missing job definition is a misconfiguration that
|
|
434
|
+
// should be loud, not silent).
|
|
435
|
+
await this.jobExecutor.execute(execution, jobDef);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// -----------------------------------------------------------------------
|
|
439
|
+
// Connection options
|
|
440
|
+
// -----------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* Producer-side connection tuning. The two flags below are
|
|
444
|
+
* the contract the T18 "Redis-down" test depends on:
|
|
445
|
+
*
|
|
446
|
+
* - `enableOfflineQueue: false` — a `Queue.add()` against a
|
|
447
|
+
* dead Redis MUST throw synchronously rather than buffer
|
|
448
|
+
* the command. Without this, BullMQ keeps the command in
|
|
449
|
+
* memory and `add()` returns success, breaking the
|
|
450
|
+
* "fail fast" guarantee.
|
|
451
|
+
* - `maxRetriesPerRequest: 1` — keep the first `add`
|
|
452
|
+
* fast; subsequent reconnects are handled by ioredis
|
|
453
|
+
* itself (we do not want BullMQ to block on retries
|
|
454
|
+
* during the launcher call).
|
|
455
|
+
*
|
|
456
|
+
* BullMQ specifically warns against `maxRetriesPerRequest: null`
|
|
457
|
+
* on the producer, because the producer does not use blocking
|
|
458
|
+
* commands. We use `1` for the same reason.
|
|
459
|
+
*/
|
|
460
|
+
private producerConnectionOptions(): Record<string, unknown> {
|
|
461
|
+
return {
|
|
462
|
+
host: this.options.connection.host,
|
|
463
|
+
port: this.options.connection.port,
|
|
464
|
+
password: this.options.connection.password,
|
|
465
|
+
username: this.options.connection.username,
|
|
466
|
+
db: this.options.connection.db,
|
|
467
|
+
...(this.options.connection.tls ? { tls: true } : {}),
|
|
468
|
+
enableOfflineQueue: false,
|
|
469
|
+
maxRetriesPerRequest: 1,
|
|
470
|
+
};
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* Worker-side connection tuning. Two flags that BullMQ
|
|
475
|
+
* *requires* for blocking workers (per the BullMQ docs):
|
|
476
|
+
*
|
|
477
|
+
* - `maxRetriesPerRequest: null` — the worker's
|
|
478
|
+
* `BLPOP` / `BRPOPLPUSH` / `XREADGROUP` commands MUST NOT
|
|
479
|
+
* retry per request. A stalled worker surfaces as a
|
|
480
|
+
* stall, not a connection error.
|
|
481
|
+
* - `enableReadyCheck: false` — the worker should not
|
|
482
|
+
* refuse to start when Redis is in the middle of a
|
|
483
|
+
* failover; ioredis reconnects on its own.
|
|
484
|
+
*/
|
|
485
|
+
private workerConnectionOptions(): Record<string, unknown> {
|
|
486
|
+
return {
|
|
487
|
+
host: this.options.connection.host,
|
|
488
|
+
port: this.options.connection.port,
|
|
489
|
+
password: this.options.connection.password,
|
|
490
|
+
username: this.options.connection.username,
|
|
491
|
+
db: this.options.connection.db,
|
|
492
|
+
...(this.options.connection.tls ? { tls: true } : {}),
|
|
493
|
+
maxRetriesPerRequest: null,
|
|
494
|
+
enableReadyCheck: false,
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// -----------------------------------------------------------------------
|
|
499
|
+
// Close
|
|
500
|
+
// -----------------------------------------------------------------------
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Close all BullMQ resources in the documented order:
|
|
504
|
+
* worker → events → queue. Each step is best-effort: a close
|
|
505
|
+
* error on one resource does not prevent the others from
|
|
506
|
+
* being closed.
|
|
507
|
+
*/
|
|
508
|
+
private async close(): Promise<void> {
|
|
509
|
+
if (this.worker !== null) {
|
|
510
|
+
try {
|
|
511
|
+
await this.worker.close();
|
|
512
|
+
} catch (err) {
|
|
513
|
+
this.logger.warn(
|
|
514
|
+
`Worker close failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
515
|
+
);
|
|
516
|
+
}
|
|
517
|
+
this.worker = null;
|
|
518
|
+
}
|
|
519
|
+
if (this.queueEvents !== null) {
|
|
520
|
+
try {
|
|
521
|
+
await this.queueEvents.close();
|
|
522
|
+
} catch (err) {
|
|
523
|
+
this.logger.warn(
|
|
524
|
+
`QueueEvents close failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
525
|
+
);
|
|
526
|
+
}
|
|
527
|
+
this.queueEvents = null;
|
|
528
|
+
}
|
|
529
|
+
if (this.queue !== null) {
|
|
530
|
+
try {
|
|
531
|
+
await this.queue.close();
|
|
532
|
+
} catch (err) {
|
|
533
|
+
this.logger.warn(
|
|
534
|
+
`Queue close failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
535
|
+
);
|
|
536
|
+
}
|
|
537
|
+
this.queue = null;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|