@strav/queue 0.4.31 → 1.0.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/worker.ts ADDED
@@ -0,0 +1,351 @@
1
+ /**
2
+ * `Worker` — consumer side of `DatabaseQueue`.
3
+ *
4
+ * The poll loop:
5
+ * 1. Inside one transaction: `SELECT … FOR UPDATE SKIP LOCKED` claims
6
+ * one available row (`available_at <= now() AND reserved_at IS
7
+ * NULL`), then `UPDATE` increments `attempts` + sets
8
+ * `reserved_at = now()`. SKIP LOCKED lets multiple Worker
9
+ * instances poll the same queue concurrently without picking
10
+ * the same row.
11
+ * 2. The transaction COMMITs — the claim is durable. The row stays
12
+ * reserved until the result handling clears it.
13
+ * 3. The Worker constructs the Job via the container and runs
14
+ * `handle(ctx)` with a per-attempt timeout (driven by
15
+ * `AbortSignal.timeout(...)` — handlers that loop should check
16
+ * `ctx.signal.aborted`).
17
+ * 4. On success: DELETE the row.
18
+ * 5. On failure: if `attempts < max_attempts`, schedule a retry —
19
+ * `UPDATE` sets `available_at = now() + backoff` + clears
20
+ * `reserved_at`. Otherwise terminal — INSERT into
21
+ * `strav_failed_jobs` + DELETE from `strav_jobs`, both in a
22
+ * single transaction so the move is atomic. The `queue:retry` /
23
+ * `queue:flush` console commands that act on the failed-jobs
24
+ * table land with `@strav/cli` in M4.
25
+ *
26
+ * Backoff: default exponential with ±25% jitter, capped at 300s. Per-
27
+ * job override via `static backoff(attempt)`; per-Worker override
28
+ * via `defaultBackoff`.
29
+ *
30
+ * Graceful shutdown: callers pass an `AbortSignal` to `run()`. The
31
+ * loop checks `signal.aborted` between iterations + before the next
32
+ * sleep, so the worker exits cleanly within one poll-interval window
33
+ * of the abort.
34
+ */
35
+
36
+ import type { Database } from '@strav/database'
37
+ import { type Container, type Logger, ulid } from '@strav/kernel'
38
+ import type { JobContext, JobFailedContext } from './job.ts'
39
+ import type { JobRegistry } from './job_registry.ts'
40
+
41
+ export interface WorkerOptions {
42
+ /** Postgres pool used for claim + result handling. */
43
+ db: Database
44
+ /** Job registry — used to resolve `job_name` → `JobClass`. */
45
+ registry: JobRegistry
46
+ /** Container used to construct Job instances (resolves `@inject()` deps). */
47
+ container: Container
48
+ /** Worker logger — used for control-plane events (claim, retry, fail). Default: no-op. */
49
+ logger?: Logger
50
+ /** Queue names this Worker polls. Default `['default']`. */
51
+ queues?: readonly string[]
52
+ /** Milliseconds to sleep when a poll finds no available rows. Default 1000. */
53
+ pollInterval?: number
54
+ /** Per-attempt timeout (seconds) when the JobClass doesn't override it. Default 60. */
55
+ timeoutSeconds?: number
56
+ /** `max_attempts` fallback when neither the JobClass nor the row sets it. Default 3. */
57
+ defaultAttempts?: number
58
+ /** Backoff fallback when the JobClass doesn't override `backoff`. Default: exponential + jitter. */
59
+ defaultBackoff?: (attempt: number) => number
60
+ }
61
+
62
+ /** Outcome of `processOne()` — useful for tests + one-shot runs. */
63
+ export type JobResult =
64
+ | { status: 'completed'; jobId: string; jobName: string; attempts: number }
65
+ | { status: 'retried'; jobId: string; jobName: string; attempts: number; nextAt: Date }
66
+ | { status: 'failed'; jobId: string; jobName: string; attempts: number; error: unknown }
67
+
68
+ /** Row shape pulled from `strav_jobs` during claim. */
69
+ interface JobRow {
70
+ id: string
71
+ queue: string
72
+ job_name: string
73
+ payload: unknown
74
+ attempts: number
75
+ max_attempts: number
76
+ }
77
+
78
+ export class Worker {
79
+ private readonly db: Database
80
+ private readonly registry: JobRegistry
81
+ private readonly container: Container
82
+ private readonly logger: Logger
83
+ private readonly queues: readonly string[]
84
+ private readonly pollInterval: number
85
+ private readonly timeoutSeconds: number
86
+ private readonly defaultAttempts: number
87
+ private readonly defaultBackoff: (attempt: number) => number
88
+
89
+ constructor(opts: WorkerOptions) {
90
+ this.db = opts.db
91
+ this.registry = opts.registry
92
+ this.container = opts.container
93
+ this.logger = opts.logger ?? createNoopLogger()
94
+ this.queues = opts.queues ?? ['default']
95
+ this.pollInterval = opts.pollInterval ?? 1000
96
+ this.timeoutSeconds = opts.timeoutSeconds ?? 60
97
+ this.defaultAttempts = opts.defaultAttempts ?? 3
98
+ this.defaultBackoff = opts.defaultBackoff ?? exponentialBackoff
99
+ }
100
+
101
+ /**
102
+ * Process one available job. Returns `null` when the queue has nothing
103
+ * to claim, otherwise a `JobResult` describing the outcome. Tests +
104
+ * one-shot CLI invocations use this directly; `run()` calls it in
105
+ * a loop.
106
+ */
107
+ async processOne(): Promise<JobResult | null> {
108
+ const row = await this.claim()
109
+ if (!row) return null
110
+
111
+ const jobClass = this.registry.get(row.job_name)
112
+ if (!jobClass) {
113
+ // Unknown job_name → can't deserialize. Delete the row + log —
114
+ // leaving it would block the queue forever (every poll would
115
+ // re-claim + fail). Apps that need to recover unknown rows
116
+ // should snapshot the queue before changing job_names.
117
+ this.logger.error('Worker: unknown job_name, deleting row', {
118
+ jobId: row.id,
119
+ jobName: row.job_name,
120
+ })
121
+ await this.deleteRow(row.id)
122
+ return {
123
+ status: 'failed',
124
+ jobId: row.id,
125
+ jobName: row.job_name,
126
+ attempts: row.attempts,
127
+ error: new Error(`unknown job_name "${row.job_name}"`),
128
+ }
129
+ }
130
+
131
+ const job = this.container.make(jobClass)
132
+ const timeoutMs = (jobClass.timeout ?? this.timeoutSeconds) * 1000
133
+ const signal = AbortSignal.timeout(timeoutMs)
134
+
135
+ const ctx: JobContext = {
136
+ jobId: row.id,
137
+ attempt: row.attempts,
138
+ payload: row.payload,
139
+ signal,
140
+ log: this.logger,
141
+ }
142
+
143
+ try {
144
+ await job.handle(ctx)
145
+ await this.deleteRow(row.id)
146
+ return {
147
+ status: 'completed',
148
+ jobId: row.id,
149
+ jobName: row.job_name,
150
+ attempts: row.attempts,
151
+ }
152
+ } catch (error) {
153
+ // Best-effort failed() hook — runs on every failed attempt
154
+ // (intermediate + terminal). A throw here is logged but doesn't
155
+ // change the retry decision; the hook is a notification, not a
156
+ // control point.
157
+ if (job.failed) {
158
+ const failedCtx: JobFailedContext = { ...ctx, error }
159
+ try {
160
+ await job.failed(failedCtx)
161
+ } catch (hookError) {
162
+ this.logger.error('Worker: failed() hook threw', {
163
+ jobId: row.id,
164
+ jobName: row.job_name,
165
+ error: hookError,
166
+ })
167
+ }
168
+ }
169
+
170
+ const maxAttempts = jobClass.maxAttempts ?? row.max_attempts ?? this.defaultAttempts
171
+ if (row.attempts >= maxAttempts) {
172
+ // Terminal — atomically move the row to `strav_failed_jobs`
173
+ // so apps can triage what blew up. INSERT into the dead-letter
174
+ // table + DELETE from strav_jobs share one transaction so we
175
+ // can't end up with a row in both (or neither) on a Postgres
176
+ // wobble mid-move.
177
+ this.logger.error('Worker: job terminal failure', {
178
+ jobId: row.id,
179
+ jobName: row.job_name,
180
+ attempts: row.attempts,
181
+ })
182
+ await this.moveToFailed(row, error)
183
+ return {
184
+ status: 'failed',
185
+ jobId: row.id,
186
+ jobName: row.job_name,
187
+ attempts: row.attempts,
188
+ error,
189
+ }
190
+ }
191
+
192
+ const backoff = jobClass.backoff ?? this.defaultBackoff
193
+ const delaySeconds = Math.max(0, backoff(row.attempts))
194
+ await this.scheduleRetry(row.id, delaySeconds)
195
+ this.logger.warn('Worker: job retry scheduled', {
196
+ jobId: row.id,
197
+ jobName: row.job_name,
198
+ attempts: row.attempts,
199
+ delaySeconds,
200
+ })
201
+ return {
202
+ status: 'retried',
203
+ jobId: row.id,
204
+ jobName: row.job_name,
205
+ attempts: row.attempts,
206
+ nextAt: new Date(Date.now() + delaySeconds * 1000),
207
+ }
208
+ }
209
+ }
210
+
211
+ /**
212
+ * Run the poll loop until `signal` aborts. Each iteration calls
213
+ * `processOne()`; an empty poll triggers a sleep of `pollInterval`
214
+ * ms. The sleep is abort-aware — `signal.abort()` exits the loop
215
+ * within one tick rather than waiting out the full interval.
216
+ */
217
+ async run(signal: AbortSignal): Promise<void> {
218
+ while (!signal.aborted) {
219
+ try {
220
+ const result = await this.processOne()
221
+ if (result === null) {
222
+ await sleep(this.pollInterval, signal)
223
+ }
224
+ } catch (loopError) {
225
+ // Polling itself failed (network blip, DB restart). Log + sleep
226
+ // before retrying — without the sleep, a persistent failure
227
+ // would burn CPU.
228
+ this.logger.error('Worker: poll iteration failed', { error: loopError })
229
+ await sleep(this.pollInterval, signal)
230
+ }
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Atomically claim one row: SELECT … FOR UPDATE SKIP LOCKED + UPDATE
236
+ * to mark reserved + increment attempts. Single transaction so the
237
+ * claim is durable + safe against concurrent Workers.
238
+ */
239
+ private async claim(): Promise<JobRow | null> {
240
+ return this.db.transaction(async (tx) => {
241
+ const row = await tx.queryOne<JobRow>(
242
+ `SELECT id, queue, job_name, payload, attempts, max_attempts
243
+ FROM "strav_jobs"
244
+ WHERE queue = ANY($1::text[])
245
+ AND available_at <= now()
246
+ AND reserved_at IS NULL
247
+ ORDER BY id
248
+ LIMIT 1
249
+ FOR UPDATE SKIP LOCKED`,
250
+ [this.queues],
251
+ )
252
+ if (!row) return null
253
+ await tx.execute(
254
+ `UPDATE "strav_jobs"
255
+ SET reserved_at = now(), attempts = attempts + 1, updated_at = now()
256
+ WHERE id = $1`,
257
+ [row.id],
258
+ )
259
+ // Reflect the increment in the returned row so the caller's
260
+ // attempt counter matches what's in the DB.
261
+ return { ...row, attempts: Number(row.attempts) + 1 }
262
+ })
263
+ }
264
+
265
+ private async deleteRow(id: string): Promise<void> {
266
+ await this.db.execute(`DELETE FROM "strav_jobs" WHERE id = $1`, [id])
267
+ }
268
+
269
+ /**
270
+ * Atomically move a terminal-failure row to `strav_failed_jobs`.
271
+ * INSERT + DELETE in one transaction so we can't half-move on a
272
+ * Postgres wobble. The `exception` column stores
273
+ * `error.stack ?? String(error)` — full stack when available, the
274
+ * stringified value otherwise (some libraries throw plain strings).
275
+ */
276
+ private async moveToFailed(row: JobRow, error: unknown): Promise<void> {
277
+ const exception =
278
+ error instanceof Error ? (error.stack ?? `${error.name}: ${error.message}`) : String(error)
279
+ await this.db.transaction(async (tx) => {
280
+ await tx.execute(
281
+ `INSERT INTO "strav_failed_jobs"
282
+ (id, queue, job_name, payload, exception, attempts, failed_at, created_at, updated_at)
283
+ VALUES ($1, $2, $3, $4::jsonb, $5, $6, now(), now(), now())`,
284
+ [ulid(), row.queue, row.job_name, JSON.stringify(row.payload), exception, row.attempts],
285
+ )
286
+ await tx.execute(`DELETE FROM "strav_jobs" WHERE id = $1`, [row.id])
287
+ })
288
+ }
289
+
290
+ private async scheduleRetry(id: string, delaySeconds: number): Promise<void> {
291
+ await this.db.execute(
292
+ `UPDATE "strav_jobs"
293
+ SET available_at = now() + interval '${delaySeconds} seconds',
294
+ reserved_at = NULL,
295
+ updated_at = now()
296
+ WHERE id = $1`,
297
+ [id],
298
+ )
299
+ }
300
+ }
301
+
302
+ /**
303
+ * Default backoff — exponential with ±25% jitter, capped at 5 minutes.
304
+ *
305
+ * attempt=1 → ~2s (base 2)
306
+ * attempt=2 → ~4s (base 4)
307
+ * attempt=3 → ~8s (base 8)
308
+ * attempt=4 → ~16s (base 16)
309
+ * attempt=5 → ~32s
310
+ * …
311
+ * attempt=9+ → ~300s (clamped)
312
+ *
313
+ * Jitter prevents thundering-herd retries when many jobs fail at
314
+ * the same time (e.g. a downstream service blip).
315
+ */
316
+ function exponentialBackoff(attempt: number): number {
317
+ const base = Math.min(300, 2 ** attempt)
318
+ const jitter = (Math.random() * 2 - 1) * base * 0.25
319
+ return Math.max(1, Math.round(base + jitter))
320
+ }
321
+
322
+ /** Abort-aware sleep. Returns when either the timer fires or the signal aborts. */
323
+ function sleep(ms: number, signal: AbortSignal): Promise<void> {
324
+ return new Promise<void>((resolve) => {
325
+ if (signal.aborted) {
326
+ resolve()
327
+ return
328
+ }
329
+ const timer = setTimeout(resolve, ms)
330
+ const onAbort = () => {
331
+ clearTimeout(timer)
332
+ signal.removeEventListener('abort', onAbort)
333
+ resolve()
334
+ }
335
+ signal.addEventListener('abort', onAbort, { once: true })
336
+ })
337
+ }
338
+
339
+ /** No-op Logger — same shape as the one in DatabaseQueue / SyncQueue. */
340
+ function createNoopLogger(): Logger {
341
+ const noop = () => undefined
342
+ return {
343
+ debug: noop,
344
+ info: noop,
345
+ warn: noop,
346
+ error: noop,
347
+ fatal: noop,
348
+ trace: noop,
349
+ child: () => createNoopLogger(),
350
+ } as unknown as Logger
351
+ }
@@ -1,3 +0,0 @@
1
- export { default as QueueProvider } from './queue_provider.ts'
2
-
3
- export type { QueueProviderOptions } from './queue_provider.ts'
@@ -1,29 +0,0 @@
1
- import ServiceProvider from '@strav/kernel/core/service_provider'
2
- import type Application from '@strav/kernel/core/application'
3
- import Queue from '../queue/queue.ts'
4
-
5
- export interface QueueProviderOptions {
6
- /** Whether to auto-create the jobs tables. Default: `true` */
7
- ensureTables?: boolean
8
- }
9
-
10
- export default class QueueProvider extends ServiceProvider {
11
- readonly name = 'queue'
12
- override readonly dependencies = ['database']
13
-
14
- constructor(private options?: QueueProviderOptions) {
15
- super()
16
- }
17
-
18
- override register(app: Application): void {
19
- app.singleton(Queue)
20
- }
21
-
22
- override async boot(app: Application): Promise<void> {
23
- app.resolve(Queue)
24
-
25
- if (this.options?.ensureTables !== false) {
26
- await Queue.ensureTables()
27
- }
28
- }
29
- }
@@ -1,135 +0,0 @@
1
- /**
2
- * Per-handler circuit breaker. Tracks recent failure timestamps in
3
- * memory; trips the circuit when the failure count within the window
4
- * exceeds the threshold and pauses dispatch of that handler for
5
- * `cooldownMs`. Dispatches are auto-resumed once the cooldown expires.
6
- *
7
- * Intended defense against retry storms — a handler that consistently
8
- * fails (stale schema, downed dependency) shouldn't keep eating worker
9
- * cycles and DB connections. Tripping pushes failed jobs back to the
10
- * queue with a delay so they retry AFTER the cooldown.
11
- *
12
- * State is per-process (in-memory). Multi-worker deployments will each
13
- * track independently — that's fine; each worker self-pauses without
14
- * cross-talk, and a handler that's failing for a global reason will
15
- * trip every worker quickly.
16
- */
17
-
18
- import Emitter from '@strav/kernel/events/emitter'
19
-
20
- export interface CircuitBreakerOptions {
21
- /** Number of failures within the window that trips the breaker. Default: 10. */
22
- threshold?: number
23
- /** Window in ms over which failures are counted. Default: 60_000 (1 min). */
24
- windowMs?: number
25
- /** Cooldown in ms after tripping before retry resumes. Default: 30_000 (30 s). */
26
- cooldownMs?: number
27
- }
28
-
29
- export interface ResolvedBreakerOptions {
30
- threshold: number
31
- windowMs: number
32
- cooldownMs: number
33
- }
34
-
35
- interface BreakerState {
36
- options: ResolvedBreakerOptions
37
- failures: number[] // unix-ms timestamps, recent-first not enforced
38
- trippedUntil: number | null // unix-ms; null when closed
39
- }
40
-
41
- const DEFAULTS: ResolvedBreakerOptions = {
42
- threshold: 10,
43
- windowMs: 60_000,
44
- cooldownMs: 30_000,
45
- }
46
-
47
- const breakers = new Map<string, BreakerState>()
48
-
49
- /** Register / update a breaker for a handler. */
50
- export function configureBreaker(handlerName: string, options: CircuitBreakerOptions): void {
51
- breakers.set(handlerName, {
52
- options: { ...DEFAULTS, ...options },
53
- failures: [],
54
- trippedUntil: null,
55
- })
56
- }
57
-
58
- /** Forget all breaker state. Test-only. */
59
- export function resetBreakers(): void {
60
- breakers.clear()
61
- }
62
-
63
- /**
64
- * Check if a handler is currently tripped. Returns the remaining
65
- * cooldown in ms (>= 0) when tripped, or `null` when the circuit is
66
- * closed (handler is dispatchable). Auto-resets state when the
67
- * cooldown has elapsed and emits `queue:circuit_reset` once on
68
- * transition.
69
- */
70
- export function checkBreaker(handlerName: string, now: number = Date.now()): number | null {
71
- const state = breakers.get(handlerName)
72
- if (!state) return null
73
- if (state.trippedUntil === null) return null
74
-
75
- if (now >= state.trippedUntil) {
76
- // Cooldown expired — close the circuit. Reset failure history so
77
- // the next set of failures starts a fresh window.
78
- state.trippedUntil = null
79
- state.failures = []
80
- if (Emitter.listenerCount('queue:circuit_reset') > 0) {
81
- void Emitter.emit('queue:circuit_reset', { handler: handlerName }).catch(() => {})
82
- }
83
- return null
84
- }
85
-
86
- return state.trippedUntil - now
87
- }
88
-
89
- /**
90
- * Record a failure for a handler. Trips the circuit when the failure
91
- * count within `windowMs` reaches `threshold`. Returns the new cooldown
92
- * (ms) when tripping, or `null` when the threshold is not yet reached.
93
- */
94
- export function recordFailure(handlerName: string, now: number = Date.now()): number | null {
95
- const state = breakers.get(handlerName)
96
- if (!state) return null
97
-
98
- // Drop failures outside the window then push the new one.
99
- const cutoff = now - state.options.windowMs
100
- state.failures = state.failures.filter(t => t > cutoff)
101
- state.failures.push(now)
102
-
103
- if (state.trippedUntil !== null) {
104
- // Already tripped — do nothing.
105
- return state.trippedUntil - now
106
- }
107
-
108
- if (state.failures.length >= state.options.threshold) {
109
- state.trippedUntil = now + state.options.cooldownMs
110
- if (Emitter.listenerCount('queue:circuit_tripped') > 0) {
111
- void Emitter.emit('queue:circuit_tripped', {
112
- handler: handlerName,
113
- threshold: state.options.threshold,
114
- windowMs: state.options.windowMs,
115
- cooldownMs: state.options.cooldownMs,
116
- trippedUntil: state.trippedUntil,
117
- }).catch(() => {})
118
- }
119
- return state.options.cooldownMs
120
- }
121
-
122
- return null
123
- }
124
-
125
- /**
126
- * Record a success — clears the failure history for this handler so
127
- * intermittent errors don't accumulate. Does NOT close a tripped
128
- * circuit (only the cooldown expiry does).
129
- */
130
- export function recordSuccess(handlerName: string): void {
131
- const state = breakers.get(handlerName)
132
- if (!state) return
133
- if (state.trippedUntil !== null) return
134
- state.failures = []
135
- }
@@ -1,22 +0,0 @@
1
- export { default as Queue } from './queue.ts'
2
- export { default as Worker } from './worker.ts'
3
- export type {
4
- JobOptions,
5
- QueueConfig,
6
- JobMeta,
7
- JobRecord,
8
- FailedJobRecord,
9
- JobHandler,
10
- JobHandlerOptions,
11
- JobHandlerRegistration,
12
- JobPayloadSchema,
13
- } from './queue.ts'
14
- export type { WorkerOptions } from './worker.ts'
15
- export {
16
- configureBreaker,
17
- checkBreaker,
18
- recordFailure,
19
- recordSuccess,
20
- resetBreakers,
21
- } from './circuit_breaker.ts'
22
- export type { CircuitBreakerOptions, ResolvedBreakerOptions } from './circuit_breaker.ts'