@strav/queue 0.3.30 → 0.3.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@strav/queue",
3
- "version": "0.3.30",
3
+ "version": "0.3.33",
4
4
  "type": "module",
5
5
  "description": "Background job processing and task scheduling for the Strav framework",
6
6
  "license": "MIT",
@@ -28,8 +28,8 @@
28
28
  "./providers/*": "./src/providers/*.ts"
29
29
  },
30
30
  "peerDependencies": {
31
- "@strav/kernel": "0.3.30",
32
- "@strav/database": "0.3.30"
31
+ "@strav/kernel": "0.3.33",
32
+ "@strav/database": "0.3.33"
33
33
  },
34
34
  "scripts": {
35
35
  "test": "bun test tests/",
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Per-handler circuit breaker. Tracks recent failure timestamps in
3
+ * memory; trips the circuit when the failure count within the window
4
+ * exceeds the threshold and pauses dispatch of that handler for
5
+ * `cooldownMs`. Dispatches are auto-resumed once the cooldown expires.
6
+ *
7
+ * Intended defense against retry storms — a handler that consistently
8
+ * fails (stale schema, downed dependency) shouldn't keep eating worker
9
+ * cycles and DB connections. Tripping pushes failed jobs back to the
10
+ * queue with a delay so they retry AFTER the cooldown.
11
+ *
12
+ * State is per-process (in-memory). Multi-worker deployments will each
13
+ * track independently — that's fine; each worker self-pauses without
14
+ * cross-talk, and a handler that's failing for a global reason will
15
+ * trip every worker quickly.
16
+ */
17
+
18
+ import Emitter from '@strav/kernel/events/emitter'
19
+
20
+ export interface CircuitBreakerOptions {
21
+ /** Number of failures within the window that trips the breaker. Default: 10. */
22
+ threshold?: number
23
+ /** Window in ms over which failures are counted. Default: 60_000 (1 min). */
24
+ windowMs?: number
25
+ /** Cooldown in ms after tripping before retry resumes. Default: 30_000 (30 s). */
26
+ cooldownMs?: number
27
+ }
28
+
29
+ export interface ResolvedBreakerOptions {
30
+ threshold: number
31
+ windowMs: number
32
+ cooldownMs: number
33
+ }
34
+
35
+ interface BreakerState {
36
+ options: ResolvedBreakerOptions
37
+ failures: number[] // unix-ms timestamps, recent-first not enforced
38
+ trippedUntil: number | null // unix-ms; null when closed
39
+ }
40
+
41
+ const DEFAULTS: ResolvedBreakerOptions = {
42
+ threshold: 10,
43
+ windowMs: 60_000,
44
+ cooldownMs: 30_000,
45
+ }
46
+
47
+ const breakers = new Map<string, BreakerState>()
48
+
49
+ /** Register / update a breaker for a handler. */
50
+ export function configureBreaker(handlerName: string, options: CircuitBreakerOptions): void {
51
+ breakers.set(handlerName, {
52
+ options: { ...DEFAULTS, ...options },
53
+ failures: [],
54
+ trippedUntil: null,
55
+ })
56
+ }
57
+
58
+ /** Forget all breaker state. Test-only. */
59
+ export function resetBreakers(): void {
60
+ breakers.clear()
61
+ }
62
+
63
+ /**
64
+ * Check if a handler is currently tripped. Returns the remaining
65
+ * cooldown in ms (>= 0) when tripped, or `null` when the circuit is
66
+ * closed (handler is dispatchable). Auto-resets state when the
67
+ * cooldown has elapsed and emits `queue:circuit_reset` once on
68
+ * transition.
69
+ */
70
+ export function checkBreaker(handlerName: string, now: number = Date.now()): number | null {
71
+ const state = breakers.get(handlerName)
72
+ if (!state) return null
73
+ if (state.trippedUntil === null) return null
74
+
75
+ if (now >= state.trippedUntil) {
76
+ // Cooldown expired — close the circuit. Reset failure history so
77
+ // the next set of failures starts a fresh window.
78
+ state.trippedUntil = null
79
+ state.failures = []
80
+ if (Emitter.listenerCount('queue:circuit_reset') > 0) {
81
+ void Emitter.emit('queue:circuit_reset', { handler: handlerName }).catch(() => {})
82
+ }
83
+ return null
84
+ }
85
+
86
+ return state.trippedUntil - now
87
+ }
88
+
89
+ /**
90
+ * Record a failure for a handler. Trips the circuit when the failure
91
+ * count within `windowMs` reaches `threshold`. Returns the new cooldown
92
+ * (ms) when tripping, or `null` when the threshold is not yet reached.
93
+ */
94
+ export function recordFailure(handlerName: string, now: number = Date.now()): number | null {
95
+ const state = breakers.get(handlerName)
96
+ if (!state) return null
97
+
98
+ // Drop failures outside the window then push the new one.
99
+ const cutoff = now - state.options.windowMs
100
+ state.failures = state.failures.filter(t => t > cutoff)
101
+ state.failures.push(now)
102
+
103
+ if (state.trippedUntil !== null) {
104
+ // Already tripped — do nothing.
105
+ return state.trippedUntil - now
106
+ }
107
+
108
+ if (state.failures.length >= state.options.threshold) {
109
+ state.trippedUntil = now + state.options.cooldownMs
110
+ if (Emitter.listenerCount('queue:circuit_tripped') > 0) {
111
+ void Emitter.emit('queue:circuit_tripped', {
112
+ handler: handlerName,
113
+ threshold: state.options.threshold,
114
+ windowMs: state.options.windowMs,
115
+ cooldownMs: state.options.cooldownMs,
116
+ trippedUntil: state.trippedUntil,
117
+ }).catch(() => {})
118
+ }
119
+ return state.options.cooldownMs
120
+ }
121
+
122
+ return null
123
+ }
124
+
125
+ /**
126
+ * Record a success — clears the failure history for this handler so
127
+ * intermittent errors don't accumulate. Does NOT close a tripped
128
+ * circuit (only the cooldown expiry does).
129
+ */
130
+ export function recordSuccess(handlerName: string): void {
131
+ const state = breakers.get(handlerName)
132
+ if (!state) return
133
+ if (state.trippedUntil !== null) return
134
+ state.failures = []
135
+ }
@@ -7,5 +7,16 @@ export type {
7
7
  JobRecord,
8
8
  FailedJobRecord,
9
9
  JobHandler,
10
+ JobHandlerOptions,
11
+ JobHandlerRegistration,
12
+ JobPayloadSchema,
10
13
  } from './queue.ts'
11
14
  export type { WorkerOptions } from './worker.ts'
15
+ export {
16
+ configureBreaker,
17
+ checkBreaker,
18
+ recordFailure,
19
+ recordSuccess,
20
+ resetBreakers,
21
+ } from './circuit_breaker.ts'
22
+ export type { CircuitBreakerOptions, ResolvedBreakerOptions } from './circuit_breaker.ts'
@@ -3,6 +3,7 @@ import Configuration from '@strav/kernel/config/configuration'
3
3
  import Database from '@strav/database/database/database'
4
4
  import Emitter from '@strav/kernel/events/emitter'
5
5
  import { ConfigurationError } from '@strav/kernel/exceptions/errors'
6
+ import { configureBreaker, type CircuitBreakerOptions } from './circuit_breaker.ts'
6
7
 
7
8
  export interface JobOptions {
8
9
  queue?: string
@@ -26,6 +27,29 @@ export interface JobMeta {
26
27
  job: string
27
28
  attempts: number
28
29
  maxAttempts: number
30
+ /**
31
+ * Report progress for a long-running job. `value` is `0..1`. The reported
32
+ * value is persisted to the job row so external consumers can poll via
33
+ * {@link Queue.progressOf}, and a `queue:progress` event is emitted for
34
+ * live consumers (e.g. SSE).
35
+ *
36
+ * Returns immediately after persisting; safe to call from a tight loop
37
+ * but throttle to avoid hammering the database (e.g. every N rows or
38
+ * every 1 s).
39
+ */
40
+ progress: (value: number, message?: string) => Promise<void>
41
+ }
42
+
43
+ /** Snapshot of a job's current progress, returned by {@link Queue.progressOf}. */
44
+ export interface JobProgress {
45
+ /** Job id. */
46
+ id: number
47
+ /** 0..1, last reported by the handler. */
48
+ value: number
49
+ /** Optional human-readable message attached to the last update. */
50
+ message: string | null
51
+ /** Current attempt count. */
52
+ attempts: number
29
53
  }
30
54
 
31
55
  /** A raw job row from the _strav_jobs table. */
@@ -54,6 +78,46 @@ export interface FailedJobRecord {
54
78
 
55
79
  export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<void>
56
80
 
81
+ /**
82
+ * Minimal "schema-like" shape — anything that exposes `parse(input)`
83
+ * (Zod, ArkType, Valibot, hand-written validators) works. The schema
84
+ * is invoked at dequeue time, BEFORE the handler runs, so a tampered
85
+ * row in the DB or a payload from an older code revision is rejected
86
+ * loudly instead of executing with a half-formed shape.
87
+ */
88
+ export interface JobPayloadSchema<T = unknown> {
89
+ parse(input: unknown): T
90
+ }
91
+
92
+ /** Per-handler registration options. */
93
+ export interface JobHandlerOptions<T = any> {
94
+ /**
95
+ * Optional payload schema. When set, the worker calls `schema.parse(payload)`
96
+ * before invoking the handler; a parse failure routes the job to
97
+ * `_strav_failed_jobs` with the validation error message.
98
+ *
99
+ * Recommended for any handler whose payload comes from an external
100
+ * source (HTTP webhook, customer upload) or whose code has churned
101
+ * since older jobs were enqueued — the parse is a fail-fast invariant
102
+ * that catches drift before the handler corrupts state.
103
+ */
104
+ schema?: JobPayloadSchema<T>
105
+ /**
106
+ * Per-handler circuit breaker. Trips when the failure count within
107
+ * `windowMs` reaches `threshold`, pausing dispatch for `cooldownMs`.
108
+ * Defends against retry storms — a stale-schema or downed-dependency
109
+ * handler shouldn't keep eating worker cycles. Defaults: threshold
110
+ * 10, windowMs 60_000, cooldownMs 30_000.
111
+ */
112
+ circuitBreaker?: CircuitBreakerOptions
113
+ }
114
+
115
+ /** Internal registration record stored in Queue._handlers. */
116
+ export interface JobHandlerRegistration<T = any> {
117
+ handler: JobHandler<T>
118
+ schema?: JobPayloadSchema<T>
119
+ }
120
+
57
121
  /**
58
122
  * PostgreSQL-backed job queue.
59
123
  *
@@ -72,7 +136,7 @@ export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<
72
136
  export default class Queue {
73
137
  private static _db: Database
74
138
  private static _config: QueueConfig
75
- private static _handlers = new Map<string, JobHandler>()
139
+ private static _handlers = new Map<string, JobHandlerRegistration>()
76
140
 
77
141
  constructor(db: Database, config: Configuration) {
78
142
  Queue._db = db
@@ -97,7 +161,7 @@ export default class Queue {
97
161
  return Queue._config
98
162
  }
99
163
 
100
- static get handlers(): Map<string, JobHandler> {
164
+ static get handlers(): Map<string, JobHandlerRegistration> {
101
165
  return Queue._handlers
102
166
  }
103
167
 
@@ -116,10 +180,23 @@ export default class Queue {
116
180
  "timeout" INT NOT NULL DEFAULT 60000,
117
181
  "available_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(),
118
182
  "reserved_at" TIMESTAMPTZ,
119
- "created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW()
183
+ "created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(),
184
+ "progress" NUMERIC NOT NULL DEFAULT 0,
185
+ "progress_message" TEXT
120
186
  )
121
187
  `
122
188
 
189
+ // Additive migrations for progress columns — for tables that existed
190
+ // before progress reporting was introduced.
191
+ await sql`
192
+ ALTER TABLE "_strav_jobs"
193
+ ADD COLUMN IF NOT EXISTS "progress" NUMERIC NOT NULL DEFAULT 0
194
+ `
195
+ await sql`
196
+ ALTER TABLE "_strav_jobs"
197
+ ADD COLUMN IF NOT EXISTS "progress_message" TEXT
198
+ `
199
+
123
200
  await sql`
124
201
  CREATE INDEX IF NOT EXISTS "idx_strav_jobs_queue_available"
125
202
  ON "_strav_jobs" ("queue", "available_at")
@@ -138,9 +215,27 @@ export default class Queue {
138
215
  `
139
216
  }
140
217
 
141
- /** Register a handler for a named job. */
142
- static handle<T = any>(name: string, handler: JobHandler<T>): void {
143
- Queue._handlers.set(name, handler)
218
+ /**
219
+ * Register a handler for a named job. Pass `options.schema` to have
220
+ * the worker validate the payload (Zod / ArkType / etc.) before
221
+ * invoking the handler — a parse failure routes the job to
222
+ * `_strav_failed_jobs` instead of running the handler with bad data.
223
+ *
224
+ * @example
225
+ * import { z } from 'zod'
226
+ * Queue.handle('send-email', async (payload) => { ... }, {
227
+ * schema: z.object({ to: z.string().email(), subject: z.string() }),
228
+ * })
229
+ */
230
+ static handle<T = any>(
231
+ name: string,
232
+ handler: JobHandler<T>,
233
+ options?: JobHandlerOptions<T>
234
+ ): void {
235
+ Queue._handlers.set(name, { handler, schema: options?.schema })
236
+ if (options?.circuitBreaker) {
237
+ configureBreaker(name, options.circuitBreaker)
238
+ }
144
239
  }
145
240
 
146
241
  /**
@@ -168,6 +263,57 @@ export default class Queue {
168
263
  return id
169
264
  }
170
265
 
266
+ /**
267
+ * Persist progress for an in-flight job and emit a `queue:progress` event.
268
+ * Called by the `JobMeta.progress` callback that workers hand to handlers,
269
+ * but exposed statically so other code (e.g. retry replay tools) can update
270
+ * progress directly. `value` is clamped to `[0, 1]`.
271
+ */
272
+ static async reportProgress(
273
+ id: number,
274
+ value: number,
275
+ message?: string
276
+ ): Promise<void> {
277
+ const sql = Queue.db.sql
278
+ const clamped = Math.max(0, Math.min(1, value))
279
+ const msg = message ?? null
280
+ await sql`
281
+ UPDATE "_strav_jobs"
282
+ SET "progress" = ${clamped}, "progress_message" = ${msg}
283
+ WHERE "id" = ${id}
284
+ `
285
+ if (Emitter.listenerCount('queue:progress') > 0) {
286
+ Emitter.emit('queue:progress', {
287
+ id,
288
+ value: clamped,
289
+ message: msg,
290
+ }).catch(() => {})
291
+ }
292
+ }
293
+
294
+ /**
295
+ * Read the latest progress snapshot for a job. Returns `null` once the
296
+ * job has completed (the row is deleted on success) or if the id is
297
+ * unknown.
298
+ */
299
+ static async progressOf(id: number): Promise<JobProgress | null> {
300
+ const sql = Queue.db.sql
301
+ const rows = await sql`
302
+ SELECT "id", "progress", "progress_message", "attempts"
303
+ FROM "_strav_jobs"
304
+ WHERE "id" = ${id}
305
+ LIMIT 1
306
+ `
307
+ if (rows.length === 0) return null
308
+ const row = rows[0] as Record<string, unknown>
309
+ return {
310
+ id: Number(row.id),
311
+ value: Number(row.progress ?? 0),
312
+ message: (row.progress_message as string | null) ?? null,
313
+ attempts: Number(row.attempts ?? 0),
314
+ }
315
+ }
316
+
171
317
  /**
172
318
  * Create a listener function suitable for Emitter.on().
173
319
  * When the event fires, the payload is pushed onto the queue.
@@ -1,5 +1,6 @@
1
1
  import Queue, { hydrateJob } from './queue.ts'
2
2
  import Emitter from '@strav/kernel/events/emitter'
3
+ import { checkBreaker, recordFailure, recordSuccess } from './circuit_breaker.ts'
3
4
  import type { JobRecord, JobMeta } from './queue.ts'
4
5
 
5
6
  export interface WorkerOptions {
@@ -109,26 +110,55 @@ export default class Worker {
109
110
 
110
111
  /** Process a single job: run handler, handle success/failure. */
111
112
  private async process(job: JobRecord): Promise<void> {
112
- const handler = Queue.handlers.get(job.job)
113
+ const registration = Queue.handlers.get(job.job)
113
114
 
114
- if (!handler) {
115
+ if (!registration) {
115
116
  await this.fail(job, new Error(`No handler registered for job "${job.job}"`))
116
117
  return
117
118
  }
118
119
 
120
+ // Q-1: per-handler circuit breaker. If the handler has tripped its
121
+ // breaker (too many failures in the configured window), defer this
122
+ // job rather than running it. Push it back to the queue with
123
+ // `available_at = now + cooldown` so it retries AFTER the breaker
124
+ // resets — this clears the worker to drain unrelated jobs from the
125
+ // queue instead of compounding the failure storm.
126
+ const cooldownRemaining = checkBreaker(job.job)
127
+ if (cooldownRemaining !== null) {
128
+ await this.deferForCooldown(job, cooldownRemaining)
129
+ return
130
+ }
131
+
119
132
  const meta: JobMeta = {
120
133
  id: job.id,
121
134
  queue: job.queue,
122
135
  job: job.job,
123
136
  attempts: job.attempts,
124
137
  maxAttempts: job.maxAttempts,
138
+ progress: (value: number, message?: string) => Queue.reportProgress(job.id, value, message),
139
+ }
140
+
141
+ // Re-parse the payload through the registered schema (CC-5). Catches
142
+ // payloads that drifted from the handler's expected shape — older
143
+ // enqueues, manual DB edits, malicious tampering. A parse failure
144
+ // is fatal: the job goes straight to failed_jobs without retry,
145
+ // because retrying with the same bad payload won't help.
146
+ let payload = job.payload
147
+ if (registration.schema) {
148
+ try {
149
+ payload = registration.schema.parse(job.payload)
150
+ } catch (err) {
151
+ const detail = err instanceof Error ? err.message : String(err)
152
+ await this.fail(job, new Error(`Job "${job.job}" payload failed validation: ${detail}`))
153
+ return
154
+ }
125
155
  }
126
156
 
127
157
  const start = performance.now()
128
158
 
129
159
  try {
130
160
  await Promise.race([
131
- Promise.resolve(handler(job.payload, meta)),
161
+ Promise.resolve(registration.handler(payload, meta)),
132
162
  new Promise<never>((_, reject) =>
133
163
  setTimeout(
134
164
  () => reject(new Error(`Job "${job.job}" timed out after ${job.timeout}ms`)),
@@ -137,6 +167,7 @@ export default class Worker {
137
167
  ),
138
168
  ])
139
169
  await this.complete(job)
170
+ recordSuccess(job.job)
140
171
 
141
172
  if (Emitter.listenerCount('queue:processed') > 0) {
142
173
  const duration = performance.now() - start
@@ -149,6 +180,9 @@ export default class Worker {
149
180
  }
150
181
  } catch (error) {
151
182
  const err = error instanceof Error ? error : new Error(String(error))
183
+ // Update breaker state regardless of retry decision so a job that
184
+ // exhausts its retries still counts toward the trip threshold.
185
+ recordFailure(job.job)
152
186
  if (job.attempts >= job.maxAttempts) {
153
187
  await this.fail(job, err)
154
188
 
@@ -197,6 +231,24 @@ export default class Worker {
197
231
  `
198
232
  }
199
233
 
234
+ /**
235
+ * Push a tripped-circuit job back to the queue with `available_at`
236
+ * scheduled past the cooldown. Also rolls back the attempts counter
237
+ * the fetcher incremented so a circuit trip doesn't eat retry
238
+ * budget — the job genuinely never executed.
239
+ */
240
+ private async deferForCooldown(job: JobRecord, cooldownMs: number): Promise<void> {
241
+ const availableAt = new Date(Date.now() + Math.max(cooldownMs, 1_000))
242
+
243
+ await Queue.db.sql`
244
+ UPDATE "_strav_jobs"
245
+ SET "reserved_at" = NULL,
246
+ "available_at" = ${availableAt},
247
+ "attempts" = GREATEST("attempts" - 1, 0)
248
+ WHERE "id" = ${job.id}
249
+ `
250
+ }
251
+
200
252
  /** Calculate backoff delay in ms based on attempt number. */
201
253
  backoffDelay(attempts: number): number {
202
254
  if (Queue.config.retryBackoff === 'linear') {