@strav/queue 0.3.32 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@strav/queue",
3
- "version": "0.3.32",
3
+ "version": "0.4.0",
4
4
  "type": "module",
5
5
  "description": "Background job processing and task scheduling for the Strav framework",
6
6
  "license": "MIT",
@@ -28,8 +28,8 @@
28
28
  "./providers/*": "./src/providers/*.ts"
29
29
  },
30
30
  "peerDependencies": {
31
- "@strav/kernel": "0.3.32",
32
- "@strav/database": "0.3.32"
31
+ "@strav/kernel": "0.4.0",
32
+ "@strav/database": "0.4.0"
33
33
  },
34
34
  "scripts": {
35
35
  "test": "bun test tests/",
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Per-handler circuit breaker. Tracks recent failure timestamps in
3
+ * memory; trips the circuit when the failure count within the window
4
+ * exceeds the threshold and pauses dispatch of that handler for
5
+ * `cooldownMs`. Dispatches are auto-resumed once the cooldown expires.
6
+ *
7
+ * Intended defense against retry storms — a handler that consistently
8
+ * fails (stale schema, downed dependency) shouldn't keep eating worker
9
+ * cycles and DB connections. Tripping pushes failed jobs back to the
10
+ * queue with a delay so they retry AFTER the cooldown.
11
+ *
12
+ * State is per-process (in-memory). Multi-worker deployments will each
13
+ * track independently — that's fine; each worker self-pauses without
14
+ * cross-talk, and a handler that's failing for a global reason will
15
+ * trip every worker quickly.
16
+ */
17
+
18
+ import Emitter from '@strav/kernel/events/emitter'
19
+
20
+ export interface CircuitBreakerOptions {
21
+ /** Number of failures within the window that trips the breaker. Default: 10. */
22
+ threshold?: number
23
+ /** Window in ms over which failures are counted. Default: 60_000 (1 min). */
24
+ windowMs?: number
25
+ /** Cooldown in ms after tripping before retry resumes. Default: 30_000 (30 s). */
26
+ cooldownMs?: number
27
+ }
28
+
29
+ export interface ResolvedBreakerOptions {
30
+ threshold: number
31
+ windowMs: number
32
+ cooldownMs: number
33
+ }
34
+
35
+ interface BreakerState {
36
+ options: ResolvedBreakerOptions
37
+ failures: number[] // unix-ms timestamps, recent-first not enforced
38
+ trippedUntil: number | null // unix-ms; null when closed
39
+ }
40
+
41
+ const DEFAULTS: ResolvedBreakerOptions = {
42
+ threshold: 10,
43
+ windowMs: 60_000,
44
+ cooldownMs: 30_000,
45
+ }
46
+
47
+ const breakers = new Map<string, BreakerState>()
48
+
49
+ /** Register / update a breaker for a handler. */
50
+ export function configureBreaker(handlerName: string, options: CircuitBreakerOptions): void {
51
+ breakers.set(handlerName, {
52
+ options: { ...DEFAULTS, ...options },
53
+ failures: [],
54
+ trippedUntil: null,
55
+ })
56
+ }
57
+
58
+ /** Forget all breaker state. Test-only. */
59
+ export function resetBreakers(): void {
60
+ breakers.clear()
61
+ }
62
+
63
+ /**
64
+ * Check if a handler is currently tripped. Returns the remaining
65
+ * cooldown in ms (>= 0) when tripped, or `null` when the circuit is
66
+ * closed (handler is dispatchable). Auto-resets state when the
67
+ * cooldown has elapsed and emits `queue:circuit_reset` once on
68
+ * transition.
69
+ */
70
+ export function checkBreaker(handlerName: string, now: number = Date.now()): number | null {
71
+ const state = breakers.get(handlerName)
72
+ if (!state) return null
73
+ if (state.trippedUntil === null) return null
74
+
75
+ if (now >= state.trippedUntil) {
76
+ // Cooldown expired — close the circuit. Reset failure history so
77
+ // the next set of failures starts a fresh window.
78
+ state.trippedUntil = null
79
+ state.failures = []
80
+ if (Emitter.listenerCount('queue:circuit_reset') > 0) {
81
+ void Emitter.emit('queue:circuit_reset', { handler: handlerName }).catch(() => {})
82
+ }
83
+ return null
84
+ }
85
+
86
+ return state.trippedUntil - now
87
+ }
88
+
89
+ /**
90
+ * Record a failure for a handler. Trips the circuit when the failure
91
+ * count within `windowMs` reaches `threshold`. Returns the new cooldown
92
+ * (ms) when tripping, or `null` when the threshold is not yet reached.
93
+ */
94
+ export function recordFailure(handlerName: string, now: number = Date.now()): number | null {
95
+ const state = breakers.get(handlerName)
96
+ if (!state) return null
97
+
98
+ // Drop failures outside the window then push the new one.
99
+ const cutoff = now - state.options.windowMs
100
+ state.failures = state.failures.filter(t => t > cutoff)
101
+ state.failures.push(now)
102
+
103
+ if (state.trippedUntil !== null) {
104
+ // Already tripped — do nothing.
105
+ return state.trippedUntil - now
106
+ }
107
+
108
+ if (state.failures.length >= state.options.threshold) {
109
+ state.trippedUntil = now + state.options.cooldownMs
110
+ if (Emitter.listenerCount('queue:circuit_tripped') > 0) {
111
+ void Emitter.emit('queue:circuit_tripped', {
112
+ handler: handlerName,
113
+ threshold: state.options.threshold,
114
+ windowMs: state.options.windowMs,
115
+ cooldownMs: state.options.cooldownMs,
116
+ trippedUntil: state.trippedUntil,
117
+ }).catch(() => {})
118
+ }
119
+ return state.options.cooldownMs
120
+ }
121
+
122
+ return null
123
+ }
124
+
125
+ /**
126
+ * Record a success — clears the failure history for this handler so
127
+ * intermittent errors don't accumulate. Does NOT close a tripped
128
+ * circuit (only the cooldown expiry does).
129
+ */
130
+ export function recordSuccess(handlerName: string): void {
131
+ const state = breakers.get(handlerName)
132
+ if (!state) return
133
+ if (state.trippedUntil !== null) return
134
+ state.failures = []
135
+ }
@@ -7,5 +7,16 @@ export type {
7
7
  JobRecord,
8
8
  FailedJobRecord,
9
9
  JobHandler,
10
+ JobHandlerOptions,
11
+ JobHandlerRegistration,
12
+ JobPayloadSchema,
10
13
  } from './queue.ts'
11
14
  export type { WorkerOptions } from './worker.ts'
15
+ export {
16
+ configureBreaker,
17
+ checkBreaker,
18
+ recordFailure,
19
+ recordSuccess,
20
+ resetBreakers,
21
+ } from './circuit_breaker.ts'
22
+ export type { CircuitBreakerOptions, ResolvedBreakerOptions } from './circuit_breaker.ts'
@@ -3,6 +3,7 @@ import Configuration from '@strav/kernel/config/configuration'
3
3
  import Database from '@strav/database/database/database'
4
4
  import Emitter from '@strav/kernel/events/emitter'
5
5
  import { ConfigurationError } from '@strav/kernel/exceptions/errors'
6
+ import { configureBreaker, type CircuitBreakerOptions } from './circuit_breaker.ts'
6
7
 
7
8
  export interface JobOptions {
8
9
  queue?: string
@@ -77,6 +78,46 @@ export interface FailedJobRecord {
77
78
 
78
79
  export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<void>
79
80
 
81
+ /**
82
+ * Minimal "schema-like" shape — anything that exposes `parse(input)`
83
+ * (Zod, ArkType, Valibot, hand-written validators) works. The schema
84
+ * is invoked at dequeue time, BEFORE the handler runs, so a tampered
85
+ * row in the DB or a payload from an older code revision is rejected
86
+ * loudly instead of executing with a half-formed shape.
87
+ */
88
+ export interface JobPayloadSchema<T = unknown> {
89
+ parse(input: unknown): T
90
+ }
91
+
92
+ /** Per-handler registration options. */
93
+ export interface JobHandlerOptions<T = any> {
94
+ /**
95
+ * Optional payload schema. When set, the worker calls `schema.parse(payload)`
96
+ * before invoking the handler; a parse failure routes the job to
97
+ * `_strav_failed_jobs` with the validation error message.
98
+ *
99
+ * Recommended for any handler whose payload comes from an external
100
+ * source (HTTP webhook, customer upload) or whose code has churned
101
+ * since older jobs were enqueued — the parse is a fail-fast invariant
102
+ * that catches drift before the handler corrupts state.
103
+ */
104
+ schema?: JobPayloadSchema<T>
105
+ /**
106
+ * Per-handler circuit breaker. Trips when the failure count within
107
+ * `windowMs` reaches `threshold`, pausing dispatch for `cooldownMs`.
108
+ * Defends against retry storms — a stale-schema or downed-dependency
109
+ * handler shouldn't keep eating worker cycles. Defaults: threshold
110
+ * 10, windowMs 60_000, cooldownMs 30_000.
111
+ */
112
+ circuitBreaker?: CircuitBreakerOptions
113
+ }
114
+
115
+ /** Internal registration record stored in Queue._handlers. */
116
+ export interface JobHandlerRegistration<T = any> {
117
+ handler: JobHandler<T>
118
+ schema?: JobPayloadSchema<T>
119
+ }
120
+
80
121
  /**
81
122
  * PostgreSQL-backed job queue.
82
123
  *
@@ -95,7 +136,7 @@ export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<
95
136
  export default class Queue {
96
137
  private static _db: Database
97
138
  private static _config: QueueConfig
98
- private static _handlers = new Map<string, JobHandler>()
139
+ private static _handlers = new Map<string, JobHandlerRegistration>()
99
140
 
100
141
  constructor(db: Database, config: Configuration) {
101
142
  Queue._db = db
@@ -120,7 +161,7 @@ export default class Queue {
120
161
  return Queue._config
121
162
  }
122
163
 
123
- static get handlers(): Map<string, JobHandler> {
164
+ static get handlers(): Map<string, JobHandlerRegistration> {
124
165
  return Queue._handlers
125
166
  }
126
167
 
@@ -174,9 +215,27 @@ export default class Queue {
174
215
  `
175
216
  }
176
217
 
177
- /** Register a handler for a named job. */
178
- static handle<T = any>(name: string, handler: JobHandler<T>): void {
179
- Queue._handlers.set(name, handler)
218
+ /**
219
+ * Register a handler for a named job. Pass `options.schema` to have
220
+ * the worker validate the payload (Zod / ArkType / etc.) before
221
+ * invoking the handler — a parse failure routes the job to
222
+ * `_strav_failed_jobs` instead of running the handler with bad data.
223
+ *
224
+ * @example
225
+ * import { z } from 'zod'
226
+ * Queue.handle('send-email', async (payload) => { ... }, {
227
+ * schema: z.object({ to: z.string().email(), subject: z.string() }),
228
+ * })
229
+ */
230
+ static handle<T = any>(
231
+ name: string,
232
+ handler: JobHandler<T>,
233
+ options?: JobHandlerOptions<T>
234
+ ): void {
235
+ Queue._handlers.set(name, { handler, schema: options?.schema })
236
+ if (options?.circuitBreaker) {
237
+ configureBreaker(name, options.circuitBreaker)
238
+ }
180
239
  }
181
240
 
182
241
  /**
@@ -1,5 +1,6 @@
1
1
  import Queue, { hydrateJob } from './queue.ts'
2
2
  import Emitter from '@strav/kernel/events/emitter'
3
+ import { checkBreaker, recordFailure, recordSuccess } from './circuit_breaker.ts'
3
4
  import type { JobRecord, JobMeta } from './queue.ts'
4
5
 
5
6
  export interface WorkerOptions {
@@ -109,13 +110,25 @@ export default class Worker {
109
110
 
110
111
  /** Process a single job: run handler, handle success/failure. */
111
112
  private async process(job: JobRecord): Promise<void> {
112
- const handler = Queue.handlers.get(job.job)
113
+ const registration = Queue.handlers.get(job.job)
113
114
 
114
- if (!handler) {
115
+ if (!registration) {
115
116
  await this.fail(job, new Error(`No handler registered for job "${job.job}"`))
116
117
  return
117
118
  }
118
119
 
120
+ // Q-1: per-handler circuit breaker. If the handler has tripped its
121
+ // breaker (too many failures in the configured window), defer this
122
+ // job rather than running it. Push it back to the queue with
123
+ // `available_at = now + cooldown` so it retries AFTER the breaker
124
+ // resets — this clears the worker to drain unrelated jobs from the
125
+ // queue instead of compounding the failure storm.
126
+ const cooldownRemaining = checkBreaker(job.job)
127
+ if (cooldownRemaining !== null) {
128
+ await this.deferForCooldown(job, cooldownRemaining)
129
+ return
130
+ }
131
+
119
132
  const meta: JobMeta = {
120
133
  id: job.id,
121
134
  queue: job.queue,
@@ -125,11 +138,27 @@ export default class Worker {
125
138
  progress: (value: number, message?: string) => Queue.reportProgress(job.id, value, message),
126
139
  }
127
140
 
141
+ // Re-parse the payload through the registered schema (CC-5). Catches
142
+ // payloads that drifted from the handler's expected shape — older
143
+ // enqueues, manual DB edits, malicious tampering. A parse failure
144
+ // is fatal: the job goes straight to failed_jobs without retry,
145
+ // because retrying with the same bad payload won't help.
146
+ let payload = job.payload
147
+ if (registration.schema) {
148
+ try {
149
+ payload = registration.schema.parse(job.payload)
150
+ } catch (err) {
151
+ const detail = err instanceof Error ? err.message : String(err)
152
+ await this.fail(job, new Error(`Job "${job.job}" payload failed validation: ${detail}`))
153
+ return
154
+ }
155
+ }
156
+
128
157
  const start = performance.now()
129
158
 
130
159
  try {
131
160
  await Promise.race([
132
- Promise.resolve(handler(job.payload, meta)),
161
+ Promise.resolve(registration.handler(payload, meta)),
133
162
  new Promise<never>((_, reject) =>
134
163
  setTimeout(
135
164
  () => reject(new Error(`Job "${job.job}" timed out after ${job.timeout}ms`)),
@@ -138,6 +167,7 @@ export default class Worker {
138
167
  ),
139
168
  ])
140
169
  await this.complete(job)
170
+ recordSuccess(job.job)
141
171
 
142
172
  if (Emitter.listenerCount('queue:processed') > 0) {
143
173
  const duration = performance.now() - start
@@ -150,6 +180,9 @@ export default class Worker {
150
180
  }
151
181
  } catch (error) {
152
182
  const err = error instanceof Error ? error : new Error(String(error))
183
+ // Update breaker state regardless of retry decision so a job that
184
+ // exhausts its retries still counts toward the trip threshold.
185
+ recordFailure(job.job)
153
186
  if (job.attempts >= job.maxAttempts) {
154
187
  await this.fail(job, err)
155
188
 
@@ -198,6 +231,24 @@ export default class Worker {
198
231
  `
199
232
  }
200
233
 
234
+ /**
235
+ * Push a tripped-circuit job back to the queue with `available_at`
236
+ * scheduled past the cooldown. Also rolls back the attempts counter
237
+ * the fetcher incremented so a circuit trip doesn't eat retry
238
+ * budget — the job genuinely never executed.
239
+ */
240
+ private async deferForCooldown(job: JobRecord, cooldownMs: number): Promise<void> {
241
+ const availableAt = new Date(Date.now() + Math.max(cooldownMs, 1_000))
242
+
243
+ await Queue.db.sql`
244
+ UPDATE "_strav_jobs"
245
+ SET "reserved_at" = NULL,
246
+ "available_at" = ${availableAt},
247
+ "attempts" = GREATEST("attempts" - 1, 0)
248
+ WHERE "id" = ${job.id}
249
+ `
250
+ }
251
+
201
252
  /** Calculate backoff delay in ms based on attempt number. */
202
253
  backoffDelay(attempts: number): number {
203
254
  if (Queue.config.retryBackoff === 'linear') {