@strav/queue 0.3.32 → 0.3.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/src/queue/circuit_breaker.ts +135 -0
- package/src/queue/index.ts +11 -0
- package/src/queue/queue.ts +64 -5
- package/src/queue/worker.ts +54 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@strav/queue",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.33",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Background job processing and task scheduling for the Strav framework",
|
|
6
6
|
"license": "MIT",
|
|
@@ -28,8 +28,8 @@
|
|
|
28
28
|
"./providers/*": "./src/providers/*.ts"
|
|
29
29
|
},
|
|
30
30
|
"peerDependencies": {
|
|
31
|
-
"@strav/kernel": "0.3.
|
|
32
|
-
"@strav/database": "0.3.
|
|
31
|
+
"@strav/kernel": "0.3.33",
|
|
32
|
+
"@strav/database": "0.3.33"
|
|
33
33
|
},
|
|
34
34
|
"scripts": {
|
|
35
35
|
"test": "bun test tests/",
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-handler circuit breaker. Tracks recent failure timestamps in
|
|
3
|
+
* memory; trips the circuit when the failure count within the window
|
|
4
|
+
* exceeds the threshold and pauses dispatch of that handler for
|
|
5
|
+
* `cooldownMs`. Dispatches are auto-resumed once the cooldown expires.
|
|
6
|
+
*
|
|
7
|
+
* Intended defense against retry storms — a handler that consistently
|
|
8
|
+
* fails (stale schema, downed dependency) shouldn't keep eating worker
|
|
9
|
+
* cycles and DB connections. Tripping pushes failed jobs back to the
|
|
10
|
+
* queue with a delay so they retry AFTER the cooldown.
|
|
11
|
+
*
|
|
12
|
+
* State is per-process (in-memory). Multi-worker deployments will each
|
|
13
|
+
* track independently — that's fine; each worker self-pauses without
|
|
14
|
+
* cross-talk, and a handler that's failing for a global reason will
|
|
15
|
+
* trip every worker quickly.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import Emitter from '@strav/kernel/events/emitter'
|
|
19
|
+
|
|
20
|
+
export interface CircuitBreakerOptions {
|
|
21
|
+
/** Number of failures within the window that trips the breaker. Default: 10. */
|
|
22
|
+
threshold?: number
|
|
23
|
+
/** Window in ms over which failures are counted. Default: 60_000 (1 min). */
|
|
24
|
+
windowMs?: number
|
|
25
|
+
/** Cooldown in ms after tripping before retry resumes. Default: 30_000 (30 s). */
|
|
26
|
+
cooldownMs?: number
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface ResolvedBreakerOptions {
|
|
30
|
+
threshold: number
|
|
31
|
+
windowMs: number
|
|
32
|
+
cooldownMs: number
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
interface BreakerState {
|
|
36
|
+
options: ResolvedBreakerOptions
|
|
37
|
+
failures: number[] // unix-ms timestamps, recent-first not enforced
|
|
38
|
+
trippedUntil: number | null // unix-ms; null when closed
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const DEFAULTS: ResolvedBreakerOptions = {
|
|
42
|
+
threshold: 10,
|
|
43
|
+
windowMs: 60_000,
|
|
44
|
+
cooldownMs: 30_000,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const breakers = new Map<string, BreakerState>()
|
|
48
|
+
|
|
49
|
+
/** Register / update a breaker for a handler. */
|
|
50
|
+
export function configureBreaker(handlerName: string, options: CircuitBreakerOptions): void {
|
|
51
|
+
breakers.set(handlerName, {
|
|
52
|
+
options: { ...DEFAULTS, ...options },
|
|
53
|
+
failures: [],
|
|
54
|
+
trippedUntil: null,
|
|
55
|
+
})
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Forget all breaker state. Test-only. */
|
|
59
|
+
export function resetBreakers(): void {
|
|
60
|
+
breakers.clear()
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Check if a handler is currently tripped. Returns the remaining
|
|
65
|
+
* cooldown in ms (>= 0) when tripped, or `null` when the circuit is
|
|
66
|
+
* closed (handler is dispatchable). Auto-resets state when the
|
|
67
|
+
* cooldown has elapsed and emits `queue:circuit_reset` once on
|
|
68
|
+
* transition.
|
|
69
|
+
*/
|
|
70
|
+
export function checkBreaker(handlerName: string, now: number = Date.now()): number | null {
|
|
71
|
+
const state = breakers.get(handlerName)
|
|
72
|
+
if (!state) return null
|
|
73
|
+
if (state.trippedUntil === null) return null
|
|
74
|
+
|
|
75
|
+
if (now >= state.trippedUntil) {
|
|
76
|
+
// Cooldown expired — close the circuit. Reset failure history so
|
|
77
|
+
// the next set of failures starts a fresh window.
|
|
78
|
+
state.trippedUntil = null
|
|
79
|
+
state.failures = []
|
|
80
|
+
if (Emitter.listenerCount('queue:circuit_reset') > 0) {
|
|
81
|
+
void Emitter.emit('queue:circuit_reset', { handler: handlerName }).catch(() => {})
|
|
82
|
+
}
|
|
83
|
+
return null
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return state.trippedUntil - now
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Record a failure for a handler. Trips the circuit when the failure
|
|
91
|
+
* count within `windowMs` reaches `threshold`. Returns the new cooldown
|
|
92
|
+
* (ms) when tripping, or `null` when the threshold is not yet reached.
|
|
93
|
+
*/
|
|
94
|
+
export function recordFailure(handlerName: string, now: number = Date.now()): number | null {
|
|
95
|
+
const state = breakers.get(handlerName)
|
|
96
|
+
if (!state) return null
|
|
97
|
+
|
|
98
|
+
// Drop failures outside the window then push the new one.
|
|
99
|
+
const cutoff = now - state.options.windowMs
|
|
100
|
+
state.failures = state.failures.filter(t => t > cutoff)
|
|
101
|
+
state.failures.push(now)
|
|
102
|
+
|
|
103
|
+
if (state.trippedUntil !== null) {
|
|
104
|
+
// Already tripped — do nothing.
|
|
105
|
+
return state.trippedUntil - now
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (state.failures.length >= state.options.threshold) {
|
|
109
|
+
state.trippedUntil = now + state.options.cooldownMs
|
|
110
|
+
if (Emitter.listenerCount('queue:circuit_tripped') > 0) {
|
|
111
|
+
void Emitter.emit('queue:circuit_tripped', {
|
|
112
|
+
handler: handlerName,
|
|
113
|
+
threshold: state.options.threshold,
|
|
114
|
+
windowMs: state.options.windowMs,
|
|
115
|
+
cooldownMs: state.options.cooldownMs,
|
|
116
|
+
trippedUntil: state.trippedUntil,
|
|
117
|
+
}).catch(() => {})
|
|
118
|
+
}
|
|
119
|
+
return state.options.cooldownMs
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return null
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Record a success — clears the failure history for this handler so
|
|
127
|
+
* intermittent errors don't accumulate. Does NOT close a tripped
|
|
128
|
+
* circuit (only the cooldown expiry does).
|
|
129
|
+
*/
|
|
130
|
+
export function recordSuccess(handlerName: string): void {
|
|
131
|
+
const state = breakers.get(handlerName)
|
|
132
|
+
if (!state) return
|
|
133
|
+
if (state.trippedUntil !== null) return
|
|
134
|
+
state.failures = []
|
|
135
|
+
}
|
package/src/queue/index.ts
CHANGED
|
@@ -7,5 +7,16 @@ export type {
|
|
|
7
7
|
JobRecord,
|
|
8
8
|
FailedJobRecord,
|
|
9
9
|
JobHandler,
|
|
10
|
+
JobHandlerOptions,
|
|
11
|
+
JobHandlerRegistration,
|
|
12
|
+
JobPayloadSchema,
|
|
10
13
|
} from './queue.ts'
|
|
11
14
|
export type { WorkerOptions } from './worker.ts'
|
|
15
|
+
export {
|
|
16
|
+
configureBreaker,
|
|
17
|
+
checkBreaker,
|
|
18
|
+
recordFailure,
|
|
19
|
+
recordSuccess,
|
|
20
|
+
resetBreakers,
|
|
21
|
+
} from './circuit_breaker.ts'
|
|
22
|
+
export type { CircuitBreakerOptions, ResolvedBreakerOptions } from './circuit_breaker.ts'
|
package/src/queue/queue.ts
CHANGED
|
@@ -3,6 +3,7 @@ import Configuration from '@strav/kernel/config/configuration'
|
|
|
3
3
|
import Database from '@strav/database/database/database'
|
|
4
4
|
import Emitter from '@strav/kernel/events/emitter'
|
|
5
5
|
import { ConfigurationError } from '@strav/kernel/exceptions/errors'
|
|
6
|
+
import { configureBreaker, type CircuitBreakerOptions } from './circuit_breaker.ts'
|
|
6
7
|
|
|
7
8
|
export interface JobOptions {
|
|
8
9
|
queue?: string
|
|
@@ -77,6 +78,46 @@ export interface FailedJobRecord {
|
|
|
77
78
|
|
|
78
79
|
export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<void>
|
|
79
80
|
|
|
81
|
+
/**
|
|
82
|
+
* Minimal "schema-like" shape — anything that exposes `parse(input)`
|
|
83
|
+
* (Zod, ArkType, Valibot, hand-written validators) works. The schema
|
|
84
|
+
* is invoked at dequeue time, BEFORE the handler runs, so a tampered
|
|
85
|
+
* row in the DB or a payload from an older code revision is rejected
|
|
86
|
+
* loudly instead of executing with a half-formed shape.
|
|
87
|
+
*/
|
|
88
|
+
export interface JobPayloadSchema<T = unknown> {
|
|
89
|
+
parse(input: unknown): T
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Per-handler registration options. */
|
|
93
|
+
export interface JobHandlerOptions<T = any> {
|
|
94
|
+
/**
|
|
95
|
+
* Optional payload schema. When set, the worker calls `schema.parse(payload)`
|
|
96
|
+
* before invoking the handler; a parse failure routes the job to
|
|
97
|
+
* `_strav_failed_jobs` with the validation error message.
|
|
98
|
+
*
|
|
99
|
+
* Recommended for any handler whose payload comes from an external
|
|
100
|
+
* source (HTTP webhook, customer upload) or whose code has churned
|
|
101
|
+
* since older jobs were enqueued — the parse is a fail-fast invariant
|
|
102
|
+
* that catches drift before the handler corrupts state.
|
|
103
|
+
*/
|
|
104
|
+
schema?: JobPayloadSchema<T>
|
|
105
|
+
/**
|
|
106
|
+
* Per-handler circuit breaker. Trips when the failure count within
|
|
107
|
+
* `windowMs` reaches `threshold`, pausing dispatch for `cooldownMs`.
|
|
108
|
+
* Defends against retry storms — a stale-schema or downed-dependency
|
|
109
|
+
* handler shouldn't keep eating worker cycles. Defaults: threshold
|
|
110
|
+
* 10, windowMs 60_000, cooldownMs 30_000.
|
|
111
|
+
*/
|
|
112
|
+
circuitBreaker?: CircuitBreakerOptions
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/** Internal registration record stored in Queue._handlers. */
|
|
116
|
+
export interface JobHandlerRegistration<T = any> {
|
|
117
|
+
handler: JobHandler<T>
|
|
118
|
+
schema?: JobPayloadSchema<T>
|
|
119
|
+
}
|
|
120
|
+
|
|
80
121
|
/**
|
|
81
122
|
* PostgreSQL-backed job queue.
|
|
82
123
|
*
|
|
@@ -95,7 +136,7 @@ export type JobHandler<T = any> = (payload: T, meta: JobMeta) => void | Promise<
|
|
|
95
136
|
export default class Queue {
|
|
96
137
|
private static _db: Database
|
|
97
138
|
private static _config: QueueConfig
|
|
98
|
-
private static _handlers = new Map<string,
|
|
139
|
+
private static _handlers = new Map<string, JobHandlerRegistration>()
|
|
99
140
|
|
|
100
141
|
constructor(db: Database, config: Configuration) {
|
|
101
142
|
Queue._db = db
|
|
@@ -120,7 +161,7 @@ export default class Queue {
|
|
|
120
161
|
return Queue._config
|
|
121
162
|
}
|
|
122
163
|
|
|
123
|
-
static get handlers(): Map<string,
|
|
164
|
+
static get handlers(): Map<string, JobHandlerRegistration> {
|
|
124
165
|
return Queue._handlers
|
|
125
166
|
}
|
|
126
167
|
|
|
@@ -174,9 +215,27 @@ export default class Queue {
|
|
|
174
215
|
`
|
|
175
216
|
}
|
|
176
217
|
|
|
177
|
-
/**
|
|
178
|
-
|
|
179
|
-
|
|
218
|
+
/**
|
|
219
|
+
* Register a handler for a named job. Pass `options.schema` to have
|
|
220
|
+
* the worker validate the payload (Zod / ArkType / etc.) before
|
|
221
|
+
* invoking the handler — a parse failure routes the job to
|
|
222
|
+
* `_strav_failed_jobs` instead of running the handler with bad data.
|
|
223
|
+
*
|
|
224
|
+
* @example
|
|
225
|
+
* import { z } from 'zod'
|
|
226
|
+
* Queue.handle('send-email', async (payload) => { ... }, {
|
|
227
|
+
* schema: z.object({ to: z.string().email(), subject: z.string() }),
|
|
228
|
+
* })
|
|
229
|
+
*/
|
|
230
|
+
static handle<T = any>(
|
|
231
|
+
name: string,
|
|
232
|
+
handler: JobHandler<T>,
|
|
233
|
+
options?: JobHandlerOptions<T>
|
|
234
|
+
): void {
|
|
235
|
+
Queue._handlers.set(name, { handler, schema: options?.schema })
|
|
236
|
+
if (options?.circuitBreaker) {
|
|
237
|
+
configureBreaker(name, options.circuitBreaker)
|
|
238
|
+
}
|
|
180
239
|
}
|
|
181
240
|
|
|
182
241
|
/**
|
package/src/queue/worker.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import Queue, { hydrateJob } from './queue.ts'
|
|
2
2
|
import Emitter from '@strav/kernel/events/emitter'
|
|
3
|
+
import { checkBreaker, recordFailure, recordSuccess } from './circuit_breaker.ts'
|
|
3
4
|
import type { JobRecord, JobMeta } from './queue.ts'
|
|
4
5
|
|
|
5
6
|
export interface WorkerOptions {
|
|
@@ -109,13 +110,25 @@ export default class Worker {
|
|
|
109
110
|
|
|
110
111
|
/** Process a single job: run handler, handle success/failure. */
|
|
111
112
|
private async process(job: JobRecord): Promise<void> {
|
|
112
|
-
const
|
|
113
|
+
const registration = Queue.handlers.get(job.job)
|
|
113
114
|
|
|
114
|
-
if (!
|
|
115
|
+
if (!registration) {
|
|
115
116
|
await this.fail(job, new Error(`No handler registered for job "${job.job}"`))
|
|
116
117
|
return
|
|
117
118
|
}
|
|
118
119
|
|
|
120
|
+
// Q-1: per-handler circuit breaker. If the handler has tripped its
|
|
121
|
+
// breaker (too many failures in the configured window), defer this
|
|
122
|
+
// job rather than running it. Push it back to the queue with
|
|
123
|
+
// `available_at = now + cooldown` so it retries AFTER the breaker
|
|
124
|
+
// resets — this clears the worker to drain unrelated jobs from the
|
|
125
|
+
// queue instead of compounding the failure storm.
|
|
126
|
+
const cooldownRemaining = checkBreaker(job.job)
|
|
127
|
+
if (cooldownRemaining !== null) {
|
|
128
|
+
await this.deferForCooldown(job, cooldownRemaining)
|
|
129
|
+
return
|
|
130
|
+
}
|
|
131
|
+
|
|
119
132
|
const meta: JobMeta = {
|
|
120
133
|
id: job.id,
|
|
121
134
|
queue: job.queue,
|
|
@@ -125,11 +138,27 @@ export default class Worker {
|
|
|
125
138
|
progress: (value: number, message?: string) => Queue.reportProgress(job.id, value, message),
|
|
126
139
|
}
|
|
127
140
|
|
|
141
|
+
// Re-parse the payload through the registered schema (CC-5). Catches
|
|
142
|
+
// payloads that drifted from the handler's expected shape — older
|
|
143
|
+
// enqueues, manual DB edits, malicious tampering. A parse failure
|
|
144
|
+
// is fatal: the job goes straight to failed_jobs without retry,
|
|
145
|
+
// because retrying with the same bad payload won't help.
|
|
146
|
+
let payload = job.payload
|
|
147
|
+
if (registration.schema) {
|
|
148
|
+
try {
|
|
149
|
+
payload = registration.schema.parse(job.payload)
|
|
150
|
+
} catch (err) {
|
|
151
|
+
const detail = err instanceof Error ? err.message : String(err)
|
|
152
|
+
await this.fail(job, new Error(`Job "${job.job}" payload failed validation: ${detail}`))
|
|
153
|
+
return
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
128
157
|
const start = performance.now()
|
|
129
158
|
|
|
130
159
|
try {
|
|
131
160
|
await Promise.race([
|
|
132
|
-
Promise.resolve(handler(
|
|
161
|
+
Promise.resolve(registration.handler(payload, meta)),
|
|
133
162
|
new Promise<never>((_, reject) =>
|
|
134
163
|
setTimeout(
|
|
135
164
|
() => reject(new Error(`Job "${job.job}" timed out after ${job.timeout}ms`)),
|
|
@@ -138,6 +167,7 @@ export default class Worker {
|
|
|
138
167
|
),
|
|
139
168
|
])
|
|
140
169
|
await this.complete(job)
|
|
170
|
+
recordSuccess(job.job)
|
|
141
171
|
|
|
142
172
|
if (Emitter.listenerCount('queue:processed') > 0) {
|
|
143
173
|
const duration = performance.now() - start
|
|
@@ -150,6 +180,9 @@ export default class Worker {
|
|
|
150
180
|
}
|
|
151
181
|
} catch (error) {
|
|
152
182
|
const err = error instanceof Error ? error : new Error(String(error))
|
|
183
|
+
// Update breaker state regardless of retry decision so a job that
|
|
184
|
+
// exhausts its retries still counts toward the trip threshold.
|
|
185
|
+
recordFailure(job.job)
|
|
153
186
|
if (job.attempts >= job.maxAttempts) {
|
|
154
187
|
await this.fail(job, err)
|
|
155
188
|
|
|
@@ -198,6 +231,24 @@ export default class Worker {
|
|
|
198
231
|
`
|
|
199
232
|
}
|
|
200
233
|
|
|
234
|
+
/**
|
|
235
|
+
* Push a tripped-circuit job back to the queue with `available_at`
|
|
236
|
+
* scheduled past the cooldown. Also rolls back the attempts counter
|
|
237
|
+
* the fetcher incremented so a circuit trip doesn't eat retry
|
|
238
|
+
* budget — the job genuinely never executed.
|
|
239
|
+
*/
|
|
240
|
+
private async deferForCooldown(job: JobRecord, cooldownMs: number): Promise<void> {
|
|
241
|
+
const availableAt = new Date(Date.now() + Math.max(cooldownMs, 1_000))
|
|
242
|
+
|
|
243
|
+
await Queue.db.sql`
|
|
244
|
+
UPDATE "_strav_jobs"
|
|
245
|
+
SET "reserved_at" = NULL,
|
|
246
|
+
"available_at" = ${availableAt},
|
|
247
|
+
"attempts" = GREATEST("attempts" - 1, 0)
|
|
248
|
+
WHERE "id" = ${job.id}
|
|
249
|
+
`
|
|
250
|
+
}
|
|
251
|
+
|
|
201
252
|
/** Calculate backoff delay in ms based on attempt number. */
|
|
202
253
|
backoffDelay(attempts: number): number {
|
|
203
254
|
if (Queue.config.retryBackoff === 'linear') {
|