bunqueue 2.8.5 → 2.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/application/operations/ack.d.ts +1 -1
  2. package/dist/application/operations/ack.js +2 -2
  3. package/dist/application/queueManager.d.ts +1 -1
  4. package/dist/application/queueManager.js +2 -2
  5. package/dist/application/statsManager.js +18 -8
  6. package/dist/client/queue/dlq.js +1 -1
  7. package/dist/client/queue/operations/management.js +4 -2
  8. package/dist/client/queue/queue.js +2 -0
  9. package/dist/client/queue/scheduler.js +5 -0
  10. package/dist/client/tcp/client.d.ts +9 -0
  11. package/dist/client/tcp/client.js +38 -3
  12. package/dist/client/tcp/connection.js +11 -0
  13. package/dist/client/tcp/health.d.ts +14 -0
  14. package/dist/client/tcp/health.js +24 -0
  15. package/dist/client/tcp/types.d.ts +10 -0
  16. package/dist/client/tcp/types.js +1 -0
  17. package/dist/client/tcpPool.js +2 -0
  18. package/dist/client/types.d.ts +6 -0
  19. package/dist/client/worker/worker.js +7 -2
  20. package/dist/client/worker/workerPull.d.ts +2 -0
  21. package/dist/client/worker/workerPull.js +12 -5
  22. package/dist/domain/types/command.d.ts +4 -0
  23. package/dist/infrastructure/server/handlers/advanced.js +60 -8
  24. package/dist/infrastructure/server/handlers/core.js +1 -1
  25. package/dist/infrastructure/server/handlers/cron.js +1 -0
  26. package/dist/infrastructure/server/handlers/monitoring.js +7 -2
  27. package/dist/infrastructure/server/http.js +20 -6
  28. package/dist/infrastructure/server/httpRouteJobs.js +14 -2
  29. package/dist/infrastructure/server/httpRouteQueueConfig.js +19 -3
  30. package/dist/infrastructure/server/httpRouteQueues.js +13 -1
  31. package/dist/infrastructure/server/httpRouteResources.js +4 -0
  32. package/package.json +1 -1
@@ -57,7 +57,7 @@ export declare function ackJob(jobId: JobId, result: unknown, ctx: AckContext):
57
57
  /**
58
58
  * Mark job as failed
59
59
  */
60
- export declare function failJob(jobId: JobId, error: string | undefined, ctx: AckContext): Promise<void>;
60
+ export declare function failJob(jobId: JobId, error: string | undefined, ctx: AckContext, unrecoverable?: boolean): Promise<void>;
61
61
  /**
62
62
  * Acknowledge multiple jobs - optimized batch processing
63
63
  * Groups jobs by shard to minimize lock acquisitions: O(shards) instead of O(n)
@@ -112,7 +112,7 @@ function moveFailedJobToDlq(job, jobId, error, shard, ctx) {
112
112
  /**
113
113
  * Mark job as failed
114
114
  */
115
- export async function failJob(jobId, error, ctx) {
115
+ export async function failJob(jobId, error, ctx, unrecoverable = false) {
116
116
  const procIdx = processingShardIndex(jobId);
117
117
  const job = await withWriteLock(ctx.processingLocks[procIdx], () => {
118
118
  const job = ctx.processingShards[procIdx].get(jobId);
@@ -134,7 +134,7 @@ export async function failJob(jobId, error, ctx) {
134
134
  await withWriteLock(ctx.shardLocks[idx], () => {
135
135
  const shard = ctx.shards[idx];
136
136
  shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
137
- if (canRetry(job)) {
137
+ if (!unrecoverable && canRetry(job)) {
138
138
  const now = Date.now();
139
139
  job.runAt = now + calculateBackoff(job);
140
140
  shard.getQueue(job.queue).push(job);
@@ -79,7 +79,7 @@ export declare class QueueManager {
79
79
  result: unknown;
80
80
  token?: string;
81
81
  }>): Promise<void>;
82
- fail(jobId: JobId, error?: string, token?: string): Promise<void>;
82
+ fail(jobId: JobId, error?: string, token?: string, unrecoverable?: boolean): Promise<void>;
83
83
  /**
84
84
  * Check if a failed lock verification is a genuine ownership conflict.
85
85
  * If the job is still in processing with a different lock, throw.
@@ -358,7 +358,7 @@ export class QueueManager {
358
358
  lockMgr.releaseLock(item.id, lockCtx, item.token);
359
359
  }
360
360
  }
361
- async fail(jobId, error, token) {
361
+ async fail(jobId, error, token, unrecoverable = false) {
362
362
  const lockCtx = this.contextFactory.getLockContext();
363
363
  if (token && !lockMgr.verifyLock(jobId, token, lockCtx)) {
364
364
  this.throwIfOwnershipConflict(jobId, lockCtx);
@@ -367,7 +367,7 @@ export class QueueManager {
367
367
  return;
368
368
  }
369
369
  try {
370
- await failJob(jobId, error, this.contextFactory.getAckContext());
370
+ await failJob(jobId, error, this.contextFactory.getAckContext(), unrecoverable);
371
371
  }
372
372
  catch (err) {
373
373
  // Job removed from processing by stall detection. The stall retry
@@ -3,6 +3,16 @@
3
3
  * Provides system metrics and memory compaction utilities
4
4
  */
5
5
  import { SHARD_COUNT, shardIndex } from '../shared/hash';
6
+ /** Count jobs belonging to `queueName` across one or more job iterables. */
7
+ function countByQueue(sources, queueName) {
8
+ let count = 0;
9
+ for (const src of sources) {
10
+ for (const job of src)
11
+ if (job.queue === queueName)
12
+ count++;
13
+ }
14
+ return count;
15
+ }
6
16
  /**
7
17
  * Get queue statistics - uses running counters + priority scan
8
18
  */
@@ -14,7 +24,10 @@ export function getStats(ctx, cronScheduler) {
14
24
  delayed += shardStats.delayedJobs;
15
25
  dlq += shardStats.dlqJobs;
16
26
  active += ctx.processingShards[i].size;
17
- waitingChildren += ctx.shards[i].waitingChildren.size;
27
+ // getJobState reports BOTH waitingChildren (flow parents) and waitingDeps
28
+ // (jobs blocked on dependsOn) as state 'waiting-children', and getJobs lists
29
+ // both — so the count must include both or it undercounts vs state/list (#95 class).
30
+ waitingChildren += ctx.shards[i].waitingChildren.size + ctx.shards[i].waitingDeps.size;
18
31
  // Scan queues to split waiting vs prioritized (BullMQ v5 compat)
19
32
  for (const queue of ctx.shards[i].queues.values()) {
20
33
  for (const job of queue.values()) {
@@ -174,13 +187,10 @@ export function getQueueJobCounts(queueName, ctx) {
174
187
  }
175
188
  // Count failed (DLQ) jobs for this queue
176
189
  const failed = shard.getDlq(queueName).length;
177
- // Count waiting-children jobs (parents waiting for child completion)
178
- let waitingChildrenCount = 0;
179
- for (const job of shard.waitingChildren.values()) {
180
- if (job.queue === queueName) {
181
- waitingChildrenCount++;
182
- }
183
- }
190
+ // Count waiting-children jobs. getJobState/getJobs treat BOTH waitingChildren
191
+ // (flow parents) and waitingDeps (jobs blocked on dependsOn) as 'waiting-children',
192
+ // so count both to stay consistent with state/list (#95 class).
193
+ const waitingChildrenCount = countByQueue([shard.waitingChildren.values(), shard.waitingDeps.values()], queueName);
184
194
  // Per-queue cumulative counters
185
195
  const perQueue = ctx.perQueueMetrics?.get(queueName);
186
196
  const totalCompleted = Number(perQueue?.totalCompleted ?? 0n);
@@ -64,7 +64,7 @@ export function retryDlq(ctx, id) {
64
64
  if (ctx.embedded)
65
65
  return dlqOps.retryDlqEmbedded(ctx.name, id);
66
66
  if (ctx.tcp)
67
- void ctx.tcp.send({ cmd: 'RetryDlq', queue: ctx.name, id });
67
+ void ctx.tcp.send({ cmd: 'RetryDlq', queue: ctx.name, jobId: id });
68
68
  return 0;
69
69
  }
70
70
  /** Retry DLQ entries by filter */
@@ -77,7 +77,8 @@ export async function cleanAsync(ctx, grace, limit, type) {
77
77
  queue: ctx.name,
78
78
  grace,
79
79
  limit,
80
- type,
80
+ // Handler reads `state`; sending `type` made the state filter a no-op.
81
+ state: type,
81
82
  });
82
83
  if (!response.ok)
83
84
  return [];
@@ -107,7 +108,8 @@ export async function promoteJobs(ctx, opts) {
107
108
  });
108
109
  if (!response.ok)
109
110
  return 0;
110
- return (response.promoted ?? 0);
111
+ // Handler returns `count`; reading `promoted` always yielded 0.
112
+ return (response.count ?? 0);
111
113
  }
112
114
  /** Promote a single job */
113
115
  export async function promoteJob(ctx, id) {
@@ -63,6 +63,7 @@ export class Queue {
63
63
  poolSize,
64
64
  pingInterval: connOpts.pingInterval,
65
65
  commandTimeout: connOpts.commandTimeout,
66
+ maxCommandTimeouts: connOpts.maxCommandTimeouts,
66
67
  pipelining: connOpts.pipelining,
67
68
  maxInFlight: connOpts.maxInFlight,
68
69
  });
@@ -76,6 +77,7 @@ export class Queue {
76
77
  poolSize,
77
78
  pingInterval: connOpts.pingInterval,
78
79
  commandTimeout: connOpts.commandTimeout,
80
+ maxCommandTimeouts: connOpts.maxCommandTimeouts,
79
81
  pipelining: connOpts.pipelining,
80
82
  maxInFlight: connOpts.maxInFlight,
81
83
  });
@@ -65,6 +65,9 @@ export async function upsertJobScheduler(ctx, schedulerId, repeatOpts, jobTempla
65
65
  const dedupFields = buildCronDedup(jobTemplate);
66
66
  const jobOptions = buildCronJobOptions(ctx.defaultJobOptions, jobTemplate);
67
67
  const cronName = toCronName(ctx, schedulerId);
68
+ // Priority of spawned jobs: carried on the top-level Cron field (the handler
69
+ // reads cmd.priority), which buildCronJobOptions does not cover.
70
+ const priority = jobTemplate?.opts?.priority ?? ctx.defaultJobOptions?.priority;
68
71
  if (ctx.embedded) {
69
72
  const manager = getSharedManager();
70
73
  manager.addCron({
@@ -73,6 +76,7 @@ export async function upsertJobScheduler(ctx, schedulerId, repeatOpts, jobTempla
73
76
  data,
74
77
  schedule: cronPattern,
75
78
  repeatEvery,
79
+ priority,
76
80
  timezone: repeatOpts.timezone ?? 'UTC',
77
81
  skipMissedOnRestart: repeatOpts.skipMissedOnRestart,
78
82
  immediately: repeatOpts.immediately,
@@ -94,6 +98,7 @@ export async function upsertJobScheduler(ctx, schedulerId, repeatOpts, jobTempla
94
98
  data,
95
99
  schedule: cronPattern,
96
100
  repeatEvery,
101
+ priority,
97
102
  timezone: repeatOpts.timezone,
98
103
  skipMissedOnRestart: repeatOpts.skipMissedOnRestart,
99
104
  immediately: repeatOpts.immediately,
@@ -73,6 +73,15 @@ export declare class TcpClient extends EventEmitter {
73
73
  /** Send ping to check connection health */
74
74
  ping(): Promise<boolean>;
75
75
  private handlePingFailure;
76
+ /**
77
+ * A command timed out. On a half-open socket (peer gone, no FIN/RST) writes
78
+ * keep succeeding but no response ever returns — every command times out
79
+ * while the socket still looks "connected". The health-check ping is one way
80
+ * to notice, but it can be disabled or slower than real traffic, leaving a
81
+ * worker's PULL loop to time out forever without ever reconnecting (#94).
82
+ * Treat a sustained run of timeouts as a dead link and force a reconnect.
83
+ */
84
+ private handleCommandTimeout;
76
85
  private forceReconnect;
77
86
  /** Get connection health metrics */
78
87
  getHealth(): ConnectionHealth;
@@ -79,6 +79,7 @@ export class TcpClient extends EventEmitter {
79
79
  this.health = new HealthTracker({
80
80
  pingInterval: this.options.pingInterval,
81
81
  maxPingFailures: this.options.maxPingFailures,
82
+ maxCommandTimeouts: this.options.maxCommandTimeouts,
82
83
  });
83
84
  this.reconnect = new ReconnectManager({
84
85
  maxReconnectAttempts: this.options.maxReconnectAttempts,
@@ -258,15 +259,44 @@ export class TcpClient extends EventEmitter {
258
259
  this.emit('health', { type: 'ping_failed' });
259
260
  }
260
261
  }
262
+ /**
263
+ * A command timed out. On a half-open socket (peer gone, no FIN/RST) writes
264
+ * keep succeeding but no response ever returns — every command times out
265
+ * while the socket still looks "connected". The health-check ping is one way
266
+ * to notice, but it can be disabled or slower than real traffic, leaving a
267
+ * worker's PULL loop to time out forever without ever reconnecting (#94).
268
+ * Treat a sustained run of timeouts as a dead link and force a reconnect.
269
+ */
270
+ handleCommandTimeout() {
271
+ if (this.health.recordCommandTimeout()) {
272
+ this.emit('health', { type: 'unhealthy', reason: 'max_command_timeouts' });
273
+ this.forceReconnect();
274
+ }
275
+ }
261
276
  forceReconnect() {
262
277
  if (this.reconnect.isClosed())
263
278
  return;
264
279
  if (this.socket) {
265
- this.socket.end();
280
+ // end() can throw on an already-errored/half-dead socket. Swallow it:
281
+ // failing to close the corpse must NOT abort the reconnect path below,
282
+ // or the connection would stay wedged forever (the #94 failure mode).
283
+ try {
284
+ this.socket.end();
285
+ }
286
+ catch {
287
+ /* socket already torn down */
288
+ }
266
289
  this.socket = null;
267
290
  }
268
291
  this.connected = false;
269
292
  this.health.stopPing();
293
+ // Settle every in-flight/queued command NOW. Otherwise their per-command
294
+ // timeouts keep ticking and fire AFTER the fresh socket is up — each stale
295
+ // timeout bumps the new connection's dead-link counter and can re-trigger
296
+ // forceReconnect in a loop (a reconnect storm that never stabilises). It
297
+ // also unblocks awaiting callers (e.g. a Worker's PULL) immediately instead
298
+ // of making them wait out the full commandTimeout on a corpse.
299
+ this.commands.rejectAll(new Error('Connection lost'));
270
300
  if (this.reconnect.canReconnect())
271
301
  this.reconnect.scheduleReconnect(() => this.connect());
272
302
  }
@@ -343,6 +373,8 @@ export class TcpClient extends EventEmitter {
343
373
  if (removed) {
344
374
  this.health.recordError();
345
375
  next.reject(new Error('Command timeout'));
376
+ // In-flight command got no response: count it toward dead-link detection.
377
+ this.handleCommandTimeout();
346
378
  }
347
379
  }, this.options.commandTimeout);
348
380
  next.timeout = newTimeout;
@@ -365,17 +397,20 @@ export class TcpClient extends EventEmitter {
365
397
  let pendingRef;
366
398
  const promise = new Promise((resolve, reject) => {
367
399
  const timeout = setTimeout(() => {
368
- // Try to remove from queue first
400
+ // Try to remove from queue first. A still-queued command never reached
401
+ // the socket (e.g. waiting on connect), so it is NOT evidence of a dead
402
+ // link — reject it but don't count it toward dead-link detection.
369
403
  if (this.commands.remove(id)) {
370
404
  this.health.recordError();
371
405
  reject(new Error('Command timeout'));
372
406
  return;
373
407
  }
374
- // Try to remove from in-flight
408
+ // Try to remove from in-flight: this one WAS sent and got no response.
375
409
  const removed = this.commands.removeByReqId(reqId);
376
410
  if (removed) {
377
411
  this.health.recordError();
378
412
  reject(new Error('Command timeout'));
413
+ this.handleCommandTimeout();
379
414
  }
380
415
  }, this.options.commandTimeout);
381
416
  pendingRef = {
@@ -42,6 +42,17 @@ export async function createConnection(target, connectTimeout, events) {
42
42
  },
43
43
  open(sock) {
44
44
  cleanup();
45
+ // Enable TCP keepalive so the OS probes idle connections and surfaces a
46
+ // dead peer (suspended host, NAT/LB drop) via an error/close event,
47
+ // instead of a half-open socket lingering until tcp_retries2 (~15 min).
48
+ // Best-effort: not all platforms honor the delay, and older Bun builds
49
+ // may lack the method — never let it abort connection setup. See #94.
50
+ try {
51
+ sock.setKeepAlive?.(true, 15000);
52
+ }
53
+ catch {
54
+ /* keepalive unsupported on this platform/runtime */
55
+ }
45
56
  socketData.write = (d) => sock.write(d);
46
57
  socketData.end = () => sock.end();
47
58
  connectionResolved = true;
@@ -7,6 +7,11 @@ import type { ConnectionHealth } from './types';
7
7
  export interface HealthConfig {
8
8
  pingInterval: number;
9
9
  maxPingFailures: number;
10
+ /**
11
+ * Consecutive command timeouts before the link is concluded dead (0 = off).
12
+ * Optional for backward compatibility; defaults to 3 when omitted.
13
+ */
14
+ maxCommandTimeouts?: number;
10
15
  }
11
16
  /**
12
17
  * Tracks connection health metrics
@@ -14,6 +19,7 @@ export interface HealthConfig {
14
19
  export declare class HealthTracker {
15
20
  private readonly config;
16
21
  private consecutivePingFailures;
22
+ private consecutiveCommandTimeouts;
17
23
  private lastSuccessAt;
18
24
  private lastErrorAt;
19
25
  private connectedAt;
@@ -35,6 +41,14 @@ export declare class HealthTracker {
35
41
  recordPingSuccess(latencyMs: number): void;
36
42
  /** Record ping failure, returns true if max failures reached */
37
43
  recordPingFailure(): boolean;
44
+ /**
45
+ * Record a command timeout. Returns true when the configured consecutive
46
+ * threshold is reached (and the feature is enabled), signalling the caller to
47
+ * force a reconnect. Any intervening success resets the counter, so this only
48
+ * fires on a sustained run of timeouts — the signature of a dead/half-open
49
+ * socket where writes succeed but no response ever comes back.
50
+ */
51
+ recordCommandTimeout(): boolean;
38
52
  /** Get current health metrics */
39
53
  getHealth(state: 'connected' | 'connecting' | 'disconnected' | 'closed'): ConnectionHealth;
40
54
  /** Start ping timer */
@@ -2,12 +2,15 @@
2
2
  * TCP Health Tracker
3
3
  * Monitors connection health with ping and latency tracking
4
4
  */
5
+ /** Default consecutive command-timeout threshold when not configured. */
6
+ const DEFAULT_MAX_COMMAND_TIMEOUTS = 3;
5
7
  /**
6
8
  * Tracks connection health metrics
7
9
  */
8
10
  export class HealthTracker {
9
11
  config;
10
12
  consecutivePingFailures = 0;
13
+ consecutiveCommandTimeouts = 0;
11
14
  lastSuccessAt = null;
12
15
  lastErrorAt = null;
13
16
  connectedAt = null;
@@ -23,6 +26,9 @@ export class HealthTracker {
23
26
  recordSuccess(latencyMs) {
24
27
  this.lastSuccessAt = Date.now();
25
28
  this.totalCommands++;
29
+ // A real response proves the link is alive: the prior timeouts were not a
30
+ // sustained run, so reset the dead-link counter ("consecutive" must mean it).
31
+ this.consecutiveCommandTimeouts = 0;
26
32
  this.recordLatency(latencyMs);
27
33
  }
28
34
  /** Record command error */
@@ -38,10 +44,13 @@ export class HealthTracker {
38
44
  recordConnected() {
39
45
  this.connectedAt = Date.now();
40
46
  this.consecutivePingFailures = 0;
47
+ this.consecutiveCommandTimeouts = 0;
41
48
  }
42
49
  /** Record ping success */
43
50
  recordPingSuccess(latencyMs) {
51
+ // A successful ping is also proof the link is alive — clear both suspicions.
44
52
  this.consecutivePingFailures = 0;
53
+ this.consecutiveCommandTimeouts = 0;
45
54
  this.recordLatency(latencyMs);
46
55
  }
47
56
  /** Record ping failure, returns true if max failures reached */
@@ -51,6 +60,20 @@ export class HealthTracker {
51
60
  this.totalErrors++;
52
61
  return this.consecutivePingFailures >= this.config.maxPingFailures;
53
62
  }
63
+ /**
64
+ * Record a command timeout. Returns true when the configured consecutive
65
+ * threshold is reached (and the feature is enabled), signalling the caller to
66
+ * force a reconnect. Any intervening success resets the counter, so this only
67
+ * fires on a sustained run of timeouts — the signature of a dead/half-open
68
+ * socket where writes succeed but no response ever comes back.
69
+ */
70
+ recordCommandTimeout() {
71
+ const max = this.config.maxCommandTimeouts ?? DEFAULT_MAX_COMMAND_TIMEOUTS;
72
+ if (max <= 0)
73
+ return false;
74
+ this.consecutiveCommandTimeouts++;
75
+ return this.consecutiveCommandTimeouts >= max;
76
+ }
54
77
  /** Get current health metrics */
55
78
  getHealth(state) {
56
79
  const avgLatency = this.latencyHistory.length > 0
@@ -63,6 +86,7 @@ export class HealthTracker {
63
86
  lastErrorAt: this.lastErrorAt,
64
87
  avgLatencyMs: Math.round(avgLatency * 100) / 100,
65
88
  consecutivePingFailures: this.consecutivePingFailures,
89
+ consecutiveCommandTimeouts: this.consecutiveCommandTimeouts,
66
90
  totalCommands: this.totalCommands,
67
91
  totalErrors: this.totalErrors,
68
92
  uptimeMs: this.connectedAt ? Date.now() - this.connectedAt : 0,
@@ -26,6 +26,14 @@ export interface ConnectionOptions {
26
26
  pingInterval?: number;
27
27
  /** Max consecutive ping failures before forcing reconnect (default: 3) */
28
28
  maxPingFailures?: number;
29
+ /**
30
+ * Max consecutive command timeouts (with no intervening success) before the
31
+ * connection is concluded dead and reconnect is forced (default: 3, 0 to
32
+ * disable). This is the recovery path for a half-open socket when the
33
+ * health-check ping is disabled or slower than real traffic — a worker whose
34
+ * PULLs keep timing out no longer stalls forever waiting on the ping. See #94.
35
+ */
36
+ maxCommandTimeouts?: number;
29
37
  /** Enable pipelining - multiple commands in flight (default: true) */
30
38
  pipelining?: boolean;
31
39
  /** Max commands in flight when pipelining (default: 100) */
@@ -45,6 +53,8 @@ export interface ConnectionHealth {
45
53
  avgLatencyMs: number;
46
54
  /** Consecutive ping failures */
47
55
  consecutivePingFailures: number;
56
+ /** Consecutive command timeouts with no intervening success */
57
+ consecutiveCommandTimeouts: number;
48
58
  /** Total commands sent */
49
59
  totalCommands: number;
50
60
  /** Total errors */
@@ -15,6 +15,7 @@ export const DEFAULT_CONNECTION = {
15
15
  autoReconnect: true,
16
16
  pingInterval: 30000,
17
17
  maxPingFailures: 3,
18
+ maxCommandTimeouts: 3,
18
19
  pipelining: true,
19
20
  maxInFlight: 100,
20
21
  };
@@ -29,6 +29,7 @@ export class TcpConnectionPool {
29
29
  autoReconnect: options.autoReconnect ?? true,
30
30
  pingInterval: options.pingInterval ?? 30000,
31
31
  maxPingFailures: options.maxPingFailures ?? 3,
32
+ maxCommandTimeouts: options.maxCommandTimeouts ?? 3,
32
33
  pipelining: options.pipelining ?? true,
33
34
  maxInFlight: options.maxInFlight ?? 100,
34
35
  };
@@ -46,6 +47,7 @@ export class TcpConnectionPool {
46
47
  autoReconnect: this.options.autoReconnect,
47
48
  pingInterval: this.options.pingInterval,
48
49
  maxPingFailures: this.options.maxPingFailures,
50
+ maxCommandTimeouts: this.options.maxCommandTimeouts,
49
51
  });
50
52
  this.clients.push(client);
51
53
  }
@@ -376,6 +376,12 @@ export interface ConnectionOptions {
376
376
  pingInterval?: number;
377
377
  /** Command timeout in ms (default: 30000) */
378
378
  commandTimeout?: number;
379
+ /**
380
+ * Consecutive command timeouts (no intervening success) before the connection
381
+ * is concluded dead and a reconnect is forced (default: 3, 0 to disable).
382
+ * Recovery path for a half-open socket independent of the health-check ping. See #94.
383
+ */
384
+ maxCommandTimeouts?: number;
379
385
  /** Enable TCP pipelining (default: true) */
380
386
  pipelining?: boolean;
381
387
  /** Max commands in flight per connection (default: 100) */
@@ -47,6 +47,7 @@ function createTcpPool(opts, concurrency) {
47
47
  poolSize,
48
48
  pingInterval: connOpts.pingInterval,
49
49
  commandTimeout: connOpts.commandTimeout,
50
+ maxCommandTimeouts: connOpts.maxCommandTimeouts,
50
51
  pipelining: connOpts.pipelining,
51
52
  maxInFlight: connOpts.maxInFlight,
52
53
  });
@@ -397,9 +398,12 @@ export class Worker extends EventEmitter {
397
398
  cmd: 'ExtendLocks',
398
399
  ids: jobIds,
399
400
  tokens,
400
- duration,
401
+ // Protocol expects a per-id `durations` array, and the handler returns
402
+ // `count` (not `extended`). Sending `duration`/reading `extended` made
403
+ // batch lock renewal silently keep the old TTL.
404
+ durations: jobIds.map(() => duration),
401
405
  });
402
- const extended = response.extended;
406
+ const extended = response.count;
403
407
  return extended ?? 0;
404
408
  }
405
409
  // ============ Lifecycle ============
@@ -734,6 +738,7 @@ export class Worker extends EventEmitter {
734
738
  workerId: this.workerId,
735
739
  useLocks: this.opts.useLocks,
736
740
  pollTimeout: this.opts.pollTimeout,
741
+ lockDuration: this.opts.lockDuration,
737
742
  };
738
743
  }
739
744
  /** Apply worker-level removeOnComplete/removeOnFail defaults to a job */
@@ -9,6 +9,8 @@ export interface PullConfig {
9
9
  readonly workerId: string;
10
10
  readonly useLocks: boolean;
11
11
  readonly pollTimeout: number;
12
+ /** Lock TTL in ms to request from the server on a lock-based pull. */
13
+ readonly lockDuration?: number;
12
14
  }
13
15
  export declare function pullEmbedded(config: PullConfig, count: number): Promise<Array<{
14
16
  job: InternalJob;
@@ -6,13 +6,14 @@ import { getSharedManager } from '../manager';
6
6
  import { parseJobFromResponse } from './jobParser';
7
7
  export async function pullEmbedded(config, count) {
8
8
  const manager = getSharedManager();
9
- // Use lock-based pull only when useLocks is enabled
9
+ // Use lock-based pull only when useLocks is enabled. Pass lockDuration so the
10
+ // configured lock TTL is honored in embedded mode too (undefined → server default).
10
11
  if (config.useLocks) {
11
12
  if (count === 1) {
12
- const { job, token } = await manager.pullWithLock(config.name, config.workerId, 0);
13
+ const { job, token } = await manager.pullWithLock(config.name, config.workerId, 0, config.lockDuration);
13
14
  return job ? [{ job, token }] : [];
14
15
  }
15
- const { jobs, tokens } = await manager.pullBatchWithLock(config.name, count, config.workerId, 0);
16
+ const { jobs, tokens } = await manager.pullBatchWithLock(config.name, count, config.workerId, 0, config.lockDuration);
16
17
  return jobs.map((job, i) => ({ job, token: tokens[i] || null }));
17
18
  }
18
19
  // No locks - use regular pull
@@ -26,16 +27,22 @@ export async function pullEmbedded(config, count) {
26
27
  export async function pullTcp(config, tcp, count, closing) {
27
28
  if (closing)
28
29
  return [];
29
- // Build pull command - only request locks if useLocks is enabled
30
+ // Build pull command - only request locks if useLocks is enabled.
31
+ // `count` belongs to the batch PULLB; a single PULL doesn't need it.
30
32
  const cmd = {
31
33
  cmd: count === 1 ? 'PULL' : 'PULLB',
32
34
  queue: config.name,
33
35
  timeout: config.pollTimeout,
34
- count,
35
36
  };
37
+ if (count > 1)
38
+ cmd.count = count;
36
39
  // Only request lock ownership when useLocks is enabled
37
40
  if (config.useLocks) {
38
41
  cmd.owner = config.workerId;
42
+ // Propagate the configured lock TTL so the server doesn't always fall back
43
+ // to its 30s default (WorkerOptions.lockDuration was previously ignored).
44
+ if (config.lockDuration !== undefined)
45
+ cmd.lockTtl = config.lockDuration;
39
46
  }
40
47
  const response = await tcp.send(cmd);
41
48
  if (!response.ok)
@@ -98,6 +98,8 @@ export interface FailCommand extends BaseCommand {
98
98
  readonly id: string;
99
99
  readonly error?: string;
100
100
  readonly token?: string;
101
+ /** Skip all remaining retries and fail terminally (UnrecoverableError over TCP). */
102
+ readonly unrecoverable?: boolean;
101
103
  }
102
104
  export interface GetJobCommand extends BaseCommand {
103
105
  readonly cmd: 'GetJob';
@@ -309,6 +311,8 @@ export interface AddLogCommand extends BaseCommand {
309
311
  export interface GetLogsCommand extends BaseCommand {
310
312
  readonly cmd: 'GetLogs';
311
313
  readonly id: string;
314
+ readonly start?: number;
315
+ readonly end?: number;
312
316
  }
313
317
  export interface HeartbeatCommand extends BaseCommand {
314
318
  readonly cmd: 'Heartbeat';
@@ -4,6 +4,45 @@
4
4
  */
5
5
  import * as resp from '../../../domain/types/response';
6
6
  import { jobId } from '../../../domain/types/job';
7
+ /**
8
+ * Coerce a value to a finite number, or return undefined if it can't be.
9
+ * Guards config endpoints against non-numeric input (e.g. `"abc"`) that would
10
+ * otherwise reach numeric comparisons as NaN and silently break behaviour
11
+ * (a string `stallInterval` disabled stall detection entirely).
12
+ */
13
+ function toFiniteNumber(value) {
14
+ if (value === undefined || value === null)
15
+ return undefined;
16
+ const n = typeof value === 'number' ? value : Number(value);
17
+ return Number.isFinite(n) ? n : undefined;
18
+ }
19
+ /**
20
+ * Sanitize the numeric fields of a config object: coerce numeric strings, drop
21
+ * non-numeric garbage (so the manager's merge keeps the existing/default value
22
+ * instead of storing NaN). Booleans and unknown keys pass through untouched.
23
+ */
24
+ function sanitizeConfigNumbers(config, numericKeys) {
25
+ if (!config || typeof config !== 'object')
26
+ return config;
27
+ const numeric = new Set(numericKeys);
28
+ const out = {};
29
+ for (const [key, value] of Object.entries(config)) {
30
+ if (!numeric.has(key)) {
31
+ out[key] = value; // booleans / unknown keys pass through untouched
32
+ continue;
33
+ }
34
+ if (value === null) {
35
+ out[key] = null; // valid for nullable fields (e.g. dlq maxAge)
36
+ continue;
37
+ }
38
+ const n = toFiniteNumber(value);
39
+ // coerce numeric strings; omit non-numeric garbage so the manager's merge
40
+ // keeps the existing/default value instead of storing NaN
41
+ if (n !== undefined)
42
+ out[key] = n;
43
+ }
44
+ return out;
45
+ }
7
46
  // ============ Job Management ============
8
47
  /** Handle Update command - update job data */
9
48
  export async function handleUpdate(cmd, ctx, reqId) {
@@ -122,8 +161,11 @@ export function handleCount(cmd, ctx, reqId) {
122
161
  // ============ Rate Limiting ============
123
162
  /** Handle RateLimit command */
124
163
  export function handleRateLimit(cmd, ctx, reqId) {
125
- ctx.queueManager.setRateLimit(cmd.queue, cmd.limit);
126
- ctx.queueManager.emitDashboardEvent('ratelimit:set', { queue: cmd.queue, max: cmd.limit });
164
+ const limit = toFiniteNumber(cmd.limit);
165
+ if (limit === undefined)
166
+ return resp.error('limit must be a finite number', reqId);
167
+ ctx.queueManager.setRateLimit(cmd.queue, limit);
168
+ ctx.queueManager.emitDashboardEvent('ratelimit:set', { queue: cmd.queue, max: limit });
127
169
  return resp.ok(undefined, reqId);
128
170
  }
129
171
  /** Handle RateLimitClear command */
@@ -134,10 +176,13 @@ export function handleRateLimitClear(cmd, ctx, reqId) {
134
176
  }
135
177
  /** Handle SetConcurrency command */
136
178
  export function handleSetConcurrency(cmd, ctx, reqId) {
137
- ctx.queueManager.setConcurrency(cmd.queue, cmd.limit);
179
+ const limit = toFiniteNumber(cmd.limit);
180
+ if (limit === undefined)
181
+ return resp.error('limit must be a finite number', reqId);
182
+ ctx.queueManager.setConcurrency(cmd.queue, limit);
138
183
  ctx.queueManager.emitDashboardEvent('concurrency:set', {
139
184
  queue: cmd.queue,
140
- concurrency: cmd.limit,
185
+ concurrency: limit,
141
186
  });
142
187
  return resp.ok(undefined, reqId);
143
188
  }
@@ -150,10 +195,11 @@ export function handleClearConcurrency(cmd, ctx, reqId) {
150
195
  // ============ Config Commands ============
151
196
  /** Handle SetStallConfig command */
152
197
  export function handleSetStallConfig(cmd, ctx, reqId) {
153
- ctx.queueManager.setStallConfig(cmd.queue, cmd.config);
198
+ const config = sanitizeConfigNumbers(cmd.config, ['stallInterval', 'maxStalls', 'gracePeriod']);
199
+ ctx.queueManager.setStallConfig(cmd.queue, config);
154
200
  ctx.queueManager.emitDashboardEvent('config:stall-changed', {
155
201
  queue: cmd.queue,
156
- config: cmd.config,
202
+ config,
157
203
  });
158
204
  return resp.ok(undefined, reqId);
159
205
  }
@@ -164,10 +210,16 @@ export function handleGetStallConfig(cmd, ctx, reqId) {
164
210
  }
165
211
  /** Handle SetDlqConfig command */
166
212
  export function handleSetDlqConfig(cmd, ctx, reqId) {
167
- ctx.queueManager.setDlqConfig(cmd.queue, cmd.config);
213
+ const config = sanitizeConfigNumbers(cmd.config, [
214
+ 'autoRetryInterval',
215
+ 'maxAutoRetries',
216
+ 'maxAge',
217
+ 'maxEntries',
218
+ ]);
219
+ ctx.queueManager.setDlqConfig(cmd.queue, config);
168
220
  ctx.queueManager.emitDashboardEvent('config:dlq-changed', {
169
221
  queue: cmd.queue,
170
- config: cmd.config,
222
+ config,
171
223
  });
172
224
  return resp.ok(undefined, reqId);
173
225
  }
@@ -189,7 +189,7 @@ export async function handleAckBatch(cmd, ctx, reqId) {
189
189
  export async function handleFail(cmd, ctx, reqId) {
190
190
  try {
191
191
  const jid = jobId(cmd.id);
192
- await ctx.queueManager.fail(jid, cmd.error, cmd.token);
192
+ await ctx.queueManager.fail(jid, cmd.error, cmd.token, cmd.unrecoverable);
193
193
  // Unregister job from client tracking
194
194
  ctx.queueManager.unregisterClientJob(ctx.clientId, jid);
195
195
  return resp.ok(undefined, reqId);
@@ -40,6 +40,7 @@ export function handleCron(cmd, ctx, reqId) {
40
40
  repeatEvery: cron.repeatEvery,
41
41
  nextRun: cron.nextRun,
42
42
  timezone: cron.timezone,
43
+ priority: cron.priority,
43
44
  },
44
45
  reqId,
45
46
  };
@@ -18,8 +18,13 @@ export function handleAddLog(cmd, ctx, reqId) {
18
18
  }
19
19
  export function handleGetLogs(cmd, ctx, reqId) {
20
20
  const jid = jobId(cmd.id);
21
- const logs = ctx.queueManager.getLogs(jid);
22
- return resp.data({ logs }, reqId);
21
+ const all = ctx.queueManager.getLogs(jid);
22
+ // Honor optional pagination (start/end inclusive) the client already sends.
23
+ const total = all.length;
24
+ const logs = cmd.start === undefined && cmd.end === undefined
25
+ ? all
26
+ : all.slice(cmd.start ?? 0, (cmd.end ?? total - 1) + 1);
27
+ return resp.data({ logs, count: total }, reqId);
23
28
  }
24
29
  // ============ Worker Heartbeat ============
25
30
  export function handleHeartbeat(cmd, ctx, reqId) {
@@ -59,6 +59,17 @@ export function createHttpServer(queueManager, config) {
59
59
  });
60
60
  // Helper to get CORS origin string
61
61
  const getCorsOrigin = () => (corsOrigins.has('*') ? '*' : Array.from(corsOrigins).join(', '));
62
+ // Attach CORS to responses built outside the routeRequest pipeline (health,
63
+ // ready, prometheus, debug) so browser dashboards can read them cross-origin
64
+ // (audit #16-20). Response headers are mutable for normally-constructed
65
+ // Responses; this never overwrites an existing value set by the endpoint.
66
+ const withCors = async (r) => {
67
+ const res = await r;
68
+ if (!res.headers.has('Access-Control-Allow-Origin')) {
69
+ res.headers.set('Access-Control-Allow-Origin', getCorsOrigin());
70
+ }
71
+ return res;
72
+ };
62
73
  // Fetch handler
63
74
  const fetch = async (req, server) => {
64
75
  const url = new URL(req.url);
@@ -69,26 +80,26 @@ export function createHttpServer(queueManager, config) {
69
80
  }
70
81
  // Health endpoints (no auth, no rate limit)
71
82
  if (path === '/health') {
72
- return healthEndpoint(queueManager, wsHandler.size, sseHandler.size);
83
+ return withCors(healthEndpoint(queueManager, wsHandler.size, sseHandler.size));
73
84
  }
74
85
  if (path === '/healthz' || path === '/live') {
75
- return new Response('OK', { status: 200 });
86
+ return withCors(new Response('OK', { status: 200 }));
76
87
  }
77
88
  if (path === '/ready') {
78
- return jsonResponse({ ok: true, ready: true });
89
+ return jsonResponse({ ok: true, ready: true }, 200, corsOrigins);
79
90
  }
80
91
  // Debug endpoints (require auth)
81
92
  if (path === '/gc' && req.method === 'POST') {
82
93
  const denied = checkAuth(req, authTokens);
83
94
  if (denied)
84
95
  return denied;
85
- return gcEndpoint(queueManager);
96
+ return withCors(gcEndpoint(queueManager));
86
97
  }
87
98
  if (path === '/heapstats' && req.method === 'GET') {
88
99
  const denied = checkAuth(req, authTokens);
89
100
  if (denied)
90
101
  return denied;
91
- return heapStatsEndpoint(queueManager);
102
+ return withCors(heapStatsEndpoint(queueManager));
92
103
  }
93
104
  // Rate limiting
94
105
  const clientIp = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ??
@@ -131,7 +142,10 @@ export function createHttpServer(queueManager, config) {
131
142
  return denied;
132
143
  }
133
144
  return new Response(queueManager.getPrometheusMetrics(), {
134
- headers: { 'Content-Type': 'text/plain; version=0.0.4; charset=utf-8' },
145
+ headers: {
146
+ 'Content-Type': 'text/plain; version=0.0.4; charset=utf-8',
147
+ 'Access-Control-Allow-Origin': getCorsOrigin(),
148
+ },
135
149
  });
136
150
  }
137
151
  // Check authentication for other endpoints
@@ -79,6 +79,7 @@ async function routeJobManagement(req, path, method, ctx, cors) {
79
79
  cmd: 'ChangePriority',
80
80
  id: priorityMatch[1],
81
81
  priority: body['priority'],
82
+ lifo: body['lifo'],
82
83
  }, ctx);
83
84
  return jsonResponse(r, r.ok ? 200 : 400, cors);
84
85
  }
@@ -250,7 +251,12 @@ export async function routeJobRoutes(req, path, method, ctx, cors) {
250
251
  const body = await parseJsonBody(req, cors);
251
252
  if (body instanceof Response)
252
253
  return body;
253
- const r = await handleCommand({ cmd: 'ACK', id: ackMatch[1], result: body['result'] }, ctx);
254
+ const r = await handleCommand({
255
+ cmd: 'ACK',
256
+ id: ackMatch[1],
257
+ result: body['result'],
258
+ token: body['token'],
259
+ }, ctx);
254
260
  return jsonResponse(r, r.ok ? 200 : 400, cors);
255
261
  }
256
262
  // POST /jobs/:id/fail
@@ -259,7 +265,13 @@ export async function routeJobRoutes(req, path, method, ctx, cors) {
259
265
  const body = await parseJsonBody(req, cors);
260
266
  if (body instanceof Response)
261
267
  return body;
262
- const r = await handleCommand({ cmd: 'FAIL', id: failMatch[1], error: body['error'] }, ctx);
268
+ const r = await handleCommand({
269
+ cmd: 'FAIL',
270
+ id: failMatch[1],
271
+ error: body['error'],
272
+ token: body['token'],
273
+ unrecoverable: body['unrecoverable'],
274
+ }, ctx);
263
275
  return jsonResponse(r, r.ok ? 200 : 400, cors);
264
276
  }
265
277
  // Delegate to sub-routers
@@ -26,8 +26,23 @@ export async function routeQueueConfigRoutes(req, path, method, ctx, cors) {
26
26
  const dlqMatch = path.match(RE_QUEUE_DLQ);
27
27
  if (dlqMatch && method === 'GET') {
28
28
  const queue = decodeURIComponent(dlqMatch[1]);
29
- const entries = ctx.queueManager.getDlqEntries(queue);
30
- return jsonResponse({ ok: true, entries }, 200, cors);
29
+ const all = ctx.queueManager.getDlqEntries(queue);
30
+ // Optional pagination so a dashboard can page large DLQs. Non-numeric params
31
+ // are ignored (treated as absent) rather than producing an empty/garbage slice.
32
+ const params = new URL(req.url).searchParams;
33
+ const toInt = (v) => {
34
+ if (v === null)
35
+ return undefined;
36
+ const n = Number(v);
37
+ return Number.isFinite(n) ? Math.trunc(n) : undefined;
38
+ };
39
+ const limit = toInt(params.get('limit'));
40
+ const offset = toInt(params.get('offset'));
41
+ const start = Math.max(0, offset ?? 0);
42
+ const entries = limit === undefined && offset === undefined
43
+ ? all
44
+ : all.slice(start, start + (limit !== undefined ? Math.max(0, limit) : all.length));
45
+ return jsonResponse({ ok: true, entries, total: all.length }, 200, cors);
31
46
  }
32
47
  // POST /queues/:queue/dlq/retry
33
48
  const dlqRetryMatch = path.match(RE_QUEUE_DLQ_RETRY);
@@ -79,7 +94,8 @@ export async function routeQueueConfigRoutes(req, path, method, ctx, cors) {
79
94
  const r = await handleCommand({
80
95
  cmd: 'SetConcurrency',
81
96
  queue,
82
- limit: body['limit'],
97
+ // Accept the natural `concurrency` field for this endpoint as well as `limit`.
98
+ limit: (body['concurrency'] ?? body['limit']),
83
99
  }, ctx);
84
100
  return jsonResponse(r, 200, cors);
85
101
  }
@@ -105,7 +105,19 @@ async function routeJobOps(req, path, method, ctx, cors) {
105
105
  if (listMatch && method === 'GET') {
106
106
  const queue = decodeURIComponent(listMatch[1]);
107
107
  const url = new URL(req.url);
108
- const stateValues = url.searchParams.getAll('state');
108
+ // Accept `state`, `status` (dashboard/REST convention), and `states` as
109
+ // aliases, each repeatable and comma-separated. Previously only `state` was
110
+ // read, so `?status=failed` silently fell through to an unfiltered list and
111
+ // returned the whole queue (#95). A state name never contains a comma, so
112
+ // splitting is safe.
113
+ const stateValues = [
114
+ ...url.searchParams.getAll('state'),
115
+ ...url.searchParams.getAll('status'),
116
+ ...url.searchParams.getAll('states'),
117
+ ]
118
+ .flatMap((v) => v.split(','))
119
+ .map((s) => s.trim())
120
+ .filter(Boolean);
109
121
  const state = stateValues.length === 0
110
122
  ? undefined
111
123
  : stateValues.length === 1
@@ -40,6 +40,10 @@ export async function routeResourceRoutes(req, path, method, ctx, cors) {
40
40
  uniqueKey: body['uniqueKey'],
41
41
  dedup: body['dedup'],
42
42
  skipMissedOnRestart: body['skipMissedOnRestart'],
43
+ immediately: body['immediately'],
44
+ skipIfNoWorker: body['skipIfNoWorker'],
45
+ preventOverlap: body['preventOverlap'],
46
+ jobOptions: body['jobOptions'],
43
47
  }, ctx);
44
48
  return jsonResponse(r, r.ok ? 200 : 400, cors);
45
49
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bunqueue",
3
- "version": "2.8.5",
3
+ "version": "2.8.7",
4
4
  "description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
5
5
  "type": "module",
6
6
  "main": "dist/main.js",