bunqueue 2.8.12 → 2.8.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/application/backgroundTasks.d.ts +1 -0
  2. package/dist/application/backgroundTasks.js +6 -1
  3. package/dist/application/cleanupTasks.js +5 -0
  4. package/dist/application/clientTracking.js +11 -0
  5. package/dist/application/contextFactory.d.ts +4 -2
  6. package/dist/application/contextFactory.js +5 -0
  7. package/dist/application/dependencyProcessor.js +9 -2
  8. package/dist/application/lockManager.js +16 -1
  9. package/dist/application/operations/ack.d.ts +3 -1
  10. package/dist/application/operations/ack.js +5 -0
  11. package/dist/application/operations/ackHelpers.d.ts +3 -1
  12. package/dist/application/operations/ackHelpers.js +5 -0
  13. package/dist/application/operations/jobManagement.js +10 -0
  14. package/dist/application/operations/push.d.ts +4 -0
  15. package/dist/application/operations/push.js +26 -2
  16. package/dist/application/operations/queryOperations.js +8 -2
  17. package/dist/application/queueManager.d.ts +3 -0
  18. package/dist/application/queueManager.js +59 -3
  19. package/dist/application/types.d.ts +5 -3
  20. package/dist/client/bunqueue.d.ts +1 -1
  21. package/dist/client/queue/operations/counts.d.ts +7 -2
  22. package/dist/client/queue/operations/counts.js +7 -10
  23. package/dist/client/queue/queue.d.ts +1 -1
  24. package/dist/client/tcpPool.d.ts +7 -0
  25. package/dist/client/tcpPool.js +11 -0
  26. package/dist/client/worker/processor.js +12 -11
  27. package/dist/client/worker/processorHandlers.d.ts +11 -0
  28. package/dist/client/worker/processorHandlers.js +23 -1
  29. package/dist/client/worker/worker.d.ts +11 -0
  30. package/dist/client/worker/worker.js +88 -4
  31. package/dist/infrastructure/persistence/sqlite.d.ts +9 -2
  32. package/dist/infrastructure/persistence/sqlite.js +37 -6
  33. package/dist/infrastructure/server/handlers/core.js +6 -1
  34. package/package.json +1 -1
@@ -26,5 +26,6 @@ export declare function startBackgroundTasks(ctx: BackgroundContext, cronSchedul
26
26
  * Stop all background tasks
27
27
  */
28
28
  export declare function stopBackgroundTasks(handles: BackgroundTaskHandles): void;
29
+ export declare function checkJobTimeouts(ctx: BackgroundContext): void;
29
30
  export declare function recover(ctx: BackgroundContext): void;
30
31
  export { processPendingDependencies };
@@ -108,7 +108,7 @@ function getLockContext(ctx) {
108
108
  };
109
109
  }
110
110
  // ============ Job Timeouts ============
111
- function checkJobTimeouts(ctx) {
111
+ export function checkJobTimeouts(ctx) {
112
112
  const now = Date.now();
113
113
  for (const procShard of ctx.processingShards) {
114
114
  for (const [jobId, job] of procShard) {
@@ -118,6 +118,11 @@ function checkJobTimeouts(ctx) {
118
118
  queue: job.queue,
119
119
  timeout: job.timeout,
120
120
  });
121
+ // Mark as timed-out BEFORE requeuing for retry, so a late ACK from the
122
+ // (still-hung) worker that exceeded the deadline is discarded instead of
123
+ // phantom-completing the job and skipping the retry. See ack-recovery in
124
+ // queueManager.ack (timedOutJobs guard).
125
+ ctx.timedOutJobs?.add(jobId);
121
126
  ctx.fail(jobId, 'Job timeout exceeded').catch((err) => {
122
127
  queueLog.error('Failed to mark timed out job as failed', {
123
128
  jobId: String(jobId),
@@ -221,6 +221,11 @@ function cleanEmptyQueues(ctx) {
221
221
  shard.clearQueueLimiters(queueName);
222
222
  shard.stallConfig.delete(queueName);
223
223
  shard.dlqConfig.delete(queueName);
224
+ // NOTE: perQueueMetrics is intentionally NOT pruned here — it is an
225
+ // LRU-bounded map and these counters are cumulative, so they must survive
226
+ // a transient drain (a busy queue momentarily empty must not reset to 0).
227
+ // obliterate() reclaims it explicitly; the LRU cap bounds growth for
228
+ // ephemeral/dynamically-named queues.
224
229
  ctx.dashboardEmit?.('queue:removed', { queue: queueName });
225
230
  ctx.unregisterQueueName(queueName);
226
231
  }
@@ -47,6 +47,17 @@ export async function releaseClientJobs(clientId, ctx) {
47
47
  const loc = ctx.jobIndex.get(jobId);
48
48
  if (loc?.type !== 'processing')
49
49
  continue;
50
+ // A job whose lock has been renewed since pull (renewalCount > 0) is being
51
+ // actively heartbeated by a live worker. With a pooled client, heartbeats
52
+ // travel on a DIFFERENT connection than the one that pulled, so THIS socket
53
+ // closing does not mean the worker died — re-queuing here would re-dispatch
54
+ // (double-execute) a job the worker still holds. Leave it; lock expiry /
55
+ // stall detection reclaims it if the worker truly stops heartbeating.
56
+ // A never-renewed lock (renewalCount === 0) keeps the original fast-recovery
57
+ // behavior: requeue immediately on disconnect.
58
+ const lock = ctx.jobLocks.get(jobId);
59
+ if (lock && lock.renewalCount > 0)
60
+ continue;
50
61
  const procIdx = loc.shardIdx;
51
62
  const job = ctx.processingShards[procIdx].get(jobId);
52
63
  if (!job)
@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
7
7
  import type { Shard } from '../domain/queue/shard';
8
8
  import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
9
9
  import type { RWLock } from '../shared/lock';
10
- import type { LRUMap, BoundedSet, BoundedMap } from '../shared/lru';
10
+ import type { LRUMap, BoundedSet, BoundedMap, MapLike } from '../shared/lru';
11
11
  import type { WebhookManager } from './webhookManager';
12
12
  import type { WorkerManager } from './workerManager';
13
13
  import type { EventsManager } from './eventsManager';
@@ -33,6 +33,8 @@ export interface ContextDependencies {
33
33
  jobIndex: Map<JobId, JobLocation>;
34
34
  completedJobs: BoundedSet<JobId>;
35
35
  completedJobsData: BoundedMap<JobId, Job>;
36
+ depCompletions?: BoundedSet<JobId>;
37
+ timedOutJobs?: BoundedSet<JobId>;
36
38
  jobResults: LRUMap<JobId, unknown>;
37
39
  customIdMap: LRUMap<string, JobId>;
38
40
  jobLogs: LRUMap<JobId, JobLogEntry[]>;
@@ -62,7 +64,7 @@ export interface ContextDependencies {
62
64
  };
63
65
  startTime: number;
64
66
  maxLogsPerJob: number;
65
- perQueueMetrics: Map<string, {
67
+ perQueueMetrics: MapLike<string, {
66
68
  totalCompleted: bigint;
67
69
  totalFailed: bigint;
68
70
  }>;
@@ -34,6 +34,8 @@ export class ContextFactory {
34
34
  processingLocks: this.deps.processingLocks,
35
35
  jobIndex: this.deps.jobIndex,
36
36
  completedJobs: this.deps.completedJobs,
37
+ depCompletions: this.deps.depCompletions,
38
+ timedOutJobs: this.deps.timedOutJobs,
37
39
  jobResults: this.deps.jobResults,
38
40
  customIdMap: this.deps.customIdMap,
39
41
  jobLogs: this.deps.jobLogs,
@@ -81,6 +83,8 @@ export class ContextFactory {
81
83
  shardLocks: this.deps.shardLocks,
82
84
  completedJobs: this.deps.completedJobs,
83
85
  completedJobsData: this.deps.completedJobsData,
86
+ depCompletions: this.deps.depCompletions,
87
+ timedOutJobs: this.deps.timedOutJobs,
84
88
  jobResults: this.deps.jobResults,
85
89
  customIdMap: this.deps.customIdMap,
86
90
  jobIndex: this.deps.jobIndex,
@@ -111,6 +115,7 @@ export class ContextFactory {
111
115
  processingLocks: this.deps.processingLocks,
112
116
  completedJobs: this.deps.completedJobs,
113
117
  completedJobsData: this.deps.completedJobsData,
118
+ depCompletions: this.deps.depCompletions,
114
119
  jobResults: this.deps.jobResults,
115
120
  jobIndex: this.deps.jobIndex,
116
121
  customIdMap: this.deps.customIdMap,
@@ -37,10 +37,12 @@ export async function processPendingDependencies(ctx) {
37
37
  await withWriteLock(ctx.shardLocks[i], () => {
38
38
  const shard = ctx.shards[i];
39
39
  const jobsToPromote = [];
40
- // Check which jobs have all dependencies satisfied (inside lock)
40
+ // Check which jobs have all dependencies satisfied (inside lock).
41
+ // A removeOnComplete parent is not in completedJobs (its full record was
42
+ // dropped to bound memory), so also honor its bare-id depCompletions entry.
41
43
  for (const jobId of jobIdsToCheck) {
42
44
  const job = shard.waitingDeps.get(jobId);
43
- if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep))) {
45
+ if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep) || (ctx.depCompletions?.has(dep) ?? false))) {
44
46
  jobsToPromote.push(job);
45
47
  }
46
48
  }
@@ -50,6 +52,11 @@ export async function processPendingDependencies(ctx) {
50
52
  }
51
53
  });
52
54
  }));
55
+ // NOTE: depCompletions is intentionally NOT pruned here. It is a FIFO
56
+ // BoundedSet (same cap as completedJobs), so it self-bounds. Pruning eagerly
57
+ // once "no waiters remain" would orphan a dependent pushed AFTER a
58
+ // removeOnComplete parent completed — exactly the symmetry completedJobs
59
+ // provides for normal parents (readiness holds for the whole bounded window).
53
60
  }
54
61
  /** Move jobs from waitingDeps to the active queue */
55
62
  function promoteJobsToQueue(jobsToPromote, shard, ctx, shardIdx) {
@@ -111,8 +111,19 @@ function processExpiredLockInner(jobId, lock, job, shardIdx, procIdx, ctx, now)
111
111
  /** Move job to DLQ when max stalls exceeded */
112
112
  function handleMaxStallsExceeded(opts) {
113
113
  const { jobId, job, lock, shard, ctx, now } = opts;
114
- shard.addToDlq(job, "stalled" /* FailureReason.Stalled */, `Lock expired after ${lock.renewalCount} renewals`);
114
+ // Release the concurrency slot (+group+uniqueKey) acquired at pull before
115
+ // moving to DLQ — otherwise the slot leaks (mirrors
116
+ // stallDetection.moveStalliedJobToDlq).
117
+ shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
118
+ const entry = shard.addToDlq(job, "stalled" /* FailureReason.Stalled */, `Lock expired after ${lock.renewalCount} renewals`);
115
119
  ctx.jobIndex.set(jobId, { type: 'dlq', queueName: job.queue });
120
+ // Persist the DLQ move like the sibling paths (ack.moveFailedJobToDlq,
121
+ // stallDetection.moveStalliedJobToDlq, backgroundTasks startup-recovery).
122
+ // Without these two writes the jobs row survives in SQLite as an orphan and
123
+ // the DLQ entry lives only in memory — a later retry then re-INSERTs the
124
+ // surviving row and throws `UNIQUE constraint failed: jobs.id` (#97).
125
+ ctx.storage?.saveDlqEntry(entry);
126
+ ctx.storage?.deleteJob(jobId);
116
127
  ctx.eventsManager.broadcast({
117
128
  eventType: "failed" /* EventType.Failed */,
118
129
  jobId,
@@ -125,6 +136,10 @@ function handleMaxStallsExceeded(opts) {
125
136
  function requeueExpiredJob(opts) {
126
137
  const { jobId, job, queue, idx, ctx, now } = opts;
127
138
  const shard = ctx.shards[idx];
139
+ // Release the concurrency slot (+group+uniqueKey) acquired at pull before
140
+ // re-pushing — otherwise the slot leaks and the queue wedges (mirrors
141
+ // stallDetection.retryStalliedJob).
142
+ shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
128
143
  queue.push(job);
129
144
  const isDelayed = job.runAt > now;
130
145
  shard.incrementQueued(jobId, isDelayed, job.createdAt, job.queue, job.runAt);
@@ -17,6 +17,8 @@ export interface AckContext {
17
17
  processingLocks: RWLock[];
18
18
  completedJobs: SetLike<JobId>;
19
19
  completedJobsData: MapLike<JobId, Job>;
20
+ /** Bare completion ids for removeOnComplete jobs so dependents can unblock */
21
+ depCompletions?: SetLike<JobId>;
20
22
  jobResults: MapLike<JobId, unknown>;
21
23
  jobIndex: Map<JobId, JobLocation>;
22
24
  customIdMap?: MapLike<string, JobId>;
@@ -26,7 +28,7 @@ export interface AckContext {
26
28
  totalFailed: {
27
29
  value: bigint;
28
30
  };
29
- perQueueMetrics?: Map<string, {
31
+ perQueueMetrics?: MapLike<string, {
30
32
  totalCompleted: bigint;
31
33
  totalFailed: bigint;
32
34
  }>;
@@ -50,6 +50,11 @@ export async function ackJob(jobId, result, ctx) {
50
50
  else {
51
51
  ctx.jobIndex.delete(jobId);
52
52
  ctx.storage?.deleteJob(jobId);
53
+ // removeOnComplete drops the full job (index + data + persisted row) to bound
54
+ // memory, but dependent jobs gate readiness on completedJobs.has(parentId).
55
+ // Record the bare completion id (no payload) so dependents still unblock,
56
+ // without making the job appear in state/stats queries.
57
+ ctx.depCompletions?.add(jobId);
53
58
  }
54
59
  ctx.totalCompleted.value++;
55
60
  if (ctx.perQueueMetrics) {
@@ -55,13 +55,15 @@ export interface FinalizeContext {
55
55
  storage: SqliteStorage | null;
56
56
  completedJobs: SetLike<JobId>;
57
57
  completedJobsData: MapLike<JobId, Job>;
58
+ /** Bare completion ids for removeOnComplete jobs so dependents can unblock */
59
+ depCompletions?: SetLike<JobId>;
58
60
  jobResults: MapLike<JobId, unknown>;
59
61
  jobIndex: Map<JobId, JobLocation>;
60
62
  customIdMap?: MapLike<string, JobId>;
61
63
  totalCompleted: {
62
64
  value: bigint;
63
65
  };
64
- perQueueMetrics?: Map<string, {
66
+ perQueueMetrics?: MapLike<string, {
65
67
  totalCompleted: bigint;
66
68
  totalFailed: bigint;
67
69
  }>;
@@ -164,6 +164,11 @@ export function finalizeBatchAck(extractedJobs, ctx, includeResults) {
164
164
  ctx.jobIndex.delete(jobId);
165
165
  if (hasStorage)
166
166
  storage.deleteJob(jobId);
167
+ // removeOnComplete drops the full job to bound memory, but dependents gate
168
+ // readiness on completedJobs.has(parentId). Record the bare completion id
169
+ // (no payload) so dependent jobs still unblock, without surfacing the job
170
+ // in state/stats queries.
171
+ ctx.depCompletions?.add(jobId);
167
172
  }
168
173
  }
169
174
  // Broadcast events
@@ -23,6 +23,16 @@ export async function cancelJob(jobId, ctx) {
23
23
  ctx.storage?.deleteJob(jobId);
24
24
  return { success: true, queueName: location.queueName };
25
25
  }
26
+ // Not in the run queue — it may be parked in waitingChildren (moved via
27
+ // moveToWaitingChildren, which already released its resources and does not
28
+ // track it in the queued counter, so do NOT decrement/release here).
29
+ const parked = shard.waitingChildren.get(jobId);
30
+ if (parked) {
31
+ shard.waitingChildren.delete(jobId);
32
+ ctx.jobIndex.delete(jobId);
33
+ ctx.storage?.deleteJob(jobId);
34
+ return { success: true, queueName: location.queueName };
35
+ }
26
36
  return { success: false, queueName: location.queueName };
27
37
  });
28
38
  if (result.success) {
@@ -15,6 +15,10 @@ export interface PushContext {
15
15
  shardLocks: RWLock[];
16
16
  completedJobs: SetLike<JobId>;
17
17
  completedJobsData: MapLike<JobId, Job>;
18
+ /** Bare completion ids for removeOnComplete jobs so dependents start ready */
19
+ depCompletions?: SetLike<JobId>;
20
+ /** Timeout markers — cleared on custom-id reuse so a recycled id starts clean */
21
+ timedOutJobs?: SetLike<JobId>;
18
22
  jobResults: MapLike<JobId, unknown>;
19
23
  customIdMap: MapLike<string, JobId>;
20
24
  jobIndex: Map<JobId, JobLocation>;
@@ -40,6 +40,12 @@ function handleCustomId(input, shard, ctx) {
40
40
  ctx.jobIndex.delete(id);
41
41
  ctx.storage?.deleteJob(id); // removes the surviving row + result + any buffered insert
42
42
  }
43
+ // A recycled custom id may carry a stale timeout marker from a prior job (which
44
+ // may have DLQ'd, so it is NOT in completedJobs above). Clear it so the new
45
+ // job's stall-retry recovery is not wrongly discarded — otherwise the
46
+ // timeout-resurrection guard would reintroduce #33/#75 duplicate execution for
47
+ // reused ids.
48
+ ctx.timedOutJobs?.delete(id);
43
49
  ctx.customIdMap.set(input.customId, id);
44
50
  return { skip: false, id };
45
51
  }
@@ -115,7 +121,8 @@ function handleDeduplication(job, input, queue, shard, ctx) {
115
121
  */
116
122
  function insertJobToShard(job, queue, shard, shardIdx, ctx) {
117
123
  const hasDeps = job.dependsOn.length > 0;
118
- const needsWaiting = hasDeps && !job.dependsOn.every((depId) => ctx.completedJobs.has(depId));
124
+ const needsWaiting = hasDeps &&
125
+ !job.dependsOn.every((depId) => ctx.completedJobs.has(depId) || (ctx.depCompletions?.has(depId) ?? false));
119
126
  const now = Date.now();
120
127
  if (needsWaiting) {
121
128
  shard.waitingDeps.set(job.id, job);
@@ -201,6 +208,10 @@ export async function pushJobBatch(queue, inputs, ctx) {
201
208
  const idx = shardIndex(queue);
202
209
  const resultIds = [];
203
210
  const jobsToInsert = [];
211
+ // Jobs flagged durable must bypass the 10ms write buffer (immediate fsync
212
+ // path), exactly like a single durable push — otherwise addBulk silently
213
+ // downgrades the documented "durable" guarantee.
214
+ const durableJobs = [];
204
215
  await withWriteLock(ctx.shardLocks[idx], () => {
205
216
  const shard = ctx.shards[idx];
206
217
  for (const input of inputs) {
@@ -220,6 +231,8 @@ export async function pushJobBatch(queue, inputs, ctx) {
220
231
  // Insert to shard
221
232
  insertJobToShard(job, queue, shard, idx, ctx);
222
233
  jobsToInsert.push(job);
234
+ if (input.durable)
235
+ durableJobs.push(job);
223
236
  resultIds.push(job.id);
224
237
  }
225
238
  if (jobsToInsert.length > 0) {
@@ -227,7 +240,18 @@ export async function pushJobBatch(queue, inputs, ctx) {
227
240
  }
228
241
  });
229
242
  if (jobsToInsert.length > 0) {
230
- ctx.storage?.insertJobsBatch(jobsToInsert);
243
+ if (durableJobs.length === 0) {
244
+ ctx.storage?.insertJobsBatch(jobsToInsert);
245
+ }
246
+ else {
247
+ // Durable jobs bypass the write buffer (immediate disk write); the rest
248
+ // still go through the batched buffer for throughput.
249
+ const durableSet = new Set(durableJobs);
250
+ const buffered = jobsToInsert.filter((j) => !durableSet.has(j));
251
+ if (buffered.length > 0)
252
+ ctx.storage?.insertJobsBatch(buffered);
253
+ ctx.storage?.insertJobsBatch(durableJobs, true);
254
+ }
231
255
  ctx.totalPushed.value += BigInt(jobsToInsert.length);
232
256
  throughputTracker.pushRate.increment(jobsToInsert.length);
233
257
  for (const job of jobsToInsert) {
@@ -24,7 +24,10 @@ export async function getJob(jobId, ctx) {
24
24
  case 'queue': {
25
25
  return await withReadLock(ctx.shardLocks[location.shardIdx], () => {
26
26
  const shard = ctx.shards[location.shardIdx];
27
- return (shard.getQueue(location.queueName).find(jobId) ?? shard.waitingDeps.get(jobId) ?? null);
27
+ return (shard.getQueue(location.queueName).find(jobId) ??
28
+ shard.waitingDeps.get(jobId) ??
29
+ shard.waitingChildren.get(jobId) ??
30
+ null);
28
31
  });
29
32
  }
30
33
  case 'processing': {
@@ -63,7 +66,10 @@ export function getJobByCustomId(customId, ctx) {
63
66
  return null;
64
67
  if (location.type === 'queue') {
65
68
  const shard = ctx.shards[location.shardIdx];
66
- return shard.getQueue(location.queueName).find(jobId) ?? shard.waitingDeps.get(jobId) ?? null;
69
+ return (shard.getQueue(location.queueName).find(jobId) ??
70
+ shard.waitingDeps.get(jobId) ??
71
+ shard.waitingChildren.get(jobId) ??
72
+ null);
67
73
  }
68
74
  if (location.type === 'processing') {
69
75
  return ctx.processingShards[location.shardIdx].get(jobId) ?? null;
@@ -28,6 +28,8 @@ export declare class QueueManager {
28
28
  private readonly jobIndex;
29
29
  private readonly completedJobs;
30
30
  private readonly completedJobsData;
31
+ private readonly depCompletions;
32
+ private readonly timedOutJobs;
31
33
  private readonly jobResults;
32
34
  private readonly customIdMap;
33
35
  private readonly jobLogs;
@@ -227,6 +229,7 @@ export declare class QueueManager {
227
229
  unregisterWorkersByClientId(clientId: string): number;
228
230
  getJobIndex(): Map<JobId, JobLocation>;
229
231
  getCompletedJobs(): SetLike<JobId>;
232
+ getDepCompletions(): SetLike<JobId>;
230
233
  getShards(): Shard[];
231
234
  private onJobCompleted;
232
235
  /** Check if completing this job completes an entire flow */
@@ -45,6 +45,16 @@ export class QueueManager {
45
45
  jobIndex = new Map();
46
46
  completedJobs;
47
47
  completedJobsData;
48
+ // Bare completion ids of removeOnComplete jobs — kept ONLY so dependent jobs
49
+ // can unblock (no payload, not surfaced in state/stats). Bounded like
50
+ // completedJobs; entries are pruned by the dependency processor once consumed.
51
+ depCompletions;
52
+ // Ids of jobs failed by the timeout sweep. A late ACK whose lock token no
53
+ // longer matches (the job was requeued for retry) is discarded for these,
54
+ // instead of phantom-completing the job and skipping the retry. Bounded;
55
+ // never needs explicit clearing because a legit retry ACK carries a valid
56
+ // current token and bypasses the stale-token recovery path entirely.
57
+ timedOutJobs;
48
58
  jobResults;
49
59
  customIdMap;
50
60
  jobLogs;
@@ -83,7 +93,13 @@ export class QueueManager {
83
93
  totalCompleted: { value: 0n },
84
94
  totalFailed: { value: 0n },
85
95
  };
86
- perQueueMetrics = new Map();
96
+ // LRU-bounded so high-cardinality / dynamically-named queues cannot grow it
97
+ // without bound. Live queues stay resident (recently accessed on every
98
+ // ack/fail); only long-idle ephemeral names are evicted. obliterate() also
99
+ // deletes the entry explicitly. Reclaiming on a transient drain is avoided
100
+ // on purpose so cumulative per-queue counters survive idle periods.
101
+ // Assigned in the constructor (needs this.config).
102
+ perQueueMetrics;
87
103
  startTime = Date.now();
88
104
  // Background task handles
89
105
  backgroundTaskHandles;
@@ -102,6 +118,9 @@ export class QueueManager {
102
118
  this.jobIndex.delete(jobId);
103
119
  this.completedJobsData.delete(jobId);
104
120
  });
121
+ this.depCompletions = new BoundedSet(this.config.maxCompletedJobs);
122
+ this.timedOutJobs = new BoundedSet(this.config.maxCompletedJobs);
123
+ this.perQueueMetrics = new LRUMap(this.config.maxCustomIds);
105
124
  this.jobResults = new LRUMap(this.config.maxJobResults);
106
125
  this.customIdMap = new LRUMap(this.config.maxCustomIds);
107
126
  this.jobLogs = new LRUMap(this.config.maxJobLogs);
@@ -154,6 +173,8 @@ export class QueueManager {
154
173
  jobIndex: this.jobIndex,
155
174
  completedJobs: this.completedJobs,
156
175
  completedJobsData: this.completedJobsData,
176
+ depCompletions: this.depCompletions,
177
+ timedOutJobs: this.timedOutJobs,
157
178
  jobResults: this.jobResults,
158
179
  customIdMap: this.customIdMap,
159
180
  jobLogs: this.jobLogs,
@@ -265,6 +286,13 @@ export class QueueManager {
265
286
  // Job may have been stall-retried to queue while we processed it.
266
287
  // Complete it from queue to prevent duplicate execution (Issue #33).
267
288
  if (loc?.type === 'queue') {
289
+ // BUT a job failed by the timeout sweep is requeued for RETRY — a late
290
+ // ACK from the timed-out worker must not complete it (that would skip
291
+ // the retry and silently override the timeout). Discard it gracefully.
292
+ if (this.timedOutJobs.has(jobId)) {
293
+ lockMgr.releaseLock(jobId, lockCtx, token);
294
+ return;
295
+ }
268
296
  await this.completeStallRetriedJob(jobId, result);
269
297
  lockMgr.releaseLock(jobId, lockCtx, token);
270
298
  }
@@ -281,6 +309,13 @@ export class QueueManager {
281
309
  // Without token: only if job was stall-retried (attempts > 0), to avoid
282
310
  // completing freshly-pushed jobs that were never pulled.
283
311
  if (err instanceof Error && err.message.includes('not found')) {
312
+ // A timeout-failed job requeued for retry must not be completed by a
313
+ // stale ACK from the timed-out worker — discard it so the retry wins.
314
+ if (this.timedOutJobs.has(jobId)) {
315
+ if (token)
316
+ lockMgr.releaseLock(jobId, lockCtx, token);
317
+ return;
318
+ }
284
319
  const shouldRecover = token ?? this.isStallRetried(jobId);
285
320
  if (shouldRecover && (await this.completeStallRetriedJob(jobId, result))) {
286
321
  if (token)
@@ -306,7 +341,11 @@ export class QueueManager {
306
341
  // from the queue to prevent duplicate execution.
307
342
  const loc = this.jobIndex.get(jobIds[i]);
308
343
  if (loc?.type === 'queue') {
309
- await this.completeStallRetriedJob(jobIds[i], undefined);
344
+ // Skip completion for a timeout-requeued job (retry must win); else
345
+ // recover the stall-retried job to prevent duplicate execution (#75).
346
+ if (!this.timedOutJobs.has(jobIds[i])) {
347
+ await this.completeStallRetriedJob(jobIds[i], undefined);
348
+ }
310
349
  lockMgr.releaseLock(jobIds[i], lockCtx, t);
311
350
  }
312
351
  continue;
@@ -344,7 +383,11 @@ export class QueueManager {
344
383
  // from the queue to prevent duplicate execution.
345
384
  const loc = this.jobIndex.get(item.id);
346
385
  if (loc?.type === 'queue') {
347
- await this.completeStallRetriedJob(item.id, item.result);
386
+ // Skip completion for a timeout-requeued job (retry must win); else
387
+ // recover the stall-retried job to prevent duplicate execution (#75).
388
+ if (!this.timedOutJobs.has(item.id)) {
389
+ await this.completeStallRetriedJob(item.id, item.result);
390
+ }
348
391
  lockMgr.releaseLock(item.id, lockCtx, item.token);
349
392
  }
350
393
  continue;
@@ -688,6 +731,11 @@ export class QueueManager {
688
731
  }
689
732
  for (const cid of customIdsToDelete)
690
733
  this.customIdMap.delete(cid);
734
+ // Per-queue cumulative counters are keyed by queue name and never expire on
735
+ // their own; obliterate is the documented way to reclaim ALL state for a
736
+ // queue, so drop its metrics entry too (prevents unbounded growth for
737
+ // ephemeral/dynamically-named queues).
738
+ this.perQueueMetrics.delete(queue);
691
739
  this.unregisterQueueName(queue);
692
740
  this.dashboardEmit?.('queue:obliterated', { queue });
693
741
  this.dashboardEmit?.('queue:removed', { queue });
@@ -993,6 +1041,12 @@ export class QueueManager {
993
1041
  getCompletedJobs() {
994
1042
  return this.completedJobs;
995
1043
  }
1044
+ // Bare completion ids of removeOnComplete jobs. The PUSH gate consults this so a
1045
+ // late dependent on an evicted removeOnComplete parent is admitted (same window
1046
+ // the readiness path / dependency processor already honor).
1047
+ getDepCompletions() {
1048
+ return this.depCompletions;
1049
+ }
996
1050
  getShards() {
997
1051
  return this.shards;
998
1052
  }
@@ -1421,6 +1475,8 @@ export class QueueManager {
1421
1475
  this.jobIndex.clear();
1422
1476
  this.completedJobs.clear();
1423
1477
  this.completedJobsData.clear();
1478
+ this.depCompletions.clear();
1479
+ this.timedOutJobs.clear();
1424
1480
  this.jobResults.clear();
1425
1481
  this.jobLogs.clear();
1426
1482
  this.customIdMap.clear();
@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
7
7
  import type { Shard } from '../domain/queue/shard';
8
8
  import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
9
9
  import type { RWLock } from '../shared/lock';
10
- import type { LRUMap, BoundedSet, BoundedMap, SetLike } from '../shared/lru';
10
+ import type { LRUMap, BoundedSet, BoundedMap, SetLike, MapLike } from '../shared/lru';
11
11
  import type { EventsManager } from './eventsManager';
12
12
  import type { WebhookManager } from './webhookManager';
13
13
  import type { WorkerManager } from './workerManager';
@@ -76,7 +76,7 @@ export interface QueueManagerState {
76
76
  };
77
77
  };
78
78
  readonly startTime: number;
79
- readonly perQueueMetrics: Map<string, {
79
+ readonly perQueueMetrics: MapLike<string, {
80
80
  totalCompleted: bigint;
81
81
  totalFailed: bigint;
82
82
  }>;
@@ -103,6 +103,8 @@ export interface BackgroundContext extends QueueManagerState {
103
103
  workerManager: WorkerManager;
104
104
  monitoringState: MonitoringState;
105
105
  completedJobsData: BoundedMap<JobId, Job>;
106
+ depCompletions?: BoundedSet<JobId>;
107
+ timedOutJobs?: BoundedSet<JobId>;
106
108
  }
107
109
  /** Context for stats operations */
108
110
  export interface StatsContext {
@@ -132,7 +134,7 @@ export interface StatsContext {
132
134
  };
133
135
  };
134
136
  startTime: number;
135
- perQueueMetrics?: Map<string, {
137
+ perQueueMetrics?: MapLike<string, {
136
138
  totalCompleted: bigint;
137
139
  totalFailed: bigint;
138
140
  }>;
@@ -37,7 +37,7 @@ export declare class Bunqueue<T = unknown, R = unknown> {
37
37
  opts?: JobOptions;
38
38
  }>): Promise<Job<T>[]>;
39
39
  getJob(id: string): Promise<Job<T> | null>;
40
- getJobCounts(): import("./queue/operations").JobCounts;
40
+ getJobCounts(): import("./queue/operations").JobCounts | Promise<import("./queue/operations").JobCounts>;
41
41
  getJobCountsAsync(): Promise<import("./queue/operations").JobCounts>;
42
42
  count(): number;
43
43
  countAsync(): Promise<number>;
@@ -17,8 +17,13 @@ export interface JobCounts {
17
17
  delayed: number;
18
18
  paused: number;
19
19
  }
20
- /** Get job counts (sync, embedded only) */
21
- export declare function getJobCounts(ctx: CountsContext): JobCounts;
20
+ /**
21
+ * Get job counts.
22
+ * Embedded mode returns synchronously. TCP mode delegates to the async path so
23
+ * callers receive the REAL server-side counts (awaitable Promise) instead of
24
+ * hardcoded zeros — defect: getjobcounts-tcp-zero.
25
+ */
26
+ export declare function getJobCounts(ctx: CountsContext): JobCounts | Promise<JobCounts>;
22
27
  /** Get job counts (async, works with TCP) */
23
28
  export declare function getJobCountsAsync(ctx: CountsContext): Promise<JobCounts>;
24
29
  /** Get waiting job count */
@@ -5,18 +5,15 @@
5
5
  */
6
6
  import { getSharedManager } from '../../manager';
7
7
  import { pausedView } from '../../../shared/pausedView';
8
- /** Get job counts (sync, embedded only) */
8
+ /**
9
+ * Get job counts.
10
+ * Embedded mode returns synchronously. TCP mode delegates to the async path so
11
+ * callers receive the REAL server-side counts (awaitable Promise) instead of
12
+ * hardcoded zeros — defect: getjobcounts-tcp-zero.
13
+ */
9
14
  export function getJobCounts(ctx) {
10
15
  if (!ctx.embedded) {
11
- return {
12
- waiting: 0,
13
- prioritized: 0,
14
- active: 0,
15
- completed: 0,
16
- failed: 0,
17
- delayed: 0,
18
- paused: 0,
19
- };
16
+ return getJobCountsAsync(ctx);
20
17
  }
21
18
  const manager = getSharedManager();
22
19
  // Use queue-specific counts
@@ -62,7 +62,7 @@ export declare class Queue<T = unknown> {
62
62
  getCompletedAsync(start?: number, end?: number): Promise<Job<T>[]>;
63
63
  getFailed(start?: number, end?: number): Job<T>[];
64
64
  getFailedAsync(start?: number, end?: number): Promise<Job<T>[]>;
65
- getJobCounts(): countsOps.JobCounts;
65
+ getJobCounts(): countsOps.JobCounts | Promise<countsOps.JobCounts>;
66
66
  getJobCountsAsync(): Promise<countsOps.JobCounts>;
67
67
  getWaitingCount(): Promise<number>;
68
68
  getActiveCount(): Promise<number>;
@@ -27,6 +27,13 @@ export declare class TcpConnectionPool {
27
27
  send(command: Record<string, unknown>): Promise<Record<string, unknown>>;
28
28
  /** Send multiple commands in parallel across pool */
29
29
  sendParallel(commands: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>>;
30
+ /**
31
+ * Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
32
+ * 'connected' on every successful connect, including reconnects. Used by
33
+ * Worker to re-register after a reconnect (the server drops registration when
34
+ * the registering connection closes and each reconnect gets a fresh clientId).
35
+ */
36
+ onReconnect(cb: () => void): void;
30
37
  /** Check if any connection is ready */
31
38
  isConnected(): boolean;
32
39
  /** Get number of connected clients */
@@ -99,6 +99,17 @@ export class TcpConnectionPool {
99
99
  });
100
100
  return Promise.all(promises);
101
101
  }
102
+ /**
103
+ * Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
104
+ * 'connected' on every successful connect, including reconnects. Used by
105
+ * Worker to re-register after a reconnect (the server drops registration when
106
+ * the registering connection closes and each reconnect gets a fresh clientId).
107
+ */
108
+ onReconnect(cb) {
109
+ for (const client of this.clients) {
110
+ client.on('connected', cb);
111
+ }
112
+ }
102
113
  /** Check if any connection is ready */
103
114
  isConnected() {
104
115
  return this.clients.some((c) => c.isConnected());
@@ -6,7 +6,7 @@ import { createPublicJob } from '../types';
6
6
  import { getSharedManager } from '../manager';
7
7
  import { UnrecoverableError } from '../errors';
8
8
  import { DelayedError } from '../errors';
9
- import { createProgressHandler, createLogHandler, createGetStateHandler, createGetChildrenValuesHandler, createGetFailedChildrenValuesHandler, createGetIgnoredChildrenFailuresHandler, createRemoveChildDependencyHandler, createRemoveUnprocessedChildrenHandler, createMoveToFailedHandler, createMoveToCompletedHandler, createRemoveHandler, createRetryHandler, createUpdateDataHandler, createPromoteHandler, createChangeDelayHandler, createChangePriorityHandler, createExtendLockHandler, createClearLogsHandler, createMoveToWaitHandler, createMoveToDelayedHandler, createMoveToWaitingChildrenHandler, createWaitUntilFinishedHandler, createDiscardHandler, createGetDependenciesHandler, createGetDependenciesCountHandler, createRemoveDeduplicationKeyHandler, } from './processorHandlers';
9
+ import { createProgressHandler, createLogHandler, createGetStateHandler, createGetChildrenValuesHandler, createGetFailedChildrenValuesHandler, createGetIgnoredChildrenFailuresHandler, createRemoveChildDependencyHandler, createRemoveUnprocessedChildrenHandler, createMoveToFailedHandler, createMoveToCompletedHandler, createRemoveHandler, createRetryHandler, createUpdateDataHandler, createPromoteHandler, createChangeDelayHandler, createChangePriorityHandler, createExtendLockHandler, createClearLogsHandler, createMoveToWaitHandler, createMoveToDelayedHandler, createMoveToWaitingChildrenHandler, createWaitUntilFinishedHandler, createDiscardHandler, createGetDependenciesHandler, createGetDependenciesCountHandler, createRemoveDeduplicationKeyHandler, computeStackLines, } from './processorHandlers';
10
10
  /**
11
11
  * Process a single job
12
12
  */
@@ -63,7 +63,7 @@ export async function processJob(internalJob, config) {
63
63
  try {
64
64
  const result = await processor(job);
65
65
  // Issue #82: If moveToFailed/moveToCompleted was called, skip auto-ACK
66
- if (handleManualMove(manualMove, job, config))
66
+ if (handleManualMove(manualMove, job, config, internalJob))
67
67
  return;
68
68
  // Normal path: auto-ACK
69
69
  try {
@@ -94,16 +94,23 @@ export async function processJob(internalJob, config) {
94
94
  }
95
95
  catch (error) {
96
96
  // Issue #82: If moveToFailed was already called, skip normal failure handling
97
- if (handleManualMove(manualMove, job, config))
97
+ if (handleManualMove(manualMove, job, config, internalJob))
98
98
  return;
99
99
  await handleJobFailure(internalJob, error, config, { job, jobIdStr, token });
100
100
  }
101
101
  }
102
102
  /** Issue #82: Handle explicit moveToFailed/moveToCompleted called inside processor */
103
- function handleManualMove(manualMove, job, config) {
103
+ function handleManualMove(manualMove, job, config, internalJob) {
104
104
  if (manualMove.result?.type === 'failed') {
105
105
  const err = manualMove.result.error ?? new Error('Job manually moved to failed');
106
106
  job.failedReason = err.message;
107
+ // Bug #74 follow-up: populate stacktrace on the local `failed` event for the
108
+ // explicit moveToFailed() path too, mirroring handleJobFailure. The stack is
109
+ // persisted server-side by createMoveToFailedHandler's FAIL/manager.fail call.
110
+ if (err.stack) {
111
+ const { stackLines } = computeStackLines(err);
112
+ job.stacktrace = stackLines.slice(0, internalJob.stackTraceLimit);
113
+ }
107
114
  config.onOutcome?.(false);
108
115
  config.emitter.emit('failed', job, err);
109
116
  return true;
@@ -160,13 +167,7 @@ async function handleJobFailure(internalJob, error, config, context) {
160
167
  // Bug #74: stack lines computed BEFORE the send so the server can persist
161
168
  // them. The wire copy is capped at 50 lines as a bandwidth guard; the
162
169
  // authoritative cap (job.stackTraceLimit) is applied server-side in failJob.
163
- const stackLines = err.stack
164
- ? err.stack
165
- .split('\n')
166
- .map((l) => l.trim())
167
- .filter(Boolean)
168
- : [];
169
- const wireStack = stackLines.length > 0 ? stackLines.slice(0, 50) : undefined;
170
+ const { stackLines, wireStack } = computeStackLines(err);
170
171
  try {
171
172
  if (embedded) {
172
173
  const manager = getSharedManager();
@@ -19,6 +19,17 @@ export declare function createGetFailedChildrenValuesHandler(embedded: boolean,
19
19
  export declare function createGetIgnoredChildrenFailuresHandler(embedded: boolean, tcp: TcpConnection | null): (id: string) => Promise<Record<string, string>>;
20
20
  export declare function createRemoveChildDependencyHandler(embedded: boolean, tcp: TcpConnection | null): (id: string) => Promise<boolean>;
21
21
  export declare function createRemoveUnprocessedChildrenHandler(embedded: boolean, tcp: TcpConnection | null): (id: string) => Promise<void>;
22
+ /**
23
+ * Bug #74: split an Error's stack into trimmed, non-empty lines.
24
+ * `wireStack` is the bandwidth-capped copy sent to the server (50 lines); the
25
+ * authoritative cap (job.stackTraceLimit) is applied server-side in failJob.
26
+ * Shared by the natural-throw path (handleJobFailure) and the explicit
27
+ * moveToFailed() path so both persist the stack identically.
28
+ */
29
+ export declare function computeStackLines(err: Error): {
30
+ stackLines: string[];
31
+ wireStack: string[] | undefined;
32
+ };
22
33
  /** Issue #82: Create moveToFailed handler for use inside processor */
23
34
  export declare function createMoveToFailedHandler(embedded: boolean, tcp: TcpConnection | null, internalJob: InternalJob, token: string | null | undefined, onCalled: (error: Error) => void): (id: string, error: Error, _lockToken?: string) => Promise<void>;
24
35
  /** Issue #82: Create moveToCompleted handler for use inside processor */
@@ -102,18 +102,40 @@ export function createRemoveUnprocessedChildrenHandler(embedded, tcp) {
102
102
  await tcp.send({ cmd: 'RemoveUnprocessedChildren', id });
103
103
  };
104
104
  }
105
+ /**
106
+ * Bug #74: split an Error's stack into trimmed, non-empty lines.
107
+ * `wireStack` is the bandwidth-capped copy sent to the server (50 lines); the
108
+ * authoritative cap (job.stackTraceLimit) is applied server-side in failJob.
109
+ * Shared by the natural-throw path (handleJobFailure) and the explicit
110
+ * moveToFailed() path so both persist the stack identically.
111
+ */
112
+ export function computeStackLines(err) {
113
+ const stackLines = err.stack
114
+ ? err.stack
115
+ .split('\n')
116
+ .map((l) => l.trim())
117
+ .filter(Boolean)
118
+ : [];
119
+ const wireStack = stackLines.length > 0 ? stackLines.slice(0, 50) : undefined;
120
+ return { stackLines, wireStack };
121
+ }
105
122
  /** Issue #82: Create moveToFailed handler for use inside processor */
106
123
  export function createMoveToFailedHandler(embedded, tcp, internalJob, token, onCalled) {
107
124
  return async (_id, error, _lockToken) => {
125
+ // Bug #74 follow-up: carry the stack on explicit moveToFailed() too. The
126
+ // natural-throw path already does; @arthurvanl's repro showed the manual
127
+ // path lost it. Compute before the send so the server can persist it.
128
+ const { wireStack } = computeStackLines(error);
108
129
  if (embedded) {
109
130
  const manager = getSharedManager();
110
- await manager.fail(internalJob.id, error.message, token ?? undefined);
131
+ await manager.fail(internalJob.id, error.message, token ?? undefined, undefined, wireStack);
111
132
  }
112
133
  else if (tcp) {
113
134
  await tcp.send({
114
135
  cmd: 'FAIL',
115
136
  id: internalJob.id,
116
137
  error: error.message,
138
+ ...(wireStack ? { stack: wireStack } : {}),
117
139
  ...(token ? { token } : {}),
118
140
  });
119
141
  }
@@ -28,6 +28,7 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
28
28
  private running;
29
29
  private paused;
30
30
  private _closing;
31
+ private _forceClose;
31
32
  private _closingPromise;
32
33
  private closed;
33
34
  private activeJobs;
@@ -123,6 +124,16 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
123
124
  extendJobLocks(jobIds: string[], tokens: string[], duration: number): Promise<number>;
124
125
  close(force?: boolean): Promise<void>;
125
126
  private _doClose;
127
+ /**
128
+ * Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
129
+ * These jobs are `active` server-side holding a lock; moving them back to
130
+ * `waiting` makes them re-pullable so nothing is lost, and removes them from
131
+ * the close drain (which would otherwise hang on a buffer that can never be
132
+ * advanced while `_closing` is set). Best-effort: a job that can't be
133
+ * released (e.g. already completed/stall-retried) is simply dropped from the
134
+ * local buffer — its server-side lock will expire and requeue it.
135
+ */
136
+ private releaseBufferedJobs;
126
137
  private poll;
127
138
  private tryProcess;
128
139
  private registerPulledJobs;
@@ -13,7 +13,7 @@ import { parseJobFromResponse } from './jobParser';
13
13
  import { processJob } from './processor';
14
14
  import { WorkerRateLimiter } from './workerRateLimiter';
15
15
  import { GroupConcurrencyLimiter } from './groupConcurrency';
16
- import { startHeartbeat } from './workerHeartbeat';
16
+ import { startHeartbeat, sendHeartbeat } from './workerHeartbeat';
17
17
  import { pullEmbedded, pullTcp } from './workerPull';
18
18
  import { resolveToken } from '../resolveToken';
19
19
  /** Resolve WorkerOptions into ExtendedWorkerOptions with defaults */
@@ -75,6 +75,9 @@ export class Worker extends EventEmitter {
75
75
  running = false;
76
76
  paused = false;
77
77
  _closing = false;
78
+ // Set when a force close is requested — allows close(true) to pre-empt an
79
+ // in-progress graceful close(false) by breaking out of its drain loop.
80
+ _forceClose = false;
78
81
  _closingPromise = null;
79
82
  closed = false;
80
83
  activeJobs = 0;
@@ -140,6 +143,17 @@ export class Worker extends EventEmitter {
140
143
  this.tcpPool = createTcpPool(opts, this.opts.concurrency);
141
144
  this.tcp = this.tcpPool;
142
145
  this.ackBatcher.setTcp(this.tcp);
146
+ // The server drops worker registration when the registering connection
147
+ // closes, and each pooled reconnect gets a fresh server clientId. Re-send
148
+ // RegisterWorker on reconnect so the worker stays visible in
149
+ // WorkerManager (ListWorkers / getForQueue / skipIfNoWorker) while it
150
+ // keeps consuming jobs. Only acts once we were actually registered.
151
+ this.tcpPool.onReconnect(() => {
152
+ if (this.closed || this._closing || !this.registered)
153
+ return;
154
+ this.registered = false;
155
+ this.registerWithServer();
156
+ });
143
157
  }
144
158
  if (this.opts.autorun)
145
159
  this.run();
@@ -151,6 +165,7 @@ export class Worker extends EventEmitter {
151
165
  this.running = true;
152
166
  this.paused = false;
153
167
  this._closing = false;
168
+ this._forceClose = false;
154
169
  this._closingPromise = null;
155
170
  // Defer the 'ready' emit so listeners attached synchronously after
156
171
  // construction (e.g. `new Worker(...).on('ready', ...)`) still receive it.
@@ -411,8 +426,16 @@ export class Worker extends EventEmitter {
411
426
  async close(force = false) {
412
427
  if (this.closed)
413
428
  return;
414
- if (this._closingPromise)
429
+ // A force close must be able to pre-empt an in-progress graceful close:
430
+ // the graceful drain only waits on genuinely in-flight jobs, but a caller
431
+ // asking to force-close wants to stop waiting now. Flip the force flag so
432
+ // the running _doClose's drain loop exits, and return the same promise.
433
+ if (this._closingPromise) {
434
+ if (force)
435
+ this._forceClose = true;
415
436
  return this._closingPromise;
437
+ }
438
+ this._forceClose = force;
416
439
  this._closingPromise = this._doClose(force);
417
440
  return this._closingPromise;
418
441
  }
@@ -432,9 +455,19 @@ export class Worker extends EventEmitter {
432
455
  clearInterval(this.workerHeartbeatTimer);
433
456
  this.workerHeartbeatTimer = null;
434
457
  }
458
+ // Release buffered pulled-but-unstarted jobs back to the queue. During
459
+ // _closing every code path that would advance the buffer (poll/tryProcess/
460
+ // the startJob().finally re-poll) early-returns, so a buffered job is never
461
+ // started — yet it holds a server-side lock and sits in `active` state.
462
+ // Without this, a graceful drain that waited on the buffer would hang
463
+ // forever (e.g. group-limited jobs buffered behind a max:1 limiter). We
464
+ // requeue them (Active -> Waiting) so they are re-pullable and not lost.
465
+ await this.releaseBufferedJobs();
435
466
  if (!force) {
436
- const bufferSize = () => this.pendingJobs.length - this.pendingJobsHead;
437
- while (this.activeJobs > 0 || bufferSize() > 0) {
467
+ // Wait only on genuinely in-flight jobs. The buffer was released above,
468
+ // so it must NOT be part of the drain condition. A concurrent force
469
+ // close (this._forceClose) breaks out immediately.
470
+ while (this.activeJobs > 0 && !this._forceClose) {
438
471
  await Bun.sleep(50);
439
472
  }
440
473
  }
@@ -470,6 +503,45 @@ export class Worker extends EventEmitter {
470
503
  this._closing = false;
471
504
  this.emit('closed');
472
505
  }
506
+ /**
507
+ * Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
508
+ * These jobs are `active` server-side holding a lock; moving them back to
509
+ * `waiting` makes them re-pullable so nothing is lost, and removes them from
510
+ * the close drain (which would otherwise hang on a buffer that can never be
511
+ * advanced while `_closing` is set). Best-effort: a job that can't be
512
+ * released (e.g. already completed/stall-retried) is simply dropped from the
513
+ * local buffer — its server-side lock will expire and requeue it.
514
+ */
515
+ async releaseBufferedJobs() {
516
+ const buffered = this.pendingJobs.slice(this.pendingJobsHead);
517
+ this.pendingJobs = [];
518
+ this.pendingJobsHead = 0;
519
+ if (buffered.length === 0)
520
+ return;
521
+ for (const { job, token } of buffered) {
522
+ const id = String(job.id);
523
+ try {
524
+ if (this.embedded) {
525
+ const manager = getSharedManager();
526
+ await manager.moveActiveToWait(jobId(id));
527
+ // moveActiveToWait re-queues the job (active -> waiting); release the
528
+ // lock token we still hold so it is fully owner-free and re-pullable.
529
+ if (this.opts.useLocks)
530
+ manager.releaseLock(jobId(id), token ?? undefined);
531
+ }
532
+ else if (this.tcp) {
533
+ await this.tcp.send({ cmd: 'MoveToWait', id });
534
+ }
535
+ }
536
+ catch {
537
+ // Best-effort: lock expiration will requeue anything we couldn't release.
538
+ }
539
+ finally {
540
+ this.pulledJobIds.delete(id);
541
+ this.jobTokens.delete(id);
542
+ }
543
+ }
544
+ }
473
545
  // ============ Processing Pipeline ============
474
546
  poll() {
475
547
  if (!this.running || this._closing)
@@ -568,6 +640,18 @@ export class Worker extends EventEmitter {
568
640
  this.jobTokens.set(jobIdStr, pulledItem.token);
569
641
  }
570
642
  }
643
+ // Renew the just-pulled locks immediately (fire-and-forget). With a pooled
644
+ // client (poolSize > 1), the PULL and subsequent heartbeats travel on
645
+ // different connections; the server only releases an in-flight job on
646
+ // connection drop if its lock was never renewed (renewalCount === 0). The
647
+ // periodic heartbeat timer does not fire until one interval later, leaving a
648
+ // window where a dropped pulling-socket would re-dispatch a job the worker
649
+ // is actively running. An immediate renew closes that window. Only needed
650
+ // for multi-connection lock-based workers (poolSize 1 keeps pull+heartbeat
651
+ // on the same socket, so there is nothing to protect against).
652
+ if (this.opts.useLocks && items.length > 0 && this.tcpPool && this.tcpPool.getPoolSize() > 1) {
653
+ void sendHeartbeat(this.getHeartbeatDeps());
654
+ }
571
655
  }
572
656
  getBufferedJob() {
573
657
  if (this.pendingJobsHead >= this.pendingJobs.length)
@@ -82,6 +82,8 @@ export declare class SqliteStorage {
82
82
  insertJob(job: Job, durable?: boolean): void;
83
83
  /** Insert job immediately (bypass buffer) */
84
84
  insertJobImmediate(job: Job): void;
85
+ /** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
86
+ private runInsertJobStmt;
85
87
  /**
86
88
  * Ensure a job's buffered INSERT has been written to disk before issuing a
87
89
  * state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
@@ -122,8 +124,13 @@ export declare class SqliteStorage {
122
124
  getJobStateRaw(jobId: JobId): string | null;
123
125
  /** Load all completed job IDs (for dependency recovery) */
124
126
  loadCompletedJobIds(): Set<JobId>;
125
- /** Insert batch of jobs (adds to buffer) */
126
- insertJobsBatch(jobs: Job[]): void;
127
+ /**
128
+ * Insert batch of jobs. By default the jobs go through the write buffer.
129
+ * When `durable` is true they bypass the buffer and are written to disk
130
+ * immediately (matching single-push durable semantics) — used for addBulk
131
+ * jobs flagged `durable: true`.
132
+ */
133
+ insertJobsBatch(jobs: Job[], durable?: boolean): void;
127
134
  /**
128
135
  * Query jobs by queue with optional state filter and pagination.
129
136
  * Uses idx_jobs_queue_state index for O(log n) lookups.
@@ -244,11 +244,15 @@ export class SqliteStorage {
244
244
  /** Insert job immediately (bypass buffer) */
245
245
  insertJobImmediate(job) {
246
246
  this.safeWrite(() => {
247
- this.statements
248
- .get('insertJob')
249
- .run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
247
+ this.runInsertJobStmt(job);
250
248
  });
251
249
  }
250
+ /** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
251
+ runInsertJobStmt(job) {
252
+ this.statements
253
+ .get('insertJob')
254
+ .run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
255
+ }
252
256
  /**
253
257
  * Ensure a job's buffered INSERT has been written to disk before issuing a
254
258
  * state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
@@ -420,11 +424,38 @@ export class SqliteStorage {
420
424
  /** Load all completed job IDs (for dependency recovery) */
421
425
  loadCompletedJobIds() {
422
426
  const rows = this.db.query('SELECT job_id FROM job_results').all();
423
- return new Set(rows.map((r) => r.job_id));
427
+ const ids = new Set(rows.map((r) => r.job_id));
428
+ // A job acked with no/undefined result has state='completed' but NO job_results
429
+ // row. Include state='completed' ids so dependency recovery still sees it as
430
+ // done and unblocks dependents (instead of parking them forever).
431
+ const stateRows = this.db
432
+ .query("SELECT id FROM jobs WHERE state = 'completed'")
433
+ .all();
434
+ for (const r of stateRows)
435
+ ids.add(r.id);
436
+ return ids;
424
437
  }
425
438
  // ============ Bulk Operations ============
426
- /** Insert batch of jobs (adds to buffer) */
427
- insertJobsBatch(jobs) {
439
+ /**
440
+ * Insert batch of jobs. By default the jobs go through the write buffer.
441
+ * When `durable` is true they bypass the buffer and are written to disk
442
+ * immediately (matching single-push durable semantics) — used for addBulk
443
+ * jobs flagged `durable: true`.
444
+ */
445
+ insertJobsBatch(jobs, durable) {
446
+ if (durable) {
447
+ // Atomic immediate write: every row hits disk in ONE transaction, so a
448
+ // mid-batch failure rolls back the whole batch (no partial on-disk state)
449
+ // — bypassing the write buffer to honor the durable contract.
450
+ this.safeWrite(() => {
451
+ const tx = this.db.transaction((batch) => {
452
+ for (const job of batch)
453
+ this.runInsertJobStmt(job);
454
+ });
455
+ tx(jobs);
456
+ });
457
+ return;
458
+ }
428
459
  this.writeBuffer.addBatch(jobs);
429
460
  }
430
461
  // ============ Query Operations ============
@@ -29,7 +29,12 @@ export async function handlePush(cmd, ctx, reqId) {
29
29
  for (const depId of cmd.dependsOn) {
30
30
  const depJobId = jobId(depId);
31
31
  const exists = ctx.queueManager.getJobIndex().has(depJobId) ||
32
- ctx.queueManager.getCompletedJobs().has(depJobId);
32
+ ctx.queueManager.getCompletedJobs().has(depJobId) ||
33
+ // A removeOnComplete parent that completed is recorded only here (its row
34
+ // is deleted and it leaves jobIndex/completedJobs). The readiness path and
35
+ // dependency processor already honor depCompletions; the gate must too, or
36
+ // a late dependent is wrongly rejected with "Dependency job not found".
37
+ ctx.queueManager.getDepCompletions().has(depJobId);
33
38
  if (!exists) {
34
39
  return resp.error(`Dependency job not found: ${depId}`, reqId);
35
40
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bunqueue",
3
- "version": "2.8.12",
3
+ "version": "2.8.17",
4
4
  "description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
5
5
  "type": "module",
6
6
  "main": "dist/main.js",