npm - bunqueue - Versions diffs - 2.8.13 → 2.8.18 - Mend

bunqueue 2.8.13 → 2.8.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/application/backgroundTasks.d.ts +1 -0
package/dist/application/backgroundTasks.js +6 -1
package/dist/application/cleanupTasks.js +5 -0
package/dist/application/clientTracking.js +11 -0
package/dist/application/contextFactory.d.ts +4 -2
package/dist/application/contextFactory.js +5 -0
package/dist/application/dependencyProcessor.js +9 -2
package/dist/application/lockManager.js +16 -1
package/dist/application/operations/ack.d.ts +3 -1
package/dist/application/operations/ack.js +5 -0
package/dist/application/operations/ackHelpers.d.ts +3 -1
package/dist/application/operations/ackHelpers.js +5 -0
package/dist/application/operations/jobManagement.js +10 -0
package/dist/application/operations/push.d.ts +4 -0
package/dist/application/operations/push.js +26 -2
package/dist/application/operations/queryOperations.js +8 -2
package/dist/application/queueManager.d.ts +3 -0
package/dist/application/queueManager.js +59 -3
package/dist/application/types.d.ts +5 -3
package/dist/client/bunqueue.d.ts +1 -1
package/dist/client/queue/operations/counts.d.ts +7 -2
package/dist/client/queue/operations/counts.js +7 -10
package/dist/client/queue/queue.d.ts +1 -1
package/dist/client/tcpPool.d.ts +7 -0
package/dist/client/tcpPool.js +11 -0
package/dist/client/worker/worker.d.ts +12 -0
package/dist/client/worker/worker.js +127 -8
package/dist/infrastructure/persistence/sqlite.d.ts +9 -2
package/dist/infrastructure/persistence/sqlite.js +37 -6
package/dist/infrastructure/server/handlers/core.js +6 -1
package/package.json +1 -1

package/dist/application/backgroundTasks.d.ts CHANGED Viewed

@@ -26,5 +26,6 @@ export declare function startBackgroundTasks(ctx: BackgroundContext, cronSchedul
  * Stop all background tasks
  */
 export declare function stopBackgroundTasks(handles: BackgroundTaskHandles): void;
+export declare function checkJobTimeouts(ctx: BackgroundContext): void;
 export declare function recover(ctx: BackgroundContext): void;
 export { processPendingDependencies };

package/dist/application/backgroundTasks.js CHANGED Viewed

@@ -108,7 +108,7 @@ function getLockContext(ctx) {
     };
 }
 // ============ Job Timeouts ============
-function checkJobTimeouts(ctx) {
+export function checkJobTimeouts(ctx) {
     const now = Date.now();
     for (const procShard of ctx.processingShards) {
         for (const [jobId, job] of procShard) {
@@ -118,6 +118,11 @@ function checkJobTimeouts(ctx) {
                     queue: job.queue,
                     timeout: job.timeout,
                 });
+                // Mark as timed-out BEFORE requeuing for retry, so a late ACK from the
+                // (still-hung) worker that exceeded the deadline is discarded instead of
+                // phantom-completing the job and skipping the retry. See ack-recovery in
+                // queueManager.ack (timedOutJobs guard).
+                ctx.timedOutJobs?.add(jobId);
                 ctx.fail(jobId, 'Job timeout exceeded').catch((err) => {
                     queueLog.error('Failed to mark timed out job as failed', {
                         jobId: String(jobId),

package/dist/application/cleanupTasks.js CHANGED Viewed

@@ -221,6 +221,11 @@ function cleanEmptyQueues(ctx) {
             shard.clearQueueLimiters(queueName);
             shard.stallConfig.delete(queueName);
             shard.dlqConfig.delete(queueName);
+            // NOTE: perQueueMetrics is intentionally NOT pruned here — it is an
+            // LRU-bounded map and these counters are cumulative, so they must survive
+            // a transient drain (a busy queue momentarily empty must not reset to 0).
+            // obliterate() reclaims it explicitly; the LRU cap bounds growth for
+            // ephemeral/dynamically-named queues.
             ctx.dashboardEmit?.('queue:removed', { queue: queueName });
             ctx.unregisterQueueName(queueName);
         }

package/dist/application/clientTracking.js CHANGED Viewed

@@ -47,6 +47,17 @@ export async function releaseClientJobs(clientId, ctx) {
         const loc = ctx.jobIndex.get(jobId);
         if (loc?.type !== 'processing')
             continue;
+        // A job whose lock has been renewed since pull (renewalCount > 0) is being
+        // actively heartbeated by a live worker. With a pooled client, heartbeats
+        // travel on a DIFFERENT connection than the one that pulled, so THIS socket
+        // closing does not mean the worker died — re-queuing here would re-dispatch
+        // (double-execute) a job the worker still holds. Leave it; lock expiry /
+        // stall detection reclaims it if the worker truly stops heartbeating.
+        // A never-renewed lock (renewalCount === 0) keeps the original fast-recovery
+        // behavior: requeue immediately on disconnect.
+        const lock = ctx.jobLocks.get(jobId);
+        if (lock && lock.renewalCount > 0)
+            continue;
         const procIdx = loc.shardIdx;
         const job = ctx.processingShards[procIdx].get(jobId);
         if (!job)

package/dist/application/contextFactory.d.ts CHANGED Viewed

@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
 import type { Shard } from '../domain/queue/shard';
 import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
 import type { RWLock } from '../shared/lock';
-import type { LRUMap, BoundedSet, BoundedMap } from '../shared/lru';
+import type { LRUMap, BoundedSet, BoundedMap, MapLike } from '../shared/lru';
 import type { WebhookManager } from './webhookManager';
 import type { WorkerManager } from './workerManager';
 import type { EventsManager } from './eventsManager';
@@ -33,6 +33,8 @@ export interface ContextDependencies {
     jobIndex: Map<JobId, JobLocation>;
     completedJobs: BoundedSet<JobId>;
     completedJobsData: BoundedMap<JobId, Job>;
+    depCompletions?: BoundedSet<JobId>;
+    timedOutJobs?: BoundedSet<JobId>;
     jobResults: LRUMap<JobId, unknown>;
     customIdMap: LRUMap<string, JobId>;
     jobLogs: LRUMap<JobId, JobLogEntry[]>;
@@ -62,7 +64,7 @@ export interface ContextDependencies {
     };
     startTime: number;
     maxLogsPerJob: number;
-    perQueueMetrics: Map<string, {
+    perQueueMetrics: MapLike<string, {
         totalCompleted: bigint;
         totalFailed: bigint;
     }>;

package/dist/application/contextFactory.js CHANGED Viewed

@@ -34,6 +34,8 @@ export class ContextFactory {
             processingLocks: this.deps.processingLocks,
             jobIndex: this.deps.jobIndex,
             completedJobs: this.deps.completedJobs,
+            depCompletions: this.deps.depCompletions,
+            timedOutJobs: this.deps.timedOutJobs,
             jobResults: this.deps.jobResults,
             customIdMap: this.deps.customIdMap,
             jobLogs: this.deps.jobLogs,
@@ -81,6 +83,8 @@ export class ContextFactory {
             shardLocks: this.deps.shardLocks,
             completedJobs: this.deps.completedJobs,
             completedJobsData: this.deps.completedJobsData,
+            depCompletions: this.deps.depCompletions,
+            timedOutJobs: this.deps.timedOutJobs,
             jobResults: this.deps.jobResults,
             customIdMap: this.deps.customIdMap,
             jobIndex: this.deps.jobIndex,
@@ -111,6 +115,7 @@ export class ContextFactory {
             processingLocks: this.deps.processingLocks,
             completedJobs: this.deps.completedJobs,
             completedJobsData: this.deps.completedJobsData,
+            depCompletions: this.deps.depCompletions,
             jobResults: this.deps.jobResults,
             jobIndex: this.deps.jobIndex,
             customIdMap: this.deps.customIdMap,

package/dist/application/dependencyProcessor.js CHANGED Viewed

@@ -37,10 +37,12 @@ export async function processPendingDependencies(ctx) {
         await withWriteLock(ctx.shardLocks[i], () => {
             const shard = ctx.shards[i];
             const jobsToPromote = [];
-            // Check which jobs have all dependencies satisfied (inside lock)
+            // Check which jobs have all dependencies satisfied (inside lock).
+            // A removeOnComplete parent is not in completedJobs (its full record was
+            // dropped to bound memory), so also honor its bare-id depCompletions entry.
             for (const jobId of jobIdsToCheck) {
                 const job = shard.waitingDeps.get(jobId);
-                if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep))) {
+                if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep) || (ctx.depCompletions?.has(dep) ?? false))) {
                     jobsToPromote.push(job);
                 }
             }
@@ -50,6 +52,11 @@ export async function processPendingDependencies(ctx) {
             }
         });
     }));
+    // NOTE: depCompletions is intentionally NOT pruned here. It is a FIFO
+    // BoundedSet (same cap as completedJobs), so it self-bounds. Pruning eagerly
+    // once "no waiters remain" would orphan a dependent pushed AFTER a
+    // removeOnComplete parent completed — exactly the symmetry completedJobs
+    // provides for normal parents (readiness holds for the whole bounded window).
 }
 /** Move jobs from waitingDeps to the active queue */
 function promoteJobsToQueue(jobsToPromote, shard, ctx, shardIdx) {

package/dist/application/lockManager.js CHANGED Viewed

@@ -111,8 +111,19 @@ function processExpiredLockInner(jobId, lock, job, shardIdx, procIdx, ctx, now)
 /** Move job to DLQ when max stalls exceeded */
 function handleMaxStallsExceeded(opts) {
     const { jobId, job, lock, shard, ctx, now } = opts;
-    shard.addToDlq(job, "stalled" /* FailureReason.Stalled */, `Lock expired after ${lock.renewalCount} renewals`);
+    // Release the concurrency slot (+group+uniqueKey) acquired at pull before
+    // moving to DLQ — otherwise the slot leaks (mirrors
+    // stallDetection.moveStalliedJobToDlq).
+    shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
+    const entry = shard.addToDlq(job, "stalled" /* FailureReason.Stalled */, `Lock expired after ${lock.renewalCount} renewals`);
     ctx.jobIndex.set(jobId, { type: 'dlq', queueName: job.queue });
+    // Persist the DLQ move like the sibling paths (ack.moveFailedJobToDlq,
+    // stallDetection.moveStalliedJobToDlq, backgroundTasks startup-recovery).
+    // Without these two writes the jobs row survives in SQLite as an orphan and
+    // the DLQ entry lives only in memory — a later retry then re-INSERTs the
+    // surviving row and throws `UNIQUE constraint failed: jobs.id` (#97).
+    ctx.storage?.saveDlqEntry(entry);
+    ctx.storage?.deleteJob(jobId);
     ctx.eventsManager.broadcast({
         eventType: "failed" /* EventType.Failed */,
         jobId,
@@ -125,6 +136,10 @@ function handleMaxStallsExceeded(opts) {
 function requeueExpiredJob(opts) {
     const { jobId, job, queue, idx, ctx, now } = opts;
     const shard = ctx.shards[idx];
+    // Release the concurrency slot (+group+uniqueKey) acquired at pull before
+    // re-pushing — otherwise the slot leaks and the queue wedges (mirrors
+    // stallDetection.retryStalliedJob).
+    shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
     queue.push(job);
     const isDelayed = job.runAt > now;
     shard.incrementQueued(jobId, isDelayed, job.createdAt, job.queue, job.runAt);

package/dist/application/operations/ack.d.ts CHANGED Viewed

@@ -17,6 +17,8 @@ export interface AckContext {
     processingLocks: RWLock[];
     completedJobs: SetLike<JobId>;
     completedJobsData: MapLike<JobId, Job>;
+    /** Bare completion ids for removeOnComplete jobs so dependents can unblock */
+    depCompletions?: SetLike<JobId>;
     jobResults: MapLike<JobId, unknown>;
     jobIndex: Map<JobId, JobLocation>;
     customIdMap?: MapLike<string, JobId>;
@@ -26,7 +28,7 @@ export interface AckContext {
     totalFailed: {
         value: bigint;
     };
-    perQueueMetrics?: Map<string, {
+    perQueueMetrics?: MapLike<string, {
         totalCompleted: bigint;
         totalFailed: bigint;
     }>;

package/dist/application/operations/ack.js CHANGED Viewed

@@ -50,6 +50,11 @@ export async function ackJob(jobId, result, ctx) {
     else {
         ctx.jobIndex.delete(jobId);
         ctx.storage?.deleteJob(jobId);
+        // removeOnComplete drops the full job (index + data + persisted row) to bound
+        // memory, but dependent jobs gate readiness on completedJobs.has(parentId).
+        // Record the bare completion id (no payload) so dependents still unblock,
+        // without making the job appear in state/stats queries.
+        ctx.depCompletions?.add(jobId);
     }
     ctx.totalCompleted.value++;
     if (ctx.perQueueMetrics) {

package/dist/application/operations/ackHelpers.d.ts CHANGED Viewed

@@ -55,13 +55,15 @@ export interface FinalizeContext {
     storage: SqliteStorage | null;
     completedJobs: SetLike<JobId>;
     completedJobsData: MapLike<JobId, Job>;
+    /** Bare completion ids for removeOnComplete jobs so dependents can unblock */
+    depCompletions?: SetLike<JobId>;
     jobResults: MapLike<JobId, unknown>;
     jobIndex: Map<JobId, JobLocation>;
     customIdMap?: MapLike<string, JobId>;
     totalCompleted: {
         value: bigint;
     };
-    perQueueMetrics?: Map<string, {
+    perQueueMetrics?: MapLike<string, {
         totalCompleted: bigint;
         totalFailed: bigint;
     }>;

package/dist/application/operations/ackHelpers.js CHANGED Viewed

@@ -164,6 +164,11 @@ export function finalizeBatchAck(extractedJobs, ctx, includeResults) {
             ctx.jobIndex.delete(jobId);
             if (hasStorage)
                 storage.deleteJob(jobId);
+            // removeOnComplete drops the full job to bound memory, but dependents gate
+            // readiness on completedJobs.has(parentId). Record the bare completion id
+            // (no payload) so dependent jobs still unblock, without surfacing the job
+            // in state/stats queries.
+            ctx.depCompletions?.add(jobId);
         }
     }
     // Broadcast events

package/dist/application/operations/jobManagement.js CHANGED Viewed

@@ -23,6 +23,16 @@ export async function cancelJob(jobId, ctx) {
                 ctx.storage?.deleteJob(jobId);
                 return { success: true, queueName: location.queueName };
             }
+            // Not in the run queue — it may be parked in waitingChildren (moved via
+            // moveToWaitingChildren, which already released its resources and does not
+            // track it in the queued counter, so do NOT decrement/release here).
+            const parked = shard.waitingChildren.get(jobId);
+            if (parked) {
+                shard.waitingChildren.delete(jobId);
+                ctx.jobIndex.delete(jobId);
+                ctx.storage?.deleteJob(jobId);
+                return { success: true, queueName: location.queueName };
+            }
             return { success: false, queueName: location.queueName };
         });
         if (result.success) {

package/dist/application/operations/push.d.ts CHANGED Viewed

@@ -15,6 +15,10 @@ export interface PushContext {
     shardLocks: RWLock[];
     completedJobs: SetLike<JobId>;
     completedJobsData: MapLike<JobId, Job>;
+    /** Bare completion ids for removeOnComplete jobs so dependents start ready */
+    depCompletions?: SetLike<JobId>;
+    /** Timeout markers — cleared on custom-id reuse so a recycled id starts clean */
+    timedOutJobs?: SetLike<JobId>;
     jobResults: MapLike<JobId, unknown>;
     customIdMap: MapLike<string, JobId>;
     jobIndex: Map<JobId, JobLocation>;

package/dist/application/operations/push.js CHANGED Viewed

@@ -40,6 +40,12 @@ function handleCustomId(input, shard, ctx) {
         ctx.jobIndex.delete(id);
         ctx.storage?.deleteJob(id); // removes the surviving row + result + any buffered insert
     }
+    // A recycled custom id may carry a stale timeout marker from a prior job (which
+    // may have DLQ'd, so it is NOT in completedJobs above). Clear it so the new
+    // job's stall-retry recovery is not wrongly discarded — otherwise the
+    // timeout-resurrection guard would reintroduce #33/#75 duplicate execution for
+    // reused ids.
+    ctx.timedOutJobs?.delete(id);
     ctx.customIdMap.set(input.customId, id);
     return { skip: false, id };
 }
@@ -115,7 +121,8 @@ function handleDeduplication(job, input, queue, shard, ctx) {
  */
 function insertJobToShard(job, queue, shard, shardIdx, ctx) {
     const hasDeps = job.dependsOn.length > 0;
-    const needsWaiting = hasDeps && !job.dependsOn.every((depId) => ctx.completedJobs.has(depId));
+    const needsWaiting = hasDeps &&
+        !job.dependsOn.every((depId) => ctx.completedJobs.has(depId) || (ctx.depCompletions?.has(depId) ?? false));
     const now = Date.now();
     if (needsWaiting) {
         shard.waitingDeps.set(job.id, job);
@@ -201,6 +208,10 @@ export async function pushJobBatch(queue, inputs, ctx) {
     const idx = shardIndex(queue);
     const resultIds = [];
     const jobsToInsert = [];
+    // Jobs flagged durable must bypass the 10ms write buffer (immediate fsync
+    // path), exactly like a single durable push — otherwise addBulk silently
+    // downgrades the documented "durable" guarantee.
+    const durableJobs = [];
     await withWriteLock(ctx.shardLocks[idx], () => {
         const shard = ctx.shards[idx];
         for (const input of inputs) {
@@ -220,6 +231,8 @@ export async function pushJobBatch(queue, inputs, ctx) {
             // Insert to shard
             insertJobToShard(job, queue, shard, idx, ctx);
             jobsToInsert.push(job);
+            if (input.durable)
+                durableJobs.push(job);
             resultIds.push(job.id);
         }
         if (jobsToInsert.length > 0) {
@@ -227,7 +240,18 @@ export async function pushJobBatch(queue, inputs, ctx) {
         }
     });
     if (jobsToInsert.length > 0) {
-        ctx.storage?.insertJobsBatch(jobsToInsert);
+        if (durableJobs.length === 0) {
+            ctx.storage?.insertJobsBatch(jobsToInsert);
+        }
+        else {
+            // Durable jobs bypass the write buffer (immediate disk write); the rest
+            // still go through the batched buffer for throughput.
+            const durableSet = new Set(durableJobs);
+            const buffered = jobsToInsert.filter((j) => !durableSet.has(j));
+            if (buffered.length > 0)
+                ctx.storage?.insertJobsBatch(buffered);
+            ctx.storage?.insertJobsBatch(durableJobs, true);
+        }
         ctx.totalPushed.value += BigInt(jobsToInsert.length);
         throughputTracker.pushRate.increment(jobsToInsert.length);
         for (const job of jobsToInsert) {

package/dist/application/operations/queryOperations.js CHANGED Viewed

@@ -24,7 +24,10 @@ export async function getJob(jobId, ctx) {
         case 'queue': {
             return await withReadLock(ctx.shardLocks[location.shardIdx], () => {
                 const shard = ctx.shards[location.shardIdx];
-                return (shard.getQueue(location.queueName).find(jobId) ?? shard.waitingDeps.get(jobId) ?? null);
+                return (shard.getQueue(location.queueName).find(jobId) ??
+                    shard.waitingDeps.get(jobId) ??
+                    shard.waitingChildren.get(jobId) ??
+                    null);
             });
         }
         case 'processing': {
@@ -63,7 +66,10 @@ export function getJobByCustomId(customId, ctx) {
         return null;
     if (location.type === 'queue') {
         const shard = ctx.shards[location.shardIdx];
-        return shard.getQueue(location.queueName).find(jobId) ?? shard.waitingDeps.get(jobId) ?? null;
+        return (shard.getQueue(location.queueName).find(jobId) ??
+            shard.waitingDeps.get(jobId) ??
+            shard.waitingChildren.get(jobId) ??
+            null);
     }
     if (location.type === 'processing') {
         return ctx.processingShards[location.shardIdx].get(jobId) ?? null;

package/dist/application/queueManager.d.ts CHANGED Viewed

@@ -28,6 +28,8 @@ export declare class QueueManager {
     private readonly jobIndex;
     private readonly completedJobs;
     private readonly completedJobsData;
+    private readonly depCompletions;
+    private readonly timedOutJobs;
     private readonly jobResults;
     private readonly customIdMap;
     private readonly jobLogs;
@@ -227,6 +229,7 @@ export declare class QueueManager {
     unregisterWorkersByClientId(clientId: string): number;
     getJobIndex(): Map<JobId, JobLocation>;
     getCompletedJobs(): SetLike<JobId>;
+    getDepCompletions(): SetLike<JobId>;
     getShards(): Shard[];
     private onJobCompleted;
     /** Check if completing this job completes an entire flow */

package/dist/application/queueManager.js CHANGED Viewed

@@ -45,6 +45,16 @@ export class QueueManager {
     jobIndex = new Map();
     completedJobs;
     completedJobsData;
+    // Bare completion ids of removeOnComplete jobs — kept ONLY so dependent jobs
+    // can unblock (no payload, not surfaced in state/stats). Bounded like
+    // completedJobs; entries are pruned by the dependency processor once consumed.
+    depCompletions;
+    // Ids of jobs failed by the timeout sweep. A late ACK whose lock token no
+    // longer matches (the job was requeued for retry) is discarded for these,
+    // instead of phantom-completing the job and skipping the retry. Bounded;
+    // never needs explicit clearing because a legit retry ACK carries a valid
+    // current token and bypasses the stale-token recovery path entirely.
+    timedOutJobs;
     jobResults;
     customIdMap;
     jobLogs;
@@ -83,7 +93,13 @@ export class QueueManager {
         totalCompleted: { value: 0n },
         totalFailed: { value: 0n },
     };
-    perQueueMetrics = new Map();
+    // LRU-bounded so high-cardinality / dynamically-named queues cannot grow it
+    // without bound. Live queues stay resident (recently accessed on every
+    // ack/fail); only long-idle ephemeral names are evicted. obliterate() also
+    // deletes the entry explicitly. Reclaiming on a transient drain is avoided
+    // on purpose so cumulative per-queue counters survive idle periods.
+    // Assigned in the constructor (needs this.config).
+    perQueueMetrics;
     startTime = Date.now();
     // Background task handles
     backgroundTaskHandles;
@@ -102,6 +118,9 @@ export class QueueManager {
             this.jobIndex.delete(jobId);
             this.completedJobsData.delete(jobId);
         });
+        this.depCompletions = new BoundedSet(this.config.maxCompletedJobs);
+        this.timedOutJobs = new BoundedSet(this.config.maxCompletedJobs);
+        this.perQueueMetrics = new LRUMap(this.config.maxCustomIds);
         this.jobResults = new LRUMap(this.config.maxJobResults);
         this.customIdMap = new LRUMap(this.config.maxCustomIds);
         this.jobLogs = new LRUMap(this.config.maxJobLogs);
@@ -154,6 +173,8 @@ export class QueueManager {
             jobIndex: this.jobIndex,
             completedJobs: this.completedJobs,
             completedJobsData: this.completedJobsData,
+            depCompletions: this.depCompletions,
+            timedOutJobs: this.timedOutJobs,
             jobResults: this.jobResults,
             customIdMap: this.customIdMap,
             jobLogs: this.jobLogs,
@@ -265,6 +286,13 @@ export class QueueManager {
                 // Job may have been stall-retried to queue while we processed it.
                 // Complete it from queue to prevent duplicate execution (Issue #33).
                 if (loc?.type === 'queue') {
+                    // BUT a job failed by the timeout sweep is requeued for RETRY — a late
+                    // ACK from the timed-out worker must not complete it (that would skip
+                    // the retry and silently override the timeout). Discard it gracefully.
+                    if (this.timedOutJobs.has(jobId)) {
+                        lockMgr.releaseLock(jobId, lockCtx, token);
+                        return;
+                    }
                     await this.completeStallRetriedJob(jobId, result);
                     lockMgr.releaseLock(jobId, lockCtx, token);
                 }
@@ -281,6 +309,13 @@ export class QueueManager {
             // Without token: only if job was stall-retried (attempts > 0), to avoid
             // completing freshly-pushed jobs that were never pulled.
             if (err instanceof Error && err.message.includes('not found')) {
+                // A timeout-failed job requeued for retry must not be completed by a
+                // stale ACK from the timed-out worker — discard it so the retry wins.
+                if (this.timedOutJobs.has(jobId)) {
+                    if (token)
+                        lockMgr.releaseLock(jobId, lockCtx, token);
+                    return;
+                }
                 const shouldRecover = token ?? this.isStallRetried(jobId);
                 if (shouldRecover && (await this.completeStallRetriedJob(jobId, result))) {
                     if (token)
@@ -306,7 +341,11 @@ export class QueueManager {
                     // from the queue to prevent duplicate execution.
                     const loc = this.jobIndex.get(jobIds[i]);
                     if (loc?.type === 'queue') {
-                        await this.completeStallRetriedJob(jobIds[i], undefined);
+                        // Skip completion for a timeout-requeued job (retry must win); else
+                        // recover the stall-retried job to prevent duplicate execution (#75).
+                        if (!this.timedOutJobs.has(jobIds[i])) {
+                            await this.completeStallRetriedJob(jobIds[i], undefined);
+                        }
                         lockMgr.releaseLock(jobIds[i], lockCtx, t);
                     }
                     continue;
@@ -344,7 +383,11 @@ export class QueueManager {
                 // from the queue to prevent duplicate execution.
                 const loc = this.jobIndex.get(item.id);
                 if (loc?.type === 'queue') {
-                    await this.completeStallRetriedJob(item.id, item.result);
+                    // Skip completion for a timeout-requeued job (retry must win); else
+                    // recover the stall-retried job to prevent duplicate execution (#75).
+                    if (!this.timedOutJobs.has(item.id)) {
+                        await this.completeStallRetriedJob(item.id, item.result);
+                    }
                     lockMgr.releaseLock(item.id, lockCtx, item.token);
                 }
                 continue;
@@ -688,6 +731,11 @@ export class QueueManager {
         }
         for (const cid of customIdsToDelete)
             this.customIdMap.delete(cid);
+        // Per-queue cumulative counters are keyed by queue name and never expire on
+        // their own; obliterate is the documented way to reclaim ALL state for a
+        // queue, so drop its metrics entry too (prevents unbounded growth for
+        // ephemeral/dynamically-named queues).
+        this.perQueueMetrics.delete(queue);
         this.unregisterQueueName(queue);
         this.dashboardEmit?.('queue:obliterated', { queue });
         this.dashboardEmit?.('queue:removed', { queue });
@@ -993,6 +1041,12 @@ export class QueueManager {
     getCompletedJobs() {
         return this.completedJobs;
     }
+    // Bare completion ids of removeOnComplete jobs. The PUSH gate consults this so a
+    // late dependent on an evicted removeOnComplete parent is admitted (same window
+    // the readiness path / dependency processor already honor).
+    getDepCompletions() {
+        return this.depCompletions;
+    }
     getShards() {
         return this.shards;
     }
@@ -1421,6 +1475,8 @@ export class QueueManager {
         this.jobIndex.clear();
         this.completedJobs.clear();
         this.completedJobsData.clear();
+        this.depCompletions.clear();
+        this.timedOutJobs.clear();
         this.jobResults.clear();
         this.jobLogs.clear();
         this.customIdMap.clear();

package/dist/application/types.d.ts CHANGED Viewed

@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
 import type { Shard } from '../domain/queue/shard';
 import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
 import type { RWLock } from '../shared/lock';
-import type { LRUMap, BoundedSet, BoundedMap, SetLike } from '../shared/lru';
+import type { LRUMap, BoundedSet, BoundedMap, SetLike, MapLike } from '../shared/lru';
 import type { EventsManager } from './eventsManager';
 import type { WebhookManager } from './webhookManager';
 import type { WorkerManager } from './workerManager';
@@ -76,7 +76,7 @@ export interface QueueManagerState {
         };
     };
     readonly startTime: number;
-    readonly perQueueMetrics: Map<string, {
+    readonly perQueueMetrics: MapLike<string, {
         totalCompleted: bigint;
         totalFailed: bigint;
     }>;
@@ -103,6 +103,8 @@ export interface BackgroundContext extends QueueManagerState {
     workerManager: WorkerManager;
     monitoringState: MonitoringState;
     completedJobsData: BoundedMap<JobId, Job>;
+    depCompletions?: BoundedSet<JobId>;
+    timedOutJobs?: BoundedSet<JobId>;
 }
 /** Context for stats operations */
 export interface StatsContext {
@@ -132,7 +134,7 @@ export interface StatsContext {
         };
     };
     startTime: number;
-    perQueueMetrics?: Map<string, {
+    perQueueMetrics?: MapLike<string, {
         totalCompleted: bigint;
         totalFailed: bigint;
     }>;

package/dist/client/bunqueue.d.ts CHANGED Viewed

@@ -37,7 +37,7 @@ export declare class Bunqueue<T = unknown, R = unknown> {
         opts?: JobOptions;
     }>): Promise<Job<T>[]>;
     getJob(id: string): Promise<Job<T> | null>;
-    getJobCounts(): import("./queue/operations").JobCounts;
+    getJobCounts(): import("./queue/operations").JobCounts | Promise<import("./queue/operations").JobCounts>;
     getJobCountsAsync(): Promise<import("./queue/operations").JobCounts>;
     count(): number;
     countAsync(): Promise<number>;

package/dist/client/queue/operations/counts.d.ts CHANGED Viewed

@@ -17,8 +17,13 @@ export interface JobCounts {
     delayed: number;
     paused: number;
 }
-/** Get job counts (sync, embedded only) */
-export declare function getJobCounts(ctx: CountsContext): JobCounts;
+/**
+ * Get job counts.
+ * Embedded mode returns synchronously. TCP mode delegates to the async path so
+ * callers receive the REAL server-side counts (awaitable Promise) instead of
+ * hardcoded zeros — defect: getjobcounts-tcp-zero.
+ */
+export declare function getJobCounts(ctx: CountsContext): JobCounts | Promise<JobCounts>;
 /** Get job counts (async, works with TCP) */
 export declare function getJobCountsAsync(ctx: CountsContext): Promise<JobCounts>;
 /** Get waiting job count */

package/dist/client/queue/operations/counts.js CHANGED Viewed

@@ -5,18 +5,15 @@
  */
 import { getSharedManager } from '../../manager';
 import { pausedView } from '../../../shared/pausedView';
-/** Get job counts (sync, embedded only) */
+/**
+ * Get job counts.
+ * Embedded mode returns synchronously. TCP mode delegates to the async path so
+ * callers receive the REAL server-side counts (awaitable Promise) instead of
+ * hardcoded zeros — defect: getjobcounts-tcp-zero.
+ */
 export function getJobCounts(ctx) {
     if (!ctx.embedded) {
-        return {
-            waiting: 0,
-            prioritized: 0,
-            active: 0,
-            completed: 0,
-            failed: 0,
-            delayed: 0,
-            paused: 0,
-        };
+        return getJobCountsAsync(ctx);
     }
     const manager = getSharedManager();
     // Use queue-specific counts

package/dist/client/queue/queue.d.ts CHANGED Viewed

@@ -62,7 +62,7 @@ export declare class Queue<T = unknown> {
     getCompletedAsync(start?: number, end?: number): Promise<Job<T>[]>;
     getFailed(start?: number, end?: number): Job<T>[];
     getFailedAsync(start?: number, end?: number): Promise<Job<T>[]>;
-    getJobCounts(): countsOps.JobCounts;
+    getJobCounts(): countsOps.JobCounts | Promise<countsOps.JobCounts>;
     getJobCountsAsync(): Promise<countsOps.JobCounts>;
     getWaitingCount(): Promise<number>;
     getActiveCount(): Promise<number>;

package/dist/client/tcpPool.d.ts CHANGED Viewed

@@ -27,6 +27,13 @@ export declare class TcpConnectionPool {
     send(command: Record<string, unknown>): Promise<Record<string, unknown>>;
     /** Send multiple commands in parallel across pool */
     sendParallel(commands: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>>;
+    /**
+     * Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
+     * 'connected' on every successful connect, including reconnects. Used by
+     * Worker to re-register after a reconnect (the server drops registration when
+     * the registering connection closes and each reconnect gets a fresh clientId).
+     */
+    onReconnect(cb: () => void): void;
     /** Check if any connection is ready */
     isConnected(): boolean;
     /** Get number of connected clients */

package/dist/client/tcpPool.js CHANGED Viewed

@@ -99,6 +99,17 @@ export class TcpConnectionPool {
         });
         return Promise.all(promises);
     }
+    /**
+     * Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
+     * 'connected' on every successful connect, including reconnects. Used by
+     * Worker to re-register after a reconnect (the server drops registration when
+     * the registering connection closes and each reconnect gets a fresh clientId).
+     */
+    onReconnect(cb) {
+        for (const client of this.clients) {
+            client.on('connected', cb);
+        }
+    }
     /** Check if any connection is ready */
     isConnected() {
         return this.clients.some((c) => c.isConnected());

package/dist/client/worker/worker.d.ts CHANGED Viewed

@@ -28,6 +28,7 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
     private running;
     private paused;
     private _closing;
+    private _forceClose;
     private _closingPromise;
     private closed;
     private activeJobs;
@@ -47,6 +48,7 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
     private pendingJobs;
     private pendingJobsHead;
     private processingScheduled;
+    private pendingPull;
     private lastDrainedEmit;
     private stalledUnsubscribe;
     on(event: 'ready' | 'drained' | 'closed', listener: () => void): this;
@@ -123,6 +125,16 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
     extendJobLocks(jobIds: string[], tokens: string[], duration: number): Promise<number>;
     close(force?: boolean): Promise<void>;
     private _doClose;
+    /**
+     * Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
+     * These jobs are `active` server-side holding a lock; moving them back to
+     * `waiting` makes them re-pullable so nothing is lost, and removes them from
+     * the close drain (which would otherwise hang on a buffer that can never be
+     * advanced while `_closing` is set). Best-effort: a job that can't be
+     * released (e.g. already completed/stall-retried) is simply dropped from the
+     * local buffer — its server-side lock will expire and requeue it.
+     */
+    private releaseBufferedJobs;
     private poll;
     private tryProcess;
     private registerPulledJobs;

package/dist/client/worker/worker.js CHANGED Viewed

@@ -13,7 +13,7 @@ import { parseJobFromResponse } from './jobParser';
 import { processJob } from './processor';
 import { WorkerRateLimiter } from './workerRateLimiter';
 import { GroupConcurrencyLimiter } from './groupConcurrency';
-import { startHeartbeat } from './workerHeartbeat';
+import { startHeartbeat, sendHeartbeat } from './workerHeartbeat';
 import { pullEmbedded, pullTcp } from './workerPull';
 import { resolveToken } from '../resolveToken';
 /** Resolve WorkerOptions into ExtendedWorkerOptions with defaults */
@@ -75,6 +75,9 @@ export class Worker extends EventEmitter {
     running = false;
     paused = false;
     _closing = false;
+    // Set when a force close is requested — allows close(true) to pre-empt an
+    // in-progress graceful close(false) by breaking out of its drain loop.
+    _forceClose = false;
     _closingPromise = null;
     closed = false;
     activeJobs = 0;
@@ -99,6 +102,9 @@ export class Worker extends EventEmitter {
     pendingJobs = [];
     pendingJobsHead = 0;
     processingScheduled = false; // Prevent multiple setImmediate calls
+    // Slots reserved by in-flight doPullBatch() calls (Issue #98). Subtracted from
+    // free slots so overlapping pulls see each other and do not over-lease.
+    pendingPull = 0;
     // Drained event tracking
     lastDrainedEmit = 0;
     // Stalled event subscription (BullMQ v5 compatible)
@@ -140,6 +146,17 @@ export class Worker extends EventEmitter {
             this.tcpPool = createTcpPool(opts, this.opts.concurrency);
             this.tcp = this.tcpPool;
             this.ackBatcher.setTcp(this.tcp);
+            // The server drops worker registration when the registering connection
+            // closes, and each pooled reconnect gets a fresh server clientId. Re-send
+            // RegisterWorker on reconnect so the worker stays visible in
+            // WorkerManager (ListWorkers / getForQueue / skipIfNoWorker) while it
+            // keeps consuming jobs. Only acts once we were actually registered.
+            this.tcpPool.onReconnect(() => {
+                if (this.closed || this._closing || !this.registered)
+                    return;
+                this.registered = false;
+                this.registerWithServer();
+            });
         }
         if (this.opts.autorun)
             this.run();
@@ -151,6 +168,7 @@ export class Worker extends EventEmitter {
         this.running = true;
         this.paused = false;
         this._closing = false;
+        this._forceClose = false;
         this._closingPromise = null;
         // Defer the 'ready' emit so listeners attached synchronously after
         // construction (e.g. `new Worker(...).on('ready', ...)`) still receive it.
@@ -411,8 +429,16 @@ export class Worker extends EventEmitter {
     async close(force = false) {
         if (this.closed)
             return;
-        if (this._closingPromise)
+        // A force close must be able to pre-empt an in-progress graceful close:
+        // the graceful drain only waits on genuinely in-flight jobs, but a caller
+        // asking to force-close wants to stop waiting now. Flip the force flag so
+        // the running _doClose's drain loop exits, and return the same promise.
+        if (this._closingPromise) {
+            if (force)
+                this._forceClose = true;
             return this._closingPromise;
+        }
+        this._forceClose = force;
         this._closingPromise = this._doClose(force);
         return this._closingPromise;
     }
@@ -432,9 +458,19 @@ export class Worker extends EventEmitter {
             clearInterval(this.workerHeartbeatTimer);
             this.workerHeartbeatTimer = null;
         }
+        // Release buffered pulled-but-unstarted jobs back to the queue. During
+        // _closing every code path that would advance the buffer (poll/tryProcess/
+        // the startJob().finally re-poll) early-returns, so a buffered job is never
+        // started — yet it holds a server-side lock and sits in `active` state.
+        // Without this, a graceful drain that waited on the buffer would hang
+        // forever (e.g. group-limited jobs buffered behind a max:1 limiter). We
+        // requeue them (Active -> Waiting) so they are re-pullable and not lost.
+        await this.releaseBufferedJobs();
         if (!force) {
-            const bufferSize = () => this.pendingJobs.length - this.pendingJobsHead;
-            while (this.activeJobs > 0 || bufferSize() > 0) {
+            // Wait only on genuinely in-flight jobs. The buffer was released above,
+            // so it must NOT be part of the drain condition. A concurrent force
+            // close (this._forceClose) breaks out immediately.
+            while (this.activeJobs > 0 && !this._forceClose) {
                 await Bun.sleep(50);
             }
         }
@@ -470,6 +506,45 @@ export class Worker extends EventEmitter {
         this._closing = false;
         this.emit('closed');
     }
+    /**
+     * Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
+     * These jobs are `active` server-side holding a lock; moving them back to
+     * `waiting` makes them re-pullable so nothing is lost, and removes them from
+     * the close drain (which would otherwise hang on a buffer that can never be
+     * advanced while `_closing` is set). Best-effort: a job that can't be
+     * released (e.g. already completed/stall-retried) is simply dropped from the
+     * local buffer — its server-side lock will expire and requeue it.
+     */
+    async releaseBufferedJobs() {
+        const buffered = this.pendingJobs.slice(this.pendingJobsHead);
+        this.pendingJobs = [];
+        this.pendingJobsHead = 0;
+        if (buffered.length === 0)
+            return;
+        for (const { job, token } of buffered) {
+            const id = String(job.id);
+            try {
+                if (this.embedded) {
+                    const manager = getSharedManager();
+                    await manager.moveActiveToWait(jobId(id));
+                    // moveActiveToWait re-queues the job (active -> waiting); release the
+                    // lock token we still hold so it is fully owner-free and re-pullable.
+                    if (this.opts.useLocks)
+                        manager.releaseLock(jobId(id), token ?? undefined);
+                }
+                else if (this.tcp) {
+                    await this.tcp.send({ cmd: 'MoveToWait', id });
+                }
+            }
+            catch {
+                // Best-effort: lock expiration will requeue anything we couldn't release.
+            }
+            finally {
+                this.pulledJobIds.delete(id);
+                this.jobTokens.delete(id);
+            }
+        }
+    }
     // ============ Processing Pipeline ============
     poll() {
         if (!this.running || this._closing)
@@ -568,6 +643,18 @@ export class Worker extends EventEmitter {
                 this.jobTokens.set(jobIdStr, pulledItem.token);
             }
         }
+        // Renew the just-pulled locks immediately (fire-and-forget). With a pooled
+        // client (poolSize > 1), the PULL and subsequent heartbeats travel on
+        // different connections; the server only releases an in-flight job on
+        // connection drop if its lock was never renewed (renewalCount === 0). The
+        // periodic heartbeat timer does not fire until one interval later, leaving a
+        // window where a dropped pulling-socket would re-dispatch a job the worker
+        // is actively running. An immediate renew closes that window. Only needed
+        // for multi-connection lock-based workers (poolSize 1 keeps pull+heartbeat
+        // on the same socket, so there is nothing to protect against).
+        if (this.opts.useLocks && items.length > 0 && this.tcpPool && this.tcpPool.getPoolSize() > 1) {
+            void sendHeartbeat(this.getHeartbeatDeps());
+        }
     }
     getBufferedJob() {
         if (this.pendingJobsHead >= this.pendingJobs.length)
@@ -619,14 +706,46 @@ export class Worker extends EventEmitter {
         return null;
     }
     async doPullBatch() {
-        const slots = this.opts.concurrency - this.activeJobs;
+        // Issue #98: cap the LEASED count (running + buffered + in-flight pulls) at
+        // `concurrency`, not just the running count. The old `concurrency - activeJobs`
+        // was read once and the pull leases jobs on the broker across an await, so:
+        //   1. several concurrent finally->poll->tryProcess runs each read the same
+        //      stale count and each pull a full batch, and
+        //   2. a job just pulled by one run sits in `pendingJobs` (leased, counted by
+        //      the heartbeat) but not yet in `activeJobs`, so an overlapping pull does
+        //      not see it.
+        // Both leak: with concurrency=3 the worker ends up holding 5-6 jobs leased.
+        // `pulledJobIds.size` is the true leased count (active + buffered; a job is
+        // removed only on completion), and `pendingPull` reserves slots for pulls
+        // still in flight whose jobs are not yet registered.
+        //
+        // Exception — group pull-ahead: when a group limiter is set AND the buffer is
+        // non-empty here (this branch is reached only after getNextEligibleJob() found
+        // nothing runnable, so those buffered jobs are group-blocked), the worker must
+        // pull ahead to discover jobs from other, runnable groups — otherwise it would
+        // wedge on a buffer full of one blocked group. In that case the blocked
+        // buffered jobs are not counted (only the running ones are). This preserves the
+        // existing group behavior; the reported over-pull (no group limiter) always
+        // uses the strict leased cap.
+        const groupBlockedBuffer = this.groupLimiter !== null && this.pendingJobsHead < this.pendingJobs.length;
+        const leased = groupBlockedBuffer ? this.activeJobs : this.pulledJobIds.size;
+        const slots = this.opts.concurrency - leased - this.pendingPull;
         const batchSize = Math.min(this.opts.batchSize, slots, 1000);
         if (batchSize <= 0)
             return [];
         const config = this.getPullConfig();
-        return this.embedded
-            ? pullEmbedded(config, batchSize)
-            : pullTcp(config, this.tcp, batchSize, this._closing);
+        this.pendingPull += batchSize;
+        try {
+            return this.embedded
+                ? await pullEmbedded(config, batchSize)
+                : await pullTcp(config, this.tcp, batchSize, this._closing);
+        }
+        finally {
+            // Release the reservation. The pulled jobs are now registered/buffered (or
+            // the pull failed); either way the reservation has served its purpose for
+            // the duration of the in-flight pull.
+            this.pendingPull -= batchSize;
+        }
     }
     startJob(job, token) {
         const jobIdStr = String(job.id);

package/dist/infrastructure/persistence/sqlite.d.ts CHANGED Viewed

@@ -82,6 +82,8 @@ export declare class SqliteStorage {
     insertJob(job: Job, durable?: boolean): void;
     /** Insert job immediately (bypass buffer) */
     insertJobImmediate(job: Job): void;
+    /** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
+    private runInsertJobStmt;
     /**
      * Ensure a job's buffered INSERT has been written to disk before issuing a
      * state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
@@ -122,8 +124,13 @@ export declare class SqliteStorage {
     getJobStateRaw(jobId: JobId): string | null;
     /** Load all completed job IDs (for dependency recovery) */
     loadCompletedJobIds(): Set<JobId>;
-    /** Insert batch of jobs (adds to buffer) */
-    insertJobsBatch(jobs: Job[]): void;
+    /**
+     * Insert batch of jobs. By default the jobs go through the write buffer.
+     * When `durable` is true they bypass the buffer and are written to disk
+     * immediately (matching single-push durable semantics) — used for addBulk
+     * jobs flagged `durable: true`.
+     */
+    insertJobsBatch(jobs: Job[], durable?: boolean): void;
     /**
      * Query jobs by queue with optional state filter and pagination.
      * Uses idx_jobs_queue_state index for O(log n) lookups.

package/dist/infrastructure/persistence/sqlite.js CHANGED Viewed

@@ -244,11 +244,15 @@ export class SqliteStorage {
     /** Insert job immediately (bypass buffer) */
     insertJobImmediate(job) {
         this.safeWrite(() => {
-            this.statements
-                .get('insertJob')
-                .run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
+            this.runInsertJobStmt(job);
         });
     }
+    /** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
+    runInsertJobStmt(job) {
+        this.statements
+            .get('insertJob')
+            .run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
+    }
     /**
      * Ensure a job's buffered INSERT has been written to disk before issuing a
      * state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
@@ -420,11 +424,38 @@ export class SqliteStorage {
     /** Load all completed job IDs (for dependency recovery) */
     loadCompletedJobIds() {
         const rows = this.db.query('SELECT job_id FROM job_results').all();
-        return new Set(rows.map((r) => r.job_id));
+        const ids = new Set(rows.map((r) => r.job_id));
+        // A job acked with no/undefined result has state='completed' but NO job_results
+        // row. Include state='completed' ids so dependency recovery still sees it as
+        // done and unblocks dependents (instead of parking them forever).
+        const stateRows = this.db
+            .query("SELECT id FROM jobs WHERE state = 'completed'")
+            .all();
+        for (const r of stateRows)
+            ids.add(r.id);
+        return ids;
     }
     // ============ Bulk Operations ============
-    /** Insert batch of jobs (adds to buffer) */
-    insertJobsBatch(jobs) {
+    /**
+     * Insert batch of jobs. By default the jobs go through the write buffer.
+     * When `durable` is true they bypass the buffer and are written to disk
+     * immediately (matching single-push durable semantics) — used for addBulk
+     * jobs flagged `durable: true`.
+     */
+    insertJobsBatch(jobs, durable) {
+        if (durable) {
+            // Atomic immediate write: every row hits disk in ONE transaction, so a
+            // mid-batch failure rolls back the whole batch (no partial on-disk state)
+            // — bypassing the write buffer to honor the durable contract.
+            this.safeWrite(() => {
+                const tx = this.db.transaction((batch) => {
+                    for (const job of batch)
+                        this.runInsertJobStmt(job);
+                });
+                tx(jobs);
+            });
+            return;
+        }
         this.writeBuffer.addBatch(jobs);
     }
     // ============ Query Operations ============

package/dist/infrastructure/server/handlers/core.js CHANGED Viewed

@@ -29,7 +29,12 @@ export async function handlePush(cmd, ctx, reqId) {
         for (const depId of cmd.dependsOn) {
             const depJobId = jobId(depId);
             const exists = ctx.queueManager.getJobIndex().has(depJobId) ||
-                ctx.queueManager.getCompletedJobs().has(depJobId);
+                ctx.queueManager.getCompletedJobs().has(depJobId) ||
+                // A removeOnComplete parent that completed is recorded only here (its row
+                // is deleted and it leaves jobIndex/completedJobs). The readiness path and
+                // dependency processor already honor depCompletions; the gate must too, or
+                // a late dependent is wrongly rejected with "Dependency job not found".
+                ctx.queueManager.getDepCompletions().has(depJobId);
             if (!exists) {
                 return resp.error(`Dependency job not found: ${depId}`, reqId);
             }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bunqueue",
-  "version": "2.8.13",
+  "version": "2.8.18",
   "description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
   "type": "module",
   "main": "dist/main.js",