bunqueue 2.8.13 → 2.8.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/application/backgroundTasks.d.ts +1 -0
- package/dist/application/backgroundTasks.js +6 -1
- package/dist/application/cleanupTasks.js +5 -0
- package/dist/application/clientTracking.js +11 -0
- package/dist/application/contextFactory.d.ts +4 -2
- package/dist/application/contextFactory.js +5 -0
- package/dist/application/dependencyProcessor.js +9 -2
- package/dist/application/lockManager.js +16 -1
- package/dist/application/operations/ack.d.ts +3 -1
- package/dist/application/operations/ack.js +5 -0
- package/dist/application/operations/ackHelpers.d.ts +3 -1
- package/dist/application/operations/ackHelpers.js +5 -0
- package/dist/application/operations/jobManagement.js +10 -0
- package/dist/application/operations/push.d.ts +4 -0
- package/dist/application/operations/push.js +26 -2
- package/dist/application/operations/queryOperations.js +8 -2
- package/dist/application/queueManager.d.ts +3 -0
- package/dist/application/queueManager.js +59 -3
- package/dist/application/types.d.ts +5 -3
- package/dist/client/bunqueue.d.ts +1 -1
- package/dist/client/queue/operations/counts.d.ts +7 -2
- package/dist/client/queue/operations/counts.js +7 -10
- package/dist/client/queue/queue.d.ts +1 -1
- package/dist/client/tcpPool.d.ts +7 -0
- package/dist/client/tcpPool.js +11 -0
- package/dist/client/worker/worker.d.ts +12 -0
- package/dist/client/worker/worker.js +127 -8
- package/dist/infrastructure/persistence/sqlite.d.ts +9 -2
- package/dist/infrastructure/persistence/sqlite.js +37 -6
- package/dist/infrastructure/server/handlers/core.js +6 -1
- package/package.json +1 -1
|
@@ -26,5 +26,6 @@ export declare function startBackgroundTasks(ctx: BackgroundContext, cronSchedul
|
|
|
26
26
|
* Stop all background tasks
|
|
27
27
|
*/
|
|
28
28
|
export declare function stopBackgroundTasks(handles: BackgroundTaskHandles): void;
|
|
29
|
+
export declare function checkJobTimeouts(ctx: BackgroundContext): void;
|
|
29
30
|
export declare function recover(ctx: BackgroundContext): void;
|
|
30
31
|
export { processPendingDependencies };
|
|
@@ -108,7 +108,7 @@ function getLockContext(ctx) {
|
|
|
108
108
|
};
|
|
109
109
|
}
|
|
110
110
|
// ============ Job Timeouts ============
|
|
111
|
-
function checkJobTimeouts(ctx) {
|
|
111
|
+
export function checkJobTimeouts(ctx) {
|
|
112
112
|
const now = Date.now();
|
|
113
113
|
for (const procShard of ctx.processingShards) {
|
|
114
114
|
for (const [jobId, job] of procShard) {
|
|
@@ -118,6 +118,11 @@ function checkJobTimeouts(ctx) {
|
|
|
118
118
|
queue: job.queue,
|
|
119
119
|
timeout: job.timeout,
|
|
120
120
|
});
|
|
121
|
+
// Mark as timed-out BEFORE requeuing for retry, so a late ACK from the
|
|
122
|
+
// (still-hung) worker that exceeded the deadline is discarded instead of
|
|
123
|
+
// phantom-completing the job and skipping the retry. See ack-recovery in
|
|
124
|
+
// queueManager.ack (timedOutJobs guard).
|
|
125
|
+
ctx.timedOutJobs?.add(jobId);
|
|
121
126
|
ctx.fail(jobId, 'Job timeout exceeded').catch((err) => {
|
|
122
127
|
queueLog.error('Failed to mark timed out job as failed', {
|
|
123
128
|
jobId: String(jobId),
|
|
@@ -221,6 +221,11 @@ function cleanEmptyQueues(ctx) {
|
|
|
221
221
|
shard.clearQueueLimiters(queueName);
|
|
222
222
|
shard.stallConfig.delete(queueName);
|
|
223
223
|
shard.dlqConfig.delete(queueName);
|
|
224
|
+
// NOTE: perQueueMetrics is intentionally NOT pruned here — it is an
|
|
225
|
+
// LRU-bounded map and these counters are cumulative, so they must survive
|
|
226
|
+
// a transient drain (a busy queue momentarily empty must not reset to 0).
|
|
227
|
+
// obliterate() reclaims it explicitly; the LRU cap bounds growth for
|
|
228
|
+
// ephemeral/dynamically-named queues.
|
|
224
229
|
ctx.dashboardEmit?.('queue:removed', { queue: queueName });
|
|
225
230
|
ctx.unregisterQueueName(queueName);
|
|
226
231
|
}
|
|
@@ -47,6 +47,17 @@ export async function releaseClientJobs(clientId, ctx) {
|
|
|
47
47
|
const loc = ctx.jobIndex.get(jobId);
|
|
48
48
|
if (loc?.type !== 'processing')
|
|
49
49
|
continue;
|
|
50
|
+
// A job whose lock has been renewed since pull (renewalCount > 0) is being
|
|
51
|
+
// actively heartbeated by a live worker. With a pooled client, heartbeats
|
|
52
|
+
// travel on a DIFFERENT connection than the one that pulled, so THIS socket
|
|
53
|
+
// closing does not mean the worker died — re-queuing here would re-dispatch
|
|
54
|
+
// (double-execute) a job the worker still holds. Leave it; lock expiry /
|
|
55
|
+
// stall detection reclaims it if the worker truly stops heartbeating.
|
|
56
|
+
// A never-renewed lock (renewalCount === 0) keeps the original fast-recovery
|
|
57
|
+
// behavior: requeue immediately on disconnect.
|
|
58
|
+
const lock = ctx.jobLocks.get(jobId);
|
|
59
|
+
if (lock && lock.renewalCount > 0)
|
|
60
|
+
continue;
|
|
50
61
|
const procIdx = loc.shardIdx;
|
|
51
62
|
const job = ctx.processingShards[procIdx].get(jobId);
|
|
52
63
|
if (!job)
|
|
@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
|
|
|
7
7
|
import type { Shard } from '../domain/queue/shard';
|
|
8
8
|
import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
|
|
9
9
|
import type { RWLock } from '../shared/lock';
|
|
10
|
-
import type { LRUMap, BoundedSet, BoundedMap } from '../shared/lru';
|
|
10
|
+
import type { LRUMap, BoundedSet, BoundedMap, MapLike } from '../shared/lru';
|
|
11
11
|
import type { WebhookManager } from './webhookManager';
|
|
12
12
|
import type { WorkerManager } from './workerManager';
|
|
13
13
|
import type { EventsManager } from './eventsManager';
|
|
@@ -33,6 +33,8 @@ export interface ContextDependencies {
|
|
|
33
33
|
jobIndex: Map<JobId, JobLocation>;
|
|
34
34
|
completedJobs: BoundedSet<JobId>;
|
|
35
35
|
completedJobsData: BoundedMap<JobId, Job>;
|
|
36
|
+
depCompletions?: BoundedSet<JobId>;
|
|
37
|
+
timedOutJobs?: BoundedSet<JobId>;
|
|
36
38
|
jobResults: LRUMap<JobId, unknown>;
|
|
37
39
|
customIdMap: LRUMap<string, JobId>;
|
|
38
40
|
jobLogs: LRUMap<JobId, JobLogEntry[]>;
|
|
@@ -62,7 +64,7 @@ export interface ContextDependencies {
|
|
|
62
64
|
};
|
|
63
65
|
startTime: number;
|
|
64
66
|
maxLogsPerJob: number;
|
|
65
|
-
perQueueMetrics:
|
|
67
|
+
perQueueMetrics: MapLike<string, {
|
|
66
68
|
totalCompleted: bigint;
|
|
67
69
|
totalFailed: bigint;
|
|
68
70
|
}>;
|
|
@@ -34,6 +34,8 @@ export class ContextFactory {
|
|
|
34
34
|
processingLocks: this.deps.processingLocks,
|
|
35
35
|
jobIndex: this.deps.jobIndex,
|
|
36
36
|
completedJobs: this.deps.completedJobs,
|
|
37
|
+
depCompletions: this.deps.depCompletions,
|
|
38
|
+
timedOutJobs: this.deps.timedOutJobs,
|
|
37
39
|
jobResults: this.deps.jobResults,
|
|
38
40
|
customIdMap: this.deps.customIdMap,
|
|
39
41
|
jobLogs: this.deps.jobLogs,
|
|
@@ -81,6 +83,8 @@ export class ContextFactory {
|
|
|
81
83
|
shardLocks: this.deps.shardLocks,
|
|
82
84
|
completedJobs: this.deps.completedJobs,
|
|
83
85
|
completedJobsData: this.deps.completedJobsData,
|
|
86
|
+
depCompletions: this.deps.depCompletions,
|
|
87
|
+
timedOutJobs: this.deps.timedOutJobs,
|
|
84
88
|
jobResults: this.deps.jobResults,
|
|
85
89
|
customIdMap: this.deps.customIdMap,
|
|
86
90
|
jobIndex: this.deps.jobIndex,
|
|
@@ -111,6 +115,7 @@ export class ContextFactory {
|
|
|
111
115
|
processingLocks: this.deps.processingLocks,
|
|
112
116
|
completedJobs: this.deps.completedJobs,
|
|
113
117
|
completedJobsData: this.deps.completedJobsData,
|
|
118
|
+
depCompletions: this.deps.depCompletions,
|
|
114
119
|
jobResults: this.deps.jobResults,
|
|
115
120
|
jobIndex: this.deps.jobIndex,
|
|
116
121
|
customIdMap: this.deps.customIdMap,
|
|
@@ -37,10 +37,12 @@ export async function processPendingDependencies(ctx) {
|
|
|
37
37
|
await withWriteLock(ctx.shardLocks[i], () => {
|
|
38
38
|
const shard = ctx.shards[i];
|
|
39
39
|
const jobsToPromote = [];
|
|
40
|
-
// Check which jobs have all dependencies satisfied (inside lock)
|
|
40
|
+
// Check which jobs have all dependencies satisfied (inside lock).
|
|
41
|
+
// A removeOnComplete parent is not in completedJobs (its full record was
|
|
42
|
+
// dropped to bound memory), so also honor its bare-id depCompletions entry.
|
|
41
43
|
for (const jobId of jobIdsToCheck) {
|
|
42
44
|
const job = shard.waitingDeps.get(jobId);
|
|
43
|
-
if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep))) {
|
|
45
|
+
if (job?.dependsOn.every((dep) => ctx.completedJobs.has(dep) || (ctx.depCompletions?.has(dep) ?? false))) {
|
|
44
46
|
jobsToPromote.push(job);
|
|
45
47
|
}
|
|
46
48
|
}
|
|
@@ -50,6 +52,11 @@ export async function processPendingDependencies(ctx) {
|
|
|
50
52
|
}
|
|
51
53
|
});
|
|
52
54
|
}));
|
|
55
|
+
// NOTE: depCompletions is intentionally NOT pruned here. It is a FIFO
|
|
56
|
+
// BoundedSet (same cap as completedJobs), so it self-bounds. Pruning eagerly
|
|
57
|
+
// once "no waiters remain" would orphan a dependent pushed AFTER a
|
|
58
|
+
// removeOnComplete parent completed — exactly the symmetry completedJobs
|
|
59
|
+
// provides for normal parents (readiness holds for the whole bounded window).
|
|
53
60
|
}
|
|
54
61
|
/** Move jobs from waitingDeps to the active queue */
|
|
55
62
|
function promoteJobsToQueue(jobsToPromote, shard, ctx, shardIdx) {
|
|
@@ -111,8 +111,19 @@ function processExpiredLockInner(jobId, lock, job, shardIdx, procIdx, ctx, now)
|
|
|
111
111
|
/** Move job to DLQ when max stalls exceeded */
|
|
112
112
|
function handleMaxStallsExceeded(opts) {
|
|
113
113
|
const { jobId, job, lock, shard, ctx, now } = opts;
|
|
114
|
-
|
|
114
|
+
// Release the concurrency slot (+group+uniqueKey) acquired at pull before
|
|
115
|
+
// moving to DLQ — otherwise the slot leaks (mirrors
|
|
116
|
+
// stallDetection.moveStalliedJobToDlq).
|
|
117
|
+
shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
|
|
118
|
+
const entry = shard.addToDlq(job, "stalled" /* FailureReason.Stalled */, `Lock expired after ${lock.renewalCount} renewals`);
|
|
115
119
|
ctx.jobIndex.set(jobId, { type: 'dlq', queueName: job.queue });
|
|
120
|
+
// Persist the DLQ move like the sibling paths (ack.moveFailedJobToDlq,
|
|
121
|
+
// stallDetection.moveStalliedJobToDlq, backgroundTasks startup-recovery).
|
|
122
|
+
// Without these two writes the jobs row survives in SQLite as an orphan and
|
|
123
|
+
// the DLQ entry lives only in memory — a later retry then re-INSERTs the
|
|
124
|
+
// surviving row and throws `UNIQUE constraint failed: jobs.id` (#97).
|
|
125
|
+
ctx.storage?.saveDlqEntry(entry);
|
|
126
|
+
ctx.storage?.deleteJob(jobId);
|
|
116
127
|
ctx.eventsManager.broadcast({
|
|
117
128
|
eventType: "failed" /* EventType.Failed */,
|
|
118
129
|
jobId,
|
|
@@ -125,6 +136,10 @@ function handleMaxStallsExceeded(opts) {
|
|
|
125
136
|
function requeueExpiredJob(opts) {
|
|
126
137
|
const { jobId, job, queue, idx, ctx, now } = opts;
|
|
127
138
|
const shard = ctx.shards[idx];
|
|
139
|
+
// Release the concurrency slot (+group+uniqueKey) acquired at pull before
|
|
140
|
+
// re-pushing — otherwise the slot leaks and the queue wedges (mirrors
|
|
141
|
+
// stallDetection.retryStalliedJob).
|
|
142
|
+
shard.releaseJobResources(job.queue, job.uniqueKey, job.groupId);
|
|
128
143
|
queue.push(job);
|
|
129
144
|
const isDelayed = job.runAt > now;
|
|
130
145
|
shard.incrementQueued(jobId, isDelayed, job.createdAt, job.queue, job.runAt);
|
|
@@ -17,6 +17,8 @@ export interface AckContext {
|
|
|
17
17
|
processingLocks: RWLock[];
|
|
18
18
|
completedJobs: SetLike<JobId>;
|
|
19
19
|
completedJobsData: MapLike<JobId, Job>;
|
|
20
|
+
/** Bare completion ids for removeOnComplete jobs so dependents can unblock */
|
|
21
|
+
depCompletions?: SetLike<JobId>;
|
|
20
22
|
jobResults: MapLike<JobId, unknown>;
|
|
21
23
|
jobIndex: Map<JobId, JobLocation>;
|
|
22
24
|
customIdMap?: MapLike<string, JobId>;
|
|
@@ -26,7 +28,7 @@ export interface AckContext {
|
|
|
26
28
|
totalFailed: {
|
|
27
29
|
value: bigint;
|
|
28
30
|
};
|
|
29
|
-
perQueueMetrics?:
|
|
31
|
+
perQueueMetrics?: MapLike<string, {
|
|
30
32
|
totalCompleted: bigint;
|
|
31
33
|
totalFailed: bigint;
|
|
32
34
|
}>;
|
|
@@ -50,6 +50,11 @@ export async function ackJob(jobId, result, ctx) {
|
|
|
50
50
|
else {
|
|
51
51
|
ctx.jobIndex.delete(jobId);
|
|
52
52
|
ctx.storage?.deleteJob(jobId);
|
|
53
|
+
// removeOnComplete drops the full job (index + data + persisted row) to bound
|
|
54
|
+
// memory, but dependent jobs gate readiness on completedJobs.has(parentId).
|
|
55
|
+
// Record the bare completion id (no payload) so dependents still unblock,
|
|
56
|
+
// without making the job appear in state/stats queries.
|
|
57
|
+
ctx.depCompletions?.add(jobId);
|
|
53
58
|
}
|
|
54
59
|
ctx.totalCompleted.value++;
|
|
55
60
|
if (ctx.perQueueMetrics) {
|
|
@@ -55,13 +55,15 @@ export interface FinalizeContext {
|
|
|
55
55
|
storage: SqliteStorage | null;
|
|
56
56
|
completedJobs: SetLike<JobId>;
|
|
57
57
|
completedJobsData: MapLike<JobId, Job>;
|
|
58
|
+
/** Bare completion ids for removeOnComplete jobs so dependents can unblock */
|
|
59
|
+
depCompletions?: SetLike<JobId>;
|
|
58
60
|
jobResults: MapLike<JobId, unknown>;
|
|
59
61
|
jobIndex: Map<JobId, JobLocation>;
|
|
60
62
|
customIdMap?: MapLike<string, JobId>;
|
|
61
63
|
totalCompleted: {
|
|
62
64
|
value: bigint;
|
|
63
65
|
};
|
|
64
|
-
perQueueMetrics?:
|
|
66
|
+
perQueueMetrics?: MapLike<string, {
|
|
65
67
|
totalCompleted: bigint;
|
|
66
68
|
totalFailed: bigint;
|
|
67
69
|
}>;
|
|
@@ -164,6 +164,11 @@ export function finalizeBatchAck(extractedJobs, ctx, includeResults) {
|
|
|
164
164
|
ctx.jobIndex.delete(jobId);
|
|
165
165
|
if (hasStorage)
|
|
166
166
|
storage.deleteJob(jobId);
|
|
167
|
+
// removeOnComplete drops the full job to bound memory, but dependents gate
|
|
168
|
+
// readiness on completedJobs.has(parentId). Record the bare completion id
|
|
169
|
+
// (no payload) so dependent jobs still unblock, without surfacing the job
|
|
170
|
+
// in state/stats queries.
|
|
171
|
+
ctx.depCompletions?.add(jobId);
|
|
167
172
|
}
|
|
168
173
|
}
|
|
169
174
|
// Broadcast events
|
|
@@ -23,6 +23,16 @@ export async function cancelJob(jobId, ctx) {
|
|
|
23
23
|
ctx.storage?.deleteJob(jobId);
|
|
24
24
|
return { success: true, queueName: location.queueName };
|
|
25
25
|
}
|
|
26
|
+
// Not in the run queue — it may be parked in waitingChildren (moved via
|
|
27
|
+
// moveToWaitingChildren, which already released its resources and does not
|
|
28
|
+
// track it in the queued counter, so do NOT decrement/release here).
|
|
29
|
+
const parked = shard.waitingChildren.get(jobId);
|
|
30
|
+
if (parked) {
|
|
31
|
+
shard.waitingChildren.delete(jobId);
|
|
32
|
+
ctx.jobIndex.delete(jobId);
|
|
33
|
+
ctx.storage?.deleteJob(jobId);
|
|
34
|
+
return { success: true, queueName: location.queueName };
|
|
35
|
+
}
|
|
26
36
|
return { success: false, queueName: location.queueName };
|
|
27
37
|
});
|
|
28
38
|
if (result.success) {
|
|
@@ -15,6 +15,10 @@ export interface PushContext {
|
|
|
15
15
|
shardLocks: RWLock[];
|
|
16
16
|
completedJobs: SetLike<JobId>;
|
|
17
17
|
completedJobsData: MapLike<JobId, Job>;
|
|
18
|
+
/** Bare completion ids for removeOnComplete jobs so dependents start ready */
|
|
19
|
+
depCompletions?: SetLike<JobId>;
|
|
20
|
+
/** Timeout markers — cleared on custom-id reuse so a recycled id starts clean */
|
|
21
|
+
timedOutJobs?: SetLike<JobId>;
|
|
18
22
|
jobResults: MapLike<JobId, unknown>;
|
|
19
23
|
customIdMap: MapLike<string, JobId>;
|
|
20
24
|
jobIndex: Map<JobId, JobLocation>;
|
|
@@ -40,6 +40,12 @@ function handleCustomId(input, shard, ctx) {
|
|
|
40
40
|
ctx.jobIndex.delete(id);
|
|
41
41
|
ctx.storage?.deleteJob(id); // removes the surviving row + result + any buffered insert
|
|
42
42
|
}
|
|
43
|
+
// A recycled custom id may carry a stale timeout marker from a prior job (which
|
|
44
|
+
// may have DLQ'd, so it is NOT in completedJobs above). Clear it so the new
|
|
45
|
+
// job's stall-retry recovery is not wrongly discarded — otherwise the
|
|
46
|
+
// timeout-resurrection guard would reintroduce #33/#75 duplicate execution for
|
|
47
|
+
// reused ids.
|
|
48
|
+
ctx.timedOutJobs?.delete(id);
|
|
43
49
|
ctx.customIdMap.set(input.customId, id);
|
|
44
50
|
return { skip: false, id };
|
|
45
51
|
}
|
|
@@ -115,7 +121,8 @@ function handleDeduplication(job, input, queue, shard, ctx) {
|
|
|
115
121
|
*/
|
|
116
122
|
function insertJobToShard(job, queue, shard, shardIdx, ctx) {
|
|
117
123
|
const hasDeps = job.dependsOn.length > 0;
|
|
118
|
-
const needsWaiting = hasDeps &&
|
|
124
|
+
const needsWaiting = hasDeps &&
|
|
125
|
+
!job.dependsOn.every((depId) => ctx.completedJobs.has(depId) || (ctx.depCompletions?.has(depId) ?? false));
|
|
119
126
|
const now = Date.now();
|
|
120
127
|
if (needsWaiting) {
|
|
121
128
|
shard.waitingDeps.set(job.id, job);
|
|
@@ -201,6 +208,10 @@ export async function pushJobBatch(queue, inputs, ctx) {
|
|
|
201
208
|
const idx = shardIndex(queue);
|
|
202
209
|
const resultIds = [];
|
|
203
210
|
const jobsToInsert = [];
|
|
211
|
+
// Jobs flagged durable must bypass the 10ms write buffer (immediate fsync
|
|
212
|
+
// path), exactly like a single durable push — otherwise addBulk silently
|
|
213
|
+
// downgrades the documented "durable" guarantee.
|
|
214
|
+
const durableJobs = [];
|
|
204
215
|
await withWriteLock(ctx.shardLocks[idx], () => {
|
|
205
216
|
const shard = ctx.shards[idx];
|
|
206
217
|
for (const input of inputs) {
|
|
@@ -220,6 +231,8 @@ export async function pushJobBatch(queue, inputs, ctx) {
|
|
|
220
231
|
// Insert to shard
|
|
221
232
|
insertJobToShard(job, queue, shard, idx, ctx);
|
|
222
233
|
jobsToInsert.push(job);
|
|
234
|
+
if (input.durable)
|
|
235
|
+
durableJobs.push(job);
|
|
223
236
|
resultIds.push(job.id);
|
|
224
237
|
}
|
|
225
238
|
if (jobsToInsert.length > 0) {
|
|
@@ -227,7 +240,18 @@ export async function pushJobBatch(queue, inputs, ctx) {
|
|
|
227
240
|
}
|
|
228
241
|
});
|
|
229
242
|
if (jobsToInsert.length > 0) {
|
|
230
|
-
|
|
243
|
+
if (durableJobs.length === 0) {
|
|
244
|
+
ctx.storage?.insertJobsBatch(jobsToInsert);
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
// Durable jobs bypass the write buffer (immediate disk write); the rest
|
|
248
|
+
// still go through the batched buffer for throughput.
|
|
249
|
+
const durableSet = new Set(durableJobs);
|
|
250
|
+
const buffered = jobsToInsert.filter((j) => !durableSet.has(j));
|
|
251
|
+
if (buffered.length > 0)
|
|
252
|
+
ctx.storage?.insertJobsBatch(buffered);
|
|
253
|
+
ctx.storage?.insertJobsBatch(durableJobs, true);
|
|
254
|
+
}
|
|
231
255
|
ctx.totalPushed.value += BigInt(jobsToInsert.length);
|
|
232
256
|
throughputTracker.pushRate.increment(jobsToInsert.length);
|
|
233
257
|
for (const job of jobsToInsert) {
|
|
@@ -24,7 +24,10 @@ export async function getJob(jobId, ctx) {
|
|
|
24
24
|
case 'queue': {
|
|
25
25
|
return await withReadLock(ctx.shardLocks[location.shardIdx], () => {
|
|
26
26
|
const shard = ctx.shards[location.shardIdx];
|
|
27
|
-
return (shard.getQueue(location.queueName).find(jobId) ??
|
|
27
|
+
return (shard.getQueue(location.queueName).find(jobId) ??
|
|
28
|
+
shard.waitingDeps.get(jobId) ??
|
|
29
|
+
shard.waitingChildren.get(jobId) ??
|
|
30
|
+
null);
|
|
28
31
|
});
|
|
29
32
|
}
|
|
30
33
|
case 'processing': {
|
|
@@ -63,7 +66,10 @@ export function getJobByCustomId(customId, ctx) {
|
|
|
63
66
|
return null;
|
|
64
67
|
if (location.type === 'queue') {
|
|
65
68
|
const shard = ctx.shards[location.shardIdx];
|
|
66
|
-
return shard.getQueue(location.queueName).find(jobId) ??
|
|
69
|
+
return (shard.getQueue(location.queueName).find(jobId) ??
|
|
70
|
+
shard.waitingDeps.get(jobId) ??
|
|
71
|
+
shard.waitingChildren.get(jobId) ??
|
|
72
|
+
null);
|
|
67
73
|
}
|
|
68
74
|
if (location.type === 'processing') {
|
|
69
75
|
return ctx.processingShards[location.shardIdx].get(jobId) ?? null;
|
|
@@ -28,6 +28,8 @@ export declare class QueueManager {
|
|
|
28
28
|
private readonly jobIndex;
|
|
29
29
|
private readonly completedJobs;
|
|
30
30
|
private readonly completedJobsData;
|
|
31
|
+
private readonly depCompletions;
|
|
32
|
+
private readonly timedOutJobs;
|
|
31
33
|
private readonly jobResults;
|
|
32
34
|
private readonly customIdMap;
|
|
33
35
|
private readonly jobLogs;
|
|
@@ -227,6 +229,7 @@ export declare class QueueManager {
|
|
|
227
229
|
unregisterWorkersByClientId(clientId: string): number;
|
|
228
230
|
getJobIndex(): Map<JobId, JobLocation>;
|
|
229
231
|
getCompletedJobs(): SetLike<JobId>;
|
|
232
|
+
getDepCompletions(): SetLike<JobId>;
|
|
230
233
|
getShards(): Shard[];
|
|
231
234
|
private onJobCompleted;
|
|
232
235
|
/** Check if completing this job completes an entire flow */
|
|
@@ -45,6 +45,16 @@ export class QueueManager {
|
|
|
45
45
|
jobIndex = new Map();
|
|
46
46
|
completedJobs;
|
|
47
47
|
completedJobsData;
|
|
48
|
+
// Bare completion ids of removeOnComplete jobs — kept ONLY so dependent jobs
|
|
49
|
+
// can unblock (no payload, not surfaced in state/stats). Bounded like
|
|
50
|
+
// completedJobs; entries are pruned by the dependency processor once consumed.
|
|
51
|
+
depCompletions;
|
|
52
|
+
// Ids of jobs failed by the timeout sweep. A late ACK whose lock token no
|
|
53
|
+
// longer matches (the job was requeued for retry) is discarded for these,
|
|
54
|
+
// instead of phantom-completing the job and skipping the retry. Bounded;
|
|
55
|
+
// never needs explicit clearing because a legit retry ACK carries a valid
|
|
56
|
+
// current token and bypasses the stale-token recovery path entirely.
|
|
57
|
+
timedOutJobs;
|
|
48
58
|
jobResults;
|
|
49
59
|
customIdMap;
|
|
50
60
|
jobLogs;
|
|
@@ -83,7 +93,13 @@ export class QueueManager {
|
|
|
83
93
|
totalCompleted: { value: 0n },
|
|
84
94
|
totalFailed: { value: 0n },
|
|
85
95
|
};
|
|
86
|
-
|
|
96
|
+
// LRU-bounded so high-cardinality / dynamically-named queues cannot grow it
|
|
97
|
+
// without bound. Live queues stay resident (recently accessed on every
|
|
98
|
+
// ack/fail); only long-idle ephemeral names are evicted. obliterate() also
|
|
99
|
+
// deletes the entry explicitly. Reclaiming on a transient drain is avoided
|
|
100
|
+
// on purpose so cumulative per-queue counters survive idle periods.
|
|
101
|
+
// Assigned in the constructor (needs this.config).
|
|
102
|
+
perQueueMetrics;
|
|
87
103
|
startTime = Date.now();
|
|
88
104
|
// Background task handles
|
|
89
105
|
backgroundTaskHandles;
|
|
@@ -102,6 +118,9 @@ export class QueueManager {
|
|
|
102
118
|
this.jobIndex.delete(jobId);
|
|
103
119
|
this.completedJobsData.delete(jobId);
|
|
104
120
|
});
|
|
121
|
+
this.depCompletions = new BoundedSet(this.config.maxCompletedJobs);
|
|
122
|
+
this.timedOutJobs = new BoundedSet(this.config.maxCompletedJobs);
|
|
123
|
+
this.perQueueMetrics = new LRUMap(this.config.maxCustomIds);
|
|
105
124
|
this.jobResults = new LRUMap(this.config.maxJobResults);
|
|
106
125
|
this.customIdMap = new LRUMap(this.config.maxCustomIds);
|
|
107
126
|
this.jobLogs = new LRUMap(this.config.maxJobLogs);
|
|
@@ -154,6 +173,8 @@ export class QueueManager {
|
|
|
154
173
|
jobIndex: this.jobIndex,
|
|
155
174
|
completedJobs: this.completedJobs,
|
|
156
175
|
completedJobsData: this.completedJobsData,
|
|
176
|
+
depCompletions: this.depCompletions,
|
|
177
|
+
timedOutJobs: this.timedOutJobs,
|
|
157
178
|
jobResults: this.jobResults,
|
|
158
179
|
customIdMap: this.customIdMap,
|
|
159
180
|
jobLogs: this.jobLogs,
|
|
@@ -265,6 +286,13 @@ export class QueueManager {
|
|
|
265
286
|
// Job may have been stall-retried to queue while we processed it.
|
|
266
287
|
// Complete it from queue to prevent duplicate execution (Issue #33).
|
|
267
288
|
if (loc?.type === 'queue') {
|
|
289
|
+
// BUT a job failed by the timeout sweep is requeued for RETRY — a late
|
|
290
|
+
// ACK from the timed-out worker must not complete it (that would skip
|
|
291
|
+
// the retry and silently override the timeout). Discard it gracefully.
|
|
292
|
+
if (this.timedOutJobs.has(jobId)) {
|
|
293
|
+
lockMgr.releaseLock(jobId, lockCtx, token);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
268
296
|
await this.completeStallRetriedJob(jobId, result);
|
|
269
297
|
lockMgr.releaseLock(jobId, lockCtx, token);
|
|
270
298
|
}
|
|
@@ -281,6 +309,13 @@ export class QueueManager {
|
|
|
281
309
|
// Without token: only if job was stall-retried (attempts > 0), to avoid
|
|
282
310
|
// completing freshly-pushed jobs that were never pulled.
|
|
283
311
|
if (err instanceof Error && err.message.includes('not found')) {
|
|
312
|
+
// A timeout-failed job requeued for retry must not be completed by a
|
|
313
|
+
// stale ACK from the timed-out worker — discard it so the retry wins.
|
|
314
|
+
if (this.timedOutJobs.has(jobId)) {
|
|
315
|
+
if (token)
|
|
316
|
+
lockMgr.releaseLock(jobId, lockCtx, token);
|
|
317
|
+
return;
|
|
318
|
+
}
|
|
284
319
|
const shouldRecover = token ?? this.isStallRetried(jobId);
|
|
285
320
|
if (shouldRecover && (await this.completeStallRetriedJob(jobId, result))) {
|
|
286
321
|
if (token)
|
|
@@ -306,7 +341,11 @@ export class QueueManager {
|
|
|
306
341
|
// from the queue to prevent duplicate execution.
|
|
307
342
|
const loc = this.jobIndex.get(jobIds[i]);
|
|
308
343
|
if (loc?.type === 'queue') {
|
|
309
|
-
|
|
344
|
+
// Skip completion for a timeout-requeued job (retry must win); else
|
|
345
|
+
// recover the stall-retried job to prevent duplicate execution (#75).
|
|
346
|
+
if (!this.timedOutJobs.has(jobIds[i])) {
|
|
347
|
+
await this.completeStallRetriedJob(jobIds[i], undefined);
|
|
348
|
+
}
|
|
310
349
|
lockMgr.releaseLock(jobIds[i], lockCtx, t);
|
|
311
350
|
}
|
|
312
351
|
continue;
|
|
@@ -344,7 +383,11 @@ export class QueueManager {
|
|
|
344
383
|
// from the queue to prevent duplicate execution.
|
|
345
384
|
const loc = this.jobIndex.get(item.id);
|
|
346
385
|
if (loc?.type === 'queue') {
|
|
347
|
-
|
|
386
|
+
// Skip completion for a timeout-requeued job (retry must win); else
|
|
387
|
+
// recover the stall-retried job to prevent duplicate execution (#75).
|
|
388
|
+
if (!this.timedOutJobs.has(item.id)) {
|
|
389
|
+
await this.completeStallRetriedJob(item.id, item.result);
|
|
390
|
+
}
|
|
348
391
|
lockMgr.releaseLock(item.id, lockCtx, item.token);
|
|
349
392
|
}
|
|
350
393
|
continue;
|
|
@@ -688,6 +731,11 @@ export class QueueManager {
|
|
|
688
731
|
}
|
|
689
732
|
for (const cid of customIdsToDelete)
|
|
690
733
|
this.customIdMap.delete(cid);
|
|
734
|
+
// Per-queue cumulative counters are keyed by queue name and never expire on
|
|
735
|
+
// their own; obliterate is the documented way to reclaim ALL state for a
|
|
736
|
+
// queue, so drop its metrics entry too (prevents unbounded growth for
|
|
737
|
+
// ephemeral/dynamically-named queues).
|
|
738
|
+
this.perQueueMetrics.delete(queue);
|
|
691
739
|
this.unregisterQueueName(queue);
|
|
692
740
|
this.dashboardEmit?.('queue:obliterated', { queue });
|
|
693
741
|
this.dashboardEmit?.('queue:removed', { queue });
|
|
@@ -993,6 +1041,12 @@ export class QueueManager {
|
|
|
993
1041
|
getCompletedJobs() {
|
|
994
1042
|
return this.completedJobs;
|
|
995
1043
|
}
|
|
1044
|
+
// Bare completion ids of removeOnComplete jobs. The PUSH gate consults this so a
|
|
1045
|
+
// late dependent on an evicted removeOnComplete parent is admitted (same window
|
|
1046
|
+
// the readiness path / dependency processor already honor).
|
|
1047
|
+
getDepCompletions() {
|
|
1048
|
+
return this.depCompletions;
|
|
1049
|
+
}
|
|
996
1050
|
getShards() {
|
|
997
1051
|
return this.shards;
|
|
998
1052
|
}
|
|
@@ -1421,6 +1475,8 @@ export class QueueManager {
|
|
|
1421
1475
|
this.jobIndex.clear();
|
|
1422
1476
|
this.completedJobs.clear();
|
|
1423
1477
|
this.completedJobsData.clear();
|
|
1478
|
+
this.depCompletions.clear();
|
|
1479
|
+
this.timedOutJobs.clear();
|
|
1424
1480
|
this.jobResults.clear();
|
|
1425
1481
|
this.jobLogs.clear();
|
|
1426
1482
|
this.customIdMap.clear();
|
|
@@ -7,7 +7,7 @@ import type { JobLogEntry } from '../domain/types/worker';
|
|
|
7
7
|
import type { Shard } from '../domain/queue/shard';
|
|
8
8
|
import type { SqliteStorage } from '../infrastructure/persistence/sqlite';
|
|
9
9
|
import type { RWLock } from '../shared/lock';
|
|
10
|
-
import type { LRUMap, BoundedSet, BoundedMap, SetLike } from '../shared/lru';
|
|
10
|
+
import type { LRUMap, BoundedSet, BoundedMap, SetLike, MapLike } from '../shared/lru';
|
|
11
11
|
import type { EventsManager } from './eventsManager';
|
|
12
12
|
import type { WebhookManager } from './webhookManager';
|
|
13
13
|
import type { WorkerManager } from './workerManager';
|
|
@@ -76,7 +76,7 @@ export interface QueueManagerState {
|
|
|
76
76
|
};
|
|
77
77
|
};
|
|
78
78
|
readonly startTime: number;
|
|
79
|
-
readonly perQueueMetrics:
|
|
79
|
+
readonly perQueueMetrics: MapLike<string, {
|
|
80
80
|
totalCompleted: bigint;
|
|
81
81
|
totalFailed: bigint;
|
|
82
82
|
}>;
|
|
@@ -103,6 +103,8 @@ export interface BackgroundContext extends QueueManagerState {
|
|
|
103
103
|
workerManager: WorkerManager;
|
|
104
104
|
monitoringState: MonitoringState;
|
|
105
105
|
completedJobsData: BoundedMap<JobId, Job>;
|
|
106
|
+
depCompletions?: BoundedSet<JobId>;
|
|
107
|
+
timedOutJobs?: BoundedSet<JobId>;
|
|
106
108
|
}
|
|
107
109
|
/** Context for stats operations */
|
|
108
110
|
export interface StatsContext {
|
|
@@ -132,7 +134,7 @@ export interface StatsContext {
|
|
|
132
134
|
};
|
|
133
135
|
};
|
|
134
136
|
startTime: number;
|
|
135
|
-
perQueueMetrics?:
|
|
137
|
+
perQueueMetrics?: MapLike<string, {
|
|
136
138
|
totalCompleted: bigint;
|
|
137
139
|
totalFailed: bigint;
|
|
138
140
|
}>;
|
|
@@ -37,7 +37,7 @@ export declare class Bunqueue<T = unknown, R = unknown> {
|
|
|
37
37
|
opts?: JobOptions;
|
|
38
38
|
}>): Promise<Job<T>[]>;
|
|
39
39
|
getJob(id: string): Promise<Job<T> | null>;
|
|
40
|
-
getJobCounts(): import("./queue/operations").JobCounts
|
|
40
|
+
getJobCounts(): import("./queue/operations").JobCounts | Promise<import("./queue/operations").JobCounts>;
|
|
41
41
|
getJobCountsAsync(): Promise<import("./queue/operations").JobCounts>;
|
|
42
42
|
count(): number;
|
|
43
43
|
countAsync(): Promise<number>;
|
|
@@ -17,8 +17,13 @@ export interface JobCounts {
|
|
|
17
17
|
delayed: number;
|
|
18
18
|
paused: number;
|
|
19
19
|
}
|
|
20
|
-
/**
|
|
21
|
-
|
|
20
|
+
/**
|
|
21
|
+
* Get job counts.
|
|
22
|
+
* Embedded mode returns synchronously. TCP mode delegates to the async path so
|
|
23
|
+
* callers receive the REAL server-side counts (awaitable Promise) instead of
|
|
24
|
+
* hardcoded zeros — defect: getjobcounts-tcp-zero.
|
|
25
|
+
*/
|
|
26
|
+
export declare function getJobCounts(ctx: CountsContext): JobCounts | Promise<JobCounts>;
|
|
22
27
|
/** Get job counts (async, works with TCP) */
|
|
23
28
|
export declare function getJobCountsAsync(ctx: CountsContext): Promise<JobCounts>;
|
|
24
29
|
/** Get waiting job count */
|
|
@@ -5,18 +5,15 @@
|
|
|
5
5
|
*/
|
|
6
6
|
import { getSharedManager } from '../../manager';
|
|
7
7
|
import { pausedView } from '../../../shared/pausedView';
|
|
8
|
-
/**
|
|
8
|
+
/**
|
|
9
|
+
* Get job counts.
|
|
10
|
+
* Embedded mode returns synchronously. TCP mode delegates to the async path so
|
|
11
|
+
* callers receive the REAL server-side counts (awaitable Promise) instead of
|
|
12
|
+
* hardcoded zeros — defect: getjobcounts-tcp-zero.
|
|
13
|
+
*/
|
|
9
14
|
export function getJobCounts(ctx) {
|
|
10
15
|
if (!ctx.embedded) {
|
|
11
|
-
return
|
|
12
|
-
waiting: 0,
|
|
13
|
-
prioritized: 0,
|
|
14
|
-
active: 0,
|
|
15
|
-
completed: 0,
|
|
16
|
-
failed: 0,
|
|
17
|
-
delayed: 0,
|
|
18
|
-
paused: 0,
|
|
19
|
-
};
|
|
16
|
+
return getJobCountsAsync(ctx);
|
|
20
17
|
}
|
|
21
18
|
const manager = getSharedManager();
|
|
22
19
|
// Use queue-specific counts
|
|
@@ -62,7 +62,7 @@ export declare class Queue<T = unknown> {
|
|
|
62
62
|
getCompletedAsync(start?: number, end?: number): Promise<Job<T>[]>;
|
|
63
63
|
getFailed(start?: number, end?: number): Job<T>[];
|
|
64
64
|
getFailedAsync(start?: number, end?: number): Promise<Job<T>[]>;
|
|
65
|
-
getJobCounts(): countsOps.JobCounts
|
|
65
|
+
getJobCounts(): countsOps.JobCounts | Promise<countsOps.JobCounts>;
|
|
66
66
|
getJobCountsAsync(): Promise<countsOps.JobCounts>;
|
|
67
67
|
getWaitingCount(): Promise<number>;
|
|
68
68
|
getActiveCount(): Promise<number>;
|
package/dist/client/tcpPool.d.ts
CHANGED
|
@@ -27,6 +27,13 @@ export declare class TcpConnectionPool {
|
|
|
27
27
|
send(command: Record<string, unknown>): Promise<Record<string, unknown>>;
|
|
28
28
|
/** Send multiple commands in parallel across pool */
|
|
29
29
|
sendParallel(commands: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>>;
|
|
30
|
+
/**
|
|
31
|
+
* Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
|
|
32
|
+
* 'connected' on every successful connect, including reconnects. Used by
|
|
33
|
+
* Worker to re-register after a reconnect (the server drops registration when
|
|
34
|
+
* the registering connection closes and each reconnect gets a fresh clientId).
|
|
35
|
+
*/
|
|
36
|
+
onReconnect(cb: () => void): void;
|
|
30
37
|
/** Check if any connection is ready */
|
|
31
38
|
isConnected(): boolean;
|
|
32
39
|
/** Get number of connected clients */
|
package/dist/client/tcpPool.js
CHANGED
|
@@ -99,6 +99,17 @@ export class TcpConnectionPool {
|
|
|
99
99
|
});
|
|
100
100
|
return Promise.all(promises);
|
|
101
101
|
}
|
|
102
|
+
/**
|
|
103
|
+
* Invoke `cb` whenever a pooled connection (re)establishes. TcpClient emits
|
|
104
|
+
* 'connected' on every successful connect, including reconnects. Used by
|
|
105
|
+
* Worker to re-register after a reconnect (the server drops registration when
|
|
106
|
+
* the registering connection closes and each reconnect gets a fresh clientId).
|
|
107
|
+
*/
|
|
108
|
+
onReconnect(cb) {
|
|
109
|
+
for (const client of this.clients) {
|
|
110
|
+
client.on('connected', cb);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
102
113
|
/** Check if any connection is ready */
|
|
103
114
|
isConnected() {
|
|
104
115
|
return this.clients.some((c) => c.isConnected());
|
|
@@ -28,6 +28,7 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
|
|
|
28
28
|
private running;
|
|
29
29
|
private paused;
|
|
30
30
|
private _closing;
|
|
31
|
+
private _forceClose;
|
|
31
32
|
private _closingPromise;
|
|
32
33
|
private closed;
|
|
33
34
|
private activeJobs;
|
|
@@ -47,6 +48,7 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
|
|
|
47
48
|
private pendingJobs;
|
|
48
49
|
private pendingJobsHead;
|
|
49
50
|
private processingScheduled;
|
|
51
|
+
private pendingPull;
|
|
50
52
|
private lastDrainedEmit;
|
|
51
53
|
private stalledUnsubscribe;
|
|
52
54
|
on(event: 'ready' | 'drained' | 'closed', listener: () => void): this;
|
|
@@ -123,6 +125,16 @@ export declare class Worker<T = unknown, R = unknown> extends EventEmitter {
|
|
|
123
125
|
extendJobLocks(jobIds: string[], tokens: string[], duration: number): Promise<number>;
|
|
124
126
|
close(force?: boolean): Promise<void>;
|
|
125
127
|
private _doClose;
|
|
128
|
+
/**
|
|
129
|
+
* Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
|
|
130
|
+
* These jobs are `active` server-side holding a lock; moving them back to
|
|
131
|
+
* `waiting` makes them re-pullable so nothing is lost, and removes them from
|
|
132
|
+
* the close drain (which would otherwise hang on a buffer that can never be
|
|
133
|
+
* advanced while `_closing` is set). Best-effort: a job that can't be
|
|
134
|
+
* released (e.g. already completed/stall-retried) is simply dropped from the
|
|
135
|
+
* local buffer — its server-side lock will expire and requeue it.
|
|
136
|
+
*/
|
|
137
|
+
private releaseBufferedJobs;
|
|
126
138
|
private poll;
|
|
127
139
|
private tryProcess;
|
|
128
140
|
private registerPulledJobs;
|
|
@@ -13,7 +13,7 @@ import { parseJobFromResponse } from './jobParser';
|
|
|
13
13
|
import { processJob } from './processor';
|
|
14
14
|
import { WorkerRateLimiter } from './workerRateLimiter';
|
|
15
15
|
import { GroupConcurrencyLimiter } from './groupConcurrency';
|
|
16
|
-
import { startHeartbeat } from './workerHeartbeat';
|
|
16
|
+
import { startHeartbeat, sendHeartbeat } from './workerHeartbeat';
|
|
17
17
|
import { pullEmbedded, pullTcp } from './workerPull';
|
|
18
18
|
import { resolveToken } from '../resolveToken';
|
|
19
19
|
/** Resolve WorkerOptions into ExtendedWorkerOptions with defaults */
|
|
@@ -75,6 +75,9 @@ export class Worker extends EventEmitter {
|
|
|
75
75
|
running = false;
|
|
76
76
|
paused = false;
|
|
77
77
|
_closing = false;
|
|
78
|
+
// Set when a force close is requested — allows close(true) to pre-empt an
|
|
79
|
+
// in-progress graceful close(false) by breaking out of its drain loop.
|
|
80
|
+
_forceClose = false;
|
|
78
81
|
_closingPromise = null;
|
|
79
82
|
closed = false;
|
|
80
83
|
activeJobs = 0;
|
|
@@ -99,6 +102,9 @@ export class Worker extends EventEmitter {
|
|
|
99
102
|
pendingJobs = [];
|
|
100
103
|
pendingJobsHead = 0;
|
|
101
104
|
processingScheduled = false; // Prevent multiple setImmediate calls
|
|
105
|
+
// Slots reserved by in-flight doPullBatch() calls (Issue #98). Subtracted from
|
|
106
|
+
// free slots so overlapping pulls see each other and do not over-lease.
|
|
107
|
+
pendingPull = 0;
|
|
102
108
|
// Drained event tracking
|
|
103
109
|
lastDrainedEmit = 0;
|
|
104
110
|
// Stalled event subscription (BullMQ v5 compatible)
|
|
@@ -140,6 +146,17 @@ export class Worker extends EventEmitter {
|
|
|
140
146
|
this.tcpPool = createTcpPool(opts, this.opts.concurrency);
|
|
141
147
|
this.tcp = this.tcpPool;
|
|
142
148
|
this.ackBatcher.setTcp(this.tcp);
|
|
149
|
+
// The server drops worker registration when the registering connection
|
|
150
|
+
// closes, and each pooled reconnect gets a fresh server clientId. Re-send
|
|
151
|
+
// RegisterWorker on reconnect so the worker stays visible in
|
|
152
|
+
// WorkerManager (ListWorkers / getForQueue / skipIfNoWorker) while it
|
|
153
|
+
// keeps consuming jobs. Only acts once we were actually registered.
|
|
154
|
+
this.tcpPool.onReconnect(() => {
|
|
155
|
+
if (this.closed || this._closing || !this.registered)
|
|
156
|
+
return;
|
|
157
|
+
this.registered = false;
|
|
158
|
+
this.registerWithServer();
|
|
159
|
+
});
|
|
143
160
|
}
|
|
144
161
|
if (this.opts.autorun)
|
|
145
162
|
this.run();
|
|
@@ -151,6 +168,7 @@ export class Worker extends EventEmitter {
|
|
|
151
168
|
this.running = true;
|
|
152
169
|
this.paused = false;
|
|
153
170
|
this._closing = false;
|
|
171
|
+
this._forceClose = false;
|
|
154
172
|
this._closingPromise = null;
|
|
155
173
|
// Defer the 'ready' emit so listeners attached synchronously after
|
|
156
174
|
// construction (e.g. `new Worker(...).on('ready', ...)`) still receive it.
|
|
@@ -411,8 +429,16 @@ export class Worker extends EventEmitter {
|
|
|
411
429
|
async close(force = false) {
|
|
412
430
|
if (this.closed)
|
|
413
431
|
return;
|
|
414
|
-
|
|
432
|
+
// A force close must be able to pre-empt an in-progress graceful close:
|
|
433
|
+
// the graceful drain only waits on genuinely in-flight jobs, but a caller
|
|
434
|
+
// asking to force-close wants to stop waiting now. Flip the force flag so
|
|
435
|
+
// the running _doClose's drain loop exits, and return the same promise.
|
|
436
|
+
if (this._closingPromise) {
|
|
437
|
+
if (force)
|
|
438
|
+
this._forceClose = true;
|
|
415
439
|
return this._closingPromise;
|
|
440
|
+
}
|
|
441
|
+
this._forceClose = force;
|
|
416
442
|
this._closingPromise = this._doClose(force);
|
|
417
443
|
return this._closingPromise;
|
|
418
444
|
}
|
|
@@ -432,9 +458,19 @@ export class Worker extends EventEmitter {
|
|
|
432
458
|
clearInterval(this.workerHeartbeatTimer);
|
|
433
459
|
this.workerHeartbeatTimer = null;
|
|
434
460
|
}
|
|
461
|
+
// Release buffered pulled-but-unstarted jobs back to the queue. During
|
|
462
|
+
// _closing every code path that would advance the buffer (poll/tryProcess/
|
|
463
|
+
// the startJob().finally re-poll) early-returns, so a buffered job is never
|
|
464
|
+
// started — yet it holds a server-side lock and sits in `active` state.
|
|
465
|
+
// Without this, a graceful drain that waited on the buffer would hang
|
|
466
|
+
// forever (e.g. group-limited jobs buffered behind a max:1 limiter). We
|
|
467
|
+
// requeue them (Active -> Waiting) so they are re-pullable and not lost.
|
|
468
|
+
await this.releaseBufferedJobs();
|
|
435
469
|
if (!force) {
|
|
436
|
-
|
|
437
|
-
|
|
470
|
+
// Wait only on genuinely in-flight jobs. The buffer was released above,
|
|
471
|
+
// so it must NOT be part of the drain condition. A concurrent force
|
|
472
|
+
// close (this._forceClose) breaks out immediately.
|
|
473
|
+
while (this.activeJobs > 0 && !this._forceClose) {
|
|
438
474
|
await Bun.sleep(50);
|
|
439
475
|
}
|
|
440
476
|
}
|
|
@@ -470,6 +506,45 @@ export class Worker extends EventEmitter {
|
|
|
470
506
|
this._closing = false;
|
|
471
507
|
this.emit('closed');
|
|
472
508
|
}
|
|
509
|
+
/**
|
|
510
|
+
* Release all buffered (pulled-but-unstarted) jobs back to the queue on close.
|
|
511
|
+
* These jobs are `active` server-side holding a lock; moving them back to
|
|
512
|
+
* `waiting` makes them re-pullable so nothing is lost, and removes them from
|
|
513
|
+
* the close drain (which would otherwise hang on a buffer that can never be
|
|
514
|
+
* advanced while `_closing` is set). Best-effort: a job that can't be
|
|
515
|
+
* released (e.g. already completed/stall-retried) is simply dropped from the
|
|
516
|
+
* local buffer — its server-side lock will expire and requeue it.
|
|
517
|
+
*/
|
|
518
|
+
async releaseBufferedJobs() {
|
|
519
|
+
const buffered = this.pendingJobs.slice(this.pendingJobsHead);
|
|
520
|
+
this.pendingJobs = [];
|
|
521
|
+
this.pendingJobsHead = 0;
|
|
522
|
+
if (buffered.length === 0)
|
|
523
|
+
return;
|
|
524
|
+
for (const { job, token } of buffered) {
|
|
525
|
+
const id = String(job.id);
|
|
526
|
+
try {
|
|
527
|
+
if (this.embedded) {
|
|
528
|
+
const manager = getSharedManager();
|
|
529
|
+
await manager.moveActiveToWait(jobId(id));
|
|
530
|
+
// moveActiveToWait re-queues the job (active -> waiting); release the
|
|
531
|
+
// lock token we still hold so it is fully owner-free and re-pullable.
|
|
532
|
+
if (this.opts.useLocks)
|
|
533
|
+
manager.releaseLock(jobId(id), token ?? undefined);
|
|
534
|
+
}
|
|
535
|
+
else if (this.tcp) {
|
|
536
|
+
await this.tcp.send({ cmd: 'MoveToWait', id });
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
catch {
|
|
540
|
+
// Best-effort: lock expiration will requeue anything we couldn't release.
|
|
541
|
+
}
|
|
542
|
+
finally {
|
|
543
|
+
this.pulledJobIds.delete(id);
|
|
544
|
+
this.jobTokens.delete(id);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
473
548
|
// ============ Processing Pipeline ============
|
|
474
549
|
poll() {
|
|
475
550
|
if (!this.running || this._closing)
|
|
@@ -568,6 +643,18 @@ export class Worker extends EventEmitter {
|
|
|
568
643
|
this.jobTokens.set(jobIdStr, pulledItem.token);
|
|
569
644
|
}
|
|
570
645
|
}
|
|
646
|
+
// Renew the just-pulled locks immediately (fire-and-forget). With a pooled
|
|
647
|
+
// client (poolSize > 1), the PULL and subsequent heartbeats travel on
|
|
648
|
+
// different connections; the server only releases an in-flight job on
|
|
649
|
+
// connection drop if its lock was never renewed (renewalCount === 0). The
|
|
650
|
+
// periodic heartbeat timer does not fire until one interval later, leaving a
|
|
651
|
+
// window where a dropped pulling-socket would re-dispatch a job the worker
|
|
652
|
+
// is actively running. An immediate renew closes that window. Only needed
|
|
653
|
+
// for multi-connection lock-based workers (poolSize 1 keeps pull+heartbeat
|
|
654
|
+
// on the same socket, so there is nothing to protect against).
|
|
655
|
+
if (this.opts.useLocks && items.length > 0 && this.tcpPool && this.tcpPool.getPoolSize() > 1) {
|
|
656
|
+
void sendHeartbeat(this.getHeartbeatDeps());
|
|
657
|
+
}
|
|
571
658
|
}
|
|
572
659
|
getBufferedJob() {
|
|
573
660
|
if (this.pendingJobsHead >= this.pendingJobs.length)
|
|
@@ -619,14 +706,46 @@ export class Worker extends EventEmitter {
|
|
|
619
706
|
return null;
|
|
620
707
|
}
|
|
621
708
|
async doPullBatch() {
|
|
622
|
-
|
|
709
|
+
// Issue #98: cap the LEASED count (running + buffered + in-flight pulls) at
|
|
710
|
+
// `concurrency`, not just the running count. The old `concurrency - activeJobs`
|
|
711
|
+
// was read once and the pull leases jobs on the broker across an await, so:
|
|
712
|
+
// 1. several concurrent finally->poll->tryProcess runs each read the same
|
|
713
|
+
// stale count and each pull a full batch, and
|
|
714
|
+
// 2. a job just pulled by one run sits in `pendingJobs` (leased, counted by
|
|
715
|
+
// the heartbeat) but not yet in `activeJobs`, so an overlapping pull does
|
|
716
|
+
// not see it.
|
|
717
|
+
// Both leak: with concurrency=3 the worker ends up holding 5-6 jobs leased.
|
|
718
|
+
// `pulledJobIds.size` is the true leased count (active + buffered; a job is
|
|
719
|
+
// removed only on completion), and `pendingPull` reserves slots for pulls
|
|
720
|
+
// still in flight whose jobs are not yet registered.
|
|
721
|
+
//
|
|
722
|
+
// Exception — group pull-ahead: when a group limiter is set AND the buffer is
|
|
723
|
+
// non-empty here (this branch is reached only after getNextEligibleJob() found
|
|
724
|
+
// nothing runnable, so those buffered jobs are group-blocked), the worker must
|
|
725
|
+
// pull ahead to discover jobs from other, runnable groups — otherwise it would
|
|
726
|
+
// wedge on a buffer full of one blocked group. In that case the blocked
|
|
727
|
+
// buffered jobs are not counted (only the running ones are). This preserves the
|
|
728
|
+
// existing group behavior; the reported over-pull (no group limiter) always
|
|
729
|
+
// uses the strict leased cap.
|
|
730
|
+
const groupBlockedBuffer = this.groupLimiter !== null && this.pendingJobsHead < this.pendingJobs.length;
|
|
731
|
+
const leased = groupBlockedBuffer ? this.activeJobs : this.pulledJobIds.size;
|
|
732
|
+
const slots = this.opts.concurrency - leased - this.pendingPull;
|
|
623
733
|
const batchSize = Math.min(this.opts.batchSize, slots, 1000);
|
|
624
734
|
if (batchSize <= 0)
|
|
625
735
|
return [];
|
|
626
736
|
const config = this.getPullConfig();
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
737
|
+
this.pendingPull += batchSize;
|
|
738
|
+
try {
|
|
739
|
+
return this.embedded
|
|
740
|
+
? await pullEmbedded(config, batchSize)
|
|
741
|
+
: await pullTcp(config, this.tcp, batchSize, this._closing);
|
|
742
|
+
}
|
|
743
|
+
finally {
|
|
744
|
+
// Release the reservation. The pulled jobs are now registered/buffered (or
|
|
745
|
+
// the pull failed); either way the reservation has served its purpose for
|
|
746
|
+
// the duration of the in-flight pull.
|
|
747
|
+
this.pendingPull -= batchSize;
|
|
748
|
+
}
|
|
630
749
|
}
|
|
631
750
|
startJob(job, token) {
|
|
632
751
|
const jobIdStr = String(job.id);
|
|
@@ -82,6 +82,8 @@ export declare class SqliteStorage {
|
|
|
82
82
|
insertJob(job: Job, durable?: boolean): void;
|
|
83
83
|
/** Insert job immediately (bypass buffer) */
|
|
84
84
|
insertJobImmediate(job: Job): void;
|
|
85
|
+
/** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
|
|
86
|
+
private runInsertJobStmt;
|
|
85
87
|
/**
|
|
86
88
|
* Ensure a job's buffered INSERT has been written to disk before issuing a
|
|
87
89
|
* state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
|
|
@@ -122,8 +124,13 @@ export declare class SqliteStorage {
|
|
|
122
124
|
getJobStateRaw(jobId: JobId): string | null;
|
|
123
125
|
/** Load all completed job IDs (for dependency recovery) */
|
|
124
126
|
loadCompletedJobIds(): Set<JobId>;
|
|
125
|
-
/**
|
|
126
|
-
|
|
127
|
+
/**
|
|
128
|
+
* Insert batch of jobs. By default the jobs go through the write buffer.
|
|
129
|
+
* When `durable` is true they bypass the buffer and are written to disk
|
|
130
|
+
* immediately (matching single-push durable semantics) — used for addBulk
|
|
131
|
+
* jobs flagged `durable: true`.
|
|
132
|
+
*/
|
|
133
|
+
insertJobsBatch(jobs: Job[], durable?: boolean): void;
|
|
127
134
|
/**
|
|
128
135
|
* Query jobs by queue with optional state filter and pagination.
|
|
129
136
|
* Uses idx_jobs_queue_state index for O(log n) lookups.
|
|
@@ -244,11 +244,15 @@ export class SqliteStorage {
|
|
|
244
244
|
/** Insert job immediately (bypass buffer) */
|
|
245
245
|
insertJobImmediate(job) {
|
|
246
246
|
this.safeWrite(() => {
|
|
247
|
-
this.
|
|
248
|
-
.get('insertJob')
|
|
249
|
-
.run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
|
|
247
|
+
this.runInsertJobStmt(job);
|
|
250
248
|
});
|
|
251
249
|
}
|
|
250
|
+
/** Run the insertJob statement for one job (no safeWrite/transaction wrapper). */
|
|
251
|
+
runInsertJobStmt(job) {
|
|
252
|
+
this.statements
|
|
253
|
+
.get('insertJob')
|
|
254
|
+
.run(job.id, job.queue, pack(job.data), job.priority, job.createdAt, job.runAt, job.attempts, job.maxAttempts, job.backoff, job.ttl, job.timeout, job.uniqueKey, job.customId, job.dependsOn.length > 0 ? pack(job.dependsOn) : null, job.parentId, job.childrenIds.length > 0 ? pack(job.childrenIds) : null, job.tags.length > 0 ? pack(job.tags) : null, job.runAt > Date.now() ? 'delayed' : 'waiting', job.lifo ? 1 : 0, job.groupId, job.removeOnComplete ? 1 : 0, job.removeOnFail ? 1 : 0, job.stallTimeout, job.timeline.length > 0 ? pack(job.timeline) : null);
|
|
255
|
+
}
|
|
252
256
|
/**
|
|
253
257
|
* Ensure a job's buffered INSERT has been written to disk before issuing a
|
|
254
258
|
* state-mutating UPDATE. Without this, markActive/markCompleted's UPDATE
|
|
@@ -420,11 +424,38 @@ export class SqliteStorage {
|
|
|
420
424
|
/** Load all completed job IDs (for dependency recovery) */
|
|
421
425
|
loadCompletedJobIds() {
|
|
422
426
|
const rows = this.db.query('SELECT job_id FROM job_results').all();
|
|
423
|
-
|
|
427
|
+
const ids = new Set(rows.map((r) => r.job_id));
|
|
428
|
+
// A job acked with no/undefined result has state='completed' but NO job_results
|
|
429
|
+
// row. Include state='completed' ids so dependency recovery still sees it as
|
|
430
|
+
// done and unblocks dependents (instead of parking them forever).
|
|
431
|
+
const stateRows = this.db
|
|
432
|
+
.query("SELECT id FROM jobs WHERE state = 'completed'")
|
|
433
|
+
.all();
|
|
434
|
+
for (const r of stateRows)
|
|
435
|
+
ids.add(r.id);
|
|
436
|
+
return ids;
|
|
424
437
|
}
|
|
425
438
|
// ============ Bulk Operations ============
|
|
426
|
-
/**
|
|
427
|
-
|
|
439
|
+
/**
|
|
440
|
+
* Insert batch of jobs. By default the jobs go through the write buffer.
|
|
441
|
+
* When `durable` is true they bypass the buffer and are written to disk
|
|
442
|
+
* immediately (matching single-push durable semantics) — used for addBulk
|
|
443
|
+
* jobs flagged `durable: true`.
|
|
444
|
+
*/
|
|
445
|
+
insertJobsBatch(jobs, durable) {
|
|
446
|
+
if (durable) {
|
|
447
|
+
// Atomic immediate write: every row hits disk in ONE transaction, so a
|
|
448
|
+
// mid-batch failure rolls back the whole batch (no partial on-disk state)
|
|
449
|
+
// — bypassing the write buffer to honor the durable contract.
|
|
450
|
+
this.safeWrite(() => {
|
|
451
|
+
const tx = this.db.transaction((batch) => {
|
|
452
|
+
for (const job of batch)
|
|
453
|
+
this.runInsertJobStmt(job);
|
|
454
|
+
});
|
|
455
|
+
tx(jobs);
|
|
456
|
+
});
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
428
459
|
this.writeBuffer.addBatch(jobs);
|
|
429
460
|
}
|
|
430
461
|
// ============ Query Operations ============
|
|
@@ -29,7 +29,12 @@ export async function handlePush(cmd, ctx, reqId) {
|
|
|
29
29
|
for (const depId of cmd.dependsOn) {
|
|
30
30
|
const depJobId = jobId(depId);
|
|
31
31
|
const exists = ctx.queueManager.getJobIndex().has(depJobId) ||
|
|
32
|
-
ctx.queueManager.getCompletedJobs().has(depJobId)
|
|
32
|
+
ctx.queueManager.getCompletedJobs().has(depJobId) ||
|
|
33
|
+
// A removeOnComplete parent that completed is recorded only here (its row
|
|
34
|
+
// is deleted and it leaves jobIndex/completedJobs). The readiness path and
|
|
35
|
+
// dependency processor already honor depCompletions; the gate must too, or
|
|
36
|
+
// a late dependent is wrongly rejected with "Dependency job not found".
|
|
37
|
+
ctx.queueManager.getDepCompletions().has(depJobId);
|
|
33
38
|
if (!exists) {
|
|
34
39
|
return resp.error(`Dependency job not found: ${depId}`, reqId);
|
|
35
40
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bunqueue",
|
|
3
|
-
"version": "2.8.
|
|
3
|
+
"version": "2.8.18",
|
|
4
4
|
"description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/main.js",
|