bunqueue 2.8.18 → 2.8.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -319,6 +319,21 @@ export function recover(ctx) {
319
319
  }
320
320
  ctx.registerQueueName(queue);
321
321
  }
322
+ // === Restore queue control-state (#100) ===
323
+ // paused / rate-limit / concurrency live only in LimiterManager's in-memory
324
+ // Map; without this load every queue silently un-pauses and loses its limits
325
+ // on restart. Applied directly to the owning shard (in-memory only — these
326
+ // setters do not re-persist, so there is no write-back loop).
327
+ for (const qs of ctx.storage.loadQueueState()) {
328
+ const shard = ctx.shards[shardIndex(qs.name)];
329
+ if (qs.paused)
330
+ shard.pause(qs.name);
331
+ if (qs.rateLimit !== null)
332
+ shard.setRateLimit(qs.name, qs.rateLimit);
333
+ if (qs.concurrencyLimit !== null)
334
+ shard.setConcurrency(qs.name, qs.concurrencyLimit);
335
+ ctx.registerQueueName(qs.name);
336
+ }
322
337
  // === PHASE 3: Recover completed jobs ===
323
338
  // Required for clean('completed'), stats.completed, and in-memory lookups
324
339
  // on jobs that completed before a server restart (issue #84).
@@ -88,6 +88,33 @@ export declare class QueueManager {
88
88
  * If the job was already requeued by the background lock expiration task, return silently.
89
89
  */
90
90
  private throwIfOwnershipConflict;
91
+ /**
92
+ * Issue #101 grace window: decide whether an ACK whose lock failed
93
+ * verification (because the TTL expired) should still be honored.
94
+ *
95
+ * Returns true ONLY when ALL hold:
96
+ * 1. the job is still in `processing`,
97
+ * 2. the lock entry's token still matches the presenting worker, and
98
+ * 3. the lock belongs to the CURRENT processing instance — its `createdAt`
99
+ * is not older than the job's `startedAt`.
100
+ *
101
+ * Condition 3 is the re-lease guard. A lock-expiry re-lease (checkExpiredLocks)
102
+ * deletes the stale lock, so a new lease installs a NEW token and condition 2
103
+ * already fails. But the STALL path (stallDetection retry/moveToDlq) requeues
104
+ * the job WITHOUT deleting the lock — the original (now-expired) lock lingers
105
+ * with the original token. If another worker then re-pulls the job, its
106
+ * `startedAt` is reset to a newer time than the lingering lock's `createdAt`,
107
+ * so condition 3 fails and the timed-out worker's late ack is rejected
108
+ * (preventing a double-completion the skeptic confirmed). In the genuine #101
109
+ * case — the same worker finishing just after its lock expired, no re-pull —
110
+ * `startedAt` is unchanged and `createdAt >= startedAt`, so the grace is granted
111
+ * and the successful completion is recorded instead of being lost to a stall.
112
+ *
113
+ * Without this, a successful completion arriving just after lock expiry is
114
+ * rejected as "Invalid or expired lock token", the client drops it, and the
115
+ * job stalls to `failed` despite having been processed correctly.
116
+ */
117
+ private isExpiredButOwned;
91
118
  /** Check if a queued job was stall-retried (has been processed before). */
92
119
  private isStallRetried;
93
120
  /**
@@ -144,6 +171,13 @@ export declare class QueueManager {
144
171
  isPaused(queue: string): boolean;
145
172
  drain(queue: string): number;
146
173
  obliterate(queue: string): void;
174
+ /**
175
+ * Drop per-queue metadata that obliterate is responsible for reclaiming:
176
+ * cumulative metrics (keyed by name, never self-expiring) and the persisted
177
+ * control-state row (#100 — so a stale pause/limit can't resurrect on the
178
+ * next restart).
179
+ */
180
+ private purgeQueueMetadata;
147
181
  listQueues(): string[];
148
182
  private registerQueueName;
149
183
  private unregisterQueueName;
@@ -166,6 +200,12 @@ export declare class QueueManager {
166
200
  clearRateLimit(queue: string): void;
167
201
  setConcurrency(queue: string, limit: number): void;
168
202
  clearConcurrency(queue: string): void;
203
+ /**
204
+ * Issue #100: write-through the current control-state (paused / rate-limit /
205
+ * concurrency) to the `queue_state` table so it survives a server restart.
206
+ * Reads the post-mutation state from the owning shard and UPSERTs the row.
207
+ */
208
+ private persistQueueState;
169
209
  /** Get rate limit and concurrency limit for a queue */
170
210
  getQueueLimits(queue: string): {
171
211
  rateLimit: number | null;
@@ -276,7 +276,12 @@ export class QueueManager {
276
276
  }
277
277
  async ack(jobId, result, token) {
278
278
  const lockCtx = this.contextFactory.getLockContext();
279
- if (token && !lockMgr.verifyLock(jobId, token, lockCtx)) {
279
+ if (token &&
280
+ !lockMgr.verifyLock(jobId, token, lockCtx) &&
281
+ !this.isExpiredButOwned(jobId, token, lockCtx)) {
282
+ // #101: if the lock is expired but still OURS and the job is still in
283
+ // `processing`, isExpiredButOwned() short-circuits this block so the
284
+ // completion falls through to ackJob() rather than being lost.
280
285
  this.throwIfOwnershipConflict(jobId, lockCtx);
281
286
  // No ownership conflict. If job is still in processing (dedup case
282
287
  // from Issue #33: lock removed but job still there), proceed with ACK.
@@ -334,7 +339,9 @@ export class QueueManager {
334
339
  if (tokens?.length === jobIds.length) {
335
340
  for (let i = 0; i < jobIds.length; i++) {
336
341
  const t = tokens[i];
337
- if (t && !lockMgr.verifyLock(jobIds[i], t, lockCtx)) {
342
+ if (t &&
343
+ !lockMgr.verifyLock(jobIds[i], t, lockCtx) &&
344
+ !this.isExpiredButOwned(jobIds[i], t, lockCtx)) {
338
345
  this.throwIfOwnershipConflict(jobIds[i], lockCtx);
339
346
  // Recover stall-retried job (#75): lock expired and job was
340
347
  // re-queued by lock expiration or stall detection. Complete it
@@ -350,6 +357,8 @@ export class QueueManager {
350
357
  }
351
358
  continue;
352
359
  }
360
+ // #101 grace window: an expired-but-still-ours lock on a still-processing
361
+ // job is accepted (isExpiredButOwned), not lost.
353
362
  validJobIds.push(jobIds[i]);
354
363
  if (validTokens)
355
364
  validTokens.push(t);
@@ -376,7 +385,9 @@ export class QueueManager {
376
385
  const lockCtx = this.contextFactory.getLockContext();
377
386
  const validItems = [];
378
387
  for (const item of items) {
379
- if (item.token && !lockMgr.verifyLock(item.id, item.token, lockCtx)) {
388
+ if (item.token &&
389
+ !lockMgr.verifyLock(item.id, item.token, lockCtx) &&
390
+ !this.isExpiredButOwned(item.id, item.token, lockCtx)) {
380
391
  this.throwIfOwnershipConflict(item.id, lockCtx);
381
392
  // Recover stall-retried job (#75): lock expired and job was
382
393
  // re-queued by lock expiration or stall detection. Complete it
@@ -392,6 +403,9 @@ export class QueueManager {
392
403
  }
393
404
  continue;
394
405
  }
406
+ // #101 grace window (isExpiredButOwned true): the lock TTL elapsed while
407
+ // the handler ran, but the lock is still OURS and the job is still in
408
+ // `processing` — accept the completion instead of losing the work.
395
409
  validItems.push(item);
396
410
  }
397
411
  if (validItems.length > 0) {
@@ -436,6 +450,46 @@ export class QueueManager {
436
450
  throw new Error(`Invalid or expired lock token for job ${jobId}`);
437
451
  }
438
452
  }
453
+ /**
454
+ * Issue #101 grace window: decide whether an ACK whose lock failed
455
+ * verification (because the TTL expired) should still be honored.
456
+ *
457
+ * Returns true ONLY when ALL hold:
458
+ * 1. the job is still in `processing`,
459
+ * 2. the lock entry's token still matches the presenting worker, and
460
+ * 3. the lock belongs to the CURRENT processing instance — its `createdAt`
461
+ * is not older than the job's `startedAt`.
462
+ *
463
+ * Condition 3 is the re-lease guard. A lock-expiry re-lease (checkExpiredLocks)
464
+ * deletes the stale lock, so a new lease installs a NEW token and condition 2
465
+ * already fails. But the STALL path (stallDetection retry/moveToDlq) requeues
466
+ * the job WITHOUT deleting the lock — the original (now-expired) lock lingers
467
+ * with the original token. If another worker then re-pulls the job, its
468
+ * `startedAt` is reset to a newer time than the lingering lock's `createdAt`,
469
+ * so condition 3 fails and the timed-out worker's late ack is rejected
470
+ * (preventing a double-completion the skeptic confirmed). In the genuine #101
471
+ * case — the same worker finishing just after its lock expired, no re-pull —
472
+ * `startedAt` is unchanged and `createdAt >= startedAt`, so the grace is granted
473
+ * and the successful completion is recorded instead of being lost to a stall.
474
+ *
475
+ * Without this, a successful completion arriving just after lock expiry is
476
+ * rejected as "Invalid or expired lock token", the client drops it, and the
477
+ * job stalls to `failed` despite having been processed correctly.
478
+ */
479
+ isExpiredButOwned(jobId, token, lockCtx) {
480
+ const loc = this.jobIndex.get(jobId);
481
+ if (loc?.type !== 'processing')
482
+ return false;
483
+ const lock = lockCtx.jobLocks.get(jobId);
484
+ if (lock?.token !== token)
485
+ return false;
486
+ // Re-lease guard: a re-pulled job has a startedAt newer than the lingering
487
+ // lock's createdAt → the lock no longer owns the current processing instance.
488
+ const job = this.processingShards[loc.shardIdx].get(jobId);
489
+ if (job && job.startedAt !== null && job.startedAt > lock.createdAt)
490
+ return false;
491
+ return true;
492
+ }
439
493
  /** Check if a queued job was stall-retried (has been processed before). */
440
494
  isStallRetried(jobId) {
441
495
  const loc = this.jobIndex.get(jobId);
@@ -653,6 +707,7 @@ export class QueueManager {
653
707
  // ============ Queue Control ============
654
708
  pause(queue) {
655
709
  queueControl.pauseQueue(queue, this.contextFactory.getQueueControlContext());
710
+ this.persistQueueState(queue);
656
711
  this.dashboardEmit?.('queue:paused', { queue });
657
712
  this.eventsManager.broadcast({
658
713
  eventType: "paused" /* EventType.Paused */,
@@ -663,6 +718,7 @@ export class QueueManager {
663
718
  }
664
719
  resume(queue) {
665
720
  queueControl.resumeQueue(queue, this.contextFactory.getQueueControlContext());
721
+ this.persistQueueState(queue);
666
722
  this.dashboardEmit?.('queue:resumed', { queue });
667
723
  this.eventsManager.broadcast({
668
724
  eventType: "resumed" /* EventType.Resumed */,
@@ -735,11 +791,21 @@ export class QueueManager {
735
791
  // their own; obliterate is the documented way to reclaim ALL state for a
736
792
  // queue, so drop its metrics entry too (prevents unbounded growth for
737
793
  // ephemeral/dynamically-named queues).
738
- this.perQueueMetrics.delete(queue);
794
+ this.purgeQueueMetadata(queue);
739
795
  this.unregisterQueueName(queue);
740
796
  this.dashboardEmit?.('queue:obliterated', { queue });
741
797
  this.dashboardEmit?.('queue:removed', { queue });
742
798
  }
799
+ /**
800
+ * Drop per-queue metadata that obliterate is responsible for reclaiming:
801
+ * cumulative metrics (keyed by name, never self-expiring) and the persisted
802
+ * control-state row (#100 — so a stale pause/limit can't resurrect on the
803
+ * next restart).
804
+ */
805
+ purgeQueueMetadata(queue) {
806
+ this.perQueueMetrics.delete(queue);
807
+ this.storage?.deleteQueueState(queue);
808
+ }
743
809
  listQueues() {
744
810
  return Array.from(this.queueNamesCache);
745
811
  }
@@ -792,15 +858,38 @@ export class QueueManager {
792
858
  // ============ Rate Limiting ============
793
859
  setRateLimit(queue, limit) {
794
860
  this.shards[shardIndex(queue)].setRateLimit(queue, limit);
861
+ this.persistQueueState(queue);
795
862
  }
796
863
  clearRateLimit(queue) {
797
864
  this.shards[shardIndex(queue)].clearRateLimit(queue);
865
+ this.persistQueueState(queue);
798
866
  }
799
867
  setConcurrency(queue, limit) {
800
868
  this.shards[shardIndex(queue)].setConcurrency(queue, limit);
869
+ this.persistQueueState(queue);
801
870
  }
802
871
  clearConcurrency(queue) {
803
872
  this.shards[shardIndex(queue)].clearConcurrency(queue);
873
+ this.persistQueueState(queue);
874
+ }
875
+ /**
876
+ * Issue #100: write-through the current control-state (paused / rate-limit /
877
+ * concurrency) to the `queue_state` table so it survives a server restart.
878
+ * Reads the post-mutation state from the owning shard and UPSERTs the row.
879
+ */
880
+ persistQueueState(queue) {
881
+ if (!this.storage)
882
+ return;
883
+ const state = this.shards[shardIndex(queue)].getState(queue);
884
+ // When control-state returns fully to default (not paused, no limits), drop
885
+ // the row instead of persisting an all-default placeholder. Keeps the table
886
+ // free of noise rows for ephemeral queues that only ever call resume/clear*,
887
+ // and recovers identically (absent row → default state).
888
+ if (!state.paused && state.rateLimit === null && state.concurrencyLimit === null) {
889
+ this.storage.deleteQueueState(queue);
890
+ return;
891
+ }
892
+ this.storage.saveQueueState(queue, state.paused, state.rateLimit, state.concurrencyLimit);
804
893
  }
805
894
  /** Get rate limit and concurrency limit for a queue */
806
895
  getQueueLimits(queue) {
@@ -175,6 +175,20 @@ export declare class SqliteStorage {
175
175
  deleteCron(name: string): void;
176
176
  /** Update cron job execution state (executions count and next run time) */
177
177
  updateCron(name: string, executions: number, nextRun: number): void;
178
+ /**
179
+ * Persist a queue's control-state (paused / rate-limit / concurrency) so it
180
+ * survives a server restart. Write-through on every pause/resume/limit change.
181
+ */
182
+ saveQueueState(name: string, paused: boolean, rateLimit: number | null, concurrencyLimit: number | null): void;
183
+ /** Load all persisted queue control-state rows (used by recover() on boot). */
184
+ loadQueueState(): Array<{
185
+ name: string;
186
+ paused: boolean;
187
+ rateLimit: number | null;
188
+ concurrencyLimit: number | null;
189
+ }>;
190
+ /** Drop a queue's persisted control-state (e.g. on obliterate). */
191
+ deleteQueueState(name: string): void;
178
192
  close(): void;
179
193
  getSize(): number;
180
194
  }
@@ -6,7 +6,7 @@
6
6
  import { Database } from 'bun:sqlite';
7
7
  import { createDlqEntry } from '../../domain/types/dlq';
8
8
  import { PRAGMA_SETTINGS, SCHEMA, MIGRATION_TABLE, SCHEMA_VERSION, MIGRATIONS } from './schema';
9
- import { prepareStatements } from './statements';
9
+ import { prepareStatements, } from './statements';
10
10
  import { pack, unpack, rowToJob, reconstructDlqEntry } from './sqliteSerializer';
11
11
  import { BatchInsertManager, WriteBuffer } from './sqliteBatch';
12
12
  import { storageLog } from '../../shared/logger';
@@ -581,6 +581,34 @@ export class SqliteStorage {
581
581
  this.statements.get('updateCron').run(executions, nextRun, name);
582
582
  });
583
583
  }
584
+ // ============ Queue Control-State (#100) ============
585
+ /**
586
+ * Persist a queue's control-state (paused / rate-limit / concurrency) so it
587
+ * survives a server restart. Write-through on every pause/resume/limit change.
588
+ */
589
+ saveQueueState(name, paused, rateLimit, concurrencyLimit) {
590
+ this.safeWrite(() => {
591
+ this.statements
592
+ .get('upsertQueueState')
593
+ .run(name, paused ? 1 : 0, rateLimit, concurrencyLimit);
594
+ });
595
+ }
596
+ /** Load all persisted queue control-state rows (used by recover() on boot). */
597
+ loadQueueState() {
598
+ const rows = this.statements.get('loadQueueState').all();
599
+ return rows.map((row) => ({
600
+ name: row.name,
601
+ paused: row.paused === 1,
602
+ rateLimit: row.rate_limit,
603
+ concurrencyLimit: row.concurrency_limit,
604
+ }));
605
+ }
606
+ /** Drop a queue's persisted control-state (e.g. on obliterate). */
607
+ deleteQueueState(name) {
608
+ this.safeWrite(() => {
609
+ this.statements.get('deleteQueueState').run(name);
610
+ });
611
+ }
584
612
  // ============ Utilities ============
585
613
  close() {
586
614
  this.writeBuffer.stop();
@@ -4,7 +4,7 @@
4
4
  */
5
5
  import type { Database } from 'bun:sqlite';
6
6
  /** Statement names */
7
- export type StatementName = 'insertJob' | 'updateJobState' | 'completeJob' | 'deleteJob' | 'deleteJobResult' | 'getJob' | 'insertResult' | 'getResult' | 'insertDlq' | 'loadDlq' | 'deleteDlqEntry' | 'clearDlqQueue' | 'insertCron' | 'updateCron';
7
+ export type StatementName = 'insertJob' | 'updateJobState' | 'completeJob' | 'deleteJob' | 'deleteJobResult' | 'getJob' | 'insertResult' | 'getResult' | 'insertDlq' | 'loadDlq' | 'deleteDlqEntry' | 'clearDlqQueue' | 'insertCron' | 'updateCron' | 'upsertQueueState' | 'loadQueueState' | 'deleteQueueState';
8
8
  /** SQL statements */
9
9
  export declare const SQL_STATEMENTS: Record<StatementName, string>;
10
10
  /** Prepare all statements */
@@ -61,3 +61,10 @@ export interface DbCron {
61
61
  prevent_overlap: number;
62
62
  job_options: Uint8Array | null;
63
63
  }
64
+ /** Database row type for queue control-state (#100) */
65
+ export interface DbQueueState {
66
+ name: string;
67
+ paused: number;
68
+ rate_limit: number | null;
69
+ concurrency_limit: number | null;
70
+ }
@@ -34,6 +34,10 @@ export const SQL_STATEMENTS = {
34
34
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
35
35
  `,
36
36
  updateCron: 'UPDATE cron_jobs SET executions = ?, next_run = ? WHERE name = ?',
37
+ // Queue control-state persistence (#100): paused / rate-limit / concurrency.
38
+ upsertQueueState: 'INSERT OR REPLACE INTO queue_state (name, paused, rate_limit, concurrency_limit) VALUES (?, ?, ?, ?)',
39
+ loadQueueState: 'SELECT name, paused, rate_limit, concurrency_limit FROM queue_state',
40
+ deleteQueueState: 'DELETE FROM queue_state WHERE name = ?',
37
41
  };
38
42
  /** Prepare all statements */
39
43
  export function prepareStatements(db) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bunqueue",
3
- "version": "2.8.18",
3
+ "version": "2.8.19",
4
4
  "description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
5
5
  "type": "module",
6
6
  "main": "dist/main.js",