npm - bunqueue - Versions diffs - 2.8.4 → 2.8.6 - Mend

bunqueue 2.8.4 → 2.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/application/contextFactory.js +2 -0
package/dist/application/operations/push.d.ts +2 -0
package/dist/application/operations/push.js +21 -11
package/dist/client/queue/queue.js +2 -0
package/dist/client/tcp/client.d.ts +9 -0
package/dist/client/tcp/client.js +38 -3
package/dist/client/tcp/connection.js +11 -0
package/dist/client/tcp/health.d.ts +14 -0
package/dist/client/tcp/health.js +24 -0
package/dist/client/tcp/types.d.ts +10 -0
package/dist/client/tcp/types.js +1 -0
package/dist/client/tcpPool.js +2 -0
package/dist/client/types.d.ts +6 -0
package/dist/client/worker/worker.js +1 -0
package/dist/infrastructure/persistence/sqlite.js +18 -5
package/dist/infrastructure/persistence/sqliteBatch.d.ts +32 -3
package/dist/infrastructure/persistence/sqliteBatch.js +129 -55
package/package.json +1 -1

package/dist/application/contextFactory.js CHANGED Viewed

@@ -80,6 +80,8 @@ export class ContextFactory {
             shards: this.deps.shards,
             shardLocks: this.deps.shardLocks,
             completedJobs: this.deps.completedJobs,
+            completedJobsData: this.deps.completedJobsData,
+            jobResults: this.deps.jobResults,
             customIdMap: this.deps.customIdMap,
             jobIndex: this.deps.jobIndex,
             totalPushed: this.deps.metrics.totalPushed,

package/dist/application/operations/push.d.ts CHANGED Viewed

@@ -14,6 +14,8 @@ export interface PushContext {
     shards: Shard[];
     shardLocks: RWLock[];
     completedJobs: SetLike<JobId>;
+    completedJobsData: MapLike<JobId, Job>;
+    jobResults: MapLike<JobId, unknown>;
     customIdMap: MapLike<string, JobId>;
     jobIndex: Map<JobId, JobLocation>;
     totalPushed: {

package/dist/application/operations/push.js CHANGED Viewed

@@ -17,19 +17,29 @@ function handleCustomId(input, shard, ctx) {
     }
     const id = jobId(input.customId);
     const existing = ctx.customIdMap.get(input.customId);
-    // No existing mapping - register and proceed
-    if (!existing) {
-        ctx.customIdMap.set(input.customId, id);
-        return { skip: false, id };
+    // If the existing job is still queued, the add is idempotent — return it.
+    if (existing) {
+        const location = ctx.jobIndex.get(existing);
+        const existingJob = location?.type === 'queue' ? shard.getQueue(location.queueName).find(existing) : null;
+        if (existingJob) {
+            return { skip: true, existingJob };
+        }
     }
-    // Check if existing job is still in queue
-    const location = ctx.jobIndex.get(existing);
-    const existingJob = location?.type === 'queue' ? shard.getQueue(location.queueName).find(existing) : null;
-    if (existingJob) {
-        return { skip: true, existingJob };
+    // Reuse path: the id is free in the queue (no mapping, or the prior job is
+    // processing/completed). If the prior job COMPLETED, its row survives on disk
+    // (markCompleted does an UPDATE, not a DELETE) and it is still in completedJobs.
+    // Reusing the same deterministic id would then (a) make getJobState return
+    // 'completed' for the brand-new job and (b) collide on the `jobs.id` PRIMARY KEY
+    // at flush time. Evict the stale completed job so the reused id starts fresh as
+    // 'waiting' (#92). Checked regardless of customIdMap state — the mapping may have
+    // been cleared on completion, which would otherwise skip this path entirely.
+    if (ctx.completedJobs.has(id)) {
+        ctx.completedJobs.delete(id);
+        ctx.completedJobsData.delete(id);
+        ctx.jobResults.delete(id);
+        ctx.jobIndex.delete(id);
+        ctx.storage?.deleteJob(id); // removes the surviving row + result + any buffered insert
     }
-    // Job gone (processing/completed) - allow reuse of customId
-    ctx.customIdMap.delete(input.customId);
     ctx.customIdMap.set(input.customId, id);
     return { skip: false, id };
 }

package/dist/client/queue/queue.js CHANGED Viewed

@@ -63,6 +63,7 @@ export class Queue {
                     poolSize,
                     pingInterval: connOpts.pingInterval,
                     commandTimeout: connOpts.commandTimeout,
+                    maxCommandTimeouts: connOpts.maxCommandTimeouts,
                     pipelining: connOpts.pipelining,
                     maxInFlight: connOpts.maxInFlight,
                 });
@@ -76,6 +77,7 @@ export class Queue {
                     poolSize,
                     pingInterval: connOpts.pingInterval,
                     commandTimeout: connOpts.commandTimeout,
+                    maxCommandTimeouts: connOpts.maxCommandTimeouts,
                     pipelining: connOpts.pipelining,
                     maxInFlight: connOpts.maxInFlight,
                 });

package/dist/client/tcp/client.d.ts CHANGED Viewed

@@ -73,6 +73,15 @@ export declare class TcpClient extends EventEmitter {
     /** Send ping to check connection health */
     ping(): Promise<boolean>;
     private handlePingFailure;
+    /**
+     * A command timed out. On a half-open socket (peer gone, no FIN/RST) writes
+     * keep succeeding but no response ever returns — every command times out
+     * while the socket still looks "connected". The health-check ping is one way
+     * to notice, but it can be disabled or slower than real traffic, leaving a
+     * worker's PULL loop to time out forever without ever reconnecting (#94).
+     * Treat a sustained run of timeouts as a dead link and force a reconnect.
+     */
+    private handleCommandTimeout;
     private forceReconnect;
     /** Get connection health metrics */
     getHealth(): ConnectionHealth;

package/dist/client/tcp/client.js CHANGED Viewed

@@ -79,6 +79,7 @@ export class TcpClient extends EventEmitter {
         this.health = new HealthTracker({
             pingInterval: this.options.pingInterval,
             maxPingFailures: this.options.maxPingFailures,
+            maxCommandTimeouts: this.options.maxCommandTimeouts,
         });
         this.reconnect = new ReconnectManager({
             maxReconnectAttempts: this.options.maxReconnectAttempts,
@@ -258,15 +259,44 @@ export class TcpClient extends EventEmitter {
             this.emit('health', { type: 'ping_failed' });
         }
     }
+    /**
+     * A command timed out. On a half-open socket (peer gone, no FIN/RST) writes
+     * keep succeeding but no response ever returns — every command times out
+     * while the socket still looks "connected". The health-check ping is one way
+     * to notice, but it can be disabled or slower than real traffic, leaving a
+     * worker's PULL loop to time out forever without ever reconnecting (#94).
+     * Treat a sustained run of timeouts as a dead link and force a reconnect.
+     */
+    handleCommandTimeout() {
+        if (this.health.recordCommandTimeout()) {
+            this.emit('health', { type: 'unhealthy', reason: 'max_command_timeouts' });
+            this.forceReconnect();
+        }
+    }
     forceReconnect() {
         if (this.reconnect.isClosed())
             return;
         if (this.socket) {
-            this.socket.end();
+            // end() can throw on an already-errored/half-dead socket. Swallow it:
+            // failing to close the corpse must NOT abort the reconnect path below,
+            // or the connection would stay wedged forever (the #94 failure mode).
+            try {
+                this.socket.end();
+            }
+            catch {
+                /* socket already torn down */
+            }
             this.socket = null;
         }
         this.connected = false;
         this.health.stopPing();
+        // Settle every in-flight/queued command NOW. Otherwise their per-command
+        // timeouts keep ticking and fire AFTER the fresh socket is up — each stale
+        // timeout bumps the new connection's dead-link counter and can re-trigger
+        // forceReconnect in a loop (a reconnect storm that never stabilises). It
+        // also unblocks awaiting callers (e.g. a Worker's PULL) immediately instead
+        // of making them wait out the full commandTimeout on a corpse.
+        this.commands.rejectAll(new Error('Connection lost'));
         if (this.reconnect.canReconnect())
             this.reconnect.scheduleReconnect(() => this.connect());
     }
@@ -343,6 +373,8 @@ export class TcpClient extends EventEmitter {
                 if (removed) {
                     this.health.recordError();
                     next.reject(new Error('Command timeout'));
+                    // In-flight command got no response: count it toward dead-link detection.
+                    this.handleCommandTimeout();
                 }
             }, this.options.commandTimeout);
             next.timeout = newTimeout;
@@ -365,17 +397,20 @@ export class TcpClient extends EventEmitter {
         let pendingRef;
         const promise = new Promise((resolve, reject) => {
             const timeout = setTimeout(() => {
-                // Try to remove from queue first
+                // Try to remove from queue first. A still-queued command never reached
+                // the socket (e.g. waiting on connect), so it is NOT evidence of a dead
+                // link — reject it but don't count it toward dead-link detection.
                 if (this.commands.remove(id)) {
                     this.health.recordError();
                     reject(new Error('Command timeout'));
                     return;
                 }
-                // Try to remove from in-flight
+                // Try to remove from in-flight: this one WAS sent and got no response.
                 const removed = this.commands.removeByReqId(reqId);
                 if (removed) {
                     this.health.recordError();
                     reject(new Error('Command timeout'));
+                    this.handleCommandTimeout();
                 }
             }, this.options.commandTimeout);
             pendingRef = {

package/dist/client/tcp/connection.js CHANGED Viewed

@@ -42,6 +42,17 @@ export async function createConnection(target, connectTimeout, events) {
             },
             open(sock) {
                 cleanup();
+                // Enable TCP keepalive so the OS probes idle connections and surfaces a
+                // dead peer (suspended host, NAT/LB drop) via an error/close event,
+                // instead of a half-open socket lingering until tcp_retries2 (~15 min).
+                // Best-effort: not all platforms honor the delay, and older Bun builds
+                // may lack the method — never let it abort connection setup. See #94.
+                try {
+                    sock.setKeepAlive?.(true, 15000);
+                }
+                catch {
+                    /* keepalive unsupported on this platform/runtime */
+                }
                 socketData.write = (d) => sock.write(d);
                 socketData.end = () => sock.end();
                 connectionResolved = true;

package/dist/client/tcp/health.d.ts CHANGED Viewed

@@ -7,6 +7,11 @@ import type { ConnectionHealth } from './types';
 export interface HealthConfig {
     pingInterval: number;
     maxPingFailures: number;
+    /**
+     * Consecutive command timeouts before the link is concluded dead (0 = off).
+     * Optional for backward compatibility; defaults to 3 when omitted.
+     */
+    maxCommandTimeouts?: number;
 }
 /**
  * Tracks connection health metrics
@@ -14,6 +19,7 @@ export interface HealthConfig {
 export declare class HealthTracker {
     private readonly config;
     private consecutivePingFailures;
+    private consecutiveCommandTimeouts;
     private lastSuccessAt;
     private lastErrorAt;
     private connectedAt;
@@ -35,6 +41,14 @@ export declare class HealthTracker {
     recordPingSuccess(latencyMs: number): void;
     /** Record ping failure, returns true if max failures reached */
     recordPingFailure(): boolean;
+    /**
+     * Record a command timeout. Returns true when the configured consecutive
+     * threshold is reached (and the feature is enabled), signalling the caller to
+     * force a reconnect. Any intervening success resets the counter, so this only
+     * fires on a sustained run of timeouts — the signature of a dead/half-open
+     * socket where writes succeed but no response ever comes back.
+     */
+    recordCommandTimeout(): boolean;
     /** Get current health metrics */
     getHealth(state: 'connected' | 'connecting' | 'disconnected' | 'closed'): ConnectionHealth;
     /** Start ping timer */

package/dist/client/tcp/health.js CHANGED Viewed

@@ -2,12 +2,15 @@
  * TCP Health Tracker
  * Monitors connection health with ping and latency tracking
  */
+/** Default consecutive command-timeout threshold when not configured. */
+const DEFAULT_MAX_COMMAND_TIMEOUTS = 3;
 /**
  * Tracks connection health metrics
  */
 export class HealthTracker {
     config;
     consecutivePingFailures = 0;
+    consecutiveCommandTimeouts = 0;
     lastSuccessAt = null;
     lastErrorAt = null;
     connectedAt = null;
@@ -23,6 +26,9 @@ export class HealthTracker {
     recordSuccess(latencyMs) {
         this.lastSuccessAt = Date.now();
         this.totalCommands++;
+        // A real response proves the link is alive: the prior timeouts were not a
+        // sustained run, so reset the dead-link counter ("consecutive" must mean it).
+        this.consecutiveCommandTimeouts = 0;
         this.recordLatency(latencyMs);
     }
     /** Record command error */
@@ -38,10 +44,13 @@ export class HealthTracker {
     recordConnected() {
         this.connectedAt = Date.now();
         this.consecutivePingFailures = 0;
+        this.consecutiveCommandTimeouts = 0;
     }
     /** Record ping success */
     recordPingSuccess(latencyMs) {
+        // A successful ping is also proof the link is alive — clear both suspicions.
         this.consecutivePingFailures = 0;
+        this.consecutiveCommandTimeouts = 0;
         this.recordLatency(latencyMs);
     }
     /** Record ping failure, returns true if max failures reached */
@@ -51,6 +60,20 @@ export class HealthTracker {
         this.totalErrors++;
         return this.consecutivePingFailures >= this.config.maxPingFailures;
     }
+    /**
+     * Record a command timeout. Returns true when the configured consecutive
+     * threshold is reached (and the feature is enabled), signalling the caller to
+     * force a reconnect. Any intervening success resets the counter, so this only
+     * fires on a sustained run of timeouts — the signature of a dead/half-open
+     * socket where writes succeed but no response ever comes back.
+     */
+    recordCommandTimeout() {
+        const max = this.config.maxCommandTimeouts ?? DEFAULT_MAX_COMMAND_TIMEOUTS;
+        if (max <= 0)
+            return false;
+        this.consecutiveCommandTimeouts++;
+        return this.consecutiveCommandTimeouts >= max;
+    }
     /** Get current health metrics */
     getHealth(state) {
         const avgLatency = this.latencyHistory.length > 0
@@ -63,6 +86,7 @@ export class HealthTracker {
             lastErrorAt: this.lastErrorAt,
             avgLatencyMs: Math.round(avgLatency * 100) / 100,
             consecutivePingFailures: this.consecutivePingFailures,
+            consecutiveCommandTimeouts: this.consecutiveCommandTimeouts,
             totalCommands: this.totalCommands,
             totalErrors: this.totalErrors,
             uptimeMs: this.connectedAt ? Date.now() - this.connectedAt : 0,

package/dist/client/tcp/types.d.ts CHANGED Viewed

@@ -26,6 +26,14 @@ export interface ConnectionOptions {
     pingInterval?: number;
     /** Max consecutive ping failures before forcing reconnect (default: 3) */
     maxPingFailures?: number;
+    /**
+     * Max consecutive command timeouts (with no intervening success) before the
+     * connection is concluded dead and reconnect is forced (default: 3, 0 to
+     * disable). This is the recovery path for a half-open socket when the
+     * health-check ping is disabled or slower than real traffic — a worker whose
+     * PULLs keep timing out no longer stalls forever waiting on the ping. See #94.
+     */
+    maxCommandTimeouts?: number;
     /** Enable pipelining - multiple commands in flight (default: true) */
     pipelining?: boolean;
     /** Max commands in flight when pipelining (default: 100) */
@@ -45,6 +53,8 @@ export interface ConnectionHealth {
     avgLatencyMs: number;
     /** Consecutive ping failures */
     consecutivePingFailures: number;
+    /** Consecutive command timeouts with no intervening success */
+    consecutiveCommandTimeouts: number;
     /** Total commands sent */
     totalCommands: number;
     /** Total errors */

package/dist/client/tcp/types.js CHANGED Viewed

@@ -15,6 +15,7 @@ export const DEFAULT_CONNECTION = {
     autoReconnect: true,
     pingInterval: 30000,
     maxPingFailures: 3,
+    maxCommandTimeouts: 3,
     pipelining: true,
     maxInFlight: 100,
 };

package/dist/client/tcpPool.js CHANGED Viewed

@@ -29,6 +29,7 @@ export class TcpConnectionPool {
             autoReconnect: options.autoReconnect ?? true,
             pingInterval: options.pingInterval ?? 30000,
             maxPingFailures: options.maxPingFailures ?? 3,
+            maxCommandTimeouts: options.maxCommandTimeouts ?? 3,
             pipelining: options.pipelining ?? true,
             maxInFlight: options.maxInFlight ?? 100,
         };
@@ -46,6 +47,7 @@ export class TcpConnectionPool {
                 autoReconnect: this.options.autoReconnect,
                 pingInterval: this.options.pingInterval,
                 maxPingFailures: this.options.maxPingFailures,
+                maxCommandTimeouts: this.options.maxCommandTimeouts,
             });
             this.clients.push(client);
         }

package/dist/client/types.d.ts CHANGED Viewed

@@ -376,6 +376,12 @@ export interface ConnectionOptions {
     pingInterval?: number;
     /** Command timeout in ms (default: 30000) */
     commandTimeout?: number;
+    /**
+     * Consecutive command timeouts (no intervening success) before the connection
+     * is concluded dead and a reconnect is forced (default: 3, 0 to disable).
+     * Recovery path for a half-open socket independent of the health-check ping. See #94.
+     */
+    maxCommandTimeouts?: number;
     /** Enable TCP pipelining (default: true) */
     pipelining?: boolean;
     /** Max commands in flight per connection (default: 100) */

package/dist/client/worker/worker.js CHANGED Viewed

@@ -47,6 +47,7 @@ function createTcpPool(opts, concurrency) {
         poolSize,
         pingInterval: connOpts.pingInterval,
         commandTimeout: connOpts.commandTimeout,
+        maxCommandTimeouts: connOpts.maxCommandTimeouts,
         pipelining: connOpts.pipelining,
         maxInFlight: connOpts.maxInFlight,
     });

package/dist/infrastructure/persistence/sqlite.js CHANGED Viewed

@@ -56,11 +56,24 @@ export class SqliteStorage {
             if (isSqliteFullError(err)) {
                 this.setDiskFull(err.message);
             }
-            storageLog.error('Write buffer flush failed', {
-                jobCount,
-                error: err.message,
-                diskFull: this._diskFull,
-            });
+            // A constraint violation (e.g. duplicate jobs.id) is a PERMANENT per-row
+            // rejection that the WriteBuffer isolated and dropped — sibling valid jobs
+            // in the same flush were still persisted. Log it distinctly from a
+            // transient flush failure (and never route it to the DLQ, which would
+            // resurrect a duplicate).
+            if (/constraint failed/i.test(err.message)) {
+                storageLog.error('Write buffer rejected jobs (constraint violation, dropped)', {
+                    rejectedJobCount: jobCount,
+                    error: err.message,
+                });
+            }
+            else {
+                storageLog.error('Write buffer flush failed', {
+                    jobCount,
+                    error: err.message,
+                    diskFull: this._diskFull,
+                });
+            }
         }, (jobs, lastError, attempts) => {
             this.handleCriticalLoss(jobs, lastError, attempts);
         });

package/dist/infrastructure/persistence/sqliteBatch.d.ts CHANGED Viewed

@@ -4,13 +4,33 @@
  */
 import type { Database } from 'bun:sqlite';
 import type { Job } from '../../domain/types/job';
+/** Outcome of a batch insert after isolating per-row failures. */
+export interface BatchInsertResult {
+    /** Jobs that hit a transient (non-constraint) error — caller should retry. */
+    transient: Job[];
+    /** Jobs rejected by a permanent constraint (e.g. duplicate id) — drop, never retry. */
+    conflicts: Job[];
+    /** The originating error from the fast-path failure, for logging. */
+    error?: Error;
+}
 /** Batch insert manager with prepared statement caching */
 export declare class BatchInsertManager {
     private readonly db;
     private readonly cache;
     constructor(db: Database);
-    /** Insert batch of jobs using multi-row INSERT for 50-100x speedup */
-    insertJobsBatch(jobs: Job[]): void;
+    /**
+     * Insert a batch of jobs using a single multi-row INSERT (50-100x speedup).
+     * On the common path every row succeeds and an empty result is returned.
+     *
+     * If the atomic batch fails (e.g. a single duplicate `jobs.id`), the rows are
+     * re-inserted ONE AT A TIME so a single bad row can no longer drop the rest:
+     * valid jobs persist, constraint violations are isolated as `conflicts` (drop,
+     * never retry — they would poison every future flush), and any transient
+     * failures are returned so the caller can retry just those. Never throws.
+     */
+    insertJobsBatch(jobs: Job[]): BatchInsertResult;
+    /** Fallback path: insert each job independently, isolating per-row failures. */
+    private insertRowByRow;
     /** Get or create cached prepared statement for batch insert */
     private getBatchInsertStmt;
     /** Insert a chunk of jobs with single multi-row INSERT */
@@ -52,7 +72,16 @@ export declare class WriteBuffer {
     add(job: Job): void;
     /** Add multiple jobs to buffer */
     addBatch(jobs: Job[]): void;
-    /** Flush buffer to disk using double-buffering. Returns number of jobs flushed. */
+    /**
+     * Flush buffer to disk using double-buffering. Returns number of jobs persisted.
+     *
+     * Per-row isolation (see BatchInsertManager.insertJobsBatch): valid jobs are
+     * persisted even if a sibling row violates a constraint. Constraint conflicts
+     * (e.g. duplicate id) are dropped+reported and NEVER re-buffered — re-buffering
+     * them would poison every future flush and silently drop unrelated valid jobs
+     * (the #92-class data-loss bug). Transient failures are re-buffered and retried
+     * with exponential backoff, exactly as before.
+     */
     flush(): number;
     /** Schedule a retry with exponential backoff */
     private scheduleBackoffRetry;

package/dist/infrastructure/persistence/sqliteBatch.js CHANGED Viewed

@@ -3,6 +3,20 @@
  * High-performance batch insert with prepared statement caching
  */
 import { pack } from './sqliteSerializer';
+const COLS_PER_ROW = 24;
+// SQLite has a limit of ~999 variables, so batch in chunks
+const MAX_ROWS_PER_INSERT = Math.floor(999 / COLS_PER_ROW);
+/**
+ * A constraint violation (e.g. a duplicate `jobs.id` PRIMARY KEY) is PERMANENT
+ * for that row — retrying never succeeds. It must be isolated from the rest of
+ * the batch, otherwise one bad row poisons the whole atomic flush and drops
+ * unrelated valid jobs (data loss, #92-class). Everything else (disk I/O, busy,
+ * full) is treated as transient and retried.
+ */
+function isConstraintError(err) {
+    const code = err.code ?? '';
+    return code.startsWith('SQLITE_CONSTRAINT') || /constraint failed/i.test(err.message);
+}
 /** Batch insert manager with prepared statement caching */
 export class BatchInsertManager {
     db;
@@ -10,20 +24,52 @@ export class BatchInsertManager {
     constructor(db) {
         this.db = db;
     }
-    /** Insert batch of jobs using multi-row INSERT for 50-100x speedup */
+    /**
+     * Insert a batch of jobs using a single multi-row INSERT (50-100x speedup).
+     * On the common path every row succeeds and an empty result is returned.
+     *
+     * If the atomic batch fails (e.g. a single duplicate `jobs.id`), the rows are
+     * re-inserted ONE AT A TIME so a single bad row can no longer drop the rest:
+     * valid jobs persist, constraint violations are isolated as `conflicts` (drop,
+     * never retry — they would poison every future flush), and any transient
+     * failures are returned so the caller can retry just those. Never throws.
+     */
     insertJobsBatch(jobs) {
         if (jobs.length === 0)
-            return;
+            return { transient: [], conflicts: [] };
         const now = Date.now();
-        const COLS_PER_ROW = 24;
-        // SQLite has a limit of ~999 variables, so batch in chunks
-        const MAX_ROWS_PER_INSERT = Math.floor(999 / COLS_PER_ROW);
-        this.db.transaction(() => {
-            for (let offset = 0; offset < jobs.length; offset += MAX_ROWS_PER_INSERT) {
-                const chunk = jobs.slice(offset, offset + MAX_ROWS_PER_INSERT);
-                this.insertJobsChunk(chunk, now);
+        try {
+            this.db.transaction(() => {
+                for (let offset = 0; offset < jobs.length; offset += MAX_ROWS_PER_INSERT) {
+                    const chunk = jobs.slice(offset, offset + MAX_ROWS_PER_INSERT);
+                    this.insertJobsChunk(chunk, now);
+                }
+            })();
+            return { transient: [], conflicts: [] };
+        }
+        catch (err) {
+            const batchError = err instanceof Error ? err : new Error(String(err));
+            return this.insertRowByRow(jobs, now, batchError);
+        }
+    }
+    /** Fallback path: insert each job independently, isolating per-row failures. */
+    insertRowByRow(jobs, now, batchError) {
+        const transient = [];
+        const conflicts = [];
+        for (const job of jobs) {
+            try {
+                // Single-row INSERT, auto-committed: succeeds/fails independently.
+                this.insertJobsChunk([job], now);
+            }
+            catch (e) {
+                const err = e instanceof Error ? e : new Error(String(e));
+                if (isConstraintError(err))
+                    conflicts.push(job);
+                else
+                    transient.push(job);
             }
-        })();
+        }
+        return { transient, conflicts, error: batchError };
     }
     /** Get or create cached prepared statement for batch insert */
     getBatchInsertStmt(size) {
@@ -112,7 +158,16 @@ export class WriteBuffer {
             this.flush();
         }
     }
-    /** Flush buffer to disk using double-buffering. Returns number of jobs flushed. */
+    /**
+     * Flush buffer to disk using double-buffering. Returns number of jobs persisted.
+     *
+     * Per-row isolation (see BatchInsertManager.insertJobsBatch): valid jobs are
+     * persisted even if a sibling row violates a constraint. Constraint conflicts
+     * (e.g. duplicate id) are dropped+reported and NEVER re-buffered — re-buffering
+     * them would poison every future flush and silently drop unrelated valid jobs
+     * (the #92-class data-loss bug). Transient failures are re-buffered and retried
+     * with exponential backoff, exactly as before.
+     */
     flush() {
         // Prevent flush after stop or concurrent flushes
         if (this.stopped || this.flushing)
@@ -125,53 +180,68 @@ export class WriteBuffer {
         this.activeBuffer = [];
         const jobCount = this.flushBuffer.length;
         try {
-            this.batchManager.insertJobsBatch(this.flushBuffer);
-            this.flushBuffer = []; // Clear after successful write
-            // Reset retry state on success
-            this.retryCount = 0;
-            this.currentBackoffMs = this.initialBackoffMs;
-            this.lastError = null;
-            return jobCount;
-        }
-        catch (err) {
-            const error = err instanceof Error ? err : new Error(String(err));
-            this.lastError = error;
-            this.retryCount++;
-            // On failure, prepend failed jobs back to active buffer
-            // This preserves order: failed jobs first, then new jobs
-            this.activeBuffer = this.flushBuffer.concat(this.activeBuffer);
+            // BatchInsertManager.insertJobsBatch isolates per-row failures and never
+            // throws. Stay defensive: a manager that throws (or returns nothing) is
+            // treated as a transient failure of the whole batch, preserving the
+            // re-buffer/retry/critical-loss semantics.
+            let result;
+            try {
+                // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition -- defensive against a non-conforming batch manager (e.g. a test double returning undefined)
+                result = this.batchManager.insertJobsBatch(this.flushBuffer) ?? {
+                    transient: [],
+                    conflicts: [],
+                };
+            }
+            catch (err) {
+                result = {
+                    transient: this.flushBuffer,
+                    conflicts: [],
+                    error: err instanceof Error ? err : new Error(String(err)),
+                };
+            }
+            const { transient, conflicts, error } = result;
             this.flushBuffer = [];
-            // Check if we've exceeded max retries
-            if (this.retryCount >= this.maxRetries) {
-                // Move jobs to dead letter / emit critical error
-                const lostJobs = [...this.activeBuffer];
-                this.activeBuffer = [];
-                if (this.onCriticalError) {
-                    this.onCriticalError(lostJobs, error, this.retryCount);
+            // Permanent constraint conflicts: drop + report once, never retry.
+            if (conflicts.length > 0) {
+                this.onError(error ?? new Error('Constraint violation'), conflicts.length);
+            }
+            // Transient failures: re-buffer just those and retry with backoff.
+            if (transient.length > 0) {
+                const error2 = error ?? new Error('Write buffer flush failed');
+                this.lastError = error2;
+                this.retryCount++;
+                // Prepend failed jobs back to active buffer (failed first, then new).
+                this.activeBuffer = transient.concat(this.activeBuffer);
+                if (this.retryCount >= this.maxRetries) {
+                    const lostJobs = [...this.activeBuffer];
+                    this.activeBuffer = [];
+                    if (this.onCriticalError)
+                        this.onCriticalError(lostJobs, error2, this.retryCount);
+                    this.onError(error2, lostJobs.length, {
+                        retryCount: this.retryCount,
+                        nextBackoffMs: 0,
+                        maxRetries: this.maxRetries,
+                    });
+                    this.retryCount = 0;
+                    this.currentBackoffMs = this.initialBackoffMs;
+                    this.lastError = null;
+                }
+                else {
+                    const nextBackoffMs = Math.min(this.currentBackoffMs * 2, this.maxBackoffMs);
+                    this.onError(error2, transient.length, {
+                        retryCount: this.retryCount,
+                        nextBackoffMs,
+                        maxRetries: this.maxRetries,
+                    });
+                    this.scheduleBackoffRetry();
                 }
-                // Also call onError with retry info for logging
-                this.onError(error, lostJobs.length, {
-                    retryCount: this.retryCount,
-                    nextBackoffMs: 0, // No more retries
-                    maxRetries: this.maxRetries,
-                });
-                // Reset retry state
-                this.retryCount = 0;
-                this.currentBackoffMs = this.initialBackoffMs;
-                this.lastError = null;
-                throw err;
+                return jobCount - transient.length - conflicts.length;
             }
-            // Calculate next backoff with exponential increase
-            const nextBackoffMs = Math.min(this.currentBackoffMs * 2, this.maxBackoffMs);
-            // Call error callback with retry information
-            this.onError(error, jobCount, {
-                retryCount: this.retryCount,
-                nextBackoffMs: nextBackoffMs,
-                maxRetries: this.maxRetries,
-            });
-            // Schedule backoff retry
-            this.scheduleBackoffRetry();
-            throw err;
+            // Success (or success-modulo-dropped-conflicts): reset retry state.
+            this.retryCount = 0;
+            this.currentBackoffMs = this.initialBackoffMs;
+            this.lastError = null;
+            return jobCount - conflicts.length;
         }
         finally {
             this.flushing = false;
@@ -287,6 +357,10 @@ export class WriteBuffer {
                 const flushed = this.flush();
                 clearTimeout(timeout);
                 this.stopped = true;
+                // flush() no longer throws on transient failure (it re-buffers); report
+                // anything that could not be persisted so it isn't silently lost.
+                if (this.pendingCount > 0)
+                    this.reportLostJobs();
                 resolve(flushed);
             }
             catch {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bunqueue",
-  "version": "2.8.4",
+  "version": "2.8.6",
   "description": "High-performance job queue for Bun & AI agents. SQLite persistence, cron scheduling, priorities, retries, DLQ, webhooks, native MCP server. Zero external dependencies.",
   "type": "module",
   "main": "dist/main.js",