npm - @trigger.dev/redis-worker - Versions diffs - 4.5.0-rc.3 → 4.5.0-rc.4 - Mend

@trigger.dev/redis-worker 4.5.0-rc.3 → 4.5.0-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -16382,6 +16382,7 @@ var stringToDate = zod.z.string().transform((v, ctx) => {
   }
   return d;
 });
+var stringToBool = zod.z.union([zod.z.literal("true"), zod.z.literal("false")]).transform((v) => v === "true");
 var stringToError = zod.z.string().transform((v, ctx) => {
   try {
     return BufferEntryError.parse(JSON.parse(v));
@@ -16398,6 +16399,27 @@ var BufferEntrySchema = zod.z.object({
   status: BufferEntryStatus,
   attempts: stringToInt,
   createdAt: stringToDate,
+  // Microsecond epoch of accept time, kept as a hash field for dwell
+  // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted
+  // so an entry written by an accept Lua predating this field — or one
+  // surviving across the deploy that introduced it — still parses instead
+  // of being silently dropped on pop.
+  createdAtMicros: stringToInt.default("0"),
+  // Drainer-ack flag: `true` once the drainer has materialised this run
+  // into PG. The hash persists for a short grace TTL after ack so direct
+  // reads (retrieve, trace, etc.) still resolve while PG replica lag
+  // settles. Absent on pre-ack entries.
+  materialised: stringToBool.default("false"),
+  // Denormalised pointer to the Redis idempotency lookup key (set when
+  // the run was accepted with an idempotency key, empty otherwise). The
+  // ack Lua reads this to DEL the lookup atomically with marking the
+  // entry materialised.
+  idempotencyLookupKey: zod.z.string().optional().default(""),
+  // Optimistic-lock counter for the snapshot's `metadata` field.
+  // Incremented atomically by the CAS metadata Lua. Matches the
+  // semantic of `TaskRun.metadataVersion` on the PG side (which the
+  // UpdateMetadataService uses for the same retry-on-conflict pattern).
+  metadataVersion: stringToInt.default("0"),
   lastError: stringToError.optional()
 });
 function serialiseSnapshot(snapshot) {
@@ -16408,19 +16430,32 @@ function deserialiseSnapshot(serialised) {
 }
 // src/mollifier/buffer.ts
+var ACK_GRACE_TTL_SECONDS = 30;
+function mollifierReconnectDelayMs(times, random = Math.random) {
+  const base = Math.min(times * 50, 1e3);
+  const half = Math.floor(base / 2);
+  return half + Math.round(random() * (base - half));
+}
+function encodeKeyPart(value) {
+  return Buffer.from(value, "utf8").toString("base64url");
+}
+function idempotencyLookupKeyFor(input) {
+  return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
+}
+var PENDING_PREFIX = "pending:";
+function makeIdempotencyClaimKey(input) {
+  return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
+}
 var MollifierBuffer = class {
   redis;
-  entryTtlSeconds;
   logger;
   constructor(options) {
-    this.entryTtlSeconds = options.entryTtlSeconds;
     this.logger = options.logger ?? new logger$1.Logger("MollifierBuffer", "debug");
     this.redis = createRedisClient(
       {
         ...options.redisOptions,
         retryStrategy(times) {
-          const delay = Math.min(times * 50, 1e3);
-          return delay;
+          return mollifierReconnectDelayMs(times);
         },
         maxRetriesPerRequest: 20
       },
@@ -16432,14 +16467,26 @@ var MollifierBuffer = class {
     );
     this.#registerCommands();
   }
-  // Returns true if the entry was newly written; false if a duplicate runId
-  // was already buffered (idempotent no-op). Callers can use the boolean to
-  // record a duplicate-accept metric without affecting buffer state.
+  // Three outcomes:
+  //   - { kind: "accepted" } — entry was newly written.
+  //   - { kind: "duplicate_run_id" } — runId was already buffered (idempotent
+  //     no-op, same semantic as the previous boolean-false return).
+  //   - { kind: "duplicate_idempotency", existingRunId } — the (env, task,
+  //     idempotencyKey) tuple was already bound to another buffered run.
+  //     The Lua's atomic SETNX is the race-winner; the second caller gets
+  //     the winner's runId so it can return that as the trigger response.
   async accept(input) {
     const entryKey = `mollifier:entries:${input.runId}`;
     const queueKey = `mollifier:queue:${input.envId}`;
     const orgsKey = "mollifier:orgs";
-    const createdAt = (/* @__PURE__ */ new Date()).toISOString();
+    const nowMs = Date.now();
+    const createdAt = new Date(nowMs).toISOString();
+    const createdAtMicros = nowMs * 1e3;
+    const idempotencyLookupKey = input.idempotencyKey && input.taskIdentifier ? idempotencyLookupKeyFor({
+      envId: input.envId,
+      taskIdentifier: input.taskIdentifier,
+      idempotencyKey: input.idempotencyKey
+    }) : "";
     const result = await this.redis.acceptMollifierEntry(
       entryKey,
       queueKey,
@@ -16449,10 +16496,16 @@ var MollifierBuffer = class {
       input.orgId,
       input.payload,
       createdAt,
-      String(this.entryTtlSeconds),
-      "mollifier:org-envs:"
+      String(createdAtMicros),
+      "mollifier:org-envs:",
+      idempotencyLookupKey,
+      "mollifier:entries:"
     );
-    return result === 1;
+    if (typeof result === "string" && result.length > 0) {
+      return { kind: "duplicate_idempotency", existingRunId: result };
+    }
+    if (result === 1) return { kind: "accepted" };
+    return { kind: "duplicate_run_id" };
   }
   async pop(envId) {
     const queueKey = `mollifier:queue:${envId}`;
@@ -16506,8 +16559,220 @@ var MollifierBuffer = class {
   async listEnvsForOrg(orgId) {
     return this.redis.smembers(`mollifier:org-envs:${orgId}`);
   }
+  // Read-only enumeration of currently-queued entries for a single env.
+  // Used by the stale-sweep to compute per-entry dwell time, so order is
+  // immaterial — LRANGE returns them newest-first (LPUSH head) but the
+  // caller scans the whole window. Non-destructive: the drainer still
+  // RPOPs these entries in FIFO order.
+  //
+  // The entry HGETALLs are issued in a single pipelined batch (one
+  // network round-trip instead of N) — at the stale-sweep's default
+  // maxCount=1000 the serial implementation cost ~1000 RTTs per env,
+  // which dominated sweep wall-time at any meaningful backlog.
+  //
+  // A missing entry (empty hash) is skipped: the drainer's RPOP+DEL of
+  // the entry hash can race our LRANGE→HGETALL window, so a runId on
+  // the queue with no backing hash is an expected concurrency outcome,
+  // not an error.
+  async listEntriesForEnv(envId, maxCount) {
+    if (maxCount <= 0) return [];
+    const runIds = await this.redis.lrange(
+      `mollifier:queue:${envId}`,
+      0,
+      maxCount - 1
+    );
+    if (runIds.length === 0) return [];
+    const pipeline = this.redis.pipeline();
+    for (const runId of runIds) {
+      pipeline.hgetall(`mollifier:entries:${runId}`);
+    }
+    const results = await pipeline.exec();
+    if (!results) return [];
+    const entries = [];
+    for (let i = 0; i < results.length; i++) {
+      const [err, raw] = results[i];
+      if (err) {
+        this.logger.error("MollifierBuffer.listEntriesForEnv: hgetall failed", {
+          runId: runIds[i],
+          err: err.message
+        });
+        continue;
+      }
+      if (!raw || Object.keys(raw).length === 0) continue;
+      const parsed = BufferEntrySchema.safeParse(raw);
+      if (!parsed.success) {
+        this.logger.error("MollifierBuffer.listEntriesForEnv: invalid entry shape", {
+          runId: runIds[i],
+          errors: parsed.error.flatten()
+        });
+        continue;
+      }
+      entries.push(parsed.data);
+    }
+    return entries;
+  }
+  // Atomic snapshot mutation. Used by customer-mutation API endpoints
+  // (tags, metadata-put, reschedule, cancel) when the run is still in
+  // the buffer. Three outcomes:
+  //   - "applied_to_snapshot": entry was QUEUED + not materialised; the
+  //     drainer will read the patched payload on its next pop.
+  //   - "not_found": no entry hash exists for this runId — including a
+  //     FAILED entry, whose hash the drainer-terminal `fail` path DELs.
+  //   - "busy": entry is DRAINING or materialised. The API
+  //     wait-and-bounces through PG.
+  //   - "limit_exceeded": an `append_tags` patch carrying `maxTags` would
+  //     push the deduped tag count over the cap; nothing is written.
+  async mutateSnapshot(runId, patch) {
+    const result = await this.redis.mutateMollifierSnapshot(
+      `mollifier:entries:${runId}`,
+      JSON.stringify(patch)
+    );
+    if (result === "applied_to_snapshot" || result === "not_found" || result === "busy" || result === "limit_exceeded") {
+      return result;
+    }
+    throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`);
+  }
+  // Optimistic compare-and-swap on the snapshot's metadata. Caller reads
+  // the current metadataVersion via getEntry, applies operations in JS via
+  // `applyMetadataOperations`, then calls this with the new metadata + the
+  // expected version. Lua refuses if the version has moved (caller retries
+  // up to N times). Mirrors the PG-side `UpdateMetadataService` retry
+  // loop so concurrent increment/append operations don't lose deltas.
+  async casSetMetadata(input) {
+    const entryKey = `mollifier:entries:${input.runId}`;
+    const raw = await this.redis.casSetMollifierMetadata(
+      entryKey,
+      String(input.expectedVersion),
+      input.newMetadata,
+      input.newMetadataType
+    );
+    if (raw === "not_found") return { kind: "not_found" };
+    if (raw === "busy") return { kind: "busy" };
+    if (raw.startsWith("conflict:")) {
+      return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) };
+    }
+    if (raw.startsWith("applied:")) {
+      return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) };
+    }
+    throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`);
+  }
+  // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
+  // call across both PG and buffer paths serialises through this claim;
+  // closes the race the buffer-side SETNX leaves open during the
+  // gate-transition burst window.
+  //
+  // The caller supplies an opaque `token` (UUID) on claim. The same token
+  // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and-
+  // act so a late release from a previous claimant whose TTL expired
+  // cannot erase a new owner's claim.
+  //
+  // - "claimed": we now own the claim, the caller proceeds with the
+  //   trigger pipeline and must `publishClaim` on success or
+  //   `releaseClaim` on failure.
+  // - "pending": another trigger owns the claim and hasn't published
+  //   yet; the caller should poll.
+  // - "resolved": the claim already holds a runId; the caller can
+  //   return that runId as a cached hit.
+  async claimIdempotency(input) {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const raw = await this.redis.claimMollifierIdempotency(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+      PENDING_PREFIX,
+      String(input.ttlSeconds)
+    );
+    if (raw === "claimed") return { kind: "claimed" };
+    if (raw === "pending") return { kind: "pending" };
+    if (raw.startsWith("resolved:")) {
+      return { kind: "resolved", runId: raw.slice("resolved:".length) };
+    }
+    throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`);
+  }
+  // Publish the winning runId to the claim so subsequent claimants /
+  // waiters see "resolved". TTL bounded by the customer's
+  // `idempotencyKeyExpiresAt` minus now; caller computes.
+  //
+  // Compare-and-set on the caller's token: if the current value isn't
+  // our pending marker (TTL expired and another claimant moved in, or
+  // someone else already published), the publish is a no-op. The caller
+  // can treat any such case as "we lost the claim" and re-read.
+  // Returns true if we published; false if the claim slot was no longer
+  // ours.
+  async publishClaim(input) {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const result = await this.redis.publishMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+      input.runId,
+      String(input.ttlSeconds)
+    );
+    return result === 1;
+  }
+  // Release the claim on pipeline error so waiters can re-claim and
+  // retry. Idempotent.
+  //
+  // Compare-and-delete on the caller's token: only deletes if the
+  // current value is exactly our pending marker. A late release from a
+  // claimant whose TTL expired is a no-op, so a new owner's claim is
+  // never wiped by a slow predecessor.
+  async releaseClaim(input) {
+    const claimKey = makeIdempotencyClaimKey(input);
+    await this.redis.releaseMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`
+    );
+  }
+  // Read the current claim value, used by the wait/poll loop on losers
+  // to detect "pending" → "resolved" transitions and timeouts.
+  async readClaim(input) {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const value = await this.redis.get(claimKey);
+    if (value === null) return null;
+    if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" };
+    return { kind: "resolved", runId: value };
+  }
+  // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
+  // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
+  // misses — same key may belong to a buffered run waiting to drain. The
+  // lookup self-heals: if the lookup points at an entry hash that's gone,
+  // we clear the lookup and report a miss. The clear is a compare-and-
+  // delete (only if the key still holds the stale runId we observed) so a
+  // fresh accept that rebinds the key between our GET and DEL isn't wiped.
+  async lookupIdempotency(input) {
+    const lookupKey = idempotencyLookupKeyFor(input);
+    const runId = await this.redis.get(lookupKey);
+    if (!runId) return null;
+    const entry = await this.getEntry(runId);
+    if (!entry) {
+      await this.redis.delMollifierKeyIfEquals(lookupKey, runId);
+      return null;
+    }
+    return runId;
+  }
+  // Clear the idempotency binding from a buffered run. Used by
+  // `ResetIdempotencyKeyService` alongside the existing PG-side
+  // `updateMany`. Returns the runId that was cleared, or null if no
+  // buffered run held this key.
+  async resetIdempotency(input) {
+    const lookupKey = idempotencyLookupKeyFor(input);
+    const claimKey = makeIdempotencyClaimKey(input);
+    const clearedRunId = await this.redis.resetMollifierIdempotency(
+      lookupKey,
+      "mollifier:entries:",
+      claimKey
+    );
+    return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
+  }
+  // Marks the entry as materialised (PG row written) and resets its TTL to
+  // the grace window. Entry hash persists past ack as a read-fallback
+  // safety net for the brief PG replica-lag window between drainer-side
+  // write and reader-side visibility. Also clears the associated
+  // idempotency lookup if one was set on accept.
   async ack(runId) {
-    await this.redis.del(`mollifier:entries:${runId}`);
+    await this.redis.ackMollifierEntry(
+      `mollifier:entries:${runId}`,
+      String(ACK_GRACE_TTL_SECONDS)
+    );
   }
   async requeue(runId) {
     await this.redis.requeueMollifierEntry(
@@ -16518,9 +16783,12 @@ var MollifierBuffer = class {
       "mollifier:org-envs:"
     );
   }
-  // Returns true if the entry transitioned to FAILED; false if the entry no
-  // longer exists (TTL expired between pop and fail). Caller can use the
-  // boolean to skip downstream FAILED handling for ghost entries.
+  // Returns true if a live entry was torn down; false if the entry no
+  // longer existed (a concurrent ack or manual cleanup removed it between
+  // pop and fail — there is no accept-time TTL). Note FAILED is not an
+  // observable state: the Lua marks the hash FAILED then DELs it in the
+  // same atomic script, so a subsequent getEntry returns null. Caller can
+  // use the boolean to skip downstream FAILED handling for ghost entries.
   async fail(runId, error) {
     const result = await this.redis.failMollifierEntry(
       `mollifier:entries:${runId}`,
@@ -16528,6 +16796,11 @@ var MollifierBuffer = class {
     );
     return result === 1;
   }
+  // Returns Redis-side TTL on the entry hash. Returns -1 for entries
+  // with no TTL — the steady state under the current design, where
+  // entries persist until drainer ack/fail. The ack grace TTL (30s
+  // post-materialise) is the only context where this returns a
+  // positive value; tests around the grace TTL still rely on it.
   async getEntryTtlSeconds(runId) {
     return this.redis.ttl(`mollifier:entries:${runId}`);
   }
@@ -16558,8 +16831,10 @@ var MollifierBuffer = class {
         local orgId = ARGV[3]
         local payload = ARGV[4]
         local createdAt = ARGV[5]
-        local ttlSeconds = tonumber(ARGV[6])
+        local createdAtMicros = ARGV[6]
         local orgEnvsPrefix = ARGV[7]
+        local idempotencyLookupKey = ARGV[8] or ''
+        local entryPrefix = ARGV[9]
         -- Idempotent: refuse if an entry for this runId already exists in any
         -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -16568,6 +16843,27 @@ var MollifierBuffer = class {
           return 0
         end
+        -- Idempotency-key dedup. If the caller passed a lookup key
+        -- and it's already bound to another buffered run, return the
+        -- winner's runId so the loser's API response can echo it as a
+        -- cached hit. Otherwise SET the lookup (no TTL \u2014 lifecycle is
+        -- paired with the entry hash; drainer ack/fail clear it
+        -- explicitly).
+        if idempotencyLookupKey ~= '' then
+          local existing = redis.call('GET', idempotencyLookupKey)
+          if existing then
+            -- Self-heal: only honour the binding if its entry hash still
+            -- exists. If the entry was evicted (maxmemory) but the lookup
+            -- survived, the binding is stale \u2014 fall through and rebind to
+            -- this run rather than returning a dead runId that would block
+            -- the key indefinitely. Mirrors lookupIdempotency's self-heal.
+            if redis.call('EXISTS', entryPrefix .. existing) == 1 then
+              return existing
+            end
+          end
+          redis.call('SET', idempotencyLookupKey, runId)
+        end
         redis.call('HSET', entryKey,
           'runId', runId,
           'envId', envId,
@@ -16575,8 +16871,20 @@ var MollifierBuffer = class {
           'payload', payload,
           'status', 'QUEUED',
           'attempts', '0',
-          'createdAt', createdAt)
-        redis.call('EXPIRE', entryKey, ttlSeconds)
+          'createdAt', createdAt,
+          'createdAtMicros', createdAtMicros,
+          'idempotencyLookupKey', idempotencyLookupKey,
+          'metadataVersion', '0')
+        -- No EXPIRE on the entry hash. Buffer entries persist until the
+        -- drainer ACKs (post-materialise grace) or FAILs them \u2014 the
+        -- drainer is the only recovery mechanism, so silent TTL-based
+        -- eviction would lose runs with no customer-visible signal.
+        -- Memory pressure from an offline drainer is the alertable
+        -- failure mode instead; see _ops/mollifier-ops.md.
+        -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so
+        -- insertion order == drain order (FIFO). createdAtMicros is kept
+        -- as a hash field for dwell metrics only \u2014 it is no longer a sort
+        -- key now that the buffer has no list/pagination surface.
         redis.call('LPUSH', queueKey, runId)
         -- Org-level membership: maintained atomically with the per-env
         -- queue so the drainer can walk orgs \u2192 envs-for-org and
@@ -16606,7 +16914,12 @@ var MollifierBuffer = class {
         local nextAttempts = tonumber(currentAttempts or '0') + 1
         redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
-        redis.call('LPUSH', queuePrefix .. envId, runId)
+        -- Requeue RPUSHes to the tail (the RPOP end) so a transiently
+        -- failed entry pops next rather than going to the back of the
+        -- line behind a fresh backlog. createdAt is immutable across
+        -- retries; the drainer's maxAttempts caps the
+        -- retry loop so a poisoned entry doesn't head-of-line forever.
+        redis.call('RPUSH', queuePrefix .. envId, runId)
         -- Re-track the org/env: pop may have SREM'd them when the queue
         -- last emptied. SADDs are idempotent if the values are still
         -- present.
@@ -16640,11 +16953,13 @@ var MollifierBuffer = class {
           end
         end
-        -- Loop to skip orphan queue references \u2014 runIds whose entry hash has
-        -- expired (TTL hit). HSET on a missing key would CREATE a partial
-        -- hash without a TTL, leaking memory. The loop is bounded by queue
-        -- length; entire Lua script remains atomic.
+        -- Loop to skip orphan queue references \u2014 runIds whose entry hash is
+        -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry
+        -- no TTL of their own). HSET on a missing key would CREATE a
+        -- partial hash without a TTL, leaking memory. The loop is bounded
+        -- by queue length; entire Lua script remains atomic.
         while true do
+          -- RPOP returns the tail member (oldest, FIFO), or false when empty.
           local runId = redis.call('RPOP', queueKey)
           if not runId then
             -- Queue is empty AND we have no entry to read orgId from, so
@@ -16662,16 +16977,260 @@ var MollifierBuffer = class {
               result[raw[i]] = raw[i + 1]
             end
             -- Prune org-level membership if this pop drained the queue.
-            -- Atomic with the RPOP above \u2014 a concurrent accept AFTER this
-            -- script will SADD both back along with its LPUSH.
+            -- Atomic with the RPOP above \u2014 a concurrent accept AFTER
+            -- this script will SADD both back along with its LPUSH.
             if redis.call('LLEN', queueKey) == 0 then
               pruneOrgMembership(result['orgId'])
             end
             return cjson.encode(result)
           end
-          -- Orphan queue reference: entry TTL expired while runId was queued.
-          -- Discard the reference and loop to the next.
+          -- Orphan queue reference: entry hash gone (evicted) while runId
+          -- was queued. Discard the reference and loop to the next.
+        end
+      `
+    });
+    this.redis.defineCommand("casSetMollifierMetadata", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local expectedVersion = tonumber(ARGV[1])
+        local newMetadata = ARGV[2]
+        local newMetadataType = ARGV[3]
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+        local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0'
+        local currentVersion = tonumber(currentVersionStr) or 0
+        if currentVersion ~= expectedVersion then
+          return 'conflict:' .. tostring(currentVersion)
+        end
+        -- Write the new metadata onto the snapshot's payload JSON. We
+        -- keep the rest of the payload intact \u2014 only metadata/metadataType
+        -- change. metadataVersion is denormalised on the hash for cheap
+        -- CAS reads; it's intentionally NOT stored inside the payload
+        -- itself (PG-side metadataVersion is a column, not a JSON field).
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+        payload.metadata = newMetadata
+        payload.metadataType = newMetadataType
+        local newVersion = currentVersion + 1
+        redis.call('HSET', entryKey,
+          'payload', cjson.encode(payload),
+          'metadataVersion', tostring(newVersion))
+        return 'applied:' .. tostring(newVersion)
+      `
+    });
+    this.redis.defineCommand("claimMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local pendingMarker = ARGV[1]   -- "pending:<caller-token>"
+        local pendingPrefix = ARGV[2]   -- "pending:"
+        local ttl = tonumber(ARGV[3])
+        -- SETNX-with-TTL: atomic; only one caller can win.
+        local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl)
+        if won then
+          return 'claimed'
+        end
+        local existing = redis.call('GET', claimKey)
+        if not existing then
+          -- The slot expired in the race window between the SET NX
+          -- failing and this GET. It's free now \u2014 claim it so we don't
+          -- string.sub a nil and error out.
+          redis.call('SET', claimKey, pendingMarker, 'EX', ttl)
+          return 'claimed'
+        end
+        -- Any "pending:*" value is a live claim \u2014 the caller-supplied
+        -- token differentiates ownership but is opaque to losers.
+        if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
+          return 'pending'
+        end
+        return 'resolved:' .. existing
+      `
+    });
+    this.redis.defineCommand("publishMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+        local runId = ARGV[2]
+        local ttl = tonumber(ARGV[3])
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('SET', claimKey, runId, 'EX', ttl)
+          return 1
+        end
+        return 0
+      `
+    });
+    this.redis.defineCommand("releaseMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('DEL', claimKey)
+          return 1
+        end
+        return 0
+      `
+    });
+    this.redis.defineCommand("resetMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local lookupKey = KEYS[1]
+        local entryPrefix = ARGV[1]
+        local claimKey = ARGV[2]
+        -- Reset reopens the key across BOTH the buffer lookup and the
+        -- cross-store pre-gate claim pointer. Without clearing the claim,
+        -- a resolved/pending claim would keep deduping new triggers for
+        -- the rest of its TTL even though the binding was reset. DEL is
+        -- unconditional \u2014 the claim is gone regardless of whether a
+        -- buffered run currently holds the lookup.
+        redis.call('DEL', claimKey)
+        local runId = redis.call('GET', lookupKey)
+        if not runId then
+          return ''
+        end
+        local entryKey = entryPrefix .. runId
+        if redis.call('EXISTS', entryKey) == 0 then
+          -- Stale lookup. Lazy cleanup.
+          redis.call('DEL', lookupKey)
+          return ''
+        end
+        -- Clear the idempotency fields on the snapshot payload so the
+        -- drainer's eventual engine.trigger call inserts a PG row
+        -- without the key set.
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        if payloadJson then
+          local ok, payload = pcall(cjson.decode, payloadJson)
+          if ok then
+            payload.idempotencyKey = cjson.null
+            payload.idempotencyKeyExpiresAt = cjson.null
+            redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+          end
         end
+        -- Clear the denormalised lookup pointer on the hash so a later
+        -- ack doesn't try to DEL a key that's already gone.
+        redis.call('HSET', entryKey, 'idempotencyLookupKey', '')
+        redis.call('DEL', lookupKey)
+        return runId
+      `
+    });
+    this.redis.defineCommand("mutateMollifierSnapshot", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local patchJson = ARGV[1]
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+        local patch = cjson.decode(patchJson)
+        if patch.type == 'append_tags' then
+          -- cjson decode of an absent or empty-array field gives nil or
+          -- an empty table; we rebuild as a dense array. Existing tags
+          -- are preserved; new tags are appended only if not present.
+          local existing = payload.tags or {}
+          local seen = {}
+          local merged = {}
+          for _, t in ipairs(existing) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          for _, t in ipairs(patch.tags or {}) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          -- Cap the deduped count when the caller supplies a limit, so a
+          -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API.
+          -- Reject the whole patch (write nothing) rather than truncating.
+          if patch.maxTags ~= nil and #merged > patch.maxTags then
+            return 'limit_exceeded'
+          end
+          payload.tags = merged
+        elseif patch.type == 'set_metadata' then
+          payload.metadata = patch.metadata
+          payload.metadataType = patch.metadataType
+          -- Bump the denormalised metadataVersion so an in-flight
+          -- casSetMetadata (optimistic CAS keyed on this counter) sees
+          -- the concurrent write as a version conflict and retries,
+          -- instead of clobbering it under a now-stale expectedVersion.
+          local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0
+          redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1))
+        elseif patch.type == 'set_delay' then
+          payload.delayUntil = patch.delayUntil
+        elseif patch.type == 'mark_cancelled' then
+          payload.cancelledAt = patch.cancelledAt
+          payload.cancelReason = patch.cancelReason
+        else
+          return 'busy'
+        end
+        redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+        return 'applied_to_snapshot'
+      `
+    });
+    this.redis.defineCommand("ackMollifierEntry", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local graceTtlSeconds = tonumber(ARGV[1])
+        -- Guard: never create a partial entry. If the hash is gone between
+        -- pop and ack (concurrent fail or eviction \u2014 QUEUED entries carry
+        -- no TTL), the run is gone, nothing to mark materialised.
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 0
+        end
+        -- If the entry was accepted with an idempotency key, the lookup
+        -- string was stored on the hash at accept time. Clear it now \u2014
+        -- PG becomes canonical for the key post-materialisation.
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+        redis.call('HSET', entryKey, 'materialised', 'true')
+        redis.call('EXPIRE', entryKey, graceTtlSeconds)
+        return 1
       `
     });
     this.redis.defineCommand("failMollifierEntry", {
@@ -16680,16 +17239,43 @@ var MollifierBuffer = class {
         local entryKey = KEYS[1]
         local errorPayload = ARGV[1]
-        -- Guard: never create a partial entry. If the hash expired between
-        -- pop and fail, the run is gone \u2014 nothing to mark FAILED.
+        -- Guard: nothing to mark FAILED if the hash is gone (concurrent
+        -- ack/manual cleanup). Returning 0 lets the caller distinguish
+        -- "marked failed" from "no-op".
         if redis.call('EXISTS', entryKey) == 0 then
           return 0
         end
         redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload)
+        -- Terminal-failure contract: the drainer's onTerminalFailure
+        -- callback (see MollifierDrainer.processEntry) has been
+        -- invoked before this fail() and has either written a
+        -- SYSTEM_FAILURE PG row (for both non-retryable AND
+        -- max-attempts-exhausted retryable errors) or chosen to fall
+        -- through (genuinely bad snapshot the engine can't materialise
+        -- a row from). Either way the buffer entry is no longer
+        -- load-bearing here. Clear the idempotency lookup -- PG's
+        -- unique constraint is the canonical dedup mechanism
+        -- post-materialise -- and drop the entry hash so failed runs
+        -- don't accrete forever now that there's no accept-time TTL.
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+        redis.call('DEL', entryKey)
         return 1
       `
     });
+    this.redis.defineCommand("delMollifierKeyIfEquals", {
+      numberOfKeys: 1,
+      lua: `
+        if redis.call('GET', KEYS[1]) == ARGV[1] then
+          return redis.call('DEL', KEYS[1])
+        end
+        return 0
+      `
+    });
     this.redis.defineCommand("mollifierEvaluateTrip", {
       numberOfKeys: 2,
       lua: `
@@ -16717,6 +17303,7 @@ var MollifierBuffer = class {
 var MollifierDrainer = class {
   buffer;
   handler;
+  onTerminalFailure;
   maxAttempts;
   isRetryable;
   pollIntervalMs;
@@ -16734,6 +17321,7 @@ var MollifierDrainer = class {
   constructor(options) {
     this.buffer = options.buffer;
     this.handler = options.handler;
+    this.onTerminalFailure = options.onTerminalFailure;
     this.maxAttempts = options.maxAttempts;
     this.isRetryable = options.isRetryable;
     this.pollIntervalMs = options.pollIntervalMs ?? 100;
@@ -16918,13 +17506,46 @@ var MollifierDrainer = class {
         });
         return "failed";
       }
+      const cause = this.isRetryable(err) ? "max-attempts-exhausted" : "non-retryable";
       const code = err instanceof Error ? err.name : "Unknown";
       const message = err instanceof Error ? err.message : String(err);
+      if (this.onTerminalFailure) {
+        try {
+          await this.onTerminalFailure({
+            runId: entry.runId,
+            envId: entry.envId,
+            orgId: entry.orgId,
+            payload: deserialiseSnapshot(entry.payload),
+            attempts: nextAttempts,
+            createdAt: entry.createdAt,
+            error: { code, message },
+            cause
+          });
+        } catch (writeErr) {
+          if (this.isRetryable(writeErr)) {
+            await this.buffer.requeue(entry.runId);
+            this.logger.warn(
+              "MollifierDrainer: terminal-failure callback retryable; requeued",
+              {
+                runId: entry.runId,
+                attempts: nextAttempts,
+                writeErr
+              }
+            );
+            return "failed";
+          }
+          this.logger.error("MollifierDrainer: terminal-failure callback failed", {
+            runId: entry.runId,
+            writeErr
+          });
+        }
+      }
       await this.buffer.fail(entry.runId, { code, message });
       this.logger.error("MollifierDrainer: terminal failure", {
         runId: entry.runId,
         code,
-        message
+        message,
+        cause
       });
       return "failed";
     }
@@ -16965,7 +17586,9 @@ exports.WorkerQueueManager = WorkerQueueManager;
 exports.createDefaultRetryStrategy = createDefaultRetryStrategy;
 exports.defaultRetryOptions = defaultRetryOptions;
 exports.deserialiseSnapshot = deserialiseSnapshot;
+exports.idempotencyLookupKeyFor = idempotencyLookupKeyFor;
 exports.isAbortError = isAbortError;
+exports.makeIdempotencyClaimKey = makeIdempotencyClaimKey;
 exports.noopTelemetry = noopTelemetry;
 exports.serialiseSnapshot = serialiseSnapshot;
 //# sourceMappingURL=index.cjs.map