@trigger.dev/redis-worker 4.5.0-rc.3 → 4.5.0-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -16382,6 +16382,7 @@ var stringToDate = zod.z.string().transform((v, ctx) => {
16382
16382
  }
16383
16383
  return d;
16384
16384
  });
16385
+ var stringToBool = zod.z.union([zod.z.literal("true"), zod.z.literal("false")]).transform((v) => v === "true");
16385
16386
  var stringToError = zod.z.string().transform((v, ctx) => {
16386
16387
  try {
16387
16388
  return BufferEntryError.parse(JSON.parse(v));
@@ -16398,6 +16399,27 @@ var BufferEntrySchema = zod.z.object({
16398
16399
  status: BufferEntryStatus,
16399
16400
  attempts: stringToInt,
16400
16401
  createdAt: stringToDate,
16402
+ // Microsecond epoch of accept time, kept as a hash field for dwell
16403
+ // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted
16404
+ // so an entry written by an accept Lua predating this field — or one
16405
+ // surviving across the deploy that introduced it — still parses instead
16406
+ // of being silently dropped on pop.
16407
+ createdAtMicros: stringToInt.default("0"),
16408
+ // Drainer-ack flag: `true` once the drainer has materialised this run
16409
+ // into PG. The hash persists for a short grace TTL after ack so direct
16410
+ // reads (retrieve, trace, etc.) still resolve while PG replica lag
16411
+ // settles. Absent on pre-ack entries.
16412
+ materialised: stringToBool.default("false"),
16413
+ // Denormalised pointer to the Redis idempotency lookup key (set when
16414
+ // the run was accepted with an idempotency key, empty otherwise). The
16415
+ // ack Lua reads this to DEL the lookup atomically with marking the
16416
+ // entry materialised.
16417
+ idempotencyLookupKey: zod.z.string().optional().default(""),
16418
+ // Optimistic-lock counter for the snapshot's `metadata` field.
16419
+ // Incremented atomically by the CAS metadata Lua. Matches the
16420
+ // semantic of `TaskRun.metadataVersion` on the PG side (which the
16421
+ // UpdateMetadataService uses for the same retry-on-conflict pattern).
16422
+ metadataVersion: stringToInt.default("0"),
16401
16423
  lastError: stringToError.optional()
16402
16424
  });
16403
16425
  function serialiseSnapshot(snapshot) {
@@ -16408,19 +16430,32 @@ function deserialiseSnapshot(serialised) {
16408
16430
  }
16409
16431
 
16410
16432
  // src/mollifier/buffer.ts
16433
+ var ACK_GRACE_TTL_SECONDS = 30;
16434
+ function mollifierReconnectDelayMs(times, random = Math.random) {
16435
+ const base = Math.min(times * 50, 1e3);
16436
+ const half = Math.floor(base / 2);
16437
+ return half + Math.round(random() * (base - half));
16438
+ }
16439
+ function encodeKeyPart(value) {
16440
+ return Buffer.from(value, "utf8").toString("base64url");
16441
+ }
16442
+ function idempotencyLookupKeyFor(input) {
16443
+ return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
16444
+ }
16445
+ var PENDING_PREFIX = "pending:";
16446
+ function makeIdempotencyClaimKey(input) {
16447
+ return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
16448
+ }
16411
16449
  var MollifierBuffer = class {
16412
16450
  redis;
16413
- entryTtlSeconds;
16414
16451
  logger;
16415
16452
  constructor(options) {
16416
- this.entryTtlSeconds = options.entryTtlSeconds;
16417
16453
  this.logger = options.logger ?? new logger$1.Logger("MollifierBuffer", "debug");
16418
16454
  this.redis = createRedisClient(
16419
16455
  {
16420
16456
  ...options.redisOptions,
16421
16457
  retryStrategy(times) {
16422
- const delay = Math.min(times * 50, 1e3);
16423
- return delay;
16458
+ return mollifierReconnectDelayMs(times);
16424
16459
  },
16425
16460
  maxRetriesPerRequest: 20
16426
16461
  },
@@ -16432,14 +16467,26 @@ var MollifierBuffer = class {
16432
16467
  );
16433
16468
  this.#registerCommands();
16434
16469
  }
16435
- // Returns true if the entry was newly written; false if a duplicate runId
16436
- // was already buffered (idempotent no-op). Callers can use the boolean to
16437
- // record a duplicate-accept metric without affecting buffer state.
16470
+ // Three outcomes:
16471
+ // - { kind: "accepted" } entry was newly written.
16472
+ // - { kind: "duplicate_run_id" } runId was already buffered (idempotent
16473
+ // no-op, same semantic as the previous boolean-false return).
16474
+ // - { kind: "duplicate_idempotency", existingRunId } — the (env, task,
16475
+ // idempotencyKey) tuple was already bound to another buffered run.
16476
+ // The Lua's atomic SETNX is the race-winner; the second caller gets
16477
+ // the winner's runId so it can return that as the trigger response.
16438
16478
  async accept(input) {
16439
16479
  const entryKey = `mollifier:entries:${input.runId}`;
16440
16480
  const queueKey = `mollifier:queue:${input.envId}`;
16441
16481
  const orgsKey = "mollifier:orgs";
16442
- const createdAt = (/* @__PURE__ */ new Date()).toISOString();
16482
+ const nowMs = Date.now();
16483
+ const createdAt = new Date(nowMs).toISOString();
16484
+ const createdAtMicros = nowMs * 1e3;
16485
+ const idempotencyLookupKey = input.idempotencyKey && input.taskIdentifier ? idempotencyLookupKeyFor({
16486
+ envId: input.envId,
16487
+ taskIdentifier: input.taskIdentifier,
16488
+ idempotencyKey: input.idempotencyKey
16489
+ }) : "";
16443
16490
  const result = await this.redis.acceptMollifierEntry(
16444
16491
  entryKey,
16445
16492
  queueKey,
@@ -16449,10 +16496,16 @@ var MollifierBuffer = class {
16449
16496
  input.orgId,
16450
16497
  input.payload,
16451
16498
  createdAt,
16452
- String(this.entryTtlSeconds),
16453
- "mollifier:org-envs:"
16499
+ String(createdAtMicros),
16500
+ "mollifier:org-envs:",
16501
+ idempotencyLookupKey,
16502
+ "mollifier:entries:"
16454
16503
  );
16455
- return result === 1;
16504
+ if (typeof result === "string" && result.length > 0) {
16505
+ return { kind: "duplicate_idempotency", existingRunId: result };
16506
+ }
16507
+ if (result === 1) return { kind: "accepted" };
16508
+ return { kind: "duplicate_run_id" };
16456
16509
  }
16457
16510
  async pop(envId) {
16458
16511
  const queueKey = `mollifier:queue:${envId}`;
@@ -16506,8 +16559,220 @@ var MollifierBuffer = class {
16506
16559
  async listEnvsForOrg(orgId) {
16507
16560
  return this.redis.smembers(`mollifier:org-envs:${orgId}`);
16508
16561
  }
16562
+ // Read-only enumeration of currently-queued entries for a single env.
16563
+ // Used by the stale-sweep to compute per-entry dwell time, so order is
16564
+ // immaterial — LRANGE returns them newest-first (LPUSH head) but the
16565
+ // caller scans the whole window. Non-destructive: the drainer still
16566
+ // RPOPs these entries in FIFO order.
16567
+ //
16568
+ // The entry HGETALLs are issued in a single pipelined batch (one
16569
+ // network round-trip instead of N) — at the stale-sweep's default
16570
+ // maxCount=1000 the serial implementation cost ~1000 RTTs per env,
16571
+ // which dominated sweep wall-time at any meaningful backlog.
16572
+ //
16573
+ // A missing entry (empty hash) is skipped: the drainer's RPOP+DEL of
16574
+ // the entry hash can race our LRANGE→HGETALL window, so a runId on
16575
+ // the queue with no backing hash is an expected concurrency outcome,
16576
+ // not an error.
16577
+ async listEntriesForEnv(envId, maxCount) {
16578
+ if (maxCount <= 0) return [];
16579
+ const runIds = await this.redis.lrange(
16580
+ `mollifier:queue:${envId}`,
16581
+ 0,
16582
+ maxCount - 1
16583
+ );
16584
+ if (runIds.length === 0) return [];
16585
+ const pipeline = this.redis.pipeline();
16586
+ for (const runId of runIds) {
16587
+ pipeline.hgetall(`mollifier:entries:${runId}`);
16588
+ }
16589
+ const results = await pipeline.exec();
16590
+ if (!results) return [];
16591
+ const entries = [];
16592
+ for (let i = 0; i < results.length; i++) {
16593
+ const [err, raw] = results[i];
16594
+ if (err) {
16595
+ this.logger.error("MollifierBuffer.listEntriesForEnv: hgetall failed", {
16596
+ runId: runIds[i],
16597
+ err: err.message
16598
+ });
16599
+ continue;
16600
+ }
16601
+ if (!raw || Object.keys(raw).length === 0) continue;
16602
+ const parsed = BufferEntrySchema.safeParse(raw);
16603
+ if (!parsed.success) {
16604
+ this.logger.error("MollifierBuffer.listEntriesForEnv: invalid entry shape", {
16605
+ runId: runIds[i],
16606
+ errors: parsed.error.flatten()
16607
+ });
16608
+ continue;
16609
+ }
16610
+ entries.push(parsed.data);
16611
+ }
16612
+ return entries;
16613
+ }
16614
+ // Atomic snapshot mutation. Used by customer-mutation API endpoints
16615
+ // (tags, metadata-put, reschedule, cancel) when the run is still in
16616
+ // the buffer. Three outcomes:
16617
+ // - "applied_to_snapshot": entry was QUEUED + not materialised; the
16618
+ // drainer will read the patched payload on its next pop.
16619
+ // - "not_found": no entry hash exists for this runId — including a
16620
+ // FAILED entry, whose hash the drainer-terminal `fail` path DELs.
16621
+ // - "busy": entry is DRAINING or materialised. The API
16622
+ // wait-and-bounces through PG.
16623
+ // - "limit_exceeded": an `append_tags` patch carrying `maxTags` would
16624
+ // push the deduped tag count over the cap; nothing is written.
16625
+ async mutateSnapshot(runId, patch) {
16626
+ const result = await this.redis.mutateMollifierSnapshot(
16627
+ `mollifier:entries:${runId}`,
16628
+ JSON.stringify(patch)
16629
+ );
16630
+ if (result === "applied_to_snapshot" || result === "not_found" || result === "busy" || result === "limit_exceeded") {
16631
+ return result;
16632
+ }
16633
+ throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`);
16634
+ }
16635
+ // Optimistic compare-and-swap on the snapshot's metadata. Caller reads
16636
+ // the current metadataVersion via getEntry, applies operations in JS via
16637
+ // `applyMetadataOperations`, then calls this with the new metadata + the
16638
+ // expected version. Lua refuses if the version has moved (caller retries
16639
+ // up to N times). Mirrors the PG-side `UpdateMetadataService` retry
16640
+ // loop so concurrent increment/append operations don't lose deltas.
16641
+ async casSetMetadata(input) {
16642
+ const entryKey = `mollifier:entries:${input.runId}`;
16643
+ const raw = await this.redis.casSetMollifierMetadata(
16644
+ entryKey,
16645
+ String(input.expectedVersion),
16646
+ input.newMetadata,
16647
+ input.newMetadataType
16648
+ );
16649
+ if (raw === "not_found") return { kind: "not_found" };
16650
+ if (raw === "busy") return { kind: "busy" };
16651
+ if (raw.startsWith("conflict:")) {
16652
+ return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) };
16653
+ }
16654
+ if (raw.startsWith("applied:")) {
16655
+ return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) };
16656
+ }
16657
+ throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`);
16658
+ }
16659
+ // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
16660
+ // call across both PG and buffer paths serialises through this claim;
16661
+ // closes the race the buffer-side SETNX leaves open during the
16662
+ // gate-transition burst window.
16663
+ //
16664
+ // The caller supplies an opaque `token` (UUID) on claim. The same token
16665
+ // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and-
16666
+ // act so a late release from a previous claimant whose TTL expired
16667
+ // cannot erase a new owner's claim.
16668
+ //
16669
+ // - "claimed": we now own the claim, the caller proceeds with the
16670
+ // trigger pipeline and must `publishClaim` on success or
16671
+ // `releaseClaim` on failure.
16672
+ // - "pending": another trigger owns the claim and hasn't published
16673
+ // yet; the caller should poll.
16674
+ // - "resolved": the claim already holds a runId; the caller can
16675
+ // return that runId as a cached hit.
16676
+ async claimIdempotency(input) {
16677
+ const claimKey = makeIdempotencyClaimKey(input);
16678
+ const raw = await this.redis.claimMollifierIdempotency(
16679
+ claimKey,
16680
+ `${PENDING_PREFIX}${input.token}`,
16681
+ PENDING_PREFIX,
16682
+ String(input.ttlSeconds)
16683
+ );
16684
+ if (raw === "claimed") return { kind: "claimed" };
16685
+ if (raw === "pending") return { kind: "pending" };
16686
+ if (raw.startsWith("resolved:")) {
16687
+ return { kind: "resolved", runId: raw.slice("resolved:".length) };
16688
+ }
16689
+ throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`);
16690
+ }
16691
+ // Publish the winning runId to the claim so subsequent claimants /
16692
+ // waiters see "resolved". TTL bounded by the customer's
16693
+ // `idempotencyKeyExpiresAt` minus now; caller computes.
16694
+ //
16695
+ // Compare-and-set on the caller's token: if the current value isn't
16696
+ // our pending marker (TTL expired and another claimant moved in, or
16697
+ // someone else already published), the publish is a no-op. The caller
16698
+ // can treat any such case as "we lost the claim" and re-read.
16699
+ // Returns true if we published; false if the claim slot was no longer
16700
+ // ours.
16701
+ async publishClaim(input) {
16702
+ const claimKey = makeIdempotencyClaimKey(input);
16703
+ const result = await this.redis.publishMollifierClaim(
16704
+ claimKey,
16705
+ `${PENDING_PREFIX}${input.token}`,
16706
+ input.runId,
16707
+ String(input.ttlSeconds)
16708
+ );
16709
+ return result === 1;
16710
+ }
16711
+ // Release the claim on pipeline error so waiters can re-claim and
16712
+ // retry. Idempotent.
16713
+ //
16714
+ // Compare-and-delete on the caller's token: only deletes if the
16715
+ // current value is exactly our pending marker. A late release from a
16716
+ // claimant whose TTL expired is a no-op, so a new owner's claim is
16717
+ // never wiped by a slow predecessor.
16718
+ async releaseClaim(input) {
16719
+ const claimKey = makeIdempotencyClaimKey(input);
16720
+ await this.redis.releaseMollifierClaim(
16721
+ claimKey,
16722
+ `${PENDING_PREFIX}${input.token}`
16723
+ );
16724
+ }
16725
+ // Read the current claim value, used by the wait/poll loop on losers
16726
+ // to detect "pending" → "resolved" transitions and timeouts.
16727
+ async readClaim(input) {
16728
+ const claimKey = makeIdempotencyClaimKey(input);
16729
+ const value = await this.redis.get(claimKey);
16730
+ if (value === null) return null;
16731
+ if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" };
16732
+ return { kind: "resolved", runId: value };
16733
+ }
16734
+ // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
16735
+ // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
16736
+ // misses — same key may belong to a buffered run waiting to drain. The
16737
+ // lookup self-heals: if the lookup points at an entry hash that's gone,
16738
+ // we clear the lookup and report a miss. The clear is a compare-and-
16739
+ // delete (only if the key still holds the stale runId we observed) so a
16740
+ // fresh accept that rebinds the key between our GET and DEL isn't wiped.
16741
+ async lookupIdempotency(input) {
16742
+ const lookupKey = idempotencyLookupKeyFor(input);
16743
+ const runId = await this.redis.get(lookupKey);
16744
+ if (!runId) return null;
16745
+ const entry = await this.getEntry(runId);
16746
+ if (!entry) {
16747
+ await this.redis.delMollifierKeyIfEquals(lookupKey, runId);
16748
+ return null;
16749
+ }
16750
+ return runId;
16751
+ }
16752
+ // Clear the idempotency binding from a buffered run. Used by
16753
+ // `ResetIdempotencyKeyService` alongside the existing PG-side
16754
+ // `updateMany`. Returns the runId that was cleared, or null if no
16755
+ // buffered run held this key.
16756
+ async resetIdempotency(input) {
16757
+ const lookupKey = idempotencyLookupKeyFor(input);
16758
+ const claimKey = makeIdempotencyClaimKey(input);
16759
+ const clearedRunId = await this.redis.resetMollifierIdempotency(
16760
+ lookupKey,
16761
+ "mollifier:entries:",
16762
+ claimKey
16763
+ );
16764
+ return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
16765
+ }
16766
+ // Marks the entry as materialised (PG row written) and resets its TTL to
16767
+ // the grace window. Entry hash persists past ack as a read-fallback
16768
+ // safety net for the brief PG replica-lag window between drainer-side
16769
+ // write and reader-side visibility. Also clears the associated
16770
+ // idempotency lookup if one was set on accept.
16509
16771
  async ack(runId) {
16510
- await this.redis.del(`mollifier:entries:${runId}`);
16772
+ await this.redis.ackMollifierEntry(
16773
+ `mollifier:entries:${runId}`,
16774
+ String(ACK_GRACE_TTL_SECONDS)
16775
+ );
16511
16776
  }
16512
16777
  async requeue(runId) {
16513
16778
  await this.redis.requeueMollifierEntry(
@@ -16518,9 +16783,12 @@ var MollifierBuffer = class {
16518
16783
  "mollifier:org-envs:"
16519
16784
  );
16520
16785
  }
16521
- // Returns true if the entry transitioned to FAILED; false if the entry no
16522
- // longer exists (TTL expired between pop and fail). Caller can use the
16523
- // boolean to skip downstream FAILED handling for ghost entries.
16786
+ // Returns true if a live entry was torn down; false if the entry no
16787
+ // longer existed (a concurrent ack or manual cleanup removed it between
16788
+ // pop and fail there is no accept-time TTL). Note FAILED is not an
16789
+ // observable state: the Lua marks the hash FAILED then DELs it in the
16790
+ // same atomic script, so a subsequent getEntry returns null. Caller can
16791
+ // use the boolean to skip downstream FAILED handling for ghost entries.
16524
16792
  async fail(runId, error) {
16525
16793
  const result = await this.redis.failMollifierEntry(
16526
16794
  `mollifier:entries:${runId}`,
@@ -16528,6 +16796,11 @@ var MollifierBuffer = class {
16528
16796
  );
16529
16797
  return result === 1;
16530
16798
  }
16799
+ // Returns Redis-side TTL on the entry hash. Returns -1 for entries
16800
+ // with no TTL — the steady state under the current design, where
16801
+ // entries persist until drainer ack/fail. The ack grace TTL (30s
16802
+ // post-materialise) is the only context where this returns a
16803
+ // positive value; tests around the grace TTL still rely on it.
16531
16804
  async getEntryTtlSeconds(runId) {
16532
16805
  return this.redis.ttl(`mollifier:entries:${runId}`);
16533
16806
  }
@@ -16558,8 +16831,10 @@ var MollifierBuffer = class {
16558
16831
  local orgId = ARGV[3]
16559
16832
  local payload = ARGV[4]
16560
16833
  local createdAt = ARGV[5]
16561
- local ttlSeconds = tonumber(ARGV[6])
16834
+ local createdAtMicros = ARGV[6]
16562
16835
  local orgEnvsPrefix = ARGV[7]
16836
+ local idempotencyLookupKey = ARGV[8] or ''
16837
+ local entryPrefix = ARGV[9]
16563
16838
 
16564
16839
  -- Idempotent: refuse if an entry for this runId already exists in any
16565
16840
  -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -16568,6 +16843,27 @@ var MollifierBuffer = class {
16568
16843
  return 0
16569
16844
  end
16570
16845
 
16846
+ -- Idempotency-key dedup. If the caller passed a lookup key
16847
+ -- and it's already bound to another buffered run, return the
16848
+ -- winner's runId so the loser's API response can echo it as a
16849
+ -- cached hit. Otherwise SET the lookup (no TTL \u2014 lifecycle is
16850
+ -- paired with the entry hash; drainer ack/fail clear it
16851
+ -- explicitly).
16852
+ if idempotencyLookupKey ~= '' then
16853
+ local existing = redis.call('GET', idempotencyLookupKey)
16854
+ if existing then
16855
+ -- Self-heal: only honour the binding if its entry hash still
16856
+ -- exists. If the entry was evicted (maxmemory) but the lookup
16857
+ -- survived, the binding is stale \u2014 fall through and rebind to
16858
+ -- this run rather than returning a dead runId that would block
16859
+ -- the key indefinitely. Mirrors lookupIdempotency's self-heal.
16860
+ if redis.call('EXISTS', entryPrefix .. existing) == 1 then
16861
+ return existing
16862
+ end
16863
+ end
16864
+ redis.call('SET', idempotencyLookupKey, runId)
16865
+ end
16866
+
16571
16867
  redis.call('HSET', entryKey,
16572
16868
  'runId', runId,
16573
16869
  'envId', envId,
@@ -16575,8 +16871,20 @@ var MollifierBuffer = class {
16575
16871
  'payload', payload,
16576
16872
  'status', 'QUEUED',
16577
16873
  'attempts', '0',
16578
- 'createdAt', createdAt)
16579
- redis.call('EXPIRE', entryKey, ttlSeconds)
16874
+ 'createdAt', createdAt,
16875
+ 'createdAtMicros', createdAtMicros,
16876
+ 'idempotencyLookupKey', idempotencyLookupKey,
16877
+ 'metadataVersion', '0')
16878
+ -- No EXPIRE on the entry hash. Buffer entries persist until the
16879
+ -- drainer ACKs (post-materialise grace) or FAILs them \u2014 the
16880
+ -- drainer is the only recovery mechanism, so silent TTL-based
16881
+ -- eviction would lose runs with no customer-visible signal.
16882
+ -- Memory pressure from an offline drainer is the alertable
16883
+ -- failure mode instead; see _ops/mollifier-ops.md.
16884
+ -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so
16885
+ -- insertion order == drain order (FIFO). createdAtMicros is kept
16886
+ -- as a hash field for dwell metrics only \u2014 it is no longer a sort
16887
+ -- key now that the buffer has no list/pagination surface.
16580
16888
  redis.call('LPUSH', queueKey, runId)
16581
16889
  -- Org-level membership: maintained atomically with the per-env
16582
16890
  -- queue so the drainer can walk orgs \u2192 envs-for-org and
@@ -16606,7 +16914,12 @@ var MollifierBuffer = class {
16606
16914
  local nextAttempts = tonumber(currentAttempts or '0') + 1
16607
16915
 
16608
16916
  redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
16609
- redis.call('LPUSH', queuePrefix .. envId, runId)
16917
+ -- Requeue RPUSHes to the tail (the RPOP end) so a transiently
16918
+ -- failed entry pops next rather than going to the back of the
16919
+ -- line behind a fresh backlog. createdAt is immutable across
16920
+ -- retries; the drainer's maxAttempts caps the
16921
+ -- retry loop so a poisoned entry doesn't head-of-line forever.
16922
+ redis.call('RPUSH', queuePrefix .. envId, runId)
16610
16923
  -- Re-track the org/env: pop may have SREM'd them when the queue
16611
16924
  -- last emptied. SADDs are idempotent if the values are still
16612
16925
  -- present.
@@ -16640,11 +16953,13 @@ var MollifierBuffer = class {
16640
16953
  end
16641
16954
  end
16642
16955
 
16643
- -- Loop to skip orphan queue references \u2014 runIds whose entry hash has
16644
- -- expired (TTL hit). HSET on a missing key would CREATE a partial
16645
- -- hash without a TTL, leaking memory. The loop is bounded by queue
16646
- -- length; entire Lua script remains atomic.
16956
+ -- Loop to skip orphan queue references \u2014 runIds whose entry hash is
16957
+ -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry
16958
+ -- no TTL of their own). HSET on a missing key would CREATE a
16959
+ -- partial hash without a TTL, leaking memory. The loop is bounded
16960
+ -- by queue length; entire Lua script remains atomic.
16647
16961
  while true do
16962
+ -- RPOP returns the tail member (oldest, FIFO), or false when empty.
16648
16963
  local runId = redis.call('RPOP', queueKey)
16649
16964
  if not runId then
16650
16965
  -- Queue is empty AND we have no entry to read orgId from, so
@@ -16662,16 +16977,260 @@ var MollifierBuffer = class {
16662
16977
  result[raw[i]] = raw[i + 1]
16663
16978
  end
16664
16979
  -- Prune org-level membership if this pop drained the queue.
16665
- -- Atomic with the RPOP above \u2014 a concurrent accept AFTER this
16666
- -- script will SADD both back along with its LPUSH.
16980
+ -- Atomic with the RPOP above \u2014 a concurrent accept AFTER
16981
+ -- this script will SADD both back along with its LPUSH.
16667
16982
  if redis.call('LLEN', queueKey) == 0 then
16668
16983
  pruneOrgMembership(result['orgId'])
16669
16984
  end
16670
16985
  return cjson.encode(result)
16671
16986
  end
16672
- -- Orphan queue reference: entry TTL expired while runId was queued.
16673
- -- Discard the reference and loop to the next.
16987
+ -- Orphan queue reference: entry hash gone (evicted) while runId
16988
+ -- was queued. Discard the reference and loop to the next.
16989
+ end
16990
+ `
16991
+ });
16992
+ this.redis.defineCommand("casSetMollifierMetadata", {
16993
+ numberOfKeys: 1,
16994
+ lua: `
16995
+ local entryKey = KEYS[1]
16996
+ local expectedVersion = tonumber(ARGV[1])
16997
+ local newMetadata = ARGV[2]
16998
+ local newMetadataType = ARGV[3]
16999
+
17000
+ if redis.call('EXISTS', entryKey) == 0 then
17001
+ return 'not_found'
17002
+ end
17003
+
17004
+ local status = redis.call('HGET', entryKey, 'status')
17005
+ local materialised = redis.call('HGET', entryKey, 'materialised')
17006
+ if status ~= 'QUEUED' or materialised == 'true' then
17007
+ return 'busy'
17008
+ end
17009
+
17010
+ local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0'
17011
+ local currentVersion = tonumber(currentVersionStr) or 0
17012
+ if currentVersion ~= expectedVersion then
17013
+ return 'conflict:' .. tostring(currentVersion)
17014
+ end
17015
+
17016
+ -- Write the new metadata onto the snapshot's payload JSON. We
17017
+ -- keep the rest of the payload intact \u2014 only metadata/metadataType
17018
+ -- change. metadataVersion is denormalised on the hash for cheap
17019
+ -- CAS reads; it's intentionally NOT stored inside the payload
17020
+ -- itself (PG-side metadataVersion is a column, not a JSON field).
17021
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17022
+ local ok, payload = pcall(cjson.decode, payloadJson)
17023
+ if not ok then return 'busy' end
17024
+ payload.metadata = newMetadata
17025
+ payload.metadataType = newMetadataType
17026
+
17027
+ local newVersion = currentVersion + 1
17028
+ redis.call('HSET', entryKey,
17029
+ 'payload', cjson.encode(payload),
17030
+ 'metadataVersion', tostring(newVersion))
17031
+ return 'applied:' .. tostring(newVersion)
17032
+ `
17033
+ });
17034
+ this.redis.defineCommand("claimMollifierIdempotency", {
17035
+ numberOfKeys: 1,
17036
+ lua: `
17037
+ local claimKey = KEYS[1]
17038
+ local pendingMarker = ARGV[1] -- "pending:<caller-token>"
17039
+ local pendingPrefix = ARGV[2] -- "pending:"
17040
+ local ttl = tonumber(ARGV[3])
17041
+
17042
+ -- SETNX-with-TTL: atomic; only one caller can win.
17043
+ local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl)
17044
+ if won then
17045
+ return 'claimed'
17046
+ end
17047
+
17048
+ local existing = redis.call('GET', claimKey)
17049
+ if not existing then
17050
+ -- The slot expired in the race window between the SET NX
17051
+ -- failing and this GET. It's free now \u2014 claim it so we don't
17052
+ -- string.sub a nil and error out.
17053
+ redis.call('SET', claimKey, pendingMarker, 'EX', ttl)
17054
+ return 'claimed'
17055
+ end
17056
+ -- Any "pending:*" value is a live claim \u2014 the caller-supplied
17057
+ -- token differentiates ownership but is opaque to losers.
17058
+ if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
17059
+ return 'pending'
17060
+ end
17061
+ return 'resolved:' .. existing
17062
+ `
17063
+ });
17064
+ this.redis.defineCommand("publishMollifierClaim", {
17065
+ numberOfKeys: 1,
17066
+ lua: `
17067
+ local claimKey = KEYS[1]
17068
+ local ownerMarker = ARGV[1] -- "pending:<our-token>"
17069
+ local runId = ARGV[2]
17070
+ local ttl = tonumber(ARGV[3])
17071
+
17072
+ local existing = redis.call('GET', claimKey)
17073
+ if existing == ownerMarker then
17074
+ redis.call('SET', claimKey, runId, 'EX', ttl)
17075
+ return 1
17076
+ end
17077
+ return 0
17078
+ `
17079
+ });
17080
+ this.redis.defineCommand("releaseMollifierClaim", {
17081
+ numberOfKeys: 1,
17082
+ lua: `
17083
+ local claimKey = KEYS[1]
17084
+ local ownerMarker = ARGV[1] -- "pending:<our-token>"
17085
+
17086
+ local existing = redis.call('GET', claimKey)
17087
+ if existing == ownerMarker then
17088
+ redis.call('DEL', claimKey)
17089
+ return 1
17090
+ end
17091
+ return 0
17092
+ `
17093
+ });
17094
+ this.redis.defineCommand("resetMollifierIdempotency", {
17095
+ numberOfKeys: 1,
17096
+ lua: `
17097
+ local lookupKey = KEYS[1]
17098
+ local entryPrefix = ARGV[1]
17099
+ local claimKey = ARGV[2]
17100
+
17101
+ -- Reset reopens the key across BOTH the buffer lookup and the
17102
+ -- cross-store pre-gate claim pointer. Without clearing the claim,
17103
+ -- a resolved/pending claim would keep deduping new triggers for
17104
+ -- the rest of its TTL even though the binding was reset. DEL is
17105
+ -- unconditional \u2014 the claim is gone regardless of whether a
17106
+ -- buffered run currently holds the lookup.
17107
+ redis.call('DEL', claimKey)
17108
+
17109
+ local runId = redis.call('GET', lookupKey)
17110
+ if not runId then
17111
+ return ''
17112
+ end
17113
+
17114
+ local entryKey = entryPrefix .. runId
17115
+ if redis.call('EXISTS', entryKey) == 0 then
17116
+ -- Stale lookup. Lazy cleanup.
17117
+ redis.call('DEL', lookupKey)
17118
+ return ''
17119
+ end
17120
+
17121
+ -- Clear the idempotency fields on the snapshot payload so the
17122
+ -- drainer's eventual engine.trigger call inserts a PG row
17123
+ -- without the key set.
17124
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17125
+ if payloadJson then
17126
+ local ok, payload = pcall(cjson.decode, payloadJson)
17127
+ if ok then
17128
+ payload.idempotencyKey = cjson.null
17129
+ payload.idempotencyKeyExpiresAt = cjson.null
17130
+ redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
17131
+ end
16674
17132
  end
17133
+ -- Clear the denormalised lookup pointer on the hash so a later
17134
+ -- ack doesn't try to DEL a key that's already gone.
17135
+ redis.call('HSET', entryKey, 'idempotencyLookupKey', '')
17136
+ redis.call('DEL', lookupKey)
17137
+ return runId
17138
+ `
17139
+ });
17140
+ this.redis.defineCommand("mutateMollifierSnapshot", {
17141
+ numberOfKeys: 1,
17142
+ lua: `
17143
+ local entryKey = KEYS[1]
17144
+ local patchJson = ARGV[1]
17145
+
17146
+ if redis.call('EXISTS', entryKey) == 0 then
17147
+ return 'not_found'
17148
+ end
17149
+
17150
+ local status = redis.call('HGET', entryKey, 'status')
17151
+ local materialised = redis.call('HGET', entryKey, 'materialised')
17152
+ if status ~= 'QUEUED' or materialised == 'true' then
17153
+ return 'busy'
17154
+ end
17155
+
17156
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17157
+ local ok, payload = pcall(cjson.decode, payloadJson)
17158
+ if not ok then return 'busy' end
17159
+
17160
+ local patch = cjson.decode(patchJson)
17161
+
17162
+ if patch.type == 'append_tags' then
17163
+ -- cjson decode of an absent or empty-array field gives nil or
17164
+ -- an empty table; we rebuild as a dense array. Existing tags
17165
+ -- are preserved; new tags are appended only if not present.
17166
+ local existing = payload.tags or {}
17167
+ local seen = {}
17168
+ local merged = {}
17169
+ for _, t in ipairs(existing) do
17170
+ if not seen[t] then
17171
+ seen[t] = true
17172
+ table.insert(merged, t)
17173
+ end
17174
+ end
17175
+ for _, t in ipairs(patch.tags or {}) do
17176
+ if not seen[t] then
17177
+ seen[t] = true
17178
+ table.insert(merged, t)
17179
+ end
17180
+ end
17181
+ -- Cap the deduped count when the caller supplies a limit, so a
17182
+ -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API.
17183
+ -- Reject the whole patch (write nothing) rather than truncating.
17184
+ if patch.maxTags ~= nil and #merged > patch.maxTags then
17185
+ return 'limit_exceeded'
17186
+ end
17187
+ payload.tags = merged
17188
+ elseif patch.type == 'set_metadata' then
17189
+ payload.metadata = patch.metadata
17190
+ payload.metadataType = patch.metadataType
17191
+ -- Bump the denormalised metadataVersion so an in-flight
17192
+ -- casSetMetadata (optimistic CAS keyed on this counter) sees
17193
+ -- the concurrent write as a version conflict and retries,
17194
+ -- instead of clobbering it under a now-stale expectedVersion.
17195
+ local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0
17196
+ redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1))
17197
+ elseif patch.type == 'set_delay' then
17198
+ payload.delayUntil = patch.delayUntil
17199
+ elseif patch.type == 'mark_cancelled' then
17200
+ payload.cancelledAt = patch.cancelledAt
17201
+ payload.cancelReason = patch.cancelReason
17202
+ else
17203
+ return 'busy'
17204
+ end
17205
+
17206
+ redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
17207
+ return 'applied_to_snapshot'
17208
+ `
17209
+ });
17210
+ this.redis.defineCommand("ackMollifierEntry", {
17211
+ numberOfKeys: 1,
17212
+ lua: `
17213
+ local entryKey = KEYS[1]
17214
+ local graceTtlSeconds = tonumber(ARGV[1])
17215
+
17216
+ -- Guard: never create a partial entry. If the hash is gone between
17217
+ -- pop and ack (concurrent fail or eviction \u2014 QUEUED entries carry
17218
+ -- no TTL), the run is gone, nothing to mark materialised.
17219
+ if redis.call('EXISTS', entryKey) == 0 then
17220
+ return 0
17221
+ end
17222
+
17223
+ -- If the entry was accepted with an idempotency key, the lookup
17224
+ -- string was stored on the hash at accept time. Clear it now \u2014
17225
+ -- PG becomes canonical for the key post-materialisation.
17226
+ local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
17227
+ if lookupKey and lookupKey ~= '' then
17228
+ redis.call('DEL', lookupKey)
17229
+ end
17230
+
17231
+ redis.call('HSET', entryKey, 'materialised', 'true')
17232
+ redis.call('EXPIRE', entryKey, graceTtlSeconds)
17233
+ return 1
16675
17234
  `
16676
17235
  });
16677
17236
  this.redis.defineCommand("failMollifierEntry", {
@@ -16680,16 +17239,43 @@ var MollifierBuffer = class {
16680
17239
  local entryKey = KEYS[1]
16681
17240
  local errorPayload = ARGV[1]
16682
17241
 
16683
- -- Guard: never create a partial entry. If the hash expired between
16684
- -- pop and fail, the run is gone \u2014 nothing to mark FAILED.
17242
+ -- Guard: nothing to mark FAILED if the hash is gone (concurrent
17243
+ -- ack/manual cleanup). Returning 0 lets the caller distinguish
17244
+ -- "marked failed" from "no-op".
16685
17245
  if redis.call('EXISTS', entryKey) == 0 then
16686
17246
  return 0
16687
17247
  end
16688
17248
 
16689
17249
  redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload)
17250
+
17251
+ -- Terminal-failure contract: the drainer's onTerminalFailure
17252
+ -- callback (see MollifierDrainer.processEntry) has been
17253
+ -- invoked before this fail() and has either written a
17254
+ -- SYSTEM_FAILURE PG row (for both non-retryable AND
17255
+ -- max-attempts-exhausted retryable errors) or chosen to fall
17256
+ -- through (genuinely bad snapshot the engine can't materialise
17257
+ -- a row from). Either way the buffer entry is no longer
17258
+ -- load-bearing here. Clear the idempotency lookup -- PG's
17259
+ -- unique constraint is the canonical dedup mechanism
17260
+ -- post-materialise -- and drop the entry hash so failed runs
17261
+ -- don't accrete forever now that there's no accept-time TTL.
17262
+ local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
17263
+ if lookupKey and lookupKey ~= '' then
17264
+ redis.call('DEL', lookupKey)
17265
+ end
17266
+ redis.call('DEL', entryKey)
16690
17267
  return 1
16691
17268
  `
16692
17269
  });
17270
+ this.redis.defineCommand("delMollifierKeyIfEquals", {
17271
+ numberOfKeys: 1,
17272
+ lua: `
17273
+ if redis.call('GET', KEYS[1]) == ARGV[1] then
17274
+ return redis.call('DEL', KEYS[1])
17275
+ end
17276
+ return 0
17277
+ `
17278
+ });
16693
17279
  this.redis.defineCommand("mollifierEvaluateTrip", {
16694
17280
  numberOfKeys: 2,
16695
17281
  lua: `
@@ -16717,6 +17303,7 @@ var MollifierBuffer = class {
16717
17303
  var MollifierDrainer = class {
16718
17304
  buffer;
16719
17305
  handler;
17306
+ onTerminalFailure;
16720
17307
  maxAttempts;
16721
17308
  isRetryable;
16722
17309
  pollIntervalMs;
@@ -16734,6 +17321,7 @@ var MollifierDrainer = class {
16734
17321
  constructor(options) {
16735
17322
  this.buffer = options.buffer;
16736
17323
  this.handler = options.handler;
17324
+ this.onTerminalFailure = options.onTerminalFailure;
16737
17325
  this.maxAttempts = options.maxAttempts;
16738
17326
  this.isRetryable = options.isRetryable;
16739
17327
  this.pollIntervalMs = options.pollIntervalMs ?? 100;
@@ -16918,13 +17506,46 @@ var MollifierDrainer = class {
16918
17506
  });
16919
17507
  return "failed";
16920
17508
  }
17509
+ const cause = this.isRetryable(err) ? "max-attempts-exhausted" : "non-retryable";
16921
17510
  const code = err instanceof Error ? err.name : "Unknown";
16922
17511
  const message = err instanceof Error ? err.message : String(err);
17512
+ if (this.onTerminalFailure) {
17513
+ try {
17514
+ await this.onTerminalFailure({
17515
+ runId: entry.runId,
17516
+ envId: entry.envId,
17517
+ orgId: entry.orgId,
17518
+ payload: deserialiseSnapshot(entry.payload),
17519
+ attempts: nextAttempts,
17520
+ createdAt: entry.createdAt,
17521
+ error: { code, message },
17522
+ cause
17523
+ });
17524
+ } catch (writeErr) {
17525
+ if (this.isRetryable(writeErr)) {
17526
+ await this.buffer.requeue(entry.runId);
17527
+ this.logger.warn(
17528
+ "MollifierDrainer: terminal-failure callback retryable; requeued",
17529
+ {
17530
+ runId: entry.runId,
17531
+ attempts: nextAttempts,
17532
+ writeErr
17533
+ }
17534
+ );
17535
+ return "failed";
17536
+ }
17537
+ this.logger.error("MollifierDrainer: terminal-failure callback failed", {
17538
+ runId: entry.runId,
17539
+ writeErr
17540
+ });
17541
+ }
17542
+ }
16923
17543
  await this.buffer.fail(entry.runId, { code, message });
16924
17544
  this.logger.error("MollifierDrainer: terminal failure", {
16925
17545
  runId: entry.runId,
16926
17546
  code,
16927
- message
17547
+ message,
17548
+ cause
16928
17549
  });
16929
17550
  return "failed";
16930
17551
  }
@@ -16965,7 +17586,9 @@ exports.WorkerQueueManager = WorkerQueueManager;
16965
17586
  exports.createDefaultRetryStrategy = createDefaultRetryStrategy;
16966
17587
  exports.defaultRetryOptions = defaultRetryOptions;
16967
17588
  exports.deserialiseSnapshot = deserialiseSnapshot;
17589
+ exports.idempotencyLookupKeyFor = idempotencyLookupKeyFor;
16968
17590
  exports.isAbortError = isAbortError;
17591
+ exports.makeIdempotencyClaimKey = makeIdempotencyClaimKey;
16969
17592
  exports.noopTelemetry = noopTelemetry;
16970
17593
  exports.serialiseSnapshot = serialiseSnapshot;
16971
17594
  //# sourceMappingURL=index.cjs.map