@trigger.dev/redis-worker 4.5.0-rc.3 → 4.5.0-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -16375,6 +16375,7 @@ var stringToDate = z.string().transform((v, ctx) => {
16375
16375
  }
16376
16376
  return d;
16377
16377
  });
16378
+ var stringToBool = z.union([z.literal("true"), z.literal("false")]).transform((v) => v === "true");
16378
16379
  var stringToError = z.string().transform((v, ctx) => {
16379
16380
  try {
16380
16381
  return BufferEntryError.parse(JSON.parse(v));
@@ -16391,6 +16392,27 @@ var BufferEntrySchema = z.object({
16391
16392
  status: BufferEntryStatus,
16392
16393
  attempts: stringToInt,
16393
16394
  createdAt: stringToDate,
16395
+ // Microsecond epoch of accept time, kept as a hash field for dwell
16396
+ // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted
16397
+ // so an entry written by an accept Lua predating this field — or one
16398
+ // surviving across the deploy that introduced it — still parses instead
16399
+ // of being silently dropped on pop.
16400
+ createdAtMicros: stringToInt.default("0"),
16401
+ // Drainer-ack flag: `true` once the drainer has materialised this run
16402
+ // into PG. The hash persists for a short grace TTL after ack so direct
16403
+ // reads (retrieve, trace, etc.) still resolve while PG replica lag
16404
+ // settles. Absent on pre-ack entries.
16405
+ materialised: stringToBool.default("false"),
16406
+ // Denormalised pointer to the Redis idempotency lookup key (set when
16407
+ // the run was accepted with an idempotency key, empty otherwise). The
16408
+ // ack Lua reads this to DEL the lookup atomically with marking the
16409
+ // entry materialised.
16410
+ idempotencyLookupKey: z.string().optional().default(""),
16411
+ // Optimistic-lock counter for the snapshot's `metadata` field.
16412
+ // Incremented atomically by the CAS metadata Lua. Matches the
16413
+ // semantic of `TaskRun.metadataVersion` on the PG side (which the
16414
+ // UpdateMetadataService uses for the same retry-on-conflict pattern).
16415
+ metadataVersion: stringToInt.default("0"),
16394
16416
  lastError: stringToError.optional()
16395
16417
  });
16396
16418
  function serialiseSnapshot(snapshot) {
@@ -16401,19 +16423,32 @@ function deserialiseSnapshot(serialised) {
16401
16423
  }
16402
16424
 
16403
16425
  // src/mollifier/buffer.ts
16426
+ var ACK_GRACE_TTL_SECONDS = 30;
16427
+ function mollifierReconnectDelayMs(times, random = Math.random) {
16428
+ const base = Math.min(times * 50, 1e3);
16429
+ const half = Math.floor(base / 2);
16430
+ return half + Math.round(random() * (base - half));
16431
+ }
16432
+ function encodeKeyPart(value) {
16433
+ return Buffer.from(value, "utf8").toString("base64url");
16434
+ }
16435
+ function idempotencyLookupKeyFor(input) {
16436
+ return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
16437
+ }
16438
+ var PENDING_PREFIX = "pending:";
16439
+ function makeIdempotencyClaimKey(input) {
16440
+ return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
16441
+ }
16404
16442
  var MollifierBuffer = class {
16405
16443
  redis;
16406
- entryTtlSeconds;
16407
16444
  logger;
16408
16445
  constructor(options) {
16409
- this.entryTtlSeconds = options.entryTtlSeconds;
16410
16446
  this.logger = options.logger ?? new Logger("MollifierBuffer", "debug");
16411
16447
  this.redis = createRedisClient(
16412
16448
  {
16413
16449
  ...options.redisOptions,
16414
16450
  retryStrategy(times) {
16415
- const delay = Math.min(times * 50, 1e3);
16416
- return delay;
16451
+ return mollifierReconnectDelayMs(times);
16417
16452
  },
16418
16453
  maxRetriesPerRequest: 20
16419
16454
  },
@@ -16425,14 +16460,26 @@ var MollifierBuffer = class {
16425
16460
  );
16426
16461
  this.#registerCommands();
16427
16462
  }
16428
- // Returns true if the entry was newly written; false if a duplicate runId
16429
- // was already buffered (idempotent no-op). Callers can use the boolean to
16430
- // record a duplicate-accept metric without affecting buffer state.
16463
+ // Three outcomes:
16464
+ // - { kind: "accepted" } entry was newly written.
16465
+ // - { kind: "duplicate_run_id" } runId was already buffered (idempotent
16466
+ // no-op, same semantic as the previous boolean-false return).
16467
+ // - { kind: "duplicate_idempotency", existingRunId } — the (env, task,
16468
+ // idempotencyKey) tuple was already bound to another buffered run.
16469
+ // The Lua's atomic SETNX is the race-winner; the second caller gets
16470
+ // the winner's runId so it can return that as the trigger response.
16431
16471
  async accept(input) {
16432
16472
  const entryKey = `mollifier:entries:${input.runId}`;
16433
16473
  const queueKey = `mollifier:queue:${input.envId}`;
16434
16474
  const orgsKey = "mollifier:orgs";
16435
- const createdAt = (/* @__PURE__ */ new Date()).toISOString();
16475
+ const nowMs = Date.now();
16476
+ const createdAt = new Date(nowMs).toISOString();
16477
+ const createdAtMicros = nowMs * 1e3;
16478
+ const idempotencyLookupKey = input.idempotencyKey && input.taskIdentifier ? idempotencyLookupKeyFor({
16479
+ envId: input.envId,
16480
+ taskIdentifier: input.taskIdentifier,
16481
+ idempotencyKey: input.idempotencyKey
16482
+ }) : "";
16436
16483
  const result = await this.redis.acceptMollifierEntry(
16437
16484
  entryKey,
16438
16485
  queueKey,
@@ -16442,10 +16489,16 @@ var MollifierBuffer = class {
16442
16489
  input.orgId,
16443
16490
  input.payload,
16444
16491
  createdAt,
16445
- String(this.entryTtlSeconds),
16446
- "mollifier:org-envs:"
16492
+ String(createdAtMicros),
16493
+ "mollifier:org-envs:",
16494
+ idempotencyLookupKey,
16495
+ "mollifier:entries:"
16447
16496
  );
16448
- return result === 1;
16497
+ if (typeof result === "string" && result.length > 0) {
16498
+ return { kind: "duplicate_idempotency", existingRunId: result };
16499
+ }
16500
+ if (result === 1) return { kind: "accepted" };
16501
+ return { kind: "duplicate_run_id" };
16449
16502
  }
16450
16503
  async pop(envId) {
16451
16504
  const queueKey = `mollifier:queue:${envId}`;
@@ -16499,8 +16552,220 @@ var MollifierBuffer = class {
16499
16552
  async listEnvsForOrg(orgId) {
16500
16553
  return this.redis.smembers(`mollifier:org-envs:${orgId}`);
16501
16554
  }
16555
+ // Read-only enumeration of currently-queued entries for a single env.
16556
+ // Used by the stale-sweep to compute per-entry dwell time, so order is
16557
+ // immaterial — LRANGE returns them newest-first (LPUSH head) but the
16558
+ // caller scans the whole window. Non-destructive: the drainer still
16559
+ // RPOPs these entries in FIFO order.
16560
+ //
16561
+ // The entry HGETALLs are issued in a single pipelined batch (one
16562
+ // network round-trip instead of N) — at the stale-sweep's default
16563
+ // maxCount=1000 the serial implementation cost ~1000 RTTs per env,
16564
+ // which dominated sweep wall-time at any meaningful backlog.
16565
+ //
16566
+ // A missing entry (empty hash) is skipped: the drainer's RPOP+DEL of
16567
+ // the entry hash can race our LRANGE→HGETALL window, so a runId on
16568
+ // the queue with no backing hash is an expected concurrency outcome,
16569
+ // not an error.
16570
+ async listEntriesForEnv(envId, maxCount) {
16571
+ if (maxCount <= 0) return [];
16572
+ const runIds = await this.redis.lrange(
16573
+ `mollifier:queue:${envId}`,
16574
+ 0,
16575
+ maxCount - 1
16576
+ );
16577
+ if (runIds.length === 0) return [];
16578
+ const pipeline = this.redis.pipeline();
16579
+ for (const runId of runIds) {
16580
+ pipeline.hgetall(`mollifier:entries:${runId}`);
16581
+ }
16582
+ const results = await pipeline.exec();
16583
+ if (!results) return [];
16584
+ const entries = [];
16585
+ for (let i = 0; i < results.length; i++) {
16586
+ const [err, raw] = results[i];
16587
+ if (err) {
16588
+ this.logger.error("MollifierBuffer.listEntriesForEnv: hgetall failed", {
16589
+ runId: runIds[i],
16590
+ err: err.message
16591
+ });
16592
+ continue;
16593
+ }
16594
+ if (!raw || Object.keys(raw).length === 0) continue;
16595
+ const parsed = BufferEntrySchema.safeParse(raw);
16596
+ if (!parsed.success) {
16597
+ this.logger.error("MollifierBuffer.listEntriesForEnv: invalid entry shape", {
16598
+ runId: runIds[i],
16599
+ errors: parsed.error.flatten()
16600
+ });
16601
+ continue;
16602
+ }
16603
+ entries.push(parsed.data);
16604
+ }
16605
+ return entries;
16606
+ }
16607
+ // Atomic snapshot mutation. Used by customer-mutation API endpoints
16608
+ // (tags, metadata-put, reschedule, cancel) when the run is still in
16609
+ // the buffer. Three outcomes:
16610
+ // - "applied_to_snapshot": entry was QUEUED + not materialised; the
16611
+ // drainer will read the patched payload on its next pop.
16612
+ // - "not_found": no entry hash exists for this runId — including a
16613
+ // FAILED entry, whose hash the drainer-terminal `fail` path DELs.
16614
+ // - "busy": entry is DRAINING or materialised. The API
16615
+ // wait-and-bounces through PG.
16616
+ // - "limit_exceeded": an `append_tags` patch carrying `maxTags` would
16617
+ // push the deduped tag count over the cap; nothing is written.
16618
+ async mutateSnapshot(runId, patch) {
16619
+ const result = await this.redis.mutateMollifierSnapshot(
16620
+ `mollifier:entries:${runId}`,
16621
+ JSON.stringify(patch)
16622
+ );
16623
+ if (result === "applied_to_snapshot" || result === "not_found" || result === "busy" || result === "limit_exceeded") {
16624
+ return result;
16625
+ }
16626
+ throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`);
16627
+ }
16628
+ // Optimistic compare-and-swap on the snapshot's metadata. Caller reads
16629
+ // the current metadataVersion via getEntry, applies operations in JS via
16630
+ // `applyMetadataOperations`, then calls this with the new metadata + the
16631
+ // expected version. Lua refuses if the version has moved (caller retries
16632
+ // up to N times). Mirrors the PG-side `UpdateMetadataService` retry
16633
+ // loop so concurrent increment/append operations don't lose deltas.
16634
+ async casSetMetadata(input) {
16635
+ const entryKey = `mollifier:entries:${input.runId}`;
16636
+ const raw = await this.redis.casSetMollifierMetadata(
16637
+ entryKey,
16638
+ String(input.expectedVersion),
16639
+ input.newMetadata,
16640
+ input.newMetadataType
16641
+ );
16642
+ if (raw === "not_found") return { kind: "not_found" };
16643
+ if (raw === "busy") return { kind: "busy" };
16644
+ if (raw.startsWith("conflict:")) {
16645
+ return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) };
16646
+ }
16647
+ if (raw.startsWith("applied:")) {
16648
+ return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) };
16649
+ }
16650
+ throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`);
16651
+ }
16652
+ // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
16653
+ // call across both PG and buffer paths serialises through this claim;
16654
+ // closes the race the buffer-side SETNX leaves open during the
16655
+ // gate-transition burst window.
16656
+ //
16657
+ // The caller supplies an opaque `token` (UUID) on claim. The same token
16658
+ // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and-
16659
+ // act so a late release from a previous claimant whose TTL expired
16660
+ // cannot erase a new owner's claim.
16661
+ //
16662
+ // - "claimed": we now own the claim, the caller proceeds with the
16663
+ // trigger pipeline and must `publishClaim` on success or
16664
+ // `releaseClaim` on failure.
16665
+ // - "pending": another trigger owns the claim and hasn't published
16666
+ // yet; the caller should poll.
16667
+ // - "resolved": the claim already holds a runId; the caller can
16668
+ // return that runId as a cached hit.
16669
+ async claimIdempotency(input) {
16670
+ const claimKey = makeIdempotencyClaimKey(input);
16671
+ const raw = await this.redis.claimMollifierIdempotency(
16672
+ claimKey,
16673
+ `${PENDING_PREFIX}${input.token}`,
16674
+ PENDING_PREFIX,
16675
+ String(input.ttlSeconds)
16676
+ );
16677
+ if (raw === "claimed") return { kind: "claimed" };
16678
+ if (raw === "pending") return { kind: "pending" };
16679
+ if (raw.startsWith("resolved:")) {
16680
+ return { kind: "resolved", runId: raw.slice("resolved:".length) };
16681
+ }
16682
+ throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`);
16683
+ }
16684
+ // Publish the winning runId to the claim so subsequent claimants /
16685
+ // waiters see "resolved". TTL bounded by the customer's
16686
+ // `idempotencyKeyExpiresAt` minus now; caller computes.
16687
+ //
16688
+ // Compare-and-set on the caller's token: if the current value isn't
16689
+ // our pending marker (TTL expired and another claimant moved in, or
16690
+ // someone else already published), the publish is a no-op. The caller
16691
+ // can treat any such case as "we lost the claim" and re-read.
16692
+ // Returns true if we published; false if the claim slot was no longer
16693
+ // ours.
16694
+ async publishClaim(input) {
16695
+ const claimKey = makeIdempotencyClaimKey(input);
16696
+ const result = await this.redis.publishMollifierClaim(
16697
+ claimKey,
16698
+ `${PENDING_PREFIX}${input.token}`,
16699
+ input.runId,
16700
+ String(input.ttlSeconds)
16701
+ );
16702
+ return result === 1;
16703
+ }
16704
+ // Release the claim on pipeline error so waiters can re-claim and
16705
+ // retry. Idempotent.
16706
+ //
16707
+ // Compare-and-delete on the caller's token: only deletes if the
16708
+ // current value is exactly our pending marker. A late release from a
16709
+ // claimant whose TTL expired is a no-op, so a new owner's claim is
16710
+ // never wiped by a slow predecessor.
16711
+ async releaseClaim(input) {
16712
+ const claimKey = makeIdempotencyClaimKey(input);
16713
+ await this.redis.releaseMollifierClaim(
16714
+ claimKey,
16715
+ `${PENDING_PREFIX}${input.token}`
16716
+ );
16717
+ }
16718
+ // Read the current claim value, used by the wait/poll loop on losers
16719
+ // to detect "pending" → "resolved" transitions and timeouts.
16720
+ async readClaim(input) {
16721
+ const claimKey = makeIdempotencyClaimKey(input);
16722
+ const value = await this.redis.get(claimKey);
16723
+ if (value === null) return null;
16724
+ if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" };
16725
+ return { kind: "resolved", runId: value };
16726
+ }
16727
+ // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
16728
+ // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
16729
+ // misses — same key may belong to a buffered run waiting to drain. The
16730
+ // lookup self-heals: if the lookup points at an entry hash that's gone,
16731
+ // we clear the lookup and report a miss. The clear is a compare-and-
16732
+ // delete (only if the key still holds the stale runId we observed) so a
16733
+ // fresh accept that rebinds the key between our GET and DEL isn't wiped.
16734
+ async lookupIdempotency(input) {
16735
+ const lookupKey = idempotencyLookupKeyFor(input);
16736
+ const runId = await this.redis.get(lookupKey);
16737
+ if (!runId) return null;
16738
+ const entry = await this.getEntry(runId);
16739
+ if (!entry) {
16740
+ await this.redis.delMollifierKeyIfEquals(lookupKey, runId);
16741
+ return null;
16742
+ }
16743
+ return runId;
16744
+ }
16745
+ // Clear the idempotency binding from a buffered run. Used by
16746
+ // `ResetIdempotencyKeyService` alongside the existing PG-side
16747
+ // `updateMany`. Returns the runId that was cleared, or null if no
16748
+ // buffered run held this key.
16749
+ async resetIdempotency(input) {
16750
+ const lookupKey = idempotencyLookupKeyFor(input);
16751
+ const claimKey = makeIdempotencyClaimKey(input);
16752
+ const clearedRunId = await this.redis.resetMollifierIdempotency(
16753
+ lookupKey,
16754
+ "mollifier:entries:",
16755
+ claimKey
16756
+ );
16757
+ return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
16758
+ }
16759
+ // Marks the entry as materialised (PG row written) and resets its TTL to
16760
+ // the grace window. Entry hash persists past ack as a read-fallback
16761
+ // safety net for the brief PG replica-lag window between drainer-side
16762
+ // write and reader-side visibility. Also clears the associated
16763
+ // idempotency lookup if one was set on accept.
16502
16764
  async ack(runId) {
16503
- await this.redis.del(`mollifier:entries:${runId}`);
16765
+ await this.redis.ackMollifierEntry(
16766
+ `mollifier:entries:${runId}`,
16767
+ String(ACK_GRACE_TTL_SECONDS)
16768
+ );
16504
16769
  }
16505
16770
  async requeue(runId) {
16506
16771
  await this.redis.requeueMollifierEntry(
@@ -16511,9 +16776,12 @@ var MollifierBuffer = class {
16511
16776
  "mollifier:org-envs:"
16512
16777
  );
16513
16778
  }
16514
- // Returns true if the entry transitioned to FAILED; false if the entry no
16515
- // longer exists (TTL expired between pop and fail). Caller can use the
16516
- // boolean to skip downstream FAILED handling for ghost entries.
16779
+ // Returns true if a live entry was torn down; false if the entry no
16780
+ // longer existed (a concurrent ack or manual cleanup removed it between
16781
+ // pop and fail there is no accept-time TTL). Note FAILED is not an
16782
+ // observable state: the Lua marks the hash FAILED then DELs it in the
16783
+ // same atomic script, so a subsequent getEntry returns null. Caller can
16784
+ // use the boolean to skip downstream FAILED handling for ghost entries.
16517
16785
  async fail(runId, error) {
16518
16786
  const result = await this.redis.failMollifierEntry(
16519
16787
  `mollifier:entries:${runId}`,
@@ -16521,6 +16789,11 @@ var MollifierBuffer = class {
16521
16789
  );
16522
16790
  return result === 1;
16523
16791
  }
16792
+ // Returns Redis-side TTL on the entry hash. Returns -1 for entries
16793
+ // with no TTL — the steady state under the current design, where
16794
+ // entries persist until drainer ack/fail. The ack grace TTL (30s
16795
+ // post-materialise) is the only context where this returns a
16796
+ // positive value; tests around the grace TTL still rely on it.
16524
16797
  async getEntryTtlSeconds(runId) {
16525
16798
  return this.redis.ttl(`mollifier:entries:${runId}`);
16526
16799
  }
@@ -16551,8 +16824,10 @@ var MollifierBuffer = class {
16551
16824
  local orgId = ARGV[3]
16552
16825
  local payload = ARGV[4]
16553
16826
  local createdAt = ARGV[5]
16554
- local ttlSeconds = tonumber(ARGV[6])
16827
+ local createdAtMicros = ARGV[6]
16555
16828
  local orgEnvsPrefix = ARGV[7]
16829
+ local idempotencyLookupKey = ARGV[8] or ''
16830
+ local entryPrefix = ARGV[9]
16556
16831
 
16557
16832
  -- Idempotent: refuse if an entry for this runId already exists in any
16558
16833
  -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -16561,6 +16836,27 @@ var MollifierBuffer = class {
16561
16836
  return 0
16562
16837
  end
16563
16838
 
16839
+ -- Idempotency-key dedup. If the caller passed a lookup key
16840
+ -- and it's already bound to another buffered run, return the
16841
+ -- winner's runId so the loser's API response can echo it as a
16842
+ -- cached hit. Otherwise SET the lookup (no TTL \u2014 lifecycle is
16843
+ -- paired with the entry hash; drainer ack/fail clear it
16844
+ -- explicitly).
16845
+ if idempotencyLookupKey ~= '' then
16846
+ local existing = redis.call('GET', idempotencyLookupKey)
16847
+ if existing then
16848
+ -- Self-heal: only honour the binding if its entry hash still
16849
+ -- exists. If the entry was evicted (maxmemory) but the lookup
16850
+ -- survived, the binding is stale \u2014 fall through and rebind to
16851
+ -- this run rather than returning a dead runId that would block
16852
+ -- the key indefinitely. Mirrors lookupIdempotency's self-heal.
16853
+ if redis.call('EXISTS', entryPrefix .. existing) == 1 then
16854
+ return existing
16855
+ end
16856
+ end
16857
+ redis.call('SET', idempotencyLookupKey, runId)
16858
+ end
16859
+
16564
16860
  redis.call('HSET', entryKey,
16565
16861
  'runId', runId,
16566
16862
  'envId', envId,
@@ -16568,8 +16864,20 @@ var MollifierBuffer = class {
16568
16864
  'payload', payload,
16569
16865
  'status', 'QUEUED',
16570
16866
  'attempts', '0',
16571
- 'createdAt', createdAt)
16572
- redis.call('EXPIRE', entryKey, ttlSeconds)
16867
+ 'createdAt', createdAt,
16868
+ 'createdAtMicros', createdAtMicros,
16869
+ 'idempotencyLookupKey', idempotencyLookupKey,
16870
+ 'metadataVersion', '0')
16871
+ -- No EXPIRE on the entry hash. Buffer entries persist until the
16872
+ -- drainer ACKs (post-materialise grace) or FAILs them \u2014 the
16873
+ -- drainer is the only recovery mechanism, so silent TTL-based
16874
+ -- eviction would lose runs with no customer-visible signal.
16875
+ -- Memory pressure from an offline drainer is the alertable
16876
+ -- failure mode instead; see _ops/mollifier-ops.md.
16877
+ -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so
16878
+ -- insertion order == drain order (FIFO). createdAtMicros is kept
16879
+ -- as a hash field for dwell metrics only \u2014 it is no longer a sort
16880
+ -- key now that the buffer has no list/pagination surface.
16573
16881
  redis.call('LPUSH', queueKey, runId)
16574
16882
  -- Org-level membership: maintained atomically with the per-env
16575
16883
  -- queue so the drainer can walk orgs \u2192 envs-for-org and
@@ -16599,7 +16907,12 @@ var MollifierBuffer = class {
16599
16907
  local nextAttempts = tonumber(currentAttempts or '0') + 1
16600
16908
 
16601
16909
  redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
16602
- redis.call('LPUSH', queuePrefix .. envId, runId)
16910
+ -- Requeue RPUSHes to the tail (the RPOP end) so a transiently
16911
+ -- failed entry pops next rather than going to the back of the
16912
+ -- line behind a fresh backlog. createdAt is immutable across
16913
+ -- retries; the drainer's maxAttempts caps the
16914
+ -- retry loop so a poisoned entry doesn't head-of-line forever.
16915
+ redis.call('RPUSH', queuePrefix .. envId, runId)
16603
16916
  -- Re-track the org/env: pop may have SREM'd them when the queue
16604
16917
  -- last emptied. SADDs are idempotent if the values are still
16605
16918
  -- present.
@@ -16633,11 +16946,13 @@ var MollifierBuffer = class {
16633
16946
  end
16634
16947
  end
16635
16948
 
16636
- -- Loop to skip orphan queue references \u2014 runIds whose entry hash has
16637
- -- expired (TTL hit). HSET on a missing key would CREATE a partial
16638
- -- hash without a TTL, leaking memory. The loop is bounded by queue
16639
- -- length; entire Lua script remains atomic.
16949
+ -- Loop to skip orphan queue references \u2014 runIds whose entry hash is
16950
+ -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry
16951
+ -- no TTL of their own). HSET on a missing key would CREATE a
16952
+ -- partial hash without a TTL, leaking memory. The loop is bounded
16953
+ -- by queue length; entire Lua script remains atomic.
16640
16954
  while true do
16955
+ -- RPOP returns the tail member (oldest, FIFO), or false when empty.
16641
16956
  local runId = redis.call('RPOP', queueKey)
16642
16957
  if not runId then
16643
16958
  -- Queue is empty AND we have no entry to read orgId from, so
@@ -16655,16 +16970,260 @@ var MollifierBuffer = class {
16655
16970
  result[raw[i]] = raw[i + 1]
16656
16971
  end
16657
16972
  -- Prune org-level membership if this pop drained the queue.
16658
- -- Atomic with the RPOP above \u2014 a concurrent accept AFTER this
16659
- -- script will SADD both back along with its LPUSH.
16973
+ -- Atomic with the RPOP above \u2014 a concurrent accept AFTER
16974
+ -- this script will SADD both back along with its LPUSH.
16660
16975
  if redis.call('LLEN', queueKey) == 0 then
16661
16976
  pruneOrgMembership(result['orgId'])
16662
16977
  end
16663
16978
  return cjson.encode(result)
16664
16979
  end
16665
- -- Orphan queue reference: entry TTL expired while runId was queued.
16666
- -- Discard the reference and loop to the next.
16980
+ -- Orphan queue reference: entry hash gone (evicted) while runId
16981
+ -- was queued. Discard the reference and loop to the next.
16982
+ end
16983
+ `
16984
+ });
16985
+ this.redis.defineCommand("casSetMollifierMetadata", {
16986
+ numberOfKeys: 1,
16987
+ lua: `
16988
+ local entryKey = KEYS[1]
16989
+ local expectedVersion = tonumber(ARGV[1])
16990
+ local newMetadata = ARGV[2]
16991
+ local newMetadataType = ARGV[3]
16992
+
16993
+ if redis.call('EXISTS', entryKey) == 0 then
16994
+ return 'not_found'
16995
+ end
16996
+
16997
+ local status = redis.call('HGET', entryKey, 'status')
16998
+ local materialised = redis.call('HGET', entryKey, 'materialised')
16999
+ if status ~= 'QUEUED' or materialised == 'true' then
17000
+ return 'busy'
17001
+ end
17002
+
17003
+ local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0'
17004
+ local currentVersion = tonumber(currentVersionStr) or 0
17005
+ if currentVersion ~= expectedVersion then
17006
+ return 'conflict:' .. tostring(currentVersion)
17007
+ end
17008
+
17009
+ -- Write the new metadata onto the snapshot's payload JSON. We
17010
+ -- keep the rest of the payload intact \u2014 only metadata/metadataType
17011
+ -- change. metadataVersion is denormalised on the hash for cheap
17012
+ -- CAS reads; it's intentionally NOT stored inside the payload
17013
+ -- itself (PG-side metadataVersion is a column, not a JSON field).
17014
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17015
+ local ok, payload = pcall(cjson.decode, payloadJson)
17016
+ if not ok then return 'busy' end
17017
+ payload.metadata = newMetadata
17018
+ payload.metadataType = newMetadataType
17019
+
17020
+ local newVersion = currentVersion + 1
17021
+ redis.call('HSET', entryKey,
17022
+ 'payload', cjson.encode(payload),
17023
+ 'metadataVersion', tostring(newVersion))
17024
+ return 'applied:' .. tostring(newVersion)
17025
+ `
17026
+ });
17027
+ this.redis.defineCommand("claimMollifierIdempotency", {
17028
+ numberOfKeys: 1,
17029
+ lua: `
17030
+ local claimKey = KEYS[1]
17031
+ local pendingMarker = ARGV[1] -- "pending:<caller-token>"
17032
+ local pendingPrefix = ARGV[2] -- "pending:"
17033
+ local ttl = tonumber(ARGV[3])
17034
+
17035
+ -- SETNX-with-TTL: atomic; only one caller can win.
17036
+ local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl)
17037
+ if won then
17038
+ return 'claimed'
17039
+ end
17040
+
17041
+ local existing = redis.call('GET', claimKey)
17042
+ if not existing then
17043
+ -- The slot expired in the race window between the SET NX
17044
+ -- failing and this GET. It's free now \u2014 claim it so we don't
17045
+ -- string.sub a nil and error out.
17046
+ redis.call('SET', claimKey, pendingMarker, 'EX', ttl)
17047
+ return 'claimed'
17048
+ end
17049
+ -- Any "pending:*" value is a live claim \u2014 the caller-supplied
17050
+ -- token differentiates ownership but is opaque to losers.
17051
+ if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
17052
+ return 'pending'
17053
+ end
17054
+ return 'resolved:' .. existing
17055
+ `
17056
+ });
17057
+ this.redis.defineCommand("publishMollifierClaim", {
17058
+ numberOfKeys: 1,
17059
+ lua: `
17060
+ local claimKey = KEYS[1]
17061
+ local ownerMarker = ARGV[1] -- "pending:<our-token>"
17062
+ local runId = ARGV[2]
17063
+ local ttl = tonumber(ARGV[3])
17064
+
17065
+ local existing = redis.call('GET', claimKey)
17066
+ if existing == ownerMarker then
17067
+ redis.call('SET', claimKey, runId, 'EX', ttl)
17068
+ return 1
17069
+ end
17070
+ return 0
17071
+ `
17072
+ });
17073
+ this.redis.defineCommand("releaseMollifierClaim", {
17074
+ numberOfKeys: 1,
17075
+ lua: `
17076
+ local claimKey = KEYS[1]
17077
+ local ownerMarker = ARGV[1] -- "pending:<our-token>"
17078
+
17079
+ local existing = redis.call('GET', claimKey)
17080
+ if existing == ownerMarker then
17081
+ redis.call('DEL', claimKey)
17082
+ return 1
17083
+ end
17084
+ return 0
17085
+ `
17086
+ });
17087
+ this.redis.defineCommand("resetMollifierIdempotency", {
17088
+ numberOfKeys: 1,
17089
+ lua: `
17090
+ local lookupKey = KEYS[1]
17091
+ local entryPrefix = ARGV[1]
17092
+ local claimKey = ARGV[2]
17093
+
17094
+ -- Reset reopens the key across BOTH the buffer lookup and the
17095
+ -- cross-store pre-gate claim pointer. Without clearing the claim,
17096
+ -- a resolved/pending claim would keep deduping new triggers for
17097
+ -- the rest of its TTL even though the binding was reset. DEL is
17098
+ -- unconditional \u2014 the claim is gone regardless of whether a
17099
+ -- buffered run currently holds the lookup.
17100
+ redis.call('DEL', claimKey)
17101
+
17102
+ local runId = redis.call('GET', lookupKey)
17103
+ if not runId then
17104
+ return ''
17105
+ end
17106
+
17107
+ local entryKey = entryPrefix .. runId
17108
+ if redis.call('EXISTS', entryKey) == 0 then
17109
+ -- Stale lookup. Lazy cleanup.
17110
+ redis.call('DEL', lookupKey)
17111
+ return ''
17112
+ end
17113
+
17114
+ -- Clear the idempotency fields on the snapshot payload so the
17115
+ -- drainer's eventual engine.trigger call inserts a PG row
17116
+ -- without the key set.
17117
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17118
+ if payloadJson then
17119
+ local ok, payload = pcall(cjson.decode, payloadJson)
17120
+ if ok then
17121
+ payload.idempotencyKey = cjson.null
17122
+ payload.idempotencyKeyExpiresAt = cjson.null
17123
+ redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
17124
+ end
16667
17125
  end
17126
+ -- Clear the denormalised lookup pointer on the hash so a later
17127
+ -- ack doesn't try to DEL a key that's already gone.
17128
+ redis.call('HSET', entryKey, 'idempotencyLookupKey', '')
17129
+ redis.call('DEL', lookupKey)
17130
+ return runId
17131
+ `
17132
+ });
17133
+ this.redis.defineCommand("mutateMollifierSnapshot", {
17134
+ numberOfKeys: 1,
17135
+ lua: `
17136
+ local entryKey = KEYS[1]
17137
+ local patchJson = ARGV[1]
17138
+
17139
+ if redis.call('EXISTS', entryKey) == 0 then
17140
+ return 'not_found'
17141
+ end
17142
+
17143
+ local status = redis.call('HGET', entryKey, 'status')
17144
+ local materialised = redis.call('HGET', entryKey, 'materialised')
17145
+ if status ~= 'QUEUED' or materialised == 'true' then
17146
+ return 'busy'
17147
+ end
17148
+
17149
+ local payloadJson = redis.call('HGET', entryKey, 'payload')
17150
+ local ok, payload = pcall(cjson.decode, payloadJson)
17151
+ if not ok then return 'busy' end
17152
+
17153
+ local patch = cjson.decode(patchJson)
17154
+
17155
+ if patch.type == 'append_tags' then
17156
+ -- cjson decode of an absent or empty-array field gives nil or
17157
+ -- an empty table; we rebuild as a dense array. Existing tags
17158
+ -- are preserved; new tags are appended only if not present.
17159
+ local existing = payload.tags or {}
17160
+ local seen = {}
17161
+ local merged = {}
17162
+ for _, t in ipairs(existing) do
17163
+ if not seen[t] then
17164
+ seen[t] = true
17165
+ table.insert(merged, t)
17166
+ end
17167
+ end
17168
+ for _, t in ipairs(patch.tags or {}) do
17169
+ if not seen[t] then
17170
+ seen[t] = true
17171
+ table.insert(merged, t)
17172
+ end
17173
+ end
17174
+ -- Cap the deduped count when the caller supplies a limit, so a
17175
+ -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API.
17176
+ -- Reject the whole patch (write nothing) rather than truncating.
17177
+ if patch.maxTags ~= nil and #merged > patch.maxTags then
17178
+ return 'limit_exceeded'
17179
+ end
17180
+ payload.tags = merged
17181
+ elseif patch.type == 'set_metadata' then
17182
+ payload.metadata = patch.metadata
17183
+ payload.metadataType = patch.metadataType
17184
+ -- Bump the denormalised metadataVersion so an in-flight
17185
+ -- casSetMetadata (optimistic CAS keyed on this counter) sees
17186
+ -- the concurrent write as a version conflict and retries,
17187
+ -- instead of clobbering it under a now-stale expectedVersion.
17188
+ local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0
17189
+ redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1))
17190
+ elseif patch.type == 'set_delay' then
17191
+ payload.delayUntil = patch.delayUntil
17192
+ elseif patch.type == 'mark_cancelled' then
17193
+ payload.cancelledAt = patch.cancelledAt
17194
+ payload.cancelReason = patch.cancelReason
17195
+ else
17196
+ return 'busy'
17197
+ end
17198
+
17199
+ redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
17200
+ return 'applied_to_snapshot'
17201
+ `
17202
+ });
17203
+ this.redis.defineCommand("ackMollifierEntry", {
17204
+ numberOfKeys: 1,
17205
+ lua: `
17206
+ local entryKey = KEYS[1]
17207
+ local graceTtlSeconds = tonumber(ARGV[1])
17208
+
17209
+ -- Guard: never create a partial entry. If the hash is gone between
17210
+ -- pop and ack (concurrent fail or eviction \u2014 QUEUED entries carry
17211
+ -- no TTL), the run is gone, nothing to mark materialised.
17212
+ if redis.call('EXISTS', entryKey) == 0 then
17213
+ return 0
17214
+ end
17215
+
17216
+ -- If the entry was accepted with an idempotency key, the lookup
17217
+ -- string was stored on the hash at accept time. Clear it now \u2014
17218
+ -- PG becomes canonical for the key post-materialisation.
17219
+ local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
17220
+ if lookupKey and lookupKey ~= '' then
17221
+ redis.call('DEL', lookupKey)
17222
+ end
17223
+
17224
+ redis.call('HSET', entryKey, 'materialised', 'true')
17225
+ redis.call('EXPIRE', entryKey, graceTtlSeconds)
17226
+ return 1
16668
17227
  `
16669
17228
  });
16670
17229
  this.redis.defineCommand("failMollifierEntry", {
@@ -16673,16 +17232,43 @@ var MollifierBuffer = class {
16673
17232
  local entryKey = KEYS[1]
16674
17233
  local errorPayload = ARGV[1]
16675
17234
 
16676
- -- Guard: never create a partial entry. If the hash expired between
16677
- -- pop and fail, the run is gone \u2014 nothing to mark FAILED.
17235
+ -- Guard: nothing to mark FAILED if the hash is gone (concurrent
17236
+ -- ack/manual cleanup). Returning 0 lets the caller distinguish
17237
+ -- "marked failed" from "no-op".
16678
17238
  if redis.call('EXISTS', entryKey) == 0 then
16679
17239
  return 0
16680
17240
  end
16681
17241
 
16682
17242
  redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload)
17243
+
17244
+ -- Terminal-failure contract: the drainer's onTerminalFailure
17245
+ -- callback (see MollifierDrainer.processEntry) has been
17246
+ -- invoked before this fail() and has either written a
17247
+ -- SYSTEM_FAILURE PG row (for both non-retryable AND
17248
+ -- max-attempts-exhausted retryable errors) or chosen to fall
17249
+ -- through (genuinely bad snapshot the engine can't materialise
17250
+ -- a row from). Either way the buffer entry is no longer
17251
+ -- load-bearing here. Clear the idempotency lookup -- PG's
17252
+ -- unique constraint is the canonical dedup mechanism
17253
+ -- post-materialise -- and drop the entry hash so failed runs
17254
+ -- don't accrete forever now that there's no accept-time TTL.
17255
+ local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
17256
+ if lookupKey and lookupKey ~= '' then
17257
+ redis.call('DEL', lookupKey)
17258
+ end
17259
+ redis.call('DEL', entryKey)
16683
17260
  return 1
16684
17261
  `
16685
17262
  });
17263
+ this.redis.defineCommand("delMollifierKeyIfEquals", {
17264
+ numberOfKeys: 1,
17265
+ lua: `
17266
+ if redis.call('GET', KEYS[1]) == ARGV[1] then
17267
+ return redis.call('DEL', KEYS[1])
17268
+ end
17269
+ return 0
17270
+ `
17271
+ });
16686
17272
  this.redis.defineCommand("mollifierEvaluateTrip", {
16687
17273
  numberOfKeys: 2,
16688
17274
  lua: `
@@ -16710,6 +17296,7 @@ var MollifierBuffer = class {
16710
17296
  var MollifierDrainer = class {
16711
17297
  buffer;
16712
17298
  handler;
17299
+ onTerminalFailure;
16713
17300
  maxAttempts;
16714
17301
  isRetryable;
16715
17302
  pollIntervalMs;
@@ -16727,6 +17314,7 @@ var MollifierDrainer = class {
16727
17314
  constructor(options) {
16728
17315
  this.buffer = options.buffer;
16729
17316
  this.handler = options.handler;
17317
+ this.onTerminalFailure = options.onTerminalFailure;
16730
17318
  this.maxAttempts = options.maxAttempts;
16731
17319
  this.isRetryable = options.isRetryable;
16732
17320
  this.pollIntervalMs = options.pollIntervalMs ?? 100;
@@ -16911,19 +17499,52 @@ var MollifierDrainer = class {
16911
17499
  });
16912
17500
  return "failed";
16913
17501
  }
17502
+ const cause = this.isRetryable(err) ? "max-attempts-exhausted" : "non-retryable";
16914
17503
  const code = err instanceof Error ? err.name : "Unknown";
16915
17504
  const message = err instanceof Error ? err.message : String(err);
17505
+ if (this.onTerminalFailure) {
17506
+ try {
17507
+ await this.onTerminalFailure({
17508
+ runId: entry.runId,
17509
+ envId: entry.envId,
17510
+ orgId: entry.orgId,
17511
+ payload: deserialiseSnapshot(entry.payload),
17512
+ attempts: nextAttempts,
17513
+ createdAt: entry.createdAt,
17514
+ error: { code, message },
17515
+ cause
17516
+ });
17517
+ } catch (writeErr) {
17518
+ if (this.isRetryable(writeErr)) {
17519
+ await this.buffer.requeue(entry.runId);
17520
+ this.logger.warn(
17521
+ "MollifierDrainer: terminal-failure callback retryable; requeued",
17522
+ {
17523
+ runId: entry.runId,
17524
+ attempts: nextAttempts,
17525
+ writeErr
17526
+ }
17527
+ );
17528
+ return "failed";
17529
+ }
17530
+ this.logger.error("MollifierDrainer: terminal-failure callback failed", {
17531
+ runId: entry.runId,
17532
+ writeErr
17533
+ });
17534
+ }
17535
+ }
16916
17536
  await this.buffer.fail(entry.runId, { code, message });
16917
17537
  this.logger.error("MollifierDrainer: terminal failure", {
16918
17538
  runId: entry.runId,
16919
17539
  code,
16920
- message
17540
+ message,
17541
+ cause
16921
17542
  });
16922
17543
  return "failed";
16923
17544
  }
16924
17545
  }
16925
17546
  };
16926
17547
 
16927
- export { BaseScheduler, BatchedSpanManager, BufferEntryError, BufferEntrySchema, BufferEntryStatus, CallbackFairQueueKeyProducer, ConcurrencyManager, CronSchema, CustomRetry, DRRScheduler, DefaultFairQueueKeyProducer, ExponentialBackoffRetry, FairQueue, FairQueueAttributes, FairQueueTelemetry, FixedDelayRetry, ImmediateRetry, LinearBackoffRetry, MasterQueue, MessagingAttributes, MollifierBuffer, MollifierDrainer, NoRetry, NoopScheduler, RoundRobinScheduler, SimpleQueue, TenantDispatch, VisibilityManager, WeightedScheduler, Worker, WorkerQueueManager, createDefaultRetryStrategy, defaultRetryOptions, deserialiseSnapshot, isAbortError, noopTelemetry, serialiseSnapshot };
17548
+ export { BaseScheduler, BatchedSpanManager, BufferEntryError, BufferEntrySchema, BufferEntryStatus, CallbackFairQueueKeyProducer, ConcurrencyManager, CronSchema, CustomRetry, DRRScheduler, DefaultFairQueueKeyProducer, ExponentialBackoffRetry, FairQueue, FairQueueAttributes, FairQueueTelemetry, FixedDelayRetry, ImmediateRetry, LinearBackoffRetry, MasterQueue, MessagingAttributes, MollifierBuffer, MollifierDrainer, NoRetry, NoopScheduler, RoundRobinScheduler, SimpleQueue, TenantDispatch, VisibilityManager, WeightedScheduler, Worker, WorkerQueueManager, createDefaultRetryStrategy, defaultRetryOptions, deserialiseSnapshot, idempotencyLookupKeyFor, isAbortError, makeIdempotencyClaimKey, noopTelemetry, serialiseSnapshot };
16928
17549
  //# sourceMappingURL=index.js.map
16929
17550
  //# sourceMappingURL=index.js.map