@hogsend/engine 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,14 @@
1
- import type { HatchetClient } from "@hatchet-dev/typescript-sdk/v1/index.js";
1
+ import type {
2
+ Conditions,
3
+ HatchetClient,
4
+ } from "@hatchet-dev/typescript-sdk/v1/index.js";
5
+ import {
6
+ Or,
7
+ SleepCondition,
8
+ UserEventCondition,
9
+ } from "@hatchet-dev/typescript-sdk/v1/index.js";
2
10
  import type { DurationObject } from "@hogsend/core";
3
- import { evaluateEventCondition } from "@hogsend/core";
11
+ import { durationToMs, evaluateEventCondition } from "@hogsend/core";
4
12
  import type { JourneyRegistry } from "@hogsend/core/registry";
5
13
  import {
6
14
  isValidTimeZone,
@@ -19,10 +27,32 @@ import type {
19
27
  } from "@hogsend/core/types";
20
28
  import { type Database, emailSends, journeyStates } from "@hogsend/db";
21
29
  import type { PostHogService } from "@hogsend/plugin-posthog";
22
- import { and, count, eq, max } from "drizzle-orm";
30
+ import { and, count, eq, max, notInArray } from "drizzle-orm";
23
31
  import { checkEmailPreferences } from "../lib/enrollment-guards.js";
24
32
  import { ingestEvent } from "../lib/ingestion.js";
25
33
  import type { Logger } from "../lib/logger.js";
34
+ import {
35
+ JOURNEY_EXECUTION_TIMEOUT,
36
+ JOURNEY_EXECUTION_TIMEOUT_HOURS,
37
+ } from "./constants.js";
38
+ import { JourneyExitedError } from "./errors.js";
39
+
40
+ /** Journey statuses that are terminal — a journey in any of these must never be
41
+ * resurrected back to "active" by a wait resuming. Exported so the durable task
42
+ * runner can avoid clobbering a terminal row to "failed" on a cancel. */
43
+ export const TERMINAL_STATUSES = ["completed", "failed", "exited"] as const;
44
+
45
+ /** Upper bound for a `waitForEvent` timeout — the journey task's executionTimeout. */
46
+ const MAX_WAIT_MS = durationToMs({ hours: JOURNEY_EXECUTION_TIMEOUT_HOURS });
47
+
48
+ /**
49
+ * Quote a string as a CEL single-quoted string literal, escaping backslashes
50
+ * then single quotes. Used to embed an externally-supplied userId into a CEL
51
+ * filter expression without breaking it or allowing injection.
52
+ */
53
+ function celStringLiteral(value: string): string {
54
+ return `'${value.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}'`;
55
+ }
26
56
 
27
57
  interface JourneyContextConfig {
28
58
  db: Database;
@@ -31,6 +61,12 @@ interface JourneyContextConfig {
31
61
  // Hatchet's real `sleepFor` accepts a number (milliseconds) in addition to
32
62
  // duration strings/objects; we use the number-ms form for `sleepUntil`.
33
63
  sleepFor: (duration: DurationObject | number) => Promise<unknown>;
64
+ // The forwarded object is the real Hatchet `DurableContext`, which also has
65
+ // `waitFor` (used by `waitForEvent`). Param mirrors the SDK signature so the
66
+ // real context is assignable; we read back the envelope as a plain record.
67
+ waitFor: (
68
+ conditions: Conditions | Conditions[],
69
+ ) => Promise<Record<string, unknown>>;
34
70
  };
35
71
  registry: JourneyRegistry;
36
72
  logger: Logger;
@@ -114,30 +150,103 @@ export function createJourneyContext(
114
150
  defaultSendWindow,
115
151
  } = config;
116
152
 
117
- // Shared wait lifecycle: mark the state "waiting", durably sleep, mark it
118
- // "active" again. `sleep` passes a DurationObject; `sleepUntil` passes a
119
- // precomputed ms delayHatchet's `sleepFor` accepts both.
153
+ // Enter a durable wait: flip "active" "waiting", but ONLY if the journey
154
+ // hasn't already reached a terminal state (e.g. exitOn fired before we got
155
+ // here). A no-op update means the journey is already done abort the run.
156
+ const enterWait = async (nodeId: string): Promise<void> => {
157
+ const entered = await db
158
+ .update(journeyStates)
159
+ .set({ status: "waiting", currentNodeId: nodeId, updatedAt: new Date() })
160
+ .where(
161
+ and(
162
+ eq(journeyStates.id, stateId),
163
+ notInArray(journeyStates.status, [...TERMINAL_STATUSES]),
164
+ ),
165
+ )
166
+ .returning({ id: journeyStates.id });
167
+
168
+ if (entered.length === 0) {
169
+ throw new JourneyExitedError(stateId);
170
+ }
171
+ };
172
+
173
+ // Resume from a durable wait: flip "waiting" → "active", but ONLY if the row
174
+ // is still "waiting". If an exit/cancel landed during the wait the row is no
175
+ // longer "waiting" — abort instead of reviving a terminated journey to active
176
+ // (which would let a post-wait side effect fire after the journey exited).
177
+ const resumeFromWait = async (): Promise<void> => {
178
+ const resumed = await db
179
+ .update(journeyStates)
180
+ .set({ status: "active", updatedAt: new Date() })
181
+ .where(
182
+ and(eq(journeyStates.id, stateId), eq(journeyStates.status, "waiting")),
183
+ )
184
+ .returning({ id: journeyStates.id });
185
+
186
+ if (resumed.length === 0) {
187
+ throw new JourneyExitedError(stateId);
188
+ }
189
+ };
190
+
191
+ // Durable sleep with the guarded waiting → active lifecycle. `sleep` passes a
192
+ // DurationObject; `sleepUntil` passes a precomputed ms delay — Hatchet's
193
+ // `sleepFor` accepts both.
120
194
  const performSleep = async (
121
195
  durationOrMs: DurationObject | number,
122
196
  nodeId: string,
123
197
  ): Promise<{ sleptAt: string; resumedAt: string }> => {
124
198
  const sleptAt = new Date().toISOString();
199
+ await enterWait(nodeId);
200
+ await hatchetCtx.sleepFor(durationOrMs);
201
+ const resumedAt = new Date().toISOString();
202
+ await resumeFromWait();
203
+ return { sleptAt, resumedAt };
204
+ };
125
205
 
126
- await db
127
- .update(journeyStates)
128
- .set({ status: "waiting", currentNodeId: nodeId, updatedAt: new Date() })
129
- .where(eq(journeyStates.id, stateId));
206
+ // Durably wait for THIS user's `event` OR `timeout`, whichever fires first,
207
+ // sharing the same guarded lifecycle as `performSleep`.
208
+ const performWaitForEvent = async (
209
+ event: string,
210
+ timeout: DurationObject,
211
+ nodeId: string,
212
+ ): Promise<{ timedOut: boolean }> => {
213
+ // Reject a timeout longer than the journey task's executionTimeout up front
214
+ // so it fails fast at authoring time. (Eviction-capable engines may allow
215
+ // longer wall-clock waits, but we cap to the configured ceiling — raise
216
+ // JOURNEY_EXECUTION_TIMEOUT to lift it.)
217
+ if (durationToMs(timeout) > MAX_WAIT_MS) {
218
+ throw new RangeError(
219
+ `waitForEvent timeout exceeds the journey execution limit (${JOURNEY_EXECUTION_TIMEOUT})`,
220
+ );
221
+ }
130
222
 
131
- await hatchetCtx.sleepFor(durationOrMs);
223
+ await enterWait(nodeId);
132
224
 
133
- const resumedAt = new Date().toISOString();
225
+ // Wait for the user-scoped event or the timeout. The event branch filters on
226
+ // the pushed payload's top-level `userId` (see `ingestEvent`); the SDK turns
227
+ // the ms number into a Go duration string at serialization time.
228
+ const result = await hatchetCtx.waitFor(
229
+ Or(
230
+ new UserEventCondition(
231
+ event,
232
+ `input.userId == ${celStringLiteral(userId)}`,
233
+ "event",
234
+ ),
235
+ new SleepCondition(durationToMs(timeout), "timeout"),
236
+ ),
237
+ );
134
238
 
135
- await db
136
- .update(journeyStates)
137
- .set({ status: "active", updatedAt: new Date() })
138
- .where(eq(journeyStates.id, stateId));
239
+ // Discriminate on which branch's readableDataKey ("event"/"timeout") is
240
+ // present. The eviction-capable path returns the `{ CREATE: { … } }`
241
+ // envelope; the pre-eviction path returns the inner object UN-wrapped — so
242
+ // strip an optional `CREATE` layer first to handle both shapes identically.
243
+ const fired = (("CREATE" in result ? result.CREATE : result) ??
244
+ {}) as Record<string, unknown>;
245
+ const timedOut = !("event" in fired);
139
246
 
140
- return { sleptAt, resumedAt };
247
+ await resumeFromWait();
248
+
249
+ return { timedOut };
141
250
  };
142
251
 
143
252
  return {
@@ -169,6 +278,14 @@ export function createJourneyContext(
169
278
  );
170
279
  },
171
280
 
281
+ async waitForEvent({ event, timeout, label }) {
282
+ return performWaitForEvent(
283
+ event,
284
+ timeout,
285
+ label ?? `wait-event:${event}`,
286
+ );
287
+ },
288
+
172
289
  async checkpoint(label) {
173
290
  await db
174
291
  .update(journeyStates)
@@ -76,7 +76,7 @@ export async function ingestEvent(opts: {
76
76
  userEmail: event.userEmail,
77
77
  properties: serializableProperties,
78
78
  }),
79
- checkExits(db, registry, {
79
+ checkExits(db, registry, hatchet, logger, {
80
80
  userId: event.userId,
81
81
  eventName: event.event,
82
82
  properties: event.properties,
@@ -130,6 +130,8 @@ export async function ingestEvent(opts: {
130
130
  async function checkExits(
131
131
  db: Database,
132
132
  registry: JourneyRegistry,
133
+ hatchet: HatchetClient,
134
+ logger: Logger,
133
135
  event: {
134
136
  userId: string;
135
137
  eventName: string;
@@ -147,6 +149,7 @@ async function checkExits(
147
149
  });
148
150
 
149
151
  const statesToExit: string[] = [];
152
+ const runIdsToCancel: string[] = [];
150
153
 
151
154
  for (const state of activeStates) {
152
155
  const journey = registry.get(state.journeyId);
@@ -163,6 +166,9 @@ async function checkExits(
163
166
 
164
167
  if (shouldExit) {
165
168
  statesToExit.push(state.id);
169
+ if (state.hatchetRunId) {
170
+ runIdsToCancel.push(state.hatchetRunId);
171
+ }
166
172
  }
167
173
 
168
174
  results.push({
@@ -181,6 +187,21 @@ async function checkExits(
181
187
  updatedAt: new Date(),
182
188
  })
183
189
  .where(inArray(journeyStates.id, statesToExit));
190
+
191
+ // Cancel the live durable runs so a journey suspended in a sleep or
192
+ // `waitForEvent` can't resume and fire after it has exited. Best-effort: a
193
+ // run may have already finished, and the in-run resume guard
194
+ // (JourneyExitedError) is the backstop if a cancel races a resume.
195
+ if (runIdsToCancel.length > 0) {
196
+ try {
197
+ await hatchet.runs.cancel({ ids: runIdsToCancel });
198
+ } catch (err) {
199
+ logger.warn("Failed to cancel exited journey runs", {
200
+ count: runIdsToCancel.length,
201
+ error: err instanceof Error ? err.message : String(err),
202
+ });
203
+ }
204
+ }
184
205
  }
185
206
 
186
207
  return results;
@@ -10,7 +10,7 @@ const bucketSchema = z.object({
10
10
  enabled: z.boolean(),
11
11
  kind: z.enum(["dynamic", "manual"]),
12
12
  timeBased: z.boolean(),
13
- reentry: z.enum(["once", "once_per_period", "unlimited"]),
13
+ entryLimit: z.enum(["once", "once_per_period", "unlimited"]),
14
14
  counts: z.object({
15
15
  active: z.number(),
16
16
  left: z.number(),
@@ -106,7 +106,7 @@ const getRoute = createRoute({
106
106
  schema: z.object({
107
107
  bucket: bucketSchema.extend({
108
108
  criteria: z.record(z.string(), z.unknown()).optional(),
109
- reentryPeriod: z
109
+ entryPeriod: z
110
110
  .record(z.string(), z.unknown())
111
111
  .nullable()
112
112
  .optional(),
@@ -267,7 +267,7 @@ export const bucketsRouter = new OpenAPIHono<AppEnv>()
267
267
  enabled: effectiveEnabled,
268
268
  kind: b.kind ?? "dynamic",
269
269
  timeBased: b.timeBased ?? false,
270
- reentry: b.reentry ?? "unlimited",
270
+ entryLimit: b.entryLimit ?? "unlimited",
271
271
  counts: countsMap.get(b.id) ?? { ...emptyCounts },
272
272
  };
273
273
  });
@@ -366,11 +366,9 @@ export const bucketsRouter = new OpenAPIHono<AppEnv>()
366
366
  enabled: effectiveEnabled,
367
367
  kind: meta.kind ?? "dynamic",
368
368
  timeBased: meta.timeBased ?? false,
369
- reentry: meta.reentry ?? "unlimited",
369
+ entryLimit: meta.entryLimit ?? "unlimited",
370
370
  criteria: meta.criteria as Record<string, unknown> | undefined,
371
- reentryPeriod: meta.reentryPeriod as
372
- | Record<string, unknown>
373
- | undefined,
371
+ entryPeriod: meta.entryPeriod as Record<string, unknown> | undefined,
374
372
  minDwell: meta.minDwell as Record<string, unknown> | undefined,
375
373
  maxDwell: meta.maxDwell as Record<string, unknown> | undefined,
376
374
  reconcileEvery: meta.reconcileEvery as
package/src/worker.ts CHANGED
@@ -79,12 +79,12 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
79
79
  `Hogsend worker started with ${journeyTasks.length} journey task(s)`,
80
80
  );
81
81
 
82
- await _worker.start();
83
-
84
82
  // Boot-time backfill / criteria-change re-eval (Section 6.6 B): diff each
85
- // enabled bucket's criteriaHash against bucket_configs and enqueue a
86
- // backfill/re-eval job where it differs. Best-effort never block worker
87
- // start; the cron is the backstop for time-based leaves regardless.
83
+ // enabled bucket's criteriaHash against bucket_configs and trigger a
84
+ // backfill/re-eval run where it differs. Kicked off BEFORE the listener
85
+ // because `_worker.start()` below does NOT return until the worker stops
86
+ // anything after it is dead code at runtime. The triggers are fire-and-forget
87
+ // (runNoWait) and execute once the listener is up; best-effort, never blocks.
88
88
  enqueueBucketBackfills({
89
89
  db: container.db,
90
90
  logger: container.logger,
@@ -93,6 +93,8 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
93
93
  error: err instanceof Error ? err.message : String(err),
94
94
  });
95
95
  });
96
+
97
+ await _worker.start();
96
98
  }
97
99
 
98
100
  return { start, stop };
@@ -3,7 +3,6 @@ import type { JsonObject } from "@hatchet-dev/typescript-sdk/v1/types.js";
3
3
  import {
4
4
  type BucketMeta,
5
5
  type ConditionEval,
6
- type DurationObject,
7
6
  durationToMs,
8
7
  evaluateCondition,
9
8
  } from "@hogsend/core";
@@ -16,7 +15,12 @@ import {
16
15
  importJobs,
17
16
  userEvents,
18
17
  } from "@hogsend/db";
19
- import { and, eq, gte, inArray, isNull, sql } from "drizzle-orm";
18
+ import { and, eq, gt, gte, inArray, isNull, sql } from "drizzle-orm";
19
+ import {
20
+ computeExpiresAt,
21
+ computeMaxDwellAt,
22
+ matchesEventCount,
23
+ } from "../buckets/membership-epoch.js";
20
24
  import { getBucketRegistrySingleton } from "../buckets/registry-singleton.js";
21
25
  import { getJourneyRegistrySingleton } from "../journeys/registry-singleton.js";
22
26
  import { emitBucketTransition } from "../lib/bucket-emit.js";
@@ -28,7 +32,7 @@ import { createLogger } from "../lib/logger.js";
28
32
  const BATCH_SIZE = 500;
29
33
 
30
34
  /** import_jobs.format discriminator for the reused status record (Section 6.6). */
31
- const FIRST_TIME_FORMAT = "bucket-backfill";
35
+ export const FIRST_TIME_FORMAT = "bucket-backfill";
32
36
  const REEVAL_FORMAT = "bucket-reeval";
33
37
 
34
38
  /**
@@ -199,6 +203,18 @@ async function backfillJoins(opts: {
199
203
  .set({ totalRows: matcherIds.length, updatedAt: new Date() })
200
204
  .where(eq(importJobs.id, jobId));
201
205
 
206
+ // Unconditional max-dwell TTL deadline, stamped once at insert (mirrors the
207
+ // live join, check-membership.ts). null when the bucket has no maxDwell; the
208
+ // TTL sweep (reconcileBucketTtlLeaves) filters isNotNull(maxDwellAt), so an
209
+ // unset value would never be force-left.
210
+ const maxDwellAt = computeMaxDwellAt(bucket);
211
+
212
+ // Fix C (DEFERRED): backfilled fastExpiry rows are NOT armed with a
213
+ // bucket:arm-expiry durable timer here — they are picked up by the next cron
214
+ // sweep instead (reconcileBucketLeaves / reconcileBucketTtlLeaves are the
215
+ // authoritative backstop). Conscious choice (cron cadence, default 5m), not an
216
+ // omission: arming at backfill would fan out one durable task per inserted row.
217
+
202
218
  let inserted = 0;
203
219
  for (let i = 0; i < matcherIds.length; i += BATCH_SIZE) {
204
220
  const chunk = matcherIds.slice(i, i + BATCH_SIZE);
@@ -214,14 +230,38 @@ async function backfillJoins(opts: {
214
230
  chunkContacts.map((c) => [c.externalId, c.email]),
215
231
  );
216
232
 
233
+ // Fix A: entryCount = 1 + prior memberships for each (user, bucket), the
234
+ // same monotonic ordinal the live join computes (check-membership.ts). On a
235
+ // FIRST-TIME backfill priorCount is 0 → entryCount 1 (unchanged); on a
236
+ // REEVAL re-join of a user with historical "left" rows it advances the
237
+ // epoch correctly. ONE batched GROUP BY per chunk (never per-user — the set-
238
+ // based path must not reintroduce the O(P) serial-query trap).
239
+ const priorCounts = await db
240
+ .select({
241
+ userId: bucketMemberships.userId,
242
+ cnt: sql<number>`count(*)::int`,
243
+ })
244
+ .from(bucketMemberships)
245
+ .where(
246
+ and(
247
+ eq(bucketMemberships.bucketId, bucket.id),
248
+ inArray(bucketMemberships.userId, chunk),
249
+ ),
250
+ )
251
+ .groupBy(bucketMemberships.userId);
252
+ const priorByUser = new Map(
253
+ priorCounts.map((r) => [r.userId, Number(r.cnt)]),
254
+ );
255
+
217
256
  const rows = chunk.map((userId) => ({
218
257
  userId,
219
258
  userEmail: emailByUser.get(userId) ?? null,
220
259
  bucketId: bucket.id,
221
260
  status: "active" as const,
222
261
  source: "backfill" as const,
223
- entryCount: 1,
224
- expiresAt: computeBackfillExpiresAt(bucket),
262
+ entryCount: 1 + (priorByUser.get(userId) ?? 0),
263
+ expiresAt: computeExpiresAt(bucket),
264
+ maxDwellAt,
225
265
  lastEvaluatedAt: new Date(),
226
266
  }));
227
267
 
@@ -343,8 +383,22 @@ async function selectEventMatchers(
343
383
  : null;
344
384
 
345
385
  // count gte N / exists → SELECT user_id ... GROUP BY HAVING. not_exists
346
- // (absence) → live contacts with NO such event in the window (anti-join).
386
+ // (absence) → live contacts who EVER fired the event but have NONE in the
387
+ // window (lapsed-only). A bare windowed `not_exists within W` is treated as
388
+ // LAPSED-ONLY (never-active EXCLUDED) in BOTH this backfill and the cron
389
+ // (bucket-reconcile.ts reconcileBucketJoins, the everFired floor), so the two
390
+ // writers agree: brand-new never-active signups are NOT materialized for an
391
+ // absence-within-window bucket — only users who once did X and then stopped.
347
392
  if (criteria.check === "not_exists") {
393
+ // everFired floor: contacts who fired the event AT LEAST ONCE (no window),
394
+ // mirroring the cron's `ever_fired` semi-join. Excludes never-active
395
+ // contacts so the two writers select the same lapsed-only cohort.
396
+ const everFired = db
397
+ .selectDistinct({ userId: userEvents.userId })
398
+ .from(userEvents)
399
+ .where(eq(userEvents.event, criteria.eventName))
400
+ .as("ever_fired");
401
+
348
402
  const present = db
349
403
  .select({ userId: userEvents.userId })
350
404
  .from(userEvents)
@@ -360,115 +414,94 @@ async function selectEventMatchers(
360
414
  const rows = await db
361
415
  .select({ userId: contacts.externalId })
362
416
  .from(contacts)
417
+ .innerJoin(everFired, eq(everFired.userId, contacts.externalId))
363
418
  .leftJoin(present, eq(present.userId, contacts.externalId))
364
419
  .where(and(isNull(contacts.deletedAt), isNull(present.userId)));
365
420
  return rows.map((r) => r.userId);
366
421
  }
367
422
 
368
- // exists / count: group counts then filter by the operator.
423
+ // exists / count: group counts then filter by the operator. Fix B: innerJoin
424
+ // live contacts (GDPR — only materialize memberships for non-deleted contacts
425
+ // that actually exist), mirroring selectEventLeavers in bucket-reconcile.ts.
426
+ // The not_exists branch above already filters contacts.deletedAt; without this
427
+ // join the positive-event path could materialize active rows for soft-deleted
428
+ // or orphan-event userIds, diverging from the live/reconcile paths.
369
429
  const rows = await db
370
430
  .select({
371
431
  userId: userEvents.userId,
372
432
  cnt: sql<number>`count(*)::int`,
373
433
  })
374
434
  .from(userEvents)
435
+ .innerJoin(contacts, eq(contacts.externalId, userEvents.userId))
375
436
  .where(
376
437
  and(
377
438
  eq(userEvents.event, criteria.eventName),
439
+ isNull(contacts.deletedAt),
378
440
  cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
379
441
  ),
380
442
  )
381
443
  .groupBy(userEvents.userId);
382
444
 
383
445
  return rows
384
- .filter((r) => matchesCount(criteria, Number(r.cnt)))
446
+ .filter((r) => matchesEventCount(criteria, Number(r.cnt)))
385
447
  .map((r) => r.userId);
386
448
  }
387
449
 
388
- /** True when a windowed count satisfies the (exists/count) criterion. */
389
- function matchesCount(
390
- criteria: Extract<ConditionEval, { type: "event" }>,
391
- count: number,
392
- ): boolean {
393
- switch (criteria.check) {
394
- case "exists":
395
- return count > 0;
396
- case "count": {
397
- if (!criteria.operator || criteria.value === undefined) return count > 0;
398
- switch (criteria.operator) {
399
- case "gt":
400
- return count > criteria.value;
401
- case "gte":
402
- return count >= criteria.value;
403
- case "lt":
404
- return count < criteria.value;
405
- case "lte":
406
- return count <= criteria.value;
407
- case "eq":
408
- return count === criteria.value;
409
- default:
410
- return false;
411
- }
412
- }
413
- default:
414
- return false;
415
- }
416
- }
417
-
418
450
  /**
419
451
  * Composite/multi-condition fallback (the documented O(P) exception, Section 6.6):
420
- * a chunked per-contact `evaluateCondition` loop over live contacts. Property
452
+ * a per-contact `evaluateCondition` loop over live contacts. Property
421
453
  * sub-conditions evaluate against the contact's merged properties.
454
+ *
455
+ * KEYSET PAGINATION by `contacts.externalId` in BATCH_SIZE pages (mirrors
456
+ * reconcileBucketJoins' `externalId asc` paging): each page selects
457
+ * `WHERE externalId > :cursor ORDER BY externalId ASC LIMIT BATCH_SIZE`,
458
+ * evaluates the criteria per contact, then advances the cursor to the last
459
+ * externalId of the page — repeating until a short page ends the scan. The whole
460
+ * contacts table is never held in memory at once.
422
461
  */
423
462
  async function selectCompositeMatchers(
424
463
  db: Database,
425
464
  criteria: ConditionEval,
426
465
  ): Promise<string[]> {
427
- const liveContacts = await db
428
- .select({
429
- externalId: contacts.externalId,
430
- properties: contacts.properties,
431
- })
432
- .from(contacts)
433
- .where(isNull(contacts.deletedAt));
434
-
435
466
  const matchers: string[] = [];
436
- for (const contact of liveContacts) {
437
- const isMember = await evaluateCondition({
438
- condition: criteria,
439
- ctx: {
440
- db,
441
- userId: contact.externalId,
442
- journeyContext:
443
- (contact.properties as Record<string, unknown> | null) ?? {},
444
- },
445
- });
446
- if (isMember) matchers.push(contact.externalId);
447
- }
448
- return matchers;
449
- }
467
+ let cursor: string | null = null;
450
468
 
451
- /** now + within for time-based / fastExpiry buckets; null otherwise. */
452
- function computeBackfillExpiresAt(bucket: BucketMeta): Date | null {
453
- if (!bucket.criteria) return null;
454
- if (!bucket.timeBased && !bucket.fastExpiry) return null;
455
- const within = firstWithin(bucket.criteria);
456
- if (!within) return null;
457
- return new Date(Date.now() + durationToMs(within));
458
- }
469
+ for (;;) {
470
+ const page = await db
471
+ .select({
472
+ externalId: contacts.externalId,
473
+ properties: contacts.properties,
474
+ })
475
+ .from(contacts)
476
+ .where(
477
+ and(
478
+ isNull(contacts.deletedAt),
479
+ cursor != null ? gt(contacts.externalId, cursor) : undefined,
480
+ ),
481
+ )
482
+ .orderBy(sql`${contacts.externalId} asc`)
483
+ .limit(BATCH_SIZE);
459
484
 
460
- /** Find the first EventCondition.within in a criteria tree (depth-first). */
461
- function firstWithin(criteria: ConditionEval): DurationObject | null {
462
- if (criteria.type === "event" && criteria.within) {
463
- return criteria.within;
464
- }
465
- if (criteria.type === "composite") {
466
- for (const child of criteria.conditions) {
467
- const found = firstWithin(child);
468
- if (found) return found;
485
+ for (const contact of page) {
486
+ const isMember = await evaluateCondition({
487
+ condition: criteria,
488
+ ctx: {
489
+ db,
490
+ userId: contact.externalId,
491
+ journeyContext:
492
+ (contact.properties as Record<string, unknown> | null) ?? {},
493
+ },
494
+ });
495
+ if (isMember) matchers.push(contact.externalId);
469
496
  }
497
+
498
+ // A short page (fewer than a full batch) means the scan is exhausted.
499
+ if (page.length < BATCH_SIZE) break;
500
+ cursor = page[page.length - 1]?.externalId ?? null;
501
+ if (cursor == null) break;
470
502
  }
471
- return null;
503
+
504
+ return matchers;
472
505
  }
473
506
 
474
507
  /**
@@ -535,7 +568,11 @@ export async function enqueueBucketBackfills(opts: {
535
568
 
536
569
  if (!job) continue;
537
570
 
538
- await bucketBackfillTask.run({
571
+ // runNoWait (fire-and-forget): this is called from worker boot BEFORE the
572
+ // listener starts, so awaiting the run would deadlock (the run needs the
573
+ // listener that `_worker.start()` brings up). The triggered run queues and
574
+ // executes once listening; the task itself persists the criteriaHash.
575
+ await bucketBackfillTask.runNoWait({
539
576
  jobId: job.id,
540
577
  bucketId: bucket.id,
541
578
  mode,