@hogsend/engine 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1010 @@
1
+ import { ConcurrencyLimitStrategy } from "@hatchet-dev/typescript-sdk/v1/index.js";
2
+ import type { JsonObject } from "@hatchet-dev/typescript-sdk/v1/types.js";
3
+ import {
4
+ type BucketMeta,
5
+ type ConditionEval,
6
+ collectPropertyNames,
7
+ durationToMs,
8
+ evaluateCondition,
9
+ } from "@hogsend/core";
10
+ import {
11
+ bucketConfigs,
12
+ bucketMemberships,
13
+ contacts,
14
+ createDatabase,
15
+ type Database,
16
+ importJobs,
17
+ userEvents,
18
+ } from "@hogsend/db";
19
+ import {
20
+ and,
21
+ eq,
22
+ gte,
23
+ inArray,
24
+ isNotNull,
25
+ isNull,
26
+ lte,
27
+ or,
28
+ sql,
29
+ } from "drizzle-orm";
30
+ import { shouldEmitJoin } from "../buckets/check-membership.js";
31
+ import {
32
+ BUCKET_EVENT_PREFIX,
33
+ computeExpiresAt,
34
+ computeMaxDwellAt,
35
+ countPriorMemberships,
36
+ firstWithin,
37
+ shouldLeaveByCount,
38
+ } from "../buckets/membership-epoch.js";
39
+ import { getBucketRegistrySingleton } from "../buckets/registry-singleton.js";
40
+ import { getJourneyRegistrySingleton } from "../journeys/registry-singleton.js";
41
+ import { emitBucketTransition } from "../lib/bucket-emit.js";
42
+ import { hatchet } from "../lib/hatchet.js";
43
+ import type { Logger } from "../lib/logger.js";
44
+ import { createLogger } from "../lib/logger.js";
45
+ import { FIRST_TIME_FORMAT } from "./bucket-backfill.js";
46
+
47
+ /** Chunk size for the composite-only per-member re-evaluation path (Section 6.4). */
48
+ const BATCH_SIZE = 500;
49
+
50
+ /**
51
+ * Engine-owned cron reconciliation for TIME-BASED bucket leaves (Section 6.4).
52
+ *
53
+ * Time-based criteria (an `EventCondition.within` rolling window) silently flip a
54
+ * user OUT of a bucket as the clock advances with NO inbound event — the real-time
55
+ * path structurally cannot catch this. This cron sweeps every `timeBased`,
56
+ * `kind:"dynamic"` bucket and transitions members who SHOULD leave via a single
57
+ * set-based SHOULD-LEAVE query (per criterion shape) + a bulk compare-and-swap.
58
+ *
59
+ * It self-bootstraps `db`/`logger` from `process.env` (cron runs have no request
60
+ * container), cloned from `check-alerts.ts`, and reads the process bucket-registry
61
+ * singleton (installed by `createHogsendClient`, which both API and worker call).
62
+ *
63
+ * Emission is gated on the `RETURNING` of the bulk CAS — the loser of a concurrent
64
+ * race (e.g. an overlapping ingest tick) mutates zero rows and never emits — and it
65
+ * reuses the SAME deterministic `idempotencyKey` shape as the real-time path
66
+ * (`bucket:<id>:<userId>:<kind>:<entryCount>`), so concurrent ingest + cron
67
+ * converge to exactly ONE emission (Section 6.3 worked example).
68
+ *
69
+ * NON-cancelling concurrency: a sweep that overruns the interval must be allowed to
70
+ * FINISH (else an expiration never completes and members are stuck `active`
71
+ * forever), so the strategy is `GROUP_ROUND_ROBIN` with `maxRuns: 1` keyed on a
72
+ * single static group — a newcomer queues behind the incumbent, it is never
73
+ * cancelled (NOT `CANCEL_IN_PROGRESS`).
74
+ */
75
+ export const bucketReconcileTask = hatchet.task({
76
+ name: "bucket-reconcile",
77
+ onCrons: [process.env.BUCKET_RECONCILE_CRON ?? "*/5 * * * *"],
78
+ retries: 1,
79
+ executionTimeout: "120s",
80
+ concurrency: {
81
+ // Single global key → at most one sweep runs; the next one QUEUES (round
82
+ // robin) rather than cancelling the in-flight run.
83
+ expression: "'bucket-reconcile'",
84
+ maxRuns: 1,
85
+ limitStrategy: ConcurrencyLimitStrategy.GROUP_ROUND_ROBIN,
86
+ },
87
+ fn: async () => {
88
+ const { db } = createDatabase({ url: process.env.DATABASE_URL ?? "" });
89
+ const logger = createLogger(process.env.LOG_LEVEL ?? "info");
90
+ const registry = getBucketRegistrySingleton();
91
+ const journeyRegistry = getJourneyRegistrySingleton();
92
+
93
+ let reconciled = 0;
94
+ let joined = 0;
95
+
96
+ for (const bucket of registry.getEnabled()) {
97
+ // kind:"manual" buckets are NEVER auto-recomputed (early-continue).
98
+ if (bucket.kind === "manual" || !bucket.criteria) continue;
99
+
100
+ // Process a bucket here iff a clock can flip its membership: a TIME-BASED
101
+ // criteria window (criteria-driven leaves/joins) OR an unconditional
102
+ // `maxDwell` TTL (membership-age-driven leaves). timeBased is honoured
103
+ // explicitly OR inferred from a `within` window.
104
+ const timeBased = isTimeBased(bucket);
105
+ if (!timeBased && !bucket.maxDwell) continue;
106
+
107
+ try {
108
+ if (timeBased) {
109
+ reconciled += await reconcileBucketLeaves({
110
+ db,
111
+ logger,
112
+ journeyRegistry,
113
+ bucket,
114
+ });
115
+
116
+ // reconcileJoins materializes absence joins the real-time path
117
+ // cannot see (e.g. went-dormant — the NOT-EXISTS-within-window case).
118
+ // An explicit `reconcileJoins` overrides; when omitted it is INFERRED
119
+ // true ONLY for the two SAFE set-based shapes — a single-event windowed
120
+ // `not_exists` and the lapsed-active composite (Fix #3) — whose SQL
121
+ // candidate set is exact. Other absence composites (OR-of-absence,
122
+ // absence + property/count) need an explicit opt-in and run the
123
+ // BATCH_SIZE-bounded per-member confirm, keeping the sweep O(active
124
+ // members) for everything else (Section 6.4).
125
+ if (shouldReconcileJoins(bucket)) {
126
+ joined += await reconcileBucketJoins({
127
+ db,
128
+ logger,
129
+ journeyRegistry,
130
+ bucket,
131
+ });
132
+ }
133
+ }
134
+
135
+ // Unconditional max-dwell TTL: force-leave members past
136
+ // enteredAt + maxDwell REGARDLESS of whether criteria still match. Runs
137
+ // for time-based AND pure-property dynamic buckets. Re-entry afterwards
138
+ // is governed by the bucket's `entryLimit` policy (per-bucket time-box vs
139
+ // periodic flush).
140
+ if (bucket.maxDwell) {
141
+ reconciled += await reconcileBucketTtlLeaves({
142
+ db,
143
+ logger,
144
+ journeyRegistry,
145
+ bucket,
146
+ });
147
+ }
148
+ } catch (err) {
149
+ logger.error("Bucket reconcile failed", {
150
+ bucketId: bucket.id,
151
+ error: err instanceof Error ? err.message : String(err),
152
+ });
153
+ }
154
+ }
155
+
156
+ logger.info("Bucket reconcile sweep complete", { reconciled, joined });
157
+ return { reconciled, joined };
158
+ },
159
+ });
160
+
161
+ /**
162
+ * Optional per-user fast-expiry durable timer (Section 6.5). Armed on JOIN for
163
+ * `meta.fastExpiry` buckets, it durably sleeps until the membership's `expiresAt`
164
+ * deadline, then leaves via a SINGLE atomic CAS keyed on the ARMED `expiresAt` —
165
+ * never read-then-act. A concurrent real-time event that re-armed the window (a new
166
+ * `expiresAt`) makes the CAS match zero rows, so the stale timer no-ops WITHOUT
167
+ * emitting a spurious `bucket:left`. The cron remains the authoritative backstop
168
+ * for any timer lost to worker churn.
169
+ *
170
+ * It is a single SHARED durableTask keyed on `bucket:arm-expiry` (per-bucket arming
171
+ * is by event payload, not per-bucket task instances), registered once by
172
+ * `selectBucketTasks` if ANY enabled bucket opts in (Section 9.4). The
173
+ * `bucket:`-prefixed event name is recursion-guarded by `checkBucketMembership`.
174
+ */
175
+ export interface BucketArmExpiryInput extends JsonObject {
176
+ rowId: string;
177
+ bucketId: string;
178
+ userId: string;
179
+ userEmail: string | null;
180
+ /** ISO timestamp of the armed deadline — the CAS epoch. */
181
+ armedExpiresAt: string;
182
+ /** ms from arming to the deadline (the durable sleep). */
183
+ msUntilExpiry: number;
184
+ }
185
+
186
+ export const bucketExpiryTask = hatchet.durableTask({
187
+ name: "bucket-expiry",
188
+ onEvents: [`${BUCKET_EVENT_PREFIX}arm-expiry`],
189
+ retries: 0,
190
+ fn: async (input: BucketArmExpiryInput, ctx) => {
191
+ const { db } = createDatabase({ url: process.env.DATABASE_URL ?? "" });
192
+ const logger = createLogger(process.env.LOG_LEVEL ?? "info");
193
+ const registry = getBucketRegistrySingleton();
194
+ const journeyRegistry = getJourneyRegistrySingleton();
195
+
196
+ // Durable sleep to the deadline. Hatchet's sleepFor accepts a ms number.
197
+ await ctx.sleepFor(input.msUntilExpiry);
198
+
199
+ const bucket = registry.get(input.bucketId);
200
+ if (!bucket?.criteria) {
201
+ return { status: "skipped", reason: "bucket_unregistered" };
202
+ }
203
+
204
+ // On wake, re-confirm the criteria still says "should leave". If the user
205
+ // re-qualified (e.g. fired the event again), do not leave. Load merged
206
+ // contact properties iff a property leg needs them so property predicates
207
+ // match the real-time path instead of evaluating against undefined.
208
+ const journeyContext =
209
+ collectPropertyNames(bucket.criteria).length > 0
210
+ ? await loadContactProperties(db, input.userId)
211
+ : {};
212
+ const stillMember = await evaluateCondition({
213
+ condition: bucket.criteria,
214
+ ctx: { db, userId: input.userId, journeyContext },
215
+ });
216
+ if (stillMember) {
217
+ return { status: "skipped", reason: "still_member" };
218
+ }
219
+
220
+ // SINGLE atomic CAS keyed on the ARMED expiresAt — a re-armed window (new
221
+ // expiresAt) makes this match zero rows → no spurious leave (Section 6.5).
222
+ const left = await db
223
+ .update(bucketMemberships)
224
+ .set({
225
+ status: "left",
226
+ leftAt: new Date(),
227
+ lastEvaluatedAt: new Date(),
228
+ updatedAt: new Date(),
229
+ })
230
+ .where(
231
+ and(
232
+ eq(bucketMemberships.id, input.rowId),
233
+ eq(bucketMemberships.status, "active"),
234
+ eq(bucketMemberships.expiresAt, new Date(input.armedExpiresAt)),
235
+ ),
236
+ )
237
+ .returning({
238
+ id: bucketMemberships.id,
239
+ entryCount: bucketMemberships.entryCount,
240
+ });
241
+
242
+ const flipped = left[0];
243
+ if (!flipped) {
244
+ return { status: "skipped", reason: "re_armed_or_already_left" };
245
+ }
246
+
247
+ await emitBucketTransition({
248
+ db,
249
+ registry: journeyRegistry,
250
+ hatchet,
251
+ logger,
252
+ kind: "left",
253
+ bucket,
254
+ userId: input.userId,
255
+ userEmail: input.userEmail,
256
+ epoch: flipped.entryCount,
257
+ source: "reconcile",
258
+ });
259
+
260
+ return { status: "left", rowId: flipped.id };
261
+ },
262
+ });
263
+
264
+ /**
265
+ * Set-based SHOULD-LEAVE for one time-based bucket → bulk CAS → RETURNING-gated
266
+ * emit. For single-event `not_exists`/`exists`/`count within` criteria the
267
+ * SHOULD-LEAVE SQL IS the authoritative evaluation (NO per-member
268
+ * `evaluateCondition`). Composite/multi-condition time-based buckets fall back to a
269
+ * chunked per-member `evaluateCondition` loop keyed on `lastEvaluatedAt`.
270
+ */
271
+ async function reconcileBucketLeaves(opts: {
272
+ db: Database;
273
+ logger: Logger;
274
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
275
+ bucket: BucketMeta;
276
+ }): Promise<number> {
277
+ const { db, logger, journeyRegistry, bucket } = opts;
278
+ const criteria = bucket.criteria as ConditionEval;
279
+
280
+ // A single-event/within/count criterion → set-based SHOULD-LEAVE query.
281
+ if (criteria.type === "event") {
282
+ const leaverIds = await selectEventLeavers(db, bucket, criteria);
283
+ if (leaverIds.length === 0) return 0;
284
+ return bulkLeave({
285
+ db,
286
+ logger,
287
+ journeyRegistry,
288
+ bucket,
289
+ userIds: leaverIds,
290
+ });
291
+ }
292
+
293
+ // composite/multi-condition → chunked per-member evaluateCondition (the
294
+ // documented O(active members) fallback, Section 6.4).
295
+ return reconcileCompositeLeaves({ db, logger, journeyRegistry, bucket });
296
+ }
297
+
298
+ /**
299
+ * The SHOULD-LEAVE user-id set for a single-event time-based criterion, matched to
300
+ * the criterion SHAPE (a single `NOT EXISTS` is WRONG for count/exists — Section
301
+ * 6.4). Returns active members who SHOULD leave (the set is a superset of real
302
+ * leavers; never misses one).
303
+ */
304
+ async function selectEventLeavers(
305
+ db: Database,
306
+ bucket: BucketMeta,
307
+ criteria: Extract<ConditionEval, { type: "event" }>,
308
+ ): Promise<string[]> {
309
+ const cutoff = criteria.within
310
+ ? new Date(Date.now() - durationToMs(criteria.within))
311
+ : null;
312
+
313
+ // Active members of this bucket whose contact is live (GDPR — Section 8.6).
314
+ const members = db
315
+ .select({ userId: bucketMemberships.userId })
316
+ .from(bucketMemberships)
317
+ .where(
318
+ and(
319
+ eq(bucketMemberships.bucketId, bucket.id),
320
+ eq(bucketMemberships.status, "active"),
321
+ isNull(bucketMemberships.deletedAt),
322
+ ),
323
+ )
324
+ .as("members");
325
+
326
+ // The windowed count of the criterion's event per member, set-based.
327
+ const counted = db
328
+ .select({
329
+ userId: userEvents.userId,
330
+ cnt: sql<number>`count(*)::int`.as("cnt"),
331
+ })
332
+ .from(userEvents)
333
+ .where(
334
+ and(
335
+ eq(userEvents.event, criteria.eventName),
336
+ cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
337
+ ),
338
+ )
339
+ .groupBy(userEvents.userId)
340
+ .as("counted");
341
+
342
+ // LEFT JOIN members → windowed counts. A missing/zero count is a 0.
343
+ const rows = await db
344
+ .select({
345
+ userId: members.userId,
346
+ cnt: sql<number>`coalesce(${counted.cnt}, 0)`,
347
+ })
348
+ .from(members)
349
+ .leftJoin(counted, eq(members.userId, counted.userId))
350
+ .innerJoin(contacts, eq(contacts.externalId, members.userId))
351
+ .where(isNull(contacts.deletedAt));
352
+
353
+ return rows
354
+ .filter((r) => shouldLeaveByCount(criteria, Number(r.cnt)))
355
+ .map((r) => r.userId);
356
+ }
357
+
358
+ /**
359
+ * Composite/multi-condition time-based fallback — chunked per-member
360
+ * `evaluateCondition` keyed on `lastEvaluatedAt` so the oldest-evaluated members
361
+ * are swept first and the run is bounded by `BATCH_SIZE` (Section 6.4).
362
+ */
363
+ async function reconcileCompositeLeaves(opts: {
364
+ db: Database;
365
+ logger: Logger;
366
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
367
+ bucket: BucketMeta;
368
+ }): Promise<number> {
369
+ const { db, logger, journeyRegistry, bucket } = opts;
370
+ const criteria = bucket.criteria as ConditionEval;
371
+
372
+ // Pull contact properties alongside members iff a property leg needs them, so
373
+ // property predicates in a composite evaluate against MERGED contact state —
374
+ // the SAME state the real-time path reads — instead of always-undefined.
375
+ const needsProps = collectPropertyNames(criteria).length > 0;
376
+ const members = await db
377
+ .select({
378
+ userId: bucketMemberships.userId,
379
+ properties: contacts.properties,
380
+ })
381
+ .from(bucketMemberships)
382
+ .innerJoin(contacts, eq(contacts.externalId, bucketMemberships.userId))
383
+ .where(
384
+ and(
385
+ eq(bucketMemberships.bucketId, bucket.id),
386
+ eq(bucketMemberships.status, "active"),
387
+ isNull(bucketMemberships.deletedAt),
388
+ isNull(contacts.deletedAt),
389
+ ),
390
+ )
391
+ .orderBy(sql`${bucketMemberships.lastEvaluatedAt} asc nulls first`)
392
+ .limit(BATCH_SIZE);
393
+
394
+ const leaverIds: string[] = [];
395
+ const evaluatedIds: string[] = [];
396
+ for (const member of members) {
397
+ evaluatedIds.push(member.userId);
398
+ const journeyContext = needsProps
399
+ ? ((member.properties as Record<string, unknown> | null) ?? {})
400
+ : {};
401
+ const isMember = await evaluateCondition({
402
+ condition: criteria,
403
+ ctx: { db, userId: member.userId, journeyContext },
404
+ });
405
+ if (!isMember) leaverIds.push(member.userId);
406
+ }
407
+
408
+ // Bump lastEvaluatedAt for the whole chunk so the next tick advances the cursor
409
+ // (including stable members, which are NOT leavers).
410
+ if (evaluatedIds.length > 0) {
411
+ await db
412
+ .update(bucketMemberships)
413
+ .set({ lastEvaluatedAt: new Date() })
414
+ .where(
415
+ and(
416
+ eq(bucketMemberships.bucketId, bucket.id),
417
+ eq(bucketMemberships.status, "active"),
418
+ inArray(bucketMemberships.userId, evaluatedIds),
419
+ ),
420
+ );
421
+ }
422
+
423
+ if (leaverIds.length === 0) return 0;
424
+ return bulkLeave({ db, logger, journeyRegistry, bucket, userIds: leaverIds });
425
+ }
426
+
427
+ /**
428
+ * Unconditional max-dwell TTL leave (per-bucket `maxDwell`). Selects active
429
+ * members whose `maxDwellAt` deadline has passed (GDPR: live contacts only) and
430
+ * force-leaves them through the shared `bulkLeave` CAS — with NO criteria
431
+ * re-evaluation, unlike the criteria SHOULD-LEAVE path. Emits `bucket:left`;
432
+ * whether the user can re-join afterwards is governed by the bucket's `entryLimit`
433
+ * policy on their next qualifying event (the per-bucket time-box vs flush knob).
434
+ */
435
+ async function reconcileBucketTtlLeaves(opts: {
436
+ db: Database;
437
+ logger: Logger;
438
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
439
+ bucket: BucketMeta;
440
+ }): Promise<number> {
441
+ const { db, logger, journeyRegistry, bucket } = opts;
442
+
443
+ const expired = await db
444
+ .select({ userId: bucketMemberships.userId })
445
+ .from(bucketMemberships)
446
+ .innerJoin(contacts, eq(contacts.externalId, bucketMemberships.userId))
447
+ .where(
448
+ and(
449
+ eq(bucketMemberships.bucketId, bucket.id),
450
+ eq(bucketMemberships.status, "active"),
451
+ isNull(bucketMemberships.deletedAt),
452
+ isNotNull(bucketMemberships.maxDwellAt),
453
+ lte(bucketMemberships.maxDwellAt, new Date()),
454
+ isNull(contacts.deletedAt),
455
+ ),
456
+ );
457
+
458
+ if (expired.length === 0) return 0;
459
+ return bulkLeave({
460
+ db,
461
+ logger,
462
+ journeyRegistry,
463
+ bucket,
464
+ userIds: expired.map((r) => r.userId),
465
+ });
466
+ }
467
+
468
+ /**
469
+ * Bulk compare-and-swap a set of active members to `left`, then emit `bucket:left`
470
+ * for each row the UPDATE actually flipped (gated on RETURNING — the loser of a
471
+ * concurrent race mutates zero rows and never emits, Section 6.3). minDwell defers:
472
+ * a member still inside its dwell window is NOT left here (the dwell deadline is
473
+ * carried on `expiresAt`; the next eligible tick leaves it).
474
+ */
475
+ async function bulkLeave(opts: {
476
+ db: Database;
477
+ logger: Logger;
478
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
479
+ bucket: BucketMeta;
480
+ userIds: string[];
481
+ }): Promise<number> {
482
+ const { db, logger, journeyRegistry, bucket, userIds } = opts;
483
+
484
+ const dwellMs = bucket.minDwell ? durationToMs(bucket.minDwell) : 0;
485
+ const dwellCutoff = dwellMs > 0 ? new Date(Date.now() - dwellMs) : null;
486
+
487
+ // Flip only active rows for the leaver set whose minDwell has elapsed. The CAS
488
+ // guard (status = 'active') means a concurrent leave makes this affect zero of
489
+ // those rows. RETURNING carries userEmail + entryCount for the emit.
490
+ const flipped = await db
491
+ .update(bucketMemberships)
492
+ .set({
493
+ status: "left",
494
+ leftAt: new Date(),
495
+ lastEvaluatedAt: new Date(),
496
+ updatedAt: new Date(),
497
+ })
498
+ .where(
499
+ and(
500
+ eq(bucketMemberships.bucketId, bucket.id),
501
+ eq(bucketMemberships.status, "active"),
502
+ isNull(bucketMemberships.deletedAt),
503
+ inArray(bucketMemberships.userId, userIds),
504
+ // minDwell: only leave rows that have existed at least minDwell.
505
+ dwellCutoff ? lte(bucketMemberships.enteredAt, dwellCutoff) : undefined,
506
+ ),
507
+ )
508
+ .returning({
509
+ id: bucketMemberships.id,
510
+ userId: bucketMemberships.userId,
511
+ userEmail: bucketMemberships.userEmail,
512
+ entryCount: bucketMemberships.entryCount,
513
+ });
514
+
515
+ for (const row of flipped) {
516
+ await emitBucketTransition({
517
+ db,
518
+ registry: journeyRegistry,
519
+ hatchet,
520
+ logger,
521
+ kind: "left",
522
+ bucket,
523
+ userId: row.userId,
524
+ userEmail: row.userEmail,
525
+ epoch: row.entryCount,
526
+ source: "reconcile",
527
+ });
528
+ }
529
+
530
+ return flipped.length;
531
+ }
532
+
533
+ /**
534
+ * reconcileJoins (absence buckets): materialize NEW members the real-time path
535
+ * cannot see — a user who STOPS doing X fires no event, so only the clock can
536
+ * enroll them. ONE bounded (BATCH_SIZE per tick) path handles every shape, but
537
+ * the per-candidate handling splits on whether the SQL candidate set is EXACT:
538
+ *
539
+ * - SET-BASED / EXACT (no per-member confirm, Fix #3) — the SAFE shapes the
540
+ * engine auto-infers `reconcileJoins` on:
541
+ * (a) SINGLE-EVENT `not_exists within W` — the `present` windowed anti-join
542
+ * makes the candidate query exact, and
543
+ * (b) the LAPSED-ACTIVE composite `all(event(X).exists(),
544
+ * event(X).within(W).not_exists())` — ever-fired X satisfies the exists()
545
+ * leg and the present-in-X's-window anti-join satisfies the not_exists()
546
+ * leg, so EVERY returned row is a true matcher.
547
+ * Because each matcher becomes an active member, the next tick excludes it →
548
+ * the `externalId asc` page advances naturally and the scan cannot starve.
549
+ * - PER-MEMBER CONFIRM (non-exact superset) — any OTHER absence-containing
550
+ * composite (an OR of absence legs, or absence mixed with property/count legs)
551
+ * reached ONLY via an EXPLICIT `reconcileJoins: true`. The candidate query is a
552
+ * cheap superset, so each candidate is confirmed with `evaluateCondition`
553
+ * (correct AND/OR) before it is materialized. This path is BATCH_SIZE-bounded
554
+ * per tick: a wide non-matching prefix can keep genuine matchers off the page
555
+ * indefinitely (a clean cursor would require a per-candidate examined-stamp =
556
+ * a schema change), so the bound is LOGGED once per sweep rather than silently
557
+ * starving (Fix #3).
558
+ *
559
+ * In all cases the candidate set is the exists-ever floor over ALL windowed
560
+ * `not_exists` legs (the UNION of their ever-fired sets — so an OR of absence
561
+ * legs never silently drops a user who only fired the OTHER leg), MINUS users
562
+ * present in EVERY absence leg's window (always-safe to exclude: such a user
563
+ * fails every not_exists leg, so they qualify via none — this drops the
564
+ * currently-active prefix so the bounded scan reaches genuinely-dormant users
565
+ * and converges), MINUS current active members. Deterministic `externalId asc`
566
+ * pages the cohort across ticks (convergence in ceil(candidates / BATCH_SIZE)).
567
+ *
568
+ * Composite NON-absence and positive shapes are caught real-time on event
569
+ * arrival, so they short-circuit to 0 here.
570
+ */
571
+ async function reconcileBucketJoins(opts: {
572
+ db: Database;
573
+ logger: Logger;
574
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
575
+ bucket: BucketMeta;
576
+ }): Promise<number> {
577
+ const { db, logger, journeyRegistry, bucket } = opts;
578
+ const criteria = bucket.criteria as ConditionEval;
579
+
580
+ // First-deploy guard (Fix #2): the JOIN path must NOT emit `bucket:entered`
581
+ // for historically-dormant users while a brand-new bucket's first-time
582
+ // backfill is still claiming them silently. The backfill materializes
583
+ // historical members WITHOUT live emission (the Customer.io rule); if the
584
+ // cron's absence-join scan runs concurrently it would re-discover the SAME
585
+ // dormant cohort and emit for them — a historical blast. So skip the join
586
+ // path entirely until the first-time backfill has persisted its
587
+ // criteriaHash. The transition skipped→active-joins happens when the backfill
588
+ // task finishes and calls persistCriteriaHash (bucket-backfill.ts), at which
589
+ // point bucket_configs.criteriaHash is non-null and no first-time job is in
590
+ // flight. (The LEAVE + maxDwell TTL paths are unaffected — see the caller.)
591
+ if (await firstTimeBackfillIncomplete(db, bucket)) {
592
+ logger.info("Bucket join reconcile skipped (first-time backfill pending)", {
593
+ bucketId: bucket.id,
594
+ });
595
+ return 0;
596
+ }
597
+
598
+ // Every windowed not_exists leg (the shapes a clock can JOIN). No absence leg
599
+ // → nothing for the cron to materialize (positive shapes are caught live).
600
+ const absenceLegs = collectAbsenceLegs(criteria);
601
+ if (absenceLegs.length === 0) return 0;
602
+
603
+ // Exists-ever floor: contacts who fired ANY absence-leg event AT LEAST ONCE
604
+ // (no window). UNIONing across legs keeps an OR-of-absence bucket from
605
+ // dropping a user who only ever fired one of the legs. Excludes brand-new
606
+ // never-active signups and bounds the scan to the once-active cohort.
607
+ const everFiredEvents = Array.from(new Set(absenceLegs.map((l) => l.event)));
608
+ const everFired = db
609
+ .selectDistinct({ userId: userEvents.userId })
610
+ .from(userEvents)
611
+ .where(inArray(userEvents.event, everFiredEvents))
612
+ .as("ever_fired");
613
+
614
+ // Users who already have an active membership (skip — they are members).
615
+ const activeMembers = db
616
+ .select({ userId: bucketMemberships.userId })
617
+ .from(bucketMemberships)
618
+ .where(
619
+ and(
620
+ eq(bucketMemberships.bucketId, bucket.id),
621
+ eq(bucketMemberships.status, "active"),
622
+ isNull(bucketMemberships.deletedAt),
623
+ ),
624
+ )
625
+ .as("active_members");
626
+
627
+ // Present-in-ALL-windows exclusion: a user who fired EVERY absence-leg event
628
+ // inside that leg's window fails every not_exists leg, so they cannot qualify
629
+ // (AND or OR). Dropping them is always-safe AND breaks the prefix-lock — the
630
+ // currently-active cohort (which fails the criteria anyway) is excluded so the
631
+ // bounded scan reaches real dormant users. For a single absence leg this is
632
+ // exactly the single-event `present` anti-join; the SQL is then exact.
633
+ //
634
+ // The exclusion is only applied when every leg has a DISTINCT event, so the
635
+ // `count(distinct event) = #legs` test exactly means "present in each leg's
636
+ // window". Two legs on the SAME event with different windows would let the
637
+ // wider window over-exclude a user who is absent in the tighter (joinable)
638
+ // window, so that pathological shape skips the exclusion and relies on the
639
+ // per-member confirm + paging (no over-exclusion, just no early prune).
640
+ const distinctLegEvents = new Set(absenceLegs.map((l) => l.event));
641
+ const canExclude = distinctLegEvents.size === absenceLegs.length;
642
+ const presentInAll = canExclude
643
+ ? selectPresentInAllWindows(db, absenceLegs)
644
+ : null;
645
+
646
+ const baseQuery = db
647
+ .select({
648
+ userId: contacts.externalId,
649
+ email: contacts.email,
650
+ })
651
+ .from(contacts)
652
+ .innerJoin(everFired, eq(everFired.userId, contacts.externalId))
653
+ .leftJoin(activeMembers, eq(activeMembers.userId, contacts.externalId));
654
+
655
+ const candidates = await (presentInAll
656
+ ? baseQuery
657
+ .leftJoin(presentInAll, eq(presentInAll.userId, contacts.externalId))
658
+ .where(
659
+ and(
660
+ isNull(contacts.deletedAt),
661
+ isNull(activeMembers.userId),
662
+ isNull(presentInAll.userId),
663
+ ),
664
+ )
665
+ : baseQuery.where(
666
+ and(isNull(contacts.deletedAt), isNull(activeMembers.userId)),
667
+ )
668
+ )
669
+ .orderBy(sql`${contacts.externalId} asc`)
670
+ .limit(BATCH_SIZE);
671
+
672
+ // SET-BASED / EXACT shapes (Fix #3) — every candidate row is a true matcher,
673
+ // so the per-member confirm is skipped entirely:
674
+ // (a) a single absence leg makes the candidate query exact (present-in-all =
675
+ // the one leg's present anti-join), and
676
+ // (b) the lapsed-active composite — ever-fired X satisfies the exists() leg
677
+ // and the present-in-X-window exclusion satisfies the not_exists() leg.
678
+ // Any OTHER composite (OR-of-absence, absence + property/count) is a non-exact
679
+ // superset that needs the full `evaluateCondition` confirm for correct AND/OR.
680
+ const exact =
681
+ (criteria.type === "event" && absenceLegs.length === 1) ||
682
+ isLapsedActiveComposite(criteria) != null;
683
+
684
+ // Merged contact properties feed property legs in the per-member confirm so
685
+ // an absence+property composite evaluates the SAME way it does on the
686
+ // real-time path (which reads merged contact state). Empty when no confirm
687
+ // runs (exact path) or no property leg exists.
688
+ const needsProps = !exact && collectPropertyNames(criteria).length > 0;
689
+
690
+ // The non-exact per-member path is BATCH_SIZE-bounded per tick with no
691
+ // examined-cursor (a clean cursor would need a schema change). Log the bound
692
+ // ONCE per sweep so a wide non-matching prefix that delays genuine matchers is
693
+ // visible rather than a silent starve (Fix #3).
694
+ if (!exact && candidates.length >= BATCH_SIZE) {
695
+ logger.warn(
696
+ "Bucket composite-join confirm is bounded to BATCH_SIZE/tick (explicit reconcileJoins); matchers behind a wide non-matching prefix may take multiple ticks to enroll",
697
+ { bucketId: bucket.id, batchSize: BATCH_SIZE },
698
+ );
699
+ }
700
+
701
+ let joined = 0;
702
+ for (const candidate of candidates) {
703
+ if (!exact) {
704
+ const journeyContext = needsProps
705
+ ? await loadContactProperties(db, candidate.userId)
706
+ : {};
707
+ const isMember = await evaluateCondition({
708
+ condition: criteria,
709
+ ctx: { db, userId: candidate.userId, journeyContext },
710
+ });
711
+ if (!isMember) continue;
712
+ }
713
+
714
+ const transitioned = await reconcileJoinOne({
715
+ db,
716
+ logger,
717
+ journeyRegistry,
718
+ bucket,
719
+ userId: candidate.userId,
720
+ userEmail: candidate.email ?? null,
721
+ });
722
+ if (transitioned) joined += 1;
723
+ }
724
+ return joined;
725
+ }
726
+
727
+ /**
728
+ * True while a bucket's first-time backfill has NOT completed — the gate that
729
+ * keeps the cron JOIN path from emitting a historical blast on first deploy
730
+ * (Fix #2). Two signals, either of which means "not yet safe to join-reconcile":
731
+ *
732
+ * 1. `bucket_configs.criteriaHash IS NULL` (or no row at all) — the first-time
733
+ * backfill task persists this hash on completion (persistCriteriaHash in
734
+ * bucket-backfill.ts), so a null/absent hash means the backfill has not yet
735
+ * finished claiming the historical cohort silently.
736
+ * 2. A first-time backfill `import_jobs` row is in flight — `fileName =
737
+ * bucket.id AND format = FIRST_TIME_FORMAT AND status IN
738
+ * ('pending','processing')`. This covers the boot window AFTER a prior run
739
+ * persisted a hash but BEFORE a freshly-enqueued first-time job runs (and
740
+ * the general in-flight case), so a concurrent cron tick never races the
741
+ * backfill's silent materialization.
742
+ *
743
+ * The transition skipped→active-joins is monotonic: once the backfill completes,
744
+ * the hash is non-null AND its job leaves the in-flight set, so the next cron
745
+ * tick proceeds with the absence-join scan as normal.
746
+ */
747
+ async function firstTimeBackfillIncomplete(
748
+ db: Database,
749
+ bucket: BucketMeta,
750
+ ): Promise<boolean> {
751
+ // (1) criteriaHash not yet persisted → backfill hasn't finished.
752
+ const config = await db.query.bucketConfigs.findFirst({
753
+ where: eq(bucketConfigs.bucketId, bucket.id),
754
+ });
755
+ if (!config || config.criteriaHash == null) return true;
756
+
757
+ // (2) a first-time backfill job is still pending/processing for this bucket.
758
+ const inFlight = await db
759
+ .select({ id: importJobs.id })
760
+ .from(importJobs)
761
+ .where(
762
+ and(
763
+ eq(importJobs.fileName, bucket.id),
764
+ eq(importJobs.format, FIRST_TIME_FORMAT),
765
+ inArray(importJobs.status, ["pending", "processing"]),
766
+ ),
767
+ )
768
+ .limit(1);
769
+ return inFlight.length > 0;
770
+ }
771
+
772
+ /**
773
+ * A subquery of users who fired EVERY absence leg's event inside that leg's
774
+ * rolling window — the intersection across legs. Such a user fails every
775
+ * not_exists leg, so they qualify via none and are always-safe to exclude from
776
+ * candidates. PRECONDITION: every leg has a DISTINCT event (the caller enforces
777
+ * this), so `count(distinct event) = #legs` exactly means "present in each leg's
778
+ * window".
779
+ */
780
+ function selectPresentInAllWindows(db: Database, legs: AbsenceLeg[]) {
781
+ // OR together each leg's "fired this event inside its window" predicate, then
782
+ // require a distinct match for EVERY leg (count(distinct event) = #legs).
783
+ const perLeg = legs.map((leg) =>
784
+ and(
785
+ eq(userEvents.event, leg.event),
786
+ leg.cutoff ? gte(userEvents.occurredAt, leg.cutoff) : undefined,
787
+ ),
788
+ );
789
+ return db
790
+ .select({ userId: userEvents.userId })
791
+ .from(userEvents)
792
+ .where(or(...perLeg))
793
+ .groupBy(userEvents.userId)
794
+ .having(sql`count(distinct ${userEvents.event}) >= ${legs.length}`)
795
+ .as("present_all");
796
+ }
797
+
798
+ /** The merged stored properties of a contact (for property-leg evaluation). */
799
+ async function loadContactProperties(
800
+ db: Database,
801
+ userId: string,
802
+ ): Promise<Record<string, unknown>> {
803
+ const [contact] = await db
804
+ .select({ properties: contacts.properties })
805
+ .from(contacts)
806
+ .where(eq(contacts.externalId, userId))
807
+ .limit(1);
808
+ return (contact?.properties as Record<string, unknown> | null) ?? {};
809
+ }
810
+
811
+ /**
812
+ * Insert ONE reconcile-discovered join (RETURNING-gated on the partial-active
813
+ * unique index) and emit `bucket:entered`. entryCount = 1 + prior memberships.
814
+ */
815
+ async function reconcileJoinOne(opts: {
816
+ db: Database;
817
+ logger: Logger;
818
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
819
+ bucket: BucketMeta;
820
+ userId: string;
821
+ userEmail: string | null;
822
+ }): Promise<boolean> {
823
+ const { db, logger, journeyRegistry, bucket, userId, userEmail } = opts;
824
+
825
+ // entryCount ordinal = 1 + ALL prior memberships (active + left). Shared with
826
+ // the real-time join path so the ordinal never drifts between the two writers.
827
+ const priorCount = await countPriorMemberships(db, bucket.id, userId);
828
+ const epoch = priorCount + 1;
829
+
830
+ const inserted = await db
831
+ .insert(bucketMemberships)
832
+ .values({
833
+ userId,
834
+ userEmail,
835
+ bucketId: bucket.id,
836
+ status: "active",
837
+ source: "reconcile",
838
+ entryCount: epoch,
839
+ expiresAt: computeExpiresAt(bucket),
840
+ maxDwellAt: computeMaxDwellAt(bucket),
841
+ lastEvaluatedAt: new Date(),
842
+ })
843
+ .onConflictDoNothing()
844
+ .returning({ id: bucketMemberships.id });
845
+
846
+ if (inserted.length !== 1) {
847
+ return false;
848
+ }
849
+
850
+ // The active row is always written (Studio size must reflect reality) and the
851
+ // epoch always advances via the real insert; only the bucket:entered emission
852
+ // is gated by the entryLimit policy — mirrors the real-time join path so the
853
+ // cron-discovered join cannot bypass entryLimit (Section 6.3).
854
+ if (await shouldEmitJoin({ db, bucket, userId, priorCount })) {
855
+ await emitBucketTransition({
856
+ db,
857
+ registry: journeyRegistry,
858
+ hatchet,
859
+ logger,
860
+ kind: "entered",
861
+ bucket,
862
+ userId,
863
+ userEmail,
864
+ epoch,
865
+ source: "reconcile",
866
+ });
867
+ } else {
868
+ logger.info("Bucket join emit suppressed by entryLimit policy", {
869
+ bucketId: bucket.id,
870
+ userId,
871
+ entryLimit: bucket.entryLimit ?? "unlimited",
872
+ });
873
+ }
874
+
875
+ return true;
876
+ }
877
+
878
+ /** A bucket is time-based if flagged OR its criteria carry a `within` window. */
879
+ function isTimeBased(bucket: BucketMeta): boolean {
880
+ if (bucket.timeBased) return true;
881
+ if (!bucket.criteria) return false;
882
+ return firstWithin(bucket.criteria) != null;
883
+ }
884
+
885
+ /**
886
+ * Resolve the JOIN-reconciliation decision for a bucket (tri-state on
887
+ * `reconcileJoins`):
888
+ * - `false` → hard OFF (explicit cost-bounding override; the absence join is
889
+ * skipped even for an absence-shaped bucket).
890
+ * - `true` → explicit ON (unchanged 0.2.0 opt-in behavior).
891
+ * - `undefined` → INFERRED, but ONLY for the two SAFE set-based shapes the cron
892
+ * can JOIN with an EXACT SQL candidate set (every returned row a true matcher,
893
+ * so no per-member confirm → no starvation, Fix #3):
894
+ * (a) a single-event windowed `not_exists` criterion, and
895
+ * (b) the lapsed-active composite `all(event(X).exists(),
896
+ * event(X).within(W).not_exists())` (see {@link isLapsedActiveComposite}).
897
+ * ANY OTHER absence-containing composite (an OR of absence legs, or absence
898
+ * mixed with extra property/count legs) is NOT auto-inferred — its candidate
899
+ * set is a non-exact superset that needs a per-member confirm, which is
900
+ * BATCH_SIZE-bounded per tick and can starve, so it requires an explicit
901
+ * `reconcileJoins: true` opt-in. Non-absence time-based buckets still skip the
902
+ * join scan (their joins are caught real-time).
903
+ */
904
+ function shouldReconcileJoins(bucket: BucketMeta): boolean {
905
+ if (bucket.reconcileJoins === false) return false;
906
+ if (bucket.reconcileJoins === true) return true;
907
+ if (!bucket.criteria) return false;
908
+ return isSafeAbsenceShape(bucket.criteria);
909
+ }
910
+
911
+ /**
912
+ * The two SAFE absence shapes whose cron-JOIN candidate set is EXACT in SQL
913
+ * alone — the only shapes the engine AUTO-INFERS `reconcileJoins` on (Fix #3):
914
+ * (a) a single-event windowed `not_exists` criterion, and
915
+ * (b) the lapsed-active composite (see {@link isLapsedActiveComposite}).
916
+ * Every other absence-containing composite is a non-exact superset and must opt
917
+ * in explicitly (the per-member confirm path is BATCH_SIZE-bounded per tick).
918
+ */
919
+ function isSafeAbsenceShape(criteria: ConditionEval): boolean {
920
+ if (
921
+ criteria.type === "event" &&
922
+ criteria.check === "not_exists" &&
923
+ criteria.within != null
924
+ ) {
925
+ return true;
926
+ }
927
+ return isLapsedActiveComposite(criteria) != null;
928
+ }
929
+
930
+ /** The recognized lapsed-active composite (shape (b)): event + window cutoff. */
931
+ interface LapsedActiveShape {
932
+ event: string;
933
+ /** now - within for the not_exists leg's window. */
934
+ cutoff: Date;
935
+ }
936
+
937
+ /**
938
+ * Recognize shape (b) — the flagship "went-dormant" composite — and return its
939
+ * (event, window cutoff), else null. It is a composite AND of EXACTLY two legs on
940
+ * the SAME event X: an `event(X).exists()` ever-fired anchor (no window) and an
941
+ * `event(X).within(W).not_exists()` windowed-absence leg. Because the candidate
942
+ * SQL (ever-fired X, MINUS present-in-X's-window, MINUS active members) satisfies
943
+ * BOTH legs of the AND for every returned row, the set is EXACT — no per-member
944
+ * `evaluateCondition` is needed and the page advances naturally (matchers become
945
+ * active members → excluded next tick), so it cannot starve (Fix #3).
946
+ */
947
+ function isLapsedActiveComposite(
948
+ criteria: ConditionEval,
949
+ ): LapsedActiveShape | null {
950
+ if (
951
+ criteria.type !== "composite" ||
952
+ criteria.operator !== "and" ||
953
+ criteria.conditions.length !== 2
954
+ ) {
955
+ return null;
956
+ }
957
+
958
+ const existsLeg = criteria.conditions.find(
959
+ (c) => c.type === "event" && c.check === "exists" && c.within == null,
960
+ );
961
+ const notExistsLeg = criteria.conditions.find(
962
+ (c) => c.type === "event" && c.check === "not_exists" && c.within != null,
963
+ );
964
+ if (
965
+ existsLeg?.type !== "event" ||
966
+ notExistsLeg?.type !== "event" ||
967
+ existsLeg.eventName !== notExistsLeg.eventName ||
968
+ notExistsLeg.within == null
969
+ ) {
970
+ return null;
971
+ }
972
+
973
+ return {
974
+ event: notExistsLeg.eventName,
975
+ cutoff: new Date(Date.now() - durationToMs(notExistsLeg.within)),
976
+ };
977
+ }
978
+
979
+ /** One windowed `not_exists` leg: the event + its window cutoff instant. */
980
+ interface AbsenceLeg {
981
+ event: string;
982
+ /** now - within for the leg's window; null only if within is somehow unset. */
983
+ cutoff: Date | null;
984
+ }
985
+
986
+ /**
987
+ * Every windowed `not_exists` leg in a criteria tree (depth-first) — "stopped
988
+ * doing X in the last N", the only shapes a clock can materialize a JOIN for. An
989
+ * UNBOUNDED not_exists (no window) is degenerate and not auto-joinable (the
990
+ * schema already rejects pure-unbounded-negation buckets), so it is skipped.
991
+ * Collecting ALL legs (not just the first) keeps an OR-of-absence composite from
992
+ * silently dropping users who only ever fired one of the legs.
993
+ */
994
+ function collectAbsenceLegs(criteria: ConditionEval): AbsenceLeg[] {
995
+ if (criteria.type === "event") {
996
+ if (criteria.check === "not_exists" && criteria.within != null) {
997
+ return [
998
+ {
999
+ event: criteria.eventName,
1000
+ cutoff: new Date(Date.now() - durationToMs(criteria.within)),
1001
+ },
1002
+ ];
1003
+ }
1004
+ return [];
1005
+ }
1006
+ if (criteria.type === "composite") {
1007
+ return criteria.conditions.flatMap(collectAbsenceLegs);
1008
+ }
1009
+ return [];
1010
+ }