@hogsend/engine 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,6 @@ import type { JsonObject } from "@hatchet-dev/typescript-sdk/v1/types.js";
3
3
  import {
4
4
  type BucketMeta,
5
5
  type ConditionEval,
6
- type DurationObject,
7
6
  durationToMs,
8
7
  evaluateCondition,
9
8
  } from "@hogsend/core";
@@ -16,7 +15,12 @@ import {
16
15
  importJobs,
17
16
  userEvents,
18
17
  } from "@hogsend/db";
19
- import { and, eq, gte, inArray, isNull, sql } from "drizzle-orm";
18
+ import { and, eq, gt, gte, inArray, isNull, sql } from "drizzle-orm";
19
+ import {
20
+ computeExpiresAt,
21
+ computeMaxDwellAt,
22
+ matchesEventCount,
23
+ } from "../buckets/membership-epoch.js";
20
24
  import { getBucketRegistrySingleton } from "../buckets/registry-singleton.js";
21
25
  import { getJourneyRegistrySingleton } from "../journeys/registry-singleton.js";
22
26
  import { emitBucketTransition } from "../lib/bucket-emit.js";
@@ -28,7 +32,7 @@ import { createLogger } from "../lib/logger.js";
28
32
  const BATCH_SIZE = 500;
29
33
 
30
34
  /** import_jobs.format discriminator for the reused status record (Section 6.6). */
31
- const FIRST_TIME_FORMAT = "bucket-backfill";
35
+ export const FIRST_TIME_FORMAT = "bucket-backfill";
32
36
  const REEVAL_FORMAT = "bucket-reeval";
33
37
 
34
38
  /**
@@ -199,6 +203,18 @@ async function backfillJoins(opts: {
199
203
  .set({ totalRows: matcherIds.length, updatedAt: new Date() })
200
204
  .where(eq(importJobs.id, jobId));
201
205
 
206
+ // Unconditional max-dwell TTL deadline, stamped once at insert (mirrors the
207
+ // live join, check-membership.ts). null when the bucket has no maxDwell; the
208
+ // TTL sweep (reconcileBucketTtlLeaves) filters isNotNull(maxDwellAt), so an
209
+ // unset value would never be force-left.
210
+ const maxDwellAt = computeMaxDwellAt(bucket);
211
+
212
+ // Fix C (DEFERRED): backfilled fastExpiry rows are NOT armed with a
213
+ // bucket:arm-expiry durable timer here — they are picked up by the next cron
214
+ // sweep instead (reconcileBucketLeaves / reconcileBucketTtlLeaves are the
215
+ // authoritative backstop). Conscious choice (cron cadence, default 5m), not an
216
+ // omission: arming at backfill would fan out one durable task per inserted row.
217
+
202
218
  let inserted = 0;
203
219
  for (let i = 0; i < matcherIds.length; i += BATCH_SIZE) {
204
220
  const chunk = matcherIds.slice(i, i + BATCH_SIZE);
@@ -214,14 +230,38 @@ async function backfillJoins(opts: {
214
230
  chunkContacts.map((c) => [c.externalId, c.email]),
215
231
  );
216
232
 
233
+ // Fix A: entryCount = 1 + prior memberships for each (user, bucket), the
234
+ // same monotonic ordinal the live join computes (check-membership.ts). On a
235
+ // FIRST-TIME backfill priorCount is 0 → entryCount 1 (unchanged); on a
236
+ // REEVAL re-join of a user with historical "left" rows it advances the
237
+ // epoch correctly. ONE batched GROUP BY per chunk (never per-user — the set-
238
+ // based path must not reintroduce the O(P) serial-query trap).
239
+ const priorCounts = await db
240
+ .select({
241
+ userId: bucketMemberships.userId,
242
+ cnt: sql<number>`count(*)::int`,
243
+ })
244
+ .from(bucketMemberships)
245
+ .where(
246
+ and(
247
+ eq(bucketMemberships.bucketId, bucket.id),
248
+ inArray(bucketMemberships.userId, chunk),
249
+ ),
250
+ )
251
+ .groupBy(bucketMemberships.userId);
252
+ const priorByUser = new Map(
253
+ priorCounts.map((r) => [r.userId, Number(r.cnt)]),
254
+ );
255
+
217
256
  const rows = chunk.map((userId) => ({
218
257
  userId,
219
258
  userEmail: emailByUser.get(userId) ?? null,
220
259
  bucketId: bucket.id,
221
260
  status: "active" as const,
222
261
  source: "backfill" as const,
223
- entryCount: 1,
224
- expiresAt: computeBackfillExpiresAt(bucket),
262
+ entryCount: 1 + (priorByUser.get(userId) ?? 0),
263
+ expiresAt: computeExpiresAt(bucket),
264
+ maxDwellAt,
225
265
  lastEvaluatedAt: new Date(),
226
266
  }));
227
267
 
@@ -343,8 +383,22 @@ async function selectEventMatchers(
343
383
  : null;
344
384
 
345
385
  // count gte N / exists → SELECT user_id ... GROUP BY HAVING. not_exists
346
- // (absence) → live contacts with NO such event in the window (anti-join).
386
+ // (absence) → live contacts who EVER fired the event but have NONE in the
387
+ // window (lapsed-only). A bare windowed `not_exists within W` is treated as
388
+ // LAPSED-ONLY (never-active EXCLUDED) in BOTH this backfill and the cron
389
+ // (bucket-reconcile.ts reconcileBucketJoins, the everFired floor), so the two
390
+ // writers agree: brand-new never-active signups are NOT materialized for an
391
+ // absence-within-window bucket — only users who once did X and then stopped.
347
392
  if (criteria.check === "not_exists") {
393
+ // everFired floor: contacts who fired the event AT LEAST ONCE (no window),
394
+ // mirroring the cron's `ever_fired` semi-join. Excludes never-active
395
+ // contacts so the two writers select the same lapsed-only cohort.
396
+ const everFired = db
397
+ .selectDistinct({ userId: userEvents.userId })
398
+ .from(userEvents)
399
+ .where(eq(userEvents.event, criteria.eventName))
400
+ .as("ever_fired");
401
+
348
402
  const present = db
349
403
  .select({ userId: userEvents.userId })
350
404
  .from(userEvents)
@@ -360,115 +414,94 @@ async function selectEventMatchers(
360
414
  const rows = await db
361
415
  .select({ userId: contacts.externalId })
362
416
  .from(contacts)
417
+ .innerJoin(everFired, eq(everFired.userId, contacts.externalId))
363
418
  .leftJoin(present, eq(present.userId, contacts.externalId))
364
419
  .where(and(isNull(contacts.deletedAt), isNull(present.userId)));
365
420
  return rows.map((r) => r.userId);
366
421
  }
367
422
 
368
- // exists / count: group counts then filter by the operator.
423
+ // exists / count: group counts then filter by the operator. Fix B: innerJoin
424
+ // live contacts (GDPR — only materialize memberships for non-deleted contacts
425
+ // that actually exist), mirroring selectEventLeavers in bucket-reconcile.ts.
426
+ // The not_exists branch above already filters contacts.deletedAt; without this
427
+ // join the positive-event path could materialize active rows for soft-deleted
428
+ // or orphan-event userIds, diverging from the live/reconcile paths.
369
429
  const rows = await db
370
430
  .select({
371
431
  userId: userEvents.userId,
372
432
  cnt: sql<number>`count(*)::int`,
373
433
  })
374
434
  .from(userEvents)
435
+ .innerJoin(contacts, eq(contacts.externalId, userEvents.userId))
375
436
  .where(
376
437
  and(
377
438
  eq(userEvents.event, criteria.eventName),
439
+ isNull(contacts.deletedAt),
378
440
  cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
379
441
  ),
380
442
  )
381
443
  .groupBy(userEvents.userId);
382
444
 
383
445
  return rows
384
- .filter((r) => matchesCount(criteria, Number(r.cnt)))
446
+ .filter((r) => matchesEventCount(criteria, Number(r.cnt)))
385
447
  .map((r) => r.userId);
386
448
  }
387
449
 
388
- /** True when a windowed count satisfies the (exists/count) criterion. */
389
- function matchesCount(
390
- criteria: Extract<ConditionEval, { type: "event" }>,
391
- count: number,
392
- ): boolean {
393
- switch (criteria.check) {
394
- case "exists":
395
- return count > 0;
396
- case "count": {
397
- if (!criteria.operator || criteria.value === undefined) return count > 0;
398
- switch (criteria.operator) {
399
- case "gt":
400
- return count > criteria.value;
401
- case "gte":
402
- return count >= criteria.value;
403
- case "lt":
404
- return count < criteria.value;
405
- case "lte":
406
- return count <= criteria.value;
407
- case "eq":
408
- return count === criteria.value;
409
- default:
410
- return false;
411
- }
412
- }
413
- default:
414
- return false;
415
- }
416
- }
417
-
418
450
  /**
419
451
  * Composite/multi-condition fallback (the documented O(P) exception, Section 6.6):
420
- * a chunked per-contact `evaluateCondition` loop over live contacts. Property
452
+ * a per-contact `evaluateCondition` loop over live contacts. Property
421
453
  * sub-conditions evaluate against the contact's merged properties.
454
+ *
455
+ * KEYSET PAGINATION by `contacts.externalId` in BATCH_SIZE pages (mirrors
456
+ * reconcileBucketJoins' `externalId asc` paging): each page selects
457
+ * `WHERE externalId > :cursor ORDER BY externalId ASC LIMIT BATCH_SIZE`,
458
+ * evaluates the criteria per contact, then advances the cursor to the last
459
+ * externalId of the page — repeating until a short page ends the scan. The whole
460
+ * contacts table is never held in memory at once.
422
461
  */
423
462
  async function selectCompositeMatchers(
424
463
  db: Database,
425
464
  criteria: ConditionEval,
426
465
  ): Promise<string[]> {
427
- const liveContacts = await db
428
- .select({
429
- externalId: contacts.externalId,
430
- properties: contacts.properties,
431
- })
432
- .from(contacts)
433
- .where(isNull(contacts.deletedAt));
434
-
435
466
  const matchers: string[] = [];
436
- for (const contact of liveContacts) {
437
- const isMember = await evaluateCondition({
438
- condition: criteria,
439
- ctx: {
440
- db,
441
- userId: contact.externalId,
442
- journeyContext:
443
- (contact.properties as Record<string, unknown> | null) ?? {},
444
- },
445
- });
446
- if (isMember) matchers.push(contact.externalId);
447
- }
448
- return matchers;
449
- }
467
+ let cursor: string | null = null;
450
468
 
451
- /** now + within for time-based / fastExpiry buckets; null otherwise. */
452
- function computeBackfillExpiresAt(bucket: BucketMeta): Date | null {
453
- if (!bucket.criteria) return null;
454
- if (!bucket.timeBased && !bucket.fastExpiry) return null;
455
- const within = firstWithin(bucket.criteria);
456
- if (!within) return null;
457
- return new Date(Date.now() + durationToMs(within));
458
- }
469
+ for (;;) {
470
+ const page = await db
471
+ .select({
472
+ externalId: contacts.externalId,
473
+ properties: contacts.properties,
474
+ })
475
+ .from(contacts)
476
+ .where(
477
+ and(
478
+ isNull(contacts.deletedAt),
479
+ cursor != null ? gt(contacts.externalId, cursor) : undefined,
480
+ ),
481
+ )
482
+ .orderBy(sql`${contacts.externalId} asc`)
483
+ .limit(BATCH_SIZE);
459
484
 
460
- /** Find the first EventCondition.within in a criteria tree (depth-first). */
461
- function firstWithin(criteria: ConditionEval): DurationObject | null {
462
- if (criteria.type === "event" && criteria.within) {
463
- return criteria.within;
464
- }
465
- if (criteria.type === "composite") {
466
- for (const child of criteria.conditions) {
467
- const found = firstWithin(child);
468
- if (found) return found;
485
+ for (const contact of page) {
486
+ const isMember = await evaluateCondition({
487
+ condition: criteria,
488
+ ctx: {
489
+ db,
490
+ userId: contact.externalId,
491
+ journeyContext:
492
+ (contact.properties as Record<string, unknown> | null) ?? {},
493
+ },
494
+ });
495
+ if (isMember) matchers.push(contact.externalId);
469
496
  }
497
+
498
+ // A short page (fewer than a full batch) means the scan is exhausted.
499
+ if (page.length < BATCH_SIZE) break;
500
+ cursor = page[page.length - 1]?.externalId ?? null;
501
+ if (cursor == null) break;
470
502
  }
471
- return null;
503
+
504
+ return matchers;
472
505
  }
473
506
 
474
507
  /**
@@ -535,7 +568,11 @@ export async function enqueueBucketBackfills(opts: {
535
568
 
536
569
  if (!job) continue;
537
570
 
538
- await bucketBackfillTask.run({
571
+ // runNoWait (fire-and-forget): this is called from worker boot BEFORE the
572
+ // listener starts, so awaiting the run would deadlock (the run needs the
573
+ // listener that `_worker.start()` brings up). The triggered run queues and
574
+ // executes once listening; the task itself persists the criteriaHash.
575
+ await bucketBackfillTask.runNoWait({
539
576
  jobId: job.id,
540
577
  bucketId: bucket.id,
541
578
  mode,