@hogsend/engine 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ import { createHash } from "node:crypto";
2
+ import type { JsonObject } from "@hatchet-dev/typescript-sdk/v1/types.js";
3
+ import {
4
+ type BucketMeta,
5
+ type ConditionEval,
6
+ type DurationObject,
7
+ durationToMs,
8
+ evaluateCondition,
9
+ } from "@hogsend/core";
10
+ import {
11
+ bucketConfigs,
12
+ bucketMemberships,
13
+ contacts,
14
+ createDatabase,
15
+ type Database,
16
+ importJobs,
17
+ userEvents,
18
+ } from "@hogsend/db";
19
+ import { and, eq, gte, inArray, isNull, sql } from "drizzle-orm";
20
+ import { getBucketRegistrySingleton } from "../buckets/registry-singleton.js";
21
+ import { getJourneyRegistrySingleton } from "../journeys/registry-singleton.js";
22
+ import { emitBucketTransition } from "../lib/bucket-emit.js";
23
+ import { hatchet } from "../lib/hatchet.js";
24
+ import type { Logger } from "../lib/logger.js";
25
+ import { createLogger } from "../lib/logger.js";
26
+
27
+ /** Insert chunk size, reusing the import-contacts precedent (Section 6.6). */
28
+ const BATCH_SIZE = 500;
29
+
30
+ /** import_jobs.format discriminator for the reused status record (Section 6.6). */
31
+ const FIRST_TIME_FORMAT = "bucket-backfill";
32
+ const REEVAL_FORMAT = "bucket-reeval";
33
+
34
+ /**
35
+ * A stable fingerprint of a bucket's criteria (Section 6.6 B). Normalizes the
36
+ * `ConditionEval` tree (sorted object keys so key order does not change the hash),
37
+ * then sha256-hex. Persisted on `bucket_configs.criteriaHash` and diffed on the
38
+ * next boot to detect a criteria change and enqueue re-evaluation.
39
+ */
40
+ export function computeCriteriaHash(
41
+ criteria: ConditionEval | undefined,
42
+ ): string {
43
+ return createHash("sha256")
44
+ .update(stableStringify(criteria ?? null))
45
+ .digest("hex");
46
+ }
47
+
48
+ function stableStringify(value: unknown): string {
49
+ if (value === null || typeof value !== "object") {
50
+ return JSON.stringify(value);
51
+ }
52
+ if (Array.isArray(value)) {
53
+ return `[${value.map(stableStringify).join(",")}]`;
54
+ }
55
+ const entries = Object.entries(value as Record<string, unknown>)
56
+ .filter(([, v]) => v !== undefined)
57
+ .sort(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0))
58
+ .map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`);
59
+ return `{${entries.join(",")}}`;
60
+ }
61
+
62
+ /**
63
+ * Engine-owned backfill / criteria-change re-evaluation task (Section 6.6). Runs in
64
+ * two modes:
65
+ *
66
+ * - mode:"first-time" — a NEW bucket id appeared. Materialize the full member set
67
+ * via a SET-BASED query per criteria shape, insert `active` rows
68
+ * (`source:"backfill"`, onConflictDoNothing on the partial-active unique
69
+ * index), and SUPPRESS live join emission (historical matches must not fire
70
+ * `bucket:entered` into live journeys — the Customer.io rule).
71
+ * - mode:"reeval" — an EXISTING bucket's criteria changed (detected via
72
+ * `criteriaHash` diff at boot). A FULL diff: INSERT active rows for new
73
+ * matchers (joins, NO emit) AND transition active members who no longer match
74
+ * → `left` via CAS (leaves EMIT `bucket:left` so in-flight journeys exit).
75
+ *
76
+ * Progress is tracked in `import_jobs` (the precedent), discriminated by `format`
77
+ * (`bucket-backfill` / `bucket-reeval`) with `fileName` carrying the bucketId, so
78
+ * the Studio "building / live" badge derives from a real status record (Section
79
+ * 11.3). Set-based, chunked, idempotent, resumable — never run in a migration.
80
+ */
81
+ export interface BucketBackfillInput extends JsonObject {
82
+ jobId: string;
83
+ bucketId: string;
84
+ mode: "first-time" | "reeval";
85
+ }
86
+
87
+ export const bucketBackfillTask = hatchet.task({
88
+ name: "bucket-backfill",
89
+ retries: 0,
90
+ executionTimeout: "600s",
91
+ fn: async (input: BucketBackfillInput) => {
92
+ const { db } = createDatabase({ url: process.env.DATABASE_URL ?? "" });
93
+ const logger = createLogger(process.env.LOG_LEVEL ?? "info");
94
+ const registry = getBucketRegistrySingleton();
95
+ const journeyRegistry = getJourneyRegistrySingleton();
96
+
97
+ const bucket = registry.get(input.bucketId);
98
+ if (!bucket || bucket.kind === "manual" || !bucket.criteria) {
99
+ await db
100
+ .update(importJobs)
101
+ .set({
102
+ status: "failed",
103
+ errors: [{ row: 0, error: "bucket_unregistered_or_manual" }],
104
+ updatedAt: new Date(),
105
+ })
106
+ .where(eq(importJobs.id, input.jobId));
107
+ return { status: "failed", reason: "bucket_unregistered_or_manual" };
108
+ }
109
+
110
+ await db
111
+ .update(importJobs)
112
+ .set({ status: "processing", updatedAt: new Date() })
113
+ .where(eq(importJobs.id, input.jobId));
114
+
115
+ try {
116
+ // (A/B) JOINS — new matchers materialized as active rows (NO emit, both
117
+ // modes suppress join emission, Section 6.6).
118
+ const joined = await backfillJoins({
119
+ db,
120
+ logger,
121
+ bucket,
122
+ jobId: input.jobId,
123
+ });
124
+
125
+ // (B only) LEAVES — active members who no longer match are transitioned to
126
+ // left via CAS and EMIT bucket:left (so in-flight journeys exit).
127
+ let leftCount = 0;
128
+ if (input.mode === "reeval") {
129
+ leftCount = await reevalLeaves({
130
+ db,
131
+ logger,
132
+ journeyRegistry,
133
+ bucket,
134
+ });
135
+ }
136
+
137
+ // Persist the current criteria hash so the next boot diff is a no-op until
138
+ // the criteria actually change again (Section 6.6 B).
139
+ await persistCriteriaHash(db, bucket);
140
+
141
+ await db
142
+ .update(importJobs)
143
+ .set({
144
+ status: "completed",
145
+ processedRows: joined + leftCount,
146
+ updatedAt: new Date(),
147
+ })
148
+ .where(eq(importJobs.id, input.jobId));
149
+
150
+ logger.info("Bucket backfill complete", {
151
+ bucketId: bucket.id,
152
+ mode: input.mode,
153
+ joined,
154
+ left: leftCount,
155
+ });
156
+ return { status: "completed", joined, left: leftCount };
157
+ } catch (err) {
158
+ const message = err instanceof Error ? err.message : String(err);
159
+ await db
160
+ .update(importJobs)
161
+ .set({
162
+ status: "failed",
163
+ errors: [{ row: 0, error: message }],
164
+ updatedAt: new Date(),
165
+ })
166
+ .where(eq(importJobs.id, input.jobId));
167
+ logger.error("Bucket backfill failed", { bucketId: bucket.id, message });
168
+ return { status: "failed", reason: message };
169
+ }
170
+ },
171
+ });
172
+
173
+ /**
174
+ * Materialize members for the bucket via a SET-BASED query per criteria shape,
175
+ * inserting `active` rows in BATCH_SIZE chunks (`source:"backfill"`,
176
+ * onConflictDoNothing so existing active rows are untouched and re-runs are
177
+ * idempotent). NO live join emission (Section 6.6). Returns the count of NEW rows.
178
+ *
179
+ * Single-event / count criteria use a set-based SQL query; composite criteria fall
180
+ * back to a chunked per-contact `evaluateCondition` loop (the documented O(P)
181
+ * exception).
182
+ */
183
+ async function backfillJoins(opts: {
184
+ db: Database;
185
+ logger: Logger;
186
+ bucket: BucketMeta;
187
+ jobId: string;
188
+ }): Promise<number> {
189
+ const { db, bucket, jobId } = opts;
190
+ const criteria = bucket.criteria as ConditionEval;
191
+
192
+ const matcherIds =
193
+ criteria.type === "event"
194
+ ? await selectEventMatchers(db, criteria)
195
+ : await selectCompositeMatchers(db, criteria);
196
+
197
+ await db
198
+ .update(importJobs)
199
+ .set({ totalRows: matcherIds.length, updatedAt: new Date() })
200
+ .where(eq(importJobs.id, jobId));
201
+
202
+ let inserted = 0;
203
+ for (let i = 0; i < matcherIds.length; i += BATCH_SIZE) {
204
+ const chunk = matcherIds.slice(i, i + BATCH_SIZE);
205
+
206
+ // userEmail backfilled from the contacts row where available.
207
+ const chunkContacts = await db
208
+ .select({ externalId: contacts.externalId, email: contacts.email })
209
+ .from(contacts)
210
+ .where(
211
+ and(inArray(contacts.externalId, chunk), isNull(contacts.deletedAt)),
212
+ );
213
+ const emailByUser = new Map(
214
+ chunkContacts.map((c) => [c.externalId, c.email]),
215
+ );
216
+
217
+ const rows = chunk.map((userId) => ({
218
+ userId,
219
+ userEmail: emailByUser.get(userId) ?? null,
220
+ bucketId: bucket.id,
221
+ status: "active" as const,
222
+ source: "backfill" as const,
223
+ entryCount: 1,
224
+ expiresAt: computeBackfillExpiresAt(bucket),
225
+ lastEvaluatedAt: new Date(),
226
+ }));
227
+
228
+ const result = await db
229
+ .insert(bucketMemberships)
230
+ .values(rows)
231
+ .onConflictDoNothing()
232
+ .returning({ id: bucketMemberships.id });
233
+
234
+ inserted += result.length;
235
+
236
+ await db
237
+ .update(importJobs)
238
+ .set({ processedRows: inserted, updatedAt: new Date() })
239
+ .where(eq(importJobs.id, jobId));
240
+ }
241
+
242
+ return inserted;
243
+ }
244
+
245
+ /**
246
+ * Re-eval LEAVES (mode:"reeval" only) — active members of the bucket who no longer
247
+ * satisfy the (changed) criteria are transitioned to `left` via CAS and EMIT
248
+ * `bucket:left` (Section 6.6 B asymmetry: criteria-change LEAVES emit). Set-based
249
+ * for single-event criteria; chunked per-member otherwise.
250
+ */
251
+ async function reevalLeaves(opts: {
252
+ db: Database;
253
+ logger: Logger;
254
+ journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
255
+ bucket: BucketMeta;
256
+ }): Promise<number> {
257
+ const { db, logger, journeyRegistry, bucket } = opts;
258
+ const criteria = bucket.criteria as ConditionEval;
259
+
260
+ // The set of users who STILL match (so non-matching active members = leavers).
261
+ const matcherIds =
262
+ criteria.type === "event"
263
+ ? await selectEventMatchers(db, criteria)
264
+ : await selectCompositeMatchers(db, criteria);
265
+ const matcherSet = new Set(matcherIds);
266
+
267
+ const activeMembers = await db
268
+ .select({
269
+ id: bucketMemberships.id,
270
+ userId: bucketMemberships.userId,
271
+ userEmail: bucketMemberships.userEmail,
272
+ entryCount: bucketMemberships.entryCount,
273
+ })
274
+ .from(bucketMemberships)
275
+ .innerJoin(contacts, eq(contacts.externalId, bucketMemberships.userId))
276
+ .where(
277
+ and(
278
+ eq(bucketMemberships.bucketId, bucket.id),
279
+ eq(bucketMemberships.status, "active"),
280
+ isNull(bucketMemberships.deletedAt),
281
+ isNull(contacts.deletedAt),
282
+ ),
283
+ );
284
+
285
+ const leavers = activeMembers.filter((m) => !matcherSet.has(m.userId));
286
+ if (leavers.length === 0) return 0;
287
+
288
+ let leftCount = 0;
289
+ for (let i = 0; i < leavers.length; i += BATCH_SIZE) {
290
+ const chunk = leavers.slice(i, i + BATCH_SIZE);
291
+ const flipped = await db
292
+ .update(bucketMemberships)
293
+ .set({
294
+ status: "left",
295
+ leftAt: new Date(),
296
+ lastEvaluatedAt: new Date(),
297
+ updatedAt: new Date(),
298
+ })
299
+ .where(
300
+ and(
301
+ eq(bucketMemberships.bucketId, bucket.id),
302
+ eq(bucketMemberships.status, "active"),
303
+ isNull(bucketMemberships.deletedAt),
304
+ inArray(
305
+ bucketMemberships.id,
306
+ chunk.map((m) => m.id),
307
+ ),
308
+ ),
309
+ )
310
+ .returning({
311
+ userId: bucketMemberships.userId,
312
+ userEmail: bucketMemberships.userEmail,
313
+ entryCount: bucketMemberships.entryCount,
314
+ });
315
+
316
+ for (const row of flipped) {
317
+ await emitBucketTransition({
318
+ db,
319
+ registry: journeyRegistry,
320
+ hatchet,
321
+ logger,
322
+ kind: "left",
323
+ bucket,
324
+ userId: row.userId,
325
+ userEmail: row.userEmail,
326
+ epoch: row.entryCount,
327
+ source: "backfill",
328
+ });
329
+ }
330
+ leftCount += flipped.length;
331
+ }
332
+
333
+ return leftCount;
334
+ }
335
+
336
+ /** Set-based matcher user-ids for a single-event criterion (Section 6.6). */
337
+ async function selectEventMatchers(
338
+ db: Database,
339
+ criteria: Extract<ConditionEval, { type: "event" }>,
340
+ ): Promise<string[]> {
341
+ const cutoff = criteria.within
342
+ ? new Date(Date.now() - durationToMs(criteria.within))
343
+ : null;
344
+
345
+ // count gte N / exists → SELECT user_id ... GROUP BY HAVING. not_exists
346
+ // (absence) → live contacts with NO such event in the window (anti-join).
347
+ if (criteria.check === "not_exists") {
348
+ const present = db
349
+ .select({ userId: userEvents.userId })
350
+ .from(userEvents)
351
+ .where(
352
+ and(
353
+ eq(userEvents.event, criteria.eventName),
354
+ cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
355
+ ),
356
+ )
357
+ .groupBy(userEvents.userId)
358
+ .as("present");
359
+
360
+ const rows = await db
361
+ .select({ userId: contacts.externalId })
362
+ .from(contacts)
363
+ .leftJoin(present, eq(present.userId, contacts.externalId))
364
+ .where(and(isNull(contacts.deletedAt), isNull(present.userId)));
365
+ return rows.map((r) => r.userId);
366
+ }
367
+
368
+ // exists / count: group counts then filter by the operator.
369
+ const rows = await db
370
+ .select({
371
+ userId: userEvents.userId,
372
+ cnt: sql<number>`count(*)::int`,
373
+ })
374
+ .from(userEvents)
375
+ .where(
376
+ and(
377
+ eq(userEvents.event, criteria.eventName),
378
+ cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
379
+ ),
380
+ )
381
+ .groupBy(userEvents.userId);
382
+
383
+ return rows
384
+ .filter((r) => matchesCount(criteria, Number(r.cnt)))
385
+ .map((r) => r.userId);
386
+ }
387
+
388
+ /** True when a windowed count satisfies the (exists/count) criterion. */
389
+ function matchesCount(
390
+ criteria: Extract<ConditionEval, { type: "event" }>,
391
+ count: number,
392
+ ): boolean {
393
+ switch (criteria.check) {
394
+ case "exists":
395
+ return count > 0;
396
+ case "count": {
397
+ if (!criteria.operator || criteria.value === undefined) return count > 0;
398
+ switch (criteria.operator) {
399
+ case "gt":
400
+ return count > criteria.value;
401
+ case "gte":
402
+ return count >= criteria.value;
403
+ case "lt":
404
+ return count < criteria.value;
405
+ case "lte":
406
+ return count <= criteria.value;
407
+ case "eq":
408
+ return count === criteria.value;
409
+ default:
410
+ return false;
411
+ }
412
+ }
413
+ default:
414
+ return false;
415
+ }
416
+ }
417
+
418
+ /**
419
+ * Composite/multi-condition fallback (the documented O(P) exception, Section 6.6):
420
+ * a chunked per-contact `evaluateCondition` loop over live contacts. Property
421
+ * sub-conditions evaluate against the contact's merged properties.
422
+ */
423
+ async function selectCompositeMatchers(
424
+ db: Database,
425
+ criteria: ConditionEval,
426
+ ): Promise<string[]> {
427
+ const liveContacts = await db
428
+ .select({
429
+ externalId: contacts.externalId,
430
+ properties: contacts.properties,
431
+ })
432
+ .from(contacts)
433
+ .where(isNull(contacts.deletedAt));
434
+
435
+ const matchers: string[] = [];
436
+ for (const contact of liveContacts) {
437
+ const isMember = await evaluateCondition({
438
+ condition: criteria,
439
+ ctx: {
440
+ db,
441
+ userId: contact.externalId,
442
+ journeyContext:
443
+ (contact.properties as Record<string, unknown> | null) ?? {},
444
+ },
445
+ });
446
+ if (isMember) matchers.push(contact.externalId);
447
+ }
448
+ return matchers;
449
+ }
450
+
451
+ /** now + within for time-based / fastExpiry buckets; null otherwise. */
452
+ function computeBackfillExpiresAt(bucket: BucketMeta): Date | null {
453
+ if (!bucket.criteria) return null;
454
+ if (!bucket.timeBased && !bucket.fastExpiry) return null;
455
+ const within = firstWithin(bucket.criteria);
456
+ if (!within) return null;
457
+ return new Date(Date.now() + durationToMs(within));
458
+ }
459
+
460
+ /** Find the first EventCondition.within in a criteria tree (depth-first). */
461
+ function firstWithin(criteria: ConditionEval): DurationObject | null {
462
+ if (criteria.type === "event" && criteria.within) {
463
+ return criteria.within;
464
+ }
465
+ if (criteria.type === "composite") {
466
+ for (const child of criteria.conditions) {
467
+ const found = firstWithin(child);
468
+ if (found) return found;
469
+ }
470
+ }
471
+ return null;
472
+ }
473
+
474
+ /**
475
+ * Upsert the bucket's current criteria fingerprint onto `bucket_configs` (Section
476
+ * 6.6 B). Mirrors the admin enable/disable onConflictDoUpdate target.
477
+ */
478
+ async function persistCriteriaHash(
479
+ db: Database,
480
+ bucket: BucketMeta,
481
+ ): Promise<void> {
482
+ const hash = computeCriteriaHash(bucket.criteria);
483
+ await db
484
+ .insert(bucketConfigs)
485
+ .values({ bucketId: bucket.id, criteriaHash: hash })
486
+ .onConflictDoUpdate({
487
+ target: bucketConfigs.bucketId,
488
+ set: { criteriaHash: hash, updatedAt: new Date() },
489
+ });
490
+ }
491
+
492
+ /**
493
+ * Detect first-time / criteria-changed buckets at worker boot and enqueue a
494
+ * backfill / re-eval job per bucket (Section 6.6 B). For each enabled dynamic
495
+ * bucket: read the stored `bucket_configs.criteriaHash`; if absent → first-time
496
+ * backfill; if present but different → re-eval; if equal → no-op. Creates an
497
+ * `import_jobs` status record (discriminated by `format`) and pushes
498
+ * `bucketBackfillTask.run(...)` for it.
499
+ *
500
+ * Idempotent and safe to call on every boot — equal hashes are skipped. Best-effort
501
+ * (a failure to enqueue must not crash worker boot).
502
+ */
503
+ export async function enqueueBucketBackfills(opts: {
504
+ db: Database;
505
+ logger: Logger;
506
+ }): Promise<void> {
507
+ const { db, logger } = opts;
508
+ const registry = getBucketRegistrySingleton();
509
+
510
+ for (const bucket of registry.getEnabled()) {
511
+ if (bucket.kind === "manual" || !bucket.criteria) continue;
512
+
513
+ try {
514
+ const config = await db.query.bucketConfigs.findFirst({
515
+ where: eq(bucketConfigs.bucketId, bucket.id),
516
+ });
517
+ const currentHash = computeCriteriaHash(bucket.criteria);
518
+
519
+ let mode: BucketBackfillInput["mode"] | null = null;
520
+ if (!config || config.criteriaHash == null) {
521
+ mode = "first-time";
522
+ } else if (config.criteriaHash !== currentHash) {
523
+ mode = "reeval";
524
+ }
525
+ if (!mode) continue;
526
+
527
+ const [job] = await db
528
+ .insert(importJobs)
529
+ .values({
530
+ fileName: bucket.id,
531
+ format: mode === "first-time" ? FIRST_TIME_FORMAT : REEVAL_FORMAT,
532
+ status: "pending",
533
+ })
534
+ .returning({ id: importJobs.id });
535
+
536
+ if (!job) continue;
537
+
538
+ await bucketBackfillTask.run({
539
+ jobId: job.id,
540
+ bucketId: bucket.id,
541
+ mode,
542
+ });
543
+
544
+ logger.info("Bucket backfill enqueued", {
545
+ bucketId: bucket.id,
546
+ mode,
547
+ jobId: job.id,
548
+ });
549
+ } catch (err) {
550
+ logger.warn("Bucket backfill enqueue failed", {
551
+ bucketId: bucket.id,
552
+ error: err instanceof Error ? err.message : String(err),
553
+ });
554
+ }
555
+ }
556
+ }