@hogsend/engine 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -6
- package/src/buckets/check-membership.ts +490 -0
- package/src/buckets/define-bucket.ts +52 -0
- package/src/buckets/membership-epoch.ts +186 -0
- package/src/buckets/registry-singleton.ts +21 -0
- package/src/buckets/registry.ts +62 -0
- package/src/container.ts +27 -1
- package/src/env.ts +6 -0
- package/src/index.ts +39 -1
- package/src/lib/bucket-emit.ts +107 -0
- package/src/lib/bucket-posthog-sync.ts +63 -0
- package/src/lib/ingestion.ts +25 -0
- package/src/routes/admin/buckets.ts +462 -0
- package/src/routes/admin/index.ts +2 -0
- package/src/routes/admin/metrics.ts +255 -0
- package/src/worker.ts +37 -0
- package/src/workflows/bucket-backfill.ts +593 -0
- package/src/workflows/bucket-reconcile.ts +1010 -0
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import type { JsonObject } from "@hatchet-dev/typescript-sdk/v1/types.js";
|
|
3
|
+
import {
|
|
4
|
+
type BucketMeta,
|
|
5
|
+
type ConditionEval,
|
|
6
|
+
durationToMs,
|
|
7
|
+
evaluateCondition,
|
|
8
|
+
} from "@hogsend/core";
|
|
9
|
+
import {
|
|
10
|
+
bucketConfigs,
|
|
11
|
+
bucketMemberships,
|
|
12
|
+
contacts,
|
|
13
|
+
createDatabase,
|
|
14
|
+
type Database,
|
|
15
|
+
importJobs,
|
|
16
|
+
userEvents,
|
|
17
|
+
} from "@hogsend/db";
|
|
18
|
+
import { and, eq, gt, gte, inArray, isNull, sql } from "drizzle-orm";
|
|
19
|
+
import {
|
|
20
|
+
computeExpiresAt,
|
|
21
|
+
computeMaxDwellAt,
|
|
22
|
+
matchesEventCount,
|
|
23
|
+
} from "../buckets/membership-epoch.js";
|
|
24
|
+
import { getBucketRegistrySingleton } from "../buckets/registry-singleton.js";
|
|
25
|
+
import { getJourneyRegistrySingleton } from "../journeys/registry-singleton.js";
|
|
26
|
+
import { emitBucketTransition } from "../lib/bucket-emit.js";
|
|
27
|
+
import { hatchet } from "../lib/hatchet.js";
|
|
28
|
+
import type { Logger } from "../lib/logger.js";
|
|
29
|
+
import { createLogger } from "../lib/logger.js";
|
|
30
|
+
|
|
31
|
+
/** Insert chunk size, reusing the import-contacts precedent (Section 6.6). */
|
|
32
|
+
const BATCH_SIZE = 500;
|
|
33
|
+
|
|
34
|
+
/** import_jobs.format discriminator for the reused status record (Section 6.6). */
|
|
35
|
+
export const FIRST_TIME_FORMAT = "bucket-backfill";
|
|
36
|
+
const REEVAL_FORMAT = "bucket-reeval";
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* A stable fingerprint of a bucket's criteria (Section 6.6 B). Normalizes the
|
|
40
|
+
* `ConditionEval` tree (sorted object keys so key order does not change the hash),
|
|
41
|
+
* then sha256-hex. Persisted on `bucket_configs.criteriaHash` and diffed on the
|
|
42
|
+
* next boot to detect a criteria change and enqueue re-evaluation.
|
|
43
|
+
*/
|
|
44
|
+
export function computeCriteriaHash(
|
|
45
|
+
criteria: ConditionEval | undefined,
|
|
46
|
+
): string {
|
|
47
|
+
return createHash("sha256")
|
|
48
|
+
.update(stableStringify(criteria ?? null))
|
|
49
|
+
.digest("hex");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function stableStringify(value: unknown): string {
|
|
53
|
+
if (value === null || typeof value !== "object") {
|
|
54
|
+
return JSON.stringify(value);
|
|
55
|
+
}
|
|
56
|
+
if (Array.isArray(value)) {
|
|
57
|
+
return `[${value.map(stableStringify).join(",")}]`;
|
|
58
|
+
}
|
|
59
|
+
const entries = Object.entries(value as Record<string, unknown>)
|
|
60
|
+
.filter(([, v]) => v !== undefined)
|
|
61
|
+
.sort(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0))
|
|
62
|
+
.map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`);
|
|
63
|
+
return `{${entries.join(",")}}`;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Engine-owned backfill / criteria-change re-evaluation task (Section 6.6). Runs in
|
|
68
|
+
* two modes:
|
|
69
|
+
*
|
|
70
|
+
* - mode:"first-time" — a NEW bucket id appeared. Materialize the full member set
|
|
71
|
+
* via a SET-BASED query per criteria shape, insert `active` rows
|
|
72
|
+
* (`source:"backfill"`, onConflictDoNothing on the partial-active unique
|
|
73
|
+
* index), and SUPPRESS live join emission (historical matches must not fire
|
|
74
|
+
* `bucket:entered` into live journeys — the Customer.io rule).
|
|
75
|
+
* - mode:"reeval" — an EXISTING bucket's criteria changed (detected via
|
|
76
|
+
* `criteriaHash` diff at boot). A FULL diff: INSERT active rows for new
|
|
77
|
+
* matchers (joins, NO emit) AND transition active members who no longer match
|
|
78
|
+
* → `left` via CAS (leaves EMIT `bucket:left` so in-flight journeys exit).
|
|
79
|
+
*
|
|
80
|
+
* Progress is tracked in `import_jobs` (the precedent), discriminated by `format`
|
|
81
|
+
* (`bucket-backfill` / `bucket-reeval`) with `fileName` carrying the bucketId, so
|
|
82
|
+
* the Studio "building / live" badge derives from a real status record (Section
|
|
83
|
+
* 11.3). Set-based, chunked, idempotent, resumable — never run in a migration.
|
|
84
|
+
*/
|
|
85
|
+
export interface BucketBackfillInput extends JsonObject {
|
|
86
|
+
jobId: string;
|
|
87
|
+
bucketId: string;
|
|
88
|
+
mode: "first-time" | "reeval";
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export const bucketBackfillTask = hatchet.task({
|
|
92
|
+
name: "bucket-backfill",
|
|
93
|
+
retries: 0,
|
|
94
|
+
executionTimeout: "600s",
|
|
95
|
+
fn: async (input: BucketBackfillInput) => {
|
|
96
|
+
const { db } = createDatabase({ url: process.env.DATABASE_URL ?? "" });
|
|
97
|
+
const logger = createLogger(process.env.LOG_LEVEL ?? "info");
|
|
98
|
+
const registry = getBucketRegistrySingleton();
|
|
99
|
+
const journeyRegistry = getJourneyRegistrySingleton();
|
|
100
|
+
|
|
101
|
+
const bucket = registry.get(input.bucketId);
|
|
102
|
+
if (!bucket || bucket.kind === "manual" || !bucket.criteria) {
|
|
103
|
+
await db
|
|
104
|
+
.update(importJobs)
|
|
105
|
+
.set({
|
|
106
|
+
status: "failed",
|
|
107
|
+
errors: [{ row: 0, error: "bucket_unregistered_or_manual" }],
|
|
108
|
+
updatedAt: new Date(),
|
|
109
|
+
})
|
|
110
|
+
.where(eq(importJobs.id, input.jobId));
|
|
111
|
+
return { status: "failed", reason: "bucket_unregistered_or_manual" };
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
await db
|
|
115
|
+
.update(importJobs)
|
|
116
|
+
.set({ status: "processing", updatedAt: new Date() })
|
|
117
|
+
.where(eq(importJobs.id, input.jobId));
|
|
118
|
+
|
|
119
|
+
try {
|
|
120
|
+
// (A/B) JOINS — new matchers materialized as active rows (NO emit, both
|
|
121
|
+
// modes suppress join emission, Section 6.6).
|
|
122
|
+
const joined = await backfillJoins({
|
|
123
|
+
db,
|
|
124
|
+
logger,
|
|
125
|
+
bucket,
|
|
126
|
+
jobId: input.jobId,
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
// (B only) LEAVES — active members who no longer match are transitioned to
|
|
130
|
+
// left via CAS and EMIT bucket:left (so in-flight journeys exit).
|
|
131
|
+
let leftCount = 0;
|
|
132
|
+
if (input.mode === "reeval") {
|
|
133
|
+
leftCount = await reevalLeaves({
|
|
134
|
+
db,
|
|
135
|
+
logger,
|
|
136
|
+
journeyRegistry,
|
|
137
|
+
bucket,
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Persist the current criteria hash so the next boot diff is a no-op until
|
|
142
|
+
// the criteria actually change again (Section 6.6 B).
|
|
143
|
+
await persistCriteriaHash(db, bucket);
|
|
144
|
+
|
|
145
|
+
await db
|
|
146
|
+
.update(importJobs)
|
|
147
|
+
.set({
|
|
148
|
+
status: "completed",
|
|
149
|
+
processedRows: joined + leftCount,
|
|
150
|
+
updatedAt: new Date(),
|
|
151
|
+
})
|
|
152
|
+
.where(eq(importJobs.id, input.jobId));
|
|
153
|
+
|
|
154
|
+
logger.info("Bucket backfill complete", {
|
|
155
|
+
bucketId: bucket.id,
|
|
156
|
+
mode: input.mode,
|
|
157
|
+
joined,
|
|
158
|
+
left: leftCount,
|
|
159
|
+
});
|
|
160
|
+
return { status: "completed", joined, left: leftCount };
|
|
161
|
+
} catch (err) {
|
|
162
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
163
|
+
await db
|
|
164
|
+
.update(importJobs)
|
|
165
|
+
.set({
|
|
166
|
+
status: "failed",
|
|
167
|
+
errors: [{ row: 0, error: message }],
|
|
168
|
+
updatedAt: new Date(),
|
|
169
|
+
})
|
|
170
|
+
.where(eq(importJobs.id, input.jobId));
|
|
171
|
+
logger.error("Bucket backfill failed", { bucketId: bucket.id, message });
|
|
172
|
+
return { status: "failed", reason: message };
|
|
173
|
+
}
|
|
174
|
+
},
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Materialize members for the bucket via a SET-BASED query per criteria shape,
|
|
179
|
+
* inserting `active` rows in BATCH_SIZE chunks (`source:"backfill"`,
|
|
180
|
+
* onConflictDoNothing so existing active rows are untouched and re-runs are
|
|
181
|
+
* idempotent). NO live join emission (Section 6.6). Returns the count of NEW rows.
|
|
182
|
+
*
|
|
183
|
+
* Single-event / count criteria use a set-based SQL query; composite criteria fall
|
|
184
|
+
* back to a chunked per-contact `evaluateCondition` loop (the documented O(P)
|
|
185
|
+
* exception).
|
|
186
|
+
*/
|
|
187
|
+
async function backfillJoins(opts: {
|
|
188
|
+
db: Database;
|
|
189
|
+
logger: Logger;
|
|
190
|
+
bucket: BucketMeta;
|
|
191
|
+
jobId: string;
|
|
192
|
+
}): Promise<number> {
|
|
193
|
+
const { db, bucket, jobId } = opts;
|
|
194
|
+
const criteria = bucket.criteria as ConditionEval;
|
|
195
|
+
|
|
196
|
+
const matcherIds =
|
|
197
|
+
criteria.type === "event"
|
|
198
|
+
? await selectEventMatchers(db, criteria)
|
|
199
|
+
: await selectCompositeMatchers(db, criteria);
|
|
200
|
+
|
|
201
|
+
await db
|
|
202
|
+
.update(importJobs)
|
|
203
|
+
.set({ totalRows: matcherIds.length, updatedAt: new Date() })
|
|
204
|
+
.where(eq(importJobs.id, jobId));
|
|
205
|
+
|
|
206
|
+
// Unconditional max-dwell TTL deadline, stamped once at insert (mirrors the
|
|
207
|
+
// live join, check-membership.ts). null when the bucket has no maxDwell; the
|
|
208
|
+
// TTL sweep (reconcileBucketTtlLeaves) filters isNotNull(maxDwellAt), so an
|
|
209
|
+
// unset value would never be force-left.
|
|
210
|
+
const maxDwellAt = computeMaxDwellAt(bucket);
|
|
211
|
+
|
|
212
|
+
// Fix C (DEFERRED): backfilled fastExpiry rows are NOT armed with a
|
|
213
|
+
// bucket:arm-expiry durable timer here — they are picked up by the next cron
|
|
214
|
+
// sweep instead (reconcileBucketLeaves / reconcileBucketTtlLeaves are the
|
|
215
|
+
// authoritative backstop). Conscious choice (cron cadence, default 5m), not an
|
|
216
|
+
// omission: arming at backfill would fan out one durable task per inserted row.
|
|
217
|
+
|
|
218
|
+
let inserted = 0;
|
|
219
|
+
for (let i = 0; i < matcherIds.length; i += BATCH_SIZE) {
|
|
220
|
+
const chunk = matcherIds.slice(i, i + BATCH_SIZE);
|
|
221
|
+
|
|
222
|
+
// userEmail backfilled from the contacts row where available.
|
|
223
|
+
const chunkContacts = await db
|
|
224
|
+
.select({ externalId: contacts.externalId, email: contacts.email })
|
|
225
|
+
.from(contacts)
|
|
226
|
+
.where(
|
|
227
|
+
and(inArray(contacts.externalId, chunk), isNull(contacts.deletedAt)),
|
|
228
|
+
);
|
|
229
|
+
const emailByUser = new Map(
|
|
230
|
+
chunkContacts.map((c) => [c.externalId, c.email]),
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
// Fix A: entryCount = 1 + prior memberships for each (user, bucket), the
|
|
234
|
+
// same monotonic ordinal the live join computes (check-membership.ts). On a
|
|
235
|
+
// FIRST-TIME backfill priorCount is 0 → entryCount 1 (unchanged); on a
|
|
236
|
+
// REEVAL re-join of a user with historical "left" rows it advances the
|
|
237
|
+
// epoch correctly. ONE batched GROUP BY per chunk (never per-user — the set-
|
|
238
|
+
// based path must not reintroduce the O(P) serial-query trap).
|
|
239
|
+
const priorCounts = await db
|
|
240
|
+
.select({
|
|
241
|
+
userId: bucketMemberships.userId,
|
|
242
|
+
cnt: sql<number>`count(*)::int`,
|
|
243
|
+
})
|
|
244
|
+
.from(bucketMemberships)
|
|
245
|
+
.where(
|
|
246
|
+
and(
|
|
247
|
+
eq(bucketMemberships.bucketId, bucket.id),
|
|
248
|
+
inArray(bucketMemberships.userId, chunk),
|
|
249
|
+
),
|
|
250
|
+
)
|
|
251
|
+
.groupBy(bucketMemberships.userId);
|
|
252
|
+
const priorByUser = new Map(
|
|
253
|
+
priorCounts.map((r) => [r.userId, Number(r.cnt)]),
|
|
254
|
+
);
|
|
255
|
+
|
|
256
|
+
const rows = chunk.map((userId) => ({
|
|
257
|
+
userId,
|
|
258
|
+
userEmail: emailByUser.get(userId) ?? null,
|
|
259
|
+
bucketId: bucket.id,
|
|
260
|
+
status: "active" as const,
|
|
261
|
+
source: "backfill" as const,
|
|
262
|
+
entryCount: 1 + (priorByUser.get(userId) ?? 0),
|
|
263
|
+
expiresAt: computeExpiresAt(bucket),
|
|
264
|
+
maxDwellAt,
|
|
265
|
+
lastEvaluatedAt: new Date(),
|
|
266
|
+
}));
|
|
267
|
+
|
|
268
|
+
const result = await db
|
|
269
|
+
.insert(bucketMemberships)
|
|
270
|
+
.values(rows)
|
|
271
|
+
.onConflictDoNothing()
|
|
272
|
+
.returning({ id: bucketMemberships.id });
|
|
273
|
+
|
|
274
|
+
inserted += result.length;
|
|
275
|
+
|
|
276
|
+
await db
|
|
277
|
+
.update(importJobs)
|
|
278
|
+
.set({ processedRows: inserted, updatedAt: new Date() })
|
|
279
|
+
.where(eq(importJobs.id, jobId));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return inserted;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Re-eval LEAVES (mode:"reeval" only) — active members of the bucket who no longer
|
|
287
|
+
* satisfy the (changed) criteria are transitioned to `left` via CAS and EMIT
|
|
288
|
+
* `bucket:left` (Section 6.6 B asymmetry: criteria-change LEAVES emit). Set-based
|
|
289
|
+
* for single-event criteria; chunked per-member otherwise.
|
|
290
|
+
*/
|
|
291
|
+
async function reevalLeaves(opts: {
|
|
292
|
+
db: Database;
|
|
293
|
+
logger: Logger;
|
|
294
|
+
journeyRegistry: ReturnType<typeof getJourneyRegistrySingleton>;
|
|
295
|
+
bucket: BucketMeta;
|
|
296
|
+
}): Promise<number> {
|
|
297
|
+
const { db, logger, journeyRegistry, bucket } = opts;
|
|
298
|
+
const criteria = bucket.criteria as ConditionEval;
|
|
299
|
+
|
|
300
|
+
// The set of users who STILL match (so non-matching active members = leavers).
|
|
301
|
+
const matcherIds =
|
|
302
|
+
criteria.type === "event"
|
|
303
|
+
? await selectEventMatchers(db, criteria)
|
|
304
|
+
: await selectCompositeMatchers(db, criteria);
|
|
305
|
+
const matcherSet = new Set(matcherIds);
|
|
306
|
+
|
|
307
|
+
const activeMembers = await db
|
|
308
|
+
.select({
|
|
309
|
+
id: bucketMemberships.id,
|
|
310
|
+
userId: bucketMemberships.userId,
|
|
311
|
+
userEmail: bucketMemberships.userEmail,
|
|
312
|
+
entryCount: bucketMemberships.entryCount,
|
|
313
|
+
})
|
|
314
|
+
.from(bucketMemberships)
|
|
315
|
+
.innerJoin(contacts, eq(contacts.externalId, bucketMemberships.userId))
|
|
316
|
+
.where(
|
|
317
|
+
and(
|
|
318
|
+
eq(bucketMemberships.bucketId, bucket.id),
|
|
319
|
+
eq(bucketMemberships.status, "active"),
|
|
320
|
+
isNull(bucketMemberships.deletedAt),
|
|
321
|
+
isNull(contacts.deletedAt),
|
|
322
|
+
),
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
const leavers = activeMembers.filter((m) => !matcherSet.has(m.userId));
|
|
326
|
+
if (leavers.length === 0) return 0;
|
|
327
|
+
|
|
328
|
+
let leftCount = 0;
|
|
329
|
+
for (let i = 0; i < leavers.length; i += BATCH_SIZE) {
|
|
330
|
+
const chunk = leavers.slice(i, i + BATCH_SIZE);
|
|
331
|
+
const flipped = await db
|
|
332
|
+
.update(bucketMemberships)
|
|
333
|
+
.set({
|
|
334
|
+
status: "left",
|
|
335
|
+
leftAt: new Date(),
|
|
336
|
+
lastEvaluatedAt: new Date(),
|
|
337
|
+
updatedAt: new Date(),
|
|
338
|
+
})
|
|
339
|
+
.where(
|
|
340
|
+
and(
|
|
341
|
+
eq(bucketMemberships.bucketId, bucket.id),
|
|
342
|
+
eq(bucketMemberships.status, "active"),
|
|
343
|
+
isNull(bucketMemberships.deletedAt),
|
|
344
|
+
inArray(
|
|
345
|
+
bucketMemberships.id,
|
|
346
|
+
chunk.map((m) => m.id),
|
|
347
|
+
),
|
|
348
|
+
),
|
|
349
|
+
)
|
|
350
|
+
.returning({
|
|
351
|
+
userId: bucketMemberships.userId,
|
|
352
|
+
userEmail: bucketMemberships.userEmail,
|
|
353
|
+
entryCount: bucketMemberships.entryCount,
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
for (const row of flipped) {
|
|
357
|
+
await emitBucketTransition({
|
|
358
|
+
db,
|
|
359
|
+
registry: journeyRegistry,
|
|
360
|
+
hatchet,
|
|
361
|
+
logger,
|
|
362
|
+
kind: "left",
|
|
363
|
+
bucket,
|
|
364
|
+
userId: row.userId,
|
|
365
|
+
userEmail: row.userEmail,
|
|
366
|
+
epoch: row.entryCount,
|
|
367
|
+
source: "backfill",
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
leftCount += flipped.length;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
return leftCount;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/** Set-based matcher user-ids for a single-event criterion (Section 6.6). */
|
|
377
|
+
async function selectEventMatchers(
|
|
378
|
+
db: Database,
|
|
379
|
+
criteria: Extract<ConditionEval, { type: "event" }>,
|
|
380
|
+
): Promise<string[]> {
|
|
381
|
+
const cutoff = criteria.within
|
|
382
|
+
? new Date(Date.now() - durationToMs(criteria.within))
|
|
383
|
+
: null;
|
|
384
|
+
|
|
385
|
+
// count gte N / exists → SELECT user_id ... GROUP BY HAVING. not_exists
|
|
386
|
+
// (absence) → live contacts who EVER fired the event but have NONE in the
|
|
387
|
+
// window (lapsed-only). A bare windowed `not_exists within W` is treated as
|
|
388
|
+
// LAPSED-ONLY (never-active EXCLUDED) in BOTH this backfill and the cron
|
|
389
|
+
// (bucket-reconcile.ts reconcileBucketJoins, the everFired floor), so the two
|
|
390
|
+
// writers agree: brand-new never-active signups are NOT materialized for an
|
|
391
|
+
// absence-within-window bucket — only users who once did X and then stopped.
|
|
392
|
+
if (criteria.check === "not_exists") {
|
|
393
|
+
// everFired floor: contacts who fired the event AT LEAST ONCE (no window),
|
|
394
|
+
// mirroring the cron's `ever_fired` semi-join. Excludes never-active
|
|
395
|
+
// contacts so the two writers select the same lapsed-only cohort.
|
|
396
|
+
const everFired = db
|
|
397
|
+
.selectDistinct({ userId: userEvents.userId })
|
|
398
|
+
.from(userEvents)
|
|
399
|
+
.where(eq(userEvents.event, criteria.eventName))
|
|
400
|
+
.as("ever_fired");
|
|
401
|
+
|
|
402
|
+
const present = db
|
|
403
|
+
.select({ userId: userEvents.userId })
|
|
404
|
+
.from(userEvents)
|
|
405
|
+
.where(
|
|
406
|
+
and(
|
|
407
|
+
eq(userEvents.event, criteria.eventName),
|
|
408
|
+
cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
|
|
409
|
+
),
|
|
410
|
+
)
|
|
411
|
+
.groupBy(userEvents.userId)
|
|
412
|
+
.as("present");
|
|
413
|
+
|
|
414
|
+
const rows = await db
|
|
415
|
+
.select({ userId: contacts.externalId })
|
|
416
|
+
.from(contacts)
|
|
417
|
+
.innerJoin(everFired, eq(everFired.userId, contacts.externalId))
|
|
418
|
+
.leftJoin(present, eq(present.userId, contacts.externalId))
|
|
419
|
+
.where(and(isNull(contacts.deletedAt), isNull(present.userId)));
|
|
420
|
+
return rows.map((r) => r.userId);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// exists / count: group counts then filter by the operator. Fix B: innerJoin
|
|
424
|
+
// live contacts (GDPR — only materialize memberships for non-deleted contacts
|
|
425
|
+
// that actually exist), mirroring selectEventLeavers in bucket-reconcile.ts.
|
|
426
|
+
// The not_exists branch above already filters contacts.deletedAt; without this
|
|
427
|
+
// join the positive-event path could materialize active rows for soft-deleted
|
|
428
|
+
// or orphan-event userIds, diverging from the live/reconcile paths.
|
|
429
|
+
const rows = await db
|
|
430
|
+
.select({
|
|
431
|
+
userId: userEvents.userId,
|
|
432
|
+
cnt: sql<number>`count(*)::int`,
|
|
433
|
+
})
|
|
434
|
+
.from(userEvents)
|
|
435
|
+
.innerJoin(contacts, eq(contacts.externalId, userEvents.userId))
|
|
436
|
+
.where(
|
|
437
|
+
and(
|
|
438
|
+
eq(userEvents.event, criteria.eventName),
|
|
439
|
+
isNull(contacts.deletedAt),
|
|
440
|
+
cutoff ? gte(userEvents.occurredAt, cutoff) : undefined,
|
|
441
|
+
),
|
|
442
|
+
)
|
|
443
|
+
.groupBy(userEvents.userId);
|
|
444
|
+
|
|
445
|
+
return rows
|
|
446
|
+
.filter((r) => matchesEventCount(criteria, Number(r.cnt)))
|
|
447
|
+
.map((r) => r.userId);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Composite/multi-condition fallback (the documented O(P) exception, Section 6.6):
|
|
452
|
+
* a per-contact `evaluateCondition` loop over live contacts. Property
|
|
453
|
+
* sub-conditions evaluate against the contact's merged properties.
|
|
454
|
+
*
|
|
455
|
+
* KEYSET PAGINATION by `contacts.externalId` in BATCH_SIZE pages (mirrors
|
|
456
|
+
* reconcileBucketJoins' `externalId asc` paging): each page selects
|
|
457
|
+
* `WHERE externalId > :cursor ORDER BY externalId ASC LIMIT BATCH_SIZE`,
|
|
458
|
+
* evaluates the criteria per contact, then advances the cursor to the last
|
|
459
|
+
* externalId of the page — repeating until a short page ends the scan. The whole
|
|
460
|
+
* contacts table is never held in memory at once.
|
|
461
|
+
*/
|
|
462
|
+
async function selectCompositeMatchers(
|
|
463
|
+
db: Database,
|
|
464
|
+
criteria: ConditionEval,
|
|
465
|
+
): Promise<string[]> {
|
|
466
|
+
const matchers: string[] = [];
|
|
467
|
+
let cursor: string | null = null;
|
|
468
|
+
|
|
469
|
+
for (;;) {
|
|
470
|
+
const page = await db
|
|
471
|
+
.select({
|
|
472
|
+
externalId: contacts.externalId,
|
|
473
|
+
properties: contacts.properties,
|
|
474
|
+
})
|
|
475
|
+
.from(contacts)
|
|
476
|
+
.where(
|
|
477
|
+
and(
|
|
478
|
+
isNull(contacts.deletedAt),
|
|
479
|
+
cursor != null ? gt(contacts.externalId, cursor) : undefined,
|
|
480
|
+
),
|
|
481
|
+
)
|
|
482
|
+
.orderBy(sql`${contacts.externalId} asc`)
|
|
483
|
+
.limit(BATCH_SIZE);
|
|
484
|
+
|
|
485
|
+
for (const contact of page) {
|
|
486
|
+
const isMember = await evaluateCondition({
|
|
487
|
+
condition: criteria,
|
|
488
|
+
ctx: {
|
|
489
|
+
db,
|
|
490
|
+
userId: contact.externalId,
|
|
491
|
+
journeyContext:
|
|
492
|
+
(contact.properties as Record<string, unknown> | null) ?? {},
|
|
493
|
+
},
|
|
494
|
+
});
|
|
495
|
+
if (isMember) matchers.push(contact.externalId);
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// A short page (fewer than a full batch) means the scan is exhausted.
|
|
499
|
+
if (page.length < BATCH_SIZE) break;
|
|
500
|
+
cursor = page[page.length - 1]?.externalId ?? null;
|
|
501
|
+
if (cursor == null) break;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
return matchers;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
/**
|
|
508
|
+
* Upsert the bucket's current criteria fingerprint onto `bucket_configs` (Section
|
|
509
|
+
* 6.6 B). Mirrors the admin enable/disable onConflictDoUpdate target.
|
|
510
|
+
*/
|
|
511
|
+
async function persistCriteriaHash(
|
|
512
|
+
db: Database,
|
|
513
|
+
bucket: BucketMeta,
|
|
514
|
+
): Promise<void> {
|
|
515
|
+
const hash = computeCriteriaHash(bucket.criteria);
|
|
516
|
+
await db
|
|
517
|
+
.insert(bucketConfigs)
|
|
518
|
+
.values({ bucketId: bucket.id, criteriaHash: hash })
|
|
519
|
+
.onConflictDoUpdate({
|
|
520
|
+
target: bucketConfigs.bucketId,
|
|
521
|
+
set: { criteriaHash: hash, updatedAt: new Date() },
|
|
522
|
+
});
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
/**
|
|
526
|
+
* Detect first-time / criteria-changed buckets at worker boot and enqueue a
|
|
527
|
+
* backfill / re-eval job per bucket (Section 6.6 B). For each enabled dynamic
|
|
528
|
+
* bucket: read the stored `bucket_configs.criteriaHash`; if absent → first-time
|
|
529
|
+
* backfill; if present but different → re-eval; if equal → no-op. Creates an
|
|
530
|
+
* `import_jobs` status record (discriminated by `format`) and pushes
|
|
531
|
+
* `bucketBackfillTask.run(...)` for it.
|
|
532
|
+
*
|
|
533
|
+
* Idempotent and safe to call on every boot — equal hashes are skipped. Best-effort
|
|
534
|
+
* (a failure to enqueue must not crash worker boot).
|
|
535
|
+
*/
|
|
536
|
+
export async function enqueueBucketBackfills(opts: {
|
|
537
|
+
db: Database;
|
|
538
|
+
logger: Logger;
|
|
539
|
+
}): Promise<void> {
|
|
540
|
+
const { db, logger } = opts;
|
|
541
|
+
const registry = getBucketRegistrySingleton();
|
|
542
|
+
|
|
543
|
+
for (const bucket of registry.getEnabled()) {
|
|
544
|
+
if (bucket.kind === "manual" || !bucket.criteria) continue;
|
|
545
|
+
|
|
546
|
+
try {
|
|
547
|
+
const config = await db.query.bucketConfigs.findFirst({
|
|
548
|
+
where: eq(bucketConfigs.bucketId, bucket.id),
|
|
549
|
+
});
|
|
550
|
+
const currentHash = computeCriteriaHash(bucket.criteria);
|
|
551
|
+
|
|
552
|
+
let mode: BucketBackfillInput["mode"] | null = null;
|
|
553
|
+
if (!config || config.criteriaHash == null) {
|
|
554
|
+
mode = "first-time";
|
|
555
|
+
} else if (config.criteriaHash !== currentHash) {
|
|
556
|
+
mode = "reeval";
|
|
557
|
+
}
|
|
558
|
+
if (!mode) continue;
|
|
559
|
+
|
|
560
|
+
const [job] = await db
|
|
561
|
+
.insert(importJobs)
|
|
562
|
+
.values({
|
|
563
|
+
fileName: bucket.id,
|
|
564
|
+
format: mode === "first-time" ? FIRST_TIME_FORMAT : REEVAL_FORMAT,
|
|
565
|
+
status: "pending",
|
|
566
|
+
})
|
|
567
|
+
.returning({ id: importJobs.id });
|
|
568
|
+
|
|
569
|
+
if (!job) continue;
|
|
570
|
+
|
|
571
|
+
// runNoWait (fire-and-forget): this is called from worker boot BEFORE the
|
|
572
|
+
// listener starts, so awaiting the run would deadlock (the run needs the
|
|
573
|
+
// listener that `_worker.start()` brings up). The triggered run queues and
|
|
574
|
+
// executes once listening; the task itself persists the criteriaHash.
|
|
575
|
+
await bucketBackfillTask.runNoWait({
|
|
576
|
+
jobId: job.id,
|
|
577
|
+
bucketId: bucket.id,
|
|
578
|
+
mode,
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
logger.info("Bucket backfill enqueued", {
|
|
582
|
+
bucketId: bucket.id,
|
|
583
|
+
mode,
|
|
584
|
+
jobId: job.id,
|
|
585
|
+
});
|
|
586
|
+
} catch (err) {
|
|
587
|
+
logger.warn("Bucket backfill enqueue failed", {
|
|
588
|
+
bucketId: bucket.id,
|
|
589
|
+
error: err instanceof Error ? err.message : String(err),
|
|
590
|
+
});
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
}
|