@hogsend/engine 0.12.2 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hogsend/engine",
3
- "version": "0.12.2",
3
+ "version": "0.13.1",
4
4
  "type": "module",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -40,14 +40,14 @@
40
40
  "svix": "^1.95.1",
41
41
  "winston": "^3.19.0",
42
42
  "zod": "^4.4.3",
43
- "@hogsend/core": "^0.12.2",
44
- "@hogsend/db": "^0.12.2",
45
- "@hogsend/email": "^0.12.2",
46
- "@hogsend/plugin-posthog": "^0.12.2",
47
- "@hogsend/plugin-resend": "^0.12.2"
43
+ "@hogsend/core": "^0.13.1",
44
+ "@hogsend/db": "^0.13.1",
45
+ "@hogsend/email": "^0.13.1",
46
+ "@hogsend/plugin-posthog": "^0.13.1",
47
+ "@hogsend/plugin-resend": "^0.13.1"
48
48
  },
49
49
  "optionalDependencies": {
50
- "@hogsend/plugin-postmark": "^0.12.2"
50
+ "@hogsend/plugin-postmark": "^0.13.1"
51
51
  },
52
52
  "devDependencies": {
53
53
  "@types/node": "^22.15.3",
package/src/env.ts CHANGED
@@ -1,5 +1,13 @@
1
1
  import { createEnv } from "@t3-oss/env-core";
2
2
  import { z } from "zod";
3
+ import { addrSpecOf } from "./lib/from-address.js";
4
+
5
+ // A from address may be bare ("doug@hogsend.com") or carry a display name
6
+ // ("Doug at Hogsend <doug@hogsend.com>") — both are valid provider wire
7
+ // formats. Domain derivation (lib/from-address.ts) parses either form.
8
+ const fromAddress = z.string().refine((value) => addrSpecOf(value) !== null, {
9
+ message: 'Must be an email address or "Display Name <email>"',
10
+ });
3
11
 
4
12
  /**
5
13
  * The HTTP API contract version — surfaced in the OpenAPI document
@@ -35,6 +43,14 @@ export const env = createEnv({
35
43
  // (the single intended secret-logging exception) — rotate it immediately via
36
44
  // the Studio forgot/reset flow. Min length matches better-auth's policy.
37
45
  STUDIO_ADMIN_PASSWORD: z.string().min(8).optional(),
46
+ // --- First-boot data-plane key bootstrap (lib/boot-api-key.ts) ---
47
+ // When the api_keys table is COMPLETELY empty on API boot (a template
48
+ // deploy that never ran the local `pnpm bootstrap`), the engine mints one
49
+ // ingest-scoped key ("bootstrap-ingest") and prints the FULL key ONCE to
50
+ // the server log — the data-plane sibling of the first-admin password
51
+ // above. Set "false" to opt out. A string enum (not z.coerce.boolean) so
52
+ // an explicit "false" actually disables it.
53
+ HOGSEND_BOOTSTRAP_API_KEY: z.enum(["true", "false"]).default("true"),
38
54
  // Extra origins allowed to call the auth endpoints (beyond BETTER_AUTH_URL),
39
55
  // comma-separated. Needed when the Studio is served from a different origin
40
56
  // than the API — e.g. the `hogsend studio` CLI pointing at a remote instance.
@@ -44,7 +60,7 @@ export const env = createEnv({
44
60
  // (container.ts) and the future `emailProvidersFromEnv` preset. With this
45
61
  // optional, a Postmark-only deploy boots without a Resend key.
46
62
  RESEND_API_KEY: z.string().min(1).optional(),
47
- RESEND_FROM_EMAIL: z.string().email().default("noreply@hogsend.com"),
63
+ RESEND_FROM_EMAIL: fromAddress.default("noreply@hogsend.com"),
48
64
  // --- Provider-neutral email config (BYO email provider) ---
49
65
  // The active email provider id the container resolves from the
50
66
  // EmailProviderRegistry. Absent → "resend" (today's byte-for-byte default).
@@ -52,7 +68,7 @@ export const env = createEnv({
52
68
  // Neutral default-from address. The mailer's `defaultFrom` is
53
69
  // `EMAIL_FROM ?? RESEND_FROM_EMAIL`, so an unset EMAIL_FROM keeps today's
54
70
  // Resend-named default.
55
- EMAIL_FROM: z.string().email().optional(),
71
+ EMAIL_FROM: fromAddress.optional(),
56
72
  // The sending domain the domain-status service reports on. OVERRIDES the
57
73
  // default derivation (host part of EMAIL_FROM, falling back to the host of
58
74
  // RESEND_FROM_EMAIL) — set it when you send from a subaddress domain that
package/src/index.ts CHANGED
@@ -156,6 +156,8 @@ export {
156
156
  reportWorkerReady,
157
157
  type WorkerReadyInfo,
158
158
  } from "./lib/boot.js";
159
+ // --- First-boot data-plane key bootstrap (API process only, mirrors admin) ---
160
+ export { bootstrapApiKeyFromEnv } from "./lib/boot-api-key.js";
159
161
  // --- First-admin creation (CLI + boot bootstrap share this scrypt-correct path)
160
162
  export { bootstrapAdminFromEnv } from "./lib/bootstrap-admin.js";
161
163
  // --- Bucket transition emission (shared by real-time / cron / fast-expiry) ---
@@ -201,6 +203,7 @@ export type {
201
203
  // --- Enrollment guards ---
202
204
  export { checkEmailPreferences } from "./lib/enrollment-guards.js";
203
205
  export { isFrequencyCapped } from "./lib/frequency-cap.js";
206
+ export { addrSpecOf, hostOfFromAddress } from "./lib/from-address.js";
204
207
  export { hatchet } from "./lib/hatchet.js";
205
208
  // --- Ingestion pipeline ---
206
209
  export {
@@ -267,6 +267,14 @@ export function defineJourney(options: {
267
267
  return { stateId, status: "exited" };
268
268
  }
269
269
 
270
+ logger.error("Journey run failed", {
271
+ journeyId: meta.id,
272
+ journeyName: meta.name,
273
+ stateId,
274
+ userId,
275
+ error: message,
276
+ });
277
+
270
278
  await hatchet.events.push("journey:failed", {
271
279
  journeyId: meta.id,
272
280
  stateId,
@@ -0,0 +1,101 @@
1
+ import { apiKeys } from "@hogsend/db";
2
+ import { sql } from "drizzle-orm";
3
+ import type { HogsendClient } from "../container.js";
4
+ import { generateApiKey } from "./api-key-hash.js";
5
+
6
+ /** Name of the key minted by the first-boot bootstrap (visible in Studio/admin). */
7
+ export const BOOTSTRAP_API_KEY_NAME = "bootstrap-ingest";
8
+
9
+ /**
10
+ * Boot-time first-key bootstrap — the data-plane sibling of
11
+ * `bootstrapAdminFromEnv` (lib/bootstrap-admin.ts). A local scaffold runs
12
+ * `pnpm bootstrap`, which mints an ingest-scoped `hsk_` key into `api_keys`
13
+ * BEFORE first boot — but a template deploy (Railway) never runs that script,
14
+ * so a fresh deployed instance has NO data-plane key and the customer's first
15
+ * `POST /v1/events` has nothing to authenticate with. This closes that gap.
16
+ *
17
+ * Contract (all conditions must hold to mint):
18
+ * - `HOGSEND_BOOTSTRAP_API_KEY` is not `"false"` (default on; set `false` to
19
+ * opt out entirely).
20
+ * - The `api_keys` table has ZERO rows — revoked included, i.e. truly first
21
+ * boot. Any existing row (including the local-bootstrap key) ⇒ no-op, so
22
+ * the full key is naturally never logged twice.
23
+ *
24
+ * What is minted: one key named `bootstrap-ingest` with `scopes: ["ingest"]`
25
+ * — exactly what the scaffold's local bootstrap mints. Only the sha256 hash is
26
+ * stored (same `generateApiKey` the admin api-keys route uses); the FULL key is
27
+ * printed ONCE to the server log at warn level — the same intended
28
+ * secret-logging exception as the generated first-admin password ("shown
29
+ * once"). Rotate/revoke it any time via `POST /v1/admin/api-keys`.
30
+ *
31
+ * Concurrency: unlike the admin bootstrap there is no unique constraint to
32
+ * break a tie (two replicas would mint two different keys), so the zero-check +
33
+ * insert runs in a transaction serialized by a pg advisory xact lock — exactly
34
+ * one key is ever minted on a fresh table. Never fatal: any failure is logged
35
+ * and boot continues (the admin API remains the manual path).
36
+ *
37
+ * Runs in the API process only (not the worker) — same boot path as
38
+ * `bootstrapAdminFromEnv`, after the schema guard.
39
+ */
40
+ export async function bootstrapApiKeyFromEnv(opts: {
41
+ client: HogsendClient;
42
+ }): Promise<void> {
43
+ const { db, env, logger } = opts.client;
44
+
45
+ if (env.HOGSEND_BOOTSTRAP_API_KEY === "false") return;
46
+
47
+ try {
48
+ // Cheap pre-check outside the transaction: every boot after the first
49
+ // returns here without taking the lock.
50
+ const existing = await db.select({ id: apiKeys.id }).from(apiKeys).limit(1);
51
+ if (existing.length > 0) return;
52
+
53
+ const minted = await db.transaction(async (tx) => {
54
+ // Serialize concurrent replicas booting on a fresh DB: the loser blocks
55
+ // here, then sees the winner's row and no-ops. Lock is released on commit.
56
+ await tx.execute(
57
+ sql`select pg_advisory_xact_lock(hashtext('hogsend:bootstrap-api-key'))`,
58
+ );
59
+
60
+ const recheck = await tx
61
+ .select({ id: apiKeys.id })
62
+ .from(apiKeys)
63
+ .limit(1);
64
+ if (recheck.length > 0) return null;
65
+
66
+ const { key, prefix, hash } = generateApiKey();
67
+ await tx.insert(apiKeys).values({
68
+ name: BOOTSTRAP_API_KEY_NAME,
69
+ keyPrefix: prefix,
70
+ keyHash: hash,
71
+ scopes: ["ingest"],
72
+ createdBy: "boot",
73
+ });
74
+ return key;
75
+ });
76
+
77
+ if (!minted) {
78
+ logger.debug(
79
+ "[api-keys] First-boot key bootstrap skipped: a key already exists.",
80
+ );
81
+ return;
82
+ }
83
+
84
+ // The intended secret-logging exception (mirrors the generated first-admin
85
+ // password). Shown once — the table is non-empty from now on, so this
86
+ // branch is unreachable on every subsequent boot.
87
+ logger.warn(
88
+ `[api-keys] First-boot ingest API key (shown once — save it now): ${minted}`,
89
+ );
90
+ logger.warn(
91
+ "[api-keys] Use it as HOGSEND_API_KEY / `Authorization: Bearer <key>` " +
92
+ "for POST /v1/events. Rotate or revoke via POST /v1/admin/api-keys. " +
93
+ "Disable this bootstrap with HOGSEND_BOOTSTRAP_API_KEY=false.",
94
+ );
95
+ } catch (err) {
96
+ const message = err instanceof Error ? err.message : String(err);
97
+ logger.error("[api-keys] First-boot key bootstrap failed.", {
98
+ error: message,
99
+ });
100
+ }
101
+ }
@@ -1,5 +1,6 @@
1
1
  import type { DomainStatus, EmailProvider } from "@hogsend/core";
2
2
  import type { env as envSchema } from "../env.js";
3
+ import { hostOfFromAddress } from "./from-address.js";
3
4
  import type { Logger } from "./logger.js";
4
5
 
5
6
  /**
@@ -69,15 +70,34 @@ export interface DomainStatusService {
69
70
  const VERIFIED_TTL_MS = 10 * 60 * 1000;
70
71
  /** TTL while unverified/failed/unknown — keeps test-mode auto-exit ≤60 s. */
71
72
  const UNVERIFIED_TTL_MS = 60 * 1000;
73
+ /**
74
+ * Back-off TTL after a permission-denied (401/403) refresh failure — e.g. a
75
+ * send-only restricted Resend key that cannot read the domains API. Re-probing
76
+ * every UNVERIFIED_TTL_MS would warn-spam forever and can never succeed until
77
+ * the key changes, so we assume-verified (fail-open) and go quiet for 6 h.
78
+ * An explicit `getStatus({ refresh: true })` (admin route / CLI) still probes.
79
+ */
80
+ const PERMISSION_BLOCKED_TTL_MS = 6 * 60 * 60 * 1000;
72
81
 
73
- /** Extract the host part of an email address ("hello@x.com" → "x.com"). */
74
- function hostPartOf(email: string | undefined): string | null {
75
- if (!email) return null;
76
- const at = email.lastIndexOf("@");
77
- if (at === -1 || at === email.length - 1) return null;
78
- return email.slice(at + 1).toLowerCase();
82
+ /**
83
+ * Permission-style domains-API failure detection. Matches ONLY the providers'
84
+ * own structured messages — plugin-resend `domains.ts` and plugin-postmark
85
+ * `index.ts` both emit exactly "<Provider> domains API 401: <message>" or
86
+ * "<Provider> domains API request failed with status 403". Deliberately
87
+ * narrow: a bare 401/403/"forbidden" from an intermediary (WAF/proxy/CDN) in
88
+ * front of the provider must NOT match — it stays on the transient path so a
89
+ * blip can never displace a real cached status for the long back-off TTL.
90
+ */
91
+ function isPermissionDeniedMessage(message: string): boolean {
92
+ return /\bdomains API (?:request failed with status )?40[13]\b/.test(message);
79
93
  }
80
94
 
95
+ /**
96
+ * Extract the host part of a configured from address ("hello@x.com" or
97
+ * "Name <hello@x.com>" → "x.com") — display-name aware via from-address.ts.
98
+ */
99
+ const hostPartOf = hostOfFromAddress;
100
+
81
101
  /** The Resend unverified-domain from-address fallback (so a redirected mail
82
102
  * still delivers while the real sending domain isn't verified yet). */
83
103
  const RESEND_UNVERIFIED_FROM = "onboarding@resend.dev";
@@ -174,11 +194,19 @@ export function createDomainStatusService(deps: {
174
194
 
175
195
  let cache: { snapshot: EngineDomainStatus; fetchedAt: number } | null = null;
176
196
  let inflight: Promise<EngineDomainStatus> | null = null;
197
+ // Permission-denied (401/403) back-off state: blocked extends the cache TTL
198
+ // so the background refresh stops re-probing a key that can never read
199
+ // domains; warned gates the explanatory warn to exactly ONCE per restriction
200
+ // episode (a later successful probe resets both, so a NEW restriction after
201
+ // recovery warns again).
202
+ let permissionBlocked = false;
203
+ let permissionWarned = false;
177
204
 
178
205
  const isFresh = (): boolean => {
179
206
  if (!cache) return false;
180
- const ttl =
181
- cache.snapshot.status?.state === "verified"
207
+ const ttl = permissionBlocked
208
+ ? PERMISSION_BLOCKED_TTL_MS
209
+ : cache.snapshot.status?.state === "verified"
182
210
  ? VERIFIED_TTL_MS
183
211
  : UNVERIFIED_TTL_MS;
184
212
  return Date.now() - cache.fetchedAt < ttl;
@@ -210,13 +238,23 @@ export function createDomainStatusService(deps: {
210
238
  let previousActive = false;
211
239
 
212
240
  /** Log the entering/exiting transition exactly once per flip of `active`. */
213
- const logTransition = (testMode: TestModeState): void => {
241
+ const logTransition = (
242
+ testMode: TestModeState,
243
+ opts?: { assumedVerified?: boolean },
244
+ ): void => {
214
245
  if (testMode.active === previousActive) return;
215
246
  if (testMode.active) {
216
247
  logger.warn(
217
248
  "test mode ACTIVE — domain unverified, redirecting all sends",
218
249
  { redirectTo: testMode.redirectTo, reason: testMode.reason },
219
250
  );
251
+ } else if (opts?.assumedVerified) {
252
+ // Permission-block fail-open: verification was UNREADABLE, not
253
+ // confirmed — never claim "domain verified" on this path.
254
+ logger.warn(
255
+ "test mode exited — domain status unreadable (permission denied), failing open to LIVE sends",
256
+ { domain },
257
+ );
220
258
  } else {
221
259
  logger.info("test mode exited — domain verified, sends are LIVE", {
222
260
  domain,
@@ -230,7 +268,10 @@ export function createDomainStatusService(deps: {
230
268
  * `testMode` off the JUST-written cache and fire the transition log on a flip.
231
269
  * Test mode is computed last so it reads the fresh verification state.
232
270
  */
233
- const commitSnapshot = (status: DomainStatus | null): EngineDomainStatus => {
271
+ const commitSnapshot = (
272
+ status: DomainStatus | null,
273
+ opts?: { assumedVerified?: boolean },
274
+ ): EngineDomainStatus => {
234
275
  // Seed the cache with a placeholder testMode so `computeTestMode` reads the
235
276
  // fresh `status`, then overwrite the block with the resolved state.
236
277
  const snapshot: EngineDomainStatus = {
@@ -248,7 +289,7 @@ export function createDomainStatusService(deps: {
248
289
  cache = { snapshot, fetchedAt: Date.now() };
249
290
  const testMode = computeTestMode();
250
291
  snapshot.testMode = testMode;
251
- logTransition(testMode);
292
+ logTransition(testMode, opts);
252
293
  return snapshot;
253
294
  };
254
295
 
@@ -263,6 +304,10 @@ export function createDomainStatusService(deps: {
263
304
  // biome-ignore lint/style/noNonNullAssertion: `supported` guarantees it.
264
305
  const capability = provider.domains!;
265
306
  const providerStatus = await capability.get(domain);
307
+ // The key CAN read domains after all — clear any permission back-off so a
308
+ // key swap recovers immediately and a future restriction warns once again.
309
+ permissionBlocked = false;
310
+ permissionWarned = false;
266
311
  return commitSnapshot(
267
312
  // Provider doesn't know the domain yet → an explicit not_found status
268
313
  // (the Studio Setup view keys its add-domain form off this).
@@ -286,6 +331,41 @@ export function createDomainStatusService(deps: {
286
331
  return inflight;
287
332
  };
288
333
 
334
+ /**
335
+ * A 401/403 from the provider domains API (e.g. a send-only restricted
336
+ * Resend key): warn ONCE with what it means, then back off under the long
337
+ * {@link PERMISSION_BLOCKED_TTL_MS} so the background refresh stops
338
+ * re-probing + re-warning every UNVERIFIED_TTL_MS. When the cache already
339
+ * holds a REAL snapshot it is KEPT (TTL extended only) — overwriting a
340
+ * genuinely-unverified status with assumed-verified `null` would disarm an
341
+ * armed auto test-mode for 6h off one permission-shaped failure. Only a
342
+ * cold cache commits the assumed-verified `null` snapshot (fail-open
343
+ * verified, the existing contract — production mail is never redirected).
344
+ */
345
+ const markPermissionBlocked = (message: string): void => {
346
+ permissionBlocked = true;
347
+ if (!permissionWarned) {
348
+ permissionWarned = true;
349
+ logger.warn(
350
+ "domain-status: the email provider API key cannot read domains " +
351
+ "(permission denied). Keeping the last fetched domain status; " +
352
+ "without one, verification is assumed-verified (fail-open: " +
353
+ "production mail is never redirected) and HOGSEND_TEST_MODE=auto " +
354
+ "cannot arm. Set HOGSEND_TEST_MODE=true to force test-mode " +
355
+ "redirects, or use a full-access API key. Suppressing domain " +
356
+ "checks for 6h.",
357
+ { domain, providerId, error: message },
358
+ );
359
+ }
360
+ if (cache) {
361
+ // Preserve the last real snapshot as the truth; just push its
362
+ // freshness window out to the back-off TTL.
363
+ cache.fetchedAt = Date.now();
364
+ return;
365
+ }
366
+ commitSnapshot(null, { assumedVerified: true });
367
+ };
368
+
289
369
  return {
290
370
  async getStatus(opts?: { refresh?: boolean }): Promise<EngineDomainStatus> {
291
371
  if (opts?.refresh) {
@@ -316,10 +396,19 @@ export function createDomainStatusService(deps: {
316
396
  refreshIfStale(): void {
317
397
  if (isFresh()) return;
318
398
  void fetchDeduped().catch((error: unknown) => {
399
+ const message = error instanceof Error ? error.message : String(error);
400
+ // Permission-style failure (401/403, e.g. a send-only restricted key):
401
+ // one explanatory warn + long back-off instead of warn-spam forever.
402
+ if (isPermissionDeniedMessage(message)) {
403
+ markPermissionBlocked(message);
404
+ return;
405
+ }
406
+ // Transient failures (network, 5xx) keep the existing behavior: warn
407
+ // every stale refresh, short TTL, fail-open verified via the cache.
319
408
  logger.warn("domain-status refresh failed", {
320
409
  domain,
321
410
  providerId,
322
- error: error instanceof Error ? error.message : String(error),
411
+ error: message,
323
412
  });
324
413
  });
325
414
  },
@@ -0,0 +1,29 @@
1
+ /**
2
+ * From-address helpers. A configured from address may be a bare addr-spec
3
+ * ("doug@hogsend.com") or carry a display name ("Doug at Hogsend
4
+ * <doug@hogsend.com>") — both are valid on the wire for every supported
5
+ * provider. These helpers parse either form so env validation and
6
+ * domain derivation agree on what the address part is.
7
+ */
8
+
9
+ const ADDR_SPEC_RE = /^[^\s@<>]+@[^\s@<>]+\.[^\s@<>]+$/;
10
+
11
+ /**
12
+ * Extract the addr-spec from a from address ("Doug <d@x.com>" → "d@x.com",
13
+ * "d@x.com" → "d@x.com"). Returns null when no valid address is present.
14
+ */
15
+ export function addrSpecOf(value: string | undefined): string | null {
16
+ if (!value) return null;
17
+ const match = value.trim().match(/^[^<>]*<([^<>]+)>$/);
18
+ const addr = (match?.[1] ?? value).trim();
19
+ return ADDR_SPEC_RE.test(addr) ? addr.toLowerCase() : null;
20
+ }
21
+
22
+ /** Host part of a from address ("Doug <d@x.com>" → "x.com"). */
23
+ export function hostOfFromAddress(value: string | undefined): string | null {
24
+ const addr = addrSpecOf(value);
25
+ if (!addr) return null;
26
+ const at = addr.lastIndexOf("@");
27
+ if (at === -1 || at === addr.length - 1) return null;
28
+ return addr.slice(at + 1);
29
+ }
package/src/lib/logger.ts CHANGED
@@ -12,7 +12,9 @@ export function createLogger(level: string = "info") {
12
12
  winston.format.timestamp(),
13
13
  winston.format.errors({ stack: true }),
14
14
  ),
15
- defaultMeta: { service: "growthhog-api" },
15
+ // Service label for structured logs. Override per-deploy with SERVICE_NAME;
16
+ // the neutral default keeps scaffolded apps from inheriting dogfood branding.
17
+ defaultMeta: { service: process.env.SERVICE_NAME ?? "hogsend" },
16
18
  transports: [
17
19
  new winston.transports.Console({
18
20
  format: winston.format.combine(
@@ -1,6 +1,12 @@
1
- import { getClientSchemaVersion, getEngineSchemaVersion } from "@hogsend/db";
1
+ import {
2
+ type Database,
3
+ emailSends,
4
+ getClientSchemaVersion,
5
+ getEngineSchemaVersion,
6
+ journeyStates,
7
+ } from "@hogsend/db";
2
8
  import { createRoute, OpenAPIHono, z } from "@hono/zod-openapi";
3
- import { sql } from "drizzle-orm";
9
+ import { gte, sql } from "drizzle-orm";
4
10
  import type { AppEnv } from "../app.js";
5
11
  import { API_VERSION } from "../env.js";
6
12
  import { getRedis } from "../lib/redis.js";
@@ -29,6 +35,22 @@ const trackSchema = z.object({
29
35
  pending: z.array(z.string()),
30
36
  });
31
37
 
38
+ // Recent activity counts (last 24h). Surfaces silent failures — a failed
39
+ // journey or send otherwise only shows in worker logs while health stays
40
+ // green. Informational only: counts never affect `status`, and a query
41
+ // failure degrades each count to null rather than breaking health.
42
+ const activitySchema = z.object({
43
+ windowHours: z.number(),
44
+ journeys: z.object({
45
+ failed: z.number().nullable(),
46
+ completed: z.number().nullable(),
47
+ }),
48
+ emails: z.object({
49
+ failed: z.number().nullable(),
50
+ sent: z.number().nullable(),
51
+ }),
52
+ });
53
+
32
54
  const healthResponseSchema = z.object({
33
55
  status: z.enum(["healthy", "degraded", "migration_pending"]),
34
56
  uptime: z.number(),
@@ -43,6 +65,7 @@ const healthResponseSchema = z.object({
43
65
  engine: trackSchema,
44
66
  client: trackSchema,
45
67
  }),
68
+ activity: activitySchema,
46
69
  });
47
70
 
48
71
  const healthRoute = createRoute({
@@ -60,12 +83,114 @@ const healthRoute = createRoute({
60
83
  },
61
84
  });
62
85
 
86
+ const ACTIVITY_WINDOW_HOURS = 24;
87
+
88
+ type Activity = z.infer<typeof activitySchema>;
89
+
90
+ const NULL_ACTIVITY: Activity = {
91
+ windowHours: ACTIVITY_WINDOW_HOURS,
92
+ journeys: { failed: null, completed: null },
93
+ emails: { failed: null, sent: null },
94
+ };
95
+
96
+ // Reporting must never slow the healthcheck down: an unreachable DB makes the
97
+ // COUNT queries hang on connect (the component check above answers "down"
98
+ // fast, but a fresh query can queue behind the pool), so the whole thing is
99
+ // raced against a short deadline and degrades to nulls.
100
+ const ACTIVITY_TIMEOUT_MS = 1500;
101
+
102
+ // Cheap windowed COUNTs (one FILTER query per table; the time columns are
103
+ // indexed — email_sends_created_at_idx and journey_states_updated_at_idx —
104
+ // so each prunes by index instead of seq-scanning on every healthcheck hit).
105
+ // Never throws — any failure degrades to nulls so a reporting hiccup can't
106
+ // take the healthcheck down.
107
+ async function getRecentActivity(db: Database): Promise<Activity> {
108
+ return Promise.race([
109
+ queryRecentActivity(db),
110
+ new Promise<Activity>((resolve) =>
111
+ setTimeout(() => resolve(NULL_ACTIVITY), ACTIVITY_TIMEOUT_MS).unref?.(),
112
+ ),
113
+ ]);
114
+ }
115
+
116
+ async function queryRecentActivity(db: Database): Promise<Activity> {
117
+ const since = new Date(Date.now() - ACTIVITY_WINDOW_HOURS * 60 * 60 * 1000);
118
+ try {
119
+ const [journeyRows, emailRows] = await Promise.all([
120
+ db
121
+ .select({
122
+ failed: sql<number>`count(*) filter (where ${journeyStates.status} = 'failed')`,
123
+ completed: sql<number>`count(*) filter (where ${journeyStates.status} = 'completed')`,
124
+ })
125
+ .from(journeyStates)
126
+ // updatedAt (set on every status transition) so a journey entered
127
+ // days ago that failed/completed within the window still counts.
128
+ .where(gte(journeyStates.updatedAt, since)),
129
+ db
130
+ .select({
131
+ failed: sql<number>`count(*) filter (where ${emailSends.status} = 'failed')`,
132
+ sent: sql<number>`count(*) filter (where ${emailSends.status} in ('sent', 'delivered', 'opened', 'clicked', 'bounced', 'complained'))`,
133
+ })
134
+ .from(emailSends)
135
+ .where(gte(emailSends.createdAt, since)),
136
+ ]);
137
+ return {
138
+ windowHours: ACTIVITY_WINDOW_HOURS,
139
+ journeys: {
140
+ failed: Number(journeyRows[0]?.failed ?? 0),
141
+ completed: Number(journeyRows[0]?.completed ?? 0),
142
+ },
143
+ emails: {
144
+ failed: Number(emailRows[0]?.failed ?? 0),
145
+ sent: Number(emailRows[0]?.sent ?? 0),
146
+ },
147
+ };
148
+ } catch {
149
+ return NULL_ACTIVITY;
150
+ }
151
+ }
152
+
153
+ // A component that can't answer quickly IS down for healthcheck purposes —
154
+ // an unreachable Redis otherwise stalls the probe on ioredis reconnect
155
+ // backoff, and a connection-refused Postgres makes postgres-js retry the
156
+ // connect (default connect_timeout 30s) rather than reject, so EVERY db
157
+ // consumer in this handler must be raced against a deadline or /v1/health
158
+ // itself hangs.
159
+ const COMPONENT_TIMEOUT_MS = 1500;
160
+
161
+ // Race a read against the component deadline, degrading to `fallback`.
162
+ // Unlike checkComponent this preserves the read's value type.
163
+ async function withDeadline<T>(read: Promise<T>, fallback: T): Promise<T> {
164
+ return Promise.race([
165
+ read,
166
+ new Promise<T>((resolve) =>
167
+ setTimeout(() => resolve(fallback), COMPONENT_TIMEOUT_MS).unref?.(),
168
+ ),
169
+ ]);
170
+ }
171
+
172
+ // Degraded schema read: the timeout means the DB didn't answer, which the
173
+ // `database` component already reports — claiming `migration_pending` on top
174
+ // of that would be spurious, so an unreadable track degrades to in-sync.
175
+ const NULL_SCHEMA = {
176
+ required: null,
177
+ applied: null,
178
+ pending: [] as string[],
179
+ inSync: true,
180
+ };
181
+
63
182
  async function checkComponent(
64
183
  fn: () => Promise<void>,
65
184
  ): Promise<{ status: "up" | "down"; latencyMs: number }> {
66
185
  const start = performance.now();
186
+ const timeout = new Promise<never>((_, reject) =>
187
+ setTimeout(
188
+ () => reject(new Error("component check timed out")),
189
+ COMPONENT_TIMEOUT_MS,
190
+ ).unref?.(),
191
+ );
67
192
  try {
68
- await fn();
193
+ await Promise.race([fn(), timeout]);
69
194
  return {
70
195
  status: "up",
71
196
  latencyMs: Math.round(performance.now() - start),
@@ -83,23 +208,28 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
83
208
  async (c) => {
84
209
  const { db, clientJournal } = c.get("container");
85
210
 
86
- const [dbCheck, redisCheck, heartbeat, engine, client] = await Promise.all([
87
- checkComponent(async () => {
88
- await db.execute(sql`SELECT 1`);
89
- }),
90
- checkComponent(async () => {
91
- // Actively probe: getRedis() lazily creates + connects the client (with
92
- // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
93
- // a client if something had ALREADY created one — which nothing does when
94
- // PostHog is disabled so redis always read "down" even though it was
95
- // reachable. ioredis buffers the ping until connected (or rejects if the
96
- // host is genuinely unreachable a truthful "down").
97
- await getRedis().ping();
98
- }),
99
- getWorkerHeartbeat(),
100
- getEngineSchemaVersion(db),
101
- getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
102
- ]);
211
+ const [dbCheck, redisCheck, heartbeat, engine, client, activity] =
212
+ await Promise.all([
213
+ checkComponent(async () => {
214
+ await db.execute(sql`SELECT 1`);
215
+ }),
216
+ checkComponent(async () => {
217
+ // Actively probe: getRedis() lazily creates + connects the client (with
218
+ // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
219
+ // a client if something had ALREADY created one which nothing does when
220
+ // PostHog is disabled so redis always read "down" even though it was
221
+ // reachable. ioredis buffers the ping until connected (or rejects if the
222
+ // host is genuinely unreachable → a truthful "down").
223
+ await getRedis().ping();
224
+ }),
225
+ withDeadline(getWorkerHeartbeat(), { alive: false }),
226
+ withDeadline(getEngineSchemaVersion(db), NULL_SCHEMA),
227
+ withDeadline(
228
+ getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
229
+ NULL_SCHEMA,
230
+ ),
231
+ getRecentActivity(db),
232
+ ]);
103
233
 
104
234
  // `migration_pending` if EITHER track is behind. The engine track also gates
105
235
  // boot (fatal); the client track surfaces here non-fatally (client-owned).
@@ -139,6 +269,7 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
139
269
  lastSeenAt: heartbeat.lastSeenAt,
140
270
  },
141
271
  },
272
+ activity,
142
273
  },
143
274
  200,
144
275
  );
package/src/worker.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  } from "./buckets/registry.js";
6
6
  import type { HogsendClient } from "./container.js";
7
7
  import type { DefinedJourney } from "./journeys/define-journey.js";
8
- import { selectJourneyTasks } from "./journeys/registry.js";
8
+ import { parseEnabledFilter, selectJourneyTasks } from "./journeys/registry.js";
9
9
  import { reportWorkerReady } from "./lib/boot.js";
10
10
  import { hatchet } from "./lib/hatchet.js";
11
11
  import { getRedisIfConnected } from "./lib/redis.js";
@@ -49,6 +49,13 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
49
49
  const { container, journeys } = opts;
50
50
  const enabled = opts.enabledJourneys ?? container.env.ENABLED_JOURNEYS;
51
51
  const journeyTasks = selectJourneyTasks(journeys, enabled);
52
+ // The enabled journey IDs, logged at startup so a stale worker (one missing a
53
+ // newly added journey because the dev watcher never restarted it) is visible
54
+ // at a glance — counts alone can't show WHICH journeys are registered.
55
+ const journeyFilter = parseEnabledFilter(enabled);
56
+ const journeyIds = journeys
57
+ .filter((j) => journeyFilter === "*" || journeyFilter.has(j.meta.id))
58
+ .map((j) => j.meta.id);
52
59
 
53
60
  const enabledBuckets = opts.enabledBuckets ?? container.env.ENABLED_BUCKETS;
54
61
  // The single place a bucket's per-user fast-expiry timer task is constructed
@@ -111,6 +118,7 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
111
118
  // "ready" line only fires once `hatchet.worker()` resolves).
112
119
  container.logger.info("Hogsend worker starting", {
113
120
  hatchet: container.env.HATCHET_CLIENT_HOST_PORT,
121
+ journeys: journeyIds,
114
122
  });
115
123
 
116
124
  _worker = await hatchet.worker("hogsend-worker", { workflows });
@@ -1,7 +1,67 @@
1
- import { createDatabase } from "@hogsend/db";
1
+ import {
2
+ createDatabase,
3
+ type Database,
4
+ emailSends,
5
+ journeyStates,
6
+ } from "@hogsend/db";
7
+ import { and, eq, gte, sql } from "drizzle-orm";
2
8
  import { checkAlertRules } from "../lib/alerting.js";
3
9
  import { hatchet } from "../lib/hatchet.js";
4
- import { createLogger } from "../lib/logger.js";
10
+ import { createLogger, type Logger } from "../lib/logger.js";
11
+
12
+ const FAILURE_WINDOW_MINUTES = 60;
13
+
14
+ // Ruleless failure surfacing. The configured alert rules already cover failed
15
+ // journeys (journey_failure_spike) and failed sends (they drag delivery_issue's
16
+ // delivery rate down) — but a fresh install has NO alert_rules rows, so a
17
+ // provider 403 fails silently while health stays green. This logs an error
18
+ // for ANY failed send / failed journey state in the window, no rule required.
19
+ async function surfaceRecentFailures(opts: {
20
+ db: Database;
21
+ logger: Logger;
22
+ }): Promise<void> {
23
+ const { db, logger } = opts;
24
+ const since = new Date(Date.now() - FAILURE_WINDOW_MINUTES * 60 * 1000);
25
+
26
+ try {
27
+ const [journeyRows, emailRows] = await Promise.all([
28
+ db
29
+ .select({ count: sql<number>`count(*)` })
30
+ .from(journeyStates)
31
+ .where(
32
+ and(
33
+ eq(journeyStates.status, "failed"),
34
+ gte(journeyStates.updatedAt, since),
35
+ ),
36
+ ),
37
+ db
38
+ .select({ count: sql<number>`count(*)` })
39
+ .from(emailSends)
40
+ .where(
41
+ and(
42
+ eq(emailSends.status, "failed"),
43
+ gte(emailSends.createdAt, since),
44
+ ),
45
+ ),
46
+ ]);
47
+
48
+ const failedJourneys = Number(journeyRows[0]?.count ?? 0);
49
+ const failedEmails = Number(emailRows[0]?.count ?? 0);
50
+
51
+ if (failedJourneys > 0 || failedEmails > 0) {
52
+ logger.error("Recent failures detected", {
53
+ failedJourneys,
54
+ failedEmails,
55
+ windowMinutes: FAILURE_WINDOW_MINUTES,
56
+ hint: "Check journey_states.error_message and email_sends rows; /v1/health `activity` shows 24h counts",
57
+ });
58
+ }
59
+ } catch (err) {
60
+ logger.warn("Failed to check recent failures", {
61
+ error: err instanceof Error ? err.message : String(err),
62
+ });
63
+ }
64
+ }
5
65
 
6
66
  export const checkAlertsTask = hatchet.task({
7
67
  name: "check-alerts",
@@ -13,6 +73,8 @@ export const checkAlertsTask = hatchet.task({
13
73
  });
14
74
  const logger = createLogger(process.env.LOG_LEVEL ?? "info");
15
75
 
76
+ await surfaceRecentFailures({ db, logger });
77
+
16
78
  await checkAlertRules({
17
79
  db,
18
80
  logger,