@hogsend/engine 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hogsend/engine",
3
- "version": "0.12.1",
3
+ "version": "0.13.0",
4
4
  "type": "module",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -40,14 +40,14 @@
40
40
  "svix": "^1.95.1",
41
41
  "winston": "^3.19.0",
42
42
  "zod": "^4.4.3",
43
- "@hogsend/core": "^0.12.1",
44
- "@hogsend/db": "^0.12.1",
45
- "@hogsend/email": "^0.12.1",
46
- "@hogsend/plugin-posthog": "^0.12.1",
47
- "@hogsend/plugin-resend": "^0.12.1"
43
+ "@hogsend/core": "^0.13.0",
44
+ "@hogsend/db": "^0.13.0",
45
+ "@hogsend/email": "^0.13.0",
46
+ "@hogsend/plugin-posthog": "^0.13.0",
47
+ "@hogsend/plugin-resend": "^0.13.0"
48
48
  },
49
49
  "optionalDependencies": {
50
- "@hogsend/plugin-postmark": "^0.12.1"
50
+ "@hogsend/plugin-postmark": "^0.13.0"
51
51
  },
52
52
  "devDependencies": {
53
53
  "@types/node": "^22.15.3",
package/src/env.ts CHANGED
@@ -35,6 +35,14 @@ export const env = createEnv({
35
35
  // (the single intended secret-logging exception) — rotate it immediately via
36
36
  // the Studio forgot/reset flow. Min length matches better-auth's policy.
37
37
  STUDIO_ADMIN_PASSWORD: z.string().min(8).optional(),
38
+ // --- First-boot data-plane key bootstrap (lib/boot-api-key.ts) ---
39
+ // When the api_keys table is COMPLETELY empty on API boot (a template
40
+ // deploy that never ran the local `pnpm bootstrap`), the engine mints one
41
+ // ingest-scoped key ("bootstrap-ingest") and prints the FULL key ONCE to
42
+ // the server log — the data-plane sibling of the first-admin password
43
+ // above. Set "false" to opt out. A string enum (not z.coerce.boolean) so
44
+ // an explicit "false" actually disables it.
45
+ HOGSEND_BOOTSTRAP_API_KEY: z.enum(["true", "false"]).default("true"),
38
46
  // Extra origins allowed to call the auth endpoints (beyond BETTER_AUTH_URL),
39
47
  // comma-separated. Needed when the Studio is served from a different origin
40
48
  // than the API — e.g. the `hogsend studio` CLI pointing at a remote instance.
package/src/index.ts CHANGED
@@ -156,6 +156,8 @@ export {
156
156
  reportWorkerReady,
157
157
  type WorkerReadyInfo,
158
158
  } from "./lib/boot.js";
159
+ // --- First-boot data-plane key bootstrap (API process only, mirrors admin) ---
160
+ export { bootstrapApiKeyFromEnv } from "./lib/boot-api-key.js";
159
161
  // --- First-admin creation (CLI + boot bootstrap share this scrypt-correct path)
160
162
  export { bootstrapAdminFromEnv } from "./lib/bootstrap-admin.js";
161
163
  // --- Bucket transition emission (shared by real-time / cron / fast-expiry) ---
@@ -267,6 +267,14 @@ export function defineJourney(options: {
267
267
  return { stateId, status: "exited" };
268
268
  }
269
269
 
270
+ logger.error("Journey run failed", {
271
+ journeyId: meta.id,
272
+ journeyName: meta.name,
273
+ stateId,
274
+ userId,
275
+ error: message,
276
+ });
277
+
270
278
  await hatchet.events.push("journey:failed", {
271
279
  journeyId: meta.id,
272
280
  stateId,
@@ -0,0 +1,101 @@
1
+ import { apiKeys } from "@hogsend/db";
2
+ import { sql } from "drizzle-orm";
3
+ import type { HogsendClient } from "../container.js";
4
+ import { generateApiKey } from "./api-key-hash.js";
5
+
6
+ /** Name of the key minted by the first-boot bootstrap (visible in Studio/admin). */
7
+ export const BOOTSTRAP_API_KEY_NAME = "bootstrap-ingest";
8
+
9
+ /**
10
+ * Boot-time first-key bootstrap — the data-plane sibling of
11
+ * `bootstrapAdminFromEnv` (lib/bootstrap-admin.ts). A local scaffold runs
12
+ * `pnpm bootstrap`, which mints an ingest-scoped `hsk_` key into `api_keys`
13
+ * BEFORE first boot — but a template deploy (Railway) never runs that script,
14
+ * so a fresh deployed instance has NO data-plane key and the customer's first
15
+ * `POST /v1/events` has nothing to authenticate with. This closes that gap.
16
+ *
17
+ * Contract (all conditions must hold to mint):
18
+ * - `HOGSEND_BOOTSTRAP_API_KEY` is not `"false"` (default on; set `false` to
19
+ * opt out entirely).
20
+ * - The `api_keys` table has ZERO rows — revoked included, i.e. truly first
21
+ * boot. Any existing row (including the local-bootstrap key) ⇒ no-op, so
22
+ * the full key is naturally never logged twice.
23
+ *
24
+ * What is minted: one key named `bootstrap-ingest` with `scopes: ["ingest"]`
25
+ * — exactly what the scaffold's local bootstrap mints. Only the sha256 hash is
26
+ * stored (same `generateApiKey` the admin api-keys route uses); the FULL key is
27
+ * printed ONCE to the server log at warn level — the same intended
28
+ * secret-logging exception as the generated first-admin password ("shown
29
+ * once"). Rotate/revoke it any time via `POST /v1/admin/api-keys`.
30
+ *
31
+ * Concurrency: unlike the admin bootstrap there is no unique constraint to
32
+ * break a tie (two replicas would mint two different keys), so the zero-check +
33
+ * insert runs in a transaction serialized by a pg advisory xact lock — exactly
34
+ * one key is ever minted on a fresh table. Never fatal: any failure is logged
35
+ * and boot continues (the admin API remains the manual path).
36
+ *
37
+ * Runs in the API process only (not the worker) — same boot path as
38
+ * `bootstrapAdminFromEnv`, after the schema guard.
39
+ */
40
+ export async function bootstrapApiKeyFromEnv(opts: {
41
+ client: HogsendClient;
42
+ }): Promise<void> {
43
+ const { db, env, logger } = opts.client;
44
+
45
+ if (env.HOGSEND_BOOTSTRAP_API_KEY === "false") return;
46
+
47
+ try {
48
+ // Cheap pre-check outside the transaction: every boot after the first
49
+ // returns here without taking the lock.
50
+ const existing = await db.select({ id: apiKeys.id }).from(apiKeys).limit(1);
51
+ if (existing.length > 0) return;
52
+
53
+ const minted = await db.transaction(async (tx) => {
54
+ // Serialize concurrent replicas booting on a fresh DB: the loser blocks
55
+ // here, then sees the winner's row and no-ops. Lock is released on commit.
56
+ await tx.execute(
57
+ sql`select pg_advisory_xact_lock(hashtext('hogsend:bootstrap-api-key'))`,
58
+ );
59
+
60
+ const recheck = await tx
61
+ .select({ id: apiKeys.id })
62
+ .from(apiKeys)
63
+ .limit(1);
64
+ if (recheck.length > 0) return null;
65
+
66
+ const { key, prefix, hash } = generateApiKey();
67
+ await tx.insert(apiKeys).values({
68
+ name: BOOTSTRAP_API_KEY_NAME,
69
+ keyPrefix: prefix,
70
+ keyHash: hash,
71
+ scopes: ["ingest"],
72
+ createdBy: "boot",
73
+ });
74
+ return key;
75
+ });
76
+
77
+ if (!minted) {
78
+ logger.debug(
79
+ "[api-keys] First-boot key bootstrap skipped: a key already exists.",
80
+ );
81
+ return;
82
+ }
83
+
84
+ // The intended secret-logging exception (mirrors the generated first-admin
85
+ // password). Shown once — the table is non-empty from now on, so this
86
+ // branch is unreachable on every subsequent boot.
87
+ logger.warn(
88
+ `[api-keys] First-boot ingest API key (shown once — save it now): ${minted}`,
89
+ );
90
+ logger.warn(
91
+ "[api-keys] Use it as HOGSEND_API_KEY / `Authorization: Bearer <key>` " +
92
+ "for POST /v1/events. Rotate or revoke via POST /v1/admin/api-keys. " +
93
+ "Disable this bootstrap with HOGSEND_BOOTSTRAP_API_KEY=false.",
94
+ );
95
+ } catch (err) {
96
+ const message = err instanceof Error ? err.message : String(err);
97
+ logger.error("[api-keys] First-boot key bootstrap failed.", {
98
+ error: message,
99
+ });
100
+ }
101
+ }
@@ -69,6 +69,27 @@ export interface DomainStatusService {
69
69
  const VERIFIED_TTL_MS = 10 * 60 * 1000;
70
70
  /** TTL while unverified/failed/unknown — keeps test-mode auto-exit ≤60 s. */
71
71
  const UNVERIFIED_TTL_MS = 60 * 1000;
72
+ /**
73
+ * Back-off TTL after a permission-denied (401/403) refresh failure — e.g. a
74
+ * send-only restricted Resend key that cannot read the domains API. Re-probing
75
+ * every UNVERIFIED_TTL_MS would warn-spam forever and can never succeed until
76
+ * the key changes, so we assume-verified (fail-open) and go quiet for 6 h.
77
+ * An explicit `getStatus({ refresh: true })` (admin route / CLI) still probes.
78
+ */
79
+ const PERMISSION_BLOCKED_TTL_MS = 6 * 60 * 60 * 1000;
80
+
81
+ /**
82
+ * Permission-style domains-API failure detection. Matches ONLY the providers'
83
+ * own structured messages — plugin-resend `domains.ts` and plugin-postmark
84
+ * `index.ts` both emit exactly "<Provider> domains API 401: <message>" or
85
+ * "<Provider> domains API request failed with status 403". Deliberately
86
+ * narrow: a bare 401/403/"forbidden" from an intermediary (WAF/proxy/CDN) in
87
+ * front of the provider must NOT match — it stays on the transient path so a
88
+ * blip can never displace a real cached status for the long back-off TTL.
89
+ */
90
+ function isPermissionDeniedMessage(message: string): boolean {
91
+ return /\bdomains API (?:request failed with status )?40[13]\b/.test(message);
92
+ }
72
93
 
73
94
  /** Extract the host part of an email address ("hello@x.com" → "x.com"). */
74
95
  function hostPartOf(email: string | undefined): string | null {
@@ -174,11 +195,19 @@ export function createDomainStatusService(deps: {
174
195
 
175
196
  let cache: { snapshot: EngineDomainStatus; fetchedAt: number } | null = null;
176
197
  let inflight: Promise<EngineDomainStatus> | null = null;
198
+ // Permission-denied (401/403) back-off state: blocked extends the cache TTL
199
+ // so the background refresh stops re-probing a key that can never read
200
+ // domains; warned gates the explanatory warn to exactly ONCE per restriction
201
+ // episode (a later successful probe resets both, so a NEW restriction after
202
+ // recovery warns again).
203
+ let permissionBlocked = false;
204
+ let permissionWarned = false;
177
205
 
178
206
  const isFresh = (): boolean => {
179
207
  if (!cache) return false;
180
- const ttl =
181
- cache.snapshot.status?.state === "verified"
208
+ const ttl = permissionBlocked
209
+ ? PERMISSION_BLOCKED_TTL_MS
210
+ : cache.snapshot.status?.state === "verified"
182
211
  ? VERIFIED_TTL_MS
183
212
  : UNVERIFIED_TTL_MS;
184
213
  return Date.now() - cache.fetchedAt < ttl;
@@ -210,13 +239,23 @@ export function createDomainStatusService(deps: {
210
239
  let previousActive = false;
211
240
 
212
241
  /** Log the entering/exiting transition exactly once per flip of `active`. */
213
- const logTransition = (testMode: TestModeState): void => {
242
+ const logTransition = (
243
+ testMode: TestModeState,
244
+ opts?: { assumedVerified?: boolean },
245
+ ): void => {
214
246
  if (testMode.active === previousActive) return;
215
247
  if (testMode.active) {
216
248
  logger.warn(
217
249
  "test mode ACTIVE — domain unverified, redirecting all sends",
218
250
  { redirectTo: testMode.redirectTo, reason: testMode.reason },
219
251
  );
252
+ } else if (opts?.assumedVerified) {
253
+ // Permission-block fail-open: verification was UNREADABLE, not
254
+ // confirmed — never claim "domain verified" on this path.
255
+ logger.warn(
256
+ "test mode exited — domain status unreadable (permission denied), failing open to LIVE sends",
257
+ { domain },
258
+ );
220
259
  } else {
221
260
  logger.info("test mode exited — domain verified, sends are LIVE", {
222
261
  domain,
@@ -230,7 +269,10 @@ export function createDomainStatusService(deps: {
230
269
  * `testMode` off the JUST-written cache and fire the transition log on a flip.
231
270
  * Test mode is computed last so it reads the fresh verification state.
232
271
  */
233
- const commitSnapshot = (status: DomainStatus | null): EngineDomainStatus => {
272
+ const commitSnapshot = (
273
+ status: DomainStatus | null,
274
+ opts?: { assumedVerified?: boolean },
275
+ ): EngineDomainStatus => {
234
276
  // Seed the cache with a placeholder testMode so `computeTestMode` reads the
235
277
  // fresh `status`, then overwrite the block with the resolved state.
236
278
  const snapshot: EngineDomainStatus = {
@@ -248,7 +290,7 @@ export function createDomainStatusService(deps: {
248
290
  cache = { snapshot, fetchedAt: Date.now() };
249
291
  const testMode = computeTestMode();
250
292
  snapshot.testMode = testMode;
251
- logTransition(testMode);
293
+ logTransition(testMode, opts);
252
294
  return snapshot;
253
295
  };
254
296
 
@@ -263,6 +305,10 @@ export function createDomainStatusService(deps: {
263
305
  // biome-ignore lint/style/noNonNullAssertion: `supported` guarantees it.
264
306
  const capability = provider.domains!;
265
307
  const providerStatus = await capability.get(domain);
308
+ // The key CAN read domains after all — clear any permission back-off so a
309
+ // key swap recovers immediately and a future restriction warns once again.
310
+ permissionBlocked = false;
311
+ permissionWarned = false;
266
312
  return commitSnapshot(
267
313
  // Provider doesn't know the domain yet → an explicit not_found status
268
314
  // (the Studio Setup view keys its add-domain form off this).
@@ -286,6 +332,41 @@ export function createDomainStatusService(deps: {
286
332
  return inflight;
287
333
  };
288
334
 
335
+ /**
336
+ * A 401/403 from the provider domains API (e.g. a send-only restricted
337
+ * Resend key): warn ONCE with what it means, then back off under the long
338
+ * {@link PERMISSION_BLOCKED_TTL_MS} so the background refresh stops
339
+ * re-probing + re-warning every UNVERIFIED_TTL_MS. When the cache already
340
+ * holds a REAL snapshot it is KEPT (TTL extended only) — overwriting a
341
+ * genuinely-unverified status with assumed-verified `null` would disarm an
342
+ * armed auto test-mode for 6h off one permission-shaped failure. Only a
343
+ * cold cache commits the assumed-verified `null` snapshot (fail-open
344
+ * verified, the existing contract — production mail is never redirected).
345
+ */
346
+ const markPermissionBlocked = (message: string): void => {
347
+ permissionBlocked = true;
348
+ if (!permissionWarned) {
349
+ permissionWarned = true;
350
+ logger.warn(
351
+ "domain-status: the email provider API key cannot read domains " +
352
+ "(permission denied). Keeping the last fetched domain status; " +
353
+ "without one, verification is assumed-verified (fail-open: " +
354
+ "production mail is never redirected) and HOGSEND_TEST_MODE=auto " +
355
+ "cannot arm. Set HOGSEND_TEST_MODE=true to force test-mode " +
356
+ "redirects, or use a full-access API key. Suppressing domain " +
357
+ "checks for 6h.",
358
+ { domain, providerId, error: message },
359
+ );
360
+ }
361
+ if (cache) {
362
+ // Preserve the last real snapshot as the truth; just push its
363
+ // freshness window out to the back-off TTL.
364
+ cache.fetchedAt = Date.now();
365
+ return;
366
+ }
367
+ commitSnapshot(null, { assumedVerified: true });
368
+ };
369
+
289
370
  return {
290
371
  async getStatus(opts?: { refresh?: boolean }): Promise<EngineDomainStatus> {
291
372
  if (opts?.refresh) {
@@ -316,10 +397,19 @@ export function createDomainStatusService(deps: {
316
397
  refreshIfStale(): void {
317
398
  if (isFresh()) return;
318
399
  void fetchDeduped().catch((error: unknown) => {
400
+ const message = error instanceof Error ? error.message : String(error);
401
+ // Permission-style failure (401/403, e.g. a send-only restricted key):
402
+ // one explanatory warn + long back-off instead of warn-spam forever.
403
+ if (isPermissionDeniedMessage(message)) {
404
+ markPermissionBlocked(message);
405
+ return;
406
+ }
407
+ // Transient failures (network, 5xx) keep the existing behavior: warn
408
+ // every stale refresh, short TTL, fail-open verified via the cache.
319
409
  logger.warn("domain-status refresh failed", {
320
410
  domain,
321
411
  providerId,
322
- error: error instanceof Error ? error.message : String(error),
412
+ error: message,
323
413
  });
324
414
  });
325
415
  },
package/src/lib/logger.ts CHANGED
@@ -12,7 +12,9 @@ export function createLogger(level: string = "info") {
12
12
  winston.format.timestamp(),
13
13
  winston.format.errors({ stack: true }),
14
14
  ),
15
- defaultMeta: { service: "growthhog-api" },
15
+ // Service label for structured logs. Override per-deploy with SERVICE_NAME;
16
+ // the neutral default keeps scaffolded apps from inheriting dogfood branding.
17
+ defaultMeta: { service: process.env.SERVICE_NAME ?? "hogsend" },
16
18
  transports: [
17
19
  new winston.transports.Console({
18
20
  format: winston.format.combine(
@@ -1,6 +1,12 @@
1
- import { getClientSchemaVersion, getEngineSchemaVersion } from "@hogsend/db";
1
+ import {
2
+ type Database,
3
+ emailSends,
4
+ getClientSchemaVersion,
5
+ getEngineSchemaVersion,
6
+ journeyStates,
7
+ } from "@hogsend/db";
2
8
  import { createRoute, OpenAPIHono, z } from "@hono/zod-openapi";
3
- import { sql } from "drizzle-orm";
9
+ import { gte, sql } from "drizzle-orm";
4
10
  import type { AppEnv } from "../app.js";
5
11
  import { API_VERSION } from "../env.js";
6
12
  import { getRedis } from "../lib/redis.js";
@@ -29,6 +35,22 @@ const trackSchema = z.object({
29
35
  pending: z.array(z.string()),
30
36
  });
31
37
 
38
+ // Recent activity counts (last 24h). Surfaces silent failures — a failed
39
+ // journey or send otherwise only shows in worker logs while health stays
40
+ // green. Informational only: counts never affect `status`, and a query
41
+ // failure degrades each count to null rather than breaking health.
42
+ const activitySchema = z.object({
43
+ windowHours: z.number(),
44
+ journeys: z.object({
45
+ failed: z.number().nullable(),
46
+ completed: z.number().nullable(),
47
+ }),
48
+ emails: z.object({
49
+ failed: z.number().nullable(),
50
+ sent: z.number().nullable(),
51
+ }),
52
+ });
53
+
32
54
  const healthResponseSchema = z.object({
33
55
  status: z.enum(["healthy", "degraded", "migration_pending"]),
34
56
  uptime: z.number(),
@@ -43,6 +65,7 @@ const healthResponseSchema = z.object({
43
65
  engine: trackSchema,
44
66
  client: trackSchema,
45
67
  }),
68
+ activity: activitySchema,
46
69
  });
47
70
 
48
71
  const healthRoute = createRoute({
@@ -60,12 +83,114 @@ const healthRoute = createRoute({
60
83
  },
61
84
  });
62
85
 
86
+ const ACTIVITY_WINDOW_HOURS = 24;
87
+
88
+ type Activity = z.infer<typeof activitySchema>;
89
+
90
+ const NULL_ACTIVITY: Activity = {
91
+ windowHours: ACTIVITY_WINDOW_HOURS,
92
+ journeys: { failed: null, completed: null },
93
+ emails: { failed: null, sent: null },
94
+ };
95
+
96
+ // Reporting must never slow the healthcheck down: an unreachable DB makes the
97
+ // COUNT queries hang on connect (the component check above answers "down"
98
+ // fast, but a fresh query can queue behind the pool), so the whole thing is
99
+ // raced against a short deadline and degrades to nulls.
100
+ const ACTIVITY_TIMEOUT_MS = 1500;
101
+
102
+ // Cheap windowed COUNTs (one FILTER query per table; the time columns are
103
+ // indexed — email_sends_created_at_idx and journey_states_updated_at_idx —
104
+ // so each prunes by index instead of seq-scanning on every healthcheck hit).
105
+ // Never throws — any failure degrades to nulls so a reporting hiccup can't
106
+ // take the healthcheck down.
107
+ async function getRecentActivity(db: Database): Promise<Activity> {
108
+ return Promise.race([
109
+ queryRecentActivity(db),
110
+ new Promise<Activity>((resolve) =>
111
+ setTimeout(() => resolve(NULL_ACTIVITY), ACTIVITY_TIMEOUT_MS).unref?.(),
112
+ ),
113
+ ]);
114
+ }
115
+
116
+ async function queryRecentActivity(db: Database): Promise<Activity> {
117
+ const since = new Date(Date.now() - ACTIVITY_WINDOW_HOURS * 60 * 60 * 1000);
118
+ try {
119
+ const [journeyRows, emailRows] = await Promise.all([
120
+ db
121
+ .select({
122
+ failed: sql<number>`count(*) filter (where ${journeyStates.status} = 'failed')`,
123
+ completed: sql<number>`count(*) filter (where ${journeyStates.status} = 'completed')`,
124
+ })
125
+ .from(journeyStates)
126
+ // updatedAt (set on every status transition) so a journey entered
127
+ // days ago that failed/completed within the window still counts.
128
+ .where(gte(journeyStates.updatedAt, since)),
129
+ db
130
+ .select({
131
+ failed: sql<number>`count(*) filter (where ${emailSends.status} = 'failed')`,
132
+ sent: sql<number>`count(*) filter (where ${emailSends.status} in ('sent', 'delivered', 'opened', 'clicked', 'bounced', 'complained'))`,
133
+ })
134
+ .from(emailSends)
135
+ .where(gte(emailSends.createdAt, since)),
136
+ ]);
137
+ return {
138
+ windowHours: ACTIVITY_WINDOW_HOURS,
139
+ journeys: {
140
+ failed: Number(journeyRows[0]?.failed ?? 0),
141
+ completed: Number(journeyRows[0]?.completed ?? 0),
142
+ },
143
+ emails: {
144
+ failed: Number(emailRows[0]?.failed ?? 0),
145
+ sent: Number(emailRows[0]?.sent ?? 0),
146
+ },
147
+ };
148
+ } catch {
149
+ return NULL_ACTIVITY;
150
+ }
151
+ }
152
+
153
+ // A component that can't answer quickly IS down for healthcheck purposes —
154
+ // an unreachable Redis otherwise stalls the probe on ioredis reconnect
155
+ // backoff, and a connection-refused Postgres makes postgres-js retry the
156
+ // connect (default connect_timeout 30s) rather than reject, so EVERY db
157
+ // consumer in this handler must be raced against a deadline or /v1/health
158
+ // itself hangs.
159
+ const COMPONENT_TIMEOUT_MS = 1500;
160
+
161
+ // Race a read against the component deadline, degrading to `fallback`.
162
+ // Unlike checkComponent this preserves the read's value type.
163
+ async function withDeadline<T>(read: Promise<T>, fallback: T): Promise<T> {
164
+ return Promise.race([
165
+ read,
166
+ new Promise<T>((resolve) =>
167
+ setTimeout(() => resolve(fallback), COMPONENT_TIMEOUT_MS).unref?.(),
168
+ ),
169
+ ]);
170
+ }
171
+
172
+ // Degraded schema read: the timeout means the DB didn't answer, which the
173
+ // `database` component already reports — claiming `migration_pending` on top
174
+ // of that would be spurious, so an unreadable track degrades to in-sync.
175
+ const NULL_SCHEMA = {
176
+ required: null,
177
+ applied: null,
178
+ pending: [] as string[],
179
+ inSync: true,
180
+ };
181
+
63
182
  async function checkComponent(
64
183
  fn: () => Promise<void>,
65
184
  ): Promise<{ status: "up" | "down"; latencyMs: number }> {
66
185
  const start = performance.now();
186
+ const timeout = new Promise<never>((_, reject) =>
187
+ setTimeout(
188
+ () => reject(new Error("component check timed out")),
189
+ COMPONENT_TIMEOUT_MS,
190
+ ).unref?.(),
191
+ );
67
192
  try {
68
- await fn();
193
+ await Promise.race([fn(), timeout]);
69
194
  return {
70
195
  status: "up",
71
196
  latencyMs: Math.round(performance.now() - start),
@@ -83,23 +208,28 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
83
208
  async (c) => {
84
209
  const { db, clientJournal } = c.get("container");
85
210
 
86
- const [dbCheck, redisCheck, heartbeat, engine, client] = await Promise.all([
87
- checkComponent(async () => {
88
- await db.execute(sql`SELECT 1`);
89
- }),
90
- checkComponent(async () => {
91
- // Actively probe: getRedis() lazily creates + connects the client (with
92
- // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
93
- // a client if something had ALREADY created one — which nothing does when
94
- // PostHog is disabled so redis always read "down" even though it was
95
- // reachable. ioredis buffers the ping until connected (or rejects if the
96
- // host is genuinely unreachable a truthful "down").
97
- await getRedis().ping();
98
- }),
99
- getWorkerHeartbeat(),
100
- getEngineSchemaVersion(db),
101
- getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
102
- ]);
211
+ const [dbCheck, redisCheck, heartbeat, engine, client, activity] =
212
+ await Promise.all([
213
+ checkComponent(async () => {
214
+ await db.execute(sql`SELECT 1`);
215
+ }),
216
+ checkComponent(async () => {
217
+ // Actively probe: getRedis() lazily creates + connects the client (with
218
+ // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
219
+ // a client if something had ALREADY created one which nothing does when
220
+ // PostHog is disabled so redis always read "down" even though it was
221
+ // reachable. ioredis buffers the ping until connected (or rejects if the
222
+ // host is genuinely unreachable → a truthful "down").
223
+ await getRedis().ping();
224
+ }),
225
+ withDeadline(getWorkerHeartbeat(), { alive: false }),
226
+ withDeadline(getEngineSchemaVersion(db), NULL_SCHEMA),
227
+ withDeadline(
228
+ getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
229
+ NULL_SCHEMA,
230
+ ),
231
+ getRecentActivity(db),
232
+ ]);
103
233
 
104
234
  // `migration_pending` if EITHER track is behind. The engine track also gates
105
235
  // boot (fatal); the client track surfaces here non-fatally (client-owned).
@@ -139,6 +269,7 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
139
269
  lastSeenAt: heartbeat.lastSeenAt,
140
270
  },
141
271
  },
272
+ activity,
142
273
  },
143
274
  200,
144
275
  );
package/src/worker.ts CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  } from "./buckets/registry.js";
6
6
  import type { HogsendClient } from "./container.js";
7
7
  import type { DefinedJourney } from "./journeys/define-journey.js";
8
- import { selectJourneyTasks } from "./journeys/registry.js";
8
+ import { parseEnabledFilter, selectJourneyTasks } from "./journeys/registry.js";
9
9
  import { reportWorkerReady } from "./lib/boot.js";
10
10
  import { hatchet } from "./lib/hatchet.js";
11
11
  import { getRedisIfConnected } from "./lib/redis.js";
@@ -49,6 +49,13 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
49
49
  const { container, journeys } = opts;
50
50
  const enabled = opts.enabledJourneys ?? container.env.ENABLED_JOURNEYS;
51
51
  const journeyTasks = selectJourneyTasks(journeys, enabled);
52
+ // The enabled journey IDs, logged at startup so a stale worker (one missing a
53
+ // newly added journey because the dev watcher never restarted it) is visible
54
+ // at a glance — counts alone can't show WHICH journeys are registered.
55
+ const journeyFilter = parseEnabledFilter(enabled);
56
+ const journeyIds = journeys
57
+ .filter((j) => journeyFilter === "*" || journeyFilter.has(j.meta.id))
58
+ .map((j) => j.meta.id);
52
59
 
53
60
  const enabledBuckets = opts.enabledBuckets ?? container.env.ENABLED_BUCKETS;
54
61
  // The single place a bucket's per-user fast-expiry timer task is constructed
@@ -111,6 +118,7 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
111
118
  // "ready" line only fires once `hatchet.worker()` resolves).
112
119
  container.logger.info("Hogsend worker starting", {
113
120
  hatchet: container.env.HATCHET_CLIENT_HOST_PORT,
121
+ journeys: journeyIds,
114
122
  });
115
123
 
116
124
  _worker = await hatchet.worker("hogsend-worker", { workflows });
@@ -1,7 +1,67 @@
1
- import { createDatabase } from "@hogsend/db";
1
+ import {
2
+ createDatabase,
3
+ type Database,
4
+ emailSends,
5
+ journeyStates,
6
+ } from "@hogsend/db";
7
+ import { and, eq, gte, sql } from "drizzle-orm";
2
8
  import { checkAlertRules } from "../lib/alerting.js";
3
9
  import { hatchet } from "../lib/hatchet.js";
4
- import { createLogger } from "../lib/logger.js";
10
+ import { createLogger, type Logger } from "../lib/logger.js";
11
+
12
+ const FAILURE_WINDOW_MINUTES = 60;
13
+
14
+ // Ruleless failure surfacing. The configured alert rules already cover failed
15
+ // journeys (journey_failure_spike) and failed sends (they drag delivery_issue's
16
+ // delivery rate down) — but a fresh install has NO alert_rules rows, so a
17
+ // provider 403 fails silently while health stays green. This logs an error
18
+ // for ANY failed send / failed journey state in the window, no rule required.
19
+ async function surfaceRecentFailures(opts: {
20
+ db: Database;
21
+ logger: Logger;
22
+ }): Promise<void> {
23
+ const { db, logger } = opts;
24
+ const since = new Date(Date.now() - FAILURE_WINDOW_MINUTES * 60 * 1000);
25
+
26
+ try {
27
+ const [journeyRows, emailRows] = await Promise.all([
28
+ db
29
+ .select({ count: sql<number>`count(*)` })
30
+ .from(journeyStates)
31
+ .where(
32
+ and(
33
+ eq(journeyStates.status, "failed"),
34
+ gte(journeyStates.updatedAt, since),
35
+ ),
36
+ ),
37
+ db
38
+ .select({ count: sql<number>`count(*)` })
39
+ .from(emailSends)
40
+ .where(
41
+ and(
42
+ eq(emailSends.status, "failed"),
43
+ gte(emailSends.createdAt, since),
44
+ ),
45
+ ),
46
+ ]);
47
+
48
+ const failedJourneys = Number(journeyRows[0]?.count ?? 0);
49
+ const failedEmails = Number(emailRows[0]?.count ?? 0);
50
+
51
+ if (failedJourneys > 0 || failedEmails > 0) {
52
+ logger.error("Recent failures detected", {
53
+ failedJourneys,
54
+ failedEmails,
55
+ windowMinutes: FAILURE_WINDOW_MINUTES,
56
+ hint: "Check journey_states.error_message and email_sends rows; /v1/health `activity` shows 24h counts",
57
+ });
58
+ }
59
+ } catch (err) {
60
+ logger.warn("Failed to check recent failures", {
61
+ error: err instanceof Error ? err.message : String(err),
62
+ });
63
+ }
64
+ }
5
65
 
6
66
  export const checkAlertsTask = hatchet.task({
7
67
  name: "check-alerts",
@@ -13,6 +73,8 @@ export const checkAlertsTask = hatchet.task({
13
73
  });
14
74
  const logger = createLogger(process.env.LOG_LEVEL ?? "info");
15
75
 
76
+ await surfaceRecentFailures({ db, logger });
77
+
16
78
  await checkAlertRules({
17
79
  db,
18
80
  logger,