npm - @hogsend/engine - Versions diffs - 0.12.2 → 0.13.1 - Mend

@hogsend/engine 0.12.2 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/package.json +7 -7
package/src/env.ts +18 -2
package/src/index.ts +3 -0
package/src/journeys/define-journey.ts +8 -0
package/src/lib/boot-api-key.ts +101 -0
package/src/lib/domain-status.ts +101 -12
package/src/lib/from-address.ts +29 -0
package/src/lib/logger.ts +3 -1
package/src/routes/health.ts +151 -20
package/src/worker.ts +9 -1
package/src/workflows/check-alerts.ts +64 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hogsend/engine",
-  "version": "0.12.2",
+  "version": "0.13.1",
   "type": "module",
   "license": "MIT",
   "repository": {
@@ -40,14 +40,14 @@
     "svix": "^1.95.1",
     "winston": "^3.19.0",
     "zod": "^4.4.3",
-    "@hogsend/core": "^0.12.2",
-    "@hogsend/db": "^0.12.2",
-    "@hogsend/email": "^0.12.2",
-    "@hogsend/plugin-posthog": "^0.12.2",
-    "@hogsend/plugin-resend": "^0.12.2"
+    "@hogsend/core": "^0.13.1",
+    "@hogsend/db": "^0.13.1",
+    "@hogsend/email": "^0.13.1",
+    "@hogsend/plugin-posthog": "^0.13.1",
+    "@hogsend/plugin-resend": "^0.13.1"
   },
   "optionalDependencies": {
-    "@hogsend/plugin-postmark": "^0.12.2"
+    "@hogsend/plugin-postmark": "^0.13.1"
   },
   "devDependencies": {
     "@types/node": "^22.15.3",

package/src/env.ts CHANGED Viewed

@@ -1,5 +1,13 @@
 import { createEnv } from "@t3-oss/env-core";
 import { z } from "zod";
+import { addrSpecOf } from "./lib/from-address.js";
+// A from address may be bare ("doug@hogsend.com") or carry a display name
+// ("Doug at Hogsend <doug@hogsend.com>") — both are valid provider wire
+// formats. Domain derivation (lib/from-address.ts) parses either form.
+const fromAddress = z.string().refine((value) => addrSpecOf(value) !== null, {
+  message: 'Must be an email address or "Display Name <email>"',
+});
 /**
  * The HTTP API contract version — surfaced in the OpenAPI document
@@ -35,6 +43,14 @@ export const env = createEnv({
     // (the single intended secret-logging exception) — rotate it immediately via
     // the Studio forgot/reset flow. Min length matches better-auth's policy.
     STUDIO_ADMIN_PASSWORD: z.string().min(8).optional(),
+    // --- First-boot data-plane key bootstrap (lib/boot-api-key.ts) ---
+    // When the api_keys table is COMPLETELY empty on API boot (a template
+    // deploy that never ran the local `pnpm bootstrap`), the engine mints one
+    // ingest-scoped key ("bootstrap-ingest") and prints the FULL key ONCE to
+    // the server log — the data-plane sibling of the first-admin password
+    // above. Set "false" to opt out. A string enum (not z.coerce.boolean) so
+    // an explicit "false" actually disables it.
+    HOGSEND_BOOTSTRAP_API_KEY: z.enum(["true", "false"]).default("true"),
     // Extra origins allowed to call the auth endpoints (beyond BETTER_AUTH_URL),
     // comma-separated. Needed when the Studio is served from a different origin
     // than the API — e.g. the `hogsend studio` CLI pointing at a remote instance.
@@ -44,7 +60,7 @@ export const env = createEnv({
     // (container.ts) and the future `emailProvidersFromEnv` preset. With this
     // optional, a Postmark-only deploy boots without a Resend key.
     RESEND_API_KEY: z.string().min(1).optional(),
-    RESEND_FROM_EMAIL: z.string().email().default("noreply@hogsend.com"),
+    RESEND_FROM_EMAIL: fromAddress.default("noreply@hogsend.com"),
     // --- Provider-neutral email config (BYO email provider) ---
     // The active email provider id the container resolves from the
     // EmailProviderRegistry. Absent → "resend" (today's byte-for-byte default).
@@ -52,7 +68,7 @@ export const env = createEnv({
     // Neutral default-from address. The mailer's `defaultFrom` is
     // `EMAIL_FROM ?? RESEND_FROM_EMAIL`, so an unset EMAIL_FROM keeps today's
     // Resend-named default.
-    EMAIL_FROM: z.string().email().optional(),
+    EMAIL_FROM: fromAddress.optional(),
     // The sending domain the domain-status service reports on. OVERRIDES the
     // default derivation (host part of EMAIL_FROM, falling back to the host of
     // RESEND_FROM_EMAIL) — set it when you send from a subaddress domain that

package/src/index.ts CHANGED Viewed

@@ -156,6 +156,8 @@ export {
   reportWorkerReady,
   type WorkerReadyInfo,
 } from "./lib/boot.js";
+// --- First-boot data-plane key bootstrap (API process only, mirrors admin) ---
+export { bootstrapApiKeyFromEnv } from "./lib/boot-api-key.js";
 // --- First-admin creation (CLI + boot bootstrap share this scrypt-correct path)
 export { bootstrapAdminFromEnv } from "./lib/bootstrap-admin.js";
 // --- Bucket transition emission (shared by real-time / cron / fast-expiry) ---
@@ -201,6 +203,7 @@ export type {
 // --- Enrollment guards ---
 export { checkEmailPreferences } from "./lib/enrollment-guards.js";
 export { isFrequencyCapped } from "./lib/frequency-cap.js";
+export { addrSpecOf, hostOfFromAddress } from "./lib/from-address.js";
 export { hatchet } from "./lib/hatchet.js";
 // --- Ingestion pipeline ---
 export {

package/src/journeys/define-journey.ts CHANGED Viewed

@@ -267,6 +267,14 @@ export function defineJourney(options: {
           return { stateId, status: "exited" };
         }
+        logger.error("Journey run failed", {
+          journeyId: meta.id,
+          journeyName: meta.name,
+          stateId,
+          userId,
+          error: message,
+        });
         await hatchet.events.push("journey:failed", {
           journeyId: meta.id,
           stateId,

package/src/lib/boot-api-key.ts ADDED Viewed

@@ -0,0 +1,101 @@
+import { apiKeys } from "@hogsend/db";
+import { sql } from "drizzle-orm";
+import type { HogsendClient } from "../container.js";
+import { generateApiKey } from "./api-key-hash.js";
+/** Name of the key minted by the first-boot bootstrap (visible in Studio/admin). */
+export const BOOTSTRAP_API_KEY_NAME = "bootstrap-ingest";
+/**
+ * Boot-time first-key bootstrap — the data-plane sibling of
+ * `bootstrapAdminFromEnv` (lib/bootstrap-admin.ts). A local scaffold runs
+ * `pnpm bootstrap`, which mints an ingest-scoped `hsk_` key into `api_keys`
+ * BEFORE first boot — but a template deploy (Railway) never runs that script,
+ * so a fresh deployed instance has NO data-plane key and the customer's first
+ * `POST /v1/events` has nothing to authenticate with. This closes that gap.
+ *
+ * Contract (all conditions must hold to mint):
+ *  - `HOGSEND_BOOTSTRAP_API_KEY` is not `"false"` (default on; set `false` to
+ *    opt out entirely).
+ *  - The `api_keys` table has ZERO rows — revoked included, i.e. truly first
+ *    boot. Any existing row (including the local-bootstrap key) ⇒ no-op, so
+ *    the full key is naturally never logged twice.
+ *
+ * What is minted: one key named `bootstrap-ingest` with `scopes: ["ingest"]`
+ * — exactly what the scaffold's local bootstrap mints. Only the sha256 hash is
+ * stored (same `generateApiKey` the admin api-keys route uses); the FULL key is
+ * printed ONCE to the server log at warn level — the same intended
+ * secret-logging exception as the generated first-admin password ("shown
+ * once"). Rotate/revoke it any time via `POST /v1/admin/api-keys`.
+ *
+ * Concurrency: unlike the admin bootstrap there is no unique constraint to
+ * break a tie (two replicas would mint two different keys), so the zero-check +
+ * insert runs in a transaction serialized by a pg advisory xact lock — exactly
+ * one key is ever minted on a fresh table. Never fatal: any failure is logged
+ * and boot continues (the admin API remains the manual path).
+ *
+ * Runs in the API process only (not the worker) — same boot path as
+ * `bootstrapAdminFromEnv`, after the schema guard.
+ */
+export async function bootstrapApiKeyFromEnv(opts: {
+  client: HogsendClient;
+}): Promise<void> {
+  const { db, env, logger } = opts.client;
+  if (env.HOGSEND_BOOTSTRAP_API_KEY === "false") return;
+  try {
+    // Cheap pre-check outside the transaction: every boot after the first
+    // returns here without taking the lock.
+    const existing = await db.select({ id: apiKeys.id }).from(apiKeys).limit(1);
+    if (existing.length > 0) return;
+    const minted = await db.transaction(async (tx) => {
+      // Serialize concurrent replicas booting on a fresh DB: the loser blocks
+      // here, then sees the winner's row and no-ops. Lock is released on commit.
+      await tx.execute(
+        sql`select pg_advisory_xact_lock(hashtext('hogsend:bootstrap-api-key'))`,
+      );
+      const recheck = await tx
+        .select({ id: apiKeys.id })
+        .from(apiKeys)
+        .limit(1);
+      if (recheck.length > 0) return null;
+      const { key, prefix, hash } = generateApiKey();
+      await tx.insert(apiKeys).values({
+        name: BOOTSTRAP_API_KEY_NAME,
+        keyPrefix: prefix,
+        keyHash: hash,
+        scopes: ["ingest"],
+        createdBy: "boot",
+      });
+      return key;
+    });
+    if (!minted) {
+      logger.debug(
+        "[api-keys] First-boot key bootstrap skipped: a key already exists.",
+      );
+      return;
+    }
+    // The intended secret-logging exception (mirrors the generated first-admin
+    // password). Shown once — the table is non-empty from now on, so this
+    // branch is unreachable on every subsequent boot.
+    logger.warn(
+      `[api-keys] First-boot ingest API key (shown once — save it now): ${minted}`,
+    );
+    logger.warn(
+      "[api-keys] Use it as HOGSEND_API_KEY / `Authorization: Bearer <key>` " +
+        "for POST /v1/events. Rotate or revoke via POST /v1/admin/api-keys. " +
+        "Disable this bootstrap with HOGSEND_BOOTSTRAP_API_KEY=false.",
+    );
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    logger.error("[api-keys] First-boot key bootstrap failed.", {
+      error: message,
+    });
+  }
+}

package/src/lib/domain-status.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { DomainStatus, EmailProvider } from "@hogsend/core";
 import type { env as envSchema } from "../env.js";
+import { hostOfFromAddress } from "./from-address.js";
 import type { Logger } from "./logger.js";
 /**
@@ -69,15 +70,34 @@ export interface DomainStatusService {
 const VERIFIED_TTL_MS = 10 * 60 * 1000;
 /** TTL while unverified/failed/unknown — keeps test-mode auto-exit ≤60 s. */
 const UNVERIFIED_TTL_MS = 60 * 1000;
+/**
+ * Back-off TTL after a permission-denied (401/403) refresh failure — e.g. a
+ * send-only restricted Resend key that cannot read the domains API. Re-probing
+ * every UNVERIFIED_TTL_MS would warn-spam forever and can never succeed until
+ * the key changes, so we assume-verified (fail-open) and go quiet for 6 h.
+ * An explicit `getStatus({ refresh: true })` (admin route / CLI) still probes.
+ */
+const PERMISSION_BLOCKED_TTL_MS = 6 * 60 * 60 * 1000;
-/** Extract the host part of an email address ("hello@x.com" → "x.com"). */
-function hostPartOf(email: string | undefined): string | null {
-  if (!email) return null;
-  const at = email.lastIndexOf("@");
-  if (at === -1 || at === email.length - 1) return null;
-  return email.slice(at + 1).toLowerCase();
+/**
+ * Permission-style domains-API failure detection. Matches ONLY the providers'
+ * own structured messages — plugin-resend `domains.ts` and plugin-postmark
+ * `index.ts` both emit exactly "<Provider> domains API 401: <message>" or
+ * "<Provider> domains API request failed with status 403". Deliberately
+ * narrow: a bare 401/403/"forbidden" from an intermediary (WAF/proxy/CDN) in
+ * front of the provider must NOT match — it stays on the transient path so a
+ * blip can never displace a real cached status for the long back-off TTL.
+ */
+function isPermissionDeniedMessage(message: string): boolean {
+  return /\bdomains API (?:request failed with status )?40[13]\b/.test(message);
 }
+/**
+ * Extract the host part of a configured from address ("hello@x.com" or
+ * "Name <hello@x.com>" → "x.com") — display-name aware via from-address.ts.
+ */
+const hostPartOf = hostOfFromAddress;
 /** The Resend unverified-domain from-address fallback (so a redirected mail
  * still delivers while the real sending domain isn't verified yet). */
 const RESEND_UNVERIFIED_FROM = "onboarding@resend.dev";
@@ -174,11 +194,19 @@ export function createDomainStatusService(deps: {
   let cache: { snapshot: EngineDomainStatus; fetchedAt: number } | null = null;
   let inflight: Promise<EngineDomainStatus> | null = null;
+  // Permission-denied (401/403) back-off state: blocked extends the cache TTL
+  // so the background refresh stops re-probing a key that can never read
+  // domains; warned gates the explanatory warn to exactly ONCE per restriction
+  // episode (a later successful probe resets both, so a NEW restriction after
+  // recovery warns again).
+  let permissionBlocked = false;
+  let permissionWarned = false;
   const isFresh = (): boolean => {
     if (!cache) return false;
-    const ttl =
-      cache.snapshot.status?.state === "verified"
+    const ttl = permissionBlocked
+      ? PERMISSION_BLOCKED_TTL_MS
+      : cache.snapshot.status?.state === "verified"
         ? VERIFIED_TTL_MS
         : UNVERIFIED_TTL_MS;
     return Date.now() - cache.fetchedAt < ttl;
@@ -210,13 +238,23 @@ export function createDomainStatusService(deps: {
   let previousActive = false;
   /** Log the entering/exiting transition exactly once per flip of `active`. */
-  const logTransition = (testMode: TestModeState): void => {
+  const logTransition = (
+    testMode: TestModeState,
+    opts?: { assumedVerified?: boolean },
+  ): void => {
     if (testMode.active === previousActive) return;
     if (testMode.active) {
       logger.warn(
         "test mode ACTIVE — domain unverified, redirecting all sends",
         { redirectTo: testMode.redirectTo, reason: testMode.reason },
       );
+    } else if (opts?.assumedVerified) {
+      // Permission-block fail-open: verification was UNREADABLE, not
+      // confirmed — never claim "domain verified" on this path.
+      logger.warn(
+        "test mode exited — domain status unreadable (permission denied), failing open to LIVE sends",
+        { domain },
+      );
     } else {
       logger.info("test mode exited — domain verified, sends are LIVE", {
         domain,
@@ -230,7 +268,10 @@ export function createDomainStatusService(deps: {
    * `testMode` off the JUST-written cache and fire the transition log on a flip.
    * Test mode is computed last so it reads the fresh verification state.
    */
-  const commitSnapshot = (status: DomainStatus | null): EngineDomainStatus => {
+  const commitSnapshot = (
+    status: DomainStatus | null,
+    opts?: { assumedVerified?: boolean },
+  ): EngineDomainStatus => {
     // Seed the cache with a placeholder testMode so `computeTestMode` reads the
     // fresh `status`, then overwrite the block with the resolved state.
     const snapshot: EngineDomainStatus = {
@@ -248,7 +289,7 @@ export function createDomainStatusService(deps: {
     cache = { snapshot, fetchedAt: Date.now() };
     const testMode = computeTestMode();
     snapshot.testMode = testMode;
-    logTransition(testMode);
+    logTransition(testMode, opts);
     return snapshot;
   };
@@ -263,6 +304,10 @@ export function createDomainStatusService(deps: {
     // biome-ignore lint/style/noNonNullAssertion: `supported` guarantees it.
     const capability = provider.domains!;
     const providerStatus = await capability.get(domain);
+    // The key CAN read domains after all — clear any permission back-off so a
+    // key swap recovers immediately and a future restriction warns once again.
+    permissionBlocked = false;
+    permissionWarned = false;
     return commitSnapshot(
       // Provider doesn't know the domain yet → an explicit not_found status
       // (the Studio Setup view keys its add-domain form off this).
@@ -286,6 +331,41 @@ export function createDomainStatusService(deps: {
     return inflight;
   };
+  /**
+   * A 401/403 from the provider domains API (e.g. a send-only restricted
+   * Resend key): warn ONCE with what it means, then back off under the long
+   * {@link PERMISSION_BLOCKED_TTL_MS} so the background refresh stops
+   * re-probing + re-warning every UNVERIFIED_TTL_MS. When the cache already
+   * holds a REAL snapshot it is KEPT (TTL extended only) — overwriting a
+   * genuinely-unverified status with assumed-verified `null` would disarm an
+   * armed auto test-mode for 6h off one permission-shaped failure. Only a
+   * cold cache commits the assumed-verified `null` snapshot (fail-open
+   * verified, the existing contract — production mail is never redirected).
+   */
+  const markPermissionBlocked = (message: string): void => {
+    permissionBlocked = true;
+    if (!permissionWarned) {
+      permissionWarned = true;
+      logger.warn(
+        "domain-status: the email provider API key cannot read domains " +
+          "(permission denied). Keeping the last fetched domain status; " +
+          "without one, verification is assumed-verified (fail-open: " +
+          "production mail is never redirected) and HOGSEND_TEST_MODE=auto " +
+          "cannot arm. Set HOGSEND_TEST_MODE=true to force test-mode " +
+          "redirects, or use a full-access API key. Suppressing domain " +
+          "checks for 6h.",
+        { domain, providerId, error: message },
+      );
+    }
+    if (cache) {
+      // Preserve the last real snapshot as the truth; just push its
+      // freshness window out to the back-off TTL.
+      cache.fetchedAt = Date.now();
+      return;
+    }
+    commitSnapshot(null, { assumedVerified: true });
+  };
   return {
     async getStatus(opts?: { refresh?: boolean }): Promise<EngineDomainStatus> {
       if (opts?.refresh) {
@@ -316,10 +396,19 @@ export function createDomainStatusService(deps: {
     refreshIfStale(): void {
       if (isFresh()) return;
       void fetchDeduped().catch((error: unknown) => {
+        const message = error instanceof Error ? error.message : String(error);
+        // Permission-style failure (401/403, e.g. a send-only restricted key):
+        // one explanatory warn + long back-off instead of warn-spam forever.
+        if (isPermissionDeniedMessage(message)) {
+          markPermissionBlocked(message);
+          return;
+        }
+        // Transient failures (network, 5xx) keep the existing behavior: warn
+        // every stale refresh, short TTL, fail-open verified via the cache.
         logger.warn("domain-status refresh failed", {
           domain,
           providerId,
-          error: error instanceof Error ? error.message : String(error),
+          error: message,
         });
       });
     },

package/src/lib/from-address.ts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * From-address helpers. A configured from address may be a bare addr-spec
+ * ("doug@hogsend.com") or carry a display name ("Doug at Hogsend
+ * <doug@hogsend.com>") — both are valid on the wire for every supported
+ * provider. These helpers parse either form so env validation and
+ * domain derivation agree on what the address part is.
+ */
+const ADDR_SPEC_RE = /^[^\s@<>]+@[^\s@<>]+\.[^\s@<>]+$/;
+/**
+ * Extract the addr-spec from a from address ("Doug <d@x.com>" → "d@x.com",
+ * "d@x.com" → "d@x.com"). Returns null when no valid address is present.
+ */
+export function addrSpecOf(value: string | undefined): string | null {
+  if (!value) return null;
+  const match = value.trim().match(/^[^<>]*<([^<>]+)>$/);
+  const addr = (match?.[1] ?? value).trim();
+  return ADDR_SPEC_RE.test(addr) ? addr.toLowerCase() : null;
+}
+/** Host part of a from address ("Doug <d@x.com>" → "x.com"). */
+export function hostOfFromAddress(value: string | undefined): string | null {
+  const addr = addrSpecOf(value);
+  if (!addr) return null;
+  const at = addr.lastIndexOf("@");
+  if (at === -1 || at === addr.length - 1) return null;
+  return addr.slice(at + 1);
+}

package/src/lib/logger.ts CHANGED Viewed

@@ -12,7 +12,9 @@ export function createLogger(level: string = "info") {
       winston.format.timestamp(),
       winston.format.errors({ stack: true }),
     ),
-    defaultMeta: { service: "growthhog-api" },
+    // Service label for structured logs. Override per-deploy with SERVICE_NAME;
+    // the neutral default keeps scaffolded apps from inheriting dogfood branding.
+    defaultMeta: { service: process.env.SERVICE_NAME ?? "hogsend" },
     transports: [
       new winston.transports.Console({
         format: winston.format.combine(

package/src/routes/health.ts CHANGED Viewed

@@ -1,6 +1,12 @@
-import { getClientSchemaVersion, getEngineSchemaVersion } from "@hogsend/db";
+import {
+  type Database,
+  emailSends,
+  getClientSchemaVersion,
+  getEngineSchemaVersion,
+  journeyStates,
+} from "@hogsend/db";
 import { createRoute, OpenAPIHono, z } from "@hono/zod-openapi";
-import { sql } from "drizzle-orm";
+import { gte, sql } from "drizzle-orm";
 import type { AppEnv } from "../app.js";
 import { API_VERSION } from "../env.js";
 import { getRedis } from "../lib/redis.js";
@@ -29,6 +35,22 @@ const trackSchema = z.object({
   pending: z.array(z.string()),
 });
+// Recent activity counts (last 24h). Surfaces silent failures — a failed
+// journey or send otherwise only shows in worker logs while health stays
+// green. Informational only: counts never affect `status`, and a query
+// failure degrades each count to null rather than breaking health.
+const activitySchema = z.object({
+  windowHours: z.number(),
+  journeys: z.object({
+    failed: z.number().nullable(),
+    completed: z.number().nullable(),
+  }),
+  emails: z.object({
+    failed: z.number().nullable(),
+    sent: z.number().nullable(),
+  }),
+});
 const healthResponseSchema = z.object({
   status: z.enum(["healthy", "degraded", "migration_pending"]),
   uptime: z.number(),
@@ -43,6 +65,7 @@ const healthResponseSchema = z.object({
     engine: trackSchema,
     client: trackSchema,
   }),
+  activity: activitySchema,
 });
 const healthRoute = createRoute({
@@ -60,12 +83,114 @@ const healthRoute = createRoute({
   },
 });
+const ACTIVITY_WINDOW_HOURS = 24;
+type Activity = z.infer<typeof activitySchema>;
+const NULL_ACTIVITY: Activity = {
+  windowHours: ACTIVITY_WINDOW_HOURS,
+  journeys: { failed: null, completed: null },
+  emails: { failed: null, sent: null },
+};
+// Reporting must never slow the healthcheck down: an unreachable DB makes the
+// COUNT queries hang on connect (the component check above answers "down"
+// fast, but a fresh query can queue behind the pool), so the whole thing is
+// raced against a short deadline and degrades to nulls.
+const ACTIVITY_TIMEOUT_MS = 1500;
+// Cheap windowed COUNTs (one FILTER query per table; the time columns are
+// indexed — email_sends_created_at_idx and journey_states_updated_at_idx —
+// so each prunes by index instead of seq-scanning on every healthcheck hit).
+// Never throws — any failure degrades to nulls so a reporting hiccup can't
+// take the healthcheck down.
+async function getRecentActivity(db: Database): Promise<Activity> {
+  return Promise.race([
+    queryRecentActivity(db),
+    new Promise<Activity>((resolve) =>
+      setTimeout(() => resolve(NULL_ACTIVITY), ACTIVITY_TIMEOUT_MS).unref?.(),
+    ),
+  ]);
+}
+async function queryRecentActivity(db: Database): Promise<Activity> {
+  const since = new Date(Date.now() - ACTIVITY_WINDOW_HOURS * 60 * 60 * 1000);
+  try {
+    const [journeyRows, emailRows] = await Promise.all([
+      db
+        .select({
+          failed: sql<number>`count(*) filter (where ${journeyStates.status} = 'failed')`,
+          completed: sql<number>`count(*) filter (where ${journeyStates.status} = 'completed')`,
+        })
+        .from(journeyStates)
+        // updatedAt (set on every status transition) so a journey entered
+        // days ago that failed/completed within the window still counts.
+        .where(gte(journeyStates.updatedAt, since)),
+      db
+        .select({
+          failed: sql<number>`count(*) filter (where ${emailSends.status} = 'failed')`,
+          sent: sql<number>`count(*) filter (where ${emailSends.status} in ('sent', 'delivered', 'opened', 'clicked', 'bounced', 'complained'))`,
+        })
+        .from(emailSends)
+        .where(gte(emailSends.createdAt, since)),
+    ]);
+    return {
+      windowHours: ACTIVITY_WINDOW_HOURS,
+      journeys: {
+        failed: Number(journeyRows[0]?.failed ?? 0),
+        completed: Number(journeyRows[0]?.completed ?? 0),
+      },
+      emails: {
+        failed: Number(emailRows[0]?.failed ?? 0),
+        sent: Number(emailRows[0]?.sent ?? 0),
+      },
+    };
+  } catch {
+    return NULL_ACTIVITY;
+  }
+}
+// A component that can't answer quickly IS down for healthcheck purposes —
+// an unreachable Redis otherwise stalls the probe on ioredis reconnect
+// backoff, and a connection-refused Postgres makes postgres-js retry the
+// connect (default connect_timeout 30s) rather than reject, so EVERY db
+// consumer in this handler must be raced against a deadline or /v1/health
+// itself hangs.
+const COMPONENT_TIMEOUT_MS = 1500;
+// Race a read against the component deadline, degrading to `fallback`.
+// Unlike checkComponent this preserves the read's value type.
+async function withDeadline<T>(read: Promise<T>, fallback: T): Promise<T> {
+  return Promise.race([
+    read,
+    new Promise<T>((resolve) =>
+      setTimeout(() => resolve(fallback), COMPONENT_TIMEOUT_MS).unref?.(),
+    ),
+  ]);
+}
+// Degraded schema read: the timeout means the DB didn't answer, which the
+// `database` component already reports — claiming `migration_pending` on top
+// of that would be spurious, so an unreadable track degrades to in-sync.
+const NULL_SCHEMA = {
+  required: null,
+  applied: null,
+  pending: [] as string[],
+  inSync: true,
+};
 async function checkComponent(
   fn: () => Promise<void>,
 ): Promise<{ status: "up" | "down"; latencyMs: number }> {
   const start = performance.now();
+  const timeout = new Promise<never>((_, reject) =>
+    setTimeout(
+      () => reject(new Error("component check timed out")),
+      COMPONENT_TIMEOUT_MS,
+    ).unref?.(),
+  );
   try {
-    await fn();
+    await Promise.race([fn(), timeout]);
     return {
       status: "up",
       latencyMs: Math.round(performance.now() - start),
@@ -83,23 +208,28 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
   async (c) => {
     const { db, clientJournal } = c.get("container");
-    const [dbCheck, redisCheck, heartbeat, engine, client] = await Promise.all([
-      checkComponent(async () => {
-        await db.execute(sql`SELECT 1`);
-      }),
-      checkComponent(async () => {
-        // Actively probe: getRedis() lazily creates + connects the client (with
-        // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
-        // a client if something had ALREADY created one — which nothing does when
-        // PostHog is disabled — so redis always read "down" even though it was
-        // reachable. ioredis buffers the ping until connected (or rejects if the
-        // host is genuinely unreachable → a truthful "down").
-        await getRedis().ping();
-      }),
-      getWorkerHeartbeat(),
-      getEngineSchemaVersion(db),
-      getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
-    ]);
+    const [dbCheck, redisCheck, heartbeat, engine, client, activity] =
+      await Promise.all([
+        checkComponent(async () => {
+          await db.execute(sql`SELECT 1`);
+        }),
+        checkComponent(async () => {
+          // Actively probe: getRedis() lazily creates + connects the client (with
+          // family:0 for Railway IPv6). The old getRedisIfConnected() only returned
+          // a client if something had ALREADY created one — which nothing does when
+          // PostHog is disabled — so redis always read "down" even though it was
+          // reachable. ioredis buffers the ping until connected (or rejects if the
+          // host is genuinely unreachable → a truthful "down").
+          await getRedis().ping();
+        }),
+        withDeadline(getWorkerHeartbeat(), { alive: false }),
+        withDeadline(getEngineSchemaVersion(db), NULL_SCHEMA),
+        withDeadline(
+          getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
+          NULL_SCHEMA,
+        ),
+        getRecentActivity(db),
+      ]);
     // `migration_pending` if EITHER track is behind. The engine track also gates
     // boot (fatal); the client track surfaces here non-fatally (client-owned).
@@ -139,6 +269,7 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
             lastSeenAt: heartbeat.lastSeenAt,
           },
         },
+        activity,
       },
       200,
     );

package/src/worker.ts CHANGED Viewed

@@ -5,7 +5,7 @@ import {
 } from "./buckets/registry.js";
 import type { HogsendClient } from "./container.js";
 import type { DefinedJourney } from "./journeys/define-journey.js";
-import { selectJourneyTasks } from "./journeys/registry.js";
+import { parseEnabledFilter, selectJourneyTasks } from "./journeys/registry.js";
 import { reportWorkerReady } from "./lib/boot.js";
 import { hatchet } from "./lib/hatchet.js";
 import { getRedisIfConnected } from "./lib/redis.js";
@@ -49,6 +49,13 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
   const { container, journeys } = opts;
   const enabled = opts.enabledJourneys ?? container.env.ENABLED_JOURNEYS;
   const journeyTasks = selectJourneyTasks(journeys, enabled);
+  // The enabled journey IDs, logged at startup so a stale worker (one missing a
+  // newly added journey because the dev watcher never restarted it) is visible
+  // at a glance — counts alone can't show WHICH journeys are registered.
+  const journeyFilter = parseEnabledFilter(enabled);
+  const journeyIds = journeys
+    .filter((j) => journeyFilter === "*" || journeyFilter.has(j.meta.id))
+    .map((j) => j.meta.id);
   const enabledBuckets = opts.enabledBuckets ?? container.env.ENABLED_BUCKETS;
   // The single place a bucket's per-user fast-expiry timer task is constructed
@@ -111,6 +118,7 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
     // "ready" line only fires once `hatchet.worker()` resolves).
     container.logger.info("Hogsend worker starting", {
       hatchet: container.env.HATCHET_CLIENT_HOST_PORT,
+      journeys: journeyIds,
     });
     _worker = await hatchet.worker("hogsend-worker", { workflows });

package/src/workflows/check-alerts.ts CHANGED Viewed

@@ -1,7 +1,67 @@
-import { createDatabase } from "@hogsend/db";
+import {
+  createDatabase,
+  type Database,
+  emailSends,
+  journeyStates,
+} from "@hogsend/db";
+import { and, eq, gte, sql } from "drizzle-orm";
 import { checkAlertRules } from "../lib/alerting.js";
 import { hatchet } from "../lib/hatchet.js";
-import { createLogger } from "../lib/logger.js";
+import { createLogger, type Logger } from "../lib/logger.js";
+const FAILURE_WINDOW_MINUTES = 60;
+// Ruleless failure surfacing. The configured alert rules already cover failed
+// journeys (journey_failure_spike) and failed sends (they drag delivery_issue's
+// delivery rate down) — but a fresh install has NO alert_rules rows, so a
+// provider 403 fails silently while health stays green. This logs an error
+// for ANY failed send / failed journey state in the window, no rule required.
+async function surfaceRecentFailures(opts: {
+  db: Database;
+  logger: Logger;
+}): Promise<void> {
+  const { db, logger } = opts;
+  const since = new Date(Date.now() - FAILURE_WINDOW_MINUTES * 60 * 1000);
+  try {
+    const [journeyRows, emailRows] = await Promise.all([
+      db
+        .select({ count: sql<number>`count(*)` })
+        .from(journeyStates)
+        .where(
+          and(
+            eq(journeyStates.status, "failed"),
+            gte(journeyStates.updatedAt, since),
+          ),
+        ),
+      db
+        .select({ count: sql<number>`count(*)` })
+        .from(emailSends)
+        .where(
+          and(
+            eq(emailSends.status, "failed"),
+            gte(emailSends.createdAt, since),
+          ),
+        ),
+    ]);
+    const failedJourneys = Number(journeyRows[0]?.count ?? 0);
+    const failedEmails = Number(emailRows[0]?.count ?? 0);
+    if (failedJourneys > 0 || failedEmails > 0) {
+      logger.error("Recent failures detected", {
+        failedJourneys,
+        failedEmails,
+        windowMinutes: FAILURE_WINDOW_MINUTES,
+        hint: "Check journey_states.error_message and email_sends rows; /v1/health `activity` shows 24h counts",
+      });
+    }
+  } catch (err) {
+    logger.warn("Failed to check recent failures", {
+      error: err instanceof Error ? err.message : String(err),
+    });
+  }
+}
 export const checkAlertsTask = hatchet.task({
   name: "check-alerts",
@@ -13,6 +73,8 @@ export const checkAlertsTask = hatchet.task({
     });
     const logger = createLogger(process.env.LOG_LEVEL ?? "info");
+    await surfaceRecentFailures({ db, logger });
     await checkAlertRules({
       db,
       logger,