@hogsend/engine 0.12.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -7
- package/src/env.ts +8 -0
- package/src/index.ts +2 -0
- package/src/journeys/define-journey.ts +8 -0
- package/src/lib/boot-api-key.ts +101 -0
- package/src/lib/domain-status.ts +96 -6
- package/src/lib/logger.ts +3 -1
- package/src/routes/health.ts +151 -20
- package/src/worker.ts +9 -1
- package/src/workflows/check-alerts.ts +64 -2
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hogsend/engine",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"repository": {
|
|
@@ -40,14 +40,14 @@
|
|
|
40
40
|
"svix": "^1.95.1",
|
|
41
41
|
"winston": "^3.19.0",
|
|
42
42
|
"zod": "^4.4.3",
|
|
43
|
-
"@hogsend/core": "^0.
|
|
44
|
-
"@hogsend/db": "^0.
|
|
45
|
-
"@hogsend/email": "^0.
|
|
46
|
-
"@hogsend/plugin-posthog": "^0.
|
|
47
|
-
"@hogsend/plugin-resend": "^0.
|
|
43
|
+
"@hogsend/core": "^0.13.0",
|
|
44
|
+
"@hogsend/db": "^0.13.0",
|
|
45
|
+
"@hogsend/email": "^0.13.0",
|
|
46
|
+
"@hogsend/plugin-posthog": "^0.13.0",
|
|
47
|
+
"@hogsend/plugin-resend": "^0.13.0"
|
|
48
48
|
},
|
|
49
49
|
"optionalDependencies": {
|
|
50
|
-
"@hogsend/plugin-postmark": "^0.
|
|
50
|
+
"@hogsend/plugin-postmark": "^0.13.0"
|
|
51
51
|
},
|
|
52
52
|
"devDependencies": {
|
|
53
53
|
"@types/node": "^22.15.3",
|
package/src/env.ts
CHANGED
|
@@ -35,6 +35,14 @@ export const env = createEnv({
|
|
|
35
35
|
// (the single intended secret-logging exception) — rotate it immediately via
|
|
36
36
|
// the Studio forgot/reset flow. Min length matches better-auth's policy.
|
|
37
37
|
STUDIO_ADMIN_PASSWORD: z.string().min(8).optional(),
|
|
38
|
+
// --- First-boot data-plane key bootstrap (lib/boot-api-key.ts) ---
|
|
39
|
+
// When the api_keys table is COMPLETELY empty on API boot (a template
|
|
40
|
+
// deploy that never ran the local `pnpm bootstrap`), the engine mints one
|
|
41
|
+
// ingest-scoped key ("bootstrap-ingest") and prints the FULL key ONCE to
|
|
42
|
+
// the server log — the data-plane sibling of the first-admin password
|
|
43
|
+
// above. Set "false" to opt out. A string enum (not z.coerce.boolean) so
|
|
44
|
+
// an explicit "false" actually disables it.
|
|
45
|
+
HOGSEND_BOOTSTRAP_API_KEY: z.enum(["true", "false"]).default("true"),
|
|
38
46
|
// Extra origins allowed to call the auth endpoints (beyond BETTER_AUTH_URL),
|
|
39
47
|
// comma-separated. Needed when the Studio is served from a different origin
|
|
40
48
|
// than the API — e.g. the `hogsend studio` CLI pointing at a remote instance.
|
package/src/index.ts
CHANGED
|
@@ -156,6 +156,8 @@ export {
|
|
|
156
156
|
reportWorkerReady,
|
|
157
157
|
type WorkerReadyInfo,
|
|
158
158
|
} from "./lib/boot.js";
|
|
159
|
+
// --- First-boot data-plane key bootstrap (API process only, mirrors admin) ---
|
|
160
|
+
export { bootstrapApiKeyFromEnv } from "./lib/boot-api-key.js";
|
|
159
161
|
// --- First-admin creation (CLI + boot bootstrap share this scrypt-correct path)
|
|
160
162
|
export { bootstrapAdminFromEnv } from "./lib/bootstrap-admin.js";
|
|
161
163
|
// --- Bucket transition emission (shared by real-time / cron / fast-expiry) ---
|
|
@@ -267,6 +267,14 @@ export function defineJourney(options: {
|
|
|
267
267
|
return { stateId, status: "exited" };
|
|
268
268
|
}
|
|
269
269
|
|
|
270
|
+
logger.error("Journey run failed", {
|
|
271
|
+
journeyId: meta.id,
|
|
272
|
+
journeyName: meta.name,
|
|
273
|
+
stateId,
|
|
274
|
+
userId,
|
|
275
|
+
error: message,
|
|
276
|
+
});
|
|
277
|
+
|
|
270
278
|
await hatchet.events.push("journey:failed", {
|
|
271
279
|
journeyId: meta.id,
|
|
272
280
|
stateId,
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { apiKeys } from "@hogsend/db";
|
|
2
|
+
import { sql } from "drizzle-orm";
|
|
3
|
+
import type { HogsendClient } from "../container.js";
|
|
4
|
+
import { generateApiKey } from "./api-key-hash.js";
|
|
5
|
+
|
|
6
|
+
/** Name of the key minted by the first-boot bootstrap (visible in Studio/admin). */
|
|
7
|
+
export const BOOTSTRAP_API_KEY_NAME = "bootstrap-ingest";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Boot-time first-key bootstrap — the data-plane sibling of
|
|
11
|
+
* `bootstrapAdminFromEnv` (lib/bootstrap-admin.ts). A local scaffold runs
|
|
12
|
+
* `pnpm bootstrap`, which mints an ingest-scoped `hsk_` key into `api_keys`
|
|
13
|
+
* BEFORE first boot — but a template deploy (Railway) never runs that script,
|
|
14
|
+
* so a fresh deployed instance has NO data-plane key and the customer's first
|
|
15
|
+
* `POST /v1/events` has nothing to authenticate with. This closes that gap.
|
|
16
|
+
*
|
|
17
|
+
* Contract (all conditions must hold to mint):
|
|
18
|
+
* - `HOGSEND_BOOTSTRAP_API_KEY` is not `"false"` (default on; set `false` to
|
|
19
|
+
* opt out entirely).
|
|
20
|
+
* - The `api_keys` table has ZERO rows — revoked included, i.e. truly first
|
|
21
|
+
* boot. Any existing row (including the local-bootstrap key) ⇒ no-op, so
|
|
22
|
+
* the full key is naturally never logged twice.
|
|
23
|
+
*
|
|
24
|
+
* What is minted: one key named `bootstrap-ingest` with `scopes: ["ingest"]`
|
|
25
|
+
* — exactly what the scaffold's local bootstrap mints. Only the sha256 hash is
|
|
26
|
+
* stored (same `generateApiKey` the admin api-keys route uses); the FULL key is
|
|
27
|
+
* printed ONCE to the server log at warn level — the same intended
|
|
28
|
+
* secret-logging exception as the generated first-admin password ("shown
|
|
29
|
+
* once"). Rotate/revoke it any time via `POST /v1/admin/api-keys`.
|
|
30
|
+
*
|
|
31
|
+
* Concurrency: unlike the admin bootstrap there is no unique constraint to
|
|
32
|
+
* break a tie (two replicas would mint two different keys), so the zero-check +
|
|
33
|
+
* insert runs in a transaction serialized by a pg advisory xact lock — exactly
|
|
34
|
+
* one key is ever minted on a fresh table. Never fatal: any failure is logged
|
|
35
|
+
* and boot continues (the admin API remains the manual path).
|
|
36
|
+
*
|
|
37
|
+
* Runs in the API process only (not the worker) — same boot path as
|
|
38
|
+
* `bootstrapAdminFromEnv`, after the schema guard.
|
|
39
|
+
*/
|
|
40
|
+
export async function bootstrapApiKeyFromEnv(opts: {
|
|
41
|
+
client: HogsendClient;
|
|
42
|
+
}): Promise<void> {
|
|
43
|
+
const { db, env, logger } = opts.client;
|
|
44
|
+
|
|
45
|
+
if (env.HOGSEND_BOOTSTRAP_API_KEY === "false") return;
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
// Cheap pre-check outside the transaction: every boot after the first
|
|
49
|
+
// returns here without taking the lock.
|
|
50
|
+
const existing = await db.select({ id: apiKeys.id }).from(apiKeys).limit(1);
|
|
51
|
+
if (existing.length > 0) return;
|
|
52
|
+
|
|
53
|
+
const minted = await db.transaction(async (tx) => {
|
|
54
|
+
// Serialize concurrent replicas booting on a fresh DB: the loser blocks
|
|
55
|
+
// here, then sees the winner's row and no-ops. Lock is released on commit.
|
|
56
|
+
await tx.execute(
|
|
57
|
+
sql`select pg_advisory_xact_lock(hashtext('hogsend:bootstrap-api-key'))`,
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
const recheck = await tx
|
|
61
|
+
.select({ id: apiKeys.id })
|
|
62
|
+
.from(apiKeys)
|
|
63
|
+
.limit(1);
|
|
64
|
+
if (recheck.length > 0) return null;
|
|
65
|
+
|
|
66
|
+
const { key, prefix, hash } = generateApiKey();
|
|
67
|
+
await tx.insert(apiKeys).values({
|
|
68
|
+
name: BOOTSTRAP_API_KEY_NAME,
|
|
69
|
+
keyPrefix: prefix,
|
|
70
|
+
keyHash: hash,
|
|
71
|
+
scopes: ["ingest"],
|
|
72
|
+
createdBy: "boot",
|
|
73
|
+
});
|
|
74
|
+
return key;
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
if (!minted) {
|
|
78
|
+
logger.debug(
|
|
79
|
+
"[api-keys] First-boot key bootstrap skipped: a key already exists.",
|
|
80
|
+
);
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// The intended secret-logging exception (mirrors the generated first-admin
|
|
85
|
+
// password). Shown once — the table is non-empty from now on, so this
|
|
86
|
+
// branch is unreachable on every subsequent boot.
|
|
87
|
+
logger.warn(
|
|
88
|
+
`[api-keys] First-boot ingest API key (shown once — save it now): ${minted}`,
|
|
89
|
+
);
|
|
90
|
+
logger.warn(
|
|
91
|
+
"[api-keys] Use it as HOGSEND_API_KEY / `Authorization: Bearer <key>` " +
|
|
92
|
+
"for POST /v1/events. Rotate or revoke via POST /v1/admin/api-keys. " +
|
|
93
|
+
"Disable this bootstrap with HOGSEND_BOOTSTRAP_API_KEY=false.",
|
|
94
|
+
);
|
|
95
|
+
} catch (err) {
|
|
96
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
97
|
+
logger.error("[api-keys] First-boot key bootstrap failed.", {
|
|
98
|
+
error: message,
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
}
|
package/src/lib/domain-status.ts
CHANGED
|
@@ -69,6 +69,27 @@ export interface DomainStatusService {
|
|
|
69
69
|
const VERIFIED_TTL_MS = 10 * 60 * 1000;
|
|
70
70
|
/** TTL while unverified/failed/unknown — keeps test-mode auto-exit ≤60 s. */
|
|
71
71
|
const UNVERIFIED_TTL_MS = 60 * 1000;
|
|
72
|
+
/**
|
|
73
|
+
* Back-off TTL after a permission-denied (401/403) refresh failure — e.g. a
|
|
74
|
+
* send-only restricted Resend key that cannot read the domains API. Re-probing
|
|
75
|
+
* every UNVERIFIED_TTL_MS would warn-spam forever and can never succeed until
|
|
76
|
+
* the key changes, so we assume-verified (fail-open) and go quiet for 6 h.
|
|
77
|
+
* An explicit `getStatus({ refresh: true })` (admin route / CLI) still probes.
|
|
78
|
+
*/
|
|
79
|
+
const PERMISSION_BLOCKED_TTL_MS = 6 * 60 * 60 * 1000;
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Permission-style domains-API failure detection. Matches ONLY the providers'
|
|
83
|
+
* own structured messages — plugin-resend `domains.ts` and plugin-postmark
|
|
84
|
+
* `index.ts` both emit exactly "<Provider> domains API 401: <message>" or
|
|
85
|
+
* "<Provider> domains API request failed with status 403". Deliberately
|
|
86
|
+
* narrow: a bare 401/403/"forbidden" from an intermediary (WAF/proxy/CDN) in
|
|
87
|
+
* front of the provider must NOT match — it stays on the transient path so a
|
|
88
|
+
* blip can never displace a real cached status for the long back-off TTL.
|
|
89
|
+
*/
|
|
90
|
+
function isPermissionDeniedMessage(message: string): boolean {
|
|
91
|
+
return /\bdomains API (?:request failed with status )?40[13]\b/.test(message);
|
|
92
|
+
}
|
|
72
93
|
|
|
73
94
|
/** Extract the host part of an email address ("hello@x.com" → "x.com"). */
|
|
74
95
|
function hostPartOf(email: string | undefined): string | null {
|
|
@@ -174,11 +195,19 @@ export function createDomainStatusService(deps: {
|
|
|
174
195
|
|
|
175
196
|
let cache: { snapshot: EngineDomainStatus; fetchedAt: number } | null = null;
|
|
176
197
|
let inflight: Promise<EngineDomainStatus> | null = null;
|
|
198
|
+
// Permission-denied (401/403) back-off state: blocked extends the cache TTL
|
|
199
|
+
// so the background refresh stops re-probing a key that can never read
|
|
200
|
+
// domains; warned gates the explanatory warn to exactly ONCE per restriction
|
|
201
|
+
// episode (a later successful probe resets both, so a NEW restriction after
|
|
202
|
+
// recovery warns again).
|
|
203
|
+
let permissionBlocked = false;
|
|
204
|
+
let permissionWarned = false;
|
|
177
205
|
|
|
178
206
|
const isFresh = (): boolean => {
|
|
179
207
|
if (!cache) return false;
|
|
180
|
-
const ttl =
|
|
181
|
-
|
|
208
|
+
const ttl = permissionBlocked
|
|
209
|
+
? PERMISSION_BLOCKED_TTL_MS
|
|
210
|
+
: cache.snapshot.status?.state === "verified"
|
|
182
211
|
? VERIFIED_TTL_MS
|
|
183
212
|
: UNVERIFIED_TTL_MS;
|
|
184
213
|
return Date.now() - cache.fetchedAt < ttl;
|
|
@@ -210,13 +239,23 @@ export function createDomainStatusService(deps: {
|
|
|
210
239
|
let previousActive = false;
|
|
211
240
|
|
|
212
241
|
/** Log the entering/exiting transition exactly once per flip of `active`. */
|
|
213
|
-
const logTransition = (
|
|
242
|
+
const logTransition = (
|
|
243
|
+
testMode: TestModeState,
|
|
244
|
+
opts?: { assumedVerified?: boolean },
|
|
245
|
+
): void => {
|
|
214
246
|
if (testMode.active === previousActive) return;
|
|
215
247
|
if (testMode.active) {
|
|
216
248
|
logger.warn(
|
|
217
249
|
"test mode ACTIVE — domain unverified, redirecting all sends",
|
|
218
250
|
{ redirectTo: testMode.redirectTo, reason: testMode.reason },
|
|
219
251
|
);
|
|
252
|
+
} else if (opts?.assumedVerified) {
|
|
253
|
+
// Permission-block fail-open: verification was UNREADABLE, not
|
|
254
|
+
// confirmed — never claim "domain verified" on this path.
|
|
255
|
+
logger.warn(
|
|
256
|
+
"test mode exited — domain status unreadable (permission denied), failing open to LIVE sends",
|
|
257
|
+
{ domain },
|
|
258
|
+
);
|
|
220
259
|
} else {
|
|
221
260
|
logger.info("test mode exited — domain verified, sends are LIVE", {
|
|
222
261
|
domain,
|
|
@@ -230,7 +269,10 @@ export function createDomainStatusService(deps: {
|
|
|
230
269
|
* `testMode` off the JUST-written cache and fire the transition log on a flip.
|
|
231
270
|
* Test mode is computed last so it reads the fresh verification state.
|
|
232
271
|
*/
|
|
233
|
-
const commitSnapshot = (
|
|
272
|
+
const commitSnapshot = (
|
|
273
|
+
status: DomainStatus | null,
|
|
274
|
+
opts?: { assumedVerified?: boolean },
|
|
275
|
+
): EngineDomainStatus => {
|
|
234
276
|
// Seed the cache with a placeholder testMode so `computeTestMode` reads the
|
|
235
277
|
// fresh `status`, then overwrite the block with the resolved state.
|
|
236
278
|
const snapshot: EngineDomainStatus = {
|
|
@@ -248,7 +290,7 @@ export function createDomainStatusService(deps: {
|
|
|
248
290
|
cache = { snapshot, fetchedAt: Date.now() };
|
|
249
291
|
const testMode = computeTestMode();
|
|
250
292
|
snapshot.testMode = testMode;
|
|
251
|
-
logTransition(testMode);
|
|
293
|
+
logTransition(testMode, opts);
|
|
252
294
|
return snapshot;
|
|
253
295
|
};
|
|
254
296
|
|
|
@@ -263,6 +305,10 @@ export function createDomainStatusService(deps: {
|
|
|
263
305
|
// biome-ignore lint/style/noNonNullAssertion: `supported` guarantees it.
|
|
264
306
|
const capability = provider.domains!;
|
|
265
307
|
const providerStatus = await capability.get(domain);
|
|
308
|
+
// The key CAN read domains after all — clear any permission back-off so a
|
|
309
|
+
// key swap recovers immediately and a future restriction warns once again.
|
|
310
|
+
permissionBlocked = false;
|
|
311
|
+
permissionWarned = false;
|
|
266
312
|
return commitSnapshot(
|
|
267
313
|
// Provider doesn't know the domain yet → an explicit not_found status
|
|
268
314
|
// (the Studio Setup view keys its add-domain form off this).
|
|
@@ -286,6 +332,41 @@ export function createDomainStatusService(deps: {
|
|
|
286
332
|
return inflight;
|
|
287
333
|
};
|
|
288
334
|
|
|
335
|
+
/**
|
|
336
|
+
* A 401/403 from the provider domains API (e.g. a send-only restricted
|
|
337
|
+
* Resend key): warn ONCE with what it means, then back off under the long
|
|
338
|
+
* {@link PERMISSION_BLOCKED_TTL_MS} so the background refresh stops
|
|
339
|
+
* re-probing + re-warning every UNVERIFIED_TTL_MS. When the cache already
|
|
340
|
+
* holds a REAL snapshot it is KEPT (TTL extended only) — overwriting a
|
|
341
|
+
* genuinely-unverified status with assumed-verified `null` would disarm an
|
|
342
|
+
* armed auto test-mode for 6h off one permission-shaped failure. Only a
|
|
343
|
+
* cold cache commits the assumed-verified `null` snapshot (fail-open
|
|
344
|
+
* verified, the existing contract — production mail is never redirected).
|
|
345
|
+
*/
|
|
346
|
+
const markPermissionBlocked = (message: string): void => {
|
|
347
|
+
permissionBlocked = true;
|
|
348
|
+
if (!permissionWarned) {
|
|
349
|
+
permissionWarned = true;
|
|
350
|
+
logger.warn(
|
|
351
|
+
"domain-status: the email provider API key cannot read domains " +
|
|
352
|
+
"(permission denied). Keeping the last fetched domain status; " +
|
|
353
|
+
"without one, verification is assumed-verified (fail-open: " +
|
|
354
|
+
"production mail is never redirected) and HOGSEND_TEST_MODE=auto " +
|
|
355
|
+
"cannot arm. Set HOGSEND_TEST_MODE=true to force test-mode " +
|
|
356
|
+
"redirects, or use a full-access API key. Suppressing domain " +
|
|
357
|
+
"checks for 6h.",
|
|
358
|
+
{ domain, providerId, error: message },
|
|
359
|
+
);
|
|
360
|
+
}
|
|
361
|
+
if (cache) {
|
|
362
|
+
// Preserve the last real snapshot as the truth; just push its
|
|
363
|
+
// freshness window out to the back-off TTL.
|
|
364
|
+
cache.fetchedAt = Date.now();
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
commitSnapshot(null, { assumedVerified: true });
|
|
368
|
+
};
|
|
369
|
+
|
|
289
370
|
return {
|
|
290
371
|
async getStatus(opts?: { refresh?: boolean }): Promise<EngineDomainStatus> {
|
|
291
372
|
if (opts?.refresh) {
|
|
@@ -316,10 +397,19 @@ export function createDomainStatusService(deps: {
|
|
|
316
397
|
refreshIfStale(): void {
|
|
317
398
|
if (isFresh()) return;
|
|
318
399
|
void fetchDeduped().catch((error: unknown) => {
|
|
400
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
401
|
+
// Permission-style failure (401/403, e.g. a send-only restricted key):
|
|
402
|
+
// one explanatory warn + long back-off instead of warn-spam forever.
|
|
403
|
+
if (isPermissionDeniedMessage(message)) {
|
|
404
|
+
markPermissionBlocked(message);
|
|
405
|
+
return;
|
|
406
|
+
}
|
|
407
|
+
// Transient failures (network, 5xx) keep the existing behavior: warn
|
|
408
|
+
// every stale refresh, short TTL, fail-open verified via the cache.
|
|
319
409
|
logger.warn("domain-status refresh failed", {
|
|
320
410
|
domain,
|
|
321
411
|
providerId,
|
|
322
|
-
error:
|
|
412
|
+
error: message,
|
|
323
413
|
});
|
|
324
414
|
});
|
|
325
415
|
},
|
package/src/lib/logger.ts
CHANGED
|
@@ -12,7 +12,9 @@ export function createLogger(level: string = "info") {
|
|
|
12
12
|
winston.format.timestamp(),
|
|
13
13
|
winston.format.errors({ stack: true }),
|
|
14
14
|
),
|
|
15
|
-
|
|
15
|
+
// Service label for structured logs. Override per-deploy with SERVICE_NAME;
|
|
16
|
+
// the neutral default keeps scaffolded apps from inheriting dogfood branding.
|
|
17
|
+
defaultMeta: { service: process.env.SERVICE_NAME ?? "hogsend" },
|
|
16
18
|
transports: [
|
|
17
19
|
new winston.transports.Console({
|
|
18
20
|
format: winston.format.combine(
|
package/src/routes/health.ts
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
type Database,
|
|
3
|
+
emailSends,
|
|
4
|
+
getClientSchemaVersion,
|
|
5
|
+
getEngineSchemaVersion,
|
|
6
|
+
journeyStates,
|
|
7
|
+
} from "@hogsend/db";
|
|
2
8
|
import { createRoute, OpenAPIHono, z } from "@hono/zod-openapi";
|
|
3
|
-
import { sql } from "drizzle-orm";
|
|
9
|
+
import { gte, sql } from "drizzle-orm";
|
|
4
10
|
import type { AppEnv } from "../app.js";
|
|
5
11
|
import { API_VERSION } from "../env.js";
|
|
6
12
|
import { getRedis } from "../lib/redis.js";
|
|
@@ -29,6 +35,22 @@ const trackSchema = z.object({
|
|
|
29
35
|
pending: z.array(z.string()),
|
|
30
36
|
});
|
|
31
37
|
|
|
38
|
+
// Recent activity counts (last 24h). Surfaces silent failures — a failed
|
|
39
|
+
// journey or send otherwise only shows in worker logs while health stays
|
|
40
|
+
// green. Informational only: counts never affect `status`, and a query
|
|
41
|
+
// failure degrades each count to null rather than breaking health.
|
|
42
|
+
const activitySchema = z.object({
|
|
43
|
+
windowHours: z.number(),
|
|
44
|
+
journeys: z.object({
|
|
45
|
+
failed: z.number().nullable(),
|
|
46
|
+
completed: z.number().nullable(),
|
|
47
|
+
}),
|
|
48
|
+
emails: z.object({
|
|
49
|
+
failed: z.number().nullable(),
|
|
50
|
+
sent: z.number().nullable(),
|
|
51
|
+
}),
|
|
52
|
+
});
|
|
53
|
+
|
|
32
54
|
const healthResponseSchema = z.object({
|
|
33
55
|
status: z.enum(["healthy", "degraded", "migration_pending"]),
|
|
34
56
|
uptime: z.number(),
|
|
@@ -43,6 +65,7 @@ const healthResponseSchema = z.object({
|
|
|
43
65
|
engine: trackSchema,
|
|
44
66
|
client: trackSchema,
|
|
45
67
|
}),
|
|
68
|
+
activity: activitySchema,
|
|
46
69
|
});
|
|
47
70
|
|
|
48
71
|
const healthRoute = createRoute({
|
|
@@ -60,12 +83,114 @@ const healthRoute = createRoute({
|
|
|
60
83
|
},
|
|
61
84
|
});
|
|
62
85
|
|
|
86
|
+
const ACTIVITY_WINDOW_HOURS = 24;
|
|
87
|
+
|
|
88
|
+
type Activity = z.infer<typeof activitySchema>;
|
|
89
|
+
|
|
90
|
+
const NULL_ACTIVITY: Activity = {
|
|
91
|
+
windowHours: ACTIVITY_WINDOW_HOURS,
|
|
92
|
+
journeys: { failed: null, completed: null },
|
|
93
|
+
emails: { failed: null, sent: null },
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
// Reporting must never slow the healthcheck down: an unreachable DB makes the
|
|
97
|
+
// COUNT queries hang on connect (the component check above answers "down"
|
|
98
|
+
// fast, but a fresh query can queue behind the pool), so the whole thing is
|
|
99
|
+
// raced against a short deadline and degrades to nulls.
|
|
100
|
+
const ACTIVITY_TIMEOUT_MS = 1500;
|
|
101
|
+
|
|
102
|
+
// Cheap windowed COUNTs (one FILTER query per table; the time columns are
|
|
103
|
+
// indexed — email_sends_created_at_idx and journey_states_updated_at_idx —
|
|
104
|
+
// so each prunes by index instead of seq-scanning on every healthcheck hit).
|
|
105
|
+
// Never throws — any failure degrades to nulls so a reporting hiccup can't
|
|
106
|
+
// take the healthcheck down.
|
|
107
|
+
async function getRecentActivity(db: Database): Promise<Activity> {
|
|
108
|
+
return Promise.race([
|
|
109
|
+
queryRecentActivity(db),
|
|
110
|
+
new Promise<Activity>((resolve) =>
|
|
111
|
+
setTimeout(() => resolve(NULL_ACTIVITY), ACTIVITY_TIMEOUT_MS).unref?.(),
|
|
112
|
+
),
|
|
113
|
+
]);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async function queryRecentActivity(db: Database): Promise<Activity> {
|
|
117
|
+
const since = new Date(Date.now() - ACTIVITY_WINDOW_HOURS * 60 * 60 * 1000);
|
|
118
|
+
try {
|
|
119
|
+
const [journeyRows, emailRows] = await Promise.all([
|
|
120
|
+
db
|
|
121
|
+
.select({
|
|
122
|
+
failed: sql<number>`count(*) filter (where ${journeyStates.status} = 'failed')`,
|
|
123
|
+
completed: sql<number>`count(*) filter (where ${journeyStates.status} = 'completed')`,
|
|
124
|
+
})
|
|
125
|
+
.from(journeyStates)
|
|
126
|
+
// updatedAt (set on every status transition) so a journey entered
|
|
127
|
+
// days ago that failed/completed within the window still counts.
|
|
128
|
+
.where(gte(journeyStates.updatedAt, since)),
|
|
129
|
+
db
|
|
130
|
+
.select({
|
|
131
|
+
failed: sql<number>`count(*) filter (where ${emailSends.status} = 'failed')`,
|
|
132
|
+
sent: sql<number>`count(*) filter (where ${emailSends.status} in ('sent', 'delivered', 'opened', 'clicked', 'bounced', 'complained'))`,
|
|
133
|
+
})
|
|
134
|
+
.from(emailSends)
|
|
135
|
+
.where(gte(emailSends.createdAt, since)),
|
|
136
|
+
]);
|
|
137
|
+
return {
|
|
138
|
+
windowHours: ACTIVITY_WINDOW_HOURS,
|
|
139
|
+
journeys: {
|
|
140
|
+
failed: Number(journeyRows[0]?.failed ?? 0),
|
|
141
|
+
completed: Number(journeyRows[0]?.completed ?? 0),
|
|
142
|
+
},
|
|
143
|
+
emails: {
|
|
144
|
+
failed: Number(emailRows[0]?.failed ?? 0),
|
|
145
|
+
sent: Number(emailRows[0]?.sent ?? 0),
|
|
146
|
+
},
|
|
147
|
+
};
|
|
148
|
+
} catch {
|
|
149
|
+
return NULL_ACTIVITY;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// A component that can't answer quickly IS down for healthcheck purposes —
|
|
154
|
+
// an unreachable Redis otherwise stalls the probe on ioredis reconnect
|
|
155
|
+
// backoff, and a connection-refused Postgres makes postgres-js retry the
|
|
156
|
+
// connect (default connect_timeout 30s) rather than reject, so EVERY db
|
|
157
|
+
// consumer in this handler must be raced against a deadline or /v1/health
|
|
158
|
+
// itself hangs.
|
|
159
|
+
const COMPONENT_TIMEOUT_MS = 1500;
|
|
160
|
+
|
|
161
|
+
// Race a read against the component deadline, degrading to `fallback`.
|
|
162
|
+
// Unlike checkComponent this preserves the read's value type.
|
|
163
|
+
async function withDeadline<T>(read: Promise<T>, fallback: T): Promise<T> {
|
|
164
|
+
return Promise.race([
|
|
165
|
+
read,
|
|
166
|
+
new Promise<T>((resolve) =>
|
|
167
|
+
setTimeout(() => resolve(fallback), COMPONENT_TIMEOUT_MS).unref?.(),
|
|
168
|
+
),
|
|
169
|
+
]);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Degraded schema read: the timeout means the DB didn't answer, which the
|
|
173
|
+
// `database` component already reports — claiming `migration_pending` on top
|
|
174
|
+
// of that would be spurious, so an unreadable track degrades to in-sync.
|
|
175
|
+
const NULL_SCHEMA = {
|
|
176
|
+
required: null,
|
|
177
|
+
applied: null,
|
|
178
|
+
pending: [] as string[],
|
|
179
|
+
inSync: true,
|
|
180
|
+
};
|
|
181
|
+
|
|
63
182
|
async function checkComponent(
|
|
64
183
|
fn: () => Promise<void>,
|
|
65
184
|
): Promise<{ status: "up" | "down"; latencyMs: number }> {
|
|
66
185
|
const start = performance.now();
|
|
186
|
+
const timeout = new Promise<never>((_, reject) =>
|
|
187
|
+
setTimeout(
|
|
188
|
+
() => reject(new Error("component check timed out")),
|
|
189
|
+
COMPONENT_TIMEOUT_MS,
|
|
190
|
+
).unref?.(),
|
|
191
|
+
);
|
|
67
192
|
try {
|
|
68
|
-
await fn();
|
|
193
|
+
await Promise.race([fn(), timeout]);
|
|
69
194
|
return {
|
|
70
195
|
status: "up",
|
|
71
196
|
latencyMs: Math.round(performance.now() - start),
|
|
@@ -83,23 +208,28 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
|
|
|
83
208
|
async (c) => {
|
|
84
209
|
const { db, clientJournal } = c.get("container");
|
|
85
210
|
|
|
86
|
-
const [dbCheck, redisCheck, heartbeat, engine, client] =
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
211
|
+
const [dbCheck, redisCheck, heartbeat, engine, client, activity] =
|
|
212
|
+
await Promise.all([
|
|
213
|
+
checkComponent(async () => {
|
|
214
|
+
await db.execute(sql`SELECT 1`);
|
|
215
|
+
}),
|
|
216
|
+
checkComponent(async () => {
|
|
217
|
+
// Actively probe: getRedis() lazily creates + connects the client (with
|
|
218
|
+
// family:0 for Railway IPv6). The old getRedisIfConnected() only returned
|
|
219
|
+
// a client if something had ALREADY created one — which nothing does when
|
|
220
|
+
// PostHog is disabled — so redis always read "down" even though it was
|
|
221
|
+
// reachable. ioredis buffers the ping until connected (or rejects if the
|
|
222
|
+
// host is genuinely unreachable → a truthful "down").
|
|
223
|
+
await getRedis().ping();
|
|
224
|
+
}),
|
|
225
|
+
withDeadline(getWorkerHeartbeat(), { alive: false }),
|
|
226
|
+
withDeadline(getEngineSchemaVersion(db), NULL_SCHEMA),
|
|
227
|
+
withDeadline(
|
|
228
|
+
getClientSchemaVersion(db, clientJournal ?? { entries: [] }),
|
|
229
|
+
NULL_SCHEMA,
|
|
230
|
+
),
|
|
231
|
+
getRecentActivity(db),
|
|
232
|
+
]);
|
|
103
233
|
|
|
104
234
|
// `migration_pending` if EITHER track is behind. The engine track also gates
|
|
105
235
|
// boot (fatal); the client track surfaces here non-fatally (client-owned).
|
|
@@ -139,6 +269,7 @@ export const healthRouter = new OpenAPIHono<AppEnv>().openapi(
|
|
|
139
269
|
lastSeenAt: heartbeat.lastSeenAt,
|
|
140
270
|
},
|
|
141
271
|
},
|
|
272
|
+
activity,
|
|
142
273
|
},
|
|
143
274
|
200,
|
|
144
275
|
);
|
package/src/worker.ts
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
} from "./buckets/registry.js";
|
|
6
6
|
import type { HogsendClient } from "./container.js";
|
|
7
7
|
import type { DefinedJourney } from "./journeys/define-journey.js";
|
|
8
|
-
import { selectJourneyTasks } from "./journeys/registry.js";
|
|
8
|
+
import { parseEnabledFilter, selectJourneyTasks } from "./journeys/registry.js";
|
|
9
9
|
import { reportWorkerReady } from "./lib/boot.js";
|
|
10
10
|
import { hatchet } from "./lib/hatchet.js";
|
|
11
11
|
import { getRedisIfConnected } from "./lib/redis.js";
|
|
@@ -49,6 +49,13 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
|
|
|
49
49
|
const { container, journeys } = opts;
|
|
50
50
|
const enabled = opts.enabledJourneys ?? container.env.ENABLED_JOURNEYS;
|
|
51
51
|
const journeyTasks = selectJourneyTasks(journeys, enabled);
|
|
52
|
+
// The enabled journey IDs, logged at startup so a stale worker (one missing a
|
|
53
|
+
// newly added journey because the dev watcher never restarted it) is visible
|
|
54
|
+
// at a glance — counts alone can't show WHICH journeys are registered.
|
|
55
|
+
const journeyFilter = parseEnabledFilter(enabled);
|
|
56
|
+
const journeyIds = journeys
|
|
57
|
+
.filter((j) => journeyFilter === "*" || journeyFilter.has(j.meta.id))
|
|
58
|
+
.map((j) => j.meta.id);
|
|
52
59
|
|
|
53
60
|
const enabledBuckets = opts.enabledBuckets ?? container.env.ENABLED_BUCKETS;
|
|
54
61
|
// The single place a bucket's per-user fast-expiry timer task is constructed
|
|
@@ -111,6 +118,7 @@ export function createWorker(opts: CreateWorkerOptions): Worker {
|
|
|
111
118
|
// "ready" line only fires once `hatchet.worker()` resolves).
|
|
112
119
|
container.logger.info("Hogsend worker starting", {
|
|
113
120
|
hatchet: container.env.HATCHET_CLIENT_HOST_PORT,
|
|
121
|
+
journeys: journeyIds,
|
|
114
122
|
});
|
|
115
123
|
|
|
116
124
|
_worker = await hatchet.worker("hogsend-worker", { workflows });
|
|
@@ -1,7 +1,67 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
createDatabase,
|
|
3
|
+
type Database,
|
|
4
|
+
emailSends,
|
|
5
|
+
journeyStates,
|
|
6
|
+
} from "@hogsend/db";
|
|
7
|
+
import { and, eq, gte, sql } from "drizzle-orm";
|
|
2
8
|
import { checkAlertRules } from "../lib/alerting.js";
|
|
3
9
|
import { hatchet } from "../lib/hatchet.js";
|
|
4
|
-
import { createLogger } from "../lib/logger.js";
|
|
10
|
+
import { createLogger, type Logger } from "../lib/logger.js";
|
|
11
|
+
|
|
12
|
+
const FAILURE_WINDOW_MINUTES = 60;
|
|
13
|
+
|
|
14
|
+
// Ruleless failure surfacing. The configured alert rules already cover failed
|
|
15
|
+
// journeys (journey_failure_spike) and failed sends (they drag delivery_issue's
|
|
16
|
+
// delivery rate down) — but a fresh install has NO alert_rules rows, so a
|
|
17
|
+
// provider 403 fails silently while health stays green. This logs an error
|
|
18
|
+
// for ANY failed send / failed journey state in the window, no rule required.
|
|
19
|
+
async function surfaceRecentFailures(opts: {
|
|
20
|
+
db: Database;
|
|
21
|
+
logger: Logger;
|
|
22
|
+
}): Promise<void> {
|
|
23
|
+
const { db, logger } = opts;
|
|
24
|
+
const since = new Date(Date.now() - FAILURE_WINDOW_MINUTES * 60 * 1000);
|
|
25
|
+
|
|
26
|
+
try {
|
|
27
|
+
const [journeyRows, emailRows] = await Promise.all([
|
|
28
|
+
db
|
|
29
|
+
.select({ count: sql<number>`count(*)` })
|
|
30
|
+
.from(journeyStates)
|
|
31
|
+
.where(
|
|
32
|
+
and(
|
|
33
|
+
eq(journeyStates.status, "failed"),
|
|
34
|
+
gte(journeyStates.updatedAt, since),
|
|
35
|
+
),
|
|
36
|
+
),
|
|
37
|
+
db
|
|
38
|
+
.select({ count: sql<number>`count(*)` })
|
|
39
|
+
.from(emailSends)
|
|
40
|
+
.where(
|
|
41
|
+
and(
|
|
42
|
+
eq(emailSends.status, "failed"),
|
|
43
|
+
gte(emailSends.createdAt, since),
|
|
44
|
+
),
|
|
45
|
+
),
|
|
46
|
+
]);
|
|
47
|
+
|
|
48
|
+
const failedJourneys = Number(journeyRows[0]?.count ?? 0);
|
|
49
|
+
const failedEmails = Number(emailRows[0]?.count ?? 0);
|
|
50
|
+
|
|
51
|
+
if (failedJourneys > 0 || failedEmails > 0) {
|
|
52
|
+
logger.error("Recent failures detected", {
|
|
53
|
+
failedJourneys,
|
|
54
|
+
failedEmails,
|
|
55
|
+
windowMinutes: FAILURE_WINDOW_MINUTES,
|
|
56
|
+
hint: "Check journey_states.error_message and email_sends rows; /v1/health `activity` shows 24h counts",
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
} catch (err) {
|
|
60
|
+
logger.warn("Failed to check recent failures", {
|
|
61
|
+
error: err instanceof Error ? err.message : String(err),
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
}
|
|
5
65
|
|
|
6
66
|
export const checkAlertsTask = hatchet.task({
|
|
7
67
|
name: "check-alerts",
|
|
@@ -13,6 +73,8 @@ export const checkAlertsTask = hatchet.task({
|
|
|
13
73
|
});
|
|
14
74
|
const logger = createLogger(process.env.LOG_LEVEL ?? "info");
|
|
15
75
|
|
|
76
|
+
await surfaceRecentFailures({ db, logger });
|
|
77
|
+
|
|
16
78
|
await checkAlertRules({
|
|
17
79
|
db,
|
|
18
80
|
logger,
|