@checkstack/backend 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,111 @@
1
1
  # @checkstack/backend
2
2
 
3
+ ## 0.8.2
4
+
5
+ ### Patch Changes
6
+
7
+ - 302cd3f: fix: resilient startup routing + /health and /ready endpoints
8
+
9
+ Three fixes that together eliminate startup-race errors during boot and
10
+ hot-reload, plus a new readiness API for plugins.
11
+
12
+ 1. **TrieRouter swap (root cause).** Hono's default `SmartRouter` freezes
13
+ its matcher on the first request — any later `app.add()` throws
14
+ `MESSAGE_MATCHER_IS_ALREADY_BUILT`. Plugins register routes during
15
+ `init()` (and at runtime via `loadSinglePlugin`), so an early request
16
+ during boot would silently lock the matcher with only the module-load
17
+ routes, and every later route registration would fail. The backend
18
+ now uses `TrieRouter`, which is incremental — routes can be added at
19
+ any time, including after thousands of requests have been served.
20
+ This also future-proofs runtime plugin install.
21
+
22
+ 2. **Init gating + fail-loud.** Non-bypass requests now `await` an
23
+ `initPromise` (with a 30s timeout that returns 503 + Retry-After) so
24
+ no traffic reaches Hono before plugins finish registering routes.
25
+ Init failures crash the process via `process.exit(1)` so docker/k8s
26
+ restart cleanly instead of silently serving a half-initialized
27
+ backend.
28
+
29
+ 3. **`/assets/*` fall-through.** The production frontend asset handler
30
+ now calls `next()` instead of `c.notFound()` on miss, so
31
+ plugin-asset routes registered later (`/assets/plugins/:pluginName/*`)
32
+ actually get a chance to match.
33
+
34
+ ### New: platform endpoints under `/.checkstack/*`
35
+
36
+ - `GET /.checkstack/health` — liveness, always 200 once the process is up.
37
+ - `GET /.checkstack/ready` — readiness, 503 until init completes and all
38
+ critical probes pass; 200 otherwise. Returns `{ ready, checks: [...] }`
39
+ with per-probe status, message/error and duration.
40
+
41
+ The leading `.checkstack/` prefix namespaces platform-level endpoints
42
+ away from plugin `/api/*`, runtime frontend assets, and the SPA wildcard,
43
+ leaving room for additional operator endpoints in the future.
44
+
45
+ ### New: plugin readiness API
46
+
47
+ Plugins can contribute readiness probes via the new
48
+ `coreServices.readinessRegistry` service:
49
+
50
+ ```ts
51
+ registerInit({
52
+ deps: { readiness: coreServices.readinessRegistry },
53
+ async init({ readiness }) {
54
+ readiness.register({
55
+ name: "queue.connected",
56
+ critical: true,
57
+ check: async () => ({
58
+ ok: pool.isConnected(),
59
+ message: pool.isConnected() ? undefined : "queue pool not connected",
60
+ }),
61
+ });
62
+ },
63
+ });
64
+ ```
65
+
66
+ Probes run in parallel, throwing probes are reported as `ok: false`,
67
+ and non-critical probes don't block readiness.
68
+
69
+ - Updated dependencies [302cd3f]
70
+ - @checkstack/backend-api@0.14.1
71
+ - @checkstack/cache-api@0.2.3
72
+ - @checkstack/queue-api@0.2.17
73
+ - @checkstack/signal-backend@0.2.2
74
+ - @checkstack/api-docs-common@0.1.10
75
+ - @checkstack/auth-common@0.6.4
76
+ - @checkstack/common@0.7.0
77
+ - @checkstack/drizzle-helper@0.0.4
78
+ - @checkstack/signal-common@0.2.0
79
+
80
+ ## 0.8.1
81
+
82
+ ### Patch Changes
83
+
84
+ - 2a749d3: fix: run afterPluginsReady in topological order; merge daily rollups on conflict
85
+
86
+ Two resilience fixes for the dependency chain:
87
+
88
+ 1. **Plugin loader**: Phase 3 (`afterPluginsReady`) now iterates plugins
89
+ in the same topologically-sorted order as Phase 2 (`init`). Previously
90
+ it iterated `pendingInits` in registration order, which raced
91
+ subscription-spec dependencies — catalog's afterPluginsReady registers
92
+ `catalog.system` and `catalog.group` notification targets, and emitting
93
+ plugins (incident, maintenance, …) call `registerSubscriptionSpec`
94
+ against those targets in their own afterPluginsReady. With registration
95
+ order, an emitter could run before catalog and hit
96
+ `Target type catalog.group is not registered`. Sorted order encodes
97
+ the dependency via `spec.target.ownerPlugin`, so the emitter now
98
+ always runs after the target owner.
99
+
100
+ 2. **Healthcheck retention job**: the daily rollup now upserts
101
+ `health_check_aggregates` with `ON CONFLICT DO UPDATE` instead of a
102
+ plain insert. Previously, late-arriving hourly aggregates (e.g. from
103
+ a satellite that was offline when the prior rollup ran) would crash
104
+ the rollup with a unique-constraint violation on
105
+ `(configuration_id, system_id, bucket_start, bucket_size, source_id)`.
106
+ The merge sums counts and folds min/max/p95 into the existing daily
107
+ row.
108
+
3
109
  ## 0.8.0
4
110
 
5
111
  ### Minor Changes
package/package.json CHANGED
@@ -1,26 +1,26 @@
1
1
  {
2
2
  "name": "@checkstack/backend",
3
- "version": "0.8.0",
3
+ "version": "0.8.2",
4
4
  "checkstack": {
5
5
  "type": "backend"
6
6
  },
7
7
  "type": "module",
8
8
  "scripts": {
9
9
  "dev": "bun --env-file=../../.env --watch src/index.ts",
10
- "typecheck": "tsc --noEmit",
10
+ "typecheck": "tsgo -b",
11
11
  "generate": "bun --env-file=../../.env run drizzle-kit generate",
12
12
  "lint": "bun run lint:code",
13
13
  "lint:code": "eslint . --max-warnings 0"
14
14
  },
15
15
  "dependencies": {
16
16
  "@checkstack/api-docs-common": "0.1.10",
17
- "@checkstack/auth-common": "0.6.3",
18
- "@checkstack/backend-api": "0.13.1",
17
+ "@checkstack/auth-common": "0.6.4",
18
+ "@checkstack/backend-api": "0.14.0",
19
19
  "@checkstack/common": "0.7.0",
20
20
  "@checkstack/drizzle-helper": "0.0.4",
21
- "@checkstack/cache-api": "0.2.1",
22
- "@checkstack/queue-api": "0.2.15",
23
- "@checkstack/signal-backend": "0.2.0",
21
+ "@checkstack/cache-api": "0.2.2",
22
+ "@checkstack/queue-api": "0.2.16",
23
+ "@checkstack/signal-backend": "0.2.1",
24
24
  "@checkstack/signal-common": "0.2.0",
25
25
  "@hono/zod-validator": "^0.7.6",
26
26
  "@orpc/client": "^1.13.14",
@@ -41,7 +41,7 @@
41
41
  "@types/bun": "latest",
42
42
  "@checkstack/tsconfig": "0.0.5",
43
43
  "@checkstack/scripts": "0.1.2",
44
- "@checkstack/test-utils-backend": "0.1.21",
44
+ "@checkstack/test-utils-backend": "0.1.22",
45
45
  "drizzle-kit": "^0.31.10"
46
46
  }
47
47
  }
package/src/index.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import type { Server } from "bun";
2
2
  import { type Context, Hono } from "hono";
3
+ import { TrieRouter } from "hono/router/trie-router";
3
4
  import { PluginManager } from "./plugin-manager";
4
5
  import { logger } from "hono/logger";
5
6
  import { migrate } from "drizzle-orm/node-postgres/migrator";
@@ -8,6 +9,7 @@ import path from "node:path";
8
9
  import fs from "node:fs";
9
10
  import { rootLogger } from "./logger";
10
11
  import { coreServices, coreHooks } from "@checkstack/backend-api";
12
+ import { extractErrorMessage } from "@checkstack/common";
11
13
  import { plugins } from "./schema";
12
14
  import { eq, and } from "drizzle-orm";
13
15
  import { PluginLocalInstaller } from "./services/plugin-installer";
@@ -52,9 +54,40 @@ import {
52
54
 
53
55
  import { cors } from "hono/cors";
54
56
 
55
- const app = new Hono();
57
+ // IMPORTANT: TrieRouter (not the default SmartRouter).
58
+ // SmartRouter freezes its matcher on the first incoming request — any later
59
+ // app.add() throws "Can not add a route since the matcher is already built".
60
+ // Plugins register routes asynchronously during init() and at runtime via
61
+ // loadSinglePlugin(), so we need an incremental router.
62
+ const app = new Hono({ router: new TrieRouter() });
56
63
  const pluginManager = new PluginManager();
57
64
 
65
+ /**
66
+ * Init lifecycle state.
67
+ *
68
+ * `initialized` flips to true after the entire init() completes (Phases 1-3).
69
+ * It feeds the "core.init" readiness probe consumed by /ready.
70
+ *
71
+ * `initError` is populated when init throws; the process is then exited so
72
+ * the supervisor (docker/k8s) restarts us — we never serve a half-initialized
73
+ * backend.
74
+ *
75
+ * The HTTP request gate does NOT key off these flags directly. It awaits
76
+ * `pluginManager.routesReadyPromise`, which resolves earlier — right after
77
+ * `/api/:pluginId/*` is added to the root router and BEFORE `afterPluginsReady`
78
+ * runs — so cross-plugin RPC calls during plugin boot don't deadlock on
79
+ * themselves.
80
+ */
81
+ let initError: Error | undefined;
82
+ let initialized = false;
83
+
84
+ /**
85
+ * Maximum time a request will wait for init to complete before falling back
86
+ * to a 503 Service Unavailable. Without this, a wedged plugin would hang
87
+ * health probes forever.
88
+ */
89
+ const READY_WAIT_TIMEOUT_MS = 30_000;
90
+
58
91
  // WebSocket handler instance (initialized during init)
59
92
  let wsHandler: ReturnType<typeof createWebSocketHandler> | undefined;
60
93
 
@@ -82,6 +115,50 @@ app.use(
82
115
  );
83
116
  app.use("*", logger());
84
117
 
118
+ // =============================================================================
119
+ // PLATFORM ENDPOINTS — /.checkstack/*
120
+ // =============================================================================
121
+ //
122
+ // All "platform-level" endpoints (probes, future operator hooks) live under
123
+ // /.checkstack/* so they are clearly separated from plugin /api/*, runtime
124
+ // frontend assets, and the SPA wildcard. The leading dot keeps them out of
125
+ // any plugin URL space by construction.
126
+ //
127
+ // Health & readiness:
128
+ // - registered at module load; bypass the boot gate in `fetch()` so that
129
+ // orchestrators (Kubernetes, docker-compose) can probe a still-booting
130
+ // process.
131
+ // - /.checkstack/health = "process is alive"
132
+ // - /.checkstack/ready = "plugins initialized and all critical probes pass"
133
+
134
+ /** Liveness probe — answers as long as the process responds. */
135
+ app.get("/.checkstack/health", (c) => c.json({ status: "ok" }));
136
+
137
+ /**
138
+ * Readiness probe — aggregates plugin-contributed checks.
139
+ * - 503 while init is in flight or has failed
140
+ * - 503 if any critical probe is failing
141
+ * - 200 only when init completed AND all critical probes pass
142
+ */
143
+ app.get("/.checkstack/ready", async (c) => {
144
+ if (initError) {
145
+ return c.json(
146
+ { ready: false, error: initError.message, checks: [] },
147
+ 503,
148
+ { "Retry-After": "5" },
149
+ );
150
+ }
151
+ if (!initialized) {
152
+ return c.json(
153
+ { ready: false, reason: "initializing", checks: [] },
154
+ 503,
155
+ { "Retry-After": "1" },
156
+ );
157
+ }
158
+ const snapshot = await pluginManager.getReadinessRegistry().evaluate();
159
+ return c.json(snapshot, snapshot.ready ? 200 : 503);
160
+ });
161
+
85
162
  // SECURITY: Add missing standard security headers across all API responses
86
163
  app.use("/api/*", async (c, next) => {
87
164
  await next();
@@ -185,14 +262,16 @@ if (frontendDistPath && fs.existsSync(frontendDistPath)) {
185
262
  };
186
263
 
187
264
  // Serve static assets (JS, CSS, images, etc.)
188
- app.get("/assets/*", async (c) => {
265
+ // Fall through to next() on miss so plugin-asset routes (registered later
266
+ // during init at /assets/plugins/:pluginName/*) get a chance to match.
267
+ app.get("/assets/*", async (c, next) => {
189
268
  const assetPath = c.req.path.replace("/assets/", "");
190
269
  const filePath = path.join(frontendDistPath, "assets", assetPath);
191
270
 
192
271
  if (fs.existsSync(filePath)) {
193
272
  return serveFile(c, filePath);
194
273
  }
195
- return c.notFound();
274
+ return next();
196
275
  });
197
276
 
198
277
  // Serve vendor scripts (externalized React, react-router-dom, etc.)
@@ -441,16 +520,117 @@ const init = async () => {
441
520
  logger: rootLogger.child({ service: "WebSocket" }),
442
521
  });
443
522
 
523
+ // Register the core "init" readiness probe. Plugin-contributed probes are
524
+ // additive — see coreServices.readinessRegistry for the plugin-facing API.
525
+ pluginManager.getReadinessRegistry().register({
526
+ name: "core.init",
527
+ critical: true,
528
+ check: async () => ({ ok: initialized, message: initialized ? undefined : "init not complete" }),
529
+ });
530
+
444
531
  rootLogger.info("✅ Checkstack Core initialized.");
445
532
  };
446
533
 
447
- void init();
534
+ /**
535
+ * Fire-and-forget init. We deliberately don't `await` at the top level so the
536
+ * server can answer /health and /ready while plugins are still loading;
537
+ * non-bypass requests are gated via `waitForRoutesReady()` below.
538
+ */
539
+ // eslint-disable-next-line unicorn/prefer-top-level-await -- intentionally non-blocking; gates handled in waitForRoutesReady()
540
+ void (async () => {
541
+ try {
542
+ await init();
543
+ initialized = true;
544
+ } catch (error: unknown) {
545
+ initError = new Error(extractErrorMessage(error, "init failed"));
546
+ rootLogger.error(
547
+ "❌ FATAL: Checkstack Core init failed; the process will exit so the supervisor can restart it.",
548
+ initError,
549
+ );
550
+ // Give the logger one tick to flush, then exit so docker/k8s restarts us.
551
+ // A half-initialized backend silently serves broken state — restart is
552
+ // strictly better than continuing. We disable the no-process-exit rule
553
+ // because this IS the canonical fail-fast pattern for a long-running
554
+ // server entrypoint.
555
+ setTimeout(() => {
556
+ // eslint-disable-next-line unicorn/no-process-exit -- intentional fail-fast on init failure
557
+ process.exit(1);
558
+ }, 50);
559
+ }
560
+ })();
561
+
562
+ /**
563
+ * Paths that bypass the boot gate. Platform endpoints under /.checkstack/*
564
+ * MUST be reachable while the backend is still booting so orchestrators can
565
+ * probe it. Everything else waits until plugin routes are registered.
566
+ */
567
+ const BOOT_BYPASS_PREFIX = "/.checkstack/";
568
+
569
+ /**
570
+ * Wait until plugin RPC routes are registered on the root router (resolved
571
+ * inside `loadPlugins` BEFORE Phase 2 / `afterPluginsReady`). Returns:
572
+ * - undefined when routes are ready → caller should proceed to Hono.
573
+ * - a 503 Response when init failed or the wait timed out.
574
+ *
575
+ * Why this gate, and why at this specific point:
576
+ * - Earlier (before /api/:pluginId/* is added), an incoming request would
577
+ * short-circuit through the SPA wildcard or 404 because the plugin route
578
+ * simply doesn't exist yet on the router.
579
+ * - Later (after full init), self-referencing RPC calls made from
580
+ * `afterPluginsReady` would deadlock waiting for init to complete — so
581
+ * we MUST open the gate before Phase 3 runs.
582
+ * - `loadPlugins()` resolves `routesReadyPromise` immediately after
583
+ * `registerApiRoute()`, which is the earliest point both conditions hold.
584
+ */
585
+ async function waitForRoutesReady(): Promise<Response | undefined> {
586
+ if (initError) {
587
+ return Response.json(
588
+ { error: "Backend init failed", message: initError.message },
589
+ { status: 503, headers: { "Retry-After": "5" } },
590
+ );
591
+ }
592
+ let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
593
+ // pluginManager.routesReadyPromise resolves from inside loadPlugins; it
594
+ // never rejects. The init catch handler logs + process.exit's separately.
595
+ const timedOut = await Promise.race([
596
+ pluginManager.routesReadyPromise.then(() => false),
597
+ new Promise<true>((resolve) => {
598
+ timeoutHandle = setTimeout(() => resolve(true), READY_WAIT_TIMEOUT_MS);
599
+ }),
600
+ ]);
601
+ if (timeoutHandle) clearTimeout(timeoutHandle);
602
+ if (timedOut) {
603
+ return Response.json(
604
+ { error: "Backend not ready", message: "boot timeout" },
605
+ { status: 503, headers: { "Retry-After": "5" } },
606
+ );
607
+ }
608
+ // Re-read after await — init may have rejected while we were waiting.
609
+ const errAfter = initError as Error | undefined;
610
+ if (errAfter) {
611
+ return Response.json(
612
+ { error: "Backend init failed", message: errAfter.message },
613
+ { status: 503, headers: { "Retry-After": "5" } },
614
+ );
615
+ }
616
+ return undefined;
617
+ }
448
618
 
449
619
  // Custom fetch handler that handles WebSocket upgrades
450
620
  const fetch = async (
451
621
  req: Request,
452
622
  server: Server<ServerWsData>
453
623
  ): Promise<Response | undefined> => {
624
+ const url = new URL(req.url);
625
+
626
+ // Platform endpoints (/.checkstack/*) bypass the boot gate so orchestrators
627
+ // can poll a booting process. Everything else waits until plugin routes
628
+ // are registered on the root router (resolved before Phase 2 init runs).
629
+ if (!url.pathname.startsWith(BOOT_BYPASS_PREFIX)) {
630
+ const stalled = await waitForRoutesReady();
631
+ if (stalled) return stalled;
632
+ }
633
+
454
634
  // Set the server reference for WebSocket pub/sub after startup
455
635
  if (wsHandler && !server.upgrade) {
456
636
  // Server doesn't support WebSocket upgrade (shouldn't happen with Bun)
@@ -461,8 +641,6 @@ const fetch = async (
461
641
  // Cast is safe: signal handler only reads its own fields via connectionType guard
462
642
  wsHandler?.setServer(server as unknown as Server<WebSocketData>);
463
643
 
464
- const url = new URL(req.url);
465
-
466
644
  // Handle WebSocket upgrade for signals
467
645
  if (url.pathname === "/api/signals/ws") {
468
646
  // Try to authenticate, but allow anonymous connections for broadcast signals
@@ -30,6 +30,10 @@ import {
30
30
  WebSocketRouteStoreImpl,
31
31
  createScopedWsRegistry,
32
32
  } from "../services/ws-route-registry";
33
+ import {
34
+ CoreReadinessRegistry,
35
+ createScopedReadinessRegistry,
36
+ } from "../services/readiness-registry";
33
37
 
34
38
  /**
35
39
  * Check if a PostgreSQL schema exists.
@@ -59,7 +63,11 @@ export function registerCoreServices({
59
63
  pluginRpcRouters: Map<string, unknown>;
60
64
  pluginHttpHandlers: Map<string, (req: Request) => Promise<Response>>;
61
65
  pluginContractRegistry: Map<string, unknown>;
62
- }): { collectorRegistry: CoreCollectorRegistry; wsStore: WebSocketRouteStoreImpl } {
66
+ }): {
67
+ collectorRegistry: CoreCollectorRegistry;
68
+ wsStore: WebSocketRouteStoreImpl;
69
+ readinessRegistry: CoreReadinessRegistry;
70
+ } {
63
71
  // 1. Database Factory (Scoped)
64
72
  registry.registerFactory(coreServices.database, async (metadata) => {
65
73
  const { pluginId, previousPluginIds } = metadata;
@@ -356,6 +364,17 @@ export function registerCoreServices({
356
364
  createScopedWsRegistry(globalWsStore, metadata.pluginId),
357
365
  );
358
366
 
367
+ // 11. Readiness Registry (Scoped Factory)
368
+ // Plugins contribute probes that are aggregated by the /ready endpoint.
369
+ const globalReadinessRegistry = new CoreReadinessRegistry();
370
+ registry.registerFactory(coreServices.readinessRegistry, () =>
371
+ createScopedReadinessRegistry(globalReadinessRegistry),
372
+ );
373
+
359
374
  // Return global registries for lifecycle cleanup
360
- return { collectorRegistry: globalCollectorRegistry, wsStore: globalWsStore };
375
+ return {
376
+ collectorRegistry: globalCollectorRegistry,
377
+ wsStore: globalWsStore,
378
+ readinessRegistry: globalReadinessRegistry,
379
+ };
361
380
  }
@@ -56,6 +56,14 @@ export interface PluginLoaderDeps {
56
56
  * Map of pluginId -> contract for OpenAPI generation.
57
57
  */
58
58
  pluginContractRegistry: Map<string, AnyContractRouter>;
59
+ /**
60
+ * Called once `/api/:pluginId/*` is added to the root router and Phase 2
61
+ * (per-plugin init) is about to start. From this point on, plugin RPC
62
+ * routers come online incrementally as each plugin initializes — so
63
+ * self-referencing HTTP calls (e.g. RPC made from `afterPluginsReady`)
64
+ * can be allowed through the boot-time request gate without deadlocking.
65
+ */
66
+ onApiRouteRegistered?: () => void;
59
67
  }
60
68
 
61
69
  /**
@@ -295,6 +303,19 @@ export async function loadPlugins({
295
303
  });
296
304
  registerApiRoute(rootRouter, apiHandler);
297
305
 
306
+ // Routes are now registered on the root router. Signal readiness so the
307
+ // server can stop blocking incoming requests in `waitForInit()`. We open
308
+ // the gate here (BEFORE Phase 2 / Phase 3) so that:
309
+ // - the static module-load endpoints (/api/plugins, /api/about, …) stop
310
+ // hanging behind the boot gate;
311
+ // - cross-plugin RPC calls made from `afterPluginsReady` can self-loop
312
+ // through the HTTP server without deadlocking on init completion.
313
+ // Plugin RPC routers come online incrementally as each plugin's Phase 2
314
+ // init runs; requests targeting a not-yet-initialized plugin fall through
315
+ // to the api-router's "Plugin metadata not found" 500, which is the
316
+ // pre-existing behavior and is preferable to a multi-second hang.
317
+ deps.onApiRouteRegistered?.();
318
+
298
319
  for (const id of sortedIds) {
299
320
  const p = pendingInits.find((x) => x.metadata.pluginId === id)!;
300
321
  rootLogger.info(`🚀 Initializing ${p.metadata.pluginId}...`);
@@ -515,7 +536,15 @@ export async function loadPlugins({
515
536
  );
516
537
  }
517
538
  }
518
- for (const p of pendingInits) {
539
+ // Run afterPluginsReady in topologically-sorted order, matching Phase
540
+ // 2 init order. Iterating `pendingInits` directly would use registration
541
+ // order, which races dependency chains: e.g. catalog's
542
+ // afterPluginsReady registers `catalog.group` as a notification target,
543
+ // and emitting plugins' afterPluginsReady call `registerSubscriptionSpec`
544
+ // against it — so the emitters MUST run after catalog. Topo order
545
+ // already encodes this dependency (via spec.target.ownerPlugin).
546
+ for (const id of sortedIds) {
547
+ const p = pendingInits.find((x) => x.metadata.pluginId === id)!;
519
548
  if (p.afterPluginsReady) {
520
549
  try {
521
550
  const resolvedDeps: Record<string, unknown> = {};
@@ -3,6 +3,7 @@ import { adminPool, db } from "./db";
3
3
  import { ServiceRegistry } from "./services/service-registry";
4
4
  import type { CoreCollectorRegistry } from "./services/collector-registry";
5
5
  import type { WebSocketRouteStoreImpl } from "./services/ws-route-registry";
6
+ import type { CoreReadinessRegistry } from "./services/readiness-registry";
6
7
  import {
7
8
  BackendPlugin,
8
9
  ServiceRef,
@@ -54,7 +55,21 @@ export class PluginManager {
54
55
  // Global WebSocket route store for server-level routing
55
56
  private wsStore: WebSocketRouteStoreImpl;
56
57
 
58
+ // Global readiness registry — plugins contribute probes, /ready aggregates them
59
+ private readinessRegistry: CoreReadinessRegistry;
60
+
61
+ // Resolves once `/api/:pluginId/*` is registered on the root router and
62
+ // Phase 2 (per-plugin init) is starting. The HTTP server awaits this
63
+ // promise to know when it is safe to stop gating incoming requests.
64
+ // Held as a deferred so the listener (server) can be wired up before
65
+ // loadPlugins() runs.
66
+ private resolveRoutesReady!: () => void;
67
+ readonly routesReadyPromise: Promise<void>;
68
+
57
69
  constructor() {
70
+ this.routesReadyPromise = new Promise<void>((resolve) => {
71
+ this.resolveRoutesReady = resolve;
72
+ });
58
73
  const registries = registerCoreServices({
59
74
  registry: this.registry,
60
75
  adminPool,
@@ -64,6 +79,15 @@ export class PluginManager {
64
79
  });
65
80
  this.collectorRegistry = registries.collectorRegistry;
66
81
  this.wsStore = registries.wsStore;
82
+ this.readinessRegistry = registries.readinessRegistry;
83
+ }
84
+
85
+ /**
86
+ * Get the global readiness registry so the server-level /ready endpoint
87
+ * can aggregate plugin-contributed probes.
88
+ */
89
+ getReadinessRegistry(): CoreReadinessRegistry {
90
+ return this.readinessRegistry;
67
91
  }
68
92
 
69
93
  /**
@@ -124,8 +148,13 @@ export class PluginManager {
124
148
  pluginMetadataRegistry: this.pluginMetadataRegistry,
125
149
  cleanupHandlers: this.cleanupHandlers,
126
150
  pluginContractRegistry: this.pluginContractRegistry,
151
+ onApiRouteRegistered: () => this.resolveRoutesReady(),
127
152
  },
128
153
  });
154
+ // Defensive: if loadPlugins returned without ever calling the callback
155
+ // (e.g. zero plugins discovered and no api route registered), unblock
156
+ // the server gate anyway — by this point Hono is fully configured.
157
+ this.resolveRoutesReady();
129
158
  }
130
159
 
131
160
  /**
@@ -0,0 +1,49 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import { Hono } from "hono";
3
+ import { TrieRouter } from "hono/router/trie-router";
4
+
5
+ /**
6
+ * Regression test for the "Hono router was already initialized" bug.
7
+ *
8
+ * Hono's default SmartRouter freezes its matcher on first request: any later
9
+ * `app.get/all/...` throws "Can not add a route since the matcher is already
10
+ * built". Plugins register routes during init() (and at runtime via
11
+ * loadSinglePlugin), so we use TrieRouter — which is incremental. If anyone
12
+ * ever swaps the router back to default, this test fails fast.
13
+ *
14
+ * See core/backend/src/index.ts where TrieRouter is wired up.
15
+ */
16
+ describe("Hono router (TrieRouter) supports incremental route registration", () => {
17
+ it("accepts routes added after the first request", async () => {
18
+ const app = new Hono({ router: new TrieRouter() });
19
+ app.get("/early", (c) => c.text("early"));
20
+
21
+ // Trigger matcher build (this is what freezes SmartRouter).
22
+ const r1 = await app.fetch(new Request("http://x/early"));
23
+ expect(await r1.text()).toBe("early");
24
+
25
+ // Add a route AFTER the matcher is "built". On SmartRouter this throws.
26
+ app.get("/late", (c) => c.text("late"));
27
+
28
+ const r2 = await app.fetch(new Request("http://x/late"));
29
+ expect(r2.status).toBe(200);
30
+ expect(await r2.text()).toBe("late");
31
+ });
32
+
33
+ it("accepts a parameterized route added after a request", async () => {
34
+ const app = new Hono({ router: new TrieRouter() });
35
+ app.get("/seed", (c) => c.text("seed"));
36
+ await app.fetch(new Request("http://x/seed"));
37
+
38
+ // This is the actual production scenario: /api/:pluginId/* is registered
39
+ // inside loadPlugins() during init, well after the first request may have
40
+ // already been handled.
41
+ app.all("/api/:pluginId/*", (c) =>
42
+ c.json({ pluginId: c.req.param("pluginId") }),
43
+ );
44
+
45
+ const r = await app.fetch(new Request("http://x/api/healthcheck/foo"));
46
+ expect(r.status).toBe(200);
47
+ expect(await r.json()).toEqual({ pluginId: "healthcheck" });
48
+ });
49
+ });
@@ -0,0 +1,124 @@
1
+ import { describe, it, expect, beforeEach, mock } from "bun:test";
2
+ import {
3
+ CoreReadinessRegistry,
4
+ createScopedReadinessRegistry,
5
+ } from "./readiness-registry";
6
+ import { createMockLogger } from "@checkstack/test-utils-backend";
7
+
8
+ const mockLogger = createMockLogger();
9
+ mock.module("../logger", () => ({
10
+ rootLogger: mockLogger,
11
+ }));
12
+
13
+ describe("CoreReadinessRegistry", () => {
14
+ let registry: CoreReadinessRegistry;
15
+
16
+ beforeEach(() => {
17
+ registry = new CoreReadinessRegistry();
18
+ });
19
+
20
+ it("starts empty", () => {
21
+ expect(registry.isEmpty()).toBe(true);
22
+ });
23
+
24
+ it("evaluates to ready=true with no probes", async () => {
25
+ const snapshot = await registry.evaluate();
26
+ expect(snapshot.ready).toBe(true);
27
+ expect(snapshot.checks).toHaveLength(0);
28
+ });
29
+
30
+ it("aggregates passing probes", async () => {
31
+ registry.register({
32
+ name: "db",
33
+ check: async () => ({ ok: true }),
34
+ });
35
+ registry.register({
36
+ name: "queue",
37
+ check: async () => ({ ok: true }),
38
+ });
39
+ const snapshot = await registry.evaluate();
40
+ expect(snapshot.ready).toBe(true);
41
+ expect(snapshot.checks).toHaveLength(2);
42
+ expect(snapshot.checks.every((c) => c.ok)).toBe(true);
43
+ });
44
+
45
+ it("ready=false when a critical probe fails", async () => {
46
+ registry.register({
47
+ name: "db",
48
+ check: async () => ({ ok: false, message: "down" }),
49
+ });
50
+ const snapshot = await registry.evaluate();
51
+ expect(snapshot.ready).toBe(false);
52
+ expect(snapshot.checks[0].message).toBe("down");
53
+ });
54
+
55
+ it("ready=true when only a non-critical probe fails", async () => {
56
+ registry.register({
57
+ name: "warmup",
58
+ critical: false,
59
+ check: async () => ({ ok: false }),
60
+ });
61
+ registry.register({
62
+ name: "db",
63
+ check: async () => ({ ok: true }),
64
+ });
65
+ const snapshot = await registry.evaluate();
66
+ expect(snapshot.ready).toBe(true);
67
+ });
68
+
69
+ it("treats thrown probes as failed and surfaces the error", async () => {
70
+ registry.register({
71
+ name: "boom",
72
+ check: async () => {
73
+ throw new Error("kaboom");
74
+ },
75
+ });
76
+ const snapshot = await registry.evaluate();
77
+ expect(snapshot.ready).toBe(false);
78
+ expect(snapshot.checks[0].ok).toBe(false);
79
+ expect(snapshot.checks[0].error).toBe("kaboom");
80
+ });
81
+
82
+ it("overwrites duplicate names with a warning", async () => {
83
+ registry.register({
84
+ name: "db",
85
+ check: async () => ({ ok: false }),
86
+ });
87
+ registry.register({
88
+ name: "db",
89
+ check: async () => ({ ok: true }),
90
+ });
91
+ const snapshot = await registry.evaluate();
92
+ expect(snapshot.checks).toHaveLength(1);
93
+ expect(snapshot.checks[0].ok).toBe(true);
94
+ });
95
+
96
+ it("runs probes in parallel (total time ~ slowest probe)", async () => {
97
+ const delay = (ms: number) => new Promise((r) => setTimeout(r, ms));
98
+ registry.register({
99
+ name: "slow-1",
100
+ check: async () => {
101
+ await delay(50);
102
+ return { ok: true };
103
+ },
104
+ });
105
+ registry.register({
106
+ name: "slow-2",
107
+ check: async () => {
108
+ await delay(50);
109
+ return { ok: true };
110
+ },
111
+ });
112
+ const start = Date.now();
113
+ await registry.evaluate();
114
+ const elapsed = Date.now() - start;
115
+ // Sequential would be ~100ms; parallel should be ~50ms.
116
+ expect(elapsed).toBeLessThan(95);
117
+ });
118
+
119
+ it("scoped registry forwards register() to the global", () => {
120
+ const scoped = createScopedReadinessRegistry(registry);
121
+ scoped.register({ name: "x", check: async () => ({ ok: true }) });
122
+ expect(registry.isEmpty()).toBe(false);
123
+ });
124
+ });
@@ -0,0 +1,103 @@
1
+ import type {
2
+ ReadinessCheck,
3
+ ReadinessCheckResult,
4
+ ReadinessRegistry,
5
+ } from "@checkstack/backend-api";
6
+ import { extractErrorMessage } from "@checkstack/common";
7
+ import { rootLogger } from "../logger";
8
+
9
+ /**
10
+ * Snapshot returned to /ready callers.
11
+ */
12
+ export interface ReadinessSnapshot {
13
+ ready: boolean;
14
+ checks: Array<{
15
+ name: string;
16
+ critical: boolean;
17
+ ok: boolean;
18
+ message?: string;
19
+ /** Set when the probe threw (treated as ok=false for critical checks). */
20
+ error?: string;
21
+ /** Wall-clock duration for the probe (milliseconds). */
22
+ durationMs: number;
23
+ }>;
24
+ }
25
+
26
+ /**
27
+ * Core implementation backing both `coreServices.readinessRegistry` (plugin-facing)
28
+ * and the `/ready` endpoint (server-facing). Plugins call `register`; the server
29
+ * calls `evaluate()`.
30
+ */
31
+ export class CoreReadinessRegistry {
32
+ private checks: ReadinessCheck[] = [];
33
+
34
+ register(check: ReadinessCheck): void {
35
+ if (this.checks.some((c) => c.name === check.name)) {
36
+ rootLogger.warn(
37
+ `ReadinessRegistry: probe '${check.name}' is already registered. Overwriting.`,
38
+ );
39
+ this.checks = this.checks.filter((c) => c.name !== check.name);
40
+ }
41
+ this.checks.push(check);
42
+ rootLogger.debug(
43
+ ` -> Registered readiness probe '${check.name}' (critical=${check.critical ?? true})`,
44
+ );
45
+ }
46
+
47
+ /**
48
+ * Run every probe in parallel. Critical failures set `ready = false`.
49
+ * Throws are caught and reported as `ok: false`.
50
+ */
51
+ async evaluate(): Promise<ReadinessSnapshot> {
52
+ const results = await Promise.all(
53
+ this.checks.map(async (c) => {
54
+ const start = performance.now();
55
+ const critical = c.critical ?? true;
56
+ try {
57
+ const r: ReadinessCheckResult = await c.check();
58
+ return {
59
+ name: c.name,
60
+ critical,
61
+ ok: r.ok,
62
+ message: r.message,
63
+ durationMs: Math.round(performance.now() - start),
64
+ };
65
+ } catch (error) {
66
+ return {
67
+ name: c.name,
68
+ critical,
69
+ ok: false,
70
+ error: extractErrorMessage(error, String(error)),
71
+ durationMs: Math.round(performance.now() - start),
72
+ };
73
+ }
74
+ }),
75
+ );
76
+
77
+ const ready = results.every((r) => r.ok || !r.critical);
78
+ return { ready, checks: results };
79
+ }
80
+
81
+ /**
82
+ * Returns true while no probes are registered. Used to give a stable answer
83
+ * before plugins have had a chance to register their checks.
84
+ */
85
+ isEmpty(): boolean {
86
+ return this.checks.length === 0;
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Plugin-facing scoped view (currently identical to the underlying registry —
92
+ * we intentionally don't namespace probe names by plugin so operators can read
93
+ * them at a glance, but plugins are encouraged to prefix their own names).
94
+ */
95
+ export function createScopedReadinessRegistry(
96
+ global: CoreReadinessRegistry,
97
+ ): ReadinessRegistry {
98
+ return {
99
+ register(check: ReadinessCheck) {
100
+ global.register(check);
101
+ },
102
+ };
103
+ }
package/tsconfig.json CHANGED
@@ -2,5 +2,37 @@
2
2
  "extends": "@checkstack/tsconfig/backend.json",
3
3
  "include": [
4
4
  "src"
5
+ ],
6
+ "references": [
7
+ {
8
+ "path": "../api-docs-common"
9
+ },
10
+ {
11
+ "path": "../auth-common"
12
+ },
13
+ {
14
+ "path": "../backend-api"
15
+ },
16
+ {
17
+ "path": "../cache-api"
18
+ },
19
+ {
20
+ "path": "../common"
21
+ },
22
+ {
23
+ "path": "../drizzle-helper"
24
+ },
25
+ {
26
+ "path": "../queue-api"
27
+ },
28
+ {
29
+ "path": "../signal-backend"
30
+ },
31
+ {
32
+ "path": "../signal-common"
33
+ },
34
+ {
35
+ "path": "../test-utils-backend"
36
+ }
5
37
  ]
6
- }
38
+ }