@checkstack/backend 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/CHANGELOG.md +280 -0
  2. package/drizzle/0001_slim_mordo.sql +34 -0
  3. package/drizzle/meta/0001_snapshot.json +444 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +14 -9
  6. package/src/index.ts +460 -23
  7. package/src/plugin-deregistration.test.ts +137 -0
  8. package/src/plugin-manager/api-router.ts +35 -11
  9. package/src/plugin-manager/core-services.ts +21 -2
  10. package/src/plugin-manager/plugin-loader.ts +94 -0
  11. package/src/plugin-manager.ts +324 -105
  12. package/src/router-incremental.test.ts +49 -0
  13. package/src/schema.ts +79 -1
  14. package/src/services/compatibility-checker.test.ts +146 -0
  15. package/src/services/compatibility-checker.ts +137 -0
  16. package/src/services/dev-auth.test.ts +87 -0
  17. package/src/services/dev-auth.ts +56 -0
  18. package/src/services/plugin-artifact-store.ts +131 -0
  19. package/src/services/plugin-bundle-resolver.ts +76 -0
  20. package/src/services/plugin-event-recorder.ts +87 -0
  21. package/src/services/plugin-installers/catalog-installer.ts +33 -0
  22. package/src/services/plugin-installers/github-installer.ts +207 -0
  23. package/src/services/plugin-installers/install-from-tarball.ts +69 -0
  24. package/src/services/plugin-installers/installer-registry.ts +51 -0
  25. package/src/services/plugin-installers/npm-installer.ts +156 -0
  26. package/src/services/plugin-installers/plugin-install-error.ts +37 -0
  27. package/src/services/plugin-installers/tarball-installer.ts +80 -0
  28. package/src/services/plugin-installers/tarball-utils.test.ts +200 -0
  29. package/src/services/plugin-installers/tarball-utils.ts +172 -0
  30. package/src/services/plugin-manager-orchestrator.ts +522 -0
  31. package/src/services/plugin-manager-router.ts +219 -0
  32. package/src/services/readiness-registry.test.ts +124 -0
  33. package/src/services/readiness-registry.ts +103 -0
  34. package/src/utils/plugin-discovery.test.ts +6 -0
  35. package/src/utils/plugin-discovery.ts +6 -1
  36. package/tsconfig.json +36 -1
  37. package/src/plugin-lifecycle.test.ts +0 -276
  38. package/src/plugin-manager/plugin-admin-router.ts +0 -89
  39. package/src/services/plugin-installer.test.ts +0 -90
  40. package/src/services/plugin-installer.ts +0 -70
package/src/index.ts CHANGED
@@ -1,26 +1,45 @@
1
1
  import type { Server } from "bun";
2
2
  import { type Context, Hono } from "hono";
3
+ import { TrieRouter } from "hono/router/trie-router";
3
4
  import { PluginManager } from "./plugin-manager";
4
- import { logger } from "hono/logger";
5
5
  import { migrate } from "drizzle-orm/node-postgres/migrator";
6
6
  import { db } from "./db";
7
7
  import path from "node:path";
8
8
  import fs from "node:fs";
9
9
  import { rootLogger } from "./logger";
10
10
  import { coreServices, coreHooks } from "@checkstack/backend-api";
11
+ import { extractErrorMessage } from "@checkstack/common";
11
12
  import { plugins } from "./schema";
12
13
  import { eq, and } from "drizzle-orm";
13
- import { PluginLocalInstaller } from "./services/plugin-installer";
14
14
  import { QueuePluginRegistryImpl } from "./services/queue-plugin-registry";
15
15
  import { QueueManagerImpl } from "./services/queue-manager";
16
16
  import { CachePluginRegistryImpl } from "./services/cache-plugin-registry";
17
17
  import { CacheManagerImpl } from "./services/cache-manager";
18
+ import { PostgresPluginArtifactStore } from "./services/plugin-artifact-store";
19
+ import { DefaultPluginInstallerRegistry } from "./services/plugin-installers/installer-registry";
20
+ import { PluginEventRecorder } from "./services/plugin-event-recorder";
21
+ import { createPluginManagerRouter } from "./services/plugin-manager-router";
22
+ import {
23
+ pluginManagerAccessRules,
24
+ pluginMetadata as pluginManagerMetadata,
25
+ pluginManagerAccess,
26
+ } from "@checkstack/pluginmanager-common";
27
+ import {
28
+ extractPackageJson,
29
+ tryExtractBundle,
30
+ MAX_TARBALL_SIZE_BYTES,
31
+ } from "./services/plugin-installers/tarball-utils";
18
32
  import {
19
33
  createWebSocketHandler,
20
34
  SignalServiceImpl,
21
35
  type WebSocketData,
22
36
  } from "@checkstack/signal-backend";
23
- import type { WsConnectionHandlers } from "@checkstack/backend-api";
37
+ import type {
38
+ AuthService,
39
+ BackendPlugin,
40
+ WsConnectionHandlers,
41
+ } from "@checkstack/backend-api";
42
+ import { createDevAuthService } from "./services/dev-auth";
24
43
 
25
44
  // =============================================================================
26
45
  // SERVER-LEVEL WEBSOCKET DATA
@@ -44,7 +63,6 @@ import {
44
63
  PLUGIN_INSTALLED,
45
64
  PLUGIN_DEREGISTERED,
46
65
  } from "@checkstack/signal-common";
47
- import { createPluginAdminRouter } from "./plugin-manager/plugin-admin-router";
48
66
  import {
49
67
  pluginMetadata as apiDocsMetadata,
50
68
  apiDocsAccess,
@@ -52,9 +70,40 @@ import {
52
70
 
53
71
  import { cors } from "hono/cors";
54
72
 
55
- const app = new Hono();
73
+ // IMPORTANT: TrieRouter (not the default SmartRouter).
74
+ // SmartRouter freezes its matcher on the first incoming request — any later
75
+ // app.add() throws "Can not add a route since the matcher is already built".
76
+ // Plugins register routes asynchronously during init() and at runtime via
77
+ // loadSinglePlugin(), so we need an incremental router.
78
+ const app = new Hono({ router: new TrieRouter() });
56
79
  const pluginManager = new PluginManager();
57
80
 
81
+ /**
82
+ * Init lifecycle state.
83
+ *
84
+ * `initialized` flips to true after the entire init() completes (Phases 1-3).
85
+ * It feeds the "core.init" readiness probe consumed by /ready.
86
+ *
87
+ * `initError` is populated when init throws; the process is then exited so
88
+ * the supervisor (docker/k8s) restarts us — we never serve a half-initialized
89
+ * backend.
90
+ *
91
+ * The HTTP request gate does NOT key off these flags directly. It awaits
92
+ * `pluginManager.routesReadyPromise`, which resolves earlier — right after
93
+ * `/api/:pluginId/*` is added to the root router and BEFORE `afterPluginsReady`
94
+ * runs — so cross-plugin RPC calls during plugin boot don't deadlock on
95
+ * themselves.
96
+ */
97
+ let initError: Error | undefined;
98
+ let initialized = false;
99
+
100
+ /**
101
+ * Maximum time a request will wait for init to complete before falling back
102
+ * to a 503 Service Unavailable. Without this, a wedged plugin would hang
103
+ * health probes forever.
104
+ */
105
+ const READY_WAIT_TIMEOUT_MS = 30_000;
106
+
58
107
  // WebSocket handler instance (initialized during init)
59
108
  let wsHandler: ReturnType<typeof createWebSocketHandler> | undefined;
60
109
 
@@ -80,7 +129,82 @@ app.use(
80
129
  credentials: true,
81
130
  })
82
131
  );
83
- app.use("*", logger());
132
+ // Request/response logging through our rootLogger (winston) instead of
133
+ // hono/logger which bypasses winston and writes to stdout directly. Goes
134
+ // at debug level for healthy responses; warn for 4xx and error for 5xx so
135
+ // failures surface even with low verbosity. The 5xx branch additionally
136
+ // peeks the response body so the underlying error message lands in the
137
+ // log — Hono returns errors as JSON via `c.json({error}, 500)` which the
138
+ // default access log strips down to just the status code.
139
+ app.use("*", async (c, next) => {
140
+ const start = performance.now();
141
+ const method = c.req.method;
142
+ const path = c.req.path;
143
+ rootLogger.debug(`<-- ${method} ${path}`);
144
+ await next();
145
+ const elapsedMs = (performance.now() - start).toFixed(1);
146
+ const status = c.res.status;
147
+ const line = `--> ${method} ${path} ${status} ${elapsedMs}ms`;
148
+
149
+ if (status >= 500) {
150
+ let body: string | undefined;
151
+ try {
152
+ // Clone so the response stream remains consumable downstream.
153
+ body = await c.res.clone().text();
154
+ } catch {
155
+ // ignore — best-effort body capture only
156
+ }
157
+ rootLogger.error(body ? `${line} — ${body}` : line);
158
+ } else if (status >= 400) {
159
+ rootLogger.warn(line);
160
+ } else {
161
+ rootLogger.debug(line);
162
+ }
163
+ });
164
+
165
+ // =============================================================================
166
+ // PLATFORM ENDPOINTS — /.checkstack/*
167
+ // =============================================================================
168
+ //
169
+ // All "platform-level" endpoints (probes, future operator hooks) live under
170
+ // /.checkstack/* so they are clearly separated from plugin /api/*, runtime
171
+ // frontend assets, and the SPA wildcard. The leading dot keeps them out of
172
+ // any plugin URL space by construction.
173
+ //
174
+ // Health & readiness:
175
+ // - registered at module load; bypass the boot gate in `fetch()` so that
176
+ // orchestrators (Kubernetes, docker-compose) can probe a still-booting
177
+ // process.
178
+ // - /.checkstack/health = "process is alive"
179
+ // - /.checkstack/ready = "plugins initialized and all critical probes pass"
180
+
181
+ /** Liveness probe — answers as long as the process responds. */
182
+ app.get("/.checkstack/health", (c) => c.json({ status: "ok" }));
183
+
184
+ /**
185
+ * Readiness probe — aggregates plugin-contributed checks.
186
+ * - 503 while init is in flight or has failed
187
+ * - 503 if any critical probe is failing
188
+ * - 200 only when init completed AND all critical probes pass
189
+ */
190
+ app.get("/.checkstack/ready", async (c) => {
191
+ if (initError) {
192
+ return c.json(
193
+ { ready: false, error: initError.message, checks: [] },
194
+ 503,
195
+ { "Retry-After": "5" },
196
+ );
197
+ }
198
+ if (!initialized) {
199
+ return c.json(
200
+ { ready: false, reason: "initializing", checks: [] },
201
+ 503,
202
+ { "Retry-After": "1" },
203
+ );
204
+ }
205
+ const snapshot = await pluginManager.getReadinessRegistry().evaluate();
206
+ return c.json(snapshot, snapshot.ready ? 200 : 503);
207
+ });
84
208
 
85
209
  // SECURITY: Add missing standard security headers across all API responses
86
210
  app.use("/api/*", async (c, next) => {
@@ -185,14 +309,16 @@ if (frontendDistPath && fs.existsSync(frontendDistPath)) {
185
309
  };
186
310
 
187
311
  // Serve static assets (JS, CSS, images, etc.)
188
- app.get("/assets/*", async (c) => {
312
+ // Fall through to next() on miss so plugin-asset routes (registered later
313
+ // during init at /assets/plugins/:pluginName/*) get a chance to match.
314
+ app.get("/assets/*", async (c, next) => {
189
315
  const assetPath = c.req.path.replace("/assets/", "");
190
316
  const filePath = path.join(frontendDistPath, "assets", assetPath);
191
317
 
192
318
  if (fs.existsSync(filePath)) {
193
319
  return serveFile(c, filePath);
194
320
  }
195
- return c.notFound();
321
+ return next();
196
322
  });
197
323
 
198
324
  // Serve vendor scripts (externalized React, react-router-dom, etc.)
@@ -237,12 +363,6 @@ if (frontendDistPath && fs.existsSync(frontendDistPath)) {
237
363
  const init = async () => {
238
364
  rootLogger.info("🚀 Starting Checkstack Core...");
239
365
 
240
- // Register Plugin Installer Service
241
- const installer = new PluginLocalInstaller(
242
- path.join(process.cwd(), "runtime_plugins")
243
- );
244
- pluginManager.registerService(coreServices.pluginInstaller, installer);
245
-
246
366
  // 1. Run Core Migrations
247
367
  rootLogger.info("🔄 Running core migrations...");
248
368
  try {
@@ -294,6 +414,128 @@ const init = async () => {
294
414
  );
295
415
  pluginManager.registerService(coreServices.cacheManager, cacheManager);
296
416
 
417
+ // 1.9. Register Plugin Install Services (artifact store + installer registry)
418
+ rootLogger.debug("Registering plugin install services...");
419
+ const runtimePluginsDir = path.join(process.cwd(), "runtime_plugins");
420
+ fs.mkdirSync(runtimePluginsDir, { recursive: true });
421
+ const pluginArtifactStore = new PostgresPluginArtifactStore(db);
422
+ const pluginInstallerRegistry = new DefaultPluginInstallerRegistry({
423
+ runtimeDir: runtimePluginsDir,
424
+ artifactStore: pluginArtifactStore,
425
+ });
426
+ pluginManager.registerService(
427
+ coreServices.pluginArtifactStore,
428
+ pluginArtifactStore,
429
+ );
430
+ pluginManager.registerService(
431
+ coreServices.pluginInstallerRegistry,
432
+ pluginInstallerRegistry,
433
+ );
434
+ // Per-instance event recorder (instanceId is the bun process pid for now;
435
+ // upgrade to a stable instance id when multi-region deploys land).
436
+ const eventRecorder = new PluginEventRecorder(db, `bun-${process.pid}`);
437
+ pluginManager.setEventRecorder(eventRecorder);
438
+ pluginManager.setRuntimeDir(runtimePluginsDir);
439
+
440
+ // Tarball-upload endpoint backing the install UI's "Tarball Upload" tab.
441
+ //
442
+ // The user uploads a `.tgz` produced by `bunx @checkstack/scripts plugin-pack`
443
+ // (single-package or `--bundle` mode). We peek the bytes to derive the
444
+ // primary `(name, version)`, persist the artifact to plugin_artifacts, and
445
+ // return the `artifactId`. The frontend then submits a `PluginSource` of
446
+ // type "tarball" with that id to `previewInstall` / `install`.
447
+ //
448
+ // We deliberately keep this as a plain Hono route (not an oRPC procedure)
449
+ // because oRPC contracts are JSON-only — multipart bodies can't be
450
+ // expressed there. Auth + access are enforced manually below using the
451
+ // same access service the rest of the platform uses.
452
+ app.post("/api/pluginmanager/upload-tarball", async (c) => {
453
+ const authService = await pluginManager.getService(coreServices.auth);
454
+ if (!authService) {
455
+ return c.json({ error: "Auth service not available" }, 503);
456
+ }
457
+ const user = await authService.authenticate(c.req.raw);
458
+ if (!user || user.type === "service") {
459
+ return c.json({ error: "Authentication required" }, 401);
460
+ }
461
+ const requiredAccess = `${pluginManagerMetadata.pluginId}.${pluginManagerAccess.install.id}`;
462
+ const accessRules = (
463
+ "accessRules" in user ? user.accessRules : []
464
+ ) as string[];
465
+ const anonymous = await authService.getAnonymousAccessRules();
466
+ if (!accessRules.includes(requiredAccess) && !anonymous.includes(requiredAccess)) {
467
+ return c.json({ error: "Access denied" }, 403);
468
+ }
469
+
470
+ const formData = await c.req.formData();
471
+ const file = formData.get("file");
472
+ if (!file || typeof file === "string") {
473
+ return c.json({ error: "Missing 'file' field in multipart body" }, 400);
474
+ }
475
+ const bytes = new Uint8Array(await file.arrayBuffer());
476
+ if (bytes.byteLength === 0) {
477
+ return c.json({ error: "Uploaded file is empty" }, 400);
478
+ }
479
+ if (bytes.byteLength > MAX_TARBALL_SIZE_BYTES) {
480
+ return c.json(
481
+ {
482
+ error: `Tarball exceeds maximum size: ${bytes.byteLength} > ${MAX_TARBALL_SIZE_BYTES} bytes`,
483
+ },
484
+ 413,
485
+ );
486
+ }
487
+
488
+ // Derive (name, version) by peeking the tarball. For bundle tarballs,
489
+ // use the primary's manifest entry; for single packages, the embedded
490
+ // package.json. Validation happens here too — a malformed tarball is
491
+ // rejected before any DB write.
492
+ let pluginName: string;
493
+ let version: string;
494
+ try {
495
+ const bundle = await tryExtractBundle(bytes);
496
+ if (bundle) {
497
+ pluginName = bundle.manifest.primary;
498
+ const primaryEntry = bundle.manifest.packages.find(
499
+ (p) => p.name === bundle.manifest.primary,
500
+ );
501
+ if (!primaryEntry) {
502
+ return c.json(
503
+ { error: `Bundle manifest missing primary entry '${pluginName}'` },
504
+ 400,
505
+ );
506
+ }
507
+ version = primaryEntry.version;
508
+ } else {
509
+ const meta = await extractPackageJson(bytes);
510
+ pluginName = meta.name;
511
+ version = meta.version;
512
+ }
513
+ } catch (error) {
514
+ return c.json(
515
+ { error: `Failed to peek tarball: ${extractErrorMessage(error)}` },
516
+ 400,
517
+ );
518
+ }
519
+
520
+ const { artifactId, contentHash } = await pluginArtifactStore.store({
521
+ pluginName,
522
+ version,
523
+ tarball: bytes,
524
+ });
525
+
526
+ rootLogger.info(
527
+ `📦 Tarball uploaded: ${pluginName}@${version} (artifactId=${artifactId}, ${bytes.byteLength} bytes)`,
528
+ );
529
+
530
+ return c.json({
531
+ artifactId,
532
+ pluginName,
533
+ version,
534
+ contentHash,
535
+ sizeBytes: bytes.byteLength,
536
+ });
537
+ });
538
+
297
539
  // Serve static assets for runtime frontend plugins only
298
540
  // Backend plugins don't need public assets - only frontend plugins do
299
541
  // e.g. /assets/plugins/my-plugin-frontend/index.js -> runtime_plugins/node_modules/my-plugin-frontend/dist/index.js
@@ -358,7 +600,88 @@ const init = async () => {
358
600
  }
359
601
 
360
602
  // 3. Load Plugins
361
- await pluginManager.loadPlugins(app);
603
+ //
604
+ // Dev-server mode (entered via `bunx @checkstack/scripts dev` from a
605
+ // plugin author's repo). Two env vars control it:
606
+ //
607
+ // - CHECKSTACK_DEV_PLUGIN_PATH: absolute path to a plugin module's
608
+ // directory whose `default` export is the BackendPlugin to load.
609
+ // When set, filesystem discovery is skipped — only this plugin and
610
+ // core services are loaded. Lets a plugin author iterate without
611
+ // a workspace checkout.
612
+ // - CHECKSTACK_DEV_AUTH=true: registers a synthetic auth service that
613
+ // auto-grants every access rule. Skips login flow entirely. Strictly
614
+ // refused on a known-prod NODE_ENV value to make accidental misuse
615
+ // loud.
616
+ const devPluginPath = process.env.CHECKSTACK_DEV_PLUGIN_PATH;
617
+ const devAuth = process.env.CHECKSTACK_DEV_AUTH === "true";
618
+ if (devAuth) {
619
+ if (process.env.NODE_ENV === "production") {
620
+ throw new Error(
621
+ "CHECKSTACK_DEV_AUTH=true is refused when NODE_ENV=production. " +
622
+ "Dev auth bypasses every access guard and must never run in prod.",
623
+ );
624
+ }
625
+ rootLogger.warn(
626
+ "🛠 Dev auth ENABLED — every access rule is auto-granted. Do NOT use in production.",
627
+ );
628
+ const devAuthService: AuthService = createDevAuthService({
629
+ getAllAccessRules: () => pluginManager.getAllAccessRules(),
630
+ });
631
+ pluginManager.registerService(coreServices.auth, devAuthService);
632
+ }
633
+
634
+ const manualPlugins: BackendPlugin[] = [];
635
+ if (devPluginPath) {
636
+ rootLogger.info(`🛠 Dev mode — loading plugin from ${devPluginPath}`);
637
+
638
+ // Co-load `@checkstack/*` backend deps the dev command resolved from
639
+ // the plugin's package.json. Without these, the plugin under dev's
640
+ // `init()` would hit unregistered services. The dev command always
641
+ // includes in-memory queue+cache providers when no other provider
642
+ // is in the dep graph, so coreServices.queueManager /
643
+ // coreServices.cacheManager have a registered strategy on boot.
644
+ const extraPathsRaw = process.env.CHECKSTACK_DEV_EXTRA_PLUGIN_PATHS;
645
+ const extraPaths: string[] = extraPathsRaw ? JSON.parse(extraPathsRaw) : [];
646
+ for (const extra of extraPaths) {
647
+ try {
648
+ const mod = await import(extra);
649
+ const exp = mod.default as BackendPlugin | undefined;
650
+ if (!exp || typeof exp.register !== "function") {
651
+ throw new Error(
652
+ `Module at ${extra} does not export a default BackendPlugin`,
653
+ );
654
+ }
655
+ manualPlugins.push(exp);
656
+ } catch (error) {
657
+ throw new Error(
658
+ `Failed to import co-loaded core plugin from ${extra}: ${extractErrorMessage(error)}`,
659
+ );
660
+ }
661
+ }
662
+
663
+ // Plugin under dev loads last; the platform's pendingInits topo-sort
664
+ // takes care of actual init order, but importing it last makes the
665
+ // boot log easier to read.
666
+ try {
667
+ const pluginModule = await import(devPluginPath);
668
+ const pluginExport = pluginModule.default as BackendPlugin | undefined;
669
+ if (!pluginExport || typeof pluginExport.register !== "function") {
670
+ throw new Error(
671
+ `Module at ${devPluginPath} does not export a default BackendPlugin`,
672
+ );
673
+ }
674
+ manualPlugins.push(pluginExport);
675
+ } catch (error) {
676
+ throw new Error(
677
+ `Failed to import dev plugin from ${devPluginPath}: ${extractErrorMessage(error)}`,
678
+ );
679
+ }
680
+ }
681
+
682
+ await pluginManager.loadPlugins(app, manualPlugins, {
683
+ skipDiscovery: !!devPluginPath,
684
+ });
362
685
 
363
686
  // 4. Wire up auth client for access-based signal filtering
364
687
  // This must happen AFTER plugins load so auth-backend is available
@@ -376,13 +699,28 @@ const init = async () => {
376
699
  );
377
700
  }
378
701
 
379
- // 5. Register plugin admin router (core admin endpoints)
380
- const pluginAdminRouter = createPluginAdminRouter({
702
+ // 4.5. Register the plugin-manager admin router (core router, not a regular
703
+ // plugin). Access rules from `@checkstack/pluginmanager-common` are also
704
+ // pushed into the access registry here so the autoAuthMiddleware can
705
+ // resolve them. We use the existing access-rule prefix scheme so the
706
+ // ids land as e.g. `pluginmanager.plugin.manage`.
707
+ pluginManager.registerCoreAccessRules(
708
+ pluginManagerMetadata.pluginId,
709
+ pluginManagerAccessRules,
710
+ );
711
+ pluginManager.registerCorePluginMetadata(pluginManagerMetadata);
712
+ const pluginManagerRouter = createPluginManagerRouter({
713
+ db,
381
714
  pluginManager,
382
- installer,
715
+ registry: pluginManager.getRegistry(),
716
+ eventRecorder,
717
+ workspaceRoot: path.resolve(import.meta.dir, "..", "..", ".."),
718
+ runtimeDir: runtimePluginsDir,
383
719
  });
384
- // Register as core router - available at /api/core/
385
- pluginManager.registerCoreRouter("core", pluginAdminRouter);
720
+ pluginManager.registerCoreRouter(
721
+ pluginManagerMetadata.pluginId,
722
+ pluginManagerRouter,
723
+ );
386
724
 
387
725
  // 5. Setup lifecycle listeners for multi-instance coordination
388
726
  await pluginManager.setupLifecycleListeners();
@@ -441,16 +779,117 @@ const init = async () => {
441
779
  logger: rootLogger.child({ service: "WebSocket" }),
442
780
  });
443
781
 
782
+ // Register the core "init" readiness probe. Plugin-contributed probes are
783
+ // additive — see coreServices.readinessRegistry for the plugin-facing API.
784
+ pluginManager.getReadinessRegistry().register({
785
+ name: "core.init",
786
+ critical: true,
787
+ check: async () => ({ ok: initialized, message: initialized ? undefined : "init not complete" }),
788
+ });
789
+
444
790
  rootLogger.info("✅ Checkstack Core initialized.");
445
791
  };
446
792
 
447
- void init();
793
+ /**
794
+ * Fire-and-forget init. We deliberately don't `await` at the top level so the
795
+ * server can answer /health and /ready while plugins are still loading;
796
+ * non-bypass requests are gated via `waitForRoutesReady()` below.
797
+ */
798
+ // eslint-disable-next-line unicorn/prefer-top-level-await -- intentionally non-blocking; gates handled in waitForRoutesReady()
799
+ void (async () => {
800
+ try {
801
+ await init();
802
+ initialized = true;
803
+ } catch (error: unknown) {
804
+ initError = new Error(extractErrorMessage(error, "init failed"));
805
+ rootLogger.error(
806
+ "❌ FATAL: Checkstack Core init failed; the process will exit so the supervisor can restart it.",
807
+ initError,
808
+ );
809
+ // Give the logger one tick to flush, then exit so docker/k8s restarts us.
810
+ // A half-initialized backend silently serves broken state — restart is
811
+ // strictly better than continuing. We disable the no-process-exit rule
812
+ // because this IS the canonical fail-fast pattern for a long-running
813
+ // server entrypoint.
814
+ setTimeout(() => {
815
+ // eslint-disable-next-line unicorn/no-process-exit -- intentional fail-fast on init failure
816
+ process.exit(1);
817
+ }, 50);
818
+ }
819
+ })();
820
+
821
+ /**
822
+ * Paths that bypass the boot gate. Platform endpoints under /.checkstack/*
823
+ * MUST be reachable while the backend is still booting so orchestrators can
824
+ * probe it. Everything else waits until plugin routes are registered.
825
+ */
826
+ const BOOT_BYPASS_PREFIX = "/.checkstack/";
827
+
828
+ /**
829
+ * Wait until plugin RPC routes are registered on the root router (resolved
830
+ * inside `loadPlugins` BEFORE Phase 2 / `afterPluginsReady`). Returns:
831
+ * - undefined when routes are ready → caller should proceed to Hono.
832
+ * - a 503 Response when init failed or the wait timed out.
833
+ *
834
+ * Why this gate, and why at this specific point:
835
+ * - Earlier (before /api/:pluginId/* is added), an incoming request would
836
+ * short-circuit through the SPA wildcard or 404 because the plugin route
837
+ * simply doesn't exist yet on the router.
838
+ * - Later (after full init), self-referencing RPC calls made from
839
+ * `afterPluginsReady` would deadlock waiting for init to complete — so
840
+ * we MUST open the gate before Phase 3 runs.
841
+ * - `loadPlugins()` resolves `routesReadyPromise` immediately after
842
+ * `registerApiRoute()`, which is the earliest point both conditions hold.
843
+ */
844
+ async function waitForRoutesReady(): Promise<Response | undefined> {
845
+ if (initError) {
846
+ return Response.json(
847
+ { error: "Backend init failed", message: initError.message },
848
+ { status: 503, headers: { "Retry-After": "5" } },
849
+ );
850
+ }
851
+ let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
852
+ // pluginManager.routesReadyPromise resolves from inside loadPlugins; it
853
+ // never rejects. The init catch handler logs + process.exit's separately.
854
+ const timedOut = await Promise.race([
855
+ pluginManager.routesReadyPromise.then(() => false),
856
+ new Promise<true>((resolve) => {
857
+ timeoutHandle = setTimeout(() => resolve(true), READY_WAIT_TIMEOUT_MS);
858
+ }),
859
+ ]);
860
+ if (timeoutHandle) clearTimeout(timeoutHandle);
861
+ if (timedOut) {
862
+ return Response.json(
863
+ { error: "Backend not ready", message: "boot timeout" },
864
+ { status: 503, headers: { "Retry-After": "5" } },
865
+ );
866
+ }
867
+ // Re-read after await — init may have rejected while we were waiting.
868
+ const errAfter = initError as Error | undefined;
869
+ if (errAfter) {
870
+ return Response.json(
871
+ { error: "Backend init failed", message: errAfter.message },
872
+ { status: 503, headers: { "Retry-After": "5" } },
873
+ );
874
+ }
875
+ return undefined;
876
+ }
448
877
 
449
878
  // Custom fetch handler that handles WebSocket upgrades
450
879
  const fetch = async (
451
880
  req: Request,
452
881
  server: Server<ServerWsData>
453
882
  ): Promise<Response | undefined> => {
883
+ const url = new URL(req.url);
884
+
885
+ // Platform endpoints (/.checkstack/*) bypass the boot gate so orchestrators
886
+ // can poll a booting process. Everything else waits until plugin routes
887
+ // are registered on the root router (resolved before Phase 2 init runs).
888
+ if (!url.pathname.startsWith(BOOT_BYPASS_PREFIX)) {
889
+ const stalled = await waitForRoutesReady();
890
+ if (stalled) return stalled;
891
+ }
892
+
454
893
  // Set the server reference for WebSocket pub/sub after startup
455
894
  if (wsHandler && !server.upgrade) {
456
895
  // Server doesn't support WebSocket upgrade (shouldn't happen with Bun)
@@ -461,8 +900,6 @@ const fetch = async (
461
900
  // Cast is safe: signal handler only reads its own fields via connectionType guard
462
901
  wsHandler?.setServer(server as unknown as Server<WebSocketData>);
463
902
 
464
- const url = new URL(req.url);
465
-
466
903
  // Handle WebSocket upgrade for signals
467
904
  if (url.pathname === "/api/signals/ws") {
468
905
  // Try to authenticate, but allow anonymous connections for broadcast signals