threadforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +152 -0
  3. package/bin/forge.js +1050 -0
  4. package/bin/host-commands.js +344 -0
  5. package/bin/platform-commands.js +570 -0
  6. package/package.json +71 -0
  7. package/shared/auth.js +475 -0
  8. package/src/core/DirectMessageBus.js +364 -0
  9. package/src/core/EndpointResolver.js +247 -0
  10. package/src/core/ForgeContext.js +2227 -0
  11. package/src/core/ForgeHost.js +122 -0
  12. package/src/core/ForgePlatform.js +145 -0
  13. package/src/core/Ingress.js +768 -0
  14. package/src/core/Interceptors.js +420 -0
  15. package/src/core/MessageBus.js +310 -0
  16. package/src/core/Prometheus.js +305 -0
  17. package/src/core/RequestContext.js +413 -0
  18. package/src/core/RoutingStrategy.js +316 -0
  19. package/src/core/Supervisor.js +1306 -0
  20. package/src/core/ThreadAllocator.js +196 -0
  21. package/src/core/WorkerChannelManager.js +879 -0
  22. package/src/core/config.js +624 -0
  23. package/src/core/host-config.js +311 -0
  24. package/src/core/network-utils.js +166 -0
  25. package/src/core/platform-config.js +308 -0
  26. package/src/decorators/ServiceProxy.js +899 -0
  27. package/src/decorators/index.js +571 -0
  28. package/src/deploy/NginxGenerator.js +865 -0
  29. package/src/deploy/PlatformManifestGenerator.js +96 -0
  30. package/src/deploy/RouteManifestGenerator.js +112 -0
  31. package/src/deploy/index.js +984 -0
  32. package/src/frontend/FrontendDevLifecycle.js +65 -0
  33. package/src/frontend/FrontendPluginOrchestrator.js +187 -0
  34. package/src/frontend/SiteResolver.js +63 -0
  35. package/src/frontend/StaticMountRegistry.js +90 -0
  36. package/src/frontend/index.js +5 -0
  37. package/src/frontend/plugins/index.js +2 -0
  38. package/src/frontend/plugins/viteFrontend.js +79 -0
  39. package/src/frontend/types.js +35 -0
  40. package/src/index.js +56 -0
  41. package/src/internals.js +31 -0
  42. package/src/plugins/PluginManager.js +537 -0
  43. package/src/plugins/ScopedPostgres.js +192 -0
  44. package/src/plugins/ScopedRedis.js +142 -0
  45. package/src/plugins/index.js +1729 -0
  46. package/src/registry/ServiceRegistry.js +796 -0
  47. package/src/scaling/ScaleAdvisor.js +442 -0
  48. package/src/services/Service.js +195 -0
  49. package/src/services/worker-bootstrap.js +676 -0
  50. package/src/templates/auth-service.js +65 -0
  51. package/src/templates/identity-service.js +75 -0
@@ -0,0 +1,676 @@
1
+ /**
2
+ * Worker Bootstrap v3
3
+ *
4
+ * Loads services, builds proxy clients, and injects them.
5
+ *
6
+ * After bootstrap, a service can call:
7
+ * await this.users.getUser('123')
8
+ *
9
+ * Which transparently routes through:
10
+ * - Direct function call (colocated services, same process)
11
+ * - UDS (different process, same machine)
12
+ * - Supervisor IPC fallback (startup race)
13
+ */
14
+
15
+ import fs from "node:fs";
16
+ import path from "node:path";
17
+ import { pathToFileURL } from "node:url";
18
+ import { EndpointResolver } from "../core/EndpointResolver.js";
19
+ import { ForgeContext, NOT_HANDLED } from "../core/ForgeContext.js";
20
+ import { RequestContext } from "../core/RequestContext.js";
21
+ import { buildServiceProxies, createServiceProxy } from "../decorators/ServiceProxy.js";
22
+ import { resolveStaticMountsForService } from "../frontend/SiteResolver.js";
23
+
24
+ // A6: Consolidate all env var reads into a single structured config object
25
+ function parseWorkerConfig() {
26
+ return {
27
+ groupName: process.env.FORGE_GROUP_NAME,
28
+ serviceEntries: process.env.FORGE_SERVICE_ENTRIES,
29
+ serviceNames: process.env.FORGE_SERVICE_NAMES,
30
+ port: process.env.FORGE_PORT,
31
+ workerId: process.env.FORGE_WORKER_ID,
32
+ threadCount: process.env.FORGE_THREAD_COUNT,
33
+ mode: process.env.FORGE_MODE,
34
+ serviceTypes: process.env.FORGE_SERVICE_TYPES,
35
+ channels: process.env.FORGE_CHANNELS,
36
+ hostMeta: process.env.FORGE_HOST_META,
37
+ registryMode: process.env.FORGE_REGISTRY_MODE ?? "embedded",
38
+ registryHost: process.env.FORGE_HOST,
39
+ servicePorts: process.env.FORGE_SERVICE_PORTS,
40
+ plugins: process.env.FORGE_PLUGINS,
41
+ servicePlugins: process.env.FORGE_SERVICE_PLUGINS,
42
+ configPath: process.env.FORGE_CONFIG_PATH,
43
+ serviceEndpoints: process.env.FORGE_SERVICE_ENDPOINTS,
44
+ sites: process.env.FORGE_SITES,
45
+ sitesFile: process.env.FORGE_SITES_FILE,
46
+ };
47
+ }
48
+
49
+ const workerConfig = parseWorkerConfig();
50
+ const hostMeta = workerConfig.hostMeta ? JSON.parse(workerConfig.hostMeta) : null;
51
+ let sites = null;
52
+ let sitesPayload = workerConfig.sites;
53
+ if (!sitesPayload && workerConfig.sitesFile) {
54
+ try {
55
+ sitesPayload = fs.readFileSync(workerConfig.sitesFile, "utf8");
56
+ } catch (err) {
57
+ console.warn(`[ThreadForge] Could not read FORGE_SITES_FILE, static mounts disabled: ${err.message}`);
58
+ }
59
+ }
60
+ if (sitesPayload) {
61
+ try {
62
+ sites = JSON.parse(sitesPayload);
63
+ } catch (err) {
64
+ console.warn(`[ThreadForge] Invalid FORGE_SITES payload, static mounts disabled: ${err.message}`);
65
+ }
66
+ }
67
+
68
+ async function _deliverRemoteEvent(url, body, headers, serviceName, maxRetries = 3) {
69
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
70
+ try {
71
+ const resp = await fetch(url, {
72
+ method: 'POST',
73
+ headers,
74
+ body: JSON.stringify(body),
75
+ signal: AbortSignal.timeout(5000),
76
+ });
77
+ if (resp.ok) return;
78
+ // Non-retryable status codes
79
+ if (resp.status >= 400 && resp.status < 500) return;
80
+ } catch (err) {
81
+ if (attempt === maxRetries - 1) {
82
+ console.error(`[ThreadForge] Remote event delivery to ${serviceName} failed after ${maxRetries} attempts: ${err.message}`);
83
+ return;
84
+ }
85
+ }
86
+ // Exponential backoff: 100ms, 200ms, 400ms
87
+ await new Promise(resolve => setTimeout(resolve, 100 * 2 ** attempt));
88
+ }
89
+ }
90
+
91
+ /**
92
+ * Look up which project owns a given service name.
93
+ * Returns { projectId, schema, keyPrefix } or null.
94
+ */
95
+ function resolveProjectForService(serviceName) {
96
+ if (!hostMeta) return null;
97
+ for (const [projectId, meta] of Object.entries(hostMeta)) {
98
+ if (meta.services.includes(serviceName)) {
99
+ return { projectId, schema: meta.schema, keyPrefix: meta.keyPrefix };
100
+ }
101
+ }
102
+ return null;
103
+ }
104
+
105
+ // Hoisted reference so emergency shutdown handlers can access it
106
+ /** @type {Map<string, {service: object, ctx: import('../core/ForgeContext.js').ForgeContext}>|null} */
107
+ let _localServices = null;
108
+ let emergencyShutdownPromise = null;
109
+ // M-CORE-4: Module-level flag so emergencyShutdown can check if normal shutdown is running
110
+ let shutdownInProgress = false;
111
+
112
+ // Track unhandled rejections — exit only if 5+ within 60 seconds
113
+ let _rejectionCount = 0;
114
+ let _rejectionWindowStart = Date.now();
115
+ const REJECTION_THRESHOLD = 5;
116
+ const REJECTION_WINDOW_MS = 60_000;
117
+
118
+ async function emergencyShutdown(reason, err) {
119
+ if (emergencyShutdownPromise) {
120
+ await emergencyShutdownPromise.catch(() => {});
121
+ process.exit(1);
122
+ return;
123
+ }
124
+ // M-CORE-4: Skip service._stop() if normal shutdown is already running
125
+ // to prevent double-cleanup of services
126
+ const skipServiceStop = shutdownInProgress;
127
+ emergencyShutdownPromise = (async () => {
128
+ console.error(`[ThreadForge] ${reason}:`, err);
129
+ try {
130
+ if (_localServices && !skipServiceStop) {
131
+ for (const [, { service }] of _localServices) {
132
+ try {
133
+ await Promise.race([
134
+ service._stop?.(),
135
+ new Promise((_, reject) => setTimeout(() => reject(new Error('Stop timed out')), 5000)),
136
+ ]);
137
+ } catch {}
138
+ }
139
+ }
140
+ } catch {}
141
+ process.exit(1);
142
+ })();
143
+ await emergencyShutdownPromise;
144
+ }
145
+
146
+ function handleUnhandledRejection(err) {
147
+ const now = Date.now();
148
+
149
+ // Reset window if it has elapsed
150
+ if (now - _rejectionWindowStart > REJECTION_WINDOW_MS) {
151
+ _rejectionCount = 0;
152
+ _rejectionWindowStart = now;
153
+ }
154
+ _rejectionCount++;
155
+
156
+ console.error(`[ThreadForge] SEVERE: Unhandled rejection (${_rejectionCount}/${REJECTION_THRESHOLD} in window):`, err);
157
+ if (err?.stack) console.error(err.stack);
158
+
159
+ if (_rejectionCount >= REJECTION_THRESHOLD) {
160
+ console.error(`[ThreadForge] ${REJECTION_THRESHOLD}+ unhandled rejections within ${REJECTION_WINDOW_MS / 1000}s — exiting`);
161
+ emergencyShutdown("Repeated unhandled rejections", err);
162
+ }
163
+ }
164
+
165
+ async function bootstrap() {
166
+ // RT-H2: Register emergency handlers early so exceptions during any phase get cleanup
167
+ process.on("uncaughtException", (err) => emergencyShutdown("Uncaught exception", err));
168
+ process.on("unhandledRejection", (err) => handleUnhandledRejection(err));
169
+
170
+ // RT-C2: Re-entrancy guard for graceful shutdown (uses module-level `shutdownInProgress`)
171
+
172
+ // H-RT-4: Register signal handlers early so SIGTERM/SIGINT during any phase triggers cleanup
173
+ process.once("SIGTERM", () => shutdown("SIGTERM"));
174
+ process.once("SIGINT", () => shutdown("SIGINT"));
175
+
176
+ const entries = workerConfig.serviceEntries.split(",")
177
+ .filter(e => e.trim())
178
+ .map((e) => {
179
+ const eqIdx = e.indexOf("=");
180
+ if (eqIdx === -1) {
181
+ throw new Error(`Invalid FORGE_SERVICE_ENTRIES format: "${e}". Expected "name=path".`);
182
+ }
183
+ const name = e.slice(0, eqIdx).trim();
184
+ const entryPath = e.slice(eqIdx + 1).trim();
185
+ if (!name || !entryPath) {
186
+ throw new Error(`Invalid FORGE_SERVICE_ENTRIES entry: "${e}". Name and path are both required.`);
187
+ }
188
+ return { name, entry: entryPath };
189
+ });
190
+
191
+ if (entries.length === 0) {
192
+ throw new Error("FORGE_SERVICE_ENTRIES is empty or contains no valid entries");
193
+ }
194
+
195
+ const typeMap = {};
196
+ if (workerConfig.serviceTypes) {
197
+ for (const pair of workerConfig.serviceTypes.split(",")) {
198
+ const [name, type] = pair.split("=");
199
+ typeMap[name] = type;
200
+ }
201
+ }
202
+
203
+ // Parse declared channels to know which services we might talk to
204
+ let declaredChannels = [];
205
+ try {
206
+ declaredChannels = JSON.parse(workerConfig.channels || "[]");
207
+ } catch {}
208
+
209
+ const port = parseInt(workerConfig.port, 10);
210
+ const workerId = parseInt(workerConfig.workerId, 10);
211
+ const threadCount = parseInt(workerConfig.threadCount, 10);
212
+
213
+ // Phase 1: Load all service modules (P13: parallel imports)
214
+ /** @type {Map<string, {ServiceClass: Function, instance: object}>} */
215
+ const loaded = new Map();
216
+ /** @type {Map<string, Function>} service name → class (for proxy building) */
217
+ const serviceClasses = new Map();
218
+
219
+ const resolvedEntries = entries.map(({ name, entry }) => ({
220
+ name,
221
+ entry,
222
+ url: pathToFileURL(path.resolve(process.cwd(), entry)).href,
223
+ }));
224
+
225
+ const importResults = await Promise.all(
226
+ resolvedEntries.map(async ({ name, entry, url }) => {
227
+ try {
228
+ const mod = await import(url);
229
+ return { name, entry, mod, error: null };
230
+ } catch (err) {
231
+ return { name, entry, mod: null, error: err };
232
+ }
233
+ }),
234
+ );
235
+
236
+ for (const { name, entry, mod, error } of importResults) {
237
+ if (error) {
238
+ console.error(`[ThreadForge] Failed to load service "${name}" from ${entry}: ${error.message}`);
239
+ if (error.stack) console.error(error.stack);
240
+ throw new Error(`Service "${name}" failed to load from "${entry}": ${error.message}`);
241
+ }
242
+ const ServiceClass = mod.default ?? mod;
243
+
244
+ if (typeof ServiceClass !== "function") {
245
+ throw new Error(`Service "${entry}" must export a class. Got: ${typeof ServiceClass}`);
246
+ }
247
+
248
+ const instance = new ServiceClass();
249
+ loaded.set(name, { ServiceClass, instance });
250
+ serviceClasses.set(name, ServiceClass);
251
+ }
252
+
253
+ // Also register remote service classes if we know about them
254
+ // (from channels config). For remote services we don't have the
255
+ // class, so proxies will use dynamic dispatch.
256
+ const allConnectedServices = new Set();
257
+ for (const ch of declaredChannels) {
258
+ allConnectedServices.add(ch.from);
259
+ allConnectedServices.add(ch.to);
260
+ }
261
+
262
+ // Phase 2: Create contexts and local service registry
263
+ /** @type {Map<string, {service: object, ctx: ForgeContext}>} */
264
+ const localServices = new Map();
265
+ _localServices = localServices;
266
+
267
+ function localSend(fromName, target, payload) {
268
+ const local = localServices.get(target);
269
+ if (!local) return false;
270
+ Promise.resolve(local.service.onMessage(fromName, payload)).catch((err) => {
271
+ local.ctx?.logger?.error?.("onMessage error", { from: fromName, error: err.message });
272
+ local.ctx?.metrics?.increment?.("forge_local_send_errors_total", { target });
273
+ });
274
+ return true;
275
+ }
276
+
277
+ async function localRequest(fromName, target, payload) {
278
+ const local = localServices.get(target);
279
+ if (local) {
280
+ return local.service.onRequest(fromName, payload);
281
+ }
282
+ return NOT_HANDLED;
283
+ }
284
+
285
+ // Create EndpointResolver from env — shared by all contexts in this worker
286
+ const endpointResolver = EndpointResolver.fromEnv();
287
+
288
+ for (const [name, { instance }] of loaded) {
289
+ const serviceType = typeMap[name] ?? "internal";
290
+ const isEdge = serviceType === "edge";
291
+
292
+ const ctx = new ForgeContext({
293
+ serviceName: name,
294
+ port: isEdge ? port : 0,
295
+ workerId,
296
+ threadCount,
297
+ mode: workerConfig.mode,
298
+ serviceType,
299
+ sendIPC: (msg) => {
300
+ if (process.send) process.send(msg);
301
+ },
302
+ localSend: (target, payload) => localSend(name, target, payload),
303
+ localRequest: (target, payload) => localRequest(name, target, payload),
304
+ staticMounts: resolveStaticMountsForService(name, sites),
305
+ });
306
+
307
+ ctx._endpointResolver = endpointResolver;
308
+
309
+ const projectInfo = resolveProjectForService(name);
310
+ if (projectInfo) {
311
+ ctx._projectId = projectInfo.projectId;
312
+ ctx._projectSchema = projectInfo.schema;
313
+ ctx._projectKeyPrefix = projectInfo.keyPrefix;
314
+ }
315
+
316
+ ctx._emitEvent = (eventName, data) => {
317
+ const eventPayload = { __forge_event: eventName, __forge_data: data };
318
+
319
+ for (const ch of declaredChannels) {
320
+ let target;
321
+ if (ch.from === name) target = ch.to;
322
+ else if (ch.to === name) target = ch.from;
323
+ else continue;
324
+
325
+ if (localSend(name, target, eventPayload)) continue;
326
+
327
+ const endpoint = endpointResolver.resolve(target);
328
+ if (endpoint?.remote) {
329
+ // Propagate RequestContext so events can be traced back to originating requests
330
+ const rctx = RequestContext.current();
331
+ const headers = { "Content-Type": "application/json" };
332
+ if (rctx) Object.assign(headers, rctx.toHeaders());
333
+ _deliverRemoteEvent(
334
+ `http://${endpoint.host}:${endpoint.port}/__forge/event`,
335
+ { from: name, event: eventName, data },
336
+ headers,
337
+ target,
338
+ );
339
+ } else {
340
+ ctx.send(target, eventPayload);
341
+ }
342
+ }
343
+ };
344
+
345
+ await instance._init(ctx);
346
+ ctx._serviceInstance = instance; // for /__forge/invoke endpoint
347
+ localServices.set(name, { service: instance, ctx });
348
+ }
349
+
350
+ // Phase 2b: Dynamic registry discovery
351
+ if (workerConfig.registryMode !== "embedded") {
352
+ try {
353
+ const { ServiceRegistry } = await import("../registry/ServiceRegistry.js");
354
+ const workerRegistry = new ServiceRegistry({
355
+ mode: workerConfig.registryMode,
356
+ host: workerConfig.registryHost || undefined,
357
+ });
358
+
359
+ workerRegistry.on("discovered", (reg) => {
360
+ if (reg.ports?.http) {
361
+ endpointResolver.set(reg.name, {
362
+ host: reg.host,
363
+ port: reg.ports.http,
364
+ remote: true,
365
+ });
366
+ }
367
+ });
368
+
369
+ workerRegistry.on("removed", (reg) => {
370
+ if (reg.ports?.http) {
371
+ endpointResolver.remove(reg.name, reg.host, reg.ports.http);
372
+ }
373
+ });
374
+
375
+ await workerRegistry.start();
376
+ } catch (err) {
377
+ // Don't crash on registry errors — static endpoints still work
378
+ console.error(`[ThreadForge] Worker registry init failed: ${err.message}`);
379
+ }
380
+ }
381
+
382
+ // Phase 3: Build and inject proxy clients
383
+ // A2: Delegate all proxy creation to ServiceProxy — no inline retry/circuit-breaker reimplementation
384
+ const servicePorts = JSON.parse(workerConfig.servicePorts || "{}");
385
+
386
+ for (const [name, { service }] of localServices) {
387
+ const ctx = localServices.get(name).ctx;
388
+ const proxies = buildServiceProxies(ctx, serviceClasses, localServices);
389
+
390
+ for (const svcName of allConnectedServices) {
391
+ if (svcName === name) continue;
392
+ if (proxies[svcName]) continue;
393
+
394
+ // Service not in serviceClasses (remote/external) — use dynamic proxy via ServiceProxy
395
+ proxies[svcName] = createServiceProxy(ctx, svcName, null, null, {});
396
+ }
397
+
398
+ service._setProxies(proxies);
399
+ }
400
+
401
+ // Phase 4: Connect plugins and inject into services
402
+ let pluginManager = null;
403
+ try {
404
+ const pluginNames = JSON.parse(workerConfig.plugins || "[]");
405
+ const servicePluginMap = JSON.parse(workerConfig.servicePlugins || "{}");
406
+
407
+ if (pluginNames.length > 0) {
408
+ const { PluginManager } = await import("../plugins/PluginManager.js");
409
+ pluginManager = new PluginManager();
410
+
411
+ if (workerConfig.configPath) {
412
+ const configMod = await import(workerConfig.configPath);
413
+ const config = configMod.default ?? configMod;
414
+ if (config.plugins) {
415
+ let plugins = config.plugins;
416
+
417
+ if (hostMeta) {
418
+ const { scopedPostgres } = await import("../plugins/ScopedPostgres.js");
419
+ const { scopedRedis } = await import("../plugins/ScopedRedis.js");
420
+ plugins = plugins.map((p) => {
421
+ if (p.name === "postgres") return scopedPostgres(p._options ?? {});
422
+ if (p.name === "redis") return scopedRedis(p._options ?? {});
423
+ return p;
424
+ });
425
+ }
426
+
427
+ pluginManager.register(plugins);
428
+ }
429
+ }
430
+
431
+ // P14: Connect plugins for all services in parallel
432
+ const pluginEntries = [...localServices.entries()];
433
+ const pluginResults = await Promise.all(
434
+ pluginEntries.map(async ([svcName, { service, ctx }]) => {
435
+ const svcPlugins = servicePluginMap[svcName];
436
+ const clients = await pluginManager.connectForService(svcPlugins, ctx);
437
+ return { svcName, service, ctx, clients, svcPlugins };
438
+ }),
439
+ );
440
+
441
+ for (const { service, ctx, clients, svcPlugins } of pluginResults) {
442
+ // Inject clients as properties on the service
443
+ for (const [injectName, client] of clients) {
444
+ if (injectName.startsWith("_")) continue; // skip internal plugins (cors, etc.)
445
+ service[injectName] = client;
446
+ }
447
+
448
+ // Apply plugin middleware
449
+ const middleware = pluginManager.getMiddleware(svcPlugins);
450
+ for (const mw of middleware) {
451
+ ctx.router.use(mw);
452
+ }
453
+
454
+ // Attach websocket lifecycle hooks for this service
455
+ ctx._wsPluginHooks = pluginManager.getWebSocketHooks(svcPlugins);
456
+ }
457
+ }
458
+ } catch (err) {
459
+ // Don't crash on plugin errors — log and continue
460
+ console.error(`[ThreadForge] Plugin init failed for ${workerConfig.groupName}: ${err.message}`);
461
+ if (err.stack) console.error(err.stack);
462
+ for (const [, { ctx }] of localServices) {
463
+ try {
464
+ ctx?.logger?.error(`Plugin init failed: ${err.message}`);
465
+ } catch {}
466
+ }
467
+ }
468
+
469
+ // O4: Correct shutdown order — stop accepting connections first, then drain, then stop services, then disconnect plugins
470
+ // RT-C1: shutdown is a function declaration (hoisted) to avoid TDZ when called from IPC handler
471
+ // RT-C2: re-entrancy guard prevents double shutdown from SIGTERM + IPC race
472
+ async function shutdown(signal) {
473
+ if (shutdownInProgress) return;
474
+ shutdownInProgress = true;
475
+
476
+ for (const [name, { ctx }] of localServices) {
477
+ ctx.logger.info(`Received ${signal}, shutting down ${name}...`);
478
+ }
479
+
480
+ // Step 1: Stop accepting new connections (server.close())
481
+ const serverClosePromises = [];
482
+ for (const [, { ctx }] of localServices) {
483
+ if (ctx._server) {
484
+ serverClosePromises.push(
485
+ new Promise((resolve) => {
486
+ ctx._server.close(() => resolve());
487
+ })
488
+ );
489
+ }
490
+ }
491
+
492
+ // Step 2: Wait for in-flight requests to drain (up to 5s)
493
+ if (serverClosePromises.length > 0) {
494
+ await Promise.race([
495
+ Promise.all(serverClosePromises),
496
+ new Promise((resolve) => setTimeout(resolve, 5000)),
497
+ ]);
498
+ }
499
+
500
+ // Step 3: Stop services (onStop hooks, with 5s timeout per service)
501
+ for (const [name, { service, ctx }] of localServices) {
502
+ try {
503
+ await Promise.race([
504
+ service._stop(),
505
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`Service "${name}" stop timed out after 5s`)), 5000)),
506
+ ]);
507
+ } catch (err) {
508
+ ctx.logger.error("Shutdown error", { error: err.message });
509
+ }
510
+ }
511
+
512
+ // Step 4: Disconnect all plugins
513
+ if (pluginManager) {
514
+ try {
515
+ const errors = await pluginManager.disconnectAll(localServices.values().next().value?.ctx?.logger);
516
+ if (errors.length > 0) {
517
+ console.warn(`[ThreadForge] ${errors.length} plugins failed to disconnect cleanly`);
518
+ }
519
+ } catch (err) {
520
+ console.warn(`[ThreadForge] Plugin disconnect error: ${err.message}`);
521
+ }
522
+ }
523
+
524
+ process.exit(0);
525
+ }
526
+
527
+ // Wire IPC before starting so no messages are lost during startup
528
+ process.on("message", (msg, _handle) => {
529
+ if (typeof msg !== "object" || msg === null || !msg.type) return;
530
+
531
+ if (msg.type === "forge:shutdown") {
532
+ // Stop accepting new requests, let in-flight complete
533
+ shutdown("forge:shutdown");
534
+ return;
535
+ }
536
+
537
+ // IPC-C3: Socket messages handled only by WorkerChannelManager — skip here to avoid duplicates
538
+ if (msg.type === "forge:init-socket" || msg.type === "forge:socket-registry") return;
539
+
540
+ if (msg.type === "forge:health-check") {
541
+ for (const [, { ctx }] of localServices) {
542
+ ctx._handleIPCMessage(msg);
543
+ }
544
+ return;
545
+ }
546
+
547
+ if (msg.type === "forge:metrics-snapshot") {
548
+ if (!process.send) return;
549
+ try {
550
+ const chunks = [];
551
+ for (const [, { ctx }] of localServices) {
552
+ if (ctx?.metrics?.expose) {
553
+ chunks.push(ctx.metrics.expose());
554
+ }
555
+ }
556
+ process.send({
557
+ type: "forge:metrics-snapshot-response",
558
+ requestId: msg.requestId,
559
+ metrics: chunks.join("\n"),
560
+ });
561
+ } catch (err) {
562
+ process.send({
563
+ type: "forge:metrics-snapshot-response",
564
+ requestId: msg.requestId,
565
+ error: err.message,
566
+ });
567
+ }
568
+ return;
569
+ }
570
+
571
+ if (msg.type === "forge:message" || msg.type === "forge:request" || msg.type === "forge:response") {
572
+ for (const [, { ctx }] of localServices) {
573
+ ctx._handleIPCMessage(msg);
574
+ }
575
+ }
576
+ });
577
+
578
+ // Request socket setup (supervisor may have sent init-socket before we were listening)
579
+ if (process.send) {
580
+ process.send({ type: "forge:worker-ready", group: workerConfig.groupName });
581
+ }
582
+
583
+ // Phase 5: Start all services — track started services for cleanup on failure (RT-H1)
584
+ const startedServices = [];
585
+ try {
586
+ for (const [name, { service, ctx }] of localServices) {
587
+ await service._start();
588
+ startedServices.push({ name, service, ctx });
589
+
590
+ const proxyNames = Object.keys(
591
+ Object.fromEntries(Object.entries(service).filter(([_k, v]) => v?.$name || v?.$isLocal !== undefined)),
592
+ );
593
+
594
+ // Reduce startup noise: emit framework startup metadata once per group.
595
+ if (workerId === 0) {
596
+ ctx.logger.info("Service started", {
597
+ group: workerConfig.groupName,
598
+ service: name,
599
+ type: typeMap[name] ?? "internal",
600
+ port: ctx.port || null,
601
+ worker: workerId,
602
+ pid: process.pid,
603
+ colocated: entries.length > 1 ? entries.map((e) => e.name) : undefined,
604
+ proxies: proxyNames.length > 0 ? proxyNames : undefined,
605
+ });
606
+ }
607
+ }
608
+ } catch (startErr) {
609
+ // Check if this is a fatal bind error (EPERM, EACCES, EADDRNOTAVAIL, EADDRINUSE)
610
+ if (startErr.fatalBindError) {
611
+ // Log clear error message and exit without triggering restart loop
612
+ console.error(`\n[ThreadForge] FATAL: ${startErr.userMessage || startErr.message}`);
613
+ console.error(`[ThreadForge] Service group "${workerConfig.groupName}" cannot start. Worker will not restart.\n`);
614
+ // Notify supervisor this is a fatal error (already sent via IPC from ForgeContext)
615
+ // Exit with code 100 to signal fatal configuration error (supervisor checks this)
616
+ process.exit(100);
617
+ }
618
+
619
+ // RT-H1: Stop already-started services in reverse order before re-throwing
620
+ for (let i = startedServices.length - 1; i >= 0; i--) {
621
+ const { name, service, ctx } = startedServices[i];
622
+ try {
623
+ ctx.logger.warn(`Rolling back service start for ${name} due to Phase 5 failure`);
624
+ await service._stop();
625
+ } catch (stopErr) {
626
+ ctx.logger.error(`Rollback stop failed for ${name}`, { error: stopErr.message });
627
+ }
628
+ try {
629
+ if (ctx._server) {
630
+ await new Promise((resolve) => ctx._server.close(resolve));
631
+ }
632
+ } catch {}
633
+ }
634
+ throw startErr;
635
+ }
636
+
637
+ // Phase 6: Auto-register health check (always, not just with plugins)
638
+ for (const [, { ctx }] of localServices) {
639
+ const existingHealthRoute = [...ctx.router.routes.values()].some((bucket) =>
640
+ bucket.some((r) => r.pattern === "/health"),
641
+ );
642
+ if (!existingHealthRoute) {
643
+ ctx.router.get("/health", async (_req, res) => {
644
+ const health = {
645
+ status: "ok",
646
+ service: ctx.serviceName,
647
+ pid: process.pid,
648
+ };
649
+ if (pluginManager) {
650
+ health.plugins = await pluginManager.healthCheck();
651
+ }
652
+ res.json(health);
653
+ });
654
+ }
655
+ }
656
+
657
+ // Notify supervisor that this worker finished startup for readiness aggregation.
658
+ if (process.send) {
659
+ process.send({
660
+ type: "forge:group-ready",
661
+ group: workerConfig.groupName,
662
+ workerId,
663
+ pid: process.pid,
664
+ services: entries.map((e) => e.name),
665
+ port: Number.isFinite(port) && port > 0 ? port : null,
666
+ });
667
+ }
668
+
669
+ // Signal handlers moved to early in bootstrap() — see below uncaughtException handler
670
+ }
671
+
672
+ bootstrap().catch((err) => {
673
+ const services = workerConfig.serviceNames || "unknown";
674
+ console.error(`[ThreadForge] Worker bootstrap failed for group "${workerConfig.groupName}" (services: ${services}):`, err);
675
+ process.exit(1);
676
+ });