threadforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +152 -0
  3. package/bin/forge.js +1050 -0
  4. package/bin/host-commands.js +344 -0
  5. package/bin/platform-commands.js +570 -0
  6. package/package.json +71 -0
  7. package/shared/auth.js +475 -0
  8. package/src/core/DirectMessageBus.js +364 -0
  9. package/src/core/EndpointResolver.js +247 -0
  10. package/src/core/ForgeContext.js +2227 -0
  11. package/src/core/ForgeHost.js +122 -0
  12. package/src/core/ForgePlatform.js +145 -0
  13. package/src/core/Ingress.js +768 -0
  14. package/src/core/Interceptors.js +420 -0
  15. package/src/core/MessageBus.js +310 -0
  16. package/src/core/Prometheus.js +305 -0
  17. package/src/core/RequestContext.js +413 -0
  18. package/src/core/RoutingStrategy.js +316 -0
  19. package/src/core/Supervisor.js +1306 -0
  20. package/src/core/ThreadAllocator.js +196 -0
  21. package/src/core/WorkerChannelManager.js +879 -0
  22. package/src/core/config.js +624 -0
  23. package/src/core/host-config.js +311 -0
  24. package/src/core/network-utils.js +166 -0
  25. package/src/core/platform-config.js +308 -0
  26. package/src/decorators/ServiceProxy.js +899 -0
  27. package/src/decorators/index.js +571 -0
  28. package/src/deploy/NginxGenerator.js +865 -0
  29. package/src/deploy/PlatformManifestGenerator.js +96 -0
  30. package/src/deploy/RouteManifestGenerator.js +112 -0
  31. package/src/deploy/index.js +984 -0
  32. package/src/frontend/FrontendDevLifecycle.js +65 -0
  33. package/src/frontend/FrontendPluginOrchestrator.js +187 -0
  34. package/src/frontend/SiteResolver.js +63 -0
  35. package/src/frontend/StaticMountRegistry.js +90 -0
  36. package/src/frontend/index.js +5 -0
  37. package/src/frontend/plugins/index.js +2 -0
  38. package/src/frontend/plugins/viteFrontend.js +79 -0
  39. package/src/frontend/types.js +35 -0
  40. package/src/index.js +56 -0
  41. package/src/internals.js +31 -0
  42. package/src/plugins/PluginManager.js +537 -0
  43. package/src/plugins/ScopedPostgres.js +192 -0
  44. package/src/plugins/ScopedRedis.js +142 -0
  45. package/src/plugins/index.js +1729 -0
  46. package/src/registry/ServiceRegistry.js +796 -0
  47. package/src/scaling/ScaleAdvisor.js +442 -0
  48. package/src/services/Service.js +195 -0
  49. package/src/services/worker-bootstrap.js +676 -0
  50. package/src/templates/auth-service.js +65 -0
  51. package/src/templates/identity-service.js +75 -0
@@ -0,0 +1,1306 @@
1
+ import cluster from "node:cluster";
2
+ import { EventEmitter } from "node:events";
3
+ import fs from "node:fs";
4
+ import { createServer } from "node:http";
5
+ import { createServer as createNetServer } from "node:net";
6
+ import path from "node:path";
7
+ import { fileURLToPath } from "node:url";
8
+ import { ServiceRegistry } from "../registry/ServiceRegistry.js";
9
+ import { ScaleAdvisor } from "../scaling/ScaleAdvisor.js";
10
+ import { DirectMessageBus } from "./DirectMessageBus.js";
11
+ import { ThreadAllocator } from "./ThreadAllocator.js";
12
+
13
+ import { timingSafeEqual } from "node:crypto";
14
+ import { tmpdir } from "node:os";
15
+
16
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
17
+ const WORKER_BOOTSTRAP = path.join(__dirname, "..", "services", "worker-bootstrap.js");
18
+
19
+ // L1: Restart policy constants
20
+ const RESTART_BASE_BACKOFF_MS = 2000;
21
+ const RESTART_MAX_BACKOFF_MS = 60000;
22
+ const MAX_RESTARTS_PER_WINDOW = 5;
23
+ const RESTART_WINDOW_MS = 300000;
24
+ // L5: Rate-limit restart warnings — one per 5s per group
25
+ const RESTART_WARNING_INTERVAL_MS = 5000;
26
+ // C5: Overall shutdown deadline
27
+ const SHUTDOWN_DEADLINE_MS = 25000;
28
+ // C2: Forbidden env keys
29
+ const FORBIDDEN_ENV_KEYS = new Set(['PATH', 'LD_PRELOAD', 'LD_LIBRARY_PATH', 'NODE_OPTIONS', 'NODE_EXTRA_CA_CERTS']);
30
+ const VALID_ENV_KEY = /^[A-Z_][A-Z0-9_]*$/i;
31
+ // C3: Max env var size before file fallback
32
+ const MAX_ENDPOINT_ENV_SIZE = 65536;
33
+
34
+ function isExpectedWorkerIpcError(err) {
35
+ if (!err) return false;
36
+ if (
37
+ err.code === "EPIPE" ||
38
+ err.code === "ECONNRESET" ||
39
+ err.code === "ERR_IPC_CHANNEL_CLOSED" ||
40
+ err.code === "ERR_IPC_DISCONNECTED"
41
+ ) {
42
+ return true;
43
+ }
44
+ const msg = String(err.message ?? "").toLowerCase();
45
+ return (
46
+ msg.includes("channel closed") ||
47
+ msg.includes("ipc channel is already disconnected") ||
48
+ msg.includes("broken pipe")
49
+ );
50
+ }
51
+
52
+ /**
53
+ * Supervisor v2
54
+ *
55
+ * Key differences from v1:
56
+ *
57
+ * - Understands service types (edge/internal/background)
58
+ * - Only edge services get HTTP servers
59
+ * - Colocated services share a process (same event loop)
60
+ * - Channels are dependency-based, not full mesh
61
+ * - Thread allocation is per process group, not per service
62
+ */
63
+ export class Supervisor extends EventEmitter {
64
+ constructor(config, options = {}) {
65
+ super();
66
+
67
+ this.config = config;
68
+ this.services = config.services;
69
+ this.groups = config.groups;
70
+ this.channels = config.channels; // declared dependency channels
71
+ this.options = options;
72
+
73
+ this.allocator = new ThreadAllocator({
74
+ cpus: options.cpus,
75
+ reserved: options.reserved,
76
+ });
77
+
78
+ this.messageBus = new DirectMessageBus();
79
+
80
+ // Service registry — starts embedded, upgrades to multicast/external
81
+ this.registry = new ServiceRegistry({
82
+ mode: options.registryMode ?? "embedded",
83
+ host: options.host,
84
+ httpBasePort: options.httpBasePort ?? 4000,
85
+ });
86
+
87
+ // Scale advisor — monitors health and recommends actions
88
+ this.scaleAdvisor = new ScaleAdvisor(this.registry, {
89
+ evaluationIntervalMs: options.evaluationIntervalMs ?? 30000,
90
+ });
91
+
92
+ // Log scaling recommendations
93
+ this.scaleAdvisor.on("recommendation", (rec) => {
94
+ const icon = { scale_up: "↑", migrate: "→", split_out: "⊞", scale_down: "↓" };
95
+ console.log(`\n ${icon[rec.action] ?? "•"} SCALE: ${rec.service} — ${rec.action}`);
96
+ console.log(` ${rec.reason}`);
97
+ if (rec.details.command) console.log(` Run: ${rec.details.command}`);
98
+ });
99
+
100
+ // Plugins
101
+ this.plugins = config.plugins ?? [];
102
+ this._pluginEnv = {};
103
+
104
+ /** @type {Map<number, {groupName: string, services: string[], workerId: number}>} */
105
+ this.workerMap = new Map();
106
+
107
+ /** @type {Map<string, number[]>} group name → cluster worker IDs */
108
+ this.groupWorkers = new Map();
109
+
110
+ /** @type {Map<string, number>} */
111
+ this.allocation = new Map();
112
+
113
+ this._metricsServer = null;
114
+ this._shuttingDown = false;
115
+ this._restartHistory = new Map();
116
+ /** @type {Set<number>} worker IDs being intentionally removed during scale-down */
117
+ this._scalingDown = new Set();
118
+ /** @type {Map<string, NodeJS.Timeout>} pending delayed restarts keyed by cooldownKey */
119
+ this._pendingRestarts = new Map();
120
+ /** @type {Map<number, NodeJS.Timeout>} SIGKILL timers for scale-down workers (RT-H3) */
121
+ this._killTimers = new Map();
122
+ /** @type {Map<string, number>} L5: last restart warning time per group */
123
+ this._restartWarningTimes = new Map();
124
+ /** @type {string|null} C3: temp file path for large endpoint maps */
125
+ this._endpointTempFile = null;
126
+ /** @type {string|null} temp file path for large site maps */
127
+ this._sitesTempFile = null;
128
+ /** @type {number} */
129
+ this._metricsRequestSeq = 0;
130
+ /** @type {Map<string, {expected: Set<number>, chunks: string[], timer: NodeJS.Timeout, finish: Function}>} */
131
+ this._pendingMetricsSnapshots = new Map();
132
+ /** @type {Map<string, Set<number>>} */
133
+ this._groupReadyWorkers = new Map();
134
+ /** @type {Set<string>} */
135
+ this._groupReadyLogged = new Set();
136
+ /** @type {Object<string, number>} M-3: monotonic worker index per group for scale-up */
137
+ this._nextWorkerIndex = {};
138
+ /** @type {NodeJS.Timeout|null} H-5: heartbeat monitor interval */
139
+ this._heartbeatInterval = null;
140
+ /** @type {Map<number, number>} H-5: last heartbeat response time per worker ID */
141
+ this._lastHeartbeat = new Map();
142
+ /** @type {Map<number, boolean>} O1: per-worker readiness tracking */
143
+ this._workersReady = new Map();
144
+ /** @type {number} O15: total worker restart count */
145
+ this._workerRestartCount = 0;
146
+ /** @type {WeakSet<object>} workers that already have an error guard attached */
147
+ this._workerErrorGuards = new WeakSet();
148
+ }
149
+
150
+ async start() {
151
+ if (!cluster.isPrimary) {
152
+ throw new Error("Supervisor.start() must be called from the primary process");
153
+ }
154
+
155
+ // S6: Reject placeholder JWT_SECRET in production
156
+ if (process.env.NODE_ENV === 'production' && process.env.JWT_SECRET === 'CHANGE_ME_BEFORE_DEPLOY') {
157
+ console.error('FATAL: JWT_SECRET is set to the placeholder value "CHANGE_ME_BEFORE_DEPLOY". Set a real secret before deploying.');
158
+ process.exit(1);
159
+ }
160
+
161
+ // Register signal handlers early so signals during startup are caught
162
+ process.once("SIGTERM", () => this.shutdown());
163
+ process.once("SIGINT", () => this.shutdown());
164
+
165
+ // Preflight check: fail fast with a single clear error before forking workers.
166
+ await this._assertStartupPortsAvailable();
167
+
168
+ // Validate and collect plugin env vars before forking workers
169
+ /** @type {Set<string>} plugins that failed validation — workers will skip these */
170
+ this._failedPlugins = new Set();
171
+
172
+ if (this.plugins.length > 0) {
173
+ for (const plugin of this.plugins) {
174
+ const pName = plugin.name ?? "unknown";
175
+ try {
176
+ if (plugin.validate) {
177
+ await plugin.validate();
178
+ }
179
+ if (plugin.env) {
180
+ Object.assign(this._pluginEnv, plugin.env());
181
+ }
182
+ } catch (err) {
183
+ console.warn(` ⚠ Plugin "${pName}" unavailable: ${err.message}`);
184
+ this._failedPlugins.add(pName);
185
+ }
186
+ }
187
+
188
+ const available = this.plugins.filter((p) => !this._failedPlugins.has(p.name ?? "unknown"));
189
+ if (available.length > 0) {
190
+ console.log(` Plugins: ${available.map((p) => p.name).join(", ")}`);
191
+ }
192
+ if (this._failedPlugins.size > 0) {
193
+ console.warn(` Failed plugins: ${[...this._failedPlugins].join(", ")}`);
194
+ }
195
+ }
196
+
197
+ console.log(this._banner());
198
+
199
+ // Allocate threads per process group (not per service)
200
+ this._allocateGroups();
201
+
202
+ // Display allocation
203
+ this._printAllocation();
204
+
205
+ cluster.setupPrimary({
206
+ exec: WORKER_BOOTSTRAP,
207
+ silent: false,
208
+ });
209
+
210
+ // Register exit handler BEFORE forking so early crashes are caught
211
+ cluster.on("exit", (worker, code, signal) => {
212
+ this._handleWorkerExit(worker, code, signal);
213
+ });
214
+
215
+ // P21: Pre-serialize endpoint map once for all worker forks
216
+ this._cachedEndpointJson = JSON.stringify(this._buildEndpointMap());
217
+
218
+ // Fork workers for each process group
219
+ for (const [groupName, group] of Object.entries(this.groups)) {
220
+ const threadCount = this.allocation.get(groupName) ?? 1;
221
+ this.groupWorkers.set(groupName, []);
222
+ this._groupReadyWorkers.set(groupName, new Set());
223
+
224
+ for (let i = 0; i < threadCount; i++) {
225
+ this._forkGroupWorker(groupName, group, i);
226
+ }
227
+ }
228
+
229
+ await this._startMetricsServer();
230
+
231
+ // Start registry and scale advisor
232
+ await this.registry.start();
233
+ this.scaleAdvisor.start();
234
+
235
+ // Register all local services in the registry
236
+ for (const [name, svc] of Object.entries(this.services)) {
237
+ if (svc.type === "remote") continue;
238
+ const groupName = svc.group ?? `_isolated:${name}`;
239
+ this.registry.register({
240
+ name,
241
+ ports: { http: svc.port },
242
+ udsPath: null,
243
+ workers: this.allocation.get(groupName) ?? 1,
244
+ contract: {
245
+ methods: [], // populated by worker after loading class
246
+ events: [],
247
+ },
248
+ metadata: {
249
+ group: groupName,
250
+ },
251
+ });
252
+ }
253
+
254
+ // Print channel topology
255
+ this._printTopology();
256
+
257
+ // H4: Write PID file so `forge stop` can find us without the metrics endpoint
258
+ this._pidFilePath = path.join(process.cwd(), ".forge.pid");
259
+ try {
260
+ fs.writeFileSync(this._pidFilePath, String(process.pid));
261
+ } catch {}
262
+
263
+ console.log(`\n ⚡ ThreadForge runtime started\n`);
264
+ }
265
+
266
+ /**
267
+ * Allocate threads per process group.
268
+ *
269
+ * Each group gets threads based on the highest weight of its member
270
+ * services. Colocated services share their group's allocation.
271
+ */
272
+ _allocateGroups() {
273
+ // Build a services-like map for the allocator, keyed by group name
274
+ const groupConfigs = {};
275
+ for (const [groupName, group] of Object.entries(this.groups)) {
276
+ groupConfigs[groupName] = {
277
+ name: groupName,
278
+ port: group.port ?? 0,
279
+ threads: group.threads === 0 ? "auto" : group.threads,
280
+ weight: group.weight || 1,
281
+ mode: "cluster",
282
+ };
283
+ }
284
+
285
+ this.allocation = this.allocator.allocate(groupConfigs);
286
+ }
287
+
288
+ /**
289
+ * Fork a worker for a process group.
290
+ *
291
+ * The worker will load ALL services in the group within a single
292
+ * process. Colocated services communicate via direct function calls.
293
+ */
294
+ _forkGroupWorker(groupName, group, workerIndex) {
295
+ const serviceNames = group.services.map((s) => s.name);
296
+ const edgeService = group.services.find((s) => s.type === "edge");
297
+
298
+ // Build comma-separated entry points for all services in the group
299
+ const entries = group.services.map((s) => `${s.name}=${s.entry}`).join(",");
300
+
301
+ const env = {
302
+ ...process.env,
303
+ ...this._pluginEnv,
304
+ FORGE_GROUP_NAME: groupName,
305
+ FORGE_SERVICE_ENTRIES: entries,
306
+ FORGE_SERVICE_NAMES: serviceNames.join(","),
307
+ FORGE_PORT: edgeService ? String(edgeService.port) : "0", // 0 = no HTTP
308
+ FORGE_WORKER_ID: String(workerIndex),
309
+ FORGE_THREAD_COUNT: String(this.allocation.get(groupName) ?? 1),
310
+ FORGE_MODE: "cluster",
311
+ FORGE_SERVICE_TYPES: group.services.map((s) => `${s.name}=${s.type}`).join(","),
312
+ // Port map for HTTP-based service-to-service calls (backward compat)
313
+ FORGE_SERVICE_PORTS: JSON.stringify(
314
+ Object.fromEntries(
315
+ Object.entries(this.services)
316
+ .filter(([, s]) => s.port)
317
+ .map(([name, s]) => [name, s.port]),
318
+ ),
319
+ ),
320
+ // Full endpoint topology — includes remote hosts for multi-machine
321
+ // S10: Endpoint map may contain internal IPs — treat as trusted internal config, not user input.
322
+ // P21: Use cached JSON serialization; C3: File fallback when JSON exceeds 64KB
323
+ ...(() => {
324
+ const json = this._cachedEndpointJson ?? JSON.stringify(this._buildEndpointMap());
325
+ if (json.length > MAX_ENDPOINT_ENV_SIZE) {
326
+ if (!this._endpointTempFile) {
327
+ const tempFile = path.join(tmpdir(), `forge-endpoints-${process.pid}.json`);
328
+ // M-SEC-5: Restrict temp file permissions — contains internal topology
329
+ fs.writeFileSync(tempFile, json, { encoding: 'utf8', mode: 0o600 });
330
+ this._endpointTempFile = tempFile;
331
+ }
332
+ return { FORGE_SERVICE_ENDPOINTS_FILE: this._endpointTempFile };
333
+ }
334
+ return { FORGE_SERVICE_ENDPOINTS: json };
335
+ })(),
336
+ ...(() => {
337
+ const json = this.config._sites ? JSON.stringify(this.config._sites) : "";
338
+ if (!json) return { FORGE_SITES: "" };
339
+ if (json.length > MAX_ENDPOINT_ENV_SIZE) {
340
+ if (!this._sitesTempFile) {
341
+ const tempFile = path.join(tmpdir(), `forge-sites-${process.pid}.json`);
342
+ fs.writeFileSync(tempFile, json, { encoding: "utf8", mode: 0o600 });
343
+ this._sitesTempFile = tempFile;
344
+ }
345
+ return { FORGE_SITES_FILE: this._sitesTempFile };
346
+ }
347
+ return { FORGE_SITES: json };
348
+ })(),
349
+ // Registry mode and host for dynamic discovery
350
+ FORGE_REGISTRY_MODE: this.options.registryMode ?? "embedded",
351
+ FORGE_HOST: this.options.host ?? "",
352
+ // Plugin config — which plugins each service uses
353
+ FORGE_PLUGINS: JSON.stringify(this.plugins.map((p) => p.name)),
354
+ FORGE_CONFIG_PATH: this.config._configUrl ?? "",
355
+ FORGE_SERVICE_PLUGINS: JSON.stringify(Object.fromEntries(group.services.map((s) => [s.name, s.plugins ?? null]))),
356
+ FORGE_CHANNELS: JSON.stringify(
357
+ this.channels.filter((ch) => serviceNames.includes(ch.from) || serviceNames.includes(ch.to)),
358
+ ),
359
+ };
360
+
361
+ if (this.config._isHostMode) {
362
+ env.FORGE_HOST_META = this.config._hostMetaJSON ?? JSON.stringify(this.config._hostMeta);
363
+ }
364
+
365
+ if (this.config._isPlatformMode) {
366
+ env.FORGE_PLATFORM_MODE = "1";
367
+ }
368
+
369
+ // C2: Set per-service env overrides with validation
370
+ for (const svc of group.services) {
371
+ for (const [key, value] of Object.entries(svc.env)) {
372
+ if (!VALID_ENV_KEY.test(key)) {
373
+ throw new Error(`Service "${svc.name}": invalid env key "${key}" — must match /^[A-Z_][A-Z0-9_]*$/i`);
374
+ }
375
+ if (FORBIDDEN_ENV_KEYS.has(key.toUpperCase())) {
376
+ throw new Error(`Service "${svc.name}": env key "${key}" is forbidden (security risk)`);
377
+ }
378
+ env[`FORGE_ENV_${svc.name.toUpperCase()}_${key}`] = value;
379
+ }
380
+ }
381
+
382
+ const worker = cluster.fork(env);
383
+ this._attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex);
384
+
385
+ this.workerMap.set(worker.id, {
386
+ groupName,
387
+ services: serviceNames,
388
+ workerId: workerIndex,
389
+ });
390
+
391
+ // H-5: Initialize heartbeat timestamp so the monitor doesn't kill
392
+ // workers forked after startup (restarts, scale-up)
393
+ this._lastHeartbeat.set(worker.id, Date.now());
394
+
395
+ const workers = this.groupWorkers.get(groupName) ?? [];
396
+ workers.push(worker.id);
397
+ this.groupWorkers.set(groupName, workers);
398
+
399
+ // Register with message bus — using service names, not group name
400
+ // so IPC addressing is still by service name
401
+ for (const svcName of serviceNames) {
402
+ this.messageBus.registerWorker(svcName, worker, "cluster");
403
+ }
404
+
405
+ worker.on("message", (msg) => this._handleWorkerMessage(worker, msg));
406
+
407
+ return worker;
408
+ }
409
+
410
+ _attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex) {
411
+ if (!worker || this._workerErrorGuards.has(worker)) return;
412
+ this._workerErrorGuards.add(worker);
413
+ worker.on("error", (err) => {
414
+ if (isExpectedWorkerIpcError(err)) return;
415
+ if (this._shuttingDown) return;
416
+ console.error(
417
+ ` ⚠ Worker ${groupName}[${workerIndex}] (${serviceNames.join("+")}) IPC error: ${err?.message ?? err}`,
418
+ );
419
+ });
420
+ }
421
+
422
+ _sendWorkerMessage(worker, message, label = "worker message") {
423
+ if (!worker) return false;
424
+ if (typeof worker.isDead === "function" && worker.isDead()) return false;
425
+ if (typeof worker.isConnected === "function" && !worker.isConnected()) return false;
426
+ if (worker.process?.connected === false) return false;
427
+
428
+ try {
429
+ worker.send(message);
430
+ return true;
431
+ } catch (err) {
432
+ if (!isExpectedWorkerIpcError(err)) {
433
+ console.error(` ⚠ Failed to send ${label}: ${err?.message ?? err}`);
434
+ }
435
+ return false;
436
+ }
437
+ }
438
+
439
+ _handleWorkerMessage(worker, msg) {
440
+ if (msg?.type === "forge:group-ready") {
441
+ // O1: Mark worker as ready for the /health/ready readiness probe
442
+ this._workersReady.set(worker.id, true);
443
+
444
+ const info = this.workerMap.get(worker.id);
445
+ if (!info) return;
446
+
447
+ const groupName = info.groupName;
448
+ const readySet = this._groupReadyWorkers.get(groupName) ?? new Set();
449
+ readySet.add(worker.id);
450
+ this._groupReadyWorkers.set(groupName, readySet);
451
+
452
+ const expected = this.allocation.get(groupName) ?? 1;
453
+ if (readySet.size >= expected && !this._groupReadyLogged.has(groupName)) {
454
+ this._groupReadyLogged.add(groupName);
455
+ const group = this.groups[groupName];
456
+ const edgeService = group?.services?.find((s) => s.type === "edge");
457
+ const portLabel = edgeService?.port ? ` on port ${edgeService.port}` : "";
458
+ const svcLabel = group?.services?.map((s) => s.name).join(", ") ?? groupName;
459
+ console.log(` ✓ ${svcLabel}: ${expected} workers ready${portLabel}`);
460
+
461
+ // H-5: Start heartbeat monitor once all groups are ready
462
+ if (!this._heartbeatInterval) {
463
+ const allReady = [...this._groupReadyWorkers.entries()].every(
464
+ ([gn, set]) => set.size >= (this.allocation.get(gn) ?? 1)
465
+ );
466
+ if (allReady) {
467
+ this._startHeartbeatMonitor();
468
+ }
469
+ }
470
+ }
471
+ return;
472
+ }
473
+
474
+ if (msg?.type === "forge:fatal-error") {
475
+ const info = this.workerMap.get(worker.id);
476
+ const groupName = info?.groupName ?? "unknown";
477
+ const workerId = info?.workerId ?? "?";
478
+ console.error(` ✖ Worker ${groupName}[${workerId}] fatal error: ${msg.error} - ${msg.message}`);
479
+ if (msg.port) {
480
+ console.error(` ✖ Failed to bind to port ${msg.port}. Check permissions or port availability.`);
481
+ }
482
+ return;
483
+ }
484
+
485
+ // H-5: Track heartbeat responses from workers
486
+ if (msg?.type === "forge:heartbeat-response") {
487
+ this._lastHeartbeat.set(worker.id, Date.now());
488
+ return;
489
+ }
490
+
491
+ if (msg?.type === "forge:metrics-snapshot-response" && msg.requestId) {
492
+ const pending = this._pendingMetricsSnapshots.get(msg.requestId);
493
+ if (!pending) return;
494
+
495
+ if (typeof msg.metrics === "string" && msg.metrics.trim().length > 0) {
496
+ pending.chunks.push(msg.metrics);
497
+ }
498
+ if (msg.error) {
499
+ pending.chunks.push(`# Worker ${worker.id} metrics error: ${msg.error}`);
500
+ }
501
+
502
+ pending.expected.delete(worker.id);
503
+ if (pending.expected.size === 0) {
504
+ pending.finish();
505
+ }
506
+ }
507
+ }
508
+
509
+ _mergePrometheusExpositions(expositions) {
510
+ const lines = [];
511
+ const seenMeta = new Set();
512
+
513
+ for (const chunk of expositions) {
514
+ if (typeof chunk !== "string") continue;
515
+
516
+ for (const rawLine of chunk.split(/\r?\n/)) {
517
+ const line = rawLine.trimEnd();
518
+ if (!line) continue;
519
+
520
+ if (line.startsWith("# HELP ") || line.startsWith("# TYPE ")) {
521
+ if (seenMeta.has(line)) continue;
522
+ seenMeta.add(line);
523
+ }
524
+
525
+ lines.push(line);
526
+ }
527
+ }
528
+
529
+ if (lines.length === 0) {
530
+ return "# No worker metrics available\n";
531
+ }
532
+
533
+ return `${lines.join("\n")}\n`;
534
+ }
535
+
536
+ _collectMetricsSnapshot(timeoutMs = 1000) {
537
+ const activeWorkers = Object.values(cluster.workers).filter((worker) => worker && !worker.isDead());
538
+ if (activeWorkers.length === 0) {
539
+ return Promise.resolve("# No worker metrics available\n");
540
+ }
541
+
542
+ const requestId = `metrics-${process.pid}-${Date.now()}-${++this._metricsRequestSeq}`;
543
+
544
+ return new Promise((resolve) => {
545
+ const expected = new Set(activeWorkers.map((worker) => worker.id));
546
+ const chunks = [];
547
+ let finished = false;
548
+
549
+ const finish = () => {
550
+ if (finished) return;
551
+ finished = true;
552
+ const pending = this._pendingMetricsSnapshots.get(requestId);
553
+ if (pending?.timer) clearTimeout(pending.timer);
554
+ this._pendingMetricsSnapshots.delete(requestId);
555
+ resolve(this._mergePrometheusExpositions(chunks));
556
+ };
557
+
558
+ const timer = setTimeout(finish, timeoutMs);
559
+ if (typeof timer.unref === "function") timer.unref();
560
+
561
+ this._pendingMetricsSnapshots.set(requestId, { expected, chunks, timer, finish });
562
+
563
+ for (const worker of activeWorkers) {
564
+ const sent = this._sendWorkerMessage(worker, { type: "forge:metrics-snapshot", requestId }, "metrics snapshot request");
565
+ if (!sent) expected.delete(worker.id);
566
+ }
567
+
568
+ if (expected.size === 0) {
569
+ finish();
570
+ }
571
+ });
572
+ }
573
+
574
+ _handleWorkerExit(worker, code, signal) {
575
+ const info = this.workerMap.get(worker.id);
576
+ if (!info) return;
577
+
578
+ const { groupName, services, workerId } = info;
579
+
580
+ // CR-1: Find the worker's slot index in the group before removing it
581
+ const workers = this.groupWorkers.get(groupName) ?? [];
582
+ const workerSlotIndex = workers.indexOf(worker.id);
583
+
584
+ // CR-2: Always perform cleanup even during shutdown — only skip restart/fork logic
585
+ this.workerMap.delete(worker.id);
586
+ this._groupReadyWorkers.get(groupName)?.delete(worker.id);
587
+ this._lastHeartbeat.delete(worker.id);
588
+ this._workersReady.delete(worker.id);
589
+
590
+ // RT-H3: Clear any pending SIGKILL timer for this worker
591
+ const killTimer = this._killTimers.get(worker.id);
592
+ if (killTimer) {
593
+ clearTimeout(killTimer);
594
+ this._killTimers.delete(worker.id);
595
+ }
596
+
597
+ if (workerSlotIndex !== -1) workers.splice(workerSlotIndex, 1);
598
+
599
+ // Unregister from message bus
600
+ for (let i = 0; i < services.length; i++) {
601
+ const svcName = services[i];
602
+ this.messageBus.unregisterWorker(svcName, worker.id, {
603
+ suppressBroadcast: i < services.length - 1,
604
+ });
605
+ }
606
+
607
+ // CR-2: During shutdown, only do cleanup (above) — skip restart/fork logic
608
+ if (this._shuttingDown) return;
609
+
610
+ // If this worker was intentionally removed during scale-down, don't restart
611
+ if (this._scalingDown.has(worker.id)) {
612
+ this._scalingDown.delete(worker.id);
613
+ // MED-4: Clean up restart history for removed worker
614
+ const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
615
+ this._restartHistory.delete(cooldownKey);
616
+ console.log(` ↓ Worker ${groupName}[${workerId}] (${services.join("+")}) removed (scale-down)`);
617
+ return;
618
+ }
619
+
620
+ // Exit code 100 indicates fatal configuration error (e.g., EPERM on port bind)
621
+ // Don't restart — log clear message and stop
622
+ if (code === 100) {
623
+ console.error(` ✖ Worker ${groupName}[${workerId}] (${services.join("+")}) failed with fatal error — not restarting`);
624
+ console.error(` ✖ Check worker logs above for details (likely port permission issue)`);
625
+ // Clean up restart history to prevent future attempts
626
+ const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
627
+ this._restartHistory.delete(cooldownKey);
628
+ this._pendingRestarts.delete(cooldownKey);
629
+ return;
630
+ }
631
+
632
+ const reason = signal ? `signal ${signal}` : `code ${code}`;
633
+
634
+ // L5: Rate-limit restart warnings per group
635
+ const now = Date.now();
636
+ const lastWarning = this._restartWarningTimes.get(groupName) ?? 0;
637
+ if (now - lastWarning >= RESTART_WARNING_INTERVAL_MS) {
638
+ console.error(` ⚠ Worker ${groupName}[${workerId}] (${services.join("+")}) exited: ${reason}`);
639
+ this._restartWarningTimes.set(groupName, now);
640
+ }
641
+
642
+ // CR-1: Key by worker slot index (not cluster worker.id) so restart history persists across restarts
643
+ const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
644
+ const history = this._restartHistory.get(cooldownKey) ?? { count: 0, firstRestart: now, lastRestart: 0 };
645
+
646
+ // Reset counter if outside the restart window
647
+ if (now - history.firstRestart > RESTART_WINDOW_MS) {
648
+ history.count = 0;
649
+ history.firstRestart = now;
650
+ }
651
+
652
+ if (history.count >= MAX_RESTARTS_PER_WINDOW) {
653
+ console.error(` ⚠ ${groupName}[${workerId}] exceeded max restarts (${MAX_RESTARTS_PER_WINDOW} in ${RESTART_WINDOW_MS / 60000}min), not restarting`);
654
+ this._restartHistory.delete(cooldownKey);
655
+ return;
656
+ }
657
+
658
+ // Exponential backoff with constants
659
+ const backoffMs = Math.min(RESTART_BASE_BACKOFF_MS * 2 ** history.count, RESTART_MAX_BACKOFF_MS);
660
+ const timeSinceLast = now - history.lastRestart;
661
+
662
+ if (timeSinceLast < backoffMs) {
663
+ const remaining = backoffMs - timeSinceLast;
664
+ console.log(` ↻ Delaying restart for ${groupName}[${workerId}] (${remaining}ms remaining in backoff)`);
665
+
666
+ // Cancel any existing pending restart for this slot
667
+ const existingTimer = this._pendingRestarts.get(cooldownKey);
668
+ if (existingTimer) clearTimeout(existingTimer);
669
+
670
+ const timer = setTimeout(() => {
671
+ this._pendingRestarts.delete(cooldownKey);
672
+ if (this._shuttingDown) return;
673
+ if (!this.groups[groupName]) return;
674
+
675
+ history.count++;
676
+ history.lastRestart = Date.now();
677
+ this._restartHistory.set(cooldownKey, history);
678
+
679
+ console.log(
680
+ ` ↻ Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`,
681
+ );
682
+ // O15: Track worker restart metric
683
+ this._workerRestartCount++;
684
+ this._forkGroupWorker(groupName, this.groups[groupName], workerId);
685
+ }, remaining);
686
+
687
+ timer.unref();
688
+ this._pendingRestarts.set(cooldownKey, timer);
689
+ return;
690
+ }
691
+
692
+ if (!this.groups[groupName]) return;
693
+
694
+ history.count++;
695
+ history.lastRestart = now;
696
+ this._restartHistory.set(cooldownKey, history);
697
+
698
+ console.log(` ↻ Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`);
699
+ // O15: Track worker restart metric
700
+ this._workerRestartCount++;
701
+ if (this._shuttingDown) return;
702
+ this._forkGroupWorker(groupName, this.groups[groupName], workerId);
703
+ }
704
+
705
+ _startupPortsToCheck() {
706
+ const targets = [];
707
+ const seen = new Set();
708
+
709
+ for (const [name, svc] of Object.entries(this.services)) {
710
+ if (svc?.type !== "edge") continue;
711
+ if (!Number.isInteger(svc.port) || svc.port <= 0) continue;
712
+ if (seen.has(svc.port)) continue;
713
+ seen.add(svc.port);
714
+ targets.push({ port: svc.port, purpose: `service "${name}"` });
715
+ }
716
+
717
+ return targets;
718
+ }
719
+
720
+ _isPortAvailable(port, host = "127.0.0.1") {
721
+ if (!Number.isInteger(port) || port <= 0) return Promise.resolve(true);
722
+
723
+ return new Promise((resolve) => {
724
+ const probe = createNetServer();
725
+ let settled = false;
726
+
727
+ const finish = (available) => {
728
+ if (settled) return;
729
+ settled = true;
730
+ if (available) {
731
+ probe.close(() => resolve(true));
732
+ } else {
733
+ resolve(false);
734
+ }
735
+ };
736
+
737
+ probe.once("error", (err) => {
738
+ if (err.code === "EADDRINUSE" || err.code === "EACCES" || err.code === "EPERM") {
739
+ finish(false);
740
+ return;
741
+ }
742
+ finish(false);
743
+ });
744
+
745
+ probe.once("listening", () => finish(true));
746
+ probe.listen(port, host);
747
+ });
748
+ }
749
+
750
+ async _assertStartupPortsAvailable() {
751
+ const targets = this._startupPortsToCheck();
752
+ for (const { port, purpose } of targets) {
753
+ const available = await this._isPortAvailable(port);
754
+ if (!available) {
755
+ throw new Error(
756
+ `Startup preflight failed: port ${port} (${purpose}) is unavailable ` +
757
+ `(already in use or permission denied).`,
758
+ );
759
+ }
760
+ }
761
+ }
762
+
763
+ async scale(groupName, newCount) {
764
+ const group = this.groups[groupName];
765
+ if (!group) throw new Error(`Unknown group: ${groupName}`);
766
+
767
+ // M-2: Bounds checking for newCount
768
+ if (newCount < 1 || newCount > 64) {
769
+ throw new Error(`Invalid worker count ${newCount} for group "${groupName}": must be between 1 and 64`);
770
+ }
771
+
772
+ const currentIds = this.groupWorkers.get(groupName) ?? [];
773
+ const currentCount = currentIds.length;
774
+
775
+ if (newCount === currentCount) return;
776
+
777
+ if (newCount > currentCount) {
778
+ const toAdd = newCount - currentCount;
779
+ console.log(` ↑ Scaling ${groupName} ${currentCount} → ${newCount} (+${toAdd})`);
780
+ // H-3: Reset groupReadyLogged so ready message is logged again for new workers
781
+ this._groupReadyLogged.delete(groupName);
782
+ // Use a monotonic counter to avoid index collisions after scale-down + scale-up
783
+ if (this._nextWorkerIndex[groupName] === undefined) {
784
+ this._nextWorkerIndex[groupName] = currentCount;
785
+ }
786
+ for (let i = 0; i < toAdd; i++) {
787
+ const workerIndex = this._nextWorkerIndex[groupName]++;
788
+ // Clear any stale restart history so new workers don't inherit crash counts
789
+ this._restartHistory.delete(`${groupName}:${workerIndex}`);
790
+ this._forkGroupWorker(groupName, group, workerIndex);
791
+ }
792
+ } else {
793
+ const toRemove = currentCount - newCount;
794
+ console.log(` ↓ Scaling ${groupName} ${currentCount} → ${newCount} (-${toRemove})`);
795
+ for (let i = 0; i < toRemove; i++) {
796
+ const wid = currentIds[currentIds.length - 1 - i];
797
+ this._scalingDown.add(wid);
798
+ const worker = cluster.workers[wid];
799
+ if (worker) {
800
+ worker.process.kill("SIGTERM");
801
+ // RT-H3: Force SIGKILL after 10s if worker hasn't exited
802
+ const killTimer = setTimeout(() => {
803
+ this._killTimers.delete(wid);
804
+ try {
805
+ if (!worker.isDead()) {
806
+ console.error(` ⚠ Worker ${wid} did not exit after SIGTERM, sending SIGKILL`);
807
+ worker.process.kill("SIGKILL");
808
+ }
809
+ } catch {}
810
+ }, 10_000);
811
+ killTimer.unref();
812
+ this._killTimers.set(wid, killTimer);
813
+ // H3: Clean up kill timer if worker exits before SIGKILL fires
814
+ worker.once('exit', () => {
815
+ const t = this._killTimers.get(wid);
816
+ if (t) {
817
+ clearTimeout(t);
818
+ this._killTimers.delete(wid);
819
+ }
820
+ });
821
+ }
822
+ }
823
+ }
824
+
825
+ this.allocation.set(groupName, newCount);
826
+ }
827
+
828
+ /**
829
+ * H-5: Start heartbeat monitor — checks worker health every 30s.
830
+ * Warns after 60s of silence, kills after 90s.
831
+ */
832
+ _startHeartbeatMonitor() {
833
+ // Initialize heartbeat timestamps for all current workers
834
+ const now = Date.now();
835
+ for (const wid of this.workerMap.keys()) {
836
+ this._lastHeartbeat.set(wid, now);
837
+ }
838
+
839
+ this._heartbeatInterval = setInterval(() => {
840
+ if (this._shuttingDown) return;
841
+
842
+ // Request health checks from message bus if available
843
+ if (typeof this.messageBus.requestHealthChecks === 'function') {
844
+ this.messageBus.requestHealthChecks();
845
+ }
846
+
847
+ const checkTime = Date.now();
848
+ for (const [wid, info] of this.workerMap) {
849
+ const lastSeen = this._lastHeartbeat.get(wid) ?? 0;
850
+ const elapsed = checkTime - lastSeen;
851
+
852
+ if (elapsed > 90_000) {
853
+ // 90s without response — kill the worker
854
+ console.error(` ✖ Worker ${info.groupName}[${info.workerId}] unresponsive for ${Math.round(elapsed / 1000)}s — sending SIGKILL`);
855
+ try {
856
+ const w = cluster.workers?.[wid];
857
+ if (w && !w.isDead()) {
858
+ w.process.kill('SIGKILL');
859
+ }
860
+ } catch {}
861
+ } else if (elapsed > 60_000) {
862
+ // 60s without response — log a warning
863
+ console.warn(` ⚠ Worker ${info.groupName}[${info.workerId}] no heartbeat for ${Math.round(elapsed / 1000)}s`);
864
+ }
865
+ }
866
+ }, 30_000);
867
+
868
+ this._heartbeatInterval.unref();
869
+ }
870
+
871
+ _stopHeartbeatMonitor() {
872
+ if (this._heartbeatInterval) {
873
+ clearInterval(this._heartbeatInterval);
874
+ this._heartbeatInterval = null;
875
+ }
876
+ }
877
+
878
+ async shutdown() {
879
+ if (this._shuttingDown) return;
880
+ this._shuttingDown = true;
881
+
882
+ // H-5: Stop heartbeat monitor
883
+ this._stopHeartbeatMonitor();
884
+
885
+ // Resolve in-flight /metrics scrapes so callers don't hang during shutdown
886
+ for (const pending of this._pendingMetricsSnapshots.values()) {
887
+ clearTimeout(pending.timer);
888
+ pending.finish();
889
+ }
890
+ this._pendingMetricsSnapshots.clear();
891
+
892
+ // Cancel any pending delayed restarts
893
+ for (const timer of this._pendingRestarts.values()) {
894
+ clearTimeout(timer);
895
+ }
896
+ this._pendingRestarts.clear();
897
+
898
+ // Cancel any pending SIGKILL timers from scale-down
899
+ for (const [, timer] of this._killTimers) { clearTimeout(timer); }
900
+ this._killTimers.clear();
901
+
902
+ console.log("\n Shutting down ThreadForge...\n");
903
+
904
+ // C5: Overall shutdown deadline — each phase races against remaining time
905
+ const deadlineStart = Date.now();
906
+ const withDeadline = (promise, label) => {
907
+ const remaining = SHUTDOWN_DEADLINE_MS - (Date.now() - deadlineStart);
908
+ if (remaining <= 0) {
909
+ console.warn(` ⚠ Shutdown deadline exceeded during: ${label} — skipping`);
910
+ return Promise.resolve();
911
+ }
912
+ return Promise.race([
913
+ promise,
914
+ new Promise((resolve) => {
915
+ const t = setTimeout(() => {
916
+ console.warn(` ⚠ Shutdown phase "${label}" exceeded deadline — skipping`);
917
+ resolve();
918
+ }, remaining);
919
+ t.unref();
920
+ }),
921
+ ]);
922
+ };
923
+
924
+ // Close metrics server first so health checks fail during shutdown
925
+ if (this._metricsServer) {
926
+ await withDeadline(
927
+ new Promise(resolve => this._metricsServer.close(resolve)),
928
+ 'metrics server close'
929
+ );
930
+ this._metricsServer = null;
931
+ }
932
+
933
+ // Step 1: Send graceful shutdown message to each worker
934
+ for (const id of Object.keys(cluster.workers)) {
935
+ const worker = cluster.workers[id];
936
+ if (worker) {
937
+ this._sendWorkerMessage(worker, { type: "forge:shutdown" }, "shutdown signal");
938
+ }
939
+ }
940
+
941
+ // Step 2: Wait for workers to drain HTTP connections and exit
942
+ // H-CORE-3: Unref timers so they don't keep the process alive after workers exit
943
+ await withDeadline(new Promise((resolve) => {
944
+ const check = setInterval(() => {
945
+ const alive = Object.keys(cluster.workers).filter((id) => {
946
+ const w = cluster.workers[id];
947
+ return w && !w.isDead();
948
+ });
949
+ if (alive.length === 0) {
950
+ clearInterval(check);
951
+ resolve();
952
+ }
953
+ }, 200);
954
+ check.unref();
955
+ const fallback = setTimeout(() => {
956
+ clearInterval(check);
957
+ resolve();
958
+ }, 10000);
959
+ fallback.unref();
960
+ }), 'graceful drain');
961
+
962
+ // Collect all worker PIDs before disconnect (workers may leave cluster.workers after disconnect)
963
+ const workerPids = new Set();
964
+ for (const id of Object.keys(cluster.workers)) {
965
+ const w = cluster.workers[id];
966
+ if (w?.process?.pid) workerPids.add(w.process.pid);
967
+ }
968
+
969
+ // Step 3: Disconnect remaining workers
970
+ for (const id of Object.keys(cluster.workers)) {
971
+ const worker = cluster.workers[id];
972
+ if (worker && !worker.isDead()) {
973
+ try {
974
+ worker.disconnect();
975
+ } catch {
976
+ // Worker may already be dead
977
+ }
978
+ }
979
+ }
980
+
981
+ // Step 4: Wait for disconnect to complete
982
+ await withDeadline(new Promise((resolve) => {
983
+ const check = setInterval(() => {
984
+ const alive = Object.keys(cluster.workers).filter((id) => {
985
+ const w = cluster.workers[id];
986
+ return w && !w.isDead();
987
+ });
988
+ if (alive.length === 0) {
989
+ clearInterval(check);
990
+ resolve();
991
+ }
992
+ }, 200);
993
+ check.unref();
994
+ const fallback = setTimeout(() => {
995
+ clearInterval(check);
996
+ resolve();
997
+ }, 5000);
998
+ fallback.unref();
999
+ }), 'disconnect');
1000
+
1001
+ // Step 5: Force kill any remaining workers
1002
+ for (const id of Object.keys(cluster.workers)) {
1003
+ const worker = cluster.workers[id];
1004
+ if (worker && !worker.isDead()) {
1005
+ console.error(` ⚠ Forcefully killing worker ${id}...`);
1006
+ worker.process.kill("SIGKILL");
1007
+ }
1008
+ }
1009
+
1010
+ // Also kill workers that disconnected from cluster but may still be alive
1011
+ for (const pid of workerPids) {
1012
+ try { process.kill(pid, 0); process.kill(pid, "SIGKILL"); } catch {}
1013
+ }
1014
+
1015
+ // C3: Clean up temp endpoint file if created
1016
+ if (this._endpointTempFile) {
1017
+ try { fs.unlinkSync(this._endpointTempFile); } catch {}
1018
+ this._endpointTempFile = null;
1019
+ }
1020
+ if (this._sitesTempFile) {
1021
+ try { fs.unlinkSync(this._sitesTempFile); } catch {}
1022
+ this._sitesTempFile = null;
1023
+ }
1024
+
1025
+ // H4: Clean up PID file
1026
+ if (this._pidFilePath) {
1027
+ try { fs.unlinkSync(this._pidFilePath); } catch {}
1028
+ this._pidFilePath = null;
1029
+ }
1030
+
1031
+ console.log(" All workers stopped. Goodbye.\n");
1032
+ this.messageBus.cleanup();
1033
+ this.scaleAdvisor.stop();
1034
+ // O14: Add deadline to registry.stop() to prevent hanging
1035
+ try {
1036
+ await Promise.race([
1037
+ this.registry.stop(),
1038
+ new Promise(resolve => setTimeout(resolve, 5000)),
1039
+ ]);
1040
+ } catch (err) {
1041
+ console.error(` ⚠ Registry stop failed: ${err.message}`);
1042
+ }
1043
+ // Let the caller decide whether to exit — don't force process.exit here
1044
+ // so tests and CLI wrappers can run post-shutdown cleanup
1045
+ }
1046
+
1047
+ async _startMetricsServer() {
1048
+ // Allow metricsPort: null or false to disable metrics entirely
1049
+ if (this.config.metricsPort === null || this.config.metricsPort === false) {
1050
+ console.log(` 📊 Metrics: disabled`);
1051
+ return;
1052
+ }
1053
+
1054
+ // Safety fallback to 9090 (config layer should already provide this default)
1055
+ const port = this.config.metricsPort ?? 9090;
1056
+
1057
+ return new Promise((resolve) => {
1058
+ this._metricsServer = createServer((req, res) => {
1059
+ const reqPath = new URL(req.url ?? "/", "http://localhost").pathname;
1060
+
1061
+ // Let registry handle its endpoints first
1062
+ if (this.registry.httpHandler(req, res)) return;
1063
+
1064
+ // S7: Auth gate for sensitive supervisor endpoints (matches worker-level FORGE_METRICS_TOKEN)
1065
+ // SEC-C2: Include registry endpoints — they expose full service topology
1066
+ const sensitiveEndpoints = ['/status', '/metrics', '/scaling', '/_forge/topology', '/_forge/resolve'];
1067
+ if (sensitiveEndpoints.includes(reqPath)) {
1068
+ const metricsToken = process.env.FORGE_METRICS_TOKEN;
1069
+ if (metricsToken) {
1070
+ const auth = req.headers['authorization'] ?? '';
1071
+ const expected = `Bearer ${metricsToken}`;
1072
+ if (auth.length !== expected.length ||
1073
+ !timingSafeEqual(Buffer.from(auth), Buffer.from(expected))) {
1074
+ res.writeHead(401, { "Content-Type": "application/json" });
1075
+ res.end(JSON.stringify({ error: "Unauthorized" }));
1076
+ return;
1077
+ }
1078
+ }
1079
+ }
1080
+
1081
+ if (reqPath === "/status") {
1082
+ res.writeHead(200, { "Content-Type": "application/json" });
1083
+ res.end(JSON.stringify(this._status(), null, 2));
1084
+ } else if (reqPath === "/metrics") {
1085
+ this._collectMetricsSnapshot()
1086
+ .then((payload) => {
1087
+ // O15: Prepend supervisor-level restart counter
1088
+ const supervisorMetrics =
1089
+ `# HELP forge_worker_restarts_total Total number of worker restarts\n` +
1090
+ `# TYPE forge_worker_restarts_total counter\n` +
1091
+ `forge_worker_restarts_total ${this._workerRestartCount}\n`;
1092
+ res.writeHead(200, { "Content-Type": "text/plain; version=0.0.4; charset=utf-8" });
1093
+ res.end(supervisorMetrics + payload);
1094
+ })
1095
+ .catch((err) => {
1096
+ res.writeHead(500, { "Content-Type": "text/plain; charset=utf-8" });
1097
+ res.end(`# metrics collection failed: ${err.message}\n`);
1098
+ });
1099
+ return;
1100
+ } else if (reqPath === "/health" || reqPath === "/health/ready") {
1101
+ // O1: Readiness probe — 200 only when ALL workers have reported ready
1102
+ const totalWorkers = this.workerMap.size;
1103
+ const readyWorkers = [...this._workersReady.values()].filter(Boolean).length;
1104
+ if (totalWorkers > 0 && readyWorkers >= totalWorkers) {
1105
+ res.writeHead(200, { "Content-Type": "application/json" });
1106
+ res.end(JSON.stringify({ status: "ready", ready: readyWorkers, total: totalWorkers }));
1107
+ } else {
1108
+ res.writeHead(503, { "Content-Type": "application/json" });
1109
+ res.end(JSON.stringify({ status: "starting", ready: readyWorkers, total: totalWorkers }));
1110
+ }
1111
+ } else if (reqPath === "/health/live") {
1112
+ // O1: Liveness probe — always 200 if process is running
1113
+ res.writeHead(200, { "Content-Type": "text/plain" });
1114
+ res.end("ok");
1115
+ } else if (reqPath === "/scaling") {
1116
+ res.writeHead(200, { "Content-Type": "text/plain" });
1117
+ res.end(this.scaleAdvisor.report());
1118
+ } else {
1119
+ res.writeHead(404);
1120
+ res.end("Not found");
1121
+ }
1122
+ });
1123
+
1124
+ this._metricsServer.on("error", (err) => {
1125
+ // Enhanced error message with actionable guidance
1126
+ console.warn(` ⚠ Metrics server failed to bind port ${port}: ${err.message}`);
1127
+ console.warn(` To fix: Set metricsPort to a different port in your config, or set metricsPort: null to disable metrics.`);
1128
+ console.warn(` Example: defineServices(services, { metricsPort: 9091 }) or { metricsPort: null }`);
1129
+ this._metricsServer = null;
1130
+ resolve(); // non-fatal — supervisor continues without metrics
1131
+ });
1132
+
1133
+ // Set timeouts to prevent slowloris attacks
1134
+ this._metricsServer.timeout = 5000;
1135
+ this._metricsServer.requestTimeout = 5000;
1136
+ this._metricsServer.headersTimeout = 3000;
1137
+
1138
+ // RT-H4: Bind to localhost only — metrics endpoint has no auth
1139
+ // C2: Allow override via FORGE_METRICS_BIND for containers (e.g. 0.0.0.0)
1140
+ const bindAddr = process.env.FORGE_METRICS_BIND || "127.0.0.1";
1141
+ // SEC-C2: Warn when metrics are exposed without auth
1142
+ if (bindAddr !== "127.0.0.1" && bindAddr !== "::1" && !process.env.FORGE_METRICS_TOKEN) {
1143
+ console.warn(` ⚠ Metrics server binding to ${bindAddr} without FORGE_METRICS_TOKEN — topology and metrics are publicly accessible`);
1144
+ console.warn(` Set FORGE_METRICS_TOKEN=<secret> to require Bearer auth on sensitive endpoints`);
1145
+ }
1146
+ this._metricsServer.listen(port, bindAddr, () => {
1147
+ console.log(` 📊 Metrics: http://${bindAddr}:${port}/status (Prometheus: /metrics)`);
1148
+ resolve();
1149
+ });
1150
+ });
1151
+ }
1152
+
1153
+ _status() {
1154
+ const groups = [];
1155
+ for (const [groupName, workerIds] of this.groupWorkers) {
1156
+ const group = this.groups[groupName];
1157
+ const pids = workerIds.map((wid) => cluster.workers[wid]?.process?.pid).filter(Boolean);
1158
+
1159
+ groups.push({
1160
+ group: groupName,
1161
+ services: group.services.map((s) => ({
1162
+ name: s.name,
1163
+ type: s.type,
1164
+ port: s.port,
1165
+ })),
1166
+ workers: workerIds.length,
1167
+ pids,
1168
+ });
1169
+ }
1170
+
1171
+ return {
1172
+ supervisorPid: process.pid,
1173
+ uptime: process.uptime(),
1174
+ totalCpus: this.allocator.totalCpus,
1175
+ nodeId: this.registry.nodeId,
1176
+ host: this.registry.host,
1177
+ registryMode: this.registry.mode,
1178
+ processGroups: groups,
1179
+ channels: this.channels,
1180
+ totalProcesses: Object.keys(cluster.workers).length,
1181
+ totalServices: Object.keys(this.services).length,
1182
+ remoteServices: Object.values(this.services).filter((s) => s.type === "remote").length,
1183
+ portsUsed: Object.values(this.services)
1184
+ .filter((s) => s.port)
1185
+ .map((s) => s.port),
1186
+ messageBus: this.messageBus.stats(),
1187
+ topology: this.registry.topology(),
1188
+ scalingRecommendations: this.scaleAdvisor.recommendations,
1189
+ };
1190
+ }
1191
+
1192
+ /**
1193
+ * Build endpoint map for workers.
1194
+ *
1195
+ * Maps each service to { host, port, remote } or an array of endpoints
1196
+ * for multi-instance services. Remote services get their address parsed
1197
+ * into host/port. Local services get host: '127.0.0.1'.
1198
+ */
1199
+ _buildEndpointMap() {
1200
+ const endpoints = {};
1201
+
1202
+ for (const [name, svc] of Object.entries(this.services)) {
1203
+ if (svc.type === "remote") {
1204
+ // Parse address: "http://host:port" or "host:port"
1205
+ const parsed = this._parseAddress(svc.address, name);
1206
+ if (parsed) {
1207
+ endpoints[name] = { host: parsed.host, port: parsed.port, remote: true };
1208
+ }
1209
+ } else if (svc.port) {
1210
+ endpoints[name] = { host: "127.0.0.1", port: svc.port, remote: false };
1211
+ } else if (svc.type === "internal" || svc.type === "background") {
1212
+ // Include internal/background services so workers can use DirectMessageBus
1213
+ // for cross-group calls instead of falling back to supervisor IPC
1214
+ const groupName = svc.group ?? `_isolated:${name}`;
1215
+ const socketPath = this.messageBus.getSocketPath?.(name);
1216
+ endpoints[name] = { host: "127.0.0.1", remote: false, uds: socketPath ?? null, group: groupName };
1217
+ }
1218
+ }
1219
+
1220
+ return endpoints;
1221
+ }
1222
+
1223
+ /**
1224
+ * Parse a service address string into host/port.
1225
+ * Supports: "http://host:port", "host:port", and other URL schemes
1226
+ */
1227
+ _parseAddress(address, serviceName) {
1228
+ if (!address) return null;
1229
+
1230
+ try {
1231
+ // Try as URL first (handles http://, https://, etc.)
1232
+ if (address.includes("://")) {
1233
+ const url = new URL(address);
1234
+ return {
1235
+ host: url.hostname,
1236
+ port: parseInt(url.port, 10) || (url.protocol === "https:" ? 443 : 80),
1237
+ };
1238
+ }
1239
+
1240
+ // Plain host:port
1241
+ const [host, portStr] = address.split(":");
1242
+ const port = parseInt(portStr, 10);
1243
+ if (host && port) return { host, port };
1244
+ } catch (err) {
1245
+ console.warn(` ⚠ Failed to parse address "${address}" for service "${serviceName ?? "unknown"}": ${err.message}`);
1246
+ }
1247
+
1248
+ console.warn(` ⚠ Invalid address "${address}" for service "${serviceName ?? "unknown"}" — skipping`);
1249
+ return null;
1250
+ }
1251
+
1252
+ _printAllocation() {
1253
+ console.log("");
1254
+ console.log(" ┌──────────────────┬───────────────────────────┬─────────┬────────┐");
1255
+ console.log(" │ Process Group │ Services │ Workers │ Port │");
1256
+ console.log(" ├──────────────────┼───────────────────────────┼─────────┼────────┤");
1257
+
1258
+ for (const [groupName, group] of Object.entries(this.groups)) {
1259
+ const name = groupName.replace("_isolated:", "").padEnd(16);
1260
+ const svcList = group.services
1261
+ .map((s) => {
1262
+ const badge = s.type === "edge" ? "⚡" : s.type === "background" ? "⏰" : "○";
1263
+ return `${badge} ${s.name}`;
1264
+ })
1265
+ .join(", ");
1266
+ const svcs = svcList.substring(0, 25).padEnd(25);
1267
+ const threads = String(this.allocation.get(groupName) ?? 1).padEnd(7);
1268
+ const port = group.port ? String(group.port).padEnd(6) : " — ";
1269
+ console.log(` │ ${name} │ ${svcs} │ ${threads} │ ${port} │`);
1270
+ }
1271
+
1272
+ console.log(" └──────────────────┴───────────────────────────┴─────────┴────────┘");
1273
+
1274
+ let totalProcesses = 0;
1275
+ for (const [, count] of this.allocation) totalProcesses += count;
1276
+ const edgePorts = Object.values(this.services).filter((s) => s.port).length;
1277
+
1278
+ console.log(
1279
+ ` Processes: ${totalProcesses} | Services: ${Object.keys(this.services).length} | Ports: ${edgePorts} | CPUs: ${this.allocator.totalCpus}`,
1280
+ );
1281
+ }
1282
+
1283
+ _printTopology() {
1284
+ if (this.channels.length === 0) return;
1285
+
1286
+ console.log("");
1287
+ console.log(" Channels:");
1288
+ for (const ch of this.channels) {
1289
+ console.log(` ${ch.from} ↔ ${ch.to}`);
1290
+ }
1291
+ console.log(` Total: ${this.channels.length} channels (dependency-based)`);
1292
+ }
1293
+
1294
+ _banner() {
1295
+ let version = "0.0.0";
1296
+ try {
1297
+ const pkg = JSON.parse(fs.readFileSync(new URL("../../package.json", import.meta.url), "utf8"));
1298
+ version = pkg.version;
1299
+ } catch {}
1300
+ return `
1301
+ ╔════════════════════════════════════╗
1302
+ ║ ⚡ ThreadForge v${version.padEnd(12)}║
1303
+ ║ Multi-threaded Service Runtime ║
1304
+ ╚════════════════════════════════════╝`;
1305
+ }
1306
+ }