threadforge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +152 -0
- package/bin/forge.js +1050 -0
- package/bin/host-commands.js +344 -0
- package/bin/platform-commands.js +570 -0
- package/package.json +71 -0
- package/shared/auth.js +475 -0
- package/src/core/DirectMessageBus.js +364 -0
- package/src/core/EndpointResolver.js +247 -0
- package/src/core/ForgeContext.js +2227 -0
- package/src/core/ForgeHost.js +122 -0
- package/src/core/ForgePlatform.js +145 -0
- package/src/core/Ingress.js +768 -0
- package/src/core/Interceptors.js +420 -0
- package/src/core/MessageBus.js +310 -0
- package/src/core/Prometheus.js +305 -0
- package/src/core/RequestContext.js +413 -0
- package/src/core/RoutingStrategy.js +316 -0
- package/src/core/Supervisor.js +1306 -0
- package/src/core/ThreadAllocator.js +196 -0
- package/src/core/WorkerChannelManager.js +879 -0
- package/src/core/config.js +624 -0
- package/src/core/host-config.js +311 -0
- package/src/core/network-utils.js +166 -0
- package/src/core/platform-config.js +308 -0
- package/src/decorators/ServiceProxy.js +899 -0
- package/src/decorators/index.js +571 -0
- package/src/deploy/NginxGenerator.js +865 -0
- package/src/deploy/PlatformManifestGenerator.js +96 -0
- package/src/deploy/RouteManifestGenerator.js +112 -0
- package/src/deploy/index.js +984 -0
- package/src/frontend/FrontendDevLifecycle.js +65 -0
- package/src/frontend/FrontendPluginOrchestrator.js +187 -0
- package/src/frontend/SiteResolver.js +63 -0
- package/src/frontend/StaticMountRegistry.js +90 -0
- package/src/frontend/index.js +5 -0
- package/src/frontend/plugins/index.js +2 -0
- package/src/frontend/plugins/viteFrontend.js +79 -0
- package/src/frontend/types.js +35 -0
- package/src/index.js +56 -0
- package/src/internals.js +31 -0
- package/src/plugins/PluginManager.js +537 -0
- package/src/plugins/ScopedPostgres.js +192 -0
- package/src/plugins/ScopedRedis.js +142 -0
- package/src/plugins/index.js +1729 -0
- package/src/registry/ServiceRegistry.js +796 -0
- package/src/scaling/ScaleAdvisor.js +442 -0
- package/src/services/Service.js +195 -0
- package/src/services/worker-bootstrap.js +676 -0
- package/src/templates/auth-service.js +65 -0
- package/src/templates/identity-service.js +75 -0
|
@@ -0,0 +1,1306 @@
|
|
|
1
|
+
import cluster from "node:cluster";
|
|
2
|
+
import { EventEmitter } from "node:events";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import { createServer } from "node:http";
|
|
5
|
+
import { createServer as createNetServer } from "node:net";
|
|
6
|
+
import path from "node:path";
|
|
7
|
+
import { fileURLToPath } from "node:url";
|
|
8
|
+
import { ServiceRegistry } from "../registry/ServiceRegistry.js";
|
|
9
|
+
import { ScaleAdvisor } from "../scaling/ScaleAdvisor.js";
|
|
10
|
+
import { DirectMessageBus } from "./DirectMessageBus.js";
|
|
11
|
+
import { ThreadAllocator } from "./ThreadAllocator.js";
|
|
12
|
+
|
|
13
|
+
import { timingSafeEqual } from "node:crypto";
|
|
14
|
+
import { tmpdir } from "node:os";
|
|
15
|
+
|
|
16
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const WORKER_BOOTSTRAP = path.join(__dirname, "..", "services", "worker-bootstrap.js");
|
|
18
|
+
|
|
19
|
+
// L1: Restart policy constants
|
|
20
|
+
const RESTART_BASE_BACKOFF_MS = 2000;
|
|
21
|
+
const RESTART_MAX_BACKOFF_MS = 60000;
|
|
22
|
+
const MAX_RESTARTS_PER_WINDOW = 5;
|
|
23
|
+
const RESTART_WINDOW_MS = 300000;
|
|
24
|
+
// L5: Rate-limit restart warnings — one per 5s per group
|
|
25
|
+
const RESTART_WARNING_INTERVAL_MS = 5000;
|
|
26
|
+
// C5: Overall shutdown deadline
|
|
27
|
+
const SHUTDOWN_DEADLINE_MS = 25000;
|
|
28
|
+
// C2: Forbidden env keys
|
|
29
|
+
const FORBIDDEN_ENV_KEYS = new Set(['PATH', 'LD_PRELOAD', 'LD_LIBRARY_PATH', 'NODE_OPTIONS', 'NODE_EXTRA_CA_CERTS']);
|
|
30
|
+
const VALID_ENV_KEY = /^[A-Z_][A-Z0-9_]*$/i;
|
|
31
|
+
// C3: Max env var size before file fallback
|
|
32
|
+
const MAX_ENDPOINT_ENV_SIZE = 65536;
|
|
33
|
+
|
|
34
|
+
function isExpectedWorkerIpcError(err) {
|
|
35
|
+
if (!err) return false;
|
|
36
|
+
if (
|
|
37
|
+
err.code === "EPIPE" ||
|
|
38
|
+
err.code === "ECONNRESET" ||
|
|
39
|
+
err.code === "ERR_IPC_CHANNEL_CLOSED" ||
|
|
40
|
+
err.code === "ERR_IPC_DISCONNECTED"
|
|
41
|
+
) {
|
|
42
|
+
return true;
|
|
43
|
+
}
|
|
44
|
+
const msg = String(err.message ?? "").toLowerCase();
|
|
45
|
+
return (
|
|
46
|
+
msg.includes("channel closed") ||
|
|
47
|
+
msg.includes("ipc channel is already disconnected") ||
|
|
48
|
+
msg.includes("broken pipe")
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Supervisor v2
|
|
54
|
+
*
|
|
55
|
+
* Key differences from v1:
|
|
56
|
+
*
|
|
57
|
+
* - Understands service types (edge/internal/background)
|
|
58
|
+
* - Only edge services get HTTP servers
|
|
59
|
+
* - Colocated services share a process (same event loop)
|
|
60
|
+
* - Channels are dependency-based, not full mesh
|
|
61
|
+
* - Thread allocation is per process group, not per service
|
|
62
|
+
*/
|
|
63
|
+
export class Supervisor extends EventEmitter {
|
|
64
|
+
constructor(config, options = {}) {
|
|
65
|
+
super();
|
|
66
|
+
|
|
67
|
+
this.config = config;
|
|
68
|
+
this.services = config.services;
|
|
69
|
+
this.groups = config.groups;
|
|
70
|
+
this.channels = config.channels; // declared dependency channels
|
|
71
|
+
this.options = options;
|
|
72
|
+
|
|
73
|
+
this.allocator = new ThreadAllocator({
|
|
74
|
+
cpus: options.cpus,
|
|
75
|
+
reserved: options.reserved,
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
this.messageBus = new DirectMessageBus();
|
|
79
|
+
|
|
80
|
+
// Service registry — starts embedded, upgrades to multicast/external
|
|
81
|
+
this.registry = new ServiceRegistry({
|
|
82
|
+
mode: options.registryMode ?? "embedded",
|
|
83
|
+
host: options.host,
|
|
84
|
+
httpBasePort: options.httpBasePort ?? 4000,
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Scale advisor — monitors health and recommends actions
|
|
88
|
+
this.scaleAdvisor = new ScaleAdvisor(this.registry, {
|
|
89
|
+
evaluationIntervalMs: options.evaluationIntervalMs ?? 30000,
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
// Log scaling recommendations
|
|
93
|
+
this.scaleAdvisor.on("recommendation", (rec) => {
|
|
94
|
+
const icon = { scale_up: "↑", migrate: "→", split_out: "⊞", scale_down: "↓" };
|
|
95
|
+
console.log(`\n ${icon[rec.action] ?? "•"} SCALE: ${rec.service} — ${rec.action}`);
|
|
96
|
+
console.log(` ${rec.reason}`);
|
|
97
|
+
if (rec.details.command) console.log(` Run: ${rec.details.command}`);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
// Plugins
|
|
101
|
+
this.plugins = config.plugins ?? [];
|
|
102
|
+
this._pluginEnv = {};
|
|
103
|
+
|
|
104
|
+
/** @type {Map<number, {groupName: string, services: string[], workerId: number}>} */
|
|
105
|
+
this.workerMap = new Map();
|
|
106
|
+
|
|
107
|
+
/** @type {Map<string, number[]>} group name → cluster worker IDs */
|
|
108
|
+
this.groupWorkers = new Map();
|
|
109
|
+
|
|
110
|
+
/** @type {Map<string, number>} */
|
|
111
|
+
this.allocation = new Map();
|
|
112
|
+
|
|
113
|
+
this._metricsServer = null;
|
|
114
|
+
this._shuttingDown = false;
|
|
115
|
+
this._restartHistory = new Map();
|
|
116
|
+
/** @type {Set<number>} worker IDs being intentionally removed during scale-down */
|
|
117
|
+
this._scalingDown = new Set();
|
|
118
|
+
/** @type {Map<string, NodeJS.Timeout>} pending delayed restarts keyed by cooldownKey */
|
|
119
|
+
this._pendingRestarts = new Map();
|
|
120
|
+
/** @type {Map<number, NodeJS.Timeout>} SIGKILL timers for scale-down workers (RT-H3) */
|
|
121
|
+
this._killTimers = new Map();
|
|
122
|
+
/** @type {Map<string, number>} L5: last restart warning time per group */
|
|
123
|
+
this._restartWarningTimes = new Map();
|
|
124
|
+
/** @type {string|null} C3: temp file path for large endpoint maps */
|
|
125
|
+
this._endpointTempFile = null;
|
|
126
|
+
/** @type {string|null} temp file path for large site maps */
|
|
127
|
+
this._sitesTempFile = null;
|
|
128
|
+
/** @type {number} */
|
|
129
|
+
this._metricsRequestSeq = 0;
|
|
130
|
+
/** @type {Map<string, {expected: Set<number>, chunks: string[], timer: NodeJS.Timeout, finish: Function}>} */
|
|
131
|
+
this._pendingMetricsSnapshots = new Map();
|
|
132
|
+
/** @type {Map<string, Set<number>>} */
|
|
133
|
+
this._groupReadyWorkers = new Map();
|
|
134
|
+
/** @type {Set<string>} */
|
|
135
|
+
this._groupReadyLogged = new Set();
|
|
136
|
+
/** @type {Object<string, number>} M-3: monotonic worker index per group for scale-up */
|
|
137
|
+
this._nextWorkerIndex = {};
|
|
138
|
+
/** @type {NodeJS.Timeout|null} H-5: heartbeat monitor interval */
|
|
139
|
+
this._heartbeatInterval = null;
|
|
140
|
+
/** @type {Map<number, number>} H-5: last heartbeat response time per worker ID */
|
|
141
|
+
this._lastHeartbeat = new Map();
|
|
142
|
+
/** @type {Map<number, boolean>} O1: per-worker readiness tracking */
|
|
143
|
+
this._workersReady = new Map();
|
|
144
|
+
/** @type {number} O15: total worker restart count */
|
|
145
|
+
this._workerRestartCount = 0;
|
|
146
|
+
/** @type {WeakSet<object>} workers that already have an error guard attached */
|
|
147
|
+
this._workerErrorGuards = new WeakSet();
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async start() {
|
|
151
|
+
if (!cluster.isPrimary) {
|
|
152
|
+
throw new Error("Supervisor.start() must be called from the primary process");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// S6: Reject placeholder JWT_SECRET in production
|
|
156
|
+
if (process.env.NODE_ENV === 'production' && process.env.JWT_SECRET === 'CHANGE_ME_BEFORE_DEPLOY') {
|
|
157
|
+
console.error('FATAL: JWT_SECRET is set to the placeholder value "CHANGE_ME_BEFORE_DEPLOY". Set a real secret before deploying.');
|
|
158
|
+
process.exit(1);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Register signal handlers early so signals during startup are caught
|
|
162
|
+
process.once("SIGTERM", () => this.shutdown());
|
|
163
|
+
process.once("SIGINT", () => this.shutdown());
|
|
164
|
+
|
|
165
|
+
// Preflight check: fail fast with a single clear error before forking workers.
|
|
166
|
+
await this._assertStartupPortsAvailable();
|
|
167
|
+
|
|
168
|
+
// Validate and collect plugin env vars before forking workers
|
|
169
|
+
/** @type {Set<string>} plugins that failed validation — workers will skip these */
|
|
170
|
+
this._failedPlugins = new Set();
|
|
171
|
+
|
|
172
|
+
if (this.plugins.length > 0) {
|
|
173
|
+
for (const plugin of this.plugins) {
|
|
174
|
+
const pName = plugin.name ?? "unknown";
|
|
175
|
+
try {
|
|
176
|
+
if (plugin.validate) {
|
|
177
|
+
await plugin.validate();
|
|
178
|
+
}
|
|
179
|
+
if (plugin.env) {
|
|
180
|
+
Object.assign(this._pluginEnv, plugin.env());
|
|
181
|
+
}
|
|
182
|
+
} catch (err) {
|
|
183
|
+
console.warn(` ⚠ Plugin "${pName}" unavailable: ${err.message}`);
|
|
184
|
+
this._failedPlugins.add(pName);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const available = this.plugins.filter((p) => !this._failedPlugins.has(p.name ?? "unknown"));
|
|
189
|
+
if (available.length > 0) {
|
|
190
|
+
console.log(` Plugins: ${available.map((p) => p.name).join(", ")}`);
|
|
191
|
+
}
|
|
192
|
+
if (this._failedPlugins.size > 0) {
|
|
193
|
+
console.warn(` Failed plugins: ${[...this._failedPlugins].join(", ")}`);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
console.log(this._banner());
|
|
198
|
+
|
|
199
|
+
// Allocate threads per process group (not per service)
|
|
200
|
+
this._allocateGroups();
|
|
201
|
+
|
|
202
|
+
// Display allocation
|
|
203
|
+
this._printAllocation();
|
|
204
|
+
|
|
205
|
+
cluster.setupPrimary({
|
|
206
|
+
exec: WORKER_BOOTSTRAP,
|
|
207
|
+
silent: false,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// Register exit handler BEFORE forking so early crashes are caught
|
|
211
|
+
cluster.on("exit", (worker, code, signal) => {
|
|
212
|
+
this._handleWorkerExit(worker, code, signal);
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
// P21: Pre-serialize endpoint map once for all worker forks
|
|
216
|
+
this._cachedEndpointJson = JSON.stringify(this._buildEndpointMap());
|
|
217
|
+
|
|
218
|
+
// Fork workers for each process group
|
|
219
|
+
for (const [groupName, group] of Object.entries(this.groups)) {
|
|
220
|
+
const threadCount = this.allocation.get(groupName) ?? 1;
|
|
221
|
+
this.groupWorkers.set(groupName, []);
|
|
222
|
+
this._groupReadyWorkers.set(groupName, new Set());
|
|
223
|
+
|
|
224
|
+
for (let i = 0; i < threadCount; i++) {
|
|
225
|
+
this._forkGroupWorker(groupName, group, i);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
await this._startMetricsServer();
|
|
230
|
+
|
|
231
|
+
// Start registry and scale advisor
|
|
232
|
+
await this.registry.start();
|
|
233
|
+
this.scaleAdvisor.start();
|
|
234
|
+
|
|
235
|
+
// Register all local services in the registry
|
|
236
|
+
for (const [name, svc] of Object.entries(this.services)) {
|
|
237
|
+
if (svc.type === "remote") continue;
|
|
238
|
+
const groupName = svc.group ?? `_isolated:${name}`;
|
|
239
|
+
this.registry.register({
|
|
240
|
+
name,
|
|
241
|
+
ports: { http: svc.port },
|
|
242
|
+
udsPath: null,
|
|
243
|
+
workers: this.allocation.get(groupName) ?? 1,
|
|
244
|
+
contract: {
|
|
245
|
+
methods: [], // populated by worker after loading class
|
|
246
|
+
events: [],
|
|
247
|
+
},
|
|
248
|
+
metadata: {
|
|
249
|
+
group: groupName,
|
|
250
|
+
},
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Print channel topology
|
|
255
|
+
this._printTopology();
|
|
256
|
+
|
|
257
|
+
// H4: Write PID file so `forge stop` can find us without the metrics endpoint
|
|
258
|
+
this._pidFilePath = path.join(process.cwd(), ".forge.pid");
|
|
259
|
+
try {
|
|
260
|
+
fs.writeFileSync(this._pidFilePath, String(process.pid));
|
|
261
|
+
} catch {}
|
|
262
|
+
|
|
263
|
+
console.log(`\n ⚡ ThreadForge runtime started\n`);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Allocate threads per process group.
|
|
268
|
+
*
|
|
269
|
+
* Each group gets threads based on the highest weight of its member
|
|
270
|
+
* services. Colocated services share their group's allocation.
|
|
271
|
+
*/
|
|
272
|
+
_allocateGroups() {
|
|
273
|
+
// Build a services-like map for the allocator, keyed by group name
|
|
274
|
+
const groupConfigs = {};
|
|
275
|
+
for (const [groupName, group] of Object.entries(this.groups)) {
|
|
276
|
+
groupConfigs[groupName] = {
|
|
277
|
+
name: groupName,
|
|
278
|
+
port: group.port ?? 0,
|
|
279
|
+
threads: group.threads === 0 ? "auto" : group.threads,
|
|
280
|
+
weight: group.weight || 1,
|
|
281
|
+
mode: "cluster",
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
this.allocation = this.allocator.allocate(groupConfigs);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Fork a worker for a process group.
|
|
290
|
+
*
|
|
291
|
+
* The worker will load ALL services in the group within a single
|
|
292
|
+
* process. Colocated services communicate via direct function calls.
|
|
293
|
+
*/
|
|
294
|
+
_forkGroupWorker(groupName, group, workerIndex) {
|
|
295
|
+
const serviceNames = group.services.map((s) => s.name);
|
|
296
|
+
const edgeService = group.services.find((s) => s.type === "edge");
|
|
297
|
+
|
|
298
|
+
// Build comma-separated entry points for all services in the group
|
|
299
|
+
const entries = group.services.map((s) => `${s.name}=${s.entry}`).join(",");
|
|
300
|
+
|
|
301
|
+
const env = {
|
|
302
|
+
...process.env,
|
|
303
|
+
...this._pluginEnv,
|
|
304
|
+
FORGE_GROUP_NAME: groupName,
|
|
305
|
+
FORGE_SERVICE_ENTRIES: entries,
|
|
306
|
+
FORGE_SERVICE_NAMES: serviceNames.join(","),
|
|
307
|
+
FORGE_PORT: edgeService ? String(edgeService.port) : "0", // 0 = no HTTP
|
|
308
|
+
FORGE_WORKER_ID: String(workerIndex),
|
|
309
|
+
FORGE_THREAD_COUNT: String(this.allocation.get(groupName) ?? 1),
|
|
310
|
+
FORGE_MODE: "cluster",
|
|
311
|
+
FORGE_SERVICE_TYPES: group.services.map((s) => `${s.name}=${s.type}`).join(","),
|
|
312
|
+
// Port map for HTTP-based service-to-service calls (backward compat)
|
|
313
|
+
FORGE_SERVICE_PORTS: JSON.stringify(
|
|
314
|
+
Object.fromEntries(
|
|
315
|
+
Object.entries(this.services)
|
|
316
|
+
.filter(([, s]) => s.port)
|
|
317
|
+
.map(([name, s]) => [name, s.port]),
|
|
318
|
+
),
|
|
319
|
+
),
|
|
320
|
+
// Full endpoint topology — includes remote hosts for multi-machine
|
|
321
|
+
// S10: Endpoint map may contain internal IPs — treat as trusted internal config, not user input.
|
|
322
|
+
// P21: Use cached JSON serialization; C3: File fallback when JSON exceeds 64KB
|
|
323
|
+
...(() => {
|
|
324
|
+
const json = this._cachedEndpointJson ?? JSON.stringify(this._buildEndpointMap());
|
|
325
|
+
if (json.length > MAX_ENDPOINT_ENV_SIZE) {
|
|
326
|
+
if (!this._endpointTempFile) {
|
|
327
|
+
const tempFile = path.join(tmpdir(), `forge-endpoints-${process.pid}.json`);
|
|
328
|
+
// M-SEC-5: Restrict temp file permissions — contains internal topology
|
|
329
|
+
fs.writeFileSync(tempFile, json, { encoding: 'utf8', mode: 0o600 });
|
|
330
|
+
this._endpointTempFile = tempFile;
|
|
331
|
+
}
|
|
332
|
+
return { FORGE_SERVICE_ENDPOINTS_FILE: this._endpointTempFile };
|
|
333
|
+
}
|
|
334
|
+
return { FORGE_SERVICE_ENDPOINTS: json };
|
|
335
|
+
})(),
|
|
336
|
+
...(() => {
|
|
337
|
+
const json = this.config._sites ? JSON.stringify(this.config._sites) : "";
|
|
338
|
+
if (!json) return { FORGE_SITES: "" };
|
|
339
|
+
if (json.length > MAX_ENDPOINT_ENV_SIZE) {
|
|
340
|
+
if (!this._sitesTempFile) {
|
|
341
|
+
const tempFile = path.join(tmpdir(), `forge-sites-${process.pid}.json`);
|
|
342
|
+
fs.writeFileSync(tempFile, json, { encoding: "utf8", mode: 0o600 });
|
|
343
|
+
this._sitesTempFile = tempFile;
|
|
344
|
+
}
|
|
345
|
+
return { FORGE_SITES_FILE: this._sitesTempFile };
|
|
346
|
+
}
|
|
347
|
+
return { FORGE_SITES: json };
|
|
348
|
+
})(),
|
|
349
|
+
// Registry mode and host for dynamic discovery
|
|
350
|
+
FORGE_REGISTRY_MODE: this.options.registryMode ?? "embedded",
|
|
351
|
+
FORGE_HOST: this.options.host ?? "",
|
|
352
|
+
// Plugin config — which plugins each service uses
|
|
353
|
+
FORGE_PLUGINS: JSON.stringify(this.plugins.map((p) => p.name)),
|
|
354
|
+
FORGE_CONFIG_PATH: this.config._configUrl ?? "",
|
|
355
|
+
FORGE_SERVICE_PLUGINS: JSON.stringify(Object.fromEntries(group.services.map((s) => [s.name, s.plugins ?? null]))),
|
|
356
|
+
FORGE_CHANNELS: JSON.stringify(
|
|
357
|
+
this.channels.filter((ch) => serviceNames.includes(ch.from) || serviceNames.includes(ch.to)),
|
|
358
|
+
),
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
if (this.config._isHostMode) {
|
|
362
|
+
env.FORGE_HOST_META = this.config._hostMetaJSON ?? JSON.stringify(this.config._hostMeta);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if (this.config._isPlatformMode) {
|
|
366
|
+
env.FORGE_PLATFORM_MODE = "1";
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// C2: Set per-service env overrides with validation
|
|
370
|
+
for (const svc of group.services) {
|
|
371
|
+
for (const [key, value] of Object.entries(svc.env)) {
|
|
372
|
+
if (!VALID_ENV_KEY.test(key)) {
|
|
373
|
+
throw new Error(`Service "${svc.name}": invalid env key "${key}" — must match /^[A-Z_][A-Z0-9_]*$/i`);
|
|
374
|
+
}
|
|
375
|
+
if (FORBIDDEN_ENV_KEYS.has(key.toUpperCase())) {
|
|
376
|
+
throw new Error(`Service "${svc.name}": env key "${key}" is forbidden (security risk)`);
|
|
377
|
+
}
|
|
378
|
+
env[`FORGE_ENV_${svc.name.toUpperCase()}_${key}`] = value;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const worker = cluster.fork(env);
|
|
383
|
+
this._attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex);
|
|
384
|
+
|
|
385
|
+
this.workerMap.set(worker.id, {
|
|
386
|
+
groupName,
|
|
387
|
+
services: serviceNames,
|
|
388
|
+
workerId: workerIndex,
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
// H-5: Initialize heartbeat timestamp so the monitor doesn't kill
|
|
392
|
+
// workers forked after startup (restarts, scale-up)
|
|
393
|
+
this._lastHeartbeat.set(worker.id, Date.now());
|
|
394
|
+
|
|
395
|
+
const workers = this.groupWorkers.get(groupName) ?? [];
|
|
396
|
+
workers.push(worker.id);
|
|
397
|
+
this.groupWorkers.set(groupName, workers);
|
|
398
|
+
|
|
399
|
+
// Register with message bus — using service names, not group name
|
|
400
|
+
// so IPC addressing is still by service name
|
|
401
|
+
for (const svcName of serviceNames) {
|
|
402
|
+
this.messageBus.registerWorker(svcName, worker, "cluster");
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
worker.on("message", (msg) => this._handleWorkerMessage(worker, msg));
|
|
406
|
+
|
|
407
|
+
return worker;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
_attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex) {
|
|
411
|
+
if (!worker || this._workerErrorGuards.has(worker)) return;
|
|
412
|
+
this._workerErrorGuards.add(worker);
|
|
413
|
+
worker.on("error", (err) => {
|
|
414
|
+
if (isExpectedWorkerIpcError(err)) return;
|
|
415
|
+
if (this._shuttingDown) return;
|
|
416
|
+
console.error(
|
|
417
|
+
` ⚠ Worker ${groupName}[${workerIndex}] (${serviceNames.join("+")}) IPC error: ${err?.message ?? err}`,
|
|
418
|
+
);
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
_sendWorkerMessage(worker, message, label = "worker message") {
|
|
423
|
+
if (!worker) return false;
|
|
424
|
+
if (typeof worker.isDead === "function" && worker.isDead()) return false;
|
|
425
|
+
if (typeof worker.isConnected === "function" && !worker.isConnected()) return false;
|
|
426
|
+
if (worker.process?.connected === false) return false;
|
|
427
|
+
|
|
428
|
+
try {
|
|
429
|
+
worker.send(message);
|
|
430
|
+
return true;
|
|
431
|
+
} catch (err) {
|
|
432
|
+
if (!isExpectedWorkerIpcError(err)) {
|
|
433
|
+
console.error(` ⚠ Failed to send ${label}: ${err?.message ?? err}`);
|
|
434
|
+
}
|
|
435
|
+
return false;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
_handleWorkerMessage(worker, msg) {
|
|
440
|
+
if (msg?.type === "forge:group-ready") {
|
|
441
|
+
// O1: Mark worker as ready for the /health/ready readiness probe
|
|
442
|
+
this._workersReady.set(worker.id, true);
|
|
443
|
+
|
|
444
|
+
const info = this.workerMap.get(worker.id);
|
|
445
|
+
if (!info) return;
|
|
446
|
+
|
|
447
|
+
const groupName = info.groupName;
|
|
448
|
+
const readySet = this._groupReadyWorkers.get(groupName) ?? new Set();
|
|
449
|
+
readySet.add(worker.id);
|
|
450
|
+
this._groupReadyWorkers.set(groupName, readySet);
|
|
451
|
+
|
|
452
|
+
const expected = this.allocation.get(groupName) ?? 1;
|
|
453
|
+
if (readySet.size >= expected && !this._groupReadyLogged.has(groupName)) {
|
|
454
|
+
this._groupReadyLogged.add(groupName);
|
|
455
|
+
const group = this.groups[groupName];
|
|
456
|
+
const edgeService = group?.services?.find((s) => s.type === "edge");
|
|
457
|
+
const portLabel = edgeService?.port ? ` on port ${edgeService.port}` : "";
|
|
458
|
+
const svcLabel = group?.services?.map((s) => s.name).join(", ") ?? groupName;
|
|
459
|
+
console.log(` ✓ ${svcLabel}: ${expected} workers ready${portLabel}`);
|
|
460
|
+
|
|
461
|
+
// H-5: Start heartbeat monitor once all groups are ready
|
|
462
|
+
if (!this._heartbeatInterval) {
|
|
463
|
+
const allReady = [...this._groupReadyWorkers.entries()].every(
|
|
464
|
+
([gn, set]) => set.size >= (this.allocation.get(gn) ?? 1)
|
|
465
|
+
);
|
|
466
|
+
if (allReady) {
|
|
467
|
+
this._startHeartbeatMonitor();
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
if (msg?.type === "forge:fatal-error") {
|
|
475
|
+
const info = this.workerMap.get(worker.id);
|
|
476
|
+
const groupName = info?.groupName ?? "unknown";
|
|
477
|
+
const workerId = info?.workerId ?? "?";
|
|
478
|
+
console.error(` ✖ Worker ${groupName}[${workerId}] fatal error: ${msg.error} - ${msg.message}`);
|
|
479
|
+
if (msg.port) {
|
|
480
|
+
console.error(` ✖ Failed to bind to port ${msg.port}. Check permissions or port availability.`);
|
|
481
|
+
}
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// H-5: Track heartbeat responses from workers
|
|
486
|
+
if (msg?.type === "forge:heartbeat-response") {
|
|
487
|
+
this._lastHeartbeat.set(worker.id, Date.now());
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (msg?.type === "forge:metrics-snapshot-response" && msg.requestId) {
|
|
492
|
+
const pending = this._pendingMetricsSnapshots.get(msg.requestId);
|
|
493
|
+
if (!pending) return;
|
|
494
|
+
|
|
495
|
+
if (typeof msg.metrics === "string" && msg.metrics.trim().length > 0) {
|
|
496
|
+
pending.chunks.push(msg.metrics);
|
|
497
|
+
}
|
|
498
|
+
if (msg.error) {
|
|
499
|
+
pending.chunks.push(`# Worker ${worker.id} metrics error: ${msg.error}`);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
pending.expected.delete(worker.id);
|
|
503
|
+
if (pending.expected.size === 0) {
|
|
504
|
+
pending.finish();
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
_mergePrometheusExpositions(expositions) {
|
|
510
|
+
const lines = [];
|
|
511
|
+
const seenMeta = new Set();
|
|
512
|
+
|
|
513
|
+
for (const chunk of expositions) {
|
|
514
|
+
if (typeof chunk !== "string") continue;
|
|
515
|
+
|
|
516
|
+
for (const rawLine of chunk.split(/\r?\n/)) {
|
|
517
|
+
const line = rawLine.trimEnd();
|
|
518
|
+
if (!line) continue;
|
|
519
|
+
|
|
520
|
+
if (line.startsWith("# HELP ") || line.startsWith("# TYPE ")) {
|
|
521
|
+
if (seenMeta.has(line)) continue;
|
|
522
|
+
seenMeta.add(line);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
lines.push(line);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
if (lines.length === 0) {
|
|
530
|
+
return "# No worker metrics available\n";
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
return `${lines.join("\n")}\n`;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
_collectMetricsSnapshot(timeoutMs = 1000) {
|
|
537
|
+
const activeWorkers = Object.values(cluster.workers).filter((worker) => worker && !worker.isDead());
|
|
538
|
+
if (activeWorkers.length === 0) {
|
|
539
|
+
return Promise.resolve("# No worker metrics available\n");
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
const requestId = `metrics-${process.pid}-${Date.now()}-${++this._metricsRequestSeq}`;
|
|
543
|
+
|
|
544
|
+
return new Promise((resolve) => {
|
|
545
|
+
const expected = new Set(activeWorkers.map((worker) => worker.id));
|
|
546
|
+
const chunks = [];
|
|
547
|
+
let finished = false;
|
|
548
|
+
|
|
549
|
+
const finish = () => {
|
|
550
|
+
if (finished) return;
|
|
551
|
+
finished = true;
|
|
552
|
+
const pending = this._pendingMetricsSnapshots.get(requestId);
|
|
553
|
+
if (pending?.timer) clearTimeout(pending.timer);
|
|
554
|
+
this._pendingMetricsSnapshots.delete(requestId);
|
|
555
|
+
resolve(this._mergePrometheusExpositions(chunks));
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
const timer = setTimeout(finish, timeoutMs);
|
|
559
|
+
if (typeof timer.unref === "function") timer.unref();
|
|
560
|
+
|
|
561
|
+
this._pendingMetricsSnapshots.set(requestId, { expected, chunks, timer, finish });
|
|
562
|
+
|
|
563
|
+
for (const worker of activeWorkers) {
|
|
564
|
+
const sent = this._sendWorkerMessage(worker, { type: "forge:metrics-snapshot", requestId }, "metrics snapshot request");
|
|
565
|
+
if (!sent) expected.delete(worker.id);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
if (expected.size === 0) {
|
|
569
|
+
finish();
|
|
570
|
+
}
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
_handleWorkerExit(worker, code, signal) {
|
|
575
|
+
const info = this.workerMap.get(worker.id);
|
|
576
|
+
if (!info) return;
|
|
577
|
+
|
|
578
|
+
const { groupName, services, workerId } = info;
|
|
579
|
+
|
|
580
|
+
// CR-1: Find the worker's slot index in the group before removing it
|
|
581
|
+
const workers = this.groupWorkers.get(groupName) ?? [];
|
|
582
|
+
const workerSlotIndex = workers.indexOf(worker.id);
|
|
583
|
+
|
|
584
|
+
// CR-2: Always perform cleanup even during shutdown — only skip restart/fork logic
|
|
585
|
+
this.workerMap.delete(worker.id);
|
|
586
|
+
this._groupReadyWorkers.get(groupName)?.delete(worker.id);
|
|
587
|
+
this._lastHeartbeat.delete(worker.id);
|
|
588
|
+
this._workersReady.delete(worker.id);
|
|
589
|
+
|
|
590
|
+
// RT-H3: Clear any pending SIGKILL timer for this worker
|
|
591
|
+
const killTimer = this._killTimers.get(worker.id);
|
|
592
|
+
if (killTimer) {
|
|
593
|
+
clearTimeout(killTimer);
|
|
594
|
+
this._killTimers.delete(worker.id);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
if (workerSlotIndex !== -1) workers.splice(workerSlotIndex, 1);
|
|
598
|
+
|
|
599
|
+
// Unregister from message bus
|
|
600
|
+
for (let i = 0; i < services.length; i++) {
|
|
601
|
+
const svcName = services[i];
|
|
602
|
+
this.messageBus.unregisterWorker(svcName, worker.id, {
|
|
603
|
+
suppressBroadcast: i < services.length - 1,
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// CR-2: During shutdown, only do cleanup (above) — skip restart/fork logic
|
|
608
|
+
if (this._shuttingDown) return;
|
|
609
|
+
|
|
610
|
+
// If this worker was intentionally removed during scale-down, don't restart
|
|
611
|
+
if (this._scalingDown.has(worker.id)) {
|
|
612
|
+
this._scalingDown.delete(worker.id);
|
|
613
|
+
// MED-4: Clean up restart history for removed worker
|
|
614
|
+
const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
|
|
615
|
+
this._restartHistory.delete(cooldownKey);
|
|
616
|
+
console.log(` ↓ Worker ${groupName}[${workerId}] (${services.join("+")}) removed (scale-down)`);
|
|
617
|
+
return;
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Exit code 100 indicates fatal configuration error (e.g., EPERM on port bind)
|
|
621
|
+
// Don't restart — log clear message and stop
|
|
622
|
+
if (code === 100) {
|
|
623
|
+
console.error(` ✖ Worker ${groupName}[${workerId}] (${services.join("+")}) failed with fatal error — not restarting`);
|
|
624
|
+
console.error(` ✖ Check worker logs above for details (likely port permission issue)`);
|
|
625
|
+
// Clean up restart history to prevent future attempts
|
|
626
|
+
const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
|
|
627
|
+
this._restartHistory.delete(cooldownKey);
|
|
628
|
+
this._pendingRestarts.delete(cooldownKey);
|
|
629
|
+
return;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
const reason = signal ? `signal ${signal}` : `code ${code}`;
|
|
633
|
+
|
|
634
|
+
// L5: Rate-limit restart warnings per group
|
|
635
|
+
const now = Date.now();
|
|
636
|
+
const lastWarning = this._restartWarningTimes.get(groupName) ?? 0;
|
|
637
|
+
if (now - lastWarning >= RESTART_WARNING_INTERVAL_MS) {
|
|
638
|
+
console.error(` ⚠ Worker ${groupName}[${workerId}] (${services.join("+")}) exited: ${reason}`);
|
|
639
|
+
this._restartWarningTimes.set(groupName, now);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// CR-1: Key by worker slot index (not cluster worker.id) so restart history persists across restarts
|
|
643
|
+
const cooldownKey = `${groupName}:slot${workerSlotIndex}`;
|
|
644
|
+
const history = this._restartHistory.get(cooldownKey) ?? { count: 0, firstRestart: now, lastRestart: 0 };
|
|
645
|
+
|
|
646
|
+
// Reset counter if outside the restart window
|
|
647
|
+
if (now - history.firstRestart > RESTART_WINDOW_MS) {
|
|
648
|
+
history.count = 0;
|
|
649
|
+
history.firstRestart = now;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
if (history.count >= MAX_RESTARTS_PER_WINDOW) {
|
|
653
|
+
console.error(` ⚠ ${groupName}[${workerId}] exceeded max restarts (${MAX_RESTARTS_PER_WINDOW} in ${RESTART_WINDOW_MS / 60000}min), not restarting`);
|
|
654
|
+
this._restartHistory.delete(cooldownKey);
|
|
655
|
+
return;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Exponential backoff with constants
|
|
659
|
+
const backoffMs = Math.min(RESTART_BASE_BACKOFF_MS * 2 ** history.count, RESTART_MAX_BACKOFF_MS);
|
|
660
|
+
const timeSinceLast = now - history.lastRestart;
|
|
661
|
+
|
|
662
|
+
if (timeSinceLast < backoffMs) {
|
|
663
|
+
const remaining = backoffMs - timeSinceLast;
|
|
664
|
+
console.log(` ↻ Delaying restart for ${groupName}[${workerId}] (${remaining}ms remaining in backoff)`);
|
|
665
|
+
|
|
666
|
+
// Cancel any existing pending restart for this slot
|
|
667
|
+
const existingTimer = this._pendingRestarts.get(cooldownKey);
|
|
668
|
+
if (existingTimer) clearTimeout(existingTimer);
|
|
669
|
+
|
|
670
|
+
const timer = setTimeout(() => {
|
|
671
|
+
this._pendingRestarts.delete(cooldownKey);
|
|
672
|
+
if (this._shuttingDown) return;
|
|
673
|
+
if (!this.groups[groupName]) return;
|
|
674
|
+
|
|
675
|
+
history.count++;
|
|
676
|
+
history.lastRestart = Date.now();
|
|
677
|
+
this._restartHistory.set(cooldownKey, history);
|
|
678
|
+
|
|
679
|
+
console.log(
|
|
680
|
+
` ↻ Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`,
|
|
681
|
+
);
|
|
682
|
+
// O15: Track worker restart metric
|
|
683
|
+
this._workerRestartCount++;
|
|
684
|
+
this._forkGroupWorker(groupName, this.groups[groupName], workerId);
|
|
685
|
+
}, remaining);
|
|
686
|
+
|
|
687
|
+
timer.unref();
|
|
688
|
+
this._pendingRestarts.set(cooldownKey, timer);
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
if (!this.groups[groupName]) return;
|
|
693
|
+
|
|
694
|
+
history.count++;
|
|
695
|
+
history.lastRestart = now;
|
|
696
|
+
this._restartHistory.set(cooldownKey, history);
|
|
697
|
+
|
|
698
|
+
console.log(` ↻ Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`);
|
|
699
|
+
// O15: Track worker restart metric
|
|
700
|
+
this._workerRestartCount++;
|
|
701
|
+
if (this._shuttingDown) return;
|
|
702
|
+
this._forkGroupWorker(groupName, this.groups[groupName], workerId);
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
_startupPortsToCheck() {
|
|
706
|
+
const targets = [];
|
|
707
|
+
const seen = new Set();
|
|
708
|
+
|
|
709
|
+
for (const [name, svc] of Object.entries(this.services)) {
|
|
710
|
+
if (svc?.type !== "edge") continue;
|
|
711
|
+
if (!Number.isInteger(svc.port) || svc.port <= 0) continue;
|
|
712
|
+
if (seen.has(svc.port)) continue;
|
|
713
|
+
seen.add(svc.port);
|
|
714
|
+
targets.push({ port: svc.port, purpose: `service "${name}"` });
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
return targets;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
_isPortAvailable(port, host = "127.0.0.1") {
|
|
721
|
+
if (!Number.isInteger(port) || port <= 0) return Promise.resolve(true);
|
|
722
|
+
|
|
723
|
+
return new Promise((resolve) => {
|
|
724
|
+
const probe = createNetServer();
|
|
725
|
+
let settled = false;
|
|
726
|
+
|
|
727
|
+
const finish = (available) => {
|
|
728
|
+
if (settled) return;
|
|
729
|
+
settled = true;
|
|
730
|
+
if (available) {
|
|
731
|
+
probe.close(() => resolve(true));
|
|
732
|
+
} else {
|
|
733
|
+
resolve(false);
|
|
734
|
+
}
|
|
735
|
+
};
|
|
736
|
+
|
|
737
|
+
probe.once("error", (err) => {
|
|
738
|
+
if (err.code === "EADDRINUSE" || err.code === "EACCES" || err.code === "EPERM") {
|
|
739
|
+
finish(false);
|
|
740
|
+
return;
|
|
741
|
+
}
|
|
742
|
+
finish(false);
|
|
743
|
+
});
|
|
744
|
+
|
|
745
|
+
probe.once("listening", () => finish(true));
|
|
746
|
+
probe.listen(port, host);
|
|
747
|
+
});
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
async _assertStartupPortsAvailable() {
|
|
751
|
+
const targets = this._startupPortsToCheck();
|
|
752
|
+
for (const { port, purpose } of targets) {
|
|
753
|
+
const available = await this._isPortAvailable(port);
|
|
754
|
+
if (!available) {
|
|
755
|
+
throw new Error(
|
|
756
|
+
`Startup preflight failed: port ${port} (${purpose}) is unavailable ` +
|
|
757
|
+
`(already in use or permission denied).`,
|
|
758
|
+
);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
async scale(groupName, newCount) {
|
|
764
|
+
const group = this.groups[groupName];
|
|
765
|
+
if (!group) throw new Error(`Unknown group: ${groupName}`);
|
|
766
|
+
|
|
767
|
+
// M-2: Bounds checking for newCount
|
|
768
|
+
if (newCount < 1 || newCount > 64) {
|
|
769
|
+
throw new Error(`Invalid worker count ${newCount} for group "${groupName}": must be between 1 and 64`);
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
const currentIds = this.groupWorkers.get(groupName) ?? [];
|
|
773
|
+
const currentCount = currentIds.length;
|
|
774
|
+
|
|
775
|
+
if (newCount === currentCount) return;
|
|
776
|
+
|
|
777
|
+
if (newCount > currentCount) {
|
|
778
|
+
const toAdd = newCount - currentCount;
|
|
779
|
+
console.log(` ↑ Scaling ${groupName} ${currentCount} → ${newCount} (+${toAdd})`);
|
|
780
|
+
// H-3: Reset groupReadyLogged so ready message is logged again for new workers
|
|
781
|
+
this._groupReadyLogged.delete(groupName);
|
|
782
|
+
// Use a monotonic counter to avoid index collisions after scale-down + scale-up
|
|
783
|
+
if (this._nextWorkerIndex[groupName] === undefined) {
|
|
784
|
+
this._nextWorkerIndex[groupName] = currentCount;
|
|
785
|
+
}
|
|
786
|
+
for (let i = 0; i < toAdd; i++) {
|
|
787
|
+
const workerIndex = this._nextWorkerIndex[groupName]++;
|
|
788
|
+
// Clear any stale restart history so new workers don't inherit crash counts
|
|
789
|
+
this._restartHistory.delete(`${groupName}:${workerIndex}`);
|
|
790
|
+
this._forkGroupWorker(groupName, group, workerIndex);
|
|
791
|
+
}
|
|
792
|
+
} else {
|
|
793
|
+
const toRemove = currentCount - newCount;
|
|
794
|
+
console.log(` ↓ Scaling ${groupName} ${currentCount} → ${newCount} (-${toRemove})`);
|
|
795
|
+
for (let i = 0; i < toRemove; i++) {
|
|
796
|
+
const wid = currentIds[currentIds.length - 1 - i];
|
|
797
|
+
this._scalingDown.add(wid);
|
|
798
|
+
const worker = cluster.workers[wid];
|
|
799
|
+
if (worker) {
|
|
800
|
+
worker.process.kill("SIGTERM");
|
|
801
|
+
// RT-H3: Force SIGKILL after 10s if worker hasn't exited
|
|
802
|
+
const killTimer = setTimeout(() => {
|
|
803
|
+
this._killTimers.delete(wid);
|
|
804
|
+
try {
|
|
805
|
+
if (!worker.isDead()) {
|
|
806
|
+
console.error(` ⚠ Worker ${wid} did not exit after SIGTERM, sending SIGKILL`);
|
|
807
|
+
worker.process.kill("SIGKILL");
|
|
808
|
+
}
|
|
809
|
+
} catch {}
|
|
810
|
+
}, 10_000);
|
|
811
|
+
killTimer.unref();
|
|
812
|
+
this._killTimers.set(wid, killTimer);
|
|
813
|
+
// H3: Clean up kill timer if worker exits before SIGKILL fires
|
|
814
|
+
worker.once('exit', () => {
|
|
815
|
+
const t = this._killTimers.get(wid);
|
|
816
|
+
if (t) {
|
|
817
|
+
clearTimeout(t);
|
|
818
|
+
this._killTimers.delete(wid);
|
|
819
|
+
}
|
|
820
|
+
});
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
this.allocation.set(groupName, newCount);
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
/**
|
|
829
|
+
* H-5: Start heartbeat monitor — checks worker health every 30s.
|
|
830
|
+
* Warns after 60s of silence, kills after 90s.
|
|
831
|
+
*/
|
|
832
|
+
_startHeartbeatMonitor() {
|
|
833
|
+
// Initialize heartbeat timestamps for all current workers
|
|
834
|
+
const now = Date.now();
|
|
835
|
+
for (const wid of this.workerMap.keys()) {
|
|
836
|
+
this._lastHeartbeat.set(wid, now);
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
this._heartbeatInterval = setInterval(() => {
|
|
840
|
+
if (this._shuttingDown) return;
|
|
841
|
+
|
|
842
|
+
// Request health checks from message bus if available
|
|
843
|
+
if (typeof this.messageBus.requestHealthChecks === 'function') {
|
|
844
|
+
this.messageBus.requestHealthChecks();
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
const checkTime = Date.now();
|
|
848
|
+
for (const [wid, info] of this.workerMap) {
|
|
849
|
+
const lastSeen = this._lastHeartbeat.get(wid) ?? 0;
|
|
850
|
+
const elapsed = checkTime - lastSeen;
|
|
851
|
+
|
|
852
|
+
if (elapsed > 90_000) {
|
|
853
|
+
// 90s without response — kill the worker
|
|
854
|
+
console.error(` ✖ Worker ${info.groupName}[${info.workerId}] unresponsive for ${Math.round(elapsed / 1000)}s — sending SIGKILL`);
|
|
855
|
+
try {
|
|
856
|
+
const w = cluster.workers?.[wid];
|
|
857
|
+
if (w && !w.isDead()) {
|
|
858
|
+
w.process.kill('SIGKILL');
|
|
859
|
+
}
|
|
860
|
+
} catch {}
|
|
861
|
+
} else if (elapsed > 60_000) {
|
|
862
|
+
// 60s without response — log a warning
|
|
863
|
+
console.warn(` ⚠ Worker ${info.groupName}[${info.workerId}] no heartbeat for ${Math.round(elapsed / 1000)}s`);
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
}, 30_000);
|
|
867
|
+
|
|
868
|
+
this._heartbeatInterval.unref();
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
_stopHeartbeatMonitor() {
|
|
872
|
+
if (this._heartbeatInterval) {
|
|
873
|
+
clearInterval(this._heartbeatInterval);
|
|
874
|
+
this._heartbeatInterval = null;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
async shutdown() {
|
|
879
|
+
if (this._shuttingDown) return;
|
|
880
|
+
this._shuttingDown = true;
|
|
881
|
+
|
|
882
|
+
// H-5: Stop heartbeat monitor
|
|
883
|
+
this._stopHeartbeatMonitor();
|
|
884
|
+
|
|
885
|
+
// Resolve in-flight /metrics scrapes so callers don't hang during shutdown
|
|
886
|
+
for (const pending of this._pendingMetricsSnapshots.values()) {
|
|
887
|
+
clearTimeout(pending.timer);
|
|
888
|
+
pending.finish();
|
|
889
|
+
}
|
|
890
|
+
this._pendingMetricsSnapshots.clear();
|
|
891
|
+
|
|
892
|
+
// Cancel any pending delayed restarts
|
|
893
|
+
for (const timer of this._pendingRestarts.values()) {
|
|
894
|
+
clearTimeout(timer);
|
|
895
|
+
}
|
|
896
|
+
this._pendingRestarts.clear();
|
|
897
|
+
|
|
898
|
+
// Cancel any pending SIGKILL timers from scale-down
|
|
899
|
+
for (const [, timer] of this._killTimers) { clearTimeout(timer); }
|
|
900
|
+
this._killTimers.clear();
|
|
901
|
+
|
|
902
|
+
console.log("\n Shutting down ThreadForge...\n");
|
|
903
|
+
|
|
904
|
+
// C5: Overall shutdown deadline — each phase races against remaining time
|
|
905
|
+
const deadlineStart = Date.now();
|
|
906
|
+
const withDeadline = (promise, label) => {
|
|
907
|
+
const remaining = SHUTDOWN_DEADLINE_MS - (Date.now() - deadlineStart);
|
|
908
|
+
if (remaining <= 0) {
|
|
909
|
+
console.warn(` ⚠ Shutdown deadline exceeded during: ${label} — skipping`);
|
|
910
|
+
return Promise.resolve();
|
|
911
|
+
}
|
|
912
|
+
return Promise.race([
|
|
913
|
+
promise,
|
|
914
|
+
new Promise((resolve) => {
|
|
915
|
+
const t = setTimeout(() => {
|
|
916
|
+
console.warn(` ⚠ Shutdown phase "${label}" exceeded deadline — skipping`);
|
|
917
|
+
resolve();
|
|
918
|
+
}, remaining);
|
|
919
|
+
t.unref();
|
|
920
|
+
}),
|
|
921
|
+
]);
|
|
922
|
+
};
|
|
923
|
+
|
|
924
|
+
// Close metrics server first so health checks fail during shutdown
|
|
925
|
+
if (this._metricsServer) {
|
|
926
|
+
await withDeadline(
|
|
927
|
+
new Promise(resolve => this._metricsServer.close(resolve)),
|
|
928
|
+
'metrics server close'
|
|
929
|
+
);
|
|
930
|
+
this._metricsServer = null;
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
// Step 1: Send graceful shutdown message to each worker
|
|
934
|
+
for (const id of Object.keys(cluster.workers)) {
|
|
935
|
+
const worker = cluster.workers[id];
|
|
936
|
+
if (worker) {
|
|
937
|
+
this._sendWorkerMessage(worker, { type: "forge:shutdown" }, "shutdown signal");
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// Step 2: Wait for workers to drain HTTP connections and exit
|
|
942
|
+
// H-CORE-3: Unref timers so they don't keep the process alive after workers exit
|
|
943
|
+
await withDeadline(new Promise((resolve) => {
|
|
944
|
+
const check = setInterval(() => {
|
|
945
|
+
const alive = Object.keys(cluster.workers).filter((id) => {
|
|
946
|
+
const w = cluster.workers[id];
|
|
947
|
+
return w && !w.isDead();
|
|
948
|
+
});
|
|
949
|
+
if (alive.length === 0) {
|
|
950
|
+
clearInterval(check);
|
|
951
|
+
resolve();
|
|
952
|
+
}
|
|
953
|
+
}, 200);
|
|
954
|
+
check.unref();
|
|
955
|
+
const fallback = setTimeout(() => {
|
|
956
|
+
clearInterval(check);
|
|
957
|
+
resolve();
|
|
958
|
+
}, 10000);
|
|
959
|
+
fallback.unref();
|
|
960
|
+
}), 'graceful drain');
|
|
961
|
+
|
|
962
|
+
// Collect all worker PIDs before disconnect (workers may leave cluster.workers after disconnect)
|
|
963
|
+
const workerPids = new Set();
|
|
964
|
+
for (const id of Object.keys(cluster.workers)) {
|
|
965
|
+
const w = cluster.workers[id];
|
|
966
|
+
if (w?.process?.pid) workerPids.add(w.process.pid);
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
// Step 3: Disconnect remaining workers
|
|
970
|
+
for (const id of Object.keys(cluster.workers)) {
|
|
971
|
+
const worker = cluster.workers[id];
|
|
972
|
+
if (worker && !worker.isDead()) {
|
|
973
|
+
try {
|
|
974
|
+
worker.disconnect();
|
|
975
|
+
} catch {
|
|
976
|
+
// Worker may already be dead
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
// Step 4: Wait for disconnect to complete
|
|
982
|
+
await withDeadline(new Promise((resolve) => {
|
|
983
|
+
const check = setInterval(() => {
|
|
984
|
+
const alive = Object.keys(cluster.workers).filter((id) => {
|
|
985
|
+
const w = cluster.workers[id];
|
|
986
|
+
return w && !w.isDead();
|
|
987
|
+
});
|
|
988
|
+
if (alive.length === 0) {
|
|
989
|
+
clearInterval(check);
|
|
990
|
+
resolve();
|
|
991
|
+
}
|
|
992
|
+
}, 200);
|
|
993
|
+
check.unref();
|
|
994
|
+
const fallback = setTimeout(() => {
|
|
995
|
+
clearInterval(check);
|
|
996
|
+
resolve();
|
|
997
|
+
}, 5000);
|
|
998
|
+
fallback.unref();
|
|
999
|
+
}), 'disconnect');
|
|
1000
|
+
|
|
1001
|
+
// Step 5: Force kill any remaining workers
|
|
1002
|
+
for (const id of Object.keys(cluster.workers)) {
|
|
1003
|
+
const worker = cluster.workers[id];
|
|
1004
|
+
if (worker && !worker.isDead()) {
|
|
1005
|
+
console.error(` ⚠ Forcefully killing worker ${id}...`);
|
|
1006
|
+
worker.process.kill("SIGKILL");
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
// Also kill workers that disconnected from cluster but may still be alive
|
|
1011
|
+
for (const pid of workerPids) {
|
|
1012
|
+
try { process.kill(pid, 0); process.kill(pid, "SIGKILL"); } catch {}
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
// C3: Clean up temp endpoint file if created
|
|
1016
|
+
if (this._endpointTempFile) {
|
|
1017
|
+
try { fs.unlinkSync(this._endpointTempFile); } catch {}
|
|
1018
|
+
this._endpointTempFile = null;
|
|
1019
|
+
}
|
|
1020
|
+
if (this._sitesTempFile) {
|
|
1021
|
+
try { fs.unlinkSync(this._sitesTempFile); } catch {}
|
|
1022
|
+
this._sitesTempFile = null;
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
// H4: Clean up PID file
|
|
1026
|
+
if (this._pidFilePath) {
|
|
1027
|
+
try { fs.unlinkSync(this._pidFilePath); } catch {}
|
|
1028
|
+
this._pidFilePath = null;
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
console.log(" All workers stopped. Goodbye.\n");
|
|
1032
|
+
this.messageBus.cleanup();
|
|
1033
|
+
this.scaleAdvisor.stop();
|
|
1034
|
+
// O14: Add deadline to registry.stop() to prevent hanging
|
|
1035
|
+
try {
|
|
1036
|
+
await Promise.race([
|
|
1037
|
+
this.registry.stop(),
|
|
1038
|
+
new Promise(resolve => setTimeout(resolve, 5000)),
|
|
1039
|
+
]);
|
|
1040
|
+
} catch (err) {
|
|
1041
|
+
console.error(` ⚠ Registry stop failed: ${err.message}`);
|
|
1042
|
+
}
|
|
1043
|
+
// Let the caller decide whether to exit — don't force process.exit here
|
|
1044
|
+
// so tests and CLI wrappers can run post-shutdown cleanup
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
async _startMetricsServer() {
|
|
1048
|
+
// Allow metricsPort: null or false to disable metrics entirely
|
|
1049
|
+
if (this.config.metricsPort === null || this.config.metricsPort === false) {
|
|
1050
|
+
console.log(` 📊 Metrics: disabled`);
|
|
1051
|
+
return;
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
// Safety fallback to 9090 (config layer should already provide this default)
|
|
1055
|
+
const port = this.config.metricsPort ?? 9090;
|
|
1056
|
+
|
|
1057
|
+
return new Promise((resolve) => {
|
|
1058
|
+
this._metricsServer = createServer((req, res) => {
|
|
1059
|
+
const reqPath = new URL(req.url ?? "/", "http://localhost").pathname;
|
|
1060
|
+
|
|
1061
|
+
// Let registry handle its endpoints first
|
|
1062
|
+
if (this.registry.httpHandler(req, res)) return;
|
|
1063
|
+
|
|
1064
|
+
// S7: Auth gate for sensitive supervisor endpoints (matches worker-level FORGE_METRICS_TOKEN)
|
|
1065
|
+
// SEC-C2: Include registry endpoints — they expose full service topology
|
|
1066
|
+
const sensitiveEndpoints = ['/status', '/metrics', '/scaling', '/_forge/topology', '/_forge/resolve'];
|
|
1067
|
+
if (sensitiveEndpoints.includes(reqPath)) {
|
|
1068
|
+
const metricsToken = process.env.FORGE_METRICS_TOKEN;
|
|
1069
|
+
if (metricsToken) {
|
|
1070
|
+
const auth = req.headers['authorization'] ?? '';
|
|
1071
|
+
const expected = `Bearer ${metricsToken}`;
|
|
1072
|
+
if (auth.length !== expected.length ||
|
|
1073
|
+
!timingSafeEqual(Buffer.from(auth), Buffer.from(expected))) {
|
|
1074
|
+
res.writeHead(401, { "Content-Type": "application/json" });
|
|
1075
|
+
res.end(JSON.stringify({ error: "Unauthorized" }));
|
|
1076
|
+
return;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
if (reqPath === "/status") {
|
|
1082
|
+
res.writeHead(200, { "Content-Type": "application/json" });
|
|
1083
|
+
res.end(JSON.stringify(this._status(), null, 2));
|
|
1084
|
+
} else if (reqPath === "/metrics") {
|
|
1085
|
+
this._collectMetricsSnapshot()
|
|
1086
|
+
.then((payload) => {
|
|
1087
|
+
// O15: Prepend supervisor-level restart counter
|
|
1088
|
+
const supervisorMetrics =
|
|
1089
|
+
`# HELP forge_worker_restarts_total Total number of worker restarts\n` +
|
|
1090
|
+
`# TYPE forge_worker_restarts_total counter\n` +
|
|
1091
|
+
`forge_worker_restarts_total ${this._workerRestartCount}\n`;
|
|
1092
|
+
res.writeHead(200, { "Content-Type": "text/plain; version=0.0.4; charset=utf-8" });
|
|
1093
|
+
res.end(supervisorMetrics + payload);
|
|
1094
|
+
})
|
|
1095
|
+
.catch((err) => {
|
|
1096
|
+
res.writeHead(500, { "Content-Type": "text/plain; charset=utf-8" });
|
|
1097
|
+
res.end(`# metrics collection failed: ${err.message}\n`);
|
|
1098
|
+
});
|
|
1099
|
+
return;
|
|
1100
|
+
} else if (reqPath === "/health" || reqPath === "/health/ready") {
|
|
1101
|
+
// O1: Readiness probe — 200 only when ALL workers have reported ready
|
|
1102
|
+
const totalWorkers = this.workerMap.size;
|
|
1103
|
+
const readyWorkers = [...this._workersReady.values()].filter(Boolean).length;
|
|
1104
|
+
if (totalWorkers > 0 && readyWorkers >= totalWorkers) {
|
|
1105
|
+
res.writeHead(200, { "Content-Type": "application/json" });
|
|
1106
|
+
res.end(JSON.stringify({ status: "ready", ready: readyWorkers, total: totalWorkers }));
|
|
1107
|
+
} else {
|
|
1108
|
+
res.writeHead(503, { "Content-Type": "application/json" });
|
|
1109
|
+
res.end(JSON.stringify({ status: "starting", ready: readyWorkers, total: totalWorkers }));
|
|
1110
|
+
}
|
|
1111
|
+
} else if (reqPath === "/health/live") {
|
|
1112
|
+
// O1: Liveness probe — always 200 if process is running
|
|
1113
|
+
res.writeHead(200, { "Content-Type": "text/plain" });
|
|
1114
|
+
res.end("ok");
|
|
1115
|
+
} else if (reqPath === "/scaling") {
|
|
1116
|
+
res.writeHead(200, { "Content-Type": "text/plain" });
|
|
1117
|
+
res.end(this.scaleAdvisor.report());
|
|
1118
|
+
} else {
|
|
1119
|
+
res.writeHead(404);
|
|
1120
|
+
res.end("Not found");
|
|
1121
|
+
}
|
|
1122
|
+
});
|
|
1123
|
+
|
|
1124
|
+
this._metricsServer.on("error", (err) => {
|
|
1125
|
+
// Enhanced error message with actionable guidance
|
|
1126
|
+
console.warn(` ⚠ Metrics server failed to bind port ${port}: ${err.message}`);
|
|
1127
|
+
console.warn(` To fix: Set metricsPort to a different port in your config, or set metricsPort: null to disable metrics.`);
|
|
1128
|
+
console.warn(` Example: defineServices(services, { metricsPort: 9091 }) or { metricsPort: null }`);
|
|
1129
|
+
this._metricsServer = null;
|
|
1130
|
+
resolve(); // non-fatal — supervisor continues without metrics
|
|
1131
|
+
});
|
|
1132
|
+
|
|
1133
|
+
// Set timeouts to prevent slowloris attacks
|
|
1134
|
+
this._metricsServer.timeout = 5000;
|
|
1135
|
+
this._metricsServer.requestTimeout = 5000;
|
|
1136
|
+
this._metricsServer.headersTimeout = 3000;
|
|
1137
|
+
|
|
1138
|
+
// RT-H4: Bind to localhost only — metrics endpoint has no auth
|
|
1139
|
+
// C2: Allow override via FORGE_METRICS_BIND for containers (e.g. 0.0.0.0)
|
|
1140
|
+
const bindAddr = process.env.FORGE_METRICS_BIND || "127.0.0.1";
|
|
1141
|
+
// SEC-C2: Warn when metrics are exposed without auth
|
|
1142
|
+
if (bindAddr !== "127.0.0.1" && bindAddr !== "::1" && !process.env.FORGE_METRICS_TOKEN) {
|
|
1143
|
+
console.warn(` ⚠ Metrics server binding to ${bindAddr} without FORGE_METRICS_TOKEN — topology and metrics are publicly accessible`);
|
|
1144
|
+
console.warn(` Set FORGE_METRICS_TOKEN=<secret> to require Bearer auth on sensitive endpoints`);
|
|
1145
|
+
}
|
|
1146
|
+
this._metricsServer.listen(port, bindAddr, () => {
|
|
1147
|
+
console.log(` 📊 Metrics: http://${bindAddr}:${port}/status (Prometheus: /metrics)`);
|
|
1148
|
+
resolve();
|
|
1149
|
+
});
|
|
1150
|
+
});
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
_status() {
|
|
1154
|
+
const groups = [];
|
|
1155
|
+
for (const [groupName, workerIds] of this.groupWorkers) {
|
|
1156
|
+
const group = this.groups[groupName];
|
|
1157
|
+
const pids = workerIds.map((wid) => cluster.workers[wid]?.process?.pid).filter(Boolean);
|
|
1158
|
+
|
|
1159
|
+
groups.push({
|
|
1160
|
+
group: groupName,
|
|
1161
|
+
services: group.services.map((s) => ({
|
|
1162
|
+
name: s.name,
|
|
1163
|
+
type: s.type,
|
|
1164
|
+
port: s.port,
|
|
1165
|
+
})),
|
|
1166
|
+
workers: workerIds.length,
|
|
1167
|
+
pids,
|
|
1168
|
+
});
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
return {
|
|
1172
|
+
supervisorPid: process.pid,
|
|
1173
|
+
uptime: process.uptime(),
|
|
1174
|
+
totalCpus: this.allocator.totalCpus,
|
|
1175
|
+
nodeId: this.registry.nodeId,
|
|
1176
|
+
host: this.registry.host,
|
|
1177
|
+
registryMode: this.registry.mode,
|
|
1178
|
+
processGroups: groups,
|
|
1179
|
+
channels: this.channels,
|
|
1180
|
+
totalProcesses: Object.keys(cluster.workers).length,
|
|
1181
|
+
totalServices: Object.keys(this.services).length,
|
|
1182
|
+
remoteServices: Object.values(this.services).filter((s) => s.type === "remote").length,
|
|
1183
|
+
portsUsed: Object.values(this.services)
|
|
1184
|
+
.filter((s) => s.port)
|
|
1185
|
+
.map((s) => s.port),
|
|
1186
|
+
messageBus: this.messageBus.stats(),
|
|
1187
|
+
topology: this.registry.topology(),
|
|
1188
|
+
scalingRecommendations: this.scaleAdvisor.recommendations,
|
|
1189
|
+
};
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
/**
|
|
1193
|
+
* Build endpoint map for workers.
|
|
1194
|
+
*
|
|
1195
|
+
* Maps each service to { host, port, remote } or an array of endpoints
|
|
1196
|
+
* for multi-instance services. Remote services get their address parsed
|
|
1197
|
+
* into host/port. Local services get host: '127.0.0.1'.
|
|
1198
|
+
*/
|
|
1199
|
+
_buildEndpointMap() {
|
|
1200
|
+
const endpoints = {};
|
|
1201
|
+
|
|
1202
|
+
for (const [name, svc] of Object.entries(this.services)) {
|
|
1203
|
+
if (svc.type === "remote") {
|
|
1204
|
+
// Parse address: "http://host:port" or "host:port"
|
|
1205
|
+
const parsed = this._parseAddress(svc.address, name);
|
|
1206
|
+
if (parsed) {
|
|
1207
|
+
endpoints[name] = { host: parsed.host, port: parsed.port, remote: true };
|
|
1208
|
+
}
|
|
1209
|
+
} else if (svc.port) {
|
|
1210
|
+
endpoints[name] = { host: "127.0.0.1", port: svc.port, remote: false };
|
|
1211
|
+
} else if (svc.type === "internal" || svc.type === "background") {
|
|
1212
|
+
// Include internal/background services so workers can use DirectMessageBus
|
|
1213
|
+
// for cross-group calls instead of falling back to supervisor IPC
|
|
1214
|
+
const groupName = svc.group ?? `_isolated:${name}`;
|
|
1215
|
+
const socketPath = this.messageBus.getSocketPath?.(name);
|
|
1216
|
+
endpoints[name] = { host: "127.0.0.1", remote: false, uds: socketPath ?? null, group: groupName };
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
return endpoints;
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
/**
|
|
1224
|
+
* Parse a service address string into host/port.
|
|
1225
|
+
* Supports: "http://host:port", "host:port", and other URL schemes
|
|
1226
|
+
*/
|
|
1227
|
+
_parseAddress(address, serviceName) {
|
|
1228
|
+
if (!address) return null;
|
|
1229
|
+
|
|
1230
|
+
try {
|
|
1231
|
+
// Try as URL first (handles http://, https://, etc.)
|
|
1232
|
+
if (address.includes("://")) {
|
|
1233
|
+
const url = new URL(address);
|
|
1234
|
+
return {
|
|
1235
|
+
host: url.hostname,
|
|
1236
|
+
port: parseInt(url.port, 10) || (url.protocol === "https:" ? 443 : 80),
|
|
1237
|
+
};
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
// Plain host:port
|
|
1241
|
+
const [host, portStr] = address.split(":");
|
|
1242
|
+
const port = parseInt(portStr, 10);
|
|
1243
|
+
if (host && port) return { host, port };
|
|
1244
|
+
} catch (err) {
|
|
1245
|
+
console.warn(` ⚠ Failed to parse address "${address}" for service "${serviceName ?? "unknown"}": ${err.message}`);
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
console.warn(` ⚠ Invalid address "${address}" for service "${serviceName ?? "unknown"}" — skipping`);
|
|
1249
|
+
return null;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
_printAllocation() {
|
|
1253
|
+
console.log("");
|
|
1254
|
+
console.log(" ┌──────────────────┬───────────────────────────┬─────────┬────────┐");
|
|
1255
|
+
console.log(" │ Process Group │ Services │ Workers │ Port │");
|
|
1256
|
+
console.log(" ├──────────────────┼───────────────────────────┼─────────┼────────┤");
|
|
1257
|
+
|
|
1258
|
+
for (const [groupName, group] of Object.entries(this.groups)) {
|
|
1259
|
+
const name = groupName.replace("_isolated:", "").padEnd(16);
|
|
1260
|
+
const svcList = group.services
|
|
1261
|
+
.map((s) => {
|
|
1262
|
+
const badge = s.type === "edge" ? "⚡" : s.type === "background" ? "⏰" : "○";
|
|
1263
|
+
return `${badge} ${s.name}`;
|
|
1264
|
+
})
|
|
1265
|
+
.join(", ");
|
|
1266
|
+
const svcs = svcList.substring(0, 25).padEnd(25);
|
|
1267
|
+
const threads = String(this.allocation.get(groupName) ?? 1).padEnd(7);
|
|
1268
|
+
const port = group.port ? String(group.port).padEnd(6) : " — ";
|
|
1269
|
+
console.log(` │ ${name} │ ${svcs} │ ${threads} │ ${port} │`);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
console.log(" └──────────────────┴───────────────────────────┴─────────┴────────┘");
|
|
1273
|
+
|
|
1274
|
+
let totalProcesses = 0;
|
|
1275
|
+
for (const [, count] of this.allocation) totalProcesses += count;
|
|
1276
|
+
const edgePorts = Object.values(this.services).filter((s) => s.port).length;
|
|
1277
|
+
|
|
1278
|
+
console.log(
|
|
1279
|
+
` Processes: ${totalProcesses} | Services: ${Object.keys(this.services).length} | Ports: ${edgePorts} | CPUs: ${this.allocator.totalCpus}`,
|
|
1280
|
+
);
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
_printTopology() {
|
|
1284
|
+
if (this.channels.length === 0) return;
|
|
1285
|
+
|
|
1286
|
+
console.log("");
|
|
1287
|
+
console.log(" Channels:");
|
|
1288
|
+
for (const ch of this.channels) {
|
|
1289
|
+
console.log(` ${ch.from} ↔ ${ch.to}`);
|
|
1290
|
+
}
|
|
1291
|
+
console.log(` Total: ${this.channels.length} channels (dependency-based)`);
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
_banner() {
|
|
1295
|
+
let version = "0.0.0";
|
|
1296
|
+
try {
|
|
1297
|
+
const pkg = JSON.parse(fs.readFileSync(new URL("../../package.json", import.meta.url), "utf8"));
|
|
1298
|
+
version = pkg.version;
|
|
1299
|
+
} catch {}
|
|
1300
|
+
return `
|
|
1301
|
+
╔════════════════════════════════════╗
|
|
1302
|
+
║ ⚡ ThreadForge v${version.padEnd(12)}║
|
|
1303
|
+
║ Multi-threaded Service Runtime ║
|
|
1304
|
+
╚════════════════════════════════════╝`;
|
|
1305
|
+
}
|
|
1306
|
+
}
|