threadforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +152 -0
  3. package/bin/forge.js +1050 -0
  4. package/bin/host-commands.js +344 -0
  5. package/bin/platform-commands.js +570 -0
  6. package/package.json +71 -0
  7. package/shared/auth.js +475 -0
  8. package/src/core/DirectMessageBus.js +364 -0
  9. package/src/core/EndpointResolver.js +247 -0
  10. package/src/core/ForgeContext.js +2227 -0
  11. package/src/core/ForgeHost.js +122 -0
  12. package/src/core/ForgePlatform.js +145 -0
  13. package/src/core/Ingress.js +768 -0
  14. package/src/core/Interceptors.js +420 -0
  15. package/src/core/MessageBus.js +310 -0
  16. package/src/core/Prometheus.js +305 -0
  17. package/src/core/RequestContext.js +413 -0
  18. package/src/core/RoutingStrategy.js +316 -0
  19. package/src/core/Supervisor.js +1306 -0
  20. package/src/core/ThreadAllocator.js +196 -0
  21. package/src/core/WorkerChannelManager.js +879 -0
  22. package/src/core/config.js +624 -0
  23. package/src/core/host-config.js +311 -0
  24. package/src/core/network-utils.js +166 -0
  25. package/src/core/platform-config.js +308 -0
  26. package/src/decorators/ServiceProxy.js +899 -0
  27. package/src/decorators/index.js +571 -0
  28. package/src/deploy/NginxGenerator.js +865 -0
  29. package/src/deploy/PlatformManifestGenerator.js +96 -0
  30. package/src/deploy/RouteManifestGenerator.js +112 -0
  31. package/src/deploy/index.js +984 -0
  32. package/src/frontend/FrontendDevLifecycle.js +65 -0
  33. package/src/frontend/FrontendPluginOrchestrator.js +187 -0
  34. package/src/frontend/SiteResolver.js +63 -0
  35. package/src/frontend/StaticMountRegistry.js +90 -0
  36. package/src/frontend/index.js +5 -0
  37. package/src/frontend/plugins/index.js +2 -0
  38. package/src/frontend/plugins/viteFrontend.js +79 -0
  39. package/src/frontend/types.js +35 -0
  40. package/src/index.js +56 -0
  41. package/src/internals.js +31 -0
  42. package/src/plugins/PluginManager.js +537 -0
  43. package/src/plugins/ScopedPostgres.js +192 -0
  44. package/src/plugins/ScopedRedis.js +142 -0
  45. package/src/plugins/index.js +1729 -0
  46. package/src/registry/ServiceRegistry.js +796 -0
  47. package/src/scaling/ScaleAdvisor.js +442 -0
  48. package/src/services/Service.js +195 -0
  49. package/src/services/worker-bootstrap.js +676 -0
  50. package/src/templates/auth-service.js +65 -0
  51. package/src/templates/identity-service.js +75 -0
@@ -0,0 +1,879 @@
1
+ import net from "node:net";
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { randomBytes, randomUUID, createHmac, timingSafeEqual } from "node:crypto";
5
+
6
+ /**
7
+ * WorkerChannelManager — Unix Domain Socket Mesh (Worker Side)
8
+ *
9
+ * Each worker runs a UDS server. When it receives the socket registry
10
+ * from the supervisor, it connects directly to other workers' UDS servers.
11
+ *
12
+ * Protocol: Length-prefixed JSON frames
13
+ * [4 bytes: message length (UInt32BE)][JSON payload]
14
+ *
15
+ * This gives us:
16
+ * - Direct worker-to-worker communication (no supervisor in the path)
17
+ * - Full duplex (both sides can send at any time)
18
+ * - No serialization bottleneck at the supervisor
19
+ * - Automatic reconnection on socket errors
20
+ */
21
+ /** Maximum UDS message size: 16 MB */
22
+ const MAX_UDS_MESSAGE = 16 * 1024 * 1024;
23
+
24
+ /** Maximum number of pending requests before rejecting new ones */
25
+ const MAX_PENDING_REQUESTS = 10000;
26
+
27
+ /** Maximum socket write buffer size before refusing new writes (4 MB) */
28
+ const MAX_WRITE_BUFFER = 4 * 1024 * 1024;
29
+
30
+ /** Maximum parse errors before destroying a socket */
31
+ const MAX_PARSE_ERRORS = 3;
32
+
33
+ /** Handshake timeout for server-side inbound connections (ms) */
34
+ const HANDSHAKE_TIMEOUT = 5000;
35
+
36
+ /** Maximum reconnection attempts before giving up on a peer */
37
+ const MAX_RECONNECT_ATTEMPTS = 10;
38
+
39
+
40
+ /**
41
+ * Create a reusable frame parser for length-prefixed JSON frames.
42
+ * Encapsulates the buffer-list pattern to avoid O(n^2) Buffer.concat on every chunk.
43
+ * @param {function(object): void} onFrame - called with each parsed JSON message
44
+ * @param {string} serviceName - for error logging
45
+ * @param {net.Socket} socket - socket to destroy on protocol errors
46
+ * @returns {function(Buffer): void} data event handler
47
+ */
48
+ function createFrameParser(onFrame, serviceName, socket) {
49
+ const chunks = [];
50
+ let totalLength = 0;
51
+ let buffer = Buffer.alloc(0);
52
+ let parseErrorCount = 0;
53
+
54
+ return (chunk) => {
55
+ // Reject oversized individual chunks before any allocation
56
+ if (chunk.length > MAX_UDS_MESSAGE + 4) {
57
+ parseErrorCount++;
58
+ if (parseErrorCount >= MAX_PARSE_ERRORS) {
59
+ console.error(`[${serviceName}] UDS chunk too large (${chunk.length} bytes), ${parseErrorCount} parse errors — destroying connection`);
60
+ chunks.length = 0;
61
+ totalLength = 0;
62
+ buffer = Buffer.alloc(0);
63
+ socket.destroy();
64
+ } else {
65
+ console.warn(`[${serviceName}] UDS chunk too large (${chunk.length} bytes), skipping (error ${parseErrorCount}/${MAX_PARSE_ERRORS})`);
66
+ }
67
+ return;
68
+ }
69
+
70
+ // Guard against buffer overflow (buffer + pending chunks + new chunk)
71
+ if (buffer.length + totalLength + chunk.length > MAX_UDS_MESSAGE + 4) {
72
+ parseErrorCount++;
73
+ if (parseErrorCount >= MAX_PARSE_ERRORS) {
74
+ console.error(`[${serviceName}] UDS buffer overflow detected, ${parseErrorCount} parse errors — destroying connection`);
75
+ chunks.length = 0;
76
+ totalLength = 0;
77
+ buffer = Buffer.alloc(0);
78
+ socket.destroy();
79
+ } else {
80
+ console.warn(`[${serviceName}] UDS buffer overflow detected, skipping frame (error ${parseErrorCount}/${MAX_PARSE_ERRORS})`);
81
+ // Reset buffer to recover
82
+ chunks.length = 0;
83
+ totalLength = 0;
84
+ buffer = Buffer.alloc(0);
85
+ }
86
+ return;
87
+ }
88
+
89
+ chunks.push(chunk);
90
+ totalLength += chunk.length;
91
+
92
+ // Only concatenate when we might have a complete frame
93
+ if (buffer.length + totalLength < 4) return;
94
+
95
+ // Merge chunks into buffer for frame parsing
96
+ if (chunks.length > 0) {
97
+ buffer = Buffer.concat([buffer, ...chunks]);
98
+ chunks.length = 0;
99
+ totalLength = 0;
100
+ }
101
+
102
+ // Parse length-prefixed frames
103
+ while (buffer.length >= 4) {
104
+ const msgLen = buffer.readUInt32BE(0);
105
+
106
+ if (msgLen > MAX_UDS_MESSAGE) {
107
+ parseErrorCount++;
108
+ if (parseErrorCount >= MAX_PARSE_ERRORS) {
109
+ console.error(`[${serviceName}] UDS message too large (${msgLen} bytes), ${parseErrorCount} parse errors — destroying socket`);
110
+ buffer = Buffer.alloc(0);
111
+ socket.destroy();
112
+ return;
113
+ }
114
+ console.warn(`[${serviceName}] UDS message too large (${msgLen} bytes), skipping frame (error ${parseErrorCount}/${MAX_PARSE_ERRORS})`);
115
+ // Skip past the 4-byte length prefix to try to recover
116
+ buffer = buffer.subarray(4);
117
+ continue;
118
+ }
119
+
120
+ if (buffer.length < 4 + msgLen) break; // wait for more data
121
+
122
+ const msgBuf = buffer.subarray(4, 4 + msgLen);
123
+ buffer = buffer.subarray(4 + msgLen);
124
+
125
+ try {
126
+ // COR-C2: Reviver prevents prototype pollution via __proto__/constructor/prototype keys
127
+ const msg = JSON.parse(msgBuf.toString(), (key, value) => {
128
+ if (key === '__proto__' || key === 'constructor' || key === 'prototype') return undefined;
129
+ return value;
130
+ });
131
+ onFrame(msg);
132
+ } catch (err) {
133
+ console.error(`[${serviceName}] Failed to parse UDS message:`, err.message);
134
+ }
135
+ }
136
+ };
137
+ }
138
+
139
+ export class WorkerChannelManager {
140
+ /**
141
+ * @param {string} serviceName
142
+ * @param {number} workerId
143
+ * @param {object} [options]
144
+ * @param {string[]} [options.channels] - Service names this worker needs to communicate with (P1: dependency filtering)
145
+ * @param {Function} [options.onChannelDead] - Called when a channel permanently fails reconnection
146
+ */
147
+ constructor(serviceName, workerId, options = {}) {
148
+ this.serviceName = serviceName;
149
+ this.workerId = workerId;
150
+
151
+ /**
152
+ * P1: Set of service names this worker is allowed to connect to.
153
+ * If null, connects to everything (legacy behavior).
154
+ * @type {Set<string>|null}
155
+ */
156
+ this._dependencies = options.channels ? new Set(options.channels) : null;
157
+
158
+ /** @type {Function|null} Called with (key, attempts) when a channel exhausts reconnection attempts */
159
+ this._onChannelDead = options.onChannelDead ?? null;
160
+
161
+ /** @type {net.Server|null} */
162
+ this._server = null;
163
+
164
+ /** @type {string|null} */
165
+ this._socketPath = null;
166
+
167
+ /**
168
+ * Outbound connections to other workers.
169
+ * Key: "serviceName:workerId"
170
+ * @type {Map<string, net.Socket>}
171
+ */
172
+ this.outbound = new Map();
173
+
174
+ /**
175
+ * Inbound connections from other workers.
176
+ * Key: "serviceName:workerId" (identified via handshake)
177
+ * @type {Map<string, net.Socket>}
178
+ */
179
+ this.inbound = new Map();
180
+
181
+ /**
182
+ * Service name → array of connection keys for round-robin.
183
+ * @type {Map<string, string[]>}
184
+ */
185
+ this.serviceConnections = new Map();
186
+
187
+ /** @type {Map<string, number>} */
188
+ this.rrIndex = new Map();
189
+
190
+ /** @type {Map<string, {resolve: Function, reject: Function, timer: NodeJS.Timeout}>} */
191
+ this.pendingRequests = new Map();
192
+
193
+ this.requestCounter = 0;
194
+
195
+ /** Worker ID for unique request IDs; falls back to process.pid */
196
+ this._workerId = workerId ?? process.pid;
197
+
198
+ /** @type {Function|null} */
199
+ this.onMessage = null;
200
+
201
+ /** @type {Function|null} */
202
+ this.onRequest = null;
203
+
204
+ /** @type {Function|null} - fallback to supervisor IPC */
205
+ this._supervisorSend = null;
206
+
207
+ /** @type {Object} current socket registry from supervisor */
208
+ this._registry = {};
209
+
210
+ /** @type {Map<string, number>} per-peer reconnect attempt counters */
211
+ this._reconnectAttempts = new Map();
212
+
213
+ /** @type {Map<string, NodeJS.Timeout>} reconnect timer refs for cleanup */
214
+ this._reconnectTimers = new Map();
215
+
216
+ /** P19: Reverse map from socket → key for O(1) lookup */
217
+ this._socketKeyMap = new Map();
218
+
219
+ /** @type {number} counter of backpressure events on send() */
220
+ this.backpressureEvents = 0;
221
+ }
222
+
223
+ /**
224
+ * Initialize — set up supervisor IPC listener.
225
+ */
226
+ init(supervisorSend) {
227
+ this._supervisorSend = supervisorSend;
228
+
229
+ process.on("message", (msg) => {
230
+ if (!msg || !msg.type) return;
231
+
232
+ switch (msg.type) {
233
+ case "forge:init-socket":
234
+ this._startServer(msg.socketDir, msg.serviceName, msg.workerId);
235
+ break;
236
+
237
+ case "forge:socket-registry":
238
+ this._updateRegistry(msg.registry);
239
+ break;
240
+
241
+ case "forge:health-check":
242
+ supervisorSend({
243
+ type: "forge:health-response",
244
+ timestamp: msg.timestamp,
245
+ uptime: process.uptime(),
246
+ memory: process.memoryUsage(),
247
+ pid: process.pid,
248
+ directConnections: this.outbound.size,
249
+ });
250
+ break;
251
+ }
252
+ });
253
+ }
254
+
255
+ /**
256
+ * Start our UDS server so other workers can connect to us.
257
+ */
258
+ _startServer(socketDir, serviceName, workerId) {
259
+ // One channel manager exists per service in a colocated worker process.
260
+ // Ignore init messages for sibling services; each sibling manager will
261
+ // start its own socket server when it receives its matching message.
262
+ if (serviceName !== this.serviceName) return;
263
+ if (this._server) return; // already started
264
+
265
+ this._socketPath = path.join(socketDir, `${serviceName}-${workerId}.sock`);
266
+
267
+ this._server = net.createServer((socket) => {
268
+ // A7: Start handshake timeout — close if no handshake within HANDSHAKE_TIMEOUT
269
+ let handshakeCompleted = false;
270
+ const handshakeTimer = setTimeout(() => {
271
+ if (!handshakeCompleted) {
272
+ console.warn(`[${this.serviceName}] Inbound handshake timeout — closing socket`);
273
+ socket.destroy();
274
+ }
275
+ }, HANDSHAKE_TIMEOUT);
276
+ handshakeTimer.unref();
277
+
278
+ // Inbound connection from another worker — use shared frame parser
279
+ const onData = createFrameParser(
280
+ (msg) => {
281
+ // A7: Clear handshake timer on handshake message
282
+ if (msg.type === 'forge:handshake' && !handshakeCompleted) {
283
+ handshakeCompleted = true;
284
+ clearTimeout(handshakeTimer);
285
+ }
286
+ this._handleIncomingMessage(socket, msg);
287
+ },
288
+ this.serviceName,
289
+ socket,
290
+ );
291
+
292
+ socket.on("data", onData);
293
+
294
+ socket.on("error", () => {
295
+ clearTimeout(handshakeTimer);
296
+ });
297
+ socket.on("close", () => {
298
+ clearTimeout(handshakeTimer);
299
+ // Remove from inbound
300
+ for (const [key, s] of this.inbound) {
301
+ if (s === socket) {
302
+ this.inbound.delete(key);
303
+ break;
304
+ }
305
+ }
306
+ });
307
+ });
308
+
309
+ this._server.listen(this._socketPath, () => {
310
+ // Tell supervisor we're ready
311
+ this._supervisorSend({
312
+ type: "forge:socket-ready",
313
+ socketPath: this._socketPath,
314
+ serviceName,
315
+ workerId,
316
+ });
317
+ });
318
+
319
+ this._server.on("error", (err) => {
320
+ if (err.code === "EADDRINUSE") {
321
+ // Stale socket file — unlink and retry once
322
+ try {
323
+ fs.unlinkSync(this._socketPath);
324
+ } catch {}
325
+ this._server.listen(this._socketPath, () => {
326
+ this._supervisorSend({
327
+ type: "forge:socket-ready",
328
+ socketPath: this._socketPath,
329
+ serviceName,
330
+ workerId,
331
+ });
332
+ });
333
+ } else {
334
+ console.error(`[${this.serviceName}] UDS server error:`, err.message);
335
+ }
336
+ });
337
+ }
338
+
339
+ /**
340
+ * Update our knowledge of the socket registry and connect to new peers.
341
+ */
342
+ _updateRegistry(registry) {
343
+ this._registry = registry;
344
+
345
+ const myKey = `${this.serviceName}:${this.workerId}`;
346
+
347
+ for (const [key, socketPath] of Object.entries(registry)) {
348
+ if (key === myKey) continue; // don't connect to ourselves
349
+ if (this.outbound.has(key)) continue; // already connected
350
+
351
+ // P1: Only connect to services in our dependency list (if specified)
352
+ if (this._dependencies) {
353
+ const [svcName] = key.split(":");
354
+ if (!this._dependencies.has(svcName)) {
355
+ continue;
356
+ }
357
+ }
358
+
359
+ this._connectTo(key, socketPath);
360
+ }
361
+ }
362
+
363
+ /**
364
+ * Establish an outbound connection to another worker's UDS server.
365
+ */
366
+ _connectTo(key, socketPath) {
367
+ const socket = net.createConnection(socketPath);
368
+ let handshakeTimer = null;
369
+ let dataReceived = false;
370
+
371
+ socket.on("connect", () => {
372
+ this.outbound.set(key, socket);
373
+ this._socketKeyMap.set(socket, key); // P19: O(1) reverse lookup
374
+ this._reconnectAttempts.set(key, 0);
375
+
376
+ // CR-IPC-13: Start handshake timeout — destroy if no handshake ack within 5s
377
+ handshakeTimer = setTimeout(() => {
378
+ if (!dataReceived) {
379
+ console.warn(`[${this.serviceName}] Handshake timeout for ${key}, destroying socket`);
380
+ socket.destroy();
381
+ this.outbound.delete(key);
382
+ }
383
+ }, 5000);
384
+ handshakeTimer.unref();
385
+
386
+ // Track service → connection keys for round-robin
387
+ const [svcName] = key.split(":");
388
+ if (!this.serviceConnections.has(svcName)) {
389
+ this.serviceConnections.set(svcName, []);
390
+ }
391
+ const keys = this.serviceConnections.get(svcName);
392
+ if (!keys.includes(key)) keys.push(key);
393
+
394
+ // Handshake: tell the other side who we are
395
+ const handshake = {
396
+ type: "forge:handshake",
397
+ from: this.serviceName,
398
+ fromWorkerId: this.workerId,
399
+ };
400
+ const clusterSecret = process.env.FORGE_CLUSTER_SECRET;
401
+ if (clusterSecret) {
402
+ handshake.hmac = createHmac('sha256', clusterSecret)
403
+ .update(`${this.serviceName}:${this.workerId}`)
404
+ .digest('hex');
405
+ }
406
+ this._sendFrame(socket, handshake);
407
+ });
408
+
409
+ // Outbound connection — use shared frame parser
410
+ const onData = createFrameParser(
411
+ (msg) => this._handleIncomingMessage(socket, msg),
412
+ this.serviceName,
413
+ socket,
414
+ );
415
+
416
+ socket.on("data", (chunk) => {
417
+ // CR-IPC-13: Clear handshake timeout on first data
418
+ if (!dataReceived) {
419
+ dataReceived = true;
420
+ if (handshakeTimer) {
421
+ clearTimeout(handshakeTimer);
422
+ handshakeTimer = null;
423
+ }
424
+ }
425
+ onData(chunk);
426
+ });
427
+
428
+ socket.on("error", () => {
429
+ if (handshakeTimer) { clearTimeout(handshakeTimer); handshakeTimer = null; }
430
+ this.outbound.delete(key);
431
+ this._socketKeyMap.delete(socket); // P19
432
+ const [svcName] = key.split(":");
433
+ const keys = this.serviceConnections.get(svcName);
434
+ if (keys) {
435
+ const idx = keys.indexOf(key);
436
+ if (idx !== -1) keys.splice(idx, 1);
437
+ }
438
+ // C2: Reject orphaned pending requests for this dead socket
439
+ this._rejectPendingForSocket(key);
440
+ });
441
+
442
+ socket.on("close", () => {
443
+ this.outbound.delete(key);
444
+ this._socketKeyMap.delete(socket); // P19
445
+ const [svcName] = key.split(":");
446
+ const keys = this.serviceConnections.get(svcName);
447
+ if (keys) {
448
+ const idx = keys.indexOf(key);
449
+ if (idx !== -1) keys.splice(idx, 1);
450
+ }
451
+ // C2: Reject orphaned pending requests for this dead socket
452
+ this._rejectPendingForSocket(key);
453
+
454
+ // A13: Stop retrying after MAX_RECONNECT_ATTEMPTS
455
+ const existingTimer = this._reconnectTimers.get(key);
456
+ if (existingTimer) clearTimeout(existingTimer);
457
+ const attempts = this._reconnectAttempts.get(key) ?? 0;
458
+ if (attempts >= MAX_RECONNECT_ATTEMPTS) {
459
+ console.error(`[${this.serviceName}] Channel to ${key} permanently failed after ${attempts} reconnection attempts`);
460
+ this._reconnectAttempts.delete(key);
461
+ this._reconnectTimers.delete(key);
462
+ if (this._onChannelDead) {
463
+ this._onChannelDead(key, attempts);
464
+ }
465
+ return;
466
+ }
467
+
468
+ // Attempt reconnect with exponential backoff
469
+ this._reconnectAttempts.set(key, attempts + 1);
470
+ const baseDelay = Math.min(60000, 1000 * 2 ** attempts);
471
+ const delay = baseDelay + Math.random() * 1000;
472
+ const timer = setTimeout(() => {
473
+ this._reconnectTimers.delete(key);
474
+ if (this._registry[key] && !this.outbound.has(key)) {
475
+ this._connectTo(key, this._registry[key]);
476
+ }
477
+ }, delay);
478
+ timer.unref();
479
+ this._reconnectTimers.set(key, timer);
480
+ });
481
+ }
482
+
483
+ /**
484
+ * Send a length-prefixed JSON frame over a socket.
485
+ * P6: Buffer.from(json) avoids double-scan of the string.
486
+ * @returns {boolean} true if the write was accepted into the kernel buffer
487
+ */
488
+ _sendFrame(socket, msg) {
489
+ // CR-IPC-4: Reject when buffer full — throw so callers can back off
490
+ if (socket.writableLength > MAX_WRITE_BUFFER) {
491
+ const key = this._socketKey(socket);
492
+ throw new Error(`IPC write buffer full for ${key} — receiver too slow`);
493
+ }
494
+ // P6: Buffer.from gives us the buffer directly — byteLength = buf.length, no re-scan
495
+ const json = JSON.stringify(msg);
496
+ const body = Buffer.from(json);
497
+ const frame = Buffer.allocUnsafe(4 + body.length);
498
+ frame.writeUInt32BE(body.length, 0);
499
+ body.copy(frame, 4);
500
+ const ok = socket.write(frame);
501
+ if (!ok && !socket._drainWarned) {
502
+ socket._drainWarned = true;
503
+ socket.once("drain", () => { socket._drainWarned = false; });
504
+ console.warn(`[${this.serviceName}] UDS write buffer full, waiting for drain`);
505
+ }
506
+ return ok;
507
+ }
508
+
509
+ /**
510
+ * Send a pre-built frame buffer over a socket (for broadcast optimization).
511
+ * P6: Serialize once, send to all recipients.
512
+ * @returns {boolean}
513
+ */
514
+ _sendRawFrame(socket, frameBuffer) {
515
+ if (socket.writableLength > MAX_WRITE_BUFFER) {
516
+ const key = this._socketKey(socket);
517
+ throw new Error(`IPC write buffer full for ${key} — receiver too slow`);
518
+ }
519
+ const ok = socket.write(frameBuffer);
520
+ if (!ok && !socket._drainWarned) {
521
+ socket._drainWarned = true;
522
+ socket.once("drain", () => { socket._drainWarned = false; });
523
+ console.warn(`[${this.serviceName}] UDS write buffer full, waiting for drain`);
524
+ }
525
+ return ok;
526
+ }
527
+
528
+ /**
529
+ * Build a length-prefixed frame buffer from a message object.
530
+ * P6: Used by broadcast to serialize once and send to all.
531
+ */
532
+ _buildFrame(msg) {
533
+ const json = JSON.stringify(msg);
534
+ const body = Buffer.from(json);
535
+ const frame = Buffer.allocUnsafe(4 + body.length);
536
+ frame.writeUInt32BE(body.length, 0);
537
+ body.copy(frame, 4);
538
+ return frame;
539
+ }
540
+
541
+ /**
542
+ * Handle a message from another worker (inbound or outbound socket).
543
+ */
544
+ _handleIncomingMessage(socket, msg) {
545
+ switch (msg.type) {
546
+ case "forge:handshake": {
547
+ if (!msg.from || typeof msg.from !== 'string') {
548
+ console.warn(`[${this.serviceName}] Invalid handshake: missing 'from' field`);
549
+ break;
550
+ }
551
+ // S-IPC-1: Verify HMAC if cluster secret is configured
552
+ const clusterSecret = process.env.FORGE_CLUSTER_SECRET;
553
+ if (clusterSecret) {
554
+ const expected = createHmac('sha256', clusterSecret)
555
+ .update(`${msg.from}:${msg.fromWorkerId}`)
556
+ .digest('hex');
557
+ // Wrap in try/catch: if msg.hmac is not valid hex, Buffer.from
558
+ // produces a different-length buffer and timingSafeEqual throws
559
+ try {
560
+ const expectedBuf = Buffer.from(expected, 'hex');
561
+ const hmacBuf = msg.hmac ? Buffer.from(String(msg.hmac), 'hex') : Buffer.alloc(0);
562
+ if (hmacBuf.length !== expectedBuf.length || !timingSafeEqual(expectedBuf, hmacBuf)) {
563
+ throw new Error('HMAC mismatch');
564
+ }
565
+ } catch {
566
+ console.error(`[${this.serviceName}] Handshake HMAC verification failed for ${msg.from}:${msg.fromWorkerId}`);
567
+ socket.destroy();
568
+ break;
569
+ }
570
+ }
571
+ const key = `${msg.from}:${msg.fromWorkerId}`;
572
+ this.inbound.set(key, socket);
573
+ try {
574
+ this._sendFrame(socket, {
575
+ type: "forge:handshake-ack",
576
+ from: this.serviceName,
577
+ fromWorkerId: this.workerId,
578
+ });
579
+ } catch (err) {
580
+ console.warn(`[${this.serviceName}] Failed to send handshake ack: ${err.message}`);
581
+ socket.destroy();
582
+ }
583
+ break;
584
+ }
585
+
586
+ case "forge:handshake-ack": {
587
+ break;
588
+ }
589
+
590
+ case "forge:message": {
591
+ if (this.onMessage) {
592
+ this.onMessage(msg.from, msg.payload);
593
+ }
594
+ break;
595
+ }
596
+
597
+ case "forge:request": {
598
+ if (this.onRequest) {
599
+ Promise.resolve(this.onRequest(msg.from, msg.payload))
600
+ .then((result) => {
601
+ try {
602
+ this._sendFrame(socket, {
603
+ type: "forge:response",
604
+ requestId: msg.requestId,
605
+ payload: result,
606
+ error: null,
607
+ });
608
+ } catch (sendErr) {
609
+ console.error(`[${this.serviceName}] Failed to send response: ${sendErr.message}`);
610
+ }
611
+ })
612
+ .catch((err) => {
613
+ try {
614
+ this._sendFrame(socket, {
615
+ type: "forge:response",
616
+ requestId: msg.requestId,
617
+ payload: null,
618
+ error: { message: err.message, code: err.code, statusCode: err.statusCode },
619
+ });
620
+ } catch (sendErr) {
621
+ console.error(`[${this.serviceName}] Failed to send error response: ${sendErr.message}`);
622
+ }
623
+ });
624
+ }
625
+ break;
626
+ }
627
+
628
+ case "forge:response": {
629
+ const pending = this.pendingRequests.get(msg.requestId);
630
+ if (pending) {
631
+ clearTimeout(pending.timer);
632
+ this.pendingRequests.delete(msg.requestId);
633
+ if (msg.error) {
634
+ const errObj = typeof msg.error === 'object' ? msg.error : { message: msg.error };
635
+ const err = new Error(errObj.message);
636
+ if (errObj.code) err.code = errObj.code;
637
+ if (errObj.statusCode) err.statusCode = errObj.statusCode;
638
+ pending.reject(err);
639
+ } else {
640
+ pending.resolve(msg.payload);
641
+ }
642
+ }
643
+ break;
644
+ }
645
+ }
646
+ }
647
+
648
+ // -- Public API (called by ForgeContext) --
649
+
650
+ /**
651
+ * Send a fire-and-forget message. Direct UDS path if available.
652
+ */
653
+ send(target, payload) {
654
+ const socket = this._pickSocket(target);
655
+
656
+ if (socket) {
657
+ try {
658
+ const ok = this._sendFrame(socket, {
659
+ type: "forge:message",
660
+ from: this.serviceName,
661
+ payload,
662
+ });
663
+ if (!ok) {
664
+ this.backpressureEvents++;
665
+ console.warn(`[${this.serviceName}] Backpressure on send() to "${target}" (total: ${this.backpressureEvents})`);
666
+ }
667
+ } catch (err) {
668
+ this.backpressureEvents++;
669
+ throw err;
670
+ }
671
+ } else {
672
+ // Fallback to supervisor IPC
673
+ this._supervisorSend({
674
+ type: "forge:send",
675
+ target,
676
+ payload,
677
+ });
678
+ }
679
+ }
680
+
681
+ /**
682
+ * Broadcast to all workers of a target service.
683
+ */
684
+ broadcast(target, payload) {
685
+ const keys = this.serviceConnections.get(target) ?? [];
686
+
687
+ if (keys.length > 0) {
688
+ // P6: Serialize once, send the same buffer to all recipients
689
+ const frame = this._buildFrame({
690
+ type: "forge:message",
691
+ from: this.serviceName,
692
+ payload,
693
+ });
694
+ for (const key of keys) {
695
+ const socket = this.outbound.get(key);
696
+ if (socket) {
697
+ try {
698
+ this._sendRawFrame(socket, frame);
699
+ } catch (err) {
700
+ console.warn(`[${this.serviceName}] Broadcast to ${key} failed: ${err.message}`);
701
+ }
702
+ }
703
+ }
704
+ } else {
705
+ this._supervisorSend({
706
+ type: "forge:broadcast",
707
+ target,
708
+ payload,
709
+ });
710
+ }
711
+ }
712
+
713
+ /**
714
+ * Request/response over direct UDS.
715
+ */
716
+ request(target, payload, timeoutMs = 5000) {
717
+ if (this.pendingRequests.size >= MAX_PENDING_REQUESTS) {
718
+ return Promise.reject(new Error('Too many pending requests'));
719
+ }
720
+
721
+ const socket = this._pickSocket(target);
722
+ this.requestCounter = (this.requestCounter + 1) % 1_000_000_000;
723
+ const requestId = `req_${this._workerId}_${randomUUID()}`;
724
+
725
+ return new Promise((resolve, reject) => {
726
+ const timer = setTimeout(() => {
727
+ this.pendingRequests.delete(requestId);
728
+ reject(new Error(`Request to "${target}" timed out after ${timeoutMs}ms`));
729
+ }, timeoutMs);
730
+
731
+ const sKey = socket ? this._socketKey(socket) : null;
732
+ this.pendingRequests.set(requestId, { resolve, reject, timer, socketKey: sKey });
733
+
734
+ try {
735
+ if (socket) {
736
+ this._sendFrame(socket, {
737
+ type: "forge:request",
738
+ requestId,
739
+ from: this.serviceName,
740
+ payload,
741
+ });
742
+ } else {
743
+ this._supervisorSend({
744
+ type: "forge:request",
745
+ requestId,
746
+ target,
747
+ payload,
748
+ timeout: timeoutMs,
749
+ });
750
+ }
751
+ } catch (err) {
752
+ clearTimeout(timer);
753
+ this.pendingRequests.delete(requestId);
754
+ reject(err);
755
+ }
756
+ });
757
+ }
758
+
759
+ /**
760
+ * Pick a socket to a target service (round-robin).
761
+ * @returns {net.Socket|null}
762
+ */
763
+ _pickSocket(target) {
764
+ const keys = this.serviceConnections.get(target);
765
+ if (!keys || keys.length === 0) return null;
766
+
767
+ const startIdx = (this.rrIndex.get(target) ?? 0) % keys.length;
768
+ this.rrIndex.set(target, (startIdx + 1) % 1_000_000_000);
769
+
770
+ // Try from startIdx, skip dead sockets
771
+ const dead = [];
772
+ let found = null;
773
+ for (let attempt = 0; attempt < keys.length; attempt++) {
774
+ const idx = (startIdx + attempt) % keys.length;
775
+ const socket = this.outbound.get(keys[idx]);
776
+ if (socket && !socket.destroyed) {
777
+ found = socket;
778
+ break;
779
+ }
780
+ dead.push(idx);
781
+ }
782
+ // Remove dead entries after the loop (reverse order to preserve indices)
783
+ for (let i = dead.length - 1; i >= 0; i--) {
784
+ keys.splice(dead[i], 1);
785
+ }
786
+ return found;
787
+ }
788
+
789
+ /**
790
+ * P19: O(1) socket → key lookup via reverse Map (replaces linear scan).
791
+ * @param {net.Socket} socket
792
+ * @returns {string}
793
+ */
794
+ _socketKey(socket) {
795
+ return this._socketKeyMap.get(socket) ?? 'unknown';
796
+ }
797
+
798
+ /**
799
+ * Reject all pending requests that were sent over a specific socket.
800
+ * @param {string} deadKey - the socket key that died
801
+ */
802
+ _rejectPendingForSocket(deadKey) {
803
+ for (const [id, entry] of this.pendingRequests) {
804
+ if (entry.socketKey === deadKey) {
805
+ clearTimeout(entry.timer);
806
+ this.pendingRequests.delete(id);
807
+ entry.reject(new Error('Connection lost to peer'));
808
+ }
809
+ }
810
+ }
811
+
812
+ hasDirectConnection(target) {
813
+ const keys = this.serviceConnections.get(target);
814
+ return keys && keys.length > 0;
815
+ }
816
+
817
+ topology() {
818
+ const result = {};
819
+ for (const [service, keys] of this.serviceConnections) {
820
+ result[service] = {
821
+ connections: keys.length,
822
+ keys,
823
+ };
824
+ }
825
+ return result;
826
+ }
827
+
828
+ destroy() {
829
+ // Clear reconnect timers to prevent firing after shutdown
830
+ for (const timer of this._reconnectTimers.values()) clearTimeout(timer);
831
+ this._reconnectTimers.clear();
832
+
833
+ // Clear all pending request timers and reject pending requests
834
+ for (const [id, entry] of this.pendingRequests) {
835
+ if (entry.timer) clearTimeout(entry.timer);
836
+ entry.reject(new Error('Channel destroyed'));
837
+ }
838
+ this.pendingRequests.clear();
839
+
840
+ for (const [, socket] of this.outbound) {
841
+ try {
842
+ socket.destroy();
843
+ } catch {}
844
+ }
845
+ for (const [, socket] of this.inbound) {
846
+ try {
847
+ socket.destroy();
848
+ } catch {}
849
+ }
850
+
851
+ const cleanup = () => {
852
+ this.outbound.clear();
853
+ this.inbound.clear();
854
+ this.serviceConnections.clear();
855
+ this._socketKeyMap.clear();
856
+ };
857
+
858
+ if (this._server) {
859
+ return Promise.race([
860
+ new Promise((resolve) => {
861
+ this._server.close(() => {
862
+ cleanup();
863
+ resolve();
864
+ });
865
+ }),
866
+ new Promise((resolve) => {
867
+ setTimeout(() => {
868
+ try { this._server.close(); } catch {}
869
+ cleanup();
870
+ resolve();
871
+ }, 5000).unref();
872
+ }),
873
+ ]);
874
+ }
875
+
876
+ cleanup();
877
+ return Promise.resolve();
878
+ }
879
+ }