threadforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +152 -0
  3. package/bin/forge.js +1050 -0
  4. package/bin/host-commands.js +344 -0
  5. package/bin/platform-commands.js +570 -0
  6. package/package.json +71 -0
  7. package/shared/auth.js +475 -0
  8. package/src/core/DirectMessageBus.js +364 -0
  9. package/src/core/EndpointResolver.js +247 -0
  10. package/src/core/ForgeContext.js +2227 -0
  11. package/src/core/ForgeHost.js +122 -0
  12. package/src/core/ForgePlatform.js +145 -0
  13. package/src/core/Ingress.js +768 -0
  14. package/src/core/Interceptors.js +420 -0
  15. package/src/core/MessageBus.js +310 -0
  16. package/src/core/Prometheus.js +305 -0
  17. package/src/core/RequestContext.js +413 -0
  18. package/src/core/RoutingStrategy.js +316 -0
  19. package/src/core/Supervisor.js +1306 -0
  20. package/src/core/ThreadAllocator.js +196 -0
  21. package/src/core/WorkerChannelManager.js +879 -0
  22. package/src/core/config.js +624 -0
  23. package/src/core/host-config.js +311 -0
  24. package/src/core/network-utils.js +166 -0
  25. package/src/core/platform-config.js +308 -0
  26. package/src/decorators/ServiceProxy.js +899 -0
  27. package/src/decorators/index.js +571 -0
  28. package/src/deploy/NginxGenerator.js +865 -0
  29. package/src/deploy/PlatformManifestGenerator.js +96 -0
  30. package/src/deploy/RouteManifestGenerator.js +112 -0
  31. package/src/deploy/index.js +984 -0
  32. package/src/frontend/FrontendDevLifecycle.js +65 -0
  33. package/src/frontend/FrontendPluginOrchestrator.js +187 -0
  34. package/src/frontend/SiteResolver.js +63 -0
  35. package/src/frontend/StaticMountRegistry.js +90 -0
  36. package/src/frontend/index.js +5 -0
  37. package/src/frontend/plugins/index.js +2 -0
  38. package/src/frontend/plugins/viteFrontend.js +79 -0
  39. package/src/frontend/types.js +35 -0
  40. package/src/index.js +56 -0
  41. package/src/internals.js +31 -0
  42. package/src/plugins/PluginManager.js +537 -0
  43. package/src/plugins/ScopedPostgres.js +192 -0
  44. package/src/plugins/ScopedRedis.js +142 -0
  45. package/src/plugins/index.js +1729 -0
  46. package/src/registry/ServiceRegistry.js +796 -0
  47. package/src/scaling/ScaleAdvisor.js +442 -0
  48. package/src/services/Service.js +195 -0
  49. package/src/services/worker-bootstrap.js +676 -0
  50. package/src/templates/auth-service.js +65 -0
  51. package/src/templates/identity-service.js +75 -0
@@ -0,0 +1,796 @@
1
+ /**
2
+ * Service Registry
3
+ *
4
+ * The registry is the brain of horizontal scaling. It answers one
5
+ * question for every proxy call: WHERE is this service?
6
+ *
7
+ * ═══════════════════════════════════════════════════════════════
8
+ * GROWTH STORY
9
+ * ═══════════════════════════════════════════════════════════════
10
+ *
11
+ * PHASE 1: Single machine ($20/mo VPS)
12
+ * Registry mode: 'embedded'
13
+ * - Runs in-process inside the supervisor
14
+ * - All services are local (UDS or colocated)
15
+ * - Registry is just a Map in memory
16
+ * - Zero operational overhead
17
+ *
18
+ * PHASE 2: 2-3 machines ($100/mo)
19
+ * Registry mode: 'multicast'
20
+ * - Each supervisor announces its services via UDP multicast
21
+ * on the local network (or via a simple HTTP gossip protocol)
22
+ * - Nodes discover each other automatically
23
+ * - No external dependencies (no etcd, no consul)
24
+ * - Services that move to another machine are auto-discovered
25
+ *
26
+ * PHASE 3: 10+ machines (serious scale)
27
+ * Registry mode: 'external'
28
+ * - Pluggable backend: Redis, etcd, Consul, or managed service
29
+ * - Full service mesh capabilities
30
+ * - Health checks, circuit breakers, canary deployments
31
+ *
32
+ * The proxy layer doesn't know or care which phase you're in.
33
+ * this.users.getUser('123') works identically in all three.
34
+ *
35
+ * ═══════════════════════════════════════════════════════════════
36
+ * REGISTRATION PROTOCOL
37
+ * ═══════════════════════════════════════════════════════════════
38
+ *
39
+ * When a service starts, it registers:
40
+ *
41
+ * {
42
+ * name: 'users',
43
+ * nodeId: 'node-abc123',
44
+ * host: '10.0.1.5',
45
+ * ports: {
46
+ * http: 4001, // HTTP port (for remote calls via /__forge/invoke)
47
+ * },
48
+ * udsPath: '/tmp/forge-1234/users-1.sock', // local only
49
+ * workers: 4,
50
+ * contract: { // what methods are available
51
+ * methods: ['getUser', 'createUser', 'listUsers'],
52
+ * events: ['user.created'],
53
+ * },
54
+ * health: {
55
+ * status: 'healthy',
56
+ * cpu: 23, // percent
57
+ * memory: 156, // MB
58
+ * rpcLatencyP50: 2, // ms
59
+ * rpcLatencyP99: 18, // ms
60
+ * pendingRequests: 12,
61
+ * lastHeartbeat: 1707500000000,
62
+ * },
63
+ * metadata: {
64
+ * version: '1.2.3',
65
+ * region: 'us-east-1',
66
+ * startedAt: 1707500000000,
67
+ * },
68
+ * }
69
+ *
70
+ * ═══════════════════════════════════════════════════════════════
71
+ * TOPOLOGY RESOLUTION
72
+ * ═══════════════════════════════════════════════════════════════
73
+ *
74
+ * When a proxy call is made, the registry resolves the transport:
75
+ *
76
+ * registry.resolve('users')
77
+ * → [
78
+ * { transport: 'local', instance: ... }, // colocated
79
+ * { transport: 'uds', path: '/tmp/...' }, // same machine
80
+ * { transport: 'http', host: '10.0.1.5:4001' }, // remote
81
+ * ]
82
+ *
83
+ * The proxy picks the BEST option:
84
+ * 1. Colocated (same process) → always preferred
85
+ * 2. UDS (same machine) → preferred over network
86
+ * 3. HTTP (closest region) → fallback
87
+ */
88
+
89
+ import crypto from "node:crypto";
90
+ import { EventEmitter } from "node:events";
91
+ import os from "node:os";
92
+
93
+ /**
94
+ * @typedef {Object} ServiceRegistration
95
+ * @property {string} name
96
+ * @property {string} nodeId
97
+ * @property {string} host
98
+ * @property {{http?: number}} ports
99
+ * @property {string|null} udsPath
100
+ * @property {number} workers
101
+ * @property {{methods: string[], events: string[]}} contract
102
+ * @property {ServiceHealth} health
103
+ * @property {Object} metadata
104
+ */
105
+
106
+ /**
107
+ * @typedef {Object} ServiceHealth
108
+ * @property {'healthy'|'degraded'|'unhealthy'|'draining'} status
109
+ * @property {number} cpu
110
+ * @property {number} memory
111
+ * @property {number} rpcLatencyP50
112
+ * @property {number} rpcLatencyP99
113
+ * @property {number} pendingRequests
114
+ * @property {number} lastHeartbeat
115
+ */
116
+
117
+ /**
118
+ * @typedef {Object} ResolvedEndpoint
119
+ * @property {'local'|'uds'|'http'} transport
120
+ * @property {string} nodeId
121
+ * @property {object} [instance] - for 'local'
122
+ * @property {string} [path] - for 'uds'
123
+ * @property {string} [address] - for 'http'
124
+ * @property {ServiceHealth} health
125
+ * @property {number} priority - lower = preferred
126
+ */
127
+
128
+ // ─── Base Registry ──────────────────────────────────────────
129
+
130
+ export class ServiceRegistry extends EventEmitter {
131
+ /**
132
+ * @param {Object} options
133
+ * @param {string} [options.mode='embedded'] - 'embedded' | 'multicast' | 'external'
134
+ * @param {string} [options.nodeId] - Unique node identifier
135
+ * @param {string} [options.host] - This node's reachable address
136
+ * @param {number} [options.heartbeatIntervalMs=5000]
137
+ * @param {number} [options.healthTimeoutMs=15000] - Mark unhealthy after this
138
+ * @param {number} [options.httpBasePort=4000] - Base port for auto HTTP binding
139
+ */
140
+ constructor(options = {}) {
141
+ super();
142
+
143
+ this.mode = options.mode ?? "embedded";
144
+ this.nodeId = options.nodeId ?? `node-${crypto.randomBytes(4).toString("hex")}`;
145
+ this.host = options.host ?? getLocalIP();
146
+ this.heartbeatIntervalMs = options.heartbeatIntervalMs ?? 5000;
147
+ this.healthTimeoutMs = options.healthTimeoutMs ?? 15000;
148
+ this.httpBasePort = options.httpBasePort ?? 4000;
149
+ this._clusterSecret = options.clusterSecret ?? process.env.FORGE_CLUSTER_SECRET ?? null;
150
+ this.staleReapMultiplier = options.staleReapMultiplier ?? 3;
151
+
152
+ /**
153
+ * All known service registrations across all nodes.
154
+ * Key: `${serviceName}@${nodeId}`
155
+ * @type {Map<string, ServiceRegistration>}
156
+ */
157
+ this.registrations = new Map();
158
+
159
+ /**
160
+ * Services running on THIS node.
161
+ * @type {Map<string, ServiceRegistration>}
162
+ */
163
+ this.localRegistrations = new Map();
164
+
165
+ /**
166
+ * Colocated service instances (same process).
167
+ * @type {Map<string, {service: object, ctx: object}>}
168
+ */
169
+ this.localInstances = new Map();
170
+
171
+ /** @type {NodeJS.Timeout|null} */
172
+ this._heartbeatTimer = null;
173
+
174
+ /** @type {NodeJS.Timeout|null} */
175
+ this._reapTimer = null;
176
+
177
+ /** @type {Object|null} - multicast/external backend */
178
+ this._backend = null;
179
+
180
+ /** @type {Map<string, NodeJS.Timeout>} pending drain timers from deregister() */
181
+ this._drainTimers = new Map();
182
+
183
+ /**
184
+ * Split-brain protection: track known peer nodes for quorum checks.
185
+ * @type {Map<string, number>} nodeId → last seen timestamp
186
+ */
187
+ this._knownPeers = new Map();
188
+
189
+ /** Expected cluster size for quorum calculation (0 = disabled) */
190
+ this._expectedClusterSize = parseInt(process.env.FORGE_EXPECTED_CLUSTER_SIZE || '0', 10);
191
+ }
192
+
193
+ /**
194
+ * Start the registry. In embedded mode this is a no-op.
195
+ * In multicast/external mode, connects to peers.
196
+ */
197
+ async start() {
198
+ // Start heartbeat
199
+ this._heartbeatTimer = setInterval(() => {
200
+ this._sendHeartbeats();
201
+ }, this.heartbeatIntervalMs);
202
+ this._heartbeatTimer.unref();
203
+
204
+ // Start reaper (remove stale registrations)
205
+ this._reapTimer = setInterval(() => {
206
+ this._reapStale();
207
+ }, this.healthTimeoutMs);
208
+ this._reapTimer.unref();
209
+
210
+ if (this.mode === "multicast") {
211
+ await this._startMulticast();
212
+ } else if (this.mode === "external") {
213
+ await this._startExternal();
214
+ }
215
+
216
+ return this;
217
+ }
218
+
219
+ /**
220
+ * Register a local service.
221
+ * Called by the Supervisor when a service worker starts.
222
+ */
223
+ register(registration) {
224
+ const key = `${registration.name}@${this.nodeId}`;
225
+
226
+ const reg = {
227
+ ...registration,
228
+ nodeId: this.nodeId,
229
+ host: this.host,
230
+ health: {
231
+ status: "healthy",
232
+ cpu: 0,
233
+ memory: 0,
234
+ rpcLatencyP50: 0,
235
+ rpcLatencyP99: 0,
236
+ pendingRequests: 0,
237
+ lastHeartbeat: Date.now(),
238
+ ...registration.health,
239
+ },
240
+ metadata: {
241
+ version: process.env.npm_package_version ?? "0.0.0",
242
+ region: process.env.FORGE_REGION ?? "local",
243
+ startedAt: Date.now(),
244
+ ...registration.metadata,
245
+ },
246
+ };
247
+
248
+ this.registrations.set(key, reg);
249
+ this.localRegistrations.set(registration.name, reg);
250
+
251
+ this.emit("registered", reg);
252
+ this._announceToNetwork(reg);
253
+
254
+ return reg;
255
+ }
256
+
257
+ /**
258
+ * Register a colocated service instance (same process).
259
+ */
260
+ registerLocal(name, instance) {
261
+ this.localInstances.set(name, instance);
262
+ }
263
+
264
+ /**
265
+ * Deregister a service (shutting down).
266
+ */
267
+ deregister(serviceName) {
268
+ const key = `${serviceName}@${this.nodeId}`;
269
+ const reg = this.registrations.get(key);
270
+
271
+ if (reg) {
272
+ reg.health.status = "draining";
273
+ this.emit("deregistering", reg);
274
+
275
+ // Announce to network so other nodes stop routing to us
276
+ this._announceToNetwork(reg);
277
+
278
+ // Remove after drain period
279
+ const drainTimer = setTimeout(() => {
280
+ this._drainTimers.delete(key);
281
+ this.registrations.delete(key);
282
+ this.localRegistrations.delete(serviceName);
283
+ this.localInstances.delete(serviceName);
284
+ this.emit("deregistered", { name: serviceName, nodeId: this.nodeId });
285
+ }, 5000);
286
+ drainTimer.unref();
287
+ this._drainTimers.set(key, drainTimer);
288
+ }
289
+ }
290
+
291
+ /**
292
+ * Receive a registration from another node.
293
+ */
294
+ receiveRemoteRegistration(reg) {
295
+ if (!reg || typeof reg.name !== 'string' || typeof reg.nodeId !== 'string') return;
296
+ if (!reg.host || typeof reg.host !== 'string') return;
297
+ if (!reg.health || typeof reg.health.status !== 'string') return;
298
+
299
+ if (reg.nodeId === this.nodeId) return; // ignore self
300
+
301
+ const key = `${reg.name}@${reg.nodeId}`;
302
+ const existing = this.registrations.get(key);
303
+
304
+ // Set lastSeen to local clock to avoid clock skew issues with remote timestamps
305
+ reg.lastSeen = Date.now();
306
+
307
+ // Track peer nodes for split-brain detection
308
+ this._knownPeers.set(reg.nodeId, Date.now());
309
+
310
+ this.registrations.set(key, reg);
311
+
312
+ if (!existing) {
313
+ this.emit("discovered", reg);
314
+ }
315
+ }
316
+
317
+ /**
318
+ * Check if we can see a quorum of the expected cluster.
319
+ * Returns true if quorum is met or split-brain detection is disabled.
320
+ * A node should avoid serving traffic if it's partitioned from the majority.
321
+ */
322
+ hasQuorum() {
323
+ if (this._expectedClusterSize <= 0) return true;
324
+
325
+ // Count recently-seen peers (within 2x health timeout)
326
+ const cutoff = Date.now() - this.healthTimeoutMs * 2;
327
+ let activePeers = 0;
328
+ for (const [, lastSeen] of this._knownPeers) {
329
+ if (lastSeen > cutoff) activePeers++;
330
+ }
331
+
332
+ // +1 for self
333
+ const visibleNodes = activePeers + 1;
334
+ const quorum = Math.floor(this._expectedClusterSize / 2) + 1;
335
+ return visibleNodes >= quorum;
336
+ }
337
+
338
+ /**
339
+ * Update health for a local service.
340
+ */
341
+ updateHealth(serviceName, health) {
342
+ const reg = this.localRegistrations.get(serviceName);
343
+ if (reg) {
344
+ Object.assign(reg.health, health, { lastHeartbeat: Date.now() });
345
+ }
346
+ }
347
+
348
+ // ─── Resolution ─────────────────────────────────────────
349
+
350
+ /**
351
+ * Resolve endpoints for a service, ordered by preference.
352
+ *
353
+ * Returns all known instances with their transport type and
354
+ * health status, sorted by priority:
355
+ * 1. Local (colocated, same process)
356
+ * 2. UDS (same machine, different process)
357
+ * 3. HTTP (remote, same region)
358
+ * 4. HTTP (remote, different region)
359
+ *
360
+ * @param {string} serviceName
361
+ * @returns {ResolvedEndpoint[]}
362
+ */
363
+ resolve(serviceName) {
364
+ const endpoints = [];
365
+
366
+ const isColocated = this.localInstances.has(serviceName);
367
+ let localAdded = false;
368
+
369
+ for (const [_key, reg] of this.registrations) {
370
+ if (reg.name !== serviceName) continue;
371
+ if (reg.health.status === "unhealthy") continue;
372
+
373
+ const isLocal = reg.nodeId === this.nodeId;
374
+ const isDraining = reg.health.status === "draining";
375
+
376
+ if (isDraining) continue;
377
+
378
+ if (isColocated && isLocal) {
379
+ if (!localAdded) {
380
+ endpoints.push({
381
+ transport: "local",
382
+ nodeId: reg.nodeId,
383
+ instance: this.localInstances.get(serviceName),
384
+ health: reg.health,
385
+ priority: 0, // highest priority
386
+ });
387
+ localAdded = true;
388
+ }
389
+ } else if (isLocal && reg.udsPath) {
390
+ endpoints.push({
391
+ transport: "uds",
392
+ nodeId: reg.nodeId,
393
+ path: reg.udsPath,
394
+ health: reg.health,
395
+ priority: 1,
396
+ });
397
+ } else if (reg.ports?.http) {
398
+ const sameRegion = reg.metadata?.region === (process.env.FORGE_REGION ?? "local");
399
+ endpoints.push({
400
+ transport: "http",
401
+ nodeId: reg.nodeId,
402
+ address: `${reg.host}:${reg.ports.http}`,
403
+ health: reg.health,
404
+ priority: sameRegion ? 2 : 3,
405
+ });
406
+ }
407
+ }
408
+
409
+ // Sort by priority, then by health
410
+ endpoints.sort((a, b) => {
411
+ if (a.priority !== b.priority) return a.priority - b.priority;
412
+ // Prefer lower latency
413
+ return (a.health.rpcLatencyP50 ?? 0) - (b.health.rpcLatencyP50 ?? 0);
414
+ });
415
+
416
+ return endpoints;
417
+ }
418
+
419
+ /**
420
+ * Resolve the BEST single endpoint for a service.
421
+ * Used by the proxy when making a call.
422
+ *
423
+ * @param {string} serviceName
424
+ * @returns {ResolvedEndpoint|null}
425
+ */
426
+ resolveBest(serviceName) {
427
+ const endpoints = this.resolve(serviceName);
428
+ return endpoints[0] ?? null;
429
+ }
430
+
431
+ /**
432
+ * Get all known services and their locations.
433
+ */
434
+ topology() {
435
+ const services = new Map();
436
+
437
+ for (const [, reg] of this.registrations) {
438
+ if (!services.has(reg.name)) {
439
+ services.set(reg.name, []);
440
+ }
441
+ services.get(reg.name).push({
442
+ nodeId: reg.nodeId,
443
+ host: reg.host,
444
+ transport: this.localInstances?.has(reg.name) ? "colocated"
445
+ : reg.nodeId === this.nodeId ? "local"
446
+ : "http",
447
+ status: reg.health.status,
448
+ cpu: reg.health.cpu,
449
+ workers: reg.workers,
450
+ });
451
+ }
452
+
453
+ return Object.fromEntries(services);
454
+ }
455
+
456
+ // ─── Multicast Discovery ────────────────────────────────
457
+
458
+ /**
459
+ * Multicast mode: UDP broadcast on the local network.
460
+ * Each node announces its services every heartbeat interval.
461
+ * New nodes are discovered automatically.
462
+ */
463
+ async _startMulticast() {
464
+ if (!this._clusterSecret) {
465
+ if (process.env.NODE_ENV === 'production') {
466
+ throw new Error('[Registry] FORGE_CLUSTER_SECRET is required for multicast discovery in production');
467
+ }
468
+ console.warn(
469
+ "[Registry] FORGE_CLUSTER_SECRET not set — multicast messages are unauthenticated. Set a shared secret for production use.",
470
+ );
471
+ }
472
+
473
+ const dgram = await import("node:dgram");
474
+ const MULTICAST_ADDR = process.env.FORGE_MULTICAST_ADDRESS || "239.255.42.42";
475
+ const MULTICAST_PORT = parseInt(process.env.FORGE_MULTICAST_PORT || "42042", 10);
476
+
477
+ // Create UDP socket for sending/receiving announcements
478
+ const socket = dgram.createSocket({ type: "udp4", reuseAddr: true });
479
+
480
+ // Rate limiting: track message counts per source IP
481
+ const _multicastRateCounts = new Map();
482
+ const _multicastRateTimer = setInterval(() => { _multicastRateCounts.clear(); }, 1000);
483
+ _multicastRateTimer.unref();
484
+ this._multicastRateTimer = _multicastRateTimer;
485
+
486
+ socket.on("message", (buf, rinfo) => {
487
+ // 1. Size check FIRST
488
+ if (buf.length > 8192) return;
489
+
490
+ // 2. Rate limit per source IP: reject if > 10/sec
491
+ const srcAddr = rinfo.address;
492
+ const rateCount = (_multicastRateCounts.get(srcAddr) ?? 0) + 1;
493
+ _multicastRateCounts.set(srcAddr, rateCount);
494
+ if (rateCount > 10) return;
495
+
496
+ // 3. JSON.parse
497
+ let msg;
498
+ try {
499
+ msg = JSON.parse(buf.toString());
500
+ } catch (err) {
501
+ console.warn("[ServiceRegistry] Received malformed multicast message");
502
+ return;
503
+ }
504
+
505
+ // 4. Signature verification BEFORE checking message structure
506
+ if (this._clusterSecret) {
507
+ const sig = msg.sig ?? "";
508
+ const { sig: _, ...rest } = msg;
509
+ const canonical = JSON.stringify(sortKeys(rest));
510
+ const expected = crypto.createHmac("sha256", this._clusterSecret)
511
+ .update(canonical)
512
+ .digest("hex");
513
+ try {
514
+ const sigBuf = Buffer.from(sig, "hex");
515
+ const expBuf = Buffer.from(expected, "hex");
516
+ if (sigBuf.length !== expBuf.length || !crypto.timingSafeEqual(sigBuf, expBuf)) {
517
+ return;
518
+ }
519
+ } catch {
520
+ return;
521
+ }
522
+ }
523
+
524
+ // 5. THEN check message type, nodeId, etc.
525
+ if (!msg || typeof msg !== 'object' || !msg.type || msg.type !== 'forge:announce') {
526
+ return;
527
+ }
528
+ if (msg.nodeId === this.nodeId) return; // Ignore own announcements
529
+
530
+ // Replay protection: reject stale messages
531
+ if (typeof msg.timestamp === 'number') {
532
+ const MAX_MESSAGE_AGE = (this.heartbeatIntervalMs ?? 30000) * 2;
533
+ if (Math.abs(Date.now() - msg.timestamp) > MAX_MESSAGE_AGE) {
534
+ return;
535
+ }
536
+ }
537
+
538
+ for (const reg of msg.services) {
539
+ this.receiveRemoteRegistration(reg);
540
+ }
541
+ });
542
+
543
+ await new Promise((resolve) => {
544
+ socket.bind(MULTICAST_PORT, () => {
545
+ socket.addMembership(MULTICAST_ADDR);
546
+ resolve();
547
+ });
548
+ });
549
+
550
+ this._multicastSocket = socket;
551
+ this._multicastAddr = MULTICAST_ADDR;
552
+ this._multicastPort = MULTICAST_PORT;
553
+ }
554
+
555
+ /**
556
+ * Announce services to the network.
557
+ */
558
+ _announceToNetwork(reg) {
559
+ if (this.mode === "multicast" && this._multicastSocket) {
560
+ const allServices = [...this.localRegistrations.values()];
561
+ const testPayload = JSON.stringify({
562
+ type: "forge:announce",
563
+ nodeId: this.nodeId,
564
+ services: allServices,
565
+ });
566
+
567
+ if (Buffer.byteLength(testPayload) > 1400) {
568
+ // Split services into chunks that fit in a UDP packet
569
+ const chunks = [];
570
+ let current = [];
571
+ let currentSize = 0;
572
+ for (const svc of allServices) {
573
+ const svcSize = Buffer.byteLength(JSON.stringify(svc));
574
+ if (currentSize + svcSize > 1200 && current.length > 0) {
575
+ chunks.push(current);
576
+ current = [];
577
+ currentSize = 0;
578
+ }
579
+ current.push(svc);
580
+ currentSize += svcSize;
581
+ }
582
+ if (current.length > 0) chunks.push(current);
583
+
584
+ for (const chunk of chunks) {
585
+ this._sendMulticastPayload(chunk);
586
+ }
587
+ } else {
588
+ this._sendMulticastPayload(allServices);
589
+ }
590
+ }
591
+
592
+ if (this.mode === "external" && this._backend) {
593
+ this._backend.set(`forge/services/${reg.name}/${this.nodeId}`, JSON.stringify(reg), {
594
+ ttl: Math.ceil(this.healthTimeoutMs / 1000),
595
+ });
596
+ }
597
+ }
598
+
599
+ _sendMulticastPayload(services) {
600
+ const payload = {
601
+ type: "forge:announce",
602
+ nodeId: this.nodeId,
603
+ services,
604
+ timestamp: Date.now(),
605
+ };
606
+
607
+ // Sign with HMAC if cluster secret is configured
608
+ // Use canonical JSON (recursive sorted keys) to ensure consistent signatures across environments
609
+ if (this._clusterSecret) {
610
+ const canonical = JSON.stringify(sortKeys(payload));
611
+ const sig = crypto.createHmac("sha256", this._clusterSecret).update(canonical).digest("hex");
612
+ payload.sig = sig;
613
+ }
614
+
615
+ const buf = Buffer.from(JSON.stringify(payload));
616
+ this._multicastSocket.send(buf, this._multicastPort, this._multicastAddr);
617
+ }
618
+
619
+ // ─── External Backend ───────────────────────────────────
620
+
621
+ async _startExternal() {
622
+ throw new Error(
623
+ 'External registry backend not yet implemented. Planned backends: Redis, etcd, Consul. ' +
624
+ 'Use "embedded" or "multicast" mode for now.'
625
+ );
626
+ }
627
+
628
+ /**
629
+ * Set an external backend (Redis, etcd, etc.)
630
+ *
631
+ * The backend must implement:
632
+ * get(key) → string
633
+ * set(key, value, options) → void
634
+ * watch(prefix, callback) → void
635
+ * delete(key) → void
636
+ */
637
+ setBackend(backend) {
638
+ this._backend = backend;
639
+ }
640
+
641
+ // ─── Health Management ──────────────────────────────────
642
+
643
+ _sendHeartbeats() {
644
+ for (const [, reg] of this.localRegistrations) {
645
+ reg.health.lastHeartbeat = Date.now();
646
+ reg.health.cpu = getCpuUsage();
647
+ reg.health.memory = Math.round(process.memoryUsage().rss / 1024 / 1024);
648
+ this._announceToNetwork(reg);
649
+ }
650
+ }
651
+
652
+ _reapStale() {
653
+ const now = Date.now();
654
+
655
+ for (const [key, reg] of this.registrations) {
656
+ if (reg.nodeId === this.nodeId) continue; // don't reap self
657
+
658
+ // Use lastSeen (local receive time) instead of remote lastHeartbeat to avoid clock skew
659
+ const age = now - (reg.lastSeen ?? reg.health?.lastHeartbeat ?? 0);
660
+ if (age > this.healthTimeoutMs) {
661
+ if (reg.health.status !== "unhealthy") {
662
+ reg.health.status = "unhealthy";
663
+ this.emit("unhealthy", reg);
664
+ }
665
+
666
+ // Remove after multiplier * timeout
667
+ if (age > this.healthTimeoutMs * this.staleReapMultiplier) {
668
+ this.registrations.delete(key);
669
+ this.emit("removed", reg);
670
+ }
671
+ }
672
+ }
673
+
674
+ // Reap stale peers from split-brain tracking
675
+ const peerCutoff = now - this.healthTimeoutMs * this.staleReapMultiplier;
676
+ for (const [nodeId, lastSeen] of this._knownPeers) {
677
+ if (lastSeen < peerCutoff) {
678
+ this._knownPeers.delete(nodeId);
679
+ }
680
+ }
681
+
682
+ // Emit warning if quorum is lost
683
+ if (!this.hasQuorum()) {
684
+ this.emit("quorum-lost", {
685
+ expectedSize: this._expectedClusterSize,
686
+ activePeers: this._knownPeers.size,
687
+ nodeId: this.nodeId,
688
+ });
689
+ }
690
+ }
691
+
692
+ /**
693
+ * Expose the topology via HTTP for gossip / dashboards.
694
+ * The supervisor can mount this on its metrics server.
695
+ */
696
+ httpHandler(req, res) {
697
+ if (req.url === "/_forge/topology") {
698
+ res.writeHead(200, { "Content-Type": "application/json" });
699
+ res.end(
700
+ JSON.stringify(
701
+ {
702
+ nodeId: this.nodeId,
703
+ host: this.host,
704
+ mode: this.mode,
705
+ registrations: [...this.registrations.values()],
706
+ topology: this.topology(),
707
+ },
708
+ null,
709
+ 2,
710
+ ),
711
+ );
712
+ return true;
713
+ }
714
+
715
+ const parsedUrl = new URL(req.url, "http://localhost");
716
+ if (parsedUrl.pathname === "/_forge/resolve" && req.method === "GET") {
717
+ const service = parsedUrl.searchParams.get("service");
718
+ if (service) {
719
+ res.writeHead(200, { "Content-Type": "application/json" });
720
+ res.end(JSON.stringify(this.resolve(service), null, 2));
721
+ } else {
722
+ res.writeHead(400);
723
+ res.end("Missing ?service= parameter");
724
+ }
725
+ return true;
726
+ }
727
+
728
+ return false;
729
+ }
730
+
731
+ async stop() {
732
+ if (this._heartbeatTimer) clearInterval(this._heartbeatTimer);
733
+ if (this._reapTimer) clearInterval(this._reapTimer);
734
+
735
+ // Clear all pending drain timers from deregister()
736
+ for (const timer of this._drainTimers.values()) {
737
+ clearTimeout(timer);
738
+ }
739
+ this._drainTimers.clear();
740
+
741
+ // Immediately remove all local services (skip drain period during stop)
742
+ for (const name of [...this.localRegistrations.keys()]) {
743
+ const key = `${name}@${this.nodeId}`;
744
+ this.registrations.delete(key);
745
+ this.localRegistrations.delete(name);
746
+ this.localInstances.delete(name);
747
+ this.emit("deregistered", { name, nodeId: this.nodeId });
748
+ }
749
+
750
+ if (this._multicastRateTimer) {
751
+ clearInterval(this._multicastRateTimer);
752
+ }
753
+ if (this._multicastSocket) {
754
+ this._multicastSocket.close();
755
+ }
756
+ }
757
+ }
758
+
759
+ // ─── Utilities ──────────────────────────────────────────────
760
+
761
+ function sortKeys(obj) {
762
+ if (Array.isArray(obj)) return obj.map(sortKeys);
763
+ if (obj && typeof obj === 'object') {
764
+ return Object.keys(obj).sort().reduce((acc, k) => { acc[k] = sortKeys(obj[k]); return acc; }, {});
765
+ }
766
+ return obj;
767
+ }
768
+
769
+ function getLocalIP() {
770
+ const interfaces = os.networkInterfaces();
771
+ for (const name of Object.keys(interfaces)) {
772
+ for (const iface of interfaces[name]) {
773
+ if (iface.family === "IPv4" && !iface.internal) {
774
+ return iface.address;
775
+ }
776
+ }
777
+ }
778
+ return "127.0.0.1";
779
+ }
780
+
781
+ let lastCpuUsage = process.cpuUsage();
782
+ let lastCpuTime = Date.now();
783
+
784
+ function getCpuUsage() {
785
+ const now = Date.now();
786
+ const elapsed = now - lastCpuTime;
787
+ if (elapsed === 0) return 0;
788
+
789
+ const usage = process.cpuUsage(lastCpuUsage);
790
+ lastCpuUsage = process.cpuUsage();
791
+ lastCpuTime = now;
792
+
793
+ // user + system time in microseconds / elapsed wall time in microseconds
794
+ const cpuPercent = ((usage.user + usage.system) / 1000 / elapsed) * 100;
795
+ return Math.round(cpuPercent);
796
+ }