@zero-server/lifecycle 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/cluster.js ADDED
@@ -0,0 +1,666 @@
1
+ /**
2
+ * @module cluster
3
+ * @description Clustering support for zero-server applications.
4
+ * Forks worker processes, manages automatic restarts with backoff,
5
+ * and provides IPC messaging between the primary and workers.
6
+ *
7
+ * @example
8
+ * const { createApp, cluster } = require('@zero-server/sdk');
9
+ *
10
+ * cluster((worker) => {
11
+ * const app = createApp();
12
+ * app.get('/', (req, res) => res.json({ pid: process.pid }));
13
+ * app.listen(3000);
14
+ * });
15
+ *
16
+ * @example | With Options
17
+ * cluster((worker) => {
18
+ * const app = createApp();
19
+ * app.listen(3000);
20
+ * }, {
21
+ * workers: 4,
22
+ * respawn: true,
23
+ * respawnDelay: 1000,
24
+ * maxRespawnDelay: 30000,
25
+ * });
26
+ */
27
+ const cluster = require('cluster');
28
+ const os = require('os');
29
+ const log = require('./debug')('zero:cluster');
30
+
31
+ /**
32
+ * Default cluster configuration.
33
+ * @private
34
+ */
35
+ const DEFAULTS = {
36
+ workers: os.cpus().length,
37
+ respawn: true,
38
+ respawnDelay: 1000,
39
+ maxRespawnDelay: 30000,
40
+ backoffFactor: 2,
41
+ };
42
+
43
+ // -- Cluster Manager -------------------------------
44
+
45
+ /**
46
+ * Manages a cluster of worker processes for a zero-server application.
47
+ * Runs on the primary process only; each worker is a forked child process
48
+ * sharing the same server port via the OS.
49
+ */
50
+ class ClusterManager
51
+ {
52
+ /**
53
+ * @constructor
54
+ * @param {object} [opts] - Cluster configuration.
55
+ * @param {number} [opts.workers] - Number of worker processes (default: CPU count).
56
+ * @param {boolean} [opts.respawn=true] - Automatically respawn crashed workers.
57
+ * @param {number} [opts.respawnDelay=1000] - Initial delay (ms) before respawning.
58
+ * @param {number} [opts.maxRespawnDelay=30000] - Maximum respawn delay after backoff.
59
+ * @param {number} [opts.backoffFactor=2] - Multiplier for exponential backoff.
60
+ */
61
+ constructor(opts = {})
62
+ {
63
+ this._opts = { ...DEFAULTS, ...opts };
64
+
65
+ /** @type {Map<number, import('cluster').Worker>} Worker ID → Worker */
66
+ this._workers = new Map();
67
+
68
+ /** @type {Map<number, number>} Worker ID → consecutive crash count */
69
+ this._crashCounts = new Map();
70
+
71
+ /** @type {Object<string, Function[]>} IPC message listeners keyed by type */
72
+ this._messageHandlers = {};
73
+
74
+ /** Whether the cluster is shutting down. @private */
75
+ this._shuttingDown = false;
76
+
77
+ /** Whether fork has been called. @private */
78
+ this._started = false;
79
+ }
80
+
81
+ /**
82
+ * Whether the current process is the primary (master) process.
83
+ * @type {boolean}
84
+ */
85
+ get isPrimary()
86
+ {
87
+ return cluster.isPrimary !== undefined ? cluster.isPrimary : cluster.isMaster;
88
+ }
89
+
90
+ /**
91
+ * Whether the current process is a worker process.
92
+ * @type {boolean}
93
+ */
94
+ get isWorker()
95
+ {
96
+ return cluster.isWorker;
97
+ }
98
+
99
+ /**
100
+ * Number of configured workers.
101
+ * @type {number}
102
+ */
103
+ get workerCount()
104
+ {
105
+ return this._opts.workers;
106
+ }
107
+
108
+ /**
109
+ * Get all active worker IDs.
110
+ * @returns {number[]} Array of worker IDs.
111
+ */
112
+ get workerIds()
113
+ {
114
+ return Array.from(this._workers.keys());
115
+ }
116
+
117
+ /**
118
+ * Number of currently alive workers.
119
+ * @type {number}
120
+ */
121
+ get activeWorkers()
122
+ {
123
+ return this._workers.size;
124
+ }
125
+
126
+ /**
127
+ * Fork all worker processes. Only call from the primary process.
128
+ *
129
+ * @returns {ClusterManager} this
130
+ *
131
+ * @example
132
+ * const mgr = new ClusterManager({ workers: 4 });
133
+ * if (mgr.isPrimary) {
134
+ * mgr.fork();
135
+ * }
136
+ */
137
+ fork()
138
+ {
139
+ if (!this.isPrimary) return this;
140
+ if (this._started) return this;
141
+ this._started = true;
142
+
143
+ const count = this._opts.workers;
144
+ log.info('forking %d workers', count);
145
+
146
+ for (let i = 0; i < count; i++)
147
+ {
148
+ this._spawnWorker();
149
+ }
150
+
151
+ cluster.on('exit', (worker, code, signal) =>
152
+ {
153
+ this._workers.delete(worker.id);
154
+
155
+ if (this._shuttingDown)
156
+ {
157
+ log.info('worker %d exited during shutdown', worker.id);
158
+ return;
159
+ }
160
+
161
+ if (signal)
162
+ {
163
+ log.warn('worker %d killed by signal %s', worker.id, signal);
164
+ }
165
+ else if (code !== 0)
166
+ {
167
+ log.warn('worker %d exited with code %d', worker.id, code);
168
+ }
169
+ else
170
+ {
171
+ log.info('worker %d exited cleanly', worker.id);
172
+ }
173
+
174
+ // Respawn if enabled and not shutting down
175
+ if (this._opts.respawn && code !== 0 && !this._shuttingDown)
176
+ {
177
+ const crashes = (this._crashCounts.get(worker.id) || 0) + 1;
178
+ const delay = Math.min(
179
+ this._opts.respawnDelay * Math.pow(this._opts.backoffFactor, crashes - 1),
180
+ this._opts.maxRespawnDelay
181
+ );
182
+ log.info('respawning worker in %dms (crash #%d)', delay, crashes);
183
+ setTimeout(() =>
184
+ {
185
+ if (!this._shuttingDown) this._spawnWorker();
186
+ }, delay);
187
+ }
188
+ });
189
+
190
+ // Relay IPC messages from workers
191
+ cluster.on('message', (worker, message) =>
192
+ {
193
+ if (message && typeof message === 'object' && message._zhttp)
194
+ {
195
+ this._handleMessage(worker, message);
196
+ }
197
+ });
198
+
199
+ return this;
200
+ }
201
+
202
+ /**
203
+ * Spawn a single worker process.
204
+ * @private
205
+ * @returns {import('cluster').Worker}
206
+ */
207
+ _spawnWorker()
208
+ {
209
+ const worker = cluster.fork();
210
+ this._workers.set(worker.id, worker);
211
+ this._crashCounts.set(worker.id, 0);
212
+ log.info('worker %d spawned (pid %d)', worker.id, worker.process.pid);
213
+ return worker;
214
+ }
215
+
216
+ // -- IPC Messaging ---------------------------------
217
+
218
+ /**
219
+ * Send a typed message from the primary to all workers.
220
+ *
221
+ * @param {string} type - Message type identifier.
222
+ * @param {*} data - Message payload (must be serialisable).
223
+ *
224
+ * @example | Primary
225
+ * mgr.broadcast('config:update', { maxConn: 100 });
226
+ *
227
+ * @example | Worker
228
+ * mgr.onMessage('config:update', (data) => {
229
+ * console.log('new config:', data);
230
+ * });
231
+ */
232
+ broadcast(type, data)
233
+ {
234
+ const msg = { _zhttp: true, type, data };
235
+ for (const worker of this._workers.values())
236
+ {
237
+ if (!worker.isDead()) worker.send(msg);
238
+ }
239
+ }
240
+
241
+ /**
242
+ * Send a typed message to a specific worker.
243
+ *
244
+ * @param {number} workerId - Target worker ID.
245
+ * @param {string} type - Message type identifier.
246
+ * @param {*} data - Message payload.
247
+ */
248
+ sendTo(workerId, type, data)
249
+ {
250
+ const worker = this._workers.get(workerId);
251
+ if (worker && !worker.isDead())
252
+ {
253
+ worker.send({ _zhttp: true, type, data });
254
+ }
255
+ }
256
+
257
+ /**
258
+ * Send a typed message from a worker to the primary process.
259
+ * Call this from within a worker process.
260
+ *
261
+ * @param {string} type - Message type identifier.
262
+ * @param {*} data - Message payload.
263
+ *
264
+ * @example | In a Worker
265
+ * mgr.sendToPrimary('metrics', { reqCount: 150 });
266
+ */
267
+ sendToPrimary(type, data)
268
+ {
269
+ if (!this.isWorker) return;
270
+ process.send({ _zhttp: true, type, data });
271
+ }
272
+
273
+ /**
274
+ * Register a handler for a typed IPC message.
275
+ * On the primary, receives messages from workers.
276
+ * On workers, receives messages from the primary.
277
+ *
278
+ * @param {string} type - Message type to listen for.
279
+ * @param {Function} fn - Handler `(data, worker?) => void`. `worker` is only present on the primary.
280
+ * @returns {ClusterManager} this
281
+ *
282
+ * @example
283
+ * mgr.onMessage('metrics', (data, worker) => {
284
+ * console.log('worker', worker.id, 'reports:', data);
285
+ * });
286
+ */
287
+ onMessage(type, fn)
288
+ {
289
+ if (!this._messageHandlers[type]) this._messageHandlers[type] = [];
290
+ this._messageHandlers[type].push(fn);
291
+
292
+ // If this is a worker, also listen on process for primary → worker messages
293
+ if (this.isWorker && !this._workerListenerInstalled)
294
+ {
295
+ this._workerListenerInstalled = true;
296
+ process.on('message', (message) =>
297
+ {
298
+ if (message && typeof message === 'object' && message._zhttp)
299
+ {
300
+ const fns = this._messageHandlers[message.type];
301
+ if (fns)
302
+ {
303
+ for (const handler of fns.slice())
304
+ {
305
+ try { handler(message.data); }
306
+ catch (err) { log.error('message handler error: %s', err.message); }
307
+ }
308
+ }
309
+ }
310
+ });
311
+ }
312
+
313
+ return this;
314
+ }
315
+
316
+ /**
317
+ * Handle an incoming IPC message from a worker.
318
+ * @private
319
+ * @param {import('cluster').Worker} worker
320
+ * @param {{ type: string, data: * }} message
321
+ */
322
+ _handleMessage(worker, message)
323
+ {
324
+ const fns = this._messageHandlers[message.type];
325
+ if (!fns) return;
326
+ for (const fn of fns.slice())
327
+ {
328
+ try { fn(message.data, worker); }
329
+ catch (err) { log.error('message handler error: %s', err.message); }
330
+ }
331
+ }
332
+
333
+ // -- Per-Worker Metrics Aggregation ----------------
334
+
335
+ /**
336
+ * Enable automatic per-worker metrics aggregation.
337
+ * Workers periodically send their metrics snapshot to the primary,
338
+ * which merges them into a single registry for exposition.
339
+ *
340
+ * @param {import('./observe/metrics').MetricsRegistry} registry - Registry to aggregate into (on primary) or report from (on worker).
341
+ * @param {object} [opts] - Options.
342
+ * @param {number} [opts.interval=5000] - Reporting interval in ms.
343
+ * @returns {ClusterManager} this
344
+ *
345
+ * @example
346
+ * const { MetricsRegistry, cluster } = require('@zero-server/sdk');
347
+ * const registry = new MetricsRegistry();
348
+ *
349
+ * cluster((mgr) => {
350
+ * const app = createApp();
351
+ * app.use(metricsMiddleware({ registry }));
352
+ * mgr.enableMetrics(registry, { interval: 3000 });
353
+ * app.listen(3000);
354
+ * });
355
+ */
356
+ enableMetrics(registry, opts = {})
357
+ {
358
+ const interval = opts.interval || 5000;
359
+
360
+ if (this.isWorker)
361
+ {
362
+ // Worker: periodically send metrics to primary
363
+ this._metricsTimer = setInterval(() =>
364
+ {
365
+ this.sendToPrimary('metrics:report', registry.toJSON());
366
+ }, interval);
367
+ if (this._metricsTimer.unref) this._metricsTimer.unref();
368
+ }
369
+ else if (this.isPrimary)
370
+ {
371
+ // Primary: aggregate incoming metrics
372
+ this._aggregateRegistry = registry;
373
+ this.onMessage('metrics:report', (data) =>
374
+ {
375
+ registry.merge(data);
376
+ });
377
+ }
378
+
379
+ return this;
380
+ }
381
+
382
+ /**
383
+ * Stop the per-worker metrics reporting timer.
384
+ */
385
+ disableMetrics()
386
+ {
387
+ if (this._metricsTimer)
388
+ {
389
+ clearInterval(this._metricsTimer);
390
+ this._metricsTimer = null;
391
+ }
392
+ }
393
+
394
+ // -- Sticky Sessions --------------------------------
395
+
396
+ /**
397
+ * Enable sticky sessions by hashing client IP addresses to specific workers.
398
+ * Ensures WebSocket, SSE, and gRPC connections from the same client always
399
+ * land on the same worker for proper room/state management.
400
+ *
401
+ * Must be called on the primary BEFORE listen(). Replaces the
402
+ * default round-robin OS scheduling with a custom `connection`
403
+ * listener that distributes sockets to workers based on IP hash.
404
+ *
405
+ * @param {import('http').Server|import('https').Server} server - The HTTP server to attach to.
406
+ * @param {object} [opts] - Options.
407
+ * @param {Function} [opts.hash] - Custom hash function `(ip, workerCount) => workerIndex`.
408
+ * @returns {ClusterManager} this
409
+ *
410
+ * @example
411
+ * if (mgr.isPrimary) {
412
+ * const server = http.createServer();
413
+ * mgr.enableSticky(server);
414
+ * server.listen(3000);
415
+ * }
416
+ */
417
+ enableSticky(server, opts = {})
418
+ {
419
+ if (!this.isPrimary || !server) return this;
420
+
421
+ const hashFn = typeof opts.hash === 'function'
422
+ ? opts.hash
423
+ : _defaultIpHash;
424
+
425
+ // Pause the default round-robin by taking over the connection event
426
+ server.on('connection', (socket) =>
427
+ {
428
+ // Don't distribute if no workers or shutting down
429
+ if (this._workers.size === 0 || this._shuttingDown)
430
+ {
431
+ socket.destroy();
432
+ return;
433
+ }
434
+
435
+ const ip = socket.remoteAddress || '';
436
+ const workerIds = Array.from(this._workers.keys());
437
+ const idx = hashFn(ip, workerIds.length);
438
+ const workerId = workerIds[idx % workerIds.length];
439
+ const worker = this._workers.get(workerId);
440
+
441
+ if (worker && !worker.isDead())
442
+ {
443
+ worker.send({ _zhttp: true, type: 'sticky:connection' }, socket);
444
+ }
445
+ else
446
+ {
447
+ socket.destroy();
448
+ }
449
+ });
450
+
451
+ return this;
452
+ }
453
+
454
+ // -- Graceful Restart & Shutdown -------------------
455
+
456
+ /**
457
+ * Perform a rolling restart of all workers (zero-downtime).
458
+ * Workers are restarted one at a time — a new worker is spawned and
459
+ * confirmed listening before the old one is disconnected.
460
+ *
461
+ * @returns {Promise<void>} Resolves when all workers have been replaced.
462
+ *
463
+ * @example
464
+ * process.on('SIGHUP', () => mgr.reload());
465
+ */
466
+ async reload()
467
+ {
468
+ if (!this.isPrimary || this._shuttingDown) return;
469
+
470
+ const workerIds = Array.from(this._workers.keys());
471
+ log.info('rolling restart of %d workers', workerIds.length);
472
+
473
+ for (const id of workerIds)
474
+ {
475
+ const old = this._workers.get(id);
476
+ if (!old || old.isDead()) continue;
477
+
478
+ // Spawn replacement
479
+ const replacement = this._spawnWorker();
480
+
481
+ // Wait for replacement to come online
482
+ await new Promise((resolve) =>
483
+ {
484
+ replacement.once('listening', resolve);
485
+ // Safety timeout — don't wait forever
486
+ const timer = setTimeout(resolve, 10000);
487
+ if (timer.unref) timer.unref();
488
+ });
489
+
490
+ // Disconnect old worker gracefully
491
+ old.disconnect();
492
+ await new Promise((resolve) =>
493
+ {
494
+ old.once('exit', resolve);
495
+ const timer = setTimeout(() =>
496
+ {
497
+ if (!old.isDead()) old.kill();
498
+ resolve();
499
+ }, 10000);
500
+ if (timer.unref) timer.unref();
501
+ });
502
+
503
+ this._workers.delete(id);
504
+ log.info('replaced worker %d → %d', id, replacement.id);
505
+ }
506
+
507
+ log.info('rolling restart complete');
508
+ }
509
+
510
+ /**
511
+ * Shut down the entire cluster gracefully.
512
+ * Sends `'shutdown'` IPC message to all workers, then waits for them
513
+ * to exit. Workers that don't exit within the timeout are killed.
514
+ *
515
+ * @param {object} [opts] - Shutdown options.
516
+ * @param {number} [opts.timeout=30000] - Maximum ms to wait for workers to exit.
517
+ * @returns {Promise<void>} Resolves when all workers have exited.
518
+ *
519
+ * @example
520
+ * process.on('SIGTERM', async () => {
521
+ * await mgr.shutdown({ timeout: 10000 });
522
+ * process.exit(0);
523
+ * });
524
+ */
525
+ async shutdown(opts = {})
526
+ {
527
+ if (this._shuttingDown) return;
528
+ this._shuttingDown = true;
529
+
530
+ const timeout = opts.timeout || 30000;
531
+ log.info('cluster shutdown initiated (timeout=%dms)', timeout);
532
+
533
+ // Signal all workers to shut down
534
+ this.broadcast('shutdown', {});
535
+
536
+ // Disconnect workers gracefully
537
+ for (const worker of this._workers.values())
538
+ {
539
+ if (!worker.isDead()) worker.disconnect();
540
+ }
541
+
542
+ // Wait for all workers to exit
543
+ await Promise.race([
544
+ this._waitForAllWorkers(),
545
+ new Promise((resolve) =>
546
+ {
547
+ const t = setTimeout(resolve, timeout);
548
+ if (t.unref) t.unref();
549
+ }),
550
+ ]);
551
+
552
+ // Kill any remaining workers
553
+ for (const worker of this._workers.values())
554
+ {
555
+ if (!worker.isDead())
556
+ {
557
+ log.warn('force-killing worker %d', worker.id);
558
+ worker.kill();
559
+ }
560
+ }
561
+
562
+ this._workers.clear();
563
+ log.info('cluster shutdown complete');
564
+ }
565
+
566
+ /**
567
+ * Wait for all tracked workers to exit.
568
+ * @private
569
+ * @returns {Promise<void>}
570
+ */
571
+ _waitForAllWorkers()
572
+ {
573
+ if (this._workers.size === 0) return Promise.resolve();
574
+
575
+ return new Promise((resolve) =>
576
+ {
577
+ const check = () =>
578
+ {
579
+ if (this._workers.size === 0) resolve();
580
+ };
581
+ cluster.on('exit', check);
582
+ check();
583
+ });
584
+ }
585
+ }
586
+
587
+ // -- Convenience Function --------------------------
588
+
589
+ /**
590
+ * High-level clustering helper. Forks workers on the primary process and
591
+ * runs the provided setup function on each worker.
592
+ *
593
+ * @param {Function} workerFn - Function to execute on each worker process.
594
+ * Receives the ClusterManager instance as argument.
595
+ * @param {object} [opts] - Cluster options (see ClusterManager constructor).
596
+ * @returns {ClusterManager} The cluster manager instance (on both primary and workers).
597
+ *
598
+ * @example
599
+ * const { cluster } = require('@zero-server/sdk');
600
+ *
601
+ * cluster((mgr) => {
602
+ * const app = createApp();
603
+ * app.get('/', (req, res) => res.json({ pid: process.pid }));
604
+ * app.listen(3000);
605
+ * }, { workers: 4 });
606
+ */
607
+ function clusterize(workerFn, opts = {})
608
+ {
609
+ const mgr = new ClusterManager(opts);
610
+
611
+ if (mgr.isPrimary)
612
+ {
613
+ mgr.fork();
614
+
615
+ // Install signal handlers on primary
616
+ const shutdownHandler = (signal) =>
617
+ {
618
+ log.info('primary received %s', signal);
619
+ mgr.shutdown().then(() => process.exit(0)).catch(() => process.exit(1));
620
+ };
621
+
622
+ process.on('SIGTERM', shutdownHandler);
623
+ process.on('SIGINT', shutdownHandler);
624
+
625
+ // SIGHUP triggers rolling restart (Unix only)
626
+ if (process.platform !== 'win32')
627
+ {
628
+ process.on('SIGHUP', () => mgr.reload());
629
+ }
630
+ }
631
+ else
632
+ {
633
+ // Worker process — listen for shutdown IPC from primary
634
+ mgr.onMessage('shutdown', () =>
635
+ {
636
+ log.info('worker received shutdown message');
637
+ // App-level shutdown is handled by the lifecycle manager via SIGTERM fallback
638
+ process.disconnect();
639
+ });
640
+
641
+ workerFn(mgr);
642
+ }
643
+
644
+ return mgr;
645
+ }
646
+
647
+ /**
648
+ * Default IP hash for sticky sessions.
649
+ * Uses DJB2 hash for fast integer distribution.
650
+ * @private
651
+ * @param {string} ip - Client IP address.
652
+ * @param {number} count - Number of workers.
653
+ * @returns {number} Worker index.
654
+ */
655
+ function _defaultIpHash(ip, count)
656
+ {
657
+ if (count <= 0) return 0;
658
+ let hash = 5381;
659
+ for (let i = 0; i < ip.length; i++)
660
+ {
661
+ hash = ((hash << 5) + hash + ip.charCodeAt(i)) & 0x7fffffff;
662
+ }
663
+ return hash % count;
664
+ }
665
+
666
+ module.exports = { ClusterManager, clusterize, _defaultIpHash };