@ynode/cluster 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +41 -3
  2. package/package.json +1 -1
  3. package/src/cluster.js +158 -5
package/README.md CHANGED
@@ -41,11 +41,31 @@ const startServer = async () => {
41
41
  };
42
42
 
43
43
  // Start the cluster
44
- run(startServer, {
45
- mode: "smart", // Enable auto-scaling (default)
44
+ const control = run(startServer, {
45
+ mode: "smart",
46
46
  minWorkers: 2,
47
- maxWorkers: 8 // Default is os.availableParallelism()
47
+ maxWorkers: 4
48
48
  });
49
+
50
+ // Access metrics
51
+ setInterval(() => {
52
+ console.log(control.getMetrics());
53
+ }, 5000);
54
+
55
+ // Trigger zero-downtime reload (e.g., on SIGHUP or API call)
56
+ // control.reload();
57
+ ```
58
+
59
+ ### Zero-Downtime Reload
60
+
61
+ You can reload the cluster (e.g. after a code deployment) without dropping connections using `control.reload()`. This will:
62
+ 1. Sequentially start a new worker.
63
+ 2. Wait for it to come online.
64
+ 3. Gracefully shutdown the old worker.
65
+
66
+ ```js
67
+ await control.reload();
68
+ console.log("Reload complete!");
49
69
  ```
50
70
 
51
71
  ## Configuration
@@ -65,6 +85,24 @@ The `run(startWorker, options)` function accepts the following options:
65
85
  | `autoScaleInterval` | `number` | `5000` | Interval (ms) for auto-scaling checks in "smart" mode. |
66
86
  | `shutdownSignals` | `string[]` | `['SIGINT', 'SIGTERM', 'SIGQUIT']` | Signals to listen for to trigger graceful shutdown. |
67
87
  | `shutdownTimeout` | `number` | `10000` | Time (ms) to wait for workers to shutdown before forced exit. |
88
+ | `scaleUpMemory` | `number` | `0` | Threshold (MB) for average heap usage to trigger scaling up. |
89
+ | `maxWorkerMemory` | `number` | `0` | Max heap usage (MB) for a worker before restart (Leak Protection). |
90
+ | `norestart` | `boolean` | `false` | If true, workers will not be restarted when they die. |
91
+
92
+ ## Accessing Metrics
93
+
94
+ The `run()` function returns a `ClusterManager` instance (when in cluster mode) which exposes current metrics.
95
+
96
+ ```javascript
97
+ const manager = run(startWorker, { mode: "smart" });
98
+
99
+ // In your monitoring loop or API endpoint:
100
+ if (manager) {
101
+ const metrics = manager.getMetrics();
102
+ console.log(`Current Lag: ${metrics.avgLag.toFixed(2)}ms`);
103
+ console.log(`Active Workers: ${metrics.workerCount}`);
104
+ }
105
+ ```
68
106
 
69
107
  ## Working with @ynode/autoshutdown
70
108
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ynode/cluster",
3
- "version": "1.2.0",
3
+ "version": "1.2.2",
4
4
  "description": "Smart, auto-scaling Node.js cluster manager that monitors event loop lag to optimize performance and resource usage.",
5
5
  "main": "src/cluster.js",
6
6
  "exports": {
package/src/cluster.js CHANGED
@@ -50,10 +50,36 @@ import os from "node:os";
50
50
  * @param {object} log - The logger instance.
51
51
  */
52
52
  export function run(startWorker, options = true, log = console) {
53
- const isEnabled = typeof options === "object" ? options.enabled : options;
53
+ const isEnabled = typeof options === "object" ? (options.enabled ?? true) : options;
54
54
 
55
55
  if (cluster.isWorker || !isEnabled) {
56
56
  log.info(`Running worker process.`);
57
+
58
+ // Start heartbeat loop if enabled (and we are clustering)
59
+ if (cluster.isWorker) {
60
+ ;
61
+ let lastCheck = Date.now();
62
+ setInterval(() => {
63
+ const now = Date.now();
64
+ // Approximate event loop lag
65
+ const lag = now - lastCheck - 2000;
66
+ lastCheck = now;
67
+
68
+ const memory = process.memoryUsage();
69
+
70
+ try {
71
+ process.send({
72
+ cmd: "heartbeat",
73
+ lag: Math.max(0, lag),
74
+ memory: memory.heapUsed // Use heapUsed for primary scaling/monitoring
75
+ });
76
+ } catch (err) {
77
+ // Ignore, channel probably closed
78
+ log.debug("Failed to send heartbeat to master", err);
79
+ }
80
+ }, 2000).unref();
81
+ }
82
+
57
83
  return startWorker();
58
84
  }
59
85
 
@@ -70,6 +96,9 @@ export function run(startWorker, options = true, log = console) {
70
96
  autoScaleInterval = 5000,
71
97
  shutdownSignals = ["SIGINT", "SIGTERM", "SIGQUIT"],
72
98
  shutdownTimeout = 10000,
99
+ scaleUpMemory = 0, // MB (0 = disabled)
100
+ maxWorkerMemory = 0, // MB (0 = disabled)
101
+ norestart = false,
73
102
  } = typeof options === "object" ? options : {};
74
103
 
75
104
  if (minWorkers > maxWorkers) {
@@ -120,6 +149,7 @@ export function run(startWorker, options = true, log = console) {
120
149
 
121
150
  worker.on("message", (msg) => {
122
151
  if (msg.cmd === "heartbeat") {
152
+ // console.log(`[Master] Heartbeat from ${worker.id}: ${msg.memory} bytes`);
123
153
  workerLoads.set(worker.id, {
124
154
  lag: msg.lag,
125
155
  lastSeen: Date.now(),
@@ -141,6 +171,10 @@ export function run(startWorker, options = true, log = console) {
141
171
  return log.info(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}.`);
142
172
  }
143
173
 
174
+ if (norestart) {
175
+ return log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Not restarting (norestart enabled).`);
176
+ }
177
+
144
178
  log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Restarting...`);
145
179
  try {
146
180
  cluster.fork();
@@ -179,10 +213,44 @@ export function run(startWorker, options = true, log = console) {
179
213
  }
180
214
 
181
215
  const avgLag = totalLag / count;
216
+ // Calculate Average Memory in MB
217
+ let totalMemory = 0; // Bytes
218
+ for (const stats of workerLoads.values()) {
219
+ if (stats.memory) {
220
+ totalMemory += stats.memory;
221
+ }
222
+ }
223
+ const avgMemoryMB = count > 0 ? (totalMemory / count) / 1024 / 1024 : 0;
224
+
182
225
  const currentWorkers = Object.keys(cluster.workers).length;
183
226
 
184
- if (avgLag > scaleUpThreshold && currentWorkers < maxWorkers) {
185
- log.info(`High load detected (Avg Lag: ${avgLag.toFixed(2)}ms). Scaling up...`);
227
+ // Leak Protection (Max Worker Memory)
228
+ if (maxWorkerMemory > 0) {
229
+ for (const [id, stats] of workerLoads.entries()) {
230
+ const memMB = stats.memory / 1024 / 1024;
231
+ // console.log(`[Master] Checking Worker ${id} Memory: ${memMB.toFixed(2)}MB (Limit: ${maxWorkerMemory}MB)`);
232
+ if (memMB > maxWorkerMemory) {
233
+ log.warn(`Worker ${id} exceeded memory limit (${memMB.toFixed(2)}MB > ${maxWorkerMemory}MB). Restarting...`);
234
+ const worker = cluster.workers[id];
235
+ if (worker) {
236
+ worker.kill();
237
+ }
238
+ // Exit handler will restart it
239
+ return; // Wait for restart
240
+ }
241
+ }
242
+ }
243
+
244
+ // Scale Up logic (Lag OR Memory)
245
+ const shouldScaleUpLag = avgLag > scaleUpThreshold;
246
+ const shouldScaleUpMem = scaleUpMemory > 0 && avgMemoryMB > scaleUpMemory;
247
+
248
+ if ((shouldScaleUpLag || shouldScaleUpMem) && currentWorkers < maxWorkers) {
249
+ const reason = shouldScaleUpMem
250
+ ? `High Memory (Avg: ${avgMemoryMB.toFixed(2)}MB)`
251
+ : `High Lag (Avg: ${avgLag.toFixed(2)}ms)`;
252
+
253
+ log.info(`${reason} detected. Scaling up...`);
186
254
  try {
187
255
  cluster.fork();
188
256
  } catch (err) {
@@ -230,11 +298,96 @@ export function run(startWorker, options = true, log = console) {
230
298
  // Allow some time for workers to clean up
231
299
  if (shutdownTimeout > 0) {
232
300
  setTimeout(() => {
233
- log.warn(`Master force exiting after ${shutdownTimeout}s timeout.`);
301
+ log.warn(`Master force exiting after ${shutdownTimeout / 1000}s timeout.`);
234
302
  process.exit(0);
235
- }, shutdownTimeout * 1000).unref();
303
+ }, shutdownTimeout).unref();
236
304
  }
237
305
  });
238
306
  });
239
307
  }
308
+
309
+ // Expose metrics API
310
+ return {
311
+ getMetrics: () => {
312
+ const currentWorkers = Object.keys(cluster.workers).length;
313
+ let totalLag = 0;
314
+ let count = 0;
315
+ const workersData = [];
316
+
317
+ for (const [id, stats] of workerLoads.entries()) {
318
+ totalLag += stats.lag;
319
+ count++;
320
+
321
+ const worker = cluster.workers[id];
322
+ workersData.push({
323
+ id,
324
+ pid: worker?.process.pid,
325
+ lag: stats.lag,
326
+ memory: stats.memory,
327
+ lastSeen: stats.lastSeen,
328
+ upltime: worker && (Date.now() - stats.lastSeen)
329
+ });
330
+ }
331
+
332
+ const avgLag = count > 0 ? (totalLag / count) : 0;
333
+
334
+ return {
335
+ workers: workersData,
336
+ totalLag,
337
+ avgLag,
338
+ workerCount: currentWorkers,
339
+ maxWorkers,
340
+ minWorkers,
341
+ scaleUpThreshold,
342
+ scaleDownThreshold,
343
+ mode
344
+ };
345
+ },
346
+ reload: async () => {
347
+ if (isShuttingDown) {
348
+ return;
349
+ }
350
+ log.info("Starting zero-downtime cluster reload...");
351
+
352
+ // Get a snapshot of current workers to replace
353
+ const currentWorkers = Object.values(cluster.workers);
354
+
355
+ for (const oldWorker of currentWorkers) {
356
+ if (!oldWorker) {
357
+ continue;
358
+ }
359
+
360
+ // Fork a new worker
361
+ log.info("Spawning replacement worker...");
362
+ const newWorker = cluster.fork();
363
+
364
+ // Wait for the new worker to be online
365
+ await new Promise((resolve) => {
366
+ newWorker.once("online", resolve);
367
+ });
368
+
369
+ // Wait for the new worker to be listening (optional, but safer for zero-downtime)
370
+ // However, not all workers listen. strict zero-downtime usually implies listening.
371
+ // We'll stick to 'online' for generic support in v1,
372
+ // but maybe add a small delay or check?
373
+ // For now, 'online' means the process is up and running.
374
+
375
+ log.info(`Replacement worker ${newWorker.process.pid} is online. Gracefully shutting down old worker ${oldWorker.process.pid}...`);
376
+
377
+ // Gracefully disconnect the old worker
378
+ oldWorker.disconnect();
379
+
380
+ // We don't strictly wait for the old worker to die here to speed up deployment,
381
+ // but it handles its own shutdown.
382
+ // If we wanted strict serial replacement (one dies, then next starts), we'd wait.
383
+ // But typically we want overlap.
384
+
385
+ // Wait for disconnect confirmation or short timeout to proceed to next
386
+ const disconnectPromise = new Promise(resolve => oldWorker.once("disconnect", resolve));
387
+ const timeoutPromise = new Promise(resolve => setTimeout(resolve, 2000).unref());
388
+ await Promise.race([disconnectPromise, timeoutPromise]);
389
+ }
390
+ log.info("Cluster reload complete.");
391
+ }
392
+ };
240
393
  }