@ynode/cluster 1.1.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +44 -3
  2. package/package.json +1 -1
  3. package/src/cluster.js +178 -26
package/README.md CHANGED
@@ -41,11 +41,31 @@ const startServer = async () => {
41
41
  };
42
42
 
43
43
  // Start the cluster
44
- run(startServer, {
45
- mode: "smart", // Enable auto-scaling (default)
44
+ const control = run(startServer, {
45
+ mode: "smart",
46
46
  minWorkers: 2,
47
- maxWorkers: 8 // Default is os.availableParallelism()
47
+ maxWorkers: 4
48
48
  });
49
+
50
+ // Access metrics
51
+ setInterval(() => {
52
+ console.log(control.getMetrics());
53
+ }, 5000);
54
+
55
+ // Trigger zero-downtime reload (e.g., on SIGHUP or API call)
56
+ // control.reload();
57
+ ```
58
+
59
+ ### Zero-Downtime Reload
60
+
61
+ You can reload the cluster (e.g. after a code deployment) without dropping connections using `control.reload()`. This will:
62
+ 1. Sequentially start a new worker.
63
+ 2. Wait for it to come online.
64
+ 3. Gracefully shutdown the old worker.
65
+
66
+ ```js
67
+ await control.reload();
68
+ console.log("Reload complete!");
49
69
  ```
50
70
 
51
71
  ## Configuration
@@ -62,6 +82,27 @@ The `run(startWorker, options)` function accepts the following options:
62
82
  | `scaleDownThreshold` | `number` | `10` | Event loop lag (ms) threshold to trigger scaling down. |
63
83
  | `scalingCooldown` | `number` | `10000` | Minimum time (ms) between scaling actions. |
64
84
  | `scaleDownGrace` | `number` | `30000` | Grace period (ms) after scaling up before scaling down is allowed. |
85
+ | `autoScaleInterval` | `number` | `5000` | Interval (ms) for auto-scaling checks in "smart" mode. |
86
+ | `shutdownSignals` | `string[]` | `['SIGINT', 'SIGTERM', 'SIGQUIT']` | Signals to listen for to trigger graceful shutdown. |
87
+ | `shutdownTimeout` | `number` | `10000` | Time (ms) to wait for workers to shutdown before forced exit. |
88
+ | `scaleUpMemory` | `number` | `0` | Threshold (MB) for average heap usage to trigger scaling up. |
89
+ | `maxWorkerMemory` | `number` | `0` | Max heap usage (MB) for a worker before restart (Leak Protection). |
90
+ | `norestart` | `boolean` | `false` | If true, workers will not be restarted when they die. |
91
+
92
+ ## Accessing Metrics
93
+
94
+ The `run()` function returns a `ClusterManager` instance (when in cluster mode) which exposes current metrics.
95
+
96
+ ```javascript
97
+ const manager = run(startWorker, { mode: "smart" });
98
+
99
+ // In your monitoring loop or API endpoint:
100
+ if (manager) {
101
+ const metrics = manager.getMetrics();
102
+ console.log(`Current Lag: ${metrics.avgLag.toFixed(2)}ms`);
103
+ console.log(`Active Workers: ${metrics.workerCount}`);
104
+ }
105
+ ```
65
106
 
66
107
  ## Working with @ynode/autoshutdown
67
108
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ynode/cluster",
3
- "version": "1.1.0",
3
+ "version": "1.2.2",
4
4
  "description": "Smart, auto-scaling Node.js cluster manager that monitors event loop lag to optimize performance and resource usage.",
5
5
  "main": "src/cluster.js",
6
6
  "exports": {
package/src/cluster.js CHANGED
@@ -50,10 +50,36 @@ import os from "node:os";
50
50
  * @param {object} log - The logger instance.
51
51
  */
52
52
  export function run(startWorker, options = true, log = console) {
53
- const isEnabled = typeof options === "object" ? options.enabled : options;
53
+ const isEnabled = typeof options === "object" ? (options.enabled ?? true) : options;
54
54
 
55
55
  if (cluster.isWorker || !isEnabled) {
56
56
  log.info(`Running worker process.`);
57
+
58
+ // Start heartbeat loop if enabled (and we are clustering)
59
+ if (cluster.isWorker) {
60
+ ;
61
+ let lastCheck = Date.now();
62
+ setInterval(() => {
63
+ const now = Date.now();
64
+ // Approximate event loop lag
65
+ const lag = now - lastCheck - 2000;
66
+ lastCheck = now;
67
+
68
+ const memory = process.memoryUsage();
69
+
70
+ try {
71
+ process.send({
72
+ cmd: "heartbeat",
73
+ lag: Math.max(0, lag),
74
+ memory: memory.heapUsed // Use heapUsed for primary scaling/monitoring
75
+ });
76
+ } catch (err) {
77
+ // Ignore, channel probably closed
78
+ log.debug("Failed to send heartbeat to master", err);
79
+ }
80
+ }, 2000).unref();
81
+ }
82
+
57
83
  return startWorker();
58
84
  }
59
85
 
@@ -67,6 +93,12 @@ export function run(startWorker, options = true, log = console) {
67
93
  mode = "smart", // 'smart' or 'max'
68
94
  scalingCooldown = 10000,
69
95
  scaleDownGrace = 30000,
96
+ autoScaleInterval = 5000,
97
+ shutdownSignals = ["SIGINT", "SIGTERM", "SIGQUIT"],
98
+ shutdownTimeout = 10000,
99
+ scaleUpMemory = 0, // MB (0 = disabled)
100
+ maxWorkerMemory = 0, // MB (0 = disabled)
101
+ norestart = false,
70
102
  } = typeof options === "object" ? options : {};
71
103
 
72
104
  if (minWorkers > maxWorkers) {
@@ -117,6 +149,7 @@ export function run(startWorker, options = true, log = console) {
117
149
 
118
150
  worker.on("message", (msg) => {
119
151
  if (msg.cmd === "heartbeat") {
152
+ // console.log(`[Master] Heartbeat from ${worker.id}: ${msg.memory} bytes`);
120
153
  workerLoads.set(worker.id, {
121
154
  lag: msg.lag,
122
155
  lastSeen: Date.now(),
@@ -138,6 +171,10 @@ export function run(startWorker, options = true, log = console) {
138
171
  return log.info(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}.`);
139
172
  }
140
173
 
174
+ if (norestart) {
175
+ return log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Not restarting (norestart enabled).`);
176
+ }
177
+
141
178
  log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Restarting...`);
142
179
  try {
143
180
  cluster.fork();
@@ -176,10 +213,44 @@ export function run(startWorker, options = true, log = console) {
176
213
  }
177
214
 
178
215
  const avgLag = totalLag / count;
216
+ // Calculate Average Memory in MB
217
+ let totalMemory = 0; // Bytes
218
+ for (const stats of workerLoads.values()) {
219
+ if (stats.memory) {
220
+ totalMemory += stats.memory;
221
+ }
222
+ }
223
+ const avgMemoryMB = count > 0 ? (totalMemory / count) / 1024 / 1024 : 0;
224
+
179
225
  const currentWorkers = Object.keys(cluster.workers).length;
180
226
 
181
- if (avgLag > scaleUpThreshold && currentWorkers < maxWorkers) {
182
- log.info(`High load detected (Avg Lag: ${avgLag.toFixed(2)}ms). Scaling up...`);
227
+ // Leak Protection (Max Worker Memory)
228
+ if (maxWorkerMemory > 0) {
229
+ for (const [id, stats] of workerLoads.entries()) {
230
+ const memMB = stats.memory / 1024 / 1024;
231
+ // console.log(`[Master] Checking Worker ${id} Memory: ${memMB.toFixed(2)}MB (Limit: ${maxWorkerMemory}MB)`);
232
+ if (memMB > maxWorkerMemory) {
233
+ log.warn(`Worker ${id} exceeded memory limit (${memMB.toFixed(2)}MB > ${maxWorkerMemory}MB). Restarting...`);
234
+ const worker = cluster.workers[id];
235
+ if (worker) {
236
+ worker.kill();
237
+ }
238
+ // Exit handler will restart it
239
+ return; // Wait for restart
240
+ }
241
+ }
242
+ }
243
+
244
+ // Scale Up logic (Lag OR Memory)
245
+ const shouldScaleUpLag = avgLag > scaleUpThreshold;
246
+ const shouldScaleUpMem = scaleUpMemory > 0 && avgMemoryMB > scaleUpMemory;
247
+
248
+ if ((shouldScaleUpLag || shouldScaleUpMem) && currentWorkers < maxWorkers) {
249
+ const reason = shouldScaleUpMem
250
+ ? `High Memory (Avg: ${avgMemoryMB.toFixed(2)}MB)`
251
+ : `High Lag (Avg: ${avgLag.toFixed(2)}ms)`;
252
+
253
+ log.info(`${reason} detected. Scaling up...`);
183
254
  try {
184
255
  cluster.fork();
185
256
  } catch (err) {
@@ -208,34 +279,115 @@ export function run(startWorker, options = true, log = console) {
208
279
 
209
280
  return;
210
281
  }
211
- }, 5000).unref();
282
+ return;
283
+ }, autoScaleInterval).unref();
212
284
  }
213
285
 
214
286
  // Graceful shutdown handling for Master
215
- const signals = ["SIGINT", "SIGTERM", "SIGQUIT"];
216
-
217
- signals.forEach((signal) => {
218
- process.on(signal, () => {
219
- log.info(`Master received ${signal}, shutting down workers...`);
220
- isShuttingDown = true;
221
- for (const worker of Object.values(cluster.workers)) {
222
- if (worker && worker.isConnected()) {
223
- worker.send("shutdown");
287
+ if (Array.isArray(shutdownSignals) && shutdownSignals.length > 0) {
288
+ shutdownSignals.forEach((signal) => {
289
+ process.on(signal, () => {
290
+ log.info(`Master received ${signal}, shutting down workers...`);
291
+ isShuttingDown = true;
292
+ for (const worker of Object.values(cluster.workers)) {
293
+ if (worker && worker.isConnected()) {
294
+ worker.send("shutdown");
295
+ }
224
296
  }
225
- }
226
297
 
227
- // Allow some time for workers to clean up?
228
- // Ideally we wait for them to exit, but for now we just let the process exit eventually
229
- // or rely on the fact that existing "shutdown" message logic in worker handles close.
230
- // We can just exit the master after a timeout if we want to force it,
231
- // but usually letting workers exit will cause master to exit if all handles are closed.
232
- // For safety in this template, we'll force exit after a timeout.
233
- setTimeout(() => {
234
- log.info("Master force exiting after timeout.");
235
- process.exit(0);
236
- }, 10000).unref();
298
+ // Allow some time for workers to clean up
299
+ if (shutdownTimeout > 0) {
300
+ setTimeout(() => {
301
+ log.warn(`Master force exiting after ${shutdownTimeout / 1000}s timeout.`);
302
+ process.exit(0);
303
+ }, shutdownTimeout).unref();
304
+ }
305
+ });
237
306
  });
238
- });
239
- }
307
+ }
240
308
 
309
+ // Expose metrics API
310
+ return {
311
+ getMetrics: () => {
312
+ const currentWorkers = Object.keys(cluster.workers).length;
313
+ let totalLag = 0;
314
+ let count = 0;
315
+ const workersData = [];
316
+
317
+ for (const [id, stats] of workerLoads.entries()) {
318
+ totalLag += stats.lag;
319
+ count++;
320
+
321
+ const worker = cluster.workers[id];
322
+ workersData.push({
323
+ id,
324
+ pid: worker?.process.pid,
325
+ lag: stats.lag,
326
+ memory: stats.memory,
327
+ lastSeen: stats.lastSeen,
328
+ upltime: worker && (Date.now() - stats.lastSeen)
329
+ });
330
+ }
331
+
332
+ const avgLag = count > 0 ? (totalLag / count) : 0;
333
+
334
+ return {
335
+ workers: workersData,
336
+ totalLag,
337
+ avgLag,
338
+ workerCount: currentWorkers,
339
+ maxWorkers,
340
+ minWorkers,
341
+ scaleUpThreshold,
342
+ scaleDownThreshold,
343
+ mode
344
+ };
345
+ },
346
+ reload: async () => {
347
+ if (isShuttingDown) {
348
+ return;
349
+ }
350
+ log.info("Starting zero-downtime cluster reload...");
351
+
352
+ // Get a snapshot of current workers to replace
353
+ const currentWorkers = Object.values(cluster.workers);
241
354
 
355
+ for (const oldWorker of currentWorkers) {
356
+ if (!oldWorker) {
357
+ continue;
358
+ }
359
+
360
+ // Fork a new worker
361
+ log.info("Spawning replacement worker...");
362
+ const newWorker = cluster.fork();
363
+
364
+ // Wait for the new worker to be online
365
+ await new Promise((resolve) => {
366
+ newWorker.once("online", resolve);
367
+ });
368
+
369
+ // Wait for the new worker to be listening (optional, but safer for zero-downtime)
370
+ // However, not all workers listen. strict zero-downtime usually implies listening.
371
+ // We'll stick to 'online' for generic support in v1,
372
+ // but maybe add a small delay or check?
373
+ // For now, 'online' means the process is up and running.
374
+
375
+ log.info(`Replacement worker ${newWorker.process.pid} is online. Gracefully shutting down old worker ${oldWorker.process.pid}...`);
376
+
377
+ // Gracefully disconnect the old worker
378
+ oldWorker.disconnect();
379
+
380
+ // We don't strictly wait for the old worker to die here to speed up deployment,
381
+ // but it handles its own shutdown.
382
+ // If we wanted strict serial replacement (one dies, then next starts), we'd wait.
383
+ // But typically we want overlap.
384
+
385
+ // Wait for disconnect confirmation or short timeout to proceed to next
386
+ const disconnectPromise = new Promise(resolve => oldWorker.once("disconnect", resolve));
387
+ const timeoutPromise = new Promise(resolve => setTimeout(resolve, 2000).unref());
388
+ await Promise.race([disconnectPromise, timeoutPromise]);
389
+ }
390
+ log.info("Cluster reload complete.");
391
+ }
392
+ };
393
+ }