npm - @ynode/cluster - Versions diffs - 1.1.0 → 1.2.2 - Mend

@ynode/cluster 1.1.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -41,11 +41,31 @@ const startServer = async () => {
 };
 // Start the cluster
-run(startServer, {
-    mode: "smart", // Enable auto-scaling (default)
+const control = run(startServer, {
+    mode: "smart",
     minWorkers: 2,
-    maxWorkers: 8 // Default is os.availableParallelism()
+    maxWorkers: 4
 });
+// Access metrics
+setInterval(() => {
+    console.log(control.getMetrics());
+}, 5000);
+// Trigger zero-downtime reload (e.g., on SIGHUP or API call)
+// control.reload();
+```
+### Zero-Downtime Reload
+You can reload the cluster (e.g. after a code deployment) without dropping connections using `control.reload()`. This will:
+1. Sequentially start a new worker.
+2. Wait for it to come online.
+3. Gracefully shutdown the old worker.
+```js
+await control.reload();
+console.log("Reload complete!");
 ```
 ## Configuration
@@ -62,6 +82,27 @@ The `run(startWorker, options)` function accepts the following options:
 | `scaleDownThreshold` | `number` | `10` | Event loop lag (ms) threshold to trigger scaling down. |
 | `scalingCooldown` | `number` | `10000` | Minimum time (ms) between scaling actions. |
 | `scaleDownGrace` | `number` | `30000` | Grace period (ms) after scaling up before scaling down is allowed. |
+| `autoScaleInterval` | `number` | `5000` | Interval (ms) for auto-scaling checks in "smart" mode. |
+| `shutdownSignals` | `string[]` | `['SIGINT', 'SIGTERM', 'SIGQUIT']` | Signals to listen for to trigger graceful shutdown. |
+| `shutdownTimeout` | `number` | `10000` | Time (ms) to wait for workers to shutdown before forced exit. |
+| `scaleUpMemory` | `number` | `0` | Threshold (MB) for average heap usage to trigger scaling up. |
+| `maxWorkerMemory` | `number` | `0` | Max heap usage (MB) for a worker before restart (Leak Protection). |
+| `norestart` | `boolean` | `false` | If true, workers will not be restarted when they die. |
+## Accessing Metrics
+The `run()` function returns a `ClusterManager` instance (when in cluster mode) which exposes current metrics.
+```javascript
+const manager = run(startWorker, { mode: "smart" });
+// In your monitoring loop or API endpoint:
+if (manager) {
+    const metrics = manager.getMetrics();
+    console.log(`Current Lag: ${metrics.avgLag.toFixed(2)}ms`);
+    console.log(`Active Workers: ${metrics.workerCount}`);
+}
+```
 ## Working with @ynode/autoshutdown

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@ynode/cluster",
-    "version": "1.1.0",
+    "version": "1.2.2",
     "description": "Smart, auto-scaling Node.js cluster manager that monitors event loop lag to optimize performance and resource usage.",
     "main": "src/cluster.js",
     "exports": {

package/src/cluster.js CHANGED Viewed

@@ -50,10 +50,36 @@ import os from "node:os";
  * @param {object} log - The logger instance.
  */
 export function run(startWorker, options = true, log = console) {
-    const isEnabled = typeof options === "object" ? options.enabled : options;
+    const isEnabled = typeof options === "object" ? (options.enabled ?? true) : options;
     if (cluster.isWorker || !isEnabled) {
         log.info(`Running worker process.`);
+        // Start heartbeat loop if enabled (and we are clustering)
+        if (cluster.isWorker) {
+            ;
+            let lastCheck = Date.now();
+            setInterval(() => {
+                const now = Date.now();
+                // Approximate event loop lag
+                const lag = now - lastCheck - 2000;
+                lastCheck = now;
+                const memory = process.memoryUsage();
+                try {
+                    process.send({
+                        cmd: "heartbeat",
+                        lag: Math.max(0, lag),
+                        memory: memory.heapUsed // Use heapUsed for primary scaling/monitoring
+                    });
+                } catch (err) {
+                    // Ignore, channel probably closed
+                    log.debug("Failed to send heartbeat to master", err);
+                }
+            }, 2000).unref();
+        }
         return startWorker();
     }
@@ -67,6 +93,12 @@ export function run(startWorker, options = true, log = console) {
         mode = "smart", // 'smart' or 'max'
         scalingCooldown = 10000,
         scaleDownGrace = 30000,
+        autoScaleInterval = 5000,
+        shutdownSignals = ["SIGINT", "SIGTERM", "SIGQUIT"],
+        shutdownTimeout = 10000,
+        scaleUpMemory = 0, // MB (0 = disabled)
+        maxWorkerMemory = 0, // MB (0 = disabled)
+        norestart = false,
     } = typeof options === "object" ? options : {};
     if (minWorkers > maxWorkers) {
@@ -117,6 +149,7 @@ export function run(startWorker, options = true, log = console) {
         worker.on("message", (msg) => {
             if (msg.cmd === "heartbeat") {
+                // console.log(`[Master] Heartbeat from ${worker.id}: ${msg.memory} bytes`);
                 workerLoads.set(worker.id, {
                     lag: msg.lag,
                     lastSeen: Date.now(),
@@ -138,6 +171,10 @@ export function run(startWorker, options = true, log = console) {
             return log.info(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}.`);
         }
+        if (norestart) {
+            return log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Not restarting (norestart enabled).`);
+        }
         log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Restarting...`);
         try {
             cluster.fork();
@@ -176,10 +213,44 @@ export function run(startWorker, options = true, log = console) {
             }
             const avgLag = totalLag / count;
+            // Calculate Average Memory in MB
+            let totalMemory = 0; // Bytes
+            for (const stats of workerLoads.values()) {
+                if (stats.memory) {
+                    totalMemory += stats.memory;
+                }
+            }
+            const avgMemoryMB = count > 0 ? (totalMemory / count) / 1024 / 1024 : 0;
             const currentWorkers = Object.keys(cluster.workers).length;
-            if (avgLag > scaleUpThreshold && currentWorkers < maxWorkers) {
-                log.info(`High load detected (Avg Lag: ${avgLag.toFixed(2)}ms). Scaling up...`);
+            // Leak Protection (Max Worker Memory)
+            if (maxWorkerMemory > 0) {
+                for (const [id, stats] of workerLoads.entries()) {
+                    const memMB = stats.memory / 1024 / 1024;
+                    // console.log(`[Master] Checking Worker ${id} Memory: ${memMB.toFixed(2)}MB (Limit: ${maxWorkerMemory}MB)`);
+                    if (memMB > maxWorkerMemory) {
+                        log.warn(`Worker ${id} exceeded memory limit (${memMB.toFixed(2)}MB > ${maxWorkerMemory}MB). Restarting...`);
+                        const worker = cluster.workers[id];
+                        if (worker) {
+                            worker.kill();
+                        }
+                        // Exit handler will restart it
+                        return; // Wait for restart
+                    }
+                }
+            }
+            // Scale Up logic (Lag OR Memory)
+            const shouldScaleUpLag = avgLag > scaleUpThreshold;
+            const shouldScaleUpMem = scaleUpMemory > 0 && avgMemoryMB > scaleUpMemory;
+            if ((shouldScaleUpLag || shouldScaleUpMem) && currentWorkers < maxWorkers) {
+                const reason = shouldScaleUpMem
+                    ? `High Memory (Avg: ${avgMemoryMB.toFixed(2)}MB)`
+                    : `High Lag (Avg: ${avgLag.toFixed(2)}ms)`;
+                log.info(`${reason} detected. Scaling up...`);
                 try {
                     cluster.fork();
                 } catch (err) {
@@ -208,34 +279,115 @@ export function run(startWorker, options = true, log = console) {
                 return;
             }
-        }, 5000).unref();
+            return;
+        }, autoScaleInterval).unref();
     }
     // Graceful shutdown handling for Master
-    const signals = ["SIGINT", "SIGTERM", "SIGQUIT"];
-    signals.forEach((signal) => {
-        process.on(signal, () => {
-            log.info(`Master received ${signal}, shutting down workers...`);
-            isShuttingDown = true;
-            for (const worker of Object.values(cluster.workers)) {
-                if (worker && worker.isConnected()) {
-                    worker.send("shutdown");
+    if (Array.isArray(shutdownSignals) && shutdownSignals.length > 0) {
+        shutdownSignals.forEach((signal) => {
+            process.on(signal, () => {
+                log.info(`Master received ${signal}, shutting down workers...`);
+                isShuttingDown = true;
+                for (const worker of Object.values(cluster.workers)) {
+                    if (worker && worker.isConnected()) {
+                        worker.send("shutdown");
+                    }
                 }
-            }
-            // Allow some time for workers to clean up?
-            // Ideally we wait for them to exit, but for now we just let the process exit eventually
-            // or rely on the fact that existing "shutdown" message logic in worker handles close.
-            // We can just exit the master after a timeout if we want to force it,
-            // but usually letting workers exit will cause master to exit if all handles are closed.
-            // For safety in this template, we'll force exit after a timeout.
-            setTimeout(() => {
-                log.info("Master force exiting after timeout.");
-                process.exit(0);
-            }, 10000).unref();
+                // Allow some time for workers to clean up
+                if (shutdownTimeout > 0) {
+                    setTimeout(() => {
+                        log.warn(`Master force exiting after ${shutdownTimeout / 1000}s timeout.`);
+                        process.exit(0);
+                    }, shutdownTimeout).unref();
+                }
+            });
         });
-    });
-}
+    }
+    // Expose metrics API
+    return {
+        getMetrics: () => {
+            const currentWorkers = Object.keys(cluster.workers).length;
+            let totalLag = 0;
+            let count = 0;
+            const workersData = [];
+            for (const [id, stats] of workerLoads.entries()) {
+                totalLag += stats.lag;
+                count++;
+                const worker = cluster.workers[id];
+                workersData.push({
+                    id,
+                    pid: worker?.process.pid,
+                    lag: stats.lag,
+                    memory: stats.memory,
+                    lastSeen: stats.lastSeen,
+                    upltime: worker && (Date.now() - stats.lastSeen)
+                });
+            }
+            const avgLag = count > 0 ? (totalLag / count) : 0;
+            return {
+                workers: workersData,
+                totalLag,
+                avgLag,
+                workerCount: currentWorkers,
+                maxWorkers,
+                minWorkers,
+                scaleUpThreshold,
+                scaleDownThreshold,
+                mode
+            };
+        },
+        reload: async () => {
+            if (isShuttingDown) {
+                return;
+            }
+            log.info("Starting zero-downtime cluster reload...");
+            // Get a snapshot of current workers to replace
+            const currentWorkers = Object.values(cluster.workers);
+            for (const oldWorker of currentWorkers) {
+                if (!oldWorker) {
+                    continue;
+                }
+                // Fork a new worker
+                log.info("Spawning replacement worker...");
+                const newWorker = cluster.fork();
+                // Wait for the new worker to be online
+                await new Promise((resolve) => {
+                    newWorker.once("online", resolve);
+                });
+                // Wait for the new worker to be listening (optional, but safer for zero-downtime)
+                // However, not all workers listen. strict zero-downtime usually implies listening.
+                // We'll stick to 'online' for generic support in v1,
+                // but maybe add a small delay or check?
+                // For now, 'online' means the process is up and running.
+                log.info(`Replacement worker ${newWorker.process.pid} is online. Gracefully shutting down old worker ${oldWorker.process.pid}...`);
+                // Gracefully disconnect the old worker
+                oldWorker.disconnect();
+                // We don't strictly wait for the old worker to die here to speed up deployment,
+                // but it handles its own shutdown.
+                // If we wanted strict serial replacement (one dies, then next starts), we'd wait.
+                // But typically we want overlap.
+                // Wait for disconnect confirmation or short timeout to proceed to next
+                const disconnectPromise = new Promise(resolve => oldWorker.once("disconnect", resolve));
+                const timeoutPromise = new Promise(resolve => setTimeout(resolve, 2000).unref());
+                await Promise.race([disconnectPromise, timeoutPromise]);
+            }
+            log.info("Cluster reload complete.");
+        }
+    };
+}