@ynode/cluster 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -3
- package/package.json +1 -1
- package/src/cluster.js +158 -5
package/README.md
CHANGED
|
@@ -41,11 +41,31 @@ const startServer = async () => {
|
|
|
41
41
|
};
|
|
42
42
|
|
|
43
43
|
// Start the cluster
|
|
44
|
-
run(startServer, {
|
|
45
|
-
mode: "smart",
|
|
44
|
+
const control = run(startServer, {
|
|
45
|
+
mode: "smart",
|
|
46
46
|
minWorkers: 2,
|
|
47
|
-
maxWorkers:
|
|
47
|
+
maxWorkers: 4
|
|
48
48
|
});
|
|
49
|
+
|
|
50
|
+
// Access metrics
|
|
51
|
+
setInterval(() => {
|
|
52
|
+
console.log(control.getMetrics());
|
|
53
|
+
}, 5000);
|
|
54
|
+
|
|
55
|
+
// Trigger zero-downtime reload (e.g., on SIGHUP or API call)
|
|
56
|
+
// control.reload();
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Zero-Downtime Reload
|
|
60
|
+
|
|
61
|
+
You can reload the cluster (e.g. after a code deployment) without dropping connections using `control.reload()`. This will:
|
|
62
|
+
1. Sequentially start a new worker.
|
|
63
|
+
2. Wait for it to come online.
|
|
64
|
+
3. Gracefully shutdown the old worker.
|
|
65
|
+
|
|
66
|
+
```js
|
|
67
|
+
await control.reload();
|
|
68
|
+
console.log("Reload complete!");
|
|
49
69
|
```
|
|
50
70
|
|
|
51
71
|
## Configuration
|
|
@@ -65,6 +85,24 @@ The `run(startWorker, options)` function accepts the following options:
|
|
|
65
85
|
| `autoScaleInterval` | `number` | `5000` | Interval (ms) for auto-scaling checks in "smart" mode. |
|
|
66
86
|
| `shutdownSignals` | `string[]` | `['SIGINT', 'SIGTERM', 'SIGQUIT']` | Signals to listen for to trigger graceful shutdown. |
|
|
67
87
|
| `shutdownTimeout` | `number` | `10000` | Time (ms) to wait for workers to shutdown before forced exit. |
|
|
88
|
+
| `scaleUpMemory` | `number` | `0` | Threshold (MB) for average heap usage to trigger scaling up. |
|
|
89
|
+
| `maxWorkerMemory` | `number` | `0` | Max heap usage (MB) for a worker before restart (Leak Protection). |
|
|
90
|
+
| `norestart` | `boolean` | `false` | If true, workers will not be restarted when they die. |
|
|
91
|
+
|
|
92
|
+
## Accessing Metrics
|
|
93
|
+
|
|
94
|
+
The `run()` function returns a `ClusterManager` instance (when in cluster mode) which exposes current metrics.
|
|
95
|
+
|
|
96
|
+
```javascript
|
|
97
|
+
const manager = run(startWorker, { mode: "smart" });
|
|
98
|
+
|
|
99
|
+
// In your monitoring loop or API endpoint:
|
|
100
|
+
if (manager) {
|
|
101
|
+
const metrics = manager.getMetrics();
|
|
102
|
+
console.log(`Current Lag: ${metrics.avgLag.toFixed(2)}ms`);
|
|
103
|
+
console.log(`Active Workers: ${metrics.workerCount}`);
|
|
104
|
+
}
|
|
105
|
+
```
|
|
68
106
|
|
|
69
107
|
## Working with @ynode/autoshutdown
|
|
70
108
|
|
package/package.json
CHANGED
package/src/cluster.js
CHANGED
|
@@ -50,10 +50,36 @@ import os from "node:os";
|
|
|
50
50
|
* @param {object} log - The logger instance.
|
|
51
51
|
*/
|
|
52
52
|
export function run(startWorker, options = true, log = console) {
|
|
53
|
-
const isEnabled = typeof options === "object" ? options.enabled : options;
|
|
53
|
+
const isEnabled = typeof options === "object" ? (options.enabled ?? true) : options;
|
|
54
54
|
|
|
55
55
|
if (cluster.isWorker || !isEnabled) {
|
|
56
56
|
log.info(`Running worker process.`);
|
|
57
|
+
|
|
58
|
+
// Start heartbeat loop if enabled (and we are clustering)
|
|
59
|
+
if (cluster.isWorker) {
|
|
60
|
+
;
|
|
61
|
+
let lastCheck = Date.now();
|
|
62
|
+
setInterval(() => {
|
|
63
|
+
const now = Date.now();
|
|
64
|
+
// Approximate event loop lag
|
|
65
|
+
const lag = now - lastCheck - 2000;
|
|
66
|
+
lastCheck = now;
|
|
67
|
+
|
|
68
|
+
const memory = process.memoryUsage();
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
process.send({
|
|
72
|
+
cmd: "heartbeat",
|
|
73
|
+
lag: Math.max(0, lag),
|
|
74
|
+
memory: memory.heapUsed // Use heapUsed for primary scaling/monitoring
|
|
75
|
+
});
|
|
76
|
+
} catch (err) {
|
|
77
|
+
// Ignore, channel probably closed
|
|
78
|
+
log.debug("Failed to send heartbeat to master", err);
|
|
79
|
+
}
|
|
80
|
+
}, 2000).unref();
|
|
81
|
+
}
|
|
82
|
+
|
|
57
83
|
return startWorker();
|
|
58
84
|
}
|
|
59
85
|
|
|
@@ -70,6 +96,9 @@ export function run(startWorker, options = true, log = console) {
|
|
|
70
96
|
autoScaleInterval = 5000,
|
|
71
97
|
shutdownSignals = ["SIGINT", "SIGTERM", "SIGQUIT"],
|
|
72
98
|
shutdownTimeout = 10000,
|
|
99
|
+
scaleUpMemory = 0, // MB (0 = disabled)
|
|
100
|
+
maxWorkerMemory = 0, // MB (0 = disabled)
|
|
101
|
+
norestart = false,
|
|
73
102
|
} = typeof options === "object" ? options : {};
|
|
74
103
|
|
|
75
104
|
if (minWorkers > maxWorkers) {
|
|
@@ -120,6 +149,7 @@ export function run(startWorker, options = true, log = console) {
|
|
|
120
149
|
|
|
121
150
|
worker.on("message", (msg) => {
|
|
122
151
|
if (msg.cmd === "heartbeat") {
|
|
152
|
+
// console.log(`[Master] Heartbeat from ${worker.id}: ${msg.memory} bytes`);
|
|
123
153
|
workerLoads.set(worker.id, {
|
|
124
154
|
lag: msg.lag,
|
|
125
155
|
lastSeen: Date.now(),
|
|
@@ -141,6 +171,10 @@ export function run(startWorker, options = true, log = console) {
|
|
|
141
171
|
return log.info(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}.`);
|
|
142
172
|
}
|
|
143
173
|
|
|
174
|
+
if (norestart) {
|
|
175
|
+
return log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Not restarting (norestart enabled).`);
|
|
176
|
+
}
|
|
177
|
+
|
|
144
178
|
log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Restarting...`);
|
|
145
179
|
try {
|
|
146
180
|
cluster.fork();
|
|
@@ -179,10 +213,44 @@ export function run(startWorker, options = true, log = console) {
|
|
|
179
213
|
}
|
|
180
214
|
|
|
181
215
|
const avgLag = totalLag / count;
|
|
216
|
+
// Calculate Average Memory in MB
|
|
217
|
+
let totalMemory = 0; // Bytes
|
|
218
|
+
for (const stats of workerLoads.values()) {
|
|
219
|
+
if (stats.memory) {
|
|
220
|
+
totalMemory += stats.memory;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const avgMemoryMB = count > 0 ? (totalMemory / count) / 1024 / 1024 : 0;
|
|
224
|
+
|
|
182
225
|
const currentWorkers = Object.keys(cluster.workers).length;
|
|
183
226
|
|
|
184
|
-
|
|
185
|
-
|
|
227
|
+
// Leak Protection (Max Worker Memory)
|
|
228
|
+
if (maxWorkerMemory > 0) {
|
|
229
|
+
for (const [id, stats] of workerLoads.entries()) {
|
|
230
|
+
const memMB = stats.memory / 1024 / 1024;
|
|
231
|
+
// console.log(`[Master] Checking Worker ${id} Memory: ${memMB.toFixed(2)}MB (Limit: ${maxWorkerMemory}MB)`);
|
|
232
|
+
if (memMB > maxWorkerMemory) {
|
|
233
|
+
log.warn(`Worker ${id} exceeded memory limit (${memMB.toFixed(2)}MB > ${maxWorkerMemory}MB). Restarting...`);
|
|
234
|
+
const worker = cluster.workers[id];
|
|
235
|
+
if (worker) {
|
|
236
|
+
worker.kill();
|
|
237
|
+
}
|
|
238
|
+
// Exit handler will restart it
|
|
239
|
+
return; // Wait for restart
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Scale Up logic (Lag OR Memory)
|
|
245
|
+
const shouldScaleUpLag = avgLag > scaleUpThreshold;
|
|
246
|
+
const shouldScaleUpMem = scaleUpMemory > 0 && avgMemoryMB > scaleUpMemory;
|
|
247
|
+
|
|
248
|
+
if ((shouldScaleUpLag || shouldScaleUpMem) && currentWorkers < maxWorkers) {
|
|
249
|
+
const reason = shouldScaleUpMem
|
|
250
|
+
? `High Memory (Avg: ${avgMemoryMB.toFixed(2)}MB)`
|
|
251
|
+
: `High Lag (Avg: ${avgLag.toFixed(2)}ms)`;
|
|
252
|
+
|
|
253
|
+
log.info(`${reason} detected. Scaling up...`);
|
|
186
254
|
try {
|
|
187
255
|
cluster.fork();
|
|
188
256
|
} catch (err) {
|
|
@@ -230,11 +298,96 @@ export function run(startWorker, options = true, log = console) {
|
|
|
230
298
|
// Allow some time for workers to clean up
|
|
231
299
|
if (shutdownTimeout > 0) {
|
|
232
300
|
setTimeout(() => {
|
|
233
|
-
log.warn(`Master force exiting after ${shutdownTimeout}s timeout.`);
|
|
301
|
+
log.warn(`Master force exiting after ${shutdownTimeout / 1000}s timeout.`);
|
|
234
302
|
process.exit(0);
|
|
235
|
-
}, shutdownTimeout
|
|
303
|
+
}, shutdownTimeout).unref();
|
|
236
304
|
}
|
|
237
305
|
});
|
|
238
306
|
});
|
|
239
307
|
}
|
|
308
|
+
|
|
309
|
+
// Expose metrics API
|
|
310
|
+
return {
|
|
311
|
+
getMetrics: () => {
|
|
312
|
+
const currentWorkers = Object.keys(cluster.workers).length;
|
|
313
|
+
let totalLag = 0;
|
|
314
|
+
let count = 0;
|
|
315
|
+
const workersData = [];
|
|
316
|
+
|
|
317
|
+
for (const [id, stats] of workerLoads.entries()) {
|
|
318
|
+
totalLag += stats.lag;
|
|
319
|
+
count++;
|
|
320
|
+
|
|
321
|
+
const worker = cluster.workers[id];
|
|
322
|
+
workersData.push({
|
|
323
|
+
id,
|
|
324
|
+
pid: worker?.process.pid,
|
|
325
|
+
lag: stats.lag,
|
|
326
|
+
memory: stats.memory,
|
|
327
|
+
lastSeen: stats.lastSeen,
|
|
328
|
+
upltime: worker && (Date.now() - stats.lastSeen)
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const avgLag = count > 0 ? (totalLag / count) : 0;
|
|
333
|
+
|
|
334
|
+
return {
|
|
335
|
+
workers: workersData,
|
|
336
|
+
totalLag,
|
|
337
|
+
avgLag,
|
|
338
|
+
workerCount: currentWorkers,
|
|
339
|
+
maxWorkers,
|
|
340
|
+
minWorkers,
|
|
341
|
+
scaleUpThreshold,
|
|
342
|
+
scaleDownThreshold,
|
|
343
|
+
mode
|
|
344
|
+
};
|
|
345
|
+
},
|
|
346
|
+
reload: async () => {
|
|
347
|
+
if (isShuttingDown) {
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
log.info("Starting zero-downtime cluster reload...");
|
|
351
|
+
|
|
352
|
+
// Get a snapshot of current workers to replace
|
|
353
|
+
const currentWorkers = Object.values(cluster.workers);
|
|
354
|
+
|
|
355
|
+
for (const oldWorker of currentWorkers) {
|
|
356
|
+
if (!oldWorker) {
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Fork a new worker
|
|
361
|
+
log.info("Spawning replacement worker...");
|
|
362
|
+
const newWorker = cluster.fork();
|
|
363
|
+
|
|
364
|
+
// Wait for the new worker to be online
|
|
365
|
+
await new Promise((resolve) => {
|
|
366
|
+
newWorker.once("online", resolve);
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
// Wait for the new worker to be listening (optional, but safer for zero-downtime)
|
|
370
|
+
// However, not all workers listen. strict zero-downtime usually implies listening.
|
|
371
|
+
// We'll stick to 'online' for generic support in v1,
|
|
372
|
+
// but maybe add a small delay or check?
|
|
373
|
+
// For now, 'online' means the process is up and running.
|
|
374
|
+
|
|
375
|
+
log.info(`Replacement worker ${newWorker.process.pid} is online. Gracefully shutting down old worker ${oldWorker.process.pid}...`);
|
|
376
|
+
|
|
377
|
+
// Gracefully disconnect the old worker
|
|
378
|
+
oldWorker.disconnect();
|
|
379
|
+
|
|
380
|
+
// We don't strictly wait for the old worker to die here to speed up deployment,
|
|
381
|
+
// but it handles its own shutdown.
|
|
382
|
+
// If we wanted strict serial replacement (one dies, then next starts), we'd wait.
|
|
383
|
+
// But typically we want overlap.
|
|
384
|
+
|
|
385
|
+
// Wait for disconnect confirmation or short timeout to proceed to next
|
|
386
|
+
const disconnectPromise = new Promise(resolve => oldWorker.once("disconnect", resolve));
|
|
387
|
+
const timeoutPromise = new Promise(resolve => setTimeout(resolve, 2000).unref());
|
|
388
|
+
await Promise.race([disconnectPromise, timeoutPromise]);
|
|
389
|
+
}
|
|
390
|
+
log.info("Cluster reload complete.");
|
|
391
|
+
}
|
|
392
|
+
};
|
|
240
393
|
}
|