@ynode/cluster 1.1.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -3
- package/package.json +1 -1
- package/src/cluster.js +178 -26
package/README.md
CHANGED
|
@@ -41,11 +41,31 @@ const startServer = async () => {
|
|
|
41
41
|
};
|
|
42
42
|
|
|
43
43
|
// Start the cluster
|
|
44
|
-
run(startServer, {
|
|
45
|
-
mode: "smart",
|
|
44
|
+
const control = run(startServer, {
|
|
45
|
+
mode: "smart",
|
|
46
46
|
minWorkers: 2,
|
|
47
|
-
maxWorkers:
|
|
47
|
+
maxWorkers: 4
|
|
48
48
|
});
|
|
49
|
+
|
|
50
|
+
// Access metrics
|
|
51
|
+
setInterval(() => {
|
|
52
|
+
console.log(control.getMetrics());
|
|
53
|
+
}, 5000);
|
|
54
|
+
|
|
55
|
+
// Trigger zero-downtime reload (e.g., on SIGHUP or API call)
|
|
56
|
+
// control.reload();
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Zero-Downtime Reload
|
|
60
|
+
|
|
61
|
+
You can reload the cluster (e.g. after a code deployment) without dropping connections using `control.reload()`. This will:
|
|
62
|
+
1. Sequentially start a new worker.
|
|
63
|
+
2. Wait for it to come online.
|
|
64
|
+
3. Gracefully shutdown the old worker.
|
|
65
|
+
|
|
66
|
+
```js
|
|
67
|
+
await control.reload();
|
|
68
|
+
console.log("Reload complete!");
|
|
49
69
|
```
|
|
50
70
|
|
|
51
71
|
## Configuration
|
|
@@ -62,6 +82,27 @@ The `run(startWorker, options)` function accepts the following options:
|
|
|
62
82
|
| `scaleDownThreshold` | `number` | `10` | Event loop lag (ms) threshold to trigger scaling down. |
|
|
63
83
|
| `scalingCooldown` | `number` | `10000` | Minimum time (ms) between scaling actions. |
|
|
64
84
|
| `scaleDownGrace` | `number` | `30000` | Grace period (ms) after scaling up before scaling down is allowed. |
|
|
85
|
+
| `autoScaleInterval` | `number` | `5000` | Interval (ms) for auto-scaling checks in "smart" mode. |
|
|
86
|
+
| `shutdownSignals` | `string[]` | `['SIGINT', 'SIGTERM', 'SIGQUIT']` | Signals to listen for to trigger graceful shutdown. |
|
|
87
|
+
| `shutdownTimeout` | `number` | `10000` | Time (ms) to wait for workers to shutdown before forced exit. |
|
|
88
|
+
| `scaleUpMemory` | `number` | `0` | Threshold (MB) for average heap usage to trigger scaling up. |
|
|
89
|
+
| `maxWorkerMemory` | `number` | `0` | Max heap usage (MB) for a worker before restart (Leak Protection). |
|
|
90
|
+
| `norestart` | `boolean` | `false` | If true, workers will not be restarted when they die. |
|
|
91
|
+
|
|
92
|
+
## Accessing Metrics
|
|
93
|
+
|
|
94
|
+
The `run()` function returns a `ClusterManager` instance (when in cluster mode) which exposes current metrics.
|
|
95
|
+
|
|
96
|
+
```javascript
|
|
97
|
+
const manager = run(startWorker, { mode: "smart" });
|
|
98
|
+
|
|
99
|
+
// In your monitoring loop or API endpoint:
|
|
100
|
+
if (manager) {
|
|
101
|
+
const metrics = manager.getMetrics();
|
|
102
|
+
console.log(`Current Lag: ${metrics.avgLag.toFixed(2)}ms`);
|
|
103
|
+
console.log(`Active Workers: ${metrics.workerCount}`);
|
|
104
|
+
}
|
|
105
|
+
```
|
|
65
106
|
|
|
66
107
|
## Working with @ynode/autoshutdown
|
|
67
108
|
|
package/package.json
CHANGED
package/src/cluster.js
CHANGED
|
@@ -50,10 +50,36 @@ import os from "node:os";
|
|
|
50
50
|
* @param {object} log - The logger instance.
|
|
51
51
|
*/
|
|
52
52
|
export function run(startWorker, options = true, log = console) {
|
|
53
|
-
const isEnabled = typeof options === "object" ? options.enabled : options;
|
|
53
|
+
const isEnabled = typeof options === "object" ? (options.enabled ?? true) : options;
|
|
54
54
|
|
|
55
55
|
if (cluster.isWorker || !isEnabled) {
|
|
56
56
|
log.info(`Running worker process.`);
|
|
57
|
+
|
|
58
|
+
// Start heartbeat loop if enabled (and we are clustering)
|
|
59
|
+
if (cluster.isWorker) {
|
|
60
|
+
;
|
|
61
|
+
let lastCheck = Date.now();
|
|
62
|
+
setInterval(() => {
|
|
63
|
+
const now = Date.now();
|
|
64
|
+
// Approximate event loop lag
|
|
65
|
+
const lag = now - lastCheck - 2000;
|
|
66
|
+
lastCheck = now;
|
|
67
|
+
|
|
68
|
+
const memory = process.memoryUsage();
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
process.send({
|
|
72
|
+
cmd: "heartbeat",
|
|
73
|
+
lag: Math.max(0, lag),
|
|
74
|
+
memory: memory.heapUsed // Use heapUsed for primary scaling/monitoring
|
|
75
|
+
});
|
|
76
|
+
} catch (err) {
|
|
77
|
+
// Ignore, channel probably closed
|
|
78
|
+
log.debug("Failed to send heartbeat to master", err);
|
|
79
|
+
}
|
|
80
|
+
}, 2000).unref();
|
|
81
|
+
}
|
|
82
|
+
|
|
57
83
|
return startWorker();
|
|
58
84
|
}
|
|
59
85
|
|
|
@@ -67,6 +93,12 @@ export function run(startWorker, options = true, log = console) {
|
|
|
67
93
|
mode = "smart", // 'smart' or 'max'
|
|
68
94
|
scalingCooldown = 10000,
|
|
69
95
|
scaleDownGrace = 30000,
|
|
96
|
+
autoScaleInterval = 5000,
|
|
97
|
+
shutdownSignals = ["SIGINT", "SIGTERM", "SIGQUIT"],
|
|
98
|
+
shutdownTimeout = 10000,
|
|
99
|
+
scaleUpMemory = 0, // MB (0 = disabled)
|
|
100
|
+
maxWorkerMemory = 0, // MB (0 = disabled)
|
|
101
|
+
norestart = false,
|
|
70
102
|
} = typeof options === "object" ? options : {};
|
|
71
103
|
|
|
72
104
|
if (minWorkers > maxWorkers) {
|
|
@@ -117,6 +149,7 @@ export function run(startWorker, options = true, log = console) {
|
|
|
117
149
|
|
|
118
150
|
worker.on("message", (msg) => {
|
|
119
151
|
if (msg.cmd === "heartbeat") {
|
|
152
|
+
// console.log(`[Master] Heartbeat from ${worker.id}: ${msg.memory} bytes`);
|
|
120
153
|
workerLoads.set(worker.id, {
|
|
121
154
|
lag: msg.lag,
|
|
122
155
|
lastSeen: Date.now(),
|
|
@@ -138,6 +171,10 @@ export function run(startWorker, options = true, log = console) {
|
|
|
138
171
|
return log.info(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}.`);
|
|
139
172
|
}
|
|
140
173
|
|
|
174
|
+
if (norestart) {
|
|
175
|
+
return log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Not restarting (norestart enabled).`);
|
|
176
|
+
}
|
|
177
|
+
|
|
141
178
|
log.warn(`Worker [${worker.process.pid}: ${currentWorkers} of ${maxWorkers}] died. Code: ${code}, Signal: ${signal}. Restarting...`);
|
|
142
179
|
try {
|
|
143
180
|
cluster.fork();
|
|
@@ -176,10 +213,44 @@ export function run(startWorker, options = true, log = console) {
|
|
|
176
213
|
}
|
|
177
214
|
|
|
178
215
|
const avgLag = totalLag / count;
|
|
216
|
+
// Calculate Average Memory in MB
|
|
217
|
+
let totalMemory = 0; // Bytes
|
|
218
|
+
for (const stats of workerLoads.values()) {
|
|
219
|
+
if (stats.memory) {
|
|
220
|
+
totalMemory += stats.memory;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const avgMemoryMB = count > 0 ? (totalMemory / count) / 1024 / 1024 : 0;
|
|
224
|
+
|
|
179
225
|
const currentWorkers = Object.keys(cluster.workers).length;
|
|
180
226
|
|
|
181
|
-
|
|
182
|
-
|
|
227
|
+
// Leak Protection (Max Worker Memory)
|
|
228
|
+
if (maxWorkerMemory > 0) {
|
|
229
|
+
for (const [id, stats] of workerLoads.entries()) {
|
|
230
|
+
const memMB = stats.memory / 1024 / 1024;
|
|
231
|
+
// console.log(`[Master] Checking Worker ${id} Memory: ${memMB.toFixed(2)}MB (Limit: ${maxWorkerMemory}MB)`);
|
|
232
|
+
if (memMB > maxWorkerMemory) {
|
|
233
|
+
log.warn(`Worker ${id} exceeded memory limit (${memMB.toFixed(2)}MB > ${maxWorkerMemory}MB). Restarting...`);
|
|
234
|
+
const worker = cluster.workers[id];
|
|
235
|
+
if (worker) {
|
|
236
|
+
worker.kill();
|
|
237
|
+
}
|
|
238
|
+
// Exit handler will restart it
|
|
239
|
+
return; // Wait for restart
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Scale Up logic (Lag OR Memory)
|
|
245
|
+
const shouldScaleUpLag = avgLag > scaleUpThreshold;
|
|
246
|
+
const shouldScaleUpMem = scaleUpMemory > 0 && avgMemoryMB > scaleUpMemory;
|
|
247
|
+
|
|
248
|
+
if ((shouldScaleUpLag || shouldScaleUpMem) && currentWorkers < maxWorkers) {
|
|
249
|
+
const reason = shouldScaleUpMem
|
|
250
|
+
? `High Memory (Avg: ${avgMemoryMB.toFixed(2)}MB)`
|
|
251
|
+
: `High Lag (Avg: ${avgLag.toFixed(2)}ms)`;
|
|
252
|
+
|
|
253
|
+
log.info(`${reason} detected. Scaling up...`);
|
|
183
254
|
try {
|
|
184
255
|
cluster.fork();
|
|
185
256
|
} catch (err) {
|
|
@@ -208,34 +279,115 @@ export function run(startWorker, options = true, log = console) {
|
|
|
208
279
|
|
|
209
280
|
return;
|
|
210
281
|
}
|
|
211
|
-
|
|
282
|
+
return;
|
|
283
|
+
}, autoScaleInterval).unref();
|
|
212
284
|
}
|
|
213
285
|
|
|
214
286
|
// Graceful shutdown handling for Master
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
287
|
+
if (Array.isArray(shutdownSignals) && shutdownSignals.length > 0) {
|
|
288
|
+
shutdownSignals.forEach((signal) => {
|
|
289
|
+
process.on(signal, () => {
|
|
290
|
+
log.info(`Master received ${signal}, shutting down workers...`);
|
|
291
|
+
isShuttingDown = true;
|
|
292
|
+
for (const worker of Object.values(cluster.workers)) {
|
|
293
|
+
if (worker && worker.isConnected()) {
|
|
294
|
+
worker.send("shutdown");
|
|
295
|
+
}
|
|
224
296
|
}
|
|
225
|
-
}
|
|
226
297
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
process.exit(0);
|
|
236
|
-
}, 10000).unref();
|
|
298
|
+
// Allow some time for workers to clean up
|
|
299
|
+
if (shutdownTimeout > 0) {
|
|
300
|
+
setTimeout(() => {
|
|
301
|
+
log.warn(`Master force exiting after ${shutdownTimeout / 1000}s timeout.`);
|
|
302
|
+
process.exit(0);
|
|
303
|
+
}, shutdownTimeout).unref();
|
|
304
|
+
}
|
|
305
|
+
});
|
|
237
306
|
});
|
|
238
|
-
}
|
|
239
|
-
}
|
|
307
|
+
}
|
|
240
308
|
|
|
309
|
+
// Expose metrics API
|
|
310
|
+
return {
|
|
311
|
+
getMetrics: () => {
|
|
312
|
+
const currentWorkers = Object.keys(cluster.workers).length;
|
|
313
|
+
let totalLag = 0;
|
|
314
|
+
let count = 0;
|
|
315
|
+
const workersData = [];
|
|
316
|
+
|
|
317
|
+
for (const [id, stats] of workerLoads.entries()) {
|
|
318
|
+
totalLag += stats.lag;
|
|
319
|
+
count++;
|
|
320
|
+
|
|
321
|
+
const worker = cluster.workers[id];
|
|
322
|
+
workersData.push({
|
|
323
|
+
id,
|
|
324
|
+
pid: worker?.process.pid,
|
|
325
|
+
lag: stats.lag,
|
|
326
|
+
memory: stats.memory,
|
|
327
|
+
lastSeen: stats.lastSeen,
|
|
328
|
+
upltime: worker && (Date.now() - stats.lastSeen)
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const avgLag = count > 0 ? (totalLag / count) : 0;
|
|
333
|
+
|
|
334
|
+
return {
|
|
335
|
+
workers: workersData,
|
|
336
|
+
totalLag,
|
|
337
|
+
avgLag,
|
|
338
|
+
workerCount: currentWorkers,
|
|
339
|
+
maxWorkers,
|
|
340
|
+
minWorkers,
|
|
341
|
+
scaleUpThreshold,
|
|
342
|
+
scaleDownThreshold,
|
|
343
|
+
mode
|
|
344
|
+
};
|
|
345
|
+
},
|
|
346
|
+
reload: async () => {
|
|
347
|
+
if (isShuttingDown) {
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
log.info("Starting zero-downtime cluster reload...");
|
|
351
|
+
|
|
352
|
+
// Get a snapshot of current workers to replace
|
|
353
|
+
const currentWorkers = Object.values(cluster.workers);
|
|
241
354
|
|
|
355
|
+
for (const oldWorker of currentWorkers) {
|
|
356
|
+
if (!oldWorker) {
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Fork a new worker
|
|
361
|
+
log.info("Spawning replacement worker...");
|
|
362
|
+
const newWorker = cluster.fork();
|
|
363
|
+
|
|
364
|
+
// Wait for the new worker to be online
|
|
365
|
+
await new Promise((resolve) => {
|
|
366
|
+
newWorker.once("online", resolve);
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
// Wait for the new worker to be listening (optional, but safer for zero-downtime)
|
|
370
|
+
// However, not all workers listen. strict zero-downtime usually implies listening.
|
|
371
|
+
// We'll stick to 'online' for generic support in v1,
|
|
372
|
+
// but maybe add a small delay or check?
|
|
373
|
+
// For now, 'online' means the process is up and running.
|
|
374
|
+
|
|
375
|
+
log.info(`Replacement worker ${newWorker.process.pid} is online. Gracefully shutting down old worker ${oldWorker.process.pid}...`);
|
|
376
|
+
|
|
377
|
+
// Gracefully disconnect the old worker
|
|
378
|
+
oldWorker.disconnect();
|
|
379
|
+
|
|
380
|
+
// We don't strictly wait for the old worker to die here to speed up deployment,
|
|
381
|
+
// but it handles its own shutdown.
|
|
382
|
+
// If we wanted strict serial replacement (one dies, then next starts), we'd wait.
|
|
383
|
+
// But typically we want overlap.
|
|
384
|
+
|
|
385
|
+
// Wait for disconnect confirmation or short timeout to proceed to next
|
|
386
|
+
const disconnectPromise = new Promise(resolve => oldWorker.once("disconnect", resolve));
|
|
387
|
+
const timeoutPromise = new Promise(resolve => setTimeout(resolve, 2000).unref());
|
|
388
|
+
await Promise.race([disconnectPromise, timeoutPromise]);
|
|
389
|
+
}
|
|
390
|
+
log.info("Cluster reload complete.");
|
|
391
|
+
}
|
|
392
|
+
};
|
|
393
|
+
}
|