xypriss 1.2.4 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -3
- package/dist/cjs/mods/security/src/components/cache/index.js +1 -1
- package/dist/cjs/shared/logger/Logger.js +2 -0
- package/dist/cjs/shared/logger/Logger.js.map +1 -1
- package/dist/cjs/src/cluster/bun-cluster-manager.js +1567 -0
- package/dist/cjs/src/cluster/bun-cluster-manager.js.map +1 -0
- package/dist/cjs/src/cluster/cluster-manager.js +1 -1
- package/dist/cjs/src/cluster/cluster-manager.js.map +1 -1
- package/dist/cjs/src/cluster/index.js +25 -6
- package/dist/cjs/src/cluster/index.js.map +1 -1
- package/dist/cjs/src/cluster/memory-manager.js +486 -0
- package/dist/cjs/src/cluster/memory-manager.js.map +1 -0
- package/dist/cjs/src/cluster/modules/BunIPCManager.js +603 -0
- package/dist/cjs/src/cluster/modules/BunIPCManager.js.map +1 -0
- package/dist/cjs/src/cluster/modules/ClusterFactory.js +22 -1
- package/dist/cjs/src/cluster/modules/ClusterFactory.js.map +1 -1
- package/dist/cjs/src/cluster/modules/CpuMonitor.js +658 -0
- package/dist/cjs/src/cluster/modules/CpuMonitor.js.map +1 -0
- package/dist/cjs/src/cluster/modules/CrossPlatformMemory.js +257 -0
- package/dist/cjs/src/cluster/modules/CrossPlatformMemory.js.map +1 -0
- package/dist/cjs/src/cluster/modules/ProcessMonitor.js +513 -0
- package/dist/cjs/src/cluster/modules/ProcessMonitor.js.map +1 -0
- package/dist/cjs/src/plugins/server-maintenance-plugin.js +79 -14
- package/dist/cjs/src/plugins/server-maintenance-plugin.js.map +1 -1
- package/dist/cjs/src/server/FastServer.js +64 -43
- package/dist/cjs/src/server/FastServer.js.map +1 -1
- package/dist/cjs/src/server/components/fastapi/ClusterManagerComponent.js +226 -10
- package/dist/cjs/src/server/components/fastapi/ClusterManagerComponent.js.map +1 -1
- package/dist/cjs/src/server/const/Cluster.config.js +174 -31
- package/dist/cjs/src/server/const/Cluster.config.js.map +1 -1
- package/dist/cjs/src/server/const/default.js +11 -2
- package/dist/cjs/src/server/const/default.js.map +1 -1
- package/dist/cjs/src/server/utils/PortManager.js +26 -15
- package/dist/cjs/src/server/utils/PortManager.js.map +1 -1
- package/dist/esm/mods/security/src/components/cache/index.js +1 -1
- package/dist/esm/shared/logger/Logger.js +2 -0
- package/dist/esm/shared/logger/Logger.js.map +1 -1
- package/dist/esm/src/cluster/bun-cluster-manager.js +1565 -0
- package/dist/esm/src/cluster/bun-cluster-manager.js.map +1 -0
- package/dist/esm/src/cluster/cluster-manager.js +1 -1
- package/dist/esm/src/cluster/cluster-manager.js.map +1 -1
- package/dist/esm/src/cluster/index.js +25 -6
- package/dist/esm/src/cluster/index.js.map +1 -1
- package/dist/esm/src/cluster/memory-manager.js +484 -0
- package/dist/esm/src/cluster/memory-manager.js.map +1 -0
- package/dist/esm/src/cluster/modules/BunIPCManager.js +601 -0
- package/dist/esm/src/cluster/modules/BunIPCManager.js.map +1 -0
- package/dist/esm/src/cluster/modules/ClusterFactory.js +22 -1
- package/dist/esm/src/cluster/modules/ClusterFactory.js.map +1 -1
- package/dist/esm/src/cluster/modules/CpuMonitor.js +656 -0
- package/dist/esm/src/cluster/modules/CpuMonitor.js.map +1 -0
- package/dist/esm/src/cluster/modules/CrossPlatformMemory.js +255 -0
- package/dist/esm/src/cluster/modules/CrossPlatformMemory.js.map +1 -0
- package/dist/esm/src/cluster/modules/ProcessMonitor.js +511 -0
- package/dist/esm/src/cluster/modules/ProcessMonitor.js.map +1 -0
- package/dist/esm/src/plugins/server-maintenance-plugin.js +79 -14
- package/dist/esm/src/plugins/server-maintenance-plugin.js.map +1 -1
- package/dist/esm/src/server/FastServer.js +64 -43
- package/dist/esm/src/server/FastServer.js.map +1 -1
- package/dist/esm/src/server/components/fastapi/ClusterManagerComponent.js +226 -10
- package/dist/esm/src/server/components/fastapi/ClusterManagerComponent.js.map +1 -1
- package/dist/esm/src/server/const/Cluster.config.js +174 -31
- package/dist/esm/src/server/const/Cluster.config.js.map +1 -1
- package/dist/esm/src/server/const/default.js +11 -2
- package/dist/esm/src/server/const/default.js.map +1 -1
- package/dist/esm/src/server/utils/PortManager.js +26 -15
- package/dist/esm/src/server/utils/PortManager.js.map +1 -1
- package/dist/index.d.ts +90 -2
- package/package.json +6 -1
- package/dist/cjs/src/plugins/modules/network/index.js +0 -120
- package/dist/cjs/src/plugins/modules/network/index.js.map +0 -1
- package/dist/cjs/src/server/plugins/PluginEngine.js +0 -378
- package/dist/cjs/src/server/plugins/PluginEngine.js.map +0 -1
- package/dist/cjs/src/server/plugins/PluginRegistry.js +0 -339
- package/dist/cjs/src/server/plugins/PluginRegistry.js.map +0 -1
- package/dist/cjs/src/server/plugins/builtin/JWTAuthPlugin.js +0 -591
- package/dist/cjs/src/server/plugins/builtin/JWTAuthPlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/builtin/ResponseTimePlugin.js +0 -413
- package/dist/cjs/src/server/plugins/builtin/ResponseTimePlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/builtin/SmartCachePlugin.js +0 -843
- package/dist/cjs/src/server/plugins/builtin/SmartCachePlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/core/CachePlugin.js +0 -1975
- package/dist/cjs/src/server/plugins/core/CachePlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/core/PerformancePlugin.js +0 -894
- package/dist/cjs/src/server/plugins/core/PerformancePlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/core/SecurityPlugin.js +0 -799
- package/dist/cjs/src/server/plugins/core/SecurityPlugin.js.map +0 -1
- package/dist/cjs/src/server/plugins/types/PluginTypes.js +0 -47
- package/dist/cjs/src/server/plugins/types/PluginTypes.js.map +0 -1
- package/dist/esm/src/plugins/modules/network/index.js +0 -109
- package/dist/esm/src/plugins/modules/network/index.js.map +0 -1
- package/dist/esm/src/server/plugins/PluginEngine.js +0 -376
- package/dist/esm/src/server/plugins/PluginEngine.js.map +0 -1
- package/dist/esm/src/server/plugins/PluginRegistry.js +0 -337
- package/dist/esm/src/server/plugins/PluginRegistry.js.map +0 -1
- package/dist/esm/src/server/plugins/builtin/JWTAuthPlugin.js +0 -589
- package/dist/esm/src/server/plugins/builtin/JWTAuthPlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/builtin/ResponseTimePlugin.js +0 -411
- package/dist/esm/src/server/plugins/builtin/ResponseTimePlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/builtin/SmartCachePlugin.js +0 -841
- package/dist/esm/src/server/plugins/builtin/SmartCachePlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/core/CachePlugin.js +0 -1973
- package/dist/esm/src/server/plugins/core/CachePlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/core/PerformancePlugin.js +0 -872
- package/dist/esm/src/server/plugins/core/PerformancePlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/core/SecurityPlugin.js +0 -797
- package/dist/esm/src/server/plugins/core/SecurityPlugin.js.map +0 -1
- package/dist/esm/src/server/plugins/types/PluginTypes.js +0 -47
- package/dist/esm/src/server/plugins/types/PluginTypes.js.map +0 -1
|
@@ -0,0 +1,1565 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
import { logger } from '../../shared/logger/Logger.js';
|
|
3
|
+
import { MemoryManager } from './memory-manager.js';
|
|
4
|
+
import { randomBytes, createHash } from 'crypto';
|
|
5
|
+
import { performance } from 'perf_hooks';
|
|
6
|
+
import { CpuMonitor } from './modules/CpuMonitor.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Robust Bun-compatible cluster manager
|
|
10
|
+
* Uses Bun's native process spawning and IPC capabilities with enhanced security and reliability
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Bun-compatible cluster manager with enhanced security and robustness
|
|
14
|
+
*/
|
|
15
|
+
class BunClusterManager extends EventEmitter {
|
|
16
|
+
constructor(config, basePort = 8085) {
|
|
17
|
+
super();
|
|
18
|
+
this.workers = new Map();
|
|
19
|
+
this.isRunning = false;
|
|
20
|
+
this.startTime = 0;
|
|
21
|
+
this.maxShutdownTime = 30000; // 30 seconds
|
|
22
|
+
this.workerPorts = new Set();
|
|
23
|
+
this._validateConfig(config);
|
|
24
|
+
this.config = config;
|
|
25
|
+
this.basePort = basePort;
|
|
26
|
+
this.masterToken = this._generateSecureToken();
|
|
27
|
+
// Initialize security configuration
|
|
28
|
+
this.securityConfig = {
|
|
29
|
+
maxRestartAttempts: config.processManagement?.maxRestarts || 3,
|
|
30
|
+
restartWindow: 300000, // 5 minutes
|
|
31
|
+
maxMemoryPerWorker: this._parseMemoryString(config.resources?.maxMemoryPerWorker || "512MB"),
|
|
32
|
+
allowedSignals: ["SIGTERM", "SIGKILL", "SIGUSR1", "SIGUSR2"],
|
|
33
|
+
processTimeout: 30000, // 30 seconds
|
|
34
|
+
enableResourceLimits: true,
|
|
35
|
+
};
|
|
36
|
+
// Initialize memory manager with error handling
|
|
37
|
+
try {
|
|
38
|
+
this.memoryManager = new MemoryManager(config.resources);
|
|
39
|
+
this._setupMemoryManagement();
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
logger.error("cluster", "Failed to initialize memory manager:", error);
|
|
43
|
+
throw new Error("Failed to initialize cluster manager");
|
|
44
|
+
}
|
|
45
|
+
// Initialize CPU monitor
|
|
46
|
+
this.cpuMonitor = new CpuMonitor({
|
|
47
|
+
enabled: true,
|
|
48
|
+
sampleInterval: 5000,
|
|
49
|
+
historySize: 100,
|
|
50
|
+
smoothingFactor: 0.3,
|
|
51
|
+
alertThresholds: {
|
|
52
|
+
warning: 70,
|
|
53
|
+
critical: 90,
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
// Setup graceful shutdown handlers
|
|
57
|
+
this._setupGracefulShutdown();
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Parse memory string to bytes
|
|
61
|
+
*/
|
|
62
|
+
_parseMemoryString(memoryStr) {
|
|
63
|
+
const units = {
|
|
64
|
+
B: 1,
|
|
65
|
+
KB: 1024,
|
|
66
|
+
MB: 1024 * 1024,
|
|
67
|
+
GB: 1024 * 1024 * 1024,
|
|
68
|
+
TB: 1024 * 1024 * 1024 * 1024,
|
|
69
|
+
};
|
|
70
|
+
const match = memoryStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]?B)$/i);
|
|
71
|
+
if (!match) {
|
|
72
|
+
throw new Error(`Invalid memory format: ${memoryStr}`);
|
|
73
|
+
}
|
|
74
|
+
const value = parseFloat(match[1]);
|
|
75
|
+
const unit = match[2].toUpperCase();
|
|
76
|
+
return value * (units[unit] || 1);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Validate cluster configuration
|
|
80
|
+
*/
|
|
81
|
+
_validateConfig(config) {
|
|
82
|
+
if (!config) {
|
|
83
|
+
throw new Error("Cluster configuration is required");
|
|
84
|
+
}
|
|
85
|
+
if (typeof config.workers === "number" &&
|
|
86
|
+
(config.workers < 1 || config.workers > 64)) {
|
|
87
|
+
throw new Error("Worker count must be between 1 and 64");
|
|
88
|
+
}
|
|
89
|
+
if (config.resources?.maxMemoryPerWorker) {
|
|
90
|
+
const memoryLimit = this._parseMemoryString(config.resources.maxMemoryPerWorker);
|
|
91
|
+
if (memoryLimit < 64 * 1024 * 1024) {
|
|
92
|
+
throw new Error("Minimum memory limit is 64MB per worker");
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Generate a secure token for worker authentication
|
|
98
|
+
*/
|
|
99
|
+
_generateSecureToken() {
|
|
100
|
+
return randomBytes(32).toString("hex");
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Create a worker-specific security token
|
|
104
|
+
*/
|
|
105
|
+
_createWorkerToken(workerId) {
|
|
106
|
+
return createHash("sha256")
|
|
107
|
+
.update(this.masterToken)
|
|
108
|
+
.update(workerId)
|
|
109
|
+
.update(Date.now().toString())
|
|
110
|
+
.digest("hex");
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Allocate a safe port for a worker
|
|
114
|
+
*/
|
|
115
|
+
_allocatePort() {
|
|
116
|
+
let attempts = 0;
|
|
117
|
+
const maxAttempts = 100;
|
|
118
|
+
while (attempts < maxAttempts) {
|
|
119
|
+
const port = this.basePort + Math.floor(Math.random() * 1000) + 1;
|
|
120
|
+
if (!this.workerPorts.has(port) && port > 1024 && port < 65535) {
|
|
121
|
+
this.workerPorts.add(port);
|
|
122
|
+
return port;
|
|
123
|
+
}
|
|
124
|
+
attempts++;
|
|
125
|
+
}
|
|
126
|
+
throw new Error("Unable to allocate safe port for worker");
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Release a port back to the pool
|
|
130
|
+
*/
|
|
131
|
+
_releasePort(port) {
|
|
132
|
+
this.workerPorts.delete(port);
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Setup enhanced memory management event handlers
|
|
136
|
+
*/
|
|
137
|
+
_setupMemoryManagement() {
|
|
138
|
+
if (!this.config.resources?.memoryManagement?.enabled) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
this.memoryManager.on("memory_alert", (alert) => {
|
|
142
|
+
logger.warn("cluster", `Memory Alert: ${alert.message}`);
|
|
143
|
+
this._handleMemoryAlert(alert);
|
|
144
|
+
});
|
|
145
|
+
this.memoryManager.on("low_memory_mode_enabled", () => {
|
|
146
|
+
logger.info("cluster", "Cluster entering low memory mode");
|
|
147
|
+
this.emit("low_memory_mode", {
|
|
148
|
+
enabled: true,
|
|
149
|
+
timestamp: Date.now(),
|
|
150
|
+
});
|
|
151
|
+
this._enableEmergencyMode();
|
|
152
|
+
});
|
|
153
|
+
this.memoryManager.on("low_memory_mode_disabled", () => {
|
|
154
|
+
logger.info("cluster", "Cluster exiting low memory mode");
|
|
155
|
+
this.emit("low_memory_mode", {
|
|
156
|
+
enabled: false,
|
|
157
|
+
timestamp: Date.now(),
|
|
158
|
+
});
|
|
159
|
+
this._disableEmergencyMode();
|
|
160
|
+
});
|
|
161
|
+
this.memoryManager.on("error", (error) => {
|
|
162
|
+
logger.error("cluster", "Memory manager error:", error);
|
|
163
|
+
this.emit("error", { type: "memory_manager", error });
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Handle memory alerts with appropriate actions
|
|
168
|
+
*/
|
|
169
|
+
async _handleMemoryAlert(alert) {
|
|
170
|
+
try {
|
|
171
|
+
switch (alert.action) {
|
|
172
|
+
case "scale_down":
|
|
173
|
+
await this._handleMemoryScaleDown(alert);
|
|
174
|
+
break;
|
|
175
|
+
case "restart_worker":
|
|
176
|
+
if (alert.workerId) {
|
|
177
|
+
await this._handleWorkerMemoryIssue(alert.workerId, alert);
|
|
178
|
+
}
|
|
179
|
+
break;
|
|
180
|
+
case "throttle":
|
|
181
|
+
this._handleMemoryThrottling(alert);
|
|
182
|
+
break;
|
|
183
|
+
default:
|
|
184
|
+
logger.warn("cluster", `Unknown memory alert action: ${alert.action}`);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
logger.error("cluster", "Error handling memory alert:", error);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Setup graceful shutdown handlers
|
|
193
|
+
*/
|
|
194
|
+
_setupGracefulShutdown() {
|
|
195
|
+
const shutdownHandler = async (signal) => {
|
|
196
|
+
logger.info("cluster", `Received ${signal}, initiating graceful shutdown...`);
|
|
197
|
+
if (!this.shutdownPromise) {
|
|
198
|
+
this.shutdownPromise = this.stop(true);
|
|
199
|
+
}
|
|
200
|
+
await this.shutdownPromise;
|
|
201
|
+
process.exit(0);
|
|
202
|
+
};
|
|
203
|
+
process.on("SIGTERM", () => shutdownHandler("SIGTERM"));
|
|
204
|
+
process.on("SIGINT", () => shutdownHandler("SIGINT"));
|
|
205
|
+
// Handle uncaught exceptions
|
|
206
|
+
process.on("uncaughtException", (error) => {
|
|
207
|
+
logger.error("cluster", "Uncaught exception in cluster manager:", error);
|
|
208
|
+
this.stop(false).then(() => process.exit(1));
|
|
209
|
+
});
|
|
210
|
+
process.on("unhandledRejection", (reason, promise) => {
|
|
211
|
+
logger.error("cluster", "Unhandled rejection in cluster manager:", reason);
|
|
212
|
+
this.emit("error", {
|
|
213
|
+
type: "unhandled_rejection",
|
|
214
|
+
reason,
|
|
215
|
+
promise,
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Start the Bun cluster with comprehensive error handling
|
|
221
|
+
*/
|
|
222
|
+
async start() {
|
|
223
|
+
if (this.isRunning) {
|
|
224
|
+
logger.warn("cluster", "Bun cluster is already running");
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
try {
|
|
228
|
+
logger.info("cluster", "Starting Bun cluster manager...");
|
|
229
|
+
this.startTime = performance.now();
|
|
230
|
+
this.isRunning = true;
|
|
231
|
+
const workerCount = this._getOptimalWorkerCount();
|
|
232
|
+
logger.info("cluster", `Spawning ${workerCount} Bun workers`);
|
|
233
|
+
// Validate system resources before starting
|
|
234
|
+
await this._validateSystemResources(workerCount);
|
|
235
|
+
// Spawn workers with staggered startup
|
|
236
|
+
const spawnPromises = Array.from({ length: workerCount }, (_, i) => this._spawnWorkerWithRetry(i));
|
|
237
|
+
const workers = await Promise.allSettled(spawnPromises);
|
|
238
|
+
const successfulWorkers = workers.filter((result) => result.status === "fulfilled").length;
|
|
239
|
+
if (successfulWorkers === 0) {
|
|
240
|
+
throw new Error("Failed to start any workers");
|
|
241
|
+
}
|
|
242
|
+
if (successfulWorkers < workerCount) {
|
|
243
|
+
logger.warn("cluster", `Started ${successfulWorkers}/${workerCount} workers`);
|
|
244
|
+
}
|
|
245
|
+
// Start monitoring services
|
|
246
|
+
this._startHealthMonitoring();
|
|
247
|
+
this._startMetricsCollection();
|
|
248
|
+
this._startPerformanceMonitoring();
|
|
249
|
+
// Start memory monitoring if enabled
|
|
250
|
+
if (this.config.resources?.memoryManagement?.enabled !== false) {
|
|
251
|
+
this.memoryManager.startMonitoring();
|
|
252
|
+
}
|
|
253
|
+
// Start CPU monitoring
|
|
254
|
+
this.cpuMonitor.startMonitoring();
|
|
255
|
+
logger.info("cluster", `Bun cluster started with ${successfulWorkers} workers`);
|
|
256
|
+
this.emit("cluster:started", {
|
|
257
|
+
workerCount: successfulWorkers,
|
|
258
|
+
requestedCount: workerCount,
|
|
259
|
+
timestamp: Date.now(),
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
catch (error) {
|
|
263
|
+
this.isRunning = false;
|
|
264
|
+
logger.error("cluster", "Failed to start cluster:", error);
|
|
265
|
+
await this._cleanupPartialStart();
|
|
266
|
+
throw error;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Validate system resources before starting workers
|
|
271
|
+
*/
|
|
272
|
+
async _validateSystemResources(workerCount) {
|
|
273
|
+
try {
|
|
274
|
+
// Get actual system memory information
|
|
275
|
+
const systemMemory = await this.memoryManager.getSystemMemoryStats();
|
|
276
|
+
const availableMemory = systemMemory.freeMemory;
|
|
277
|
+
const requiredMemory = workerCount * this.securityConfig.maxMemoryPerWorker;
|
|
278
|
+
// Check if we have enough memory (leave 20% buffer)
|
|
279
|
+
const memoryWithBuffer = availableMemory * 0.8;
|
|
280
|
+
if (requiredMemory > memoryWithBuffer) {
|
|
281
|
+
// Check if we can reduce to a single worker with minimal memory
|
|
282
|
+
const minMemoryPerWorker = 128 * 1024 * 1024; // 128MB minimum
|
|
283
|
+
if (workerCount === 1 &&
|
|
284
|
+
minMemoryPerWorker <= memoryWithBuffer) {
|
|
285
|
+
logger.warn("cluster", `Reducing memory limit to ${Math.round(minMemoryPerWorker / 1024 / 1024)}MB per worker due to low system memory`);
|
|
286
|
+
this.securityConfig.maxMemoryPerWorker = minMemoryPerWorker;
|
|
287
|
+
return; // Allow startup with reduced memory
|
|
288
|
+
}
|
|
289
|
+
throw new Error(`Insufficient memory for ${workerCount} workers. Required: ${Math.round(requiredMemory / 1024 / 1024)}MB, Available: ${Math.round(memoryWithBuffer / 1024 / 1024)}MB (${Math.round(availableMemory / 1024 / 1024)}MB total free). Consider disabling clustering or increasing system memory.`);
|
|
290
|
+
}
|
|
291
|
+
logger.debug("cluster", `Memory validation passed: Required ${Math.round(requiredMemory / 1024 / 1024)}MB, Available ${Math.round(memoryWithBuffer / 1024 / 1024)}MB`);
|
|
292
|
+
}
|
|
293
|
+
catch (error) {
|
|
294
|
+
if (error instanceof Error &&
|
|
295
|
+
error.message.includes("Insufficient memory")) {
|
|
296
|
+
throw error;
|
|
297
|
+
}
|
|
298
|
+
// If memory manager fails, fall back to basic validation
|
|
299
|
+
logger.warn("cluster", "Failed to get system memory stats, using fallback validation:", error);
|
|
300
|
+
const os = await import('os');
|
|
301
|
+
const freeMemory = os.freemem();
|
|
302
|
+
const requiredMemory = workerCount * this.securityConfig.maxMemoryPerWorker;
|
|
303
|
+
const availableMemoryFallback = freeMemory * 0.8;
|
|
304
|
+
if (requiredMemory > availableMemoryFallback) {
|
|
305
|
+
// Try with minimal memory for single worker
|
|
306
|
+
const minMemoryPerWorker = 128 * 1024 * 1024; // 128MB minimum
|
|
307
|
+
if (workerCount === 1 &&
|
|
308
|
+
minMemoryPerWorker <= availableMemoryFallback) {
|
|
309
|
+
logger.warn("cluster", `Fallback: Reducing memory limit to ${Math.round(minMemoryPerWorker / 1024 / 1024)}MB per worker`);
|
|
310
|
+
this.securityConfig.maxMemoryPerWorker = minMemoryPerWorker;
|
|
311
|
+
return;
|
|
312
|
+
}
|
|
313
|
+
throw new Error(`Insufficient memory for ${workerCount} workers. Required: ${Math.round(requiredMemory / 1024 / 1024)}MB, Available: ${Math.round(availableMemoryFallback / 1024 / 1024)}MB (fallback). Consider disabling clustering or increasing system memory.`);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
// Validate port availability
|
|
317
|
+
if (this.basePort < 1024 || this.basePort > 65000) {
|
|
318
|
+
throw new Error(`Invalid base port: ${this.basePort}. Must be between 1024 and 65000`);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* Cleanup after partial startup failure
|
|
323
|
+
*/
|
|
324
|
+
async _cleanupPartialStart() {
|
|
325
|
+
try {
|
|
326
|
+
const stopPromises = Array.from(this.workers.values()).map((worker) => this._stopWorker(worker.id, false));
|
|
327
|
+
await Promise.allSettled(stopPromises);
|
|
328
|
+
this.workers.clear();
|
|
329
|
+
this.workerPorts.clear();
|
|
330
|
+
}
|
|
331
|
+
catch (error) {
|
|
332
|
+
logger.error("cluster", "Error during cleanup:", error);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Stop the Bun cluster with timeout protection
|
|
337
|
+
*/
|
|
338
|
+
async stop(graceful = true) {
|
|
339
|
+
if (!this.isRunning) {
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
logger.info("cluster", `Stopping Bun cluster (graceful: ${graceful})...`);
|
|
343
|
+
this.isRunning = false;
|
|
344
|
+
// Create shutdown timeout
|
|
345
|
+
const shutdownTimeout = new Promise((_, reject) => {
|
|
346
|
+
setTimeout(() => reject(new Error("Shutdown timeout exceeded")), this.maxShutdownTime);
|
|
347
|
+
});
|
|
348
|
+
try {
|
|
349
|
+
// Stop monitoring first
|
|
350
|
+
this._stopMonitoring();
|
|
351
|
+
// Stop memory manager
|
|
352
|
+
if (this.memoryManager) {
|
|
353
|
+
this.memoryManager.stopMonitoring?.();
|
|
354
|
+
}
|
|
355
|
+
// Stop CPU monitoring
|
|
356
|
+
this.cpuMonitor.stopMonitoring();
|
|
357
|
+
// Stop all workers
|
|
358
|
+
const stopPromises = Array.from(this.workers.values()).map((worker) => this._stopWorker(worker.id, graceful));
|
|
359
|
+
await Promise.race([Promise.all(stopPromises), shutdownTimeout]);
|
|
360
|
+
this.workers.clear();
|
|
361
|
+
this.workerPorts.clear();
|
|
362
|
+
logger.info("cluster", "Bun cluster stopped successfully");
|
|
363
|
+
this.emit("cluster:stopped", { timestamp: Date.now() });
|
|
364
|
+
}
|
|
365
|
+
catch (error) {
|
|
366
|
+
logger.error("cluster", "Error during cluster shutdown:", error);
|
|
367
|
+
// Force kill remaining workers
|
|
368
|
+
await this._forceKillAllWorkers();
|
|
369
|
+
throw error;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Force kill all workers in emergency situations
|
|
374
|
+
*/
|
|
375
|
+
async _forceKillAllWorkers() {
|
|
376
|
+
const forceKillPromises = Array.from(this.workers.values()).map(async (worker) => {
|
|
377
|
+
try {
|
|
378
|
+
worker.subprocess.kill("SIGKILL");
|
|
379
|
+
await worker.subprocess.exited;
|
|
380
|
+
this._releasePort(worker.port);
|
|
381
|
+
}
|
|
382
|
+
catch (error) {
|
|
383
|
+
logger.error("cluster", `Error force killing worker ${worker.id}:`, error);
|
|
384
|
+
}
|
|
385
|
+
});
|
|
386
|
+
await Promise.allSettled(forceKillPromises);
|
|
387
|
+
this.workers.clear();
|
|
388
|
+
this.workerPorts.clear();
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Stop all monitoring services
|
|
392
|
+
*/
|
|
393
|
+
_stopMonitoring() {
|
|
394
|
+
if (this.healthCheckInterval) {
|
|
395
|
+
clearInterval(this.healthCheckInterval);
|
|
396
|
+
this.healthCheckInterval = undefined;
|
|
397
|
+
}
|
|
398
|
+
if (this.metricsInterval) {
|
|
399
|
+
clearInterval(this.metricsInterval);
|
|
400
|
+
this.metricsInterval = undefined;
|
|
401
|
+
}
|
|
402
|
+
if (this.performanceInterval) {
|
|
403
|
+
clearInterval(this.performanceInterval);
|
|
404
|
+
this.performanceInterval = undefined;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Spawn a worker with retry logic
|
|
409
|
+
*/
|
|
410
|
+
async _spawnWorkerWithRetry(index, retries = 3) {
|
|
411
|
+
let lastError = null;
|
|
412
|
+
for (let attempt = 0; attempt < retries; attempt++) {
|
|
413
|
+
try {
|
|
414
|
+
// Add delay between retry attempts
|
|
415
|
+
if (attempt > 0) {
|
|
416
|
+
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
|
|
417
|
+
}
|
|
418
|
+
return await this._spawnWorker(index);
|
|
419
|
+
}
|
|
420
|
+
catch (error) {
|
|
421
|
+
lastError = error;
|
|
422
|
+
logger.warn("cluster", `Worker spawn attempt ${attempt + 1} failed:`, error);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
throw lastError || new Error("Failed to spawn worker after retries");
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Spawn a new Bun worker process with enhanced security
|
|
429
|
+
*/
|
|
430
|
+
async _spawnWorker(index) {
|
|
431
|
+
const workerId = `worker-${index}-${Date.now()}`;
|
|
432
|
+
const port = this._allocatePort();
|
|
433
|
+
const securityToken = this._createWorkerToken(workerId);
|
|
434
|
+
logger.debug("cluster", `Spawning Bun worker ${workerId} on port ${port}`);
|
|
435
|
+
try {
|
|
436
|
+
// Validate script path exists and is accessible
|
|
437
|
+
if (!process.argv[1]) {
|
|
438
|
+
throw new Error("Unable to determine script path for worker");
|
|
439
|
+
}
|
|
440
|
+
// Enhanced environment with security measures
|
|
441
|
+
const workerEnv = {
|
|
442
|
+
...this._getSecureEnvironment(),
|
|
443
|
+
WORKER_ID: workerId,
|
|
444
|
+
WORKER_PORT: port.toString(),
|
|
445
|
+
WORKER_SECURITY_TOKEN: securityToken,
|
|
446
|
+
MASTER_TOKEN: this.masterToken,
|
|
447
|
+
NODE_ENV: "worker",
|
|
448
|
+
CLUSTER_MODE: "true",
|
|
449
|
+
WORKER_MEMORY_LIMIT: this.securityConfig.maxMemoryPerWorker.toString(),
|
|
450
|
+
WORKER_MAX_REQUESTS: "10000", // Prevent memory leaks
|
|
451
|
+
};
|
|
452
|
+
const subprocess = Bun.spawn({
|
|
453
|
+
cmd: ["bun", "run", process.argv[1]],
|
|
454
|
+
env: workerEnv,
|
|
455
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
456
|
+
});
|
|
457
|
+
const worker = {
|
|
458
|
+
id: workerId,
|
|
459
|
+
subprocess,
|
|
460
|
+
port,
|
|
461
|
+
status: "starting",
|
|
462
|
+
startTime: Date.now(),
|
|
463
|
+
restarts: 0,
|
|
464
|
+
lastPing: Date.now(),
|
|
465
|
+
health: {
|
|
466
|
+
status: "unknown",
|
|
467
|
+
consecutiveFailures: 0,
|
|
468
|
+
},
|
|
469
|
+
securityToken,
|
|
470
|
+
performance: {
|
|
471
|
+
requestCount: 0,
|
|
472
|
+
errorCount: 0,
|
|
473
|
+
averageResponseTime: 0,
|
|
474
|
+
lastRequestTime: 0,
|
|
475
|
+
cpuUsage: 0,
|
|
476
|
+
memoryUsage: 0,
|
|
477
|
+
},
|
|
478
|
+
resourceLimits: {
|
|
479
|
+
maxMemory: this.securityConfig.maxMemoryPerWorker,
|
|
480
|
+
maxCpu: 80, // 80% CPU usage limit
|
|
481
|
+
},
|
|
482
|
+
restartHistory: [],
|
|
483
|
+
};
|
|
484
|
+
this.workers.set(workerId, worker);
|
|
485
|
+
// Setup process event handlers
|
|
486
|
+
subprocess.exited
|
|
487
|
+
.then((exitCode) => {
|
|
488
|
+
this._handleWorkerExit(workerId, exitCode);
|
|
489
|
+
})
|
|
490
|
+
.catch((error) => {
|
|
491
|
+
logger.error("cluster", `Worker ${workerId} exit handler error:`, error);
|
|
492
|
+
this._handleWorkerExit(workerId, -1);
|
|
493
|
+
});
|
|
494
|
+
// Setup stdout/stderr handling for better debugging
|
|
495
|
+
this._setupWorkerLogging(worker);
|
|
496
|
+
// Wait for worker to be ready with timeout
|
|
497
|
+
await this._waitForWorkerReady(worker);
|
|
498
|
+
worker.status = "running";
|
|
499
|
+
worker.health.status = "healthy";
|
|
500
|
+
// Register worker with IPC manager if available
|
|
501
|
+
if (this.ipcManager) {
|
|
502
|
+
this.ipcManager.registerWorker(workerId, worker.subprocess);
|
|
503
|
+
logger.debug("cluster", `Worker ${workerId} registered with IPC manager`);
|
|
504
|
+
}
|
|
505
|
+
logger.info("cluster", `Bun worker ${workerId} started on port ${port}`);
|
|
506
|
+
this.emit("worker:started", {
|
|
507
|
+
workerId,
|
|
508
|
+
port,
|
|
509
|
+
timestamp: Date.now(),
|
|
510
|
+
});
|
|
511
|
+
return worker;
|
|
512
|
+
}
|
|
513
|
+
catch (error) {
|
|
514
|
+
this._releasePort(port);
|
|
515
|
+
logger.error("cluster", `Failed to spawn Bun worker ${workerId}:`, error);
|
|
516
|
+
throw error;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
/**
|
|
520
|
+
* Get secure environment variables for workers
|
|
521
|
+
*/
|
|
522
|
+
_getSecureEnvironment() {
|
|
523
|
+
// Only pass safe environment variables to workers
|
|
524
|
+
const safeEnvVars = [
|
|
525
|
+
"NODE_ENV",
|
|
526
|
+
"PATH",
|
|
527
|
+
"HOME",
|
|
528
|
+
"USER",
|
|
529
|
+
"PWD",
|
|
530
|
+
"LOG_LEVEL",
|
|
531
|
+
"DEBUG",
|
|
532
|
+
"TZ",
|
|
533
|
+
];
|
|
534
|
+
const secureEnv = {};
|
|
535
|
+
for (const key of safeEnvVars) {
|
|
536
|
+
if (process.env[key]) {
|
|
537
|
+
secureEnv[key] = process.env[key];
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
return secureEnv;
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Setup logging for worker process
|
|
544
|
+
*/
|
|
545
|
+
_setupWorkerLogging(worker) {
|
|
546
|
+
if (worker.subprocess.stdout) {
|
|
547
|
+
worker.subprocess.stdout.pipeTo(new WritableStream({
|
|
548
|
+
write(chunk) {
|
|
549
|
+
const data = new TextDecoder().decode(chunk);
|
|
550
|
+
logger.debug("cluster", `[${worker.id}] stdout: ${data.trim()}`);
|
|
551
|
+
},
|
|
552
|
+
}));
|
|
553
|
+
}
|
|
554
|
+
if (worker.subprocess.stderr) {
|
|
555
|
+
worker.subprocess.stderr.pipeTo(new WritableStream({
|
|
556
|
+
write(chunk) {
|
|
557
|
+
const data = new TextDecoder().decode(chunk);
|
|
558
|
+
logger.warn("cluster", `[${worker.id}] stderr: ${data.trim()}`);
|
|
559
|
+
},
|
|
560
|
+
}));
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
/**
|
|
564
|
+
* Check if worker is ready and responding with multiple strategies
|
|
565
|
+
*/
|
|
566
|
+
async _checkWorkerReadiness(worker) {
|
|
567
|
+
try {
|
|
568
|
+
// Check if process is still running
|
|
569
|
+
if (worker.subprocess.killed) {
|
|
570
|
+
logger.debug("cluster", `Worker ${worker.id} process is killed`);
|
|
571
|
+
return false;
|
|
572
|
+
}
|
|
573
|
+
// Strategy 1: Check if process is responsive (basic check)
|
|
574
|
+
if (!worker.subprocess.pid) {
|
|
575
|
+
logger.debug("cluster", `Worker ${worker.id} has no PID`);
|
|
576
|
+
return false;
|
|
577
|
+
}
|
|
578
|
+
// Strategy 2: Try IPC communication first (faster than port check)
|
|
579
|
+
if (this.ipcManager) {
|
|
580
|
+
try {
|
|
581
|
+
// Try to send a ping via IPC
|
|
582
|
+
const ipcReady = await this._checkWorkerIPCReadiness(worker);
|
|
583
|
+
if (ipcReady) {
|
|
584
|
+
logger.debug("cluster", `Worker ${worker.id} ready via IPC`);
|
|
585
|
+
return true;
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
catch (error) {
|
|
589
|
+
logger.debug("cluster", `Worker ${worker.id} IPC check failed:`, error);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
// Strategy 3: Check port listening (fallback)
|
|
593
|
+
const isListening = await this._checkPortListening(worker.port);
|
|
594
|
+
if (isListening) {
|
|
595
|
+
logger.debug("cluster", `Worker ${worker.id} ready via port check`);
|
|
596
|
+
return true;
|
|
597
|
+
}
|
|
598
|
+
// Strategy 4: Check if worker has been running for a minimum time (more lenient)
|
|
599
|
+
const runningTime = Date.now() - worker.startTime;
|
|
600
|
+
if (runningTime > 3000) {
|
|
601
|
+
// 3 seconds minimum (reduced from 5)
|
|
602
|
+
logger.debug("cluster", `Worker ${worker.id} assumed ready after ${runningTime}ms (time-based)`);
|
|
603
|
+
return true;
|
|
604
|
+
}
|
|
605
|
+
// Strategy 5: If worker process is stable and not killed, assume it's working
|
|
606
|
+
if (runningTime > 1000 &&
|
|
607
|
+
!worker.subprocess.killed &&
|
|
608
|
+
worker.subprocess.pid) {
|
|
609
|
+
logger.debug("cluster", `Worker ${worker.id} process stable after ${runningTime}ms (process-based)`);
|
|
610
|
+
return true;
|
|
611
|
+
}
|
|
612
|
+
logger.debug("cluster", `Worker ${worker.id} not ready yet (running for ${runningTime}ms)`);
|
|
613
|
+
return false;
|
|
614
|
+
}
|
|
615
|
+
catch (error) {
|
|
616
|
+
logger.debug("cluster", `Worker ${worker.id} readiness check failed:`, error);
|
|
617
|
+
return false;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Check worker readiness via IPC
|
|
622
|
+
*/
|
|
623
|
+
async _checkWorkerIPCReadiness(worker) {
|
|
624
|
+
try {
|
|
625
|
+
// Register worker with IPC manager temporarily for ping
|
|
626
|
+
if (!this.ipcManager)
|
|
627
|
+
return false;
|
|
628
|
+
this.ipcManager.registerWorker(worker.id, worker.subprocess);
|
|
629
|
+
// Try to ping the worker
|
|
630
|
+
const response = await Promise.race([
|
|
631
|
+
this.ipcManager.sendToWorker(worker.id, "ping", {}),
|
|
632
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error("IPC timeout")), 2000)),
|
|
633
|
+
]);
|
|
634
|
+
// Check if we got a real response or a mock response
|
|
635
|
+
if (response && typeof response === "object") {
|
|
636
|
+
if (response.status === "ok" &&
|
|
637
|
+
response.message === "IPC not fully supported in Bun mode") {
|
|
638
|
+
logger.debug("cluster", `Worker ${worker.id} IPC not fully supported, but worker is registered`);
|
|
639
|
+
return true; // Worker is registered, even if IPC isn't fully functional
|
|
640
|
+
}
|
|
641
|
+
if (response.status === "fallback") {
|
|
642
|
+
logger.debug("cluster", `Worker ${worker.id} IPC communication failed, but worker exists`);
|
|
643
|
+
return true; // Worker exists, even if IPC failed
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
return response !== undefined;
|
|
647
|
+
}
|
|
648
|
+
catch (error) {
|
|
649
|
+
return false;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
/**
|
|
653
|
+
* Check if a port is listening
|
|
654
|
+
*/
|
|
655
|
+
async _checkPortListening(port) {
|
|
656
|
+
return new Promise((resolve) => {
|
|
657
|
+
const net = require("net");
|
|
658
|
+
const socket = new net.Socket();
|
|
659
|
+
const timeout = setTimeout(() => {
|
|
660
|
+
socket.destroy();
|
|
661
|
+
resolve(false);
|
|
662
|
+
}, 2000); // Increased timeout to 2 seconds
|
|
663
|
+
socket.on("connect", () => {
|
|
664
|
+
clearTimeout(timeout);
|
|
665
|
+
socket.destroy();
|
|
666
|
+
resolve(true);
|
|
667
|
+
});
|
|
668
|
+
socket.on("error", (error) => {
|
|
669
|
+
clearTimeout(timeout);
|
|
670
|
+
// Log the specific error for debugging
|
|
671
|
+
logger.debug("cluster", `Port ${port} connection error:`, error.code);
|
|
672
|
+
resolve(false);
|
|
673
|
+
});
|
|
674
|
+
try {
|
|
675
|
+
socket.connect(port, "localhost");
|
|
676
|
+
}
|
|
677
|
+
catch (error) {
|
|
678
|
+
clearTimeout(timeout);
|
|
679
|
+
logger.debug("cluster", `Port ${port} connect attempt failed:`, error);
|
|
680
|
+
resolve(false);
|
|
681
|
+
}
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Wait for worker to be ready with progressive timeout and better diagnostics
|
|
686
|
+
*/
|
|
687
|
+
async _waitForWorkerReady(worker, timeout = 15000 // Further reduced to 15 seconds
|
|
688
|
+
) {
|
|
689
|
+
return new Promise((resolve, reject) => {
|
|
690
|
+
const checkInterval = 500; // Increased interval to reduce CPU usage
|
|
691
|
+
let attempts = 0;
|
|
692
|
+
const maxAttempts = Math.floor(timeout / checkInterval);
|
|
693
|
+
logger.debug("cluster", `Waiting for worker ${worker.id} to be ready (timeout: ${timeout}ms)`);
|
|
694
|
+
const timeoutId = setTimeout(() => {
|
|
695
|
+
logger.error("cluster", `Worker ${worker.id} startup timeout after ${timeout}ms`);
|
|
696
|
+
logger.error("cluster", `Worker ${worker.id} diagnostics:`, {
|
|
697
|
+
pid: worker.subprocess.pid,
|
|
698
|
+
killed: worker.subprocess.killed,
|
|
699
|
+
port: worker.port,
|
|
700
|
+
startTime: worker.startTime,
|
|
701
|
+
runningTime: Date.now() - worker.startTime,
|
|
702
|
+
});
|
|
703
|
+
reject(new Error(`Worker ${worker.id} failed to start within ${timeout}ms`));
|
|
704
|
+
}, timeout);
|
|
705
|
+
const checkReady = async () => {
|
|
706
|
+
try {
|
|
707
|
+
attempts++;
|
|
708
|
+
// Check if process is still running
|
|
709
|
+
if (worker.subprocess.killed) {
|
|
710
|
+
clearTimeout(timeoutId);
|
|
711
|
+
logger.error("cluster", `Worker ${worker.id} process died during startup`);
|
|
712
|
+
reject(new Error(`Worker ${worker.id} process died during startup`));
|
|
713
|
+
return;
|
|
714
|
+
}
|
|
715
|
+
// Log progress every 10 attempts (5 seconds)
|
|
716
|
+
if (attempts % 10 === 0) {
|
|
717
|
+
const runningTime = Date.now() - worker.startTime;
|
|
718
|
+
logger.debug("cluster", `Worker ${worker.id} still starting... (${runningTime}ms, attempt ${attempts}/${maxAttempts})`);
|
|
719
|
+
}
|
|
720
|
+
// Real readiness check - verify worker is actually responding
|
|
721
|
+
const isReady = await this._checkWorkerReadiness(worker);
|
|
722
|
+
if (isReady) {
|
|
723
|
+
clearTimeout(timeoutId);
|
|
724
|
+
const startupTime = Date.now() - worker.startTime;
|
|
725
|
+
logger.info("cluster", `Worker ${worker.id} ready after ${startupTime}ms`);
|
|
726
|
+
resolve();
|
|
727
|
+
return;
|
|
728
|
+
}
|
|
729
|
+
// Continue checking
|
|
730
|
+
setTimeout(checkReady, checkInterval);
|
|
731
|
+
}
|
|
732
|
+
catch (error) {
|
|
733
|
+
clearTimeout(timeoutId);
|
|
734
|
+
logger.error("cluster", `Worker ${worker.id} readiness check error:`, error);
|
|
735
|
+
reject(error);
|
|
736
|
+
}
|
|
737
|
+
};
|
|
738
|
+
// Start checking immediately
|
|
739
|
+
checkReady();
|
|
740
|
+
});
|
|
741
|
+
}
|
|
742
|
+
/**
|
|
743
|
+
* Stop a specific worker with enhanced safety measures
|
|
744
|
+
*/
|
|
745
|
+
async _stopWorker(workerId, graceful = true) {
|
|
746
|
+
const worker = this.workers.get(workerId);
|
|
747
|
+
if (!worker) {
|
|
748
|
+
return;
|
|
749
|
+
}
|
|
750
|
+
logger.debug("cluster", `Stopping Bun worker ${workerId} (graceful: ${graceful})`);
|
|
751
|
+
worker.status = "stopping";
|
|
752
|
+
try {
|
|
753
|
+
const stopPromise = this._executeWorkerStop(worker, graceful);
|
|
754
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
755
|
+
setTimeout(() => reject(new Error("Worker stop timeout")), this.securityConfig.processTimeout);
|
|
756
|
+
});
|
|
757
|
+
await Promise.race([stopPromise, timeoutPromise]);
|
|
758
|
+
worker.status = "stopped";
|
|
759
|
+
this._releasePort(worker.port);
|
|
760
|
+
// Unregister worker from IPC manager if available
|
|
761
|
+
if (this.ipcManager) {
|
|
762
|
+
this.ipcManager.unregisterWorker(workerId);
|
|
763
|
+
logger.debug("cluster", `Worker ${workerId} unregistered from IPC manager`);
|
|
764
|
+
}
|
|
765
|
+
logger.info("cluster", `Bun worker ${workerId} stopped`);
|
|
766
|
+
this.emit("worker:stopped", { workerId, timestamp: Date.now() });
|
|
767
|
+
}
|
|
768
|
+
catch (error) {
|
|
769
|
+
logger.error("cluster", `Error stopping Bun worker ${workerId}:`, error);
|
|
770
|
+
worker.status = "error";
|
|
771
|
+
// Force kill if graceful stop failed
|
|
772
|
+
try {
|
|
773
|
+
worker.subprocess.kill("SIGKILL");
|
|
774
|
+
await worker.subprocess.exited;
|
|
775
|
+
this._releasePort(worker.port);
|
|
776
|
+
}
|
|
777
|
+
catch (forceError) {
|
|
778
|
+
logger.error("cluster", `Error force killing worker ${workerId}:`, forceError);
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
/**
|
|
783
|
+
* Execute worker stop with proper signal handling
|
|
784
|
+
*/
|
|
785
|
+
async _executeWorkerStop(worker, graceful) {
|
|
786
|
+
if (graceful) {
|
|
787
|
+
// Send SIGTERM for graceful shutdown
|
|
788
|
+
worker.subprocess.kill("SIGTERM");
|
|
789
|
+
// Wait for graceful shutdown with timeout
|
|
790
|
+
const gracefulTimeout = setTimeout(() => {
|
|
791
|
+
logger.warn("cluster", `Worker ${worker.id} graceful shutdown timeout, force killing`);
|
|
792
|
+
worker.subprocess.kill("SIGKILL");
|
|
793
|
+
}, 5000);
|
|
794
|
+
await worker.subprocess.exited;
|
|
795
|
+
clearTimeout(gracefulTimeout);
|
|
796
|
+
}
|
|
797
|
+
else {
|
|
798
|
+
worker.subprocess.kill("SIGKILL");
|
|
799
|
+
await worker.subprocess.exited;
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
/**
|
|
803
|
+
* Handle worker process exit with enhanced tracking
|
|
804
|
+
*/
|
|
805
|
+
async _handleWorkerExit(workerId, exitCode) {
|
|
806
|
+
const worker = this.workers.get(workerId);
|
|
807
|
+
if (!worker) {
|
|
808
|
+
return;
|
|
809
|
+
}
|
|
810
|
+
const exitReason = this._determineExitReason(exitCode);
|
|
811
|
+
logger.warn("cluster", `Bun worker ${workerId} exited: ${exitReason}`);
|
|
812
|
+
// Update worker state
|
|
813
|
+
worker.status = "stopped";
|
|
814
|
+
worker.health.status = "unhealthy";
|
|
815
|
+
worker.health.consecutiveFailures++;
|
|
816
|
+
// Unregister worker from IPC manager if available
|
|
817
|
+
if (this.ipcManager) {
|
|
818
|
+
this.ipcManager.unregisterWorker(workerId);
|
|
819
|
+
logger.debug("cluster", `Worker ${workerId} unregistered from IPC manager`);
|
|
820
|
+
}
|
|
821
|
+
// Add to restart history
|
|
822
|
+
worker.restartHistory.push({
|
|
823
|
+
timestamp: Date.now(),
|
|
824
|
+
reason: exitReason,
|
|
825
|
+
exitCode: exitCode || undefined,
|
|
826
|
+
});
|
|
827
|
+
this.emit("worker:exit", {
|
|
828
|
+
workerId,
|
|
829
|
+
exitCode,
|
|
830
|
+
reason: exitReason,
|
|
831
|
+
timestamp: Date.now(),
|
|
832
|
+
});
|
|
833
|
+
// Check if restart is needed and allowed
|
|
834
|
+
if (this._shouldRestartWorker(worker)) {
|
|
835
|
+
await this._attemptWorkerRestart(worker);
|
|
836
|
+
}
|
|
837
|
+
else {
|
|
838
|
+
logger.warn("cluster", `Worker ${workerId} will not be restarted: ${this._getRestartBlockReason(worker)}`);
|
|
839
|
+
this.workers.delete(workerId);
|
|
840
|
+
this._releasePort(worker.port);
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
/**
|
|
844
|
+
* Determine the reason for worker exit
|
|
845
|
+
*/
|
|
846
|
+
_determineExitReason(exitCode) {
|
|
847
|
+
if (exitCode === null)
|
|
848
|
+
return "killed";
|
|
849
|
+
if (exitCode === 0)
|
|
850
|
+
return "normal_exit";
|
|
851
|
+
if (exitCode === 1)
|
|
852
|
+
return "error_exit";
|
|
853
|
+
if (exitCode === 130)
|
|
854
|
+
return "sigint";
|
|
855
|
+
if (exitCode === 143)
|
|
856
|
+
return "sigterm";
|
|
857
|
+
if (exitCode === 137)
|
|
858
|
+
return "sigkill";
|
|
859
|
+
return `exit_code_${exitCode}`;
|
|
860
|
+
}
|
|
861
|
+
/**
|
|
862
|
+
* Check if worker should be restarted
|
|
863
|
+
*/
|
|
864
|
+
_shouldRestartWorker(worker) {
|
|
865
|
+
if (!this.isRunning)
|
|
866
|
+
return false;
|
|
867
|
+
const autoRestart = this.config.processManagement?.respawn !== false;
|
|
868
|
+
if (!autoRestart)
|
|
869
|
+
return false;
|
|
870
|
+
// Check restart count limits
|
|
871
|
+
if (worker.restarts >= this.securityConfig.maxRestartAttempts)
|
|
872
|
+
return false;
|
|
873
|
+
// Check restart frequency (prevent restart loops)
|
|
874
|
+
const recentRestarts = worker.restartHistory.filter((r) => Date.now() - r.timestamp < this.securityConfig.restartWindow).length;
|
|
875
|
+
return recentRestarts < this.securityConfig.maxRestartAttempts;
|
|
876
|
+
}
|
|
877
|
+
/**
|
|
878
|
+
* Get reason why restart is blocked
|
|
879
|
+
*/
|
|
880
|
+
_getRestartBlockReason(worker) {
|
|
881
|
+
if (!this.isRunning)
|
|
882
|
+
return "cluster_shutting_down";
|
|
883
|
+
if (!this.config.processManagement?.respawn)
|
|
884
|
+
return "auto_restart_disabled";
|
|
885
|
+
if (worker.restarts >= this.securityConfig.maxRestartAttempts)
|
|
886
|
+
return "max_restarts_exceeded";
|
|
887
|
+
const recentRestarts = worker.restartHistory.filter((r) => Date.now() - r.timestamp < this.securityConfig.restartWindow).length;
|
|
888
|
+
if (recentRestarts >= this.securityConfig.maxRestartAttempts)
|
|
889
|
+
return "restart_frequency_limit";
|
|
890
|
+
return "unknown";
|
|
891
|
+
}
|
|
892
|
+
/**
|
|
893
|
+
* Attempt to restart a worker with backoff
|
|
894
|
+
*/
|
|
895
|
+
async _attemptWorkerRestart(worker) {
|
|
896
|
+
logger.info("cluster", `Restarting Bun worker ${worker.id}...`);
|
|
897
|
+
worker.restarts++;
|
|
898
|
+
// Calculate backoff delay
|
|
899
|
+
const backoffDelay = Math.min(1000 * Math.pow(2, worker.restarts - 1), 30000);
|
|
900
|
+
await new Promise((resolve) => setTimeout(resolve, backoffDelay));
|
|
901
|
+
try {
|
|
902
|
+
const index = parseInt(worker.id.split("-")[1]) || 0;
|
|
903
|
+
this.workers.delete(worker.id);
|
|
904
|
+
this._releasePort(worker.port);
|
|
905
|
+
await this._spawnWorkerWithRetry(index, 2);
|
|
906
|
+
logger.info("cluster", `Successfully restarted worker (was ${worker.id})`);
|
|
907
|
+
this.emit("worker:restarted", {
|
|
908
|
+
oldWorkerId: worker.id,
|
|
909
|
+
restartCount: worker.restarts,
|
|
910
|
+
timestamp: Date.now(),
|
|
911
|
+
});
|
|
912
|
+
}
|
|
913
|
+
catch (error) {
|
|
914
|
+
logger.error("cluster", `Failed to restart worker ${worker.id}:`, error);
|
|
915
|
+
this.workers.delete(worker.id);
|
|
916
|
+
this._releasePort(worker.port);
|
|
917
|
+
this.emit("worker:restart_failed", {
|
|
918
|
+
workerId: worker.id,
|
|
919
|
+
error: error instanceof Error ? error.message : String(error),
|
|
920
|
+
timestamp: Date.now(),
|
|
921
|
+
});
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
/**
|
|
925
|
+
* Get optimal worker count with system constraints
|
|
926
|
+
*/
|
|
927
|
+
_getOptimalWorkerCount() {
|
|
928
|
+
if (typeof this.config.workers === "number") {
|
|
929
|
+
return Math.max(1, Math.min(this.config.workers, 32)); // Cap at 32 workers
|
|
930
|
+
}
|
|
931
|
+
if (this.config.workers === "auto") {
|
|
932
|
+
const cpuCount = navigator.hardwareConcurrency || 4;
|
|
933
|
+
// Use OS-level memory information for better accuracy
|
|
934
|
+
const os = require("os");
|
|
935
|
+
const totalSystemMemory = os.totalmem();
|
|
936
|
+
const freeSystemMemory = os.freemem();
|
|
937
|
+
// Calculate based on available memory (leave 20% buffer)
|
|
938
|
+
const usableMemory = Math.min(totalSystemMemory * 0.6, freeSystemMemory * 0.8);
|
|
939
|
+
const memoryBasedCount = Math.floor(usableMemory / this.securityConfig.maxMemoryPerWorker);
|
|
940
|
+
// Use the minimum of CPU-based and memory-based counts
|
|
941
|
+
const optimalCount = Math.max(1, Math.min(cpuCount - 1, memoryBasedCount, 16));
|
|
942
|
+
logger.debug("cluster", `Optimal worker calculation: CPU=${cpuCount - 1}, Memory=${memoryBasedCount}, Selected=${optimalCount}`);
|
|
943
|
+
return optimalCount;
|
|
944
|
+
}
|
|
945
|
+
return 2; // Safe default
|
|
946
|
+
}
|
|
947
|
+
/**
|
|
948
|
+
* Start comprehensive health monitoring
|
|
949
|
+
*/
|
|
950
|
+
_startHealthMonitoring() {
|
|
951
|
+
this.healthCheckInterval = setInterval(async () => {
|
|
952
|
+
try {
|
|
953
|
+
await this._performHealthCheck();
|
|
954
|
+
}
|
|
955
|
+
catch (error) {
|
|
956
|
+
logger.error("cluster", "Health check error:", error);
|
|
957
|
+
}
|
|
958
|
+
}, 15000); // Check every 15 seconds
|
|
959
|
+
}
|
|
960
|
+
/**
|
|
961
|
+
* Perform comprehensive health check on all workers
|
|
962
|
+
*/
|
|
963
|
+
async _performHealthCheck() {
|
|
964
|
+
const healthPromises = Array.from(this.workers.values()).map(async (worker) => {
|
|
965
|
+
try {
|
|
966
|
+
// Check process status
|
|
967
|
+
if (worker.subprocess.killed) {
|
|
968
|
+
worker.health.status = "unhealthy";
|
|
969
|
+
worker.health.consecutiveFailures++;
|
|
970
|
+
worker.health.lastError = "Process killed";
|
|
971
|
+
return;
|
|
972
|
+
}
|
|
973
|
+
// Check memory usage if available
|
|
974
|
+
const memoryUsage = await this._getWorkerMemoryUsage(worker);
|
|
975
|
+
if (memoryUsage > worker.resourceLimits.maxMemory) {
|
|
976
|
+
worker.health.status = "unhealthy";
|
|
977
|
+
worker.health.consecutiveFailures++;
|
|
978
|
+
worker.health.lastError = `Memory limit exceeded: ${Math.round(memoryUsage / 1024 / 1024)}MB`;
|
|
979
|
+
this.emit("worker:memory_exceeded", {
|
|
980
|
+
workerId: worker.id,
|
|
981
|
+
memoryUsage,
|
|
982
|
+
limit: worker.resourceLimits.maxMemory,
|
|
983
|
+
});
|
|
984
|
+
return;
|
|
985
|
+
}
|
|
986
|
+
// Update performance metrics
|
|
987
|
+
worker.performance.memoryUsage = memoryUsage;
|
|
988
|
+
// Health check passed
|
|
989
|
+
worker.health.status = "healthy";
|
|
990
|
+
worker.health.consecutiveFailures = 0;
|
|
991
|
+
worker.lastPing = Date.now();
|
|
992
|
+
}
|
|
993
|
+
catch (error) {
|
|
994
|
+
worker.health.status = "unhealthy";
|
|
995
|
+
worker.health.consecutiveFailures++;
|
|
996
|
+
worker.health.lastError =
|
|
997
|
+
error instanceof Error ? error.message : String(error);
|
|
998
|
+
logger.warn("cluster", `Health check failed for worker ${worker.id}:`, error);
|
|
999
|
+
}
|
|
1000
|
+
});
|
|
1001
|
+
await Promise.allSettled(healthPromises);
|
|
1002
|
+
// Check overall cluster health
|
|
1003
|
+
const unhealthyWorkers = this.getAllWorkers().filter((w) => w.health.status === "unhealthy");
|
|
1004
|
+
if (unhealthyWorkers.length > 0) {
|
|
1005
|
+
this.emit("cluster:health_degraded", {
|
|
1006
|
+
unhealthyCount: unhealthyWorkers.length,
|
|
1007
|
+
totalCount: this.workers.size,
|
|
1008
|
+
timestamp: Date.now(),
|
|
1009
|
+
});
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
/**
|
|
1013
|
+
* Get worker memory usage using actual process monitoring
|
|
1014
|
+
*/
|
|
1015
|
+
async _getWorkerMemoryUsage(worker) {
|
|
1016
|
+
try {
|
|
1017
|
+
// Use Bun's process monitoring if available
|
|
1018
|
+
if (worker.subprocess && !worker.subprocess.killed) {
|
|
1019
|
+
// For Bun processes, we need to use system-level monitoring
|
|
1020
|
+
// since Bun doesn't expose process.memoryUsage() for subprocesses
|
|
1021
|
+
const pid = worker.subprocess.pid;
|
|
1022
|
+
if (pid) {
|
|
1023
|
+
return await this._getProcessMemoryUsage(pid);
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
// Fallback to estimated usage if process monitoring fails
|
|
1027
|
+
logger.warn("cluster", `Unable to get actual memory usage for worker ${worker.id}, using fallback`);
|
|
1028
|
+
return 64 * 1024 * 1024; // 64MB fallback
|
|
1029
|
+
}
|
|
1030
|
+
catch (error) {
|
|
1031
|
+
logger.error("cluster", `Error getting memory usage for worker ${worker.id}:`, error);
|
|
1032
|
+
return 64 * 1024 * 1024; // 64MB fallback
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
/**
|
|
1036
|
+
* Get actual memory usage for a process by PID
|
|
1037
|
+
*/
|
|
1038
|
+
async _getProcessMemoryUsage(pid) {
|
|
1039
|
+
try {
|
|
1040
|
+
if (process.platform === "linux") {
|
|
1041
|
+
const fs = await import('fs');
|
|
1042
|
+
const statm = await fs.promises.readFile(`/proc/${pid}/statm`, "utf8");
|
|
1043
|
+
const pages = parseInt(statm.split(" ")[1]); // RSS in pages
|
|
1044
|
+
const pageSize = 4096; // Standard page size on Linux
|
|
1045
|
+
return pages * pageSize;
|
|
1046
|
+
}
|
|
1047
|
+
else if (process.platform === "darwin") {
|
|
1048
|
+
// macOS implementation using ps command
|
|
1049
|
+
const { spawn } = await import('child_process');
|
|
1050
|
+
return new Promise((resolve, reject) => {
|
|
1051
|
+
const ps = spawn("ps", [
|
|
1052
|
+
"-o",
|
|
1053
|
+
"rss=",
|
|
1054
|
+
"-p",
|
|
1055
|
+
pid.toString(),
|
|
1056
|
+
]);
|
|
1057
|
+
let output = "";
|
|
1058
|
+
ps.stdout.on("data", (data) => {
|
|
1059
|
+
output += data.toString();
|
|
1060
|
+
});
|
|
1061
|
+
ps.on("close", (code) => {
|
|
1062
|
+
if (code === 0) {
|
|
1063
|
+
const rssKB = parseInt(output.trim());
|
|
1064
|
+
resolve(rssKB * 1024); // Convert KB to bytes
|
|
1065
|
+
}
|
|
1066
|
+
else {
|
|
1067
|
+
reject(new Error(`ps command failed with code ${code}`));
|
|
1068
|
+
}
|
|
1069
|
+
});
|
|
1070
|
+
ps.on("error", reject);
|
|
1071
|
+
});
|
|
1072
|
+
}
|
|
1073
|
+
else {
|
|
1074
|
+
// Windows or other platforms - use fallback
|
|
1075
|
+
throw new Error(`Memory monitoring not implemented for platform: ${process.platform}`);
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
catch (error) {
|
|
1079
|
+
throw new Error(`Failed to get process memory usage: ${error}`);
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
/**
|
|
1083
|
+
* Start metrics collection with detailed tracking
|
|
1084
|
+
*/
|
|
1085
|
+
_startMetricsCollection() {
|
|
1086
|
+
this.metricsInterval = setInterval(async () => {
|
|
1087
|
+
try {
|
|
1088
|
+
await this._collectMetrics();
|
|
1089
|
+
}
|
|
1090
|
+
catch (error) {
|
|
1091
|
+
logger.error("cluster", "Metrics collection error:", error);
|
|
1092
|
+
}
|
|
1093
|
+
}, 60000); // Collect every minute
|
|
1094
|
+
}
|
|
1095
|
+
/**
|
|
1096
|
+
* Collect comprehensive cluster metrics
|
|
1097
|
+
*/
|
|
1098
|
+
async _collectMetrics() {
|
|
1099
|
+
const workers = Array.from(this.workers.values());
|
|
1100
|
+
const activeWorkers = workers.filter((w) => w.health.status === "healthy");
|
|
1101
|
+
const totalRequests = workers.reduce((sum, w) => sum + w.performance.requestCount, 0);
|
|
1102
|
+
const totalErrors = workers.reduce((sum, w) => sum + w.performance.errorCount, 0);
|
|
1103
|
+
const avgResponseTimes = workers
|
|
1104
|
+
.filter((w) => w.performance.averageResponseTime > 0)
|
|
1105
|
+
.map((w) => w.performance.averageResponseTime);
|
|
1106
|
+
const averageResponseTime = avgResponseTimes.length > 0
|
|
1107
|
+
? avgResponseTimes.reduce((sum, time) => sum + time, 0) /
|
|
1108
|
+
avgResponseTimes.length
|
|
1109
|
+
: 0;
|
|
1110
|
+
const memoryUsage = process.memoryUsage();
|
|
1111
|
+
const workerMemoryUsage = workers.reduce((sum, w) => sum + w.performance.memoryUsage, 0);
|
|
1112
|
+
const metrics = {
|
|
1113
|
+
totalWorkers: workers.length,
|
|
1114
|
+
activeWorkers: activeWorkers.length,
|
|
1115
|
+
totalRequests,
|
|
1116
|
+
averageResponseTime: Math.round(averageResponseTime * 100) / 100, // Round to 2 decimal places
|
|
1117
|
+
memoryUsage: memoryUsage.heapUsed + workerMemoryUsage,
|
|
1118
|
+
cpuUsage: await this._calculateCpuUsage(),
|
|
1119
|
+
uptime: performance.now() - this.startTime,
|
|
1120
|
+
errorRate: totalRequests > 0 ? (totalErrors / totalRequests) * 100 : 0,
|
|
1121
|
+
restartCount: workers.reduce((sum, w) => sum + w.restarts, 0),
|
|
1122
|
+
};
|
|
1123
|
+
this.emit("metrics:collected", { metrics, timestamp: Date.now() });
|
|
1124
|
+
return metrics;
|
|
1125
|
+
}
|
|
1126
|
+
/**
|
|
1127
|
+
* Calculate CPU usage for the cluster using sophisticated monitoring
|
|
1128
|
+
*/
|
|
1129
|
+
async _calculateCpuUsage() {
|
|
1130
|
+
const workers = this.getAllWorkers();
|
|
1131
|
+
return await this.cpuMonitor.calculateClusterCpuUsage(workers);
|
|
1132
|
+
}
|
|
1133
|
+
/**
|
|
1134
|
+
* Start performance monitoring
|
|
1135
|
+
*/
|
|
1136
|
+
_startPerformanceMonitoring() {
|
|
1137
|
+
this.performanceInterval = setInterval(() => {
|
|
1138
|
+
this._updateWorkerPerformanceMetrics();
|
|
1139
|
+
}, 30000); // Update every 30 seconds
|
|
1140
|
+
}
|
|
1141
|
+
/**
|
|
1142
|
+
* Update worker performance metrics with real data
|
|
1143
|
+
*/
|
|
1144
|
+
_updateWorkerPerformanceMetrics() {
|
|
1145
|
+
for (const [, worker] of this.workers) {
|
|
1146
|
+
// Update memory usage with actual data
|
|
1147
|
+
this._getWorkerMemoryUsage(worker)
|
|
1148
|
+
.then((memoryUsage) => {
|
|
1149
|
+
worker.performance.memoryUsage = memoryUsage;
|
|
1150
|
+
})
|
|
1151
|
+
.catch((error) => {
|
|
1152
|
+
logger.debug("cluster", `Failed to update memory usage for worker ${worker.id}:`, error);
|
|
1153
|
+
});
|
|
1154
|
+
// Update CPU usage if available
|
|
1155
|
+
this._getWorkerCpuUsage(worker)
|
|
1156
|
+
.then((cpuUsage) => {
|
|
1157
|
+
worker.performance.cpuUsage = cpuUsage;
|
|
1158
|
+
})
|
|
1159
|
+
.catch((error) => {
|
|
1160
|
+
logger.debug("cluster", `Failed to update CPU usage for worker ${worker.id}:`, error);
|
|
1161
|
+
});
|
|
1162
|
+
// Decay old metrics to prevent infinite growth
|
|
1163
|
+
const timeSinceLastUpdate = Date.now() -
|
|
1164
|
+
(worker.performance.lastRequestTime || worker.startTime);
|
|
1165
|
+
if (timeSinceLastUpdate > 300000) {
|
|
1166
|
+
// 5 minutes - decay counters
|
|
1167
|
+
worker.performance.requestCount = Math.floor(worker.performance.requestCount * 0.9);
|
|
1168
|
+
worker.performance.errorCount = Math.floor(worker.performance.errorCount * 0.9);
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
/**
|
|
1173
|
+
* Get actual CPU usage for a worker
|
|
1174
|
+
*/
|
|
1175
|
+
async _getWorkerCpuUsage(worker) {
|
|
1176
|
+
try {
|
|
1177
|
+
if (worker.subprocess && !worker.subprocess.killed) {
|
|
1178
|
+
const pid = worker.subprocess.pid;
|
|
1179
|
+
if (pid) {
|
|
1180
|
+
return await this._getProcessCpuUsage(pid);
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
return 0;
|
|
1184
|
+
}
|
|
1185
|
+
catch (error) {
|
|
1186
|
+
logger.debug("cluster", `Error getting CPU usage for worker ${worker.id}:`, error);
|
|
1187
|
+
return 0;
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
/**
|
|
1191
|
+
* Get actual CPU usage for a process by PID using sophisticated monitoring
|
|
1192
|
+
*/
|
|
1193
|
+
async _getProcessCpuUsage(pid) {
|
|
1194
|
+
return await this.cpuMonitor.getProcessCpuUsage(pid);
|
|
1195
|
+
}
|
|
1196
|
+
/**
|
|
1197
|
+
* Enable emergency mode for resource conservation
|
|
1198
|
+
*/
|
|
1199
|
+
_enableEmergencyMode() {
|
|
1200
|
+
logger.warn("cluster", "Enabling emergency mode - reducing resource usage");
|
|
1201
|
+
// Reduce monitoring frequency
|
|
1202
|
+
if (this.healthCheckInterval) {
|
|
1203
|
+
clearInterval(this.healthCheckInterval);
|
|
1204
|
+
this.healthCheckInterval = setInterval(() => this._performHealthCheck(), 60000); // 1 minute
|
|
1205
|
+
}
|
|
1206
|
+
if (this.metricsInterval) {
|
|
1207
|
+
clearInterval(this.metricsInterval);
|
|
1208
|
+
this.metricsInterval = setInterval(() => this._collectMetrics(), 300000); // 5 minutes
|
|
1209
|
+
}
|
|
1210
|
+
this.emit("emergency_mode", { enabled: true, timestamp: Date.now() });
|
|
1211
|
+
}
|
|
1212
|
+
/**
|
|
1213
|
+
* Disable emergency mode and restore normal operation
|
|
1214
|
+
*/
|
|
1215
|
+
_disableEmergencyMode() {
|
|
1216
|
+
logger.info("cluster", "Disabling emergency mode - restoring normal operation");
|
|
1217
|
+
// Restore normal monitoring frequency
|
|
1218
|
+
if (this.healthCheckInterval) {
|
|
1219
|
+
clearInterval(this.healthCheckInterval);
|
|
1220
|
+
this._startHealthMonitoring();
|
|
1221
|
+
}
|
|
1222
|
+
if (this.metricsInterval) {
|
|
1223
|
+
clearInterval(this.metricsInterval);
|
|
1224
|
+
this._startMetricsCollection();
|
|
1225
|
+
}
|
|
1226
|
+
this.emit("emergency_mode", { enabled: false, timestamp: Date.now() });
|
|
1227
|
+
}
|
|
1228
|
+
// Public API methods (maintaining compatibility)
|
|
1229
|
+
/**
|
|
1230
|
+
* Get all workers
|
|
1231
|
+
*/
|
|
1232
|
+
getAllWorkers() {
|
|
1233
|
+
return Array.from(this.workers.values());
|
|
1234
|
+
}
|
|
1235
|
+
/**
|
|
1236
|
+
* Get active workers
|
|
1237
|
+
*/
|
|
1238
|
+
getActiveWorkers() {
|
|
1239
|
+
return this.getAllWorkers().filter((w) => w.health.status === "healthy");
|
|
1240
|
+
}
|
|
1241
|
+
/**
|
|
1242
|
+
* Set IPC manager for worker communication
|
|
1243
|
+
*/
|
|
1244
|
+
setIPCManager(ipcManager) {
|
|
1245
|
+
this.ipcManager = ipcManager;
|
|
1246
|
+
logger.debug("cluster", "IPC Manager set for Bun cluster");
|
|
1247
|
+
// Register existing workers with IPC manager
|
|
1248
|
+
for (const [workerId, worker] of this.workers) {
|
|
1249
|
+
if (worker.subprocess && worker.status === "running") {
|
|
1250
|
+
this.ipcManager.registerWorker(workerId, worker.subprocess);
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
/**
|
|
1255
|
+
* Get cluster metrics
|
|
1256
|
+
*/
|
|
1257
|
+
async getMetrics() {
|
|
1258
|
+
return this._collectMetrics();
|
|
1259
|
+
}
|
|
1260
|
+
/**
|
|
1261
|
+
* Check cluster health with detailed information
|
|
1262
|
+
*/
|
|
1263
|
+
async checkHealth() {
|
|
1264
|
+
const workers = this.getAllWorkers();
|
|
1265
|
+
const activeWorkers = this.getActiveWorkers();
|
|
1266
|
+
const healthyPercentage = workers.length > 0
|
|
1267
|
+
? (activeWorkers.length / workers.length) * 100
|
|
1268
|
+
: 0;
|
|
1269
|
+
const uptime = performance.now() - this.startTime;
|
|
1270
|
+
const unhealthyWorkers = workers.filter((w) => w.health.status === "unhealthy");
|
|
1271
|
+
const criticalIssues = unhealthyWorkers.filter((w) => w.health.consecutiveFailures >= 3);
|
|
1272
|
+
return {
|
|
1273
|
+
healthy: healthyPercentage >= 70 && criticalIssues.length === 0,
|
|
1274
|
+
details: {
|
|
1275
|
+
totalWorkers: workers.length,
|
|
1276
|
+
activeWorkers: activeWorkers.length,
|
|
1277
|
+
healthyPercentage: Math.round(healthyPercentage),
|
|
1278
|
+
uptime: Math.round(uptime),
|
|
1279
|
+
criticalIssues: criticalIssues.length,
|
|
1280
|
+
memoryUsage: process.memoryUsage().heapUsed,
|
|
1281
|
+
isEmergencyMode: false, // Will be implemented when MemoryManager is updated
|
|
1282
|
+
lastHealthCheck: Date.now(),
|
|
1283
|
+
},
|
|
1284
|
+
};
|
|
1285
|
+
}
|
|
1286
|
+
/**
|
|
1287
|
+
* Scale up workers with validation
|
|
1288
|
+
*/
|
|
1289
|
+
async scaleUp(count = 1) {
|
|
1290
|
+
if (!this.isRunning) {
|
|
1291
|
+
throw new Error("Cannot scale up: cluster is not running");
|
|
1292
|
+
}
|
|
1293
|
+
if (count <= 0 || count > 16) {
|
|
1294
|
+
throw new Error("Invalid scale up count: must be between 1 and 16");
|
|
1295
|
+
}
|
|
1296
|
+
logger.info("cluster", `Scaling up Bun cluster by ${count} workers`);
|
|
1297
|
+
// Validate resources before scaling
|
|
1298
|
+
const currentCount = this.workers.size;
|
|
1299
|
+
const newCount = currentCount + count;
|
|
1300
|
+
await this._validateSystemResources(newCount);
|
|
1301
|
+
const spawnPromises = Array.from({ length: count }, (_, i) => this._spawnWorkerWithRetry(currentCount + i));
|
|
1302
|
+
const results = await Promise.allSettled(spawnPromises);
|
|
1303
|
+
const successful = results.filter((r) => r.status === "fulfilled").length;
|
|
1304
|
+
logger.info("cluster", `Scale up completed: ${successful}/${count} workers started`);
|
|
1305
|
+
this.emit("cluster:scaled_up", {
|
|
1306
|
+
requested: count,
|
|
1307
|
+
successful,
|
|
1308
|
+
newTotal: this.workers.size,
|
|
1309
|
+
timestamp: Date.now(),
|
|
1310
|
+
});
|
|
1311
|
+
if (successful === 0) {
|
|
1312
|
+
throw new Error("Failed to start any new workers during scale up");
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
/**
|
|
1316
|
+
* Scale down workers with safety checks
|
|
1317
|
+
*/
|
|
1318
|
+
async scaleDown(count = 1) {
|
|
1319
|
+
if (!this.isRunning) {
|
|
1320
|
+
throw new Error("Cannot scale down: cluster is not running");
|
|
1321
|
+
}
|
|
1322
|
+
const activeWorkers = this.getActiveWorkers();
|
|
1323
|
+
if (activeWorkers.length <= 1) {
|
|
1324
|
+
throw new Error("Cannot scale down: must maintain at least one active worker");
|
|
1325
|
+
}
|
|
1326
|
+
const actualCount = Math.min(count, activeWorkers.length - 1);
|
|
1327
|
+
logger.info("cluster", `Scaling down Bun cluster by ${actualCount} workers`);
|
|
1328
|
+
// Select workers to stop (prefer oldest workers)
|
|
1329
|
+
const workersToStop = activeWorkers
|
|
1330
|
+
.sort((a, b) => a.startTime - b.startTime)
|
|
1331
|
+
.slice(-actualCount);
|
|
1332
|
+
const stopPromises = workersToStop.map(async (worker) => {
|
|
1333
|
+
await this._stopWorker(worker.id, true);
|
|
1334
|
+
this.workers.delete(worker.id);
|
|
1335
|
+
return worker.id;
|
|
1336
|
+
});
|
|
1337
|
+
const results = await Promise.allSettled(stopPromises);
|
|
1338
|
+
const successful = results.filter((r) => r.status === "fulfilled").length;
|
|
1339
|
+
logger.info("cluster", `Scale down completed: ${successful}/${actualCount} workers stopped`);
|
|
1340
|
+
this.emit("cluster:scaled_down", {
|
|
1341
|
+
requested: actualCount,
|
|
1342
|
+
successful,
|
|
1343
|
+
newTotal: this.workers.size,
|
|
1344
|
+
timestamp: Date.now(),
|
|
1345
|
+
});
|
|
1346
|
+
}
|
|
1347
|
+
/**
|
|
1348
|
+
* Handle memory-based scale down with safety measures
|
|
1349
|
+
*/
|
|
1350
|
+
async _handleMemoryScaleDown(alert) {
|
|
1351
|
+
const enhancedWorkers = Array.from(this.workers.values()).filter((w) => w.health.status === "healthy");
|
|
1352
|
+
if (enhancedWorkers.length <= 1) {
|
|
1353
|
+
logger.warn("cluster", "Cannot scale down further - only one worker remaining");
|
|
1354
|
+
return;
|
|
1355
|
+
}
|
|
1356
|
+
// Find the worker using the most memory
|
|
1357
|
+
const workerToStop = enhancedWorkers.reduce((prev, current) => prev.performance.memoryUsage > current.performance.memoryUsage
|
|
1358
|
+
? prev
|
|
1359
|
+
: current);
|
|
1360
|
+
logger.info("cluster", `Scaling down due to memory pressure - stopping worker ${workerToStop.id}`);
|
|
1361
|
+
try {
|
|
1362
|
+
await this._stopWorker(workerToStop.id, true);
|
|
1363
|
+
this.workers.delete(workerToStop.id);
|
|
1364
|
+
this.emit("worker:scaled_down_memory", {
|
|
1365
|
+
workerId: workerToStop.id,
|
|
1366
|
+
memoryUsage: workerToStop.performance.memoryUsage,
|
|
1367
|
+
alert,
|
|
1368
|
+
timestamp: Date.now(),
|
|
1369
|
+
});
|
|
1370
|
+
}
|
|
1371
|
+
catch (error) {
|
|
1372
|
+
logger.error("cluster", `Failed to scale down worker ${workerToStop.id}:`, error);
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
/**
|
|
1376
|
+
* Handle worker memory issues with enhanced recovery
|
|
1377
|
+
*/
|
|
1378
|
+
async _handleWorkerMemoryIssue(workerId, alert) {
|
|
1379
|
+
const worker = this.workers.get(workerId);
|
|
1380
|
+
if (!worker) {
|
|
1381
|
+
return;
|
|
1382
|
+
}
|
|
1383
|
+
logger.warn("cluster", `Handling memory issue for worker ${workerId}: ${alert.message}`);
|
|
1384
|
+
// Add to restart history
|
|
1385
|
+
worker.restartHistory.push({
|
|
1386
|
+
timestamp: Date.now(),
|
|
1387
|
+
reason: `memory_issue: ${alert.message}`,
|
|
1388
|
+
});
|
|
1389
|
+
try {
|
|
1390
|
+
// Force stop for memory issues (no graceful shutdown)
|
|
1391
|
+
await this._stopWorker(workerId, false);
|
|
1392
|
+
// Wait before restart to allow memory cleanup
|
|
1393
|
+
await new Promise((resolve) => setTimeout(resolve, 5000));
|
|
1394
|
+
// Restart with the same index
|
|
1395
|
+
const index = parseInt(workerId.split("-")[1]) || 0;
|
|
1396
|
+
await this._spawnWorkerWithRetry(index, 2);
|
|
1397
|
+
this.emit("worker:memory_restart", {
|
|
1398
|
+
oldWorkerId: workerId,
|
|
1399
|
+
alert,
|
|
1400
|
+
timestamp: Date.now(),
|
|
1401
|
+
});
|
|
1402
|
+
}
|
|
1403
|
+
catch (error) {
|
|
1404
|
+
logger.error("cluster", `Failed to restart worker ${workerId} after memory issue:`, error);
|
|
1405
|
+
this.workers.delete(workerId);
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
/**
|
|
1409
|
+
* Handle memory throttling with appropriate measures
|
|
1410
|
+
*/
|
|
1411
|
+
_handleMemoryThrottling(alert) {
|
|
1412
|
+
logger.info("cluster", `Implementing memory throttling: ${alert.message}`);
|
|
1413
|
+
// Reduce monitoring frequency to save memory
|
|
1414
|
+
this._enableEmergencyMode();
|
|
1415
|
+
// Emit throttling event for application to handle
|
|
1416
|
+
this.emit("memory_throttling", {
|
|
1417
|
+
alert,
|
|
1418
|
+
timestamp: Date.now(),
|
|
1419
|
+
action: "reduce_concurrency",
|
|
1420
|
+
recommendations: {
|
|
1421
|
+
reduceWorkerCount: true,
|
|
1422
|
+
enableCompression: true,
|
|
1423
|
+
clearCaches: true,
|
|
1424
|
+
deferNonCriticalTasks: true,
|
|
1425
|
+
},
|
|
1426
|
+
});
|
|
1427
|
+
}
|
|
1428
|
+
/**
|
|
1429
|
+
* Get memory optimization recommendations
|
|
1430
|
+
*/
|
|
1431
|
+
getMemoryRecommendations() {
|
|
1432
|
+
const enhancedWorkers = Array.from(this.workers.values());
|
|
1433
|
+
const totalMemory = enhancedWorkers.reduce((sum, w) => sum + w.performance.memoryUsage, 0);
|
|
1434
|
+
const avgMemoryPerWorker = enhancedWorkers.length > 0
|
|
1435
|
+
? totalMemory / enhancedWorkers.length
|
|
1436
|
+
: 0;
|
|
1437
|
+
return {
|
|
1438
|
+
currentWorkerCount: enhancedWorkers.length,
|
|
1439
|
+
optimalWorkerCount: this.getOptimalWorkerCountForMemory(),
|
|
1440
|
+
averageMemoryPerWorker: Math.round(avgMemoryPerWorker / 1024 / 1024), // MB
|
|
1441
|
+
recommendations: this.memoryManager.getMemoryOptimizationRecommendations?.() || {
|
|
1442
|
+
scaleDown: enhancedWorkers.length >
|
|
1443
|
+
this.getOptimalWorkerCountForMemory(),
|
|
1444
|
+
enableCompression: true,
|
|
1445
|
+
optimizeGarbageCollection: true,
|
|
1446
|
+
monitorMemoryLeaks: totalMemory > 1024 * 1024 * 1024, // > 1GB
|
|
1447
|
+
},
|
|
1448
|
+
timestamp: Date.now(),
|
|
1449
|
+
};
|
|
1450
|
+
}
|
|
1451
|
+
/**
|
|
1452
|
+
* Get optimal worker count based on memory constraints
|
|
1453
|
+
*/
|
|
1454
|
+
getOptimalWorkerCountForMemory() {
|
|
1455
|
+
// Use OS-level memory information for accurate calculation
|
|
1456
|
+
const os = require("os");
|
|
1457
|
+
const freeMemory = os.freemem();
|
|
1458
|
+
const totalMemory = os.totalmem();
|
|
1459
|
+
// Use the smaller of free memory or 60% of total memory (conservative approach)
|
|
1460
|
+
const available = Math.min(freeMemory * 0.8, totalMemory * 0.6);
|
|
1461
|
+
const perWorker = this.securityConfig.maxMemoryPerWorker;
|
|
1462
|
+
const memoryBasedCount = Math.floor(available / perWorker);
|
|
1463
|
+
const result = Math.max(1, Math.min(memoryBasedCount, this._getOptimalWorkerCount()));
|
|
1464
|
+
logger.debug("cluster", `Memory-based worker count: ${memoryBasedCount} (available: ${Math.round(available / 1024 / 1024)}MB, per worker: ${Math.round(perWorker / 1024 / 1024)}MB)`);
|
|
1465
|
+
return result;
|
|
1466
|
+
}
|
|
1467
|
+
/**
|
|
1468
|
+
* Enable low memory mode with comprehensive measures
|
|
1469
|
+
*/
|
|
1470
|
+
enableLowMemoryMode() {
|
|
1471
|
+
logger.info("cluster", "Manually enabling low memory mode");
|
|
1472
|
+
this.memoryManager.enableLowMemoryMode?.();
|
|
1473
|
+
this._enableEmergencyMode();
|
|
1474
|
+
}
|
|
1475
|
+
/**
|
|
1476
|
+
* Disable low memory mode and restore normal operation
|
|
1477
|
+
*/
|
|
1478
|
+
disableLowMemoryMode() {
|
|
1479
|
+
logger.info("cluster", "Manually disabling low memory mode");
|
|
1480
|
+
this.memoryManager.disableLowMemoryMode?.();
|
|
1481
|
+
this._disableEmergencyMode();
|
|
1482
|
+
}
|
|
1483
|
+
/**
|
|
1484
|
+
* Get detailed worker information for debugging
|
|
1485
|
+
*/
|
|
1486
|
+
getWorkerDetails(workerId) {
|
|
1487
|
+
if (workerId) {
|
|
1488
|
+
const worker = this.workers.get(workerId);
|
|
1489
|
+
if (!worker) {
|
|
1490
|
+
return null;
|
|
1491
|
+
}
|
|
1492
|
+
return {
|
|
1493
|
+
id: worker.id,
|
|
1494
|
+
port: worker.port,
|
|
1495
|
+
status: worker.status,
|
|
1496
|
+
health: worker.health,
|
|
1497
|
+
performance: worker.performance,
|
|
1498
|
+
uptime: Date.now() - worker.startTime,
|
|
1499
|
+
restarts: worker.restarts,
|
|
1500
|
+
restartHistory: worker.restartHistory.slice(-5), // Last 5 restarts
|
|
1501
|
+
resourceLimits: worker.resourceLimits,
|
|
1502
|
+
};
|
|
1503
|
+
}
|
|
1504
|
+
return Array.from(this.workers.values()).map((worker) => ({
|
|
1505
|
+
id: worker.id,
|
|
1506
|
+
port: worker.port,
|
|
1507
|
+
status: worker.status,
|
|
1508
|
+
health: worker.health.status,
|
|
1509
|
+
uptime: Date.now() - worker.startTime,
|
|
1510
|
+
restarts: worker.restarts,
|
|
1511
|
+
memoryUsage: Math.round(worker.performance.memoryUsage / 1024 / 1024), // MB
|
|
1512
|
+
requestCount: worker.performance.requestCount,
|
|
1513
|
+
}));
|
|
1514
|
+
}
|
|
1515
|
+
/**
|
|
1516
|
+
* Force restart of a specific worker (for debugging/maintenance)
|
|
1517
|
+
*/
|
|
1518
|
+
async forceRestartWorker(workerId) {
|
|
1519
|
+
const worker = this.workers.get(workerId);
|
|
1520
|
+
if (!worker) {
|
|
1521
|
+
throw new Error(`Worker ${workerId} not found`);
|
|
1522
|
+
}
|
|
1523
|
+
logger.info("cluster", `Force restarting worker ${workerId}`);
|
|
1524
|
+
worker.restartHistory.push({
|
|
1525
|
+
timestamp: Date.now(),
|
|
1526
|
+
reason: "manual_restart",
|
|
1527
|
+
});
|
|
1528
|
+
await this._attemptWorkerRestart(worker);
|
|
1529
|
+
}
|
|
1530
|
+
/**
|
|
1531
|
+
* Get cluster status summary
|
|
1532
|
+
*/
|
|
1533
|
+
getStatus() {
|
|
1534
|
+
const workers = this.getAllWorkers();
|
|
1535
|
+
const activeWorkers = this.getActiveWorkers();
|
|
1536
|
+
const enhancedWorkers = Array.from(this.workers.values());
|
|
1537
|
+
return {
|
|
1538
|
+
isRunning: this.isRunning,
|
|
1539
|
+
uptime: performance.now() - this.startTime,
|
|
1540
|
+
workers: {
|
|
1541
|
+
total: workers.length,
|
|
1542
|
+
active: activeWorkers.length,
|
|
1543
|
+
starting: workers.filter((w) => w.status === "starting").length,
|
|
1544
|
+
stopping: workers.filter((w) => w.status === "stopping").length,
|
|
1545
|
+
unhealthy: workers.filter((w) => w.health.status === "unhealthy").length,
|
|
1546
|
+
},
|
|
1547
|
+
performance: {
|
|
1548
|
+
totalRequests: enhancedWorkers.reduce((sum, w) => sum + w.performance.requestCount, 0),
|
|
1549
|
+
totalErrors: enhancedWorkers.reduce((sum, w) => sum + w.performance.errorCount, 0),
|
|
1550
|
+
totalRestarts: workers.reduce((sum, w) => sum + w.restarts, 0),
|
|
1551
|
+
},
|
|
1552
|
+
memory: {
|
|
1553
|
+
masterUsage: Math.round(process.memoryUsage().heapUsed / 1024 / 1024), // MB
|
|
1554
|
+
workerUsage: Math.round(enhancedWorkers.reduce((sum, w) => sum + w.performance.memoryUsage, 0) /
|
|
1555
|
+
1024 /
|
|
1556
|
+
1024), // MB
|
|
1557
|
+
isLowMemoryMode: false, // Will be implemented in MemoryManager
|
|
1558
|
+
},
|
|
1559
|
+
timestamp: Date.now(),
|
|
1560
|
+
};
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1563
|
+
|
|
1564
|
+
export { BunClusterManager };
|
|
1565
|
+
//# sourceMappingURL=bun-cluster-manager.js.map
|