aetherframework-cluster 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +90 -0
- package/README.md +1049 -0
- package/index.js +288 -0
- package/package.json +41 -0
- package/src/core/ClusterManager.js +109 -0
- package/src/core/HealthMonitor.js +571 -0
- package/src/core/LoadBalancer.js +531 -0
- package/src/core/WorkerManager.js +619 -0
- package/src/examples/advanced-cluster.js +150 -0
- package/src/examples/basic-cluster.js +107 -0
- package/src/examples/benchmark-cluster.js +112 -0
- package/src/examples/simple-app.js +52 -0
- package/src/middleware/cluster-health.js +330 -0
- package/src/middleware/graceful-shutdown.js +443 -0
- package/src/middleware/process-monitor.js +925 -0
- package/src/middleware/worker-stats.js +879 -0
- package/src/utils/cpu-detector.js +78 -0
- package/src/utils/env-loader.js +140 -0
- package/src/utils/signal-handler.js +90 -0
|
@@ -0,0 +1,879 @@
|
|
|
1
|
+
// packages/cluster/src/middleware/worker-stats.js
|
|
2
|
+
import { EventEmitter } from 'events';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Worker Statistics Middleware - Provides detailed worker statistics and monitoring endpoints
|
|
6
|
+
* This middleware adds endpoints to monitor individual worker performance and cluster statistics
|
|
7
|
+
*/
|
|
8
|
+
class WorkerStatsMiddleware extends EventEmitter {
|
|
9
|
+
constructor(workerManager, options = {}) {
|
|
10
|
+
super();
|
|
11
|
+
|
|
12
|
+
this.workerManager = workerManager;
|
|
13
|
+
this.options = {
|
|
14
|
+
path: options.path || '/cluster/stats',
|
|
15
|
+
auth: options.auth || null,
|
|
16
|
+
rateLimit: options.rateLimit || { windowMs: 60000, max: 60 },
|
|
17
|
+
cacheDuration: options.cacheDuration || 5000, // 5 seconds
|
|
18
|
+
includeDetails: options.includeDetails !== false,
|
|
19
|
+
...options
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
this.statsCache = null;
|
|
23
|
+
this.cacheTimestamp = 0;
|
|
24
|
+
this.requestCounts = new Map();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Get middleware function for HTTP framework integration
|
|
29
|
+
* @returns {Function} Middleware function
|
|
30
|
+
*/
|
|
31
|
+
middleware() {
|
|
32
|
+
return async (ctx, next) => {
|
|
33
|
+
// Only handle stats routes
|
|
34
|
+
if (!ctx.path.startsWith(this.options.path)) {
|
|
35
|
+
return next();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Check authentication if required
|
|
39
|
+
if (this.options.auth && !this.checkAuth(ctx)) {
|
|
40
|
+
ctx.status = 401;
|
|
41
|
+
ctx.body = {
|
|
42
|
+
status: 'error',
|
|
43
|
+
message: 'Unauthorized',
|
|
44
|
+
timestamp: new Date().toISOString()
|
|
45
|
+
};
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Check rate limiting
|
|
50
|
+
if (!this.checkRateLimit(ctx)) {
|
|
51
|
+
ctx.status = 429;
|
|
52
|
+
ctx.body = {
|
|
53
|
+
status: 'error',
|
|
54
|
+
message: 'Too many requests',
|
|
55
|
+
timestamp: new Date().toISOString(),
|
|
56
|
+
retryAfter: Math.ceil(this.options.rateLimit.windowMs / 1000)
|
|
57
|
+
};
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
// Handle different stats endpoints
|
|
63
|
+
if (ctx.path === this.options.path) {
|
|
64
|
+
await this.handleAllStats(ctx);
|
|
65
|
+
} else if (ctx.path === `${this.options.path}/workers`) {
|
|
66
|
+
await this.handleWorkersStats(ctx);
|
|
67
|
+
} else if (ctx.path === `${this.options.path}/workers/:pid`) {
|
|
68
|
+
await this.handleWorkerStats(ctx);
|
|
69
|
+
} else if (ctx.path === `${this.options.path}/cluster`) {
|
|
70
|
+
await this.handleClusterStats(ctx);
|
|
71
|
+
} else if (ctx.path === `${this.options.path}/metrics`) {
|
|
72
|
+
await this.handleMetrics(ctx);
|
|
73
|
+
} else if (ctx.path === `${this.options.path}/health`) {
|
|
74
|
+
await this.handleHealthStats(ctx);
|
|
75
|
+
} else {
|
|
76
|
+
ctx.status = 404;
|
|
77
|
+
ctx.body = {
|
|
78
|
+
status: 'error',
|
|
79
|
+
message: 'Stats endpoint not found',
|
|
80
|
+
timestamp: new Date().toISOString()
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
} catch (error) {
|
|
84
|
+
console.error('Worker stats middleware error:', error);
|
|
85
|
+
|
|
86
|
+
ctx.status = 500;
|
|
87
|
+
ctx.body = {
|
|
88
|
+
status: 'error',
|
|
89
|
+
timestamp: new Date().toISOString(),
|
|
90
|
+
message: 'Failed to get worker statistics',
|
|
91
|
+
error: error.message,
|
|
92
|
+
stack: process.env.NODE_ENV === 'development' ? error.stack : undefined
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Handle request for all statistics
|
|
100
|
+
* @param {Object} ctx - Context object
|
|
101
|
+
*/
|
|
102
|
+
async handleAllStats(ctx) {
|
|
103
|
+
const now = Date.now();
|
|
104
|
+
const useCache = ctx.query.nocache !== 'true' &&
|
|
105
|
+
(now - this.cacheTimestamp) < this.options.cacheDuration;
|
|
106
|
+
|
|
107
|
+
let stats;
|
|
108
|
+
if (useCache && this.statsCache) {
|
|
109
|
+
stats = this.statsCache;
|
|
110
|
+
} else {
|
|
111
|
+
stats = await this.getAllStats();
|
|
112
|
+
this.statsCache = stats;
|
|
113
|
+
this.cacheTimestamp = now;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Add response headers
|
|
117
|
+
ctx.set('X-Stats-Cache', useCache ? 'HIT' : 'MISS');
|
|
118
|
+
ctx.set('X-Stats-Timestamp', new Date().toISOString());
|
|
119
|
+
ctx.set('X-Stats-Version', '1.0.0');
|
|
120
|
+
|
|
121
|
+
ctx.status = 200;
|
|
122
|
+
ctx.body = {
|
|
123
|
+
status: 'success',
|
|
124
|
+
timestamp: new Date().toISOString(),
|
|
125
|
+
cache: useCache,
|
|
126
|
+
...stats
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Handle request for workers statistics
|
|
132
|
+
* @param {Object} ctx - Context object
|
|
133
|
+
*/
|
|
134
|
+
async handleWorkersStats(ctx) {
|
|
135
|
+
const workers = this.workerManager.getAllWorkerStats();
|
|
136
|
+
const query = ctx.query || {};
|
|
137
|
+
|
|
138
|
+
// Apply filters if provided
|
|
139
|
+
let filteredWorkers = workers;
|
|
140
|
+
|
|
141
|
+
// Filter by state
|
|
142
|
+
if (query.state) {
|
|
143
|
+
filteredWorkers = filteredWorkers.filter(w => w.state === query.state);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Filter by health status
|
|
147
|
+
if (query.healthy === 'true') {
|
|
148
|
+
filteredWorkers = filteredWorkers.filter(w => w.isAlive && w.errorRate < 10);
|
|
149
|
+
} else if (query.healthy === 'false') {
|
|
150
|
+
filteredWorkers = filteredWorkers.filter(w => !w.isAlive || w.errorRate >= 10);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Filter by minimum uptime
|
|
154
|
+
if (query.minUptime) {
|
|
155
|
+
const minUptime = parseInt(query.minUptime, 10);
|
|
156
|
+
filteredWorkers = filteredWorkers.filter(w => w.uptime >= minUptime);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Filter by maximum error rate
|
|
160
|
+
if (query.maxErrorRate) {
|
|
161
|
+
const maxErrorRate = parseFloat(query.maxErrorRate);
|
|
162
|
+
filteredWorkers = filteredWorkers.filter(w => w.errorRate <= maxErrorRate);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Sort workers
|
|
166
|
+
if (query.sort) {
|
|
167
|
+
const [field, order] = query.sort.split(':');
|
|
168
|
+
const sortOrder = order === 'desc' ? -1 : 1;
|
|
169
|
+
|
|
170
|
+
filteredWorkers.sort((a, b) => {
|
|
171
|
+
if (a[field] < b[field]) return -1 * sortOrder;
|
|
172
|
+
if (a[field] > b[field]) return 1 * sortOrder;
|
|
173
|
+
return 0;
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Pagination
|
|
178
|
+
const page = parseInt(query.page) || 1;
|
|
179
|
+
const limit = parseInt(query.limit) || 20;
|
|
180
|
+
const start = (page - 1) * limit;
|
|
181
|
+
const end = start + limit;
|
|
182
|
+
const paginatedWorkers = filteredWorkers.slice(start, end);
|
|
183
|
+
|
|
184
|
+
// Calculate summary
|
|
185
|
+
const summary = {
|
|
186
|
+
total: workers.length,
|
|
187
|
+
filtered: filteredWorkers.length,
|
|
188
|
+
active: workers.filter(w => w.state === 'active').length,
|
|
189
|
+
ready: workers.filter(w => w.state === 'ready').length,
|
|
190
|
+
idle: workers.filter(w => w.state === 'idle').length,
|
|
191
|
+
dead: workers.filter(w => !w.isAlive).length,
|
|
192
|
+
totalRequests: workers.reduce((sum, w) => sum + w.requests, 0),
|
|
193
|
+
totalErrors: workers.reduce((sum, w) => sum + w.errors, 0),
|
|
194
|
+
averageErrorRate: workers.length > 0 ?
|
|
195
|
+
workers.reduce((sum, w) => sum + parseFloat(w.errorRate), 0) / workers.length : 0
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
ctx.status = 200;
|
|
199
|
+
ctx.body = {
|
|
200
|
+
status: 'success',
|
|
201
|
+
timestamp: new Date().toISOString(),
|
|
202
|
+
summary,
|
|
203
|
+
pagination: {
|
|
204
|
+
page,
|
|
205
|
+
limit,
|
|
206
|
+
total: filteredWorkers.length,
|
|
207
|
+
pages: Math.ceil(filteredWorkers.length / limit),
|
|
208
|
+
hasNext: end < filteredWorkers.length,
|
|
209
|
+
hasPrev: page > 1
|
|
210
|
+
},
|
|
211
|
+
workers: paginatedWorkers.map(worker => this.formatWorkerStats(worker, query.details === 'true'))
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Handle request for single worker statistics
|
|
217
|
+
* @param {Object} ctx - Context object
|
|
218
|
+
*/
|
|
219
|
+
async handleWorkerStats(ctx) {
|
|
220
|
+
const pid = parseInt(ctx.params.pid);
|
|
221
|
+
|
|
222
|
+
if (isNaN(pid)) {
|
|
223
|
+
ctx.status = 400;
|
|
224
|
+
ctx.body = {
|
|
225
|
+
status: 'error',
|
|
226
|
+
message: 'Invalid worker PID',
|
|
227
|
+
timestamp: new Date().toISOString()
|
|
228
|
+
};
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const worker = this.workerManager.getWorkerByPid(pid);
|
|
233
|
+
|
|
234
|
+
if (!worker) {
|
|
235
|
+
ctx.status = 404;
|
|
236
|
+
ctx.body = {
|
|
237
|
+
status: 'error',
|
|
238
|
+
message: `Worker with PID ${pid} not found`,
|
|
239
|
+
timestamp: new Date().toISOString()
|
|
240
|
+
};
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const stats = this.workerManager.getWorkerStats(pid);
|
|
245
|
+
const health = this.workerManager.checkWorkerHealth(pid);
|
|
246
|
+
const load = this.workerManager.getWorkerLoad(pid);
|
|
247
|
+
const restartCount = this.workerManager.getRestartCount(pid);
|
|
248
|
+
|
|
249
|
+
const response = {
|
|
250
|
+
status: 'success',
|
|
251
|
+
timestamp: new Date().toISOString(),
|
|
252
|
+
worker: {
|
|
253
|
+
pid: worker.pid,
|
|
254
|
+
workerId: worker.id,
|
|
255
|
+
state: worker.state,
|
|
256
|
+
metadata: worker.metadata,
|
|
257
|
+
startTime: worker.startTime,
|
|
258
|
+
lastHeartbeat: worker.lastHeartbeat,
|
|
259
|
+
isAlive: this.workerManager.isWorkerAlive(pid)
|
|
260
|
+
},
|
|
261
|
+
stats: this.formatWorkerStats(stats, ctx.query.details === 'true'),
|
|
262
|
+
health,
|
|
263
|
+
performance: {
|
|
264
|
+
load,
|
|
265
|
+
restartCount,
|
|
266
|
+
shouldRestart: this.workerManager.shouldRestartWorker(pid),
|
|
267
|
+
lastRequestTime: stats.lastRequestTime,
|
|
268
|
+
requestsPerMinute: this.calculateRequestsPerMinute(pid)
|
|
269
|
+
}
|
|
270
|
+
};
|
|
271
|
+
|
|
272
|
+
// Add detailed metrics if requested
|
|
273
|
+
if (ctx.query.details === 'true') {
|
|
274
|
+
response.detailed = {
|
|
275
|
+
connections: this.workerManager.getAllWorkerLoads().get(pid) || 0,
|
|
276
|
+
lastError: stats.lastError,
|
|
277
|
+
uptime: Date.now() - worker.startTime,
|
|
278
|
+
memoryUsage: stats.memoryUsage,
|
|
279
|
+
cpuUsage: stats.cpuUsage,
|
|
280
|
+
environment: process.env.NODE_ENV || 'development',
|
|
281
|
+
nodeVersion: process.version,
|
|
282
|
+
platform: process.platform,
|
|
283
|
+
arch: process.arch
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
ctx.status = 200;
|
|
288
|
+
ctx.body = response;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Handle request for cluster statistics
|
|
293
|
+
* @param {Object} ctx - Context object
|
|
294
|
+
*/
|
|
295
|
+
async handleClusterStats(ctx) {
|
|
296
|
+
const clusterStats = this.workerManager.getClusterStats();
|
|
297
|
+
const metrics = this.workerManager.getMetrics();
|
|
298
|
+
|
|
299
|
+
const response = {
|
|
300
|
+
status: 'success',
|
|
301
|
+
timestamp: new Date().toISOString(),
|
|
302
|
+
cluster: {
|
|
303
|
+
...clusterStats,
|
|
304
|
+
workersByState: this.workerManager.groupWorkersByState(this.workerManager.getAllWorkerStats())
|
|
305
|
+
},
|
|
306
|
+
metrics: {
|
|
307
|
+
...metrics,
|
|
308
|
+
currentTime: Date.now(),
|
|
309
|
+
system: {
|
|
310
|
+
memory: process.memoryUsage(),
|
|
311
|
+
cpu: process.cpuUsage(),
|
|
312
|
+
uptime: process.uptime(),
|
|
313
|
+
platform: process.platform,
|
|
314
|
+
arch: process.arch,
|
|
315
|
+
nodeVersion: process.version
|
|
316
|
+
}
|
|
317
|
+
},
|
|
318
|
+
recommendations: this.getClusterRecommendations(clusterStats)
|
|
319
|
+
};
|
|
320
|
+
|
|
321
|
+
// Add historical data if available
|
|
322
|
+
if (ctx.query.history === 'true') {
|
|
323
|
+
response.history = this.getHistoricalStats();
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
ctx.status = 200;
|
|
327
|
+
ctx.body = response;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* Handle request for metrics
|
|
332
|
+
* @param {Object} ctx - Context object
|
|
333
|
+
*/
|
|
334
|
+
async handleMetrics(ctx) {
|
|
335
|
+
const format = ctx.query.format || 'json';
|
|
336
|
+
|
|
337
|
+
if (format === 'prometheus') {
|
|
338
|
+
await this.handlePrometheusMetrics(ctx);
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const workers = this.workerManager.getAllWorkerStats();
|
|
343
|
+
const clusterStats = this.workerManager.getClusterStats();
|
|
344
|
+
const metrics = this.workerManager.getMetrics();
|
|
345
|
+
|
|
346
|
+
const response = {
|
|
347
|
+
status: 'success',
|
|
348
|
+
timestamp: new Date().toISOString(),
|
|
349
|
+
metrics: {
|
|
350
|
+
// Worker metrics
|
|
351
|
+
worker_count: workers.length,
|
|
352
|
+
worker_active: workers.filter(w => w.state === 'active').length,
|
|
353
|
+
worker_ready: workers.filter(w => w.state === 'ready').length,
|
|
354
|
+
worker_idle: workers.filter(w => w.state === 'idle').length,
|
|
355
|
+
worker_dead: workers.filter(w => !w.isAlive).length,
|
|
356
|
+
|
|
357
|
+
// Performance metrics
|
|
358
|
+
total_requests: metrics.totalRequests,
|
|
359
|
+
total_errors: metrics.totalErrors,
|
|
360
|
+
total_restarts: metrics.totalRestarts,
|
|
361
|
+
error_rate: clusterStats.performance.errorRate,
|
|
362
|
+
average_requests_per_worker: clusterStats.performance.averageRequestsPerWorker,
|
|
363
|
+
average_load_per_worker: clusterStats.performance.averageLoadPerWorker,
|
|
364
|
+
|
|
365
|
+
// System metrics
|
|
366
|
+
system_uptime: process.uptime(),
|
|
367
|
+
cluster_uptime: metrics.uptime / 1000, // Convert to seconds
|
|
368
|
+
memory_usage: process.memoryUsage().heapUsed,
|
|
369
|
+
memory_total: process.memoryUsage().heapTotal,
|
|
370
|
+
memory_ratio: process.memoryUsage().heapUsed / process.memoryUsage().heapTotal,
|
|
371
|
+
|
|
372
|
+
// Timestamps
|
|
373
|
+
current_timestamp: Date.now(),
|
|
374
|
+
start_timestamp: metrics.startTime
|
|
375
|
+
},
|
|
376
|
+
workers: workers.map(worker => ({
|
|
377
|
+
pid: worker.pid,
|
|
378
|
+
worker_id: worker.workerId,
|
|
379
|
+
state: worker.state,
|
|
380
|
+
uptime: worker.uptime,
|
|
381
|
+
requests: worker.requests,
|
|
382
|
+
errors: worker.errors,
|
|
383
|
+
error_rate: worker.errorRate,
|
|
384
|
+
load: worker.load,
|
|
385
|
+
is_alive: worker.isAlive,
|
|
386
|
+
last_request_time: worker.lastRequestTime
|
|
387
|
+
}))
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
ctx.status = 200;
|
|
391
|
+
ctx.body = response;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* Handle request for Prometheus metrics format
|
|
396
|
+
* @param {Object} ctx - Context object
|
|
397
|
+
*/
|
|
398
|
+
async handlePrometheusMetrics(ctx) {
|
|
399
|
+
const workers = this.workerManager.getAllWorkerStats();
|
|
400
|
+
const clusterStats = this.workerManager.getClusterStats();
|
|
401
|
+
const metrics = this.workerManager.getMetrics();
|
|
402
|
+
|
|
403
|
+
let prometheusMetrics = '';
|
|
404
|
+
|
|
405
|
+
// Worker metrics
|
|
406
|
+
prometheusMetrics += `# HELP worker_count Total number of workers\n`;
|
|
407
|
+
prometheusMetrics += `# TYPE worker_count gauge\n`;
|
|
408
|
+
prometheusMetrics += `worker_count ${workers.length}\n\n`;
|
|
409
|
+
|
|
410
|
+
prometheusMetrics += `# HELP worker_active Number of active workers\n`;
|
|
411
|
+
prometheusMetrics += `# TYPE worker_active gauge\n`;
|
|
412
|
+
prometheusMetrics += `worker_active ${workers.filter(w => w.state === 'active').length}\n\n`;
|
|
413
|
+
|
|
414
|
+
prometheusMetrics += `# HELP worker_ready Number of ready workers\n`;
|
|
415
|
+
prometheusMetrics += `# TYPE worker_ready gauge\n`;
|
|
416
|
+
prometheusMetrics += `worker_ready ${workers.filter(w => w.state === 'ready').length}\n\n`;
|
|
417
|
+
|
|
418
|
+
prometheusMetrics += `# HELP worker_idle Number of idle workers\n`;
|
|
419
|
+
prometheusMetrics += `# TYPE worker_idle gauge\n`;
|
|
420
|
+
prometheusMetrics += `worker_idle ${workers.filter(w => w.state === 'idle').length}\n\n`;
|
|
421
|
+
|
|
422
|
+
// Performance metrics
|
|
423
|
+
prometheusMetrics += `# HELP total_requests Total number of requests\n`;
|
|
424
|
+
prometheusMetrics += `# TYPE total_requests counter\n`;
|
|
425
|
+
prometheusMetrics += `total_requests ${metrics.totalRequests}\n\n`;
|
|
426
|
+
|
|
427
|
+
prometheusMetrics += `# HELP total_errors Total number of errors\n`;
|
|
428
|
+
prometheusMetrics += `# TYPE total_errors counter\n`;
|
|
429
|
+
prometheusMetrics += `total_errors ${metrics.totalErrors}\n\n`;
|
|
430
|
+
|
|
431
|
+
prometheusMetrics += `# HELP error_rate Error rate percentage\n`;
|
|
432
|
+
prometheusMetrics += `# TYPE error_rate gauge\n`;
|
|
433
|
+
prometheusMetrics += `error_rate ${clusterStats.performance.errorRate}\n\n`;
|
|
434
|
+
|
|
435
|
+
// System metrics
|
|
436
|
+
const memoryUsage = process.memoryUsage();
|
|
437
|
+
prometheusMetrics += `# HELP memory_usage_bytes Memory usage in bytes\n`;
|
|
438
|
+
prometheusMetrics += `# TYPE memory_usage_bytes gauge\n`;
|
|
439
|
+
prometheusMetrics += `memory_usage_bytes ${memoryUsage.heapUsed}\n\n`;
|
|
440
|
+
|
|
441
|
+
prometheusMetrics += `# HELP memory_total_bytes Total memory in bytes\n`;
|
|
442
|
+
prometheusMetrics += `# TYPE memory_total_bytes gauge\n`;
|
|
443
|
+
prometheusMetrics += `memory_total_bytes ${memoryUsage.heapTotal}\n\n`;
|
|
444
|
+
|
|
445
|
+
// Worker-specific metrics
|
|
446
|
+
for (const worker of workers) {
|
|
447
|
+
prometheusMetrics += `# HELP worker_requests_total Total requests per worker\n`;
|
|
448
|
+
prometheusMetrics += `# TYPE worker_requests_total counter\n`;
|
|
449
|
+
prometheusMetrics += `worker_requests_total{pid="${worker.pid}",worker_id="${worker.workerId}"} ${worker.requests}\n\n`;
|
|
450
|
+
|
|
451
|
+
prometheusMetrics += `# HELP worker_errors_total Total errors per worker\n`;
|
|
452
|
+
prometheusMetrics += `# TYPE worker_errors_total counter\n`;
|
|
453
|
+
prometheusMetrics += `worker_errors_total{pid="${worker.pid}",worker_id="${worker.workerId}"} ${worker.errors}\n\n`;
|
|
454
|
+
|
|
455
|
+
prometheusMetrics += `# HELP worker_uptime_seconds Worker uptime in seconds\n`;
|
|
456
|
+
prometheusMetrics += `# TYPE worker_uptime_seconds gauge\n`;
|
|
457
|
+
prometheusMetrics += `worker_uptime_seconds{pid="${worker.pid}",worker_id="${worker.workerId}"} ${worker.uptime}\n\n`;
|
|
458
|
+
|
|
459
|
+
prometheusMetrics += `# HELP worker_load_current Current worker load\n`;
|
|
460
|
+
prometheusMetrics += `# TYPE worker_load_current gauge\n`;
|
|
461
|
+
prometheusMetrics += `worker_load_current{pid="${worker.pid}",worker_id="${worker.workerId}"} ${worker.load}\n\n`;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
ctx.set('Content-Type', 'text/plain; version=0.0.4');
|
|
465
|
+
ctx.status = 200;
|
|
466
|
+
ctx.body = prometheusMetrics;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
/**
|
|
470
|
+
* Handle request for health statistics
|
|
471
|
+
* @param {Object} ctx - Context object
|
|
472
|
+
*/
|
|
473
|
+
async handleHealthStats(ctx) {
|
|
474
|
+
const workers = this.workerManager.getAllWorkerStats();
|
|
475
|
+
const clusterStats = this.workerManager.getClusterStats();
|
|
476
|
+
|
|
477
|
+
const healthStatus = {
|
|
478
|
+
status: 'healthy',
|
|
479
|
+
timestamp: new Date().toISOString(),
|
|
480
|
+
checks: []
|
|
481
|
+
};
|
|
482
|
+
|
|
483
|
+
// Check worker availability
|
|
484
|
+
const activeWorkers = workers.filter(w => w.state === 'active' || w.state === 'ready').length;
|
|
485
|
+
if (activeWorkers === 0) {
|
|
486
|
+
healthStatus.status = 'critical';
|
|
487
|
+
healthStatus.checks.push({
|
|
488
|
+
name: 'worker_availability',
|
|
489
|
+
status: 'critical',
|
|
490
|
+
message: 'No active workers available'
|
|
491
|
+
});
|
|
492
|
+
} else if (activeWorkers < workers.length * 0.5) {
|
|
493
|
+
healthStatus.status = 'degraded';
|
|
494
|
+
healthStatus.checks.push({
|
|
495
|
+
name: 'worker_availability',
|
|
496
|
+
status: 'degraded',
|
|
497
|
+
message: `Less than 50% of workers active: ${activeWorkers}/${workers.length}`
|
|
498
|
+
});
|
|
499
|
+
} else {
|
|
500
|
+
healthStatus.checks.push({
|
|
501
|
+
name: 'worker_availability',
|
|
502
|
+
status: 'healthy',
|
|
503
|
+
message: `${activeWorkers}/${workers.length} workers active`
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// Check error rate
|
|
508
|
+
const totalRequests = workers.reduce((sum, w) => sum + w.requests, 0);
|
|
509
|
+
const totalErrors = workers.reduce((sum, w) => sum + w.errors, 0);
|
|
510
|
+
const errorRate = totalRequests > 0 ? (totalErrors / totalRequests) * 100 : 0;
|
|
511
|
+
|
|
512
|
+
if (errorRate > 10) {
|
|
513
|
+
healthStatus.status = healthStatus.status === 'healthy' ? 'degraded' : healthStatus.status;
|
|
514
|
+
healthStatus.checks.push({
|
|
515
|
+
name: 'error_rate',
|
|
516
|
+
status: 'degraded',
|
|
517
|
+
message: `High error rate: ${errorRate.toFixed(2)}%`
|
|
518
|
+
});
|
|
519
|
+
} else {
|
|
520
|
+
healthStatus.checks.push({
|
|
521
|
+
name: 'error_rate',
|
|
522
|
+
status: 'healthy',
|
|
523
|
+
message: `Error rate: ${errorRate.toFixed(2)}%`
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Check memory usage
|
|
528
|
+
const memoryUsage = process.memoryUsage();
|
|
529
|
+
const memoryRatio = memoryUsage.heapUsed / memoryUsage.heapTotal;
|
|
530
|
+
|
|
531
|
+
if (memoryRatio > 0.8) {
|
|
532
|
+
healthStatus.status = healthStatus.status === 'healthy' ? 'degraded' : healthStatus.status;
|
|
533
|
+
healthStatus.checks.push({
|
|
534
|
+
name: 'memory_usage',
|
|
535
|
+
status: 'degraded',
|
|
536
|
+
message: `High memory usage: ${(memoryRatio * 100).toFixed(1)}%`
|
|
537
|
+
});
|
|
538
|
+
} else {
|
|
539
|
+
healthStatus.checks.push({
|
|
540
|
+
name: 'memory_usage',
|
|
541
|
+
status: 'healthy',
|
|
542
|
+
message: `Memory usage: ${(memoryRatio * 100).toFixed(1)}%`
|
|
543
|
+
});
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Check worker health
|
|
547
|
+
const unhealthyWorkers = workers.filter(w => !w.isAlive || w.errorRate >= 10);
|
|
548
|
+
if (unhealthyWorkers.length > 0) {
|
|
549
|
+
healthStatus.status = healthStatus.status === 'healthy' ? 'degraded' : healthStatus.status;
|
|
550
|
+
healthStatus.checks.push({
|
|
551
|
+
name: 'worker_health',
|
|
552
|
+
status: 'degraded',
|
|
553
|
+
message: `${unhealthyWorkers.length} unhealthy workers`,
|
|
554
|
+
details: unhealthyWorkers.map(w => ({
|
|
555
|
+
pid: w.pid,
|
|
556
|
+
errorRate: w.errorRate,
|
|
557
|
+
isAlive: w.isAlive
|
|
558
|
+
}))
|
|
559
|
+
});
|
|
560
|
+
} else {
|
|
561
|
+
healthStatus.checks.push({
|
|
562
|
+
name: 'worker_health',
|
|
563
|
+
status: 'healthy',
|
|
564
|
+
message: 'All workers healthy'
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
// Add summary
|
|
569
|
+
healthStatus.summary = {
|
|
570
|
+
totalWorkers: workers.length,
|
|
571
|
+
activeWorkers,
|
|
572
|
+
unhealthyWorkers: unhealthyWorkers.length,
|
|
573
|
+
totalRequests,
|
|
574
|
+
totalErrors,
|
|
575
|
+
errorRate: errorRate.toFixed(2),
|
|
576
|
+
memoryUsage: `${(memoryRatio * 100).toFixed(1)}%`,
|
|
577
|
+
uptime: Math.floor(process.uptime())
|
|
578
|
+
};
|
|
579
|
+
|
|
580
|
+
ctx.status = healthStatus.status === 'healthy' ? 200 :
|
|
581
|
+
healthStatus.status === 'degraded' ? 200 : 503;
|
|
582
|
+
ctx.body = healthStatus;
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
/**
|
|
586
|
+
* Get all statistics
|
|
587
|
+
* @returns {Object} Complete statistics object
|
|
588
|
+
*/
|
|
589
|
+
async getAllStats() {
|
|
590
|
+
const workers = this.workerManager.getAllWorkerStats();
|
|
591
|
+
const clusterStats = this.workerManager.getClusterStats();
|
|
592
|
+
const metrics = this.workerManager.getMetrics();
|
|
593
|
+
|
|
594
|
+
return {
|
|
595
|
+
workers: workers.map(worker => this.formatWorkerStats(worker)),
|
|
596
|
+
cluster: clusterStats,
|
|
597
|
+
metrics,
|
|
598
|
+
system: {
|
|
599
|
+
memory: process.memoryUsage(),
|
|
600
|
+
cpu: process.cpuUsage(),
|
|
601
|
+
uptime: process.uptime(),
|
|
602
|
+
platform: process.platform,
|
|
603
|
+
arch: process.arch,
|
|
604
|
+
version: process.version,
|
|
605
|
+
pid: process.pid,
|
|
606
|
+
isWorker: !process.env.isPrimary,
|
|
607
|
+
workerId: process.env.workerId || 'master'
|
|
608
|
+
},
|
|
609
|
+
timestamp: new Date().toISOString(),
|
|
610
|
+
version: '1.0.0'
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
/**
|
|
615
|
+
* Format worker statistics for response
|
|
616
|
+
* @param {Object} stats - Worker statistics
|
|
617
|
+
* @param {boolean} includeDetails - Whether to include detailed information
|
|
618
|
+
* @returns {Object} Formatted statistics
|
|
619
|
+
*/
|
|
620
|
+
formatWorkerStats(stats, includeDetails = false) {
|
|
621
|
+
const formatted = {
|
|
622
|
+
pid: stats.pid,
|
|
623
|
+
workerId: stats.workerId,
|
|
624
|
+
state: stats.state,
|
|
625
|
+
uptime: stats.uptime,
|
|
626
|
+
requests: stats.requests,
|
|
627
|
+
errors: stats.errors,
|
|
628
|
+
errorRate: stats.errorRate,
|
|
629
|
+
load: stats.load,
|
|
630
|
+
isAlive: stats.isAlive,
|
|
631
|
+
lastRequestTime: stats.lastRequestTime,
|
|
632
|
+
restartCount: stats.restartCount,
|
|
633
|
+
health: this.getWorkerHealthStatus(stats)
|
|
634
|
+
};
|
|
635
|
+
|
|
636
|
+
if (includeDetails) {
|
|
637
|
+
formatted.details = {
|
|
638
|
+
memoryUsage: stats.memoryUsage,
|
|
639
|
+
cpuUsage: stats.cpuUsage,
|
|
640
|
+
lastHeartbeat: stats.lastHeartbeat,
|
|
641
|
+
metadata: stats.metadata,
|
|
642
|
+
lastError: stats.lastError,
|
|
643
|
+
environment: process.env.NODE_ENV || 'development'
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
return formatted;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
/**
|
|
651
|
+
* Get worker health status
|
|
652
|
+
* @param {Object} stats - Worker statistics
|
|
653
|
+
* @returns {string} Health status
|
|
654
|
+
*/
|
|
655
|
+
getWorkerHealthStatus(stats) {
|
|
656
|
+
if (!stats.isAlive) {
|
|
657
|
+
return 'dead';
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
if (parseFloat(stats.errorRate) >= 10) {
|
|
661
|
+
return 'unhealthy';
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
if (stats.state === 'active' || stats.state === 'ready') {
|
|
665
|
+
return 'healthy';
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
if (stats.state === 'idle') {
|
|
669
|
+
return 'idle';
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
return 'unknown';
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Calculate requests per minute for a worker
|
|
677
|
+
* @param {number} pid - Worker PID
|
|
678
|
+
* @returns {number} Requests per minute
|
|
679
|
+
*/
|
|
680
|
+
calculateRequestsPerMinute(pid) {
|
|
681
|
+
const stats = this.workerManager.getWorkerStats(pid);
|
|
682
|
+
if (!stats || !stats.lastRequestTime) {
|
|
683
|
+
return 0;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
const now = Date.now();
|
|
687
|
+
const timeSinceLastRequest = (now - stats.lastRequestTime) / 1000 / 60; // Convert to minutes
|
|
688
|
+
|
|
689
|
+
if (timeSinceLastRequest === 0) {
|
|
690
|
+
return 0;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
return stats.requests / timeSinceLastRequest;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
/**
|
|
697
|
+
* Get cluster recommendations
|
|
698
|
+
* @param {Object} clusterStats - Cluster statistics
|
|
699
|
+
* @returns {Array} Recommendations
|
|
700
|
+
*/
|
|
701
|
+
getClusterRecommendations(clusterStats) {
|
|
702
|
+
const recommendations = [];
|
|
703
|
+
|
|
704
|
+
// Check idle worker ratio
|
|
705
|
+
const idleRatio = clusterStats.workers.idle / clusterStats.workers.total;
|
|
706
|
+
if (idleRatio > 0.7) {
|
|
707
|
+
recommendations.push({
|
|
708
|
+
type: 'optimization',
|
|
709
|
+
priority: 'low',
|
|
710
|
+
message: 'High idle worker ratio',
|
|
711
|
+
suggestion: 'Consider reducing the number of workers to save resources',
|
|
712
|
+
details: `Idle workers: ${clusterStats.workers.idle}/${clusterStats.workers.total}`
|
|
713
|
+
});
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// Check if all workers are active and fewer than CPU cores
|
|
717
|
+
const cpuCount = require('os').cpus().length;
|
|
718
|
+
if (clusterStats.workers.active === clusterStats.workers.total &&
|
|
719
|
+
clusterStats.workers.total < cpuCount) {
|
|
720
|
+
recommendations.push({
|
|
721
|
+
type: 'scaling',
|
|
722
|
+
priority: 'medium',
|
|
723
|
+
message: 'All workers are active',
|
|
724
|
+
suggestion: `Consider increasing the number of workers to ${cpuCount} (CPU cores)`,
|
|
725
|
+
details: `Current: ${clusterStats.workers.total}, CPU cores: ${cpuCount}`
|
|
726
|
+
});
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
// Check error rate
|
|
730
|
+
if (parseFloat(clusterStats.performance.errorRate) > 10) {
|
|
731
|
+
recommendations.push({
|
|
732
|
+
type: 'performance',
|
|
733
|
+
priority: 'high',
|
|
734
|
+
message: 'High error rate detected',
|
|
735
|
+
suggestion: 'Investigate application logs and error handling',
|
|
736
|
+
details: `Error rate: ${clusterStats.performance.errorRate}%`
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// Check worker restarts
|
|
741
|
+
if (clusterStats.performance.totalRestarts > 0) {
|
|
742
|
+
recommendations.push({
|
|
743
|
+
type: 'stability',
|
|
744
|
+
priority: 'medium',
|
|
745
|
+
message: 'Worker restarts detected',
|
|
746
|
+
suggestion: 'Monitor worker stability and investigate restart causes',
|
|
747
|
+
details: `Total restarts: ${clusterStats.performance.totalRestarts}`
|
|
748
|
+
});
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
return recommendations;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
/**
|
|
755
|
+
* Get historical statistics
|
|
756
|
+
* @returns {Array} Historical stats
|
|
757
|
+
*/
|
|
758
|
+
getHistoricalStats() {
|
|
759
|
+
// In a real implementation, this would fetch from a database or cache
|
|
760
|
+
// For now, return empty array
|
|
761
|
+
return [];
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/**
|
|
765
|
+
* Check authentication
|
|
766
|
+
* @param {Object} ctx - Context object
|
|
767
|
+
* @returns {boolean} Authentication status
|
|
768
|
+
*/
|
|
769
|
+
checkAuth(ctx) {
|
|
770
|
+
if (typeof this.options.auth === 'function') {
|
|
771
|
+
return this.options.auth(ctx);
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
if (typeof this.options.auth === 'object') {
|
|
775
|
+
// Check API key
|
|
776
|
+
if (this.options.auth.apiKey) {
|
|
777
|
+
const apiKey = ctx.headers['x-api-key'] || ctx.query.apiKey;
|
|
778
|
+
return apiKey === this.options.auth.apiKey;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// Check basic auth
|
|
782
|
+
if (this.options.auth.username && this.options.auth.password) {
|
|
783
|
+
const authHeader = ctx.headers.authorization;
|
|
784
|
+
if (!authHeader || !authHeader.startsWith('Basic ')) {
|
|
785
|
+
return false;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
const credentials = Buffer.from(authHeader.slice(6), 'base64').toString();
|
|
789
|
+
const [username, password] = credentials.split(':');
|
|
790
|
+
return username === this.options.auth.username && password === this.options.auth.password;
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
return true; // No auth required
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
/**
|
|
798
|
+
* Check rate limiting
|
|
799
|
+
* @param {Object} ctx - Context object
|
|
800
|
+
* @returns {boolean} Rate limit status
|
|
801
|
+
*/
|
|
802
|
+
checkRateLimit(ctx) {
|
|
803
|
+
if (!this.options.rateLimit) {
|
|
804
|
+
return true;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
const clientIp = ctx.ip || ctx.request.ip;
|
|
808
|
+
const now = Date.now();
|
|
809
|
+
const windowMs = this.options.rateLimit.windowMs;
|
|
810
|
+
const max = this.options.rateLimit.max;
|
|
811
|
+
|
|
812
|
+
if (!this.requestCounts.has(clientIp)) {
|
|
813
|
+
this.requestCounts.set(clientIp, {
|
|
814
|
+
count: 1,
|
|
815
|
+
resetTime: now + windowMs
|
|
816
|
+
});
|
|
817
|
+
return true;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
const clientData = this.requestCounts.get(clientIp);
|
|
821
|
+
|
|
822
|
+
// Reset if window has passed
|
|
823
|
+
if (now > clientData.resetTime) {
|
|
824
|
+
clientData.count = 1;
|
|
825
|
+
clientData.resetTime = now + windowMs;
|
|
826
|
+
return true;
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
// Check if limit exceeded
|
|
830
|
+
if (clientData.count >= max) {
|
|
831
|
+
return false;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
clientData.count++;
|
|
835
|
+
return true;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
/**
|
|
839
|
+
* Clear rate limit cache
|
|
840
|
+
*/
|
|
841
|
+
clearRateLimitCache() {
|
|
842
|
+
const now = Date.now();
|
|
843
|
+
for (const [ip, data] of this.requestCounts.entries()) {
|
|
844
|
+
if (now > data.resetTime) {
|
|
845
|
+
this.requestCounts.delete(ip);
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
/**
|
|
851
|
+
* Get middleware configuration
|
|
852
|
+
* @returns {Object} Middleware configuration
|
|
853
|
+
*/
|
|
854
|
+
getConfig() {
|
|
855
|
+
return {
|
|
856
|
+
...this.options,
|
|
857
|
+
cacheEnabled: this.options.cacheDuration > 0,
|
|
858
|
+
rateLimitEnabled: !!this.options.rateLimit,
|
|
859
|
+
authEnabled: !!this.options.auth
|
|
860
|
+
};
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
/**
|
|
864
|
+
* Clear statistics cache
|
|
865
|
+
*/
|
|
866
|
+
clearCache() {
|
|
867
|
+
this.statsCache = null;
|
|
868
|
+
this.cacheTimestamp = 0;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
/**
|
|
872
|
+
* Reset rate limiting
|
|
873
|
+
*/
|
|
874
|
+
resetRateLimit() {
|
|
875
|
+
this.requestCounts.clear();
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
export default WorkerStatsMiddleware;
|