aetherframework-cluster 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,571 @@
1
+ // packages/cluster/src/core/HealthMonitor.js
2
+ import { EventEmitter } from 'events';
3
+
4
+ /**
5
+ * Health Monitor - Monitors the health of worker processes and the cluster
6
+ * Provides health check endpoints and monitoring capabilities
7
+ */
8
+ class HealthMonitor extends EventEmitter {
9
+ constructor(clusterManager, options = {}) {
10
+ super();
11
+
12
+ this.clusterManager = clusterManager;
13
+ this.options = {
14
+ checkInterval: options.checkInterval || 30000,
15
+ timeout: options.timeout || 5000,
16
+ memoryThreshold: options.memoryThreshold || 0.8,
17
+ cpuThreshold: options.cpuThreshold || 0.7,
18
+ maxErrorRate: options.maxErrorRate || 10,
19
+ maxRestarts: options.maxRestarts || 5,
20
+ ...options
21
+ };
22
+
23
+ this.healthChecks = new Map();
24
+ this.healthHistory = [];
25
+ this.maxHistorySize = 100;
26
+ this.checkInterval = null;
27
+ this.lastCheckTime = null;
28
+
29
+ // Register default health checks
30
+ this.registerDefaultChecks();
31
+ }
32
+
33
+ /**
34
+ * Register default health checks
35
+ */
36
+ registerDefaultChecks() {
37
+ // Worker availability check
38
+ this.registerCheck('worker_availability', async () => {
39
+ const stats = this.clusterManager.getStats();
40
+ const activeWorkers = stats.workers.filter(w => w.state === 'online').length;
41
+
42
+ if (activeWorkers === 0) {
43
+ throw new Error('No active workers available');
44
+ }
45
+
46
+ if (activeWorkers < stats.total * 0.5) {
47
+ throw new Error(`Less than 50% of workers active: ${activeWorkers}/${stats.total}`);
48
+ }
49
+
50
+ return {
51
+ status: 'healthy',
52
+ activeWorkers,
53
+ totalWorkers: stats.total,
54
+ percentage: (activeWorkers / stats.total) * 100
55
+ };
56
+ });
57
+
58
+ // Memory usage check
59
+ this.registerCheck('memory_usage', async () => {
60
+ const memoryUsage = process.memoryUsage();
61
+ const memoryRatio = memoryUsage.heapUsed / memoryUsage.heapTotal;
62
+
63
+ if (memoryRatio > this.options.memoryThreshold) {
64
+ throw new Error(`High memory usage: ${(memoryRatio * 100).toFixed(1)}% (threshold: ${this.options.memoryThreshold * 100}%)`);
65
+ }
66
+
67
+ return {
68
+ status: 'healthy',
69
+ heapUsed: memoryUsage.heapUsed,
70
+ heapTotal: memoryUsage.heapTotal,
71
+ ratio: memoryRatio,
72
+ threshold: this.options.memoryThreshold
73
+ };
74
+ });
75
+
76
+ // Error rate check
77
+ this.registerCheck('error_rate', async () => {
78
+ const stats = this.clusterManager.getStats();
79
+ const errorRate = stats.totalRequests > 0 ? (stats.errors / stats.totalRequests) * 100 : 0;
80
+
81
+ if (errorRate > this.options.maxErrorRate) {
82
+ throw new Error(`High error rate: ${errorRate.toFixed(2)}% (threshold: ${this.options.maxErrorRate}%)`);
83
+ }
84
+
85
+ return {
86
+ status: 'healthy',
87
+ totalRequests: stats.totalRequests,
88
+ totalErrors: stats.errors,
89
+ errorRate,
90
+ threshold: this.options.maxErrorRate
91
+ };
92
+ });
93
+
94
+ // Uptime check
95
+ this.registerCheck('uptime', async () => {
96
+ const uptime = process.uptime();
97
+ const stats = this.clusterManager.getStats();
98
+
99
+ if (uptime < 60) {
100
+ return {
101
+ status: 'degraded',
102
+ uptime,
103
+ message: 'System recently started'
104
+ };
105
+ }
106
+
107
+ return {
108
+ status: 'healthy',
109
+ uptime,
110
+ clusterUptime: stats.uptime
111
+ };
112
+ });
113
+
114
+ // Worker health check
115
+ this.registerCheck('worker_health', async () => {
116
+ const stats = this.clusterManager.getStats();
117
+ const unhealthyWorkers = [];
118
+
119
+ for (const worker of stats.workers) {
120
+ // Check if worker has been active recently
121
+ const idleTime = worker.lastRequestTime ? Date.now() - worker.lastRequestTime : Infinity;
122
+
123
+ if (idleTime > 300000) { // 5 minutes
124
+ unhealthyWorkers.push({
125
+ pid: worker.pid,
126
+ issue: 'Worker idle for too long',
127
+ idleTime: Math.floor(idleTime / 1000)
128
+ });
129
+ }
130
+
131
+ // Check error rate for individual workers
132
+ const workerErrorRate = worker.requests > 0 ? (worker.errors / worker.requests) * 100 : 0;
133
+ if (workerErrorRate > 20) { // 20% error rate threshold
134
+ unhealthyWorkers.push({
135
+ pid: worker.pid,
136
+ issue: 'High error rate',
137
+ errorRate: workerErrorRate.toFixed(2)
138
+ });
139
+ }
140
+ }
141
+
142
+ if (unhealthyWorkers.length > 0) {
143
+ return {
144
+ status: 'degraded',
145
+ unhealthyWorkers,
146
+ totalWorkers: stats.workers.length,
147
+ message: `${unhealthyWorkers.length} workers have issues`
148
+ };
149
+ }
150
+
151
+ return {
152
+ status: 'healthy',
153
+ totalWorkers: stats.workers.length,
154
+ message: 'All workers healthy'
155
+ };
156
+ });
157
+ }
158
+
159
+ /**
160
+ * Register a custom health check
161
+ * @param {string} name - Check name
162
+ * @param {Function} checkFn - Check function returning Promise
163
+ * @param {Object} options - Check options
164
+ */
165
+ registerCheck(name, checkFn, options = {}) {
166
+ this.healthChecks.set(name, {
167
+ fn: checkFn,
168
+ options: {
169
+ timeout: this.options.timeout,
170
+ ...options
171
+ },
172
+ lastRun: null,
173
+ lastResult: null,
174
+ history: []
175
+ });
176
+
177
+ console.log(`✅ Registered health check: ${name}`);
178
+ }
179
+
180
+ /**
181
+ * Unregister a health check
182
+ * @param {string} name - Check name
183
+ * @returns {boolean} Success status
184
+ */
185
+ unregisterCheck(name) {
186
+ return this.healthChecks.delete(name);
187
+ }
188
+
189
+ /**
190
+ * Run a specific health check
191
+ * @param {string} name - Check name
192
+ * @returns {Promise<Object>} Check result
193
+ */
194
+ async runCheck(name) {
195
+ if (!this.healthChecks.has(name)) {
196
+ throw new Error(`Health check "${name}" not found`);
197
+ }
198
+
199
+ const check = this.healthChecks.get(name);
200
+ const startTime = Date.now();
201
+
202
+ try {
203
+ // Run check with timeout
204
+ const result = await Promise.race([
205
+ check.fn(),
206
+ new Promise((_, reject) =>
207
+ setTimeout(() => reject(new Error(`Health check "${name}" timeout`)), check.options.timeout)
208
+ )
209
+ ]);
210
+
211
+ const endTime = Date.now();
212
+ const duration = endTime - startTime;
213
+
214
+ const checkResult = {
215
+ name,
216
+ status: 'healthy',
217
+ duration,
218
+ timestamp: new Date().toISOString(),
219
+ data: result
220
+ };
221
+
222
+ // Update check history
223
+ check.lastRun = endTime;
224
+ check.lastResult = checkResult;
225
+ check.history.push(checkResult);
226
+
227
+ // Limit history size
228
+ if (check.history.length > this.maxHistorySize) {
229
+ check.history = check.history.slice(-this.maxHistorySize);
230
+ }
231
+
232
+ this.emit('check:completed', checkResult);
233
+ return checkResult;
234
+
235
+ } catch (error) {
236
+ const endTime = Date.now();
237
+ const duration = endTime - startTime;
238
+
239
+ const checkResult = {
240
+ name,
241
+ status: 'unhealthy',
242
+ duration,
243
+ timestamp: new Date().toISOString(),
244
+ error: error.message,
245
+ stack: error.stack
246
+ };
247
+
248
+ // Update check history
249
+ check.lastRun = endTime;
250
+ check.lastResult = checkResult;
251
+ check.history.push(checkResult);
252
+
253
+ // Limit history size
254
+ if (check.history.length > this.maxHistorySize) {
255
+ check.history = check.history.slice(-this.maxHistorySize);
256
+ }
257
+
258
+ this.emit('check:failed', checkResult);
259
+ return checkResult;
260
+ }
261
+ }
262
+
263
+ /**
264
+ * Run all health checks
265
+ * @returns {Promise<Object>} Overall health status
266
+ */
267
+ async runAllChecks() {
268
+ const startTime = Date.now();
269
+ this.lastCheckTime = startTime;
270
+
271
+ const checkPromises = [];
272
+ const checkNames = Array.from(this.healthChecks.keys());
273
+
274
+ // Run all checks in parallel
275
+ for (const name of checkNames) {
276
+ checkPromises.push(this.runCheck(name));
277
+ }
278
+
279
+ const results = await Promise.allSettled(checkPromises);
280
+ const endTime = Date.now();
281
+ const duration = endTime - startTime;
282
+
283
+ // Process results
284
+ const checkResults = [];
285
+ let healthyChecks = 0;
286
+ let unhealthyChecks = 0;
287
+ let warnings = [];
288
+ let errors = [];
289
+
290
+ for (let i = 0; i < results.length; i++) {
291
+ const result = results[i];
292
+ const name = checkNames[i];
293
+
294
+ if (result.status === 'fulfilled') {
295
+ checkResults.push(result.value);
296
+
297
+ if (result.value.status === 'healthy') {
298
+ healthyChecks++;
299
+ } else {
300
+ unhealthyChecks++;
301
+ warnings.push(`${name}: ${result.value.error}`);
302
+ }
303
+ } else {
304
+ // Check failed to run
305
+ unhealthyChecks++;
306
+ errors.push(`${name}: ${result.reason.message}`);
307
+
308
+ checkResults.push({
309
+ name,
310
+ status: 'unhealthy',
311
+ duration: 0,
312
+ timestamp: new Date().toISOString(),
313
+ error: result.reason.message
314
+ });
315
+ }
316
+ }
317
+
318
+ // Determine overall status
319
+ let overallStatus = 'healthy';
320
+ if (unhealthyChecks > 0) {
321
+ overallStatus = 'degraded';
322
+ }
323
+ if (unhealthyChecks > checkNames.length * 0.5) {
324
+ overallStatus = 'unhealthy';
325
+ }
326
+
327
+ const healthReport = {
328
+ status: overallStatus,
329
+ timestamp: new Date().toISOString(),
330
+ duration,
331
+ checks: {
332
+ total: checkNames.length,
333
+ healthy: healthyChecks,
334
+ unhealthy: unhealthyChecks,
335
+ results: checkResults
336
+ },
337
+ warnings: warnings.length > 0 ? warnings : null,
338
+ errors: errors.length > 0 ? errors : null,
339
+ recommendations: this.getRecommendations(checkResults)
340
+ };
341
+
342
+ // Add to history
343
+ this.healthHistory.push({
344
+ timestamp: healthReport.timestamp,
345
+ status: healthReport.status,
346
+ healthyChecks,
347
+ unhealthyChecks,
348
+ duration
349
+ });
350
+
351
+ // Limit history size
352
+ if (this.healthHistory.length > this.maxHistorySize) {
353
+ this.healthHistory = this.healthHistory.slice(-this.maxHistorySize);
354
+ }
355
+
356
+ this.emit('health:report', healthReport);
357
+ return healthReport;
358
+ }
359
+
360
+ /**
361
+ * Get health check recommendations
362
+ * @param {Array} checkResults - Array of check results
363
+ * @returns {Array} Recommendations
364
+ */
365
+ getRecommendations(checkResults) {
366
+ const recommendations = [];
367
+
368
+ for (const result of checkResults) {
369
+ if (result.status !== 'healthy') {
370
+ switch (result.name) {
371
+ case 'worker_availability':
372
+ recommendations.push('Consider restarting inactive workers or increasing worker count');
373
+ break;
374
+ case 'memory_usage':
375
+ recommendations.push('Monitor memory usage, consider optimizing memory or scaling horizontally');
376
+ break;
377
+ case 'error_rate':
378
+ recommendations.push('Investigate high error rate, check application logs and error handling');
379
+ break;
380
+ case 'uptime':
381
+ if (result.data && result.data.status === 'degraded') {
382
+ recommendations.push('System recently started, monitor for stability');
383
+ }
384
+ break;
385
+ case 'worker_health':
386
+ if (result.data && result.data.unhealthyWorkers) {
387
+ recommendations.push(`Restart ${result.data.unhealthyWorkers.length} unhealthy workers`);
388
+ }
389
+ break;
390
+ }
391
+ }
392
+ }
393
+
394
+ return recommendations.length > 0 ? recommendations : ['All systems operational'];
395
+ }
396
+
397
+ /**
398
+ * Start periodic health checks
399
+ */
400
+ start() {
401
+ if (this.checkInterval) {
402
+ console.warn('Health monitor already started');
403
+ return;
404
+ }
405
+
406
+ console.log('🚀 Starting health monitor...');
407
+
408
+ // Run initial check
409
+ this.runAllChecks().catch(error => {
410
+ console.error('Initial health check failed:', error);
411
+ });
412
+
413
+ // Start periodic checks
414
+ this.checkInterval = setInterval(() => {
415
+ this.runAllChecks().catch(error => {
416
+ console.error('Periodic health check failed:', error);
417
+ });
418
+ }, this.options.checkInterval);
419
+
420
+ console.log(`✅ Health monitor started with ${this.options.checkInterval}ms interval`);
421
+ }
422
+
423
+ /**
424
+ * Stop periodic health checks
425
+ */
426
+ stop() {
427
+ if (this.checkInterval) {
428
+ clearInterval(this.checkInterval);
429
+ this.checkInterval = null;
430
+ console.log('🛑 Health monitor stopped');
431
+ }
432
+ }
433
+
434
+ /**
435
+ * Get health check handler for HTTP endpoint
436
+ * @returns {Function} HTTP handler function
437
+ */
438
+ getHandler() {
439
+ return async (ctx) => {
440
+ try {
441
+ const health = await this.runAllChecks();
442
+
443
+ // Set response status based on health
444
+ ctx.status = health.status === 'healthy' ? 200 :
445
+ health.status === 'degraded' ? 200 : 503;
446
+
447
+ // Add health headers
448
+ ctx.set('X-Health-Status', health.status);
449
+ ctx.set('X-Health-Checks', `${health.checks.healthy}/${health.checks.total}`);
450
+ ctx.set('X-Health-Timestamp', health.timestamp);
451
+
452
+ // Return health report
453
+ ctx.body = {
454
+ ...health,
455
+ // Add system information
456
+ system: {
457
+ pid: process.pid,
458
+ uptime: process.uptime(),
459
+ memory: process.memoryUsage(),
460
+ cpu: process.cpuUsage(),
461
+ version: process.version,
462
+ platform: process.platform,
463
+ arch: process.arch
464
+ },
465
+ // Add cluster information if available
466
+ cluster: this.clusterManager ? this.clusterManager.getStats() : null
467
+ };
468
+
469
+ } catch (error) {
470
+ console.error('Health check handler error:', error);
471
+
472
+ ctx.status = 503;
473
+ ctx.body = {
474
+ status: 'error',
475
+ timestamp: new Date().toISOString(),
476
+ message: 'Health check failed',
477
+ error: error.message,
478
+ stack: process.env.NODE_ENV === 'development' ? error.stack : undefined
479
+ };
480
+ }
481
+ };
482
+ }
483
+
484
+ /**
485
+ * Get detailed health information
486
+ * @returns {Object} Detailed health information
487
+ */
488
+ getDetailedHealth() {
489
+ const checks = [];
490
+
491
+ for (const [name, check] of this.healthChecks.entries()) {
492
+ checks.push({
493
+ name,
494
+ lastRun: check.lastRun ? new Date(check.lastRun).toISOString() : null,
495
+ lastResult: check.lastResult,
496
+ historySize: check.history.length,
497
+ options: check.options
498
+ });
499
+ }
500
+
501
+ return {
502
+ checks,
503
+ history: {
504
+ records: this.healthHistory.length,
505
+ latest: this.healthHistory[this.healthHistory.length - 1] || null,
506
+ summary: {
507
+ totalChecks: this.healthHistory.length,
508
+ healthyChecks: this.healthHistory.filter(h => h.status === 'healthy').length,
509
+ degradedChecks: this.healthHistory.filter(h => h.status === 'degraded').length,
510
+ unhealthyChecks: this.healthHistory.filter(h => h.status === 'unhealthy').length
511
+ }
512
+ },
513
+ config: this.options,
514
+ lastCheck: this.lastCheckTime ? new Date(this.lastCheckTime).toISOString() : null
515
+ };
516
+ }
517
+
518
+ /**
519
+ * Get health check status
520
+ * @param {string} name - Check name
521
+ * @returns {Object|null} Check status
522
+ */
523
+ getCheckStatus(name) {
524
+ if (!this.healthChecks.has(name)) {
525
+ return null;
526
+ }
527
+
528
+ const check = this.healthChecks.get(name);
529
+ return {
530
+ name,
531
+ lastRun: check.lastRun,
532
+ lastResult: check.lastResult,
533
+ history: check.history.slice(-10), // Last 10 results
534
+ options: check.options
535
+ };
536
+ }
537
+
538
+ /**
539
+ * Get all check statuses
540
+ * @returns {Array} Array of check statuses
541
+ */
542
+ getAllCheckStatuses() {
543
+ const statuses = [];
544
+
545
+ for (const [name] of this.healthChecks.entries()) {
546
+ const status = this.getCheckStatus(name);
547
+ if (status) {
548
+ statuses.push(status);
549
+ }
550
+ }
551
+
552
+ return statuses;
553
+ }
554
+
555
+ /**
556
+ * Reset health check history
557
+ */
558
+ resetHistory() {
559
+ this.healthHistory = [];
560
+
561
+ for (const [name, check] of this.healthChecks.entries()) {
562
+ check.history = [];
563
+ check.lastRun = null;
564
+ check.lastResult = null;
565
+ }
566
+
567
+ console.log('🔄 Health check history reset');
568
+ }
569
+ }
570
+
571
+ export default HealthMonitor;