@gravito/zenith 0.1.0-beta.1 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ALERTING_GUIDE.md +71 -0
- package/QUASAR_MASTER_PLAN.md +137 -0
- package/dist/bin.js +38061 -26911
- package/dist/client/assets/index-BSTyMCFd.css +1 -0
- package/dist/client/assets/index-oXEse8ih.js +436 -0
- package/dist/client/index.html +2 -2
- package/dist/server/index.js +38061 -26911
- package/package.json +52 -48
- package/specs/PULSE_SPEC.md +86 -0
- package/src/client/App.tsx +2 -0
- package/src/client/Layout.tsx +30 -11
- package/src/client/Sidebar.tsx +2 -1
- package/src/client/WorkerStatus.tsx +25 -21
- package/src/client/components/BrandIcons.tsx +63 -0
- package/src/client/components/PageHeader.tsx +34 -0
- package/src/client/pages/OverviewPage.tsx +18 -20
- package/src/client/pages/PulsePage.tsx +396 -0
- package/src/client/pages/QueuesPage.tsx +1 -3
- package/src/client/pages/SettingsPage.tsx +586 -78
- package/src/client/pages/WorkersPage.tsx +1 -1
- package/src/client/pages/index.ts +1 -0
- package/src/server/index.ts +148 -8
- package/src/server/services/AlertService.ts +189 -41
- package/src/server/services/CommandService.ts +137 -0
- package/src/server/services/PulseService.ts +80 -0
- package/src/server/services/QueueService.ts +58 -4
- package/src/shared/types.ts +97 -0
- package/tsconfig.json +2 -2
- package/PULSE_IMPLEMENTATION_PLAN.md +0 -111
- package/dist/client/assets/index-DGYEwTDL.css +0 -1
- package/dist/client/assets/index-oyTdySX0.js +0 -421
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { useQuery, useQueryClient } from '@tanstack/react-query'
|
|
2
2
|
import { motion } from 'framer-motion'
|
|
3
3
|
import { AlertCircle, Clock, Cpu, Gauge, MemoryStick, RefreshCcw, Server, Zap } from 'lucide-react'
|
|
4
|
-
import
|
|
4
|
+
import { useEffect } from 'react'
|
|
5
5
|
import { cn } from '../utils'
|
|
6
6
|
|
|
7
7
|
interface Worker {
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { LoginPage } from './LoginPage'
|
|
2
2
|
export { MetricsPage } from './MetricsPage'
|
|
3
3
|
export { OverviewPage } from './OverviewPage'
|
|
4
|
+
export { PulsePage } from './PulsePage'
|
|
4
5
|
export { QueuesPage } from './QueuesPage'
|
|
5
6
|
export { SchedulesPage } from './SchedulesPage'
|
|
6
7
|
export { SettingsPage } from './SettingsPage'
|
package/src/server/index.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import { DB } from '@gravito/atlas'
|
|
2
2
|
import { Photon } from '@gravito/photon'
|
|
3
|
+
import { QuasarAgent } from '@gravito/quasar'
|
|
3
4
|
import { MySQLPersistence, SQLitePersistence } from '@gravito/stream'
|
|
5
|
+
import fs from 'fs'
|
|
4
6
|
import { serveStatic } from 'hono/bun'
|
|
5
7
|
import { getCookie } from 'hono/cookie'
|
|
6
8
|
import { streamSSE } from 'hono/streaming'
|
|
9
|
+
import os from 'os'
|
|
10
|
+
import path from 'path'
|
|
11
|
+
import { fileURLToPath } from 'url'
|
|
7
12
|
import {
|
|
8
13
|
authMiddleware,
|
|
9
14
|
createSession,
|
|
@@ -11,6 +16,8 @@ import {
|
|
|
11
16
|
isAuthEnabled,
|
|
12
17
|
verifyPassword,
|
|
13
18
|
} from './middleware/auth'
|
|
19
|
+
import { CommandService } from './services/CommandService'
|
|
20
|
+
import { PulseService } from './services/PulseService'
|
|
14
21
|
import { QueueService } from './services/QueueService'
|
|
15
22
|
|
|
16
23
|
const app = new Photon()
|
|
@@ -58,14 +65,26 @@ if (dbDriver === 'sqlite' || process.env.DB_HOST) {
|
|
|
58
65
|
|
|
59
66
|
// Service Initialization
|
|
60
67
|
const queueService = new QueueService(REDIS_URL, QUEUE_PREFIX, persistence)
|
|
68
|
+
const pulseService = new PulseService(REDIS_URL)
|
|
69
|
+
const commandService = new CommandService(REDIS_URL)
|
|
61
70
|
|
|
62
71
|
queueService
|
|
63
72
|
.connect()
|
|
73
|
+
.then(() => pulseService.connect())
|
|
74
|
+
.then(() => commandService.connect())
|
|
64
75
|
.then(() => {
|
|
76
|
+
// Start Self-Monitoring (Quasar)
|
|
77
|
+
const agent = new QuasarAgent({
|
|
78
|
+
service: 'flux-console',
|
|
79
|
+
redisUrl: REDIS_URL,
|
|
80
|
+
})
|
|
81
|
+
agent.start().catch((err) => console.error('[FluxConsole] Quasar Agent Error:', err))
|
|
82
|
+
|
|
65
83
|
console.log(`[FluxConsole] Connected to Redis at ${REDIS_URL}`)
|
|
66
84
|
// Start background metrics recording (Reduced from 5s to 2s for better real-time feel)
|
|
67
|
-
setInterval(() => {
|
|
68
|
-
|
|
85
|
+
setInterval(async () => {
|
|
86
|
+
const nodes = await pulseService.getNodes()
|
|
87
|
+
queueService.recordStatusMetrics(nodes).catch(console.error)
|
|
69
88
|
}, 2000)
|
|
70
89
|
|
|
71
90
|
// Start Scheduler Tick (Reduced from 10s to 5s)
|
|
@@ -74,7 +93,10 @@ queueService
|
|
|
74
93
|
}, 5000)
|
|
75
94
|
|
|
76
95
|
// Record initial snapshot
|
|
77
|
-
|
|
96
|
+
pulseService
|
|
97
|
+
.getNodes()
|
|
98
|
+
.then((nodes) => queueService.recordStatusMetrics(nodes))
|
|
99
|
+
.catch(console.error)
|
|
78
100
|
})
|
|
79
101
|
.catch((err) => {
|
|
80
102
|
console.error('[FluxConsole] Failed to connect to Redis', err)
|
|
@@ -328,19 +350,80 @@ api.get('/metrics/history', async (c) => {
|
|
|
328
350
|
|
|
329
351
|
api.get('/system/status', (c) => {
|
|
330
352
|
const mem = process.memoryUsage()
|
|
353
|
+
const totalMem = os.totalmem()
|
|
354
|
+
|
|
355
|
+
// Find package.json (relative to this file in src/server/index.ts)
|
|
356
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
|
357
|
+
const pkgPath = path.resolve(__dirname, '../../package.json')
|
|
358
|
+
let pkg = { version: '0.1.0-unknown', name: '@gravito/zenith' }
|
|
359
|
+
try {
|
|
360
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'))
|
|
361
|
+
} catch (_e) {
|
|
362
|
+
// fallback
|
|
363
|
+
}
|
|
364
|
+
|
|
331
365
|
return c.json({
|
|
332
366
|
node: process.version,
|
|
333
367
|
memory: {
|
|
334
368
|
rss: `${(mem.rss / 1024 / 1024).toFixed(2)} MB`,
|
|
335
369
|
heapUsed: `${(mem.heapUsed / 1024 / 1024).toFixed(2)} MB`,
|
|
336
|
-
total:
|
|
370
|
+
total: `${(totalMem / 1024 / 1024 / 1024).toFixed(2)} GB`,
|
|
337
371
|
},
|
|
338
|
-
|
|
372
|
+
version: pkg.version,
|
|
373
|
+
package: pkg.name,
|
|
374
|
+
engine: `Zenith ${pkg.version}`,
|
|
339
375
|
uptime: process.uptime(),
|
|
340
|
-
env:
|
|
376
|
+
env:
|
|
377
|
+
process.env.NODE_ENV === 'production'
|
|
378
|
+
? `production (${os.hostname()})`
|
|
379
|
+
: `development (${os.hostname()})`,
|
|
380
|
+
redisUrl: process.env.REDIS_URL || 'redis://localhost:6379',
|
|
341
381
|
})
|
|
342
382
|
})
|
|
343
383
|
|
|
384
|
+
// --- Pulse Monitoring ---
|
|
385
|
+
api.get('/pulse/nodes', async (c) => {
|
|
386
|
+
try {
|
|
387
|
+
const nodes = await pulseService.getNodes()
|
|
388
|
+
return c.json({ nodes })
|
|
389
|
+
} catch (_err) {
|
|
390
|
+
return c.json({ error: 'Failed to fetch pulse nodes' }, 500)
|
|
391
|
+
}
|
|
392
|
+
})
|
|
393
|
+
|
|
394
|
+
// --- Pulse Remote Control (Phase 3) ---
|
|
395
|
+
api.post('/pulse/command', async (c) => {
|
|
396
|
+
try {
|
|
397
|
+
const { service, nodeId, type, queue, jobKey, driver, action } = await c.req.json()
|
|
398
|
+
|
|
399
|
+
// Validate required fields
|
|
400
|
+
if (!service || !nodeId || !type || !queue || !jobKey) {
|
|
401
|
+
return c.json({ error: 'Missing required fields: service, nodeId, type, queue, jobKey' }, 400)
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// Validate command type
|
|
405
|
+
if (type !== 'RETRY_JOB' && type !== 'DELETE_JOB' && type !== 'LARAVEL_ACTION') {
|
|
406
|
+
return c.json({ error: 'Invalid command type. Allowed: RETRY_JOB, DELETE_JOB, LARAVEL_ACTION' }, 400)
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
const commandId = await commandService.sendCommand(service, nodeId, type, {
|
|
410
|
+
queue,
|
|
411
|
+
jobKey,
|
|
412
|
+
driver: driver || 'redis',
|
|
413
|
+
action,
|
|
414
|
+
})
|
|
415
|
+
|
|
416
|
+
return c.json({
|
|
417
|
+
success: true,
|
|
418
|
+
commandId,
|
|
419
|
+
message: `Command ${type} sent to ${nodeId}. Observe job state for result.`,
|
|
420
|
+
})
|
|
421
|
+
} catch (err) {
|
|
422
|
+
console.error('[CommandService] Error:', err)
|
|
423
|
+
return c.json({ error: 'Failed to send command' }, 500)
|
|
424
|
+
}
|
|
425
|
+
})
|
|
426
|
+
|
|
344
427
|
api.post('/queues/:name/jobs/delete', async (c) => {
|
|
345
428
|
const queueName = c.req.param('name')
|
|
346
429
|
const { type, raw } = await c.req.json()
|
|
@@ -454,9 +537,23 @@ api.get('/logs/stream', async (c) => {
|
|
|
454
537
|
})
|
|
455
538
|
})
|
|
456
539
|
|
|
540
|
+
// 4. Poll Pulse Nodes per client (simple polling for now)
|
|
541
|
+
const pulseInterval = setInterval(async () => {
|
|
542
|
+
try {
|
|
543
|
+
const nodes = await pulseService.getNodes()
|
|
544
|
+
await stream.writeSSE({
|
|
545
|
+
data: JSON.stringify({ nodes }),
|
|
546
|
+
event: 'pulse',
|
|
547
|
+
})
|
|
548
|
+
} catch (err) {
|
|
549
|
+
// ignore errors
|
|
550
|
+
}
|
|
551
|
+
}, 2000)
|
|
552
|
+
|
|
457
553
|
stream.onAbort(() => {
|
|
458
554
|
unsubscribeLogs()
|
|
459
555
|
unsubscribeStats()
|
|
556
|
+
clearInterval(pulseInterval)
|
|
460
557
|
})
|
|
461
558
|
|
|
462
559
|
// Keep alive
|
|
@@ -508,17 +605,60 @@ api.delete('/schedules/:id', async (c) => {
|
|
|
508
605
|
})
|
|
509
606
|
|
|
510
607
|
// --- Alerting ---
|
|
511
|
-
api.get('/alerts/config', (c) => {
|
|
608
|
+
api.get('/alerts/config', async (c) => {
|
|
512
609
|
return c.json({
|
|
513
610
|
rules: queueService.alerts.getRules(),
|
|
514
|
-
|
|
611
|
+
config: queueService.alerts.getConfig(),
|
|
612
|
+
maintenance: await queueService.getMaintenanceConfig(),
|
|
515
613
|
})
|
|
516
614
|
})
|
|
517
615
|
|
|
616
|
+
api.post('/maintenance/config', async (c) => {
|
|
617
|
+
const config = await c.req.json()
|
|
618
|
+
try {
|
|
619
|
+
await queueService.saveMaintenanceConfig(config)
|
|
620
|
+
return c.json({ success: true })
|
|
621
|
+
} catch (err) {
|
|
622
|
+
return c.json({ error: 'Failed to save maintenance config' }, 500)
|
|
623
|
+
}
|
|
624
|
+
})
|
|
625
|
+
|
|
626
|
+
api.post('/alerts/config', async (c) => {
|
|
627
|
+
const config = await c.req.json()
|
|
628
|
+
try {
|
|
629
|
+
await queueService.alerts.saveConfig(config)
|
|
630
|
+
return c.json({ success: true })
|
|
631
|
+
} catch (err) {
|
|
632
|
+
return c.json({ error: 'Failed to save alert config' }, 500)
|
|
633
|
+
}
|
|
634
|
+
})
|
|
635
|
+
|
|
636
|
+
api.post('/alerts/rules', async (c) => {
|
|
637
|
+
const rule = await c.req.json()
|
|
638
|
+
try {
|
|
639
|
+
await queueService.alerts.addRule(rule)
|
|
640
|
+
return c.json({ success: true })
|
|
641
|
+
} catch (err) {
|
|
642
|
+
return c.json({ error: 'Failed to add rule' }, 500)
|
|
643
|
+
}
|
|
644
|
+
})
|
|
645
|
+
|
|
646
|
+
api.delete('/alerts/rules/:id', async (c) => {
|
|
647
|
+
const id = c.req.param('id')
|
|
648
|
+
try {
|
|
649
|
+
await queueService.alerts.deleteRule(id)
|
|
650
|
+
return c.json({ success: true })
|
|
651
|
+
} catch (err) {
|
|
652
|
+
return c.json({ error: 'Failed to delete rule' }, 500)
|
|
653
|
+
}
|
|
654
|
+
})
|
|
655
|
+
|
|
518
656
|
api.post('/alerts/test', async (c) => {
|
|
519
657
|
try {
|
|
658
|
+
const nodes = await pulseService.getNodes()
|
|
520
659
|
queueService.alerts.check({
|
|
521
660
|
queues: [],
|
|
661
|
+
nodes,
|
|
522
662
|
workers: [
|
|
523
663
|
{
|
|
524
664
|
id: 'test-node',
|
|
@@ -1,57 +1,110 @@
|
|
|
1
1
|
import { EventEmitter } from 'events'
|
|
2
|
+
import { Redis } from 'ioredis'
|
|
3
|
+
import nodemailer from 'nodemailer'
|
|
4
|
+
import type { AlertConfig, AlertEvent, AlertRule, PulseNode } from '../../shared/types'
|
|
2
5
|
import type { WorkerReport } from './QueueService'
|
|
3
6
|
|
|
4
|
-
export interface AlertRule {
|
|
5
|
-
id: string
|
|
6
|
-
name: string
|
|
7
|
-
type: 'backlog' | 'failure' | 'worker_lost'
|
|
8
|
-
threshold: number
|
|
9
|
-
queue?: string // Optional: specific queue or all
|
|
10
|
-
cooldownMinutes: number
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
export interface AlertEvent {
|
|
14
|
-
ruleId: string
|
|
15
|
-
timestamp: number
|
|
16
|
-
message: string
|
|
17
|
-
severity: 'warning' | 'critical'
|
|
18
|
-
}
|
|
19
|
-
|
|
20
7
|
export class AlertService {
|
|
8
|
+
private redis: Redis
|
|
21
9
|
private rules: AlertRule[] = []
|
|
10
|
+
private config: AlertConfig = { channels: {} }
|
|
22
11
|
private cooldowns: Map<string, number> = new Map()
|
|
23
|
-
private webhookUrl: string | null = process.env.SLACK_WEBHOOK_URL || null
|
|
24
12
|
private emitter = new EventEmitter()
|
|
13
|
+
private readonly RULES_KEY = 'gravito:zenith:alerts:rules'
|
|
14
|
+
private readonly CONFIG_KEY = 'gravito:zenith:alerts:config'
|
|
25
15
|
|
|
26
|
-
constructor() {
|
|
27
|
-
|
|
16
|
+
constructor(redisUrl: string) {
|
|
17
|
+
this.redis = new Redis(redisUrl, {
|
|
18
|
+
lazyConnect: true,
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
// Initial default rules
|
|
28
22
|
this.rules = [
|
|
29
23
|
{
|
|
30
24
|
id: 'global_failure_spike',
|
|
31
25
|
name: 'High Failure Rate',
|
|
32
26
|
type: 'failure',
|
|
33
|
-
threshold: 50,
|
|
27
|
+
threshold: 50,
|
|
34
28
|
cooldownMinutes: 30,
|
|
35
29
|
},
|
|
36
30
|
{
|
|
37
31
|
id: 'global_backlog_critical',
|
|
38
32
|
name: 'Queue Backlog Warning',
|
|
39
33
|
type: 'backlog',
|
|
40
|
-
threshold: 1000,
|
|
34
|
+
threshold: 1000,
|
|
41
35
|
cooldownMinutes: 60,
|
|
42
36
|
},
|
|
43
37
|
{
|
|
44
38
|
id: 'no_workers_online',
|
|
45
39
|
name: 'All Workers Offline',
|
|
46
40
|
type: 'worker_lost',
|
|
47
|
-
threshold: 1,
|
|
41
|
+
threshold: 1,
|
|
48
42
|
cooldownMinutes: 15,
|
|
49
43
|
},
|
|
50
44
|
]
|
|
45
|
+
|
|
46
|
+
// Default configuration (with env fallback for Slack)
|
|
47
|
+
if (process.env.SLACK_WEBHOOK_URL) {
|
|
48
|
+
this.config.channels.slack = {
|
|
49
|
+
enabled: true,
|
|
50
|
+
webhookUrl: process.env.SLACK_WEBHOOK_URL,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
this.init().catch((err) => console.error('[AlertService] Failed to initialize:', err))
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async connect() {
|
|
58
|
+
if (this.redis.status === 'wait') {
|
|
59
|
+
await this.redis.connect()
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
private async init() {
|
|
64
|
+
await this.loadRules()
|
|
65
|
+
await this.loadConfig()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async loadRules() {
|
|
69
|
+
try {
|
|
70
|
+
const data = await this.redis.get(this.RULES_KEY)
|
|
71
|
+
if (data) {
|
|
72
|
+
this.rules = JSON.parse(data)
|
|
73
|
+
}
|
|
74
|
+
} catch (err) {
|
|
75
|
+
console.error('[AlertService] Error loading rules from Redis:', err)
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async loadConfig() {
|
|
80
|
+
try {
|
|
81
|
+
const data = await this.redis.get(this.CONFIG_KEY)
|
|
82
|
+
if (data) {
|
|
83
|
+
this.config = JSON.parse(data)
|
|
84
|
+
}
|
|
85
|
+
} catch (err) {
|
|
86
|
+
console.error('[AlertService] Error loading config from Redis:', err)
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async saveRules(rules: AlertRule[]) {
|
|
91
|
+
this.rules = rules
|
|
92
|
+
await this.redis.set(this.RULES_KEY, JSON.stringify(rules))
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async saveConfig(config: AlertConfig) {
|
|
96
|
+
this.config = config
|
|
97
|
+
await this.redis.set(this.CONFIG_KEY, JSON.stringify(config))
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async addRule(rule: AlertRule) {
|
|
101
|
+
this.rules.push(rule)
|
|
102
|
+
await this.saveRules(this.rules)
|
|
51
103
|
}
|
|
52
104
|
|
|
53
|
-
|
|
54
|
-
this.
|
|
105
|
+
async deleteRule(id: string) {
|
|
106
|
+
this.rules = this.rules.filter((r) => r.id !== id)
|
|
107
|
+
await this.saveRules(this.rules)
|
|
55
108
|
}
|
|
56
109
|
|
|
57
110
|
onAlert(callback: (event: AlertEvent) => void) {
|
|
@@ -61,10 +114,10 @@ export class AlertService {
|
|
|
61
114
|
|
|
62
115
|
/**
|
|
63
116
|
* Evaluates rules against provided data.
|
|
64
|
-
* Extremely lightweight: only uses existing metrics data.
|
|
65
117
|
*/
|
|
66
118
|
async check(data: {
|
|
67
119
|
queues: any[]
|
|
120
|
+
nodes: Record<string, PulseNode[]>
|
|
68
121
|
workers: WorkerReport[]
|
|
69
122
|
totals: { waiting: number; delayed: number; failed: number }
|
|
70
123
|
}) {
|
|
@@ -83,21 +136,35 @@ export class AlertService {
|
|
|
83
136
|
|
|
84
137
|
// 2. Evaluate Rule
|
|
85
138
|
switch (rule.type) {
|
|
86
|
-
case 'backlog':
|
|
87
|
-
|
|
139
|
+
case 'backlog': {
|
|
140
|
+
const targetValue = rule.queue
|
|
141
|
+
? data.queues.find((q) => q.name === rule.queue)?.waiting || 0
|
|
142
|
+
: data.totals.waiting
|
|
143
|
+
|
|
144
|
+
if (targetValue >= rule.threshold) {
|
|
88
145
|
fired = true
|
|
89
146
|
severity = 'critical'
|
|
90
|
-
message =
|
|
147
|
+
message = rule.queue
|
|
148
|
+
? `Queue backlog on ${rule.queue}: ${targetValue} jobs waiting.`
|
|
149
|
+
: `Queue backlog detected: ${targetValue} jobs waiting across all queues.`
|
|
91
150
|
}
|
|
92
151
|
break
|
|
152
|
+
}
|
|
93
153
|
|
|
94
|
-
case 'failure':
|
|
95
|
-
|
|
154
|
+
case 'failure': {
|
|
155
|
+
const targetValue = rule.queue
|
|
156
|
+
? data.queues.find((q) => q.name === rule.queue)?.failed || 0
|
|
157
|
+
: data.totals.failed
|
|
158
|
+
|
|
159
|
+
if (targetValue >= rule.threshold) {
|
|
96
160
|
fired = true
|
|
97
161
|
severity = 'warning'
|
|
98
|
-
message =
|
|
162
|
+
message = rule.queue
|
|
163
|
+
? `High failure count on ${rule.queue}: ${targetValue} jobs failed.`
|
|
164
|
+
: `High failure count: ${targetValue} jobs are currently in failed state.`
|
|
99
165
|
}
|
|
100
166
|
break
|
|
167
|
+
}
|
|
101
168
|
|
|
102
169
|
case 'worker_lost':
|
|
103
170
|
if (data.workers.length < rule.threshold) {
|
|
@@ -106,6 +173,36 @@ export class AlertService {
|
|
|
106
173
|
message = `System Incident: Zero worker nodes detected! Jobs will not be processed.`
|
|
107
174
|
}
|
|
108
175
|
break
|
|
176
|
+
|
|
177
|
+
case 'node_cpu':
|
|
178
|
+
// Check all pulse nodes
|
|
179
|
+
for (const serviceNodes of Object.values(data.nodes)) {
|
|
180
|
+
for (const node of serviceNodes) {
|
|
181
|
+
if (node.cpu.process >= rule.threshold) {
|
|
182
|
+
fired = true
|
|
183
|
+
severity = 'warning'
|
|
184
|
+
message = `High CPU Usage on ${node.service} (${node.id}): ${node.cpu.process}%`
|
|
185
|
+
break
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (fired) break
|
|
189
|
+
}
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
case 'node_ram':
|
|
193
|
+
for (const serviceNodes of Object.values(data.nodes)) {
|
|
194
|
+
for (const node of serviceNodes) {
|
|
195
|
+
const usagePercent = (node.memory.process.rss / node.memory.system.total) * 100
|
|
196
|
+
if (usagePercent >= rule.threshold) {
|
|
197
|
+
fired = true
|
|
198
|
+
severity = 'warning'
|
|
199
|
+
message = `High RAM Usage on ${node.service} (${node.id}): ${usagePercent.toFixed(1)}%`
|
|
200
|
+
break
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (fired) break
|
|
204
|
+
}
|
|
205
|
+
break
|
|
109
206
|
}
|
|
110
207
|
|
|
111
208
|
// 3. Dispatch if fired
|
|
@@ -124,14 +221,26 @@ export class AlertService {
|
|
|
124
221
|
}
|
|
125
222
|
}
|
|
126
223
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
* Fire-and-forget to ensure zero impact on main loop latency.
|
|
130
|
-
*/
|
|
131
|
-
private notify(event: AlertEvent) {
|
|
132
|
-
if (!this.webhookUrl) return
|
|
224
|
+
private async notify(event: AlertEvent) {
|
|
225
|
+
const { slack, discord, email } = this.config.channels
|
|
133
226
|
|
|
134
|
-
//
|
|
227
|
+
// 1. Notify Slack
|
|
228
|
+
if (slack?.enabled && slack.webhookUrl) {
|
|
229
|
+
this.sendToWebhook(slack.webhookUrl, 'Slack', event).catch(console.error)
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// 2. Notify Discord
|
|
233
|
+
if (discord?.enabled && discord.webhookUrl) {
|
|
234
|
+
this.sendToWebhook(discord.webhookUrl, 'Discord', event).catch(console.error)
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// 3. Notify Email
|
|
238
|
+
if (email?.enabled) {
|
|
239
|
+
this.sendEmail(email, event).catch(console.error)
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
private async sendToWebhook(url: string, platform: string, event: AlertEvent) {
|
|
135
244
|
const payload = {
|
|
136
245
|
text: `*Flux Console Alert [${event.severity.toUpperCase()}]*\n${event.message}\n_Time: ${new Date(event.timestamp).toISOString()}_`,
|
|
137
246
|
attachments: [
|
|
@@ -140,21 +249,60 @@ export class AlertService {
|
|
|
140
249
|
fields: [
|
|
141
250
|
{ title: 'Rule', value: event.ruleId, short: true },
|
|
142
251
|
{ title: 'Severity', value: event.severity, short: true },
|
|
252
|
+
{ title: 'Platform', value: platform, short: true },
|
|
143
253
|
],
|
|
144
254
|
},
|
|
145
255
|
],
|
|
146
256
|
}
|
|
147
257
|
|
|
148
|
-
fetch(
|
|
258
|
+
const res = await fetch(url, {
|
|
149
259
|
method: 'POST',
|
|
150
260
|
headers: { 'Content-Type': 'application/json' },
|
|
151
261
|
body: JSON.stringify(payload),
|
|
152
|
-
})
|
|
153
|
-
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
if (!res.ok) {
|
|
265
|
+
throw new Error(`Failed to send to ${platform}: ${await res.text()}`)
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
private async sendEmail(config: any, event: AlertEvent) {
|
|
270
|
+
const transporter = nodemailer.createTransport({
|
|
271
|
+
host: config.smtpHost,
|
|
272
|
+
port: config.smtpPort,
|
|
273
|
+
secure: config.smtpPort === 465,
|
|
274
|
+
auth: {
|
|
275
|
+
user: config.smtpUser,
|
|
276
|
+
pass: config.smtpPass,
|
|
277
|
+
},
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
await transporter.sendMail({
|
|
281
|
+
from: config.from,
|
|
282
|
+
to: config.to,
|
|
283
|
+
subject: `[Zenith Alert] ${event.severity.toUpperCase()}: ${event.ruleId}`,
|
|
284
|
+
text: `${event.message}\n\nTimestamp: ${new Date(event.timestamp).toISOString()}\nSeverity: ${event.severity}`,
|
|
285
|
+
html: `
|
|
286
|
+
<div style="font-family: sans-serif; padding: 20px; border: 1px solid #eee; border-radius: 10px;">
|
|
287
|
+
<h2 style="color: ${event.severity === 'critical' ? '#ef4444' : '#f59e0b'}">
|
|
288
|
+
Zenith Alert: ${event.severity.toUpperCase()}
|
|
289
|
+
</h2>
|
|
290
|
+
<p style="font-size: 16px;">${event.message}</p>
|
|
291
|
+
<hr />
|
|
292
|
+
<p style="font-size: 12px; color: #666;">
|
|
293
|
+
Rule ID: ${event.ruleId}<br />
|
|
294
|
+
Time: ${new Date(event.timestamp).toISOString()}
|
|
295
|
+
</p>
|
|
296
|
+
</div>
|
|
297
|
+
`,
|
|
154
298
|
})
|
|
155
299
|
}
|
|
156
300
|
|
|
157
301
|
getRules() {
|
|
158
302
|
return this.rules
|
|
159
303
|
}
|
|
304
|
+
|
|
305
|
+
getConfig() {
|
|
306
|
+
return this.config
|
|
307
|
+
}
|
|
160
308
|
}
|