@gravito/zenith 0.1.0-beta.1 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import { useQuery, useQueryClient } from '@tanstack/react-query'
2
2
  import { motion } from 'framer-motion'
3
3
  import { AlertCircle, Clock, Cpu, Gauge, MemoryStick, RefreshCcw, Server, Zap } from 'lucide-react'
4
- import React, { useEffect } from 'react'
4
+ import { useEffect } from 'react'
5
5
  import { cn } from '../utils'
6
6
 
7
7
  interface Worker {
@@ -1,6 +1,7 @@
1
1
  export { LoginPage } from './LoginPage'
2
2
  export { MetricsPage } from './MetricsPage'
3
3
  export { OverviewPage } from './OverviewPage'
4
+ export { PulsePage } from './PulsePage'
4
5
  export { QueuesPage } from './QueuesPage'
5
6
  export { SchedulesPage } from './SchedulesPage'
6
7
  export { SettingsPage } from './SettingsPage'
@@ -1,9 +1,14 @@
1
1
  import { DB } from '@gravito/atlas'
2
2
  import { Photon } from '@gravito/photon'
3
+ import { QuasarAgent } from '@gravito/quasar'
3
4
  import { MySQLPersistence, SQLitePersistence } from '@gravito/stream'
5
+ import fs from 'fs'
4
6
  import { serveStatic } from 'hono/bun'
5
7
  import { getCookie } from 'hono/cookie'
6
8
  import { streamSSE } from 'hono/streaming'
9
+ import os from 'os'
10
+ import path from 'path'
11
+ import { fileURLToPath } from 'url'
7
12
  import {
8
13
  authMiddleware,
9
14
  createSession,
@@ -11,6 +16,8 @@ import {
11
16
  isAuthEnabled,
12
17
  verifyPassword,
13
18
  } from './middleware/auth'
19
+ import { CommandService } from './services/CommandService'
20
+ import { PulseService } from './services/PulseService'
14
21
  import { QueueService } from './services/QueueService'
15
22
 
16
23
  const app = new Photon()
@@ -58,14 +65,26 @@ if (dbDriver === 'sqlite' || process.env.DB_HOST) {
58
65
 
59
66
  // Service Initialization
60
67
  const queueService = new QueueService(REDIS_URL, QUEUE_PREFIX, persistence)
68
+ const pulseService = new PulseService(REDIS_URL)
69
+ const commandService = new CommandService(REDIS_URL)
61
70
 
62
71
  queueService
63
72
  .connect()
73
+ .then(() => pulseService.connect())
74
+ .then(() => commandService.connect())
64
75
  .then(() => {
76
+ // Start Self-Monitoring (Quasar)
77
+ const agent = new QuasarAgent({
78
+ service: 'flux-console',
79
+ redisUrl: REDIS_URL,
80
+ })
81
+ agent.start().catch((err) => console.error('[FluxConsole] Quasar Agent Error:', err))
82
+
65
83
  console.log(`[FluxConsole] Connected to Redis at ${REDIS_URL}`)
66
84
  // Start background metrics recording (Reduced from 5s to 2s for better real-time feel)
67
- setInterval(() => {
68
- queueService.recordStatusMetrics().catch(console.error)
85
+ setInterval(async () => {
86
+ const nodes = await pulseService.getNodes()
87
+ queueService.recordStatusMetrics(nodes).catch(console.error)
69
88
  }, 2000)
70
89
 
71
90
  // Start Scheduler Tick (Reduced from 10s to 5s)
@@ -74,7 +93,10 @@ queueService
74
93
  }, 5000)
75
94
 
76
95
  // Record initial snapshot
77
- queueService.recordStatusMetrics().catch(console.error)
96
+ pulseService
97
+ .getNodes()
98
+ .then((nodes) => queueService.recordStatusMetrics(nodes))
99
+ .catch(console.error)
78
100
  })
79
101
  .catch((err) => {
80
102
  console.error('[FluxConsole] Failed to connect to Redis', err)
@@ -328,19 +350,80 @@ api.get('/metrics/history', async (c) => {
328
350
 
329
351
  api.get('/system/status', (c) => {
330
352
  const mem = process.memoryUsage()
353
+ const totalMem = os.totalmem()
354
+
355
+ // Find package.json (relative to this file in src/server/index.ts)
356
+ const __dirname = path.dirname(fileURLToPath(import.meta.url))
357
+ const pkgPath = path.resolve(__dirname, '../../package.json')
358
+ let pkg = { version: '0.1.0-unknown', name: '@gravito/zenith' }
359
+ try {
360
+ pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'))
361
+ } catch (_e) {
362
+ // fallback
363
+ }
364
+
331
365
  return c.json({
332
366
  node: process.version,
333
367
  memory: {
334
368
  rss: `${(mem.rss / 1024 / 1024).toFixed(2)} MB`,
335
369
  heapUsed: `${(mem.heapUsed / 1024 / 1024).toFixed(2)} MB`,
336
- total: '4.00 GB', // Hardcoded limit for demo aesthetic
370
+ total: `${(totalMem / 1024 / 1024 / 1024).toFixed(2)} GB`,
337
371
  },
338
- engine: 'v0.1.0-beta.1',
372
+ version: pkg.version,
373
+ package: pkg.name,
374
+ engine: `Zenith ${pkg.version}`,
339
375
  uptime: process.uptime(),
340
- env: process.env.NODE_ENV || 'production-east-1',
376
+ env:
377
+ process.env.NODE_ENV === 'production'
378
+ ? `production (${os.hostname()})`
379
+ : `development (${os.hostname()})`,
380
+ redisUrl: process.env.REDIS_URL || 'redis://localhost:6379',
341
381
  })
342
382
  })
343
383
 
384
+ // --- Pulse Monitoring ---
385
+ api.get('/pulse/nodes', async (c) => {
386
+ try {
387
+ const nodes = await pulseService.getNodes()
388
+ return c.json({ nodes })
389
+ } catch (_err) {
390
+ return c.json({ error: 'Failed to fetch pulse nodes' }, 500)
391
+ }
392
+ })
393
+
394
+ // --- Pulse Remote Control (Phase 3) ---
395
+ api.post('/pulse/command', async (c) => {
396
+ try {
397
+ const { service, nodeId, type, queue, jobKey, driver, action } = await c.req.json()
398
+
399
+ // Validate required fields
400
+ if (!service || !nodeId || !type || !queue || !jobKey) {
401
+ return c.json({ error: 'Missing required fields: service, nodeId, type, queue, jobKey' }, 400)
402
+ }
403
+
404
+ // Validate command type
405
+ if (type !== 'RETRY_JOB' && type !== 'DELETE_JOB' && type !== 'LARAVEL_ACTION') {
406
+ return c.json({ error: 'Invalid command type. Allowed: RETRY_JOB, DELETE_JOB, LARAVEL_ACTION' }, 400)
407
+ }
408
+
409
+ const commandId = await commandService.sendCommand(service, nodeId, type, {
410
+ queue,
411
+ jobKey,
412
+ driver: driver || 'redis',
413
+ action,
414
+ })
415
+
416
+ return c.json({
417
+ success: true,
418
+ commandId,
419
+ message: `Command ${type} sent to ${nodeId}. Observe job state for result.`,
420
+ })
421
+ } catch (err) {
422
+ console.error('[CommandService] Error:', err)
423
+ return c.json({ error: 'Failed to send command' }, 500)
424
+ }
425
+ })
426
+
344
427
  api.post('/queues/:name/jobs/delete', async (c) => {
345
428
  const queueName = c.req.param('name')
346
429
  const { type, raw } = await c.req.json()
@@ -454,9 +537,23 @@ api.get('/logs/stream', async (c) => {
454
537
  })
455
538
  })
456
539
 
540
+ // 4. Poll Pulse Nodes per client (simple polling for now)
541
+ const pulseInterval = setInterval(async () => {
542
+ try {
543
+ const nodes = await pulseService.getNodes()
544
+ await stream.writeSSE({
545
+ data: JSON.stringify({ nodes }),
546
+ event: 'pulse',
547
+ })
548
+ } catch (err) {
549
+ // ignore errors
550
+ }
551
+ }, 2000)
552
+
457
553
  stream.onAbort(() => {
458
554
  unsubscribeLogs()
459
555
  unsubscribeStats()
556
+ clearInterval(pulseInterval)
460
557
  })
461
558
 
462
559
  // Keep alive
@@ -508,17 +605,60 @@ api.delete('/schedules/:id', async (c) => {
508
605
  })
509
606
 
510
607
  // --- Alerting ---
511
- api.get('/alerts/config', (c) => {
608
+ api.get('/alerts/config', async (c) => {
512
609
  return c.json({
513
610
  rules: queueService.alerts.getRules(),
514
- webhookEnabled: !!process.env.SLACK_WEBHOOK_URL,
611
+ config: queueService.alerts.getConfig(),
612
+ maintenance: await queueService.getMaintenanceConfig(),
515
613
  })
516
614
  })
517
615
 
616
+ api.post('/maintenance/config', async (c) => {
617
+ const config = await c.req.json()
618
+ try {
619
+ await queueService.saveMaintenanceConfig(config)
620
+ return c.json({ success: true })
621
+ } catch (err) {
622
+ return c.json({ error: 'Failed to save maintenance config' }, 500)
623
+ }
624
+ })
625
+
626
+ api.post('/alerts/config', async (c) => {
627
+ const config = await c.req.json()
628
+ try {
629
+ await queueService.alerts.saveConfig(config)
630
+ return c.json({ success: true })
631
+ } catch (err) {
632
+ return c.json({ error: 'Failed to save alert config' }, 500)
633
+ }
634
+ })
635
+
636
+ api.post('/alerts/rules', async (c) => {
637
+ const rule = await c.req.json()
638
+ try {
639
+ await queueService.alerts.addRule(rule)
640
+ return c.json({ success: true })
641
+ } catch (err) {
642
+ return c.json({ error: 'Failed to add rule' }, 500)
643
+ }
644
+ })
645
+
646
+ api.delete('/alerts/rules/:id', async (c) => {
647
+ const id = c.req.param('id')
648
+ try {
649
+ await queueService.alerts.deleteRule(id)
650
+ return c.json({ success: true })
651
+ } catch (err) {
652
+ return c.json({ error: 'Failed to delete rule' }, 500)
653
+ }
654
+ })
655
+
518
656
  api.post('/alerts/test', async (c) => {
519
657
  try {
658
+ const nodes = await pulseService.getNodes()
520
659
  queueService.alerts.check({
521
660
  queues: [],
661
+ nodes,
522
662
  workers: [
523
663
  {
524
664
  id: 'test-node',
@@ -1,57 +1,110 @@
1
1
  import { EventEmitter } from 'events'
2
+ import { Redis } from 'ioredis'
3
+ import nodemailer from 'nodemailer'
4
+ import type { AlertConfig, AlertEvent, AlertRule, PulseNode } from '../../shared/types'
2
5
  import type { WorkerReport } from './QueueService'
3
6
 
4
- export interface AlertRule {
5
- id: string
6
- name: string
7
- type: 'backlog' | 'failure' | 'worker_lost'
8
- threshold: number
9
- queue?: string // Optional: specific queue or all
10
- cooldownMinutes: number
11
- }
12
-
13
- export interface AlertEvent {
14
- ruleId: string
15
- timestamp: number
16
- message: string
17
- severity: 'warning' | 'critical'
18
- }
19
-
20
7
  export class AlertService {
8
+ private redis: Redis
21
9
  private rules: AlertRule[] = []
10
+ private config: AlertConfig = { channels: {} }
22
11
  private cooldowns: Map<string, number> = new Map()
23
- private webhookUrl: string | null = process.env.SLACK_WEBHOOK_URL || null
24
12
  private emitter = new EventEmitter()
13
+ private readonly RULES_KEY = 'gravito:zenith:alerts:rules'
14
+ private readonly CONFIG_KEY = 'gravito:zenith:alerts:config'
25
15
 
26
- constructor() {
27
- // Default Rules
16
+ constructor(redisUrl: string) {
17
+ this.redis = new Redis(redisUrl, {
18
+ lazyConnect: true,
19
+ })
20
+
21
+ // Initial default rules
28
22
  this.rules = [
29
23
  {
30
24
  id: 'global_failure_spike',
31
25
  name: 'High Failure Rate',
32
26
  type: 'failure',
33
- threshold: 50, // More than 50 failed jobs
27
+ threshold: 50,
34
28
  cooldownMinutes: 30,
35
29
  },
36
30
  {
37
31
  id: 'global_backlog_critical',
38
32
  name: 'Queue Backlog Warning',
39
33
  type: 'backlog',
40
- threshold: 1000, // More than 1000 waiting jobs
34
+ threshold: 1000,
41
35
  cooldownMinutes: 60,
42
36
  },
43
37
  {
44
38
  id: 'no_workers_online',
45
39
  name: 'All Workers Offline',
46
40
  type: 'worker_lost',
47
- threshold: 1, // < 1 worker
41
+ threshold: 1,
48
42
  cooldownMinutes: 15,
49
43
  },
50
44
  ]
45
+
46
+ // Default configuration (with env fallback for Slack)
47
+ if (process.env.SLACK_WEBHOOK_URL) {
48
+ this.config.channels.slack = {
49
+ enabled: true,
50
+ webhookUrl: process.env.SLACK_WEBHOOK_URL,
51
+ }
52
+ }
53
+
54
+ this.init().catch((err) => console.error('[AlertService] Failed to initialize:', err))
55
+ }
56
+
57
+ async connect() {
58
+ if (this.redis.status === 'wait') {
59
+ await this.redis.connect()
60
+ }
61
+ }
62
+
63
+ private async init() {
64
+ await this.loadRules()
65
+ await this.loadConfig()
66
+ }
67
+
68
+ async loadRules() {
69
+ try {
70
+ const data = await this.redis.get(this.RULES_KEY)
71
+ if (data) {
72
+ this.rules = JSON.parse(data)
73
+ }
74
+ } catch (err) {
75
+ console.error('[AlertService] Error loading rules from Redis:', err)
76
+ }
77
+ }
78
+
79
+ async loadConfig() {
80
+ try {
81
+ const data = await this.redis.get(this.CONFIG_KEY)
82
+ if (data) {
83
+ this.config = JSON.parse(data)
84
+ }
85
+ } catch (err) {
86
+ console.error('[AlertService] Error loading config from Redis:', err)
87
+ }
88
+ }
89
+
90
+ async saveRules(rules: AlertRule[]) {
91
+ this.rules = rules
92
+ await this.redis.set(this.RULES_KEY, JSON.stringify(rules))
93
+ }
94
+
95
+ async saveConfig(config: AlertConfig) {
96
+ this.config = config
97
+ await this.redis.set(this.CONFIG_KEY, JSON.stringify(config))
98
+ }
99
+
100
+ async addRule(rule: AlertRule) {
101
+ this.rules.push(rule)
102
+ await this.saveRules(this.rules)
51
103
  }
52
104
 
53
- setWebhook(url: string | null) {
54
- this.webhookUrl = url
105
+ async deleteRule(id: string) {
106
+ this.rules = this.rules.filter((r) => r.id !== id)
107
+ await this.saveRules(this.rules)
55
108
  }
56
109
 
57
110
  onAlert(callback: (event: AlertEvent) => void) {
@@ -61,10 +114,10 @@ export class AlertService {
61
114
 
62
115
  /**
63
116
  * Evaluates rules against provided data.
64
- * Extremely lightweight: only uses existing metrics data.
65
117
  */
66
118
  async check(data: {
67
119
  queues: any[]
120
+ nodes: Record<string, PulseNode[]>
68
121
  workers: WorkerReport[]
69
122
  totals: { waiting: number; delayed: number; failed: number }
70
123
  }) {
@@ -83,21 +136,35 @@ export class AlertService {
83
136
 
84
137
  // 2. Evaluate Rule
85
138
  switch (rule.type) {
86
- case 'backlog':
87
- if (data.totals.waiting >= rule.threshold) {
139
+ case 'backlog': {
140
+ const targetValue = rule.queue
141
+ ? data.queues.find((q) => q.name === rule.queue)?.waiting || 0
142
+ : data.totals.waiting
143
+
144
+ if (targetValue >= rule.threshold) {
88
145
  fired = true
89
146
  severity = 'critical'
90
- message = `Queue backlog detected: ${data.totals.waiting} jobs waiting across all queues.`
147
+ message = rule.queue
148
+ ? `Queue backlog on ${rule.queue}: ${targetValue} jobs waiting.`
149
+ : `Queue backlog detected: ${targetValue} jobs waiting across all queues.`
91
150
  }
92
151
  break
152
+ }
93
153
 
94
- case 'failure':
95
- if (data.totals.failed >= rule.threshold) {
154
+ case 'failure': {
155
+ const targetValue = rule.queue
156
+ ? data.queues.find((q) => q.name === rule.queue)?.failed || 0
157
+ : data.totals.failed
158
+
159
+ if (targetValue >= rule.threshold) {
96
160
  fired = true
97
161
  severity = 'warning'
98
- message = `High failure count: ${data.totals.failed} jobs are currently in failed state.`
162
+ message = rule.queue
163
+ ? `High failure count on ${rule.queue}: ${targetValue} jobs failed.`
164
+ : `High failure count: ${targetValue} jobs are currently in failed state.`
99
165
  }
100
166
  break
167
+ }
101
168
 
102
169
  case 'worker_lost':
103
170
  if (data.workers.length < rule.threshold) {
@@ -106,6 +173,36 @@ export class AlertService {
106
173
  message = `System Incident: Zero worker nodes detected! Jobs will not be processed.`
107
174
  }
108
175
  break
176
+
177
+ case 'node_cpu':
178
+ // Check all pulse nodes
179
+ for (const serviceNodes of Object.values(data.nodes)) {
180
+ for (const node of serviceNodes) {
181
+ if (node.cpu.process >= rule.threshold) {
182
+ fired = true
183
+ severity = 'warning'
184
+ message = `High CPU Usage on ${node.service} (${node.id}): ${node.cpu.process}%`
185
+ break
186
+ }
187
+ }
188
+ if (fired) break
189
+ }
190
+ break
191
+
192
+ case 'node_ram':
193
+ for (const serviceNodes of Object.values(data.nodes)) {
194
+ for (const node of serviceNodes) {
195
+ const usagePercent = (node.memory.process.rss / node.memory.system.total) * 100
196
+ if (usagePercent >= rule.threshold) {
197
+ fired = true
198
+ severity = 'warning'
199
+ message = `High RAM Usage on ${node.service} (${node.id}): ${usagePercent.toFixed(1)}%`
200
+ break
201
+ }
202
+ }
203
+ if (fired) break
204
+ }
205
+ break
109
206
  }
110
207
 
111
208
  // 3. Dispatch if fired
@@ -124,14 +221,26 @@ export class AlertService {
124
221
  }
125
222
  }
126
223
 
127
- /**
128
- * Send notification to external channels.
129
- * Fire-and-forget to ensure zero impact on main loop latency.
130
- */
131
- private notify(event: AlertEvent) {
132
- if (!this.webhookUrl) return
224
+ private async notify(event: AlertEvent) {
225
+ const { slack, discord, email } = this.config.channels
133
226
 
134
- // Simple Slack formatting
227
+ // 1. Notify Slack
228
+ if (slack?.enabled && slack.webhookUrl) {
229
+ this.sendToWebhook(slack.webhookUrl, 'Slack', event).catch(console.error)
230
+ }
231
+
232
+ // 2. Notify Discord
233
+ if (discord?.enabled && discord.webhookUrl) {
234
+ this.sendToWebhook(discord.webhookUrl, 'Discord', event).catch(console.error)
235
+ }
236
+
237
+ // 3. Notify Email
238
+ if (email?.enabled) {
239
+ this.sendEmail(email, event).catch(console.error)
240
+ }
241
+ }
242
+
243
+ private async sendToWebhook(url: string, platform: string, event: AlertEvent) {
135
244
  const payload = {
136
245
  text: `*Flux Console Alert [${event.severity.toUpperCase()}]*\n${event.message}\n_Time: ${new Date(event.timestamp).toISOString()}_`,
137
246
  attachments: [
@@ -140,21 +249,60 @@ export class AlertService {
140
249
  fields: [
141
250
  { title: 'Rule', value: event.ruleId, short: true },
142
251
  { title: 'Severity', value: event.severity, short: true },
252
+ { title: 'Platform', value: platform, short: true },
143
253
  ],
144
254
  },
145
255
  ],
146
256
  }
147
257
 
148
- fetch(this.webhookUrl, {
258
+ const res = await fetch(url, {
149
259
  method: 'POST',
150
260
  headers: { 'Content-Type': 'application/json' },
151
261
  body: JSON.stringify(payload),
152
- }).catch((err) => {
153
- console.error('[AlertService] Failed to send notification:', err.message)
262
+ })
263
+
264
+ if (!res.ok) {
265
+ throw new Error(`Failed to send to ${platform}: ${await res.text()}`)
266
+ }
267
+ }
268
+
269
+ private async sendEmail(config: any, event: AlertEvent) {
270
+ const transporter = nodemailer.createTransport({
271
+ host: config.smtpHost,
272
+ port: config.smtpPort,
273
+ secure: config.smtpPort === 465,
274
+ auth: {
275
+ user: config.smtpUser,
276
+ pass: config.smtpPass,
277
+ },
278
+ })
279
+
280
+ await transporter.sendMail({
281
+ from: config.from,
282
+ to: config.to,
283
+ subject: `[Zenith Alert] ${event.severity.toUpperCase()}: ${event.ruleId}`,
284
+ text: `${event.message}\n\nTimestamp: ${new Date(event.timestamp).toISOString()}\nSeverity: ${event.severity}`,
285
+ html: `
286
+ <div style="font-family: sans-serif; padding: 20px; border: 1px solid #eee; border-radius: 10px;">
287
+ <h2 style="color: ${event.severity === 'critical' ? '#ef4444' : '#f59e0b'}">
288
+ Zenith Alert: ${event.severity.toUpperCase()}
289
+ </h2>
290
+ <p style="font-size: 16px;">${event.message}</p>
291
+ <hr />
292
+ <p style="font-size: 12px; color: #666;">
293
+ Rule ID: ${event.ruleId}<br />
294
+ Time: ${new Date(event.timestamp).toISOString()}
295
+ </p>
296
+ </div>
297
+ `,
154
298
  })
155
299
  }
156
300
 
157
301
  getRules() {
158
302
  return this.rules
159
303
  }
304
+
305
+ getConfig() {
306
+ return this.config
307
+ }
160
308
  }