@gravito/zenith 0.1.0-beta.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +9 -0
  2. package/dist/bin.js +38846 -27303
  3. package/dist/client/assets/index-C332gZ-J.css +1 -0
  4. package/dist/client/assets/index-D4HibwTK.js +436 -0
  5. package/dist/client/index.html +2 -2
  6. package/dist/server/index.js +38846 -27303
  7. package/docs/ALERTING_GUIDE.md +71 -0
  8. package/docs/LARAVEL_ZENITH_ROADMAP.md +109 -0
  9. package/docs/QUASAR_MASTER_PLAN.md +140 -0
  10. package/package.json +52 -48
  11. package/scripts/debug_redis_keys.ts +24 -0
  12. package/specs/PULSE_SPEC.md +86 -0
  13. package/src/client/App.tsx +2 -0
  14. package/src/client/Layout.tsx +18 -0
  15. package/src/client/Sidebar.tsx +2 -1
  16. package/src/client/WorkerStatus.tsx +121 -76
  17. package/src/client/components/BrandIcons.tsx +138 -0
  18. package/src/client/components/ConfirmDialog.tsx +0 -1
  19. package/src/client/components/JobInspector.tsx +18 -6
  20. package/src/client/components/PageHeader.tsx +38 -0
  21. package/src/client/pages/OverviewPage.tsx +17 -20
  22. package/src/client/pages/PulsePage.tsx +478 -0
  23. package/src/client/pages/QueuesPage.tsx +1 -3
  24. package/src/client/pages/SettingsPage.tsx +640 -78
  25. package/src/client/pages/WorkersPage.tsx +71 -3
  26. package/src/client/pages/index.ts +1 -0
  27. package/src/server/index.ts +311 -11
  28. package/src/server/services/AlertService.ts +189 -41
  29. package/src/server/services/CommandService.ts +137 -0
  30. package/src/server/services/PulseService.ts +80 -0
  31. package/src/server/services/QueueService.ts +63 -6
  32. package/src/shared/types.ts +99 -0
  33. package/tsconfig.json +2 -2
  34. package/ARCHITECTURE.md +0 -88
  35. package/BATCH_OPERATIONS_IMPLEMENTATION.md +0 -159
  36. package/EVOLUTION_BLUEPRINT.md +0 -112
  37. package/JOBINSPECTOR_SCROLL_FIX.md +0 -152
  38. package/PULSE_IMPLEMENTATION_PLAN.md +0 -111
  39. package/TESTING_BATCH_OPERATIONS.md +0 -252
  40. package/dist/client/assets/index-DGYEwTDL.css +0 -1
  41. package/dist/client/assets/index-oyTdySX0.js +0 -421
  42. /package/{DEPLOYMENT.md → docs/DEPLOYMENT.md} +0 -0
  43. /package/{DOCS_INTERNAL.md → docs/DOCS_INTERNAL.md} +0 -0
  44. /package/{QUICK_TEST_GUIDE.md → docs/QUICK_TEST_GUIDE.md} +0 -0
  45. /package/{ROADMAP.md → docs/ROADMAP.md} +0 -0
@@ -1,57 +1,110 @@
1
1
  import { EventEmitter } from 'events'
2
+ import { Redis } from 'ioredis'
3
+ import nodemailer from 'nodemailer'
4
+ import type { AlertConfig, AlertEvent, AlertRule, PulseNode } from '../../shared/types'
2
5
  import type { WorkerReport } from './QueueService'
3
6
 
4
- export interface AlertRule {
5
- id: string
6
- name: string
7
- type: 'backlog' | 'failure' | 'worker_lost'
8
- threshold: number
9
- queue?: string // Optional: specific queue or all
10
- cooldownMinutes: number
11
- }
12
-
13
- export interface AlertEvent {
14
- ruleId: string
15
- timestamp: number
16
- message: string
17
- severity: 'warning' | 'critical'
18
- }
19
-
20
7
  export class AlertService {
8
+ private redis: Redis
21
9
  private rules: AlertRule[] = []
10
+ private config: AlertConfig = { channels: {} }
22
11
  private cooldowns: Map<string, number> = new Map()
23
- private webhookUrl: string | null = process.env.SLACK_WEBHOOK_URL || null
24
12
  private emitter = new EventEmitter()
13
+ private readonly RULES_KEY = 'gravito:zenith:alerts:rules'
14
+ private readonly CONFIG_KEY = 'gravito:zenith:alerts:config'
25
15
 
26
- constructor() {
27
- // Default Rules
16
+ constructor(redisUrl: string) {
17
+ this.redis = new Redis(redisUrl, {
18
+ lazyConnect: true,
19
+ })
20
+
21
+ // Initial default rules
28
22
  this.rules = [
29
23
  {
30
24
  id: 'global_failure_spike',
31
25
  name: 'High Failure Rate',
32
26
  type: 'failure',
33
- threshold: 50, // More than 50 failed jobs
27
+ threshold: 50,
34
28
  cooldownMinutes: 30,
35
29
  },
36
30
  {
37
31
  id: 'global_backlog_critical',
38
32
  name: 'Queue Backlog Warning',
39
33
  type: 'backlog',
40
- threshold: 1000, // More than 1000 waiting jobs
34
+ threshold: 1000,
41
35
  cooldownMinutes: 60,
42
36
  },
43
37
  {
44
38
  id: 'no_workers_online',
45
39
  name: 'All Workers Offline',
46
40
  type: 'worker_lost',
47
- threshold: 1, // < 1 worker
41
+ threshold: 1,
48
42
  cooldownMinutes: 15,
49
43
  },
50
44
  ]
45
+
46
+ // Default configuration (with env fallback for Slack)
47
+ if (process.env.SLACK_WEBHOOK_URL) {
48
+ this.config.channels.slack = {
49
+ enabled: true,
50
+ webhookUrl: process.env.SLACK_WEBHOOK_URL,
51
+ }
52
+ }
53
+
54
+ this.init().catch((err) => console.error('[AlertService] Failed to initialize:', err))
55
+ }
56
+
57
+ async connect() {
58
+ if (this.redis.status === 'wait') {
59
+ await this.redis.connect()
60
+ }
61
+ }
62
+
63
+ private async init() {
64
+ await this.loadRules()
65
+ await this.loadConfig()
66
+ }
67
+
68
+ async loadRules() {
69
+ try {
70
+ const data = await this.redis.get(this.RULES_KEY)
71
+ if (data) {
72
+ this.rules = JSON.parse(data)
73
+ }
74
+ } catch (err) {
75
+ console.error('[AlertService] Error loading rules from Redis:', err)
76
+ }
77
+ }
78
+
79
+ async loadConfig() {
80
+ try {
81
+ const data = await this.redis.get(this.CONFIG_KEY)
82
+ if (data) {
83
+ this.config = JSON.parse(data)
84
+ }
85
+ } catch (err) {
86
+ console.error('[AlertService] Error loading config from Redis:', err)
87
+ }
88
+ }
89
+
90
+ async saveRules(rules: AlertRule[]) {
91
+ this.rules = rules
92
+ await this.redis.set(this.RULES_KEY, JSON.stringify(rules))
93
+ }
94
+
95
+ async saveConfig(config: AlertConfig) {
96
+ this.config = config
97
+ await this.redis.set(this.CONFIG_KEY, JSON.stringify(config))
98
+ }
99
+
100
+ async addRule(rule: AlertRule) {
101
+ this.rules.push(rule)
102
+ await this.saveRules(this.rules)
51
103
  }
52
104
 
53
- setWebhook(url: string | null) {
54
- this.webhookUrl = url
105
+ async deleteRule(id: string) {
106
+ this.rules = this.rules.filter((r) => r.id !== id)
107
+ await this.saveRules(this.rules)
55
108
  }
56
109
 
57
110
  onAlert(callback: (event: AlertEvent) => void) {
@@ -61,10 +114,10 @@ export class AlertService {
61
114
 
62
115
  /**
63
116
  * Evaluates rules against provided data.
64
- * Extremely lightweight: only uses existing metrics data.
65
117
  */
66
118
  async check(data: {
67
119
  queues: any[]
120
+ nodes: Record<string, PulseNode[]>
68
121
  workers: WorkerReport[]
69
122
  totals: { waiting: number; delayed: number; failed: number }
70
123
  }) {
@@ -83,21 +136,35 @@ export class AlertService {
83
136
 
84
137
  // 2. Evaluate Rule
85
138
  switch (rule.type) {
86
- case 'backlog':
87
- if (data.totals.waiting >= rule.threshold) {
139
+ case 'backlog': {
140
+ const targetValue = rule.queue
141
+ ? data.queues.find((q) => q.name === rule.queue)?.waiting || 0
142
+ : data.totals.waiting
143
+
144
+ if (targetValue >= rule.threshold) {
88
145
  fired = true
89
146
  severity = 'critical'
90
- message = `Queue backlog detected: ${data.totals.waiting} jobs waiting across all queues.`
147
+ message = rule.queue
148
+ ? `Queue backlog on ${rule.queue}: ${targetValue} jobs waiting.`
149
+ : `Queue backlog detected: ${targetValue} jobs waiting across all queues.`
91
150
  }
92
151
  break
152
+ }
93
153
 
94
- case 'failure':
95
- if (data.totals.failed >= rule.threshold) {
154
+ case 'failure': {
155
+ const targetValue = rule.queue
156
+ ? data.queues.find((q) => q.name === rule.queue)?.failed || 0
157
+ : data.totals.failed
158
+
159
+ if (targetValue >= rule.threshold) {
96
160
  fired = true
97
161
  severity = 'warning'
98
- message = `High failure count: ${data.totals.failed} jobs are currently in failed state.`
162
+ message = rule.queue
163
+ ? `High failure count on ${rule.queue}: ${targetValue} jobs failed.`
164
+ : `High failure count: ${targetValue} jobs are currently in failed state.`
99
165
  }
100
166
  break
167
+ }
101
168
 
102
169
  case 'worker_lost':
103
170
  if (data.workers.length < rule.threshold) {
@@ -106,6 +173,36 @@ export class AlertService {
106
173
  message = `System Incident: Zero worker nodes detected! Jobs will not be processed.`
107
174
  }
108
175
  break
176
+
177
+ case 'node_cpu':
178
+ // Check all pulse nodes
179
+ for (const serviceNodes of Object.values(data.nodes)) {
180
+ for (const node of serviceNodes) {
181
+ if (node.cpu.process >= rule.threshold) {
182
+ fired = true
183
+ severity = 'warning'
184
+ message = `High CPU Usage on ${node.service} (${node.id}): ${node.cpu.process}%`
185
+ break
186
+ }
187
+ }
188
+ if (fired) break
189
+ }
190
+ break
191
+
192
+ case 'node_ram':
193
+ for (const serviceNodes of Object.values(data.nodes)) {
194
+ for (const node of serviceNodes) {
195
+ const usagePercent = (node.memory.process.rss / node.memory.system.total) * 100
196
+ if (usagePercent >= rule.threshold) {
197
+ fired = true
198
+ severity = 'warning'
199
+ message = `High RAM Usage on ${node.service} (${node.id}): ${usagePercent.toFixed(1)}%`
200
+ break
201
+ }
202
+ }
203
+ if (fired) break
204
+ }
205
+ break
109
206
  }
110
207
 
111
208
  // 3. Dispatch if fired
@@ -124,14 +221,26 @@ export class AlertService {
124
221
  }
125
222
  }
126
223
 
127
- /**
128
- * Send notification to external channels.
129
- * Fire-and-forget to ensure zero impact on main loop latency.
130
- */
131
- private notify(event: AlertEvent) {
132
- if (!this.webhookUrl) return
224
+ private async notify(event: AlertEvent) {
225
+ const { slack, discord, email } = this.config.channels
133
226
 
134
- // Simple Slack formatting
227
+ // 1. Notify Slack
228
+ if (slack?.enabled && slack.webhookUrl) {
229
+ this.sendToWebhook(slack.webhookUrl, 'Slack', event).catch(console.error)
230
+ }
231
+
232
+ // 2. Notify Discord
233
+ if (discord?.enabled && discord.webhookUrl) {
234
+ this.sendToWebhook(discord.webhookUrl, 'Discord', event).catch(console.error)
235
+ }
236
+
237
+ // 3. Notify Email
238
+ if (email?.enabled) {
239
+ this.sendEmail(email, event).catch(console.error)
240
+ }
241
+ }
242
+
243
+ private async sendToWebhook(url: string, platform: string, event: AlertEvent) {
135
244
  const payload = {
136
245
  text: `*Flux Console Alert [${event.severity.toUpperCase()}]*\n${event.message}\n_Time: ${new Date(event.timestamp).toISOString()}_`,
137
246
  attachments: [
@@ -140,21 +249,60 @@ export class AlertService {
140
249
  fields: [
141
250
  { title: 'Rule', value: event.ruleId, short: true },
142
251
  { title: 'Severity', value: event.severity, short: true },
252
+ { title: 'Platform', value: platform, short: true },
143
253
  ],
144
254
  },
145
255
  ],
146
256
  }
147
257
 
148
- fetch(this.webhookUrl, {
258
+ const res = await fetch(url, {
149
259
  method: 'POST',
150
260
  headers: { 'Content-Type': 'application/json' },
151
261
  body: JSON.stringify(payload),
152
- }).catch((err) => {
153
- console.error('[AlertService] Failed to send notification:', err.message)
262
+ })
263
+
264
+ if (!res.ok) {
265
+ throw new Error(`Failed to send to ${platform}: ${await res.text()}`)
266
+ }
267
+ }
268
+
269
+ private async sendEmail(config: any, event: AlertEvent) {
270
+ const transporter = nodemailer.createTransport({
271
+ host: config.smtpHost,
272
+ port: config.smtpPort,
273
+ secure: config.smtpPort === 465,
274
+ auth: {
275
+ user: config.smtpUser,
276
+ pass: config.smtpPass,
277
+ },
278
+ })
279
+
280
+ await transporter.sendMail({
281
+ from: config.from,
282
+ to: config.to,
283
+ subject: `[Zenith Alert] ${event.severity.toUpperCase()}: ${event.ruleId}`,
284
+ text: `${event.message}\n\nTimestamp: ${new Date(event.timestamp).toISOString()}\nSeverity: ${event.severity}`,
285
+ html: `
286
+ <div style="font-family: sans-serif; padding: 20px; border: 1px solid #eee; border-radius: 10px;">
287
+ <h2 style="color: ${event.severity === 'critical' ? '#ef4444' : '#f59e0b'}">
288
+ Zenith Alert: ${event.severity.toUpperCase()}
289
+ </h2>
290
+ <p style="font-size: 16px;">${event.message}</p>
291
+ <hr />
292
+ <p style="font-size: 12px; color: #666;">
293
+ Rule ID: ${event.ruleId}<br />
294
+ Time: ${new Date(event.timestamp).toISOString()}
295
+ </p>
296
+ </div>
297
+ `,
154
298
  })
155
299
  }
156
300
 
157
301
  getRules() {
158
302
  return this.rules
159
303
  }
304
+
305
+ getConfig() {
306
+ return this.config
307
+ }
160
308
  }
@@ -0,0 +1,137 @@
1
+ import type { CommandType, QuasarCommand } from '@gravito/quasar'
2
+ import { Redis } from 'ioredis'
3
+
4
+ /**
5
+ * CommandService handles sending commands from Zenith to Quasar agents.
6
+ *
7
+ * This is the "control center" that publishes commands to Redis Pub/Sub.
8
+ * Agents subscribe and execute commands locally.
9
+ */
10
+ export class CommandService {
11
+ private redis: Redis
12
+
13
+ constructor(redisUrl: string) {
14
+ this.redis = new Redis(redisUrl, {
15
+ lazyConnect: true,
16
+ })
17
+ }
18
+
19
+ async connect(): Promise<void> {
20
+ if (this.redis.status !== 'ready' && this.redis.status !== 'connecting') {
21
+ await this.redis.connect()
22
+ }
23
+ }
24
+
25
+ /**
26
+ * Send a command to a specific Quasar agent.
27
+ *
28
+ * @param service - Target service name
29
+ * @param nodeId - Target node ID (or '*' for broadcast)
30
+ * @param type - Command type
31
+ * @param payload - Command payload
32
+ * @returns Command ID
33
+ */
34
+ async sendCommand(
35
+ service: string,
36
+ nodeId: string,
37
+ type: CommandType,
38
+ payload: QuasarCommand['payload']
39
+ ): Promise<string> {
40
+ const commandId = crypto.randomUUID()
41
+
42
+ const command: QuasarCommand = {
43
+ id: commandId,
44
+ type,
45
+ targetNodeId: nodeId,
46
+ payload,
47
+ timestamp: Date.now(),
48
+ issuer: 'zenith',
49
+ }
50
+
51
+ const channel = `gravito:quasar:cmd:${service}:${nodeId}`
52
+
53
+ await this.redis.publish(channel, JSON.stringify(command))
54
+
55
+ console.log(`[CommandService] 📤 Sent ${type} to ${channel}`)
56
+ return commandId
57
+ }
58
+
59
+ /**
60
+ * Retry a job on a specific node.
61
+ */
62
+ async retryJob(
63
+ service: string,
64
+ nodeId: string,
65
+ queue: string,
66
+ jobKey: string,
67
+ driver: 'redis' | 'laravel' = 'redis'
68
+ ): Promise<string> {
69
+ return this.sendCommand(service, nodeId, 'RETRY_JOB', {
70
+ queue,
71
+ jobKey,
72
+ driver,
73
+ })
74
+ }
75
+
76
+ /**
77
+ * Delete a job on a specific node.
78
+ */
79
+ async deleteJob(
80
+ service: string,
81
+ nodeId: string,
82
+ queue: string,
83
+ jobKey: string,
84
+ driver: 'redis' | 'laravel' = 'redis'
85
+ ): Promise<string> {
86
+ return this.sendCommand(service, nodeId, 'DELETE_JOB', {
87
+ queue,
88
+ jobKey,
89
+ driver,
90
+ })
91
+ }
92
+
93
+ /**
94
+ * Broadcast a retry command to all nodes of a service.
95
+ */
96
+ async broadcastRetryJob(
97
+ service: string,
98
+ queue: string,
99
+ jobKey: string,
100
+ driver: 'redis' | 'laravel' = 'redis'
101
+ ): Promise<string> {
102
+ return this.retryJob(service, '*', queue, jobKey, driver)
103
+ }
104
+
105
+ /**
106
+ * Broadcast a delete command to all nodes of a service.
107
+ */
108
+ async broadcastDeleteJob(
109
+ service: string,
110
+ queue: string,
111
+ jobKey: string,
112
+ driver: 'redis' | 'laravel' = 'redis'
113
+ ): Promise<string> {
114
+ return this.deleteJob(service, '*', queue, jobKey, driver)
115
+ }
116
+
117
+ /**
118
+ * Send a Laravel-specific action (retry-all, restart) to a node.
119
+ */
120
+ async laravelAction(
121
+ service: string,
122
+ nodeId: string,
123
+ action: 'retry-all' | 'restart' | 'retry',
124
+ jobId?: string
125
+ ): Promise<string> {
126
+ return this.sendCommand(service, nodeId, 'LARAVEL_ACTION', {
127
+ queue: 'default',
128
+ jobKey: '*',
129
+ action,
130
+ jobId,
131
+ })
132
+ }
133
+
134
+ async disconnect(): Promise<void> {
135
+ await this.redis.quit()
136
+ }
137
+ }
@@ -0,0 +1,80 @@
1
+ import { Redis } from 'ioredis'
2
+ import type { PulseNode } from '../../shared/types'
3
+
4
+ export class PulseService {
5
+ private redis: Redis
6
+ private prefix = 'gravito:quasar:node:'
7
+
8
+ constructor(redisUrl: string) {
9
+ this.redis = new Redis(redisUrl, {
10
+ lazyConnect: true,
11
+ })
12
+ }
13
+
14
+ async connect() {
15
+ await this.redis.connect()
16
+ }
17
+
18
+ /**
19
+ * Discovers active Pulse nodes using SCAN.
20
+ */
21
+ async getNodes(): Promise<Record<string, PulseNode[]>> {
22
+ const nodes: PulseNode[] = []
23
+ let cursor = '0'
24
+ const now = Date.now()
25
+
26
+ do {
27
+ // Scan for pulse keys
28
+ const result = await this.redis.scan(cursor, 'MATCH', `${this.prefix}*`, 'COUNT', 100)
29
+ cursor = result[0]
30
+ const keys = result[1]
31
+
32
+ if (keys.length > 0) {
33
+ // Fetch values
34
+ const values = await this.redis.mget(...keys)
35
+
36
+ values.forEach((v) => {
37
+ if (v) {
38
+ try {
39
+ const node = JSON.parse(v) as PulseNode
40
+ // Filter out stale nodes if TTL didn't catch them yet (grace period 60s)
41
+ if (now - node.timestamp < 60000) {
42
+ nodes.push(node)
43
+ }
44
+ } catch (_e) {
45
+ // Ignore malformed
46
+ }
47
+ }
48
+ })
49
+ }
50
+ } while (cursor !== '0')
51
+
52
+ // Group by service
53
+ const grouped: Record<string, PulseNode[]> = {}
54
+
55
+ // Sort nodes by service name, then by node id for stable UI positions
56
+ nodes.sort((a, b) => {
57
+ const sComp = a.service.localeCompare(b.service)
58
+ if (sComp !== 0) return sComp
59
+ return a.id.localeCompare(b.id)
60
+ })
61
+
62
+ for (const node of nodes) {
63
+ if (!grouped[node.service]) {
64
+ grouped[node.service] = []
65
+ }
66
+ grouped[node.service].push(node)
67
+ }
68
+
69
+ return grouped
70
+ }
71
+
72
+ /**
73
+ * Manually record a heartbeat (for this Zenith server itself).
74
+ */
75
+ async recordHeartbeat(node: PulseNode): Promise<void> {
76
+ const key = `${this.prefix}${node.service}:${node.id}`
77
+ // TTL 30 seconds
78
+ await this.redis.set(key, JSON.stringify(node), 'EX', 30)
79
+ }
80
+ }
@@ -51,7 +51,7 @@ export class QueueService {
51
51
  private logThrottleReset = Date.now()
52
52
  private readonly MAX_LOGS_PER_SEC = 50
53
53
  private manager: QueueManager
54
- public alerts = new AlertService()
54
+ public alerts: AlertService
55
55
 
56
56
  constructor(
57
57
  redisUrl: string,
@@ -84,10 +84,11 @@ export class QueueService {
84
84
  },
85
85
  persistence,
86
86
  })
87
+ this.alerts = new AlertService(redisUrl)
87
88
  }
88
89
 
89
90
  async connect() {
90
- await Promise.all([this.redis.connect(), this.subRedis.connect()])
91
+ await Promise.all([this.redis.connect(), this.subRedis.connect(), this.alerts.connect()])
91
92
 
92
93
  // Setup single Redis subscription
93
94
  await this.subRedis.subscribe('flux_console:logs')
@@ -123,6 +124,58 @@ export class QueueService {
123
124
  }
124
125
  }
125
126
  })
127
+
128
+ // Start Maintenance Loop
129
+ this.runMaintenanceLoop()
130
+ }
131
+
132
+ private async runMaintenanceLoop() {
133
+ // Initial delay to avoid startup congestion
134
+ setTimeout(() => {
135
+ const loop = async () => {
136
+ try {
137
+ await this.checkMaintenance()
138
+ } catch (err) {
139
+ console.error('[Maintenance] Task Error:', err)
140
+ }
141
+ // Check every hour (3600000 ms)
142
+ setTimeout(loop, 3600000)
143
+ }
144
+ loop()
145
+ }, 1000 * 30) // 30 seconds after boot
146
+ }
147
+
148
+ private async checkMaintenance() {
149
+ const config = await this.getMaintenanceConfig()
150
+ if (!config.autoCleanup) return
151
+
152
+ const now = Date.now()
153
+ const lastRun = config.lastRun || 0
154
+ const ONE_DAY = 24 * 60 * 60 * 1000
155
+
156
+ if (now - lastRun >= ONE_DAY) {
157
+ console.log(
158
+ `[Maintenance] Starting Auto-Cleanup (Retention: ${config.retentionDays} days)...`
159
+ )
160
+ const deleted = await this.cleanupArchive(config.retentionDays)
161
+ console.log(`[Maintenance] Cleanup Complete. Removed ${deleted} records.`)
162
+
163
+ // Update Last Run
164
+ await this.saveMaintenanceConfig({
165
+ ...config,
166
+ lastRun: now,
167
+ })
168
+ }
169
+ }
170
+
171
+ async getMaintenanceConfig(): Promise<any> {
172
+ const data = await this.redis.get('gravito:zenith:maintenance:config')
173
+ if (data) return JSON.parse(data)
174
+ return { autoCleanup: false, retentionDays: 30 }
175
+ }
176
+
177
+ async saveMaintenanceConfig(config: any): Promise<void> {
178
+ await this.redis.set('gravito:zenith:maintenance:config', JSON.stringify(config))
126
179
  }
127
180
 
128
181
  /**
@@ -291,7 +344,10 @@ export class QueueService {
291
344
  /**
292
345
  * Records a snapshot of current global statistics for sparklines.
293
346
  */
294
- async recordStatusMetrics(): Promise<void> {
347
+ async recordStatusMetrics(
348
+ nodes: Record<string, any> = {},
349
+ injectedWorkers?: any[]
350
+ ): Promise<void> {
295
351
  const stats = await this.listQueues()
296
352
  const totals = stats.reduce(
297
353
  (acc, q) => {
@@ -312,7 +368,7 @@ export class QueueService {
312
368
  pipe.set(`flux_console:metrics:failed:${now}`, totals.failed, 'EX', 3600)
313
369
 
314
370
  // Also record worker count
315
- const workers = await this.listWorkers()
371
+ const workers = injectedWorkers || (await this.listWorkers())
316
372
  pipe.set(`flux_console:metrics:workers:${now}`, workers.length, 'EX', 3600)
317
373
 
318
374
  await pipe.exec()
@@ -328,7 +384,8 @@ export class QueueService {
328
384
  this.alerts
329
385
  .check({
330
386
  queues: stats,
331
- workers,
387
+ nodes: nodes as any,
388
+ workers: workers as any,
332
389
  totals,
333
390
  })
334
391
  .catch((err) => console.error('[AlertService] Rule Evaluation Error:', err))
@@ -403,7 +460,7 @@ export class QueueService {
403
460
  }
404
461
  } while (cursor !== '0')
405
462
 
406
- return workers
463
+ return workers.sort((a, b) => a.id.localeCompare(b.id))
407
464
  }
408
465
 
409
466
  /**