@gravito/zenith 0.1.0-beta.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/dist/bin.js +38846 -27303
- package/dist/client/assets/index-C332gZ-J.css +1 -0
- package/dist/client/assets/index-D4HibwTK.js +436 -0
- package/dist/client/index.html +2 -2
- package/dist/server/index.js +38846 -27303
- package/docs/ALERTING_GUIDE.md +71 -0
- package/docs/LARAVEL_ZENITH_ROADMAP.md +109 -0
- package/docs/QUASAR_MASTER_PLAN.md +140 -0
- package/package.json +52 -48
- package/scripts/debug_redis_keys.ts +24 -0
- package/specs/PULSE_SPEC.md +86 -0
- package/src/client/App.tsx +2 -0
- package/src/client/Layout.tsx +18 -0
- package/src/client/Sidebar.tsx +2 -1
- package/src/client/WorkerStatus.tsx +121 -76
- package/src/client/components/BrandIcons.tsx +138 -0
- package/src/client/components/ConfirmDialog.tsx +0 -1
- package/src/client/components/JobInspector.tsx +18 -6
- package/src/client/components/PageHeader.tsx +38 -0
- package/src/client/pages/OverviewPage.tsx +17 -20
- package/src/client/pages/PulsePage.tsx +478 -0
- package/src/client/pages/QueuesPage.tsx +1 -3
- package/src/client/pages/SettingsPage.tsx +640 -78
- package/src/client/pages/WorkersPage.tsx +71 -3
- package/src/client/pages/index.ts +1 -0
- package/src/server/index.ts +311 -11
- package/src/server/services/AlertService.ts +189 -41
- package/src/server/services/CommandService.ts +137 -0
- package/src/server/services/PulseService.ts +80 -0
- package/src/server/services/QueueService.ts +63 -6
- package/src/shared/types.ts +99 -0
- package/tsconfig.json +2 -2
- package/ARCHITECTURE.md +0 -88
- package/BATCH_OPERATIONS_IMPLEMENTATION.md +0 -159
- package/EVOLUTION_BLUEPRINT.md +0 -112
- package/JOBINSPECTOR_SCROLL_FIX.md +0 -152
- package/PULSE_IMPLEMENTATION_PLAN.md +0 -111
- package/TESTING_BATCH_OPERATIONS.md +0 -252
- package/dist/client/assets/index-DGYEwTDL.css +0 -1
- package/dist/client/assets/index-oyTdySX0.js +0 -421
- /package/{DEPLOYMENT.md → docs/DEPLOYMENT.md} +0 -0
- /package/{DOCS_INTERNAL.md → docs/DOCS_INTERNAL.md} +0 -0
- /package/{QUICK_TEST_GUIDE.md → docs/QUICK_TEST_GUIDE.md} +0 -0
- /package/{ROADMAP.md → docs/ROADMAP.md} +0 -0
|
@@ -1,57 +1,110 @@
|
|
|
1
1
|
import { EventEmitter } from 'events'
|
|
2
|
+
import { Redis } from 'ioredis'
|
|
3
|
+
import nodemailer from 'nodemailer'
|
|
4
|
+
import type { AlertConfig, AlertEvent, AlertRule, PulseNode } from '../../shared/types'
|
|
2
5
|
import type { WorkerReport } from './QueueService'
|
|
3
6
|
|
|
4
|
-
export interface AlertRule {
|
|
5
|
-
id: string
|
|
6
|
-
name: string
|
|
7
|
-
type: 'backlog' | 'failure' | 'worker_lost'
|
|
8
|
-
threshold: number
|
|
9
|
-
queue?: string // Optional: specific queue or all
|
|
10
|
-
cooldownMinutes: number
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
export interface AlertEvent {
|
|
14
|
-
ruleId: string
|
|
15
|
-
timestamp: number
|
|
16
|
-
message: string
|
|
17
|
-
severity: 'warning' | 'critical'
|
|
18
|
-
}
|
|
19
|
-
|
|
20
7
|
export class AlertService {
|
|
8
|
+
private redis: Redis
|
|
21
9
|
private rules: AlertRule[] = []
|
|
10
|
+
private config: AlertConfig = { channels: {} }
|
|
22
11
|
private cooldowns: Map<string, number> = new Map()
|
|
23
|
-
private webhookUrl: string | null = process.env.SLACK_WEBHOOK_URL || null
|
|
24
12
|
private emitter = new EventEmitter()
|
|
13
|
+
private readonly RULES_KEY = 'gravito:zenith:alerts:rules'
|
|
14
|
+
private readonly CONFIG_KEY = 'gravito:zenith:alerts:config'
|
|
25
15
|
|
|
26
|
-
constructor() {
|
|
27
|
-
|
|
16
|
+
constructor(redisUrl: string) {
|
|
17
|
+
this.redis = new Redis(redisUrl, {
|
|
18
|
+
lazyConnect: true,
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
// Initial default rules
|
|
28
22
|
this.rules = [
|
|
29
23
|
{
|
|
30
24
|
id: 'global_failure_spike',
|
|
31
25
|
name: 'High Failure Rate',
|
|
32
26
|
type: 'failure',
|
|
33
|
-
threshold: 50,
|
|
27
|
+
threshold: 50,
|
|
34
28
|
cooldownMinutes: 30,
|
|
35
29
|
},
|
|
36
30
|
{
|
|
37
31
|
id: 'global_backlog_critical',
|
|
38
32
|
name: 'Queue Backlog Warning',
|
|
39
33
|
type: 'backlog',
|
|
40
|
-
threshold: 1000,
|
|
34
|
+
threshold: 1000,
|
|
41
35
|
cooldownMinutes: 60,
|
|
42
36
|
},
|
|
43
37
|
{
|
|
44
38
|
id: 'no_workers_online',
|
|
45
39
|
name: 'All Workers Offline',
|
|
46
40
|
type: 'worker_lost',
|
|
47
|
-
threshold: 1,
|
|
41
|
+
threshold: 1,
|
|
48
42
|
cooldownMinutes: 15,
|
|
49
43
|
},
|
|
50
44
|
]
|
|
45
|
+
|
|
46
|
+
// Default configuration (with env fallback for Slack)
|
|
47
|
+
if (process.env.SLACK_WEBHOOK_URL) {
|
|
48
|
+
this.config.channels.slack = {
|
|
49
|
+
enabled: true,
|
|
50
|
+
webhookUrl: process.env.SLACK_WEBHOOK_URL,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
this.init().catch((err) => console.error('[AlertService] Failed to initialize:', err))
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async connect() {
|
|
58
|
+
if (this.redis.status === 'wait') {
|
|
59
|
+
await this.redis.connect()
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
private async init() {
|
|
64
|
+
await this.loadRules()
|
|
65
|
+
await this.loadConfig()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async loadRules() {
|
|
69
|
+
try {
|
|
70
|
+
const data = await this.redis.get(this.RULES_KEY)
|
|
71
|
+
if (data) {
|
|
72
|
+
this.rules = JSON.parse(data)
|
|
73
|
+
}
|
|
74
|
+
} catch (err) {
|
|
75
|
+
console.error('[AlertService] Error loading rules from Redis:', err)
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async loadConfig() {
|
|
80
|
+
try {
|
|
81
|
+
const data = await this.redis.get(this.CONFIG_KEY)
|
|
82
|
+
if (data) {
|
|
83
|
+
this.config = JSON.parse(data)
|
|
84
|
+
}
|
|
85
|
+
} catch (err) {
|
|
86
|
+
console.error('[AlertService] Error loading config from Redis:', err)
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async saveRules(rules: AlertRule[]) {
|
|
91
|
+
this.rules = rules
|
|
92
|
+
await this.redis.set(this.RULES_KEY, JSON.stringify(rules))
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async saveConfig(config: AlertConfig) {
|
|
96
|
+
this.config = config
|
|
97
|
+
await this.redis.set(this.CONFIG_KEY, JSON.stringify(config))
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async addRule(rule: AlertRule) {
|
|
101
|
+
this.rules.push(rule)
|
|
102
|
+
await this.saveRules(this.rules)
|
|
51
103
|
}
|
|
52
104
|
|
|
53
|
-
|
|
54
|
-
this.
|
|
105
|
+
async deleteRule(id: string) {
|
|
106
|
+
this.rules = this.rules.filter((r) => r.id !== id)
|
|
107
|
+
await this.saveRules(this.rules)
|
|
55
108
|
}
|
|
56
109
|
|
|
57
110
|
onAlert(callback: (event: AlertEvent) => void) {
|
|
@@ -61,10 +114,10 @@ export class AlertService {
|
|
|
61
114
|
|
|
62
115
|
/**
|
|
63
116
|
* Evaluates rules against provided data.
|
|
64
|
-
* Extremely lightweight: only uses existing metrics data.
|
|
65
117
|
*/
|
|
66
118
|
async check(data: {
|
|
67
119
|
queues: any[]
|
|
120
|
+
nodes: Record<string, PulseNode[]>
|
|
68
121
|
workers: WorkerReport[]
|
|
69
122
|
totals: { waiting: number; delayed: number; failed: number }
|
|
70
123
|
}) {
|
|
@@ -83,21 +136,35 @@ export class AlertService {
|
|
|
83
136
|
|
|
84
137
|
// 2. Evaluate Rule
|
|
85
138
|
switch (rule.type) {
|
|
86
|
-
case 'backlog':
|
|
87
|
-
|
|
139
|
+
case 'backlog': {
|
|
140
|
+
const targetValue = rule.queue
|
|
141
|
+
? data.queues.find((q) => q.name === rule.queue)?.waiting || 0
|
|
142
|
+
: data.totals.waiting
|
|
143
|
+
|
|
144
|
+
if (targetValue >= rule.threshold) {
|
|
88
145
|
fired = true
|
|
89
146
|
severity = 'critical'
|
|
90
|
-
message =
|
|
147
|
+
message = rule.queue
|
|
148
|
+
? `Queue backlog on ${rule.queue}: ${targetValue} jobs waiting.`
|
|
149
|
+
: `Queue backlog detected: ${targetValue} jobs waiting across all queues.`
|
|
91
150
|
}
|
|
92
151
|
break
|
|
152
|
+
}
|
|
93
153
|
|
|
94
|
-
case 'failure':
|
|
95
|
-
|
|
154
|
+
case 'failure': {
|
|
155
|
+
const targetValue = rule.queue
|
|
156
|
+
? data.queues.find((q) => q.name === rule.queue)?.failed || 0
|
|
157
|
+
: data.totals.failed
|
|
158
|
+
|
|
159
|
+
if (targetValue >= rule.threshold) {
|
|
96
160
|
fired = true
|
|
97
161
|
severity = 'warning'
|
|
98
|
-
message =
|
|
162
|
+
message = rule.queue
|
|
163
|
+
? `High failure count on ${rule.queue}: ${targetValue} jobs failed.`
|
|
164
|
+
: `High failure count: ${targetValue} jobs are currently in failed state.`
|
|
99
165
|
}
|
|
100
166
|
break
|
|
167
|
+
}
|
|
101
168
|
|
|
102
169
|
case 'worker_lost':
|
|
103
170
|
if (data.workers.length < rule.threshold) {
|
|
@@ -106,6 +173,36 @@ export class AlertService {
|
|
|
106
173
|
message = `System Incident: Zero worker nodes detected! Jobs will not be processed.`
|
|
107
174
|
}
|
|
108
175
|
break
|
|
176
|
+
|
|
177
|
+
case 'node_cpu':
|
|
178
|
+
// Check all pulse nodes
|
|
179
|
+
for (const serviceNodes of Object.values(data.nodes)) {
|
|
180
|
+
for (const node of serviceNodes) {
|
|
181
|
+
if (node.cpu.process >= rule.threshold) {
|
|
182
|
+
fired = true
|
|
183
|
+
severity = 'warning'
|
|
184
|
+
message = `High CPU Usage on ${node.service} (${node.id}): ${node.cpu.process}%`
|
|
185
|
+
break
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (fired) break
|
|
189
|
+
}
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
case 'node_ram':
|
|
193
|
+
for (const serviceNodes of Object.values(data.nodes)) {
|
|
194
|
+
for (const node of serviceNodes) {
|
|
195
|
+
const usagePercent = (node.memory.process.rss / node.memory.system.total) * 100
|
|
196
|
+
if (usagePercent >= rule.threshold) {
|
|
197
|
+
fired = true
|
|
198
|
+
severity = 'warning'
|
|
199
|
+
message = `High RAM Usage on ${node.service} (${node.id}): ${usagePercent.toFixed(1)}%`
|
|
200
|
+
break
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (fired) break
|
|
204
|
+
}
|
|
205
|
+
break
|
|
109
206
|
}
|
|
110
207
|
|
|
111
208
|
// 3. Dispatch if fired
|
|
@@ -124,14 +221,26 @@ export class AlertService {
|
|
|
124
221
|
}
|
|
125
222
|
}
|
|
126
223
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
* Fire-and-forget to ensure zero impact on main loop latency.
|
|
130
|
-
*/
|
|
131
|
-
private notify(event: AlertEvent) {
|
|
132
|
-
if (!this.webhookUrl) return
|
|
224
|
+
private async notify(event: AlertEvent) {
|
|
225
|
+
const { slack, discord, email } = this.config.channels
|
|
133
226
|
|
|
134
|
-
//
|
|
227
|
+
// 1. Notify Slack
|
|
228
|
+
if (slack?.enabled && slack.webhookUrl) {
|
|
229
|
+
this.sendToWebhook(slack.webhookUrl, 'Slack', event).catch(console.error)
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// 2. Notify Discord
|
|
233
|
+
if (discord?.enabled && discord.webhookUrl) {
|
|
234
|
+
this.sendToWebhook(discord.webhookUrl, 'Discord', event).catch(console.error)
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// 3. Notify Email
|
|
238
|
+
if (email?.enabled) {
|
|
239
|
+
this.sendEmail(email, event).catch(console.error)
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
private async sendToWebhook(url: string, platform: string, event: AlertEvent) {
|
|
135
244
|
const payload = {
|
|
136
245
|
text: `*Flux Console Alert [${event.severity.toUpperCase()}]*\n${event.message}\n_Time: ${new Date(event.timestamp).toISOString()}_`,
|
|
137
246
|
attachments: [
|
|
@@ -140,21 +249,60 @@ export class AlertService {
|
|
|
140
249
|
fields: [
|
|
141
250
|
{ title: 'Rule', value: event.ruleId, short: true },
|
|
142
251
|
{ title: 'Severity', value: event.severity, short: true },
|
|
252
|
+
{ title: 'Platform', value: platform, short: true },
|
|
143
253
|
],
|
|
144
254
|
},
|
|
145
255
|
],
|
|
146
256
|
}
|
|
147
257
|
|
|
148
|
-
fetch(
|
|
258
|
+
const res = await fetch(url, {
|
|
149
259
|
method: 'POST',
|
|
150
260
|
headers: { 'Content-Type': 'application/json' },
|
|
151
261
|
body: JSON.stringify(payload),
|
|
152
|
-
})
|
|
153
|
-
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
if (!res.ok) {
|
|
265
|
+
throw new Error(`Failed to send to ${platform}: ${await res.text()}`)
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
private async sendEmail(config: any, event: AlertEvent) {
|
|
270
|
+
const transporter = nodemailer.createTransport({
|
|
271
|
+
host: config.smtpHost,
|
|
272
|
+
port: config.smtpPort,
|
|
273
|
+
secure: config.smtpPort === 465,
|
|
274
|
+
auth: {
|
|
275
|
+
user: config.smtpUser,
|
|
276
|
+
pass: config.smtpPass,
|
|
277
|
+
},
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
await transporter.sendMail({
|
|
281
|
+
from: config.from,
|
|
282
|
+
to: config.to,
|
|
283
|
+
subject: `[Zenith Alert] ${event.severity.toUpperCase()}: ${event.ruleId}`,
|
|
284
|
+
text: `${event.message}\n\nTimestamp: ${new Date(event.timestamp).toISOString()}\nSeverity: ${event.severity}`,
|
|
285
|
+
html: `
|
|
286
|
+
<div style="font-family: sans-serif; padding: 20px; border: 1px solid #eee; border-radius: 10px;">
|
|
287
|
+
<h2 style="color: ${event.severity === 'critical' ? '#ef4444' : '#f59e0b'}">
|
|
288
|
+
Zenith Alert: ${event.severity.toUpperCase()}
|
|
289
|
+
</h2>
|
|
290
|
+
<p style="font-size: 16px;">${event.message}</p>
|
|
291
|
+
<hr />
|
|
292
|
+
<p style="font-size: 12px; color: #666;">
|
|
293
|
+
Rule ID: ${event.ruleId}<br />
|
|
294
|
+
Time: ${new Date(event.timestamp).toISOString()}
|
|
295
|
+
</p>
|
|
296
|
+
</div>
|
|
297
|
+
`,
|
|
154
298
|
})
|
|
155
299
|
}
|
|
156
300
|
|
|
157
301
|
getRules() {
|
|
158
302
|
return this.rules
|
|
159
303
|
}
|
|
304
|
+
|
|
305
|
+
getConfig() {
|
|
306
|
+
return this.config
|
|
307
|
+
}
|
|
160
308
|
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import type { CommandType, QuasarCommand } from '@gravito/quasar'
|
|
2
|
+
import { Redis } from 'ioredis'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CommandService handles sending commands from Zenith to Quasar agents.
|
|
6
|
+
*
|
|
7
|
+
* This is the "control center" that publishes commands to Redis Pub/Sub.
|
|
8
|
+
* Agents subscribe and execute commands locally.
|
|
9
|
+
*/
|
|
10
|
+
export class CommandService {
|
|
11
|
+
private redis: Redis
|
|
12
|
+
|
|
13
|
+
constructor(redisUrl: string) {
|
|
14
|
+
this.redis = new Redis(redisUrl, {
|
|
15
|
+
lazyConnect: true,
|
|
16
|
+
})
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async connect(): Promise<void> {
|
|
20
|
+
if (this.redis.status !== 'ready' && this.redis.status !== 'connecting') {
|
|
21
|
+
await this.redis.connect()
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Send a command to a specific Quasar agent.
|
|
27
|
+
*
|
|
28
|
+
* @param service - Target service name
|
|
29
|
+
* @param nodeId - Target node ID (or '*' for broadcast)
|
|
30
|
+
* @param type - Command type
|
|
31
|
+
* @param payload - Command payload
|
|
32
|
+
* @returns Command ID
|
|
33
|
+
*/
|
|
34
|
+
async sendCommand(
|
|
35
|
+
service: string,
|
|
36
|
+
nodeId: string,
|
|
37
|
+
type: CommandType,
|
|
38
|
+
payload: QuasarCommand['payload']
|
|
39
|
+
): Promise<string> {
|
|
40
|
+
const commandId = crypto.randomUUID()
|
|
41
|
+
|
|
42
|
+
const command: QuasarCommand = {
|
|
43
|
+
id: commandId,
|
|
44
|
+
type,
|
|
45
|
+
targetNodeId: nodeId,
|
|
46
|
+
payload,
|
|
47
|
+
timestamp: Date.now(),
|
|
48
|
+
issuer: 'zenith',
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const channel = `gravito:quasar:cmd:${service}:${nodeId}`
|
|
52
|
+
|
|
53
|
+
await this.redis.publish(channel, JSON.stringify(command))
|
|
54
|
+
|
|
55
|
+
console.log(`[CommandService] 📤 Sent ${type} to ${channel}`)
|
|
56
|
+
return commandId
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Retry a job on a specific node.
|
|
61
|
+
*/
|
|
62
|
+
async retryJob(
|
|
63
|
+
service: string,
|
|
64
|
+
nodeId: string,
|
|
65
|
+
queue: string,
|
|
66
|
+
jobKey: string,
|
|
67
|
+
driver: 'redis' | 'laravel' = 'redis'
|
|
68
|
+
): Promise<string> {
|
|
69
|
+
return this.sendCommand(service, nodeId, 'RETRY_JOB', {
|
|
70
|
+
queue,
|
|
71
|
+
jobKey,
|
|
72
|
+
driver,
|
|
73
|
+
})
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Delete a job on a specific node.
|
|
78
|
+
*/
|
|
79
|
+
async deleteJob(
|
|
80
|
+
service: string,
|
|
81
|
+
nodeId: string,
|
|
82
|
+
queue: string,
|
|
83
|
+
jobKey: string,
|
|
84
|
+
driver: 'redis' | 'laravel' = 'redis'
|
|
85
|
+
): Promise<string> {
|
|
86
|
+
return this.sendCommand(service, nodeId, 'DELETE_JOB', {
|
|
87
|
+
queue,
|
|
88
|
+
jobKey,
|
|
89
|
+
driver,
|
|
90
|
+
})
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Broadcast a retry command to all nodes of a service.
|
|
95
|
+
*/
|
|
96
|
+
async broadcastRetryJob(
|
|
97
|
+
service: string,
|
|
98
|
+
queue: string,
|
|
99
|
+
jobKey: string,
|
|
100
|
+
driver: 'redis' | 'laravel' = 'redis'
|
|
101
|
+
): Promise<string> {
|
|
102
|
+
return this.retryJob(service, '*', queue, jobKey, driver)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Broadcast a delete command to all nodes of a service.
|
|
107
|
+
*/
|
|
108
|
+
async broadcastDeleteJob(
|
|
109
|
+
service: string,
|
|
110
|
+
queue: string,
|
|
111
|
+
jobKey: string,
|
|
112
|
+
driver: 'redis' | 'laravel' = 'redis'
|
|
113
|
+
): Promise<string> {
|
|
114
|
+
return this.deleteJob(service, '*', queue, jobKey, driver)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Send a Laravel-specific action (retry-all, restart) to a node.
|
|
119
|
+
*/
|
|
120
|
+
async laravelAction(
|
|
121
|
+
service: string,
|
|
122
|
+
nodeId: string,
|
|
123
|
+
action: 'retry-all' | 'restart' | 'retry',
|
|
124
|
+
jobId?: string
|
|
125
|
+
): Promise<string> {
|
|
126
|
+
return this.sendCommand(service, nodeId, 'LARAVEL_ACTION', {
|
|
127
|
+
queue: 'default',
|
|
128
|
+
jobKey: '*',
|
|
129
|
+
action,
|
|
130
|
+
jobId,
|
|
131
|
+
})
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async disconnect(): Promise<void> {
|
|
135
|
+
await this.redis.quit()
|
|
136
|
+
}
|
|
137
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Redis } from 'ioredis'
|
|
2
|
+
import type { PulseNode } from '../../shared/types'
|
|
3
|
+
|
|
4
|
+
export class PulseService {
|
|
5
|
+
private redis: Redis
|
|
6
|
+
private prefix = 'gravito:quasar:node:'
|
|
7
|
+
|
|
8
|
+
constructor(redisUrl: string) {
|
|
9
|
+
this.redis = new Redis(redisUrl, {
|
|
10
|
+
lazyConnect: true,
|
|
11
|
+
})
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async connect() {
|
|
15
|
+
await this.redis.connect()
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Discovers active Pulse nodes using SCAN.
|
|
20
|
+
*/
|
|
21
|
+
async getNodes(): Promise<Record<string, PulseNode[]>> {
|
|
22
|
+
const nodes: PulseNode[] = []
|
|
23
|
+
let cursor = '0'
|
|
24
|
+
const now = Date.now()
|
|
25
|
+
|
|
26
|
+
do {
|
|
27
|
+
// Scan for pulse keys
|
|
28
|
+
const result = await this.redis.scan(cursor, 'MATCH', `${this.prefix}*`, 'COUNT', 100)
|
|
29
|
+
cursor = result[0]
|
|
30
|
+
const keys = result[1]
|
|
31
|
+
|
|
32
|
+
if (keys.length > 0) {
|
|
33
|
+
// Fetch values
|
|
34
|
+
const values = await this.redis.mget(...keys)
|
|
35
|
+
|
|
36
|
+
values.forEach((v) => {
|
|
37
|
+
if (v) {
|
|
38
|
+
try {
|
|
39
|
+
const node = JSON.parse(v) as PulseNode
|
|
40
|
+
// Filter out stale nodes if TTL didn't catch them yet (grace period 60s)
|
|
41
|
+
if (now - node.timestamp < 60000) {
|
|
42
|
+
nodes.push(node)
|
|
43
|
+
}
|
|
44
|
+
} catch (_e) {
|
|
45
|
+
// Ignore malformed
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
})
|
|
49
|
+
}
|
|
50
|
+
} while (cursor !== '0')
|
|
51
|
+
|
|
52
|
+
// Group by service
|
|
53
|
+
const grouped: Record<string, PulseNode[]> = {}
|
|
54
|
+
|
|
55
|
+
// Sort nodes by service name, then by node id for stable UI positions
|
|
56
|
+
nodes.sort((a, b) => {
|
|
57
|
+
const sComp = a.service.localeCompare(b.service)
|
|
58
|
+
if (sComp !== 0) return sComp
|
|
59
|
+
return a.id.localeCompare(b.id)
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
for (const node of nodes) {
|
|
63
|
+
if (!grouped[node.service]) {
|
|
64
|
+
grouped[node.service] = []
|
|
65
|
+
}
|
|
66
|
+
grouped[node.service].push(node)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return grouped
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Manually record a heartbeat (for this Zenith server itself).
|
|
74
|
+
*/
|
|
75
|
+
async recordHeartbeat(node: PulseNode): Promise<void> {
|
|
76
|
+
const key = `${this.prefix}${node.service}:${node.id}`
|
|
77
|
+
// TTL 30 seconds
|
|
78
|
+
await this.redis.set(key, JSON.stringify(node), 'EX', 30)
|
|
79
|
+
}
|
|
80
|
+
}
|
|
@@ -51,7 +51,7 @@ export class QueueService {
|
|
|
51
51
|
private logThrottleReset = Date.now()
|
|
52
52
|
private readonly MAX_LOGS_PER_SEC = 50
|
|
53
53
|
private manager: QueueManager
|
|
54
|
-
public alerts
|
|
54
|
+
public alerts: AlertService
|
|
55
55
|
|
|
56
56
|
constructor(
|
|
57
57
|
redisUrl: string,
|
|
@@ -84,10 +84,11 @@ export class QueueService {
|
|
|
84
84
|
},
|
|
85
85
|
persistence,
|
|
86
86
|
})
|
|
87
|
+
this.alerts = new AlertService(redisUrl)
|
|
87
88
|
}
|
|
88
89
|
|
|
89
90
|
async connect() {
|
|
90
|
-
await Promise.all([this.redis.connect(), this.subRedis.connect()])
|
|
91
|
+
await Promise.all([this.redis.connect(), this.subRedis.connect(), this.alerts.connect()])
|
|
91
92
|
|
|
92
93
|
// Setup single Redis subscription
|
|
93
94
|
await this.subRedis.subscribe('flux_console:logs')
|
|
@@ -123,6 +124,58 @@ export class QueueService {
|
|
|
123
124
|
}
|
|
124
125
|
}
|
|
125
126
|
})
|
|
127
|
+
|
|
128
|
+
// Start Maintenance Loop
|
|
129
|
+
this.runMaintenanceLoop()
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
private async runMaintenanceLoop() {
|
|
133
|
+
// Initial delay to avoid startup congestion
|
|
134
|
+
setTimeout(() => {
|
|
135
|
+
const loop = async () => {
|
|
136
|
+
try {
|
|
137
|
+
await this.checkMaintenance()
|
|
138
|
+
} catch (err) {
|
|
139
|
+
console.error('[Maintenance] Task Error:', err)
|
|
140
|
+
}
|
|
141
|
+
// Check every hour (3600000 ms)
|
|
142
|
+
setTimeout(loop, 3600000)
|
|
143
|
+
}
|
|
144
|
+
loop()
|
|
145
|
+
}, 1000 * 30) // 30 seconds after boot
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
private async checkMaintenance() {
|
|
149
|
+
const config = await this.getMaintenanceConfig()
|
|
150
|
+
if (!config.autoCleanup) return
|
|
151
|
+
|
|
152
|
+
const now = Date.now()
|
|
153
|
+
const lastRun = config.lastRun || 0
|
|
154
|
+
const ONE_DAY = 24 * 60 * 60 * 1000
|
|
155
|
+
|
|
156
|
+
if (now - lastRun >= ONE_DAY) {
|
|
157
|
+
console.log(
|
|
158
|
+
`[Maintenance] Starting Auto-Cleanup (Retention: ${config.retentionDays} days)...`
|
|
159
|
+
)
|
|
160
|
+
const deleted = await this.cleanupArchive(config.retentionDays)
|
|
161
|
+
console.log(`[Maintenance] Cleanup Complete. Removed ${deleted} records.`)
|
|
162
|
+
|
|
163
|
+
// Update Last Run
|
|
164
|
+
await this.saveMaintenanceConfig({
|
|
165
|
+
...config,
|
|
166
|
+
lastRun: now,
|
|
167
|
+
})
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
async getMaintenanceConfig(): Promise<any> {
|
|
172
|
+
const data = await this.redis.get('gravito:zenith:maintenance:config')
|
|
173
|
+
if (data) return JSON.parse(data)
|
|
174
|
+
return { autoCleanup: false, retentionDays: 30 }
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async saveMaintenanceConfig(config: any): Promise<void> {
|
|
178
|
+
await this.redis.set('gravito:zenith:maintenance:config', JSON.stringify(config))
|
|
126
179
|
}
|
|
127
180
|
|
|
128
181
|
/**
|
|
@@ -291,7 +344,10 @@ export class QueueService {
|
|
|
291
344
|
/**
|
|
292
345
|
* Records a snapshot of current global statistics for sparklines.
|
|
293
346
|
*/
|
|
294
|
-
async recordStatusMetrics(
|
|
347
|
+
async recordStatusMetrics(
|
|
348
|
+
nodes: Record<string, any> = {},
|
|
349
|
+
injectedWorkers?: any[]
|
|
350
|
+
): Promise<void> {
|
|
295
351
|
const stats = await this.listQueues()
|
|
296
352
|
const totals = stats.reduce(
|
|
297
353
|
(acc, q) => {
|
|
@@ -312,7 +368,7 @@ export class QueueService {
|
|
|
312
368
|
pipe.set(`flux_console:metrics:failed:${now}`, totals.failed, 'EX', 3600)
|
|
313
369
|
|
|
314
370
|
// Also record worker count
|
|
315
|
-
const workers = await this.listWorkers()
|
|
371
|
+
const workers = injectedWorkers || (await this.listWorkers())
|
|
316
372
|
pipe.set(`flux_console:metrics:workers:${now}`, workers.length, 'EX', 3600)
|
|
317
373
|
|
|
318
374
|
await pipe.exec()
|
|
@@ -328,7 +384,8 @@ export class QueueService {
|
|
|
328
384
|
this.alerts
|
|
329
385
|
.check({
|
|
330
386
|
queues: stats,
|
|
331
|
-
|
|
387
|
+
nodes: nodes as any,
|
|
388
|
+
workers: workers as any,
|
|
332
389
|
totals,
|
|
333
390
|
})
|
|
334
391
|
.catch((err) => console.error('[AlertService] Rule Evaluation Error:', err))
|
|
@@ -403,7 +460,7 @@ export class QueueService {
|
|
|
403
460
|
}
|
|
404
461
|
} while (cursor !== '0')
|
|
405
462
|
|
|
406
|
-
return workers
|
|
463
|
+
return workers.sort((a, b) => a.id.localeCompare(b.id))
|
|
407
464
|
}
|
|
408
465
|
|
|
409
466
|
/**
|