rrq 0.4.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rrq/cli.py +340 -91
- rrq/cli_commands/__init__.py +1 -0
- rrq/cli_commands/base.py +102 -0
- rrq/cli_commands/commands/__init__.py +1 -0
- rrq/cli_commands/commands/debug.py +551 -0
- rrq/cli_commands/commands/dlq.py +853 -0
- rrq/cli_commands/commands/jobs.py +516 -0
- rrq/cli_commands/commands/monitor.py +776 -0
- rrq/cli_commands/commands/queues.py +539 -0
- rrq/cli_commands/utils.py +161 -0
- rrq/client.py +39 -35
- rrq/constants.py +10 -0
- rrq/cron.py +75 -15
- rrq/hooks.py +217 -0
- rrq/job.py +5 -5
- rrq/registry.py +0 -3
- rrq/settings.py +13 -1
- rrq/store.py +333 -55
- rrq/worker.py +199 -139
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/METADATA +208 -24
- rrq-0.7.0.dist-info/RECORD +26 -0
- rrq-0.4.0.dist-info/RECORD +0 -16
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/WHEEL +0 -0
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/entry_points.txt +0 -0
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,776 @@
|
|
|
1
|
+
"""Real-time monitoring dashboard for RRQ"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections import defaultdict, deque
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.align import Align
|
|
10
|
+
from rich.layout import Layout
|
|
11
|
+
from rich.live import Live
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
from rich.text import Text
|
|
15
|
+
|
|
16
|
+
from rrq.constants import (
|
|
17
|
+
HEALTH_KEY_PREFIX,
|
|
18
|
+
JOB_KEY_PREFIX,
|
|
19
|
+
QUEUE_KEY_PREFIX,
|
|
20
|
+
DLQ_KEY_PREFIX,
|
|
21
|
+
)
|
|
22
|
+
from rrq.cli_commands.base import AsyncCommand, load_app_settings, get_job_store
|
|
23
|
+
from ..utils import (
|
|
24
|
+
console,
|
|
25
|
+
format_duration,
|
|
26
|
+
format_queue_name,
|
|
27
|
+
format_status,
|
|
28
|
+
format_timestamp,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Error truncation lengths for consistency with DLQ commands
|
|
32
|
+
ERROR_DISPLAY_LENGTH = 50 # For consistent display across DLQ and monitor
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MonitorCommands(AsyncCommand):
|
|
36
|
+
"""Real-time monitoring commands"""
|
|
37
|
+
|
|
38
|
+
def register(self, cli_group: click.Group) -> None:
|
|
39
|
+
"""Register monitor commands"""
|
|
40
|
+
|
|
41
|
+
@cli_group.command("monitor")
|
|
42
|
+
@click.option(
|
|
43
|
+
"--settings",
|
|
44
|
+
"settings_object_path",
|
|
45
|
+
type=str,
|
|
46
|
+
help="Python settings path (e.g., myapp.settings.rrq_settings)",
|
|
47
|
+
)
|
|
48
|
+
@click.option(
|
|
49
|
+
"--refresh",
|
|
50
|
+
type=float,
|
|
51
|
+
default=1.0,
|
|
52
|
+
help="Refresh interval in seconds",
|
|
53
|
+
)
|
|
54
|
+
@click.option(
|
|
55
|
+
"--queues",
|
|
56
|
+
multiple=True,
|
|
57
|
+
help="Specific queues to monitor (default: all)",
|
|
58
|
+
)
|
|
59
|
+
def monitor(settings_object_path: str, refresh: float, queues: tuple):
|
|
60
|
+
"""Launch real-time monitoring dashboard"""
|
|
61
|
+
self.make_async(self._monitor)(settings_object_path, refresh, queues)
|
|
62
|
+
|
|
63
|
+
async def _monitor(
|
|
64
|
+
self, settings_object_path: str, refresh: float, queues: tuple
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Run the monitoring dashboard"""
|
|
67
|
+
settings = load_app_settings(settings_object_path)
|
|
68
|
+
dashboard = Dashboard(settings, refresh, queues)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
await dashboard.run()
|
|
72
|
+
except KeyboardInterrupt:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class Dashboard:
|
|
77
|
+
"""Real-time monitoring dashboard"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, settings, refresh_interval: float, queue_filter: tuple):
|
|
80
|
+
self.settings = settings
|
|
81
|
+
self.refresh_interval = refresh_interval
|
|
82
|
+
self.queue_filter = list(queue_filter) if queue_filter else None
|
|
83
|
+
self.job_store = None
|
|
84
|
+
|
|
85
|
+
# Metrics storage
|
|
86
|
+
self.queue_sizes = defaultdict(lambda: deque(maxlen=60)) # 60 data points
|
|
87
|
+
self.processing_rates = defaultdict(lambda: deque(maxlen=60))
|
|
88
|
+
self.error_counts = defaultdict(int)
|
|
89
|
+
self.dlq_stats = {"total_jobs": 0, "newest_error": None, "top_errors": {}}
|
|
90
|
+
self.last_update = datetime.now()
|
|
91
|
+
|
|
92
|
+
# Event streaming for real-time updates
|
|
93
|
+
self._last_event_id = "0"
|
|
94
|
+
self._event_buffer = deque(maxlen=100)
|
|
95
|
+
|
|
96
|
+
async def run(self):
|
|
97
|
+
"""Run the dashboard"""
|
|
98
|
+
self.job_store = await get_job_store(self.settings)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
layout = self.create_layout()
|
|
102
|
+
|
|
103
|
+
with Live(
|
|
104
|
+
layout, refresh_per_second=1 / self.refresh_interval, console=console
|
|
105
|
+
) as _:
|
|
106
|
+
while True:
|
|
107
|
+
await self.update_metrics()
|
|
108
|
+
self.update_layout(layout)
|
|
109
|
+
await asyncio.sleep(self.refresh_interval)
|
|
110
|
+
finally:
|
|
111
|
+
await self.job_store.aclose()
|
|
112
|
+
|
|
113
|
+
def create_layout(self) -> Layout:
|
|
114
|
+
"""Create the dashboard layout"""
|
|
115
|
+
layout = Layout(name="root")
|
|
116
|
+
|
|
117
|
+
layout.split_column(
|
|
118
|
+
Layout(name="header", size=3),
|
|
119
|
+
Layout(name="main"),
|
|
120
|
+
Layout(name="footer", size=3),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
layout["main"].split_row(
|
|
124
|
+
Layout(name="queues", ratio=1),
|
|
125
|
+
Layout(name="workers", ratio=1),
|
|
126
|
+
Layout(name="dlq", ratio=1),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
layout["queues"].split_column(
|
|
130
|
+
Layout(name="queue_stats", ratio=2),
|
|
131
|
+
Layout(name="queue_chart", ratio=1),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
layout["workers"].split_column(
|
|
135
|
+
Layout(name="worker_list", ratio=2),
|
|
136
|
+
Layout(name="recent_jobs", ratio=1),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
layout["dlq"].split_column(
|
|
140
|
+
Layout(name="dlq_stats", ratio=2),
|
|
141
|
+
Layout(name="dlq_errors", ratio=1),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
return layout
|
|
145
|
+
|
|
146
|
+
async def update_metrics(self):
|
|
147
|
+
"""Update all metrics using hybrid monitoring approach"""
|
|
148
|
+
try:
|
|
149
|
+
# Check for real-time events first
|
|
150
|
+
await self._process_monitoring_events()
|
|
151
|
+
|
|
152
|
+
# Use hybrid approach for queue/worker metrics
|
|
153
|
+
queue_data = await self._update_queue_metrics_optimized()
|
|
154
|
+
await self._update_worker_metrics_optimized()
|
|
155
|
+
|
|
156
|
+
# Update queue size tracking
|
|
157
|
+
for queue_name, size in queue_data.items():
|
|
158
|
+
self.queue_sizes[queue_name].append(size)
|
|
159
|
+
|
|
160
|
+
# Get recent job information (keep optimized scan)
|
|
161
|
+
self.recent_jobs = await self._get_recent_jobs_optimized()
|
|
162
|
+
|
|
163
|
+
# Get DLQ information
|
|
164
|
+
await self._update_dlq_stats()
|
|
165
|
+
|
|
166
|
+
self.last_update = datetime.now()
|
|
167
|
+
except Exception as e:
|
|
168
|
+
console.print(f"[red]Error updating metrics: {e}[/red]")
|
|
169
|
+
# Continue with cached data if available
|
|
170
|
+
|
|
171
|
+
async def _process_monitoring_events(self):
|
|
172
|
+
"""Process real-time monitoring events from Redis streams"""
|
|
173
|
+
try:
|
|
174
|
+
events = await self.job_store.consume_monitor_events(
|
|
175
|
+
last_id=self._last_event_id,
|
|
176
|
+
count=50,
|
|
177
|
+
block=10, # Short non-blocking read
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
for stream_name, event_list in events:
|
|
181
|
+
for event_id, event_data in event_list:
|
|
182
|
+
# Update last processed event ID
|
|
183
|
+
self._last_event_id = (
|
|
184
|
+
event_id.decode() if isinstance(event_id, bytes) else event_id
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Process event based on type
|
|
188
|
+
event_type = event_data.get(b"event_type", b"").decode()
|
|
189
|
+
if event_type == "queue_activity":
|
|
190
|
+
queue_name = event_data.get(b"queue_name", b"").decode()
|
|
191
|
+
if queue_name:
|
|
192
|
+
# Trigger immediate refresh for this queue
|
|
193
|
+
await self._refresh_queue_size(queue_name)
|
|
194
|
+
elif event_type == "worker_heartbeat":
|
|
195
|
+
worker_id = event_data.get(b"worker_id", b"").decode()
|
|
196
|
+
if worker_id:
|
|
197
|
+
# Trigger immediate worker refresh
|
|
198
|
+
await self._refresh_worker_status(worker_id)
|
|
199
|
+
|
|
200
|
+
except Exception:
|
|
201
|
+
# Events are optional - continue without them if there's an issue
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
async def _refresh_queue_size(self, queue_name: str):
|
|
205
|
+
"""Immediately refresh size for a specific queue"""
|
|
206
|
+
try:
|
|
207
|
+
queue_key = f"{QUEUE_KEY_PREFIX}{queue_name}"
|
|
208
|
+
size = await self.job_store.redis.zcard(queue_key)
|
|
209
|
+
self.queue_sizes[queue_name].append(size)
|
|
210
|
+
except Exception:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
async def _refresh_worker_status(self, worker_id: str):
|
|
214
|
+
"""Immediately refresh status for a specific worker"""
|
|
215
|
+
try:
|
|
216
|
+
health_data, ttl = await self.job_store.get_worker_health(worker_id)
|
|
217
|
+
if health_data:
|
|
218
|
+
# Update worker in current list
|
|
219
|
+
for i, worker in enumerate(self.workers):
|
|
220
|
+
if worker["id"] == worker_id:
|
|
221
|
+
self.workers[i].update(
|
|
222
|
+
{
|
|
223
|
+
"status": health_data.get("status", "unknown"),
|
|
224
|
+
"active_jobs": health_data.get("active_jobs", 0),
|
|
225
|
+
"queues": health_data.get("queues", []),
|
|
226
|
+
"last_heartbeat": health_data.get("timestamp"),
|
|
227
|
+
"ttl": ttl,
|
|
228
|
+
}
|
|
229
|
+
)
|
|
230
|
+
break
|
|
231
|
+
except Exception:
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
async def _get_recent_jobs(self, limit: int = 10) -> List[Dict]:
|
|
235
|
+
"""Get recently processed jobs"""
|
|
236
|
+
jobs = []
|
|
237
|
+
job_pattern = f"{JOB_KEY_PREFIX}*"
|
|
238
|
+
|
|
239
|
+
# Sample recent jobs
|
|
240
|
+
count = 0
|
|
241
|
+
async for key in self.job_store.redis.scan_iter(match=job_pattern):
|
|
242
|
+
if count >= limit * 2: # Sample more to find recent ones
|
|
243
|
+
break
|
|
244
|
+
|
|
245
|
+
job_id = key.decode().replace(JOB_KEY_PREFIX, "")
|
|
246
|
+
job_dict = await self.job_store.get_job_data_dict(job_id)
|
|
247
|
+
if job_dict:
|
|
248
|
+
# Only include recently updated jobs
|
|
249
|
+
if "completed_at" in job_dict or "started_at" in job_dict:
|
|
250
|
+
jobs.append(
|
|
251
|
+
{
|
|
252
|
+
"id": job_id,
|
|
253
|
+
"function": job_dict.get("function_name", "unknown"),
|
|
254
|
+
"status": job_dict.get("status", "unknown"),
|
|
255
|
+
"started_at": float(job_dict.get("started_at", 0)),
|
|
256
|
+
"completed_at": float(job_dict.get("completed_at", 0)),
|
|
257
|
+
}
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
count += 1
|
|
261
|
+
|
|
262
|
+
# Sort by most recent activity
|
|
263
|
+
jobs.sort(
|
|
264
|
+
key=lambda x: x.get("completed_at") or x.get("started_at") or 0,
|
|
265
|
+
reverse=True,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
return jobs[:limit]
|
|
269
|
+
|
|
270
|
+
async def _update_queue_metrics_optimized(self) -> Dict[str, int]:
|
|
271
|
+
"""Hybrid queue metrics collection using active registries and efficient batch operations"""
|
|
272
|
+
# Use the hybrid monitoring approach: get active queues from registry
|
|
273
|
+
try:
|
|
274
|
+
# Get recently active queues from the registry (O(log N) operation)
|
|
275
|
+
active_queue_names = await self.job_store.get_active_queues(
|
|
276
|
+
max_age_seconds=300
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Apply filtering if specified
|
|
280
|
+
if self.queue_filter:
|
|
281
|
+
active_queue_names = [
|
|
282
|
+
q for q in active_queue_names if q in self.queue_filter
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
# Use batch operation to get queue sizes efficiently
|
|
286
|
+
if active_queue_names:
|
|
287
|
+
queue_data = await self.job_store.batch_get_queue_sizes(
|
|
288
|
+
active_queue_names
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
queue_data = {}
|
|
292
|
+
|
|
293
|
+
# Fallback to legacy scan for first run or if no active queues found
|
|
294
|
+
if not queue_data:
|
|
295
|
+
queue_data = await self._legacy_scan_queue_metrics()
|
|
296
|
+
|
|
297
|
+
except Exception:
|
|
298
|
+
# Fallback to legacy scan on any error
|
|
299
|
+
queue_data = await self._legacy_scan_queue_metrics()
|
|
300
|
+
|
|
301
|
+
return queue_data
|
|
302
|
+
|
|
303
|
+
async def _legacy_scan_queue_metrics(self) -> Dict[str, int]:
|
|
304
|
+
"""Legacy scan-based queue metrics as fallback"""
|
|
305
|
+
queue_keys = []
|
|
306
|
+
queue_pattern = f"{QUEUE_KEY_PREFIX}*"
|
|
307
|
+
|
|
308
|
+
# Perform limited scan (max 100 keys at a time)
|
|
309
|
+
scan_count = 0
|
|
310
|
+
try:
|
|
311
|
+
async for key in self.job_store.redis.scan_iter(
|
|
312
|
+
match=queue_pattern, count=50
|
|
313
|
+
):
|
|
314
|
+
queue_keys.append(key)
|
|
315
|
+
scan_count += 1
|
|
316
|
+
if scan_count >= 100: # Limit scan operations
|
|
317
|
+
break
|
|
318
|
+
except TypeError:
|
|
319
|
+
# Handle mocks that don't support count parameter
|
|
320
|
+
async for key in self.job_store.redis.scan_iter(match=queue_pattern):
|
|
321
|
+
queue_keys.append(key)
|
|
322
|
+
scan_count += 1
|
|
323
|
+
if scan_count >= 100: # Limit scan operations
|
|
324
|
+
break
|
|
325
|
+
|
|
326
|
+
# Apply filtering early and get sizes individually (compatible with tests)
|
|
327
|
+
queue_data = {}
|
|
328
|
+
for key in queue_keys:
|
|
329
|
+
queue_name = key.decode().replace(QUEUE_KEY_PREFIX, "")
|
|
330
|
+
if not self.queue_filter or queue_name in self.queue_filter:
|
|
331
|
+
size = await self.job_store.redis.zcard(key)
|
|
332
|
+
queue_data[queue_name] = size
|
|
333
|
+
|
|
334
|
+
return queue_data
|
|
335
|
+
|
|
336
|
+
async def _update_worker_metrics_optimized(self):
|
|
337
|
+
"""Hybrid worker metrics collection using active registries"""
|
|
338
|
+
try:
|
|
339
|
+
# Use the hybrid monitoring approach: get active workers from registry
|
|
340
|
+
active_worker_ids = await self.job_store.get_active_workers(
|
|
341
|
+
max_age_seconds=60
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Get worker health data efficiently
|
|
345
|
+
workers = []
|
|
346
|
+
for worker_id in active_worker_ids:
|
|
347
|
+
health_data, ttl = await self.job_store.get_worker_health(worker_id)
|
|
348
|
+
|
|
349
|
+
if health_data:
|
|
350
|
+
workers.append(
|
|
351
|
+
{
|
|
352
|
+
"id": worker_id,
|
|
353
|
+
"status": health_data.get("status", "unknown"),
|
|
354
|
+
"active_jobs": health_data.get("active_jobs", 0),
|
|
355
|
+
"queues": health_data.get("queues", []),
|
|
356
|
+
"last_heartbeat": health_data.get("timestamp"),
|
|
357
|
+
"ttl": ttl,
|
|
358
|
+
}
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Fallback to legacy scan if no active workers found
|
|
362
|
+
if not workers:
|
|
363
|
+
workers = await self._legacy_scan_worker_metrics()
|
|
364
|
+
|
|
365
|
+
except Exception:
|
|
366
|
+
# Fallback to legacy scan on any error
|
|
367
|
+
workers = await self._legacy_scan_worker_metrics()
|
|
368
|
+
|
|
369
|
+
self.workers = workers
|
|
370
|
+
|
|
371
|
+
async def _legacy_scan_worker_metrics(self) -> list:
|
|
372
|
+
"""Legacy scan-based worker metrics as fallback"""
|
|
373
|
+
worker_keys = []
|
|
374
|
+
health_pattern = f"{HEALTH_KEY_PREFIX}*"
|
|
375
|
+
|
|
376
|
+
scan_count = 0
|
|
377
|
+
try:
|
|
378
|
+
async for key in self.job_store.redis.scan_iter(
|
|
379
|
+
match=health_pattern, count=50
|
|
380
|
+
):
|
|
381
|
+
worker_keys.append(key)
|
|
382
|
+
scan_count += 1
|
|
383
|
+
if scan_count >= 50: # Limit worker scans
|
|
384
|
+
break
|
|
385
|
+
except TypeError:
|
|
386
|
+
# Handle mocks that don't support count parameter
|
|
387
|
+
async for key in self.job_store.redis.scan_iter(match=health_pattern):
|
|
388
|
+
worker_keys.append(key)
|
|
389
|
+
scan_count += 1
|
|
390
|
+
if scan_count >= 50: # Limit worker scans
|
|
391
|
+
break
|
|
392
|
+
|
|
393
|
+
# Get worker health individually (compatible with tests)
|
|
394
|
+
workers = []
|
|
395
|
+
for key in worker_keys:
|
|
396
|
+
worker_id = key.decode().replace(HEALTH_KEY_PREFIX, "")
|
|
397
|
+
health_data, ttl = await self.job_store.get_worker_health(worker_id)
|
|
398
|
+
|
|
399
|
+
if health_data:
|
|
400
|
+
workers.append(
|
|
401
|
+
{
|
|
402
|
+
"id": worker_id,
|
|
403
|
+
"status": health_data.get("status", "unknown"),
|
|
404
|
+
"active_jobs": health_data.get("active_jobs", 0),
|
|
405
|
+
"queues": health_data.get("queues", []),
|
|
406
|
+
"last_heartbeat": health_data.get("timestamp"),
|
|
407
|
+
"ttl": ttl,
|
|
408
|
+
}
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
return workers
|
|
412
|
+
|
|
413
|
+
async def _get_recent_jobs_optimized(self, limit: int = 10) -> List[Dict]:
|
|
414
|
+
"""Optimized recent jobs collection with limited scanning"""
|
|
415
|
+
jobs = []
|
|
416
|
+
job_pattern = f"{JOB_KEY_PREFIX}*"
|
|
417
|
+
|
|
418
|
+
# Limit scan iterations more aggressively for recent jobs
|
|
419
|
+
job_keys = []
|
|
420
|
+
scan_count = 0
|
|
421
|
+
try:
|
|
422
|
+
async for key in self.job_store.redis.scan_iter(
|
|
423
|
+
match=job_pattern, count=20
|
|
424
|
+
):
|
|
425
|
+
job_keys.append(key)
|
|
426
|
+
scan_count += 1
|
|
427
|
+
if scan_count >= limit * 3: # Scan 3x the needed amount max
|
|
428
|
+
break
|
|
429
|
+
except TypeError:
|
|
430
|
+
# Handle mocks that don't support count parameter
|
|
431
|
+
async for key in self.job_store.redis.scan_iter(match=job_pattern):
|
|
432
|
+
job_keys.append(key)
|
|
433
|
+
scan_count += 1
|
|
434
|
+
if scan_count >= limit * 3: # Scan 3x the needed amount max
|
|
435
|
+
break
|
|
436
|
+
|
|
437
|
+
# Get job data individually (compatible with tests)
|
|
438
|
+
if job_keys:
|
|
439
|
+
recent_jobs = []
|
|
440
|
+
for key in job_keys:
|
|
441
|
+
job_id = key.decode().replace(JOB_KEY_PREFIX, "")
|
|
442
|
+
job_dict = await self.job_store.get_job_data_dict(job_id)
|
|
443
|
+
if job_dict:
|
|
444
|
+
try:
|
|
445
|
+
# Only include recently updated jobs
|
|
446
|
+
if "completed_at" in job_dict or "started_at" in job_dict:
|
|
447
|
+
recent_jobs.append(
|
|
448
|
+
{
|
|
449
|
+
"id": job_id,
|
|
450
|
+
"function": job_dict.get(
|
|
451
|
+
"function_name", "unknown"
|
|
452
|
+
),
|
|
453
|
+
"status": job_dict.get("status", "unknown"),
|
|
454
|
+
"started_at": float(job_dict.get("started_at", 0)),
|
|
455
|
+
"completed_at": float(
|
|
456
|
+
job_dict.get("completed_at", 0)
|
|
457
|
+
),
|
|
458
|
+
}
|
|
459
|
+
)
|
|
460
|
+
except (ValueError, UnicodeDecodeError):
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
# Sort by most recent activity
|
|
464
|
+
recent_jobs.sort(
|
|
465
|
+
key=lambda x: x.get("completed_at") or x.get("started_at") or 0,
|
|
466
|
+
reverse=True,
|
|
467
|
+
)
|
|
468
|
+
jobs = recent_jobs[:limit]
|
|
469
|
+
|
|
470
|
+
return jobs
|
|
471
|
+
|
|
472
|
+
def update_layout(self, layout: Layout):
|
|
473
|
+
"""Update the layout with current data"""
|
|
474
|
+
# Header
|
|
475
|
+
layout["header"].update(self._create_header())
|
|
476
|
+
|
|
477
|
+
# Queue stats
|
|
478
|
+
layout["queue_stats"].update(self._create_queue_stats())
|
|
479
|
+
|
|
480
|
+
# Queue chart
|
|
481
|
+
layout["queue_chart"].update(self._create_queue_chart())
|
|
482
|
+
|
|
483
|
+
# Worker list
|
|
484
|
+
layout["worker_list"].update(self._create_worker_list())
|
|
485
|
+
|
|
486
|
+
# Recent jobs
|
|
487
|
+
layout["recent_jobs"].update(self._create_recent_jobs())
|
|
488
|
+
|
|
489
|
+
# DLQ stats
|
|
490
|
+
layout["dlq_stats"].update(self._create_dlq_stats())
|
|
491
|
+
|
|
492
|
+
# DLQ errors
|
|
493
|
+
layout["dlq_errors"].update(self._create_dlq_errors())
|
|
494
|
+
|
|
495
|
+
# Footer
|
|
496
|
+
layout["footer"].update(self._create_footer())
|
|
497
|
+
|
|
498
|
+
def _create_header(self) -> Panel:
|
|
499
|
+
"""Create header panel"""
|
|
500
|
+
header_text = Text()
|
|
501
|
+
header_text.append("RRQ Monitor", style="bold cyan")
|
|
502
|
+
header_text.append(" | ", style="dim")
|
|
503
|
+
header_text.append(
|
|
504
|
+
f"Last Update: {self.last_update.strftime('%H:%M:%S')}", style="dim"
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
return Panel(
|
|
508
|
+
Align.center(header_text),
|
|
509
|
+
style="cyan",
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
def _create_queue_stats(self) -> Panel:
|
|
513
|
+
"""Create queue statistics table"""
|
|
514
|
+
table = Table(show_header=True, header_style="bold magenta", expand=True)
|
|
515
|
+
table.add_column("Queue", style="cyan")
|
|
516
|
+
table.add_column("Size", justify="right")
|
|
517
|
+
table.add_column("Trend", justify="center")
|
|
518
|
+
table.add_column("Rate", justify="right")
|
|
519
|
+
|
|
520
|
+
total_size = 0
|
|
521
|
+
for queue_name in sorted(self.queue_sizes.keys()):
|
|
522
|
+
sizes = list(self.queue_sizes[queue_name])
|
|
523
|
+
current_size = sizes[-1] if sizes else 0
|
|
524
|
+
total_size += current_size
|
|
525
|
+
|
|
526
|
+
# Calculate trend
|
|
527
|
+
trend = "→"
|
|
528
|
+
trend_style = "dim"
|
|
529
|
+
if len(sizes) >= 2:
|
|
530
|
+
diff = sizes[-1] - sizes[-2]
|
|
531
|
+
if diff > 0:
|
|
532
|
+
trend = "↑"
|
|
533
|
+
trend_style = "red"
|
|
534
|
+
elif diff < 0:
|
|
535
|
+
trend = "↓"
|
|
536
|
+
trend_style = "green"
|
|
537
|
+
|
|
538
|
+
# Calculate processing rate (jobs/min)
|
|
539
|
+
rate = "N/A"
|
|
540
|
+
if len(sizes) >= 10:
|
|
541
|
+
# Average change over last 10 samples
|
|
542
|
+
recent_changes = [sizes[i] - sizes[i - 1] for i in range(-9, 0)]
|
|
543
|
+
avg_change = sum(recent_changes) / len(recent_changes)
|
|
544
|
+
if avg_change < 0: # Negative means jobs are being processed
|
|
545
|
+
rate = f"{abs(avg_change * 60 / self.refresh_interval):.1f}/min"
|
|
546
|
+
|
|
547
|
+
table.add_row(
|
|
548
|
+
format_queue_name(queue_name),
|
|
549
|
+
str(current_size),
|
|
550
|
+
Text(trend, style=trend_style),
|
|
551
|
+
rate,
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Add total row
|
|
555
|
+
table.add_row(
|
|
556
|
+
Text("TOTAL", style="bold"),
|
|
557
|
+
Text(str(total_size), style="bold"),
|
|
558
|
+
"",
|
|
559
|
+
"",
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
return Panel(table, title="Queue Statistics", border_style="blue")
|
|
563
|
+
|
|
564
|
+
def _create_queue_chart(self) -> Panel:
|
|
565
|
+
"""Create queue size sparkline chart"""
|
|
566
|
+
lines = []
|
|
567
|
+
|
|
568
|
+
for queue_name in sorted(self.queue_sizes.keys()):
|
|
569
|
+
sizes = list(self.queue_sizes[queue_name])
|
|
570
|
+
if not sizes:
|
|
571
|
+
continue
|
|
572
|
+
|
|
573
|
+
# Create sparkline
|
|
574
|
+
max_val = max(sizes) if sizes else 1
|
|
575
|
+
min_val = min(sizes) if sizes else 0
|
|
576
|
+
|
|
577
|
+
if max_val == min_val:
|
|
578
|
+
sparkline = "─" * 20
|
|
579
|
+
else:
|
|
580
|
+
sparkline = ""
|
|
581
|
+
for val in sizes[-20:]: # Last 20 values
|
|
582
|
+
normalized = (val - min_val) / (max_val - min_val)
|
|
583
|
+
spark_chars = " ▁▂▃▄▅▆▇█"
|
|
584
|
+
idx = int(normalized * (len(spark_chars) - 1))
|
|
585
|
+
sparkline += spark_chars[idx]
|
|
586
|
+
|
|
587
|
+
line = f"{queue_name:>12}: {sparkline} [{sizes[-1] if sizes else 0}]"
|
|
588
|
+
lines.append(line)
|
|
589
|
+
|
|
590
|
+
content = "\n".join(lines) if lines else "No queue data"
|
|
591
|
+
return Panel(content, title="Queue Trends (60s)", border_style="green")
|
|
592
|
+
|
|
593
|
+
def _create_worker_list(self) -> Panel:
|
|
594
|
+
"""Create worker status table"""
|
|
595
|
+
table = Table(show_header=True, header_style="bold magenta", expand=True)
|
|
596
|
+
table.add_column("Worker", style="cyan")
|
|
597
|
+
table.add_column("Status", justify="center")
|
|
598
|
+
table.add_column("Jobs", justify="right")
|
|
599
|
+
table.add_column("Heartbeat", style="dim")
|
|
600
|
+
|
|
601
|
+
if not self.workers:
|
|
602
|
+
table.add_row(
|
|
603
|
+
"[dim italic]No active workers[/dim italic]",
|
|
604
|
+
"",
|
|
605
|
+
"",
|
|
606
|
+
"",
|
|
607
|
+
)
|
|
608
|
+
else:
|
|
609
|
+
for worker in sorted(self.workers, key=lambda x: x["id"]):
|
|
610
|
+
# Status with color
|
|
611
|
+
status_colors = {
|
|
612
|
+
"running": "green",
|
|
613
|
+
"idle": "yellow",
|
|
614
|
+
"stopped": "red",
|
|
615
|
+
"initializing": "blue",
|
|
616
|
+
}
|
|
617
|
+
status_color = status_colors.get(worker["status"], "white")
|
|
618
|
+
status_text = Text(worker["status"].upper(), style=status_color)
|
|
619
|
+
|
|
620
|
+
# Worker ID (truncated)
|
|
621
|
+
worker_id = (
|
|
622
|
+
worker["id"][:12] + "..."
|
|
623
|
+
if len(worker["id"]) > 15
|
|
624
|
+
else worker["id"]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
table.add_row(
|
|
628
|
+
worker_id,
|
|
629
|
+
status_text,
|
|
630
|
+
str(worker["active_jobs"]),
|
|
631
|
+
format_timestamp(worker["last_heartbeat"]),
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
return Panel(table, title="Active Workers", border_style="blue")
|
|
635
|
+
|
|
636
|
+
def _create_recent_jobs(self) -> Panel:
|
|
637
|
+
"""Create recent jobs table"""
|
|
638
|
+
table = Table(show_header=True, header_style="bold magenta", expand=True)
|
|
639
|
+
table.add_column("Job", style="cyan")
|
|
640
|
+
table.add_column("Function", style="yellow")
|
|
641
|
+
table.add_column("Status", justify="center")
|
|
642
|
+
table.add_column("Duration", justify="right")
|
|
643
|
+
|
|
644
|
+
if not self.recent_jobs:
|
|
645
|
+
table.add_row(
|
|
646
|
+
"[dim italic]No recent jobs[/dim italic]",
|
|
647
|
+
"",
|
|
648
|
+
"",
|
|
649
|
+
"",
|
|
650
|
+
)
|
|
651
|
+
else:
|
|
652
|
+
for job in self.recent_jobs[:5]: # Show top 5
|
|
653
|
+
# Calculate duration
|
|
654
|
+
duration = None
|
|
655
|
+
if job.get("completed_at") and job.get("started_at"):
|
|
656
|
+
duration = job["completed_at"] - job["started_at"]
|
|
657
|
+
|
|
658
|
+
# Truncate IDs
|
|
659
|
+
job_id = job["id"][:8] + "..."
|
|
660
|
+
function = (
|
|
661
|
+
job["function"][:20] + "..."
|
|
662
|
+
if len(job["function"]) > 20
|
|
663
|
+
else job["function"]
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
table.add_row(
|
|
667
|
+
job_id,
|
|
668
|
+
function,
|
|
669
|
+
format_status(job["status"]),
|
|
670
|
+
format_duration(duration) if duration else "N/A",
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
return Panel(table, title="Recent Jobs", border_style="green")
|
|
674
|
+
|
|
675
|
+
def _create_footer(self) -> Panel:
|
|
676
|
+
"""Create footer panel"""
|
|
677
|
+
footer_text = Text()
|
|
678
|
+
footer_text.append("Press ", style="dim")
|
|
679
|
+
footer_text.append("Ctrl+C", style="bold yellow")
|
|
680
|
+
footer_text.append(" to exit", style="dim")
|
|
681
|
+
|
|
682
|
+
return Panel(
|
|
683
|
+
Align.center(footer_text),
|
|
684
|
+
style="dim",
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
async def _update_dlq_stats(self):
|
|
688
|
+
"""Update DLQ statistics"""
|
|
689
|
+
dlq_name = self.settings.default_dlq_name
|
|
690
|
+
dlq_key = f"{DLQ_KEY_PREFIX}{dlq_name}"
|
|
691
|
+
|
|
692
|
+
# Get total DLQ job count
|
|
693
|
+
self.dlq_stats["total_jobs"] = await self.job_store.redis.llen(dlq_key)
|
|
694
|
+
|
|
695
|
+
if self.dlq_stats["total_jobs"] > 0:
|
|
696
|
+
# Get some recent DLQ jobs for error analysis
|
|
697
|
+
job_ids = await self.job_store.redis.lrange(dlq_key, 0, 9) # Get first 10
|
|
698
|
+
job_ids = [job_id.decode("utf-8") for job_id in job_ids]
|
|
699
|
+
|
|
700
|
+
errors = []
|
|
701
|
+
newest_time = 0
|
|
702
|
+
|
|
703
|
+
for job_id in job_ids:
|
|
704
|
+
job_data = await self.job_store.get_job(job_id)
|
|
705
|
+
if job_data:
|
|
706
|
+
error = job_data.get("last_error", "Unknown error")
|
|
707
|
+
completion_time = job_data.get("completion_time", 0)
|
|
708
|
+
|
|
709
|
+
if isinstance(completion_time, str):
|
|
710
|
+
try:
|
|
711
|
+
from datetime import datetime
|
|
712
|
+
|
|
713
|
+
completion_time = datetime.fromisoformat(
|
|
714
|
+
completion_time.replace("Z", "+00:00")
|
|
715
|
+
).timestamp()
|
|
716
|
+
except (ValueError, TypeError):
|
|
717
|
+
completion_time = 0
|
|
718
|
+
|
|
719
|
+
if completion_time > newest_time:
|
|
720
|
+
newest_time = completion_time
|
|
721
|
+
self.dlq_stats["newest_error"] = (
|
|
722
|
+
error[:ERROR_DISPLAY_LENGTH] + "..."
|
|
723
|
+
if len(error) > ERROR_DISPLAY_LENGTH
|
|
724
|
+
else error
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
errors.append(
|
|
728
|
+
error[:ERROR_DISPLAY_LENGTH] + "..."
|
|
729
|
+
if len(error) > ERROR_DISPLAY_LENGTH
|
|
730
|
+
else error
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
# Count error types
|
|
734
|
+
error_counts = {}
|
|
735
|
+
for error in errors:
|
|
736
|
+
error_counts[error] = error_counts.get(error, 0) + 1
|
|
737
|
+
|
|
738
|
+
self.dlq_stats["top_errors"] = dict(
|
|
739
|
+
sorted(error_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
740
|
+
)
|
|
741
|
+
else:
|
|
742
|
+
self.dlq_stats["newest_error"] = None
|
|
743
|
+
self.dlq_stats["top_errors"] = {}
|
|
744
|
+
|
|
745
|
+
def _create_dlq_stats(self) -> Panel:
|
|
746
|
+
"""Create DLQ statistics panel"""
|
|
747
|
+
table = Table(show_header=True, header_style="bold red", expand=True)
|
|
748
|
+
table.add_column("Metric", style="cyan")
|
|
749
|
+
table.add_column("Value", justify="right")
|
|
750
|
+
|
|
751
|
+
table.add_row("Total Jobs", str(self.dlq_stats["total_jobs"]))
|
|
752
|
+
|
|
753
|
+
if self.dlq_stats["newest_error"]:
|
|
754
|
+
table.add_row("Latest Error", self.dlq_stats["newest_error"])
|
|
755
|
+
else:
|
|
756
|
+
table.add_row("Latest Error", "None")
|
|
757
|
+
|
|
758
|
+
return Panel(
|
|
759
|
+
table,
|
|
760
|
+
title=f"Dead Letter Queue ({self.settings.default_dlq_name})",
|
|
761
|
+
border_style="red",
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
def _create_dlq_errors(self) -> Panel:
|
|
765
|
+
"""Create DLQ error patterns panel"""
|
|
766
|
+
table = Table(show_header=True, header_style="bold red", expand=True)
|
|
767
|
+
table.add_column("Error Pattern", style="red")
|
|
768
|
+
table.add_column("Count", justify="right")
|
|
769
|
+
|
|
770
|
+
if self.dlq_stats["top_errors"]:
|
|
771
|
+
for error, count in self.dlq_stats["top_errors"].items():
|
|
772
|
+
table.add_row(error, str(count))
|
|
773
|
+
else:
|
|
774
|
+
table.add_row("No errors", "0")
|
|
775
|
+
|
|
776
|
+
return Panel(table, title="Top Error Patterns", border_style="red")
|