jettask 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. jettask/constants.py +213 -0
  2. jettask/core/app.py +525 -205
  3. jettask/core/cli.py +193 -185
  4. jettask/core/consumer_manager.py +126 -34
  5. jettask/core/context.py +3 -0
  6. jettask/core/enums.py +137 -0
  7. jettask/core/event_pool.py +501 -168
  8. jettask/core/message.py +147 -0
  9. jettask/core/offline_worker_recovery.py +181 -114
  10. jettask/core/task.py +10 -174
  11. jettask/core/task_batch.py +153 -0
  12. jettask/core/unified_manager_base.py +243 -0
  13. jettask/core/worker_scanner.py +54 -54
  14. jettask/executors/asyncio.py +184 -64
  15. jettask/webui/backend/config.py +51 -0
  16. jettask/webui/backend/data_access.py +2083 -92
  17. jettask/webui/backend/data_api.py +3294 -0
  18. jettask/webui/backend/dependencies.py +261 -0
  19. jettask/webui/backend/init_meta_db.py +158 -0
  20. jettask/webui/backend/main.py +1358 -69
  21. jettask/webui/backend/main_unified.py +78 -0
  22. jettask/webui/backend/main_v2.py +394 -0
  23. jettask/webui/backend/namespace_api.py +295 -0
  24. jettask/webui/backend/namespace_api_old.py +294 -0
  25. jettask/webui/backend/namespace_data_access.py +611 -0
  26. jettask/webui/backend/queue_backlog_api.py +727 -0
  27. jettask/webui/backend/queue_stats_v2.py +521 -0
  28. jettask/webui/backend/redis_monitor_api.py +476 -0
  29. jettask/webui/backend/unified_api_router.py +1601 -0
  30. jettask/webui/db_init.py +204 -32
  31. jettask/webui/frontend/package-lock.json +492 -1
  32. jettask/webui/frontend/package.json +4 -1
  33. jettask/webui/frontend/src/App.css +105 -7
  34. jettask/webui/frontend/src/App.jsx +49 -20
  35. jettask/webui/frontend/src/components/NamespaceSelector.jsx +166 -0
  36. jettask/webui/frontend/src/components/QueueBacklogChart.jsx +298 -0
  37. jettask/webui/frontend/src/components/QueueBacklogTrend.jsx +638 -0
  38. jettask/webui/frontend/src/components/QueueDetailsTable.css +65 -0
  39. jettask/webui/frontend/src/components/QueueDetailsTable.jsx +487 -0
  40. jettask/webui/frontend/src/components/QueueDetailsTableV2.jsx +465 -0
  41. jettask/webui/frontend/src/components/ScheduledTaskFilter.jsx +423 -0
  42. jettask/webui/frontend/src/components/TaskFilter.jsx +425 -0
  43. jettask/webui/frontend/src/components/TimeRangeSelector.css +21 -0
  44. jettask/webui/frontend/src/components/TimeRangeSelector.jsx +160 -0
  45. jettask/webui/frontend/src/components/layout/AppLayout.css +95 -0
  46. jettask/webui/frontend/src/components/layout/AppLayout.jsx +49 -0
  47. jettask/webui/frontend/src/components/layout/Header.css +34 -10
  48. jettask/webui/frontend/src/components/layout/Header.jsx +31 -23
  49. jettask/webui/frontend/src/components/layout/SideMenu.css +137 -0
  50. jettask/webui/frontend/src/components/layout/SideMenu.jsx +209 -0
  51. jettask/webui/frontend/src/components/layout/TabsNav.css +244 -0
  52. jettask/webui/frontend/src/components/layout/TabsNav.jsx +206 -0
  53. jettask/webui/frontend/src/components/layout/UserInfo.css +197 -0
  54. jettask/webui/frontend/src/components/layout/UserInfo.jsx +197 -0
  55. jettask/webui/frontend/src/contexts/NamespaceContext.jsx +72 -0
  56. jettask/webui/frontend/src/contexts/TabsContext.backup.jsx +245 -0
  57. jettask/webui/frontend/src/main.jsx +1 -0
  58. jettask/webui/frontend/src/pages/Alerts.jsx +684 -0
  59. jettask/webui/frontend/src/pages/Dashboard.jsx +1330 -0
  60. jettask/webui/frontend/src/pages/QueueDetail.jsx +1109 -10
  61. jettask/webui/frontend/src/pages/QueueMonitor.jsx +236 -115
  62. jettask/webui/frontend/src/pages/Queues.jsx +5 -1
  63. jettask/webui/frontend/src/pages/ScheduledTasks.jsx +809 -0
  64. jettask/webui/frontend/src/pages/Settings.jsx +800 -0
  65. jettask/webui/frontend/src/services/api.js +7 -5
  66. jettask/webui/frontend/src/utils/suppressWarnings.js +22 -0
  67. jettask/webui/frontend/src/utils/userPreferences.js +154 -0
  68. jettask/webui/multi_namespace_consumer.py +543 -0
  69. jettask/webui/pg_consumer.py +983 -246
  70. jettask/webui/static/dist/assets/index-7129cfe1.css +1 -0
  71. jettask/webui/static/dist/assets/index-8d1935cc.js +774 -0
  72. jettask/webui/static/dist/index.html +2 -2
  73. jettask/webui/task_center.py +216 -0
  74. jettask/webui/task_center_client.py +150 -0
  75. jettask/webui/unified_consumer_manager.py +193 -0
  76. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/METADATA +1 -1
  77. jettask-0.2.4.dist-info/RECORD +134 -0
  78. jettask/webui/pg_consumer_slow.py +0 -1099
  79. jettask/webui/pg_consumer_test.py +0 -678
  80. jettask/webui/static/dist/assets/index-823408e8.css +0 -1
  81. jettask/webui/static/dist/assets/index-9968b0b8.js +0 -543
  82. jettask/webui/test_pg_consumer_recovery.py +0 -547
  83. jettask/webui/test_recovery_simple.py +0 -492
  84. jettask/webui/test_self_recovery.py +0 -467
  85. jettask-0.2.1.dist-info/RECORD +0 -91
  86. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/WHEEL +0 -0
  87. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/entry_points.txt +0 -0
  88. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/licenses/LICENSE +0 -0
  89. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3294 @@
1
+ """
2
+ 数据查询API路由
3
+ 所有接口都需要指定namespace参数
4
+ """
5
+ from fastapi import APIRouter, HTTPException, Query, Request
6
+ from typing import List, Dict, Optional
7
+ from pydantic import BaseModel
8
+ from datetime import datetime, timedelta, timezone
9
+ import logging
10
+ import time
11
+ import traceback
12
+ from sqlalchemy import text
13
+
14
+ from namespace_data_access import get_namespace_data_access
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ router = APIRouter(prefix="/api/data", tags=["data"])
19
+
20
+ # 获取全局数据访问实例
21
+ data_access = get_namespace_data_access()
22
+
23
+
24
+ async def handle_database_connection_error(e: Exception, namespace: str, operation_name: str):
25
+ """
26
+ 处理数据库连接异常,重置连接并抛出适当的HTTP异常
27
+
28
+ Args:
29
+ e: 原始异常
30
+ namespace: 命名空间名称
31
+ operation_name: 操作名称(用于错误消息)
32
+ """
33
+ error_msg = str(e)
34
+
35
+ # 检查是否为连接相关的错误
36
+ is_connection_error = (
37
+ "password authentication failed" in error_msg or
38
+ "connection failed" in error_msg or
39
+ "could not connect to server" in error_msg
40
+ )
41
+
42
+ if is_connection_error:
43
+ # 重置连接以便重新初始化
44
+ try:
45
+ await data_access.reset_connection(namespace)
46
+ logger.info(f"已重置命名空间 {namespace} 的数据库连接")
47
+ except Exception as reset_error:
48
+ logger.error(f"重置命名空间 {namespace} 连接失败: {reset_error}")
49
+
50
+ if "password authentication failed" in error_msg or "connection failed" in error_msg:
51
+ raise HTTPException(
52
+ status_code=500,
53
+ detail=f"命名空间 '{namespace}' 的数据库连接失败,请检查该命名空间的数据库配置"
54
+ )
55
+ else:
56
+ raise HTTPException(
57
+ status_code=500,
58
+ detail=f"命名空间 '{namespace}' 的数据库服务不可用,请检查数据库是否正常运行"
59
+ )
60
+ else:
61
+ # 其他类型的错误
62
+ raise HTTPException(status_code=500, detail=f"{operation_name}失败: {error_msg}")
63
+
64
+
65
+ def build_queue_filter_and_params(queues: Optional[str] = None):
66
+ """
67
+ 构建队列筛选条件和参数
68
+
69
+ Args:
70
+ queues: 逗号分隔的队列名称列表
71
+
72
+ Returns:
73
+ tuple: (queue_filter_sql, queue_list, queue_params_dict)
74
+ """
75
+ queue_list = []
76
+ if queues:
77
+ queue_list = [q.strip() for q in queues.split(',') if q.strip()]
78
+
79
+ queue_filter = ""
80
+ queue_params = {}
81
+
82
+ if queue_list:
83
+ queue_placeholders = ','.join([f':queue_{i}' for i in range(len(queue_list))])
84
+ queue_filter = f"AND t.queue IN ({queue_placeholders})"
85
+
86
+ # 添加队列参数
87
+ for i, queue in enumerate(queue_list):
88
+ queue_params[f'queue_{i}'] = queue
89
+
90
+ return queue_filter, queue_list, queue_params
91
+
92
+
93
+ class TimeRangeQuery(BaseModel):
94
+ start_time: Optional[datetime] = None
95
+ end_time: Optional[datetime] = None
96
+ time_range: Optional[str] = "15m"
97
+ queues: Optional[List[str]] = None
98
+ filters: Optional[List[Dict]] = None
99
+
100
+
101
+ class TimeRangeResult:
102
+ """时间范围处理结果"""
103
+ def __init__(self, start_time: datetime, end_time: datetime, interval: str, interval_seconds: int, granularity: str):
104
+ self.start_time = start_time
105
+ self.end_time = end_time
106
+ self.interval = interval
107
+ self.interval_seconds = interval_seconds
108
+ self.granularity = granularity
109
+
110
+
111
+ def parse_time_range_query(query: TimeRangeQuery) -> TimeRangeResult:
112
+ """
113
+ 解析TimeRangeQuery对象,返回时间范围和间隔信息
114
+
115
+ Args:
116
+ query: TimeRangeQuery对象
117
+
118
+ Returns:
119
+ TimeRangeResult对象,包含start_time, end_time, interval, interval_seconds, granularity
120
+ """
121
+ end_time = datetime.now(timezone.utc)
122
+
123
+ if query.time_range and query.time_range != 'custom':
124
+ # 解析时间范围字符串
125
+ if query.time_range.endswith('m'):
126
+ minutes = int(query.time_range[:-1])
127
+ start_time = end_time - timedelta(minutes=minutes)
128
+ elif query.time_range.endswith('h'):
129
+ hours = int(query.time_range[:-1])
130
+ start_time = end_time - timedelta(hours=hours)
131
+ elif query.time_range.endswith('d'):
132
+ days = int(query.time_range[:-1])
133
+ start_time = end_time - timedelta(days=days)
134
+ else:
135
+ start_time = end_time - timedelta(minutes=15)
136
+ else:
137
+ # 使用自定义时间范围或默认值
138
+ start_time = query.start_time or (end_time - timedelta(minutes=15))
139
+ end_time = query.end_time or end_time
140
+
141
+ # 动态计算时间间隔
142
+ return calculate_dynamic_interval(start_time, end_time)
143
+
144
+
145
+ def parse_time_range_string(time_range: str) -> TimeRangeResult:
146
+ """
147
+ 解析时间范围字符串,返回时间范围和间隔信息
148
+
149
+ Args:
150
+ time_range: 时间范围字符串,如'1h', '30m', '7d'
151
+
152
+ Returns:
153
+ TimeRangeResult对象,包含start_time, end_time, interval, interval_seconds, granularity
154
+ """
155
+ end_time = datetime.now(timezone.utc)
156
+
157
+ if time_range.endswith('m'):
158
+ minutes = int(time_range[:-1])
159
+ start_time = end_time - timedelta(minutes=minutes)
160
+ elif time_range.endswith('h'):
161
+ hours = int(time_range[:-1])
162
+ start_time = end_time - timedelta(hours=hours)
163
+ elif time_range.endswith('d'):
164
+ days = int(time_range[:-1])
165
+ start_time = end_time - timedelta(days=days)
166
+ else:
167
+ start_time = end_time - timedelta(hours=1)
168
+
169
+ # 动态计算时间间隔
170
+ return calculate_dynamic_interval(start_time, end_time)
171
+
172
+
173
+ def calculate_dynamic_interval(start_time: datetime, end_time: datetime, target_points: int = 200) -> TimeRangeResult:
174
+ """
175
+ 根据时间范围动态计算合适的时间间隔
176
+
177
+ Args:
178
+ start_time: 开始时间
179
+ end_time: 结束时间
180
+ target_points: 目标数据点数量,默认200
181
+
182
+ Returns:
183
+ TimeRangeResult对象,包含时间间隔信息
184
+ """
185
+ duration = (end_time - start_time).total_seconds()
186
+ ideal_interval_seconds = duration / target_points
187
+
188
+ # 选择合适的间隔
189
+ if ideal_interval_seconds <= 1:
190
+ interval_seconds = 1
191
+ interval = '1 seconds'
192
+ granularity = 'second'
193
+ elif ideal_interval_seconds <= 5:
194
+ interval_seconds = 5
195
+ interval = '5 seconds'
196
+ granularity = 'second'
197
+ elif ideal_interval_seconds <= 10:
198
+ interval_seconds = 10
199
+ interval = '10 seconds'
200
+ granularity = 'second'
201
+ elif ideal_interval_seconds <= 30:
202
+ interval_seconds = 30
203
+ interval = '30 seconds'
204
+ granularity = 'second'
205
+ elif ideal_interval_seconds <= 60:
206
+ interval_seconds = 60
207
+ interval = '1 minute'
208
+ granularity = 'minute'
209
+ elif ideal_interval_seconds <= 120:
210
+ interval_seconds = 120
211
+ interval = '2 minutes'
212
+ granularity = 'minute'
213
+ elif ideal_interval_seconds <= 300:
214
+ interval_seconds = 300
215
+ interval = '5 minutes'
216
+ granularity = 'minute'
217
+ elif ideal_interval_seconds <= 600:
218
+ interval_seconds = 600
219
+ interval = '10 minutes'
220
+ granularity = 'minute'
221
+ elif ideal_interval_seconds <= 1800:
222
+ interval_seconds = 1800
223
+ interval = '30 minutes'
224
+ granularity = 'minute'
225
+ elif ideal_interval_seconds <= 3600:
226
+ interval_seconds = 3600
227
+ interval = '1 hour'
228
+ granularity = 'hour'
229
+ else:
230
+ interval_seconds = 86400
231
+ interval = '1 day'
232
+ granularity = 'day'
233
+
234
+ return TimeRangeResult(start_time, end_time, interval, interval_seconds, granularity)
235
+
236
+
237
+ class QueueStatsResponse(BaseModel):
238
+ queue_name: str
239
+ length: int
240
+ consumer_groups: int
241
+ consumers: int
242
+ pending: int
243
+
244
+
245
+ class TaskDetailResponse(BaseModel):
246
+ id: str
247
+ status: str
248
+ name: str
249
+ queue: str
250
+ worker_id: Optional[str]
251
+ created_at: Optional[str]
252
+ started_at: Optional[str]
253
+ completed_at: Optional[str]
254
+ result: Optional[str]
255
+ error: Optional[str]
256
+ retry_count: int
257
+
258
+
259
+ class ScheduledTaskResponse(BaseModel):
260
+ id: int
261
+ name: str
262
+ queue: str
263
+ schedule: str
264
+ task_data: dict
265
+ enabled: bool
266
+ last_run_at: Optional[str]
267
+ next_run_at: Optional[str]
268
+ execution_count: int
269
+ created_at: Optional[str]
270
+ updated_at: Optional[str]
271
+ description: Optional[str]
272
+ max_retries: Optional[int]
273
+ retry_delay: Optional[int]
274
+ timeout: Optional[int]
275
+ priority: Optional[int]
276
+
277
+
278
+ @router.get("/namespaces", response_model=List[dict])
279
+ async def list_available_namespaces():
280
+ """获取所有可用的命名空间列表"""
281
+ try:
282
+ namespaces = await data_access.manager.list_namespaces()
283
+ # 只返回基本信息
284
+ return [
285
+ {
286
+ 'id': ns.get('id'),
287
+ 'name': ns.get('name'),
288
+ 'description': ns.get('description', ''),
289
+ 'created_at': ns.get('created_at')
290
+ }
291
+ for ns in namespaces
292
+ ]
293
+ except Exception as e:
294
+ logger.error(f"获取命名空间列表失败: {e}")
295
+ traceback.print_exc()
296
+ raise HTTPException(status_code=500, detail=str(e))
297
+
298
+
299
+ @router.get("/queues/{namespace}", response_model=List[QueueStatsResponse])
300
+ async def get_queue_stats(namespace: str):
301
+ """
302
+ 获取指定命名空间的队列统计信息
303
+
304
+ Args:
305
+ namespace: 命名空间名称
306
+ """
307
+ try:
308
+ stats = await data_access.get_queue_stats(namespace)
309
+ return stats
310
+ except Exception as e:
311
+ logger.error(f"获取队列统计失败: {e}")
312
+ traceback.print_exc()
313
+ raise HTTPException(status_code=500, detail=str(e))
314
+
315
+
316
+ @router.get("/tasks/{namespace}/{task_id}", response_model=TaskDetailResponse)
317
+ async def get_task_detail(namespace: str, task_id: str):
318
+ """
319
+ 获取指定命名空间中的任务详情
320
+
321
+ Args:
322
+ namespace: 命名空间名称
323
+ task_id: 任务ID
324
+ """
325
+ try:
326
+ task = await data_access.get_task_detail(namespace, task_id)
327
+ if not task:
328
+ raise HTTPException(status_code=404, detail="Task not found")
329
+ return task
330
+ except HTTPException:
331
+ raise
332
+ except Exception as e:
333
+ logger.error(f"获取任务详情失败: {e}")
334
+ traceback.print_exc()
335
+ raise HTTPException(status_code=500, detail=str(e))
336
+
337
+
338
+ @router.get("/scheduled-tasks/{namespace}")
339
+ async def get_scheduled_tasks(
340
+ namespace: str,
341
+ limit: int = Query(100, description="返回记录数"),
342
+ offset: int = Query(0, description="偏移量")
343
+ ):
344
+ """
345
+ 获取指定命名空间的定时任务列表
346
+
347
+ Args:
348
+ namespace: 命名空间名称
349
+ limit: 返回记录数
350
+ offset: 偏移量
351
+ """
352
+ try:
353
+ result = await data_access.get_scheduled_tasks(namespace, limit, offset)
354
+ return result
355
+ except Exception as e:
356
+ logger.error(f"获取定时任务列表失败: {e}")
357
+ traceback.print_exc()
358
+ raise HTTPException(status_code=500, detail=str(e))
359
+
360
+
361
+ @router.get("/queue-history/{namespace}/{queue_name}")
362
+ async def get_queue_history(
363
+ namespace: str,
364
+ queue_name: str,
365
+ hours: int = Query(24, description="历史时间范围(小时)"),
366
+ interval: int = Query(1, description="时间间隔(小时)")
367
+ ):
368
+ """
369
+ 获取指定命名空间中队列的历史数据
370
+
371
+ Args:
372
+ namespace: 命名空间名称
373
+ queue_name: 队列名称
374
+ hours: 历史时间范围(小时)
375
+ interval: 时间间隔(小时)
376
+ """
377
+ try:
378
+ history = await data_access.get_queue_history(
379
+ namespace, queue_name, hours, interval
380
+ )
381
+ return history
382
+ except Exception as e:
383
+ logger.error(f"获取队列历史数据失败: {e}")
384
+ traceback.print_exc()
385
+ raise HTTPException(status_code=500, detail=str(e))
386
+
387
+
388
+ @router.post("/queue-timeline/{namespace}")
389
+ async def get_queue_timeline(namespace: str, query: TimeRangeQuery):
390
+ """
391
+ 获取指定命名空间的队列时间线数据
392
+
393
+ Args:
394
+ namespace: 命名空间名称
395
+ query: 时间范围查询参数
396
+ """
397
+ try:
398
+ # 使用路径参数中的namespace(TimeRangeQuery没有namespace属性)
399
+
400
+ conn = await data_access.manager.get_connection(namespace)
401
+
402
+ # 使用公共工具函数处理时间范围
403
+ time_range_result = parse_time_range_query(query)
404
+ start_time = time_range_result.start_time
405
+ end_time = time_range_result.end_time
406
+
407
+ # 如果没有PostgreSQL配置,返回模拟数据
408
+ if not conn.pg_config:
409
+ # 生成模拟的时序数据
410
+ timeline_data = []
411
+ duration = (end_time - start_time).total_seconds()
412
+ num_points = min(50, max(10, int(duration / 60))) # 10-50个数据点
413
+
414
+ for i in range(num_points):
415
+ timestamp = start_time + timedelta(seconds=i * duration / num_points)
416
+ for queue_name in (query.queues or ['default']):
417
+ timeline_data.append({
418
+ 'time': timestamp.isoformat(),
419
+ 'queue': queue_name,
420
+ 'value': 100 + i * 2 # 模拟增长
421
+ })
422
+
423
+ return {
424
+ "data": timeline_data,
425
+ "granularity": "1m"
426
+ }
427
+
428
+ # 从PostgreSQL查询时序数据
429
+ async with await conn.get_pg_session() as session:
430
+ try:
431
+ # 动态计算时间间隔,目标是生成约200个时间点
432
+ duration = (end_time - start_time).total_seconds()
433
+ TARGET_POINTS = 200 # 目标数据点数
434
+ ideal_interval_seconds = duration / TARGET_POINTS
435
+
436
+ # 将间隔秒数规范化到合理的值
437
+ if ideal_interval_seconds <= 1:
438
+ interval = '1 second'
439
+ granularity = 'second'
440
+ elif ideal_interval_seconds <= 5:
441
+ interval = '5 seconds'
442
+ granularity = 'second'
443
+ elif ideal_interval_seconds <= 10:
444
+ interval = '10 seconds'
445
+ granularity = 'second'
446
+ elif ideal_interval_seconds <= 30:
447
+ interval = '30 seconds'
448
+ granularity = 'second'
449
+ elif ideal_interval_seconds <= 60:
450
+ interval = '1 minute'
451
+ granularity = 'minute'
452
+ elif ideal_interval_seconds <= 120:
453
+ interval = '2 minutes'
454
+ granularity = 'minute'
455
+ elif ideal_interval_seconds <= 300:
456
+ interval = '5 minutes'
457
+ granularity = 'minute'
458
+ elif ideal_interval_seconds <= 600:
459
+ interval = '10 minutes'
460
+ granularity = 'minute'
461
+ elif ideal_interval_seconds <= 900:
462
+ interval = '15 minutes'
463
+ granularity = 'minute'
464
+ elif ideal_interval_seconds <= 1800:
465
+ interval = '30 minutes'
466
+ granularity = 'minute'
467
+ elif ideal_interval_seconds <= 3600:
468
+ interval = '1 hour'
469
+ granularity = 'hour'
470
+ elif ideal_interval_seconds <= 7200:
471
+ interval = '2 hours'
472
+ granularity = 'hour'
473
+ elif ideal_interval_seconds <= 14400:
474
+ interval = '4 hours'
475
+ granularity = 'hour'
476
+ elif ideal_interval_seconds <= 21600:
477
+ interval = '6 hours'
478
+ granularity = 'hour'
479
+ elif ideal_interval_seconds <= 43200:
480
+ interval = '12 hours'
481
+ granularity = 'hour'
482
+ else:
483
+ interval = '1 day'
484
+ granularity = 'day'
485
+
486
+ timeline_data = []
487
+
488
+ # 计算间隔秒数用于时间桶对齐
489
+ interval_seconds_map = {
490
+ '1 second': 1, '5 seconds': 5, '10 seconds': 10, '30 seconds': 30,
491
+ '1 minute': 60, '2 minutes': 120, '5 minutes': 300, '10 minutes': 600,
492
+ '15 minutes': 900, '30 minutes': 1800, '1 hour': 3600, '2 hours': 7200,
493
+ '4 hours': 14400, '6 hours': 21600, '12 hours': 43200, '1 day': 86400
494
+ }
495
+ interval_seconds = interval_seconds_map.get(interval, 60) # 默认1分钟
496
+
497
+ for queue_name in (query.queues or []):
498
+ # 使用对齐到固定边界的时间序列生成方案
499
+ # 注意:需要匹配基础队列名及其所有优先级队列
500
+ query_sql = text(f"""
501
+ WITH time_series AS (
502
+ -- 生成对齐到固定边界的时间序列
503
+ SELECT generate_series(
504
+ to_timestamp(FLOOR(EXTRACT(epoch FROM CAST(:start_time AS timestamptz)) / {interval_seconds}) * {interval_seconds}),
505
+ to_timestamp(CEILING(EXTRACT(epoch FROM CAST(:end_time AS timestamptz)) / {interval_seconds}) * {interval_seconds} + {interval_seconds}),
506
+ CAST(:interval_val AS interval)
507
+ ) AS time_bucket
508
+ ),
509
+ task_counts AS (
510
+ SELECT
511
+ -- 任务时间也对齐到相同边界
512
+ to_timestamp(
513
+ FLOOR(EXTRACT(epoch FROM t.created_at) / {interval_seconds}) * {interval_seconds}
514
+ ) AS time_bucket,
515
+ COUNT(t.stream_id) as count,
516
+ COUNT(CASE WHEN t.stream_id NOT IN (SELECT stream_id FROM task_runs) THEN 1 END) as pending,
517
+ COUNT(CASE WHEN EXISTS (SELECT 1 FROM task_runs tr WHERE tr.stream_id = t.stream_id AND tr.status = 'pending') THEN 1 END) as processing
518
+ FROM tasks t
519
+ WHERE t.namespace = :namespace
520
+ -- 匹配基础队列名和所有优先级队列(如 shared_queue, shared_queue:0, shared_queue:5 等)
521
+ AND (t.queue = :queue_name OR t.queue LIKE :queue_pattern)
522
+ AND t.created_at >= :start_time
523
+ AND t.created_at <= :end_time
524
+ GROUP BY 1
525
+ )
526
+ SELECT
527
+ ts.time_bucket,
528
+ COALESCE(tc.count, 0) as value,
529
+ COALESCE(tc.pending, 0) as pending,
530
+ COALESCE(tc.processing, 0) as processing
531
+ FROM time_series ts
532
+ LEFT JOIN task_counts tc ON ts.time_bucket = tc.time_bucket
533
+ ORDER BY ts.time_bucket
534
+ """)
535
+
536
+ result = await session.execute(query_sql, {
537
+ 'namespace': namespace,
538
+ 'queue_name': queue_name,
539
+ 'queue_pattern': f'{queue_name}:%', # 匹配所有优先级队列
540
+ 'start_time': start_time,
541
+ 'end_time': end_time,
542
+ 'interval_val': interval
543
+ })
544
+
545
+ # 转换结果为列表以便获取索引
546
+ rows = list(result)
547
+ end_index = len(rows) - 1
548
+
549
+ for idx, row in enumerate(rows):
550
+ timeline_data.append({
551
+ 'time': row.time_bucket.isoformat(),
552
+ 'queue': queue_name,
553
+ 'value': int(row.value) or None if idx > 0 and end_index != idx else int(row.value)
554
+ })
555
+
556
+ return {
557
+ "data": timeline_data,
558
+ "granularity": granularity
559
+ }
560
+
561
+ except Exception as e:
562
+ logger.warning(f"查询时序数据失败,返回当前快照: {e}")
563
+ traceback.print_exc()
564
+
565
+ # 如果查询失败,返回Redis当前快照
566
+ redis_client = await conn.get_redis_client()
567
+ try:
568
+ timeline_data = []
569
+ for queue_name in (query.queues or []):
570
+ queue_key = f"{conn.redis_prefix}:QUEUE:{queue_name}"
571
+ queue_len = await redis_client.xlen(queue_key)
572
+
573
+ timeline_data.append({
574
+ 'time': end_time.isoformat(),
575
+ 'queue': queue_name,
576
+ 'value': queue_len
577
+ })
578
+
579
+ return {
580
+ "data": timeline_data,
581
+ "granularity": "snapshot"
582
+ }
583
+ finally:
584
+ await redis_client.aclose()
585
+
586
+ except Exception as e:
587
+ logger.error(f"获取队列时间线失败: {e}")
588
+ traceback.print_exc()
589
+ raise HTTPException(status_code=500, detail=str(e))
590
+
591
+
592
+ @router.get("/queue-consumers/{namespace}/{queue_name}")
593
+ async def get_queue_consumers(namespace: str, queue_name: str):
594
+ """
595
+ 获取指定命名空间中队列的消费者信息
596
+
597
+ Args:
598
+ namespace: 命名空间名称
599
+ queue_name: 队列名称
600
+ """
601
+ try:
602
+ conn = await data_access.manager.get_connection(namespace)
603
+ redis_client = await conn.get_redis_client()
604
+
605
+ try:
606
+ queue_key = f"{conn.redis_prefix}:QUEUE:{queue_name}"
607
+
608
+ # 获取消费组信息
609
+ groups_info = await redis_client.xinfo_groups(queue_key)
610
+
611
+ result = {
612
+ 'queue_name': queue_name,
613
+ 'consumer_groups': []
614
+ }
615
+
616
+ for group in groups_info:
617
+ # 获取消费者详情
618
+ consumers_info = await redis_client.xinfo_consumers(queue_key, group['name'])
619
+
620
+ group_data = {
621
+ 'name': group['name'],
622
+ 'consumers': group.get('consumers', 0),
623
+ 'pending': group.get('pending', 0),
624
+ 'last_delivered_id': group.get('last-delivered-id'),
625
+ 'consumer_details': []
626
+ }
627
+
628
+ for consumer in consumers_info:
629
+ group_data['consumer_details'].append({
630
+ 'name': consumer.get('name'),
631
+ 'pending': consumer.get('pending', 0),
632
+ 'idle': consumer.get('idle', 0)
633
+ })
634
+
635
+ result['consumer_groups'].append(group_data)
636
+
637
+ return result
638
+
639
+ finally:
640
+ await redis_client.aclose()
641
+
642
+ except Exception as e:
643
+ logger.error(f"获取消费者信息失败: {e}")
644
+ traceback.print_exc()
645
+ raise HTTPException(status_code=500, detail=str(e))
646
+
647
+
648
+ @router.get("/system-stats/{namespace}")
649
+ async def get_system_stats(namespace: str):
650
+ """
651
+ 获取指定命名空间的系统统计信息
652
+
653
+ Args:
654
+ namespace: 命名空间名称
655
+ """
656
+ try:
657
+ conn = await data_access.manager.get_connection(namespace)
658
+ redis_client = await conn.get_redis_client()
659
+
660
+ try:
661
+ # 统计各种类型的键
662
+ stats = {
663
+ 'namespace': namespace,
664
+ 'queues': 0,
665
+ 'tasks': 0,
666
+ 'delayed_tasks': 0,
667
+ 'workers': 0
668
+ }
669
+
670
+ # 统计队列数量
671
+ queue_pattern = f"{conn.redis_prefix}:QUEUE:*"
672
+ async for _ in redis_client.scan_iter(match=queue_pattern):
673
+ stats['queues'] += 1
674
+
675
+ # 统计任务数量
676
+ task_pattern = f"{conn.redis_prefix}:TASK:*"
677
+ async for _ in redis_client.scan_iter(match=task_pattern):
678
+ stats['tasks'] += 1
679
+
680
+ # 统计延迟任务数量
681
+ delayed_pattern = f"{conn.redis_prefix}:DELAYED_QUEUE:*"
682
+ async for key in redis_client.scan_iter(match=delayed_pattern):
683
+ count = await redis_client.zcard(key)
684
+ stats['delayed_tasks'] += count
685
+
686
+ # 统计工作进程数量
687
+ worker_pattern = f"{conn.redis_prefix}:WORKER:*"
688
+ async for _ in redis_client.scan_iter(match=worker_pattern):
689
+ stats['workers'] += 1
690
+
691
+ return stats
692
+
693
+ finally:
694
+ await redis_client.aclose()
695
+
696
+ except Exception as e:
697
+ logger.error(f"获取系统统计信息失败: {e}")
698
+ traceback.print_exc()
699
+ raise HTTPException(status_code=500, detail=str(e))
700
+
701
+
702
+ @router.post("/queue-details/{namespace}")
703
+ async def get_queue_details(namespace: str, query: TimeRangeQuery):
704
+ """
705
+ 获取指定命名空间中队列的详细信息
706
+
707
+ Args:
708
+ namespace: 命名空间名称
709
+ query: 查询参数(包含队列列表和时间范围)
710
+ """
711
+ try:
712
+ conn = await data_access.manager.get_connection(namespace)
713
+ redis_client = await conn.get_redis_client()
714
+
715
+ # 确定时间范围
716
+ end_time = datetime.now(timezone.utc)
717
+ if query.time_range and query.time_range != 'custom':
718
+ # 解析时间范围字符串
719
+ if query.time_range.endswith('m'):
720
+ minutes = int(query.time_range[:-1])
721
+ start_time = end_time - timedelta(minutes=minutes)
722
+ elif query.time_range.endswith('h'):
723
+ hours = int(query.time_range[:-1])
724
+ start_time = end_time - timedelta(hours=hours)
725
+ elif query.time_range.endswith('d'):
726
+ days = int(query.time_range[:-1])
727
+ start_time = end_time - timedelta(days=days)
728
+ else:
729
+ start_time = end_time - timedelta(minutes=15)
730
+ else:
731
+ # 使用自定义时间范围或默认值
732
+ start_time = query.start_time or (end_time - timedelta(minutes=15))
733
+ end_time = query.end_time or end_time
734
+
735
+ result = []
736
+
737
+ for queue_name in query.queues:
738
+ queue_key = f"{conn.redis_prefix}:QUEUE:{queue_name}"
739
+
740
+ # 获取Redis中的队列信息
741
+ queue_len = await redis_client.xlen(queue_key)
742
+
743
+ # 获取消费组信息
744
+ try:
745
+ groups_info = await redis_client.xinfo_groups(queue_key)
746
+ consumer_groups = len(groups_info)
747
+ total_consumers = sum(g.get('consumers', 0) for g in groups_info)
748
+
749
+ # 计算所有消费者已领取但未确认的消息数(不可见消息)
750
+ invisible_messages_count = 0
751
+ for group in groups_info:
752
+ try:
753
+ consumers = await redis_client.xinfo_consumers(queue_key, group['name'])
754
+ for consumer in consumers:
755
+ invisible_messages_count += consumer.get('pending', 0) # 消费者已领取但未确认
756
+ except:
757
+ pass
758
+
759
+ # 注意:这里不再使用这些变量,将在后面根据数据库的pending任务重新计算
760
+ # visible_messages_count 和 invisible_messages_count 将在获取数据库数据后重新计算
761
+
762
+ except:
763
+ consumer_groups = 0
764
+ total_consumers = 0
765
+ invisible_messages_count = 0
766
+ # visible_messages_count 将在获取数据库数据后计算
767
+
768
+ # 获取活跃的workers数量
769
+ active_workers = 0
770
+ try:
771
+ worker_keys = []
772
+ async for key in redis_client.scan_iter(match=f"{conn.redis_prefix}:WORKER:*"):
773
+ worker_keys.append(key)
774
+
775
+ for worker_key in worker_keys:
776
+ worker_info = await redis_client.hgetall(worker_key)
777
+ if worker_info:
778
+ last_heartbeat = worker_info.get('last_heartbeat')
779
+ if last_heartbeat:
780
+ try:
781
+ heartbeat_time = float(last_heartbeat)
782
+ if time.time() - heartbeat_time < 60: # 60秒内有心跳
783
+ worker_queues = worker_info.get('queues', '')
784
+ if queue_name in worker_queues:
785
+ active_workers += 1
786
+ except:
787
+ pass
788
+ except Exception as e:
789
+ logger.warning(f"获取活跃workers失败: {e}")
790
+ traceback.print_exc()
791
+
792
+ # 从PostgreSQL获取统计数据
793
+ total_tasks = 0
794
+ pending_tasks_db = 0 # 数据库中的pending任务数
795
+ completed_tasks = 0
796
+ failed_tasks = 0
797
+ processing_tasks = 0
798
+ enqueue_rate = 0
799
+ dequeue_rate = 0
800
+ tasks_per_minute = 0
801
+
802
+ if conn.pg_config:
803
+ try:
804
+ async with await conn.get_pg_session() as session:
805
+ # 查询任务统计
806
+ query_sql = text("""
807
+ SELECT
808
+ COUNT(DISTINCT t.stream_id) as total,
809
+ COUNT(DISTINCT CASE WHEN t.stream_id NOT IN (SELECT stream_id FROM task_runs) THEN t.stream_id END) as pending,
810
+ COUNT(DISTINCT CASE WHEN tr.status = 'success' THEN t.stream_id END) as completed,
811
+ COUNT(DISTINCT CASE WHEN tr.status = 'error' THEN t.stream_id END) as failed,
812
+ COUNT(DISTINCT CASE WHEN tr.status = 'pending' THEN t.stream_id END) as processing,
813
+ COUNT(DISTINCT CASE WHEN t.created_at >= :recent_time THEN t.stream_id END) as recent_created,
814
+ COUNT(DISTINCT CASE WHEN tr.status = 'success' AND tr.end_time >= :recent_time THEN t.stream_id END) as recent_completed
815
+ FROM tasks t
816
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
817
+ WHERE t.namespace = :namespace
818
+ -- 匹配基础队列名和所有优先级队列
819
+ AND (t.queue = :queue_name OR t.queue LIKE :queue_pattern)
820
+ AND t.created_at >= :start_time
821
+ AND t.created_at <= :end_time
822
+ """)
823
+
824
+ # 最近1分钟的时间点,用于计算速率
825
+ recent_time = end_time - timedelta(minutes=1)
826
+
827
+ params = {
828
+ 'namespace': namespace,
829
+ 'queue_name': queue_name,
830
+ 'queue_pattern': f'{queue_name}:%', # 匹配所有优先级队列
831
+ 'start_time': start_time,
832
+ 'end_time': end_time,
833
+ 'recent_time': recent_time
834
+ }
835
+ print(f'PostgreSQL查询参数: {params}')
836
+
837
+ result_db = await session.execute(query_sql, params)
838
+
839
+ row = result_db.first()
840
+ print(f'PostgreSQL查询结果: {row=}')
841
+
842
+ # 额外调试:检查是否有该队列的任务(不限时间)
843
+ debug_query = text("SELECT COUNT(*) as count FROM tasks WHERE namespace = :namespace AND (queue = :queue_name OR queue LIKE :queue_pattern)")
844
+ debug_result = await session.execute(debug_query, {
845
+ 'namespace': namespace,
846
+ 'queue_name': queue_name,
847
+ 'queue_pattern': f'{queue_name}:%'
848
+ })
849
+ debug_row = debug_result.first()
850
+ print(f'该队列总任务数(不限时间): {debug_row.count if debug_row else 0}')
851
+ if row:
852
+ total_tasks = row.total or 0
853
+ pending_tasks_db = row.pending or 0 # 获取数据库中的pending任务数
854
+ completed_tasks = row.completed or 0
855
+ failed_tasks = row.failed or 0
856
+ processing_tasks = row.processing or 0
857
+
858
+ # 计算速率(基于最近1分钟)
859
+ enqueue_rate = row.recent_created or 0
860
+ dequeue_rate = row.recent_completed or 0
861
+
862
+ # 计算整个时间段的平均处理速度
863
+ time_diff_minutes = (end_time - start_time).total_seconds() / 60
864
+ if time_diff_minutes > 0:
865
+ tasks_per_minute = round(total_tasks / time_diff_minutes, 2)
866
+
867
+ except Exception as e:
868
+ logger.warning(f"查询数据库失败: {e}")
869
+ traceback.print_exc()
870
+
871
+ # 根据用户要求重新计算可见和不可见消息
872
+ # 注意:这里的计算基于时间范围内的数据库pending任务
873
+ # 但Redis的invisible_messages_count是实时的,可能包含时间范围外的任务
874
+ # 为了保持逻辑一致,我们使用以下计算:
875
+ # - 如果时间范围内没有pending任务,则可见和不可见都为0
876
+ # - 否则,不可见消息取Redis实际值和DB pending任务的较小值
877
+ if pending_tasks_db > 0:
878
+ # 不可见消息不能超过pending任务总数
879
+ actual_invisible = min(invisible_messages_count, pending_tasks_db)
880
+ visible_messages_count = pending_tasks_db - actual_invisible
881
+ else:
882
+ visible_messages_count = 0
883
+ actual_invisible = 0
884
+
885
+ result.append({
886
+ 'queue_name': queue_name,
887
+ # 基于查询条件的数据库统计
888
+ 'message_count': total_tasks, # 符合查询条件的任务总数(来自数据库)
889
+ 'visible_messages': visible_messages_count, # 可见消息 = DB pending - min(Redis invisible, DB pending)
890
+ 'invisible_messages': actual_invisible, # 不可见消息 = min(Redis invisible, DB pending)
891
+ 'processing': processing_tasks, # 数据库中processing状态的任务数
892
+ 'queue_length': queue_len, # Redis实时队列长度(保留用于参考)
893
+
894
+ # 历史统计数据(来自PostgreSQL)- 基于选定时间范围
895
+ 'completed': completed_tasks, # 时间范围内已完成
896
+ 'failed': failed_tasks, # 时间范围内失败
897
+ 'consumption_rate': tasks_per_minute, # 消费速度
898
+ 'enqueue_rate': enqueue_rate, # 入队速率
899
+ 'dequeue_rate': dequeue_rate, # 出队速率
900
+
901
+ # 混合数据
902
+ 'success_rate': round((completed_tasks / total_tasks * 100) if total_tasks > 0 else 0, 2), # 成功率(基于历史)
903
+ 'queue_status': 'active' if queue_len > 0 or active_workers > 0 else 'idle', # 状态(基于实时)
904
+ 'active_workers': active_workers, # 活跃Workers
905
+ 'consumer_groups': consumer_groups, # 消费组数
906
+ 'consumers': total_consumers, # 消费者数
907
+
908
+ # 调试信息(可选)
909
+ 'historical_tasks': total_tasks, # 时间范围内的任务总数(用于调试)
910
+ })
911
+
912
+ await redis_client.aclose()
913
+ return {'success': True, 'data': result}
914
+
915
+ except Exception as e:
916
+ logger.error(f"获取队列详情失败: {e}")
917
+ traceback.print_exc()
918
+ raise HTTPException(status_code=500, detail=str(e))
919
+
920
+
921
+ @router.post("/queue-flow-rates/{namespace}")
922
+ async def get_queue_flow_rates(namespace: str, query: TimeRangeQuery):
923
+ """
924
+ 获取指定命名空间中队列的流量速率(入队、完成、失败)
925
+
926
+ Args:
927
+ namespace: 命名空间名称
928
+ query: 时间范围查询参数
929
+ """
930
+ try:
931
+ print(f'请求参数: get_queue_flow_rates {namespace=}, {query=}')
932
+
933
+ # 使用公共工具函数处理时间范围
934
+ time_range_result = parse_time_range_query(query)
935
+ start_time = time_range_result.start_time
936
+ end_time = time_range_result.end_time
937
+
938
+ print(f'时间范围: {start_time=}, {end_time=}')
939
+
940
+ # 使用命名空间数据访问
941
+ conn = await data_access.manager.get_connection(namespace)
942
+
943
+ # 如果没有PostgreSQL配置,返回空数据
944
+ if not conn.pg_config:
945
+ return {"data": [], "granularity": "minute"}
946
+
947
+ async with await conn.get_pg_session() as session:
948
+ # 如果没有指定队列,获取所有队列
949
+ if not query.queues or len(query.queues) == 0:
950
+ # 获取所有队列名称
951
+ queue_sql = text("""
952
+ SELECT DISTINCT queue
953
+ FROM tasks
954
+ WHERE namespace = :namespace
955
+ ORDER BY queue
956
+ """)
957
+ queue_result = await session.execute(queue_sql, {'namespace': namespace})
958
+ all_queues = [row.queue for row in queue_result.fetchall()]
959
+
960
+ print(f'所有队列: {all_queues=}')
961
+
962
+ if not all_queues:
963
+ return {"data": [], "granularity": "minute"}
964
+
965
+ # 统计所有队列的流量数据
966
+ queue_conditions = "(" + " OR ".join([f"t.queue = '{queue}'" for queue in all_queues]) + ")"
967
+ else:
968
+ # 使用指定的队列(支持基础队列名和优先级队列)
969
+ queue_name = query.queues[0]
970
+ # 匹配精确队列名或带优先级的队列名(如 shared_queue 或 shared_queue:5)
971
+ queue_conditions = f"(t.queue = '{queue_name}' OR t.queue LIKE '{queue_name}:%')"
972
+ print(f'指定队列: {queue_name=}, 条件: {queue_conditions}')
973
+
974
+ # 使用已计算好的时间间隔信息
975
+ interval = time_range_result.interval
976
+ interval_seconds = time_range_result.interval_seconds
977
+ granularity = time_range_result.granularity
978
+
979
+ # 查询流量数据
980
+ flow_sql = text(f"""
981
+ WITH time_series AS (
982
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
983
+ FROM generate_series(
984
+ :start_time ::timestamptz,
985
+ :end_time ::timestamptz + INTERVAL '{interval_seconds} seconds',
986
+ :interval_val ::interval
987
+ ) AS ts
988
+ ),
989
+ enqueue_counts AS (
990
+ SELECT
991
+ to_timestamp(FLOOR(EXTRACT(epoch FROM t.created_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
992
+ COUNT(*) as enqueued
993
+ FROM tasks t
994
+ WHERE t.namespace = :namespace
995
+ AND ({queue_conditions})
996
+ AND t.created_at >= :start_time
997
+ AND t.created_at <= :end_time
998
+ GROUP BY time_bucket
999
+ ),
1000
+ complete_counts AS (
1001
+ SELECT
1002
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.end_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1003
+ COUNT(*) as completed
1004
+ FROM task_runs tr
1005
+ JOIN tasks t ON tr.stream_id = t.stream_id
1006
+ WHERE t.namespace = :namespace
1007
+ AND ({queue_conditions})
1008
+ AND tr.end_time >= :start_time
1009
+ AND tr.end_time <= :end_time
1010
+ AND tr.status = 'success'
1011
+ GROUP BY time_bucket
1012
+ ),
1013
+ failed_counts AS (
1014
+ SELECT
1015
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.end_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1016
+ COUNT(*) as failed
1017
+ FROM task_runs tr
1018
+ JOIN tasks t ON tr.stream_id = t.stream_id
1019
+ WHERE t.namespace = :namespace
1020
+ AND ({queue_conditions})
1021
+ AND tr.end_time >= :start_time
1022
+ AND tr.end_time <= :end_time
1023
+ AND tr.status = 'error'
1024
+ GROUP BY time_bucket
1025
+ )
1026
+ SELECT
1027
+ ts.time_bucket,
1028
+ COALESCE(eq.enqueued, 0) as enqueued,
1029
+ COALESCE(cc.completed, 0) as completed,
1030
+ COALESCE(fc.failed, 0) as failed
1031
+ FROM time_series ts
1032
+ LEFT JOIN enqueue_counts eq ON ts.time_bucket = eq.time_bucket
1033
+ LEFT JOIN complete_counts cc ON ts.time_bucket = cc.time_bucket
1034
+ LEFT JOIN failed_counts fc ON ts.time_bucket = fc.time_bucket
1035
+ ORDER BY ts.time_bucket
1036
+ """)
1037
+
1038
+ # 先查询tasks表看有没有数据(不限时间范围)
1039
+ test_sql = text("""
1040
+ SELECT COUNT(*) as total_count,
1041
+ COUNT(CASE WHEN created_at >= :start_time AND created_at <= :end_time THEN 1 END) as range_count,
1042
+ MIN(created_at) as min_time,
1043
+ MAX(created_at) as max_time
1044
+ FROM tasks
1045
+ WHERE namespace = :namespace
1046
+ """)
1047
+ test_result = await session.execute(test_sql, {
1048
+ 'namespace': namespace,
1049
+ 'start_time': start_time,
1050
+ 'end_time': end_time
1051
+ })
1052
+ test_row = test_result.fetchone()
1053
+ print(f'tasks表统计 - 总记录数: {test_row.total_count}, 时间范围内: {test_row.range_count}')
1054
+ print(f'tasks表时间范围: {test_row.min_time} 到 {test_row.max_time}')
1055
+
1056
+ result = await session.execute(flow_sql, {
1057
+ 'namespace': namespace,
1058
+ 'start_time': start_time,
1059
+ 'end_time': end_time,
1060
+ 'interval_val': interval
1061
+ })
1062
+
1063
+ # 格式化数据
1064
+ data = []
1065
+ rows = result.fetchall()
1066
+ print(f'查询结果行数: {len(rows)}')
1067
+ end_index = len(rows) - 1
1068
+
1069
+ for idx, row in enumerate(rows):
1070
+ time_point = row.time_bucket.isoformat()
1071
+
1072
+ # 直接使用实际值,不管是否为0
1073
+ # 这样可以保持与tasks接口的行为一致
1074
+ data.append({'time': time_point, 'value': row.enqueued, 'metric': '入队速率'})
1075
+ data.append({'time': time_point, 'value': row.completed, 'metric': '完成速率'})
1076
+ data.append({'time': time_point, 'value': row.failed, 'metric': '失败数'})
1077
+
1078
+ return {"data": data, "granularity": granularity}
1079
+
1080
+ # 下面是新的实现,暂时注释掉
1081
+ '''
1082
+ # 获取命名空间连接
1083
+ conn = await data_access.manager.get_connection(namespace)
1084
+
1085
+ # 如果没有PostgreSQL配置,返回模拟数据
1086
+ if not conn.pg_config:
1087
+ # 生成模拟数据
1088
+ end_time = datetime.now(timezone.utc)
1089
+ start_time = end_time - timedelta(minutes=15)
1090
+
1091
+ data = []
1092
+ num_points = 10
1093
+ for i in range(num_points):
1094
+ timestamp = start_time + timedelta(seconds=i * 90)
1095
+ time_str = timestamp.isoformat()
1096
+
1097
+ data.append({'time': time_str, 'value': 10 + i, 'metric': '入队速率'})
1098
+ data.append({'time': time_str, 'value': 8 + i, 'metric': '完成速率'})
1099
+ data.append({'time': time_str, 'value': 1, 'metric': '失败数'})
1100
+
1101
+ return {"data": data, "granularity": "minute"}
1102
+
1103
+ # 处理时间范围
1104
+ end_time = datetime.now(timezone.utc)
1105
+ if query.time_range and query.time_range != 'custom':
1106
+ # 解析时间范围字符串
1107
+ if query.time_range.endswith('m'):
1108
+ minutes = int(query.time_range[:-1])
1109
+ start_time = end_time - timedelta(minutes=minutes)
1110
+ elif query.time_range.endswith('h'):
1111
+ hours = int(query.time_range[:-1])
1112
+ start_time = end_time - timedelta(hours=hours)
1113
+ elif query.time_range.endswith('d'):
1114
+ days = int(query.time_range[:-1])
1115
+ start_time = end_time - timedelta(days=days)
1116
+ else:
1117
+ start_time = end_time - timedelta(minutes=15)
1118
+ else:
1119
+ # 使用自定义时间范围或默认值
1120
+ start_time = query.start_time or (end_time - timedelta(minutes=15))
1121
+ end_time = query.end_time or end_time
1122
+
1123
+ # 从PostgreSQL查询流量数据
1124
+ async with await conn.get_pg_session() as session:
1125
+ # 动态计算时间间隔
1126
+ duration = (end_time - start_time).total_seconds()
1127
+ TARGET_POINTS = 200
1128
+ ideal_interval_seconds = duration / TARGET_POINTS
1129
+
1130
+ # 选择合适的间隔
1131
+ if ideal_interval_seconds <= 1:
1132
+ interval = '1 second'
1133
+ granularity = 'second'
1134
+ elif ideal_interval_seconds <= 5:
1135
+ interval = '5 seconds'
1136
+ granularity = 'second'
1137
+ elif ideal_interval_seconds <= 10:
1138
+ interval = '10 seconds'
1139
+ granularity = 'second'
1140
+ elif ideal_interval_seconds <= 30:
1141
+ interval = '30 seconds'
1142
+ granularity = 'second'
1143
+ elif ideal_interval_seconds <= 60:
1144
+ interval = '1 minute'
1145
+ granularity = 'minute'
1146
+ elif ideal_interval_seconds <= 300:
1147
+ interval = '5 minutes'
1148
+ granularity = 'minute'
1149
+ elif ideal_interval_seconds <= 600:
1150
+ interval = '10 minutes'
1151
+ granularity = 'minute'
1152
+ elif ideal_interval_seconds <= 900:
1153
+ interval = '15 minutes'
1154
+ granularity = 'minute'
1155
+ elif ideal_interval_seconds <= 1800:
1156
+ interval = '30 minutes'
1157
+ granularity = 'minute'
1158
+ elif ideal_interval_seconds <= 3600:
1159
+ interval = '1 hour'
1160
+ granularity = 'hour'
1161
+ else:
1162
+ interval = '1 day'
1163
+ granularity = 'day'
1164
+
1165
+ # 构建筛选条件
1166
+ filter_conditions = []
1167
+ if query.filters:
1168
+ for filter_item in query.filters:
1169
+ field = filter_item.get('field')
1170
+ operator = filter_item.get('operator')
1171
+ value = filter_item.get('value')
1172
+
1173
+ if field and operator and value:
1174
+ if operator == 'eq':
1175
+ filter_conditions.append(f"AND {field} = '{value}'")
1176
+ elif operator == 'ne':
1177
+ filter_conditions.append(f"AND {field} != '{value}'")
1178
+ elif operator == 'contains':
1179
+ filter_conditions.append(f"AND {field} LIKE '%{value}%'")
1180
+
1181
+ extra_where = " ".join(filter_conditions)
1182
+
1183
+ # 根据间隔确定聚合粒度
1184
+ if granularity == 'second':
1185
+ if ideal_interval_seconds <= 1:
1186
+ time_trunc = 'second'
1187
+ interval_seconds = 1
1188
+ elif ideal_interval_seconds <= 5:
1189
+ time_trunc = '5 seconds'
1190
+ interval_seconds = 5
1191
+ elif ideal_interval_seconds <= 10:
1192
+ time_trunc = '10 seconds'
1193
+ interval_seconds = 10
1194
+ else:
1195
+ time_trunc = '30 seconds'
1196
+ interval_seconds = 30
1197
+ elif granularity == 'minute':
1198
+ time_trunc = 'minute'
1199
+ interval_seconds = 60
1200
+ elif granularity == 'hour':
1201
+ time_trunc = 'hour'
1202
+ interval_seconds = 3600
1203
+ else:
1204
+ time_trunc = 'day'
1205
+ interval_seconds = 86400
1206
+
1207
+ # 查询入队速率、完成速率和失败数
1208
+ # 重要:时间序列也要对齐到相同的时间桶
1209
+ query_sql = text(f"""
1210
+ WITH time_series AS (
1211
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
1212
+ FROM generate_series(
1213
+ :start_time ::timestamptz,
1214
+ :end_time ::timestamptz,
1215
+ :interval_val ::interval
1216
+ ) AS ts
1217
+ ),
1218
+ enqueued_rate AS (
1219
+ SELECT
1220
+ to_timestamp(FLOOR(EXTRACT(epoch FROM created_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1221
+ COUNT(*) AS count
1222
+ FROM tasks
1223
+ WHERE namespace = :namespace
1224
+ AND queue_name = :queue_name
1225
+ AND created_at >= :start_time
1226
+ AND created_at <= :end_time
1227
+ GROUP BY 1
1228
+ ),
1229
+ completed_rate AS (
1230
+ SELECT
1231
+ to_timestamp(FLOOR(EXTRACT(epoch FROM completed_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1232
+ COUNT(*) AS count
1233
+ FROM tasks
1234
+ WHERE namespace = :namespace
1235
+ AND queue_name = :queue_name
1236
+ AND completed_at >= :start_time
1237
+ AND completed_at <= :end_time
1238
+ AND status = 'success'
1239
+ GROUP BY 1
1240
+ ),
1241
+ failed_rate AS (
1242
+ SELECT
1243
+ to_timestamp(FLOOR(EXTRACT(epoch FROM completed_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1244
+ COUNT(*) AS count
1245
+ FROM tasks
1246
+ WHERE namespace = :namespace
1247
+ AND queue_name = :queue_name
1248
+ AND completed_at >= :start_time
1249
+ AND completed_at <= :end_time
1250
+ AND status = 'error'
1251
+ GROUP BY 1
1252
+ )
1253
+ SELECT
1254
+ ts.time_bucket,
1255
+ COALESCE(e.count, 0) AS enqueued,
1256
+ COALESCE(c.count, 0) AS completed,
1257
+ COALESCE(f.count, 0) AS failed
1258
+ FROM time_series ts
1259
+ LEFT JOIN enqueued_rate e ON ts.time_bucket = e.time_bucket
1260
+ LEFT JOIN completed_rate c ON ts.time_bucket = c.time_bucket
1261
+ LEFT JOIN failed_rate f ON ts.time_bucket = f.time_bucket
1262
+ ORDER BY ts.time_bucket
1263
+ """)
1264
+
1265
+ result = await session.execute(query_sql, {
1266
+ 'namespace': namespace,
1267
+ 'queue_name': queue_name,
1268
+ 'start_time': start_time,
1269
+ 'end_time': end_time,
1270
+ 'interval_val': interval
1271
+ })
1272
+
1273
+ # 格式化数据为前端需要的格式
1274
+ data = []
1275
+ rows = result.fetchall()
1276
+ end_index = len(rows) - 1
1277
+
1278
+ for idx, row in enumerate(rows):
1279
+ time_str = row.time_bucket.isoformat()
1280
+
1281
+ # 对于中间的数据点,如果值为0则设为None,让图表自动连接
1282
+ # 只保留第一个和最后一个点的0值
1283
+ enqueued_val = row.enqueued if row.enqueued > 0 or idx == 0 or idx == end_index else None
1284
+ completed_val = row.completed if row.completed > 0 or idx == 0 or idx == end_index else None
1285
+ failed_val = row.failed if row.failed > 0 or idx == 0 or idx == end_index else None
1286
+
1287
+ data.append({'time': time_str, 'value': enqueued_val, 'metric': '入队速率'})
1288
+ data.append({'time': time_str, 'value': completed_val, 'metric': '完成速率'})
1289
+ data.append({'time': time_str, 'value': failed_val, 'metric': '失败数'})
1290
+
1291
+ return {"data": data, "granularity": granularity}
1292
+ '''
1293
+
1294
+ except Exception as e:
1295
+ logger.error(f"获取队列流量速率失败: {e}")
1296
+ traceback.print_exc()
1297
+
1298
+ await handle_database_connection_error(e, namespace, "获取队列流量数据")
1299
+
1300
+
1301
+ @router.get("/dashboard-stats/{namespace}")
1302
+ async def get_dashboard_stats(
1303
+ namespace: str,
1304
+ time_range: str = "24h",
1305
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
1306
+ ):
1307
+ """
1308
+ 获取仪表板统计数据(任务总数、成功数、失败数、成功率、吞吐量等)
1309
+
1310
+ Args:
1311
+ namespace: 命名空间名称
1312
+ time_range: 时间范围(如'1h', '24h', '7d')
1313
+ """
1314
+ try:
1315
+ conn = await data_access.manager.get_connection(namespace)
1316
+
1317
+ # 如果没有PostgreSQL配置,返回空数据
1318
+ if not conn.pg_config:
1319
+ return {
1320
+ "success": True,
1321
+ "data": {
1322
+ "total_tasks": 0,
1323
+ "completed_tasks": 0,
1324
+ "failed_tasks": 0,
1325
+ "running_tasks": 0,
1326
+ "pending_tasks": 0,
1327
+ "success_rate": 0,
1328
+ "throughput": 0,
1329
+ "avg_processing_time": 0,
1330
+ "total_queues": 0
1331
+ }
1332
+ }
1333
+
1334
+ # 计算时间范围
1335
+ end_time = datetime.now(timezone.utc)
1336
+ if time_range.endswith('m'):
1337
+ minutes = int(time_range[:-1])
1338
+ start_time = end_time - timedelta(minutes=minutes)
1339
+ elif time_range.endswith('h'):
1340
+ hours = int(time_range[:-1])
1341
+ start_time = end_time - timedelta(hours=hours)
1342
+ elif time_range.endswith('d'):
1343
+ days = int(time_range[:-1])
1344
+ start_time = end_time - timedelta(days=days)
1345
+ else:
1346
+ start_time = end_time - timedelta(hours=24) # 默认24小时
1347
+
1348
+ # 构建队列筛选条件
1349
+ queue_filter, queue_list, queue_params = build_queue_filter_and_params(queues)
1350
+ print(f'🔍 Dashboard Stats - 收到队列参数: {queues}')
1351
+ print(f'🔍 Dashboard Stats - 解析后的队列列表: {queue_list}')
1352
+ print(f'🔍 Dashboard Stats - SQL筛选条件: {queue_filter}')
1353
+ print(f'🔍 Dashboard Stats - 查询参数: {queue_params}')
1354
+ async with await conn.get_pg_session() as session:
1355
+ # 获取任务统计数据
1356
+ # 修复:正确区分pending任务(在tasks表但不在task_runs表中的任务)
1357
+ stats_sql = text(f"""
1358
+ WITH task_stats AS (
1359
+ SELECT
1360
+ t.stream_id,
1361
+ t.created_at,
1362
+ t.queue,
1363
+ tr.status,
1364
+ tr.execution_time,
1365
+ tr.end_time
1366
+ FROM tasks t
1367
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
1368
+ WHERE t.namespace = :namespace
1369
+ AND t.created_at >= :start_time
1370
+ AND t.created_at <= :end_time
1371
+ {queue_filter}
1372
+ )
1373
+ SELECT
1374
+ COUNT(DISTINCT stream_id) as total_tasks,
1375
+ COUNT(DISTINCT CASE WHEN status = 'success' THEN stream_id END) as completed_tasks,
1376
+ COUNT(DISTINCT CASE WHEN status = 'error' THEN stream_id END) as failed_tasks,
1377
+ COUNT(DISTINCT CASE WHEN status = 'running' THEN stream_id END) as running_tasks,
1378
+ -- pending任务:在tasks表中但没有在task_runs表中(status为NULL)或status='pending'
1379
+ COUNT(DISTINCT CASE WHEN status IS NULL OR status = 'pending' THEN stream_id END) as pending_tasks,
1380
+ COUNT(DISTINCT queue) as total_queues,
1381
+ AVG(CASE WHEN status = 'success' AND execution_time IS NOT NULL
1382
+ THEN execution_time END) as avg_execution_time
1383
+ FROM task_stats
1384
+ """)
1385
+
1386
+ # 准备查询参数
1387
+ query_params = {
1388
+ 'namespace': namespace,
1389
+ 'start_time': start_time,
1390
+ 'end_time': end_time,
1391
+ **queue_params
1392
+ }
1393
+
1394
+ print(f'🔍 Dashboard Stats - 最终SQL: {stats_sql}')
1395
+ print(f'🔍 Dashboard Stats - 最终查询参数: {query_params}')
1396
+
1397
+ # 调试:查看数据库中实际的队列名称
1398
+ debug_sql = text("""
1399
+ SELECT DISTINCT t.queue
1400
+ FROM tasks t
1401
+ WHERE t.namespace = :namespace
1402
+ AND t.created_at >= :start_time
1403
+ AND t.created_at <= :end_time
1404
+ LIMIT 10
1405
+ """)
1406
+ debug_result = await session.execute(debug_sql, {
1407
+ 'namespace': namespace,
1408
+ 'start_time': start_time,
1409
+ 'end_time': end_time
1410
+ })
1411
+ actual_queues = [row.queue for row in debug_result.fetchall()]
1412
+ print(f'🔍 Dashboard Stats - 数据库中实际的队列名称: {actual_queues}')
1413
+
1414
+ result = await session.execute(stats_sql, query_params)
1415
+
1416
+ row = result.first()
1417
+
1418
+ if row:
1419
+ total_tasks = row.total_tasks or 0
1420
+ completed_tasks = row.completed_tasks or 0
1421
+ failed_tasks = row.failed_tasks or 0
1422
+ pending_tasks = row.pending_tasks or 0
1423
+ total_queues = row.total_queues or 0
1424
+ avg_execution_time = row.avg_execution_time or 0
1425
+
1426
+ # 从task_runs表计算当前正在执行的任务数(start_time <= now < end_time)
1427
+ running_tasks = 0
1428
+ try:
1429
+ running_sql = text(f"""
1430
+ SELECT COUNT(*) as total_running
1431
+ FROM task_runs tr
1432
+ JOIN tasks t ON tr.stream_id = t.stream_id
1433
+ WHERE t.namespace = :namespace
1434
+ AND tr.start_time IS NOT NULL
1435
+ AND tr.start_time <= NOW()
1436
+ AND (tr.end_time IS NULL OR tr.end_time > NOW())
1437
+ {queue_filter}
1438
+ """)
1439
+
1440
+ running_result = await session.execute(running_sql, query_params)
1441
+
1442
+ running_row = running_result.first()
1443
+ running_tasks = int(running_row.total_running) if running_row else 0
1444
+
1445
+ except Exception as e:
1446
+ logger.warning(f"计算当前并发任务数失败,使用默认值: {e}")
1447
+ traceback.print_exc()
1448
+ running_tasks = row.running_tasks or 0
1449
+
1450
+ # 计算成功率
1451
+ success_rate = round((completed_tasks / total_tasks * 100) if total_tasks > 0 else 0, 1)
1452
+
1453
+ # 计算吞吐量(每分钟完成的任务数)
1454
+ # 使用递进的时间窗口来找到最合适的吞吐量计算方式
1455
+ recent_end_time = datetime.now(timezone.utc)
1456
+ throughput = 0
1457
+
1458
+ # 尝试多个时间窗口,从短到长,找到有足够数据的窗口
1459
+ time_windows = [
1460
+ (5, "最近5分钟"),
1461
+ (10, "最近10分钟"),
1462
+ (30, "最近30分钟"),
1463
+ (60, "最近1小时")
1464
+ ]
1465
+
1466
+ for window_minutes, window_desc in time_windows:
1467
+ recent_start_time = recent_end_time - timedelta(minutes=window_minutes)
1468
+
1469
+ recent_query = text(f"""
1470
+ SELECT COUNT(DISTINCT t.stream_id) as recent_completed
1471
+ FROM tasks t
1472
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
1473
+ WHERE t.namespace = :namespace
1474
+ AND tr.status = 'success'
1475
+ AND tr.end_time >= :recent_start_time
1476
+ AND tr.end_time <= :recent_end_time
1477
+ {queue_filter}
1478
+ """)
1479
+
1480
+ # 准备吞吐量查询参数
1481
+ throughput_params = {
1482
+ 'namespace': namespace,
1483
+ 'recent_start_time': recent_start_time,
1484
+ 'recent_end_time': recent_end_time,
1485
+ **queue_params
1486
+ }
1487
+
1488
+ recent_result = await session.execute(recent_query, throughput_params)
1489
+
1490
+ recent_row = recent_result.first()
1491
+ recent_completed = recent_row.recent_completed if recent_row else 0
1492
+
1493
+ print(f'🔍 Dashboard Stats - 吞吐量计算 {window_desc}: {recent_completed} 个任务完成')
1494
+
1495
+ # 如果这个时间窗口有足够的数据(至少5个任务),就使用它
1496
+ if recent_completed >= 5:
1497
+ throughput = round(recent_completed / window_minutes, 1)
1498
+ logger.info(f"使用{window_desc}计算吞吐量: {recent_completed}个任务/{window_minutes}分钟 = {throughput}任务/分钟")
1499
+ break
1500
+ elif recent_completed > 0:
1501
+ # 如果有少量数据,也计算但继续寻找更好的窗口
1502
+ throughput = round(recent_completed / window_minutes, 1)
1503
+
1504
+ # 如果所有窗口都没有数据,吞吐量为0
1505
+ if throughput == 0:
1506
+ logger.info("最近1小时内没有完成的任务,吞吐量为0")
1507
+
1508
+ # 将execution_time从秒转换为毫秒
1509
+ avg_processing_time = round(avg_execution_time * 1000 if avg_execution_time else 0, 1)
1510
+
1511
+ # 同时获取任务数量分布数据(按队列分组,不区分状态)
1512
+ distribution_sql = text(f"""
1513
+ SELECT
1514
+ t.queue,
1515
+ COUNT(DISTINCT t.stream_id) as count
1516
+ FROM tasks t
1517
+ WHERE t.namespace = :namespace
1518
+ AND t.created_at >= :start_time
1519
+ AND t.created_at <= :end_time
1520
+ {queue_filter}
1521
+ GROUP BY t.queue
1522
+ ORDER BY count DESC, t.queue
1523
+ """)
1524
+
1525
+ distribution_result = await session.execute(distribution_sql, query_params)
1526
+
1527
+ # 格式化分布数据为饼图格式(只按队列,不区分状态)
1528
+ distribution_data = []
1529
+
1530
+ for row in distribution_result.fetchall():
1531
+ queue = row.queue
1532
+ count = row.count
1533
+
1534
+ if count > 0:
1535
+ distribution_data.append({
1536
+ 'type': queue, # 直接使用队列名,不添加状态后缀
1537
+ 'value': count,
1538
+ 'queue': queue,
1539
+ 'status': 'all' # 表示所有状态
1540
+ })
1541
+
1542
+ # 如果没有数据,返回默认值
1543
+ if not distribution_data:
1544
+ distribution_data = [
1545
+ {'type': '暂无数据', 'value': 1, 'queue': '', 'status': 'empty'}
1546
+ ]
1547
+
1548
+ return {
1549
+ "success": True,
1550
+ "data": {
1551
+ "total_tasks": total_tasks,
1552
+ "completed_tasks": completed_tasks,
1553
+ "failed_tasks": failed_tasks,
1554
+ "running_tasks": running_tasks,
1555
+ "pending_tasks": pending_tasks,
1556
+ "success_rate": success_rate,
1557
+ "throughput": throughput,
1558
+ "avg_processing_time": avg_processing_time,
1559
+ "total_queues": total_queues,
1560
+ "time_range": time_range,
1561
+ "start_time": start_time.isoformat(),
1562
+ "end_time": end_time.isoformat(),
1563
+ "task_distribution": distribution_data # 新增:任务状态分布数据
1564
+ }
1565
+ }
1566
+ else:
1567
+ return {
1568
+ "success": True,
1569
+ "data": {
1570
+ "total_tasks": 0,
1571
+ "completed_tasks": 0,
1572
+ "failed_tasks": 0,
1573
+ "running_tasks": 0,
1574
+ "pending_tasks": 0,
1575
+ "success_rate": 0,
1576
+ "throughput": 0,
1577
+ "avg_processing_time": 0,
1578
+ "total_queues": 0
1579
+ }
1580
+ }
1581
+
1582
+ except Exception as e:
1583
+ logger.error(f"获取仪表板统计数据失败: {e}")
1584
+ traceback.print_exc()
1585
+ raise HTTPException(status_code=500, detail=str(e))
1586
+
1587
+
1588
+ @router.get("/queue-backlog-trend/{namespace}")
1589
+ async def get_queue_backlog_trend(
1590
+ namespace: str,
1591
+ time_range: str = "1h",
1592
+ queue_name: Optional[str] = None,
1593
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
1594
+ ):
1595
+ """
1596
+ 获取队列积压趋势数据(排队任务数)
1597
+
1598
+ Args:
1599
+ namespace: 命名空间名称
1600
+ time_range: 时间范围(如'1h', '24h', '7d')
1601
+ queue_name: 队列名称(可选,不指定则获取所有队列的总积压)
1602
+ """
1603
+ try:
1604
+ conn = await data_access.manager.get_connection(namespace)
1605
+
1606
+ # 使用公共工具函数处理时间范围
1607
+ time_range_result = parse_time_range_string(time_range)
1608
+ start_time = time_range_result.start_time
1609
+ end_time = time_range_result.end_time
1610
+ interval = time_range_result.interval
1611
+ interval_seconds = time_range_result.interval_seconds
1612
+
1613
+ # 如果没有PostgreSQL配置,返回模拟数据
1614
+ if not conn.pg_config:
1615
+ # 生成模拟的排队任务数趋势
1616
+ data = []
1617
+ num_points = min(50, max(10, int((end_time - start_time).total_seconds() / interval_seconds)))
1618
+
1619
+ for i in range(num_points):
1620
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
1621
+ data.append({
1622
+ 'time': timestamp.isoformat(),
1623
+ 'value': max(0, 5 + int(2 * (0.5 - abs(i - num_points/2) / (num_points/2)))), # 模拟波动
1624
+ 'metric': '排队任务数'
1625
+ })
1626
+
1627
+ return {"data": data, "granularity": "minute"}
1628
+
1629
+ async with await conn.get_pg_session() as session:
1630
+ # 从stream_backlog_monitor表获取数据,使用pending_count字段
1631
+ try:
1632
+ backlog_sql = text(f"""
1633
+ WITH time_series AS (
1634
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
1635
+ FROM generate_series(
1636
+ :start_time ::timestamptz,
1637
+ :end_time ::timestamptz,
1638
+ :interval_val ::interval
1639
+ ) AS ts
1640
+ ),
1641
+ backlog_data AS (
1642
+ SELECT
1643
+ to_timestamp(FLOOR(EXTRACT(epoch FROM created_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1644
+ -- 使用pending_count字段,它表示实际的待处理任务数
1645
+ -- 如果没有pending_count,则使用0
1646
+ MAX(COALESCE(pending_count, 0)) as max_pending
1647
+ FROM stream_backlog_monitor
1648
+ WHERE namespace = :namespace
1649
+ AND created_at >= :start_time
1650
+ AND created_at <= :end_time
1651
+ {f"AND stream_name = :queue_name" if queue_name else ""}
1652
+ GROUP BY 1
1653
+ )
1654
+ SELECT
1655
+ ts.time_bucket,
1656
+ COALESCE(bd.max_pending, 0) as pending_value
1657
+ FROM time_series ts
1658
+ LEFT JOIN backlog_data bd ON ts.time_bucket = bd.time_bucket
1659
+ ORDER BY ts.time_bucket
1660
+ """)
1661
+
1662
+ params = {
1663
+ 'namespace': namespace,
1664
+ 'start_time': start_time,
1665
+ 'end_time': end_time,
1666
+ 'interval_val': interval
1667
+ }
1668
+ if queue_name:
1669
+ params['queue_name'] = queue_name
1670
+
1671
+ result = await session.execute(backlog_sql, params)
1672
+ rows = result.fetchall()
1673
+
1674
+ # 直接使用pending_count数据,不需要检查是否有非零值
1675
+ data = []
1676
+ for idx, row in enumerate(rows):
1677
+ # 第0个和最后一个元素的value不能为null,其他的可以为null
1678
+ is_first_or_last = idx == 0 or idx == len(rows) - 1
1679
+ value = int(row.pending_value) if row.pending_value > 0 else (0 if is_first_or_last else None)
1680
+
1681
+ data.append({
1682
+ 'time': row.time_bucket.isoformat(),
1683
+ 'value': value,
1684
+ 'metric': '排队任务数'
1685
+ })
1686
+ return {"data": data, "granularity": "minute"}
1687
+
1688
+ except Exception as e:
1689
+ logger.error(f"从stream_backlog_monitor获取数据失败: {e}")
1690
+ traceback.print_exc()
1691
+ raise HTTPException(status_code=500, detail=f"获取排队任务数据失败: {str(e)}")
1692
+
1693
+ except Exception as e:
1694
+ logger.error(f"获取队列积压趋势失败: {e}")
1695
+ traceback.print_exc()
1696
+ raise HTTPException(status_code=500, detail=str(e))
1697
+
1698
+
1699
+ @router.get("/task-creation-latency/{namespace}")
1700
+ async def get_task_creation_latency(
1701
+ namespace: str,
1702
+ time_range: str = "1h"
1703
+ ):
1704
+ """
1705
+ 获取任务创建延时趋势数据
1706
+
1707
+ Args:
1708
+ namespace: 命名空间名称
1709
+ time_range: 时间范围(如'1h', '24h', '7d')
1710
+ """
1711
+ try:
1712
+ conn = await data_access.manager.get_connection(namespace)
1713
+
1714
+ # 计算时间范围
1715
+ end_time = datetime.now(timezone.utc)
1716
+ if time_range.endswith('m'):
1717
+ minutes = int(time_range[:-1])
1718
+ start_time = end_time - timedelta(minutes=minutes)
1719
+ interval = '1 minute'
1720
+ interval_seconds = 60
1721
+ elif time_range.endswith('h'):
1722
+ hours = int(time_range[:-1])
1723
+ start_time = end_time - timedelta(hours=hours)
1724
+ interval = '5 minutes' if hours <= 6 else '10 minutes'
1725
+ interval_seconds = 300 if hours <= 6 else 600
1726
+ elif time_range.endswith('d'):
1727
+ days = int(time_range[:-1])
1728
+ start_time = end_time - timedelta(days=days)
1729
+ interval = '1 hour'
1730
+ interval_seconds = 3600
1731
+ else:
1732
+ start_time = end_time - timedelta(hours=1)
1733
+ interval = '1 minute'
1734
+ interval_seconds = 60
1735
+
1736
+ # 如果没有PostgreSQL配置,返回模拟数据
1737
+ if not conn.pg_config:
1738
+ data = []
1739
+ num_points = min(30, max(10, int((end_time - start_time).total_seconds() / interval_seconds)))
1740
+
1741
+ for i in range(num_points):
1742
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
1743
+ data.append({
1744
+ 'time': timestamp.isoformat(),
1745
+ 'value': 50 + 20 * (0.5 - abs((i - num_points/2) / (num_points/2))), # 模拟延时波动
1746
+ 'type': '创建延时'
1747
+ })
1748
+
1749
+ return {"data": data, "granularity": "minute"}
1750
+
1751
+ async with await conn.get_pg_session() as session:
1752
+ # 计算任务创建延时(从提交到开始处理的时间)
1753
+ try:
1754
+ latency_sql = text(f"""
1755
+ WITH time_series AS (
1756
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
1757
+ FROM generate_series(
1758
+ :start_time ::timestamptz,
1759
+ :end_time ::timestamptz,
1760
+ :interval_val ::interval
1761
+ ) AS ts
1762
+ ),
1763
+ task_latency AS (
1764
+ SELECT
1765
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.start_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
1766
+ AVG(EXTRACT(epoch FROM (tr.start_time - t.created_at))) as avg_latency_seconds
1767
+ FROM tasks t
1768
+ JOIN task_runs tr ON t.stream_id = tr.stream_id
1769
+ WHERE t.namespace = :namespace
1770
+ AND tr.start_time >= :start_time
1771
+ AND tr.start_time <= :end_time
1772
+ AND tr.start_time IS NOT NULL
1773
+ GROUP BY 1
1774
+ )
1775
+ SELECT
1776
+ ts.time_bucket,
1777
+ COALESCE(tl.avg_latency_seconds, 0) as latency_ms
1778
+ FROM time_series ts
1779
+ LEFT JOIN task_latency tl ON ts.time_bucket = tl.time_bucket
1780
+ ORDER BY ts.time_bucket
1781
+ """)
1782
+
1783
+ result = await session.execute(latency_sql, {
1784
+ 'namespace': namespace,
1785
+ 'start_time': start_time,
1786
+ 'end_time': end_time,
1787
+ 'interval_val': interval
1788
+ })
1789
+
1790
+ data = []
1791
+ for row in result.fetchall():
1792
+ # 转换为毫秒
1793
+ latency_ms = row.latency_ms * 1000 if row.latency_ms > 0 else None
1794
+ data.append({
1795
+ 'time': row.time_bucket.isoformat(),
1796
+ 'value': round(latency_ms, 1) if latency_ms else None,
1797
+ 'type': '创建延时'
1798
+ })
1799
+
1800
+ return {"data": data, "granularity": "minute"}
1801
+
1802
+ except Exception as e:
1803
+ logger.error(f"获取任务创建延时失败: {e}")
1804
+ traceback.print_exc()
1805
+ raise HTTPException(status_code=500, detail=f"获取创建延时数据失败: {str(e)}")
1806
+
1807
+ except Exception as e:
1808
+ logger.error(f"获取任务创建延时失败: {e}")
1809
+ traceback.print_exc()
1810
+ raise HTTPException(status_code=500, detail=str(e))
1811
+
1812
+
1813
+ @router.get("/top-queues/{namespace}")
1814
+ async def get_top_queues(
1815
+ namespace: str,
1816
+ metric: str = Query("backlog", description="指标类型: backlog(积压) 或 error(错误率)"),
1817
+ limit: int = 10,
1818
+ time_range: str = "24h",
1819
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
1820
+ ):
1821
+ """
1822
+ 获取队列排行榜 - 支持积压和错误率两种指标
1823
+
1824
+ Args:
1825
+ namespace: 命名空间名称
1826
+ metric: 指标类型 (backlog/error)
1827
+ limit: 返回的队列数量限制
1828
+ time_range: 时间范围
1829
+ """
1830
+ if metric == "backlog":
1831
+ return await _get_top_backlog_queues(namespace, limit, queues)
1832
+ elif metric == "error":
1833
+ return await _get_top_error_queues(namespace, limit, time_range, queues)
1834
+ else:
1835
+ raise HTTPException(status_code=400, detail=f"不支持的指标类型: {metric}")
1836
+
1837
+
1838
+ @router.get("/top-backlog-queues/{namespace}")
1839
+ async def get_top_backlog_queues(
1840
+ namespace: str,
1841
+ limit: int = 10,
1842
+ time_range: str = Query("1h", description="时间范围,如1h、24h、7d"),
1843
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
1844
+ ):
1845
+ """
1846
+ 获取积压最多的队列Top10 (已废弃,请使用 /top-queues/{namespace}?metric=backlog)
1847
+
1848
+ Args:
1849
+ namespace: 命名空间名称
1850
+ limit: 返回的队列数量限制
1851
+ time_range: 时间范围,如1h、24h、7d
1852
+ """
1853
+ return await _get_top_backlog_queues(namespace, limit, time_range, queues)
1854
+
1855
+
1856
+ async def _get_top_backlog_queues(
1857
+ namespace: str,
1858
+ limit: int = 10,
1859
+ time_range: str = "1h",
1860
+ queues: Optional[str] = None
1861
+ ):
1862
+ """
1863
+ 内部方法:获取积压最多的队列Top10
1864
+ """
1865
+ try:
1866
+ conn = await data_access.manager.get_connection(namespace)
1867
+
1868
+ # 计算时间范围
1869
+ end_time = datetime.now(timezone.utc)
1870
+ if time_range.endswith('h'):
1871
+ hours = int(time_range[:-1])
1872
+ start_time = end_time - timedelta(hours=hours)
1873
+ elif time_range.endswith('d'):
1874
+ days = int(time_range[:-1])
1875
+ start_time = end_time - timedelta(days=days)
1876
+ else:
1877
+ # 默认1小时
1878
+ start_time = end_time - timedelta(hours=1)
1879
+
1880
+ # 如果没有PostgreSQL配置,返回空数据
1881
+ if not conn.pg_config:
1882
+ return {
1883
+ "success": True,
1884
+ "data": []
1885
+ }
1886
+
1887
+ async with await conn.get_pg_session() as session:
1888
+ try:
1889
+ # 处理队列筛选参数
1890
+ queue_list = []
1891
+ if queues:
1892
+ queue_list = [q.strip() for q in queues.split(',') if q.strip()]
1893
+
1894
+ # 优先从stream_backlog_monitor获取最新的积压数据
1895
+ # 使用backlog_unprocessed字段表示总积压(包括未投递和已投递未处理的消息)
1896
+ if queue_list:
1897
+ backlog_sql = text("""
1898
+ SELECT
1899
+ stream_name as queue,
1900
+ MAX(backlog_unprocessed) as backlog,
1901
+ CASE
1902
+ WHEN MAX(backlog_unprocessed) > 100 THEN 'critical'
1903
+ WHEN MAX(backlog_unprocessed) > 50 THEN 'warning'
1904
+ ELSE 'normal'
1905
+ END as status
1906
+ FROM stream_backlog_monitor
1907
+ WHERE namespace = :namespace
1908
+ AND created_at >= :start_time
1909
+ AND created_at <= :end_time
1910
+ AND stream_name = ANY(:queues)
1911
+ GROUP BY stream_name
1912
+ HAVING MAX(backlog_unprocessed) > 0
1913
+ ORDER BY backlog DESC
1914
+ LIMIT :limit
1915
+ """)
1916
+
1917
+ result = await session.execute(backlog_sql, {
1918
+ 'namespace': namespace,
1919
+ 'start_time': start_time,
1920
+ 'end_time': end_time,
1921
+ 'queues': queue_list,
1922
+ 'limit': limit
1923
+ })
1924
+ else:
1925
+ backlog_sql = text("""
1926
+ SELECT
1927
+ stream_name as queue,
1928
+ MAX(backlog_unprocessed) as backlog,
1929
+ CASE
1930
+ WHEN MAX(backlog_unprocessed) > 100 THEN 'critical'
1931
+ WHEN MAX(backlog_unprocessed) > 50 THEN 'warning'
1932
+ ELSE 'normal'
1933
+ END as status
1934
+ FROM stream_backlog_monitor
1935
+ WHERE namespace = :namespace
1936
+ AND created_at >= :start_time
1937
+ AND created_at <= :end_time
1938
+ GROUP BY stream_name
1939
+ HAVING MAX(backlog_unprocessed) > 0
1940
+ ORDER BY backlog DESC
1941
+ LIMIT :limit
1942
+ """)
1943
+
1944
+ result = await session.execute(backlog_sql, {
1945
+ 'namespace': namespace,
1946
+ 'start_time': start_time,
1947
+ 'end_time': end_time,
1948
+ 'limit': limit
1949
+ })
1950
+
1951
+ backlog_queues = []
1952
+ for row in result.fetchall():
1953
+ backlog_queues.append({
1954
+ "queue": row.queue,
1955
+ "backlog": int(row.backlog),
1956
+ "status": row.status
1957
+ })
1958
+
1959
+ if backlog_queues:
1960
+ return {"success": True, "data": backlog_queues}
1961
+
1962
+ except Exception as e:
1963
+ logger.warning(f"从stream_backlog_monitor获取积压数据失败: {e}")
1964
+ traceback.print_exc()
1965
+
1966
+ # 如果没有积压监控数据,从tasks表统计pending任务
1967
+ try:
1968
+ task_sql = text("""
1969
+ SELECT
1970
+ t.queue,
1971
+ COUNT(DISTINCT t.stream_id) as backlog,
1972
+ CASE
1973
+ WHEN COUNT(DISTINCT t.stream_id) > 1000 THEN 'critical'
1974
+ WHEN COUNT(DISTINCT t.stream_id) > 500 THEN 'warning'
1975
+ ELSE 'normal'
1976
+ END as status
1977
+ FROM tasks t
1978
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
1979
+ WHERE t.namespace = :namespace
1980
+ AND (tr.stream_id IS NULL OR tr.status = 'pending')
1981
+ AND t.created_at > NOW() - INTERVAL '24 hour'
1982
+ GROUP BY t.queue
1983
+ ORDER BY backlog DESC
1984
+ LIMIT :limit
1985
+ """)
1986
+
1987
+ result = await session.execute(task_sql, {
1988
+ 'namespace': namespace,
1989
+ 'limit': limit
1990
+ })
1991
+
1992
+ backlog_queues = []
1993
+ for row in result.fetchall():
1994
+ backlog_queues.append({
1995
+ "queue": row.queue,
1996
+ "backlog": int(row.backlog),
1997
+ "status": row.status
1998
+ })
1999
+
2000
+ return {"success": True, "data": backlog_queues}
2001
+
2002
+ except Exception as e:
2003
+ logger.error(f"从tasks表获取积压数据失败: {e}")
2004
+ traceback.print_exc()
2005
+ raise HTTPException(status_code=500, detail=f"获取积压排行数据失败: {str(e)}")
2006
+
2007
+ except Exception as e:
2008
+ logger.error(f"获取积压排行失败: {e}")
2009
+ traceback.print_exc()
2010
+ raise HTTPException(status_code=500, detail=str(e))
2011
+
2012
+
2013
+ @router.get("/top-error-queues/{namespace}")
2014
+ async def get_top_error_queues(
2015
+ namespace: str,
2016
+ limit: int = 10,
2017
+ time_range: str = "24h",
2018
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
2019
+ ):
2020
+ """
2021
+ 获取错误率最高的队列Top10 (已废弃,请使用 /top-queues/{namespace}?metric=error)
2022
+
2023
+ Args:
2024
+ namespace: 命名空间名称
2025
+ limit: 返回的队列数量限制
2026
+ time_range: 时间范围
2027
+ """
2028
+ return await _get_top_error_queues(namespace, limit, time_range, queues)
2029
+
2030
+
2031
+ async def _get_top_error_queues(
2032
+ namespace: str,
2033
+ limit: int = 10,
2034
+ time_range: str = "24h",
2035
+ queues: Optional[str] = None
2036
+ ):
2037
+ """
2038
+ 内部方法:获取错误率最高的队列Top10
2039
+ """
2040
+ try:
2041
+ conn = await data_access.manager.get_connection(namespace)
2042
+
2043
+ # 计算时间范围
2044
+ end_time = datetime.now(timezone.utc)
2045
+ if time_range.endswith('h'):
2046
+ hours = int(time_range[:-1])
2047
+ start_time = end_time - timedelta(hours=hours)
2048
+ elif time_range.endswith('d'):
2049
+ days = int(time_range[:-1])
2050
+ start_time = end_time - timedelta(days=days)
2051
+ else:
2052
+ start_time = end_time - timedelta(hours=24)
2053
+
2054
+ # 如果没有PostgreSQL配置,返回空数据
2055
+ if not conn.pg_config:
2056
+ return {
2057
+ "success": True,
2058
+ "data": []
2059
+ }
2060
+
2061
+ async with await conn.get_pg_session() as session:
2062
+ try:
2063
+ # 直接从task_runs表查询,因为只有执行过的任务才会有错误记录
2064
+ # 通过consumer_group提取队列名(格式通常是 namespace:QUEUE:queue_name:task_name)
2065
+ error_sql = text("""
2066
+ WITH queue_stats AS (
2067
+ SELECT
2068
+ CASE
2069
+ WHEN consumer_group LIKE :queue_prefix || '%'
2070
+ THEN SPLIT_PART(consumer_group, ':', 3)
2071
+ ELSE consumer_group
2072
+ END as queue,
2073
+ COUNT(DISTINCT stream_id) as total,
2074
+ COUNT(DISTINCT CASE WHEN status IN ('failed', 'error', 'timeout') THEN stream_id END) as failed
2075
+ FROM task_runs
2076
+ WHERE created_at >= :start_time
2077
+ AND created_at <= :end_time
2078
+ AND consumer_group LIKE :namespace_prefix || '%'
2079
+ GROUP BY 1
2080
+ )
2081
+ SELECT
2082
+ queue,
2083
+ total,
2084
+ failed,
2085
+ ROUND(failed * 100.0 / NULLIF(total, 0), 1) as error_rate
2086
+ FROM queue_stats
2087
+ WHERE failed > 0
2088
+ ORDER BY error_rate DESC, failed DESC
2089
+ LIMIT :limit
2090
+ """)
2091
+
2092
+ result = await session.execute(error_sql, {
2093
+ 'namespace_prefix': f'{namespace}:',
2094
+ 'queue_prefix': f'{namespace}:QUEUE:',
2095
+ 'start_time': start_time,
2096
+ 'end_time': end_time,
2097
+ 'limit': limit
2098
+ })
2099
+
2100
+ error_queues = []
2101
+ for row in result.fetchall():
2102
+ error_queues.append({
2103
+ "queue": row.queue,
2104
+ "errorRate": str(row.error_rate or 0),
2105
+ "failed": int(row.failed),
2106
+ "total": int(row.total)
2107
+ })
2108
+
2109
+ return {"success": True, "data": error_queues}
2110
+
2111
+ except Exception as e:
2112
+ logger.error(f"获取错误率排行数据失败: {e}")
2113
+ traceback.print_exc()
2114
+ raise HTTPException(status_code=500, detail=f"获取错误率排行数据失败: {str(e)}")
2115
+
2116
+ except Exception as e:
2117
+ logger.error(f"获取错误率排行失败: {e}")
2118
+ traceback.print_exc()
2119
+ raise HTTPException(status_code=500, detail=str(e))
2120
+
2121
+
2122
+ @router.get("/task-concurrency-trend/{namespace}")
2123
+ async def get_task_concurrency_trend(
2124
+ namespace: str,
2125
+ time_range: str = "1h"
2126
+ ):
2127
+ """
2128
+ 获取任务执行数量趋势数据(每个时间间隔内开始执行的任务数量)
2129
+
2130
+ Args:
2131
+ namespace: 命名空间名称
2132
+ time_range: 时间范围(如'1h', '24h', '7d')
2133
+ """
2134
+ try:
2135
+ conn = await data_access.manager.get_connection(namespace)
2136
+
2137
+ # 计算时间范围
2138
+ end_time = datetime.now(timezone.utc)
2139
+ if time_range.endswith('m'):
2140
+ minutes = int(time_range[:-1])
2141
+ start_time = end_time - timedelta(minutes=minutes)
2142
+ interval = '1 minute'
2143
+ interval_seconds = 60
2144
+ elif time_range.endswith('h'):
2145
+ hours = int(time_range[:-1])
2146
+ start_time = end_time - timedelta(hours=hours)
2147
+ interval = '5 minutes' if hours <= 6 else '10 minutes'
2148
+ interval_seconds = 300 if hours <= 6 else 600
2149
+ elif time_range.endswith('d'):
2150
+ days = int(time_range[:-1])
2151
+ start_time = end_time - timedelta(days=days)
2152
+ interval = '1 hour'
2153
+ interval_seconds = 3600
2154
+ else:
2155
+ start_time = end_time - timedelta(hours=1)
2156
+ interval = '1 minute'
2157
+ interval_seconds = 60
2158
+
2159
+ async with await conn.get_pg_session() as session:
2160
+ try:
2161
+ # 计算每个时间间隔内执行的任务数量(更实用的指标)
2162
+ concurrency_sql = text(f"""
2163
+ WITH time_series AS (
2164
+ SELECT
2165
+ to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket_start,
2166
+ to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) + INTERVAL '{interval_seconds} seconds' AS time_bucket_end
2167
+ FROM generate_series(
2168
+ :start_time ::timestamptz,
2169
+ :end_time ::timestamptz,
2170
+ :interval_val ::interval
2171
+ ) AS ts
2172
+ )
2173
+ SELECT
2174
+ ts.time_bucket_start as time_bucket,
2175
+ COUNT(tr.stream_id) as concurrent_count
2176
+ FROM time_series ts
2177
+ LEFT JOIN task_runs tr ON (
2178
+ EXISTS (
2179
+ SELECT 1 FROM tasks t
2180
+ WHERE t.stream_id = tr.stream_id
2181
+ AND t.namespace = :namespace
2182
+ )
2183
+ AND tr.start_time IS NOT NULL
2184
+ AND tr.start_time >= ts.time_bucket_start
2185
+ AND tr.start_time < ts.time_bucket_end
2186
+ )
2187
+ GROUP BY ts.time_bucket_start
2188
+ ORDER BY ts.time_bucket_start
2189
+ """)
2190
+
2191
+ result = await session.execute(concurrency_sql, {
2192
+ 'namespace': namespace,
2193
+ 'start_time': start_time,
2194
+ 'end_time': end_time,
2195
+ 'interval_val': interval
2196
+ })
2197
+
2198
+ data = []
2199
+ for row in result.fetchall():
2200
+ concurrent_count = row.concurrent_count or 0
2201
+ data.append({
2202
+ 'time': row.time_bucket.isoformat(),
2203
+ 'value': int(concurrent_count),
2204
+ 'type': '执行数量'
2205
+ })
2206
+
2207
+ return {"data": data, "granularity": "minute"}
2208
+
2209
+ except Exception as e:
2210
+ logger.error(f"获取任务并发数据失败: {e}")
2211
+ traceback.print_exc()
2212
+ # 如果查询失败,从Redis获取当前并发数作为静态数据
2213
+ try:
2214
+ redis_client = await conn.get_redis_client()
2215
+
2216
+ # 统计当前正在处理的任务数
2217
+ worker_pattern = f"{conn.redis_prefix}:WORKER:*"
2218
+ current_concurrent = 0
2219
+
2220
+ async for worker_key in redis_client.scan_iter(match=worker_pattern):
2221
+ worker_info = await redis_client.hgetall(worker_key)
2222
+ if worker_info and worker_info.get('status') == 'busy':
2223
+ current_concurrent += 1
2224
+
2225
+ await redis_client.aclose()
2226
+
2227
+ # 生成静态数据点
2228
+ data = []
2229
+ num_points = min(20, max(5, int((end_time - start_time).total_seconds() / interval_seconds)))
2230
+
2231
+ for i in range(num_points):
2232
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
2233
+ data.append({
2234
+ 'time': timestamp.isoformat(),
2235
+ 'value': current_concurrent if i == num_points - 1 else None,
2236
+ 'type': '并发数'
2237
+ })
2238
+
2239
+ return {"data": data, "granularity": "minute"}
2240
+
2241
+ except Exception as redis_error:
2242
+ logger.error(f"从Redis获取并发数据也失败: {redis_error}")
2243
+ raise HTTPException(status_code=500, detail=f"获取并发数据失败: {str(e)}")
2244
+
2245
+ except Exception as e:
2246
+ logger.error(f"获取任务并发趋势失败: {e}")
2247
+ traceback.print_exc()
2248
+ raise HTTPException(status_code=500, detail=str(e))
2249
+
2250
+
2251
+ @router.post("/tasks/{namespace}")
2252
+ async def get_namespace_tasks(namespace: str, request: Request):
2253
+ """
2254
+ 获取指定命名空间中队列的任务列表
2255
+
2256
+ Args:
2257
+ namespace: 命名空间名称
2258
+ request: 请求体,包含queue_name、分页、筛选等参数
2259
+ """
2260
+ try:
2261
+ # 解析请求体
2262
+ body = await request.json()
2263
+ queue_name = body.get('queue_name')
2264
+ page = body.get('page', 1)
2265
+ page_size = body.get('page_size', 20)
2266
+ filters = body.get('filters', [])
2267
+ sort_field = body.get('sort_field')
2268
+ sort_order = body.get('sort_order', 'desc')
2269
+
2270
+ # 处理时间范围参数
2271
+ start_time = body.get('start_time')
2272
+ end_time = body.get('end_time')
2273
+ time_range = body.get('time_range')
2274
+
2275
+ if not queue_name:
2276
+ raise HTTPException(status_code=400, detail="queue_name is required")
2277
+
2278
+ # 获取命名空间连接
2279
+ conn = await data_access.manager.get_connection(namespace)
2280
+
2281
+ # 如果没有PostgreSQL配置,返回空数据
2282
+ if not conn.pg_config:
2283
+ return {
2284
+ "success": True,
2285
+ "data": [],
2286
+ "total": 0,
2287
+ "page": page,
2288
+ "page_size": page_size
2289
+ }
2290
+
2291
+ # 如果提供了时间范围,计算起止时间
2292
+ if not start_time or not end_time:
2293
+ if time_range and time_range != 'custom':
2294
+ now = datetime.now(timezone.utc)
2295
+ time_range_map = {
2296
+ "15m": timedelta(minutes=15),
2297
+ "30m": timedelta(minutes=30),
2298
+ "1h": timedelta(hours=1),
2299
+ "3h": timedelta(hours=3),
2300
+ "6h": timedelta(hours=6),
2301
+ "12h": timedelta(hours=12),
2302
+ "24h": timedelta(hours=24),
2303
+ "7d": timedelta(days=7),
2304
+ "30d": timedelta(days=30),
2305
+ }
2306
+
2307
+ delta = time_range_map.get(time_range)
2308
+ if delta:
2309
+ end_time = now
2310
+ start_time = end_time - delta
2311
+
2312
+ # 如果有时间范围,将其转换为datetime对象
2313
+ if start_time and isinstance(start_time, str):
2314
+ start_time = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
2315
+ if end_time and isinstance(end_time, str):
2316
+ end_time = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
2317
+
2318
+ # 从PostgreSQL查询任务数据
2319
+ async with await conn.get_pg_session() as session:
2320
+ # 构建基础查询(支持基础队列名和优先级队列)
2321
+ query_conditions = ["t.namespace = :namespace", "(t.queue = :queue_name OR t.queue LIKE :queue_pattern)"]
2322
+ query_params = {
2323
+ 'namespace': namespace,
2324
+ 'queue_name': queue_name,
2325
+ 'queue_pattern': f'{queue_name}:%' # 匹配所有优先级队列
2326
+ }
2327
+
2328
+ # 添加时间范围条件
2329
+ if start_time:
2330
+ query_conditions.append("t.created_at >= :start_time")
2331
+ query_params['start_time'] = start_time
2332
+ if end_time:
2333
+ query_conditions.append("t.created_at <= :end_time")
2334
+ query_params['end_time'] = end_time
2335
+
2336
+ # 添加筛选条件
2337
+ for filter_item in filters:
2338
+ # 跳过被禁用的筛选条件
2339
+ if filter_item.get('enabled') == False:
2340
+ continue
2341
+
2342
+ field = filter_item.get('field')
2343
+ operator = filter_item.get('operator')
2344
+ value = filter_item.get('value')
2345
+
2346
+ if not field or not operator:
2347
+ continue
2348
+
2349
+ # 特殊处理id字段(映射到stream_id)
2350
+ if field == 'id':
2351
+ actual_field = 'stream_id'
2352
+ table_prefix = 't.'
2353
+ elif field == 'scheduled_task_id':
2354
+ # scheduled_task_id特殊处理,需要转换为字符串
2355
+ actual_field = field
2356
+ table_prefix = 't.'
2357
+ # 将值转换为字符串
2358
+ if operator == 'eq':
2359
+ query_conditions.append(f"{table_prefix}{actual_field} = :{field}")
2360
+ query_params[field] = str(value)
2361
+ elif operator == 'ne':
2362
+ query_conditions.append(f"{table_prefix}{actual_field} != :{field}")
2363
+ query_params[field] = str(value)
2364
+ elif operator == 'contains':
2365
+ query_conditions.append(f"{table_prefix}{actual_field} LIKE :{field}")
2366
+ query_params[field] = f'%{str(value)}%'
2367
+ continue # 跳过后续的通用处理
2368
+ else:
2369
+ actual_field = field
2370
+ # 根据字段决定使用哪个表的别名
2371
+ table_prefix = 't.'
2372
+ if field in ['status', 'task_name', 'worker_id', 'consumer_group']:
2373
+ table_prefix = 'tr.'
2374
+ elif field in ['queue', 'namespace', 'stream_id']:
2375
+ table_prefix = 't.'
2376
+
2377
+ # 简单的相等筛选
2378
+ if operator == 'eq':
2379
+ # 对于status字段的特殊处理
2380
+ if field == 'status' and value == 'pending':
2381
+ # pending状态:task_runs中没有记录(status为NULL)或status='pending'
2382
+ query_conditions.append(f"({table_prefix}{actual_field} IS NULL OR {table_prefix}{actual_field} = :{field})")
2383
+ query_params[field] = value
2384
+ else:
2385
+ query_conditions.append(f"{table_prefix}{actual_field} = :{field}")
2386
+ query_params[field] = value
2387
+ elif operator == 'ne':
2388
+ # 对于status字段的特殊处理,需要包含NULL值
2389
+ if field == 'status':
2390
+ # 使用COALESCE处理NULL值,将NULL视为'pending'
2391
+ query_conditions.append(f"(COALESCE({table_prefix}{actual_field}, 'pending') != :{field})")
2392
+ query_params[field] = value
2393
+ else:
2394
+ query_conditions.append(f"{table_prefix}{actual_field} != :{field}")
2395
+ query_params[field] = value
2396
+ elif operator == 'contains':
2397
+ query_conditions.append(f"{table_prefix}{actual_field} LIKE :{field}")
2398
+ query_params[field] = f'%{value}%'
2399
+
2400
+ # 构建WHERE子句
2401
+ where_clause = " AND ".join(query_conditions)
2402
+
2403
+ # 计算总数(需要JOIN因为WHERE条件可能涉及task_runs表)
2404
+ count_sql = text(f"""
2405
+ SELECT COUNT(DISTINCT t.stream_id)
2406
+ FROM tasks t
2407
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
2408
+ WHERE {where_clause}
2409
+ """)
2410
+ total_result = await session.execute(count_sql, query_params)
2411
+ total = total_result.scalar()
2412
+
2413
+ # 构建排序子句
2414
+ order_clause = "t.created_at DESC" # 默认排序
2415
+ if sort_field:
2416
+ # 根据字段添加正确的表别名
2417
+ if sort_field in ['created_at', 'queue', 'stream_id']:
2418
+ order_clause = f"t.{sort_field} {sort_order.upper()}"
2419
+ elif sort_field in ['status', 'task_name', 'worker_id', 'consumer_group', 'started_at', 'completed_at']:
2420
+ order_clause = f"tr.{sort_field} {sort_order.upper()}"
2421
+ else:
2422
+ order_clause = f"{sort_field} {sort_order.upper()}"
2423
+
2424
+ # 查询任务列表(连接tasks和task_runs表)
2425
+ offset = (page - 1) * page_size
2426
+ query_sql = text(f"""
2427
+ SELECT
2428
+ t.stream_id as id,
2429
+ COALESCE(tr.status, 'pending') as status,
2430
+ COALESCE(tr.task_name, t.payload->>'task_name', 'unknown') as task_name,
2431
+ t.queue as queue_name,
2432
+ tr.consumer_group,
2433
+ tr.worker_id,
2434
+ t.created_at,
2435
+ tr.start_time as started_at,
2436
+ tr.end_time as completed_at,
2437
+ tr.duration,
2438
+ tr.execution_time,
2439
+ t.payload as task_data,
2440
+ tr.result,
2441
+ tr.error_message,
2442
+ tr.retry_count
2443
+ FROM tasks t
2444
+ LEFT JOIN task_runs tr ON t.stream_id = tr.stream_id
2445
+ WHERE {where_clause}
2446
+ ORDER BY {order_clause}
2447
+ LIMIT :limit OFFSET :offset
2448
+ """)
2449
+
2450
+ query_params['limit'] = page_size
2451
+ query_params['offset'] = offset
2452
+
2453
+ result = await session.execute(query_sql, query_params)
2454
+
2455
+ # 格式化数据
2456
+ tasks = []
2457
+ for row in result:
2458
+ # 使用error_message字段或从result中提取错误信息
2459
+ error = row.error_message
2460
+ retry_count = row.retry_count if row.retry_count else 0
2461
+
2462
+ if not error and row.result:
2463
+ try:
2464
+ import json
2465
+ result_data = json.loads(row.result) if isinstance(row.result, str) else row.result
2466
+ if isinstance(result_data, dict):
2467
+ error = result_data.get('error')
2468
+ except:
2469
+ pass
2470
+
2471
+ task = {
2472
+ 'id': row.id,
2473
+ 'status': row.status,
2474
+ 'task_name': row.task_name, # 改为task_name以匹配前端
2475
+ 'queue': row.queue_name,
2476
+ 'consumer_group': row.consumer_group if row.consumer_group else '-', # 添加消费者组字段
2477
+ 'worker_id': row.worker_id,
2478
+ 'created_at': row.created_at.isoformat() if row.created_at else None,
2479
+ 'started_at': row.started_at.isoformat() if row.started_at else None,
2480
+ 'completed_at': row.completed_at.isoformat() if row.completed_at else None,
2481
+ 'error': error,
2482
+ 'retry_count': retry_count
2483
+ }
2484
+
2485
+ # 执行时间(从开始到完成)- 直接使用秒数值
2486
+ if row.execution_time is not None:
2487
+ task['execution_time'] = row.execution_time # 返回数字类型,前端会格式化
2488
+ else:
2489
+ task['execution_time'] = None
2490
+
2491
+
2492
+ # 计算总耗时(从创建到完成)
2493
+ if row.created_at and row.completed_at:
2494
+ duration = (row.completed_at - row.created_at).total_seconds()
2495
+ task['duration'] = duration # 返回数字类型,前端会格式化
2496
+ elif row.created_at and row.status == 'running':
2497
+ # 如果任务还在运行,计算从创建到现在的时间
2498
+ duration = (datetime.now(timezone.utc) - row.created_at).total_seconds()
2499
+ task['duration'] = duration
2500
+ else:
2501
+ task['duration'] = None
2502
+
2503
+ tasks.append(task)
2504
+
2505
+ return {
2506
+ "success": True,
2507
+ "data": tasks,
2508
+ "total": total,
2509
+ "page": page,
2510
+ "page_size": page_size
2511
+ }
2512
+
2513
+ except Exception as e:
2514
+ import traceback
2515
+ traceback.print_exc()
2516
+ logger.error(f"获取任务列表失败: {e}")
2517
+
2518
+ await handle_database_connection_error(e, namespace, "获取任务列表")
2519
+
2520
+
2521
+ @router.get("/message-offset-trend/{namespace}")
2522
+ async def get_message_offset_trend(
2523
+ namespace: str,
2524
+ time_range: str = "1h"
2525
+ ):
2526
+ """
2527
+ 获取消息offset趋势数据(投递和确认进度)
2528
+
2529
+ Args:
2530
+ namespace: 命名空间名称
2531
+ time_range: 时间范围(如'1h', '24h', '7d')
2532
+ """
2533
+ try:
2534
+ conn = await data_access.manager.get_connection(namespace)
2535
+
2536
+ # 计算时间范围
2537
+ end_time = datetime.now(timezone.utc)
2538
+ if time_range.endswith('m'):
2539
+ minutes = int(time_range[:-1])
2540
+ start_time = end_time - timedelta(minutes=minutes)
2541
+ interval = '1 minute'
2542
+ interval_seconds = 60
2543
+ elif time_range.endswith('h'):
2544
+ hours = int(time_range[:-1])
2545
+ start_time = end_time - timedelta(hours=hours)
2546
+ interval = '5 minutes' if hours <= 6 else '10 minutes'
2547
+ interval_seconds = 300 if hours <= 6 else 600
2548
+ elif time_range.endswith('d'):
2549
+ days = int(time_range[:-1])
2550
+ start_time = end_time - timedelta(days=days)
2551
+ interval = '1 hour'
2552
+ interval_seconds = 3600
2553
+ else:
2554
+ start_time = end_time - timedelta(hours=1)
2555
+ interval = '1 minute'
2556
+ interval_seconds = 60
2557
+
2558
+ # 如果没有PostgreSQL配置,返回模拟数据
2559
+ if not conn.pg_config:
2560
+ data = []
2561
+ num_points = min(30, max(10, int((end_time - start_time).total_seconds() / interval_seconds)))
2562
+
2563
+ for i in range(num_points):
2564
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
2565
+ base_offset = 1000 + i * 10
2566
+ data.extend([
2567
+ {
2568
+ 'time': timestamp.isoformat(),
2569
+ 'value': base_offset + 10,
2570
+ 'type': '已发布Offset'
2571
+ },
2572
+ {
2573
+ 'time': timestamp.isoformat(),
2574
+ 'value': base_offset + 5,
2575
+ 'type': '已投递Offset'
2576
+ },
2577
+ {
2578
+ 'time': timestamp.isoformat(),
2579
+ 'value': base_offset,
2580
+ 'type': '已确认Offset'
2581
+ }
2582
+ ])
2583
+
2584
+ return {"data": data, "granularity": "minute"}
2585
+
2586
+ async with await conn.get_pg_session() as session:
2587
+ try:
2588
+ # 从stream_backlog_monitor表获取offset进度的时间序列数据
2589
+ offset_sql = text(f"""
2590
+ WITH time_series AS (
2591
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
2592
+ FROM generate_series(
2593
+ :start_time ::timestamptz,
2594
+ :end_time ::timestamptz,
2595
+ :interval_val ::interval
2596
+ ) AS ts
2597
+ ),
2598
+ offset_aggregated AS (
2599
+ SELECT
2600
+ to_timestamp(FLOOR(EXTRACT(epoch FROM created_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2601
+ MAX(last_delivered_offset) as max_delivered_offset,
2602
+ MAX(last_acked_offset) as max_acked_offset,
2603
+ MAX(last_published_offset) as max_published_offset
2604
+ FROM stream_backlog_monitor
2605
+ WHERE namespace = :namespace
2606
+ AND created_at >= :start_time
2607
+ AND created_at <= :end_time
2608
+ GROUP BY time_bucket
2609
+ )
2610
+ SELECT
2611
+ ts.time_bucket,
2612
+ COALESCE(oa.max_delivered_offset, 0) as delivered_offset,
2613
+ COALESCE(oa.max_acked_offset, 0) as acked_offset,
2614
+ COALESCE(oa.max_published_offset, 0) as published_offset
2615
+ FROM time_series ts
2616
+ LEFT JOIN offset_aggregated oa ON ts.time_bucket = oa.time_bucket
2617
+ ORDER BY ts.time_bucket
2618
+ """)
2619
+
2620
+ result = await session.execute(offset_sql, {
2621
+ 'namespace': namespace,
2622
+ 'start_time': start_time,
2623
+ 'end_time': end_time,
2624
+ 'interval_val': interval
2625
+ })
2626
+
2627
+ data = []
2628
+ for row in result.fetchall():
2629
+ timestamp = row.time_bucket.isoformat()
2630
+
2631
+ # 添加已发布offset数据
2632
+ data.append({
2633
+ 'time': timestamp,
2634
+ 'value': int(row.published_offset),
2635
+ 'type': '已发布Offset'
2636
+ })
2637
+
2638
+ # 添加已投递offset数据
2639
+ data.append({
2640
+ 'time': timestamp,
2641
+ 'value': int(row.delivered_offset),
2642
+ 'type': '已投递Offset'
2643
+ })
2644
+
2645
+ # 添加已确认offset数据
2646
+ data.append({
2647
+ 'time': timestamp,
2648
+ 'value': int(row.acked_offset),
2649
+ 'type': '已确认Offset'
2650
+ })
2651
+
2652
+ return {"data": data, "granularity": "minute"}
2653
+
2654
+ except Exception as e:
2655
+ logger.error(f"查询offset趋势数据失败: {e}")
2656
+ traceback.print_exc()
2657
+ # 返回模拟数据
2658
+ data = []
2659
+ num_points = min(20, max(10, int((end_time - start_time).total_seconds() / interval_seconds)))
2660
+
2661
+ for i in range(num_points):
2662
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
2663
+ base_offset = 1000 + i * 10
2664
+ data.extend([
2665
+ {
2666
+ 'time': timestamp.isoformat(),
2667
+ 'value': base_offset + 10,
2668
+ 'type': '已发布Offset'
2669
+ },
2670
+ {
2671
+ 'time': timestamp.isoformat(),
2672
+ 'value': base_offset + 5,
2673
+ 'type': '已投递Offset'
2674
+ },
2675
+ {
2676
+ 'time': timestamp.isoformat(),
2677
+ 'value': base_offset,
2678
+ 'type': '已确认Offset'
2679
+ }
2680
+ ])
2681
+
2682
+ return {"data": data, "granularity": "minute"}
2683
+
2684
+ except Exception as e:
2685
+ logger.error(f"获取消息offset趋势失败: {e}")
2686
+ traceback.print_exc()
2687
+ raise HTTPException(status_code=500, detail=str(e))
2688
+
2689
+
2690
+ @router.post("/dashboard-overview-stats/{namespace}")
2691
+ async def get_dashboard_overview_stats(namespace: str, query: TimeRangeQuery):
2692
+ """
2693
+ 获取概览页面的统一统计数据
2694
+ 包含:任务处理趋势、任务并发数量、任务处理时间、任务执行延时
2695
+
2696
+ Args:
2697
+ namespace: 命名空间名称
2698
+ query: 时间范围查询参数
2699
+
2700
+ Returns:
2701
+ 统一的时间序列数据,包含所有概览图表需要的指标和granularity字段
2702
+ """
2703
+ try:
2704
+ conn = await data_access.manager.get_connection(namespace)
2705
+
2706
+ # 如果没有PostgreSQL配置,返回空数据
2707
+ if not conn.pg_config:
2708
+ return {
2709
+ "task_trend": [],
2710
+ "concurrency": [],
2711
+ "processing_time": [],
2712
+ "creation_latency": [],
2713
+ "granularity": "minute"
2714
+ }
2715
+
2716
+ # 使用公共工具函数处理时间范围
2717
+ time_range_result = parse_time_range_query(query)
2718
+ start_time = time_range_result.start_time
2719
+ end_time = time_range_result.end_time
2720
+ interval = time_range_result.interval
2721
+ interval_seconds = time_range_result.interval_seconds
2722
+ granularity = time_range_result.granularity
2723
+
2724
+ # 构建队列筛选条件
2725
+ queues_str = ','.join(query.queues) if query.queues else None
2726
+ queue_filter, queue_list, queue_params = build_queue_filter_and_params(queues_str)
2727
+ print(f'🔍 Dashboard Overview - 收到队列参数: {query.queues}')
2728
+ print(f'🔍 Dashboard Overview - SQL筛选条件: {queue_filter}')
2729
+
2730
+ async with await conn.get_pg_session() as session:
2731
+ # 统一查询所有概览页面需要的数据
2732
+ sql = text(f"""
2733
+ WITH time_series AS (
2734
+ SELECT to_timestamp(FLOOR(EXTRACT(epoch FROM ts) / {interval_seconds}) * {interval_seconds}) AS time_bucket
2735
+ FROM generate_series(
2736
+ :start_time ::timestamptz,
2737
+ :end_time ::timestamptz + INTERVAL '{interval_seconds} seconds',
2738
+ :interval_val ::interval
2739
+ ) AS ts
2740
+ ),
2741
+ enqueue_counts AS (
2742
+ SELECT
2743
+ to_timestamp(FLOOR(EXTRACT(epoch FROM t.created_at) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2744
+ COUNT(DISTINCT t.stream_id) as enqueued
2745
+ FROM tasks t
2746
+ WHERE t.namespace = :namespace
2747
+ AND t.created_at >= :start_time
2748
+ AND t.created_at <= :end_time
2749
+ {queue_filter}
2750
+ GROUP BY time_bucket
2751
+ ),
2752
+ complete_counts AS (
2753
+ SELECT
2754
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.end_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2755
+ COUNT(DISTINCT t.stream_id) as completed
2756
+ FROM task_runs tr
2757
+ JOIN tasks t ON tr.stream_id = t.stream_id
2758
+ WHERE t.namespace = :namespace
2759
+ AND tr.end_time >= :start_time
2760
+ AND tr.end_time <= :end_time
2761
+ AND tr.status = 'success'
2762
+ -- 只统计在时间范围内创建的任务的完成情况
2763
+ AND t.created_at >= :start_time
2764
+ AND t.created_at <= :end_time
2765
+ {queue_filter}
2766
+ GROUP BY time_bucket
2767
+ ),
2768
+ failed_counts AS (
2769
+ SELECT
2770
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.end_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2771
+ COUNT(DISTINCT t.stream_id) as failed
2772
+ FROM task_runs tr
2773
+ JOIN tasks t ON tr.stream_id = t.stream_id
2774
+ WHERE t.namespace = :namespace
2775
+ AND tr.end_time >= :start_time
2776
+ AND tr.end_time <= :end_time
2777
+ AND tr.status = 'error'
2778
+ -- 只统计在时间范围内创建的任务的失败情况
2779
+ AND t.created_at >= :start_time
2780
+ AND t.created_at <= :end_time
2781
+ {queue_filter}
2782
+ GROUP BY time_bucket
2783
+ ),
2784
+ concurrency_data AS (
2785
+ -- 计算每个时间桶内任务开始时的并发数
2786
+ -- 统计唯一任务,避免重试导致的重复计数
2787
+ SELECT
2788
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.start_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2789
+ COUNT(DISTINCT t.stream_id) as concurrent_tasks
2790
+ FROM task_runs tr
2791
+ JOIN tasks t ON tr.stream_id = t.stream_id
2792
+ WHERE t.namespace = :namespace
2793
+ AND tr.start_time >= :start_time
2794
+ AND tr.start_time <= :end_time
2795
+ AND tr.start_time IS NOT NULL
2796
+ AND tr.end_time IS NOT NULL
2797
+ -- 只统计在时间范围内创建的任务
2798
+ AND t.created_at >= :start_time
2799
+ AND t.created_at <= :end_time
2800
+ {queue_filter}
2801
+ GROUP BY time_bucket
2802
+ ),
2803
+ processing_time_data AS (
2804
+ SELECT
2805
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.end_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2806
+ AVG(CASE WHEN tr.status = 'success' AND tr.execution_time > 0
2807
+ THEN tr.execution_time END) as avg_processing_time,
2808
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY
2809
+ CASE WHEN tr.status = 'success' AND tr.execution_time > 0
2810
+ THEN tr.execution_time END) as p50_processing_time,
2811
+ PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY
2812
+ CASE WHEN tr.status = 'success' AND tr.execution_time > 0
2813
+ THEN tr.execution_time END) as p90_processing_time
2814
+ FROM task_runs tr
2815
+ JOIN tasks t ON tr.stream_id = t.stream_id
2816
+ WHERE t.namespace = :namespace
2817
+ AND tr.end_time >= :start_time
2818
+ AND tr.end_time <= :end_time
2819
+ AND tr.status = 'success'
2820
+ {queue_filter}
2821
+ GROUP BY time_bucket
2822
+ ),
2823
+ creation_latency_data AS (
2824
+ SELECT
2825
+ to_timestamp(FLOOR(EXTRACT(epoch FROM tr.start_time) / {interval_seconds}) * {interval_seconds}) AS time_bucket,
2826
+ AVG(EXTRACT(EPOCH FROM (tr.start_time - t.created_at))) as avg_creation_latency,
2827
+ PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY
2828
+ EXTRACT(EPOCH FROM (tr.start_time - t.created_at))) as p50_creation_latency,
2829
+ PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY
2830
+ EXTRACT(EPOCH FROM (tr.start_time - t.created_at))) as p90_creation_latency
2831
+ FROM task_runs tr
2832
+ JOIN tasks t ON tr.stream_id = t.stream_id
2833
+ WHERE t.namespace = :namespace
2834
+ AND tr.start_time >= :start_time
2835
+ AND tr.start_time <= :end_time
2836
+ AND tr.start_time IS NOT NULL
2837
+ {queue_filter}
2838
+ GROUP BY time_bucket
2839
+ )
2840
+ SELECT
2841
+ ts.time_bucket,
2842
+ COALESCE(eq.enqueued, 0) as enqueued,
2843
+ COALESCE(cc.completed, 0) as completed,
2844
+ COALESCE(fc.failed, 0) as failed,
2845
+ COALESCE(cd.concurrent_tasks, 0) as concurrent_tasks,
2846
+ ROUND(ptd.avg_processing_time::numeric, 6) as avg_processing_time,
2847
+ ROUND(ptd.p50_processing_time::numeric, 6) as p50_processing_time,
2848
+ ROUND(ptd.p90_processing_time::numeric, 6) as p90_processing_time,
2849
+ ROUND(cld.avg_creation_latency::numeric, 3) as avg_creation_latency,
2850
+ ROUND(cld.p50_creation_latency::numeric, 3) as p50_creation_latency,
2851
+ ROUND(cld.p90_creation_latency::numeric, 3) as p90_creation_latency
2852
+ FROM time_series ts
2853
+ LEFT JOIN enqueue_counts eq ON ts.time_bucket = eq.time_bucket
2854
+ LEFT JOIN complete_counts cc ON ts.time_bucket = cc.time_bucket
2855
+ LEFT JOIN failed_counts fc ON ts.time_bucket = fc.time_bucket
2856
+ LEFT JOIN concurrency_data cd ON ts.time_bucket = cd.time_bucket
2857
+ LEFT JOIN processing_time_data ptd ON ts.time_bucket = ptd.time_bucket
2858
+ LEFT JOIN creation_latency_data cld ON ts.time_bucket = cld.time_bucket
2859
+ ORDER BY ts.time_bucket
2860
+ """)
2861
+
2862
+ # 准备查询参数
2863
+ query_params = {
2864
+ 'namespace': namespace,
2865
+ 'start_time': start_time,
2866
+ 'end_time': end_time,
2867
+ 'interval_val': interval,
2868
+ **queue_params
2869
+ }
2870
+
2871
+ print(f'🔍 Dashboard Overview - 最终查询参数: {query_params}')
2872
+
2873
+ result = await session.execute(sql, query_params)
2874
+
2875
+
2876
+ # 格式化数据,按业务分组
2877
+ task_trend = []
2878
+ concurrency = []
2879
+ processing_time = []
2880
+ creation_latency = []
2881
+
2882
+ rows = result.fetchall()
2883
+ end_index = len(rows) - 1
2884
+
2885
+ for idx, row in enumerate(rows):
2886
+ time_str = row.time_bucket.isoformat()
2887
+
2888
+ # 任务处理趋势数据
2889
+ enqueued_val = row.enqueued if row.enqueued > 0 or idx == 0 or idx == end_index else None
2890
+ completed_val = row.completed if row.completed > 0 or idx == 0 or idx == end_index else None
2891
+ failed_val = row.failed if row.failed > 0 or idx == 0 or idx == end_index else None
2892
+
2893
+ task_trend.extend([
2894
+ {'time': time_str, 'value': enqueued_val, 'metric': '入队速率'},
2895
+ {'time': time_str, 'value': completed_val, 'metric': '完成速率'},
2896
+ {'time': time_str, 'value': failed_val, 'metric': '失败数'}
2897
+ ])
2898
+
2899
+ # 任务并发数量 - 直接显示计算的并发数(包括0)
2900
+ concurrent_val = row.concurrent_tasks or 0
2901
+
2902
+ concurrency.append({
2903
+ 'time': time_str,
2904
+ 'value': concurrent_val,
2905
+ 'metric': '并发任务数'
2906
+ })
2907
+
2908
+ # 任务处理时间(转换为毫秒)
2909
+ if row.avg_processing_time is not None:
2910
+ avg_time_val = round(float(row.avg_processing_time * 1000), 1)
2911
+ else:
2912
+ avg_time_val = None if idx != 0 and idx != end_index else 0
2913
+
2914
+ if row.p50_processing_time is not None:
2915
+ p50_time_val = round(float(row.p50_processing_time * 1000), 1)
2916
+ else:
2917
+ p50_time_val = None if idx != 0 and idx != end_index else 0
2918
+
2919
+ if row.p90_processing_time is not None:
2920
+ p90_time_val = round(float(row.p90_processing_time * 1000), 1)
2921
+ else:
2922
+ p90_time_val = None if idx != 0 and idx != end_index else 0
2923
+
2924
+ processing_time.extend([
2925
+ {'time': time_str, 'value': avg_time_val, 'metric': '平均处理时间'},
2926
+ {'time': time_str, 'value': p50_time_val, 'metric': 'P50处理时间'},
2927
+ {'time': time_str, 'value': p90_time_val, 'metric': 'P90处理时间'}
2928
+ ])
2929
+
2930
+ # 任务执行延时(秒)
2931
+ if row.avg_creation_latency is not None:
2932
+ avg_latency_val = round(float(row.avg_creation_latency), 3)
2933
+ else:
2934
+ avg_latency_val = None if idx != 0 and idx != end_index else 0
2935
+
2936
+ if row.p50_creation_latency is not None:
2937
+ p50_latency_val = round(float(row.p50_creation_latency), 3)
2938
+ else:
2939
+ p50_latency_val = None if idx != 0 and idx != end_index else 0
2940
+
2941
+ if row.p90_creation_latency is not None:
2942
+ p90_latency_val = round(float(row.p90_creation_latency), 3)
2943
+ else:
2944
+ p90_latency_val = None if idx != 0 and idx != end_index else 0
2945
+
2946
+ creation_latency.extend([
2947
+ {'time': time_str, 'value': avg_latency_val, 'metric': '平均执行延时'},
2948
+ {'time': time_str, 'value': p50_latency_val, 'metric': 'P50执行延时'},
2949
+ {'time': time_str, 'value': p90_latency_val, 'metric': 'P90执行延时'}
2950
+ ])
2951
+
2952
+ total_data_points = len(task_trend) + len(concurrency) + len(processing_time) + len(creation_latency)
2953
+ logger.info(f"获取到 {total_data_points} 条概览统计数据,粒度: {granularity}")
2954
+
2955
+ return {
2956
+ "task_trend": task_trend,
2957
+ "concurrency": concurrency,
2958
+ "processing_time": processing_time,
2959
+ "creation_latency": creation_latency,
2960
+ "granularity": granularity
2961
+ }
2962
+
2963
+ except Exception as e:
2964
+ logger.error(f"获取概览统计数据失败: {e}")
2965
+ traceback.print_exc()
2966
+ return {
2967
+ "task_trend": [],
2968
+ "concurrency": [],
2969
+ "processing_time": [],
2970
+ "creation_latency": [],
2971
+ "granularity": "minute"
2972
+ }
2973
+
2974
+
2975
+ @router.get("/task-processing-time-trend/{namespace}")
2976
+ async def get_task_processing_time_trend(
2977
+ namespace: str,
2978
+ time_range: str = "24h"
2979
+ ):
2980
+ """
2981
+ 获取任务处理时间趋势数据(用于时间序列图表)
2982
+
2983
+ Args:
2984
+ namespace: 命名空间名称
2985
+ time_range: 时间范围(如'1h', '24h', '7d')
2986
+
2987
+ Returns:
2988
+ 按时间间隔分组的处理时间统计(平均值、P50、P90等)
2989
+ """
2990
+ try:
2991
+ conn = await data_access.manager.get_connection(namespace)
2992
+
2993
+ # 如果没有PostgreSQL配置,返回空数据
2994
+ if not conn.pg_config:
2995
+ return {
2996
+ "success": True,
2997
+ "data": []
2998
+ }
2999
+
3000
+ # 计算时间范围和间隔
3001
+ end_time = datetime.now(timezone.utc)
3002
+ interval = "15 minutes" # 默认15分钟间隔
3003
+
3004
+ if time_range.endswith('m'):
3005
+ minutes = int(time_range[:-1])
3006
+ start_time = end_time - timedelta(minutes=minutes)
3007
+ if minutes <= 60:
3008
+ interval = "5 minutes"
3009
+ elif minutes <= 240:
3010
+ interval = "15 minutes"
3011
+ else:
3012
+ interval = "1 hour"
3013
+ elif time_range.endswith('h'):
3014
+ hours = int(time_range[:-1])
3015
+ start_time = end_time - timedelta(hours=hours)
3016
+ if hours <= 6:
3017
+ interval = "15 minutes"
3018
+ elif hours <= 24:
3019
+ interval = "1 hour"
3020
+ else:
3021
+ interval = "4 hours"
3022
+ elif time_range.endswith('d'):
3023
+ days = int(time_range[:-1])
3024
+ start_time = end_time - timedelta(days=days)
3025
+ if days <= 1:
3026
+ interval = "1 hour"
3027
+ elif days <= 7:
3028
+ interval = "4 hours"
3029
+ else:
3030
+ interval = "1 day"
3031
+ else:
3032
+ start_time = end_time - timedelta(hours=24) # 默认24小时
3033
+ interval = "1 hour"
3034
+
3035
+ async with await conn.get_pg_session() as session:
3036
+ # 首先检查execution_time的样本数据
3037
+ sample_sql = text("""
3038
+ SELECT
3039
+ tr.execution_time,
3040
+ tr.duration,
3041
+ tr.start_time,
3042
+ tr.end_time,
3043
+ t.queue
3044
+ FROM task_runs tr
3045
+ JOIN tasks t ON tr.stream_id = t.stream_id
3046
+ WHERE t.namespace = :namespace
3047
+ AND tr.end_time >= :start_time
3048
+ AND tr.end_time <= :end_time
3049
+ AND tr.status = 'success'
3050
+ AND (tr.execution_time IS NOT NULL OR tr.duration IS NOT NULL)
3051
+ ORDER BY tr.end_time DESC
3052
+ LIMIT 10
3053
+ """)
3054
+
3055
+ sample_result = await session.execute(sample_sql, {
3056
+ 'namespace': namespace,
3057
+ 'start_time': start_time,
3058
+ 'end_time': end_time
3059
+ })
3060
+
3061
+ logger.info("=== 样本execution_time数据 ===")
3062
+ for sample_row in sample_result:
3063
+ logger.info(f"execution_time={sample_row.execution_time}, duration={sample_row.duration}, "
3064
+ f"queue={sample_row.queue}, start_time={sample_row.start_time}, end_time={sample_row.end_time}")
3065
+
3066
+ # 查询处理时间趋势数据 - 使用duration字段如果execution_time为空
3067
+ sql = text("""
3068
+ WITH processing_stats AS (
3069
+ SELECT
3070
+ date_trunc('hour', tr.end_time) as time_bucket,
3071
+ COALESCE(tr.execution_time, tr.duration) as processing_time,
3072
+ t.queue
3073
+ FROM task_runs tr
3074
+ JOIN tasks t ON tr.stream_id = t.stream_id
3075
+ WHERE t.namespace = :namespace
3076
+ AND tr.end_time >= :start_time
3077
+ AND tr.end_time <= :end_time
3078
+ AND tr.status = 'success'
3079
+ AND (tr.execution_time IS NOT NULL OR tr.duration IS NOT NULL)
3080
+ AND COALESCE(tr.execution_time, tr.duration) > 0
3081
+ )
3082
+ SELECT
3083
+ ps.time_bucket,
3084
+ COUNT(ps.processing_time) as task_count,
3085
+ ROUND(AVG(ps.processing_time)::numeric, 6) as avg_processing_time,
3086
+ ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ps.processing_time)::numeric, 6) as p50_processing_time,
3087
+ ROUND(PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY ps.processing_time)::numeric, 6) as p90_processing_time,
3088
+ ROUND(MIN(ps.processing_time)::numeric, 6) as min_processing_time,
3089
+ ROUND(MAX(ps.processing_time)::numeric, 6) as max_processing_time
3090
+ FROM processing_stats ps
3091
+ GROUP BY ps.time_bucket
3092
+ ORDER BY ps.time_bucket
3093
+ """)
3094
+
3095
+ result = await session.execute(sql, {
3096
+ 'namespace': namespace,
3097
+ 'start_time': start_time,
3098
+ 'end_time': end_time
3099
+ })
3100
+
3101
+ data = []
3102
+ for row in result:
3103
+ # 调试日志
3104
+ logger.info(f"处理时间数据行: time_bucket={row.time_bucket}, task_count={row.task_count}, "
3105
+ f"avg_processing_time={row.avg_processing_time}, "
3106
+ f"p50_processing_time={row.p50_processing_time}, "
3107
+ f"p90_processing_time={row.p90_processing_time}")
3108
+
3109
+ # 将处理时间从秒转换为毫秒
3110
+ data.append({
3111
+ 'time': row.time_bucket.isoformat() if row.time_bucket else None,
3112
+ 'metric': '平均处理时间',
3113
+ 'value': round(float(row.avg_processing_time * 1000), 1) if row.avg_processing_time else 0,
3114
+ 'queue': 'all',
3115
+ 'task_count': row.task_count or 0
3116
+ })
3117
+ data.append({
3118
+ 'time': row.time_bucket.isoformat() if row.time_bucket else None,
3119
+ 'metric': 'P50处理时间',
3120
+ 'value': round(float(row.p50_processing_time * 1000), 1) if row.p50_processing_time else 0,
3121
+ 'queue': 'all',
3122
+ 'task_count': row.task_count or 0
3123
+ })
3124
+ data.append({
3125
+ 'time': row.time_bucket.isoformat() if row.time_bucket else None,
3126
+ 'metric': 'P90处理时间',
3127
+ 'value': round(float(row.p90_processing_time * 1000), 1) if row.p90_processing_time else 0,
3128
+ 'queue': 'all',
3129
+ 'task_count': row.task_count or 0
3130
+ })
3131
+
3132
+ logger.info(f"获取到 {len(data)} 条处理时间趋势数据")
3133
+
3134
+ return {
3135
+ "success": True,
3136
+ "data": data,
3137
+ "time_range": time_range,
3138
+ "start_time": start_time.isoformat(),
3139
+ "end_time": end_time.isoformat()
3140
+ }
3141
+
3142
+ except Exception as e:
3143
+ logger.error(f"获取任务处理时间趋势数据失败: {e}")
3144
+ traceback.print_exc()
3145
+ return {
3146
+ "success": False,
3147
+ "error": str(e),
3148
+ "data": []
3149
+ }
3150
+
3151
+
3152
+ @router.get("/task-status-distribution/{namespace}")
3153
+ async def get_task_status_distribution(
3154
+ namespace: str,
3155
+ time_range: str = "24h",
3156
+ queues: Optional[str] = Query(None, description="逗号分隔的队列名称列表")
3157
+ ):
3158
+ """
3159
+ 获取任务状态分布数据,按队列分组统计
3160
+
3161
+ Args:
3162
+ namespace: 命名空间名称
3163
+ time_range: 时间范围(如'1h', '24h', '7d')
3164
+
3165
+ Returns:
3166
+ 按队列分组的任务状态分布数据,用于饼图展示
3167
+ """
3168
+ try:
3169
+ conn = await data_access.manager.get_connection(namespace)
3170
+
3171
+ if not conn.pg_config:
3172
+ return {"data": []}
3173
+
3174
+ # 计算时间范围
3175
+ end_time = datetime.now(timezone.utc)
3176
+ if time_range.endswith('m'):
3177
+ minutes = int(time_range[:-1])
3178
+ start_time = end_time - timedelta(minutes=minutes)
3179
+ elif time_range.endswith('h'):
3180
+ hours = int(time_range[:-1])
3181
+ start_time = end_time - timedelta(hours=hours)
3182
+ elif time_range.endswith('d'):
3183
+ days = int(time_range[:-1])
3184
+ start_time = end_time - timedelta(days=days)
3185
+ else:
3186
+ start_time = end_time - timedelta(hours=24)
3187
+
3188
+ # 解析队列列表
3189
+ queue_list = []
3190
+ if queues:
3191
+ queue_list = [q.strip() for q in queues.split(',') if q.strip()]
3192
+
3193
+ async with await conn.get_pg_session() as session:
3194
+ # 构建队列筛选条件
3195
+ queue_filter = ""
3196
+ if queue_list:
3197
+ queue_placeholders = ','.join([f':queue_{i}' for i in range(len(queue_list))])
3198
+ queue_filter = f"AND t.queue IN ({queue_placeholders})"
3199
+
3200
+ # 查询任务状态分布数据
3201
+ distribution_sql = text(f"""
3202
+ SELECT
3203
+ t.queue,
3204
+ tr.status,
3205
+ COUNT(*) as count
3206
+ FROM task_runs tr
3207
+ JOIN tasks t ON tr.stream_id = t.stream_id
3208
+ WHERE t.namespace = :namespace
3209
+ AND tr.end_time >= :start_time
3210
+ AND tr.end_time <= :end_time
3211
+ AND tr.status IS NOT NULL
3212
+ {queue_filter}
3213
+ GROUP BY t.queue, tr.status
3214
+ ORDER BY t.queue, tr.status
3215
+ """)
3216
+
3217
+ # 准备查询参数
3218
+ query_params = {
3219
+ 'namespace': namespace,
3220
+ 'start_time': start_time,
3221
+ 'end_time': end_time
3222
+ }
3223
+
3224
+ # 添加队列参数
3225
+ for i, queue in enumerate(queue_list):
3226
+ query_params[f'queue_{i}'] = queue
3227
+
3228
+ result = await session.execute(distribution_sql, query_params)
3229
+
3230
+ # 处理数据,按队列分组
3231
+ queue_data = {}
3232
+ total_by_status = {}
3233
+
3234
+ for row in result.fetchall():
3235
+ queue = row.queue
3236
+ status = row.status
3237
+ count = row.count
3238
+
3239
+ # 按队列统计
3240
+ if queue not in queue_data:
3241
+ queue_data[queue] = {'success': 0, 'error': 0, 'timeout': 0}
3242
+
3243
+ if status == 'success':
3244
+ queue_data[queue]['success'] += count
3245
+ elif status == 'error':
3246
+ queue_data[queue]['error'] += count
3247
+ elif status == 'timeout':
3248
+ queue_data[queue]['timeout'] += count
3249
+
3250
+ # 按状态统计总数
3251
+ if status not in total_by_status:
3252
+ total_by_status[status] = 0
3253
+ total_by_status[status] += count
3254
+
3255
+ # 格式化数据为饼图格式
3256
+ data = []
3257
+
3258
+ # 方案1: 按队列分组显示
3259
+ for queue, counts in queue_data.items():
3260
+ total_queue = counts['success'] + counts['error'] + counts['timeout']
3261
+ if total_queue > 0:
3262
+ data.append({
3263
+ 'type': f'{queue} (成功)',
3264
+ 'value': counts['success'],
3265
+ 'queue': queue,
3266
+ 'status': 'success'
3267
+ })
3268
+ if counts['error'] > 0:
3269
+ data.append({
3270
+ 'type': f'{queue} (失败)',
3271
+ 'value': counts['error'],
3272
+ 'queue': queue,
3273
+ 'status': 'error'
3274
+ })
3275
+ if counts['timeout'] > 0:
3276
+ data.append({
3277
+ 'type': f'{queue} (超时)',
3278
+ 'value': counts['timeout'],
3279
+ 'queue': queue,
3280
+ 'status': 'timeout'
3281
+ })
3282
+
3283
+ # 如果没有数据,返回默认值
3284
+ if not data:
3285
+ data = [
3286
+ {'type': '暂无数据', 'value': 1, 'queue': '', 'status': 'empty'}
3287
+ ]
3288
+
3289
+ return {"data": data}
3290
+
3291
+ except Exception as e:
3292
+ logger.error(f"获取任务状态分布失败: {e}")
3293
+ traceback.print_exc()
3294
+ raise HTTPException(status_code=500, detail=str(e))