jettask 0.2.20__py3-none-any.whl → 0.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. jettask/__init__.py +4 -0
  2. jettask/cli.py +12 -8
  3. jettask/config/lua_scripts.py +37 -0
  4. jettask/config/nacos_config.py +1 -1
  5. jettask/core/app.py +313 -340
  6. jettask/core/container.py +4 -4
  7. jettask/{persistence → core}/namespace.py +93 -27
  8. jettask/core/task.py +16 -9
  9. jettask/core/unified_manager_base.py +136 -26
  10. jettask/db/__init__.py +67 -0
  11. jettask/db/base.py +137 -0
  12. jettask/{utils/db_connector.py → db/connector.py} +130 -26
  13. jettask/db/models/__init__.py +16 -0
  14. jettask/db/models/scheduled_task.py +196 -0
  15. jettask/db/models/task.py +77 -0
  16. jettask/db/models/task_run.py +85 -0
  17. jettask/executor/__init__.py +0 -15
  18. jettask/executor/core.py +76 -31
  19. jettask/executor/process_entry.py +29 -114
  20. jettask/executor/task_executor.py +4 -0
  21. jettask/messaging/event_pool.py +928 -685
  22. jettask/messaging/scanner.py +30 -0
  23. jettask/persistence/__init__.py +28 -103
  24. jettask/persistence/buffer.py +170 -0
  25. jettask/persistence/consumer.py +330 -249
  26. jettask/persistence/manager.py +304 -0
  27. jettask/persistence/persistence.py +391 -0
  28. jettask/scheduler/__init__.py +15 -3
  29. jettask/scheduler/{task_crud.py → database.py} +61 -57
  30. jettask/scheduler/loader.py +2 -2
  31. jettask/scheduler/{scheduler_coordinator.py → manager.py} +23 -6
  32. jettask/scheduler/models.py +14 -10
  33. jettask/scheduler/schedule.py +166 -0
  34. jettask/scheduler/scheduler.py +12 -11
  35. jettask/schemas/__init__.py +50 -1
  36. jettask/schemas/backlog.py +43 -6
  37. jettask/schemas/namespace.py +70 -19
  38. jettask/schemas/queue.py +19 -3
  39. jettask/schemas/responses.py +493 -0
  40. jettask/task/__init__.py +0 -2
  41. jettask/task/router.py +3 -0
  42. jettask/test_connection_monitor.py +1 -1
  43. jettask/utils/__init__.py +7 -5
  44. jettask/utils/db_init.py +8 -4
  45. jettask/utils/namespace_dep.py +167 -0
  46. jettask/utils/queue_matcher.py +186 -0
  47. jettask/utils/rate_limit/concurrency_limiter.py +7 -1
  48. jettask/utils/stream_backlog.py +1 -1
  49. jettask/webui/__init__.py +0 -1
  50. jettask/webui/api/__init__.py +4 -4
  51. jettask/webui/api/alerts.py +806 -71
  52. jettask/webui/api/example_refactored.py +400 -0
  53. jettask/webui/api/namespaces.py +390 -45
  54. jettask/webui/api/overview.py +300 -54
  55. jettask/webui/api/queues.py +971 -267
  56. jettask/webui/api/scheduled.py +1249 -56
  57. jettask/webui/api/settings.py +129 -7
  58. jettask/webui/api/workers.py +442 -0
  59. jettask/webui/app.py +46 -2329
  60. jettask/webui/middleware/__init__.py +6 -0
  61. jettask/webui/middleware/namespace_middleware.py +135 -0
  62. jettask/webui/services/__init__.py +146 -0
  63. jettask/webui/services/heartbeat_service.py +251 -0
  64. jettask/webui/services/overview_service.py +60 -51
  65. jettask/webui/services/queue_monitor_service.py +426 -0
  66. jettask/webui/services/redis_monitor_service.py +87 -0
  67. jettask/webui/services/settings_service.py +174 -111
  68. jettask/webui/services/task_monitor_service.py +222 -0
  69. jettask/webui/services/timeline_pg_service.py +452 -0
  70. jettask/webui/services/timeline_service.py +189 -0
  71. jettask/webui/services/worker_monitor_service.py +467 -0
  72. jettask/webui/utils/__init__.py +11 -0
  73. jettask/webui/utils/time_utils.py +122 -0
  74. jettask/worker/lifecycle.py +8 -2
  75. {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/METADATA +1 -1
  76. jettask-0.2.24.dist-info/RECORD +142 -0
  77. jettask/executor/executor.py +0 -338
  78. jettask/persistence/backlog_monitor.py +0 -567
  79. jettask/persistence/base.py +0 -2334
  80. jettask/persistence/db_manager.py +0 -516
  81. jettask/persistence/maintenance.py +0 -81
  82. jettask/persistence/message_consumer.py +0 -259
  83. jettask/persistence/models.py +0 -49
  84. jettask/persistence/offline_recovery.py +0 -196
  85. jettask/persistence/queue_discovery.py +0 -215
  86. jettask/persistence/task_persistence.py +0 -218
  87. jettask/persistence/task_updater.py +0 -583
  88. jettask/scheduler/add_execution_count.sql +0 -11
  89. jettask/scheduler/add_priority_field.sql +0 -26
  90. jettask/scheduler/add_scheduler_id.sql +0 -25
  91. jettask/scheduler/add_scheduler_id_index.sql +0 -10
  92. jettask/scheduler/make_scheduler_id_required.sql +0 -28
  93. jettask/scheduler/migrate_interval_seconds.sql +0 -9
  94. jettask/scheduler/performance_optimization.sql +0 -45
  95. jettask/scheduler/run_scheduler.py +0 -186
  96. jettask/scheduler/schema.sql +0 -84
  97. jettask/task/task_executor.py +0 -318
  98. jettask/webui/api/analytics.py +0 -323
  99. jettask/webui/config.py +0 -90
  100. jettask/webui/models/__init__.py +0 -3
  101. jettask/webui/models/namespace.py +0 -63
  102. jettask/webui/namespace_manager/__init__.py +0 -10
  103. jettask/webui/namespace_manager/multi.py +0 -593
  104. jettask/webui/namespace_manager/unified.py +0 -193
  105. jettask/webui/run.py +0 -46
  106. jettask-0.2.20.dist-info/RECORD +0 -145
  107. {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/WHEEL +0 -0
  108. {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/entry_points.txt +0 -0
  109. {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/licenses/LICENSE +0 -0
  110. {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/top_level.txt +0 -0
@@ -3,9 +3,11 @@
3
3
  提供轻量级的路由入口,业务逻辑在 SettingsService 中实现
4
4
  """
5
5
  from fastapi import APIRouter, HTTPException
6
+ from typing import Dict, Any
6
7
  import logging
7
8
  import traceback
8
9
 
10
+ from jettask.schemas import SystemSettingsResponse, DatabaseStatusResponse
9
11
  from jettask.webui.services.settings_service import SettingsService
10
12
 
11
13
  logger = logging.getLogger(__name__)
@@ -16,11 +18,71 @@ router = APIRouter(prefix="/settings", tags=["settings"])
16
18
 
17
19
  # ============ 系统配置接口 ============
18
20
 
19
- @router.get("/system")
20
- async def get_system_settings():
21
+ @router.get(
22
+ "/system",
23
+ summary="获取系统配置信息",
24
+ description="获取系统级别的配置信息,包括 API 版本、服务名称、运行环境、数据库状态等",
25
+ response_model=SystemSettingsResponse,
26
+ responses={
27
+ 200: {
28
+ "description": "成功返回系统配置信息"
29
+ },
30
+ 500: {
31
+ "description": "服务器内部错误",
32
+ "content": {
33
+ "application/json": {
34
+ "example": {
35
+ "detail": "获取系统配置失败: Configuration error"
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ )
42
+ async def get_system_settings() -> Dict[str, Any]:
21
43
  """
22
- 获取系统配置信息
23
- 返回系统级别的配置,如数据库连接信息、API配置等
44
+ ## 获取系统配置信息
45
+
46
+ 返回 JetTask WebUI 的系统级配置信息,用于系统管理和监控。
47
+
48
+ **配置信息包括**:
49
+ - API 版本号
50
+ - 服务名称
51
+ - 运行环境 (development/staging/production)
52
+ - 调试模式状态
53
+ - 数据库连接状态和配置
54
+
55
+ **使用场景**:
56
+ - 系统设置页面
57
+ - 运维监控
58
+ - 故障排查
59
+ - 环境验证
60
+
61
+ **示例请求**:
62
+ ```bash
63
+ curl -X GET "http://localhost:8001/api/v1/settings/system"
64
+ ```
65
+
66
+ **示例响应**:
67
+ ```json
68
+ {
69
+ "success": true,
70
+ "data": {
71
+ "api_version": "v1",
72
+ "service_name": "JetTask WebUI",
73
+ "environment": "development",
74
+ "debug_mode": true,
75
+ "database": {
76
+ "connected": true,
77
+ "host": "localhost",
78
+ "port": 5432,
79
+ "database": "jettask",
80
+ "pool_size": 10,
81
+ "active_connections": 3
82
+ }
83
+ }
84
+ }
85
+ ```
24
86
  """
25
87
  try:
26
88
  return SettingsService.get_system_settings()
@@ -29,10 +91,70 @@ async def get_system_settings():
29
91
  raise HTTPException(status_code=500, detail=str(e))
30
92
 
31
93
 
32
- @router.get("/database-status")
33
- async def check_database_status():
94
+ @router.get(
95
+ "/database-status",
96
+ summary="检查数据库连接状态",
97
+ description="检查数据库(PostgreSQL/MySQL)的连接状态和性能指标",
98
+ response_model=DatabaseStatusResponse,
99
+ responses={
100
+ 200: {
101
+ "description": "成功返回数据库状态"
102
+ },
103
+ 500: {
104
+ "description": "服务器内部错误或数据库连接失败",
105
+ "content": {
106
+ "application/json": {
107
+ "example": {
108
+ "detail": "数据库状态检查失败: Connection refused"
109
+ }
110
+ }
111
+ }
112
+ }
113
+ }
114
+ )
115
+ async def check_database_status() -> Dict[str, Any]:
34
116
  """
35
- 检查数据库连接状态
117
+ ## 检查数据库连接状态
118
+
119
+ 检查 JetTask 数据库的连接状态、配置信息和性能指标。
120
+
121
+ **返回信息包括**:
122
+ - 连接状态 (connected: true/false)
123
+ - 主机地址和端口
124
+ - 数据库名称
125
+ - 连接池大小
126
+ - 当前活跃连接数
127
+
128
+ **使用场景**:
129
+ - 系统健康检查
130
+ - 数据库监控
131
+ - 故障诊断
132
+ - 容量规划
133
+
134
+ **示例请求**:
135
+ ```bash
136
+ curl -X GET "http://localhost:8001/api/v1/settings/database-status"
137
+ ```
138
+
139
+ **示例响应**:
140
+ ```json
141
+ {
142
+ "success": true,
143
+ "data": {
144
+ "connected": true,
145
+ "host": "localhost",
146
+ "port": 5432,
147
+ "database": "jettask",
148
+ "pool_size": 10,
149
+ "active_connections": 3
150
+ }
151
+ }
152
+ ```
153
+
154
+ **注意事项**:
155
+ - 此接口会实际连接数据库进行检查
156
+ - 如果数据库不可用,将返回 500 错误
157
+ - 建议配置监控告警
36
158
  """
37
159
  try:
38
160
  return await SettingsService.check_database_status()
@@ -0,0 +1,442 @@
1
+ """
2
+ Worker 监控模块 - Worker 状态监控、心跳管理、离线历史
3
+
4
+ 提供 Worker 相关的监控和管理功能
5
+ """
6
+ from fastapi import APIRouter, HTTPException, Request, Query, Path
7
+ from typing import Optional, List, Dict, Any
8
+ import logging
9
+
10
+ from jettask.schemas import (
11
+ WorkersResponse,
12
+ WorkerSummaryResponse,
13
+ WorkerOfflineHistoryResponse
14
+ )
15
+
16
+ router = APIRouter(prefix="/workers", tags=["workers"])
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # ============ Worker 监控 ============
21
+
22
+ @router.get(
23
+ "/{namespace}/{queue_name}",
24
+ summary="获取队列的 Worker 列表",
25
+ description="获取指定队列所有 Worker 的实时心跳信息和状态",
26
+ response_model=WorkersResponse,
27
+ responses={
28
+ 200: {
29
+ "description": "成功返回 Worker 列表"
30
+ },
31
+ 500: {
32
+ "description": "服务器内部错误",
33
+ "content": {
34
+ "application/json": {
35
+ "example": {"detail": "Monitor service not initialized"}
36
+ }
37
+ }
38
+ }
39
+ }
40
+ )
41
+ async def get_queue_workers(
42
+ request: Request,
43
+ namespace: str = Path(..., description="命名空间名称", example="default"),
44
+ queue_name: str = Path(..., description="队列名称", example="email_queue")
45
+ ) -> Dict[str, Any]:
46
+ """
47
+ ## 获取队列的 Worker 列表
48
+
49
+ 获取指定队列所有 Worker 的实时心跳信息,包括在线状态、处理任务等。
50
+
51
+ **返回信息包括**:
52
+ - Worker ID
53
+ - 队列名称
54
+ - 在线状态(online/offline)
55
+ - 最后心跳时间
56
+ - 已处理任务数
57
+ - 当前正在处理的任务 ID
58
+
59
+ **使用场景**:
60
+ - Worker 监控面板
61
+ - 实时状态查看
62
+ - 负载分析
63
+ - 故障诊断
64
+
65
+ **示例请求**:
66
+ ```bash
67
+ curl -X GET "http://localhost:8001/api/v1/workers/default/email_queue"
68
+ ```
69
+
70
+ **注意事项**:
71
+ - Worker 状态基于心跳时间判断,超过心跳超时时间视为离线
72
+ - 离线 Worker 仍会在列表中显示一段时间
73
+ - 心跳数据实时更新,可能有轻微延迟
74
+ """
75
+ try:
76
+ # 从 app.state 获取 monitor 实例
77
+ if not hasattr(request.app.state, 'monitor'):
78
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
79
+
80
+ monitor = request.app.state.monitor
81
+ workers = await monitor.get_worker_heartbeats(queue_name)
82
+
83
+ return {
84
+ "success": True,
85
+ "namespace": namespace,
86
+ "queue": queue_name,
87
+ "workers": workers
88
+ }
89
+ except Exception as e:
90
+ logger.error(f"获取队列 {queue_name} 的 Worker 信息失败: {e}", exc_info=True)
91
+ raise HTTPException(status_code=500, detail=str(e))
92
+
93
+
94
+ @router.get(
95
+ "/{namespace}/{queue_name}/summary",
96
+ summary="获取队列 Worker 汇总统计",
97
+ description="获取指定队列 Worker 的汇总统计信息,包括总数、在线数、离线数等",
98
+ response_model=WorkerSummaryResponse,
99
+ responses={
100
+ 200: {
101
+ "description": "成功返回汇总统计"
102
+ },
103
+ 500: {
104
+ "description": "服务器内部错误"
105
+ }
106
+ }
107
+ )
108
+ async def get_queue_worker_summary(
109
+ request: Request,
110
+ namespace: str = Path(..., description="命名空间名称", example="default"),
111
+ queue_name: str = Path(..., description="队列名称", example="email_queue"),
112
+ fast: bool = Query(False, description="是否使用快速模式(不包含历史数据)", example=False)
113
+ ) -> Dict[str, Any]:
114
+ """
115
+ ## 获取队列 Worker 汇总统计
116
+
117
+ 获取指定队列所有 Worker 的汇总统计信息,用于快速了解整体状况。
118
+
119
+ **统计指标包括**:
120
+ - 总 Worker 数
121
+ - 在线 Worker 数
122
+ - 离线 Worker 数
123
+ - 总处理任务数
124
+ - 平均每个 Worker 处理任务数
125
+
126
+ **快速模式说明**:
127
+ - `fast=false`: 完整模式,包含离线 Worker 历史数据(默认)
128
+ - `fast=true`: 快速模式,只统计当前在线 Worker,性能更好
129
+
130
+ **使用场景**:
131
+ - 监控看板汇总信息
132
+ - Worker 集群健康检查
133
+ - 容量规划
134
+ - 性能分析
135
+
136
+ **示例请求**:
137
+ ```bash
138
+ # 获取完整统计
139
+ curl -X GET "http://localhost:8001/api/v1/workers/default/email_queue/summary"
140
+
141
+ # 使用快速模式
142
+ curl -X GET "http://localhost:8001/api/v1/workers/default/email_queue/summary?fast=true"
143
+ ```
144
+
145
+ **注意事项**:
146
+ - 快速模式适合实时监控场景
147
+ - 完整模式适合分析历史趋势
148
+ - 统计数据基于心跳时间,可能有轻微延迟
149
+ """
150
+ try:
151
+ if not hasattr(request.app.state, 'monitor'):
152
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
153
+
154
+ monitor = request.app.state.monitor
155
+
156
+ if fast:
157
+ summary = await monitor.get_queue_worker_summary_fast(queue_name)
158
+ else:
159
+ summary = await monitor.get_queue_worker_summary(queue_name)
160
+
161
+ return {
162
+ "success": True,
163
+ "namespace": namespace,
164
+ "queue": queue_name,
165
+ "summary": summary
166
+ }
167
+ except Exception as e:
168
+ logger.error(f"获取队列 {queue_name} 的 Worker 汇总统计失败: {e}", exc_info=True)
169
+ raise HTTPException(status_code=500, detail=str(e))
170
+
171
+
172
+ @router.get(
173
+ "/offline-history",
174
+ summary="获取全局 Worker 离线历史",
175
+ description="获取所有 Worker 的离线历史记录,支持时间范围筛选",
176
+ response_model=WorkerOfflineHistoryResponse,
177
+ responses={
178
+ 200: {
179
+ "description": "成功返回离线历史"
180
+ },
181
+ 500: {
182
+ "description": "服务器内部错误"
183
+ }
184
+ }
185
+ )
186
+ async def get_workers_offline_history(
187
+ request: Request,
188
+ limit: int = Query(100, ge=1, le=1000, description="返回记录数量限制", example=100),
189
+ start_time: Optional[float] = Query(None, description="开始时间戳(Unix 时间戳)", example=1697644800),
190
+ end_time: Optional[float] = Query(None, description="结束时间戳(Unix 时间戳)", example=1697731200)
191
+ ) -> Dict[str, Any]:
192
+ """
193
+ ## 获取全局 Worker 离线历史
194
+
195
+ 获取所有命名空间、所有队列的 Worker 离线历史记录。
196
+
197
+ **返回信息包括**:
198
+ - Worker ID
199
+ - 队列名称
200
+ - 离线时间
201
+ - 最后处理的任务 ID
202
+ - 离线原因(heartbeat_timeout、shutdown、crash 等)
203
+
204
+ **使用场景**:
205
+ - Worker 稳定性分析
206
+ - 故障诊断
207
+ - 历史趋势分析
208
+ - 运维报表
209
+
210
+ **示例请求**:
211
+ ```bash
212
+ # 获取最近100条离线记录
213
+ curl -X GET "http://localhost:8001/api/v1/workers/offline-history?limit=100"
214
+
215
+ # 获取指定时间范围的离线记录
216
+ curl -X GET "http://localhost:8001/api/v1/workers/offline-history?start_time=1697644800&end_time=1697731200&limit=50"
217
+ ```
218
+
219
+ **注意事项**:
220
+ - 时间戳使用 Unix 时间戳格式(秒)
221
+ - 默认返回最近100条记录
222
+ - 最大可返回1000条记录
223
+ - 记录按离线时间倒序排列
224
+ """
225
+ try:
226
+ if not hasattr(request.app.state, 'monitor'):
227
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
228
+
229
+ monitor = request.app.state.monitor
230
+ history = await monitor.get_worker_offline_history(limit, start_time, end_time)
231
+
232
+ return {
233
+ "success": True,
234
+ "history": history,
235
+ "total": len(history)
236
+ }
237
+ except Exception as e:
238
+ logger.error(f"获取 Worker 离线历史失败: {e}", exc_info=True)
239
+ raise HTTPException(status_code=500, detail=str(e))
240
+
241
+
242
+ @router.get(
243
+ "/{namespace}/{queue_name}/offline-history",
244
+ summary="获取队列 Worker 离线历史",
245
+ description="获取指定队列的 Worker 离线历史记录",
246
+ response_model=WorkerOfflineHistoryResponse,
247
+ responses={
248
+ 200: {
249
+ "description": "成功返回队列离线历史"
250
+ },
251
+ 500: {
252
+ "description": "服务器内部错误"
253
+ }
254
+ }
255
+ )
256
+ async def get_queue_workers_offline_history(
257
+ request: Request,
258
+ namespace: str = Path(..., description="命名空间名称", example="default"),
259
+ queue_name: str = Path(..., description="队列名称", example="email_queue"),
260
+ limit: int = Query(100, ge=1, le=1000, description="返回记录数量限制", example=100),
261
+ start_time: Optional[float] = Query(None, description="开始时间戳(Unix 时间戳)"),
262
+ end_time: Optional[float] = Query(None, description="结束时间戳(Unix 时间戳)")
263
+ ) -> Dict[str, Any]:
264
+ """
265
+ ## 获取队列 Worker 离线历史
266
+
267
+ 获取指定队列的 Worker 离线历史记录,用于分析特定队列的 Worker 稳定性。
268
+
269
+ **使用场景**:
270
+ - 分析特定队列的 Worker 稳定性
271
+ - 诊断队列相关的 Worker 问题
272
+ - 队列维护和优化
273
+
274
+ **示例请求**:
275
+ ```bash
276
+ curl -X GET "http://localhost:8001/api/v1/workers/default/email_queue/offline-history?limit=50"
277
+ ```
278
+
279
+ **注意事项**:
280
+ - 只返回该队列相关的 Worker 离线记录
281
+ - 如果一个 Worker 服务多个队列,只要包含目标队列就会返回
282
+ """
283
+ try:
284
+ if not hasattr(request.app.state, 'monitor'):
285
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
286
+
287
+ monitor = request.app.state.monitor
288
+
289
+ # 获取所有历史记录,然后过滤出该队列的
290
+ all_history = await monitor.get_worker_offline_history(limit * 10, start_time, end_time)
291
+ queue_history = [
292
+ record for record in all_history
293
+ if queue_name in record.get('queues', '').split(',')
294
+ ][:limit]
295
+
296
+ return {
297
+ "success": True,
298
+ "namespace": namespace,
299
+ "queue": queue_name,
300
+ "history": queue_history,
301
+ "total": len(queue_history)
302
+ }
303
+ except Exception as e:
304
+ logger.error(f"获取队列 {queue_name} 的 Worker 离线历史失败: {e}", exc_info=True)
305
+ raise HTTPException(status_code=500, detail=str(e))
306
+
307
+
308
+ # ============ Worker 心跳监控 ============
309
+
310
+ @router.get(
311
+ "/heartbeat/stats",
312
+ summary="获取心跳监控统计",
313
+ description="获取所有 Worker 的心跳监控统计信息",
314
+ responses={
315
+ 200: {
316
+ "description": "成功返回心跳统计",
317
+ "content": {
318
+ "application/json": {
319
+ "example": {
320
+ "total_workers": 50,
321
+ "online_workers": 45,
322
+ "offline_workers": 3,
323
+ "timeout_workers": 2
324
+ }
325
+ }
326
+ }
327
+ },
328
+ 500: {
329
+ "description": "服务器内部错误"
330
+ }
331
+ }
332
+ )
333
+ async def get_heartbeat_stats(request: Request) -> Dict[str, Any]:
334
+ """
335
+ ## 获取心跳监控统计
336
+
337
+ 获取所有 Worker 的心跳监控统计信息,用于整体健康度监控。
338
+
339
+ **统计指标包括**:
340
+ - 总 Worker 数
341
+ - 在线 Worker 数
342
+ - 离线 Worker 数
343
+ - 心跳超时 Worker 数
344
+
345
+ **使用场景**:
346
+ - 全局监控大盘
347
+ - 系统健康度评估
348
+ - 告警规则触发
349
+ - 运维看板
350
+
351
+ **示例请求**:
352
+ ```bash
353
+ curl -X GET "http://localhost:8001/api/v1/workers/heartbeat/stats"
354
+ ```
355
+
356
+ **注意事项**:
357
+ - 统计数据实时计算
358
+ - 超时判断基于配置的心跳超时时间
359
+ - 离线 Worker 会在一定时间后清理
360
+ """
361
+ try:
362
+ if not hasattr(request.app.state, 'monitor'):
363
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
364
+
365
+ monitor = request.app.state.monitor
366
+ stats = await monitor.get_heartbeat_stats()
367
+
368
+ return stats
369
+ except Exception as e:
370
+ logger.error(f"获取心跳统计信息失败: {e}", exc_info=True)
371
+ raise HTTPException(status_code=500, detail=str(e))
372
+
373
+
374
+ @router.get(
375
+ "/heartbeat/{worker_id}",
376
+ summary="检查 Worker 心跳状态",
377
+ description="检查指定 Worker 的心跳状态,判断是否在线",
378
+ responses={
379
+ 200: {
380
+ "description": "成功返回心跳状态",
381
+ "content": {
382
+ "application/json": {
383
+ "example": {
384
+ "worker_id": "worker-001",
385
+ "is_online": True
386
+ }
387
+ }
388
+ }
389
+ },
390
+ 500: {
391
+ "description": "服务器内部错误"
392
+ }
393
+ }
394
+ )
395
+ async def check_worker_heartbeat(
396
+ request: Request,
397
+ worker_id: str = Path(..., description="Worker ID", example="worker-001")
398
+ ) -> Dict[str, Any]:
399
+ """
400
+ ## 检查 Worker 心跳状态
401
+
402
+ 检查指定 Worker 的心跳状态,判断该 Worker 是否在线。
403
+
404
+ **返回信息**:
405
+ - Worker ID
406
+ - 是否在线(true/false)
407
+
408
+ **使用场景**:
409
+ - 故障诊断
410
+ - Worker 健康检查
411
+ - 自动化运维脚本
412
+ - 监控告警
413
+
414
+ **示例请求**:
415
+ ```bash
416
+ curl -X GET "http://localhost:8001/api/v1/workers/heartbeat/worker-001"
417
+ ```
418
+
419
+ **判断逻辑**:
420
+ - 如果 Worker 最后心跳时间在超时时间内,返回 `is_online: true`
421
+ - 如果超过超时时间,返回 `is_online: false`
422
+ - 如果从未收到心跳,返回 `is_online: false`
423
+
424
+ **注意事项**:
425
+ - 在线状态基于最后心跳时间判断
426
+ - 默认心跳超时时间为30秒(可配置)
427
+ - Worker ID 区分大小写
428
+ """
429
+ try:
430
+ if not hasattr(request.app.state, 'monitor'):
431
+ raise HTTPException(status_code=500, detail="Monitor service not initialized")
432
+
433
+ monitor = request.app.state.monitor
434
+ is_online = await monitor.check_worker_heartbeat(worker_id)
435
+
436
+ return {
437
+ "worker_id": worker_id,
438
+ "is_online": is_online
439
+ }
440
+ except Exception as e:
441
+ logger.error(f"检查 Worker {worker_id} 心跳状态失败: {e}", exc_info=True)
442
+ raise HTTPException(status_code=500, detail=str(e))