jettask 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +2 -0
- jettask/cli.py +12 -8
- jettask/config/lua_scripts.py +37 -0
- jettask/config/nacos_config.py +1 -1
- jettask/core/app.py +313 -340
- jettask/core/container.py +4 -4
- jettask/{persistence → core}/namespace.py +93 -27
- jettask/core/task.py +16 -9
- jettask/core/unified_manager_base.py +136 -26
- jettask/db/__init__.py +67 -0
- jettask/db/base.py +137 -0
- jettask/{utils/db_connector.py → db/connector.py} +130 -26
- jettask/db/models/__init__.py +16 -0
- jettask/db/models/scheduled_task.py +196 -0
- jettask/db/models/task.py +77 -0
- jettask/db/models/task_run.py +85 -0
- jettask/executor/__init__.py +0 -15
- jettask/executor/core.py +76 -31
- jettask/executor/process_entry.py +29 -114
- jettask/executor/task_executor.py +4 -0
- jettask/messaging/event_pool.py +928 -685
- jettask/messaging/scanner.py +30 -0
- jettask/persistence/__init__.py +28 -103
- jettask/persistence/buffer.py +170 -0
- jettask/persistence/consumer.py +330 -249
- jettask/persistence/manager.py +304 -0
- jettask/persistence/persistence.py +391 -0
- jettask/scheduler/__init__.py +15 -3
- jettask/scheduler/{task_crud.py → database.py} +61 -57
- jettask/scheduler/loader.py +2 -2
- jettask/scheduler/{scheduler_coordinator.py → manager.py} +23 -6
- jettask/scheduler/models.py +14 -10
- jettask/scheduler/schedule.py +166 -0
- jettask/scheduler/scheduler.py +12 -11
- jettask/schemas/__init__.py +50 -1
- jettask/schemas/backlog.py +43 -6
- jettask/schemas/namespace.py +70 -19
- jettask/schemas/queue.py +19 -3
- jettask/schemas/responses.py +493 -0
- jettask/task/__init__.py +0 -2
- jettask/task/router.py +3 -0
- jettask/test_connection_monitor.py +1 -1
- jettask/utils/__init__.py +7 -5
- jettask/utils/db_init.py +8 -4
- jettask/utils/namespace_dep.py +167 -0
- jettask/utils/queue_matcher.py +186 -0
- jettask/utils/rate_limit/concurrency_limiter.py +7 -1
- jettask/utils/stream_backlog.py +1 -1
- jettask/webui/__init__.py +0 -1
- jettask/webui/api/__init__.py +4 -4
- jettask/webui/api/alerts.py +806 -71
- jettask/webui/api/example_refactored.py +400 -0
- jettask/webui/api/namespaces.py +390 -45
- jettask/webui/api/overview.py +300 -54
- jettask/webui/api/queues.py +971 -267
- jettask/webui/api/scheduled.py +1249 -56
- jettask/webui/api/settings.py +129 -7
- jettask/webui/api/workers.py +442 -0
- jettask/webui/app.py +46 -2329
- jettask/webui/middleware/__init__.py +6 -0
- jettask/webui/middleware/namespace_middleware.py +135 -0
- jettask/webui/services/__init__.py +146 -0
- jettask/webui/services/heartbeat_service.py +251 -0
- jettask/webui/services/overview_service.py +60 -51
- jettask/webui/services/queue_monitor_service.py +426 -0
- jettask/webui/services/redis_monitor_service.py +87 -0
- jettask/webui/services/settings_service.py +174 -111
- jettask/webui/services/task_monitor_service.py +222 -0
- jettask/webui/services/timeline_pg_service.py +452 -0
- jettask/webui/services/timeline_service.py +189 -0
- jettask/webui/services/worker_monitor_service.py +467 -0
- jettask/webui/utils/__init__.py +11 -0
- jettask/webui/utils/time_utils.py +122 -0
- jettask/worker/lifecycle.py +8 -2
- {jettask-0.2.23.dist-info → jettask-0.2.24.dist-info}/METADATA +1 -1
- jettask-0.2.24.dist-info/RECORD +142 -0
- jettask/executor/executor.py +0 -338
- jettask/persistence/backlog_monitor.py +0 -567
- jettask/persistence/base.py +0 -2334
- jettask/persistence/db_manager.py +0 -516
- jettask/persistence/maintenance.py +0 -81
- jettask/persistence/message_consumer.py +0 -259
- jettask/persistence/models.py +0 -49
- jettask/persistence/offline_recovery.py +0 -196
- jettask/persistence/queue_discovery.py +0 -215
- jettask/persistence/task_persistence.py +0 -218
- jettask/persistence/task_updater.py +0 -583
- jettask/scheduler/add_execution_count.sql +0 -11
- jettask/scheduler/add_priority_field.sql +0 -26
- jettask/scheduler/add_scheduler_id.sql +0 -25
- jettask/scheduler/add_scheduler_id_index.sql +0 -10
- jettask/scheduler/make_scheduler_id_required.sql +0 -28
- jettask/scheduler/migrate_interval_seconds.sql +0 -9
- jettask/scheduler/performance_optimization.sql +0 -45
- jettask/scheduler/run_scheduler.py +0 -186
- jettask/scheduler/schema.sql +0 -84
- jettask/task/task_executor.py +0 -318
- jettask/webui/api/analytics.py +0 -323
- jettask/webui/config.py +0 -90
- jettask/webui/models/__init__.py +0 -3
- jettask/webui/models/namespace.py +0 -63
- jettask/webui/namespace_manager/__init__.py +0 -10
- jettask/webui/namespace_manager/multi.py +0 -593
- jettask/webui/namespace_manager/unified.py +0 -193
- jettask/webui/run.py +0 -46
- jettask-0.2.23.dist-info/RECORD +0 -145
- {jettask-0.2.23.dist-info → jettask-0.2.24.dist-info}/WHEEL +0 -0
- {jettask-0.2.23.dist-info → jettask-0.2.24.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.23.dist-info → jettask-0.2.24.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.23.dist-info → jettask-0.2.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
"""
|
2
|
+
Namespace 中间件 - 自动注入命名空间上下文
|
3
|
+
|
4
|
+
这个中间件会自动检测路由中的 {namespace} 参数,并将 NamespaceContext 注入到 request.state.ns
|
5
|
+
这样所有路由都无需手动使用 Depends(get_namespace_context),直接访问 request.state.ns 即可
|
6
|
+
"""
|
7
|
+
import logging
|
8
|
+
import re
|
9
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
10
|
+
from starlette.requests import Request
|
11
|
+
from starlette.responses import JSONResponse
|
12
|
+
from typing import Callable
|
13
|
+
|
14
|
+
from jettask.utils.namespace_dep import NamespaceContext
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class NamespaceMiddleware(BaseHTTPMiddleware):
|
20
|
+
"""
|
21
|
+
Namespace 自动注入中间件
|
22
|
+
|
23
|
+
功能:
|
24
|
+
1. 自动检测路由路径中的 {namespace} 参数
|
25
|
+
2. 查询命名空间配置并建立数据库连接
|
26
|
+
3. 将 NamespaceContext 注入到 request.state.ns
|
27
|
+
4. 统一处理命名空间不存在等错误
|
28
|
+
|
29
|
+
使用方式:
|
30
|
+
```python
|
31
|
+
# 在 app.py 中注册
|
32
|
+
app.add_middleware(NamespaceMiddleware)
|
33
|
+
|
34
|
+
# 在路由中使用
|
35
|
+
@router.get("/{namespace}/queues")
|
36
|
+
async def get_queues(request: Request):
|
37
|
+
ns = request.state.ns # 已自动注入
|
38
|
+
redis_client = await ns.get_redis_client()
|
39
|
+
# ... 业务逻辑
|
40
|
+
```
|
41
|
+
"""
|
42
|
+
|
43
|
+
# 需要排除的路径前缀(这些路径不需要 namespace)
|
44
|
+
EXCLUDED_PATHS = [
|
45
|
+
'/api/v1/namespaces', # 命名空间管理自身
|
46
|
+
'/api/v1/overview/', # 根路径(健康检查等)
|
47
|
+
'/docs', # API 文档
|
48
|
+
'/openapi.json', # OpenAPI schema
|
49
|
+
'/redoc', # ReDoc 文档
|
50
|
+
'/health', # 健康检查
|
51
|
+
]
|
52
|
+
|
53
|
+
async def dispatch(self, request: Request, call_next: Callable):
|
54
|
+
"""
|
55
|
+
中间件处理逻辑
|
56
|
+
|
57
|
+
Args:
|
58
|
+
request: HTTP 请求对象
|
59
|
+
call_next: 下一个中间件或路由处理器
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
HTTP 响应
|
63
|
+
"""
|
64
|
+
# 1. 检查是否是排除路径
|
65
|
+
path = request.url.path
|
66
|
+
|
67
|
+
# 检查排除路径
|
68
|
+
for excluded in self.EXCLUDED_PATHS:
|
69
|
+
if path.startswith(excluded):
|
70
|
+
# 不需要 namespace,直接放行
|
71
|
+
return await call_next(request)
|
72
|
+
|
73
|
+
# 2. 从路径中提取 namespace 参数
|
74
|
+
# 匹配模式:/api/v1/xxx/{namespace}/...
|
75
|
+
namespace_match = re.search(r'/api/v1/[^/]+/([^/]+)', path)
|
76
|
+
|
77
|
+
if not namespace_match:
|
78
|
+
# 没有 namespace 参数,直接放行
|
79
|
+
return await call_next(request)
|
80
|
+
|
81
|
+
namespace = namespace_match.group(1)
|
82
|
+
|
83
|
+
# 3. 特殊处理:如果 namespace 实际上是其他路径段(如 "redis"),跳过
|
84
|
+
# 例如:/api/v1/queues/redis/monitor/{namespace}
|
85
|
+
if namespace in ['redis', 'tasks-v2', 'statistics']:
|
86
|
+
# 尝试从更后面的路径段提取 namespace
|
87
|
+
# 模式:/api/v1/queues/redis/monitor/{namespace}
|
88
|
+
namespace_match2 = re.search(r'/api/v1/[^/]+/[^/]+/[^/]+/([^/]+)', path)
|
89
|
+
if namespace_match2:
|
90
|
+
namespace = namespace_match2.group(1)
|
91
|
+
else:
|
92
|
+
# 如果还是没有,直接放行(可能这个路由不需要 namespace)
|
93
|
+
return await call_next(request)
|
94
|
+
|
95
|
+
# 4. 获取 namespace_data_access
|
96
|
+
if not hasattr(request.app.state, 'namespace_data_access'):
|
97
|
+
logger.error("namespace_data_access 未初始化")
|
98
|
+
return JSONResponse(
|
99
|
+
status_code=500,
|
100
|
+
content={"detail": "Namespace data access not initialized"}
|
101
|
+
)
|
102
|
+
|
103
|
+
manager = request.app.state.namespace_data_access.manager
|
104
|
+
|
105
|
+
# 5. 获取命名空间连接并注入上下文
|
106
|
+
try:
|
107
|
+
connection = await manager.get_connection(namespace)
|
108
|
+
|
109
|
+
# 创建 NamespaceContext 并注入到 request.state
|
110
|
+
request.state.ns = NamespaceContext(
|
111
|
+
namespace_name=namespace,
|
112
|
+
connection=connection,
|
113
|
+
manager=manager
|
114
|
+
)
|
115
|
+
|
116
|
+
logger.debug(f"已为请求 {path} 注入命名空间上下文: {namespace}")
|
117
|
+
|
118
|
+
except ValueError as e:
|
119
|
+
# 命名空间不存在或配置错误
|
120
|
+
logger.warning(f"命名空间 '{namespace}' 不存在或配置错误: {e}")
|
121
|
+
return JSONResponse(
|
122
|
+
status_code=404,
|
123
|
+
content={"detail": f"命名空间 '{namespace}' 不存在或配置错误"}
|
124
|
+
)
|
125
|
+
except Exception as e:
|
126
|
+
# 其他错误(数据库连接失败等)
|
127
|
+
logger.error(f"获取命名空间 '{namespace}' 连接失败: {e}", exc_info=True)
|
128
|
+
return JSONResponse(
|
129
|
+
status_code=500,
|
130
|
+
content={"detail": f"获取命名空间连接失败: {str(e)}"}
|
131
|
+
)
|
132
|
+
|
133
|
+
# 6. 调用下一个处理器
|
134
|
+
response = await call_next(request)
|
135
|
+
return response
|
@@ -11,6 +11,144 @@ from .analytics_service import AnalyticsService
|
|
11
11
|
from .settings_service import SettingsService
|
12
12
|
from .task_service import TaskService
|
13
13
|
|
14
|
+
# 监控服务
|
15
|
+
from .redis_monitor_service import RedisMonitorService
|
16
|
+
from .task_monitor_service import TaskMonitorService
|
17
|
+
from .worker_monitor_service import WorkerMonitorService
|
18
|
+
from .queue_monitor_service import QueueMonitorService
|
19
|
+
from .heartbeat_service import HeartbeatService
|
20
|
+
from .timeline_service import TimelineService
|
21
|
+
from .timeline_pg_service import TimelinePgService
|
22
|
+
|
23
|
+
|
24
|
+
class MonitorService:
|
25
|
+
"""
|
26
|
+
统一的监控服务类
|
27
|
+
|
28
|
+
整合所有监控服务,提供统一的接口
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(self, redis_url: str = "redis://localhost:6379", redis_prefix: str = "jettask"):
|
32
|
+
"""
|
33
|
+
初始化监控服务
|
34
|
+
|
35
|
+
Args:
|
36
|
+
redis_url: Redis 连接 URL
|
37
|
+
redis_prefix: Redis 键前缀
|
38
|
+
"""
|
39
|
+
# 创建基础 Redis 服务
|
40
|
+
self.redis_service = RedisMonitorService(redis_url, redis_prefix)
|
41
|
+
|
42
|
+
# 创建各个子服务
|
43
|
+
self.task_service = TaskMonitorService(self.redis_service)
|
44
|
+
self.worker_service = WorkerMonitorService(self.redis_service)
|
45
|
+
self.queue_service = QueueMonitorService(self.redis_service)
|
46
|
+
self.heartbeat_service = HeartbeatService(self.redis_service)
|
47
|
+
self.timeline_service = TimelineService(self.redis_service)
|
48
|
+
|
49
|
+
async def connect(self):
|
50
|
+
"""连接到 Redis"""
|
51
|
+
await self.redis_service.connect()
|
52
|
+
|
53
|
+
async def close(self):
|
54
|
+
"""关闭所有服务"""
|
55
|
+
# 停止心跳扫描器
|
56
|
+
await self.heartbeat_service.stop_heartbeat_scanner()
|
57
|
+
# 关闭 Redis 连接
|
58
|
+
await self.redis_service.close()
|
59
|
+
|
60
|
+
async def __aenter__(self):
|
61
|
+
"""异步上下文管理器入口"""
|
62
|
+
await self.connect()
|
63
|
+
return self
|
64
|
+
|
65
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
66
|
+
"""异步上下文管理器退出"""
|
67
|
+
await self.close()
|
68
|
+
|
69
|
+
# ==================== Task Monitor Methods ====================
|
70
|
+
|
71
|
+
async def get_task_info(self, stream_id: str, queue_name: str):
|
72
|
+
"""获取单个任务的详细信息"""
|
73
|
+
return await self.task_service.get_task_info(stream_id, queue_name)
|
74
|
+
|
75
|
+
async def get_stream_info(self, queue_name: str):
|
76
|
+
"""获取 Stream 的统计信息"""
|
77
|
+
return await self.task_service.get_stream_info(queue_name)
|
78
|
+
|
79
|
+
async def get_queue_tasks(self, queue_name: str, start: str = "-", end: str = "+", count: int = 100, reverse: bool = False):
|
80
|
+
"""获取队列中的任务列表"""
|
81
|
+
return await self.task_service.get_queue_tasks(queue_name, start, end, count, reverse)
|
82
|
+
|
83
|
+
# ==================== Worker Monitor Methods ====================
|
84
|
+
|
85
|
+
async def get_worker_heartbeats(self, queue_name: str):
|
86
|
+
"""获取指定队列的 Worker 心跳信息"""
|
87
|
+
return await self.worker_service.get_worker_heartbeats(queue_name)
|
88
|
+
|
89
|
+
async def get_queue_worker_summary(self, queue_name: str):
|
90
|
+
"""获取队列的 Worker 汇总统计信息(包含历史数据)"""
|
91
|
+
return await self.worker_service.get_queue_worker_summary(queue_name)
|
92
|
+
|
93
|
+
async def get_queue_worker_summary_fast(self, queue_name: str):
|
94
|
+
"""获取队列的 Worker 汇总统计信息(快速版,仅在线 Worker)"""
|
95
|
+
return await self.worker_service.get_queue_worker_summary_fast(queue_name)
|
96
|
+
|
97
|
+
async def get_worker_offline_history(self, limit: int = 100, start_time=None, end_time=None):
|
98
|
+
"""获取 Worker 下线历史记录"""
|
99
|
+
return await self.worker_service.get_worker_offline_history(limit, start_time, end_time)
|
100
|
+
|
101
|
+
# ==================== Queue Monitor Methods ====================
|
102
|
+
|
103
|
+
async def get_all_queues(self):
|
104
|
+
"""获取所有队列名称"""
|
105
|
+
return await self.queue_service.get_all_queues()
|
106
|
+
|
107
|
+
async def get_queue_stats(self, queue_name: str):
|
108
|
+
"""获取队列统计信息(RabbitMQ 兼容格式)"""
|
109
|
+
return await self.queue_service.get_queue_stats(queue_name)
|
110
|
+
|
111
|
+
# ==================== Heartbeat Service Methods ====================
|
112
|
+
|
113
|
+
async def start_heartbeat_scanner(self):
|
114
|
+
"""启动心跳扫描器"""
|
115
|
+
await self.heartbeat_service.start_heartbeat_scanner()
|
116
|
+
|
117
|
+
async def stop_heartbeat_scanner(self):
|
118
|
+
"""停止心跳扫描器"""
|
119
|
+
await self.heartbeat_service.stop_heartbeat_scanner()
|
120
|
+
|
121
|
+
async def check_worker_heartbeat(self, worker_id: str):
|
122
|
+
"""检查单个 Worker 的心跳状态"""
|
123
|
+
return await self.heartbeat_service.check_worker_heartbeat(worker_id)
|
124
|
+
|
125
|
+
async def get_heartbeat_stats(self):
|
126
|
+
"""获取心跳监控统计信息"""
|
127
|
+
return await self.heartbeat_service.get_heartbeat_stats()
|
128
|
+
|
129
|
+
# ==================== Timeline Service Methods ====================
|
130
|
+
|
131
|
+
async def get_redis_timeline(self, queue_name: str, **kwargs):
|
132
|
+
"""获取 Redis Stream 时间轴数据"""
|
133
|
+
return await self.timeline_service.get_redis_timeline(queue_name, **kwargs)
|
134
|
+
|
135
|
+
# ==================== Utility Methods ====================
|
136
|
+
|
137
|
+
def get_prefixed_queue_name(self, queue_name: str) -> str:
|
138
|
+
"""为队列名称添加前缀"""
|
139
|
+
return self.redis_service.get_prefixed_queue_name(queue_name)
|
140
|
+
|
141
|
+
@property
|
142
|
+
def redis(self):
|
143
|
+
"""获取 Redis 客户端"""
|
144
|
+
return self.redis_service.redis
|
145
|
+
|
146
|
+
@property
|
147
|
+
def redis_prefix(self) -> str:
|
148
|
+
"""获取 Redis 前缀"""
|
149
|
+
return self.redis_service.redis_prefix
|
150
|
+
|
151
|
+
|
14
152
|
__all__ = [
|
15
153
|
'OverviewService',
|
16
154
|
'QueueService',
|
@@ -19,4 +157,12 @@ __all__ = [
|
|
19
157
|
'AnalyticsService',
|
20
158
|
'SettingsService',
|
21
159
|
'TaskService',
|
160
|
+
'MonitorService',
|
161
|
+
'RedisMonitorService',
|
162
|
+
'TaskMonitorService',
|
163
|
+
'WorkerMonitorService',
|
164
|
+
'QueueMonitorService',
|
165
|
+
'HeartbeatService',
|
166
|
+
'TimelineService',
|
167
|
+
'TimelinePgService',
|
22
168
|
]
|
@@ -0,0 +1,251 @@
|
|
1
|
+
"""
|
2
|
+
心跳监控服务
|
3
|
+
|
4
|
+
提供 Worker 心跳检查和自动离线标记功能
|
5
|
+
"""
|
6
|
+
import asyncio
|
7
|
+
import logging
|
8
|
+
import time
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
from .redis_monitor_service import RedisMonitorService
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class HeartbeatService:
|
17
|
+
"""心跳监控服务类"""
|
18
|
+
|
19
|
+
def __init__(self, redis_service: RedisMonitorService, scanner_interval: int = 5, heartbeat_timeout: int = 30):
|
20
|
+
"""
|
21
|
+
初始化心跳监控服务
|
22
|
+
|
23
|
+
Args:
|
24
|
+
redis_service: Redis 监控基础服务实例
|
25
|
+
scanner_interval: 扫描间隔(秒),默认 5 秒
|
26
|
+
heartbeat_timeout: 心跳超时时间(秒),默认 30 秒
|
27
|
+
"""
|
28
|
+
self.redis_service = redis_service
|
29
|
+
self.scanner_interval = scanner_interval
|
30
|
+
self.default_heartbeat_timeout = heartbeat_timeout
|
31
|
+
|
32
|
+
# 扫描器任务控制
|
33
|
+
self.scanner_task: Optional[asyncio.Task] = None
|
34
|
+
self._scanner_running = False
|
35
|
+
|
36
|
+
@property
|
37
|
+
def redis(self):
|
38
|
+
"""获取 Redis 客户端"""
|
39
|
+
return self.redis_service.redis
|
40
|
+
|
41
|
+
@property
|
42
|
+
def redis_prefix(self) -> str:
|
43
|
+
"""获取 Redis 前缀"""
|
44
|
+
return self.redis_service.redis_prefix
|
45
|
+
|
46
|
+
@property
|
47
|
+
def worker_state_manager(self):
|
48
|
+
"""获取 WorkerStateManager"""
|
49
|
+
return self.redis_service.worker_state_manager
|
50
|
+
|
51
|
+
async def start_heartbeat_scanner(self):
|
52
|
+
"""启动心跳扫描器"""
|
53
|
+
if not self._scanner_running:
|
54
|
+
self._scanner_running = True
|
55
|
+
self.scanner_task = asyncio.create_task(self._heartbeat_scanner())
|
56
|
+
logger.info("心跳扫描器任务已创建并启动")
|
57
|
+
else:
|
58
|
+
logger.warning("心跳扫描器已经在运行中")
|
59
|
+
|
60
|
+
async def stop_heartbeat_scanner(self):
|
61
|
+
"""停止心跳扫描器"""
|
62
|
+
self._scanner_running = False
|
63
|
+
if self.scanner_task and not self.scanner_task.done():
|
64
|
+
self.scanner_task.cancel()
|
65
|
+
try:
|
66
|
+
await self.scanner_task
|
67
|
+
except asyncio.CancelledError:
|
68
|
+
logger.info("心跳扫描器已取消")
|
69
|
+
pass
|
70
|
+
logger.info("心跳扫描器已停止")
|
71
|
+
|
72
|
+
async def _heartbeat_scanner(self):
|
73
|
+
"""
|
74
|
+
心跳扫描器任务,定期检查 Worker 心跳状态
|
75
|
+
|
76
|
+
定期扫描所有 Worker 的心跳信息,如果发现心跳超时的 Worker,
|
77
|
+
自动标记为离线状态。
|
78
|
+
"""
|
79
|
+
logger.info(f"心跳扫描器启动 (扫描间隔: {self.scanner_interval}s, 超时阈值: {self.default_heartbeat_timeout}s)")
|
80
|
+
|
81
|
+
while self._scanner_running:
|
82
|
+
try:
|
83
|
+
# 使用 WorkerStateManager 获取所有 worker
|
84
|
+
from jettask.worker.lifecycle import WorkerStateManager
|
85
|
+
|
86
|
+
worker_manager = WorkerStateManager(
|
87
|
+
redis_client=self.redis,
|
88
|
+
redis_prefix=self.redis_prefix
|
89
|
+
)
|
90
|
+
|
91
|
+
# 获取所有 worker ID
|
92
|
+
worker_ids = await worker_manager.get_all_workers()
|
93
|
+
|
94
|
+
if worker_ids:
|
95
|
+
current_time = time.time()
|
96
|
+
logger.debug(f"检查 {len(worker_ids)} 个 Worker 的心跳状态")
|
97
|
+
|
98
|
+
# 批量获取所有 worker 信息
|
99
|
+
all_workers_info = await worker_manager.get_all_workers_info(only_alive=False)
|
100
|
+
|
101
|
+
# 检查每个 worker 的心跳
|
102
|
+
offline_count = 0
|
103
|
+
for worker_id in worker_ids:
|
104
|
+
worker_data = all_workers_info.get(worker_id)
|
105
|
+
if not worker_data:
|
106
|
+
continue
|
107
|
+
|
108
|
+
try:
|
109
|
+
# 获取心跳相关信息
|
110
|
+
last_heartbeat = float(worker_data.get('last_heartbeat', 0))
|
111
|
+
is_alive = worker_data.get('is_alive') == 'true'
|
112
|
+
heartbeat_timeout = float(
|
113
|
+
worker_data.get('heartbeat_timeout', self.default_heartbeat_timeout)
|
114
|
+
)
|
115
|
+
consumer_id = worker_data.get('consumer_id', worker_id)
|
116
|
+
|
117
|
+
# 检查是否超时
|
118
|
+
heartbeat_age = current_time - last_heartbeat
|
119
|
+
if is_alive and heartbeat_age > heartbeat_timeout:
|
120
|
+
logger.info(
|
121
|
+
f"Worker {consumer_id} 心跳超时 ({heartbeat_age:.1f}s > {heartbeat_timeout}s),标记为离线"
|
122
|
+
)
|
123
|
+
|
124
|
+
# 通过 WorkerStateManager 更新 worker 状态为离线
|
125
|
+
await worker_manager.set_worker_offline(
|
126
|
+
worker_id=worker_id,
|
127
|
+
reason="heartbeat_timeout"
|
128
|
+
)
|
129
|
+
offline_count += 1
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
logger.error(f"检查 worker {worker_id} 心跳时出错: {e}", exc_info=True)
|
133
|
+
|
134
|
+
if offline_count > 0:
|
135
|
+
logger.info(f"本次扫描标记了 {offline_count} 个 Worker 为离线")
|
136
|
+
|
137
|
+
# 等待下一次扫描
|
138
|
+
await asyncio.sleep(self.scanner_interval)
|
139
|
+
|
140
|
+
except asyncio.CancelledError:
|
141
|
+
logger.info("心跳扫描器收到取消信号")
|
142
|
+
break
|
143
|
+
except Exception as e:
|
144
|
+
logger.error(f"心跳扫描器出错: {e}", exc_info=True)
|
145
|
+
await asyncio.sleep(self.scanner_interval)
|
146
|
+
|
147
|
+
logger.info("心跳扫描器已停止运行")
|
148
|
+
|
149
|
+
async def check_worker_heartbeat(self, worker_id: str) -> bool:
|
150
|
+
"""
|
151
|
+
检查单个 Worker 的心跳状态
|
152
|
+
|
153
|
+
Args:
|
154
|
+
worker_id: Worker ID
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
如果 Worker 在线返回 True,否则返回 False
|
158
|
+
"""
|
159
|
+
try:
|
160
|
+
worker_key = f"{self.redis_prefix}:WORKER:{worker_id}"
|
161
|
+
worker_data = await self.redis.hgetall(worker_key)
|
162
|
+
|
163
|
+
if not worker_data:
|
164
|
+
logger.warning(f"Worker {worker_id} 不存在")
|
165
|
+
return False
|
166
|
+
|
167
|
+
last_heartbeat = float(worker_data.get('last_heartbeat', 0))
|
168
|
+
is_alive = worker_data.get('is_alive', 'true').lower() == 'true'
|
169
|
+
heartbeat_timeout = float(worker_data.get('heartbeat_timeout', self.default_heartbeat_timeout))
|
170
|
+
|
171
|
+
current_time = time.time()
|
172
|
+
heartbeat_age = current_time - last_heartbeat
|
173
|
+
|
174
|
+
# Worker 标记为在线且心跳未超时
|
175
|
+
return is_alive and heartbeat_age <= heartbeat_timeout
|
176
|
+
|
177
|
+
except Exception as e:
|
178
|
+
logger.error(f"检查 worker {worker_id} 心跳状态时出错: {e}", exc_info=True)
|
179
|
+
return False
|
180
|
+
|
181
|
+
async def get_heartbeat_stats(self) -> dict:
|
182
|
+
"""
|
183
|
+
获取心跳监控统计信息
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
包含心跳统计信息的字典
|
187
|
+
"""
|
188
|
+
try:
|
189
|
+
from jettask.worker.lifecycle import WorkerStateManager
|
190
|
+
|
191
|
+
worker_manager = WorkerStateManager(
|
192
|
+
redis_client=self.redis,
|
193
|
+
redis_prefix=self.redis_prefix
|
194
|
+
)
|
195
|
+
|
196
|
+
# 获取所有 worker
|
197
|
+
worker_ids = await worker_manager.get_all_workers()
|
198
|
+
|
199
|
+
if not worker_ids:
|
200
|
+
return {
|
201
|
+
'total_workers': 0,
|
202
|
+
'online_workers': 0,
|
203
|
+
'timeout_workers': 0,
|
204
|
+
'offline_workers': 0
|
205
|
+
}
|
206
|
+
|
207
|
+
# 获取所有 worker 信息
|
208
|
+
all_workers_info = await worker_manager.get_all_workers_info(only_alive=False)
|
209
|
+
|
210
|
+
current_time = time.time()
|
211
|
+
online_count = 0
|
212
|
+
timeout_count = 0
|
213
|
+
offline_count = 0
|
214
|
+
|
215
|
+
for worker_id in worker_ids:
|
216
|
+
worker_data = all_workers_info.get(worker_id)
|
217
|
+
if not worker_data:
|
218
|
+
continue
|
219
|
+
|
220
|
+
last_heartbeat = float(worker_data.get('last_heartbeat', 0))
|
221
|
+
is_alive = worker_data.get('is_alive') == 'true'
|
222
|
+
heartbeat_timeout = float(worker_data.get('heartbeat_timeout', self.default_heartbeat_timeout))
|
223
|
+
|
224
|
+
heartbeat_age = current_time - last_heartbeat
|
225
|
+
|
226
|
+
if not is_alive:
|
227
|
+
offline_count += 1
|
228
|
+
elif heartbeat_age > heartbeat_timeout:
|
229
|
+
timeout_count += 1
|
230
|
+
else:
|
231
|
+
online_count += 1
|
232
|
+
|
233
|
+
return {
|
234
|
+
'total_workers': len(worker_ids),
|
235
|
+
'online_workers': online_count,
|
236
|
+
'timeout_workers': timeout_count,
|
237
|
+
'offline_workers': offline_count,
|
238
|
+
'scanner_running': self._scanner_running,
|
239
|
+
'scanner_interval': self.scanner_interval,
|
240
|
+
'heartbeat_timeout': self.default_heartbeat_timeout
|
241
|
+
}
|
242
|
+
|
243
|
+
except Exception as e:
|
244
|
+
logger.error(f"获取心跳统计信息时出错: {e}", exc_info=True)
|
245
|
+
return {
|
246
|
+
'total_workers': 0,
|
247
|
+
'online_workers': 0,
|
248
|
+
'timeout_workers': 0,
|
249
|
+
'offline_workers': 0,
|
250
|
+
'error': str(e)
|
251
|
+
}
|