jettask 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +10 -3
- jettask/cli.py +314 -228
- jettask/config/__init__.py +9 -1
- jettask/config/config.py +245 -0
- jettask/config/env_loader.py +381 -0
- jettask/config/lua_scripts.py +158 -0
- jettask/config/nacos_config.py +132 -5
- jettask/core/__init__.py +1 -1
- jettask/core/app.py +1573 -666
- jettask/core/app_importer.py +33 -16
- jettask/core/container.py +532 -0
- jettask/core/task.py +1 -4
- jettask/core/unified_manager_base.py +2 -2
- jettask/executor/__init__.py +38 -0
- jettask/executor/core.py +625 -0
- jettask/executor/executor.py +338 -0
- jettask/executor/orchestrator.py +290 -0
- jettask/executor/process_entry.py +638 -0
- jettask/executor/task_executor.py +317 -0
- jettask/messaging/__init__.py +68 -0
- jettask/messaging/event_pool.py +2188 -0
- jettask/messaging/reader.py +519 -0
- jettask/messaging/registry.py +266 -0
- jettask/messaging/scanner.py +369 -0
- jettask/messaging/sender.py +312 -0
- jettask/persistence/__init__.py +118 -0
- jettask/persistence/backlog_monitor.py +567 -0
- jettask/{backend/data_access.py → persistence/base.py} +58 -57
- jettask/persistence/consumer.py +315 -0
- jettask/{core → persistence}/db_manager.py +23 -22
- jettask/persistence/maintenance.py +81 -0
- jettask/persistence/message_consumer.py +259 -0
- jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
- jettask/persistence/offline_recovery.py +196 -0
- jettask/persistence/queue_discovery.py +215 -0
- jettask/persistence/task_persistence.py +218 -0
- jettask/persistence/task_updater.py +583 -0
- jettask/scheduler/__init__.py +2 -2
- jettask/scheduler/loader.py +6 -5
- jettask/scheduler/run_scheduler.py +1 -1
- jettask/scheduler/scheduler.py +7 -7
- jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
- jettask/task/__init__.py +16 -0
- jettask/{router.py → task/router.py} +26 -8
- jettask/task/task_center/__init__.py +9 -0
- jettask/task/task_executor.py +318 -0
- jettask/task/task_registry.py +291 -0
- jettask/test_connection_monitor.py +73 -0
- jettask/utils/__init__.py +31 -1
- jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
- jettask/utils/db_connector.py +1629 -0
- jettask/{db_init.py → utils/db_init.py} +1 -1
- jettask/utils/rate_limit/__init__.py +30 -0
- jettask/utils/rate_limit/concurrency_limiter.py +665 -0
- jettask/utils/rate_limit/config.py +145 -0
- jettask/utils/rate_limit/limiter.py +41 -0
- jettask/utils/rate_limit/manager.py +269 -0
- jettask/utils/rate_limit/qps_limiter.py +154 -0
- jettask/utils/rate_limit/task_limiter.py +384 -0
- jettask/utils/serializer.py +3 -0
- jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
- jettask/utils/time_sync.py +173 -0
- jettask/webui/__init__.py +27 -0
- jettask/{api/v1 → webui/api}/alerts.py +1 -1
- jettask/{api/v1 → webui/api}/analytics.py +2 -2
- jettask/{api/v1 → webui/api}/namespaces.py +1 -1
- jettask/{api/v1 → webui/api}/overview.py +1 -1
- jettask/{api/v1 → webui/api}/queues.py +3 -3
- jettask/{api/v1 → webui/api}/scheduled.py +1 -1
- jettask/{api/v1 → webui/api}/settings.py +1 -1
- jettask/{api.py → webui/app.py} +253 -145
- jettask/webui/namespace_manager/__init__.py +10 -0
- jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
- jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
- jettask/{run.py → webui/run.py} +2 -2
- jettask/{services → webui/services}/__init__.py +1 -3
- jettask/{services → webui/services}/overview_service.py +34 -16
- jettask/{services → webui/services}/queue_service.py +1 -1
- jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
- jettask/{services → webui/services}/settings_service.py +1 -1
- jettask/worker/__init__.py +53 -0
- jettask/worker/lifecycle.py +1507 -0
- jettask/worker/manager.py +583 -0
- jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
- jettask-0.2.20.dist-info/RECORD +145 -0
- jettask/__main__.py +0 -140
- jettask/api/__init__.py +0 -103
- jettask/backend/__init__.py +0 -1
- jettask/backend/api/__init__.py +0 -3
- jettask/backend/api/v1/__init__.py +0 -17
- jettask/backend/api/v1/monitoring.py +0 -431
- jettask/backend/api/v1/namespaces.py +0 -504
- jettask/backend/api/v1/queues.py +0 -342
- jettask/backend/api/v1/tasks.py +0 -367
- jettask/backend/core/__init__.py +0 -3
- jettask/backend/core/cache.py +0 -221
- jettask/backend/core/database.py +0 -200
- jettask/backend/core/exceptions.py +0 -102
- jettask/backend/dependencies.py +0 -261
- jettask/backend/init_meta_db.py +0 -158
- jettask/backend/main.py +0 -1426
- jettask/backend/main_unified.py +0 -78
- jettask/backend/main_v2.py +0 -394
- jettask/backend/models/__init__.py +0 -3
- jettask/backend/models/requests.py +0 -236
- jettask/backend/models/responses.py +0 -230
- jettask/backend/namespace_api_old.py +0 -267
- jettask/backend/services/__init__.py +0 -3
- jettask/backend/start.py +0 -42
- jettask/backend/unified_api_router.py +0 -1541
- jettask/cleanup_deprecated_tables.sql +0 -16
- jettask/core/consumer_manager.py +0 -1695
- jettask/core/delay_scanner.py +0 -256
- jettask/core/event_pool.py +0 -1700
- jettask/core/heartbeat_process.py +0 -222
- jettask/core/task_batch.py +0 -153
- jettask/core/worker_scanner.py +0 -271
- jettask/executors/__init__.py +0 -5
- jettask/executors/asyncio.py +0 -876
- jettask/executors/base.py +0 -30
- jettask/executors/common.py +0 -148
- jettask/executors/multi_asyncio.py +0 -309
- jettask/gradio_app.py +0 -570
- jettask/integrated_gradio_app.py +0 -1088
- jettask/main.py +0 -0
- jettask/monitoring/__init__.py +0 -3
- jettask/pg_consumer.py +0 -1896
- jettask/run_monitor.py +0 -22
- jettask/run_webui.py +0 -148
- jettask/scheduler/multi_namespace_scheduler.py +0 -294
- jettask/scheduler/unified_manager.py +0 -450
- jettask/task_center_client.py +0 -150
- jettask/utils/serializer_optimized.py +0 -33
- jettask/webui_exceptions.py +0 -67
- jettask-0.2.19.dist-info/RECORD +0 -150
- /jettask/{constants.py → config/constants.py} +0 -0
- /jettask/{backend/config.py → config/task_center.py} +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
- /jettask/{models.py → persistence/models.py} +0 -0
- /jettask/scheduler/{manager.py → task_crud.py} +0 -0
- /jettask/{schema.sql → schemas/schema.sql} +0 -0
- /jettask/{task_center.py → task/task_center/client.py} +0 -0
- /jettask/{monitoring → utils}/file_watcher.py +0 -0
- /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
- /jettask/{api/v1 → webui/api}/__init__.py +0 -0
- /jettask/{webui_config.py → webui/config.py} +0 -0
- /jettask/{webui_models → webui/models}/__init__.py +0 -0
- /jettask/{webui_models → webui/models}/namespace.py +0 -0
- /jettask/{services → webui/services}/alert_service.py +0 -0
- /jettask/{services → webui/services}/analytics_service.py +0 -0
- /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
- /jettask/{services → webui/services}/task_service.py +0 -0
- /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
- /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1507 @@
|
|
1
|
+
"""
|
2
|
+
Worker 生命周期管理
|
3
|
+
|
4
|
+
整合了以下模块的功能:
|
5
|
+
- state_manager.py: Worker 状态管理
|
6
|
+
- heartbeat_thread.py: 心跳线程管理
|
7
|
+
- scanner.py: Worker 超时扫描
|
8
|
+
- core.py: WorkerLifecycle, WorkerStatistics
|
9
|
+
- consumer_manager.py: HeartbeatConsumerStrategy 的心跳和统计逻辑
|
10
|
+
"""
|
11
|
+
|
12
|
+
import os
|
13
|
+
import socket
|
14
|
+
import uuid
|
15
|
+
import time
|
16
|
+
import asyncio
|
17
|
+
import logging
|
18
|
+
import threading
|
19
|
+
import json
|
20
|
+
from typing import Dict, List, Optional, Set, Callable, Any
|
21
|
+
from collections import defaultdict, namedtuple
|
22
|
+
from redis.asyncio.lock import Lock as AsyncLock
|
23
|
+
import redis
|
24
|
+
import redis.asyncio as aioredis
|
25
|
+
|
26
|
+
logger = logging.getLogger(__name__)
|
27
|
+
|
28
|
+
|
29
|
+
# ============================================================================
|
30
|
+
# Worker 状态管理
|
31
|
+
# ============================================================================
|
32
|
+
|
33
|
+
class WorkerStateManager:
|
34
|
+
"""Worker状态管理器 - Worker状态的唯一管理入口
|
35
|
+
|
36
|
+
⚠️ 重要:所有Worker状态的修改都必须通过这个类进行,不要直接操作Redis!
|
37
|
+
|
38
|
+
职责:
|
39
|
+
1. 统一管理worker所有状态字段的读写
|
40
|
+
2. 当关键状态变更时,通过Redis Pub/Sub发送信号
|
41
|
+
3. 提供状态变更监听机制
|
42
|
+
4. 维护worker在ACTIVE_WORKERS sorted set中的记录
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self, redis_client: aioredis.Redis, redis_prefix: str = "jettask", event_pool=None):
|
46
|
+
"""初始化Worker状态管理器
|
47
|
+
|
48
|
+
Args:
|
49
|
+
redis_client: 异步Redis客户端
|
50
|
+
redis_prefix: Redis key前缀
|
51
|
+
event_pool: EventPool实例(可选),用于事件驱动的消息恢复
|
52
|
+
"""
|
53
|
+
self.redis = redis_client
|
54
|
+
self.redis_prefix = redis_prefix
|
55
|
+
self.active_workers_key = f"{redis_prefix}:ACTIVE_WORKERS"
|
56
|
+
self.event_pool = event_pool
|
57
|
+
|
58
|
+
# Pub/Sub通道名称
|
59
|
+
self.worker_state_channel = f"{redis_prefix}:WORKER_STATE_CHANGE"
|
60
|
+
|
61
|
+
# 监听器订阅
|
62
|
+
self._pubsub = None
|
63
|
+
self._listener_task: Optional[asyncio.Task] = None
|
64
|
+
self._running = False
|
65
|
+
self._callbacks: Set[Callable] = set()
|
66
|
+
|
67
|
+
# Pub/Sub 配置
|
68
|
+
self._health_check_interval = 60
|
69
|
+
self._health_check_task: Optional[asyncio.Task] = None
|
70
|
+
|
71
|
+
def _get_worker_key(self, worker_id: str) -> str:
|
72
|
+
"""获取worker的Redis key"""
|
73
|
+
return f"{self.redis_prefix}:WORKER:{worker_id}"
|
74
|
+
|
75
|
+
async def initialize_worker(self, worker_id: str, worker_info: Dict[str, Any]):
|
76
|
+
"""初始化worker(首次创建)"""
|
77
|
+
worker_key = self._get_worker_key(worker_id)
|
78
|
+
current_time = time.time()
|
79
|
+
|
80
|
+
worker_info.setdefault('is_alive', 'true')
|
81
|
+
worker_info.setdefault('messages_transferred', 'false')
|
82
|
+
worker_info.setdefault('created_at', str(current_time))
|
83
|
+
worker_info.setdefault('last_heartbeat', str(current_time))
|
84
|
+
|
85
|
+
pipeline = self.redis.pipeline()
|
86
|
+
pipeline.hset(worker_key, mapping=worker_info)
|
87
|
+
pipeline.zadd(self.active_workers_key, {worker_id: current_time})
|
88
|
+
await pipeline.execute()
|
89
|
+
|
90
|
+
logger.debug(f"Initialized worker {worker_id}")
|
91
|
+
|
92
|
+
async def set_worker_online(self, worker_id: str, worker_data: dict = None):
|
93
|
+
"""设置worker为在线状态"""
|
94
|
+
worker_key = self._get_worker_key(worker_id)
|
95
|
+
old_alive = await self.redis.hget(worker_key, 'is_alive')
|
96
|
+
old_alive = old_alive.decode('utf-8') if isinstance(old_alive, bytes) else old_alive
|
97
|
+
|
98
|
+
current_time = time.time()
|
99
|
+
pipeline = self.redis.pipeline()
|
100
|
+
pipeline.hset(worker_key, 'is_alive', 'true')
|
101
|
+
pipeline.hset(worker_key, 'last_heartbeat', str(current_time))
|
102
|
+
|
103
|
+
# 当 worker 从离线变为在线时,重置 messages_transferred
|
104
|
+
# 这表示是一个新的 worker 实例,还没有进行消息转移
|
105
|
+
if old_alive != 'true':
|
106
|
+
pipeline.hset(worker_key, 'messages_transferred', 'false')
|
107
|
+
|
108
|
+
if worker_data:
|
109
|
+
pipeline.hset(worker_key, mapping=worker_data)
|
110
|
+
|
111
|
+
pipeline.zadd(self.active_workers_key, {worker_id: current_time})
|
112
|
+
await pipeline.execute()
|
113
|
+
|
114
|
+
if old_alive != 'true':
|
115
|
+
await self._publish_state_change(worker_id, 'online')
|
116
|
+
logger.debug(f"Worker {worker_id} is now ONLINE")
|
117
|
+
|
118
|
+
async def set_worker_offline(self, worker_id: str, reason: str = "unknown"):
|
119
|
+
"""设置worker为离线状态"""
|
120
|
+
worker_key = self._get_worker_key(worker_id)
|
121
|
+
old_alive = await self.redis.hget(worker_key, 'is_alive')
|
122
|
+
old_alive = old_alive.decode('utf-8') if isinstance(old_alive, bytes) else old_alive
|
123
|
+
|
124
|
+
current_time = time.time()
|
125
|
+
pipeline = self.redis.pipeline()
|
126
|
+
pipeline.hset(worker_key, 'messages_transferred', 'false') # 重置消息转移标记,允许其他worker接管消息
|
127
|
+
pipeline.hset(worker_key, 'is_alive', 'false')
|
128
|
+
pipeline.hset(worker_key, 'offline_reason', reason)
|
129
|
+
pipeline.hset(worker_key, 'offline_time', str(current_time))
|
130
|
+
pipeline.zrem(self.active_workers_key, worker_id)
|
131
|
+
await pipeline.execute()
|
132
|
+
|
133
|
+
if old_alive == 'true':
|
134
|
+
await self._publish_state_change(worker_id, 'offline', reason)
|
135
|
+
logger.debug(f"Worker {worker_id} is now OFFLINE (reason: {reason})")
|
136
|
+
|
137
|
+
async def update_worker_heartbeat(self, worker_id: str, heartbeat_data: dict = None):
|
138
|
+
"""更新worker心跳(确保在线状态)"""
|
139
|
+
worker_key = self._get_worker_key(worker_id)
|
140
|
+
current_time = time.time()
|
141
|
+
|
142
|
+
pipeline = self.redis.pipeline()
|
143
|
+
pipeline.hset(worker_key, 'is_alive', 'true')
|
144
|
+
pipeline.hset(worker_key, 'last_heartbeat', str(current_time))
|
145
|
+
|
146
|
+
if heartbeat_data:
|
147
|
+
pipeline.hset(worker_key, mapping=heartbeat_data)
|
148
|
+
|
149
|
+
pipeline.zadd(self.active_workers_key, {worker_id: current_time})
|
150
|
+
await pipeline.execute()
|
151
|
+
|
152
|
+
async def update_worker_field(self, worker_id: str, field: str, value: str):
|
153
|
+
"""更新worker的单个字段"""
|
154
|
+
worker_key = self._get_worker_key(worker_id)
|
155
|
+
await self.redis.hset(worker_key, field, value)
|
156
|
+
|
157
|
+
async def update_worker_fields(self, worker_id: str, fields: Dict[str, Any]):
|
158
|
+
"""批量更新worker的多个字段"""
|
159
|
+
worker_key = self._get_worker_key(worker_id)
|
160
|
+
await self.redis.hset(worker_key, mapping=fields)
|
161
|
+
|
162
|
+
async def increment_queue_stats(self, worker_id: str, queue: str,
|
163
|
+
running_tasks_delta: int = None,
|
164
|
+
success_count_increment: int = None,
|
165
|
+
failed_count_increment: int = None,
|
166
|
+
total_count_increment: int = None,
|
167
|
+
processing_time_increment: float = None,
|
168
|
+
latency_time_increment: float = None):
|
169
|
+
"""增量更新worker在特定队列上的累积统计信息"""
|
170
|
+
worker_key = self._get_worker_key(worker_id)
|
171
|
+
pipeline = self.redis.pipeline()
|
172
|
+
|
173
|
+
if running_tasks_delta is not None and running_tasks_delta != 0:
|
174
|
+
pipeline.hincrby(worker_key, f'{queue}:running_tasks', running_tasks_delta)
|
175
|
+
|
176
|
+
if success_count_increment is not None:
|
177
|
+
pipeline.hincrby(worker_key, f'{queue}:success_count', success_count_increment)
|
178
|
+
|
179
|
+
if failed_count_increment is not None:
|
180
|
+
pipeline.hincrby(worker_key, f'{queue}:failed_count', failed_count_increment)
|
181
|
+
|
182
|
+
if total_count_increment is not None:
|
183
|
+
pipeline.hincrby(worker_key, f'{queue}:total_count', total_count_increment)
|
184
|
+
|
185
|
+
if processing_time_increment is not None:
|
186
|
+
pipeline.hincrbyfloat(worker_key, f'{queue}:total_processing_time', processing_time_increment)
|
187
|
+
|
188
|
+
if latency_time_increment is not None:
|
189
|
+
pipeline.hincrbyfloat(worker_key, f'{queue}:total_latency_time', latency_time_increment)
|
190
|
+
|
191
|
+
await pipeline.execute()
|
192
|
+
|
193
|
+
async def get_queue_total_stats(self, worker_id: str, queue: str) -> dict:
|
194
|
+
"""获取队列的累积统计数据"""
|
195
|
+
worker_key = self._get_worker_key(worker_id)
|
196
|
+
fields = [
|
197
|
+
f'{queue}:total_count',
|
198
|
+
f'{queue}:total_processing_time',
|
199
|
+
f'{queue}:total_latency_time'
|
200
|
+
]
|
201
|
+
values = await self.redis.hmget(worker_key, fields)
|
202
|
+
|
203
|
+
return {
|
204
|
+
'total_count': int(values[0]) if values[0] else 0,
|
205
|
+
'total_processing_time': float(values[1]) if values[1] else 0.0,
|
206
|
+
'total_latency_time': float(values[2]) if values[2] else 0.0
|
207
|
+
}
|
208
|
+
|
209
|
+
async def update_queue_stats(self, worker_id: str, queue: str,
|
210
|
+
running_tasks: int = None,
|
211
|
+
avg_processing_time: float = None,
|
212
|
+
avg_latency_time: float = None):
|
213
|
+
"""更新worker在特定队列上的统计信息"""
|
214
|
+
worker_key = self._get_worker_key(worker_id)
|
215
|
+
pipeline = self.redis.pipeline()
|
216
|
+
|
217
|
+
if running_tasks is not None:
|
218
|
+
pipeline.hset(worker_key, f'{queue}:running_tasks', str(running_tasks))
|
219
|
+
|
220
|
+
if avg_processing_time is not None:
|
221
|
+
pipeline.hset(worker_key, f'{queue}:avg_processing_time', f'{avg_processing_time:.3f}')
|
222
|
+
|
223
|
+
if avg_latency_time is not None:
|
224
|
+
pipeline.hset(worker_key, f'{queue}:avg_latency_time', f'{avg_latency_time:.3f}')
|
225
|
+
|
226
|
+
await pipeline.execute()
|
227
|
+
|
228
|
+
async def mark_messages_transferred(self, worker_id: str, transferred: bool = True):
|
229
|
+
"""标记worker的消息是否已转移"""
|
230
|
+
worker_key = self._get_worker_key(worker_id)
|
231
|
+
await self.redis.hset(worker_key, 'messages_transferred', 'true' if transferred else 'false')
|
232
|
+
|
233
|
+
async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, str]]:
|
234
|
+
"""获取worker的完整信息"""
|
235
|
+
worker_key = self._get_worker_key(worker_id)
|
236
|
+
data = await self.redis.hgetall(worker_key)
|
237
|
+
|
238
|
+
if not data:
|
239
|
+
return None
|
240
|
+
|
241
|
+
result = {}
|
242
|
+
for k, v in data.items():
|
243
|
+
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
244
|
+
value = v.decode('utf-8') if isinstance(v, bytes) else v
|
245
|
+
result[key] = value
|
246
|
+
|
247
|
+
return result
|
248
|
+
|
249
|
+
async def get_worker_field(self, worker_id: str, field: str) -> Optional[str]:
|
250
|
+
"""获取worker的单个字段值"""
|
251
|
+
worker_key = self._get_worker_key(worker_id)
|
252
|
+
value = await self.redis.hget(worker_key, field)
|
253
|
+
|
254
|
+
if value is None:
|
255
|
+
return None
|
256
|
+
|
257
|
+
return value.decode('utf-8') if isinstance(value, bytes) else value
|
258
|
+
|
259
|
+
async def is_worker_alive(self, worker_id: str) -> bool:
|
260
|
+
"""检查worker是否在线"""
|
261
|
+
is_alive = await self.get_worker_field(worker_id, 'is_alive')
|
262
|
+
return is_alive == 'true'
|
263
|
+
|
264
|
+
async def get_all_workers_info(self, only_alive: bool = True) -> Dict[str, Dict[str, str]]:
|
265
|
+
"""获取所有worker的信息"""
|
266
|
+
pattern = f"{self.redis_prefix}:WORKER:*"
|
267
|
+
result = {}
|
268
|
+
|
269
|
+
cursor = 0
|
270
|
+
while True:
|
271
|
+
cursor, keys = await self.redis.scan(cursor, match=pattern, count=100)
|
272
|
+
|
273
|
+
for key in keys:
|
274
|
+
if isinstance(key, bytes):
|
275
|
+
key = key.decode('utf-8')
|
276
|
+
|
277
|
+
parts = key.split(":")
|
278
|
+
if len(parts) >= 3:
|
279
|
+
worker_id = parts[2]
|
280
|
+
worker_info = await self.get_worker_info(worker_id)
|
281
|
+
if worker_info:
|
282
|
+
if only_alive and worker_info.get('is_alive') != 'true':
|
283
|
+
continue
|
284
|
+
result[worker_id] = worker_info
|
285
|
+
|
286
|
+
if cursor == 0:
|
287
|
+
break
|
288
|
+
|
289
|
+
return result
|
290
|
+
|
291
|
+
async def delete_worker(self, worker_id: str):
|
292
|
+
"""删除worker的所有数据"""
|
293
|
+
worker_key = self._get_worker_key(worker_id)
|
294
|
+
pipeline = self.redis.pipeline()
|
295
|
+
pipeline.delete(worker_key)
|
296
|
+
pipeline.zrem(self.active_workers_key, worker_id)
|
297
|
+
await pipeline.execute()
|
298
|
+
logger.debug(f"Deleted worker {worker_id}")
|
299
|
+
|
300
|
+
async def _publish_state_change(self, worker_id: str, state: str, reason: str = None):
|
301
|
+
"""发布状态变更信号"""
|
302
|
+
message = {
|
303
|
+
'worker_id': worker_id,
|
304
|
+
'state': state,
|
305
|
+
'timestamp': asyncio.get_event_loop().time()
|
306
|
+
}
|
307
|
+
|
308
|
+
if reason:
|
309
|
+
message['reason'] = reason
|
310
|
+
|
311
|
+
await self.redis.publish(
|
312
|
+
self.worker_state_channel,
|
313
|
+
json.dumps(message)
|
314
|
+
)
|
315
|
+
|
316
|
+
logger.debug(f"Published state change: {message}")
|
317
|
+
|
318
|
+
async def start_listener(self):
|
319
|
+
"""启动状态变更监听器"""
|
320
|
+
if self._running:
|
321
|
+
logger.warning("Worker state listener already running")
|
322
|
+
return
|
323
|
+
|
324
|
+
self._running = True
|
325
|
+
self._pubsub = await self._create_and_subscribe_pubsub()
|
326
|
+
self._listener_task = asyncio.create_task(self._listen_loop())
|
327
|
+
self._health_check_task = asyncio.create_task(self._health_check_loop())
|
328
|
+
|
329
|
+
logger.debug(f"Started worker state listener on channel: {self.worker_state_channel}")
|
330
|
+
|
331
|
+
async def stop_listener(self):
|
332
|
+
"""停止状态变更监听器"""
|
333
|
+
if not self._running:
|
334
|
+
return
|
335
|
+
|
336
|
+
self._running = False
|
337
|
+
|
338
|
+
if self._listener_task:
|
339
|
+
self._listener_task.cancel()
|
340
|
+
try:
|
341
|
+
await self._listener_task
|
342
|
+
except asyncio.CancelledError:
|
343
|
+
pass
|
344
|
+
|
345
|
+
if self._health_check_task:
|
346
|
+
self._health_check_task.cancel()
|
347
|
+
try:
|
348
|
+
await self._health_check_task
|
349
|
+
except asyncio.CancelledError:
|
350
|
+
pass
|
351
|
+
|
352
|
+
if self._pubsub:
|
353
|
+
await self._pubsub.unsubscribe(self.worker_state_channel)
|
354
|
+
await self._pubsub.close()
|
355
|
+
|
356
|
+
logger.debug("Stopped worker state listener")
|
357
|
+
|
358
|
+
async def _create_and_subscribe_pubsub(self):
|
359
|
+
"""创建 PubSub 连接并订阅频道"""
|
360
|
+
if self._pubsub:
|
361
|
+
try:
|
362
|
+
await self._pubsub.close()
|
363
|
+
except:
|
364
|
+
pass
|
365
|
+
|
366
|
+
pubsub = self.redis.pubsub()
|
367
|
+
await pubsub.subscribe(self.worker_state_channel)
|
368
|
+
|
369
|
+
logger.debug(f"Created and subscribed to Redis Pub/Sub channel: {self.worker_state_channel}")
|
370
|
+
return pubsub
|
371
|
+
|
372
|
+
async def _health_check_loop(self):
|
373
|
+
"""定期检查 Pub/Sub 连接健康状态"""
|
374
|
+
while self._running:
|
375
|
+
try:
|
376
|
+
await asyncio.sleep(self._health_check_interval)
|
377
|
+
|
378
|
+
if not self._running:
|
379
|
+
break
|
380
|
+
|
381
|
+
if self._pubsub and self._pubsub.connection:
|
382
|
+
try:
|
383
|
+
await asyncio.wait_for(self._pubsub.ping(), timeout=5.0)
|
384
|
+
logger.debug("Pub/Sub health check: OK")
|
385
|
+
except Exception as e:
|
386
|
+
logger.warning(f"Pub/Sub health check failed: {e}")
|
387
|
+
else:
|
388
|
+
logger.warning("Pub/Sub connection is None")
|
389
|
+
|
390
|
+
except asyncio.CancelledError:
|
391
|
+
logger.debug("Health check loop cancelled")
|
392
|
+
break
|
393
|
+
except Exception as e:
|
394
|
+
logger.error(f"Error in health check loop: {e}")
|
395
|
+
|
396
|
+
async def _listen_loop(self):
|
397
|
+
"""监听循环(支持自动重连)"""
|
398
|
+
retry_delay = 1
|
399
|
+
max_retry_delay = 30
|
400
|
+
|
401
|
+
while self._running:
|
402
|
+
try:
|
403
|
+
async for message in self._pubsub.listen():
|
404
|
+
if message['type'] == 'message':
|
405
|
+
try:
|
406
|
+
data = json.loads(message['data'])
|
407
|
+
|
408
|
+
if data.get('state') == 'offline' and self.event_pool:
|
409
|
+
worker_id = data.get('worker_id')
|
410
|
+
if worker_id:
|
411
|
+
logger.info(f"[StateManager] Worker {worker_id} offline event received")
|
412
|
+
asyncio.create_task(
|
413
|
+
self.event_pool.handle_worker_offline_event(worker_id)
|
414
|
+
)
|
415
|
+
|
416
|
+
for callback in self._callbacks:
|
417
|
+
try:
|
418
|
+
if asyncio.iscoroutinefunction(callback):
|
419
|
+
await callback(data)
|
420
|
+
else:
|
421
|
+
callback(data)
|
422
|
+
except Exception as e:
|
423
|
+
logger.error(f"Error in state change callback: {e}")
|
424
|
+
|
425
|
+
except Exception as e:
|
426
|
+
logger.error(f"Error processing state change message: {e}")
|
427
|
+
|
428
|
+
retry_delay = 1
|
429
|
+
|
430
|
+
except asyncio.CancelledError:
|
431
|
+
logger.debug("Listen loop cancelled")
|
432
|
+
break
|
433
|
+
except Exception as e:
|
434
|
+
logger.error(f"Error in listen loop: {e}")
|
435
|
+
|
436
|
+
if not self._running:
|
437
|
+
break
|
438
|
+
|
439
|
+
logger.warning(f"Attempting to reconnect to Redis Pub/Sub in {retry_delay} seconds...")
|
440
|
+
await asyncio.sleep(retry_delay)
|
441
|
+
|
442
|
+
try:
|
443
|
+
self._pubsub = await self._create_and_subscribe_pubsub()
|
444
|
+
logger.info(f"Successfully reconnected to Redis Pub/Sub")
|
445
|
+
retry_delay = 1
|
446
|
+
except Exception as reconnect_error:
|
447
|
+
logger.error(f"Failed to reconnect to Redis Pub/Sub: {reconnect_error}")
|
448
|
+
retry_delay = min(retry_delay * 2, max_retry_delay)
|
449
|
+
|
450
|
+
logger.debug("Listen loop exited")
|
451
|
+
|
452
|
+
def register_callback(self, callback: Callable):
|
453
|
+
"""注册状态变更回调"""
|
454
|
+
self._callbacks.add(callback)
|
455
|
+
logger.debug(f"Registered state change callback: {callback.__name__}")
|
456
|
+
|
457
|
+
def unregister_callback(self, callback: Callable):
|
458
|
+
"""注销状态变更回调"""
|
459
|
+
self._callbacks.discard(callback)
|
460
|
+
logger.debug(f"Unregistered state change callback: {callback.__name__}")
|
461
|
+
|
462
|
+
|
463
|
+
# ============================================================================
|
464
|
+
# 心跳管理
|
465
|
+
# ============================================================================
|
466
|
+
|
467
|
+
class HeartbeatTaskManager:
|
468
|
+
"""基于协程的心跳管理器(在主进程的独立事件循环线程中运行,轻量级)"""
|
469
|
+
|
470
|
+
def __init__(self, redis_client, worker_key: str, worker_id: str, redis_prefix: str,
|
471
|
+
interval: float = 5.0, heartbeat_timeout: float = 15.0, loop: asyncio.AbstractEventLoop = None):
|
472
|
+
"""初始化心跳任务管理器
|
473
|
+
|
474
|
+
Args:
|
475
|
+
redis_client: 异步 Redis 客户端
|
476
|
+
worker_key: Worker 的 Redis key
|
477
|
+
worker_id: Worker ID
|
478
|
+
redis_prefix: Redis 前缀
|
479
|
+
interval: 心跳间隔(秒)
|
480
|
+
heartbeat_timeout: 心跳超时时间(秒)
|
481
|
+
loop: 事件循环(如果为None,会在当前线程创建新的)
|
482
|
+
"""
|
483
|
+
self.redis_client = redis_client
|
484
|
+
self.worker_key = worker_key
|
485
|
+
self.worker_id = worker_id
|
486
|
+
self.redis_prefix = redis_prefix
|
487
|
+
self.interval = interval
|
488
|
+
self.heartbeat_timeout = heartbeat_timeout
|
489
|
+
self.queues: Set[str] = set()
|
490
|
+
self._last_heartbeat_time = None
|
491
|
+
self._loop = loop
|
492
|
+
|
493
|
+
# 心跳任务和停止事件
|
494
|
+
self._task: Optional[asyncio.Task] = None
|
495
|
+
self._stop_event: Optional[asyncio.Event] = None
|
496
|
+
self._first_heartbeat_done: Optional[asyncio.Event] = None
|
497
|
+
self._thread: Optional[threading.Thread] = None
|
498
|
+
self._thread_ready: Optional[threading.Event] = None
|
499
|
+
|
500
|
+
@classmethod
|
501
|
+
async def create_and_start(cls, redis_client, redis_prefix: str, queues: List[str] = None,
|
502
|
+
interval: float = 5.0, worker_state=None):
|
503
|
+
"""
|
504
|
+
创建心跳管理器并启动,生成 worker_id 后等待首次心跳成功
|
505
|
+
|
506
|
+
Args:
|
507
|
+
redis_client: 异步 Redis 客户端
|
508
|
+
redis_prefix: Redis 前缀
|
509
|
+
queues: 队列列表
|
510
|
+
interval: 心跳间隔
|
511
|
+
worker_state: WorkerState 实例(用于查找可复用的 worker_id)
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
HeartbeatTaskManager 实例(包含 worker_id 和 worker_key 属性)
|
515
|
+
"""
|
516
|
+
from jettask.worker.manager import WorkerNaming
|
517
|
+
|
518
|
+
# 1. 生成 worker_id
|
519
|
+
naming = WorkerNaming()
|
520
|
+
|
521
|
+
# 生成主机名前缀
|
522
|
+
try:
|
523
|
+
hostname = socket.gethostname()
|
524
|
+
ip = socket.gethostbyname(hostname)
|
525
|
+
prefix = hostname if hostname != 'localhost' else ip
|
526
|
+
except:
|
527
|
+
prefix = os.environ.get('HOSTNAME', 'unknown')
|
528
|
+
|
529
|
+
# 尝试复用离线的 worker_id
|
530
|
+
reusable_id = None
|
531
|
+
if worker_state:
|
532
|
+
reusable_id = await naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
|
533
|
+
|
534
|
+
# 生成或复用 worker_id
|
535
|
+
if reusable_id:
|
536
|
+
worker_id = reusable_id
|
537
|
+
logger.info(f"[PID {os.getpid()}] Reusing offline worker ID: {worker_id}")
|
538
|
+
else:
|
539
|
+
worker_id = naming.generate_worker_id(prefix)
|
540
|
+
logger.info(f"[PID {os.getpid()}] Generated new worker ID: {worker_id}")
|
541
|
+
|
542
|
+
worker_key = f"{redis_prefix}:WORKER:{worker_id}"
|
543
|
+
|
544
|
+
# 2. 创建心跳管理器
|
545
|
+
manager = cls(
|
546
|
+
redis_client=redis_client,
|
547
|
+
worker_key=worker_key,
|
548
|
+
worker_id=worker_id,
|
549
|
+
redis_prefix=redis_prefix,
|
550
|
+
interval=interval
|
551
|
+
)
|
552
|
+
|
553
|
+
# 3. 设置队列
|
554
|
+
if queues:
|
555
|
+
for queue in queues:
|
556
|
+
manager.queues.add(queue)
|
557
|
+
|
558
|
+
# 4. 启动心跳任务
|
559
|
+
await manager.start()
|
560
|
+
|
561
|
+
# 5. 等待首次心跳成功(最多等待 10 秒)
|
562
|
+
try:
|
563
|
+
await asyncio.wait_for(manager._first_heartbeat_done.wait(), timeout=10)
|
564
|
+
except asyncio.TimeoutError:
|
565
|
+
logger.warning(f"Timeout waiting for first heartbeat for worker {worker_id}")
|
566
|
+
|
567
|
+
logger.info(f"Heartbeat task started for worker {worker_id}")
|
568
|
+
return manager
|
569
|
+
|
570
|
+
async def start(self):
|
571
|
+
"""启动心跳任务"""
|
572
|
+
if self._task and not self._task.done():
|
573
|
+
logger.warning("Heartbeat task already running")
|
574
|
+
return
|
575
|
+
|
576
|
+
self._stop_event = asyncio.Event()
|
577
|
+
self._first_heartbeat_done = asyncio.Event()
|
578
|
+
self._task = asyncio.create_task(self._heartbeat_loop())
|
579
|
+
|
580
|
+
async def stop(self):
|
581
|
+
"""停止心跳任务"""
|
582
|
+
if not self._task:
|
583
|
+
return
|
584
|
+
|
585
|
+
logger.debug(f"Stopping heartbeat task for worker {self.worker_id}")
|
586
|
+
self._stop_event.set()
|
587
|
+
|
588
|
+
try:
|
589
|
+
await asyncio.wait_for(self._task, timeout=2.0)
|
590
|
+
except asyncio.TimeoutError:
|
591
|
+
logger.warning("Heartbeat task did not stop in time, cancelling...")
|
592
|
+
self._task.cancel()
|
593
|
+
try:
|
594
|
+
await self._task
|
595
|
+
except asyncio.CancelledError:
|
596
|
+
pass
|
597
|
+
|
598
|
+
logger.debug("Heartbeat task stopped")
|
599
|
+
|
600
|
+
async def _heartbeat_loop(self):
|
601
|
+
"""心跳循环(在协程中运行)"""
|
602
|
+
hostname = socket.gethostname()
|
603
|
+
pid = str(os.getpid())
|
604
|
+
|
605
|
+
logger.info(f"Heartbeat task starting for worker {self.worker_id}")
|
606
|
+
|
607
|
+
heartbeat_count = 0
|
608
|
+
last_log_time = time.time()
|
609
|
+
first_heartbeat = True
|
610
|
+
|
611
|
+
while not self._stop_event.is_set():
|
612
|
+
try:
|
613
|
+
current_time = time.time()
|
614
|
+
|
615
|
+
needs_full_init = False
|
616
|
+
publish_online_signal = False
|
617
|
+
|
618
|
+
old_alive = await self.redis_client.hget(self.worker_key, 'is_alive')
|
619
|
+
consumer_id = await self.redis_client.hget(self.worker_key, 'consumer_id')
|
620
|
+
|
621
|
+
if not consumer_id:
|
622
|
+
needs_full_init = True
|
623
|
+
publish_online_signal = True
|
624
|
+
logger.warning(f"Worker {self.worker_id} key missing critical fields, reinitializing...")
|
625
|
+
elif first_heartbeat and old_alive != b'true' and old_alive != 'true':
|
626
|
+
publish_online_signal = True
|
627
|
+
|
628
|
+
# 标记首次心跳完成
|
629
|
+
if first_heartbeat:
|
630
|
+
first_heartbeat = False
|
631
|
+
|
632
|
+
if needs_full_init:
|
633
|
+
worker_info = {
|
634
|
+
'consumer_id': self.worker_id,
|
635
|
+
'host': hostname,
|
636
|
+
'pid': pid,
|
637
|
+
'created_at': str(current_time),
|
638
|
+
'last_heartbeat': str(current_time),
|
639
|
+
'is_alive': 'true',
|
640
|
+
'messages_transferred': 'false',
|
641
|
+
'heartbeat_timeout': str(self.heartbeat_timeout),
|
642
|
+
}
|
643
|
+
|
644
|
+
if self.queues:
|
645
|
+
worker_info['queues'] = ','.join(sorted(self.queues))
|
646
|
+
|
647
|
+
await self.redis_client.hset(self.worker_key, mapping=worker_info)
|
648
|
+
logger.info(f"Reinitialized worker {self.worker_id} with full info")
|
649
|
+
else:
|
650
|
+
# 构建心跳更新数据
|
651
|
+
heartbeat_update = {
|
652
|
+
'last_heartbeat': str(current_time),
|
653
|
+
'is_alive': 'true',
|
654
|
+
'host': hostname
|
655
|
+
}
|
656
|
+
|
657
|
+
# 如果是从离线变为在线(复用worker ID),重置 messages_transferred
|
658
|
+
if publish_online_signal:
|
659
|
+
heartbeat_update['messages_transferred'] = 'false'
|
660
|
+
logger.debug(f"Worker {self.worker_id} reused, reset messages_transferred=false")
|
661
|
+
|
662
|
+
await self.redis_client.hset(self.worker_key, mapping=heartbeat_update)
|
663
|
+
|
664
|
+
await self.redis_client.zadd(
|
665
|
+
f"{self.redis_prefix}:ACTIVE_WORKERS",
|
666
|
+
{self.worker_id: current_time}
|
667
|
+
)
|
668
|
+
|
669
|
+
if publish_online_signal:
|
670
|
+
state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
|
671
|
+
message = json.dumps({
|
672
|
+
'worker_id': self.worker_id,
|
673
|
+
'state': 'online',
|
674
|
+
'timestamp': current_time
|
675
|
+
})
|
676
|
+
result = await self.redis_client.publish(state_change_channel, message)
|
677
|
+
logger.info(f"Worker {self.worker_id} is now ONLINE, published to {result} subscribers")
|
678
|
+
|
679
|
+
workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
|
680
|
+
await self.redis_client.sadd(workers_registry_key, self.worker_id)
|
681
|
+
|
682
|
+
self._last_heartbeat_time = current_time
|
683
|
+
heartbeat_count += 1
|
684
|
+
|
685
|
+
# 如果这是首次心跳,通知等待的协程
|
686
|
+
if heartbeat_count == 1:
|
687
|
+
self._first_heartbeat_done.set()
|
688
|
+
logger.debug(f"First heartbeat completed for worker {self.worker_id}")
|
689
|
+
|
690
|
+
if current_time - last_log_time >= 30:
|
691
|
+
logger.debug(f"Heartbeat task: sent {heartbeat_count} heartbeats for worker {self.worker_id}")
|
692
|
+
last_log_time = current_time
|
693
|
+
heartbeat_count = 0
|
694
|
+
|
695
|
+
except Exception as e:
|
696
|
+
logger.error(f"Error in heartbeat task: {e}", exc_info=True)
|
697
|
+
|
698
|
+
# 等待下一次心跳
|
699
|
+
try:
|
700
|
+
await asyncio.wait_for(self._stop_event.wait(), timeout=self.interval)
|
701
|
+
break # 如果停止事件被设置,退出循环
|
702
|
+
except asyncio.TimeoutError:
|
703
|
+
pass # 超时是正常的,继续下一次心跳
|
704
|
+
|
705
|
+
logger.info(f"Heartbeat task stopped for worker {self.worker_id}")
|
706
|
+
|
707
|
+
async def mark_offline(self, reason: str = "shutdown"):
|
708
|
+
"""标记 worker 为离线状态"""
|
709
|
+
try:
|
710
|
+
current_time = time.time()
|
711
|
+
state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
|
712
|
+
|
713
|
+
pipeline = self.redis_client.pipeline()
|
714
|
+
pipeline.hset(self.worker_key, 'is_alive', 'false')
|
715
|
+
pipeline.hset(self.worker_key, 'offline_reason', reason)
|
716
|
+
pipeline.hset(self.worker_key, 'offline_time', str(current_time))
|
717
|
+
pipeline.hset(self.worker_key, 'messages_transferred', 'false')
|
718
|
+
pipeline.zrem(f"{self.redis_prefix}:ACTIVE_WORKERS", self.worker_id)
|
719
|
+
|
720
|
+
message = json.dumps({
|
721
|
+
'worker_id': self.worker_id,
|
722
|
+
'state': 'offline',
|
723
|
+
'reason': reason,
|
724
|
+
'timestamp': current_time
|
725
|
+
})
|
726
|
+
pipeline.publish(state_change_channel, message)
|
727
|
+
await pipeline.execute()
|
728
|
+
|
729
|
+
logger.info(f"Worker {self.worker_id} marked as offline (reason: {reason})")
|
730
|
+
except Exception as e:
|
731
|
+
logger.error(f"Error marking worker offline: {e}", exc_info=True)
|
732
|
+
|
733
|
+
|
734
|
+
class HeartbeatThreadManager:
|
735
|
+
"""基于线程的心跳管理器(在 CLI 主进程中运行)"""
|
736
|
+
|
737
|
+
def __init__(self, redis_client=None, worker_key=None, worker_id=None, redis_prefix=None,
|
738
|
+
interval=5.0, redis_url=None, consumer_id=None, heartbeat_interval=None,
|
739
|
+
heartbeat_timeout=15.0):
|
740
|
+
"""初始化心跳线程管理器"""
|
741
|
+
if redis_url is not None:
|
742
|
+
from jettask.utils.db_connector import get_sync_redis_client
|
743
|
+
self.redis_client = get_sync_redis_client(redis_url, decode_responses=True)
|
744
|
+
self.redis_url = redis_url
|
745
|
+
self.consumer_id = consumer_id
|
746
|
+
self.heartbeat_interval = heartbeat_interval or 5.0
|
747
|
+
self.heartbeat_timeout = heartbeat_timeout
|
748
|
+
self.worker_key = None
|
749
|
+
self.worker_id = None
|
750
|
+
self.redis_prefix = redis_prefix
|
751
|
+
self.interval = self.heartbeat_interval
|
752
|
+
self.queues: Set[str] = set()
|
753
|
+
self._last_heartbeat_time = None
|
754
|
+
self._last_heartbeat_time_lock = threading.Lock()
|
755
|
+
else:
|
756
|
+
self.redis_client = redis_client
|
757
|
+
self.worker_key = worker_key
|
758
|
+
self.worker_id = worker_id
|
759
|
+
self.redis_prefix = redis_prefix
|
760
|
+
self.interval = interval
|
761
|
+
self.redis_url = None
|
762
|
+
self.consumer_id = worker_id
|
763
|
+
self.heartbeat_interval = interval
|
764
|
+
self.heartbeat_timeout = 15.0
|
765
|
+
self.queues: Set[str] = set()
|
766
|
+
self._last_heartbeat_time = None
|
767
|
+
self._last_heartbeat_time_lock = threading.Lock()
|
768
|
+
|
769
|
+
self._stop_event = threading.Event()
|
770
|
+
self._thread = None
|
771
|
+
self.heartbeat_process = self
|
772
|
+
|
773
|
+
# 用于等待首次心跳的事件
|
774
|
+
self._first_heartbeat_done = threading.Event()
|
775
|
+
|
776
|
+
@classmethod
|
777
|
+
def create_and_start(cls, redis_client, redis_prefix: str, queues: List[str] = None,
|
778
|
+
interval: float = 5.0, worker_state=None):
|
779
|
+
"""
|
780
|
+
创建心跳管理器并启动,生成 worker_id 后等待首次心跳成功
|
781
|
+
|
782
|
+
Args:
|
783
|
+
redis_client: Redis 客户端
|
784
|
+
redis_prefix: Redis 前缀
|
785
|
+
queues: 队列列表
|
786
|
+
interval: 心跳间隔
|
787
|
+
worker_state: WorkerState 实例(用于查找可复用的 worker_id)
|
788
|
+
|
789
|
+
Returns:
|
790
|
+
HeartbeatThreadManager 实例(包含 worker_id 和 worker_key 属性)
|
791
|
+
"""
|
792
|
+
from jettask.worker.manager import WorkerNaming
|
793
|
+
|
794
|
+
# 1. 生成 worker_id
|
795
|
+
naming = WorkerNaming()
|
796
|
+
|
797
|
+
# 生成主机名前缀
|
798
|
+
try:
|
799
|
+
hostname = socket.gethostname()
|
800
|
+
ip = socket.gethostbyname(hostname)
|
801
|
+
prefix = hostname if hostname != 'localhost' else ip
|
802
|
+
except:
|
803
|
+
prefix = os.environ.get('HOSTNAME', 'unknown')
|
804
|
+
|
805
|
+
# 尝试复用离线的 worker_id(同步方式)
|
806
|
+
reusable_id = None
|
807
|
+
if worker_state:
|
808
|
+
import asyncio
|
809
|
+
try:
|
810
|
+
loop = asyncio.get_event_loop()
|
811
|
+
if not loop.is_running():
|
812
|
+
reusable_id = loop.run_until_complete(
|
813
|
+
naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
|
814
|
+
)
|
815
|
+
except RuntimeError:
|
816
|
+
# 没有事件循环,创建新的
|
817
|
+
loop = asyncio.new_event_loop()
|
818
|
+
asyncio.set_event_loop(loop)
|
819
|
+
try:
|
820
|
+
reusable_id = loop.run_until_complete(
|
821
|
+
naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
|
822
|
+
)
|
823
|
+
finally:
|
824
|
+
loop.close()
|
825
|
+
|
826
|
+
# 生成或复用 worker_id
|
827
|
+
if reusable_id:
|
828
|
+
worker_id = reusable_id
|
829
|
+
logger.info(f"[PID {os.getpid()}] Reusing offline worker ID: {worker_id}")
|
830
|
+
else:
|
831
|
+
worker_id = naming.generate_worker_id(prefix)
|
832
|
+
logger.info(f"[PID {os.getpid()}] Generated new worker ID: {worker_id}")
|
833
|
+
|
834
|
+
worker_key = f"{redis_prefix}:WORKER:{worker_id}"
|
835
|
+
|
836
|
+
# 2. 创建心跳管理器
|
837
|
+
manager = cls(
|
838
|
+
redis_client=redis_client,
|
839
|
+
worker_key=worker_key,
|
840
|
+
worker_id=worker_id,
|
841
|
+
redis_prefix=redis_prefix,
|
842
|
+
interval=interval
|
843
|
+
)
|
844
|
+
|
845
|
+
# 3. 设置队列
|
846
|
+
if queues:
|
847
|
+
for queue in queues:
|
848
|
+
manager.queues.add(queue)
|
849
|
+
|
850
|
+
# 4. 启动心跳线程
|
851
|
+
manager.start()
|
852
|
+
|
853
|
+
# 5. 等待首次心跳成功(最多等待 10 秒)
|
854
|
+
if not manager._first_heartbeat_done.wait(timeout=10):
|
855
|
+
logger.warning(f"Timeout waiting for first heartbeat for worker {worker_id}")
|
856
|
+
|
857
|
+
# 返回管理器对象,调用方可以通过 manager.worker_id 和 manager.worker_key 访问
|
858
|
+
return manager
|
859
|
+
|
860
|
+
def start(self):
|
861
|
+
"""启动心跳线程"""
|
862
|
+
if self._thread and self._thread.is_alive():
|
863
|
+
logger.warning("Heartbeat thread already running")
|
864
|
+
return
|
865
|
+
|
866
|
+
self._stop_event.clear()
|
867
|
+
self._thread = threading.Thread(
|
868
|
+
target=self._heartbeat_loop,
|
869
|
+
name=f"Heartbeat-{self.worker_id}",
|
870
|
+
daemon=True
|
871
|
+
)
|
872
|
+
self._thread.start()
|
873
|
+
logger.info(f"Heartbeat thread started for worker {self.worker_id}")
|
874
|
+
|
875
|
+
def stop(self, timeout=2.0):
|
876
|
+
"""停止心跳线程"""
|
877
|
+
if not self._thread:
|
878
|
+
return
|
879
|
+
|
880
|
+
logger.debug(f"Stopping heartbeat thread for worker {self.worker_id}")
|
881
|
+
self._stop_event.set()
|
882
|
+
self._thread.join(timeout=timeout)
|
883
|
+
|
884
|
+
if self._thread.is_alive():
|
885
|
+
logger.warning("Heartbeat thread did not stop in time")
|
886
|
+
else:
|
887
|
+
logger.debug("Heartbeat thread stopped")
|
888
|
+
|
889
|
+
def _heartbeat_loop(self):
|
890
|
+
"""心跳循环(在线程中运行)"""
|
891
|
+
hostname = socket.gethostname()
|
892
|
+
pid = str(os.getpid())
|
893
|
+
|
894
|
+
logger.info(f"Heartbeat thread starting for worker {self.worker_id}")
|
895
|
+
|
896
|
+
heartbeat_count = 0
|
897
|
+
last_log_time = time.time()
|
898
|
+
first_heartbeat = True
|
899
|
+
|
900
|
+
while not self._stop_event.is_set():
|
901
|
+
try:
|
902
|
+
current_time = time.time()
|
903
|
+
|
904
|
+
needs_full_init = False
|
905
|
+
publish_online_signal = False
|
906
|
+
|
907
|
+
old_alive = self.redis_client.hget(self.worker_key, 'is_alive')
|
908
|
+
consumer_id = self.redis_client.hget(self.worker_key, 'consumer_id')
|
909
|
+
|
910
|
+
if not consumer_id:
|
911
|
+
needs_full_init = True
|
912
|
+
publish_online_signal = True
|
913
|
+
logger.warning(f"Worker {self.worker_id} key missing critical fields, reinitializing...")
|
914
|
+
elif first_heartbeat and old_alive != 'true':
|
915
|
+
publish_online_signal = True
|
916
|
+
|
917
|
+
# 标记首次心跳完成(在第一次心跳逻辑执行后)
|
918
|
+
if first_heartbeat:
|
919
|
+
first_heartbeat = False
|
920
|
+
|
921
|
+
if needs_full_init:
|
922
|
+
worker_info = {
|
923
|
+
'consumer_id': self.worker_id,
|
924
|
+
'host': hostname,
|
925
|
+
'pid': pid,
|
926
|
+
'created_at': str(current_time),
|
927
|
+
'last_heartbeat': str(current_time),
|
928
|
+
'is_alive': 'true',
|
929
|
+
'messages_transferred': 'false',
|
930
|
+
'heartbeat_timeout': str(self.heartbeat_timeout),
|
931
|
+
}
|
932
|
+
|
933
|
+
if self.queues:
|
934
|
+
worker_info['queues'] = ','.join(sorted(self.queues))
|
935
|
+
|
936
|
+
self.redis_client.hset(self.worker_key, mapping=worker_info)
|
937
|
+
logger.info(f"Reinitialized worker {self.worker_id} with full info")
|
938
|
+
else:
|
939
|
+
# 构建心跳更新数据
|
940
|
+
heartbeat_update = {
|
941
|
+
'last_heartbeat': str(current_time),
|
942
|
+
'is_alive': 'true',
|
943
|
+
'host': hostname
|
944
|
+
}
|
945
|
+
|
946
|
+
# 如果是从离线变为在线(复用worker ID),重置 messages_transferred
|
947
|
+
if publish_online_signal:
|
948
|
+
heartbeat_update['messages_transferred'] = 'false'
|
949
|
+
logger.debug(f"Worker {self.worker_id} reused, reset messages_transferred=false")
|
950
|
+
|
951
|
+
self.redis_client.hset(self.worker_key, mapping=heartbeat_update)
|
952
|
+
|
953
|
+
self.redis_client.zadd(
|
954
|
+
f"{self.redis_prefix}:ACTIVE_WORKERS",
|
955
|
+
{self.worker_id: current_time}
|
956
|
+
)
|
957
|
+
|
958
|
+
if publish_online_signal:
|
959
|
+
state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
|
960
|
+
message = json.dumps({
|
961
|
+
'worker_id': self.worker_id,
|
962
|
+
'state': 'online',
|
963
|
+
'timestamp': current_time
|
964
|
+
})
|
965
|
+
result = self.redis_client.publish(state_change_channel, message)
|
966
|
+
logger.info(f"Worker {self.worker_id} is now ONLINE, published to {result} subscribers")
|
967
|
+
|
968
|
+
workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
|
969
|
+
self.redis_client.sadd(workers_registry_key, self.worker_id)
|
970
|
+
|
971
|
+
with self._last_heartbeat_time_lock:
|
972
|
+
self._last_heartbeat_time = current_time
|
973
|
+
|
974
|
+
heartbeat_count += 1
|
975
|
+
|
976
|
+
# 如果这是首次心跳,通知等待的线程
|
977
|
+
if heartbeat_count == 1:
|
978
|
+
self._first_heartbeat_done.set()
|
979
|
+
logger.debug(f"First heartbeat completed for worker {self.worker_id}")
|
980
|
+
|
981
|
+
if current_time - last_log_time >= 30:
|
982
|
+
logger.debug(f"Heartbeat thread: sent {heartbeat_count} heartbeats for worker {self.worker_id}")
|
983
|
+
last_log_time = current_time
|
984
|
+
heartbeat_count = 0
|
985
|
+
|
986
|
+
except Exception as e:
|
987
|
+
logger.error(f"Error in heartbeat thread: {e}", exc_info=True)
|
988
|
+
if "Timeout connecting" in str(e) or "Connection" in str(e):
|
989
|
+
try:
|
990
|
+
self.redis_client.close()
|
991
|
+
except:
|
992
|
+
pass
|
993
|
+
try:
|
994
|
+
if self.redis_url:
|
995
|
+
from jettask.utils.db_connector import get_sync_redis_client
|
996
|
+
self.redis_client = get_sync_redis_client(
|
997
|
+
redis_url=self.redis_url,
|
998
|
+
decode_responses=True,
|
999
|
+
)
|
1000
|
+
logger.info(f"Reconnected to Redis for heartbeat thread {self.worker_id}")
|
1001
|
+
except Exception as reconnect_error:
|
1002
|
+
logger.error(f"Failed to reconnect Redis: {reconnect_error}")
|
1003
|
+
time.sleep(5)
|
1004
|
+
|
1005
|
+
self._stop_event.wait(timeout=self.interval)
|
1006
|
+
|
1007
|
+
logger.info(f"Heartbeat thread exiting for worker {self.worker_id}")
|
1008
|
+
try:
|
1009
|
+
current_time = time.time()
|
1010
|
+
pipeline = self.redis_client.pipeline()
|
1011
|
+
pipeline.hset(self.worker_key, mapping={
|
1012
|
+
'is_alive': 'false',
|
1013
|
+
'offline_time': str(current_time),
|
1014
|
+
'shutdown_reason': 'heartbeat_stopped',
|
1015
|
+
'messages_transferred': 'false'
|
1016
|
+
})
|
1017
|
+
|
1018
|
+
state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
|
1019
|
+
message = json.dumps({
|
1020
|
+
'worker_id': self.worker_id,
|
1021
|
+
'state': 'offline',
|
1022
|
+
'timestamp': current_time
|
1023
|
+
})
|
1024
|
+
pipeline.publish(state_change_channel, message)
|
1025
|
+
pipeline.execute()
|
1026
|
+
|
1027
|
+
logger.info(f"Worker {self.worker_id} marked as offline")
|
1028
|
+
except Exception as e:
|
1029
|
+
logger.error(f"Error marking worker offline: {e}", exc_info=True)
|
1030
|
+
|
1031
|
+
def add_queue(self, queue: str, worker_key: str):
|
1032
|
+
"""添加队列"""
|
1033
|
+
self.queues.add(queue)
|
1034
|
+
|
1035
|
+
if self.worker_key is None:
|
1036
|
+
self.worker_key = worker_key
|
1037
|
+
parts = worker_key.split(':')
|
1038
|
+
if len(parts) >= 3:
|
1039
|
+
self.redis_prefix = parts[0]
|
1040
|
+
self.worker_id = parts[2]
|
1041
|
+
else:
|
1042
|
+
logger.error(f"Invalid worker_key format: {worker_key}")
|
1043
|
+
raise ValueError(f"Invalid worker_key format: {worker_key}")
|
1044
|
+
|
1045
|
+
if self._thread is not None and self._thread.is_alive():
|
1046
|
+
logger.debug(f"Heartbeat thread already running, added queue {queue}")
|
1047
|
+
return
|
1048
|
+
|
1049
|
+
self.start()
|
1050
|
+
logger.debug(f"Started single heartbeat thread for worker {self.worker_id}")
|
1051
|
+
|
1052
|
+
def remove_queue(self, queue: str):
|
1053
|
+
"""移除队列"""
|
1054
|
+
if queue in self.queues:
|
1055
|
+
self.queues.remove(queue)
|
1056
|
+
logger.debug(f"Removed queue {queue} from heartbeat monitoring")
|
1057
|
+
|
1058
|
+
if not self.queues:
|
1059
|
+
self.stop()
|
1060
|
+
logger.debug("No more queues, stopped heartbeat thread")
|
1061
|
+
|
1062
|
+
def stop_all(self):
|
1063
|
+
"""停止心跳线程"""
|
1064
|
+
self.stop()
|
1065
|
+
self.queues.clear()
|
1066
|
+
|
1067
|
+
def is_healthy(self) -> bool:
|
1068
|
+
"""检查心跳线程是否健康"""
|
1069
|
+
if not self._thread:
|
1070
|
+
return len(self.queues) == 0
|
1071
|
+
|
1072
|
+
if not self._thread.is_alive():
|
1073
|
+
logger.error(f"Heartbeat thread for worker {self.worker_id} is not alive")
|
1074
|
+
return False
|
1075
|
+
return True
|
1076
|
+
|
1077
|
+
def get_last_heartbeat_time(self) -> Optional[float]:
|
1078
|
+
"""获取最后一次心跳时间"""
|
1079
|
+
with self._last_heartbeat_time_lock:
|
1080
|
+
return self._last_heartbeat_time
|
1081
|
+
|
1082
|
+
def is_heartbeat_timeout(self) -> bool:
|
1083
|
+
"""检查心跳是否已超时"""
|
1084
|
+
last_heartbeat = self.get_last_heartbeat_time()
|
1085
|
+
if last_heartbeat is None:
|
1086
|
+
return False
|
1087
|
+
|
1088
|
+
current_time = time.time()
|
1089
|
+
return (current_time - last_heartbeat) > self.heartbeat_timeout
|
1090
|
+
|
1091
|
+
|
1092
|
+
# ============================================================================
|
1093
|
+
# Worker 扫描器
|
1094
|
+
# ============================================================================
|
1095
|
+
|
1096
|
+
class WorkerScanner:
|
1097
|
+
"""使用 Redis Sorted Set 优化的 Worker 扫描器
|
1098
|
+
|
1099
|
+
核心优化:
|
1100
|
+
1. O(log N) 的超时检测复杂度
|
1101
|
+
2. 自动一致性维护
|
1102
|
+
3. 原子性操作保证数据一致
|
1103
|
+
"""
|
1104
|
+
|
1105
|
+
def __init__(self, sync_redis, async_redis, redis_prefix: str = 'jettask',
|
1106
|
+
heartbeat_timeout: float = 3.0, worker_prefix: str = 'WORKER',
|
1107
|
+
worker_state_manager=None):
|
1108
|
+
self.redis = sync_redis
|
1109
|
+
self.async_redis = async_redis
|
1110
|
+
self.redis_prefix = redis_prefix
|
1111
|
+
self.worker_prefix = worker_prefix
|
1112
|
+
self.heartbeat_timeout = heartbeat_timeout
|
1113
|
+
self.active_workers_key = f"{redis_prefix}:ACTIVE_WORKERS"
|
1114
|
+
self.worker_state_manager = worker_state_manager
|
1115
|
+
|
1116
|
+
self._initialized = False
|
1117
|
+
self._last_full_sync = 0
|
1118
|
+
self._full_sync_interval = 60
|
1119
|
+
self._scan_counter = 0
|
1120
|
+
self._partial_check_interval = 10
|
1121
|
+
|
1122
|
+
async def scan_timeout_workers(self) -> List[Dict]:
|
1123
|
+
"""快速扫描超时的 worker - O(log N) 复杂度"""
|
1124
|
+
self._scan_counter += 1
|
1125
|
+
if self._scan_counter >= self._partial_check_interval:
|
1126
|
+
self._scan_counter = 0
|
1127
|
+
asyncio.create_task(self._partial_check())
|
1128
|
+
|
1129
|
+
current_time = time.time()
|
1130
|
+
max_possible_timeout = 300
|
1131
|
+
cutoff_time = current_time - max_possible_timeout
|
1132
|
+
|
1133
|
+
potential_timeout_worker_ids = await self.async_redis.zrangebyscore(
|
1134
|
+
self.active_workers_key,
|
1135
|
+
min=0,
|
1136
|
+
max=current_time - 1
|
1137
|
+
)
|
1138
|
+
|
1139
|
+
if not potential_timeout_worker_ids:
|
1140
|
+
return []
|
1141
|
+
|
1142
|
+
if self.worker_state_manager:
|
1143
|
+
all_workers_info = await self.worker_state_manager.get_all_workers_info(only_alive=False)
|
1144
|
+
workers_data = [all_workers_info.get(wid) for wid in potential_timeout_worker_ids]
|
1145
|
+
else:
|
1146
|
+
pipeline = self.async_redis.pipeline()
|
1147
|
+
for worker_id in potential_timeout_worker_ids:
|
1148
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1149
|
+
pipeline.hgetall(worker_key)
|
1150
|
+
workers_data = await pipeline.execute()
|
1151
|
+
|
1152
|
+
result = []
|
1153
|
+
cleanup_pipeline = self.async_redis.pipeline()
|
1154
|
+
need_cleanup = False
|
1155
|
+
|
1156
|
+
for worker_id, worker_data in zip(potential_timeout_worker_ids, workers_data):
|
1157
|
+
if not worker_data:
|
1158
|
+
cleanup_pipeline.zrem(self.active_workers_key, worker_id)
|
1159
|
+
workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
|
1160
|
+
cleanup_pipeline.srem(workers_registry_key, worker_id)
|
1161
|
+
need_cleanup = True
|
1162
|
+
continue
|
1163
|
+
|
1164
|
+
worker_heartbeat_timeout = float(worker_data.get('heartbeat_timeout', self.heartbeat_timeout))
|
1165
|
+
last_heartbeat = float(worker_data.get('last_heartbeat', 0))
|
1166
|
+
worker_cutoff_time = current_time - worker_heartbeat_timeout
|
1167
|
+
|
1168
|
+
if last_heartbeat >= worker_cutoff_time:
|
1169
|
+
cleanup_pipeline.zadd(self.active_workers_key, {worker_id: last_heartbeat})
|
1170
|
+
need_cleanup = True
|
1171
|
+
continue
|
1172
|
+
|
1173
|
+
is_alive = worker_data.get('is_alive', 'true') == 'true' if self.worker_state_manager else worker_data.get('is_alive', 'true').lower() == 'true'
|
1174
|
+
if not is_alive:
|
1175
|
+
cleanup_pipeline.zrem(self.active_workers_key, worker_id)
|
1176
|
+
need_cleanup = True
|
1177
|
+
continue
|
1178
|
+
|
1179
|
+
logger.debug(f"Worker {worker_id} timeout: last_heartbeat={last_heartbeat}, timeout={worker_heartbeat_timeout}s")
|
1180
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1181
|
+
result.append({
|
1182
|
+
'worker_key': worker_key,
|
1183
|
+
'worker_data': worker_data,
|
1184
|
+
'worker_id': worker_id
|
1185
|
+
})
|
1186
|
+
|
1187
|
+
if need_cleanup:
|
1188
|
+
await cleanup_pipeline.execute()
|
1189
|
+
|
1190
|
+
if result:
|
1191
|
+
logger.info(f"Found {len(result)} timeout workers")
|
1192
|
+
|
1193
|
+
return result
|
1194
|
+
|
1195
|
+
async def update_heartbeat(self, worker_id: str, heartbeat_time: Optional[float] = None):
|
1196
|
+
"""原子性更新心跳"""
|
1197
|
+
if heartbeat_time is None:
|
1198
|
+
heartbeat_time = time.time()
|
1199
|
+
|
1200
|
+
pipeline = self.async_redis.pipeline()
|
1201
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1202
|
+
|
1203
|
+
pipeline.hset(worker_key, 'last_heartbeat', str(heartbeat_time))
|
1204
|
+
pipeline.zadd(self.active_workers_key, {worker_id: heartbeat_time})
|
1205
|
+
|
1206
|
+
await pipeline.execute()
|
1207
|
+
|
1208
|
+
async def add_worker(self, worker_id: str, worker_data: Dict):
|
1209
|
+
"""添加新 worker"""
|
1210
|
+
heartbeat_time = float(worker_data.get('last_heartbeat', time.time()))
|
1211
|
+
|
1212
|
+
pipeline = self.async_redis.pipeline()
|
1213
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1214
|
+
|
1215
|
+
pipeline.hset(worker_key, mapping=worker_data)
|
1216
|
+
pipeline.zadd(self.active_workers_key, {worker_id: heartbeat_time})
|
1217
|
+
|
1218
|
+
await pipeline.execute()
|
1219
|
+
logger.debug(f"Added worker {worker_id} to system")
|
1220
|
+
|
1221
|
+
async def remove_worker(self, worker_id: str):
|
1222
|
+
"""移除 worker"""
|
1223
|
+
if self.worker_state_manager:
|
1224
|
+
await self.worker_state_manager.set_worker_offline(worker_id, reason="heartbeat_timeout")
|
1225
|
+
else:
|
1226
|
+
pipeline = self.async_redis.pipeline()
|
1227
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1228
|
+
|
1229
|
+
pipeline.hset(worker_key, 'is_alive', 'false')
|
1230
|
+
pipeline.zrem(self.active_workers_key, worker_id)
|
1231
|
+
|
1232
|
+
await pipeline.execute()
|
1233
|
+
logger.debug(f"Removed worker {worker_id} from active set (direct mode)")
|
1234
|
+
|
1235
|
+
if self.worker_state_manager:
|
1236
|
+
await self.async_redis.zrem(self.active_workers_key, worker_id)
|
1237
|
+
|
1238
|
+
async def cleanup_stale_workers(self, max_age_seconds: float = 3600):
|
1239
|
+
"""清理过期的 worker 记录"""
|
1240
|
+
current_time = time.time()
|
1241
|
+
cutoff_time = current_time - max_age_seconds
|
1242
|
+
|
1243
|
+
stale_worker_ids = await self.async_redis.zrangebyscore(
|
1244
|
+
self.active_workers_key,
|
1245
|
+
min=0,
|
1246
|
+
max=cutoff_time
|
1247
|
+
)
|
1248
|
+
|
1249
|
+
if not stale_worker_ids:
|
1250
|
+
return 0
|
1251
|
+
|
1252
|
+
pipeline = self.async_redis.pipeline()
|
1253
|
+
|
1254
|
+
for worker_id in stale_worker_ids:
|
1255
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1256
|
+
pipeline.delete(worker_key)
|
1257
|
+
|
1258
|
+
pipeline.zrem(self.active_workers_key, *stale_worker_ids)
|
1259
|
+
|
1260
|
+
await pipeline.execute()
|
1261
|
+
|
1262
|
+
logger.info(f"Cleaned up {len(stale_worker_ids)} stale worker records")
|
1263
|
+
return len(stale_worker_ids)
|
1264
|
+
|
1265
|
+
async def _partial_check(self):
|
1266
|
+
"""部分一致性检查"""
|
1267
|
+
try:
|
1268
|
+
sample_size = min(10, await self.async_redis.zcard(self.active_workers_key))
|
1269
|
+
if sample_size == 0:
|
1270
|
+
return
|
1271
|
+
|
1272
|
+
random_workers = await self.async_redis.zrandmember(
|
1273
|
+
self.active_workers_key, sample_size, withscores=True
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
for worker_id, zset_score in random_workers:
|
1277
|
+
worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
|
1278
|
+
hash_heartbeat = await self.async_redis.hget(worker_key, 'last_heartbeat')
|
1279
|
+
|
1280
|
+
if not hash_heartbeat:
|
1281
|
+
await self.async_redis.zrem(self.active_workers_key, worker_id)
|
1282
|
+
logger.debug(f"Partial check: removed {worker_id}")
|
1283
|
+
else:
|
1284
|
+
hash_time = float(hash_heartbeat)
|
1285
|
+
if abs(hash_time - zset_score) > 1.0:
|
1286
|
+
await self.async_redis.zadd(self.active_workers_key, {worker_id: hash_time})
|
1287
|
+
logger.debug(f"Partial check: synced {worker_id}")
|
1288
|
+
|
1289
|
+
except Exception as e:
|
1290
|
+
logger.debug(f"Partial check error: {e}")
|
1291
|
+
|
1292
|
+
async def get_active_count(self) -> int:
|
1293
|
+
"""获取活跃 worker 数量 - O(1)"""
|
1294
|
+
return await self.async_redis.zcard(self.active_workers_key)
|
1295
|
+
|
1296
|
+
|
1297
|
+
# ============================================================================
|
1298
|
+
# Worker 生命周期
|
1299
|
+
# ============================================================================
|
1300
|
+
|
1301
|
+
class WorkerLifecycle:
|
1302
|
+
"""Worker 生命周期管理
|
1303
|
+
|
1304
|
+
职责:
|
1305
|
+
- 初始化 Worker(生成ID、注册、启动心跳)
|
1306
|
+
- 清理 Worker(停止心跳、注销、离线标记)
|
1307
|
+
"""
|
1308
|
+
|
1309
|
+
def __init__(
|
1310
|
+
self,
|
1311
|
+
redis_client,
|
1312
|
+
async_redis_client,
|
1313
|
+
redis_prefix: str,
|
1314
|
+
naming: 'WorkerNaming',
|
1315
|
+
state_manager: 'WorkerStateManager',
|
1316
|
+
registry: 'WorkerRegistry',
|
1317
|
+
heartbeat_class
|
1318
|
+
):
|
1319
|
+
"""初始化生命周期管理器"""
|
1320
|
+
self.redis_client = redis_client
|
1321
|
+
self.async_redis_client = async_redis_client
|
1322
|
+
self.redis_prefix = redis_prefix
|
1323
|
+
self.naming = naming
|
1324
|
+
self.state = state_manager
|
1325
|
+
self.registry = registry
|
1326
|
+
self.heartbeat_class = heartbeat_class
|
1327
|
+
self.active_heartbeats: Dict[str, Any] = {}
|
1328
|
+
|
1329
|
+
async def initialize_worker(
|
1330
|
+
self,
|
1331
|
+
prefix: str,
|
1332
|
+
queues: List[str],
|
1333
|
+
reuse_offline: bool = True
|
1334
|
+
) -> str:
|
1335
|
+
"""初始化 Worker"""
|
1336
|
+
worker_id = None
|
1337
|
+
if reuse_offline:
|
1338
|
+
worker_id = await self.naming.find_reusable_worker_id(prefix, self.registry)
|
1339
|
+
|
1340
|
+
if not worker_id:
|
1341
|
+
worker_id = self.naming.generate_worker_id(prefix)
|
1342
|
+
|
1343
|
+
logger.info(f"Initializing worker: {worker_id}")
|
1344
|
+
|
1345
|
+
await self.state.set_worker_online(
|
1346
|
+
worker_id=worker_id,
|
1347
|
+
queues=queues,
|
1348
|
+
pid=os.getpid(),
|
1349
|
+
host=socket.gethostname()
|
1350
|
+
)
|
1351
|
+
|
1352
|
+
await self.registry.register(worker_id)
|
1353
|
+
|
1354
|
+
worker_key = f"{self.redis_prefix}:WORKER:{worker_id}"
|
1355
|
+
heartbeat = self.heartbeat_class(
|
1356
|
+
redis_client=self.redis_client,
|
1357
|
+
worker_key=worker_key,
|
1358
|
+
worker_id=worker_id,
|
1359
|
+
redis_prefix=self.redis_prefix,
|
1360
|
+
interval=5.0
|
1361
|
+
)
|
1362
|
+
|
1363
|
+
for queue in queues:
|
1364
|
+
heartbeat.queues.add(queue)
|
1365
|
+
|
1366
|
+
heartbeat.start()
|
1367
|
+
self.active_heartbeats[worker_id] = heartbeat
|
1368
|
+
|
1369
|
+
logger.info(f"Worker initialized successfully: {worker_id}")
|
1370
|
+
return worker_id
|
1371
|
+
|
1372
|
+
async def cleanup_worker(self, worker_id: str):
|
1373
|
+
"""清理 Worker 资源"""
|
1374
|
+
logger.info(f"Cleaning up worker: {worker_id}")
|
1375
|
+
|
1376
|
+
try:
|
1377
|
+
if worker_id in self.active_heartbeats:
|
1378
|
+
heartbeat = self.active_heartbeats[worker_id]
|
1379
|
+
heartbeat.stop()
|
1380
|
+
del self.active_heartbeats[worker_id]
|
1381
|
+
|
1382
|
+
await self.state.set_worker_offline(worker_id)
|
1383
|
+
await self.registry.unregister(worker_id)
|
1384
|
+
|
1385
|
+
logger.info(f"Worker cleaned up successfully: {worker_id}")
|
1386
|
+
except Exception as e:
|
1387
|
+
logger.error(f"Error cleaning up worker {worker_id}: {e}")
|
1388
|
+
raise
|
1389
|
+
|
1390
|
+
async def record_task_start(self, worker_id: str, queue: str):
|
1391
|
+
"""记录任务开始"""
|
1392
|
+
await self.state.increment_queue_stats(
|
1393
|
+
worker_id=worker_id,
|
1394
|
+
queue=queue,
|
1395
|
+
running_tasks_delta=1
|
1396
|
+
)
|
1397
|
+
|
1398
|
+
async def record_task_finish(
|
1399
|
+
self,
|
1400
|
+
worker_id: str,
|
1401
|
+
queue: str,
|
1402
|
+
success: bool,
|
1403
|
+
duration: float
|
1404
|
+
):
|
1405
|
+
"""记录任务完成"""
|
1406
|
+
await self.state.increment_queue_stats(
|
1407
|
+
worker_id=worker_id,
|
1408
|
+
queue=queue,
|
1409
|
+
running_tasks_delta=-1,
|
1410
|
+
success_count_increment=1 if success else 0,
|
1411
|
+
failed_count_increment=0 if success else 1,
|
1412
|
+
total_count_increment=1,
|
1413
|
+
processing_time_increment=duration
|
1414
|
+
)
|
1415
|
+
|
1416
|
+
# 更新平均处理时间
|
1417
|
+
stats = await self.state.get_queue_total_stats(worker_id, queue)
|
1418
|
+
if stats['total_count'] > 0:
|
1419
|
+
avg_time = stats['total_processing_time'] / stats['total_count']
|
1420
|
+
await self.state.update_queue_stats(
|
1421
|
+
worker_id=worker_id,
|
1422
|
+
queue=queue,
|
1423
|
+
avg_processing_time=avg_time
|
1424
|
+
)
|
1425
|
+
|
1426
|
+
async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, Any]]:
|
1427
|
+
"""获取 Worker 信息"""
|
1428
|
+
return await self.state.get_worker_info(worker_id)
|
1429
|
+
|
1430
|
+
|
1431
|
+
# ============================================================================
|
1432
|
+
# 兼容性层:HeartbeatConsumerStrategy (for backward compatibility)
|
1433
|
+
# ============================================================================
|
1434
|
+
|
1435
|
+
class HeartbeatConsumerStrategy:
|
1436
|
+
"""
|
1437
|
+
兼容性类 - 为旧代码提供向后兼容
|
1438
|
+
|
1439
|
+
⚠️ 已废弃: 请使用 WorkerManager 和 WorkerNaming 代替
|
1440
|
+
"""
|
1441
|
+
|
1442
|
+
def __init__(self, redis_client, config: Dict = None, app=None):
|
1443
|
+
self.redis = redis_client
|
1444
|
+
self.config = config or {}
|
1445
|
+
self.app = app
|
1446
|
+
self.redis_prefix = config.get('redis_prefix', 'jettask')
|
1447
|
+
|
1448
|
+
# 如果 app 传入了 worker_id,直接使用(子进程复用主进程的ID)
|
1449
|
+
if app and hasattr(app, 'worker_id') and app.worker_id:
|
1450
|
+
self.consumer_id = app.worker_id
|
1451
|
+
self._worker_key = app.worker_key or f'{self.redis_prefix}:WORKER:{app.worker_id}'
|
1452
|
+
logger.info(f"[PID {os.getpid()}] HeartbeatConsumerStrategy using provided worker_id: {self.consumer_id}")
|
1453
|
+
else:
|
1454
|
+
self.consumer_id = None
|
1455
|
+
self._worker_key = None
|
1456
|
+
|
1457
|
+
# 获取主机名前缀
|
1458
|
+
try:
|
1459
|
+
hostname = socket.gethostname()
|
1460
|
+
ip = socket.gethostbyname(hostname)
|
1461
|
+
prefix = hostname if hostname != 'localhost' else ip
|
1462
|
+
except:
|
1463
|
+
prefix = os.environ.get('HOSTNAME', 'unknown')
|
1464
|
+
|
1465
|
+
self.hostname_prefix = prefix
|
1466
|
+
|
1467
|
+
def _ensure_consumer_id(self):
|
1468
|
+
"""确保consumer_id已创建(兼容旧代码)"""
|
1469
|
+
import os
|
1470
|
+
if self.consumer_id is None:
|
1471
|
+
# 使用 WorkerNaming 生成
|
1472
|
+
from .manager import WorkerNaming
|
1473
|
+
naming = WorkerNaming()
|
1474
|
+
self.consumer_id = naming.generate_worker_id(self.hostname_prefix)
|
1475
|
+
self._worker_key = f'{self.redis_prefix}:WORKER:{self.consumer_id}'
|
1476
|
+
logger.info(f"[PID {os.getpid()}] Generated NEW worker ID: {self.consumer_id}")
|
1477
|
+
else:
|
1478
|
+
logger.debug(f"[PID {os.getpid()}] Reusing existing worker ID: {self.consumer_id}")
|
1479
|
+
|
1480
|
+
def get_consumer_name(self, queue: str) -> str:
|
1481
|
+
"""
|
1482
|
+
获取消费者名称
|
1483
|
+
|
1484
|
+
统一 group_name 架构:所有队列(包括优先级队列)使用基础队列名生成 consumer name
|
1485
|
+
例如:robust_bench2 和 robust_bench2:8 都使用 "YYDG-xxx-robust_bench2"
|
1486
|
+
"""
|
1487
|
+
self._ensure_consumer_id()
|
1488
|
+
|
1489
|
+
# 提取基础队列名(移除优先级后缀)
|
1490
|
+
base_queue = queue
|
1491
|
+
if ':' in queue and queue.rsplit(':', 1)[1].isdigit():
|
1492
|
+
base_queue = queue.rsplit(':', 1)[0]
|
1493
|
+
|
1494
|
+
return f"{self.consumer_id}-{base_queue}"
|
1495
|
+
|
1496
|
+
def cleanup(self):
|
1497
|
+
"""清理资源(兼容旧代码)"""
|
1498
|
+
pass
|
1499
|
+
|
1500
|
+
|
1501
|
+
__all__ = [
|
1502
|
+
'WorkerStateManager',
|
1503
|
+
'HeartbeatThreadManager',
|
1504
|
+
'WorkerScanner',
|
1505
|
+
'WorkerLifecycle',
|
1506
|
+
'HeartbeatConsumerStrategy', # 兼容性
|
1507
|
+
]
|