jettask 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +60 -2
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.18.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1507 @@
1
+ """
2
+ Worker 生命周期管理
3
+
4
+ 整合了以下模块的功能:
5
+ - state_manager.py: Worker 状态管理
6
+ - heartbeat_thread.py: 心跳线程管理
7
+ - scanner.py: Worker 超时扫描
8
+ - core.py: WorkerLifecycle, WorkerStatistics
9
+ - consumer_manager.py: HeartbeatConsumerStrategy 的心跳和统计逻辑
10
+ """
11
+
12
+ import os
13
+ import socket
14
+ import uuid
15
+ import time
16
+ import asyncio
17
+ import logging
18
+ import threading
19
+ import json
20
+ from typing import Dict, List, Optional, Set, Callable, Any
21
+ from collections import defaultdict, namedtuple
22
+ from redis.asyncio.lock import Lock as AsyncLock
23
+ import redis
24
+ import redis.asyncio as aioredis
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ============================================================================
30
+ # Worker 状态管理
31
+ # ============================================================================
32
+
33
+ class WorkerStateManager:
34
+ """Worker状态管理器 - Worker状态的唯一管理入口
35
+
36
+ ⚠️ 重要:所有Worker状态的修改都必须通过这个类进行,不要直接操作Redis!
37
+
38
+ 职责:
39
+ 1. 统一管理worker所有状态字段的读写
40
+ 2. 当关键状态变更时,通过Redis Pub/Sub发送信号
41
+ 3. 提供状态变更监听机制
42
+ 4. 维护worker在ACTIVE_WORKERS sorted set中的记录
43
+ """
44
+
45
+ def __init__(self, redis_client: aioredis.Redis, redis_prefix: str = "jettask", event_pool=None):
46
+ """初始化Worker状态管理器
47
+
48
+ Args:
49
+ redis_client: 异步Redis客户端
50
+ redis_prefix: Redis key前缀
51
+ event_pool: EventPool实例(可选),用于事件驱动的消息恢复
52
+ """
53
+ self.redis = redis_client
54
+ self.redis_prefix = redis_prefix
55
+ self.active_workers_key = f"{redis_prefix}:ACTIVE_WORKERS"
56
+ self.event_pool = event_pool
57
+
58
+ # Pub/Sub通道名称
59
+ self.worker_state_channel = f"{redis_prefix}:WORKER_STATE_CHANGE"
60
+
61
+ # 监听器订阅
62
+ self._pubsub = None
63
+ self._listener_task: Optional[asyncio.Task] = None
64
+ self._running = False
65
+ self._callbacks: Set[Callable] = set()
66
+
67
+ # Pub/Sub 配置
68
+ self._health_check_interval = 60
69
+ self._health_check_task: Optional[asyncio.Task] = None
70
+
71
+ def _get_worker_key(self, worker_id: str) -> str:
72
+ """获取worker的Redis key"""
73
+ return f"{self.redis_prefix}:WORKER:{worker_id}"
74
+
75
+ async def initialize_worker(self, worker_id: str, worker_info: Dict[str, Any]):
76
+ """初始化worker(首次创建)"""
77
+ worker_key = self._get_worker_key(worker_id)
78
+ current_time = time.time()
79
+
80
+ worker_info.setdefault('is_alive', 'true')
81
+ worker_info.setdefault('messages_transferred', 'false')
82
+ worker_info.setdefault('created_at', str(current_time))
83
+ worker_info.setdefault('last_heartbeat', str(current_time))
84
+
85
+ pipeline = self.redis.pipeline()
86
+ pipeline.hset(worker_key, mapping=worker_info)
87
+ pipeline.zadd(self.active_workers_key, {worker_id: current_time})
88
+ await pipeline.execute()
89
+
90
+ logger.debug(f"Initialized worker {worker_id}")
91
+
92
+ async def set_worker_online(self, worker_id: str, worker_data: dict = None):
93
+ """设置worker为在线状态"""
94
+ worker_key = self._get_worker_key(worker_id)
95
+ old_alive = await self.redis.hget(worker_key, 'is_alive')
96
+ old_alive = old_alive.decode('utf-8') if isinstance(old_alive, bytes) else old_alive
97
+
98
+ current_time = time.time()
99
+ pipeline = self.redis.pipeline()
100
+ pipeline.hset(worker_key, 'is_alive', 'true')
101
+ pipeline.hset(worker_key, 'last_heartbeat', str(current_time))
102
+
103
+ # 当 worker 从离线变为在线时,重置 messages_transferred
104
+ # 这表示是一个新的 worker 实例,还没有进行消息转移
105
+ if old_alive != 'true':
106
+ pipeline.hset(worker_key, 'messages_transferred', 'false')
107
+
108
+ if worker_data:
109
+ pipeline.hset(worker_key, mapping=worker_data)
110
+
111
+ pipeline.zadd(self.active_workers_key, {worker_id: current_time})
112
+ await pipeline.execute()
113
+
114
+ if old_alive != 'true':
115
+ await self._publish_state_change(worker_id, 'online')
116
+ logger.debug(f"Worker {worker_id} is now ONLINE")
117
+
118
+ async def set_worker_offline(self, worker_id: str, reason: str = "unknown"):
119
+ """设置worker为离线状态"""
120
+ worker_key = self._get_worker_key(worker_id)
121
+ old_alive = await self.redis.hget(worker_key, 'is_alive')
122
+ old_alive = old_alive.decode('utf-8') if isinstance(old_alive, bytes) else old_alive
123
+
124
+ current_time = time.time()
125
+ pipeline = self.redis.pipeline()
126
+ pipeline.hset(worker_key, 'messages_transferred', 'false') # 重置消息转移标记,允许其他worker接管消息
127
+ pipeline.hset(worker_key, 'is_alive', 'false')
128
+ pipeline.hset(worker_key, 'offline_reason', reason)
129
+ pipeline.hset(worker_key, 'offline_time', str(current_time))
130
+ pipeline.zrem(self.active_workers_key, worker_id)
131
+ await pipeline.execute()
132
+
133
+ if old_alive == 'true':
134
+ await self._publish_state_change(worker_id, 'offline', reason)
135
+ logger.debug(f"Worker {worker_id} is now OFFLINE (reason: {reason})")
136
+
137
+ async def update_worker_heartbeat(self, worker_id: str, heartbeat_data: dict = None):
138
+ """更新worker心跳(确保在线状态)"""
139
+ worker_key = self._get_worker_key(worker_id)
140
+ current_time = time.time()
141
+
142
+ pipeline = self.redis.pipeline()
143
+ pipeline.hset(worker_key, 'is_alive', 'true')
144
+ pipeline.hset(worker_key, 'last_heartbeat', str(current_time))
145
+
146
+ if heartbeat_data:
147
+ pipeline.hset(worker_key, mapping=heartbeat_data)
148
+
149
+ pipeline.zadd(self.active_workers_key, {worker_id: current_time})
150
+ await pipeline.execute()
151
+
152
+ async def update_worker_field(self, worker_id: str, field: str, value: str):
153
+ """更新worker的单个字段"""
154
+ worker_key = self._get_worker_key(worker_id)
155
+ await self.redis.hset(worker_key, field, value)
156
+
157
+ async def update_worker_fields(self, worker_id: str, fields: Dict[str, Any]):
158
+ """批量更新worker的多个字段"""
159
+ worker_key = self._get_worker_key(worker_id)
160
+ await self.redis.hset(worker_key, mapping=fields)
161
+
162
+ async def increment_queue_stats(self, worker_id: str, queue: str,
163
+ running_tasks_delta: int = None,
164
+ success_count_increment: int = None,
165
+ failed_count_increment: int = None,
166
+ total_count_increment: int = None,
167
+ processing_time_increment: float = None,
168
+ latency_time_increment: float = None):
169
+ """增量更新worker在特定队列上的累积统计信息"""
170
+ worker_key = self._get_worker_key(worker_id)
171
+ pipeline = self.redis.pipeline()
172
+
173
+ if running_tasks_delta is not None and running_tasks_delta != 0:
174
+ pipeline.hincrby(worker_key, f'{queue}:running_tasks', running_tasks_delta)
175
+
176
+ if success_count_increment is not None:
177
+ pipeline.hincrby(worker_key, f'{queue}:success_count', success_count_increment)
178
+
179
+ if failed_count_increment is not None:
180
+ pipeline.hincrby(worker_key, f'{queue}:failed_count', failed_count_increment)
181
+
182
+ if total_count_increment is not None:
183
+ pipeline.hincrby(worker_key, f'{queue}:total_count', total_count_increment)
184
+
185
+ if processing_time_increment is not None:
186
+ pipeline.hincrbyfloat(worker_key, f'{queue}:total_processing_time', processing_time_increment)
187
+
188
+ if latency_time_increment is not None:
189
+ pipeline.hincrbyfloat(worker_key, f'{queue}:total_latency_time', latency_time_increment)
190
+
191
+ await pipeline.execute()
192
+
193
+ async def get_queue_total_stats(self, worker_id: str, queue: str) -> dict:
194
+ """获取队列的累积统计数据"""
195
+ worker_key = self._get_worker_key(worker_id)
196
+ fields = [
197
+ f'{queue}:total_count',
198
+ f'{queue}:total_processing_time',
199
+ f'{queue}:total_latency_time'
200
+ ]
201
+ values = await self.redis.hmget(worker_key, fields)
202
+
203
+ return {
204
+ 'total_count': int(values[0]) if values[0] else 0,
205
+ 'total_processing_time': float(values[1]) if values[1] else 0.0,
206
+ 'total_latency_time': float(values[2]) if values[2] else 0.0
207
+ }
208
+
209
+ async def update_queue_stats(self, worker_id: str, queue: str,
210
+ running_tasks: int = None,
211
+ avg_processing_time: float = None,
212
+ avg_latency_time: float = None):
213
+ """更新worker在特定队列上的统计信息"""
214
+ worker_key = self._get_worker_key(worker_id)
215
+ pipeline = self.redis.pipeline()
216
+
217
+ if running_tasks is not None:
218
+ pipeline.hset(worker_key, f'{queue}:running_tasks', str(running_tasks))
219
+
220
+ if avg_processing_time is not None:
221
+ pipeline.hset(worker_key, f'{queue}:avg_processing_time', f'{avg_processing_time:.3f}')
222
+
223
+ if avg_latency_time is not None:
224
+ pipeline.hset(worker_key, f'{queue}:avg_latency_time', f'{avg_latency_time:.3f}')
225
+
226
+ await pipeline.execute()
227
+
228
+ async def mark_messages_transferred(self, worker_id: str, transferred: bool = True):
229
+ """标记worker的消息是否已转移"""
230
+ worker_key = self._get_worker_key(worker_id)
231
+ await self.redis.hset(worker_key, 'messages_transferred', 'true' if transferred else 'false')
232
+
233
+ async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, str]]:
234
+ """获取worker的完整信息"""
235
+ worker_key = self._get_worker_key(worker_id)
236
+ data = await self.redis.hgetall(worker_key)
237
+
238
+ if not data:
239
+ return None
240
+
241
+ result = {}
242
+ for k, v in data.items():
243
+ key = k.decode('utf-8') if isinstance(k, bytes) else k
244
+ value = v.decode('utf-8') if isinstance(v, bytes) else v
245
+ result[key] = value
246
+
247
+ return result
248
+
249
+ async def get_worker_field(self, worker_id: str, field: str) -> Optional[str]:
250
+ """获取worker的单个字段值"""
251
+ worker_key = self._get_worker_key(worker_id)
252
+ value = await self.redis.hget(worker_key, field)
253
+
254
+ if value is None:
255
+ return None
256
+
257
+ return value.decode('utf-8') if isinstance(value, bytes) else value
258
+
259
+ async def is_worker_alive(self, worker_id: str) -> bool:
260
+ """检查worker是否在线"""
261
+ is_alive = await self.get_worker_field(worker_id, 'is_alive')
262
+ return is_alive == 'true'
263
+
264
+ async def get_all_workers_info(self, only_alive: bool = True) -> Dict[str, Dict[str, str]]:
265
+ """获取所有worker的信息"""
266
+ pattern = f"{self.redis_prefix}:WORKER:*"
267
+ result = {}
268
+
269
+ cursor = 0
270
+ while True:
271
+ cursor, keys = await self.redis.scan(cursor, match=pattern, count=100)
272
+
273
+ for key in keys:
274
+ if isinstance(key, bytes):
275
+ key = key.decode('utf-8')
276
+
277
+ parts = key.split(":")
278
+ if len(parts) >= 3:
279
+ worker_id = parts[2]
280
+ worker_info = await self.get_worker_info(worker_id)
281
+ if worker_info:
282
+ if only_alive and worker_info.get('is_alive') != 'true':
283
+ continue
284
+ result[worker_id] = worker_info
285
+
286
+ if cursor == 0:
287
+ break
288
+
289
+ return result
290
+
291
+ async def delete_worker(self, worker_id: str):
292
+ """删除worker的所有数据"""
293
+ worker_key = self._get_worker_key(worker_id)
294
+ pipeline = self.redis.pipeline()
295
+ pipeline.delete(worker_key)
296
+ pipeline.zrem(self.active_workers_key, worker_id)
297
+ await pipeline.execute()
298
+ logger.debug(f"Deleted worker {worker_id}")
299
+
300
+ async def _publish_state_change(self, worker_id: str, state: str, reason: str = None):
301
+ """发布状态变更信号"""
302
+ message = {
303
+ 'worker_id': worker_id,
304
+ 'state': state,
305
+ 'timestamp': asyncio.get_event_loop().time()
306
+ }
307
+
308
+ if reason:
309
+ message['reason'] = reason
310
+
311
+ await self.redis.publish(
312
+ self.worker_state_channel,
313
+ json.dumps(message)
314
+ )
315
+
316
+ logger.debug(f"Published state change: {message}")
317
+
318
+ async def start_listener(self):
319
+ """启动状态变更监听器"""
320
+ if self._running:
321
+ logger.warning("Worker state listener already running")
322
+ return
323
+
324
+ self._running = True
325
+ self._pubsub = await self._create_and_subscribe_pubsub()
326
+ self._listener_task = asyncio.create_task(self._listen_loop())
327
+ self._health_check_task = asyncio.create_task(self._health_check_loop())
328
+
329
+ logger.debug(f"Started worker state listener on channel: {self.worker_state_channel}")
330
+
331
+ async def stop_listener(self):
332
+ """停止状态变更监听器"""
333
+ if not self._running:
334
+ return
335
+
336
+ self._running = False
337
+
338
+ if self._listener_task:
339
+ self._listener_task.cancel()
340
+ try:
341
+ await self._listener_task
342
+ except asyncio.CancelledError:
343
+ pass
344
+
345
+ if self._health_check_task:
346
+ self._health_check_task.cancel()
347
+ try:
348
+ await self._health_check_task
349
+ except asyncio.CancelledError:
350
+ pass
351
+
352
+ if self._pubsub:
353
+ await self._pubsub.unsubscribe(self.worker_state_channel)
354
+ await self._pubsub.close()
355
+
356
+ logger.debug("Stopped worker state listener")
357
+
358
+ async def _create_and_subscribe_pubsub(self):
359
+ """创建 PubSub 连接并订阅频道"""
360
+ if self._pubsub:
361
+ try:
362
+ await self._pubsub.close()
363
+ except:
364
+ pass
365
+
366
+ pubsub = self.redis.pubsub()
367
+ await pubsub.subscribe(self.worker_state_channel)
368
+
369
+ logger.debug(f"Created and subscribed to Redis Pub/Sub channel: {self.worker_state_channel}")
370
+ return pubsub
371
+
372
+ async def _health_check_loop(self):
373
+ """定期检查 Pub/Sub 连接健康状态"""
374
+ while self._running:
375
+ try:
376
+ await asyncio.sleep(self._health_check_interval)
377
+
378
+ if not self._running:
379
+ break
380
+
381
+ if self._pubsub and self._pubsub.connection:
382
+ try:
383
+ await asyncio.wait_for(self._pubsub.ping(), timeout=5.0)
384
+ logger.debug("Pub/Sub health check: OK")
385
+ except Exception as e:
386
+ logger.warning(f"Pub/Sub health check failed: {e}")
387
+ else:
388
+ logger.warning("Pub/Sub connection is None")
389
+
390
+ except asyncio.CancelledError:
391
+ logger.debug("Health check loop cancelled")
392
+ break
393
+ except Exception as e:
394
+ logger.error(f"Error in health check loop: {e}")
395
+
396
+ async def _listen_loop(self):
397
+ """监听循环(支持自动重连)"""
398
+ retry_delay = 1
399
+ max_retry_delay = 30
400
+
401
+ while self._running:
402
+ try:
403
+ async for message in self._pubsub.listen():
404
+ if message['type'] == 'message':
405
+ try:
406
+ data = json.loads(message['data'])
407
+
408
+ if data.get('state') == 'offline' and self.event_pool:
409
+ worker_id = data.get('worker_id')
410
+ if worker_id:
411
+ logger.info(f"[StateManager] Worker {worker_id} offline event received")
412
+ asyncio.create_task(
413
+ self.event_pool.handle_worker_offline_event(worker_id)
414
+ )
415
+
416
+ for callback in self._callbacks:
417
+ try:
418
+ if asyncio.iscoroutinefunction(callback):
419
+ await callback(data)
420
+ else:
421
+ callback(data)
422
+ except Exception as e:
423
+ logger.error(f"Error in state change callback: {e}")
424
+
425
+ except Exception as e:
426
+ logger.error(f"Error processing state change message: {e}")
427
+
428
+ retry_delay = 1
429
+
430
+ except asyncio.CancelledError:
431
+ logger.debug("Listen loop cancelled")
432
+ break
433
+ except Exception as e:
434
+ logger.error(f"Error in listen loop: {e}")
435
+
436
+ if not self._running:
437
+ break
438
+
439
+ logger.warning(f"Attempting to reconnect to Redis Pub/Sub in {retry_delay} seconds...")
440
+ await asyncio.sleep(retry_delay)
441
+
442
+ try:
443
+ self._pubsub = await self._create_and_subscribe_pubsub()
444
+ logger.info(f"Successfully reconnected to Redis Pub/Sub")
445
+ retry_delay = 1
446
+ except Exception as reconnect_error:
447
+ logger.error(f"Failed to reconnect to Redis Pub/Sub: {reconnect_error}")
448
+ retry_delay = min(retry_delay * 2, max_retry_delay)
449
+
450
+ logger.debug("Listen loop exited")
451
+
452
+ def register_callback(self, callback: Callable):
453
+ """注册状态变更回调"""
454
+ self._callbacks.add(callback)
455
+ logger.debug(f"Registered state change callback: {callback.__name__}")
456
+
457
+ def unregister_callback(self, callback: Callable):
458
+ """注销状态变更回调"""
459
+ self._callbacks.discard(callback)
460
+ logger.debug(f"Unregistered state change callback: {callback.__name__}")
461
+
462
+
463
+ # ============================================================================
464
+ # 心跳管理
465
+ # ============================================================================
466
+
467
+ class HeartbeatTaskManager:
468
+ """基于协程的心跳管理器(在主进程的独立事件循环线程中运行,轻量级)"""
469
+
470
+ def __init__(self, redis_client, worker_key: str, worker_id: str, redis_prefix: str,
471
+ interval: float = 5.0, heartbeat_timeout: float = 15.0, loop: asyncio.AbstractEventLoop = None):
472
+ """初始化心跳任务管理器
473
+
474
+ Args:
475
+ redis_client: 异步 Redis 客户端
476
+ worker_key: Worker 的 Redis key
477
+ worker_id: Worker ID
478
+ redis_prefix: Redis 前缀
479
+ interval: 心跳间隔(秒)
480
+ heartbeat_timeout: 心跳超时时间(秒)
481
+ loop: 事件循环(如果为None,会在当前线程创建新的)
482
+ """
483
+ self.redis_client = redis_client
484
+ self.worker_key = worker_key
485
+ self.worker_id = worker_id
486
+ self.redis_prefix = redis_prefix
487
+ self.interval = interval
488
+ self.heartbeat_timeout = heartbeat_timeout
489
+ self.queues: Set[str] = set()
490
+ self._last_heartbeat_time = None
491
+ self._loop = loop
492
+
493
+ # 心跳任务和停止事件
494
+ self._task: Optional[asyncio.Task] = None
495
+ self._stop_event: Optional[asyncio.Event] = None
496
+ self._first_heartbeat_done: Optional[asyncio.Event] = None
497
+ self._thread: Optional[threading.Thread] = None
498
+ self._thread_ready: Optional[threading.Event] = None
499
+
500
+ @classmethod
501
+ async def create_and_start(cls, redis_client, redis_prefix: str, queues: List[str] = None,
502
+ interval: float = 5.0, worker_state=None):
503
+ """
504
+ 创建心跳管理器并启动,生成 worker_id 后等待首次心跳成功
505
+
506
+ Args:
507
+ redis_client: 异步 Redis 客户端
508
+ redis_prefix: Redis 前缀
509
+ queues: 队列列表
510
+ interval: 心跳间隔
511
+ worker_state: WorkerState 实例(用于查找可复用的 worker_id)
512
+
513
+ Returns:
514
+ HeartbeatTaskManager 实例(包含 worker_id 和 worker_key 属性)
515
+ """
516
+ from jettask.worker.manager import WorkerNaming
517
+
518
+ # 1. 生成 worker_id
519
+ naming = WorkerNaming()
520
+
521
+ # 生成主机名前缀
522
+ try:
523
+ hostname = socket.gethostname()
524
+ ip = socket.gethostbyname(hostname)
525
+ prefix = hostname if hostname != 'localhost' else ip
526
+ except:
527
+ prefix = os.environ.get('HOSTNAME', 'unknown')
528
+
529
+ # 尝试复用离线的 worker_id
530
+ reusable_id = None
531
+ if worker_state:
532
+ reusable_id = await naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
533
+
534
+ # 生成或复用 worker_id
535
+ if reusable_id:
536
+ worker_id = reusable_id
537
+ logger.info(f"[PID {os.getpid()}] Reusing offline worker ID: {worker_id}")
538
+ else:
539
+ worker_id = naming.generate_worker_id(prefix)
540
+ logger.info(f"[PID {os.getpid()}] Generated new worker ID: {worker_id}")
541
+
542
+ worker_key = f"{redis_prefix}:WORKER:{worker_id}"
543
+
544
+ # 2. 创建心跳管理器
545
+ manager = cls(
546
+ redis_client=redis_client,
547
+ worker_key=worker_key,
548
+ worker_id=worker_id,
549
+ redis_prefix=redis_prefix,
550
+ interval=interval
551
+ )
552
+
553
+ # 3. 设置队列
554
+ if queues:
555
+ for queue in queues:
556
+ manager.queues.add(queue)
557
+
558
+ # 4. 启动心跳任务
559
+ await manager.start()
560
+
561
+ # 5. 等待首次心跳成功(最多等待 10 秒)
562
+ try:
563
+ await asyncio.wait_for(manager._first_heartbeat_done.wait(), timeout=10)
564
+ except asyncio.TimeoutError:
565
+ logger.warning(f"Timeout waiting for first heartbeat for worker {worker_id}")
566
+
567
+ logger.info(f"Heartbeat task started for worker {worker_id}")
568
+ return manager
569
+
570
+ async def start(self):
571
+ """启动心跳任务"""
572
+ if self._task and not self._task.done():
573
+ logger.warning("Heartbeat task already running")
574
+ return
575
+
576
+ self._stop_event = asyncio.Event()
577
+ self._first_heartbeat_done = asyncio.Event()
578
+ self._task = asyncio.create_task(self._heartbeat_loop())
579
+
580
+ async def stop(self):
581
+ """停止心跳任务"""
582
+ if not self._task:
583
+ return
584
+
585
+ logger.debug(f"Stopping heartbeat task for worker {self.worker_id}")
586
+ self._stop_event.set()
587
+
588
+ try:
589
+ await asyncio.wait_for(self._task, timeout=2.0)
590
+ except asyncio.TimeoutError:
591
+ logger.warning("Heartbeat task did not stop in time, cancelling...")
592
+ self._task.cancel()
593
+ try:
594
+ await self._task
595
+ except asyncio.CancelledError:
596
+ pass
597
+
598
+ logger.debug("Heartbeat task stopped")
599
+
600
+ async def _heartbeat_loop(self):
601
+ """心跳循环(在协程中运行)"""
602
+ hostname = socket.gethostname()
603
+ pid = str(os.getpid())
604
+
605
+ logger.info(f"Heartbeat task starting for worker {self.worker_id}")
606
+
607
+ heartbeat_count = 0
608
+ last_log_time = time.time()
609
+ first_heartbeat = True
610
+
611
+ while not self._stop_event.is_set():
612
+ try:
613
+ current_time = time.time()
614
+
615
+ needs_full_init = False
616
+ publish_online_signal = False
617
+
618
+ old_alive = await self.redis_client.hget(self.worker_key, 'is_alive')
619
+ consumer_id = await self.redis_client.hget(self.worker_key, 'consumer_id')
620
+
621
+ if not consumer_id:
622
+ needs_full_init = True
623
+ publish_online_signal = True
624
+ logger.warning(f"Worker {self.worker_id} key missing critical fields, reinitializing...")
625
+ elif first_heartbeat and old_alive != b'true' and old_alive != 'true':
626
+ publish_online_signal = True
627
+
628
+ # 标记首次心跳完成
629
+ if first_heartbeat:
630
+ first_heartbeat = False
631
+
632
+ if needs_full_init:
633
+ worker_info = {
634
+ 'consumer_id': self.worker_id,
635
+ 'host': hostname,
636
+ 'pid': pid,
637
+ 'created_at': str(current_time),
638
+ 'last_heartbeat': str(current_time),
639
+ 'is_alive': 'true',
640
+ 'messages_transferred': 'false',
641
+ 'heartbeat_timeout': str(self.heartbeat_timeout),
642
+ }
643
+
644
+ if self.queues:
645
+ worker_info['queues'] = ','.join(sorted(self.queues))
646
+
647
+ await self.redis_client.hset(self.worker_key, mapping=worker_info)
648
+ logger.info(f"Reinitialized worker {self.worker_id} with full info")
649
+ else:
650
+ # 构建心跳更新数据
651
+ heartbeat_update = {
652
+ 'last_heartbeat': str(current_time),
653
+ 'is_alive': 'true',
654
+ 'host': hostname
655
+ }
656
+
657
+ # 如果是从离线变为在线(复用worker ID),重置 messages_transferred
658
+ if publish_online_signal:
659
+ heartbeat_update['messages_transferred'] = 'false'
660
+ logger.debug(f"Worker {self.worker_id} reused, reset messages_transferred=false")
661
+
662
+ await self.redis_client.hset(self.worker_key, mapping=heartbeat_update)
663
+
664
+ await self.redis_client.zadd(
665
+ f"{self.redis_prefix}:ACTIVE_WORKERS",
666
+ {self.worker_id: current_time}
667
+ )
668
+
669
+ if publish_online_signal:
670
+ state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
671
+ message = json.dumps({
672
+ 'worker_id': self.worker_id,
673
+ 'state': 'online',
674
+ 'timestamp': current_time
675
+ })
676
+ result = await self.redis_client.publish(state_change_channel, message)
677
+ logger.info(f"Worker {self.worker_id} is now ONLINE, published to {result} subscribers")
678
+
679
+ workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
680
+ await self.redis_client.sadd(workers_registry_key, self.worker_id)
681
+
682
+ self._last_heartbeat_time = current_time
683
+ heartbeat_count += 1
684
+
685
+ # 如果这是首次心跳,通知等待的协程
686
+ if heartbeat_count == 1:
687
+ self._first_heartbeat_done.set()
688
+ logger.debug(f"First heartbeat completed for worker {self.worker_id}")
689
+
690
+ if current_time - last_log_time >= 30:
691
+ logger.debug(f"Heartbeat task: sent {heartbeat_count} heartbeats for worker {self.worker_id}")
692
+ last_log_time = current_time
693
+ heartbeat_count = 0
694
+
695
+ except Exception as e:
696
+ logger.error(f"Error in heartbeat task: {e}", exc_info=True)
697
+
698
+ # 等待下一次心跳
699
+ try:
700
+ await asyncio.wait_for(self._stop_event.wait(), timeout=self.interval)
701
+ break # 如果停止事件被设置,退出循环
702
+ except asyncio.TimeoutError:
703
+ pass # 超时是正常的,继续下一次心跳
704
+
705
+ logger.info(f"Heartbeat task stopped for worker {self.worker_id}")
706
+
707
+ async def mark_offline(self, reason: str = "shutdown"):
708
+ """标记 worker 为离线状态"""
709
+ try:
710
+ current_time = time.time()
711
+ state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
712
+
713
+ pipeline = self.redis_client.pipeline()
714
+ pipeline.hset(self.worker_key, 'is_alive', 'false')
715
+ pipeline.hset(self.worker_key, 'offline_reason', reason)
716
+ pipeline.hset(self.worker_key, 'offline_time', str(current_time))
717
+ pipeline.hset(self.worker_key, 'messages_transferred', 'false')
718
+ pipeline.zrem(f"{self.redis_prefix}:ACTIVE_WORKERS", self.worker_id)
719
+
720
+ message = json.dumps({
721
+ 'worker_id': self.worker_id,
722
+ 'state': 'offline',
723
+ 'reason': reason,
724
+ 'timestamp': current_time
725
+ })
726
+ pipeline.publish(state_change_channel, message)
727
+ await pipeline.execute()
728
+
729
+ logger.info(f"Worker {self.worker_id} marked as offline (reason: {reason})")
730
+ except Exception as e:
731
+ logger.error(f"Error marking worker offline: {e}", exc_info=True)
732
+
733
+
734
+ class HeartbeatThreadManager:
735
+ """基于线程的心跳管理器(在 CLI 主进程中运行)"""
736
+
737
+ def __init__(self, redis_client=None, worker_key=None, worker_id=None, redis_prefix=None,
738
+ interval=5.0, redis_url=None, consumer_id=None, heartbeat_interval=None,
739
+ heartbeat_timeout=15.0):
740
+ """初始化心跳线程管理器"""
741
+ if redis_url is not None:
742
+ from jettask.utils.db_connector import get_sync_redis_client
743
+ self.redis_client = get_sync_redis_client(redis_url, decode_responses=True)
744
+ self.redis_url = redis_url
745
+ self.consumer_id = consumer_id
746
+ self.heartbeat_interval = heartbeat_interval or 5.0
747
+ self.heartbeat_timeout = heartbeat_timeout
748
+ self.worker_key = None
749
+ self.worker_id = None
750
+ self.redis_prefix = redis_prefix
751
+ self.interval = self.heartbeat_interval
752
+ self.queues: Set[str] = set()
753
+ self._last_heartbeat_time = None
754
+ self._last_heartbeat_time_lock = threading.Lock()
755
+ else:
756
+ self.redis_client = redis_client
757
+ self.worker_key = worker_key
758
+ self.worker_id = worker_id
759
+ self.redis_prefix = redis_prefix
760
+ self.interval = interval
761
+ self.redis_url = None
762
+ self.consumer_id = worker_id
763
+ self.heartbeat_interval = interval
764
+ self.heartbeat_timeout = 15.0
765
+ self.queues: Set[str] = set()
766
+ self._last_heartbeat_time = None
767
+ self._last_heartbeat_time_lock = threading.Lock()
768
+
769
+ self._stop_event = threading.Event()
770
+ self._thread = None
771
+ self.heartbeat_process = self
772
+
773
+ # 用于等待首次心跳的事件
774
+ self._first_heartbeat_done = threading.Event()
775
+
776
+ @classmethod
777
+ def create_and_start(cls, redis_client, redis_prefix: str, queues: List[str] = None,
778
+ interval: float = 5.0, worker_state=None):
779
+ """
780
+ 创建心跳管理器并启动,生成 worker_id 后等待首次心跳成功
781
+
782
+ Args:
783
+ redis_client: Redis 客户端
784
+ redis_prefix: Redis 前缀
785
+ queues: 队列列表
786
+ interval: 心跳间隔
787
+ worker_state: WorkerState 实例(用于查找可复用的 worker_id)
788
+
789
+ Returns:
790
+ HeartbeatThreadManager 实例(包含 worker_id 和 worker_key 属性)
791
+ """
792
+ from jettask.worker.manager import WorkerNaming
793
+
794
+ # 1. 生成 worker_id
795
+ naming = WorkerNaming()
796
+
797
+ # 生成主机名前缀
798
+ try:
799
+ hostname = socket.gethostname()
800
+ ip = socket.gethostbyname(hostname)
801
+ prefix = hostname if hostname != 'localhost' else ip
802
+ except:
803
+ prefix = os.environ.get('HOSTNAME', 'unknown')
804
+
805
+ # 尝试复用离线的 worker_id(同步方式)
806
+ reusable_id = None
807
+ if worker_state:
808
+ import asyncio
809
+ try:
810
+ loop = asyncio.get_event_loop()
811
+ if not loop.is_running():
812
+ reusable_id = loop.run_until_complete(
813
+ naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
814
+ )
815
+ except RuntimeError:
816
+ # 没有事件循环,创建新的
817
+ loop = asyncio.new_event_loop()
818
+ asyncio.set_event_loop(loop)
819
+ try:
820
+ reusable_id = loop.run_until_complete(
821
+ naming.find_reusable_worker_id(prefix=prefix, worker_state=worker_state)
822
+ )
823
+ finally:
824
+ loop.close()
825
+
826
+ # 生成或复用 worker_id
827
+ if reusable_id:
828
+ worker_id = reusable_id
829
+ logger.info(f"[PID {os.getpid()}] Reusing offline worker ID: {worker_id}")
830
+ else:
831
+ worker_id = naming.generate_worker_id(prefix)
832
+ logger.info(f"[PID {os.getpid()}] Generated new worker ID: {worker_id}")
833
+
834
+ worker_key = f"{redis_prefix}:WORKER:{worker_id}"
835
+
836
+ # 2. 创建心跳管理器
837
+ manager = cls(
838
+ redis_client=redis_client,
839
+ worker_key=worker_key,
840
+ worker_id=worker_id,
841
+ redis_prefix=redis_prefix,
842
+ interval=interval
843
+ )
844
+
845
+ # 3. 设置队列
846
+ if queues:
847
+ for queue in queues:
848
+ manager.queues.add(queue)
849
+
850
+ # 4. 启动心跳线程
851
+ manager.start()
852
+
853
+ # 5. 等待首次心跳成功(最多等待 10 秒)
854
+ if not manager._first_heartbeat_done.wait(timeout=10):
855
+ logger.warning(f"Timeout waiting for first heartbeat for worker {worker_id}")
856
+
857
+ # 返回管理器对象,调用方可以通过 manager.worker_id 和 manager.worker_key 访问
858
+ return manager
859
+
860
+ def start(self):
861
+ """启动心跳线程"""
862
+ if self._thread and self._thread.is_alive():
863
+ logger.warning("Heartbeat thread already running")
864
+ return
865
+
866
+ self._stop_event.clear()
867
+ self._thread = threading.Thread(
868
+ target=self._heartbeat_loop,
869
+ name=f"Heartbeat-{self.worker_id}",
870
+ daemon=True
871
+ )
872
+ self._thread.start()
873
+ logger.info(f"Heartbeat thread started for worker {self.worker_id}")
874
+
875
+ def stop(self, timeout=2.0):
876
+ """停止心跳线程"""
877
+ if not self._thread:
878
+ return
879
+
880
+ logger.debug(f"Stopping heartbeat thread for worker {self.worker_id}")
881
+ self._stop_event.set()
882
+ self._thread.join(timeout=timeout)
883
+
884
+ if self._thread.is_alive():
885
+ logger.warning("Heartbeat thread did not stop in time")
886
+ else:
887
+ logger.debug("Heartbeat thread stopped")
888
+
889
+ def _heartbeat_loop(self):
890
+ """心跳循环(在线程中运行)"""
891
+ hostname = socket.gethostname()
892
+ pid = str(os.getpid())
893
+
894
+ logger.info(f"Heartbeat thread starting for worker {self.worker_id}")
895
+
896
+ heartbeat_count = 0
897
+ last_log_time = time.time()
898
+ first_heartbeat = True
899
+
900
+ while not self._stop_event.is_set():
901
+ try:
902
+ current_time = time.time()
903
+
904
+ needs_full_init = False
905
+ publish_online_signal = False
906
+
907
+ old_alive = self.redis_client.hget(self.worker_key, 'is_alive')
908
+ consumer_id = self.redis_client.hget(self.worker_key, 'consumer_id')
909
+
910
+ if not consumer_id:
911
+ needs_full_init = True
912
+ publish_online_signal = True
913
+ logger.warning(f"Worker {self.worker_id} key missing critical fields, reinitializing...")
914
+ elif first_heartbeat and old_alive != 'true':
915
+ publish_online_signal = True
916
+
917
+ # 标记首次心跳完成(在第一次心跳逻辑执行后)
918
+ if first_heartbeat:
919
+ first_heartbeat = False
920
+
921
+ if needs_full_init:
922
+ worker_info = {
923
+ 'consumer_id': self.worker_id,
924
+ 'host': hostname,
925
+ 'pid': pid,
926
+ 'created_at': str(current_time),
927
+ 'last_heartbeat': str(current_time),
928
+ 'is_alive': 'true',
929
+ 'messages_transferred': 'false',
930
+ 'heartbeat_timeout': str(self.heartbeat_timeout),
931
+ }
932
+
933
+ if self.queues:
934
+ worker_info['queues'] = ','.join(sorted(self.queues))
935
+
936
+ self.redis_client.hset(self.worker_key, mapping=worker_info)
937
+ logger.info(f"Reinitialized worker {self.worker_id} with full info")
938
+ else:
939
+ # 构建心跳更新数据
940
+ heartbeat_update = {
941
+ 'last_heartbeat': str(current_time),
942
+ 'is_alive': 'true',
943
+ 'host': hostname
944
+ }
945
+
946
+ # 如果是从离线变为在线(复用worker ID),重置 messages_transferred
947
+ if publish_online_signal:
948
+ heartbeat_update['messages_transferred'] = 'false'
949
+ logger.debug(f"Worker {self.worker_id} reused, reset messages_transferred=false")
950
+
951
+ self.redis_client.hset(self.worker_key, mapping=heartbeat_update)
952
+
953
+ self.redis_client.zadd(
954
+ f"{self.redis_prefix}:ACTIVE_WORKERS",
955
+ {self.worker_id: current_time}
956
+ )
957
+
958
+ if publish_online_signal:
959
+ state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
960
+ message = json.dumps({
961
+ 'worker_id': self.worker_id,
962
+ 'state': 'online',
963
+ 'timestamp': current_time
964
+ })
965
+ result = self.redis_client.publish(state_change_channel, message)
966
+ logger.info(f"Worker {self.worker_id} is now ONLINE, published to {result} subscribers")
967
+
968
+ workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
969
+ self.redis_client.sadd(workers_registry_key, self.worker_id)
970
+
971
+ with self._last_heartbeat_time_lock:
972
+ self._last_heartbeat_time = current_time
973
+
974
+ heartbeat_count += 1
975
+
976
+ # 如果这是首次心跳,通知等待的线程
977
+ if heartbeat_count == 1:
978
+ self._first_heartbeat_done.set()
979
+ logger.debug(f"First heartbeat completed for worker {self.worker_id}")
980
+
981
+ if current_time - last_log_time >= 30:
982
+ logger.debug(f"Heartbeat thread: sent {heartbeat_count} heartbeats for worker {self.worker_id}")
983
+ last_log_time = current_time
984
+ heartbeat_count = 0
985
+
986
+ except Exception as e:
987
+ logger.error(f"Error in heartbeat thread: {e}", exc_info=True)
988
+ if "Timeout connecting" in str(e) or "Connection" in str(e):
989
+ try:
990
+ self.redis_client.close()
991
+ except:
992
+ pass
993
+ try:
994
+ if self.redis_url:
995
+ from jettask.utils.db_connector import get_sync_redis_client
996
+ self.redis_client = get_sync_redis_client(
997
+ redis_url=self.redis_url,
998
+ decode_responses=True,
999
+ )
1000
+ logger.info(f"Reconnected to Redis for heartbeat thread {self.worker_id}")
1001
+ except Exception as reconnect_error:
1002
+ logger.error(f"Failed to reconnect Redis: {reconnect_error}")
1003
+ time.sleep(5)
1004
+
1005
+ self._stop_event.wait(timeout=self.interval)
1006
+
1007
+ logger.info(f"Heartbeat thread exiting for worker {self.worker_id}")
1008
+ try:
1009
+ current_time = time.time()
1010
+ pipeline = self.redis_client.pipeline()
1011
+ pipeline.hset(self.worker_key, mapping={
1012
+ 'is_alive': 'false',
1013
+ 'offline_time': str(current_time),
1014
+ 'shutdown_reason': 'heartbeat_stopped',
1015
+ 'messages_transferred': 'false'
1016
+ })
1017
+
1018
+ state_change_channel = f"{self.redis_prefix}:WORKER_STATE_CHANGE"
1019
+ message = json.dumps({
1020
+ 'worker_id': self.worker_id,
1021
+ 'state': 'offline',
1022
+ 'timestamp': current_time
1023
+ })
1024
+ pipeline.publish(state_change_channel, message)
1025
+ pipeline.execute()
1026
+
1027
+ logger.info(f"Worker {self.worker_id} marked as offline")
1028
+ except Exception as e:
1029
+ logger.error(f"Error marking worker offline: {e}", exc_info=True)
1030
+
1031
+ def add_queue(self, queue: str, worker_key: str):
1032
+ """添加队列"""
1033
+ self.queues.add(queue)
1034
+
1035
+ if self.worker_key is None:
1036
+ self.worker_key = worker_key
1037
+ parts = worker_key.split(':')
1038
+ if len(parts) >= 3:
1039
+ self.redis_prefix = parts[0]
1040
+ self.worker_id = parts[2]
1041
+ else:
1042
+ logger.error(f"Invalid worker_key format: {worker_key}")
1043
+ raise ValueError(f"Invalid worker_key format: {worker_key}")
1044
+
1045
+ if self._thread is not None and self._thread.is_alive():
1046
+ logger.debug(f"Heartbeat thread already running, added queue {queue}")
1047
+ return
1048
+
1049
+ self.start()
1050
+ logger.debug(f"Started single heartbeat thread for worker {self.worker_id}")
1051
+
1052
+ def remove_queue(self, queue: str):
1053
+ """移除队列"""
1054
+ if queue in self.queues:
1055
+ self.queues.remove(queue)
1056
+ logger.debug(f"Removed queue {queue} from heartbeat monitoring")
1057
+
1058
+ if not self.queues:
1059
+ self.stop()
1060
+ logger.debug("No more queues, stopped heartbeat thread")
1061
+
1062
+ def stop_all(self):
1063
+ """停止心跳线程"""
1064
+ self.stop()
1065
+ self.queues.clear()
1066
+
1067
+ def is_healthy(self) -> bool:
1068
+ """检查心跳线程是否健康"""
1069
+ if not self._thread:
1070
+ return len(self.queues) == 0
1071
+
1072
+ if not self._thread.is_alive():
1073
+ logger.error(f"Heartbeat thread for worker {self.worker_id} is not alive")
1074
+ return False
1075
+ return True
1076
+
1077
+ def get_last_heartbeat_time(self) -> Optional[float]:
1078
+ """获取最后一次心跳时间"""
1079
+ with self._last_heartbeat_time_lock:
1080
+ return self._last_heartbeat_time
1081
+
1082
+ def is_heartbeat_timeout(self) -> bool:
1083
+ """检查心跳是否已超时"""
1084
+ last_heartbeat = self.get_last_heartbeat_time()
1085
+ if last_heartbeat is None:
1086
+ return False
1087
+
1088
+ current_time = time.time()
1089
+ return (current_time - last_heartbeat) > self.heartbeat_timeout
1090
+
1091
+
1092
+ # ============================================================================
1093
+ # Worker 扫描器
1094
+ # ============================================================================
1095
+
1096
+ class WorkerScanner:
1097
+ """使用 Redis Sorted Set 优化的 Worker 扫描器
1098
+
1099
+ 核心优化:
1100
+ 1. O(log N) 的超时检测复杂度
1101
+ 2. 自动一致性维护
1102
+ 3. 原子性操作保证数据一致
1103
+ """
1104
+
1105
+ def __init__(self, sync_redis, async_redis, redis_prefix: str = 'jettask',
1106
+ heartbeat_timeout: float = 3.0, worker_prefix: str = 'WORKER',
1107
+ worker_state_manager=None):
1108
+ self.redis = sync_redis
1109
+ self.async_redis = async_redis
1110
+ self.redis_prefix = redis_prefix
1111
+ self.worker_prefix = worker_prefix
1112
+ self.heartbeat_timeout = heartbeat_timeout
1113
+ self.active_workers_key = f"{redis_prefix}:ACTIVE_WORKERS"
1114
+ self.worker_state_manager = worker_state_manager
1115
+
1116
+ self._initialized = False
1117
+ self._last_full_sync = 0
1118
+ self._full_sync_interval = 60
1119
+ self._scan_counter = 0
1120
+ self._partial_check_interval = 10
1121
+
1122
+ async def scan_timeout_workers(self) -> List[Dict]:
1123
+ """快速扫描超时的 worker - O(log N) 复杂度"""
1124
+ self._scan_counter += 1
1125
+ if self._scan_counter >= self._partial_check_interval:
1126
+ self._scan_counter = 0
1127
+ asyncio.create_task(self._partial_check())
1128
+
1129
+ current_time = time.time()
1130
+ max_possible_timeout = 300
1131
+ cutoff_time = current_time - max_possible_timeout
1132
+
1133
+ potential_timeout_worker_ids = await self.async_redis.zrangebyscore(
1134
+ self.active_workers_key,
1135
+ min=0,
1136
+ max=current_time - 1
1137
+ )
1138
+
1139
+ if not potential_timeout_worker_ids:
1140
+ return []
1141
+
1142
+ if self.worker_state_manager:
1143
+ all_workers_info = await self.worker_state_manager.get_all_workers_info(only_alive=False)
1144
+ workers_data = [all_workers_info.get(wid) for wid in potential_timeout_worker_ids]
1145
+ else:
1146
+ pipeline = self.async_redis.pipeline()
1147
+ for worker_id in potential_timeout_worker_ids:
1148
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1149
+ pipeline.hgetall(worker_key)
1150
+ workers_data = await pipeline.execute()
1151
+
1152
+ result = []
1153
+ cleanup_pipeline = self.async_redis.pipeline()
1154
+ need_cleanup = False
1155
+
1156
+ for worker_id, worker_data in zip(potential_timeout_worker_ids, workers_data):
1157
+ if not worker_data:
1158
+ cleanup_pipeline.zrem(self.active_workers_key, worker_id)
1159
+ workers_registry_key = f"{self.redis_prefix}:REGISTRY:WORKERS"
1160
+ cleanup_pipeline.srem(workers_registry_key, worker_id)
1161
+ need_cleanup = True
1162
+ continue
1163
+
1164
+ worker_heartbeat_timeout = float(worker_data.get('heartbeat_timeout', self.heartbeat_timeout))
1165
+ last_heartbeat = float(worker_data.get('last_heartbeat', 0))
1166
+ worker_cutoff_time = current_time - worker_heartbeat_timeout
1167
+
1168
+ if last_heartbeat >= worker_cutoff_time:
1169
+ cleanup_pipeline.zadd(self.active_workers_key, {worker_id: last_heartbeat})
1170
+ need_cleanup = True
1171
+ continue
1172
+
1173
+ is_alive = worker_data.get('is_alive', 'true') == 'true' if self.worker_state_manager else worker_data.get('is_alive', 'true').lower() == 'true'
1174
+ if not is_alive:
1175
+ cleanup_pipeline.zrem(self.active_workers_key, worker_id)
1176
+ need_cleanup = True
1177
+ continue
1178
+
1179
+ logger.debug(f"Worker {worker_id} timeout: last_heartbeat={last_heartbeat}, timeout={worker_heartbeat_timeout}s")
1180
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1181
+ result.append({
1182
+ 'worker_key': worker_key,
1183
+ 'worker_data': worker_data,
1184
+ 'worker_id': worker_id
1185
+ })
1186
+
1187
+ if need_cleanup:
1188
+ await cleanup_pipeline.execute()
1189
+
1190
+ if result:
1191
+ logger.info(f"Found {len(result)} timeout workers")
1192
+
1193
+ return result
1194
+
1195
+ async def update_heartbeat(self, worker_id: str, heartbeat_time: Optional[float] = None):
1196
+ """原子性更新心跳"""
1197
+ if heartbeat_time is None:
1198
+ heartbeat_time = time.time()
1199
+
1200
+ pipeline = self.async_redis.pipeline()
1201
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1202
+
1203
+ pipeline.hset(worker_key, 'last_heartbeat', str(heartbeat_time))
1204
+ pipeline.zadd(self.active_workers_key, {worker_id: heartbeat_time})
1205
+
1206
+ await pipeline.execute()
1207
+
1208
+ async def add_worker(self, worker_id: str, worker_data: Dict):
1209
+ """添加新 worker"""
1210
+ heartbeat_time = float(worker_data.get('last_heartbeat', time.time()))
1211
+
1212
+ pipeline = self.async_redis.pipeline()
1213
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1214
+
1215
+ pipeline.hset(worker_key, mapping=worker_data)
1216
+ pipeline.zadd(self.active_workers_key, {worker_id: heartbeat_time})
1217
+
1218
+ await pipeline.execute()
1219
+ logger.debug(f"Added worker {worker_id} to system")
1220
+
1221
+ async def remove_worker(self, worker_id: str):
1222
+ """移除 worker"""
1223
+ if self.worker_state_manager:
1224
+ await self.worker_state_manager.set_worker_offline(worker_id, reason="heartbeat_timeout")
1225
+ else:
1226
+ pipeline = self.async_redis.pipeline()
1227
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1228
+
1229
+ pipeline.hset(worker_key, 'is_alive', 'false')
1230
+ pipeline.zrem(self.active_workers_key, worker_id)
1231
+
1232
+ await pipeline.execute()
1233
+ logger.debug(f"Removed worker {worker_id} from active set (direct mode)")
1234
+
1235
+ if self.worker_state_manager:
1236
+ await self.async_redis.zrem(self.active_workers_key, worker_id)
1237
+
1238
+ async def cleanup_stale_workers(self, max_age_seconds: float = 3600):
1239
+ """清理过期的 worker 记录"""
1240
+ current_time = time.time()
1241
+ cutoff_time = current_time - max_age_seconds
1242
+
1243
+ stale_worker_ids = await self.async_redis.zrangebyscore(
1244
+ self.active_workers_key,
1245
+ min=0,
1246
+ max=cutoff_time
1247
+ )
1248
+
1249
+ if not stale_worker_ids:
1250
+ return 0
1251
+
1252
+ pipeline = self.async_redis.pipeline()
1253
+
1254
+ for worker_id in stale_worker_ids:
1255
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1256
+ pipeline.delete(worker_key)
1257
+
1258
+ pipeline.zrem(self.active_workers_key, *stale_worker_ids)
1259
+
1260
+ await pipeline.execute()
1261
+
1262
+ logger.info(f"Cleaned up {len(stale_worker_ids)} stale worker records")
1263
+ return len(stale_worker_ids)
1264
+
1265
+ async def _partial_check(self):
1266
+ """部分一致性检查"""
1267
+ try:
1268
+ sample_size = min(10, await self.async_redis.zcard(self.active_workers_key))
1269
+ if sample_size == 0:
1270
+ return
1271
+
1272
+ random_workers = await self.async_redis.zrandmember(
1273
+ self.active_workers_key, sample_size, withscores=True
1274
+ )
1275
+
1276
+ for worker_id, zset_score in random_workers:
1277
+ worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{worker_id}"
1278
+ hash_heartbeat = await self.async_redis.hget(worker_key, 'last_heartbeat')
1279
+
1280
+ if not hash_heartbeat:
1281
+ await self.async_redis.zrem(self.active_workers_key, worker_id)
1282
+ logger.debug(f"Partial check: removed {worker_id}")
1283
+ else:
1284
+ hash_time = float(hash_heartbeat)
1285
+ if abs(hash_time - zset_score) > 1.0:
1286
+ await self.async_redis.zadd(self.active_workers_key, {worker_id: hash_time})
1287
+ logger.debug(f"Partial check: synced {worker_id}")
1288
+
1289
+ except Exception as e:
1290
+ logger.debug(f"Partial check error: {e}")
1291
+
1292
+ async def get_active_count(self) -> int:
1293
+ """获取活跃 worker 数量 - O(1)"""
1294
+ return await self.async_redis.zcard(self.active_workers_key)
1295
+
1296
+
1297
+ # ============================================================================
1298
+ # Worker 生命周期
1299
+ # ============================================================================
1300
+
1301
+ class WorkerLifecycle:
1302
+ """Worker 生命周期管理
1303
+
1304
+ 职责:
1305
+ - 初始化 Worker(生成ID、注册、启动心跳)
1306
+ - 清理 Worker(停止心跳、注销、离线标记)
1307
+ """
1308
+
1309
+ def __init__(
1310
+ self,
1311
+ redis_client,
1312
+ async_redis_client,
1313
+ redis_prefix: str,
1314
+ naming: 'WorkerNaming',
1315
+ state_manager: 'WorkerStateManager',
1316
+ registry: 'WorkerRegistry',
1317
+ heartbeat_class
1318
+ ):
1319
+ """初始化生命周期管理器"""
1320
+ self.redis_client = redis_client
1321
+ self.async_redis_client = async_redis_client
1322
+ self.redis_prefix = redis_prefix
1323
+ self.naming = naming
1324
+ self.state = state_manager
1325
+ self.registry = registry
1326
+ self.heartbeat_class = heartbeat_class
1327
+ self.active_heartbeats: Dict[str, Any] = {}
1328
+
1329
+ async def initialize_worker(
1330
+ self,
1331
+ prefix: str,
1332
+ queues: List[str],
1333
+ reuse_offline: bool = True
1334
+ ) -> str:
1335
+ """初始化 Worker"""
1336
+ worker_id = None
1337
+ if reuse_offline:
1338
+ worker_id = await self.naming.find_reusable_worker_id(prefix, self.registry)
1339
+
1340
+ if not worker_id:
1341
+ worker_id = self.naming.generate_worker_id(prefix)
1342
+
1343
+ logger.info(f"Initializing worker: {worker_id}")
1344
+
1345
+ await self.state.set_worker_online(
1346
+ worker_id=worker_id,
1347
+ queues=queues,
1348
+ pid=os.getpid(),
1349
+ host=socket.gethostname()
1350
+ )
1351
+
1352
+ await self.registry.register(worker_id)
1353
+
1354
+ worker_key = f"{self.redis_prefix}:WORKER:{worker_id}"
1355
+ heartbeat = self.heartbeat_class(
1356
+ redis_client=self.redis_client,
1357
+ worker_key=worker_key,
1358
+ worker_id=worker_id,
1359
+ redis_prefix=self.redis_prefix,
1360
+ interval=5.0
1361
+ )
1362
+
1363
+ for queue in queues:
1364
+ heartbeat.queues.add(queue)
1365
+
1366
+ heartbeat.start()
1367
+ self.active_heartbeats[worker_id] = heartbeat
1368
+
1369
+ logger.info(f"Worker initialized successfully: {worker_id}")
1370
+ return worker_id
1371
+
1372
+ async def cleanup_worker(self, worker_id: str):
1373
+ """清理 Worker 资源"""
1374
+ logger.info(f"Cleaning up worker: {worker_id}")
1375
+
1376
+ try:
1377
+ if worker_id in self.active_heartbeats:
1378
+ heartbeat = self.active_heartbeats[worker_id]
1379
+ heartbeat.stop()
1380
+ del self.active_heartbeats[worker_id]
1381
+
1382
+ await self.state.set_worker_offline(worker_id)
1383
+ await self.registry.unregister(worker_id)
1384
+
1385
+ logger.info(f"Worker cleaned up successfully: {worker_id}")
1386
+ except Exception as e:
1387
+ logger.error(f"Error cleaning up worker {worker_id}: {e}")
1388
+ raise
1389
+
1390
+ async def record_task_start(self, worker_id: str, queue: str):
1391
+ """记录任务开始"""
1392
+ await self.state.increment_queue_stats(
1393
+ worker_id=worker_id,
1394
+ queue=queue,
1395
+ running_tasks_delta=1
1396
+ )
1397
+
1398
+ async def record_task_finish(
1399
+ self,
1400
+ worker_id: str,
1401
+ queue: str,
1402
+ success: bool,
1403
+ duration: float
1404
+ ):
1405
+ """记录任务完成"""
1406
+ await self.state.increment_queue_stats(
1407
+ worker_id=worker_id,
1408
+ queue=queue,
1409
+ running_tasks_delta=-1,
1410
+ success_count_increment=1 if success else 0,
1411
+ failed_count_increment=0 if success else 1,
1412
+ total_count_increment=1,
1413
+ processing_time_increment=duration
1414
+ )
1415
+
1416
+ # 更新平均处理时间
1417
+ stats = await self.state.get_queue_total_stats(worker_id, queue)
1418
+ if stats['total_count'] > 0:
1419
+ avg_time = stats['total_processing_time'] / stats['total_count']
1420
+ await self.state.update_queue_stats(
1421
+ worker_id=worker_id,
1422
+ queue=queue,
1423
+ avg_processing_time=avg_time
1424
+ )
1425
+
1426
+ async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, Any]]:
1427
+ """获取 Worker 信息"""
1428
+ return await self.state.get_worker_info(worker_id)
1429
+
1430
+
1431
+ # ============================================================================
1432
+ # 兼容性层:HeartbeatConsumerStrategy (for backward compatibility)
1433
+ # ============================================================================
1434
+
1435
+ class HeartbeatConsumerStrategy:
1436
+ """
1437
+ 兼容性类 - 为旧代码提供向后兼容
1438
+
1439
+ ⚠️ 已废弃: 请使用 WorkerManager 和 WorkerNaming 代替
1440
+ """
1441
+
1442
+ def __init__(self, redis_client, config: Dict = None, app=None):
1443
+ self.redis = redis_client
1444
+ self.config = config or {}
1445
+ self.app = app
1446
+ self.redis_prefix = config.get('redis_prefix', 'jettask')
1447
+
1448
+ # 如果 app 传入了 worker_id,直接使用(子进程复用主进程的ID)
1449
+ if app and hasattr(app, 'worker_id') and app.worker_id:
1450
+ self.consumer_id = app.worker_id
1451
+ self._worker_key = app.worker_key or f'{self.redis_prefix}:WORKER:{app.worker_id}'
1452
+ logger.info(f"[PID {os.getpid()}] HeartbeatConsumerStrategy using provided worker_id: {self.consumer_id}")
1453
+ else:
1454
+ self.consumer_id = None
1455
+ self._worker_key = None
1456
+
1457
+ # 获取主机名前缀
1458
+ try:
1459
+ hostname = socket.gethostname()
1460
+ ip = socket.gethostbyname(hostname)
1461
+ prefix = hostname if hostname != 'localhost' else ip
1462
+ except:
1463
+ prefix = os.environ.get('HOSTNAME', 'unknown')
1464
+
1465
+ self.hostname_prefix = prefix
1466
+
1467
+ def _ensure_consumer_id(self):
1468
+ """确保consumer_id已创建(兼容旧代码)"""
1469
+ import os
1470
+ if self.consumer_id is None:
1471
+ # 使用 WorkerNaming 生成
1472
+ from .manager import WorkerNaming
1473
+ naming = WorkerNaming()
1474
+ self.consumer_id = naming.generate_worker_id(self.hostname_prefix)
1475
+ self._worker_key = f'{self.redis_prefix}:WORKER:{self.consumer_id}'
1476
+ logger.info(f"[PID {os.getpid()}] Generated NEW worker ID: {self.consumer_id}")
1477
+ else:
1478
+ logger.debug(f"[PID {os.getpid()}] Reusing existing worker ID: {self.consumer_id}")
1479
+
1480
+ def get_consumer_name(self, queue: str) -> str:
1481
+ """
1482
+ 获取消费者名称
1483
+
1484
+ 统一 group_name 架构:所有队列(包括优先级队列)使用基础队列名生成 consumer name
1485
+ 例如:robust_bench2 和 robust_bench2:8 都使用 "YYDG-xxx-robust_bench2"
1486
+ """
1487
+ self._ensure_consumer_id()
1488
+
1489
+ # 提取基础队列名(移除优先级后缀)
1490
+ base_queue = queue
1491
+ if ':' in queue and queue.rsplit(':', 1)[1].isdigit():
1492
+ base_queue = queue.rsplit(':', 1)[0]
1493
+
1494
+ return f"{self.consumer_id}-{base_queue}"
1495
+
1496
+ def cleanup(self):
1497
+ """清理资源(兼容旧代码)"""
1498
+ pass
1499
+
1500
+
1501
+ __all__ = [
1502
+ 'WorkerStateManager',
1503
+ 'HeartbeatThreadManager',
1504
+ 'WorkerScanner',
1505
+ 'WorkerLifecycle',
1506
+ 'HeartbeatConsumerStrategy', # 兼容性
1507
+ ]