jettask 0.2.19__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +12 -3
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/METADATA +2 -71
  86. jettask-0.2.23.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.19.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2188 @@
1
+ """
2
+ EventPool - 事件池核心实现
3
+
4
+ 负责:
5
+ 1. 任务队列管理和消息分发
6
+ 2. 消费者管理和生命周期控制
7
+ 3. 优先级队列处理
8
+ 4. 离线Worker恢复机制
9
+
10
+ 核心组件集成:
11
+ - MessageSender/Reader: 消息发送和读取(通过container)
12
+ - QueueRegistry: 队列注册管理
13
+ - ConsumerManager: 消费者生命周期管理
14
+ """
15
+
16
+ from ..utils.serializer import dumps_str, loads_str
17
+ import time
18
+ import threading
19
+ import logging
20
+ import asyncio
21
+ import json
22
+ from collections import defaultdict, deque, Counter
23
+ from typing import List, Optional, TYPE_CHECKING, Union
24
+ import traceback
25
+ import redis
26
+ from redis import asyncio as aioredis
27
+
28
+ from ..utils.db_connector import get_sync_redis_client, get_async_redis_client
29
+
30
+ from ..utils.helpers import get_hostname
31
+ import os
32
+ from jettask.worker.manager import ConsumerManager
33
+ from jettask.worker.recovery import OfflineWorkerRecovery
34
+ from .scanner import DelayedMessageScanner
35
+
36
+ logger = logging.getLogger('app')
37
+
38
+ # Lua脚本:原子地更新Redis hash中的最大值
39
+ UPDATE_MAX_OFFSET_LUA = """
40
+ local hash_key = KEYS[1]
41
+ local field = KEYS[2]
42
+ local new_value = tonumber(ARGV[1])
43
+
44
+ local current = redis.call('HGET', hash_key, field)
45
+ if current == false or tonumber(current) < new_value then
46
+ redis.call('HSET', hash_key, field, new_value)
47
+ return 1
48
+ else
49
+ return 0
50
+ end
51
+ """
52
+
53
+ class EventPool(object):
54
+ STATE_MACHINE_NAME = "STATE_MACHINE"
55
+ TIMEOUT = 60 * 5
56
+
57
+ def __init__(
58
+ self,
59
+ redis_client: redis.StrictRedis,
60
+ async_redis_client: aioredis.StrictRedis,
61
+ queues: list = None,
62
+ redis_url: str = None,
63
+ consumer_strategy: str = None,
64
+ consumer_config: dict = None,
65
+ redis_prefix: str = None,
66
+ app=None, # 添加app参数
67
+ ) -> None:
68
+ self.redis_client = redis_client
69
+ self.async_redis_client = async_redis_client
70
+ # 创建用于二进制数据的Redis客户端(用于Stream操作)
71
+ # 直接使用全局客户端实例(单例)
72
+ self.binary_redis_client = get_sync_redis_client(redis_url, decode_responses=False)
73
+ self.async_binary_redis_client = get_async_redis_client(redis_url, decode_responses=False)
74
+
75
+ self.queues = queues
76
+ self._redis_url = redis_url or 'redis://localhost:6379/0'
77
+ self.redis_prefix = redis_prefix or 'jettask'
78
+ self.app = app # 保存app引用
79
+
80
+ # 初始化消费者管理器
81
+ # consumer_strategy 参数已移除,现在只使用 HEARTBEAT 策略
82
+ # 确保配置中包含队列信息、redis_url和redis_prefix
83
+ manager_config = consumer_config or {}
84
+ manager_config['queues'] = queues or []
85
+ manager_config['redis_prefix'] = redis_prefix or 'jettask'
86
+ manager_config['redis_url'] = redis_url or 'redis://localhost:6379/0'
87
+
88
+ # 保存consumer_config供后续使用
89
+ self.consumer_config = manager_config
90
+
91
+ self.consumer_manager = ConsumerManager(
92
+ redis_client=redis_client,
93
+ config=manager_config,
94
+ app=app
95
+ )
96
+
97
+ # 创建并注入 HeartbeatConsumerStrategy(兼容性层)
98
+ from jettask.worker.lifecycle import HeartbeatConsumerStrategy
99
+ import os
100
+ heartbeat_strategy = HeartbeatConsumerStrategy(
101
+ redis_client=redis_client,
102
+ config=manager_config,
103
+ app=app
104
+ )
105
+ self.consumer_manager.set_heartbeat_strategy(heartbeat_strategy)
106
+
107
+ # 创建队列注册表,用于恢复优先级队列
108
+ from jettask.messaging.registry import QueueRegistry
109
+ self.queue_registry = QueueRegistry(
110
+ redis_client=self.redis_client,
111
+ async_redis_client=self.async_redis_client,
112
+ redis_prefix=self.redis_prefix
113
+ )
114
+
115
+ # 创建带前缀的队列名称映射
116
+ self.prefixed_queues = {}
117
+
118
+ # 优先级队列管理(简化:直接从Redis读取,不再使用缓存)
119
+
120
+ # 用于跟踪广播消息
121
+ self._broadcast_message_tracker = {}
122
+
123
+ self.solo_routing_tasks = {}
124
+ self.solo_running_state = {}
125
+ self.solo_urgent_retry = {}
126
+ self.batch_routing_tasks = {}
127
+ self.task_scheduler = {}
128
+ self.running_task_state_mappings = {}
129
+ self.delay_tasks = []
130
+ self.solo_agg_task = {}
131
+ self.rlock = threading.RLock()
132
+ self._claimed_message_ids = set() # 跟踪已认领的消息ID,防止重复处理
133
+ self._stop_reading = False # 用于控制停止读取的标志
134
+ self._queue_stop_flags = {queue: False for queue in (queues or [])} # 每个队列的停止标志
135
+ # 延迟任务分布式锁的key
136
+ self._delay_lock_key = f"{self.redis_prefix}:DELAY_LOCK"
137
+
138
+ # 初始化延迟消息扫描器(带优先级队列发现回调和消费者组确保回调)
139
+ scan_interval = manager_config.get('scan_interval', 0.05)
140
+ self.delayed_scanner = DelayedMessageScanner(
141
+ async_binary_redis_client=self.async_binary_redis_client,
142
+ redis_prefix=self.redis_prefix,
143
+ scan_interval=scan_interval,
144
+ batch_size=100,
145
+ priority_discovery_callback=self._discover_priority_queues_for_scanner,
146
+ ensure_consumer_group_callback=self._ensure_consumer_group_for_scanner
147
+ )
148
+
149
+ # 延迟任务列表和锁(用于与扫描器通信)
150
+ self._delayed_tasks_lists = {}
151
+ self._delayed_tasks_locks = {}
152
+
153
+ def _put_task(self, event_queue: Union[deque, asyncio.Queue], task, urgent: bool = False):
154
+ """统一的任务放入方法"""
155
+ # 如果是deque,使用原有逻辑
156
+ if isinstance(event_queue, deque):
157
+ if urgent:
158
+ event_queue.appendleft(task)
159
+ else:
160
+ event_queue.append(task)
161
+ # 如果是asyncio.Queue,则暂时只能按顺序放入(Queue不支持优先级)
162
+ elif isinstance(event_queue, asyncio.Queue):
163
+ # 对于asyncio.Queue,我们需要在async上下文中操作
164
+ # 这里先保留接口,具体实现在async方法中
165
+ pass
166
+
167
+ async def _async_put_task(self, event_queue: asyncio.Queue, task, urgent: bool = False):
168
+ """异步任务放入方法"""
169
+ await event_queue.put(task)
170
+
171
+ def init_routing(self):
172
+ for queue in self.queues:
173
+ self.solo_agg_task[queue] = defaultdict(list)
174
+ self.solo_routing_tasks[queue] = defaultdict(list)
175
+ self.solo_running_state[queue] = defaultdict(bool)
176
+ self.batch_routing_tasks[queue] = defaultdict(list)
177
+ self.task_scheduler[queue] = defaultdict(int)
178
+ self.running_task_state_mappings[queue] = defaultdict(dict)
179
+
180
+ def get_prefixed_queue_name(self, queue: str) -> str:
181
+ """为队列名称添加前缀"""
182
+ return f"{self.redis_prefix}:QUEUE:{queue}"
183
+
184
+
185
+ def get_redis_client(self, asyncio: bool = False, binary: bool = False):
186
+ """获取Redis客户端
187
+
188
+ Args:
189
+ asyncio: 是否使用异步客户端
190
+ binary: 是否使用二进制客户端(用于Stream操作)
191
+ """
192
+ if binary:
193
+ return self.async_binary_redis_client if asyncio else self.binary_redis_client
194
+ return self.async_redis_client if asyncio else self.redis_client
195
+
196
+ def _batch_send_event_sync(self, prefixed_queue, messages: List[dict], pipe):
197
+ """批量发送事件(同步)"""
198
+ # 使用Lua脚本批量发送消息并添加自增offset
199
+ lua_script = """
200
+ local stream_key = KEYS[1]
201
+ local prefix = ARGV[1]
202
+ local results = {}
203
+
204
+ -- 使用Hash存储所有队列的offset
205
+ local offsets_hash = prefix .. ':QUEUE_OFFSETS'
206
+
207
+ -- 从stream_key中提取队列名(去掉prefix:QUEUE:前缀)
208
+ local queue_name = string.gsub(stream_key, '^' .. prefix .. ':QUEUE:', '')
209
+
210
+ -- 将队列添加到全局队列注册表(包括所有队列,包括优先级队列)
211
+ local queues_registry_key = prefix .. ':REGISTRY:QUEUES'
212
+ redis.call('SADD', queues_registry_key, queue_name)
213
+
214
+ -- 从ARGV[2]开始,每个参数是一个消息的data
215
+ for i = 2, #ARGV do
216
+ local data = ARGV[i]
217
+
218
+ -- 使用HINCRBY原子递增offset(如果不存在会自动创建并设为1)
219
+ local current_offset = redis.call('HINCRBY', offsets_hash, queue_name, 1)
220
+
221
+ -- 添加消息到Stream(包含offset字段)
222
+ local stream_id = redis.call('XADD', stream_key, '*',
223
+ 'data', data,
224
+ 'offset', current_offset)
225
+
226
+ table.insert(results, stream_id)
227
+ end
228
+
229
+ return results
230
+ """
231
+
232
+ # 准备Lua脚本参数
233
+ lua_args = [self.redis_prefix.encode() if isinstance(self.redis_prefix, str) else self.redis_prefix]
234
+
235
+ for message in messages:
236
+ # 确保消息格式正确
237
+ if 'data' in message:
238
+ data = message['data'] if isinstance(message['data'], bytes) else dumps_str(message['data'])
239
+ else:
240
+ data = dumps_str(message)
241
+ lua_args.append(data)
242
+
243
+ # 获取同步Redis客户端
244
+ client = self.get_redis_client(asyncio=False, binary=True)
245
+
246
+ # 执行Lua脚本
247
+ results = client.eval(
248
+ lua_script,
249
+ 1, # 1个KEY
250
+ prefixed_queue, # KEY[1]: stream key
251
+ *lua_args # ARGV: prefix, data1, data2, ...
252
+ )
253
+
254
+ # 解码所有返回的Stream ID
255
+ return [r.decode('utf-8') if isinstance(r, bytes) else r for r in results]
256
+
257
+ async def _batch_send_event(self, prefixed_queue, messages: List[dict], pipe):
258
+ """批量发送事件(异步)"""
259
+ # 使用Lua脚本批量发送消息并添加自增offset
260
+ lua_script = """
261
+ local stream_key = KEYS[1]
262
+ local prefix = ARGV[1]
263
+ local results = {}
264
+
265
+ -- 使用Hash存储所有队列的offset
266
+ local offsets_hash = prefix .. ':QUEUE_OFFSETS'
267
+
268
+ -- 从stream_key中提取队列名(去掉prefix:QUEUE:前缀)
269
+ local queue_name = string.gsub(stream_key, '^' .. prefix .. ':QUEUE:', '')
270
+
271
+ -- 将队列添加到全局队列注册表(包括所有队列,包括优先级队列)
272
+ local queues_registry_key = prefix .. ':REGISTRY:QUEUES'
273
+ redis.call('SADD', queues_registry_key, queue_name)
274
+
275
+ -- 从ARGV[2]开始,每个参数是一个消息的data
276
+ for i = 2, #ARGV do
277
+ local data = ARGV[i]
278
+
279
+ -- 使用HINCRBY原子递增offset(如果不存在会自动创建并设为1)
280
+ local current_offset = redis.call('HINCRBY', offsets_hash, queue_name, 1)
281
+
282
+ -- 添加消息到Stream(包含offset字段)
283
+ local stream_id = redis.call('XADD', stream_key, '*',
284
+ 'data', data,
285
+ 'offset', current_offset)
286
+
287
+ table.insert(results, stream_id)
288
+ end
289
+
290
+ return results
291
+ """
292
+
293
+ # 准备Lua脚本参数
294
+ lua_args = [self.redis_prefix.encode() if isinstance(self.redis_prefix, str) else self.redis_prefix]
295
+
296
+ for message in messages:
297
+ # 确保消息格式正确
298
+ if 'data' in message:
299
+ data = message['data'] if isinstance(message['data'], bytes) else dumps_str(message['data'])
300
+ else:
301
+ data = dumps_str(message)
302
+ lua_args.append(data)
303
+
304
+ # 获取异步Redis客户端(不使用pipe,直接使用client)
305
+ client = self.get_redis_client(asyncio=True, binary=True)
306
+
307
+ # 执行Lua脚本
308
+ results = await client.eval(
309
+ lua_script,
310
+ 1, # 1个KEY
311
+ prefixed_queue, # KEY[1]: stream key
312
+ *lua_args # ARGV: prefix, data1, data2, ...
313
+ )
314
+
315
+ # 解码所有返回的Stream ID
316
+ return [r.decode('utf-8') if isinstance(r, bytes) else r for r in results]
317
+
318
+ def is_urgent(self, routing_key):
319
+ is_urgent = self.solo_urgent_retry.get(routing_key, False)
320
+ if is_urgent == True:
321
+ del self.solo_urgent_retry[routing_key]
322
+ return is_urgent
323
+
324
+ async def scan_priority_queues(self, base_queue: str) -> list:
325
+ """扫描Redis中的优先级队列
326
+
327
+ Args:
328
+ base_queue: 基础队列名(不带优先级后缀)
329
+
330
+ Returns:
331
+ 按优先级排序的队列列表
332
+ """
333
+ pattern = f"{self.redis_prefix}:QUEUE:{base_queue}:*"
334
+
335
+ try:
336
+ # 使用 QueueRegistry 获取优先级队列,避免 scan
337
+ from jettask.messaging.registry import QueueRegistry
338
+ registry = QueueRegistry(
339
+ redis_client=self.redis_client,
340
+ async_redis_client=self.async_redis_client,
341
+ redis_prefix=self.redis_prefix
342
+ )
343
+
344
+ # 获取基础队列的所有优先级队列
345
+ priority_queue_names = await registry.get_priority_queues_for_base(base_queue)
346
+ priority_queues = set(priority_queue_names)
347
+
348
+ # 如果没有优先级队列,检查是否有带优先级后缀的队列
349
+ if not priority_queues:
350
+ all_queues = await registry.get_all_queues()
351
+ for queue in all_queues:
352
+ if queue.startswith(f"{base_queue}:"):
353
+ priority_queues.add(queue)
354
+
355
+ # 添加基础队列(无优先级)
356
+ priority_queues.add(base_queue)
357
+
358
+ # 按优先级排序(数字越小优先级越高)
359
+ sorted_queues = []
360
+ for q in priority_queues:
361
+ if ':' in q:
362
+ base, priority = q.rsplit(':', 1)
363
+ if base == base_queue and priority.isdigit():
364
+ sorted_queues.append((int(priority), q))
365
+ else:
366
+ sorted_queues.append((float('inf'), q)) # 非数字优先级放最后
367
+ else:
368
+ sorted_queues.append((float('inf'), q)) # 无优先级放最后
369
+
370
+ sorted_queues.sort(key=lambda x: x[0])
371
+ return [q[1] for q in sorted_queues]
372
+
373
+ except Exception as e:
374
+ import traceback
375
+ logger.error(f"Error scanning priority queues for {base_queue}: {e}\n{traceback.format_exc()}")
376
+ return [base_queue] # 返回基础队列作为fallback
377
+
378
+ async def _ensure_consumer_group_and_record_info(
379
+ self,
380
+ prefixed_queue: str,
381
+ task_name: str,
382
+ consumer_name: str = None,
383
+ base_group_name: str = None
384
+ ) -> str:
385
+ """统一的方法:创建 consumer group 并记录 group_info
386
+
387
+ Args:
388
+ prefixed_queue: 带前缀的队列名(如 "test5:QUEUE:robust_bench2:6")
389
+ task_name: 任务名
390
+ consumer_name: consumer 名称(可选,如果不提供会自动获取)
391
+ base_group_name: 基础队列的 group_name(可选,优先级队列会使用这个)
392
+
393
+ Returns:
394
+ str: 使用的 group_name
395
+ """
396
+ # 提取实际队列名和基础队列名
397
+ actual_queue_name = prefixed_queue.replace(f"{self.redis_prefix}:QUEUE:", "")
398
+ if ':' in actual_queue_name and actual_queue_name.rsplit(':', 1)[1].isdigit():
399
+ base_queue = actual_queue_name.rsplit(':', 1)[0]
400
+ is_priority_queue = True
401
+ else:
402
+ base_queue = actual_queue_name
403
+ is_priority_queue = False
404
+
405
+ # 如果没有提供 consumer_name,从基础队列获取
406
+ if consumer_name is None:
407
+ consumer_name = self.consumer_manager.get_consumer_name(base_queue)
408
+
409
+ # 所有队列(包括优先级队列)都使用基础队列的 group_name
410
+ if base_group_name:
411
+ # 优先级队列使用传入的基础 group_name
412
+ group_name = base_group_name
413
+ else:
414
+ # 基础队列:构建自己的 group_name
415
+ base_prefixed_queue = self.get_prefixed_queue_name(base_queue)
416
+ group_name = f"{base_prefixed_queue}:{task_name}"
417
+
418
+ # 创建 consumer group
419
+ try:
420
+ await self.async_redis_client.xgroup_create(
421
+ name=prefixed_queue,
422
+ groupname=group_name,
423
+ id="0",
424
+ mkstream=True
425
+ )
426
+ logger.debug(f"Created consumer group {group_name} for queue {prefixed_queue}")
427
+ except Exception as e:
428
+ if "BUSYGROUP" in str(e):
429
+ logger.debug(f"Consumer group {group_name} already exists for queue {prefixed_queue}")
430
+ else:
431
+ logger.warning(f"Error creating consumer group {group_name} for {prefixed_queue}: {e}")
432
+
433
+ # 只为基础队列记录 group_info(优先级队列共享基础队列的 group_info)
434
+ if not is_priority_queue and self.consumer_manager:
435
+ await self.consumer_manager.record_group_info_async(
436
+ actual_queue_name, task_name, group_name, consumer_name
437
+ )
438
+
439
+ return group_name
440
+
441
+ async def _discover_priority_queues_for_scanner(self, base_queue: str) -> list:
442
+ """为Scanner提供的优先级队列发现回调(返回不带前缀的队列名)
443
+
444
+ Args:
445
+ base_queue: 基础队列名(不带前缀)
446
+
447
+ Returns:
448
+ 优先级队列列表(不带前缀,例如 ['queue:1', 'queue:3'])
449
+ """
450
+ from jettask.messaging.registry import QueueRegistry
451
+ registry = QueueRegistry(self.redis_client, self.async_redis_client, self.redis_prefix)
452
+
453
+ # 获取基础队列的所有优先级队列
454
+ priority_queue_names = await registry.get_priority_queues_for_base(base_queue)
455
+
456
+ # 过滤出带数字后缀的优先级队列
457
+ priority_queues = []
458
+ for pq_name in priority_queue_names:
459
+ if ':' in pq_name and pq_name.rsplit(':', 1)[1].isdigit():
460
+ priority_queues.append(pq_name)
461
+
462
+ # 按优先级排序(数字越小优先级越高)
463
+ return sorted(priority_queues, key=lambda x: int(x.split(':')[-1]))
464
+
465
+ async def _ensure_consumer_group_for_scanner(self, queue: str) -> None:
466
+ """为Scanner提供的消费者组确保回调(当发现新的优先级队列时调用)
467
+
468
+ Args:
469
+ queue: 队列名(不带前缀,可能包含优先级后缀,如 'robust_bench2:6')
470
+ """
471
+ try:
472
+ # 获取带前缀的队列名
473
+ prefixed_queue = self.get_prefixed_queue_name(queue)
474
+
475
+ # 提取基础队列名(移除优先级后缀)
476
+ if ':' in queue and queue.rsplit(':', 1)[1].isdigit():
477
+ base_queue = queue.rsplit(':', 1)[0]
478
+ else:
479
+ base_queue = queue
480
+
481
+ # 如果当前队列在监听的队列列表中,为所有注册的 task 创建消费者组
482
+ if base_queue in self.queues:
483
+ # 从 app 获取所有已注册的 task
484
+ if self.app and hasattr(self.app, '_tasks'):
485
+ for task_name in self.app._tasks.keys():
486
+ # 使用统一的方法创建 consumer group 并记录 group_info
487
+ await self._ensure_consumer_group_and_record_info(
488
+ prefixed_queue, task_name
489
+ )
490
+ logger.info(f"Ensured consumer group for task {task_name} on queue {prefixed_queue}")
491
+ else:
492
+ logger.warning(f"App or tasks not available, cannot ensure consumer groups for {queue}")
493
+ else:
494
+ logger.debug(f"Queue {base_queue} not in monitored queues, skipping consumer group creation")
495
+
496
+ except Exception as e:
497
+ logger.error(f"Error ensuring consumer group for queue {queue}: {e}", exc_info=True)
498
+
499
+ async def get_priority_queues_direct(self, base_queue: str) -> list:
500
+ """直接从Redis获取优先级队列列表(不使用缓存)
501
+
502
+ Args:
503
+ base_queue: 基础队列名
504
+
505
+ Returns:
506
+ 优先级队列列表(已加上前缀)
507
+ """
508
+ # 直接从注册表获取优先级队列
509
+ from jettask.messaging.registry import QueueRegistry
510
+ registry = QueueRegistry(self.redis_client, self.async_redis_client, self.redis_prefix)
511
+
512
+ # 获取基础队列的所有优先级队列
513
+ priority_queue_names = await registry.get_priority_queues_for_base(base_queue)
514
+ priority_queues = []
515
+
516
+ for pq_name in priority_queue_names:
517
+ # 只添加优先级队列(带数字后缀的)
518
+ if ':' in pq_name and pq_name.rsplit(':', 1)[1].isdigit():
519
+ # 构建完整的队列名
520
+ prefixed_pq = f"{self.redis_prefix}:QUEUE:{pq_name}"
521
+ priority_queues.append(prefixed_pq)
522
+
523
+ # 按优先级排序(数字越小优先级越高)
524
+ return sorted(priority_queues, key=lambda x: int(x.split(':')[-1]) if x.split(':')[-1].isdigit() else float('inf'))
525
+
526
+
527
+ @classmethod
528
+ def separate_by_key(cls, lst):
529
+ groups = {}
530
+ for item in lst:
531
+ key = item[0]['routing_key']
532
+ if key not in groups:
533
+ groups[key] = []
534
+ groups[key].append(item)
535
+ result = []
536
+ group_values = list(groups.values())
537
+ while True:
538
+ exists_data = False
539
+ for values in group_values:
540
+ try:
541
+ result.append(values.pop(0))
542
+ exists_data = True
543
+ except:
544
+ pass
545
+ if not exists_data:
546
+ break
547
+ return result
548
+
549
+ async def _unified_task_checker(self, event_queue: asyncio.Queue, checker_type: str = 'solo_agg'):
550
+ """统一的任务检查器,减少代码重复"""
551
+ last_solo_running_state = defaultdict(dict)
552
+ last_wait_time = defaultdict(int)
553
+ queue_batch_tasks = defaultdict(list)
554
+ left_queue_batch_tasks = defaultdict(list)
555
+
556
+ # 延迟任务专用状态
557
+ delay_tasks = getattr(self, 'delay_tasks', []) if checker_type == 'delay' else []
558
+
559
+ while True:
560
+ has_work = False
561
+ current_time = time.time()
562
+
563
+ if checker_type == 'delay':
564
+ # 延迟任务逻辑
565
+ put_count = 0
566
+ need_del_index = []
567
+ for i in range(len(delay_tasks)):
568
+ schedule_time = delay_tasks[i][0]
569
+ task = delay_tasks[i][1]
570
+ if schedule_time <= current_time:
571
+ try:
572
+ await self._async_put_task(event_queue, task)
573
+ need_del_index.append(i)
574
+ put_count += 1
575
+ has_work = True
576
+ except IndexError:
577
+ pass
578
+ for i in need_del_index:
579
+ del delay_tasks[i]
580
+
581
+ elif checker_type == 'solo_agg':
582
+ # Solo聚合任务逻辑
583
+ for queue in self.queues:
584
+ for agg_key, tasks in self.solo_agg_task[queue].items():
585
+ if not tasks:
586
+ continue
587
+
588
+ has_work = True
589
+ need_del_index = []
590
+ need_lock_routing_keys = []
591
+ sort_by_tasks = self.separate_by_key(tasks)
592
+ max_wait_time = 5
593
+ max_records = 3
594
+
595
+ for index, (routing, task) in enumerate(sort_by_tasks):
596
+ routing_key = routing['routing_key']
597
+ max_records = routing.get('max_records', 1)
598
+ max_wait_time = routing.get('max_wait_time', 0)
599
+
600
+ with self.rlock:
601
+ if self.solo_running_state[queue].get(routing_key, 0) > 0:
602
+ continue
603
+
604
+ if len(queue_batch_tasks[queue] + left_queue_batch_tasks[queue]) >= max_records:
605
+ break
606
+
607
+ task["routing"] = routing
608
+
609
+ if self.is_urgent(routing_key):
610
+ left_queue_batch_tasks[queue].append(task)
611
+ else:
612
+ queue_batch_tasks[queue].append(task)
613
+ need_lock_routing_keys.append(routing_key)
614
+ need_del_index.append(index)
615
+
616
+ for routing_key, count in Counter(need_lock_routing_keys).items():
617
+ with self.rlock:
618
+ self.solo_running_state[queue][routing_key] = count
619
+
620
+ if last_solo_running_state[queue] != self.solo_running_state[queue]:
621
+ last_solo_running_state[queue] = self.solo_running_state[queue].copy()
622
+
623
+ tasks = [task for index, task in enumerate(sort_by_tasks) if index not in need_del_index]
624
+ self.solo_agg_task[queue][agg_key] = tasks
625
+
626
+ if (len(queue_batch_tasks[queue] + left_queue_batch_tasks[queue]) >= max_records or
627
+ (last_wait_time[queue] and last_wait_time[queue] < current_time - max_wait_time)):
628
+ for task in queue_batch_tasks[queue]:
629
+ await self._async_put_task(event_queue, task)
630
+ for task in left_queue_batch_tasks[queue]:
631
+ await self._async_put_task(event_queue, task)
632
+ queue_batch_tasks[queue] = []
633
+ left_queue_batch_tasks[queue] = []
634
+ last_wait_time[queue] = 0
635
+ elif last_wait_time[queue] == 0:
636
+ last_wait_time[queue] = current_time
637
+
638
+ # 统一的睡眠策略
639
+ sleep_time = self._get_optimal_sleep_time(has_work, checker_type)
640
+ await asyncio.sleep(sleep_time)
641
+
642
+ def _get_optimal_sleep_time(self, has_work: bool, checker_type: str) -> float:
643
+ """获取最优睡眠时间"""
644
+ if checker_type == 'delay':
645
+ return 0.001 if has_work else 1.0
646
+ elif has_work:
647
+ return 0.001 # 有工作时极短休眠
648
+ else:
649
+ return 0.01 # 无工作时短暂休眠
650
+
651
+
652
+ async def async_check_solo_agg_tasks(self, event_queue: asyncio.Queue):
653
+ """异步版本的聚合任务检查"""
654
+ await self._unified_task_checker(event_queue, checker_type='solo_agg')
655
+
656
+ async def check_solo_agg_tasks(self, event_queue: asyncio.Queue):
657
+ """聚合任务检查"""
658
+ await self._unified_task_checker(event_queue, checker_type='solo_agg')
659
+
660
+ def check_sole_tasks(self, event_queue: Union[deque, asyncio.Queue]):
661
+ agg_task_mappings = {queue: defaultdict(list) for queue in self.queues}
662
+ agg_wait_task_mappings = {queue: defaultdict(float) for queue in self.queues}
663
+ task_max_wait_time_mapping = {}
664
+ make_up_for_index_mappings = {queue: defaultdict(int) for queue in self.queues}
665
+ while True:
666
+ put_count = 0
667
+ for queue in self.queues:
668
+ agg_task = agg_task_mappings[queue]
669
+ for routing_key, tasks in self.solo_routing_tasks[queue].items():
670
+ schedule_time = self.task_scheduler[queue][routing_key]
671
+ if tasks:
672
+ for task in tasks:
673
+ prev_routing = task[0]
674
+ if agg_key:= prev_routing.get('agg_key'):
675
+ if not self.running_task_state_mappings[queue][agg_key]:
676
+ self.solo_running_state[queue][routing_key] = False
677
+ break
678
+ if (
679
+ schedule_time <= time.time()
680
+ and self.solo_running_state[queue][routing_key] == False
681
+ ) :
682
+ try:
683
+ routing, task = tasks.pop(0)
684
+ except IndexError:
685
+ continue
686
+ task["routing"] = routing
687
+
688
+ agg_key = routing.get('agg_key')
689
+ if agg_key is not None:
690
+ start_time = agg_wait_task_mappings[queue][agg_key]
691
+ if not start_time:
692
+ agg_wait_task_mappings[queue][agg_key] = time.time()
693
+ start_time = agg_wait_task_mappings[queue][agg_key]
694
+ agg_task[agg_key].append(task)
695
+ max_wait_time = routing.get('max_wait_time', 3)
696
+ task_max_wait_time_mapping[agg_key] = max_wait_time
697
+ if len(agg_task[agg_key])>=routing.get('max_records', 100) or time.time()-start_time>=max_wait_time:
698
+ logger.debug(f'{agg_key=} {len(agg_task[agg_key])} 已满,准备发车!{routing.get("max_records", 100)} {time.time()-start_time} {max_wait_time}')
699
+ for task in agg_task[agg_key]:
700
+ task['routing']['version'] = 1
701
+ self.running_task_state_mappings[queue][agg_key][task['event_id']] = time.time()
702
+ self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
703
+ agg_task[agg_key] = []
704
+ make_up_for_index_mappings[queue][agg_key] = 0
705
+ agg_wait_task_mappings[queue][agg_key] = 0
706
+ else:
707
+ self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
708
+ self.solo_running_state[queue][routing_key] = True
709
+ put_count += 1
710
+ for agg_key in agg_task.keys():
711
+ if not agg_task[agg_key]:
712
+ continue
713
+ start_time = agg_wait_task_mappings[queue][agg_key]
714
+ max_wait_time = task_max_wait_time_mapping[agg_key]
715
+ if make_up_for_index_mappings[queue][agg_key]>= len(agg_task[agg_key])-1:
716
+ make_up_for_index_mappings[queue][agg_key] = 0
717
+ routing = agg_task[agg_key][make_up_for_index_mappings[queue][agg_key]]['routing']
718
+ routing_key = routing['routing_key']
719
+ self.solo_running_state[queue][routing_key] = False
720
+ make_up_for_index_mappings[queue][agg_key] += 1
721
+ if time.time()-start_time>=max_wait_time:
722
+ logger.debug(f'{agg_key=} {len(agg_task[agg_key])}被迫发车! {time.time()-start_time} {max_wait_time}')
723
+ for task in agg_task[agg_key]:
724
+ task['routing']['version'] = 1
725
+ self.running_task_state_mappings[queue][agg_key][task['event_id']] = time.time()
726
+ self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
727
+ agg_task[agg_key] = []
728
+ make_up_for_index_mappings[queue][agg_key] = 0
729
+ agg_wait_task_mappings[queue][agg_key] = 0
730
+ # 优化:根据处理任务数量动态调整休眠时间
731
+ if not put_count:
732
+ time.sleep(0.001)
733
+ elif put_count < 5:
734
+ time.sleep(0.0005) # 少量任务时极短休眠
735
+
736
+ async def check_batch_tasks(self, event_queue: asyncio.Queue):
737
+ """批量任务检查 - 已简化为统一检查器"""
738
+ # 批量任务逻辑已整合到其他检查器中,这个函数保留以兼容
739
+ await asyncio.sleep(0.1)
740
+
741
+ async def check_delay_tasks(self, event_queue: asyncio.Queue):
742
+ """延迟任务检查"""
743
+ await self._unified_task_checker(event_queue, checker_type='delay')
744
+
745
+ def _handle_redis_error(self, error: Exception, consecutive_errors: int, queue: str = None) -> tuple[bool, int]:
746
+ """处理Redis错误的通用方法
747
+ 返回: (should_recreate_connection, new_consecutive_errors)
748
+ """
749
+ if isinstance(error, redis.exceptions.ConnectionError):
750
+ logger.error(f'Redis连接错误: {error}')
751
+ logger.error(traceback.format_exc())
752
+ consecutive_errors += 1
753
+ if consecutive_errors >= 5:
754
+ logger.error(f'连续连接失败{consecutive_errors}次,重新创建连接')
755
+ return True, 0
756
+ return False, consecutive_errors
757
+
758
+ elif isinstance(error, redis.exceptions.ResponseError):
759
+ if "NOGROUP" in str(error) and queue:
760
+ logger.warning(f'队列 {queue} 或消费者组不存在')
761
+ return False, consecutive_errors
762
+ else:
763
+ logger.error(f'Redis错误: {error}')
764
+ logger.error(traceback.format_exc())
765
+ consecutive_errors += 1
766
+ return False, consecutive_errors
767
+ else:
768
+ logger.error(f'意外错误: {error}')
769
+ logger.error(traceback.format_exc())
770
+ consecutive_errors += 1
771
+ return False, consecutive_errors
772
+
773
+ def _process_message_common(self, event_id: str, event_data: dict, queue: str, event_queue, is_async: bool = False, consumer_name: str = None, group_name: str = None):
774
+ """通用的消息处理逻辑,供同步和异步版本使用"""
775
+ # 检查消息是否已被认领,防止重复处理
776
+ if event_id in self._claimed_message_ids:
777
+ logger.debug(f"跳过已认领的消息 {event_id}")
778
+ return event_id
779
+
780
+ # 解析消息中的实际数据
781
+ # event_data 格式: {b'data': b'{"name": "...", "event_id": "...", ...}'}
782
+ actual_event_id = event_id # 默认使用Stream ID
783
+ parsed_event_data = None # 解析后的数据
784
+
785
+ # 检查是否有data字段(Stream消息格式)
786
+ if 'data' in event_data or b'data' in event_data:
787
+ data_field = event_data.get('data') or event_data.get(b'data')
788
+ if data_field:
789
+ try:
790
+ # 直接解析二进制数据,不需要解码
791
+ if isinstance(data_field, bytes):
792
+ parsed_data = loads_str(data_field)
793
+ else:
794
+ parsed_data = data_field
795
+ # 检查是否有原始的event_id(延迟任务会有)
796
+ if 'event_id' in parsed_data:
797
+ actual_event_id = parsed_data['event_id']
798
+ # 使用解析后的数据作为event_data
799
+ parsed_event_data = parsed_data
800
+ except (ValueError, UnicodeDecodeError):
801
+ pass # 解析失败,使用默认的Stream ID
802
+
803
+ # 如果成功解析了数据,使用解析后的数据;否则使用原始数据
804
+ final_event_data = parsed_event_data if parsed_event_data is not None else event_data
805
+
806
+ routing = final_event_data.get("routing")
807
+
808
+ # 从消息体中获取实际的队列名(可能包含优先级后缀)
809
+ # 这确保ACK使用正确的stream key
810
+ actual_queue = final_event_data.get('queue', queue)
811
+
812
+ # 如果没有传入group_name,使用默认值(prefixed_queue)
813
+ if not group_name:
814
+ prefixed_queue = self.get_prefixed_queue_name(queue)
815
+ group_name = prefixed_queue
816
+
817
+ # 提取并确保 offset 在 event_data 中(关键:确保延迟任务的 offset 能被传递到 executor)
818
+ offset = None
819
+ if 'offset' in final_event_data:
820
+ try:
821
+ offset = int(final_event_data['offset'])
822
+ except (ValueError, TypeError):
823
+ pass
824
+ # 如果 final_event_data 中没有 offset,从原始 event_data 中提取(Stream 消息格式)
825
+ elif 'offset' in event_data or b'offset' in event_data:
826
+ offset_field = event_data.get('offset') or event_data.get(b'offset')
827
+ if offset_field:
828
+ try:
829
+ offset = int(offset_field)
830
+ # 将 offset 添加到 final_event_data 中,确保 executor 能提取
831
+ final_event_data['offset'] = offset
832
+ except (ValueError, TypeError):
833
+ pass
834
+
835
+ task_item = {
836
+ "queue": actual_queue, # 使用消息体中的实际队列名(可能包含优先级)
837
+ "event_id": actual_event_id,
838
+ "event_data": final_event_data, # 使用解析后的数据(包含 offset)
839
+ "consumer": consumer_name, # 添加消费者信息
840
+ "group_name": group_name, # 添加group_name用于ACK
841
+ }
842
+
843
+ push_flag = True
844
+ if routing:
845
+ # routing 现在直接是对象,不需要反序列化
846
+ if agg_key := routing.get('agg_key'):
847
+ self.solo_agg_task[queue][agg_key].append(
848
+ [routing, task_item]
849
+ )
850
+ push_flag = False
851
+
852
+ if push_flag:
853
+ if is_async:
854
+ # 这里不能直接await,需要返回一个标记
855
+ return ('async_put', task_item)
856
+ else:
857
+ self._put_task(event_queue, task_item)
858
+
859
+ return event_id
860
+
861
+ async def _start_offline_worker_processor_with_restart(self, queue: str):
862
+ """启动带自动重启机制的离线worker处理器"""
863
+ async def supervisor():
864
+ """监督器任务,负责重启失败的处理器"""
865
+ restart_count = 0
866
+ max_restarts = 10
867
+
868
+ while not self._stop_reading and restart_count < max_restarts:
869
+ try:
870
+ logger.debug(f"Starting offline worker processor for queue {queue} (attempt {restart_count + 1})")
871
+ await self._process_offline_workers(queue)
872
+ # 如果正常退出(stop_reading为True),则不重启
873
+ if self._stop_reading:
874
+ logger.debug(f"Offline worker processor for queue {queue} stopped normally")
875
+ break
876
+ except asyncio.CancelledError:
877
+ logger.debug(f"Offline worker processor for queue {queue} cancelled")
878
+ break
879
+ except Exception as e:
880
+ restart_count += 1
881
+ import traceback
882
+ logger.error(f"Offline worker processor for queue {queue} crashed: {e}")
883
+ logger.error(traceback.format_exc())
884
+ if restart_count < max_restarts:
885
+ wait_time = min(restart_count * 5, 30) # 递增等待时间,最多30秒
886
+ logger.debug(f"Restarting offline worker processor for queue {queue} in {wait_time} seconds...")
887
+ await asyncio.sleep(wait_time)
888
+ else:
889
+ logger.error(f"Offline worker processor for queue {queue} failed {max_restarts} times, giving up")
890
+
891
+ # 创建监督器任务
892
+ asyncio.create_task(supervisor())
893
+
894
+ async def _execute_recovery_for_queue(self, queue: str, log_prefix: str = "Recovery") -> int:
895
+ """
896
+ 执行单个队列的消息恢复(封装通用逻辑)
897
+
898
+ Args:
899
+ queue: 队列名称
900
+ log_prefix: 日志前缀,用于区分调用场景(如"Recovery Event"或"Recovery Fallback")
901
+
902
+ Returns:
903
+ int: 恢复的消息数量
904
+ """
905
+ # 获取该队列的恢复器(如果已创建)
906
+ recovery_key = f"recovery_{queue}"
907
+ recovery = getattr(self, recovery_key, None)
908
+
909
+ if not recovery:
910
+ # 创建新的恢复器
911
+ recovery = OfflineWorkerRecovery(
912
+ async_redis_client=self.async_binary_redis_client,
913
+ redis_prefix=self.redis_prefix,
914
+ worker_prefix='WORKER',
915
+ consumer_manager=self.consumer_manager,
916
+ queue_registry=self.queue_registry,
917
+ worker_state=self.app.worker_state if self.app else None
918
+ )
919
+ setattr(self, recovery_key, recovery)
920
+
921
+ # 获取当前 consumer
922
+ # 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
923
+ base_queue = queue
924
+ if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
925
+ base_queue = queue.rsplit(':', 1)[0]
926
+
927
+ try:
928
+ current_consumer = self.consumer_manager.get_consumer_name(base_queue)
929
+ # 不再为优先级队列添加后缀
930
+ except Exception as e:
931
+ logger.warning(f"[{log_prefix}] Failed to get consumer for queue {queue}: {e}")
932
+ raise
933
+
934
+ # 创建一个回调函数,根据 task_name 获取对应的 event_queue
935
+ def get_event_queue_by_task(task_name: str):
936
+ """根据 task_name 获取对应的 event_queue"""
937
+ event_queue_dict = getattr(self, '_event_queue_dict', None)
938
+ if event_queue_dict:
939
+ return event_queue_dict.get(task_name)
940
+ return None
941
+
942
+ # 执行恢复(传入 event_queue_callback)
943
+ recovered = await recovery.recover_offline_workers(
944
+ queue=queue,
945
+ event_queue=None, # 保持为 None,通过 callback 传递
946
+ current_consumer_name=current_consumer,
947
+ event_queue_callback=get_event_queue_by_task # 传入回调函数
948
+ )
949
+
950
+ return recovered
951
+
952
+ async def handle_worker_offline_event(self, worker_id: str, queues: list = None):
953
+ """
954
+ 处理 Worker 离线事件(事件驱动)
955
+ 当收到 Worker 离线通知时立即处理消息转移
956
+
957
+ Args:
958
+ worker_id: 离线的 Worker ID
959
+ queues: Worker 负责的队列列表(可选,如果不提供则从 Redis 读取)
960
+ """
961
+ try:
962
+ logger.info(f"[Recovery Event] Received offline event for worker {worker_id}")
963
+
964
+ # 如果没有提供队列列表,从 Redis 读取
965
+ if not queues:
966
+ worker_key = f"{self.redis_prefix}:WORKER:{worker_id}"
967
+ worker_data = await self.async_redis_client.hgetall(worker_key)
968
+ if worker_data:
969
+ queues_str = worker_data.get(b'queues', b'').decode('utf-8') if isinstance(worker_data.get(b'queues'), bytes) else worker_data.get('queues', '')
970
+ queues = queues_str.split(',') if queues_str else []
971
+
972
+ if not queues:
973
+ logger.warning(f"[Recovery Event] No queues found for worker {worker_id}, skipping recovery")
974
+ return
975
+
976
+ # 获取 event_queue 字典(从 EventPool 保存的引用中)
977
+ event_queue_dict = getattr(self, '_event_queue_dict', None)
978
+ if not event_queue_dict:
979
+ logger.warning(f"[Recovery Event] No event_queue_dict available, recovered messages will not be executed")
980
+
981
+ # 为每个队列触发恢复
982
+ for queue in queues:
983
+ if not queue.strip():
984
+ continue
985
+
986
+ logger.info(f"[Recovery Event] Triggering recovery for worker {worker_id} on queue {queue}")
987
+
988
+ try:
989
+ # 使用封装的方法执行恢复
990
+ recovered = await self._execute_recovery_for_queue(queue, log_prefix="Recovery Event")
991
+
992
+ if recovered > 0:
993
+ logger.info(f"[Recovery Event] Recovered {recovered} messages from worker {worker_id} on queue {queue}")
994
+ else:
995
+ logger.debug(f"[Recovery Event] No messages to recover from worker {worker_id} on queue {queue}")
996
+ except Exception as e:
997
+ logger.warning(f"[Recovery Event] Failed to recover queue {queue}: {e}")
998
+ continue
999
+
1000
+ except Exception as e:
1001
+ logger.error(f"[Recovery Event] Error handling offline event for worker {worker_id}: {e}", exc_info=True)
1002
+
1003
+ async def _process_offline_workers(self, queue: str):
1004
+ """定期检测离线worker并使用XCLAIM转移其pending消息(兜底机制)"""
1005
+ logger.debug(f"Started offline worker processor for queue {queue}")
1006
+
1007
+ # 等待consumer manager初始化
1008
+ base_queue = queue
1009
+ if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
1010
+ base_queue = queue.rsplit(':', 1)[0]
1011
+
1012
+ wait_times = [0.1, 0.2, 0.4, 0.8, 1.6, 3.2]
1013
+ for wait_time in wait_times:
1014
+ try:
1015
+ current_consumer = self.consumer_manager.get_consumer_name(base_queue)
1016
+ if current_consumer:
1017
+ if base_queue != queue:
1018
+ current_consumer = f"{current_consumer}:{queue.rsplit(':', 1)[-1]}"
1019
+ logger.debug(f"Consumer manager initialized for queue {queue}, consumer: {current_consumer}")
1020
+ break
1021
+ except Exception as e:
1022
+ logger.debug(f"Consumer manager not ready yet, waiting {wait_time}s: {e}")
1023
+ await asyncio.sleep(wait_time)
1024
+
1025
+ logger.debug(f"Offline worker processor for queue {queue} is now active")
1026
+
1027
+ # 扫描间隔(拉长到30秒,作为兜底)
1028
+ scan_interval = 30
1029
+
1030
+ scan_count = 0
1031
+ while not self._stop_reading:
1032
+ try:
1033
+ scan_count += 1
1034
+ # 每10次扫描记录一次日志(现在是5分钟一次)
1035
+ # if scan_count % 10 == 1:
1036
+ logger.debug(f"[Recovery Fallback] Periodic scan active for queue {queue} (scan #{scan_count})")
1037
+
1038
+ # 使用封装的方法执行兜底扫描
1039
+ recovered = await self._execute_recovery_for_queue(queue, log_prefix="Recovery Fallback")
1040
+
1041
+ if recovered > 0:
1042
+ logger.warning(f"[Recovery Fallback] Found {recovered} messages in fallback scan for queue {queue} - event-driven recovery may have missed them")
1043
+
1044
+ except Exception as e:
1045
+ logger.error(f"Error in offline worker processor for queue {queue}: {e}", exc_info=True)
1046
+
1047
+ # 等待下一次扫描
1048
+ await asyncio.sleep(scan_interval)
1049
+
1050
+ logger.debug(f"Stopped offline worker processor for queue {queue}")
1051
+
1052
+ async def _perform_self_recovery(self, queues: set, event_queue: dict):
1053
+ """
1054
+ 在worker启动时执行"自我恢复"
1055
+
1056
+ 场景:Worker复用了离线worker ID,但此时worker已经变为在线状态(is_alive=true),
1057
+ 周期性扫描只查找is_alive=false的worker,会漏掉当前worker之前的pending消息。
1058
+
1059
+ 解决方案:主动恢复"当前worker"的pending消息,无论is_alive状态如何。
1060
+
1061
+ Args:
1062
+ queues: 需要恢复的队列集合(包括优先级队列)
1063
+ event_queue: 事件队列字典
1064
+ """
1065
+ logger.info("[Recovery Self] Starting self-recovery for current worker...")
1066
+
1067
+ # 获取当前 worker ID
1068
+ current_worker_id = None
1069
+ if self.app and hasattr(self.app, 'worker_id'):
1070
+ current_worker_id = self.app.worker_id
1071
+
1072
+ if not current_worker_id:
1073
+ logger.debug("[Recovery Self] No worker_id available, skipping self-recovery")
1074
+ return
1075
+
1076
+ worker_key = f"{self.redis_prefix}:WORKER:{current_worker_id}"
1077
+ logger.info(f"[Recovery Self] Checking pending messages for worker: {current_worker_id}")
1078
+
1079
+ # event_queue callback
1080
+ def get_event_queue_by_task(task_name: str):
1081
+ """根据 task_name 获取对应的 event_queue"""
1082
+ if event_queue:
1083
+ return event_queue.get(task_name)
1084
+ return None
1085
+
1086
+ total_recovered = 0
1087
+
1088
+ # 按队列恢复消息
1089
+ for queue in queues:
1090
+ try:
1091
+ # 获取基础队列名
1092
+ base_queue = queue
1093
+ if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
1094
+ base_queue = queue.rsplit(':', 1)[0]
1095
+
1096
+ # 等待 consumer manager 初始化
1097
+ current_consumer = None
1098
+ for _ in range(5):
1099
+ try:
1100
+ current_consumer = self.consumer_manager.get_consumer_name(base_queue)
1101
+ if current_consumer:
1102
+ # 优先级队列需要添加后缀
1103
+ if base_queue != queue:
1104
+ priority_suffix = queue.rsplit(':', 1)[-1]
1105
+ current_consumer = f"{current_consumer}:{priority_suffix}"
1106
+ break
1107
+ except:
1108
+ pass
1109
+ await asyncio.sleep(0.1)
1110
+
1111
+ if not current_consumer:
1112
+ logger.warning(f"[Recovery Self] Cannot get consumer for queue {queue}, skipping")
1113
+ continue
1114
+
1115
+ # 构建 stream_key
1116
+ stream_key = f"{self.redis_prefix}:QUEUE:{queue}"
1117
+
1118
+ # 获取 group_info
1119
+ worker_data = await self.async_redis_client.hgetall(worker_key)
1120
+ if not worker_data:
1121
+ logger.debug(f"[Recovery Self] Worker {current_worker_id} has no data")
1122
+ continue
1123
+
1124
+ # 解码 worker_data
1125
+ decoded_worker_data = {}
1126
+ for k, v in worker_data.items():
1127
+ key = k.decode('utf-8') if isinstance(k, bytes) else k
1128
+ value = v.decode('utf-8') if isinstance(v, bytes) else v
1129
+ decoded_worker_data[key] = value
1130
+
1131
+ # 提取 group_info
1132
+ group_infos = []
1133
+ for key, value in decoded_worker_data.items():
1134
+ if key.startswith('group_info:'):
1135
+ try:
1136
+ group_info = json.loads(value)
1137
+ if group_info.get('queue') == base_queue:
1138
+ group_infos.append(group_info)
1139
+ except Exception as e:
1140
+ logger.error(f"[Recovery Self] Error parsing group_info: {e}")
1141
+
1142
+ if not group_infos:
1143
+ logger.debug(f"[Recovery Self] No group_info for queue {queue}")
1144
+ continue
1145
+
1146
+ # 尝试恢复每个 group 的 pending 消息
1147
+ for group_info in group_infos:
1148
+ try:
1149
+ task_name = group_info.get('task_name')
1150
+ group_name = group_info.get('group_name')
1151
+
1152
+ if not task_name or not group_name:
1153
+ continue
1154
+
1155
+ # 构建离线 consumer 名称
1156
+ # 统一 group_name 架构:所有队列使用同一个 consumer name
1157
+ offline_consumer_name = group_info.get('consumer_name')
1158
+
1159
+ # 检查是否有 pending 消息
1160
+ pending_info = await self.async_binary_redis_client.xpending(stream_key, group_name)
1161
+ if not pending_info or pending_info.get('pending', 0) == 0:
1162
+ continue
1163
+
1164
+ # 查询详细的 pending 消息
1165
+ detailed_pending = await self.async_binary_redis_client.xpending_range(
1166
+ stream_key, group_name,
1167
+ min='-', max='+', count=100,
1168
+ consumername=offline_consumer_name
1169
+ )
1170
+
1171
+ if not detailed_pending:
1172
+ continue
1173
+
1174
+ logger.info(
1175
+ f"[Recovery Self] Found {len(detailed_pending)} pending messages "
1176
+ f"for worker {current_worker_id}, queue {queue}, task {task_name}"
1177
+ )
1178
+
1179
+ # 认领消息
1180
+ message_ids = [msg['message_id'] for msg in detailed_pending]
1181
+ claimed_messages = await self.async_binary_redis_client.xclaim(
1182
+ stream_key, group_name, current_consumer,
1183
+ min_idle_time=0, # 立即认领
1184
+ message_ids=message_ids
1185
+ )
1186
+
1187
+ if claimed_messages:
1188
+ logger.info(
1189
+ f"[Recovery Self] Claimed {len(claimed_messages)} messages "
1190
+ f"from {offline_consumer_name} to {current_consumer}"
1191
+ )
1192
+
1193
+ # 将消息放入 event_queue
1194
+ task_event_queue = get_event_queue_by_task(task_name)
1195
+ if task_event_queue:
1196
+ for msg_id, msg_data in claimed_messages:
1197
+ if isinstance(msg_id, bytes):
1198
+ msg_id = msg_id.decode('utf-8')
1199
+
1200
+ data_field = msg_data.get(b'data') or msg_data.get('data')
1201
+ if data_field:
1202
+ try:
1203
+ import msgpack
1204
+ parsed_data = msgpack.unpackb(data_field, raw=False)
1205
+ parsed_data['_task_name'] = task_name
1206
+ parsed_data['queue'] = queue
1207
+
1208
+ task_item = {
1209
+ 'queue': queue,
1210
+ 'event_id': msg_id,
1211
+ 'event_data': parsed_data,
1212
+ 'consumer': current_consumer,
1213
+ 'group_name': group_name
1214
+ }
1215
+
1216
+ await task_event_queue.put(task_item)
1217
+ total_recovered += 1
1218
+ except Exception as e:
1219
+ logger.error(f"[Recovery Self] Error processing message: {e}")
1220
+ else:
1221
+ logger.warning(f"[Recovery Self] No event_queue for task {task_name}")
1222
+
1223
+ except Exception as e:
1224
+ logger.error(f"[Recovery Self] Error recovering group {group_info}: {e}", exc_info=True)
1225
+
1226
+ except Exception as e:
1227
+ logger.error(f"[Recovery Self] Error recovering queue {queue}: {e}", exc_info=True)
1228
+
1229
+ if total_recovered > 0:
1230
+ logger.info(f"[Recovery Self] Self-recovery completed: recovered {total_recovered} messages")
1231
+ else:
1232
+ logger.info("[Recovery Self] Self-recovery completed: no pending messages found")
1233
+
1234
+ async def _update_read_offset(self, queue: str, group_name: str, offset: int):
1235
+ """更新已读取的offset(只更新最大值)
1236
+
1237
+ Args:
1238
+ queue: 队列名(不带前缀,可能包含优先级后缀,如 "robust_bench2:8")
1239
+ group_name: consumer group名称(格式:{prefix}:QUEUE:{base_queue}:{task_name})
1240
+ offset: 读取的offset值
1241
+ """
1242
+ try:
1243
+ if offset is None:
1244
+ return
1245
+
1246
+ read_offset_key = f"{self.redis_prefix}:READ_OFFSETS"
1247
+
1248
+ # 从 group_name 中提取 task_name(最后一段)
1249
+ task_name = group_name.split(':')[-1]
1250
+
1251
+ # 构建 field:队列名(含优先级)+ 任务名
1252
+ # 例如:robust_bench2:8:benchmark_task
1253
+ field = f"{queue}:{task_name}"
1254
+
1255
+ # 使用Lua脚本原子地更新最大offset
1256
+ await self.async_redis_client.eval(
1257
+ UPDATE_MAX_OFFSET_LUA,
1258
+ 2, # keys数量
1259
+ read_offset_key, # KEYS[1]
1260
+ field, # KEYS[2]
1261
+ offset # ARGV[1]
1262
+ )
1263
+ logger.debug(f"Updated read offset for {field}: {offset}")
1264
+ except Exception as e:
1265
+ logger.error(f"Error updating read offset: {e}")
1266
+
1267
+ async def listening_event(self, event_queue: dict, prefetch_multiplier: int = 1):
1268
+ """监听事件 - 为每个task创建独立的consumer group
1269
+
1270
+ Args:
1271
+ event_queue: dict[str, asyncio.Queue] - 按task_name隔离的队列字典
1272
+ prefetch_multiplier: 预取倍数
1273
+ """
1274
+ # 验证参数类型
1275
+ if not isinstance(event_queue, dict):
1276
+ raise TypeError(f"event_queue must be a dict[str, asyncio.Queue], got {type(event_queue)}")
1277
+
1278
+ # 保存 event_queue 字典的引用,供事件驱动的恢复使用
1279
+ self._event_queue_dict = event_queue
1280
+
1281
+ logger.info(f"Using task-isolated event queue mode for tasks: {list(event_queue.keys())}")
1282
+
1283
+ # 创建一个字典来存储每个队列的延迟任务 - 使用list + Lock更高效
1284
+ for queue in self.queues:
1285
+ self._delayed_tasks_lists[queue] = []
1286
+ self._delayed_tasks_locks[queue] = asyncio.Lock()
1287
+
1288
+ # 保存所有创建的任务,以便清理时能够取消它们
1289
+ self._background_tasks = []
1290
+
1291
+ # group信息将在每个task监听时记录
1292
+
1293
+ async def listen_event_by_task(queue, task_name):
1294
+ """为单个任务监听事件"""
1295
+ # 恢复读取历史 pending 消息的逻辑
1296
+ check_backlog = {} # {queue_name: bool} - 首次读取 pending 消息
1297
+ lastid = {} # 每个队列的lastid - 首次为 "0",后续为 ">"
1298
+ consecutive_errors = 0
1299
+ max_consecutive_errors = 5
1300
+
1301
+ # 获取当前task使用的event_queue
1302
+ task_event_queue = event_queue.get(task_name)
1303
+ if not task_event_queue:
1304
+ logger.error(f"No event queue found for task {task_name}")
1305
+ return
1306
+
1307
+ # 获取任务对象
1308
+ task = self.app._tasks.get(task_name)
1309
+ if not task:
1310
+ logger.error(f"Task {task_name} not found")
1311
+ return
1312
+
1313
+ # 定义必要的变量
1314
+ prefixed_queue = self.get_prefixed_queue_name(queue)
1315
+ consumer_name = self.consumer_manager.get_consumer_name(queue)
1316
+ # 使用函数名作为group_name,实现任务隔离(用于后续消息处理)
1317
+ group_name = f"{prefixed_queue}:{task_name}"
1318
+
1319
+ # 直接获取所有优先级队列(包括默认队列)
1320
+ priority_queues = await self.get_priority_queues_direct(queue)
1321
+ all_queues = [prefixed_queue] + priority_queues # 默认队列 + 优先级队列
1322
+
1323
+ # 为基础队列创建 consumer group 并记录 group_info
1324
+ base_group_name = await self._ensure_consumer_group_and_record_info(
1325
+ prefixed_queue, task_name, consumer_name
1326
+ )
1327
+
1328
+ # 为优先级队列创建 consumer group(共享基础队列的 group_name)
1329
+ for pq in priority_queues:
1330
+ await self._ensure_consumer_group_and_record_info(
1331
+ pq, task_name, consumer_name, base_group_name=base_group_name
1332
+ )
1333
+
1334
+ # ✅ 初始化每个队列:首次读取 pending 消息(从 "0" 开始)
1335
+ for q in all_queues:
1336
+ lastid[q] = "0" # 首次读取历史消息
1337
+ check_backlog[q] = True # 首次读取 pending 消息
1338
+
1339
+ # 获取该队列的延迟任务列表和锁
1340
+ delayed_list = self._delayed_tasks_lists.get(queue)
1341
+ delayed_lock = self._delayed_tasks_locks.get(queue)
1342
+
1343
+ # 记录上次优先级队列更新时间和上次group_info检查时间
1344
+ last_priority_update = time.time()
1345
+ last_group_info_check = time.time()
1346
+
1347
+ while not self._stop_reading:
1348
+ # 定期直接从Redis获取优先级队列(每1秒检查一次)
1349
+ current_time = time.time()
1350
+ if current_time - last_priority_update >= 1: # 简化为固定1秒间隔
1351
+ new_priority_queues = await self.get_priority_queues_direct(queue)
1352
+
1353
+ # 如果优先级队列有变化,更新本地变量
1354
+ if new_priority_queues != priority_queues:
1355
+ logger.debug(f"Priority queues updated for {queue}: {priority_queues} -> {new_priority_queues}")
1356
+ priority_queues = new_priority_queues
1357
+ all_queues = [prefixed_queue] + priority_queues
1358
+
1359
+ # 为新的优先级队列创建consumer group(共享基础队列的 group_name)
1360
+ for q in all_queues:
1361
+ if q not in lastid: # 这是新队列
1362
+ await self._ensure_consumer_group_and_record_info(
1363
+ q, task_name, consumer_name, base_group_name=group_name
1364
+ )
1365
+ logger.debug(f"Ensured consumer group for new priority queue {q}")
1366
+
1367
+ # ✅ 初始化新队列:读取历史 pending 消息
1368
+ lastid[q] = "0"
1369
+ check_backlog[q] = True
1370
+
1371
+ last_priority_update = current_time
1372
+
1373
+ # 定期检查并恢复group_info(每10秒检查一次)
1374
+ if current_time - last_group_info_check >= 10:
1375
+ # 检查worker key中是否缺失group_info
1376
+ if self.consumer_manager:
1377
+ worker_key = self.consumer_manager._heartbeat_strategy._worker_key
1378
+ try:
1379
+ # 检查第一个队列的group_info是否存在
1380
+ first_queue = all_queues[0] if all_queues else prefixed_queue
1381
+ first_group_name = f"{first_queue}:{task_name}"
1382
+ field_name = f"group_info:{first_group_name}"
1383
+
1384
+ group_info_exists = await self.async_redis_client.hexists(worker_key, field_name)
1385
+
1386
+ # 如果group_info不存在,说明worker key可能被重建了,需要恢复group_info
1387
+ if not group_info_exists:
1388
+ logger.info(f"Detected missing group_info for task {task_name}, restoring...")
1389
+
1390
+ # 恢复基础队列的 group_info
1391
+ await self._ensure_consumer_group_and_record_info(prefixed_queue, task_name, consumer_name)
1392
+
1393
+ # 为优先级队列重新创建 consumer group(共享基础队列的 group_name)
1394
+ for pq in priority_queues:
1395
+ await self._ensure_consumer_group_and_record_info(
1396
+ pq, task_name, consumer_name, base_group_name=group_name
1397
+ )
1398
+
1399
+ logger.info(f"Restored group_info and consumer groups for {len(all_queues)} queues for task {task_name}")
1400
+ except Exception as e:
1401
+ logger.error(f"Error checking/restoring group_info: {e}", exc_info=True)
1402
+
1403
+ last_group_info_check = current_time
1404
+
1405
+ # 批量获取并处理延迟任务(使用list更高效)
1406
+ if delayed_list:
1407
+ # 原子地交换list内容
1408
+ async with delayed_lock:
1409
+ if delayed_list:
1410
+ # 快速拷贝并清空原list
1411
+ tasks_to_process = delayed_list.copy()
1412
+ delayed_list.clear()
1413
+ else:
1414
+ tasks_to_process = []
1415
+
1416
+ # 处理所有延迟任务
1417
+ if tasks_to_process:
1418
+ my_tasks = [] # 属于当前task的任务
1419
+ other_tasks = [] # 属于其他task的任务
1420
+
1421
+ for delayed_task in tasks_to_process:
1422
+ # Scanner 返回的格式:{'event_id': '...', 'queue': '...'}
1423
+ # 没有 data 字段,需要通过 XCLAIM 获取
1424
+
1425
+ # 注意:新版本Scanner只返回消息ID,不返回数据
1426
+ # 数据将在后续通过XCLAIM获取
1427
+ task_data = delayed_task.get('data', None)
1428
+
1429
+ # 如果task_data存在(兼容旧版本Scanner)
1430
+ if task_data:
1431
+ if isinstance(task_data, str):
1432
+ import json
1433
+ task_data = json.loads(task_data)
1434
+
1435
+ # 检查消息是否指定了目标task
1436
+ target_tasks = task_data.get('_target_tasks', None)
1437
+ if target_tasks and task_name not in target_tasks:
1438
+ # 这个消息不是给当前task的
1439
+ other_tasks.append(delayed_task)
1440
+ continue
1441
+
1442
+ # 当前task处理这个任务
1443
+ # task_data可能为None,会在后续通过XCLAIM获取
1444
+ my_tasks.append((delayed_task, task_data))
1445
+
1446
+ # 处理属于当前task的所有任务
1447
+ # 按队列分组延迟任务的 offset(因为可能来自不同的优先级队列)
1448
+ max_offsets_by_queue = {}
1449
+
1450
+ for delayed_task, task_data in my_tasks:
1451
+ event_id = delayed_task.get('event_id', f"delayed-{time.time()}")
1452
+
1453
+ # 获取任务来自哪个队列(可能包含优先级后缀)
1454
+ task_queue = delayed_task.get('queue', queue)
1455
+
1456
+ # 如果task_data为None,说明Scanner只返回了消息ID
1457
+ # 需要使用XCLAIM从Stream中claim消息并转移所有权
1458
+ if task_data is None:
1459
+ prefixed_queue = self.get_prefixed_queue_name(task_queue)
1460
+ try:
1461
+ # 使用XCLAIM转移消息所有权
1462
+ # min_idle_time设为0,强制claim
1463
+ claimed_messages = await self.async_binary_redis_client.xclaim(
1464
+ name=prefixed_queue,
1465
+ groupname=group_name,
1466
+ consumername=consumer_name,
1467
+ min_idle_time=0, # 立即claim,不管idle时间
1468
+ message_ids=[event_id]
1469
+ )
1470
+
1471
+ if not claimed_messages:
1472
+ logger.warning(f"Failed to claim delayed message {event_id} from queue {task_queue}")
1473
+ continue
1474
+
1475
+ # 解析claimed消息
1476
+ claimed_msg = claimed_messages[0] # [(stream_id, fields)]
1477
+ if isinstance(claimed_msg, (list, tuple)) and len(claimed_msg) >= 2:
1478
+ fields = claimed_msg[1]
1479
+
1480
+ # 将fields转换为字典
1481
+ task_data_dict = {}
1482
+ if isinstance(fields, dict):
1483
+ task_data_dict = fields
1484
+ elif isinstance(fields, list):
1485
+ for j in range(0, len(fields), 2):
1486
+ if j + 1 < len(fields):
1487
+ key = fields[j]
1488
+ value = fields[j + 1]
1489
+ task_data_dict[key] = value
1490
+
1491
+ # 解析data字段
1492
+ data_field = task_data_dict.get('data') or task_data_dict.get(b'data')
1493
+ if data_field:
1494
+ task_data = loads_str(data_field)
1495
+
1496
+ # 提取 offset 字段(关键:确保延迟任务的 offset 能被记录)
1497
+ offset_field = task_data_dict.get('offset') or task_data_dict.get(b'offset')
1498
+ if offset_field:
1499
+ try:
1500
+ offset_value = int(offset_field) if isinstance(offset_field, (int, str)) else int(offset_field.decode())
1501
+ task_data['offset'] = offset_value
1502
+ except (ValueError, TypeError, AttributeError):
1503
+ logger.debug(f"Failed to extract offset from claimed message {event_id}")
1504
+ else:
1505
+ logger.warning(f"No data field in claimed message {event_id}")
1506
+ continue
1507
+ else:
1508
+ logger.warning(f"Invalid claimed message format for {event_id}")
1509
+ continue
1510
+
1511
+ except Exception as e:
1512
+ logger.error(f"Error claiming delayed message {event_id}: {e}", exc_info=True)
1513
+ continue
1514
+
1515
+ task_data['_task_name'] = task_name
1516
+
1517
+ # 记录延迟精度(用于调试)
1518
+ if 'execute_at' in task_data:
1519
+ delay_error = time.time() - task_data['execute_at']
1520
+ if abs(delay_error) > 0.1: # 超过100ms才记录
1521
+ logger.debug(f'延迟任务 {event_id} 执行误差: {delay_error*1000:.1f}ms')
1522
+
1523
+ # 收集每个队列的最大offset
1524
+ if 'offset' in task_data:
1525
+ try:
1526
+ message_offset = int(task_data['offset'])
1527
+ if task_queue not in max_offsets_by_queue or message_offset > max_offsets_by_queue[task_queue]:
1528
+ max_offsets_by_queue[task_queue] = message_offset
1529
+ except (ValueError, TypeError):
1530
+ pass
1531
+
1532
+ # 所有队列(包括优先级队列)都使用基础队列的 group_name
1533
+ result = self._process_message_common(
1534
+ event_id, task_data, task_queue, task_event_queue,
1535
+ is_async=True, consumer_name=consumer_name, group_name=group_name
1536
+ )
1537
+ if isinstance(result, tuple) and result[0] == 'async_put':
1538
+ await self._async_put_task(task_event_queue, result[1])
1539
+
1540
+ # 批量更新每个队列的最大offset(所有队列使用同一个 group_name)
1541
+ for task_queue, max_offset in max_offsets_by_queue.items():
1542
+ asyncio.create_task(self._update_read_offset(task_queue, group_name, max_offset))
1543
+
1544
+ # 把不属于当前task的任务放回list
1545
+ if other_tasks:
1546
+ async with delayed_lock:
1547
+ delayed_list.extend(other_tasks)
1548
+
1549
+ # 处理正常的Stream消息(支持优先级队列)
1550
+ # 实现真正的优先级消费:
1551
+ # 1. 先检查event_queue是否已满
1552
+ # 2. 优先从高优先级队列读取
1553
+ # 3. 只有高优先级队列空了才读取低优先级
1554
+ # 4. 不超过prefetch_multiplier限制
1555
+
1556
+ # 检查内存队列是否已满
1557
+ current_queue_size = task_event_queue.qsize() if hasattr(task_event_queue, 'qsize') else 0
1558
+ if current_queue_size >= prefetch_multiplier:
1559
+ # 内存队列已满,等待处理
1560
+ await asyncio.sleep(0.01) # 短暂等待
1561
+ continue
1562
+
1563
+ messages = []
1564
+ messages_needed = prefetch_multiplier - current_queue_size # 还能读取的消息数
1565
+
1566
+ if messages_needed <= 0:
1567
+ # 不需要读取更多消息
1568
+ await asyncio.sleep(0.01)
1569
+ continue
1570
+
1571
+ # 优化:预先检查哪些队列有待读取的消息,避免在空队列上浪费时间
1572
+ # ✅ 但如果队列需要读取 pending 消息(check_backlog=True),则跳过该队列的 offset 检查
1573
+ queues_with_messages = []
1574
+
1575
+ try:
1576
+ # 批量获取已发送和已读取的offset
1577
+ queue_offsets_key = f"{self.redis_prefix}:QUEUE_OFFSETS"
1578
+ read_offsets_key = f"{self.redis_prefix}:READ_OFFSETS"
1579
+
1580
+ # 使用pipeline批量获取offset
1581
+ pipe = self.async_redis_client.pipeline()
1582
+
1583
+ # 获取所有队列的已发送offset
1584
+ for q in all_queues:
1585
+ # 从队列名中提取实际的队列名(去掉前缀)
1586
+ actual_queue = q.replace(f"{self.redis_prefix}:QUEUE:", "")
1587
+ pipe.hget(queue_offsets_key, actual_queue)
1588
+
1589
+ # 提取 task_name(从 group_name 中)
1590
+ task_name = group_name.split(':')[-1]
1591
+
1592
+ # 获取所有队列的已读取offset
1593
+ for q in all_queues:
1594
+ actual_queue = q.replace(f"{self.redis_prefix}:QUEUE:", "")
1595
+ # field 格式:队列名(含优先级):任务名
1596
+ field = f"{actual_queue}:{task_name}"
1597
+ pipe.hget(read_offsets_key, field)
1598
+
1599
+ results = await pipe.execute()
1600
+
1601
+ # 分析结果,确定哪些队列有待读取的消息
1602
+ half_len = len(all_queues)
1603
+ for i, q in enumerate(all_queues):
1604
+ # ✅ 如果该队列需要读取 pending 消息,直接加入列表,跳过 offset 检查
1605
+ if check_backlog.get(q, False):
1606
+ queues_with_messages.append(q)
1607
+ logger.debug(f"Queue {q} needs to read pending messages, skipping offset check")
1608
+ continue
1609
+
1610
+ sent_offset = results[i] # 已发送的offset
1611
+ read_offset = results[half_len + i] # 已读取的offset
1612
+
1613
+ # 转换为整数
1614
+ sent = int(sent_offset) if sent_offset else 0
1615
+ read = int(read_offset) if read_offset else 0
1616
+
1617
+ # 如果已发送的offset大于已读取的offset,说明有消息待读取
1618
+ if sent > read:
1619
+ queues_with_messages.append(q)
1620
+ logger.debug(f"Queue {q} has {sent - read} unread messages (sent={sent}, read={read})")
1621
+
1622
+ # 如果没有队列有消息,记录下来(不再使用原始队列列表避免空读)
1623
+ if not queues_with_messages:
1624
+ logger.debug("No queues have unread messages, will wait for new messages")
1625
+
1626
+ except Exception as e:
1627
+ # 出错时回退到原始逻辑
1628
+ logger.debug(f"Failed to check queue offsets: {e}")
1629
+ queues_with_messages = all_queues
1630
+
1631
+ # print(f'{queues_with_messages=}')
1632
+ # 按优先级顺序读取有消息的队列
1633
+ for q in queues_with_messages:
1634
+ if messages_needed <= 0:
1635
+ break # 已经读取足够的消息
1636
+
1637
+ q_bytes = q.encode() if isinstance(q, str) else q
1638
+ # 针对具体队列检查是否需要读取历史消息
1639
+ if check_backlog.get(q, True):
1640
+ myid = lastid.get(q, "0-0")
1641
+ else:
1642
+ myid = ">"
1643
+ myid_bytes = myid.encode() if isinstance(myid, str) else myid
1644
+
1645
+ try:
1646
+ # print(f'{myid_bytes=} {consumer_name=} {check_backlog=} {q_bytes=}')
1647
+ # 所有队列(包括优先级队列)都使用基础队列的 group_name
1648
+ # 从当前优先级队列读取(最多读取messages_needed个)
1649
+ q_messages = await self.async_binary_redis_client.xreadgroup(
1650
+ groupname=group_name,
1651
+ consumername=consumer_name,
1652
+ streams={q_bytes: myid_bytes},
1653
+ count=messages_needed, # 只读取需要的数量
1654
+ block=100 # 非阻塞
1655
+ )
1656
+ if q_messages:
1657
+ logger.info(f"Read messages from {q}: {len(q_messages[0][1]) if q_messages else 0} messages")
1658
+ # if check_backlog.get(q, True):
1659
+ # print(f'先处理历史消息:{q_bytes=} {group_name=} {q_messages=}')
1660
+ # 记录从哪个队列读取的
1661
+ messages.extend(q_messages)
1662
+ messages_read = len(q_messages[0][1]) if q_messages else 0
1663
+ messages_needed -= messages_read
1664
+
1665
+ # 如果高优先级队列还有消息,继续从该队列读取
1666
+ # 直到该队列空了或者达到prefetch限制
1667
+ if messages_read > 0 and messages_needed > 0:
1668
+ # 该队列可能还有更多消息,下次循环继续优先从这个队列读
1669
+ # 但现在先处理已读取的消息
1670
+ break # 跳出for循环,处理已有消息
1671
+
1672
+ except Exception as e:
1673
+ if "NOGROUP" in str(e):
1674
+ # consumer group 不存在(可能是 Redis 被清空了),重新创建
1675
+ logger.warning(f"NOGROUP error for queue {q}, recreating consumer group...")
1676
+ try:
1677
+ # 为队列创建 consumer group(共享基础队列的 group_name)
1678
+ await self._ensure_consumer_group_and_record_info(
1679
+ q, task_name, consumer_name, base_group_name=group_name
1680
+ )
1681
+ logger.info(f"Recreated consumer group for queue {q}")
1682
+
1683
+ # 重新初始化这个队列的 lastid 和 check_backlog
1684
+ lastid[q] = "0"
1685
+ check_backlog[q] = True
1686
+
1687
+ # 确保这个队列在 all_queues 中(可能因 Redis 清空而丢失)
1688
+ if q not in all_queues:
1689
+ all_queues.append(q)
1690
+ # 同时更新 priority_queues(如果是优先级队列)
1691
+ if q != prefixed_queue and q not in priority_queues:
1692
+ priority_queues.append(q)
1693
+ logger.info(f"Re-added queue {q} to all_queues after NOGROUP recovery")
1694
+ except Exception as recreate_error:
1695
+ logger.error(f"Failed to recreate consumer group for {q}: {recreate_error}")
1696
+ else:
1697
+ logger.debug(f"Error reading from queue {q}: {e}")
1698
+ continue
1699
+
1700
+
1701
+ try:
1702
+ # logger.debug(f'{group_name=} {consumer_name=} {block_time=}')
1703
+ consecutive_errors = 0
1704
+ # if check_backlog and messages:
1705
+ # logger.debug(f'先消费之前的消息 {group_name=} ')
1706
+ # logger.debug(f'{check_backlog=} {messages=}')
1707
+
1708
+ # 上报已投递的offset(用于积压监控)
1709
+ try:
1710
+ from jettask.utils.stream_backlog import report_delivered_offset
1711
+ # 对每个stream的消息上报offset
1712
+ for msg in messages:
1713
+ stream_name = msg[0]
1714
+ if isinstance(stream_name, bytes):
1715
+ stream_name = stream_name.decode('utf-8')
1716
+ # 提取队列名(去掉前缀)
1717
+ queue_name = stream_name.replace(f"{self.redis_prefix}:STREAM:", "")
1718
+ await report_delivered_offset(
1719
+ self.async_redis_client,
1720
+ self.redis_prefix,
1721
+ queue_name,
1722
+ group_name,
1723
+ [msg]
1724
+ )
1725
+ except Exception as e:
1726
+ # 监控失败不影响主流程
1727
+ logger.debug(f"Failed to report delivered offset: {e}")
1728
+
1729
+ # 收集需要跳过的消息ID
1730
+ skip_message_ids = []
1731
+
1732
+ # 用于记录每个队列的最大offset(批量更新)
1733
+ max_offsets_per_queue = {}
1734
+
1735
+ for message in messages:
1736
+ # print(f'{message=}')
1737
+ # message[0]是stream名称,message[1]是消息列表
1738
+ stream_name = message[0]
1739
+ if isinstance(stream_name, bytes):
1740
+ stream_name = stream_name.decode('utf-8')
1741
+
1742
+ # 根据这个具体队列的消息数量,更新该队列的check_backlog状态
1743
+ if len(message[1]) == 0:
1744
+ # 这个队列没有历史消息了,下次读取最新消息
1745
+ check_backlog[stream_name] = False
1746
+
1747
+ for event in message[1]:
1748
+ event_id = event[0]
1749
+ # 更新对应队列的lastid
1750
+ lastid[stream_name] = event_id
1751
+ # 将bytes类型的event_id转换为字符串
1752
+ if isinstance(event_id, bytes):
1753
+ event_id = event_id.decode('utf-8')
1754
+ event_data = event[1]
1755
+
1756
+ # 解析消息内容,决定是否处理
1757
+ should_process = True
1758
+
1759
+ try:
1760
+ # 解析data字段中的消息
1761
+ if b'data' in event_data or 'data' in event_data:
1762
+ data_field = event_data.get(b'data') or event_data.get('data')
1763
+
1764
+ # 直接解析二进制数据,不需要解码
1765
+ parsed_data = loads_str(data_field)
1766
+
1767
+ # 跳过延迟任务(延迟任务由延迟扫描器处理)
1768
+ # 但如果任务已到期,或者正在从 pending 恢复,则应该处理
1769
+ if parsed_data.get('is_delayed') == 1:
1770
+ # 检查是否已到期
1771
+ execute_at = parsed_data.get('execute_at')
1772
+ current_time = time.time()
1773
+
1774
+ if execute_at and execute_at > current_time:
1775
+ # 未到期,跳过(由Scanner处理)
1776
+ should_process = False
1777
+ continue
1778
+
1779
+ # 已到期或无execute_at字段,继续处理
1780
+ # 这种情况可能是:
1781
+ # 1. 延迟任务已到期,正在被执行
1782
+ # 2. 从 pending 恢复的已到期任务
1783
+ logger.debug(f"Processing expired delayed task {event_id}")
1784
+
1785
+ # 每个task都有独立的consumer group
1786
+ # 检查消息是否指定了目标task(用于精确路由)
1787
+ target_tasks = parsed_data.get('_target_tasks', None)
1788
+ if target_tasks and task_name not in target_tasks:
1789
+ # 这个消息指定了其他task处理
1790
+ should_process = False
1791
+
1792
+ if should_process:
1793
+ # 添加task_name到数据中(用于执行器识别任务)
1794
+ parsed_data['_task_name'] = task_name
1795
+
1796
+ # 提取offset字段(如果存在)
1797
+ offset_field = event_data.get(b'offset') or event_data.get('offset')
1798
+ message_offset = None
1799
+ if offset_field:
1800
+ # 将offset添加到parsed_data中
1801
+ if isinstance(offset_field, bytes):
1802
+ offset_field = offset_field.decode('utf-8')
1803
+ parsed_data['offset'] = offset_field
1804
+ try:
1805
+ message_offset = int(offset_field)
1806
+ except (ValueError, TypeError):
1807
+ pass
1808
+
1809
+ # 更新event_data
1810
+ event_data.clear()
1811
+ for key, value in parsed_data.items():
1812
+ event_data[key] = value
1813
+
1814
+ # 收集每个队列的最大offset(不要每条消息都记录)
1815
+ if message_offset is not None:
1816
+ # 从stream_name提取实际的队列名
1817
+ actual_queue_name = stream_name.replace(f"{self.redis_prefix}:QUEUE:", "")
1818
+ # 更新该队列的最大offset
1819
+ if actual_queue_name not in max_offsets_per_queue:
1820
+ max_offsets_per_queue[actual_queue_name] = message_offset
1821
+ else:
1822
+ max_offsets_per_queue[actual_queue_name] = max(max_offsets_per_queue[actual_queue_name], message_offset)
1823
+
1824
+ logger.debug(f"Task {task_name} will process message {event_id}")
1825
+ else:
1826
+ # 没有data字段,跳过消息
1827
+ should_process = False
1828
+ except Exception as e:
1829
+ logger.error(f"Task {task_name}: Error parsing message data: {e}")
1830
+
1831
+ if should_process:
1832
+ # 处理消息 - 消息会被放入队列,由执行器处理并ACK
1833
+ # 使用消息体中的实际队列名(可能包含优先级)
1834
+ actual_queue = event_data.get('queue', queue)
1835
+
1836
+ # 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
1837
+ # 不再需要为优先级队列添加后缀
1838
+ result = self._process_message_common(
1839
+ event_id, event_data, actual_queue, task_event_queue,
1840
+ is_async=True, consumer_name=consumer_name, group_name=group_name
1841
+ )
1842
+ if isinstance(result, tuple) and result[0] == 'async_put':
1843
+ await self._async_put_task(task_event_queue, result[1])
1844
+ logger.debug(f"Put task {event_id} into task_event_queue")
1845
+ # 注意:这里不ACK,由执行器在处理完成后ACK
1846
+ else:
1847
+ # 不属于当前task的消息,收集起来批量ACK
1848
+ skip_message_ids.append(event_id)
1849
+
1850
+
1851
+ # 批量ACK不需要的消息(所有队列使用同一个 group_name)
1852
+ if skip_message_ids:
1853
+ group_name_bytes = group_name.encode() if isinstance(group_name, str) else group_name
1854
+ for q in all_queues:
1855
+ q_bytes = q.encode() if isinstance(q, str) else q
1856
+ try:
1857
+ await self.async_binary_redis_client.xack(q_bytes, group_name_bytes, *skip_message_ids)
1858
+ except:
1859
+ pass # 忽略ACK错误
1860
+ logger.debug(f"Task {task_name} batch ACKed {len(skip_message_ids)} skipped messages")
1861
+
1862
+ # 批量更新每个队列的最大已读取offset(所有队列使用同一个 group_name)
1863
+ if max_offsets_per_queue:
1864
+ for queue_name, max_offset in max_offsets_per_queue.items():
1865
+ asyncio.create_task(self._update_read_offset(queue_name, group_name, max_offset))
1866
+ logger.debug(f"Updated read offsets for {len(max_offsets_per_queue)} queues")
1867
+
1868
+ except Exception as e:
1869
+ error_msg = str(e)
1870
+ # import traceback
1871
+ # traceback.print_exc()
1872
+ logger.error(f"Error in task listener {task_name}: {e}")
1873
+
1874
+ # 特殊处理:如果是NOGROUP错误,尝试重新创建consumer group
1875
+ if "NOGROUP" in error_msg:
1876
+ logger.warning(f"Detected NOGROUP error for {task_name}, attempting to recreate consumer groups...")
1877
+ try:
1878
+ # 为所有队列创建consumer group并记录group_info(使用统一方法)
1879
+ for q in all_queues:
1880
+ await self._ensure_consumer_group_and_record_info(q, task_name, consumer_name)
1881
+ logger.info(f"Recreated consumer groups for {len(all_queues)} queues for task {task_name}")
1882
+
1883
+ # 重新初始化所有队列的 lastid 和 check_backlog
1884
+ for q in all_queues:
1885
+ lastid[q] = "0"
1886
+ check_backlog[q] = True
1887
+
1888
+ # 重新创建成功,重置错误计数器
1889
+ consecutive_errors = 0
1890
+ continue
1891
+ except Exception as create_error:
1892
+ logger.error(f"Failed to recreate consumer groups for {task_name}: {create_error}")
1893
+
1894
+ consecutive_errors += 1
1895
+ if consecutive_errors >= max_consecutive_errors:
1896
+ logger.error(f"Too many errors for task {task_name}, restarting...")
1897
+ consecutive_errors = 0
1898
+ await asyncio.sleep(min(consecutive_errors, 5))
1899
+
1900
+ logger.debug(f"Starting event listeners for queues: {self.queues}")
1901
+ tasks = []
1902
+
1903
+ if not (self.app and hasattr(self.app, '_tasks_by_queue')):
1904
+ raise RuntimeError("No app or tasks registered, cannot start listeners")
1905
+
1906
+ # 为每个队列注册延迟任务回调
1907
+ async def handle_expired_tasks(queue: str, tasks: list):
1908
+ """处理到期的延迟任务"""
1909
+ if tasks:
1910
+ async with self._delayed_tasks_locks[queue]:
1911
+ self._delayed_tasks_lists[queue].extend(tasks)
1912
+
1913
+ for queue in self.queues:
1914
+ # 创建队列专用的回调函数
1915
+ import functools
1916
+ callback = functools.partial(handle_expired_tasks, queue)
1917
+ self.delayed_scanner.register_callback(queue, callback)
1918
+
1919
+ # 启动延迟消息扫描器
1920
+ await self.delayed_scanner.start(self.queues)
1921
+ logger.info(f"Delayed message scanner started for queues: {self.queues}")
1922
+
1923
+ # 为每个队列启动离线worker处理器(带自动重启)
1924
+ # 包括优先级队列
1925
+ all_recovery_queues = set(self.queues)
1926
+ for base_queue in self.queues:
1927
+ # 扫描优先级队列
1928
+ priority_queues = await self.scan_priority_queues(base_queue)
1929
+ for pq in priority_queues:
1930
+ if pq != base_queue: # 不重复添加基础队列
1931
+ all_recovery_queues.add(pq)
1932
+
1933
+ # ✅ 在启动离线worker处理器之前,先触发一次"自我恢复"
1934
+ # 这是为了处理"复用worker ID"的场景:
1935
+ # - Worker复用离线worker ID后,会立即变为在线状态
1936
+ # - 但此时该worker之前的pending消息还未恢复
1937
+ # - 周期性扫描只查找is_alive=false的worker,会漏掉刚复用的worker
1938
+ # - 因此需要在启动时主动恢复"当前worker"的pending消息
1939
+ # logger.info("[Recovery] Performing initial self-recovery check on startup...")
1940
+ # try:
1941
+ # await self._perform_self_recovery(all_recovery_queues, event_queue)
1942
+ # except Exception as e:
1943
+ # logger.error(f"Error during initial self-recovery: {e}", exc_info=True)
1944
+
1945
+ # 为所有队列(包括优先级队列)启动离线worker处理器
1946
+ for queue in all_recovery_queues:
1947
+ logger.debug(f"Starting offline worker processor for queue: {queue}")
1948
+ offline_processor_task = asyncio.create_task(
1949
+ self._start_offline_worker_processor_with_restart(queue) # 移除 event_queue 参数
1950
+ )
1951
+ tasks.append(offline_processor_task)
1952
+ self._background_tasks.append(offline_processor_task)
1953
+
1954
+ # # 为每个task创建独立的listener
1955
+ for queue in self.queues:
1956
+ task_names = self.app._tasks_by_queue.get(queue, [])
1957
+ if not task_names:
1958
+ raise RuntimeError(f"No tasks registered for queue '{queue}'. Cannot start worker without tasks.")
1959
+
1960
+ for task_name in task_names:
1961
+ logger.debug(f"Starting listener for task: {task_name} on queue: {queue}")
1962
+ task = asyncio.create_task(listen_event_by_task(queue, task_name))
1963
+ tasks.append(task)
1964
+ self._background_tasks.append(task)
1965
+
1966
+ try:
1967
+ # 等待所有任务
1968
+ await asyncio.gather(*tasks)
1969
+ except asyncio.CancelledError:
1970
+ logger.debug("listening_event tasks cancelled, cleaning up...")
1971
+
1972
+ # 停止延迟消息扫描器
1973
+ await self.delayed_scanner.stop()
1974
+
1975
+ # 取消所有后台任务
1976
+ for task in self._background_tasks:
1977
+ if not task.done():
1978
+ task.cancel()
1979
+ # 等待所有任务完成(使用return_exceptions=True避免再次抛出异常)
1980
+ if self._background_tasks:
1981
+ try:
1982
+ await asyncio.wait_for(
1983
+ asyncio.gather(*self._background_tasks, return_exceptions=True),
1984
+ timeout=0.2
1985
+ )
1986
+ except asyncio.TimeoutError:
1987
+ logger.debug("Some background tasks did not complete in time")
1988
+ raise
1989
+
1990
+ # 注意:延迟任务扫描逻辑已迁移到 DelayedMessageScanner 独立模块
1991
+ # 旧的 _scan_and_load_delayed_tasks_to_list 和 _scan_and_load_delayed_tasks 方法已删除
1992
+
1993
+ async def _claim_delayed_tasks(self, queue: str, event_queue: asyncio.Queue, prefetch_multiplier: int):
1994
+ """处理延迟队列中的到期任务"""
1995
+ try:
1996
+ # 检查队列大小,如果已满则不处理
1997
+ if event_queue.qsize() >= max(prefetch_multiplier // 2, 1):
1998
+ return
1999
+
2000
+ current_time = time.time()
2001
+ delayed_queue_key = f"{self.redis_prefix}:DELAYED_QUEUE:{queue}"
2002
+ consumer_name = self.consumer_manager.get_consumer_name(queue)
2003
+ prefixed_queue = self.get_prefixed_queue_name(queue)
2004
+
2005
+ # 计算需要获取的任务数量
2006
+ count_to_claim = max(1, prefetch_multiplier - event_queue.qsize())
2007
+
2008
+ # Lua脚本:原子性地获取到期任务、认领、删除成功认领的任务
2009
+ lua_script = """
2010
+ local delayed_queue_key = KEYS[1]
2011
+ local stream_key = KEYS[2]
2012
+ local group_name = KEYS[3]
2013
+ local consumer_name = ARGV[1]
2014
+ local current_time = ARGV[2]
2015
+ local limit = ARGV[3]
2016
+
2017
+ -- 获取到期的任务ID(这些是Stream消息ID)
2018
+ local expired_tasks = redis.call('ZRANGEBYSCORE', delayed_queue_key, 0, current_time, 'LIMIT', 0, limit)
2019
+
2020
+ if #expired_tasks == 0 then
2021
+ return {}
2022
+ end
2023
+
2024
+ local successfully_claimed = {}
2025
+ local claimed_messages = {}
2026
+
2027
+ -- 尝试认领每个任务
2028
+ for i, task_id in ipairs(expired_tasks) do
2029
+ -- 先检查消息的pending信息
2030
+ local pending_info = redis.call('XPENDING', stream_key, group_name, task_id, task_id, 1)
2031
+
2032
+ if #pending_info > 0 then
2033
+ -- pending_info[1] 格式: {id, consumer, idle_time, delivery_count}
2034
+ local idle_time = pending_info[1][3]
2035
+
2036
+ -- 只认领空闲时间超过1秒的消息(避免认领刚被读取的消息)
2037
+ if idle_time > 1000 then
2038
+ -- 使用XCLAIM认领消息
2039
+ local claim_result = redis.call('XCLAIM', stream_key, group_name, consumer_name, 0, task_id)
2040
+
2041
+ if #claim_result > 0 then
2042
+ -- 认领成功,记录任务ID
2043
+ table.insert(successfully_claimed, task_id)
2044
+ -- 保存认领到的消息内容
2045
+ for j, msg in ipairs(claim_result) do
2046
+ table.insert(claimed_messages, msg)
2047
+ end
2048
+ end
2049
+ end
2050
+ else
2051
+ -- 消息不在pending列表中,可能还没被读取,跳过
2052
+ -- 但保留在ZSET中,等待正常读取
2053
+ end
2054
+ end
2055
+
2056
+ -- 只删除成功认领的任务
2057
+ if #successfully_claimed > 0 then
2058
+ redis.call('ZREM', delayed_queue_key, unpack(successfully_claimed))
2059
+ end
2060
+
2061
+ -- 返回认领到的消息
2062
+ return claimed_messages
2063
+ """
2064
+
2065
+ # 注册Lua脚本(如果还没有注册)
2066
+ if not hasattr(self, '_atomic_claim_script'):
2067
+ self._atomic_claim_script = self.async_redis_client.register_script(lua_script)
2068
+
2069
+ # 执行Lua脚本
2070
+ try:
2071
+ claimed_messages = await self._atomic_claim_script(
2072
+ keys=[delayed_queue_key, prefixed_queue, prefixed_queue],
2073
+ args=[consumer_name, str(current_time), str(count_to_claim)]
2074
+ )
2075
+
2076
+ if not claimed_messages:
2077
+ return
2078
+
2079
+ # claimed_messages 是嵌套列表,每个元素是 [msg_id, msg_data_fields]
2080
+ # 其中 msg_data_fields 是扁平的键值对列表
2081
+ for claimed_message in claimed_messages:
2082
+ if isinstance(claimed_message, list) and len(claimed_message) >= 2:
2083
+ msg_id = claimed_message[0]
2084
+ msg_data_fields = claimed_message[1]
2085
+
2086
+ # 解析消息数据
2087
+ msg_data = {}
2088
+ if isinstance(msg_data_fields, list):
2089
+ for j in range(0, len(msg_data_fields), 2):
2090
+ if j + 1 < len(msg_data_fields):
2091
+ key = msg_data_fields[j]
2092
+ value = msg_data_fields[j + 1]
2093
+ # 保持bytes格式以匹配正常消息处理
2094
+ if isinstance(key, str):
2095
+ key = key.encode()
2096
+ if isinstance(value, str):
2097
+ value = value.encode()
2098
+ msg_data[key] = value
2099
+
2100
+ # 清除延迟标记
2101
+ if b'data' in msg_data:
2102
+ data_field = msg_data.get(b'data')
2103
+ if data_field:
2104
+ try:
2105
+ # 直接解析二进制数据
2106
+ parsed_data = loads_str(data_field)
2107
+ # 清除延迟标记,避免再次被延迟
2108
+ parsed_data['is_delayed'] = 0
2109
+ # dumps_str 现在直接返回二进制
2110
+ updated_data = dumps_str(parsed_data)
2111
+ msg_data[b'data'] = updated_data
2112
+ except:
2113
+ pass
2114
+
2115
+ # 处理消息
2116
+ result = self._process_message_common(
2117
+ msg_id, msg_data, queue, event_queue,
2118
+ is_async=True, consumer_name=consumer_name
2119
+ )
2120
+ if isinstance(result, tuple) and result[0] == 'async_put':
2121
+ await self._async_put_task(event_queue, result[1])
2122
+
2123
+ logger.debug(f"Claimed and processed delayed task {msg_id} from queue {queue}")
2124
+
2125
+ logger.debug(f"Processed {len(claimed_messages)} delayed tasks for queue {queue}")
2126
+
2127
+ except Exception as e:
2128
+ logger.error(f"Error executing atomic claim script: {e}")
2129
+
2130
+ except Exception as e:
2131
+ logger.error(f"Error processing delayed tasks for queue {queue}: {e}")
2132
+ # 错误不应该阻塞主流程
2133
+ def read_pending(self, groupname: str, queue: str, asyncio: bool = False):
2134
+ client = self.get_redis_client(asyncio, binary=True)
2135
+ prefixed_queue = self.get_prefixed_queue_name(queue)
2136
+ return client.xpending(prefixed_queue, groupname)
2137
+
2138
+ def ack(self, queue, event_id, asyncio: bool = False):
2139
+ client = self.get_redis_client(asyncio, binary=True)
2140
+ prefixed_queue = self.get_prefixed_queue_name(queue)
2141
+ result = client.xack(prefixed_queue, prefixed_queue, event_id)
2142
+ # 清理已认领的消息ID
2143
+ if event_id in self._claimed_message_ids:
2144
+ self._claimed_message_ids.remove(event_id)
2145
+ return result
2146
+ def _safe_redis_operation(self, operation, *args, max_retries=3, **kwargs):
2147
+ """
2148
+ 安全的Redis操作,带有重试机制
2149
+
2150
+ 注意:Redis连接池已配置为无限重试(InfiniteRetry),会自动处理连接失败。
2151
+ 这里的重试主要用于处理应用层面的临时错误。
2152
+ """
2153
+ for attempt in range(max_retries):
2154
+ try:
2155
+ return operation(*args, **kwargs)
2156
+ except (redis.exceptions.TimeoutError, redis.exceptions.ConnectionError) as e:
2157
+ if attempt == max_retries - 1:
2158
+ logger.error(f"Redis操作失败,已重试{max_retries}次: {e}")
2159
+ raise
2160
+
2161
+ logger.warning(f"Redis操作失败,第{attempt + 1}次重试: {e}")
2162
+ # 不需要手动重新创建连接,连接池会自动重试
2163
+ time.sleep(min(2 ** attempt, 5)) # 指数退避,最多5秒
2164
+
2165
+ def cleanup(self):
2166
+ """清理EventPool资源"""
2167
+ # 立即设置停止标志,阻止后台任务继续处理
2168
+ self._stop_reading = True
2169
+
2170
+ # 只有在有实际资源需要清理时才打印日志
2171
+ has_active_resources = False
2172
+
2173
+ # 检查是否有活跃的消费者管理器
2174
+ if hasattr(self, 'consumer_manager') and self.consumer_manager:
2175
+ # 检查消费者管理器是否真的有活动
2176
+ if hasattr(self.consumer_manager, '_heartbeat_strategy'):
2177
+ strategy = self.consumer_manager._heartbeat_strategy
2178
+ if strategy and hasattr(strategy, 'consumer_id') and strategy.consumer_id:
2179
+ has_active_resources = True
2180
+
2181
+ if has_active_resources:
2182
+ logger.debug("Cleaning up EventPool resources...")
2183
+ self.consumer_manager.cleanup()
2184
+ logger.debug("EventPool cleanup completed")
2185
+ else:
2186
+ # 静默清理
2187
+ if hasattr(self, 'consumer_manager') and self.consumer_manager:
2188
+ self.consumer_manager.cleanup()