jettask 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +60 -2
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.18.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -1,1700 +0,0 @@
1
- from ..utils.serializer import dumps_str, loads_str
2
- import time
3
- import threading
4
- import logging
5
- import asyncio
6
- from collections import defaultdict, deque, Counter
7
- from typing import List, Optional, TYPE_CHECKING, Union
8
-
9
- import redis
10
- from redis import asyncio as aioredis
11
-
12
-
13
- from ..utils.helpers import get_hostname
14
- import os
15
- from .consumer_manager import ConsumerManager, ConsumerStrategy
16
- from .offline_worker_recovery import OfflineWorkerRecovery
17
-
18
- logger = logging.getLogger('app')
19
-
20
-
21
- class EventPool(object):
22
- STATE_MACHINE_NAME = "STATE_MACHINE"
23
- TIMEOUT = 60 * 5
24
-
25
- def __init__(
26
- self,
27
- redis_client: redis.StrictRedis,
28
- async_redis_client: aioredis.StrictRedis,
29
- queues: list = None,
30
- redis_url: str = None,
31
- consumer_strategy: str = None,
32
- consumer_config: dict = None,
33
- redis_prefix: str = None,
34
- app=None, # 添加app参数
35
- ) -> None:
36
- self.redis_client = redis_client
37
- self.async_redis_client = async_redis_client
38
- print(f'{redis_url=}')
39
- # 创建用于二进制数据的Redis客户端(用于Stream操作)
40
- from ..core.app import get_binary_redis_pool, get_async_binary_redis_pool
41
- binary_pool = get_binary_redis_pool(redis_url or 'redis://localhost:6379/0')
42
- self.binary_redis_client = redis.StrictRedis(connection_pool=binary_pool)
43
- async_binary_pool = get_async_binary_redis_pool(redis_url or 'redis://localhost:6379/0')
44
- self.async_binary_redis_client = aioredis.StrictRedis(connection_pool=async_binary_pool)
45
-
46
- self.queues = queues
47
- self._redis_url = redis_url or 'redis://localhost:6379/0'
48
- self.redis_prefix = redis_prefix or 'jettask'
49
- self.app = app # 保存app引用
50
-
51
- # 初始化消费者管理器
52
- strategy = ConsumerStrategy(consumer_strategy) if consumer_strategy else ConsumerStrategy.HEARTBEAT
53
- # 确保配置中包含队列信息、redis_url和redis_prefix
54
- manager_config = consumer_config or {}
55
- manager_config['queues'] = queues or []
56
- manager_config['redis_prefix'] = redis_prefix or 'jettask'
57
- manager_config['redis_url'] = redis_url or 'redis://localhost:6379/0'
58
-
59
- # 保存consumer_config供后续使用
60
- self.consumer_config = manager_config
61
-
62
- self.consumer_manager = ConsumerManager(
63
- redis_client=redis_client,
64
- strategy=strategy,
65
- config=manager_config
66
- )
67
-
68
- # 创建带前缀的队列名称映射
69
- self.prefixed_queues = {}
70
-
71
- # 优先级队列管理
72
- self.priority_queues = {} # {base_queue: [queue:1, queue:2, ...]}
73
- self.priority_queues_lock = threading.Lock()
74
- self._last_priority_scan = {} # 记录上次扫描时间
75
- self._priority_scan_interval = 5 # 5秒扫描一次,及时发现新的优先级队列
76
-
77
- # 用于跟踪广播消息
78
- self._broadcast_message_tracker = {}
79
-
80
- self.solo_routing_tasks = {}
81
- self.solo_running_state = {}
82
- self.solo_urgent_retry = {}
83
- self.batch_routing_tasks = {}
84
- self.task_scheduler = {}
85
- self.running_task_state_mappings = {}
86
- self.delay_tasks = []
87
- self.solo_agg_task = {}
88
- self.rlock = threading.RLock()
89
- self._claimed_message_ids = set() # 跟踪已认领的消息ID,防止重复处理
90
- self._stop_reading = False # 用于控制停止读取的标志
91
- self._queue_stop_flags = {queue: False for queue in (queues or [])} # 每个队列的停止标志
92
- # 延迟任务分布式锁的key
93
- self._delay_lock_key = f"{self.redis_prefix}:DELAY_LOCK"
94
-
95
- def _put_task(self, event_queue: Union[deque, asyncio.Queue], task, urgent: bool = False):
96
- """统一的任务放入方法"""
97
- # 如果是deque,使用原有逻辑
98
- if isinstance(event_queue, deque):
99
- if urgent:
100
- event_queue.appendleft(task)
101
- else:
102
- event_queue.append(task)
103
- # 如果是asyncio.Queue,则暂时只能按顺序放入(Queue不支持优先级)
104
- elif isinstance(event_queue, asyncio.Queue):
105
- # 对于asyncio.Queue,我们需要在async上下文中操作
106
- # 这里先保留接口,具体实现在async方法中
107
- pass
108
-
109
- async def _async_put_task(self, event_queue: asyncio.Queue, task, urgent: bool = False):
110
- """异步任务放入方法"""
111
- await event_queue.put(task)
112
-
113
- def init_routing(self):
114
- for queue in self.queues:
115
- self.solo_agg_task[queue] = defaultdict(list)
116
- self.solo_routing_tasks[queue] = defaultdict(list)
117
- self.solo_running_state[queue] = defaultdict(bool)
118
- self.batch_routing_tasks[queue] = defaultdict(list)
119
- self.task_scheduler[queue] = defaultdict(int)
120
- self.running_task_state_mappings[queue] = defaultdict(dict)
121
-
122
- def get_prefixed_queue_name(self, queue: str) -> str:
123
- """为队列名称添加前缀"""
124
- return f"{self.redis_prefix}:QUEUE:{queue}"
125
-
126
-
127
- def get_redis_client(self, asyncio: bool = False, binary: bool = False):
128
- """获取Redis客户端
129
-
130
- Args:
131
- asyncio: 是否使用异步客户端
132
- binary: 是否使用二进制客户端(用于Stream操作)
133
- """
134
- if binary:
135
- return self.async_binary_redis_client if asyncio else self.binary_redis_client
136
- return self.async_redis_client if asyncio else self.redis_client
137
-
138
-
139
- async def _batch_send_event(self, prefixed_queue, messages: List[dict], pipe):
140
- # 使用Lua脚本批量发送消息并添加自增offset
141
- lua_script = """
142
- local stream_key = KEYS[1]
143
- local prefix = ARGV[1]
144
- local results = {}
145
-
146
- -- 使用Hash存储所有队列的offset
147
- local offsets_hash = prefix .. ':QUEUE_OFFSETS'
148
-
149
- -- 从stream_key中提取队列名(去掉prefix:QUEUE:前缀)
150
- local queue_name = string.gsub(stream_key, '^' .. prefix .. ':QUEUE:', '')
151
-
152
- -- 从ARGV[2]开始,每个参数是一个消息的data
153
- for i = 2, #ARGV do
154
- local data = ARGV[i]
155
-
156
- -- 使用HINCRBY原子递增offset(如果不存在会自动创建并设为1)
157
- local current_offset = redis.call('HINCRBY', offsets_hash, queue_name, 1)
158
-
159
- -- 添加消息到Stream(包含offset字段)
160
- local stream_id = redis.call('XADD', stream_key, '*',
161
- 'data', data,
162
- 'offset', current_offset)
163
-
164
- table.insert(results, stream_id)
165
- end
166
-
167
- return results
168
- """
169
-
170
- # 准备Lua脚本参数
171
- lua_args = [self.redis_prefix.encode() if isinstance(self.redis_prefix, str) else self.redis_prefix]
172
-
173
- for message in messages:
174
- # 确保消息格式正确
175
- if 'data' in message:
176
- data = message['data'] if isinstance(message['data'], bytes) else dumps_str(message['data'])
177
- else:
178
- data = dumps_str(message)
179
- lua_args.append(data)
180
-
181
- # 获取异步Redis客户端(不使用pipe,直接使用client)
182
- client = self.get_redis_client(asyncio=True, binary=True)
183
-
184
- # 执行Lua脚本
185
- results = await client.eval(
186
- lua_script,
187
- 1, # 1个KEY
188
- prefixed_queue, # KEY[1]: stream key
189
- *lua_args # ARGV: prefix, data1, data2, ...
190
- )
191
-
192
- # 解码所有返回的Stream ID
193
- return [r.decode('utf-8') if isinstance(r, bytes) else r for r in results]
194
-
195
- def is_urgent(self, routing_key):
196
- is_urgent = self.solo_urgent_retry.get(routing_key, False)
197
- if is_urgent == True:
198
- del self.solo_urgent_retry[routing_key]
199
- return is_urgent
200
-
201
- async def scan_priority_queues(self, base_queue: str) -> list:
202
- """扫描Redis中的优先级队列
203
-
204
- Args:
205
- base_queue: 基础队列名(不带优先级后缀)
206
-
207
- Returns:
208
- 按优先级排序的队列列表
209
- """
210
- pattern = f"{self.redis_prefix}:QUEUE:{base_queue}:*"
211
-
212
- try:
213
- # 使用SCAN命令扫描匹配的键
214
- cursor = 0
215
- priority_queues = set()
216
-
217
- while True:
218
- cursor, keys = await self.async_binary_redis_client.scan(
219
- cursor=cursor,
220
- match=pattern.encode() if isinstance(pattern, str) else pattern,
221
- count=100
222
- )
223
-
224
- for key in keys:
225
- # 解析键名获取优先级
226
- key_str = key.decode() if isinstance(key, bytes) else key
227
- # 提取优先级后缀
228
- parts = key_str.split(':')
229
- if len(parts) >= 4: # jettask:QUEUE:base_queue:priority
230
- queue_with_priority = ':'.join(parts[2:]) # base_queue:priority
231
- priority_queues.add(queue_with_priority)
232
-
233
- if cursor == 0:
234
- break
235
-
236
- # 添加基础队列(无优先级)
237
- priority_queues.add(base_queue)
238
-
239
- # 按优先级排序(数字越小优先级越高)
240
- sorted_queues = []
241
- for q in priority_queues:
242
- if ':' in q:
243
- base, priority = q.rsplit(':', 1)
244
- if base == base_queue and priority.isdigit():
245
- sorted_queues.append((int(priority), q))
246
- else:
247
- sorted_queues.append((float('inf'), q)) # 非数字优先级放最后
248
- else:
249
- sorted_queues.append((float('inf'), q)) # 无优先级放最后
250
-
251
- sorted_queues.sort(key=lambda x: x[0])
252
- return [q[1] for q in sorted_queues]
253
-
254
- except Exception as e:
255
- logger.error(f"Error scanning priority queues for {base_queue}: {e}")
256
- return [base_queue] # 返回基础队列作为fallback
257
-
258
- async def update_priority_queues_cache(self, base_queue: str):
259
- """更新优先级队列缓存
260
-
261
- Args:
262
- base_queue: 基础队列名
263
- """
264
- current_time = time.time()
265
-
266
- # 检查是否需要重新扫描
267
- last_scan = self._last_priority_scan.get(base_queue, 0)
268
- if current_time - last_scan < self._priority_scan_interval:
269
- return # 未到扫描时间
270
-
271
- # 扫描并更新缓存
272
- priority_queues = await self.scan_priority_queues(base_queue)
273
-
274
- with self.priority_queues_lock:
275
- self.priority_queues[base_queue] = priority_queues
276
- self._last_priority_scan[base_queue] = current_time
277
-
278
- # if len(priority_queues) > 1:
279
- # logger.info(f"Updated priority queues for {base_queue}: {priority_queues}")
280
-
281
- def get_priority_queues(self, base_queue: str) -> list:
282
- """获取优先级队列列表(从缓存)
283
-
284
- Args:
285
- base_queue: 基础队列名
286
-
287
- Returns:
288
- 优先级队列列表(已加上前缀)
289
- """
290
- with self.priority_queues_lock:
291
- queues = self.priority_queues.get(base_queue, [])
292
- # 返回已加上前缀的队列名
293
- result = []
294
- for q in queues:
295
- if q == base_queue:
296
- continue # 跳过基础队列,在listen_event_by_task中已经处理
297
- # 只返回优先级队列(带优先级后缀的)
298
- if ':' in q and q.rsplit(':', 1)[1].isdigit():
299
- result.append(f"{self.redis_prefix}:QUEUE:{q}")
300
- return result
301
-
302
- @classmethod
303
- def separate_by_key(cls, lst):
304
- groups = {}
305
- for item in lst:
306
- key = item[0]['routing_key']
307
- if key not in groups:
308
- groups[key] = []
309
- groups[key].append(item)
310
- result = []
311
- group_values = list(groups.values())
312
- while True:
313
- exists_data = False
314
- for values in group_values:
315
- try:
316
- result.append(values.pop(0))
317
- exists_data = True
318
- except:
319
- pass
320
- if not exists_data:
321
- break
322
- return result
323
-
324
- async def _unified_task_checker(self, event_queue: asyncio.Queue, checker_type: str = 'solo_agg'):
325
- """统一的任务检查器,减少代码重复"""
326
- last_solo_running_state = defaultdict(dict)
327
- last_wait_time = defaultdict(int)
328
- queue_batch_tasks = defaultdict(list)
329
- left_queue_batch_tasks = defaultdict(list)
330
-
331
- # 延迟任务专用状态
332
- delay_tasks = getattr(self, 'delay_tasks', []) if checker_type == 'delay' else []
333
-
334
- while True:
335
- has_work = False
336
- current_time = time.time()
337
-
338
- if checker_type == 'delay':
339
- # 延迟任务逻辑
340
- put_count = 0
341
- need_del_index = []
342
- for i in range(len(delay_tasks)):
343
- schedule_time = delay_tasks[i][0]
344
- task = delay_tasks[i][1]
345
- if schedule_time <= current_time:
346
- try:
347
- await self._async_put_task(event_queue, task)
348
- need_del_index.append(i)
349
- put_count += 1
350
- has_work = True
351
- except IndexError:
352
- pass
353
- for i in need_del_index:
354
- del delay_tasks[i]
355
-
356
- elif checker_type == 'solo_agg':
357
- # Solo聚合任务逻辑
358
- for queue in self.queues:
359
- for agg_key, tasks in self.solo_agg_task[queue].items():
360
- if not tasks:
361
- continue
362
-
363
- has_work = True
364
- need_del_index = []
365
- need_lock_routing_keys = []
366
- sort_by_tasks = self.separate_by_key(tasks)
367
- max_wait_time = 5
368
- max_records = 3
369
-
370
- for index, (routing, task) in enumerate(sort_by_tasks):
371
- routing_key = routing['routing_key']
372
- max_records = routing.get('max_records', 1)
373
- max_wait_time = routing.get('max_wait_time', 0)
374
-
375
- with self.rlock:
376
- if self.solo_running_state[queue].get(routing_key, 0) > 0:
377
- continue
378
-
379
- if len(queue_batch_tasks[queue] + left_queue_batch_tasks[queue]) >= max_records:
380
- break
381
-
382
- task["routing"] = routing
383
-
384
- if self.is_urgent(routing_key):
385
- left_queue_batch_tasks[queue].append(task)
386
- else:
387
- queue_batch_tasks[queue].append(task)
388
- need_lock_routing_keys.append(routing_key)
389
- need_del_index.append(index)
390
-
391
- for routing_key, count in Counter(need_lock_routing_keys).items():
392
- with self.rlock:
393
- self.solo_running_state[queue][routing_key] = count
394
-
395
- if last_solo_running_state[queue] != self.solo_running_state[queue]:
396
- last_solo_running_state[queue] = self.solo_running_state[queue].copy()
397
-
398
- tasks = [task for index, task in enumerate(sort_by_tasks) if index not in need_del_index]
399
- self.solo_agg_task[queue][agg_key] = tasks
400
-
401
- if (len(queue_batch_tasks[queue] + left_queue_batch_tasks[queue]) >= max_records or
402
- (last_wait_time[queue] and last_wait_time[queue] < current_time - max_wait_time)):
403
- for task in queue_batch_tasks[queue]:
404
- await self._async_put_task(event_queue, task)
405
- for task in left_queue_batch_tasks[queue]:
406
- await self._async_put_task(event_queue, task)
407
- queue_batch_tasks[queue] = []
408
- left_queue_batch_tasks[queue] = []
409
- last_wait_time[queue] = 0
410
- elif last_wait_time[queue] == 0:
411
- last_wait_time[queue] = current_time
412
-
413
- # 统一的睡眠策略
414
- sleep_time = self._get_optimal_sleep_time(has_work, checker_type)
415
- await asyncio.sleep(sleep_time)
416
-
417
- def _get_optimal_sleep_time(self, has_work: bool, checker_type: str) -> float:
418
- """获取最优睡眠时间"""
419
- if checker_type == 'delay':
420
- return 0.001 if has_work else 1.0
421
- elif has_work:
422
- return 0.001 # 有工作时极短休眠
423
- else:
424
- return 0.01 # 无工作时短暂休眠
425
-
426
-
427
- async def async_check_solo_agg_tasks(self, event_queue: asyncio.Queue):
428
- """异步版本的聚合任务检查"""
429
- await self._unified_task_checker(event_queue, checker_type='solo_agg')
430
-
431
- async def check_solo_agg_tasks(self, event_queue: asyncio.Queue):
432
- """聚合任务检查"""
433
- await self._unified_task_checker(event_queue, checker_type='solo_agg')
434
-
435
- def check_sole_tasks(self, event_queue: Union[deque, asyncio.Queue]):
436
- agg_task_mappings = {queue: defaultdict(list) for queue in self.queues}
437
- agg_wait_task_mappings = {queue: defaultdict(float) for queue in self.queues}
438
- task_max_wait_time_mapping = {}
439
- make_up_for_index_mappings = {queue: defaultdict(int) for queue in self.queues}
440
- while True:
441
- put_count = 0
442
- for queue in self.queues:
443
- agg_task = agg_task_mappings[queue]
444
- for routing_key, tasks in self.solo_routing_tasks[queue].items():
445
- schedule_time = self.task_scheduler[queue][routing_key]
446
- if tasks:
447
- for task in tasks:
448
- prev_routing = task[0]
449
- if agg_key:= prev_routing.get('agg_key'):
450
- if not self.running_task_state_mappings[queue][agg_key]:
451
- self.solo_running_state[queue][routing_key] = False
452
- break
453
- if (
454
- schedule_time <= time.time()
455
- and self.solo_running_state[queue][routing_key] == False
456
- ) :
457
- try:
458
- routing, task = tasks.pop(0)
459
- except IndexError:
460
- continue
461
- task["routing"] = routing
462
-
463
- agg_key = routing.get('agg_key')
464
- if agg_key is not None:
465
- start_time = agg_wait_task_mappings[queue][agg_key]
466
- if not start_time:
467
- agg_wait_task_mappings[queue][agg_key] = time.time()
468
- start_time = agg_wait_task_mappings[queue][agg_key]
469
- agg_task[agg_key].append(task)
470
- max_wait_time = routing.get('max_wait_time', 3)
471
- task_max_wait_time_mapping[agg_key] = max_wait_time
472
- if len(agg_task[agg_key])>=routing.get('max_records', 100) or time.time()-start_time>=max_wait_time:
473
- logger.info(f'{agg_key=} {len(agg_task[agg_key])} 已满,准备发车!{routing.get("max_records", 100)} {time.time()-start_time} {max_wait_time}')
474
- for task in agg_task[agg_key]:
475
- task['routing']['version'] = 1
476
- self.running_task_state_mappings[queue][agg_key][task['event_id']] = time.time()
477
- self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
478
- agg_task[agg_key] = []
479
- make_up_for_index_mappings[queue][agg_key] = 0
480
- agg_wait_task_mappings[queue][agg_key] = 0
481
- else:
482
- self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
483
- self.solo_running_state[queue][routing_key] = True
484
- put_count += 1
485
- for agg_key in agg_task.keys():
486
- if not agg_task[agg_key]:
487
- continue
488
- start_time = agg_wait_task_mappings[queue][agg_key]
489
- max_wait_time = task_max_wait_time_mapping[agg_key]
490
- if make_up_for_index_mappings[queue][agg_key]>= len(agg_task[agg_key])-1:
491
- make_up_for_index_mappings[queue][agg_key] = 0
492
- routing = agg_task[agg_key][make_up_for_index_mappings[queue][agg_key]]['routing']
493
- routing_key = routing['routing_key']
494
- self.solo_running_state[queue][routing_key] = False
495
- make_up_for_index_mappings[queue][agg_key] += 1
496
- if time.time()-start_time>=max_wait_time:
497
- logger.info(f'{agg_key=} {len(agg_task[agg_key])}被迫发车! {time.time()-start_time} {max_wait_time}')
498
- for task in agg_task[agg_key]:
499
- task['routing']['version'] = 1
500
- self.running_task_state_mappings[queue][agg_key][task['event_id']] = time.time()
501
- self._put_task(event_queue, task, urgent=self.is_urgent(routing_key))
502
- agg_task[agg_key] = []
503
- make_up_for_index_mappings[queue][agg_key] = 0
504
- agg_wait_task_mappings[queue][agg_key] = 0
505
- # 优化:根据处理任务数量动态调整休眠时间
506
- if not put_count:
507
- time.sleep(0.001)
508
- elif put_count < 5:
509
- time.sleep(0.0005) # 少量任务时极短休眠
510
-
511
- async def check_batch_tasks(self, event_queue: asyncio.Queue):
512
- """批量任务检查 - 已简化为统一检查器"""
513
- # 批量任务逻辑已整合到其他检查器中,这个函数保留以兼容
514
- await asyncio.sleep(0.1)
515
-
516
- async def check_delay_tasks(self, event_queue: asyncio.Queue):
517
- """延迟任务检查"""
518
- await self._unified_task_checker(event_queue, checker_type='delay')
519
-
520
- def _handle_redis_error(self, error: Exception, consecutive_errors: int, queue: str = None) -> tuple[bool, int]:
521
- """处理Redis错误的通用方法
522
- 返回: (should_recreate_connection, new_consecutive_errors)
523
- """
524
- if isinstance(error, redis.exceptions.ConnectionError):
525
- logger.error(f'Redis连接错误: {error}')
526
- consecutive_errors += 1
527
- if consecutive_errors >= 5:
528
- logger.error(f'连续连接失败{consecutive_errors}次,重新创建连接')
529
- return True, 0
530
- return False, consecutive_errors
531
-
532
- elif isinstance(error, redis.exceptions.ResponseError):
533
- if "NOGROUP" in str(error) and queue:
534
- logger.warning(f'队列 {queue} 或消费者组不存在')
535
- return False, consecutive_errors
536
- else:
537
- logger.error(f'Redis错误: {error}')
538
- consecutive_errors += 1
539
- return False, consecutive_errors
540
- else:
541
- logger.error(f'意外错误: {error}')
542
- consecutive_errors += 1
543
- return False, consecutive_errors
544
-
545
- def _process_message_common(self, event_id: str, event_data: dict, queue: str, event_queue, is_async: bool = False, consumer_name: str = None, group_name: str = None):
546
- """通用的消息处理逻辑,供同步和异步版本使用"""
547
- # 检查消息是否已被认领,防止重复处理
548
- if event_id in self._claimed_message_ids:
549
- logger.debug(f"跳过已认领的消息 {event_id}")
550
- return event_id
551
-
552
- # 解析消息中的实际数据
553
- # event_data 格式: {b'data': b'{"name": "...", "event_id": "...", ...}'}
554
- actual_event_id = event_id # 默认使用Stream ID
555
- parsed_event_data = None # 解析后的数据
556
-
557
- # 检查是否有data字段(Stream消息格式)
558
- if 'data' in event_data or b'data' in event_data:
559
- data_field = event_data.get('data') or event_data.get(b'data')
560
- if data_field:
561
- try:
562
- # 直接解析二进制数据,不需要解码
563
- if isinstance(data_field, bytes):
564
- parsed_data = loads_str(data_field)
565
- else:
566
- parsed_data = data_field
567
- # 检查是否有原始的event_id(延迟任务会有)
568
- if 'event_id' in parsed_data:
569
- actual_event_id = parsed_data['event_id']
570
- # 使用解析后的数据作为event_data
571
- parsed_event_data = parsed_data
572
- except (ValueError, UnicodeDecodeError):
573
- pass # 解析失败,使用默认的Stream ID
574
-
575
- # 如果成功解析了数据,使用解析后的数据;否则使用原始数据
576
- final_event_data = parsed_event_data if parsed_event_data is not None else event_data
577
-
578
- routing = final_event_data.get("routing")
579
-
580
- # 从消息体中获取实际的队列名(可能包含优先级后缀)
581
- # 这确保ACK使用正确的stream key
582
- actual_queue = final_event_data.get('queue', queue)
583
-
584
- # 如果没有传入group_name,使用默认值(prefixed_queue)
585
- if not group_name:
586
- prefixed_queue = self.get_prefixed_queue_name(queue)
587
- group_name = prefixed_queue
588
-
589
- task_item = {
590
- "queue": actual_queue, # 使用消息体中的实际队列名(可能包含优先级)
591
- "event_id": actual_event_id,
592
- "event_data": final_event_data, # 使用解析后的数据
593
- "consumer": consumer_name, # 添加消费者信息
594
- "group_name": group_name, # 添加group_name用于ACK
595
- }
596
-
597
- push_flag = True
598
- if routing:
599
- # routing 现在直接是对象,不需要反序列化
600
- if agg_key := routing.get('agg_key'):
601
- self.solo_agg_task[queue][agg_key].append(
602
- [routing, task_item]
603
- )
604
- push_flag = False
605
-
606
- if push_flag:
607
- if is_async:
608
- # 这里不能直接await,需要返回一个标记
609
- return ('async_put', task_item)
610
- else:
611
- self._put_task(event_queue, task_item)
612
-
613
- return event_id
614
-
615
- async def _start_offline_worker_processor_with_restart(self, queue: str, event_queue: asyncio.Queue):
616
- """启动带自动重启机制的离线worker处理器"""
617
- async def supervisor():
618
- """监督器任务,负责重启失败的处理器"""
619
- restart_count = 0
620
- max_restarts = 10
621
-
622
- while not self._stop_reading and restart_count < max_restarts:
623
- try:
624
- logger.info(f"Starting offline worker processor for queue {queue} (attempt {restart_count + 1})")
625
- await self._process_offline_workers(queue, event_queue)
626
- # 如果正常退出(stop_reading为True),则不重启
627
- if self._stop_reading:
628
- logger.info(f"Offline worker processor for queue {queue} stopped normally")
629
- break
630
- except asyncio.CancelledError:
631
- logger.info(f"Offline worker processor for queue {queue} cancelled")
632
- break
633
- except Exception as e:
634
- restart_count += 1
635
- import traceback
636
- traceback.print_exc()
637
- logger.error(f"Offline worker processor for queue {queue} crashed: {e}")
638
- if restart_count < max_restarts:
639
- wait_time = min(restart_count * 5, 30) # 递增等待时间,最多30秒
640
- logger.info(f"Restarting offline worker processor for queue {queue} in {wait_time} seconds...")
641
- await asyncio.sleep(wait_time)
642
- else:
643
- logger.error(f"Offline worker processor for queue {queue} failed {max_restarts} times, giving up")
644
-
645
- # 创建监督器任务
646
- asyncio.create_task(supervisor())
647
-
648
- async def _process_offline_workers(self, queue: str, event_queue: asyncio.Queue):
649
- """定期检测离线worker并使用XCLAIM转移其pending消息 - 使用独立的恢复模块"""
650
- logger.info(f"Started offline worker processor for queue {queue}")
651
-
652
- # 创建离线worker恢复器
653
- recovery = OfflineWorkerRecovery(
654
- async_redis_client=self.async_binary_redis_client,
655
- redis_prefix=self.redis_prefix,
656
- worker_prefix='WORKER',
657
- consumer_manager=self.consumer_manager
658
- )
659
-
660
- # 等待consumer manager初始化
661
- # 对于优先级队列,使用基础队列名来获取consumer
662
- base_queue = queue
663
- if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
664
- base_queue = queue.rsplit(':', 1)[0]
665
-
666
- wait_times = [0.1, 0.2, 0.4, 0.8, 1.6, 3.2]
667
- for wait_time in wait_times:
668
- try:
669
- current_consumer = self.consumer_manager.get_consumer_name(base_queue)
670
- if current_consumer:
671
- # 对于优先级队列,consumer名称需要添加队列后缀
672
- if base_queue != queue:
673
- current_consumer = f"{current_consumer}:{queue.rsplit(':', 1)[-1]}"
674
- logger.info(f"Consumer manager initialized for queue {queue}, consumer: {current_consumer}")
675
- break
676
- except Exception as e:
677
- logger.debug(f"Consumer manager not ready yet, waiting {wait_time}s: {e}")
678
- await asyncio.sleep(wait_time)
679
-
680
- logger.info(f"Offline worker processor for queue {queue} is now active")
681
-
682
- # 扫描间隔
683
- scan_interval = 2 # 每30秒扫描一次
684
-
685
- while not self._stop_reading:
686
- try:
687
- # 直接恢复所有任务(每个task都有独立的consumer group)
688
- recovered = await recovery.recover_offline_workers(
689
- queue=queue,
690
- event_queue=event_queue,
691
- current_consumer_name=current_consumer
692
- )
693
- if recovered > 0:
694
- logger.info(f"Recovered {recovered} messages on queue {queue}")
695
-
696
- except Exception as e:
697
- import traceback
698
- traceback.print_exc()
699
- logger.error(f"Error in offline worker processor for queue {queue}: {e}")
700
-
701
- # 等待下一次扫描
702
- await asyncio.sleep(scan_interval)
703
-
704
- logger.info(f"Stopped offline worker processor for queue {queue}")
705
-
706
-
707
- async def listening_event(self, event_queue: asyncio.Queue, prefetch_multiplier: int = 1):
708
- """监听事件 - 为每个task创建独立的consumer group"""
709
-
710
- # 创建一个字典来存储每个队列的延迟任务 - 使用list + Lock更高效
711
- delayed_tasks_lists = {}
712
- delayed_tasks_locks = {}
713
- for queue in self.queues:
714
- delayed_tasks_lists[queue] = []
715
- delayed_tasks_locks[queue] = asyncio.Lock()
716
-
717
- # group信息将在每个task监听时记录
718
-
719
- async def scan_delayed_tasks_for_queue(queue: str, task_list: list, task_lock: asyncio.Lock):
720
- """为单个队列独立扫描延迟任务"""
721
- base_interval = self.consumer_config.get('scan_interval', 0.05) # 基础间隔50ms
722
- max_interval = 0.5 # 最大间隔500ms
723
-
724
- logger.info(f'Starting delayed task scanner for queue {queue}, interval={base_interval}')
725
-
726
- while not self._stop_reading:
727
- try:
728
- current_time = time.time()
729
-
730
- # 扫描并获取下一个任务的到期时间
731
- await self._scan_and_load_delayed_tasks_to_list(queue, task_list, task_lock)
732
-
733
- # 动态调整扫描间隔
734
- # 如果有任务被加载,使用较短的间隔
735
- # 否则可以使用较长的间隔以节省CPU
736
- if task_list:
737
- sleep_time = base_interval
738
- else:
739
- # 检查下一个任务的到期时间
740
- delayed_queue_key = f"{self.redis_prefix}:DELAYED_QUEUE:{queue}"
741
- result = await self.async_binary_redis_client.zrange(
742
- delayed_queue_key, 0, 0, withscores=True
743
- )
744
-
745
- if result:
746
- next_task_time = result[0][1]
747
- # 计算到下一个任务的时间,但不超过max_interval
748
- sleep_time = min(max_interval, max(base_interval, next_task_time - current_time - 0.01))
749
- else:
750
- sleep_time = max_interval
751
-
752
- except Exception as e:
753
- import traceback
754
- # traceback.print_exc()
755
- logger.error(f"Error scanning delayed tasks for queue {queue}: {e}")
756
- sleep_time = base_interval
757
-
758
- await asyncio.sleep(sleep_time)
759
-
760
- async def listen_event_by_task(queue, task_name):
761
- """为单个任务监听事件"""
762
- # 为每个队列单独管理check_backlog状态
763
- check_backlog = {} # {queue_name: bool}
764
- lastid = {} # 每个队列的lastid
765
- consecutive_errors = 0
766
- max_consecutive_errors = 5
767
-
768
- # 获取任务对象
769
- task = self.app._tasks.get(task_name)
770
- if not task:
771
- logger.error(f"Task {task_name} not found")
772
- return
773
-
774
- # 定义必要的变量
775
- prefixed_queue = self.get_prefixed_queue_name(queue)
776
- # 使用函数名作为group_name,实现任务隔离
777
- group_name = f"{prefixed_queue}:{task_name}"
778
- print(f'{group_name=} {task_name=}')
779
- consumer_name = self.consumer_manager.get_consumer_name(queue)
780
-
781
- # 记录group信息到worker hash表
782
- if self.consumer_manager:
783
- await self.consumer_manager.record_group_info_async(queue, task_name, group_name, consumer_name)
784
-
785
- # 初始化优先级队列扫描
786
- await self.update_priority_queues_cache(queue)
787
-
788
- # 获取所有优先级队列(包括默认队列)
789
- priority_queues = self.get_priority_queues(queue)
790
- all_queues = [prefixed_queue] + priority_queues # 默认队列 + 优先级队列
791
-
792
- # 为每个队列创建consumer group(如果不存在)
793
- for q in all_queues:
794
- try:
795
- await self.async_redis_client.xgroup_create(
796
- name=q,
797
- groupname=group_name,
798
- id="0",
799
- mkstream=True
800
- )
801
- logger.info(f"Created consumer group {group_name} for queue {q}")
802
- except Exception as e:
803
- if "BUSYGROUP" in str(e):
804
- logger.debug(f"Consumer group {group_name} already exists for queue {q}")
805
- else:
806
- logger.warning(f"Error creating consumer group for {q}: {e}")
807
-
808
- # 初始化每个队列的lastid和check_backlog
809
- for q in all_queues:
810
- lastid[q] = "0-0"
811
- check_backlog[q] = True # 每个队列都需要检查历史消息
812
-
813
- # 获取该队列的延迟任务列表和锁
814
- delayed_list = delayed_tasks_lists.get(queue)
815
- delayed_lock = delayed_tasks_locks.get(queue)
816
-
817
- # 记录上次优先级队列更新时间
818
- last_priority_update = time.time()
819
-
820
- while not self._stop_reading:
821
- # 定期更新优先级队列缓存
822
- current_time = time.time()
823
- if current_time - last_priority_update >= self._priority_scan_interval:
824
- await self.update_priority_queues_cache(queue)
825
- new_priority_queues = self.get_priority_queues(queue)
826
-
827
- # 如果优先级队列有变化,更新本地变量
828
- if new_priority_queues != priority_queues:
829
- logger.info(f"Priority queues updated for {queue}: {priority_queues} -> {new_priority_queues}")
830
- priority_queues = new_priority_queues
831
- all_queues = [prefixed_queue] + priority_queues
832
-
833
- # 为新的优先级队列创建consumer group
834
- for q in all_queues:
835
- if q not in lastid: # 这是新队列
836
- try:
837
- await self.async_redis_client.xgroup_create(
838
- name=q,
839
- groupname=group_name,
840
- id="0",
841
- mkstream=True
842
- )
843
- logger.info(f"Created consumer group {group_name} for new priority queue {q}")
844
- except Exception as e:
845
- if "BUSYGROUP" not in str(e):
846
- logger.warning(f"Error creating consumer group for {q}: {e}")
847
-
848
- # 初始化新队列的状态
849
- lastid[q] = "0-0"
850
- check_backlog[q] = True
851
-
852
- last_priority_update = current_time
853
-
854
- # 批量获取并处理延迟任务(使用list更高效)
855
- if delayed_list:
856
- # 原子地交换list内容
857
- async with delayed_lock:
858
- if delayed_list:
859
- # 快速拷贝并清空原list
860
- tasks_to_process = delayed_list.copy()
861
- delayed_list.clear()
862
- else:
863
- tasks_to_process = []
864
-
865
- # 处理所有延迟任务
866
- if tasks_to_process:
867
- my_tasks = [] # 属于当前task的任务
868
- other_tasks = [] # 属于其他task的任务
869
-
870
- for delayed_task in tasks_to_process:
871
- # 对于延迟任务,不需要检查name字段
872
- # 因为延迟任务在发送时没有指定特定的task
873
- # 所有监听这个队列的task都可以处理
874
- task_data = delayed_task.get('data', {})
875
- if isinstance(task_data, str):
876
- import json
877
- task_data = json.loads(task_data)
878
-
879
- # 每个task都有独立的consumer group,都能独立处理消息
880
- # 检查消息是否指定了目标task
881
- target_tasks = task_data.get('_target_tasks', None)
882
- if target_tasks and task_name not in target_tasks:
883
- # 这个消息不是给当前task的
884
- other_tasks.append(delayed_task)
885
- else:
886
- # 当前task处理这个任务
887
- my_tasks.append((delayed_task, task_data))
888
-
889
- # 处理属于当前task的所有任务
890
- for delayed_task, task_data in my_tasks:
891
- event_id = delayed_task.get('event_id', f"delayed-{time.time()}")
892
- task_data['_task_name'] = task_name
893
-
894
- # 记录延迟精度(用于调试)
895
- if 'execute_at' in task_data:
896
- delay_error = time.time() - task_data['execute_at']
897
- if abs(delay_error) > 0.1: # 超过100ms才记录
898
- logger.info(f'延迟任务 {event_id} 执行误差: {delay_error*1000:.1f}ms')
899
-
900
- result = self._process_message_common(
901
- event_id, task_data, queue, event_queue,
902
- is_async=True, consumer_name=consumer_name, group_name=group_name
903
- )
904
- if isinstance(result, tuple) and result[0] == 'async_put':
905
- await self._async_put_task(event_queue, result[1])
906
-
907
- # 把不属于当前task的任务放回list
908
- if other_tasks:
909
- async with delayed_lock:
910
- delayed_list.extend(other_tasks)
911
-
912
- # 处理正常的Stream消息(支持优先级队列)
913
- # 实现真正的优先级消费:
914
- # 1. 先检查event_queue是否已满
915
- # 2. 优先从高优先级队列读取
916
- # 3. 只有高优先级队列空了才读取低优先级
917
- # 4. 不超过prefetch_multiplier限制
918
-
919
- # 检查内存队列是否已满
920
- current_queue_size = event_queue.qsize() if hasattr(event_queue, 'qsize') else 0
921
- if current_queue_size >= prefetch_multiplier:
922
- # 内存队列已满,等待处理
923
- await asyncio.sleep(0.01) # 短暂等待
924
- continue
925
-
926
- messages = []
927
- messages_needed = prefetch_multiplier - current_queue_size # 还能读取的消息数
928
-
929
- if messages_needed <= 0:
930
- # 不需要读取更多消息
931
- await asyncio.sleep(0.01)
932
- continue
933
-
934
- # 按优先级顺序读取队列
935
- for q in all_queues:
936
- if messages_needed <= 0:
937
- break # 已经读取足够的消息
938
-
939
- q_bytes = q.encode() if isinstance(q, str) else q
940
- # 针对具体队列检查是否需要读取历史消息
941
- if check_backlog.get(q, True):
942
- myid = lastid.get(q, "0-0")
943
- else:
944
- myid = ">"
945
- myid_bytes = myid.encode() if isinstance(myid, str) else myid
946
-
947
- try:
948
- # 为优先级队列使用正确的consumer名称
949
- # 如果是优先级队列,consumer名称需要带优先级后缀
950
- q_consumer_name = consumer_name
951
- if q != prefixed_queue and ':' in q:
952
- # 这是优先级队列,添加优先级后缀
953
- priority_suffix = q.rsplit(':', 1)[-1]
954
- q_consumer_name = f"{consumer_name}:{priority_suffix}"
955
- # 从当前优先级队列读取(最多读取messages_needed个)
956
- q_messages = await self.async_binary_redis_client.xreadgroup(
957
- groupname=group_name,
958
- consumername=q_consumer_name,
959
- streams={q_bytes: myid_bytes},
960
- count=messages_needed, # 只读取需要的数量
961
- block=1 # 非阻塞
962
- )
963
-
964
- if q_messages:
965
- # logger.debug(f"Read messages from {q}: {len(q_messages[0][1]) if q_messages else 0} messages")
966
- # if check_backlog.get(q, True):
967
- # print(f'先处理历史消息:{q_bytes=} {group_name=} {q_messages=}')
968
- # 记录从哪个队列读取的
969
- messages.extend(q_messages)
970
- messages_read = len(q_messages[0][1]) if q_messages else 0
971
- messages_needed -= messages_read
972
-
973
- # 如果高优先级队列还有消息,继续从该队列读取
974
- # 直到该队列空了或者达到prefetch限制
975
- if messages_read > 0 and messages_needed > 0:
976
- # 该队列可能还有更多消息,下次循环继续优先从这个队列读
977
- # 但现在先处理已读取的消息
978
- break # 跳出for循环,处理已有消息
979
-
980
- except Exception as e:
981
- if "NOGROUP" in str(e):
982
- # 创建consumer group
983
- try:
984
- await self.async_redis_client.xgroup_create(
985
- name=q,
986
- groupname=group_name,
987
- id="0",
988
- mkstream=True
989
- )
990
- logger.debug(f"Created consumer group {group_name} for {q}")
991
- # 重试读取
992
- q_messages = await self.async_binary_redis_client.xreadgroup(
993
- groupname=group_name,
994
- consumername=q_consumer_name,
995
- streams={q_bytes: myid_bytes},
996
- count=messages_needed,
997
- block=0
998
- )
999
- if q_messages:
1000
- messages.extend(q_messages)
1001
- messages_read = len(q_messages[0][1]) if q_messages else 0
1002
- messages_needed -= messages_read
1003
- except:
1004
- pass
1005
- else:
1006
- logger.debug(f"Error reading from queue {q}: {e}")
1007
- continue
1008
-
1009
- # 如果没有读取到任何消息,在最高优先级队列上阻塞等待
1010
- if not messages:
1011
- # 优先在高优先级队列上等待
1012
- # 如果有优先级队列,在第一个优先级队列上等待
1013
- # 否则在默认队列上等待
1014
- wait_queue = all_queues[0] if all_queues else prefixed_queue
1015
-
1016
- try:
1017
- q_bytes = wait_queue.encode() if isinstance(wait_queue, str) else wait_queue
1018
- if check_backlog.get(wait_queue, True):
1019
- myid = lastid.get(wait_queue, "0-0")
1020
- else:
1021
- myid = ">"
1022
- myid_bytes = myid.encode() if isinstance(myid, str) else myid
1023
-
1024
- # 为等待队列使用正确的consumer名称
1025
- wait_consumer_name = consumer_name
1026
- if wait_queue != prefixed_queue and ':' in wait_queue:
1027
- # 这是优先级队列,添加优先级后缀
1028
- priority_suffix = wait_queue.rsplit(':', 1)[-1]
1029
- wait_consumer_name = f"{consumer_name}:{priority_suffix}"
1030
-
1031
- messages = await self.async_binary_redis_client.xreadgroup(
1032
- groupname=group_name,
1033
- consumername=wait_consumer_name,
1034
- streams={q_bytes: myid_bytes},
1035
- count=prefetch_multiplier,
1036
- block=100 # 阻塞100ms
1037
- )
1038
- except Exception as e:
1039
- # 忽略错误,下次循环重试
1040
- pass
1041
-
1042
- try:
1043
- # logger.info(f'{group_name=} {consumer_name=} {block_time=}')
1044
- consecutive_errors = 0
1045
- # if check_backlog and messages:
1046
- # logger.info(f'先消费之前的消息 {group_name=} ')
1047
- # logger.info(f'{check_backlog=} {messages=}')
1048
-
1049
- # 上报已投递的offset(用于积压监控)
1050
- try:
1051
- from jettask.monitor.stream_backlog_monitor import report_delivered_offset
1052
- # 对每个stream的消息上报offset
1053
- for msg in messages:
1054
- stream_name = msg[0]
1055
- if isinstance(stream_name, bytes):
1056
- stream_name = stream_name.decode('utf-8')
1057
- # 提取队列名(去掉前缀)
1058
- queue_name = stream_name.replace(f"{self.redis_prefix}:STREAM:", "")
1059
- await report_delivered_offset(
1060
- self.async_redis_client,
1061
- self.redis_prefix,
1062
- queue_name,
1063
- group_name,
1064
- [msg]
1065
- )
1066
- except Exception as e:
1067
- # 监控失败不影响主流程
1068
- logger.debug(f"Failed to report delivered offset: {e}")
1069
-
1070
- # 收集需要跳过的消息ID
1071
- skip_message_ids = []
1072
-
1073
- for message in messages:
1074
- # print(f'{message=}')
1075
- # message[0]是stream名称,message[1]是消息列表
1076
- stream_name = message[0]
1077
- if isinstance(stream_name, bytes):
1078
- stream_name = stream_name.decode('utf-8')
1079
-
1080
- # 根据这个具体队列的消息数量,更新该队列的check_backlog状态
1081
- if len(message[1]) == 0:
1082
- # 这个队列没有历史消息了,下次读取最新消息
1083
- check_backlog[stream_name] = False
1084
-
1085
- for event in message[1]:
1086
- event_id = event[0]
1087
- # 更新对应队列的lastid
1088
- lastid[stream_name] = event_id
1089
- # 将bytes类型的event_id转换为字符串
1090
- if isinstance(event_id, bytes):
1091
- event_id = event_id.decode('utf-8')
1092
- event_data = event[1]
1093
-
1094
- # 解析消息内容,决定是否处理
1095
- should_process = True
1096
-
1097
- try:
1098
- # 解析data字段中的消息
1099
- if b'data' in event_data or 'data' in event_data:
1100
- data_field = event_data.get(b'data') or event_data.get('data')
1101
-
1102
- # 直接解析二进制数据,不需要解码
1103
- parsed_data = loads_str(data_field)
1104
- # 跳过延迟任务(延迟任务由延迟扫描器处理)
1105
- if parsed_data.get('is_delayed') == 1:
1106
- should_process = False
1107
- continue
1108
-
1109
- # 每个task都有独立的consumer group
1110
- # 检查消息是否指定了目标task(用于精确路由)
1111
- target_tasks = parsed_data.get('_target_tasks', None)
1112
- if target_tasks and task_name not in target_tasks:
1113
- # 这个消息指定了其他task处理
1114
- should_process = False
1115
-
1116
- if should_process:
1117
- # 添加task_name到数据中(用于执行器识别任务)
1118
- parsed_data['_task_name'] = task_name
1119
-
1120
- # 提取offset字段(如果存在)
1121
- offset_field = event_data.get(b'offset') or event_data.get('offset')
1122
- if offset_field:
1123
- # 将offset添加到parsed_data中
1124
- if isinstance(offset_field, bytes):
1125
- offset_field = offset_field.decode('utf-8')
1126
- parsed_data['offset'] = offset_field
1127
-
1128
- # 更新event_data
1129
- event_data.clear()
1130
- for key, value in parsed_data.items():
1131
- event_data[key] = value
1132
-
1133
- logger.debug(f"Task {task_name} will process message {event_id}")
1134
- else:
1135
- # 没有data字段,跳过消息
1136
- should_process = False
1137
- except Exception as e:
1138
- logger.error(f"Task {task_name}: Error parsing message data: {e}")
1139
-
1140
- if should_process:
1141
- # 处理消息 - 消息会被放入队列,由执行器处理并ACK
1142
- # 使用消息体中的实际队列名(可能包含优先级)
1143
- actual_queue = event_data.get('queue', queue)
1144
-
1145
- # 确定实际的consumer名称(对于优先级队列需要带后缀)
1146
- actual_consumer_name = consumer_name
1147
- # 从stream_name判断是否是优先级队列
1148
- if stream_name != prefixed_queue and ':' in stream_name:
1149
- # 这是优先级队列,添加优先级后缀
1150
- priority_suffix = stream_name.rsplit(':', 1)[-1]
1151
- actual_consumer_name = f"{consumer_name}:{priority_suffix}"
1152
-
1153
- result = self._process_message_common(
1154
- event_id, event_data, actual_queue, event_queue,
1155
- is_async=True, consumer_name=actual_consumer_name, group_name=group_name
1156
- )
1157
- if isinstance(result, tuple) and result[0] == 'async_put':
1158
- await self._async_put_task(event_queue, result[1])
1159
- logger.debug(f"Put task {event_id} into event_queue")
1160
- # 注意:这里不ACK,由执行器在处理完成后ACK
1161
- else:
1162
- # 不属于当前task的消息,收集起来批量ACK
1163
- skip_message_ids.append(event_id)
1164
-
1165
-
1166
- # 批量ACK不需要的消息(需要按队列分组)
1167
- if skip_message_ids:
1168
- # 这里简化处理,对所有队列都尝试ACK(实际只会在对应队列ACK成功)
1169
- group_name_bytes = group_name.encode() if isinstance(group_name, str) else group_name
1170
- for q in all_queues:
1171
- q_bytes = q.encode() if isinstance(q, str) else q
1172
- try:
1173
- await self.async_binary_redis_client.xack(q_bytes, group_name_bytes, *skip_message_ids)
1174
- except:
1175
- pass # 忽略ACK错误
1176
- logger.debug(f"Task {task_name} batch ACKed {len(skip_message_ids)} skipped messages")
1177
-
1178
- except Exception as e:
1179
- error_msg = str(e)
1180
- # import traceback
1181
- # traceback.print_exc()
1182
- logger.error(f"Error in task listener {task_name}: {e}")
1183
-
1184
- # 特殊处理:如果是NOGROUP错误,尝试重新创建consumer group
1185
- if "NOGROUP" in error_msg:
1186
- logger.info(f"Detected NOGROUP error for {task_name}, attempting to recreate consumer group...")
1187
- try:
1188
- # 为所有队列创建consumer group
1189
- for q in all_queues:
1190
- try:
1191
- await self.async_redis_client.xgroup_create(
1192
- name=q,
1193
- groupname=group_name,
1194
- id="0",
1195
- mkstream=True
1196
- )
1197
- logger.info(f"Successfully created consumer group {group_name} for queue {q}")
1198
- except:
1199
- pass # 可能已存在
1200
- logger.info(f"Consumer groups created/verified for task {task_name}")
1201
- # 重新创建成功,重置错误计数器
1202
- consecutive_errors = 0
1203
- continue
1204
- except Exception as create_error:
1205
- logger.error(f"Failed to recreate consumer group for {task_name}: {create_error}")
1206
-
1207
- consecutive_errors += 1
1208
- if consecutive_errors >= max_consecutive_errors:
1209
- logger.error(f"Too many errors for task {task_name}, restarting...")
1210
- consecutive_errors = 0
1211
- await asyncio.sleep(min(consecutive_errors, 5))
1212
-
1213
- logger.info(f"Starting event listeners for queues: {self.queues}")
1214
- tasks = []
1215
-
1216
- if not (self.app and hasattr(self.app, '_tasks_by_queue')):
1217
- raise RuntimeError("No app or tasks registered, cannot start listeners")
1218
-
1219
- # 为每个队列创建独立的延迟任务扫描器
1220
- for queue in self.queues:
1221
- logger.info(f"Starting delayed task scanner for queue: {queue}")
1222
- scanner_task = asyncio.create_task(
1223
- scan_delayed_tasks_for_queue(
1224
- queue,
1225
- delayed_tasks_lists[queue],
1226
- delayed_tasks_locks[queue]
1227
- )
1228
- )
1229
- tasks.append(scanner_task)
1230
-
1231
- # 为每个队列启动离线worker处理器(带自动重启)
1232
- # 包括优先级队列
1233
- all_recovery_queues = set(self.queues)
1234
- for base_queue in self.queues:
1235
- # 扫描优先级队列
1236
- priority_queues = await self.scan_priority_queues(base_queue)
1237
- for pq in priority_queues:
1238
- if pq != base_queue: # 不重复添加基础队列
1239
- all_recovery_queues.add(pq)
1240
-
1241
- # 为所有队列(包括优先级队列)启动离线worker处理器
1242
- for queue in all_recovery_queues:
1243
- logger.info(f"Starting offline worker processor for queue: {queue}")
1244
- offline_processor_task = asyncio.create_task(
1245
- self._start_offline_worker_processor_with_restart(queue, event_queue)
1246
- )
1247
- tasks.append(offline_processor_task)
1248
-
1249
- # # 为每个task创建独立的listener
1250
- for queue in self.queues:
1251
- task_names = self.app._tasks_by_queue.get(queue, [])
1252
- if not task_names:
1253
- raise RuntimeError(f"No tasks registered for queue '{queue}'. Cannot start worker without tasks.")
1254
-
1255
- for task_name in task_names:
1256
- logger.info(f"Starting listener for task: {task_name} on queue: {queue}")
1257
- task = asyncio.create_task(listen_event_by_task(queue, task_name))
1258
- tasks.append(task)
1259
-
1260
- # 等待所有任务
1261
- await asyncio.gather(*tasks)
1262
-
1263
- async def _scan_and_load_delayed_tasks_to_list(self, queue: str, task_list: list, task_lock: asyncio.Lock):
1264
- """扫描延迟队列并将到期任务加载到list(更高效)"""
1265
- try:
1266
- current_time = time.time()
1267
- delayed_queue_key = f"{self.redis_prefix}:DELAYED_QUEUE:{queue}"
1268
- prefixed_queue = self.get_prefixed_queue_name(queue)
1269
-
1270
- # 使用Lua脚本原子地获取并移除到期的任务
1271
- lua_script = """
1272
- local delayed_queue_key = KEYS[1]
1273
- local stream_key = KEYS[2]
1274
- local current_time = ARGV[1]
1275
- local limit = ARGV[2]
1276
-
1277
- -- 获取到期的任务ID(这些是Stream消息ID)
1278
- local expired_task_ids = redis.call('ZRANGEBYSCORE', delayed_queue_key, 0, current_time, 'LIMIT', 0, limit)
1279
-
1280
- if #expired_task_ids == 0 then
1281
- return {}
1282
- end
1283
-
1284
- local tasks_with_data = {}
1285
-
1286
- -- 获取每个任务的实际数据
1287
- for i, task_id in ipairs(expired_task_ids) do
1288
- -- 从Stream中读取任务数据
1289
- local messages = redis.call('XRANGE', stream_key, task_id, task_id)
1290
- if #messages > 0 then
1291
- -- 移除延迟队列中的任务
1292
- redis.call('ZREM', delayed_queue_key, task_id)
1293
- -- 添加到结果中
1294
- table.insert(tasks_with_data, messages[1])
1295
- end
1296
- end
1297
-
1298
- return tasks_with_data
1299
- """
1300
-
1301
- # 注册Lua脚本(使用二进制客户端)
1302
- if not hasattr(self, '_scan_delayed_script'):
1303
- self._scan_delayed_script = self.async_binary_redis_client.register_script(lua_script)
1304
-
1305
- # 执行脚本,每次最多获取100个到期任务(提高批处理效率)
1306
- expired_tasks = await self._scan_delayed_script(
1307
- keys=[delayed_queue_key, prefixed_queue],
1308
- args=[str(current_time), "100"]
1309
- )
1310
- # 移除频繁的debug日志,只在有任务时记录
1311
- if not expired_tasks:
1312
- return
1313
-
1314
- # 批量处理任务并添加到list
1315
- tasks_to_add = []
1316
- for task in expired_tasks:
1317
- try:
1318
- if isinstance(task, list) and len(task) >= 2:
1319
- # task格式: [stream_id, fields]
1320
- stream_id = task[0]
1321
- fields = task[1]
1322
-
1323
- # 将fields转换为字典(保持二进制格式)
1324
- task_data = {}
1325
- if isinstance(fields, list):
1326
- for j in range(0, len(fields), 2):
1327
- if j + 1 < len(fields):
1328
- key = fields[j]
1329
- value = fields[j + 1]
1330
- # 保持原始格式,不解码
1331
- task_data[key] = value
1332
-
1333
- # 解析data字段
1334
- data_field = task_data.get('data') or task_data.get(b'data')
1335
- if data_field:
1336
- # 使用loads_str来解析(它能处理二进制和字符串)
1337
- data = loads_str(data_field)
1338
- # 添加event_id
1339
- data['event_id'] = stream_id if isinstance(stream_id, str) else stream_id.decode('utf-8')
1340
-
1341
- # 添加到列表
1342
- tasks_to_add.append({'event_id': data['event_id'], 'data': data})
1343
-
1344
- except Exception as e:
1345
- logger.error(f"Error processing delayed task: {e}")
1346
-
1347
- # 批量添加到list(使用锁保证线程安全)
1348
- if tasks_to_add:
1349
- async with task_lock:
1350
- task_list.extend(tasks_to_add)
1351
- logger.info(f"Added {len(tasks_to_add)} delayed tasks to list for queue {queue}")
1352
-
1353
- except Exception as e:
1354
- logger.error(f"Error scanning delayed tasks for queue {queue}: {e}")
1355
-
1356
- async def _scan_and_load_delayed_tasks(self, queue: str, memory_queue: asyncio.Queue):
1357
- """扫描延迟队列并将到期任务加载到内存队列"""
1358
- try:
1359
- current_time = time.time()
1360
- delayed_queue_key = f"{self.redis_prefix}:DELAYED_QUEUE:{queue}"
1361
- prefixed_queue = self.get_prefixed_queue_name(queue)
1362
-
1363
- # 使用Lua脚本原子地获取并移除到期的任务
1364
- lua_script = """
1365
- local delayed_queue_key = KEYS[1]
1366
- local stream_key = KEYS[2]
1367
- local current_time = ARGV[1]
1368
- local limit = ARGV[2]
1369
-
1370
- -- 获取到期的任务ID(这些是Stream消息ID)
1371
- local expired_task_ids = redis.call('ZRANGEBYSCORE', delayed_queue_key, 0, current_time, 'LIMIT', 0, limit)
1372
-
1373
- if #expired_task_ids == 0 then
1374
- return {}
1375
- end
1376
-
1377
- local tasks_with_data = {}
1378
-
1379
- -- 获取每个任务的实际数据
1380
- for i, task_id in ipairs(expired_task_ids) do
1381
- -- 从Stream中读取任务数据
1382
- local messages = redis.call('XRANGE', stream_key, task_id, task_id)
1383
- if #messages > 0 then
1384
- -- 移除延迟队列中的任务
1385
- redis.call('ZREM', delayed_queue_key, task_id)
1386
- -- 添加到结果中
1387
- table.insert(tasks_with_data, messages[1])
1388
- end
1389
- end
1390
-
1391
- return tasks_with_data
1392
- """
1393
-
1394
- # 注册Lua脚本(使用二进制客户端)
1395
- if not hasattr(self, '_scan_delayed_script'):
1396
- self._scan_delayed_script = self.async_binary_redis_client.register_script(lua_script)
1397
-
1398
- # 执行脚本,每次最多获取100个到期任务(提高批处理效率)
1399
- expired_tasks = await self._scan_delayed_script(
1400
- keys=[delayed_queue_key, prefixed_queue],
1401
- args=[str(current_time), "100"]
1402
- )
1403
-
1404
- if not expired_tasks:
1405
- return
1406
-
1407
- # 处理返回的任务
1408
- for task in expired_tasks:
1409
- try:
1410
- if isinstance(task, list) and len(task) >= 2:
1411
- # task格式: [stream_id, fields]
1412
- stream_id = task[0]
1413
- fields = task[1]
1414
-
1415
- # 将fields转换为字典(保持二进制格式)
1416
- task_data = {}
1417
- if isinstance(fields, list):
1418
- for j in range(0, len(fields), 2):
1419
- if j + 1 < len(fields):
1420
- key = fields[j]
1421
- value = fields[j + 1]
1422
- # 保持原始格式,不解码
1423
- task_data[key] = value
1424
-
1425
- # 解析data字段
1426
- data_field = task_data.get('data') or task_data.get(b'data')
1427
- if data_field:
1428
- # 使用loads_str来解析(它能处理二进制和字符串)
1429
- data = loads_str(data_field)
1430
- # 添加event_id
1431
- data['event_id'] = stream_id if isinstance(stream_id, str) else stream_id.decode('utf-8')
1432
-
1433
- # 将任务放入内存队列
1434
- await memory_queue.put({'event_id': data['event_id'], 'data': data})
1435
- logger.debug(f"Loaded delayed task {data['event_id']} to memory queue for queue {queue}")
1436
-
1437
- except Exception as e:
1438
- logger.error(f"Error processing delayed task: {e}")
1439
-
1440
- except Exception as e:
1441
- logger.error(f"Error scanning delayed tasks for queue {queue}: {e}")
1442
-
1443
- async def _claim_delayed_tasks(self, queue: str, event_queue: asyncio.Queue, prefetch_multiplier: int):
1444
- """处理延迟队列中的到期任务"""
1445
- try:
1446
- # 检查队列大小,如果已满则不处理
1447
- if event_queue.qsize() >= max(prefetch_multiplier // 2, 1):
1448
- return
1449
-
1450
- current_time = time.time()
1451
- delayed_queue_key = f"{self.redis_prefix}:DELAYED_QUEUE:{queue}"
1452
- consumer_name = self.consumer_manager.get_consumer_name(queue)
1453
- prefixed_queue = self.get_prefixed_queue_name(queue)
1454
-
1455
- # 计算需要获取的任务数量
1456
- count_to_claim = max(1, prefetch_multiplier - event_queue.qsize())
1457
-
1458
- # Lua脚本:原子性地获取到期任务、认领、删除成功认领的任务
1459
- lua_script = """
1460
- local delayed_queue_key = KEYS[1]
1461
- local stream_key = KEYS[2]
1462
- local group_name = KEYS[3]
1463
- local consumer_name = ARGV[1]
1464
- local current_time = ARGV[2]
1465
- local limit = ARGV[3]
1466
-
1467
- -- 获取到期的任务ID(这些是Stream消息ID)
1468
- local expired_tasks = redis.call('ZRANGEBYSCORE', delayed_queue_key, 0, current_time, 'LIMIT', 0, limit)
1469
-
1470
- if #expired_tasks == 0 then
1471
- return {}
1472
- end
1473
-
1474
- local successfully_claimed = {}
1475
- local claimed_messages = {}
1476
-
1477
- -- 尝试认领每个任务
1478
- for i, task_id in ipairs(expired_tasks) do
1479
- -- 先检查消息的pending信息
1480
- local pending_info = redis.call('XPENDING', stream_key, group_name, task_id, task_id, 1)
1481
-
1482
- if #pending_info > 0 then
1483
- -- pending_info[1] 格式: {id, consumer, idle_time, delivery_count}
1484
- local idle_time = pending_info[1][3]
1485
-
1486
- -- 只认领空闲时间超过1秒的消息(避免认领刚被读取的消息)
1487
- if idle_time > 1000 then
1488
- -- 使用XCLAIM认领消息
1489
- local claim_result = redis.call('XCLAIM', stream_key, group_name, consumer_name, 0, task_id)
1490
-
1491
- if #claim_result > 0 then
1492
- -- 认领成功,记录任务ID
1493
- table.insert(successfully_claimed, task_id)
1494
- -- 保存认领到的消息内容
1495
- for j, msg in ipairs(claim_result) do
1496
- table.insert(claimed_messages, msg)
1497
- end
1498
- end
1499
- end
1500
- else
1501
- -- 消息不在pending列表中,可能还没被读取,跳过
1502
- -- 但保留在ZSET中,等待正常读取
1503
- end
1504
- end
1505
-
1506
- -- 只删除成功认领的任务
1507
- if #successfully_claimed > 0 then
1508
- redis.call('ZREM', delayed_queue_key, unpack(successfully_claimed))
1509
- end
1510
-
1511
- -- 返回认领到的消息
1512
- return claimed_messages
1513
- """
1514
-
1515
- # 注册Lua脚本(如果还没有注册)
1516
- if not hasattr(self, '_atomic_claim_script'):
1517
- self._atomic_claim_script = self.async_redis_client.register_script(lua_script)
1518
-
1519
- # 执行Lua脚本
1520
- try:
1521
- claimed_messages = await self._atomic_claim_script(
1522
- keys=[delayed_queue_key, prefixed_queue, prefixed_queue],
1523
- args=[consumer_name, str(current_time), str(count_to_claim)]
1524
- )
1525
-
1526
- if not claimed_messages:
1527
- return
1528
-
1529
- # claimed_messages 是嵌套列表,每个元素是 [msg_id, msg_data_fields]
1530
- # 其中 msg_data_fields 是扁平的键值对列表
1531
- for claimed_message in claimed_messages:
1532
- if isinstance(claimed_message, list) and len(claimed_message) >= 2:
1533
- msg_id = claimed_message[0]
1534
- msg_data_fields = claimed_message[1]
1535
-
1536
- # 解析消息数据
1537
- msg_data = {}
1538
- if isinstance(msg_data_fields, list):
1539
- for j in range(0, len(msg_data_fields), 2):
1540
- if j + 1 < len(msg_data_fields):
1541
- key = msg_data_fields[j]
1542
- value = msg_data_fields[j + 1]
1543
- # 保持bytes格式以匹配正常消息处理
1544
- if isinstance(key, str):
1545
- key = key.encode()
1546
- if isinstance(value, str):
1547
- value = value.encode()
1548
- msg_data[key] = value
1549
-
1550
- # 清除延迟标记
1551
- if b'data' in msg_data:
1552
- data_field = msg_data.get(b'data')
1553
- if data_field:
1554
- try:
1555
- # 直接解析二进制数据
1556
- parsed_data = loads_str(data_field)
1557
- # 清除延迟标记,避免再次被延迟
1558
- parsed_data['is_delayed'] = 0
1559
- # dumps_str 现在直接返回二进制
1560
- updated_data = dumps_str(parsed_data)
1561
- msg_data[b'data'] = updated_data
1562
- except:
1563
- pass
1564
-
1565
- # 处理消息
1566
- result = self._process_message_common(
1567
- msg_id, msg_data, queue, event_queue,
1568
- is_async=True, consumer_name=consumer_name
1569
- )
1570
- if isinstance(result, tuple) and result[0] == 'async_put':
1571
- await self._async_put_task(event_queue, result[1])
1572
-
1573
- logger.info(f"Claimed and processed delayed task {msg_id} from queue {queue}")
1574
-
1575
- logger.info(f"Processed {len(claimed_messages)} delayed tasks for queue {queue}")
1576
-
1577
- except Exception as e:
1578
- logger.error(f"Error executing atomic claim script: {e}")
1579
-
1580
- except Exception as e:
1581
- logger.error(f"Error processing delayed tasks for queue {queue}: {e}")
1582
- # 错误不应该阻塞主流程
1583
- def read_pending(self, groupname: str, queue: str, asyncio: bool = False):
1584
- # 现在使用自动解码的客户端
1585
- client = self.get_redis_client(asyncio, binary=True)
1586
- prefixed_queue = self.get_prefixed_queue_name(queue)
1587
- return client.xpending(prefixed_queue, groupname)
1588
-
1589
- def ack(self, queue, event_id, asyncio: bool = False):
1590
- # 现在使用自动解码的客户端
1591
- client = self.get_redis_client(asyncio, binary=True)
1592
- prefixed_queue = self.get_prefixed_queue_name(queue)
1593
- result = client.xack(prefixed_queue, prefixed_queue, event_id)
1594
- # 清理已认领的消息ID
1595
- if event_id in self._claimed_message_ids:
1596
- self._claimed_message_ids.remove(event_id)
1597
- return result
1598
-
1599
- def _recreate_redis_connection(self):
1600
- """重新创建Redis连接"""
1601
- try:
1602
- logger.info("开始重新创建Redis连接...")
1603
-
1604
- # 关闭现有连接
1605
- if hasattr(self.redis_client, 'connection_pool'):
1606
- try:
1607
- self.redis_client.connection_pool.disconnect()
1608
- except:
1609
- pass
1610
-
1611
- if hasattr(self.async_redis_client, 'connection_pool'):
1612
- try:
1613
- self.async_redis_client.connection_pool.disconnect()
1614
- except:
1615
- pass
1616
-
1617
- # 重新创建连接池和客户端
1618
- from ..core.app import get_redis_pool, get_async_redis_pool, get_binary_redis_pool, get_async_binary_redis_pool
1619
- import redis
1620
- from redis import asyncio as aioredis
1621
-
1622
- redis_url = self._redis_url
1623
-
1624
- # 重新创建同步连接
1625
- pool = get_redis_pool(redis_url)
1626
- new_redis_client = redis.StrictRedis(connection_pool=pool)
1627
-
1628
- # 重新创建异步连接
1629
- async_pool = get_async_redis_pool(redis_url)
1630
- new_async_redis_client = aioredis.StrictRedis(connection_pool=async_pool)
1631
-
1632
- # 重新创建二进制连接
1633
- binary_pool = get_binary_redis_pool(redis_url)
1634
- new_binary_redis_client = redis.StrictRedis(connection_pool=binary_pool)
1635
-
1636
- async_binary_pool = get_async_binary_redis_pool(redis_url)
1637
- new_async_binary_redis_client = aioredis.StrictRedis(connection_pool=async_binary_pool)
1638
-
1639
- # 测试新连接
1640
- new_redis_client.ping()
1641
-
1642
- # 更新连接
1643
- self.redis_client = new_redis_client
1644
- self.async_redis_client = new_async_redis_client
1645
- self.binary_redis_client = new_binary_redis_client
1646
- self.async_binary_redis_client = new_async_binary_redis_client
1647
-
1648
- logger.info("Redis连接已成功重新创建")
1649
-
1650
- except Exception as e:
1651
- logger.error(f"重新创建Redis连接失败: {e}")
1652
- # 如果重新创建失败,尝试重置连接池
1653
- try:
1654
- if hasattr(self.redis_client, 'connection_pool'):
1655
- self.redis_client.connection_pool.reset()
1656
- if hasattr(self.async_redis_client, 'connection_pool'):
1657
- self.async_redis_client.connection_pool.reset()
1658
- logger.info("已重置现有连接池")
1659
- except Exception as reset_error:
1660
- logger.error(f"重置连接池失败: {reset_error}")
1661
-
1662
- def _safe_redis_operation(self, operation, *args, max_retries=3, **kwargs):
1663
- """安全的Redis操作,带有重试机制"""
1664
- for attempt in range(max_retries):
1665
- try:
1666
- return operation(*args, **kwargs)
1667
- except (redis.exceptions.TimeoutError, redis.exceptions.ConnectionError) as e:
1668
- if attempt == max_retries - 1:
1669
- logger.error(f"Redis操作失败,已重试{max_retries}次: {e}")
1670
- raise
1671
-
1672
- logger.warning(f"Redis操作失败,第{attempt + 1}次重试: {e}")
1673
- if attempt == 0: # 第一次失败时重新创建连接
1674
- self._recreate_redis_connection()
1675
- time.sleep(min(2 ** attempt, 5)) # 指数退避,最多5秒
1676
-
1677
- def cleanup(self):
1678
- """清理EventPool资源"""
1679
- # 立即设置停止标志,阻止后台任务继续处理
1680
- self._stop_reading = True
1681
-
1682
- # 只有在有实际资源需要清理时才打印日志
1683
- has_active_resources = False
1684
-
1685
- # 检查是否有活跃的消费者管理器
1686
- if hasattr(self, 'consumer_manager') and self.consumer_manager:
1687
- # 检查消费者管理器是否真的有活动
1688
- if hasattr(self.consumer_manager, '_heartbeat_strategy'):
1689
- strategy = self.consumer_manager._heartbeat_strategy
1690
- if strategy and hasattr(strategy, 'consumer_id') and strategy.consumer_id:
1691
- has_active_resources = True
1692
-
1693
- if has_active_resources:
1694
- logger.info("Cleaning up EventPool resources...")
1695
- self.consumer_manager.cleanup()
1696
- logger.info("EventPool cleanup completed")
1697
- else:
1698
- # 静默清理
1699
- if hasattr(self, 'consumer_manager') and self.consumer_manager:
1700
- self.consumer_manager.cleanup()