jettask 0.2.20__py3-none-any.whl → 0.2.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +4 -0
- jettask/cli.py +12 -8
- jettask/config/lua_scripts.py +37 -0
- jettask/config/nacos_config.py +1 -1
- jettask/core/app.py +313 -340
- jettask/core/container.py +4 -4
- jettask/{persistence → core}/namespace.py +93 -27
- jettask/core/task.py +16 -9
- jettask/core/unified_manager_base.py +136 -26
- jettask/db/__init__.py +67 -0
- jettask/db/base.py +137 -0
- jettask/{utils/db_connector.py → db/connector.py} +130 -26
- jettask/db/models/__init__.py +16 -0
- jettask/db/models/scheduled_task.py +196 -0
- jettask/db/models/task.py +77 -0
- jettask/db/models/task_run.py +85 -0
- jettask/executor/__init__.py +0 -15
- jettask/executor/core.py +76 -31
- jettask/executor/process_entry.py +29 -114
- jettask/executor/task_executor.py +4 -0
- jettask/messaging/event_pool.py +928 -685
- jettask/messaging/scanner.py +30 -0
- jettask/persistence/__init__.py +28 -103
- jettask/persistence/buffer.py +170 -0
- jettask/persistence/consumer.py +330 -249
- jettask/persistence/manager.py +304 -0
- jettask/persistence/persistence.py +391 -0
- jettask/scheduler/__init__.py +15 -3
- jettask/scheduler/{task_crud.py → database.py} +61 -57
- jettask/scheduler/loader.py +2 -2
- jettask/scheduler/{scheduler_coordinator.py → manager.py} +23 -6
- jettask/scheduler/models.py +14 -10
- jettask/scheduler/schedule.py +166 -0
- jettask/scheduler/scheduler.py +12 -11
- jettask/schemas/__init__.py +50 -1
- jettask/schemas/backlog.py +43 -6
- jettask/schemas/namespace.py +70 -19
- jettask/schemas/queue.py +19 -3
- jettask/schemas/responses.py +493 -0
- jettask/task/__init__.py +0 -2
- jettask/task/router.py +3 -0
- jettask/test_connection_monitor.py +1 -1
- jettask/utils/__init__.py +7 -5
- jettask/utils/db_init.py +8 -4
- jettask/utils/namespace_dep.py +167 -0
- jettask/utils/queue_matcher.py +186 -0
- jettask/utils/rate_limit/concurrency_limiter.py +7 -1
- jettask/utils/stream_backlog.py +1 -1
- jettask/webui/__init__.py +0 -1
- jettask/webui/api/__init__.py +4 -4
- jettask/webui/api/alerts.py +806 -71
- jettask/webui/api/example_refactored.py +400 -0
- jettask/webui/api/namespaces.py +390 -45
- jettask/webui/api/overview.py +300 -54
- jettask/webui/api/queues.py +971 -267
- jettask/webui/api/scheduled.py +1249 -56
- jettask/webui/api/settings.py +129 -7
- jettask/webui/api/workers.py +442 -0
- jettask/webui/app.py +46 -2329
- jettask/webui/middleware/__init__.py +6 -0
- jettask/webui/middleware/namespace_middleware.py +135 -0
- jettask/webui/services/__init__.py +146 -0
- jettask/webui/services/heartbeat_service.py +251 -0
- jettask/webui/services/overview_service.py +60 -51
- jettask/webui/services/queue_monitor_service.py +426 -0
- jettask/webui/services/redis_monitor_service.py +87 -0
- jettask/webui/services/settings_service.py +174 -111
- jettask/webui/services/task_monitor_service.py +222 -0
- jettask/webui/services/timeline_pg_service.py +452 -0
- jettask/webui/services/timeline_service.py +189 -0
- jettask/webui/services/worker_monitor_service.py +467 -0
- jettask/webui/utils/__init__.py +11 -0
- jettask/webui/utils/time_utils.py +122 -0
- jettask/worker/lifecycle.py +8 -2
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/METADATA +1 -1
- jettask-0.2.24.dist-info/RECORD +142 -0
- jettask/executor/executor.py +0 -338
- jettask/persistence/backlog_monitor.py +0 -567
- jettask/persistence/base.py +0 -2334
- jettask/persistence/db_manager.py +0 -516
- jettask/persistence/maintenance.py +0 -81
- jettask/persistence/message_consumer.py +0 -259
- jettask/persistence/models.py +0 -49
- jettask/persistence/offline_recovery.py +0 -196
- jettask/persistence/queue_discovery.py +0 -215
- jettask/persistence/task_persistence.py +0 -218
- jettask/persistence/task_updater.py +0 -583
- jettask/scheduler/add_execution_count.sql +0 -11
- jettask/scheduler/add_priority_field.sql +0 -26
- jettask/scheduler/add_scheduler_id.sql +0 -25
- jettask/scheduler/add_scheduler_id_index.sql +0 -10
- jettask/scheduler/make_scheduler_id_required.sql +0 -28
- jettask/scheduler/migrate_interval_seconds.sql +0 -9
- jettask/scheduler/performance_optimization.sql +0 -45
- jettask/scheduler/run_scheduler.py +0 -186
- jettask/scheduler/schema.sql +0 -84
- jettask/task/task_executor.py +0 -318
- jettask/webui/api/analytics.py +0 -323
- jettask/webui/config.py +0 -90
- jettask/webui/models/__init__.py +0 -3
- jettask/webui/models/namespace.py +0 -63
- jettask/webui/namespace_manager/__init__.py +0 -10
- jettask/webui/namespace_manager/multi.py +0 -593
- jettask/webui/namespace_manager/unified.py +0 -193
- jettask/webui/run.py +0 -46
- jettask-0.2.20.dist-info/RECORD +0 -145
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/WHEEL +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/top_level.txt +0 -0
jettask/messaging/event_pool.py
CHANGED
@@ -25,13 +25,14 @@ import traceback
|
|
25
25
|
import redis
|
26
26
|
from redis import asyncio as aioredis
|
27
27
|
|
28
|
-
from
|
28
|
+
from jettask.db.connector import get_sync_redis_client, get_async_redis_client
|
29
29
|
|
30
30
|
from ..utils.helpers import get_hostname
|
31
31
|
import os
|
32
32
|
from jettask.worker.manager import ConsumerManager
|
33
33
|
from jettask.worker.recovery import OfflineWorkerRecovery
|
34
34
|
from .scanner import DelayedMessageScanner
|
35
|
+
from jettask.config.lua_scripts import LUA_SCRIPT_BATCH_SEND_EVENT
|
35
36
|
|
36
37
|
logger = logging.getLogger('app')
|
37
38
|
|
@@ -69,13 +70,22 @@ class EventPool(object):
|
|
69
70
|
self.async_redis_client = async_redis_client
|
70
71
|
# 创建用于二进制数据的Redis客户端(用于Stream操作)
|
71
72
|
# 直接使用全局客户端实例(单例)
|
72
|
-
|
73
|
-
self.
|
74
|
-
|
75
|
-
|
73
|
+
# 使用无限超时支持PubSub长连接
|
74
|
+
self.binary_redis_client = get_sync_redis_client(redis_url, decode_responses=False, socket_timeout=None)
|
75
|
+
self.async_binary_redis_client = get_async_redis_client(redis_url, decode_responses=False, socket_timeout=None)
|
76
|
+
|
76
77
|
self._redis_url = redis_url or 'redis://localhost:6379/0'
|
77
78
|
self.redis_prefix = redis_prefix or 'jettask'
|
78
79
|
self.app = app # 保存app引用
|
80
|
+
|
81
|
+
# ✅ 在初始化阶段分离通配符模式和静态队列
|
82
|
+
# self.queues 始终只存储静态队列(或动态发现的队列)
|
83
|
+
# self.wildcard_patterns 存储通配符模式,用于动态队列发现
|
84
|
+
from jettask.utils.queue_matcher import separate_wildcard_and_static_queues
|
85
|
+
|
86
|
+
self.wildcard_patterns, static_queues = separate_wildcard_and_static_queues(queues or [])
|
87
|
+
self.queues = static_queues # self.queues 只包含静态队列
|
88
|
+
self.wildcard_mode = len(self.wildcard_patterns) > 0 # 是否启用通配符模式
|
79
89
|
|
80
90
|
# 初始化消费者管理器
|
81
91
|
# consumer_strategy 参数已移除,现在只使用 HEARTBEAT 策略
|
@@ -168,6 +178,57 @@ class EventPool(object):
|
|
168
178
|
"""异步任务放入方法"""
|
169
179
|
await event_queue.put(task)
|
170
180
|
|
181
|
+
async def discover_and_update_queues(self, wildcard_patterns: List[str]) -> List[str]:
|
182
|
+
"""
|
183
|
+
动态发现并更新队列列表(支持通配符模式)
|
184
|
+
|
185
|
+
根据通配符模式从注册表中匹配队列,并更新 self.queues
|
186
|
+
self.queues 会包含:原有的静态队列 + 新发现的匹配队列
|
187
|
+
|
188
|
+
Args:
|
189
|
+
wildcard_patterns: 通配符模式列表,例如 ['test*']
|
190
|
+
- 'test*' 表示匹配所有test开头的队列
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
List[str]: 新发现的队列列表(只返回新增的,不包括已存在的)
|
194
|
+
|
195
|
+
Example:
|
196
|
+
>>> await ep.discover_and_update_queues(['test*'])
|
197
|
+
['test1', 'test2'] # 假设这两个是新发现的
|
198
|
+
"""
|
199
|
+
from jettask.utils.queue_matcher import discover_matching_queues
|
200
|
+
|
201
|
+
# 1. 从注册表获取所有已注册的队列
|
202
|
+
all_registered_queues = await self.queue_registry.get_all_queues()
|
203
|
+
|
204
|
+
# 将 bytes 转为 str(如果需要)
|
205
|
+
all_registered_queues = {
|
206
|
+
q.decode('utf-8') if isinstance(q, bytes) else q
|
207
|
+
for q in all_registered_queues
|
208
|
+
}
|
209
|
+
|
210
|
+
# 2. 根据通配符模式匹配队列
|
211
|
+
matched_queues = discover_matching_queues(wildcard_patterns, all_registered_queues)
|
212
|
+
|
213
|
+
# 3. 计算新增的队列(matched_queues 中不在 self.queues 中的)
|
214
|
+
current_queues = set(self.queues or [])
|
215
|
+
new_queues = matched_queues - current_queues
|
216
|
+
|
217
|
+
# 4. 更新 self.queues(合并现有队列和新匹配的队列)
|
218
|
+
if new_queues:
|
219
|
+
updated_queues = sorted(current_queues | matched_queues)
|
220
|
+
logger.info(
|
221
|
+
f"队列动态发现: 新增={list(new_queues)}, "
|
222
|
+
f"总计={len(updated_queues)}"
|
223
|
+
)
|
224
|
+
self.queues = updated_queues
|
225
|
+
|
226
|
+
# 为新队列添加停止标志
|
227
|
+
for queue in new_queues:
|
228
|
+
self._queue_stop_flags[queue] = False
|
229
|
+
|
230
|
+
return list(new_queues)
|
231
|
+
|
171
232
|
def init_routing(self):
|
172
233
|
for queue in self.queues:
|
173
234
|
self.solo_agg_task[queue] = defaultdict(list)
|
@@ -195,40 +256,6 @@ class EventPool(object):
|
|
195
256
|
|
196
257
|
def _batch_send_event_sync(self, prefixed_queue, messages: List[dict], pipe):
|
197
258
|
"""批量发送事件(同步)"""
|
198
|
-
# 使用Lua脚本批量发送消息并添加自增offset
|
199
|
-
lua_script = """
|
200
|
-
local stream_key = KEYS[1]
|
201
|
-
local prefix = ARGV[1]
|
202
|
-
local results = {}
|
203
|
-
|
204
|
-
-- 使用Hash存储所有队列的offset
|
205
|
-
local offsets_hash = prefix .. ':QUEUE_OFFSETS'
|
206
|
-
|
207
|
-
-- 从stream_key中提取队列名(去掉prefix:QUEUE:前缀)
|
208
|
-
local queue_name = string.gsub(stream_key, '^' .. prefix .. ':QUEUE:', '')
|
209
|
-
|
210
|
-
-- 将队列添加到全局队列注册表(包括所有队列,包括优先级队列)
|
211
|
-
local queues_registry_key = prefix .. ':REGISTRY:QUEUES'
|
212
|
-
redis.call('SADD', queues_registry_key, queue_name)
|
213
|
-
|
214
|
-
-- 从ARGV[2]开始,每个参数是一个消息的data
|
215
|
-
for i = 2, #ARGV do
|
216
|
-
local data = ARGV[i]
|
217
|
-
|
218
|
-
-- 使用HINCRBY原子递增offset(如果不存在会自动创建并设为1)
|
219
|
-
local current_offset = redis.call('HINCRBY', offsets_hash, queue_name, 1)
|
220
|
-
|
221
|
-
-- 添加消息到Stream(包含offset字段)
|
222
|
-
local stream_id = redis.call('XADD', stream_key, '*',
|
223
|
-
'data', data,
|
224
|
-
'offset', current_offset)
|
225
|
-
|
226
|
-
table.insert(results, stream_id)
|
227
|
-
end
|
228
|
-
|
229
|
-
return results
|
230
|
-
"""
|
231
|
-
|
232
259
|
# 准备Lua脚本参数
|
233
260
|
lua_args = [self.redis_prefix.encode() if isinstance(self.redis_prefix, str) else self.redis_prefix]
|
234
261
|
|
@@ -245,7 +272,7 @@ class EventPool(object):
|
|
245
272
|
|
246
273
|
# 执行Lua脚本
|
247
274
|
results = client.eval(
|
248
|
-
|
275
|
+
LUA_SCRIPT_BATCH_SEND_EVENT,
|
249
276
|
1, # 1个KEY
|
250
277
|
prefixed_queue, # KEY[1]: stream key
|
251
278
|
*lua_args # ARGV: prefix, data1, data2, ...
|
@@ -256,40 +283,6 @@ class EventPool(object):
|
|
256
283
|
|
257
284
|
async def _batch_send_event(self, prefixed_queue, messages: List[dict], pipe):
|
258
285
|
"""批量发送事件(异步)"""
|
259
|
-
# 使用Lua脚本批量发送消息并添加自增offset
|
260
|
-
lua_script = """
|
261
|
-
local stream_key = KEYS[1]
|
262
|
-
local prefix = ARGV[1]
|
263
|
-
local results = {}
|
264
|
-
|
265
|
-
-- 使用Hash存储所有队列的offset
|
266
|
-
local offsets_hash = prefix .. ':QUEUE_OFFSETS'
|
267
|
-
|
268
|
-
-- 从stream_key中提取队列名(去掉prefix:QUEUE:前缀)
|
269
|
-
local queue_name = string.gsub(stream_key, '^' .. prefix .. ':QUEUE:', '')
|
270
|
-
|
271
|
-
-- 将队列添加到全局队列注册表(包括所有队列,包括优先级队列)
|
272
|
-
local queues_registry_key = prefix .. ':REGISTRY:QUEUES'
|
273
|
-
redis.call('SADD', queues_registry_key, queue_name)
|
274
|
-
|
275
|
-
-- 从ARGV[2]开始,每个参数是一个消息的data
|
276
|
-
for i = 2, #ARGV do
|
277
|
-
local data = ARGV[i]
|
278
|
-
|
279
|
-
-- 使用HINCRBY原子递增offset(如果不存在会自动创建并设为1)
|
280
|
-
local current_offset = redis.call('HINCRBY', offsets_hash, queue_name, 1)
|
281
|
-
|
282
|
-
-- 添加消息到Stream(包含offset字段)
|
283
|
-
local stream_id = redis.call('XADD', stream_key, '*',
|
284
|
-
'data', data,
|
285
|
-
'offset', current_offset)
|
286
|
-
|
287
|
-
table.insert(results, stream_id)
|
288
|
-
end
|
289
|
-
|
290
|
-
return results
|
291
|
-
"""
|
292
|
-
|
293
286
|
# 准备Lua脚本参数
|
294
287
|
lua_args = [self.redis_prefix.encode() if isinstance(self.redis_prefix, str) else self.redis_prefix]
|
295
288
|
|
@@ -306,7 +299,7 @@ class EventPool(object):
|
|
306
299
|
|
307
300
|
# 执行Lua脚本
|
308
301
|
results = await client.eval(
|
309
|
-
|
302
|
+
LUA_SCRIPT_BATCH_SEND_EVENT,
|
310
303
|
1, # 1个KEY
|
311
304
|
prefixed_queue, # KEY[1]: stream key
|
312
305
|
*lua_args # ARGV: prefix, data1, data2, ...
|
@@ -1263,706 +1256,959 @@ class EventPool(object):
|
|
1263
1256
|
logger.debug(f"Updated read offset for {field}: {offset}")
|
1264
1257
|
except Exception as e:
|
1265
1258
|
logger.error(f"Error updating read offset: {e}")
|
1266
|
-
|
1267
|
-
async def listening_event(self, event_queue: dict, prefetch_multiplier: int = 1):
|
1268
|
-
"""监听事件 - 为每个task创建独立的consumer group
|
1269
1259
|
|
1270
|
-
|
1271
|
-
event_queue: dict[str, asyncio.Queue] - 按task_name隔离的队列字典
|
1272
|
-
prefetch_multiplier: 预取倍数
|
1273
|
-
"""
|
1274
|
-
# 验证参数类型
|
1275
|
-
if not isinstance(event_queue, dict):
|
1276
|
-
raise TypeError(f"event_queue must be a dict[str, asyncio.Queue], got {type(event_queue)}")
|
1260
|
+
# ==================== 通配符队列发现相关方法 ====================
|
1277
1261
|
|
1278
|
-
|
1279
|
-
|
1262
|
+
async def _initial_queue_discovery(self):
|
1263
|
+
"""初始队列发现(启动时执行一次)- 仅在通配符模式下使用"""
|
1264
|
+
if not self.wildcard_mode:
|
1265
|
+
return
|
1280
1266
|
|
1281
|
-
|
1267
|
+
try:
|
1268
|
+
logger.info("[QueueDiscovery] Performing initial queue discovery...")
|
1282
1269
|
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1270
|
+
# 从 QUEUE_REGISTRY 获取所有队列
|
1271
|
+
queue_members = await self.async_redis_client.smembers(
|
1272
|
+
self._queue_registry_key.encode()
|
1273
|
+
)
|
1287
1274
|
|
1288
|
-
|
1289
|
-
|
1275
|
+
discovered_queues = set()
|
1276
|
+
for queue_bytes in queue_members:
|
1277
|
+
queue_name = queue_bytes.decode('utf-8') if isinstance(queue_bytes, bytes) else str(queue_bytes)
|
1278
|
+
discovered_queues.add(queue_name)
|
1290
1279
|
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
"""为单个任务监听事件"""
|
1295
|
-
# 恢复读取历史 pending 消息的逻辑
|
1296
|
-
check_backlog = {} # {queue_name: bool} - 首次读取 pending 消息
|
1297
|
-
lastid = {} # 每个队列的lastid - 首次为 "0",后续为 ">"
|
1298
|
-
consecutive_errors = 0
|
1299
|
-
max_consecutive_errors = 5
|
1300
|
-
|
1301
|
-
# 获取当前task使用的event_queue
|
1302
|
-
task_event_queue = event_queue.get(task_name)
|
1303
|
-
if not task_event_queue:
|
1304
|
-
logger.error(f"No event queue found for task {task_name}")
|
1305
|
-
return
|
1280
|
+
if not discovered_queues:
|
1281
|
+
# 如果注册表为空,尝试从现有数据初始化
|
1282
|
+
logger.warning("[QueueDiscovery] QUEUE_REGISTRY is empty, initializing from existing data...")
|
1306
1283
|
|
1307
|
-
|
1308
|
-
|
1309
|
-
if not task:
|
1310
|
-
logger.error(f"Task {task_name} not found")
|
1311
|
-
return
|
1284
|
+
await self.queue_registry.initialize_from_existing_data()
|
1285
|
+
discovered_queues = await self.queue_registry.get_all_queues()
|
1312
1286
|
|
1313
|
-
|
1314
|
-
prefixed_queue = self.get_prefixed_queue_name(queue)
|
1315
|
-
consumer_name = self.consumer_manager.get_consumer_name(queue)
|
1316
|
-
# 使用函数名作为group_name,实现任务隔离(用于后续消息处理)
|
1317
|
-
group_name = f"{prefixed_queue}:{task_name}"
|
1287
|
+
logger.info(f"[QueueDiscovery] Initial discovery found {len(discovered_queues)} queues: {discovered_queues}")
|
1318
1288
|
|
1319
|
-
#
|
1320
|
-
|
1321
|
-
|
1289
|
+
# 更新队列列表
|
1290
|
+
self._discovered_queues = discovered_queues
|
1291
|
+
# 过滤掉通配符本身,只保留实际队列
|
1292
|
+
self.queues = [q for q in discovered_queues if q != '*']
|
1322
1293
|
|
1323
|
-
#
|
1324
|
-
|
1325
|
-
|
1326
|
-
)
|
1294
|
+
# 更新 ConsumerManager 的队列配置
|
1295
|
+
if self.consumer_manager:
|
1296
|
+
self.consumer_config['queues'] = self.queues
|
1327
1297
|
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1298
|
+
except Exception as e:
|
1299
|
+
logger.error(f"[QueueDiscovery] Initial discovery failed: {e}", exc_info=True)
|
1300
|
+
self._discovered_queues = set()
|
1301
|
+
self.queues = []
|
1302
|
+
|
1303
|
+
async def _discover_queues_loop(self):
|
1304
|
+
"""定期发现新队列(仅在通配符模式下运行)"""
|
1305
|
+
if not self.wildcard_mode:
|
1306
|
+
return
|
1307
|
+
|
1308
|
+
logger.info("[QueueDiscovery] Starting wildcard queue discovery loop...")
|
1309
|
+
|
1310
|
+
while not self._stop_reading:
|
1311
|
+
try:
|
1312
|
+
# 从 QUEUE_REGISTRY 获取所有队列
|
1313
|
+
queue_members = await self.async_redis_client.smembers(
|
1314
|
+
self._queue_registry_key.encode()
|
1332
1315
|
)
|
1333
1316
|
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
# 获取该队列的延迟任务列表和锁
|
1340
|
-
delayed_list = self._delayed_tasks_lists.get(queue)
|
1341
|
-
delayed_lock = self._delayed_tasks_locks.get(queue)
|
1342
|
-
|
1343
|
-
# 记录上次优先级队列更新时间和上次group_info检查时间
|
1344
|
-
last_priority_update = time.time()
|
1345
|
-
last_group_info_check = time.time()
|
1346
|
-
|
1347
|
-
while not self._stop_reading:
|
1348
|
-
# 定期直接从Redis获取优先级队列(每1秒检查一次)
|
1349
|
-
current_time = time.time()
|
1350
|
-
if current_time - last_priority_update >= 1: # 简化为固定1秒间隔
|
1351
|
-
new_priority_queues = await self.get_priority_queues_direct(queue)
|
1352
|
-
|
1353
|
-
# 如果优先级队列有变化,更新本地变量
|
1354
|
-
if new_priority_queues != priority_queues:
|
1355
|
-
logger.debug(f"Priority queues updated for {queue}: {priority_queues} -> {new_priority_queues}")
|
1356
|
-
priority_queues = new_priority_queues
|
1357
|
-
all_queues = [prefixed_queue] + priority_queues
|
1358
|
-
|
1359
|
-
# 为新的优先级队列创建consumer group(共享基础队列的 group_name)
|
1360
|
-
for q in all_queues:
|
1361
|
-
if q not in lastid: # 这是新队列
|
1362
|
-
await self._ensure_consumer_group_and_record_info(
|
1363
|
-
q, task_name, consumer_name, base_group_name=group_name
|
1364
|
-
)
|
1365
|
-
logger.debug(f"Ensured consumer group for new priority queue {q}")
|
1317
|
+
current_queues = set()
|
1318
|
+
for queue_bytes in queue_members:
|
1319
|
+
queue_name = queue_bytes.decode('utf-8') if isinstance(queue_bytes, bytes) else str(queue_bytes)
|
1320
|
+
current_queues.add(queue_name)
|
1366
1321
|
|
1367
|
-
|
1368
|
-
|
1369
|
-
check_backlog[q] = True
|
1322
|
+
# 发现新队列
|
1323
|
+
new_queues = current_queues - self._discovered_queues
|
1370
1324
|
|
1371
|
-
|
1325
|
+
if new_queues:
|
1326
|
+
logger.info(f"[QueueDiscovery] Discovered new queues: {new_queues}")
|
1372
1327
|
|
1373
|
-
|
1374
|
-
|
1375
|
-
# 检查worker key中是否缺失group_info
|
1376
|
-
if self.consumer_manager:
|
1377
|
-
worker_key = self.consumer_manager._heartbeat_strategy._worker_key
|
1378
|
-
try:
|
1379
|
-
# 检查第一个队列的group_info是否存在
|
1380
|
-
first_queue = all_queues[0] if all_queues else prefixed_queue
|
1381
|
-
first_group_name = f"{first_queue}:{task_name}"
|
1382
|
-
field_name = f"group_info:{first_group_name}"
|
1328
|
+
# 为新队列创建监听任务
|
1329
|
+
await self._start_listeners_for_new_queues(new_queues)
|
1383
1330
|
|
1384
|
-
|
1331
|
+
# 更新已发现队列集合
|
1332
|
+
self._discovered_queues.update(new_queues)
|
1385
1333
|
|
1386
|
-
|
1387
|
-
|
1388
|
-
logger.info(f"Detected missing group_info for task {task_name}, restoring...")
|
1334
|
+
# 更新 self.queues(过滤掉通配符)
|
1335
|
+
self.queues = [q for q in self._discovered_queues if q != '*']
|
1389
1336
|
|
1390
|
-
|
1391
|
-
|
1337
|
+
# 更新 ConsumerManager 的队列配置
|
1338
|
+
if self.consumer_manager:
|
1339
|
+
self.consumer_config['queues'] = self.queues
|
1392
1340
|
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1341
|
+
# 检查已删除的队列
|
1342
|
+
removed_queues = self._discovered_queues - current_queues
|
1343
|
+
if removed_queues:
|
1344
|
+
logger.info(f"[QueueDiscovery] Queues removed: {removed_queues}")
|
1345
|
+
self._discovered_queues -= removed_queues
|
1346
|
+
self.queues = [q for q in self._discovered_queues if q != '*']
|
1398
1347
|
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1348
|
+
# 更新 ConsumerManager 的队列配置
|
1349
|
+
if self.consumer_manager:
|
1350
|
+
self.consumer_config['queues'] = self.queues
|
1402
1351
|
|
1403
|
-
|
1404
|
-
|
1405
|
-
# 批量获取并处理延迟任务(使用list更高效)
|
1406
|
-
if delayed_list:
|
1407
|
-
# 原子地交换list内容
|
1408
|
-
async with delayed_lock:
|
1409
|
-
if delayed_list:
|
1410
|
-
# 快速拷贝并清空原list
|
1411
|
-
tasks_to_process = delayed_list.copy()
|
1412
|
-
delayed_list.clear()
|
1413
|
-
else:
|
1414
|
-
tasks_to_process = []
|
1415
|
-
|
1416
|
-
# 处理所有延迟任务
|
1417
|
-
if tasks_to_process:
|
1418
|
-
my_tasks = [] # 属于当前task的任务
|
1419
|
-
other_tasks = [] # 属于其他task的任务
|
1420
|
-
|
1421
|
-
for delayed_task in tasks_to_process:
|
1422
|
-
# Scanner 返回的格式:{'event_id': '...', 'queue': '...'}
|
1423
|
-
# 没有 data 字段,需要通过 XCLAIM 获取
|
1424
|
-
|
1425
|
-
# 注意:新版本Scanner只返回消息ID,不返回数据
|
1426
|
-
# 数据将在后续通过XCLAIM获取
|
1427
|
-
task_data = delayed_task.get('data', None)
|
1428
|
-
|
1429
|
-
# 如果task_data存在(兼容旧版本Scanner)
|
1430
|
-
if task_data:
|
1431
|
-
if isinstance(task_data, str):
|
1432
|
-
import json
|
1433
|
-
task_data = json.loads(task_data)
|
1434
|
-
|
1435
|
-
# 检查消息是否指定了目标task
|
1436
|
-
target_tasks = task_data.get('_target_tasks', None)
|
1437
|
-
if target_tasks and task_name not in target_tasks:
|
1438
|
-
# 这个消息不是给当前task的
|
1439
|
-
other_tasks.append(delayed_task)
|
1440
|
-
continue
|
1352
|
+
# 每10秒检查一次
|
1353
|
+
await asyncio.sleep(10)
|
1441
1354
|
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
|
1446
|
-
# 处理属于当前task的所有任务
|
1447
|
-
# 按队列分组延迟任务的 offset(因为可能来自不同的优先级队列)
|
1448
|
-
max_offsets_by_queue = {}
|
1355
|
+
except Exception as e:
|
1356
|
+
logger.error(f"[QueueDiscovery] Error in discovery loop: {e}", exc_info=True)
|
1357
|
+
await asyncio.sleep(10)
|
1449
1358
|
|
1450
|
-
|
1451
|
-
|
1359
|
+
async def _start_listeners_for_new_queues(self, new_queues: set):
|
1360
|
+
"""为新发现的队列启动监听任务
|
1452
1361
|
|
1453
|
-
|
1454
|
-
|
1362
|
+
Args:
|
1363
|
+
new_queues: 新发现的队列集合
|
1364
|
+
"""
|
1365
|
+
if not (self.app and hasattr(self.app, '_tasks_by_queue')):
|
1366
|
+
logger.error("[QueueDiscovery] No app or tasks registered")
|
1367
|
+
return
|
1455
1368
|
|
1456
|
-
|
1457
|
-
|
1458
|
-
if task_data is None:
|
1459
|
-
prefixed_queue = self.get_prefixed_queue_name(task_queue)
|
1460
|
-
try:
|
1461
|
-
# 使用XCLAIM转移消息所有权
|
1462
|
-
# min_idle_time设为0,强制claim
|
1463
|
-
claimed_messages = await self.async_binary_redis_client.xclaim(
|
1464
|
-
name=prefixed_queue,
|
1465
|
-
groupname=group_name,
|
1466
|
-
consumername=consumer_name,
|
1467
|
-
min_idle_time=0, # 立即claim,不管idle时间
|
1468
|
-
message_ids=[event_id]
|
1469
|
-
)
|
1470
|
-
|
1471
|
-
if not claimed_messages:
|
1472
|
-
logger.warning(f"Failed to claim delayed message {event_id} from queue {task_queue}")
|
1473
|
-
continue
|
1369
|
+
# 获取通配符任务(queue="*" 的任务)
|
1370
|
+
wildcard_tasks = self.app._tasks_by_queue.get('*', [])
|
1474
1371
|
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
fields = claimed_msg[1]
|
1479
|
-
|
1480
|
-
# 将fields转换为字典
|
1481
|
-
task_data_dict = {}
|
1482
|
-
if isinstance(fields, dict):
|
1483
|
-
task_data_dict = fields
|
1484
|
-
elif isinstance(fields, list):
|
1485
|
-
for j in range(0, len(fields), 2):
|
1486
|
-
if j + 1 < len(fields):
|
1487
|
-
key = fields[j]
|
1488
|
-
value = fields[j + 1]
|
1489
|
-
task_data_dict[key] = value
|
1490
|
-
|
1491
|
-
# 解析data字段
|
1492
|
-
data_field = task_data_dict.get('data') or task_data_dict.get(b'data')
|
1493
|
-
if data_field:
|
1494
|
-
task_data = loads_str(data_field)
|
1495
|
-
|
1496
|
-
# 提取 offset 字段(关键:确保延迟任务的 offset 能被记录)
|
1497
|
-
offset_field = task_data_dict.get('offset') or task_data_dict.get(b'offset')
|
1498
|
-
if offset_field:
|
1499
|
-
try:
|
1500
|
-
offset_value = int(offset_field) if isinstance(offset_field, (int, str)) else int(offset_field.decode())
|
1501
|
-
task_data['offset'] = offset_value
|
1502
|
-
except (ValueError, TypeError, AttributeError):
|
1503
|
-
logger.debug(f"Failed to extract offset from claimed message {event_id}")
|
1504
|
-
else:
|
1505
|
-
logger.warning(f"No data field in claimed message {event_id}")
|
1506
|
-
continue
|
1507
|
-
else:
|
1508
|
-
logger.warning(f"Invalid claimed message format for {event_id}")
|
1509
|
-
continue
|
1372
|
+
if not wildcard_tasks:
|
1373
|
+
logger.warning("[QueueDiscovery] No wildcard tasks registered (queue='*')")
|
1374
|
+
return
|
1510
1375
|
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1376
|
+
# 获取当前的 event_queue 字典(从 listening_event 传递过来的)
|
1377
|
+
# 注意:这需要在 listening_event 中保存 event_queue 的引用
|
1378
|
+
event_queue_dict = getattr(self, '_event_queue_dict', None)
|
1379
|
+
if not event_queue_dict:
|
1380
|
+
logger.error("[QueueDiscovery] Event queue dict not found")
|
1381
|
+
return
|
1514
1382
|
|
1515
|
-
|
1383
|
+
for queue in new_queues:
|
1384
|
+
# 初始化延迟任务列表和锁
|
1385
|
+
if queue not in self._delayed_tasks_lists:
|
1386
|
+
self._delayed_tasks_lists[queue] = []
|
1387
|
+
self._delayed_tasks_locks[queue] = asyncio.Lock()
|
1516
1388
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
logger.debug(f'延迟任务 {event_id} 执行误差: {delay_error*1000:.1f}ms')
|
1389
|
+
# 为新队列注册延迟任务回调
|
1390
|
+
import functools
|
1391
|
+
callback = functools.partial(self._handle_expired_tasks, queue)
|
1392
|
+
self.delayed_scanner.register_callback(queue, callback)
|
1522
1393
|
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1394
|
+
# 更新延迟扫描器监听的队列列表(添加新队列)
|
1395
|
+
# 注意:DelayedMessageScanner 需要支持动态添加队列
|
1396
|
+
if hasattr(self.delayed_scanner, 'add_queue'):
|
1397
|
+
await self.delayed_scanner.add_queue(queue)
|
1398
|
+
|
1399
|
+
# 为每个通配符任务创建监听器
|
1400
|
+
for task_name in wildcard_tasks:
|
1401
|
+
logger.info(f"[QueueDiscovery] Starting listener: {task_name} on queue: {queue}")
|
1402
|
+
|
1403
|
+
# 创建监听任务(复用 listening_event 中的 listen_event_by_task 逻辑)
|
1404
|
+
# 注意:这里需要能够访问 listen_event_by_task 函数
|
1405
|
+
# 我们将在 listening_event 中将其保存为实例方法
|
1406
|
+
if hasattr(self, '_listen_event_by_task'):
|
1407
|
+
task = asyncio.create_task(
|
1408
|
+
self._listen_event_by_task(queue, task_name)
|
1409
|
+
)
|
1410
|
+
self._background_tasks.append(task)
|
1531
1411
|
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
if isinstance(result, tuple) and result[0] == 'async_put':
|
1538
|
-
await self._async_put_task(task_event_queue, result[1])
|
1412
|
+
# 为新队列启动离线worker恢复
|
1413
|
+
offline_task = asyncio.create_task(
|
1414
|
+
self._start_offline_worker_processor_with_restart(queue)
|
1415
|
+
)
|
1416
|
+
self._background_tasks.append(offline_task)
|
1539
1417
|
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
messages_needed = prefetch_multiplier - current_queue_size # 还能读取的消息数
|
1565
|
-
|
1566
|
-
if messages_needed <= 0:
|
1567
|
-
# 不需要读取更多消息
|
1568
|
-
await asyncio.sleep(0.01)
|
1569
|
-
continue
|
1418
|
+
async def _handle_expired_tasks(self, queue: str, tasks: list):
|
1419
|
+
"""处理到期的延迟任务(回调函数)
|
1420
|
+
|
1421
|
+
Args:
|
1422
|
+
queue: 队列名称
|
1423
|
+
tasks: 到期的任务列表
|
1424
|
+
"""
|
1425
|
+
if tasks:
|
1426
|
+
# 确保延迟任务列表已初始化
|
1427
|
+
if queue not in self._delayed_tasks_lists:
|
1428
|
+
self._delayed_tasks_lists[queue] = []
|
1429
|
+
self._delayed_tasks_locks[queue] = asyncio.Lock()
|
1430
|
+
|
1431
|
+
async with self._delayed_tasks_locks[queue]:
|
1432
|
+
self._delayed_tasks_lists[queue].extend(tasks)
|
1433
|
+
|
1434
|
+
# ==================== 结束:通配符队列发现相关方法 ====================
|
1435
|
+
|
1436
|
+
# 为每个队列注册延迟任务回调
|
1437
|
+
async def handle_expired_tasks(self, queue: str, tasks: list):
|
1438
|
+
"""处理到期的延迟任务"""
|
1439
|
+
if tasks:
|
1440
|
+
async with self._delayed_tasks_locks[queue]:
|
1441
|
+
self._delayed_tasks_lists[queue].extend(tasks)
|
1570
1442
|
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1443
|
+
async def listen_event_by_task(self, event_queue, queue, task_name, prefetch_multiplier):
|
1444
|
+
"""为单个任务监听事件"""
|
1445
|
+
# 恢复读取历史 pending 消息的逻辑
|
1446
|
+
check_backlog = {} # {queue_name: bool} - 首次读取 pending 消息
|
1447
|
+
lastid = {} # 每个队列的lastid - 首次为 "0",后续为 ">"
|
1448
|
+
consecutive_errors = 0
|
1449
|
+
max_consecutive_errors = 5
|
1450
|
+
|
1451
|
+
# 获取当前task使用的event_queue
|
1452
|
+
task_event_queue = event_queue.get(task_name)
|
1453
|
+
if not task_event_queue:
|
1454
|
+
logger.error(f"No event queue found for task {task_name}")
|
1455
|
+
return
|
1574
1456
|
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1457
|
+
# 获取任务对象
|
1458
|
+
task = self.app._tasks.get(task_name)
|
1459
|
+
if not task:
|
1460
|
+
logger.error(f"Task {task_name} not found")
|
1461
|
+
return
|
1579
1462
|
|
1580
|
-
|
1581
|
-
|
1463
|
+
# 定义必要的变量
|
1464
|
+
prefixed_queue = self.get_prefixed_queue_name(queue)
|
1465
|
+
consumer_name = self.consumer_manager.get_consumer_name(queue)
|
1466
|
+
# 使用函数名作为group_name,实现任务隔离(用于后续消息处理)
|
1467
|
+
group_name = f"{prefixed_queue}:{task_name}"
|
1582
1468
|
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
actual_queue = q.replace(f"{self.redis_prefix}:QUEUE:", "")
|
1587
|
-
pipe.hget(queue_offsets_key, actual_queue)
|
1469
|
+
# 直接获取所有优先级队列(包括默认队列)
|
1470
|
+
priority_queues = await self.get_priority_queues_direct(queue)
|
1471
|
+
all_queues = [prefixed_queue] + priority_queues # 默认队列 + 优先级队列
|
1588
1472
|
|
1589
|
-
|
1590
|
-
|
1473
|
+
# 为基础队列创建 consumer group 并记录 group_info
|
1474
|
+
base_group_name = await self._ensure_consumer_group_and_record_info(
|
1475
|
+
prefixed_queue, task_name, consumer_name
|
1476
|
+
)
|
1477
|
+
|
1478
|
+
# 为优先级队列创建 consumer group(共享基础队列的 group_name)
|
1479
|
+
for pq in priority_queues:
|
1480
|
+
await self._ensure_consumer_group_and_record_info(
|
1481
|
+
pq, task_name, consumer_name, base_group_name=base_group_name
|
1482
|
+
)
|
1591
1483
|
|
1592
|
-
|
1484
|
+
# ✅ 初始化每个队列:首次读取 pending 消息(从 "0" 开始)
|
1485
|
+
for q in all_queues:
|
1486
|
+
lastid[q] = "0" # 首次读取历史消息
|
1487
|
+
check_backlog[q] = True # 首次读取 pending 消息
|
1488
|
+
|
1489
|
+
# 获取该队列的延迟任务列表和锁
|
1490
|
+
delayed_list = self._delayed_tasks_lists.get(queue)
|
1491
|
+
delayed_lock = self._delayed_tasks_locks.get(queue)
|
1492
|
+
|
1493
|
+
# 记录上次优先级队列更新时间和上次group_info检查时间
|
1494
|
+
last_priority_update = time.time()
|
1495
|
+
last_group_info_check = time.time()
|
1496
|
+
|
1497
|
+
while not self._stop_reading:
|
1498
|
+
# 定期直接从Redis获取优先级队列(每1秒检查一次)
|
1499
|
+
current_time = time.time()
|
1500
|
+
if current_time - last_priority_update >= 1: # 简化为固定1秒间隔
|
1501
|
+
new_priority_queues = await self.get_priority_queues_direct(queue)
|
1502
|
+
|
1503
|
+
# 如果优先级队列有变化,更新本地变量
|
1504
|
+
if new_priority_queues != priority_queues:
|
1505
|
+
logger.debug(f"Priority queues updated for {queue}: {priority_queues} -> {new_priority_queues}")
|
1506
|
+
priority_queues = new_priority_queues
|
1507
|
+
all_queues = [prefixed_queue] + priority_queues
|
1508
|
+
|
1509
|
+
# 为新的优先级队列创建consumer group(共享基础队列的 group_name)
|
1593
1510
|
for q in all_queues:
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
results = await pipe.execute()
|
1600
|
-
|
1601
|
-
# 分析结果,确定哪些队列有待读取的消息
|
1602
|
-
half_len = len(all_queues)
|
1603
|
-
for i, q in enumerate(all_queues):
|
1604
|
-
# ✅ 如果该队列需要读取 pending 消息,直接加入列表,跳过 offset 检查
|
1605
|
-
if check_backlog.get(q, False):
|
1606
|
-
queues_with_messages.append(q)
|
1607
|
-
logger.debug(f"Queue {q} needs to read pending messages, skipping offset check")
|
1608
|
-
continue
|
1511
|
+
if q not in lastid: # 这是新队列
|
1512
|
+
await self._ensure_consumer_group_and_record_info(
|
1513
|
+
q, task_name, consumer_name, base_group_name=group_name
|
1514
|
+
)
|
1515
|
+
logger.debug(f"Ensured consumer group for new priority queue {q}")
|
1609
1516
|
|
1610
|
-
|
1611
|
-
|
1517
|
+
# ✅ 初始化新队列:读取历史 pending 消息
|
1518
|
+
lastid[q] = "0"
|
1519
|
+
check_backlog[q] = True
|
1612
1520
|
|
1613
|
-
|
1614
|
-
sent = int(sent_offset) if sent_offset else 0
|
1615
|
-
read = int(read_offset) if read_offset else 0
|
1521
|
+
last_priority_update = current_time
|
1616
1522
|
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1523
|
+
# 定期检查并恢复group_info(每10秒检查一次)
|
1524
|
+
if current_time - last_group_info_check >= 10:
|
1525
|
+
# 检查worker key中是否缺失group_info
|
1526
|
+
if self.consumer_manager:
|
1527
|
+
worker_key = self.consumer_manager._heartbeat_strategy._worker_key
|
1528
|
+
try:
|
1529
|
+
# 检查第一个队列的group_info是否存在
|
1530
|
+
first_queue = all_queues[0] if all_queues else prefixed_queue
|
1531
|
+
first_group_name = f"{first_queue}:{task_name}"
|
1532
|
+
field_name = f"group_info:{first_group_name}"
|
1621
1533
|
|
1622
|
-
|
1623
|
-
if not queues_with_messages:
|
1624
|
-
logger.debug("No queues have unread messages, will wait for new messages")
|
1534
|
+
group_info_exists = await self.async_redis_client.hexists(worker_key, field_name)
|
1625
1535
|
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1536
|
+
# 如果group_info不存在,说明worker key可能被重建了,需要恢复group_info
|
1537
|
+
if not group_info_exists:
|
1538
|
+
logger.info(f"Detected missing group_info for task {task_name}, restoring...")
|
1539
|
+
|
1540
|
+
# 恢复基础队列的 group_info
|
1541
|
+
await self._ensure_consumer_group_and_record_info(prefixed_queue, task_name, consumer_name)
|
1542
|
+
|
1543
|
+
# 为优先级队列重新创建 consumer group(共享基础队列的 group_name)
|
1544
|
+
for pq in priority_queues:
|
1545
|
+
await self._ensure_consumer_group_and_record_info(
|
1546
|
+
pq, task_name, consumer_name, base_group_name=group_name
|
1547
|
+
)
|
1548
|
+
|
1549
|
+
logger.info(f"Restored group_info and consumer groups for {len(all_queues)} queues for task {task_name}")
|
1550
|
+
except Exception as e:
|
1551
|
+
logger.error(f"Error checking/restoring group_info: {e}", exc_info=True)
|
1552
|
+
|
1553
|
+
last_group_info_check = current_time
|
1554
|
+
|
1555
|
+
# 批量获取并处理延迟任务(使用list更高效)
|
1556
|
+
if delayed_list:
|
1557
|
+
# 原子地交换list内容
|
1558
|
+
async with delayed_lock:
|
1559
|
+
if delayed_list:
|
1560
|
+
# 快速拷贝并清空原list
|
1561
|
+
tasks_to_process = delayed_list.copy()
|
1562
|
+
delayed_list.clear()
|
1563
|
+
else:
|
1564
|
+
tasks_to_process = []
|
1630
1565
|
|
1631
|
-
#
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1635
|
-
break # 已经读取足够的消息
|
1566
|
+
# 处理所有延迟任务
|
1567
|
+
if tasks_to_process:
|
1568
|
+
my_tasks = [] # 属于当前task的任务
|
1569
|
+
other_tasks = [] # 属于其他task的任务
|
1636
1570
|
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1571
|
+
for delayed_task in tasks_to_process:
|
1572
|
+
# Scanner 返回的格式:{'event_id': '...', 'queue': '...'}
|
1573
|
+
# 没有 data 字段,需要通过 XCLAIM 获取
|
1574
|
+
|
1575
|
+
# 注意:新版本Scanner只返回消息ID,不返回数据
|
1576
|
+
# 数据将在后续通过XCLAIM获取
|
1577
|
+
task_data = delayed_task.get('data', None)
|
1578
|
+
|
1579
|
+
# 如果task_data存在(兼容旧版本Scanner)
|
1580
|
+
if task_data:
|
1581
|
+
if isinstance(task_data, str):
|
1582
|
+
import json
|
1583
|
+
task_data = json.loads(task_data)
|
1584
|
+
|
1585
|
+
# 检查消息是否指定了目标task
|
1586
|
+
target_tasks = task_data.get('_target_tasks', None)
|
1587
|
+
if target_tasks and task_name not in target_tasks:
|
1588
|
+
# 这个消息不是给当前task的
|
1589
|
+
other_tasks.append(delayed_task)
|
1590
|
+
continue
|
1591
|
+
|
1592
|
+
# 当前task处理这个任务
|
1593
|
+
# task_data可能为None,会在后续通过XCLAIM获取
|
1594
|
+
my_tasks.append((delayed_task, task_data))
|
1644
1595
|
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
# print(f'先处理历史消息:{q_bytes=} {group_name=} {q_messages=}')
|
1660
|
-
# 记录从哪个队列读取的
|
1661
|
-
messages.extend(q_messages)
|
1662
|
-
messages_read = len(q_messages[0][1]) if q_messages else 0
|
1663
|
-
messages_needed -= messages_read
|
1664
|
-
|
1665
|
-
# 如果高优先级队列还有消息,继续从该队列读取
|
1666
|
-
# 直到该队列空了或者达到prefetch限制
|
1667
|
-
if messages_read > 0 and messages_needed > 0:
|
1668
|
-
# 该队列可能还有更多消息,下次循环继续优先从这个队列读
|
1669
|
-
# 但现在先处理已读取的消息
|
1670
|
-
break # 跳出for循环,处理已有消息
|
1671
|
-
|
1672
|
-
except Exception as e:
|
1673
|
-
if "NOGROUP" in str(e):
|
1674
|
-
# consumer group 不存在(可能是 Redis 被清空了),重新创建
|
1675
|
-
logger.warning(f"NOGROUP error for queue {q}, recreating consumer group...")
|
1596
|
+
# 处理属于当前task的所有任务
|
1597
|
+
# 按队列分组延迟任务的 offset(因为可能来自不同的优先级队列)
|
1598
|
+
max_offsets_by_queue = {}
|
1599
|
+
|
1600
|
+
for delayed_task, task_data in my_tasks:
|
1601
|
+
event_id = delayed_task.get('event_id', f"delayed-{time.time()}")
|
1602
|
+
|
1603
|
+
# 获取任务来自哪个队列(可能包含优先级后缀)
|
1604
|
+
task_queue = delayed_task.get('queue', queue)
|
1605
|
+
|
1606
|
+
# 如果task_data为None,说明Scanner只返回了消息ID
|
1607
|
+
# 需要使用XCLAIM从Stream中claim消息并转移所有权
|
1608
|
+
if task_data is None:
|
1609
|
+
prefixed_queue = self.get_prefixed_queue_name(task_queue)
|
1676
1610
|
try:
|
1677
|
-
#
|
1678
|
-
|
1679
|
-
|
1611
|
+
# 使用XCLAIM转移消息所有权
|
1612
|
+
# min_idle_time设为0,强制claim
|
1613
|
+
claimed_messages = await self.async_binary_redis_client.xclaim(
|
1614
|
+
name=prefixed_queue,
|
1615
|
+
groupname=group_name,
|
1616
|
+
consumername=consumer_name,
|
1617
|
+
min_idle_time=0, # 立即claim,不管idle时间
|
1618
|
+
message_ids=[event_id]
|
1680
1619
|
)
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1686
|
-
|
1687
|
-
|
1688
|
-
if
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1620
|
+
|
1621
|
+
if not claimed_messages:
|
1622
|
+
logger.warning(f"Failed to claim delayed message {event_id} from queue {task_queue}")
|
1623
|
+
continue
|
1624
|
+
|
1625
|
+
# 解析claimed消息
|
1626
|
+
claimed_msg = claimed_messages[0] # [(stream_id, fields)]
|
1627
|
+
if isinstance(claimed_msg, (list, tuple)) and len(claimed_msg) >= 2:
|
1628
|
+
fields = claimed_msg[1]
|
1629
|
+
|
1630
|
+
# 将fields转换为字典
|
1631
|
+
task_data_dict = {}
|
1632
|
+
if isinstance(fields, dict):
|
1633
|
+
task_data_dict = fields
|
1634
|
+
elif isinstance(fields, list):
|
1635
|
+
for j in range(0, len(fields), 2):
|
1636
|
+
if j + 1 < len(fields):
|
1637
|
+
key = fields[j]
|
1638
|
+
value = fields[j + 1]
|
1639
|
+
task_data_dict[key] = value
|
1640
|
+
|
1641
|
+
# 解析data字段
|
1642
|
+
data_field = task_data_dict.get('data') or task_data_dict.get(b'data')
|
1643
|
+
if data_field:
|
1644
|
+
task_data = loads_str(data_field)
|
1645
|
+
|
1646
|
+
# 提取 offset 字段(关键:确保延迟任务的 offset 能被记录)
|
1647
|
+
offset_field = task_data_dict.get('offset') or task_data_dict.get(b'offset')
|
1648
|
+
if offset_field:
|
1649
|
+
try:
|
1650
|
+
offset_value = int(offset_field) if isinstance(offset_field, (int, str)) else int(offset_field.decode())
|
1651
|
+
task_data['offset'] = offset_value
|
1652
|
+
except (ValueError, TypeError, AttributeError):
|
1653
|
+
logger.debug(f"Failed to extract offset from claimed message {event_id}")
|
1654
|
+
else:
|
1655
|
+
logger.warning(f"No data field in claimed message {event_id}")
|
1656
|
+
continue
|
1657
|
+
else:
|
1658
|
+
logger.warning(f"Invalid claimed message format for {event_id}")
|
1659
|
+
continue
|
1660
|
+
|
1661
|
+
except Exception as e:
|
1662
|
+
logger.error(f"Error claiming delayed message {event_id}: {e}", exc_info=True)
|
1663
|
+
continue
|
1664
|
+
|
1665
|
+
task_data['_task_name'] = task_name
|
1666
|
+
|
1667
|
+
# 记录延迟精度(用于调试)
|
1668
|
+
if 'execute_at' in task_data:
|
1669
|
+
delay_error = time.time() - task_data['execute_at']
|
1670
|
+
if abs(delay_error) > 0.1: # 超过100ms才记录
|
1671
|
+
logger.debug(f'延迟任务 {event_id} 执行误差: {delay_error*1000:.1f}ms')
|
1672
|
+
|
1673
|
+
# 收集每个队列的最大offset
|
1674
|
+
if 'offset' in task_data:
|
1675
|
+
try:
|
1676
|
+
message_offset = int(task_data['offset'])
|
1677
|
+
if task_queue not in max_offsets_by_queue or message_offset > max_offsets_by_queue[task_queue]:
|
1678
|
+
max_offsets_by_queue[task_queue] = message_offset
|
1679
|
+
except (ValueError, TypeError):
|
1680
|
+
pass
|
1681
|
+
|
1682
|
+
# 所有队列(包括优先级队列)都使用基础队列的 group_name
|
1683
|
+
result = self._process_message_common(
|
1684
|
+
event_id, task_data, task_queue, task_event_queue,
|
1685
|
+
is_async=True, consumer_name=consumer_name, group_name=group_name
|
1686
|
+
)
|
1687
|
+
if isinstance(result, tuple) and result[0] == 'async_put':
|
1688
|
+
await self._async_put_task(task_event_queue, result[1])
|
1689
|
+
|
1690
|
+
# 批量更新每个队列的最大offset(所有队列使用同一个 group_name)
|
1691
|
+
for task_queue, max_offset in max_offsets_by_queue.items():
|
1692
|
+
asyncio.create_task(self._update_read_offset(task_queue, group_name, max_offset))
|
1693
|
+
|
1694
|
+
# 把不属于当前task的任务放回list
|
1695
|
+
if other_tasks:
|
1696
|
+
async with delayed_lock:
|
1697
|
+
delayed_list.extend(other_tasks)
|
1698
|
+
|
1699
|
+
# 处理正常的Stream消息(支持优先级队列)
|
1700
|
+
# 实现真正的优先级消费:
|
1701
|
+
# 1. 先检查event_queue是否已满
|
1702
|
+
# 2. 优先从高优先级队列读取
|
1703
|
+
# 3. 只有高优先级队列空了才读取低优先级
|
1704
|
+
# 4. 不超过prefetch_multiplier限制
|
1705
|
+
|
1706
|
+
# 检查内存队列是否已满
|
1707
|
+
current_queue_size = task_event_queue.qsize() if hasattr(task_event_queue, 'qsize') else 0
|
1708
|
+
if current_queue_size >= prefetch_multiplier:
|
1709
|
+
# 内存队列已满,等待处理
|
1710
|
+
await asyncio.sleep(0.01) # 短暂等待
|
1711
|
+
continue
|
1712
|
+
|
1713
|
+
messages = []
|
1714
|
+
messages_needed = prefetch_multiplier - current_queue_size # 还能读取的消息数
|
1715
|
+
|
1716
|
+
if messages_needed <= 0:
|
1717
|
+
# 不需要读取更多消息
|
1718
|
+
await asyncio.sleep(0.01)
|
1719
|
+
continue
|
1720
|
+
|
1721
|
+
# 优化:预先检查哪些队列有待读取的消息,避免在空队列上浪费时间
|
1722
|
+
# ✅ 但如果队列需要读取 pending 消息(check_backlog=True),则跳过该队列的 offset 检查
|
1723
|
+
queues_with_messages = []
|
1724
|
+
|
1725
|
+
try:
|
1726
|
+
# 批量获取已发送和已读取的offset
|
1727
|
+
queue_offsets_key = f"{self.redis_prefix}:QUEUE_OFFSETS"
|
1728
|
+
read_offsets_key = f"{self.redis_prefix}:READ_OFFSETS"
|
1729
|
+
|
1730
|
+
# 使用pipeline批量获取offset
|
1731
|
+
pipe = self.async_redis_client.pipeline()
|
1732
|
+
|
1733
|
+
# 获取所有队列的已发送offset
|
1734
|
+
for q in all_queues:
|
1735
|
+
# 从队列名中提取实际的队列名(去掉前缀)
|
1736
|
+
actual_queue = q.replace(f"{self.redis_prefix}:QUEUE:", "")
|
1737
|
+
pipe.hget(queue_offsets_key, actual_queue)
|
1738
|
+
|
1739
|
+
# 提取 task_name(从 group_name 中)
|
1740
|
+
task_name = group_name.split(':')[-1]
|
1741
|
+
|
1742
|
+
# 获取所有队列的已读取offset
|
1743
|
+
for q in all_queues:
|
1744
|
+
actual_queue = q.replace(f"{self.redis_prefix}:QUEUE:", "")
|
1745
|
+
# field 格式:队列名(含优先级):任务名
|
1746
|
+
field = f"{actual_queue}:{task_name}"
|
1747
|
+
pipe.hget(read_offsets_key, field)
|
1748
|
+
|
1749
|
+
results = await pipe.execute()
|
1750
|
+
|
1751
|
+
# 分析结果,确定哪些队列有待读取的消息
|
1752
|
+
half_len = len(all_queues)
|
1753
|
+
for i, q in enumerate(all_queues):
|
1754
|
+
# ✅ 如果该队列需要读取 pending 消息,直接加入列表,跳过 offset 检查
|
1755
|
+
if check_backlog.get(q, False):
|
1756
|
+
queues_with_messages.append(q)
|
1757
|
+
logger.debug(f"Queue {q} needs to read pending messages, skipping offset check")
|
1698
1758
|
continue
|
1699
|
-
|
1759
|
+
|
1760
|
+
sent_offset = results[i] # 已发送的offset
|
1761
|
+
read_offset = results[half_len + i] # 已读取的offset
|
1762
|
+
|
1763
|
+
# 转换为整数
|
1764
|
+
sent = int(sent_offset) if sent_offset else 0
|
1765
|
+
read = int(read_offset) if read_offset else 0
|
1766
|
+
|
1767
|
+
# 如果已发送的offset大于已读取的offset,说明有消息待读取
|
1768
|
+
if sent > read:
|
1769
|
+
queues_with_messages.append(q)
|
1770
|
+
logger.debug(f"Queue {q} has {sent - read} unread messages (sent={sent}, read={read})")
|
1771
|
+
|
1772
|
+
# 如果没有队列有消息,记录下来(不再使用原始队列列表避免空读)
|
1773
|
+
if not queues_with_messages:
|
1774
|
+
logger.debug("No queues have unread messages, will wait for new messages")
|
1775
|
+
|
1776
|
+
except Exception as e:
|
1777
|
+
# 出错时回退到原始逻辑
|
1778
|
+
logger.debug(f"Failed to check queue offsets: {e}")
|
1779
|
+
queues_with_messages = all_queues
|
1780
|
+
|
1781
|
+
# print(f'{queues_with_messages=}')
|
1782
|
+
# 按优先级顺序读取有消息的队列
|
1783
|
+
for q in queues_with_messages:
|
1784
|
+
if messages_needed <= 0:
|
1785
|
+
break # 已经读取足够的消息
|
1786
|
+
|
1787
|
+
q_bytes = q.encode() if isinstance(q, str) else q
|
1788
|
+
# 针对具体队列检查是否需要读取历史消息
|
1789
|
+
if check_backlog.get(q, True):
|
1790
|
+
myid = lastid.get(q, "0-0")
|
1791
|
+
else:
|
1792
|
+
myid = ">"
|
1793
|
+
myid_bytes = myid.encode() if isinstance(myid, str) else myid
|
1700
1794
|
|
1701
1795
|
try:
|
1702
|
-
#
|
1703
|
-
|
1704
|
-
#
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1796
|
+
# print(f'{myid_bytes=} {consumer_name=} {check_backlog=} {q_bytes=}')
|
1797
|
+
# 所有队列(包括优先级队列)都使用基础队列的 group_name
|
1798
|
+
# 从当前优先级队列读取(最多读取messages_needed个)
|
1799
|
+
q_messages = await self.async_binary_redis_client.xreadgroup(
|
1800
|
+
groupname=group_name,
|
1801
|
+
consumername=consumer_name,
|
1802
|
+
streams={q_bytes: myid_bytes},
|
1803
|
+
count=messages_needed, # 只读取需要的数量
|
1804
|
+
block=100 # 非阻塞
|
1805
|
+
)
|
1806
|
+
# logger.info(f'{group_name=} {q_bytes=} {consumer_name=} {q_messages=}')
|
1807
|
+
if q_messages:
|
1808
|
+
# logger.info(f"Read messages from {q}: {len(q_messages[0][1]) if q_messages else 0} messages")
|
1809
|
+
# if check_backlog.get(q, True):
|
1810
|
+
# print(f'先处理历史消息:{q_bytes=} {group_name=} {q_messages=}')
|
1811
|
+
# 记录从哪个队列读取的
|
1812
|
+
messages.extend(q_messages)
|
1813
|
+
messages_read = len(q_messages[0][1]) if q_messages else 0
|
1814
|
+
messages_needed -= messages_read
|
1815
|
+
|
1816
|
+
# 如果高优先级队列还有消息,继续从该队列读取
|
1817
|
+
# 直到该队列空了或者达到prefetch限制
|
1818
|
+
if messages_read > 0 and messages_needed > 0:
|
1819
|
+
# 该队列可能还有更多消息,下次循环继续优先从这个队列读
|
1820
|
+
# 但现在先处理已读取的消息
|
1821
|
+
break # 跳出for循环,处理已有消息
|
1822
|
+
|
1823
|
+
except Exception as e:
|
1824
|
+
if "NOGROUP" in str(e):
|
1825
|
+
# consumer group 不存在(可能是 Redis 被清空了),重新创建
|
1826
|
+
logger.warning(f"NOGROUP error for queue {q}, recreating consumer group...")
|
1827
|
+
try:
|
1828
|
+
# 为队列创建 consumer group(共享基础队列的 group_name)
|
1829
|
+
await self._ensure_consumer_group_and_record_info(
|
1830
|
+
q, task_name, consumer_name, base_group_name=group_name
|
1724
1831
|
)
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1832
|
+
logger.info(f"Recreated consumer group for queue {q}")
|
1833
|
+
|
1834
|
+
# 重新初始化这个队列的 lastid 和 check_backlog
|
1835
|
+
lastid[q] = "0"
|
1836
|
+
check_backlog[q] = True
|
1837
|
+
|
1838
|
+
# 确保这个队列在 all_queues 中(可能因 Redis 清空而丢失)
|
1839
|
+
if q not in all_queues:
|
1840
|
+
all_queues.append(q)
|
1841
|
+
# 同时更新 priority_queues(如果是优先级队列)
|
1842
|
+
if q != prefixed_queue and q not in priority_queues:
|
1843
|
+
priority_queues.append(q)
|
1844
|
+
logger.info(f"Re-added queue {q} to all_queues after NOGROUP recovery")
|
1845
|
+
except Exception as recreate_error:
|
1846
|
+
logger.error(f"Failed to recreate consumer group for {q}: {recreate_error}")
|
1847
|
+
else:
|
1848
|
+
logger.debug(f"Error reading from queue {q}: {e}")
|
1849
|
+
continue
|
1850
|
+
|
1851
|
+
|
1852
|
+
try:
|
1853
|
+
# logger.debug(f'{group_name=} {consumer_name=} {block_time=}')
|
1854
|
+
consecutive_errors = 0
|
1855
|
+
# if check_backlog and messages:
|
1856
|
+
# logger.debug(f'先消费之前的消息 {group_name=} ')
|
1857
|
+
# logger.debug(f'{check_backlog=} {messages=}')
|
1858
|
+
|
1859
|
+
# 上报已投递的offset(用于积压监控)
|
1860
|
+
try:
|
1861
|
+
from jettask.utils.stream_backlog import report_delivered_offset
|
1862
|
+
# 对每个stream的消息上报offset
|
1863
|
+
for msg in messages:
|
1864
|
+
stream_name = msg[0]
|
1739
1865
|
if isinstance(stream_name, bytes):
|
1740
1866
|
stream_name = stream_name.decode('utf-8')
|
1867
|
+
# 提取队列名(去掉前缀)
|
1868
|
+
queue_name = stream_name.replace(f"{self.redis_prefix}:STREAM:", "")
|
1869
|
+
await report_delivered_offset(
|
1870
|
+
self.async_redis_client,
|
1871
|
+
self.redis_prefix,
|
1872
|
+
queue_name,
|
1873
|
+
group_name,
|
1874
|
+
[msg]
|
1875
|
+
)
|
1876
|
+
except Exception as e:
|
1877
|
+
# 监控失败不影响主流程
|
1878
|
+
logger.debug(f"Failed to report delivered offset: {e}")
|
1879
|
+
|
1880
|
+
# 收集需要跳过的消息ID
|
1881
|
+
skip_message_ids = []
|
1882
|
+
|
1883
|
+
# 用于记录每个队列的最大offset(批量更新)
|
1884
|
+
max_offsets_per_queue = {}
|
1885
|
+
|
1886
|
+
for message in messages:
|
1887
|
+
# print(f'{message=}')
|
1888
|
+
# message[0]是stream名称,message[1]是消息列表
|
1889
|
+
stream_name = message[0]
|
1890
|
+
if isinstance(stream_name, bytes):
|
1891
|
+
stream_name = stream_name.decode('utf-8')
|
1892
|
+
|
1893
|
+
# 根据这个具体队列的消息数量,更新该队列的check_backlog状态
|
1894
|
+
if len(message[1]) == 0:
|
1895
|
+
# 这个队列没有历史消息了,下次读取最新消息
|
1896
|
+
check_backlog[stream_name] = False
|
1897
|
+
|
1898
|
+
for event in message[1]:
|
1899
|
+
event_id = event[0]
|
1900
|
+
# 更新对应队列的lastid
|
1901
|
+
lastid[stream_name] = event_id
|
1902
|
+
# 将bytes类型的event_id转换为字符串
|
1903
|
+
if isinstance(event_id, bytes):
|
1904
|
+
event_id = event_id.decode('utf-8')
|
1905
|
+
event_data = event[1]
|
1741
1906
|
|
1742
|
-
#
|
1743
|
-
|
1744
|
-
# 这个队列没有历史消息了,下次读取最新消息
|
1745
|
-
check_backlog[stream_name] = False
|
1907
|
+
# 解析消息内容,决定是否处理
|
1908
|
+
should_process = True
|
1746
1909
|
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1910
|
+
try:
|
1911
|
+
# 解析data字段中的消息
|
1912
|
+
if b'data' in event_data or 'data' in event_data:
|
1913
|
+
data_field = event_data.get(b'data') or event_data.get('data')
|
1914
|
+
|
1915
|
+
# 直接解析二进制数据,不需要解码
|
1916
|
+
parsed_data = loads_str(data_field)
|
1917
|
+
|
1918
|
+
# 跳过延迟任务(延迟任务由延迟扫描器处理)
|
1919
|
+
# 但如果任务已到期,或者正在从 pending 恢复,则应该处理
|
1920
|
+
if parsed_data.get('is_delayed') == 1:
|
1921
|
+
# 检查是否已到期
|
1922
|
+
execute_at = parsed_data.get('execute_at')
|
1923
|
+
current_time = time.time()
|
1924
|
+
|
1925
|
+
if execute_at and execute_at > current_time:
|
1926
|
+
# 未到期,跳过(由Scanner处理)
|
1927
|
+
should_process = False
|
1928
|
+
continue
|
1766
1929
|
|
1767
|
-
#
|
1768
|
-
#
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1777
|
-
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
logger.debug(f"Processing expired delayed task {event_id}")
|
1930
|
+
# 已到期或无execute_at字段,继续处理
|
1931
|
+
# 这种情况可能是:
|
1932
|
+
# 1. 延迟任务已到期,正在被执行
|
1933
|
+
# 2. 从 pending 恢复的已到期任务
|
1934
|
+
logger.debug(f"Processing expired delayed task {event_id}")
|
1935
|
+
|
1936
|
+
# 每个task都有独立的consumer group
|
1937
|
+
# 检查消息是否指定了目标task(用于精确路由)
|
1938
|
+
target_tasks = parsed_data.get('_target_tasks', None)
|
1939
|
+
if target_tasks and task_name not in target_tasks:
|
1940
|
+
# 这个消息指定了其他task处理
|
1941
|
+
should_process = False
|
1942
|
+
|
1943
|
+
if should_process:
|
1944
|
+
# 添加task_name到数据中(用于执行器识别任务)
|
1945
|
+
parsed_data['_task_name'] = task_name
|
1784
1946
|
|
1785
|
-
#
|
1786
|
-
|
1787
|
-
|
1788
|
-
if
|
1789
|
-
#
|
1790
|
-
|
1947
|
+
# 提取offset字段(如果存在)
|
1948
|
+
offset_field = event_data.get(b'offset') or event_data.get('offset')
|
1949
|
+
message_offset = None
|
1950
|
+
if offset_field:
|
1951
|
+
# 将offset添加到parsed_data中
|
1952
|
+
if isinstance(offset_field, bytes):
|
1953
|
+
offset_field = offset_field.decode('utf-8')
|
1954
|
+
parsed_data['offset'] = offset_field
|
1955
|
+
try:
|
1956
|
+
message_offset = int(offset_field)
|
1957
|
+
except (ValueError, TypeError):
|
1958
|
+
pass
|
1791
1959
|
|
1792
|
-
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
# 更新event_data
|
1810
|
-
event_data.clear()
|
1811
|
-
for key, value in parsed_data.items():
|
1812
|
-
event_data[key] = value
|
1813
|
-
|
1814
|
-
# 收集每个队列的最大offset(不要每条消息都记录)
|
1815
|
-
if message_offset is not None:
|
1816
|
-
# 从stream_name提取实际的队列名
|
1817
|
-
actual_queue_name = stream_name.replace(f"{self.redis_prefix}:QUEUE:", "")
|
1818
|
-
# 更新该队列的最大offset
|
1819
|
-
if actual_queue_name not in max_offsets_per_queue:
|
1820
|
-
max_offsets_per_queue[actual_queue_name] = message_offset
|
1821
|
-
else:
|
1822
|
-
max_offsets_per_queue[actual_queue_name] = max(max_offsets_per_queue[actual_queue_name], message_offset)
|
1823
|
-
|
1824
|
-
logger.debug(f"Task {task_name} will process message {event_id}")
|
1825
|
-
else:
|
1826
|
-
# 没有data字段,跳过消息
|
1827
|
-
should_process = False
|
1828
|
-
except Exception as e:
|
1829
|
-
logger.error(f"Task {task_name}: Error parsing message data: {e}")
|
1830
|
-
|
1831
|
-
if should_process:
|
1832
|
-
# 处理消息 - 消息会被放入队列,由执行器处理并ACK
|
1833
|
-
# 使用消息体中的实际队列名(可能包含优先级)
|
1834
|
-
actual_queue = event_data.get('queue', queue)
|
1835
|
-
|
1836
|
-
# 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
|
1837
|
-
# 不再需要为优先级队列添加后缀
|
1838
|
-
result = self._process_message_common(
|
1839
|
-
event_id, event_data, actual_queue, task_event_queue,
|
1840
|
-
is_async=True, consumer_name=consumer_name, group_name=group_name
|
1841
|
-
)
|
1842
|
-
if isinstance(result, tuple) and result[0] == 'async_put':
|
1843
|
-
await self._async_put_task(task_event_queue, result[1])
|
1844
|
-
logger.debug(f"Put task {event_id} into task_event_queue")
|
1845
|
-
# 注意:这里不ACK,由执行器在处理完成后ACK
|
1960
|
+
# 更新event_data
|
1961
|
+
event_data.clear()
|
1962
|
+
for key, value in parsed_data.items():
|
1963
|
+
event_data[key] = value
|
1964
|
+
|
1965
|
+
# 收集每个队列的最大offset(不要每条消息都记录)
|
1966
|
+
if message_offset is not None:
|
1967
|
+
# 从stream_name提取实际的队列名
|
1968
|
+
actual_queue_name = stream_name.replace(f"{self.redis_prefix}:QUEUE:", "")
|
1969
|
+
# 更新该队列的最大offset
|
1970
|
+
if actual_queue_name not in max_offsets_per_queue:
|
1971
|
+
max_offsets_per_queue[actual_queue_name] = message_offset
|
1972
|
+
else:
|
1973
|
+
max_offsets_per_queue[actual_queue_name] = max(max_offsets_per_queue[actual_queue_name], message_offset)
|
1974
|
+
|
1975
|
+
logger.debug(f"Task {task_name} will process message {event_id}")
|
1846
1976
|
else:
|
1847
|
-
#
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
# 批量ACK不需要的消息(所有队列使用同一个 group_name)
|
1852
|
-
if skip_message_ids:
|
1853
|
-
group_name_bytes = group_name.encode() if isinstance(group_name, str) else group_name
|
1854
|
-
for q in all_queues:
|
1855
|
-
q_bytes = q.encode() if isinstance(q, str) else q
|
1856
|
-
try:
|
1857
|
-
await self.async_binary_redis_client.xack(q_bytes, group_name_bytes, *skip_message_ids)
|
1858
|
-
except:
|
1859
|
-
pass # 忽略ACK错误
|
1860
|
-
logger.debug(f"Task {task_name} batch ACKed {len(skip_message_ids)} skipped messages")
|
1861
|
-
|
1862
|
-
# 批量更新每个队列的最大已读取offset(所有队列使用同一个 group_name)
|
1863
|
-
if max_offsets_per_queue:
|
1864
|
-
for queue_name, max_offset in max_offsets_per_queue.items():
|
1865
|
-
asyncio.create_task(self._update_read_offset(queue_name, group_name, max_offset))
|
1866
|
-
logger.debug(f"Updated read offsets for {len(max_offsets_per_queue)} queues")
|
1977
|
+
# 没有data字段,跳过消息
|
1978
|
+
should_process = False
|
1979
|
+
except Exception as e:
|
1980
|
+
logger.error(f"Task {task_name}: Error parsing message data: {e}")
|
1867
1981
|
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1876
|
-
|
1982
|
+
if should_process:
|
1983
|
+
# 处理消息 - 消息会被放入队列,由执行器处理并ACK
|
1984
|
+
# 使用消息体中的实际队列名(可能包含优先级)
|
1985
|
+
actual_queue = event_data.get('queue', queue)
|
1986
|
+
|
1987
|
+
# 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
|
1988
|
+
# 不再需要为优先级队列添加后缀
|
1989
|
+
result = self._process_message_common(
|
1990
|
+
event_id, event_data, actual_queue, task_event_queue,
|
1991
|
+
is_async=True, consumer_name=consumer_name, group_name=group_name
|
1992
|
+
)
|
1993
|
+
if isinstance(result, tuple) and result[0] == 'async_put':
|
1994
|
+
await self._async_put_task(task_event_queue, result[1])
|
1995
|
+
logger.debug(f"Put task {event_id} into task_event_queue")
|
1996
|
+
# 注意:这里不ACK,由执行器在处理完成后ACK
|
1997
|
+
else:
|
1998
|
+
# 不属于当前task的消息,收集起来批量ACK
|
1999
|
+
skip_message_ids.append(event_id)
|
2000
|
+
|
2001
|
+
|
2002
|
+
# 批量ACK不需要的消息(所有队列使用同一个 group_name)
|
2003
|
+
if skip_message_ids:
|
2004
|
+
group_name_bytes = group_name.encode() if isinstance(group_name, str) else group_name
|
2005
|
+
for q in all_queues:
|
2006
|
+
q_bytes = q.encode() if isinstance(q, str) else q
|
1877
2007
|
try:
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1888
|
-
# 重新创建成功,重置错误计数器
|
1889
|
-
consecutive_errors = 0
|
1890
|
-
continue
|
1891
|
-
except Exception as create_error:
|
1892
|
-
logger.error(f"Failed to recreate consumer groups for {task_name}: {create_error}")
|
2008
|
+
await self.async_binary_redis_client.xack(q_bytes, group_name_bytes, *skip_message_ids)
|
2009
|
+
except:
|
2010
|
+
pass # 忽略ACK错误
|
2011
|
+
logger.debug(f"Task {task_name} batch ACKed {len(skip_message_ids)} skipped messages")
|
2012
|
+
|
2013
|
+
# 批量更新每个队列的最大已读取offset(所有队列使用同一个 group_name)
|
2014
|
+
if max_offsets_per_queue:
|
2015
|
+
for queue_name, max_offset in max_offsets_per_queue.items():
|
2016
|
+
asyncio.create_task(self._update_read_offset(queue_name, group_name, max_offset))
|
2017
|
+
logger.debug(f"Updated read offsets for {len(max_offsets_per_queue)} queues")
|
1893
2018
|
|
1894
|
-
|
1895
|
-
|
1896
|
-
|
2019
|
+
except Exception as e:
|
2020
|
+
error_msg = str(e)
|
2021
|
+
# import traceback
|
2022
|
+
# traceback.print_exc()
|
2023
|
+
logger.error(f"Error in task listener {task_name}: {e}")
|
2024
|
+
|
2025
|
+
# 特殊处理:如果是NOGROUP错误,尝试重新创建consumer group
|
2026
|
+
if "NOGROUP" in error_msg:
|
2027
|
+
logger.warning(f"Detected NOGROUP error for {task_name}, attempting to recreate consumer groups...")
|
2028
|
+
try:
|
2029
|
+
# 为所有队列创建consumer group并记录group_info(使用统一方法)
|
2030
|
+
for q in all_queues:
|
2031
|
+
await self._ensure_consumer_group_and_record_info(q, task_name, consumer_name)
|
2032
|
+
logger.info(f"Recreated consumer groups for {len(all_queues)} queues for task {task_name}")
|
2033
|
+
|
2034
|
+
# 重新初始化所有队列的 lastid 和 check_backlog
|
2035
|
+
for q in all_queues:
|
2036
|
+
lastid[q] = "0"
|
2037
|
+
check_backlog[q] = True
|
2038
|
+
|
2039
|
+
# 重新创建成功,重置错误计数器
|
1897
2040
|
consecutive_errors = 0
|
1898
|
-
|
1899
|
-
|
1900
|
-
|
2041
|
+
continue
|
2042
|
+
except Exception as create_error:
|
2043
|
+
logger.error(f"Failed to recreate consumer groups for {task_name}: {create_error}")
|
2044
|
+
|
2045
|
+
consecutive_errors += 1
|
2046
|
+
if consecutive_errors >= max_consecutive_errors:
|
2047
|
+
logger.error(f"Too many errors for task {task_name}, restarting...")
|
2048
|
+
consecutive_errors = 0
|
2049
|
+
await asyncio.sleep(min(consecutive_errors, 5))
|
2050
|
+
|
2051
|
+
|
2052
|
+
async def _listen_queues(self, queues: List[str], event_queue: dict, prefetch_multiplier: int):
|
2053
|
+
"""
|
2054
|
+
为指定的队列启动监听任务
|
2055
|
+
|
2056
|
+
Args:
|
2057
|
+
queues: 要监听的队列列表
|
2058
|
+
event_queue: dict[str, asyncio.Queue] - 按task_name隔离的队列字典
|
2059
|
+
prefetch_multiplier: 预取倍数
|
2060
|
+
|
2061
|
+
Returns:
|
2062
|
+
List[asyncio.Task]: 创建的监听任务列表
|
2063
|
+
"""
|
1901
2064
|
tasks = []
|
1902
|
-
|
2065
|
+
|
1903
2066
|
if not (self.app and hasattr(self.app, '_tasks_by_queue')):
|
1904
2067
|
raise RuntimeError("No app or tasks registered, cannot start listeners")
|
1905
|
-
|
1906
|
-
# 为每个队列注册延迟任务回调
|
1907
|
-
async def handle_expired_tasks(queue: str, tasks: list):
|
1908
|
-
"""处理到期的延迟任务"""
|
1909
|
-
if tasks:
|
1910
|
-
async with self._delayed_tasks_locks[queue]:
|
1911
|
-
self._delayed_tasks_lists[queue].extend(tasks)
|
1912
2068
|
|
1913
|
-
|
2069
|
+
# 为每个队列初始化延迟任务列表(如果还没有)
|
2070
|
+
for queue in queues:
|
2071
|
+
if queue not in self._delayed_tasks_lists:
|
2072
|
+
self._delayed_tasks_lists[queue] = []
|
2073
|
+
self._delayed_tasks_locks[queue] = asyncio.Lock()
|
2074
|
+
|
2075
|
+
# 为每个队列注册延迟扫描器回调
|
2076
|
+
for queue in queues:
|
1914
2077
|
# 创建队列专用的回调函数
|
1915
2078
|
import functools
|
1916
|
-
callback = functools.partial(handle_expired_tasks, queue)
|
2079
|
+
callback = functools.partial(self.handle_expired_tasks, queue)
|
1917
2080
|
self.delayed_scanner.register_callback(queue, callback)
|
1918
2081
|
|
1919
|
-
#
|
1920
|
-
await self.delayed_scanner.
|
1921
|
-
logger.info(f"
|
1922
|
-
|
2082
|
+
# 添加队列到延迟消息扫描器
|
2083
|
+
await self.delayed_scanner.add_queues(queues)
|
2084
|
+
logger.info(f"Added queues to delayed message scanner: {queues}")
|
2085
|
+
|
1923
2086
|
# 为每个队列启动离线worker处理器(带自动重启)
|
1924
2087
|
# 包括优先级队列
|
1925
|
-
all_recovery_queues = set(
|
1926
|
-
for base_queue in
|
2088
|
+
all_recovery_queues = set(queues)
|
2089
|
+
for base_queue in queues:
|
1927
2090
|
# 扫描优先级队列
|
1928
2091
|
priority_queues = await self.scan_priority_queues(base_queue)
|
1929
2092
|
for pq in priority_queues:
|
1930
2093
|
if pq != base_queue: # 不重复添加基础队列
|
1931
2094
|
all_recovery_queues.add(pq)
|
1932
|
-
|
1933
|
-
# ✅ 在启动离线worker处理器之前,先触发一次"自我恢复"
|
1934
|
-
# 这是为了处理"复用worker ID"的场景:
|
1935
|
-
# - Worker复用离线worker ID后,会立即变为在线状态
|
1936
|
-
# - 但此时该worker之前的pending消息还未恢复
|
1937
|
-
# - 周期性扫描只查找is_alive=false的worker,会漏掉刚复用的worker
|
1938
|
-
# - 因此需要在启动时主动恢复"当前worker"的pending消息
|
1939
|
-
# logger.info("[Recovery] Performing initial self-recovery check on startup...")
|
1940
|
-
# try:
|
1941
|
-
# await self._perform_self_recovery(all_recovery_queues, event_queue)
|
1942
|
-
# except Exception as e:
|
1943
|
-
# logger.error(f"Error during initial self-recovery: {e}", exc_info=True)
|
1944
2095
|
|
1945
2096
|
# 为所有队列(包括优先级队列)启动离线worker处理器
|
1946
2097
|
for queue in all_recovery_queues:
|
1947
2098
|
logger.debug(f"Starting offline worker processor for queue: {queue}")
|
1948
2099
|
offline_processor_task = asyncio.create_task(
|
1949
|
-
self._start_offline_worker_processor_with_restart(queue)
|
2100
|
+
self._start_offline_worker_processor_with_restart(queue)
|
1950
2101
|
)
|
1951
2102
|
tasks.append(offline_processor_task)
|
1952
2103
|
self._background_tasks.append(offline_processor_task)
|
1953
|
-
|
1954
|
-
#
|
1955
|
-
for queue in
|
1956
|
-
|
2104
|
+
|
2105
|
+
# 为每个task创建独立的listener
|
2106
|
+
for queue in queues:
|
2107
|
+
# 使用工具方法查找匹配的任务
|
2108
|
+
from jettask.utils.queue_matcher import find_matching_tasks
|
2109
|
+
|
2110
|
+
task_names = find_matching_tasks(queue, self.app._tasks_by_queue, self.wildcard_mode)
|
2111
|
+
|
2112
|
+
if task_names and self.wildcard_mode:
|
2113
|
+
# 记录通配符匹配日志(仅在通配符模式下且找到任务时)
|
2114
|
+
if queue not in self.app._tasks_by_queue:
|
2115
|
+
logger.info(f"队列 '{queue}' 通过通配符匹配找到任务: {task_names}")
|
2116
|
+
|
1957
2117
|
if not task_names:
|
1958
|
-
|
1959
|
-
|
2118
|
+
# 在通配符模式下,如果队列没有任务也不报错(可能是动态发现的队列)
|
2119
|
+
if not self.wildcard_mode:
|
2120
|
+
raise RuntimeError(f"No tasks registered for queue '{queue}'. Cannot start worker without tasks.")
|
2121
|
+
else:
|
2122
|
+
logger.debug(f"No tasks registered for queue '{queue}', skipping...")
|
2123
|
+
continue
|
2124
|
+
|
1960
2125
|
for task_name in task_names:
|
1961
|
-
logger.
|
1962
|
-
task = asyncio.create_task(listen_event_by_task(queue, task_name))
|
2126
|
+
logger.info(f"Starting listener for task: {task_name} on queue: {queue}")
|
2127
|
+
task = asyncio.create_task(self.listen_event_by_task(event_queue, queue, task_name, prefetch_multiplier))
|
1963
2128
|
tasks.append(task)
|
1964
2129
|
self._background_tasks.append(task)
|
1965
|
-
|
2130
|
+
|
2131
|
+
return tasks
|
2132
|
+
|
2133
|
+
async def _dynamic_queue_discovery(self, wildcard_patterns: List[str], event_queue: dict, prefetch_multiplier: int, interval: float = 5.0):
|
2134
|
+
"""
|
2135
|
+
动态队列发现后台任务
|
2136
|
+
|
2137
|
+
Args:
|
2138
|
+
wildcard_patterns: 通配符模式列表
|
2139
|
+
event_queue: 任务事件队列字典
|
2140
|
+
prefetch_multiplier: 预取倍数
|
2141
|
+
interval: 检查间隔(秒)
|
2142
|
+
"""
|
2143
|
+
logger.info(f"启动动态队列发现任务,通配符模式: {wildcard_patterns}, 检查间隔: {interval}秒")
|
2144
|
+
|
2145
|
+
while not self._stop_reading:
|
2146
|
+
try:
|
2147
|
+
# 调用队列发现方法,返回新发现的队列
|
2148
|
+
new_queues = await self.discover_and_update_queues(wildcard_patterns)
|
2149
|
+
|
2150
|
+
if new_queues:
|
2151
|
+
# logger.info(f"发现新队列: {new_queues}")
|
2152
|
+
# 为新队列启动监听
|
2153
|
+
new_tasks = await self._listen_queues(new_queues, event_queue, prefetch_multiplier)
|
2154
|
+
# logger.info(f"已为 {len(new_queues)} 个新队列启动监听,创建了 {len(new_tasks)} 个任务")
|
2155
|
+
|
2156
|
+
# 等待下一次检查
|
2157
|
+
await asyncio.sleep(interval)
|
2158
|
+
|
2159
|
+
except asyncio.CancelledError:
|
2160
|
+
logger.info("动态队列发现任务已取消")
|
2161
|
+
break
|
2162
|
+
except Exception as e:
|
2163
|
+
logger.error(f"动态队列发现出错: {e}", exc_info=True)
|
2164
|
+
await asyncio.sleep(interval)
|
2165
|
+
|
2166
|
+
async def listening_event(self, event_queue: dict, prefetch_multiplier: int = 1):
|
2167
|
+
"""监听事件 - 为每个task创建独立的consumer group
|
2168
|
+
|
2169
|
+
Args:
|
2170
|
+
event_queue: dict[str, asyncio.Queue] - 按task_name隔离的队列字典
|
2171
|
+
prefetch_multiplier: 预取倍数
|
2172
|
+
"""
|
2173
|
+
# 验证参数类型
|
2174
|
+
if not isinstance(event_queue, dict):
|
2175
|
+
raise TypeError(f"event_queue must be a dict[str, asyncio.Queue], got {type(event_queue)}")
|
2176
|
+
|
2177
|
+
# 保存 event_queue 字典的引用,供事件驱动的恢复使用
|
2178
|
+
self._event_queue_dict = event_queue
|
2179
|
+
|
2180
|
+
logger.info(f"Using task-isolated event queue mode for tasks: {list(event_queue.keys())}")
|
2181
|
+
|
2182
|
+
# 保存所有创建的任务,以便清理时能够取消它们
|
2183
|
+
self._background_tasks = []
|
2184
|
+
|
2185
|
+
logger.info(f"静态队列: {self.queues}, 通配符模式: {self.wildcard_patterns}")
|
2186
|
+
|
2187
|
+
# 创建延迟任务字典
|
2188
|
+
self._delayed_tasks_lists = {}
|
2189
|
+
self._delayed_tasks_locks = {}
|
2190
|
+
|
2191
|
+
# 启动延迟消息扫描器(先用静态队列启动,后续动态添加)
|
2192
|
+
await self.delayed_scanner.start(self.queues)
|
2193
|
+
logger.info(f"Delayed message scanner started for static queues: {self.queues}")
|
2194
|
+
|
2195
|
+
tasks = []
|
2196
|
+
|
2197
|
+
# 1. 先为静态队列(不包含通配符的队列)启动监听
|
2198
|
+
if self.queues:
|
2199
|
+
static_tasks = await self._listen_queues(self.queues, event_queue, prefetch_multiplier)
|
2200
|
+
tasks.extend(static_tasks)
|
2201
|
+
logger.info(f"已为 {len(self.queues)} 个静态队列启动监听")
|
2202
|
+
|
2203
|
+
# 2. 如果有通配符模式,启动动态队列发现任务
|
2204
|
+
if self.wildcard_patterns:
|
2205
|
+
discovery_task = asyncio.create_task(
|
2206
|
+
self._dynamic_queue_discovery(self.wildcard_patterns, event_queue, prefetch_multiplier, interval=5.0)
|
2207
|
+
)
|
2208
|
+
tasks.append(discovery_task)
|
2209
|
+
self._background_tasks.append(discovery_task)
|
2210
|
+
logger.info(f"已启动动态队列发现任务,通配符模式: {self.wildcard_patterns}")
|
2211
|
+
|
1966
2212
|
try:
|
1967
2213
|
# 等待所有任务
|
1968
2214
|
await asyncio.gather(*tasks)
|
@@ -1987,9 +2233,6 @@ class EventPool(object):
|
|
1987
2233
|
logger.debug("Some background tasks did not complete in time")
|
1988
2234
|
raise
|
1989
2235
|
|
1990
|
-
# 注意:延迟任务扫描逻辑已迁移到 DelayedMessageScanner 独立模块
|
1991
|
-
# 旧的 _scan_and_load_delayed_tasks_to_list 和 _scan_and_load_delayed_tasks 方法已删除
|
1992
|
-
|
1993
2236
|
async def _claim_delayed_tasks(self, queue: str, event_queue: asyncio.Queue, prefetch_multiplier: int):
|
1994
2237
|
"""处理延迟队列中的到期任务"""
|
1995
2238
|
try:
|