jettask 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/constants.py +213 -0
- jettask/core/app.py +525 -205
- jettask/core/cli.py +193 -185
- jettask/core/consumer_manager.py +126 -34
- jettask/core/context.py +3 -0
- jettask/core/enums.py +137 -0
- jettask/core/event_pool.py +501 -168
- jettask/core/message.py +147 -0
- jettask/core/offline_worker_recovery.py +181 -114
- jettask/core/task.py +10 -174
- jettask/core/task_batch.py +153 -0
- jettask/core/unified_manager_base.py +243 -0
- jettask/core/worker_scanner.py +54 -54
- jettask/executors/asyncio.py +184 -64
- jettask/webui/backend/config.py +51 -0
- jettask/webui/backend/data_access.py +2083 -92
- jettask/webui/backend/data_api.py +3294 -0
- jettask/webui/backend/dependencies.py +261 -0
- jettask/webui/backend/init_meta_db.py +158 -0
- jettask/webui/backend/main.py +1358 -69
- jettask/webui/backend/main_unified.py +78 -0
- jettask/webui/backend/main_v2.py +394 -0
- jettask/webui/backend/namespace_api.py +295 -0
- jettask/webui/backend/namespace_api_old.py +294 -0
- jettask/webui/backend/namespace_data_access.py +611 -0
- jettask/webui/backend/queue_backlog_api.py +727 -0
- jettask/webui/backend/queue_stats_v2.py +521 -0
- jettask/webui/backend/redis_monitor_api.py +476 -0
- jettask/webui/backend/unified_api_router.py +1601 -0
- jettask/webui/db_init.py +204 -32
- jettask/webui/frontend/package-lock.json +492 -1
- jettask/webui/frontend/package.json +4 -1
- jettask/webui/frontend/src/App.css +105 -7
- jettask/webui/frontend/src/App.jsx +49 -20
- jettask/webui/frontend/src/components/NamespaceSelector.jsx +166 -0
- jettask/webui/frontend/src/components/QueueBacklogChart.jsx +298 -0
- jettask/webui/frontend/src/components/QueueBacklogTrend.jsx +638 -0
- jettask/webui/frontend/src/components/QueueDetailsTable.css +65 -0
- jettask/webui/frontend/src/components/QueueDetailsTable.jsx +487 -0
- jettask/webui/frontend/src/components/QueueDetailsTableV2.jsx +465 -0
- jettask/webui/frontend/src/components/ScheduledTaskFilter.jsx +423 -0
- jettask/webui/frontend/src/components/TaskFilter.jsx +425 -0
- jettask/webui/frontend/src/components/TimeRangeSelector.css +21 -0
- jettask/webui/frontend/src/components/TimeRangeSelector.jsx +160 -0
- jettask/webui/frontend/src/components/layout/AppLayout.css +95 -0
- jettask/webui/frontend/src/components/layout/AppLayout.jsx +49 -0
- jettask/webui/frontend/src/components/layout/Header.css +34 -10
- jettask/webui/frontend/src/components/layout/Header.jsx +31 -23
- jettask/webui/frontend/src/components/layout/SideMenu.css +137 -0
- jettask/webui/frontend/src/components/layout/SideMenu.jsx +209 -0
- jettask/webui/frontend/src/components/layout/TabsNav.css +244 -0
- jettask/webui/frontend/src/components/layout/TabsNav.jsx +206 -0
- jettask/webui/frontend/src/components/layout/UserInfo.css +197 -0
- jettask/webui/frontend/src/components/layout/UserInfo.jsx +197 -0
- jettask/webui/frontend/src/contexts/NamespaceContext.jsx +72 -0
- jettask/webui/frontend/src/contexts/TabsContext.backup.jsx +245 -0
- jettask/webui/frontend/src/main.jsx +1 -0
- jettask/webui/frontend/src/pages/Alerts.jsx +684 -0
- jettask/webui/frontend/src/pages/Dashboard.jsx +1330 -0
- jettask/webui/frontend/src/pages/QueueDetail.jsx +1109 -10
- jettask/webui/frontend/src/pages/QueueMonitor.jsx +236 -115
- jettask/webui/frontend/src/pages/Queues.jsx +5 -1
- jettask/webui/frontend/src/pages/ScheduledTasks.jsx +809 -0
- jettask/webui/frontend/src/pages/Settings.jsx +800 -0
- jettask/webui/frontend/src/services/api.js +7 -5
- jettask/webui/frontend/src/utils/suppressWarnings.js +22 -0
- jettask/webui/frontend/src/utils/userPreferences.js +154 -0
- jettask/webui/multi_namespace_consumer.py +543 -0
- jettask/webui/pg_consumer.py +983 -246
- jettask/webui/static/dist/assets/index-7129cfe1.css +1 -0
- jettask/webui/static/dist/assets/index-8d1935cc.js +774 -0
- jettask/webui/static/dist/index.html +2 -2
- jettask/webui/task_center.py +216 -0
- jettask/webui/task_center_client.py +150 -0
- jettask/webui/unified_consumer_manager.py +193 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/METADATA +1 -1
- jettask-0.2.4.dist-info/RECORD +134 -0
- jettask/webui/pg_consumer_slow.py +0 -1099
- jettask/webui/pg_consumer_test.py +0 -678
- jettask/webui/static/dist/assets/index-823408e8.css +0 -1
- jettask/webui/static/dist/assets/index-9968b0b8.js +0 -543
- jettask/webui/test_pg_consumer_recovery.py +0 -547
- jettask/webui/test_recovery_simple.py +0 -492
- jettask/webui/test_self_recovery.py +0 -467
- jettask-0.2.1.dist-info/RECORD +0 -91
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/WHEEL +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/top_level.txt +0 -0
jettask/webui/pg_consumer.py
CHANGED
@@ -4,9 +4,10 @@
|
|
4
4
|
import asyncio
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
+
import msgpack
|
7
8
|
import os
|
8
9
|
import time
|
9
|
-
from typing import Dict, List, Optional, Any
|
10
|
+
from typing import Dict, List, Optional, Any, Set
|
10
11
|
from datetime import datetime, timezone
|
11
12
|
from collections import defaultdict
|
12
13
|
|
@@ -19,22 +20,36 @@ from sqlalchemy import text
|
|
19
20
|
from jettask.webui.config import PostgreSQLConfig, RedisConfig
|
20
21
|
from jettask.core.consumer_manager import ConsumerManager, ConsumerStrategy
|
21
22
|
from jettask.core.offline_worker_recovery import OfflineWorkerRecovery
|
23
|
+
from jettask.constants import is_internal_consumer, TASK_STATUS_PRIORITY
|
22
24
|
|
23
25
|
logger = logging.getLogger(__name__)
|
24
26
|
|
27
|
+
# 注释掉调试文件写入,避免权限问题
|
28
|
+
# logger_f = open(f'./pg_consumer.txt', 'a+')
|
25
29
|
|
30
|
+
# 使用 constants.py 中定义的任务状态优先级
|
31
|
+
# STATUS_PRIORITY 已从 constants.py 导入为 TASK_STATUS_PRIORITY
|
26
32
|
class PostgreSQLConsumer:
|
27
|
-
"""PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL
|
33
|
+
"""PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL
|
34
|
+
|
35
|
+
支持多租户(命名空间)隔离
|
36
|
+
"""
|
28
37
|
|
29
38
|
def __init__(self, pg_config: PostgreSQLConfig, redis_config: RedisConfig, prefix: str = "jettask",
|
30
|
-
node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT
|
39
|
+
node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT,
|
40
|
+
namespace_id: str = None, namespace_name: str = None,
|
41
|
+
enable_backlog_monitor: bool = True, backlog_monitor_interval: int = 1):
|
31
42
|
self.pg_config = pg_config
|
32
43
|
self.redis_config = redis_config
|
33
44
|
self.prefix = prefix
|
45
|
+
|
46
|
+
# 命名空间支持
|
47
|
+
self.namespace_id = namespace_id
|
48
|
+
self.namespace_name = namespace_name or "default"
|
34
49
|
self.redis_client: Optional[Redis] = None
|
35
50
|
self.async_engine = None
|
36
51
|
self.AsyncSessionLocal = None
|
37
|
-
self.consumer_group = f"{prefix}
|
52
|
+
self.consumer_group = f"{prefix}_pg_consumer"
|
38
53
|
|
39
54
|
# 节点标识
|
40
55
|
import socket
|
@@ -69,28 +84,42 @@ class PostgreSQLConsumer:
|
|
69
84
|
self.min_batch_size = 500
|
70
85
|
self.max_batch_size = 5000
|
71
86
|
|
87
|
+
# Stream积压监控配置
|
88
|
+
self.enable_backlog_monitor = enable_backlog_monitor # 是否启用积压监控
|
89
|
+
self.backlog_monitor_interval = backlog_monitor_interval # 监控采集间隔(秒)
|
90
|
+
self.backlog_monitor_lock_key = f"{prefix}:BACKLOG_MONITOR_LOCK" # 分布式锁键
|
91
|
+
self.backlog_monitor_lock_ttl = backlog_monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
|
92
|
+
|
72
93
|
async def start(self):
|
73
94
|
"""启动消费者"""
|
74
95
|
logger.info(f"Starting PostgreSQL consumer (simplified) on node: {self.node_id}")
|
75
96
|
|
76
97
|
# 连接Redis
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
decode_responses
|
83
|
-
|
98
|
+
# 构建连接参数,只在密码非空时传递
|
99
|
+
async_redis_params = {
|
100
|
+
'host': self.redis_config.host,
|
101
|
+
'port': self.redis_config.port,
|
102
|
+
'db': self.redis_config.db,
|
103
|
+
'decode_responses': False
|
104
|
+
}
|
105
|
+
if self.redis_config.password:
|
106
|
+
async_redis_params['password'] = self.redis_config.password
|
107
|
+
|
108
|
+
self.redis_client = await redis.Redis(**async_redis_params)
|
84
109
|
|
85
110
|
# 初始化 ConsumerManager(需要同步的 Redis 客户端)
|
86
111
|
import redis as sync_redis
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
decode_responses
|
93
|
-
|
112
|
+
# 构建连接参数,只在密码非空时传递
|
113
|
+
sync_redis_params = {
|
114
|
+
'host': self.redis_config.host,
|
115
|
+
'port': self.redis_config.port,
|
116
|
+
'db': self.redis_config.db,
|
117
|
+
'decode_responses': True # 使用字符串模式,与其他组件保持一致
|
118
|
+
}
|
119
|
+
if self.redis_config.password:
|
120
|
+
sync_redis_params['password'] = self.redis_config.password
|
121
|
+
|
122
|
+
sync_redis_client = sync_redis.StrictRedis(**sync_redis_params)
|
94
123
|
|
95
124
|
# 配置 ConsumerManager
|
96
125
|
# 初始队列列表包含TASK_CHANGES,其他队列会动态添加
|
@@ -109,7 +138,7 @@ class PostgreSQLConsumer:
|
|
109
138
|
|
110
139
|
# 获取稳定的 consumer_id(使用TASK_CHANGES作为基准队列)
|
111
140
|
self.consumer_id = self.consumer_manager.get_consumer_name('TASK_CHANGES')
|
112
|
-
logger.
|
141
|
+
logger.debug(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
|
113
142
|
|
114
143
|
# 创建SQLAlchemy异步引擎
|
115
144
|
if self.pg_config.dsn.startswith('postgresql://'):
|
@@ -127,7 +156,7 @@ class PostgreSQLConsumer:
|
|
127
156
|
)
|
128
157
|
|
129
158
|
# 预热连接池
|
130
|
-
logger.
|
159
|
+
logger.debug("Pre-warming database connection pool...")
|
131
160
|
async with self.async_engine.begin() as conn:
|
132
161
|
await conn.execute(text("SELECT 1"))
|
133
162
|
|
@@ -138,8 +167,6 @@ class PostgreSQLConsumer:
|
|
138
167
|
expire_on_commit=False
|
139
168
|
)
|
140
169
|
|
141
|
-
# 初始化数据库架构
|
142
|
-
await self._init_database()
|
143
170
|
|
144
171
|
self._running = True
|
145
172
|
|
@@ -163,16 +190,23 @@ class PostgreSQLConsumer:
|
|
163
190
|
asyncio.create_task(self._start_offline_recovery()) # 离线worker恢复服务
|
164
191
|
]
|
165
192
|
|
193
|
+
# 如果启用了积压监控,添加监控任务
|
194
|
+
if self.enable_backlog_monitor:
|
195
|
+
self._tasks.append(
|
196
|
+
asyncio.create_task(self._stream_backlog_monitor()) # Stream积压监控
|
197
|
+
)
|
198
|
+
logger.info(f"Stream backlog monitor enabled with {self.backlog_monitor_interval}s interval")
|
199
|
+
|
166
200
|
# 如果使用 HEARTBEAT 策略,ConsumerManager 会自动管理心跳
|
167
201
|
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and self.consumer_manager:
|
168
202
|
# 启动心跳(ConsumerManager 内部会处理)
|
169
|
-
logger.
|
203
|
+
logger.debug("Heartbeat is managed by ConsumerManager")
|
170
204
|
|
171
|
-
logger.
|
205
|
+
logger.debug("PostgreSQL consumer started successfully")
|
172
206
|
|
173
207
|
async def stop(self):
|
174
208
|
"""停止消费者"""
|
175
|
-
logger.
|
209
|
+
logger.debug("Stopping PostgreSQL consumer...")
|
176
210
|
self._running = False
|
177
211
|
|
178
212
|
# 停止离线恢复服务
|
@@ -190,7 +224,7 @@ class PostgreSQLConsumer:
|
|
190
224
|
if self.consumer_manager:
|
191
225
|
try:
|
192
226
|
self.consumer_manager.cleanup()
|
193
|
-
logger.
|
227
|
+
logger.debug(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
|
194
228
|
except Exception as e:
|
195
229
|
logger.error(f"Error cleaning up ConsumerManager: {e}")
|
196
230
|
|
@@ -201,38 +235,30 @@ class PostgreSQLConsumer:
|
|
201
235
|
if self.async_engine:
|
202
236
|
await self.async_engine.dispose()
|
203
237
|
|
204
|
-
logger.
|
205
|
-
|
206
|
-
async def _init_database(self):
|
207
|
-
"""初始化数据库架构"""
|
208
|
-
# 使用相对于当前文件的路径
|
209
|
-
import os
|
210
|
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
211
|
-
schema_path = os.path.join(current_dir, "schema.sql")
|
212
|
-
try:
|
213
|
-
with open(schema_path, 'r') as f:
|
214
|
-
schema_sql = f.read()
|
215
|
-
|
216
|
-
async with self.AsyncSessionLocal() as session:
|
217
|
-
await session.execute(text(schema_sql))
|
218
|
-
await session.commit()
|
219
|
-
logger.info("Database schema initialized")
|
220
|
-
except FileNotFoundError:
|
221
|
-
logger.warning(f"Schema file not found at {schema_path}, skipping initialization")
|
222
|
-
except Exception as e:
|
223
|
-
logger.error(f"Failed to initialize database schema: {e}")
|
224
|
-
|
238
|
+
logger.debug("PostgreSQL consumer stopped")
|
239
|
+
|
225
240
|
async def _initial_queue_discovery(self):
|
226
241
|
"""初始队列发现,在启动时执行一次"""
|
227
242
|
try:
|
228
243
|
pattern = f"{self.prefix}:QUEUE:*"
|
229
244
|
new_queues = set()
|
245
|
+
logger.info(f"Starting initial queue discovery with pattern: {pattern}")
|
230
246
|
|
231
247
|
async for key in self.redis_client.scan_iter(match=pattern, count=100):
|
232
|
-
|
233
|
-
|
248
|
+
# 从key中提取队列名,格式可能是:
|
249
|
+
# - prefix:QUEUE:queue_name (普通队列)
|
250
|
+
# - prefix:QUEUE:queue_name:priority (优先级队列)
|
251
|
+
key_str = key.decode('utf-8')
|
252
|
+
parts = key_str.split(":")
|
253
|
+
if len(parts) >= 3:
|
254
|
+
# 去掉前缀和QUEUE部分
|
255
|
+
queue_parts = parts[2:] # 从第3部分开始是队列名
|
256
|
+
queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
|
257
|
+
new_queues.add(queue_name)
|
258
|
+
logger.info(f"Found queue: {queue_name} from key: {key_str}")
|
234
259
|
|
235
260
|
if new_queues:
|
261
|
+
logger.info(f"Initial queue discovery found {len(new_queues)} queues: {new_queues}")
|
236
262
|
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
237
263
|
all_queues = list(new_queues) + ['TASK_CHANGES']
|
238
264
|
|
@@ -256,7 +282,7 @@ class PostgreSQLConsumer:
|
|
256
282
|
'queues',
|
257
283
|
','.join(all_queues)
|
258
284
|
)
|
259
|
-
logger.
|
285
|
+
logger.debug(f"Initial queue discovery - found queues: {all_queues}")
|
260
286
|
except Exception as e:
|
261
287
|
logger.error(f"Error updating initial worker queues: {e}")
|
262
288
|
|
@@ -271,24 +297,34 @@ class PostgreSQLConsumer:
|
|
271
297
|
try:
|
272
298
|
pattern = f"{self.prefix}:QUEUE:*"
|
273
299
|
new_queues = set()
|
274
|
-
|
300
|
+
# logger.info(f'{pattern=}')
|
275
301
|
async for key in self.redis_client.scan_iter(match=pattern, count=100):
|
276
|
-
|
277
|
-
|
302
|
+
# 从key中提取队列名,格式可能是:
|
303
|
+
# - prefix:QUEUE:queue_name (普通队列)
|
304
|
+
# - prefix:QUEUE:queue_name:priority (优先级队列)
|
305
|
+
key_str = key.decode('utf-8')
|
306
|
+
parts = key_str.split(":")
|
307
|
+
if len(parts) >= 3:
|
308
|
+
# 去掉前缀和QUEUE部分
|
309
|
+
queue_parts = parts[2:] # 从第3部分开始是队列名
|
310
|
+
queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
|
311
|
+
new_queues.add(queue_name)
|
278
312
|
|
279
313
|
# 为新发现的队列创建消费者组
|
280
314
|
for queue in new_queues - self._known_queues:
|
315
|
+
# 正确构建stream_key,保留优先级部分
|
281
316
|
stream_key = f"{self.prefix}:QUEUE:{queue}"
|
282
317
|
try:
|
283
318
|
await self.redis_client.xgroup_create(
|
284
319
|
stream_key, self.consumer_group, id='0', mkstream=True
|
285
320
|
)
|
286
|
-
logger.info(f"Created consumer group for new queue: {queue}")
|
321
|
+
logger.info(f"Created consumer group for new queue: {queue} with stream_key: {stream_key}")
|
287
322
|
except redis.ResponseError:
|
288
323
|
pass
|
289
324
|
|
290
325
|
# 更新ConsumerManager的队列列表(同步操作)
|
291
326
|
if new_queues != self._known_queues:
|
327
|
+
logger.info(f"Queue discovery: found {len(new_queues)} queues: {new_queues}")
|
292
328
|
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
293
329
|
all_queues = list(new_queues) + ['TASK_CHANGES']
|
294
330
|
|
@@ -312,7 +348,7 @@ class PostgreSQLConsumer:
|
|
312
348
|
'queues',
|
313
349
|
','.join(all_queues)
|
314
350
|
)
|
315
|
-
logger.
|
351
|
+
logger.debug(f"Updated ConsumerManager queues: {all_queues}")
|
316
352
|
except Exception as e:
|
317
353
|
logger.error(f"Error updating worker queues: {e}")
|
318
354
|
|
@@ -326,8 +362,22 @@ class PostgreSQLConsumer:
|
|
326
362
|
await asyncio.sleep(10)
|
327
363
|
|
328
364
|
async def _consume_queue(self, queue_name: str):
|
329
|
-
"""
|
330
|
-
|
365
|
+
"""消费单个队列的任务(包括优先级队列)"""
|
366
|
+
# logger.info(f"Starting to consume queue: {queue_name}")
|
367
|
+
# 判断是否是优先级队列
|
368
|
+
is_priority_queue = ':' in queue_name and queue_name.rsplit(':', 1)[-1].isdigit()
|
369
|
+
|
370
|
+
if is_priority_queue:
|
371
|
+
# 优先级队列格式:base_queue:priority (如 robust_bench2:2)
|
372
|
+
base_queue = queue_name.rsplit(':', 1)[0]
|
373
|
+
priority = queue_name.rsplit(':', 1)[1]
|
374
|
+
stream_key = f"{self.prefix}:QUEUE:{base_queue}:{priority}"
|
375
|
+
else:
|
376
|
+
# 普通队列
|
377
|
+
stream_key = f"{self.prefix}:QUEUE:{queue_name}"
|
378
|
+
|
379
|
+
logger.debug(f"Consuming queue: {queue_name}, stream_key: {stream_key}, is_priority: {is_priority_queue}")
|
380
|
+
|
331
381
|
check_backlog = True
|
332
382
|
lastid = "0-0"
|
333
383
|
|
@@ -350,7 +400,6 @@ class PostgreSQLConsumer:
|
|
350
400
|
count=10000,
|
351
401
|
block=1000 if not check_backlog else 0
|
352
402
|
)
|
353
|
-
|
354
403
|
if not messages or (messages and len(messages[0][1]) == 0):
|
355
404
|
check_backlog = False
|
356
405
|
continue
|
@@ -369,7 +418,7 @@ class PostgreSQLConsumer:
|
|
369
418
|
await self.redis_client.xgroup_create(
|
370
419
|
stream_key, self.consumer_group, id='0', mkstream=True
|
371
420
|
)
|
372
|
-
logger.
|
421
|
+
logger.debug(f"Recreated consumer group for queue: {queue_name}")
|
373
422
|
check_backlog = True
|
374
423
|
lastid = "0-0"
|
375
424
|
except:
|
@@ -379,7 +428,7 @@ class PostgreSQLConsumer:
|
|
379
428
|
self._consecutive_errors[queue_name] += 1
|
380
429
|
|
381
430
|
if self._consecutive_errors[queue_name] > 10:
|
382
|
-
logger.
|
431
|
+
logger.debug(f"Too many errors for queue {queue_name}, will retry later")
|
383
432
|
await asyncio.sleep(30)
|
384
433
|
self._consecutive_errors[queue_name] = 0
|
385
434
|
|
@@ -392,19 +441,18 @@ class PostgreSQLConsumer:
|
|
392
441
|
"""启动所有队列的消费任务"""
|
393
442
|
discover_task = asyncio.create_task(self._discover_queues())
|
394
443
|
queue_tasks = {}
|
395
|
-
|
396
444
|
while self._running:
|
397
445
|
try:
|
398
446
|
for queue in self._known_queues:
|
399
447
|
if queue not in queue_tasks or queue_tasks[queue].done():
|
400
448
|
queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
|
401
|
-
logger.
|
449
|
+
logger.debug(f"Started consumer task for queue: {queue}")
|
402
450
|
|
403
451
|
for queue in list(queue_tasks.keys()):
|
404
452
|
if queue not in self._known_queues:
|
405
453
|
queue_tasks[queue].cancel()
|
406
454
|
del queue_tasks[queue]
|
407
|
-
logger.
|
455
|
+
logger.debug(f"Stopped consumer task for removed queue: {queue}")
|
408
456
|
|
409
457
|
await asyncio.sleep(10)
|
410
458
|
|
@@ -428,7 +476,6 @@ class PostgreSQLConsumer:
|
|
428
476
|
continue
|
429
477
|
|
430
478
|
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
|
431
|
-
queue_name = stream_key_str.split(":")[-1]
|
432
479
|
msg_ids_to_ack = []
|
433
480
|
|
434
481
|
for msg_id, data in stream_messages:
|
@@ -439,7 +486,7 @@ class PostgreSQLConsumer:
|
|
439
486
|
msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
|
440
487
|
|
441
488
|
# 使用公共方法解析消息
|
442
|
-
task_info = self._parse_stream_message(msg_id_str, data
|
489
|
+
task_info = self._parse_stream_message(msg_id_str, data)
|
443
490
|
if task_info:
|
444
491
|
tasks_to_insert.append(task_info)
|
445
492
|
msg_ids_to_ack.append(msg_id)
|
@@ -479,23 +526,92 @@ class PostgreSQLConsumer:
|
|
479
526
|
logger.error(f"Error executing batch ACK: {e}")
|
480
527
|
|
481
528
|
async def _insert_tasks(self, tasks: List[Dict[str, Any]]):
|
482
|
-
"""批量插入任务到PostgreSQL"""
|
529
|
+
"""批量插入任务到PostgreSQL(只处理tasks表)"""
|
483
530
|
if not tasks:
|
484
531
|
return
|
485
532
|
|
533
|
+
logger.info(f"Attempting to insert {len(tasks)} tasks to tasks table")
|
486
534
|
try:
|
487
535
|
async with self.AsyncSessionLocal() as session:
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
536
|
+
# 插入tasks表 - 使用批量INSERT忽略冲突
|
537
|
+
# 由于stream_id在实践中是唯一的,我们可以简单地忽略重复
|
538
|
+
tasks_query = text("""
|
539
|
+
INSERT INTO tasks (stream_id, queue, namespace, scheduled_task_id,
|
540
|
+
payload, priority, created_at, source, metadata)
|
541
|
+
VALUES (:stream_id, :queue, :namespace, :scheduled_task_id,
|
542
|
+
CAST(:payload AS jsonb), :priority, :created_at, :source, CAST(:metadata AS jsonb))
|
543
|
+
ON CONFLICT DO NOTHING
|
544
|
+
RETURNING stream_id;
|
494
545
|
""")
|
495
546
|
|
496
|
-
|
497
|
-
|
498
|
-
|
547
|
+
# 准备tasks表的数据
|
548
|
+
tasks_data = []
|
549
|
+
for task in tasks:
|
550
|
+
task_data = json.loads(task['task_data'])
|
551
|
+
|
552
|
+
# 从task_data中获取scheduled_task_id
|
553
|
+
scheduled_task_id = task_data.get('scheduled_task_id') or task.get('scheduled_task_id')
|
554
|
+
|
555
|
+
# 根据是否有scheduled_task_id来判断任务来源
|
556
|
+
if scheduled_task_id:
|
557
|
+
source = 'scheduler' # 定时任务
|
558
|
+
else:
|
559
|
+
source = 'redis_stream' # 普通任务
|
560
|
+
|
561
|
+
tasks_data.append({
|
562
|
+
'stream_id': task['id'], # Redis Stream ID作为stream_id
|
563
|
+
'queue': task['queue_name'],
|
564
|
+
'namespace': self.namespace_name,
|
565
|
+
'scheduled_task_id': str(scheduled_task_id) if scheduled_task_id else None,
|
566
|
+
'payload': task['task_data'], # 完整的任务数据
|
567
|
+
'priority': task['priority'],
|
568
|
+
'created_at': task['created_at'],
|
569
|
+
'source': source,
|
570
|
+
'metadata': task.get('metadata', '{}')
|
571
|
+
})
|
572
|
+
|
573
|
+
# 批量插入 - 使用executemany提高性能
|
574
|
+
logger.debug(f"Executing batch insert with {len(tasks_data)} tasks")
|
575
|
+
|
576
|
+
try:
|
577
|
+
# 使用executemany批量插入
|
578
|
+
result = await session.execute(tasks_query, tasks_data)
|
579
|
+
|
580
|
+
# 获取实际插入的记录数
|
581
|
+
inserted_count = result.rowcount
|
582
|
+
|
583
|
+
# if inserted_count > 0:
|
584
|
+
# logger.info(f"Successfully inserted {inserted_count} new tasks to tasks table")
|
585
|
+
# else:
|
586
|
+
# logger.info(f"No new tasks inserted (all may be duplicates)")
|
587
|
+
|
588
|
+
await session.commit()
|
589
|
+
logger.debug("Tasks table batch insert transaction completed")
|
590
|
+
|
591
|
+
except Exception as e:
|
592
|
+
logger.error(f"Error in batch insert, trying fallback: {e}")
|
593
|
+
await session.rollback()
|
594
|
+
|
595
|
+
# 如果批量插入失败,降级为小批量插入(每批10条)
|
596
|
+
batch_size = 10
|
597
|
+
total_inserted = 0
|
598
|
+
|
599
|
+
for i in range(0, len(tasks_data), batch_size):
|
600
|
+
batch = tasks_data[i:i+batch_size]
|
601
|
+
try:
|
602
|
+
result = await session.execute(tasks_query, batch)
|
603
|
+
batch_inserted = result.rowcount
|
604
|
+
if batch_inserted > 0:
|
605
|
+
total_inserted += batch_inserted
|
606
|
+
await session.commit()
|
607
|
+
except Exception as batch_error:
|
608
|
+
logger.error(f"Batch {i//batch_size + 1} failed: {batch_error}")
|
609
|
+
await session.rollback()
|
610
|
+
|
611
|
+
if total_inserted > 0:
|
612
|
+
logger.info(f"Fallback insert completed: {total_inserted} tasks inserted")
|
613
|
+
else:
|
614
|
+
logger.info(f"No new tasks inserted in fallback mode")
|
499
615
|
|
500
616
|
except Exception as e:
|
501
617
|
logger.error(f"Error inserting tasks to PostgreSQL: {e}")
|
@@ -507,21 +623,21 @@ class PostgreSQLConsumer:
|
|
507
623
|
|
508
624
|
# 使用 ConsumerManager 管理的 consumer name
|
509
625
|
# 这样 ConsumerManager 才能正确跟踪和恢复这个流的待处理消息
|
510
|
-
consumer_name = self.consumer_manager.get_consumer_name('
|
626
|
+
consumer_name = self.consumer_manager.get_consumer_name('TASK_CHANGES')
|
511
627
|
|
512
628
|
# 创建消费者组
|
513
629
|
try:
|
514
630
|
await self.redis_client.xgroup_create(
|
515
631
|
change_stream_key, consumer_group, id='0', mkstream=True
|
516
632
|
)
|
517
|
-
logger.
|
633
|
+
logger.debug(f"Created consumer group for task changes stream")
|
518
634
|
except redis.ResponseError:
|
519
635
|
pass
|
520
636
|
|
521
637
|
# 模仿 listen_event_by_task 的写法:先处理pending消息,再处理新消息
|
522
638
|
check_backlog = True
|
523
639
|
lastid = "0-0"
|
524
|
-
batch_size =
|
640
|
+
batch_size = 1000
|
525
641
|
|
526
642
|
while self._running:
|
527
643
|
try:
|
@@ -549,8 +665,8 @@ class PostgreSQLConsumer:
|
|
549
665
|
else:
|
550
666
|
check_backlog = False
|
551
667
|
|
552
|
-
|
553
|
-
|
668
|
+
# 收集消息ID和对应的task_id
|
669
|
+
msg_to_task = {} # msg_id -> task_id 映射
|
554
670
|
|
555
671
|
for _, stream_messages in messages:
|
556
672
|
for msg_id, data in stream_messages:
|
@@ -561,21 +677,56 @@ class PostgreSQLConsumer:
|
|
561
677
|
else:
|
562
678
|
lastid = str(msg_id)
|
563
679
|
|
564
|
-
|
565
|
-
if
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
680
|
+
task_key = data[b'id']
|
681
|
+
task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
|
682
|
+
|
683
|
+
# 从完整的task_key格式提取stream_id
|
684
|
+
# 格式: namespace:TASK:stream_id:queue_name
|
685
|
+
stream_id = None
|
686
|
+
if ':TASK:' in task_key:
|
687
|
+
parts = task_key.split(':TASK:')
|
688
|
+
if len(parts) == 2:
|
689
|
+
# 再从右边部分提取stream_id
|
690
|
+
right_parts = parts[1].split(':')
|
691
|
+
if right_parts:
|
692
|
+
stream_id = right_parts[0] # 提取stream_id
|
693
|
+
|
694
|
+
if stream_id:
|
695
|
+
# 存储元组: (stream_id, task_key)
|
696
|
+
msg_to_task[msg_id] = (stream_id, task_key)
|
697
|
+
else:
|
698
|
+
logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
|
570
699
|
except Exception as e:
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
700
|
+
import traceback
|
701
|
+
traceback.print_exc()
|
702
|
+
logger.error(f"Error processing change event {msg_id}: {e} {data=}")
|
703
|
+
# 解析失败的消息也应该ACK,避免一直重试
|
704
|
+
await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
|
576
705
|
|
577
|
-
if
|
578
|
-
|
706
|
+
if msg_to_task:
|
707
|
+
# 批量更新任务,返回成功更新的task_id列表
|
708
|
+
# msg_to_task 的值现在是元组 (stream_id, task_key)
|
709
|
+
id_tuples = list(set(msg_to_task.values()))
|
710
|
+
logger.info(f"Processing {len(id_tuples)} task updates from change stream")
|
711
|
+
# logger_f.write(f'{id_tuples=} \n')
|
712
|
+
successful_tuples = await self._update_tasks_by_event(id_tuples)
|
713
|
+
|
714
|
+
# 只ACK成功更新的消息
|
715
|
+
ack_ids = []
|
716
|
+
failed_count = 0
|
717
|
+
for msg_id, id_tuple in msg_to_task.items():
|
718
|
+
if successful_tuples and id_tuple in successful_tuples:
|
719
|
+
ack_ids.append(msg_id)
|
720
|
+
else:
|
721
|
+
failed_count += 1
|
722
|
+
|
723
|
+
if ack_ids:
|
724
|
+
await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
|
725
|
+
if len(ack_ids) > 0:
|
726
|
+
logger.info(f"Updated {len(ack_ids)} task statuses")
|
727
|
+
|
728
|
+
if failed_count > 0:
|
729
|
+
logger.debug(f"Failed to update {failed_count} tasks, will retry")
|
579
730
|
|
580
731
|
except redis.ResponseError as e:
|
581
732
|
if "NOGROUP" in str(e):
|
@@ -584,7 +735,7 @@ class PostgreSQLConsumer:
|
|
584
735
|
await self.redis_client.xgroup_create(
|
585
736
|
change_stream_key, consumer_group, id='0', mkstream=True
|
586
737
|
)
|
587
|
-
logger.
|
738
|
+
logger.debug(f"Recreated consumer group for task changes stream")
|
588
739
|
check_backlog = True
|
589
740
|
lastid = "0-0"
|
590
741
|
except:
|
@@ -596,36 +747,114 @@ class PostgreSQLConsumer:
|
|
596
747
|
logger.error(f"Error in consume_task_changes: {e}", exc_info=True)
|
597
748
|
await asyncio.sleep(1)
|
598
749
|
|
599
|
-
async def _update_tasks_by_event(self,
|
600
|
-
"""基于事件ID批量更新任务状态
|
601
|
-
|
602
|
-
|
750
|
+
async def _update_tasks_by_event(self, id_tuples: List[tuple]) -> Set[tuple]:
|
751
|
+
"""基于事件ID批量更新任务状态
|
752
|
+
|
753
|
+
Args:
|
754
|
+
id_tuples: 元组列表,每个元组为 (stream_id, task_key)
|
755
|
+
|
756
|
+
Returns:
|
757
|
+
成功更新的元组集合
|
758
|
+
"""
|
759
|
+
if not id_tuples:
|
760
|
+
return set()
|
761
|
+
|
762
|
+
successful_tuples = set()
|
603
763
|
|
604
764
|
try:
|
605
765
|
pipeline = self.redis_client.pipeline()
|
606
|
-
for
|
607
|
-
task_key = f"{self.prefix}:TASK:{task_id}"
|
766
|
+
for stream_id, task_key in id_tuples:
|
608
767
|
pipeline.hgetall(task_key)
|
609
768
|
|
610
769
|
redis_values = await pipeline.execute()
|
611
770
|
updates = []
|
771
|
+
valid_tuples = [] # 记录有效的元组
|
772
|
+
if len(id_tuples) != len(redis_values):
|
773
|
+
logger.error(f'Mismatch: {len(id_tuples)=} {len(redis_values)=}')
|
774
|
+
# 不抛出异常,继续处理能处理的
|
612
775
|
|
613
|
-
for i,
|
776
|
+
for i, (stream_id, task_key) in enumerate(id_tuples):
|
777
|
+
if i >= len(redis_values):
|
778
|
+
logger.error(f'Missing redis value for task_key={task_key}')
|
779
|
+
continue
|
780
|
+
|
614
781
|
hash_data = redis_values[i]
|
615
782
|
|
616
783
|
if not hash_data:
|
784
|
+
logger.debug(f'No hash data for task_key={task_key}')
|
617
785
|
continue
|
618
786
|
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
787
|
+
try:
|
788
|
+
# 从task_key解析出consumer_group
|
789
|
+
# task_key格式: namespace:TASK:stream_id:group_name
|
790
|
+
# 其中group_name就是完整的consumer_group(格式: jettask:QUEUE:queue_name:task_name)
|
791
|
+
parts = task_key.split(':', 3) # 最多分割成4部分
|
792
|
+
if len(parts) == 4:
|
793
|
+
# parts[0] = namespace (如 'default')
|
794
|
+
# parts[1] = 'TASK'
|
795
|
+
# parts[2] = stream_id
|
796
|
+
# parts[3] = group_name (consumer_group)
|
797
|
+
consumer_group = parts[3] # 直接使用group_name作为consumer_group
|
798
|
+
logger.debug(f"Extracted consumer_group from task_key: {consumer_group}")
|
799
|
+
else:
|
800
|
+
logger.warning(f"Cannot parse consumer_group from task_key: {task_key}")
|
801
|
+
continue
|
802
|
+
|
803
|
+
# 从consumer_group中提取task_name
|
804
|
+
# consumer_group格式: prefix:QUEUE:queue:task_name (如 jettask:QUEUE:robust_bench2:robust_benchmark.benchmark_task)
|
805
|
+
task_name = None
|
806
|
+
if consumer_group:
|
807
|
+
parts = consumer_group.split(':')
|
808
|
+
if len(parts) >= 4:
|
809
|
+
# 最后一部分是task_name
|
810
|
+
task_name = parts[-1]
|
811
|
+
logger.debug(f"Extracted task_name '{task_name}' from consumer_group '{consumer_group}'")
|
812
|
+
|
813
|
+
# 使用stream_id作为任务ID
|
814
|
+
update_info = self._parse_task_hash(stream_id, hash_data)
|
815
|
+
if update_info:
|
816
|
+
# 添加consumer_group和task_name到更新信息中
|
817
|
+
update_info['consumer_group'] = consumer_group
|
818
|
+
update_info['task_name'] = task_name or 'unknown' # 如果无法提取task_name,使用'unknown'
|
819
|
+
# consumer_name就是worker_id(执行任务的实际worker)
|
820
|
+
update_info['consumer_name'] = update_info.get('worker_id')
|
821
|
+
updates.append(update_info)
|
822
|
+
valid_tuples.append((stream_id, task_key))
|
823
|
+
else:
|
824
|
+
logger.debug(f'Failed to parse stream_id={stream_id} hash_data={hash_data}')
|
825
|
+
except Exception as e:
|
826
|
+
logger.error(f'Error parsing task stream_id={stream_id}: {e}')
|
827
|
+
continue
|
623
828
|
if updates:
|
624
|
-
|
625
|
-
|
626
|
-
|
829
|
+
logger.info(f"Attempting to update {len(updates)} tasks, first few: {[u['id'] for u in updates[:3]]}")
|
830
|
+
# logger_f.write(f'{updates=} \n')
|
831
|
+
try:
|
832
|
+
# _update_tasks 现在返回成功更新的ID集合
|
833
|
+
batch_successful = await self._update_tasks(updates)
|
834
|
+
# 将成功的stream_id映射回元组
|
835
|
+
for stream_id in batch_successful:
|
836
|
+
for tuple_item in valid_tuples:
|
837
|
+
if tuple_item[0] == stream_id: # stream_id匹配
|
838
|
+
successful_tuples.add(tuple_item)
|
839
|
+
if batch_successful:
|
840
|
+
logger.info(f"Successfully updated {len(batch_successful)} tasks from change events")
|
841
|
+
else:
|
842
|
+
logger.warning(f"No tasks were successfully updated")
|
843
|
+
except Exception as e:
|
844
|
+
logger.error(f"Error in batch update: {e}")
|
845
|
+
# 批量更新失败,尝试逐个更新
|
846
|
+
for update, tuple_item in zip(updates, valid_tuples):
|
847
|
+
try:
|
848
|
+
single_successful = await self._update_tasks([update])
|
849
|
+
if update['id'] in single_successful:
|
850
|
+
successful_tuples.add(tuple_item)
|
851
|
+
except Exception as single_error:
|
852
|
+
logger.error(f"Failed to update task {tuple_item[0]}: {single_error}")
|
853
|
+
|
627
854
|
except Exception as e:
|
628
855
|
logger.error(f"Error updating tasks by event: {e}", exc_info=True)
|
856
|
+
logger.debug(f'{successful_tuples=}')
|
857
|
+
return successful_tuples
|
629
858
|
|
630
859
|
def _parse_task_hash(self, task_id: str, hash_data: dict) -> Optional[dict]:
|
631
860
|
"""解析Redis Hash数据"""
|
@@ -679,13 +908,12 @@ class PostgreSQLConsumer:
|
|
679
908
|
|
680
909
|
update_info['worker_id'] = hash_dict.get('consumer') or hash_dict.get('worker_id')
|
681
910
|
|
682
|
-
# 转换数值
|
911
|
+
# 转换数值 - 直接存储原始秒数值
|
683
912
|
for num_field in ['execution_time', 'duration']:
|
684
913
|
if hash_dict.get(num_field):
|
685
914
|
try:
|
686
915
|
num_str = hash_dict[num_field]
|
687
|
-
|
688
|
-
num_str = num_str[2:-1]
|
916
|
+
# 直接存储浮点数秒值
|
689
917
|
update_info[num_field] = float(num_str)
|
690
918
|
except:
|
691
919
|
pass
|
@@ -707,103 +935,158 @@ class PostgreSQLConsumer:
|
|
707
935
|
|
708
936
|
return None
|
709
937
|
|
710
|
-
async def _update_tasks(self, updates: List[Dict[str, Any]]):
|
711
|
-
"""
|
938
|
+
async def _update_tasks(self, updates: List[Dict[str, Any]]) -> Set[str]:
|
939
|
+
"""批量更新任务状态(使用UPSERT逻辑处理task_runs表)
|
940
|
+
|
941
|
+
Returns:
|
942
|
+
成功更新的stream_id集合
|
943
|
+
"""
|
712
944
|
if not updates:
|
713
|
-
return
|
945
|
+
return set()
|
714
946
|
|
715
947
|
try:
|
716
948
|
async with self.AsyncSessionLocal() as session:
|
717
|
-
#
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
949
|
+
# V3结构:使用UPSERT逻辑处理task_runs表
|
950
|
+
stream_ids = [u['id'] for u in updates]
|
951
|
+
logger.info(f"Upserting {len(stream_ids)} task_runs records")
|
952
|
+
|
953
|
+
# 对于分区表,我们需要使用不同的UPSERT策略
|
954
|
+
# 先尝试UPDATE,如果没有更新到任何行,则INSERT
|
955
|
+
upsert_query = text("""
|
956
|
+
WITH updated AS (
|
957
|
+
UPDATE task_runs SET
|
958
|
+
consumer_name = COALESCE(CAST(:consumer_name AS TEXT), consumer_name),
|
959
|
+
status = CASE
|
960
|
+
WHEN CAST(:status AS TEXT) IS NULL THEN status
|
961
|
+
WHEN status = 'pending' THEN COALESCE(CAST(:status AS TEXT), status)
|
962
|
+
WHEN status = 'running' AND CAST(:status AS TEXT) IN ('success', 'failed', 'timeout', 'skipped') THEN CAST(:status AS TEXT)
|
963
|
+
WHEN status IN ('success', 'failed', 'timeout', 'skipped') THEN status
|
964
|
+
ELSE COALESCE(CAST(:status AS TEXT), status)
|
965
|
+
END,
|
966
|
+
result = CASE
|
967
|
+
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN result
|
968
|
+
ELSE COALESCE(CAST(:result AS jsonb), result)
|
969
|
+
END,
|
970
|
+
error_message = CASE
|
971
|
+
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN error_message
|
972
|
+
ELSE COALESCE(CAST(:error_message AS TEXT), error_message)
|
973
|
+
END,
|
974
|
+
start_time = COALESCE(CAST(:started_at AS TIMESTAMPTZ), start_time),
|
975
|
+
end_time = CASE
|
976
|
+
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN end_time
|
977
|
+
ELSE COALESCE(CAST(:completed_at AS TIMESTAMPTZ), end_time)
|
978
|
+
END,
|
979
|
+
worker_id = COALESCE(CAST(:worker_id AS TEXT), worker_id),
|
980
|
+
duration = COALESCE(CAST(:duration AS DOUBLE PRECISION), duration),
|
981
|
+
execution_time = COALESCE(CAST(:execution_time AS DOUBLE PRECISION), execution_time),
|
982
|
+
updated_at = CURRENT_TIMESTAMP
|
983
|
+
WHERE stream_id = :stream_id AND consumer_group = :consumer_group
|
984
|
+
RETURNING stream_id
|
985
|
+
)
|
986
|
+
INSERT INTO task_runs (
|
987
|
+
stream_id, task_name, consumer_group, consumer_name, status, result, error_message,
|
988
|
+
start_time, end_time, worker_id, duration, execution_time,
|
989
|
+
created_at, updated_at
|
990
|
+
)
|
991
|
+
SELECT
|
992
|
+
:stream_id, :task_name, :consumer_group, :consumer_name,
|
993
|
+
COALESCE(CAST(:status AS TEXT), 'pending'),
|
994
|
+
CAST(:result AS jsonb),
|
995
|
+
CAST(:error_message AS TEXT),
|
996
|
+
CAST(:started_at AS TIMESTAMPTZ),
|
997
|
+
CAST(:completed_at AS TIMESTAMPTZ),
|
998
|
+
CAST(:worker_id AS TEXT),
|
999
|
+
CAST(:duration AS DOUBLE PRECISION),
|
1000
|
+
CAST(:execution_time AS DOUBLE PRECISION),
|
1001
|
+
CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
1002
|
+
WHERE NOT EXISTS (SELECT 1 FROM updated)
|
1003
|
+
RETURNING stream_id;
|
729
1004
|
""")
|
730
1005
|
|
731
|
-
#
|
732
|
-
|
1006
|
+
# 为每个更新转换参数名称(从id改为stream_id)
|
1007
|
+
run_updates = []
|
1008
|
+
for update in updates:
|
1009
|
+
run_update = update.copy()
|
1010
|
+
run_update['stream_id'] = run_update.pop('id') # 将id改为stream_id
|
1011
|
+
# consumer_group 已经在 update_info 中了,不需要额外处理
|
1012
|
+
run_updates.append(run_update)
|
733
1013
|
|
734
|
-
#
|
735
|
-
|
736
|
-
|
1014
|
+
# 批量执行UPSERT - 使用事务批处理提高性能
|
1015
|
+
successful_count = 0
|
1016
|
+
batch_size = 20 # 每批处理20条记录
|
737
1017
|
|
738
|
-
|
739
|
-
|
740
|
-
task_ids = [u['id'] for u in updates]
|
741
|
-
|
742
|
-
# 先使用内存集合进行快速过滤
|
743
|
-
async with self._processed_ids_lock:
|
744
|
-
# 过滤出可能存在的ID(在内存集合中的肯定存在)
|
745
|
-
known_existing_ids = set(task_ids) & self._processed_task_ids
|
1018
|
+
for i in range(0, len(run_updates), batch_size):
|
1019
|
+
batch = run_updates[i:i+batch_size]
|
746
1020
|
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
logger.debug(f"Memory cache hit: avoided DB query for {len(known_existing_ids)} IDs")
|
754
|
-
elif potential_missing_ids:
|
755
|
-
# 只查询不在内存集合中的ID,减少查询范围
|
756
|
-
logger.debug(f"Memory cache partial hit: checking {len(potential_missing_ids)} IDs in DB (skipped {len(known_existing_ids)} cached IDs)")
|
757
|
-
check_query = text("""
|
758
|
-
SELECT id FROM tasks WHERE id = ANY(:ids)
|
759
|
-
""")
|
760
|
-
check_result = await session.execute(check_query, {'ids': list(potential_missing_ids)})
|
761
|
-
existing_in_db = {row[0] for row in check_result}
|
762
|
-
|
763
|
-
# 更新内存集合(发现的新ID加入集合)
|
764
|
-
async with self._processed_ids_lock:
|
765
|
-
self._processed_task_ids.update(existing_in_db)
|
1021
|
+
try:
|
1022
|
+
# 在一个事务中处理整批
|
1023
|
+
for run_update in batch:
|
1024
|
+
result = await session.execute(upsert_query, run_update)
|
1025
|
+
if result.rowcount > 0:
|
1026
|
+
successful_count += 1
|
766
1027
|
|
767
|
-
#
|
768
|
-
|
769
|
-
|
770
|
-
missing_ids = set()
|
771
|
-
else:
|
772
|
-
# 所有记录都更新成功
|
773
|
-
missing_ids = set()
|
774
|
-
|
775
|
-
if missing_ids:
|
776
|
-
# 将缺失的任务更新加入待重试队列,而不是立即创建
|
777
|
-
async with self._pending_updates_lock:
|
778
|
-
# 创建更新信息映射
|
779
|
-
update_map = {u['id']: u for u in updates if u['id'] in missing_ids}
|
1028
|
+
# 批量提交
|
1029
|
+
await session.commit()
|
1030
|
+
logger.debug(f"Batch {i//batch_size + 1} committed: {len(batch)} records")
|
780
1031
|
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
# 这确保了只有最新的更新会被重试
|
785
|
-
if task_id in self._pending_updates:
|
786
|
-
logger.debug(f"Replacing old pending update for task {task_id} with newer one")
|
787
|
-
|
788
|
-
# 保存更新信息,等待重试(会覆盖旧的)
|
789
|
-
self._pending_updates[task_id] = update_map[task_id]
|
1032
|
+
except Exception as e:
|
1033
|
+
logger.error(f"Batch {i//batch_size + 1} failed, trying individual records: {e}")
|
1034
|
+
await session.rollback()
|
790
1035
|
|
791
|
-
#
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
1036
|
+
# 如果批处理失败,回退到逐个处理这批记录
|
1037
|
+
for run_update in batch:
|
1038
|
+
try:
|
1039
|
+
result = await session.execute(upsert_query, run_update)
|
1040
|
+
await session.commit()
|
1041
|
+
if result.rowcount > 0:
|
1042
|
+
successful_count += 1
|
1043
|
+
except Exception as individual_error:
|
1044
|
+
logger.error(f"Individual upsert failed for {run_update.get('stream_id')}: {individual_error}")
|
1045
|
+
await session.rollback()
|
799
1046
|
|
800
|
-
|
1047
|
+
# 记录成功更新的数量
|
1048
|
+
if successful_count > 0:
|
1049
|
+
logger.info(f"Upserted {successful_count}/{len(run_updates)} task_runs records")
|
801
1050
|
|
802
|
-
#
|
803
|
-
|
1051
|
+
# 检查哪些任务是完成状态,需要从Redis中删除
|
1052
|
+
completed_task_keys = []
|
1053
|
+
for update in updates:
|
1054
|
+
status = update.get('status')
|
1055
|
+
# 如果状态是完成状态(success, error, cancel等)
|
1056
|
+
if status in ['success', 'error', 'failed', 'cancel', 'cancelled', 'timeout', 'skipped']:
|
1057
|
+
# 构建task_key
|
1058
|
+
# task_key格式: namespace:TASK:stream_id:group_name
|
1059
|
+
stream_id = update['id']
|
1060
|
+
consumer_group = update.get('consumer_group')
|
1061
|
+
if consumer_group:
|
1062
|
+
# 从consumer_group提取namespace
|
1063
|
+
# consumer_group格式: prefix:QUEUE:queue:task_name
|
1064
|
+
parts = consumer_group.split(':', 1)
|
1065
|
+
namespace = parts[0] if parts else 'default'
|
1066
|
+
task_key = f"{namespace}:TASK:{stream_id}:{consumer_group}"
|
1067
|
+
completed_task_keys.append(task_key)
|
1068
|
+
logger.info(f"Task {stream_id} with status {status} will be deleted from Redis: {task_key}")
|
1069
|
+
|
1070
|
+
# 从Redis中删除已完成的任务
|
1071
|
+
if completed_task_keys:
|
1072
|
+
try:
|
1073
|
+
pipeline = self.redis_client.pipeline()
|
1074
|
+
for task_key in completed_task_keys:
|
1075
|
+
pipeline.delete(task_key)
|
1076
|
+
deleted_results = await pipeline.execute()
|
1077
|
+
deleted_count = sum(1 for r in deleted_results if r > 0)
|
1078
|
+
if deleted_count > 0:
|
1079
|
+
logger.info(f"Deleted {deleted_count} completed tasks from Redis")
|
1080
|
+
except Exception as e:
|
1081
|
+
logger.error(f"Error deleting completed tasks from Redis: {e}")
|
1082
|
+
|
1083
|
+
# UPSERT 操作总是成功的,返回所有stream_id
|
1084
|
+
# 不需要复杂的错误处理,因为UPSERT保证了操作的原子性
|
1085
|
+
return set(stream_ids)
|
804
1086
|
|
805
1087
|
except Exception as e:
|
806
|
-
logger.error(f"Error
|
1088
|
+
logger.error(f"Error upserting task statuses: {e}")
|
1089
|
+
return set() # 出错时返回空集
|
807
1090
|
|
808
1091
|
async def _retry_pending_updates(self):
|
809
1092
|
"""定期重试待更新的任务"""
|
@@ -821,10 +1104,12 @@ class PostgreSQLConsumer:
|
|
821
1104
|
self._pending_updates.clear()
|
822
1105
|
|
823
1106
|
if pending_items:
|
824
|
-
logger.info(f"Retrying {len(pending_items)} pending task updates")
|
825
1107
|
|
826
1108
|
# 重新尝试更新
|
827
1109
|
updates = [update_info for _, update_info in pending_items]
|
1110
|
+
logger.debug(f"Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}")
|
1111
|
+
logger_f.write(f'{time.time()=} Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}\n')
|
1112
|
+
logger_f.flush()
|
828
1113
|
await self._update_tasks(updates)
|
829
1114
|
|
830
1115
|
except Exception as e:
|
@@ -833,7 +1118,7 @@ class PostgreSQLConsumer:
|
|
833
1118
|
|
834
1119
|
async def _start_offline_recovery(self):
|
835
1120
|
"""启动离线worker恢复服务,恢复离线PG_CONSUMER的消息"""
|
836
|
-
logger.
|
1121
|
+
logger.debug("Starting offline worker recovery service for PG_CONSUMER")
|
837
1122
|
|
838
1123
|
# 等待consumer manager初始化和队列发现
|
839
1124
|
# await asyncio.sleep(5)
|
@@ -843,30 +1128,30 @@ class PostgreSQLConsumer:
|
|
843
1128
|
total_recovered = 0
|
844
1129
|
|
845
1130
|
# 1. 恢复普通队列的消息
|
846
|
-
for queue in self._known_queues:
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
1131
|
+
# for queue in self._known_queues:
|
1132
|
+
# # logger.info(f'{queue=}')
|
1133
|
+
# try:
|
1134
|
+
# recovered = await self.offline_recovery.recover_offline_workers(
|
1135
|
+
# queue=queue,
|
1136
|
+
# current_consumer_name=self.consumer_id,
|
1137
|
+
# process_message_callback=self._process_recovered_queue_message
|
1138
|
+
# )
|
854
1139
|
|
855
|
-
|
856
|
-
|
857
|
-
|
1140
|
+
# if recovered > 0:
|
1141
|
+
# logger.info(f"Recovered {recovered} messages from queue {queue}")
|
1142
|
+
# total_recovered += recovered
|
858
1143
|
|
859
|
-
|
860
|
-
|
1144
|
+
# except Exception as e:
|
1145
|
+
# logger.error(f"Error recovering queue {queue}: {e}")
|
861
1146
|
|
862
1147
|
# 2. 恢复TASK_CHANGES stream的消息
|
863
1148
|
recovered = await self._recover_task_changes_offline_messages()
|
864
1149
|
if recovered > 0:
|
865
|
-
logger.
|
1150
|
+
logger.debug(f"Recovered {recovered} TASK_CHANGES messages")
|
866
1151
|
total_recovered += recovered
|
867
1152
|
|
868
1153
|
if total_recovered > 0:
|
869
|
-
logger.
|
1154
|
+
logger.debug(f"Total recovered {total_recovered} messages in this cycle")
|
870
1155
|
|
871
1156
|
# 每30秒扫描一次
|
872
1157
|
await asyncio.sleep(1)
|
@@ -909,37 +1194,34 @@ class PostgreSQLConsumer:
|
|
909
1194
|
logger.error(f"Error in recover_task_changes_offline_messages: {e}")
|
910
1195
|
return 0
|
911
1196
|
|
912
|
-
async def _process_recovered_queue_message(self, msg_id, msg_data, queue, consumer_id):
|
913
|
-
"""处理恢复的普通队列消息(符合OfflineWorkerRecovery的回调接口)"""
|
914
|
-
try:
|
915
|
-
logger.info(f"Processing recovered message {msg_id} from queue {queue}, offline worker {consumer_id}")
|
916
|
-
|
917
|
-
# 解析任务信息
|
918
|
-
task_info = self._parse_stream_message(msg_id, msg_data, queue)
|
919
|
-
if task_info:
|
920
|
-
# 批量插入到数据库
|
921
|
-
await self._batch_insert_tasks([task_info])
|
922
|
-
|
923
|
-
# ACK消息
|
924
|
-
stream_key = f"{self.prefix}:QUEUE:{queue}"
|
925
|
-
await self.redis_client.xack(stream_key, self.consumer_group, msg_id)
|
926
|
-
|
927
|
-
except Exception as e:
|
928
|
-
logger.error(f"Error processing recovered queue message {msg_id}: {e}")
|
929
|
-
|
930
1197
|
async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
|
931
1198
|
"""处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
|
932
1199
|
try:
|
933
|
-
|
934
|
-
|
935
|
-
if
|
936
|
-
|
937
|
-
|
1200
|
+
logger.debug(f'处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口) {msg_data=}')
|
1201
|
+
# 解析消息 - 现在使用task_id而不是event_id
|
1202
|
+
if b'task_id' in msg_data:
|
1203
|
+
# 使用msgpack解压task_id
|
1204
|
+
compressed_task_id = msg_data[b'task_id']
|
1205
|
+
task_key = msgpack.unpackb(compressed_task_id)
|
1206
|
+
task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
|
938
1207
|
|
939
|
-
|
1208
|
+
# 从完整的task_key格式提取stream_id
|
1209
|
+
# 格式: namespace:TASK:stream_id:queue_name
|
1210
|
+
stream_id = None
|
1211
|
+
if ':TASK:' in task_key:
|
1212
|
+
parts = task_key.split(':TASK:')
|
1213
|
+
if len(parts) == 2:
|
1214
|
+
# 再从右边部分提取stream_id
|
1215
|
+
right_parts = parts[1].split(':')
|
1216
|
+
if right_parts:
|
1217
|
+
stream_id = right_parts[0] # 提取stream_id
|
940
1218
|
|
941
|
-
|
942
|
-
|
1219
|
+
if stream_id:
|
1220
|
+
logger.debug(f"Processing recovered TASK_CHANGES message: {stream_id} from offline worker {consumer_id}")
|
1221
|
+
# 更新任务状态 - 传入(stream_id, task_key)元组
|
1222
|
+
await self._update_tasks_by_event([(stream_id, task_key)])
|
1223
|
+
else:
|
1224
|
+
logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
|
943
1225
|
|
944
1226
|
# ACK消息
|
945
1227
|
change_stream_key = f"{self.prefix}:TASK_CHANGES"
|
@@ -960,10 +1242,11 @@ class PostgreSQLConsumer:
|
|
960
1242
|
|
961
1243
|
if current_time - last_analyze_time > analyze_interval:
|
962
1244
|
async with self.AsyncSessionLocal() as session:
|
963
|
-
logger.
|
1245
|
+
logger.debug("Running ANALYZE on tasks and task_runs tables...")
|
964
1246
|
await session.execute(text("ANALYZE tasks"))
|
1247
|
+
await session.execute(text("ANALYZE task_runs"))
|
965
1248
|
await session.commit()
|
966
|
-
logger.
|
1249
|
+
logger.debug("ANALYZE completed successfully for both tables")
|
967
1250
|
last_analyze_time = current_time
|
968
1251
|
|
969
1252
|
await asyncio.sleep(300) # 每5分钟检查一次
|
@@ -972,11 +1255,442 @@ class PostgreSQLConsumer:
|
|
972
1255
|
logger.error(f"Error in database maintenance: {e}")
|
973
1256
|
await asyncio.sleep(60)
|
974
1257
|
|
975
|
-
def
|
1258
|
+
async def _stream_backlog_monitor(self):
|
1259
|
+
"""Stream积压监控任务 - 使用分布式锁确保只有一个实例采集"""
|
1260
|
+
# await asyncio.sleep(10) # 启动后延迟10秒开始
|
1261
|
+
|
1262
|
+
while self._running:
|
1263
|
+
try:
|
1264
|
+
# 尝试获取分布式锁
|
1265
|
+
lock_acquired = await self._try_acquire_monitor_lock()
|
1266
|
+
|
1267
|
+
if lock_acquired:
|
1268
|
+
try:
|
1269
|
+
logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
|
1270
|
+
await self._collect_stream_backlog_metrics()
|
1271
|
+
logger.debug("Stream backlog metrics collected successfully")
|
1272
|
+
finally:
|
1273
|
+
# 释放锁
|
1274
|
+
await self._release_monitor_lock()
|
1275
|
+
else:
|
1276
|
+
logger.debug("Another instance is collecting backlog metrics, skipping...")
|
1277
|
+
|
1278
|
+
# 等待下一次采集
|
1279
|
+
await asyncio.sleep(self.backlog_monitor_interval)
|
1280
|
+
|
1281
|
+
except Exception as e:
|
1282
|
+
logger.error(f"Error in stream backlog monitor: {e}")
|
1283
|
+
await asyncio.sleep(30) # 出错后等待30秒
|
1284
|
+
|
1285
|
+
async def _try_acquire_monitor_lock(self) -> bool:
|
1286
|
+
"""尝试获取监控锁(使用Redis原生锁)"""
|
1287
|
+
try:
|
1288
|
+
# 使用SET NX EX命令实现分布式锁
|
1289
|
+
# NX: 只在键不存在时设置
|
1290
|
+
# EX: 设置过期时间(秒)
|
1291
|
+
result = await self.redis_client.set(
|
1292
|
+
self.backlog_monitor_lock_key.encode(),
|
1293
|
+
self.node_id.encode(), # 锁的值为当前节点ID
|
1294
|
+
nx=True, # 只在不存在时设置
|
1295
|
+
ex=self.backlog_monitor_lock_ttl # 过期时间
|
1296
|
+
)
|
1297
|
+
return result is not None
|
1298
|
+
except Exception as e:
|
1299
|
+
logger.error(f"Error acquiring monitor lock: {e}")
|
1300
|
+
return False
|
1301
|
+
|
1302
|
+
async def _release_monitor_lock(self):
|
1303
|
+
"""释放监控锁(只释放自己持有的锁)"""
|
1304
|
+
try:
|
1305
|
+
# 使用Lua脚本确保只释放自己持有的锁
|
1306
|
+
lua_script = """
|
1307
|
+
if redis.call("get", KEYS[1]) == ARGV[1] then
|
1308
|
+
return redis.call("del", KEYS[1])
|
1309
|
+
else
|
1310
|
+
return 0
|
1311
|
+
end
|
1312
|
+
"""
|
1313
|
+
await self.redis_client.eval(
|
1314
|
+
lua_script,
|
1315
|
+
1,
|
1316
|
+
self.backlog_monitor_lock_key.encode(),
|
1317
|
+
self.node_id.encode()
|
1318
|
+
)
|
1319
|
+
except Exception as e:
|
1320
|
+
logger.error(f"Error releasing monitor lock: {e}")
|
1321
|
+
|
1322
|
+
async def _collect_stream_backlog_metrics(self):
|
1323
|
+
"""采集Stream积压指标并保存到数据库(使用offset方式)"""
|
1324
|
+
try:
|
1325
|
+
# 获取所有队列的最新offset (QUEUE_OFFSETS)
|
1326
|
+
queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
|
1327
|
+
queue_offsets = {}
|
1328
|
+
try:
|
1329
|
+
# 使用decode_responses=False的客户端,手动解码
|
1330
|
+
raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
|
1331
|
+
for k, v in raw_queue_offsets.items():
|
1332
|
+
queue_name = k.decode() if isinstance(k, bytes) else k
|
1333
|
+
offset_value = v.decode() if isinstance(v, bytes) else v
|
1334
|
+
queue_offsets[queue_name] = int(offset_value)
|
1335
|
+
except Exception as e:
|
1336
|
+
logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
|
1337
|
+
|
1338
|
+
# 获取所有任务组的消费offset (TASK_OFFSETS)
|
1339
|
+
task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
|
1340
|
+
task_offsets = {}
|
1341
|
+
try:
|
1342
|
+
raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
|
1343
|
+
for k, v in raw_task_offsets.items():
|
1344
|
+
task_key = k.decode() if isinstance(k, bytes) else k
|
1345
|
+
offset_value = v.decode() if isinstance(v, bytes) else v
|
1346
|
+
task_offsets[task_key] = int(offset_value)
|
1347
|
+
except Exception as e:
|
1348
|
+
logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
|
1349
|
+
|
1350
|
+
# 使用SCAN命令扫描所有队列Stream(包括普通队列和优先级队列)
|
1351
|
+
stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
|
1352
|
+
pattern = f"{self.prefix}:QUEUE:*".encode()
|
1353
|
+
cursor = 0
|
1354
|
+
|
1355
|
+
# 使用SCAN命令,增大count参数以提高效率
|
1356
|
+
while True:
|
1357
|
+
cursor, keys = await self.redis_client.scan(cursor, match=pattern, count=10000)
|
1358
|
+
|
1359
|
+
for key in keys:
|
1360
|
+
key_str = key.decode()
|
1361
|
+
# 移除前缀 "prefix:QUEUE:"
|
1362
|
+
queue_part = key_str.replace(f"{self.prefix}:QUEUE:", "")
|
1363
|
+
|
1364
|
+
# 检查是否是优先级队列(格式:queue_name:priority)
|
1365
|
+
parts = queue_part.split(':')
|
1366
|
+
if len(parts) == 2 and parts[1].isdigit():
|
1367
|
+
# 优先级队列
|
1368
|
+
queue_name = parts[0]
|
1369
|
+
priority = int(parts[1])
|
1370
|
+
if queue_name not in stream_info_map:
|
1371
|
+
stream_info_map[queue_name] = []
|
1372
|
+
stream_info_map[queue_name].append((key, priority))
|
1373
|
+
elif ':' not in queue_part:
|
1374
|
+
# 普通队列(不包含冒号)
|
1375
|
+
queue_name = queue_part
|
1376
|
+
if queue_name not in stream_info_map:
|
1377
|
+
stream_info_map[queue_name] = []
|
1378
|
+
stream_info_map[queue_name].append((key, 0)) # 普通队列优先级为0
|
1379
|
+
# 忽略其他格式的键(如消费组等)
|
1380
|
+
|
1381
|
+
if cursor == 0:
|
1382
|
+
break
|
1383
|
+
|
1384
|
+
if not stream_info_map:
|
1385
|
+
logger.debug("No streams found for backlog monitoring")
|
1386
|
+
return
|
1387
|
+
|
1388
|
+
# 调试日志(使用debug级别避免刷屏)
|
1389
|
+
logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
|
1390
|
+
for queue_name, stream_list in stream_info_map.items():
|
1391
|
+
priorities = [p for _, p in stream_list]
|
1392
|
+
# 筛选出非0优先级(0表示普通队列)
|
1393
|
+
high_priorities = [p for p in priorities if p > 0]
|
1394
|
+
if high_priorities:
|
1395
|
+
logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
|
1396
|
+
else:
|
1397
|
+
logger.debug(f" - {queue_name}: regular queue only (priority=0)")
|
1398
|
+
|
1399
|
+
# 收集每个队列的指标(聚合所有优先级)
|
1400
|
+
metrics = []
|
1401
|
+
current_time = datetime.now(timezone.utc)
|
1402
|
+
|
1403
|
+
for queue_name, stream_list in stream_info_map.items():
|
1404
|
+
# 分别处理每个优先级队列
|
1405
|
+
for stream_key, priority in stream_list:
|
1406
|
+
try:
|
1407
|
+
# 获取该队列的最新offset(考虑优先级队列)
|
1408
|
+
if priority > 0:
|
1409
|
+
# 优先级队列的key格式: queue_name:priority
|
1410
|
+
queue_key = f"{queue_name}:{priority}"
|
1411
|
+
else:
|
1412
|
+
queue_key = queue_name
|
1413
|
+
last_published_offset = queue_offsets.get(queue_key, 0)
|
1414
|
+
|
1415
|
+
# 获取Stream信息
|
1416
|
+
stream_info = await self.redis_client.xinfo_stream(stream_key)
|
1417
|
+
stream_length = stream_info.get(b'length', 0)
|
1418
|
+
|
1419
|
+
# 获取消费组信息
|
1420
|
+
has_consumer_groups = False
|
1421
|
+
try:
|
1422
|
+
groups = await self.redis_client.xinfo_groups(stream_key)
|
1423
|
+
|
1424
|
+
for group in groups:
|
1425
|
+
# 处理group_name
|
1426
|
+
raw_name = group.get('name', b'')
|
1427
|
+
if isinstance(raw_name, bytes):
|
1428
|
+
group_name = raw_name.decode() if raw_name else ''
|
1429
|
+
else:
|
1430
|
+
group_name = str(raw_name) if raw_name else ''
|
1431
|
+
|
1432
|
+
if not group_name:
|
1433
|
+
group_name = 'unknown'
|
1434
|
+
|
1435
|
+
# 过滤内部消费者组
|
1436
|
+
if is_internal_consumer(group_name):
|
1437
|
+
# logger.info(f"Skipping internal consumer group: {group_name}")
|
1438
|
+
continue
|
1439
|
+
|
1440
|
+
# 处理pending - 直接是int
|
1441
|
+
pending_count = group.get('pending', 0)
|
1442
|
+
|
1443
|
+
# 从TASK_OFFSETS获取该组的消费offset
|
1444
|
+
# key格式: f"{queue_name}:{group_name}" (不包含优先级)
|
1445
|
+
task_offset_key = f"{queue_name}:{group_name}"
|
1446
|
+
last_acked_offset = task_offsets.get(task_offset_key, 0)
|
1447
|
+
|
1448
|
+
# 计算各种积压指标
|
1449
|
+
# 1. 总积压 = 队列最新offset - 消费组已确认的offset
|
1450
|
+
total_backlog = max(0, last_published_offset - last_acked_offset)
|
1451
|
+
|
1452
|
+
# 2. 未投递的积压 = 总积压 - pending数量
|
1453
|
+
backlog_undelivered = max(0, total_backlog - pending_count)
|
1454
|
+
|
1455
|
+
# 3. 已投递未确认 = pending数量
|
1456
|
+
backlog_delivered_unacked = pending_count
|
1457
|
+
|
1458
|
+
# 4. 已投递的offset = 已确认offset + pending数量
|
1459
|
+
last_delivered_offset = last_acked_offset + pending_count
|
1460
|
+
|
1461
|
+
# 为每个消费组创建一条记录
|
1462
|
+
metrics.append({
|
1463
|
+
'namespace': self.namespace_name,
|
1464
|
+
'stream_name': queue_name,
|
1465
|
+
'priority': priority, # 添加优先级字段
|
1466
|
+
'consumer_group': group_name,
|
1467
|
+
'last_published_offset': last_published_offset,
|
1468
|
+
'last_delivered_offset': last_delivered_offset,
|
1469
|
+
'last_acked_offset': last_acked_offset,
|
1470
|
+
'pending_count': pending_count,
|
1471
|
+
'backlog_undelivered': backlog_undelivered,
|
1472
|
+
'backlog_unprocessed': total_backlog,
|
1473
|
+
'created_at': current_time
|
1474
|
+
})
|
1475
|
+
has_consumer_groups = True
|
1476
|
+
|
1477
|
+
except Exception as e:
|
1478
|
+
# 这个队列没有消费组
|
1479
|
+
logger.debug(f"No consumer groups for stream {stream_key.decode()}: {e}")
|
1480
|
+
|
1481
|
+
# 如果没有消费组,保存Stream级别的指标
|
1482
|
+
if not has_consumer_groups and last_published_offset > 0:
|
1483
|
+
metrics.append({
|
1484
|
+
'namespace': self.namespace_name,
|
1485
|
+
'stream_name': queue_name,
|
1486
|
+
'priority': priority, # 添加优先级字段
|
1487
|
+
'consumer_group': None,
|
1488
|
+
'last_published_offset': last_published_offset,
|
1489
|
+
'last_delivered_offset': 0,
|
1490
|
+
'last_acked_offset': 0,
|
1491
|
+
'pending_count': 0,
|
1492
|
+
'backlog_undelivered': last_published_offset,
|
1493
|
+
'backlog_unprocessed': last_published_offset,
|
1494
|
+
'created_at': current_time
|
1495
|
+
})
|
1496
|
+
|
1497
|
+
except Exception as e:
|
1498
|
+
logger.error(f"Error collecting metrics for stream {stream_key.decode()}: {e}")
|
1499
|
+
continue
|
1500
|
+
|
1501
|
+
# 保存指标到数据库
|
1502
|
+
if metrics:
|
1503
|
+
await self._save_backlog_metrics(metrics)
|
1504
|
+
# logger.info(f"Collected backlog metrics for {len(metrics)} stream/group combinations {time.time() }")
|
1505
|
+
|
1506
|
+
except Exception as e:
|
1507
|
+
import traceback
|
1508
|
+
traceback.print_exc()
|
1509
|
+
logger.error(f"Error collecting stream backlog metrics: {e}")
|
1510
|
+
|
1511
|
+
async def _save_backlog_metrics(self, metrics: List[Dict]):
|
1512
|
+
"""保存积压指标到数据库(仅保存发生变化的数据)"""
|
1513
|
+
if not metrics:
|
1514
|
+
return
|
1515
|
+
|
1516
|
+
# logger.info(f"Processing {len(metrics)} metrics for deduplication")
|
1517
|
+
|
1518
|
+
try:
|
1519
|
+
async with self.AsyncSessionLocal() as session:
|
1520
|
+
# 要保存的新记录
|
1521
|
+
metrics_to_save = []
|
1522
|
+
|
1523
|
+
# 使用批量查询优化性能
|
1524
|
+
metric_keys = {} # 用于快速查找
|
1525
|
+
|
1526
|
+
for metric in metrics:
|
1527
|
+
# 构建唯一键:namespace + stream_name + consumer_group + priority
|
1528
|
+
unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
|
1529
|
+
metric_keys[unique_key] = metric
|
1530
|
+
|
1531
|
+
# logger.info(f"Checking {len(metric_keys)} unique metric combinations")
|
1532
|
+
|
1533
|
+
# 批量查询最新记录 - 分批查询以避免SQL过长
|
1534
|
+
last_records = {}
|
1535
|
+
metric_list = list(metric_keys.values())
|
1536
|
+
batch_size = 50 # 每批查询50个
|
1537
|
+
|
1538
|
+
for i in range(0, len(metric_list), batch_size):
|
1539
|
+
batch = metric_list[i:i + batch_size]
|
1540
|
+
|
1541
|
+
# 构建参数化查询
|
1542
|
+
conditions = []
|
1543
|
+
params = {}
|
1544
|
+
for idx, metric in enumerate(batch):
|
1545
|
+
param_prefix = f"p{i + idx}"
|
1546
|
+
conditions.append(f"""
|
1547
|
+
(namespace = :{param_prefix}_ns
|
1548
|
+
AND stream_name = :{param_prefix}_sn
|
1549
|
+
AND consumer_group = :{param_prefix}_cg
|
1550
|
+
AND priority = :{param_prefix}_pr)
|
1551
|
+
""")
|
1552
|
+
params[f"{param_prefix}_ns"] = metric['namespace']
|
1553
|
+
params[f"{param_prefix}_sn"] = metric['stream_name']
|
1554
|
+
params[f"{param_prefix}_cg"] = metric['consumer_group']
|
1555
|
+
params[f"{param_prefix}_pr"] = metric['priority']
|
1556
|
+
|
1557
|
+
if conditions:
|
1558
|
+
# 使用窗口函数获取每个组合的最新记录
|
1559
|
+
query_sql = text(f"""
|
1560
|
+
WITH latest_records AS (
|
1561
|
+
SELECT
|
1562
|
+
namespace,
|
1563
|
+
stream_name,
|
1564
|
+
consumer_group,
|
1565
|
+
priority,
|
1566
|
+
last_published_offset,
|
1567
|
+
last_delivered_offset,
|
1568
|
+
last_acked_offset,
|
1569
|
+
pending_count,
|
1570
|
+
backlog_undelivered,
|
1571
|
+
backlog_unprocessed,
|
1572
|
+
ROW_NUMBER() OVER (
|
1573
|
+
PARTITION BY namespace, stream_name, consumer_group, priority
|
1574
|
+
ORDER BY created_at DESC
|
1575
|
+
) as rn
|
1576
|
+
FROM stream_backlog_monitor
|
1577
|
+
WHERE ({' OR '.join(conditions)})
|
1578
|
+
)
|
1579
|
+
SELECT
|
1580
|
+
namespace,
|
1581
|
+
stream_name,
|
1582
|
+
consumer_group,
|
1583
|
+
priority,
|
1584
|
+
last_published_offset,
|
1585
|
+
last_delivered_offset,
|
1586
|
+
last_acked_offset,
|
1587
|
+
pending_count,
|
1588
|
+
backlog_undelivered,
|
1589
|
+
backlog_unprocessed
|
1590
|
+
FROM latest_records
|
1591
|
+
WHERE rn = 1
|
1592
|
+
""")
|
1593
|
+
|
1594
|
+
result = await session.execute(query_sql, params)
|
1595
|
+
for row in result:
|
1596
|
+
key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
|
1597
|
+
last_records[key] = row
|
1598
|
+
logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
|
1599
|
+
|
1600
|
+
# 对每个指标进行去重检查
|
1601
|
+
for unique_key, metric in metric_keys.items():
|
1602
|
+
should_save = False
|
1603
|
+
|
1604
|
+
if unique_key not in last_records:
|
1605
|
+
# 没有历史记录,需要保存
|
1606
|
+
should_save = True
|
1607
|
+
# logger.info(f"New metric for {unique_key}, will save")
|
1608
|
+
else:
|
1609
|
+
# 比较关键指标是否发生变化
|
1610
|
+
last_record = last_records[unique_key]
|
1611
|
+
|
1612
|
+
# 详细的调试日志
|
1613
|
+
changes = []
|
1614
|
+
logger.debug(f"Comparing for {unique_key}:")
|
1615
|
+
logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
|
1616
|
+
f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
|
1617
|
+
f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
|
1618
|
+
f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
|
1619
|
+
logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
|
1620
|
+
f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
|
1621
|
+
f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
|
1622
|
+
f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
|
1623
|
+
|
1624
|
+
# 确保类型一致的比较(全部转为int进行比较)
|
1625
|
+
db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
|
1626
|
+
new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
|
1627
|
+
|
1628
|
+
db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
|
1629
|
+
new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
|
1630
|
+
|
1631
|
+
db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
|
1632
|
+
new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
|
1633
|
+
|
1634
|
+
db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
|
1635
|
+
new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
|
1636
|
+
|
1637
|
+
db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
|
1638
|
+
new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
|
1639
|
+
|
1640
|
+
db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
|
1641
|
+
new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
|
1642
|
+
|
1643
|
+
if db_published != new_published:
|
1644
|
+
changes.append(f"published: {db_published} -> {new_published}")
|
1645
|
+
if db_delivered != new_delivered:
|
1646
|
+
changes.append(f"delivered: {db_delivered} -> {new_delivered}")
|
1647
|
+
if db_acked != new_acked:
|
1648
|
+
changes.append(f"acked: {db_acked} -> {new_acked}")
|
1649
|
+
if db_pending != new_pending:
|
1650
|
+
changes.append(f"pending: {db_pending} -> {new_pending}")
|
1651
|
+
if db_undelivered != new_undelivered:
|
1652
|
+
changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
|
1653
|
+
if db_unprocessed != new_unprocessed:
|
1654
|
+
changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
|
1655
|
+
|
1656
|
+
if changes:
|
1657
|
+
should_save = True
|
1658
|
+
# logger.info(f"Metric changed for {unique_key}: {', '.join(changes)}")
|
1659
|
+
else:
|
1660
|
+
logger.debug(f"Metric unchanged for {unique_key}, skipping")
|
1661
|
+
|
1662
|
+
if should_save:
|
1663
|
+
metrics_to_save.append(metric)
|
1664
|
+
|
1665
|
+
# 批量插入发生变化的监控数据
|
1666
|
+
if metrics_to_save:
|
1667
|
+
insert_sql = text("""
|
1668
|
+
INSERT INTO stream_backlog_monitor
|
1669
|
+
(namespace, stream_name, priority, consumer_group, last_published_offset,
|
1670
|
+
last_delivered_offset, last_acked_offset, pending_count,
|
1671
|
+
backlog_undelivered, backlog_unprocessed, created_at)
|
1672
|
+
VALUES
|
1673
|
+
(:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
|
1674
|
+
:last_delivered_offset, :last_acked_offset, :pending_count,
|
1675
|
+
:backlog_undelivered, :backlog_unprocessed, :created_at)
|
1676
|
+
""")
|
1677
|
+
|
1678
|
+
# 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
|
1679
|
+
for metric_data in metrics_to_save:
|
1680
|
+
await session.execute(insert_sql, metric_data)
|
1681
|
+
|
1682
|
+
await session.commit()
|
1683
|
+
# logger.info(f"Saved {len(metrics_to_save)} changed metrics out of {len(metrics)} total")
|
1684
|
+
else:
|
1685
|
+
logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
|
1686
|
+
|
1687
|
+
except Exception as e:
|
1688
|
+
logger.error(f"Error saving backlog metrics to database: {e}")
|
1689
|
+
|
1690
|
+
def _parse_stream_message(self, task_id: str, data: dict) -> Optional[dict]:
|
976
1691
|
"""解析Stream消息为任务信息(返回完整的字段)"""
|
977
1692
|
try:
|
978
1693
|
from jettask.utils.serializer import loads_str
|
979
|
-
|
980
1694
|
if b'data' in data:
|
981
1695
|
task_data = loads_str(data[b'data'])
|
982
1696
|
else:
|
@@ -991,7 +1705,15 @@ class PostgreSQLConsumer:
|
|
991
1705
|
else:
|
992
1706
|
value = v
|
993
1707
|
task_data[key] = value
|
994
|
-
|
1708
|
+
# 如果配置了命名空间,检查消息是否属于该命名空间
|
1709
|
+
# if self.namespace_id:
|
1710
|
+
# msg_namespace_id = task_data.get('__namespace_id')
|
1711
|
+
# # 如果消息没有namespace_id且当前不是默认命名空间,跳过
|
1712
|
+
# if msg_namespace_id != self.namespace_id:
|
1713
|
+
# if not (msg_namespace_id is None and self.namespace_id == 'default'):
|
1714
|
+
# logger.debug(f"Skipping message from different namespace: {msg_namespace_id} != {self.namespace_id}")
|
1715
|
+
# return None
|
1716
|
+
queue_name = task_data['queue']
|
995
1717
|
task_name = task_data.get('name', task_data.get('task', 'unknown'))
|
996
1718
|
created_at = None
|
997
1719
|
if 'trigger_time' in task_data:
|
@@ -1000,7 +1722,6 @@ class PostgreSQLConsumer:
|
|
1000
1722
|
created_at = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
1001
1723
|
except:
|
1002
1724
|
pass
|
1003
|
-
|
1004
1725
|
# 返回完整的字段,包括所有可能为None的字段
|
1005
1726
|
return {
|
1006
1727
|
'id': task_id,
|
@@ -1016,12 +1737,16 @@ class PostgreSQLConsumer:
|
|
1016
1737
|
'created_at': created_at,
|
1017
1738
|
'started_at': None, # 新任务还未开始
|
1018
1739
|
'completed_at': None, # 新任务还未完成
|
1740
|
+
'scheduled_task_id': task_data.get('scheduled_task_id'), # 调度任务ID
|
1741
|
+
'metadata': json.dumps(task_data.get('metadata', {})),
|
1019
1742
|
'worker_id': None, # 新任务还未分配worker
|
1020
1743
|
'execution_time': None, # 新任务还没有执行时间
|
1021
1744
|
'duration': None, # 新任务还没有持续时间
|
1022
|
-
'
|
1745
|
+
'namespace_id': self.namespace_id # 添加命名空间ID
|
1023
1746
|
}
|
1024
1747
|
except Exception as e:
|
1748
|
+
import traceback
|
1749
|
+
traceback.print_exc()
|
1025
1750
|
logger.error(f"Error parsing stream message for task {task_id}: {e}")
|
1026
1751
|
return None
|
1027
1752
|
|
@@ -1030,7 +1755,19 @@ class PostgreSQLConsumer:
|
|
1030
1755
|
async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig,
|
1031
1756
|
consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
|
1032
1757
|
"""运行PostgreSQL消费者"""
|
1033
|
-
|
1758
|
+
# 从环境变量读取监控配置
|
1759
|
+
enable_backlog_monitor = os.getenv('JETTASK_ENABLE_BACKLOG_MONITOR', 'true').lower() == 'true'
|
1760
|
+
backlog_monitor_interval = int(os.getenv('JETTASK_BACKLOG_MONITOR_INTERVAL', '60'))
|
1761
|
+
|
1762
|
+
logger.info(f"Backlog monitor config: enabled={enable_backlog_monitor}, interval={backlog_monitor_interval}s")
|
1763
|
+
|
1764
|
+
consumer = PostgreSQLConsumer(
|
1765
|
+
pg_config,
|
1766
|
+
redis_config,
|
1767
|
+
consumer_strategy=consumer_strategy,
|
1768
|
+
enable_backlog_monitor=enable_backlog_monitor,
|
1769
|
+
backlog_monitor_interval=backlog_monitor_interval
|
1770
|
+
)
|
1034
1771
|
|
1035
1772
|
try:
|
1036
1773
|
await consumer.start()
|
@@ -1038,7 +1775,7 @@ async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig
|
|
1038
1775
|
await asyncio.sleep(1)
|
1039
1776
|
|
1040
1777
|
except KeyboardInterrupt:
|
1041
|
-
logger.
|
1778
|
+
logger.debug("Received interrupt signal")
|
1042
1779
|
finally:
|
1043
1780
|
await consumer.stop()
|
1044
1781
|
|
@@ -1080,9 +1817,9 @@ def main():
|
|
1080
1817
|
elif strategy_name == 'HEARTBEAT':
|
1081
1818
|
consumer_strategy = ConsumerStrategy.HEARTBEAT
|
1082
1819
|
else:
|
1083
|
-
logger.
|
1820
|
+
logger.debug(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
|
1084
1821
|
|
1085
|
-
logger.
|
1822
|
+
logger.debug(f"Using consumer strategy: {consumer_strategy.value}")
|
1086
1823
|
|
1087
1824
|
asyncio.run(run_pg_consumer(pg_config, redis_config, consumer_strategy))
|
1088
1825
|
|