jettask 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +60 -2
- jettask/cli.py +314 -228
- jettask/config/__init__.py +9 -1
- jettask/config/config.py +245 -0
- jettask/config/env_loader.py +381 -0
- jettask/config/lua_scripts.py +158 -0
- jettask/config/nacos_config.py +132 -5
- jettask/core/__init__.py +1 -1
- jettask/core/app.py +1573 -666
- jettask/core/app_importer.py +33 -16
- jettask/core/container.py +532 -0
- jettask/core/task.py +1 -4
- jettask/core/unified_manager_base.py +2 -2
- jettask/executor/__init__.py +38 -0
- jettask/executor/core.py +625 -0
- jettask/executor/executor.py +338 -0
- jettask/executor/orchestrator.py +290 -0
- jettask/executor/process_entry.py +638 -0
- jettask/executor/task_executor.py +317 -0
- jettask/messaging/__init__.py +68 -0
- jettask/messaging/event_pool.py +2188 -0
- jettask/messaging/reader.py +519 -0
- jettask/messaging/registry.py +266 -0
- jettask/messaging/scanner.py +369 -0
- jettask/messaging/sender.py +312 -0
- jettask/persistence/__init__.py +118 -0
- jettask/persistence/backlog_monitor.py +567 -0
- jettask/{backend/data_access.py → persistence/base.py} +58 -57
- jettask/persistence/consumer.py +315 -0
- jettask/{core → persistence}/db_manager.py +23 -22
- jettask/persistence/maintenance.py +81 -0
- jettask/persistence/message_consumer.py +259 -0
- jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
- jettask/persistence/offline_recovery.py +196 -0
- jettask/persistence/queue_discovery.py +215 -0
- jettask/persistence/task_persistence.py +218 -0
- jettask/persistence/task_updater.py +583 -0
- jettask/scheduler/__init__.py +2 -2
- jettask/scheduler/loader.py +6 -5
- jettask/scheduler/run_scheduler.py +1 -1
- jettask/scheduler/scheduler.py +7 -7
- jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
- jettask/task/__init__.py +16 -0
- jettask/{router.py → task/router.py} +26 -8
- jettask/task/task_center/__init__.py +9 -0
- jettask/task/task_executor.py +318 -0
- jettask/task/task_registry.py +291 -0
- jettask/test_connection_monitor.py +73 -0
- jettask/utils/__init__.py +31 -1
- jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
- jettask/utils/db_connector.py +1629 -0
- jettask/{db_init.py → utils/db_init.py} +1 -1
- jettask/utils/rate_limit/__init__.py +30 -0
- jettask/utils/rate_limit/concurrency_limiter.py +665 -0
- jettask/utils/rate_limit/config.py +145 -0
- jettask/utils/rate_limit/limiter.py +41 -0
- jettask/utils/rate_limit/manager.py +269 -0
- jettask/utils/rate_limit/qps_limiter.py +154 -0
- jettask/utils/rate_limit/task_limiter.py +384 -0
- jettask/utils/serializer.py +3 -0
- jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
- jettask/utils/time_sync.py +173 -0
- jettask/webui/__init__.py +27 -0
- jettask/{api/v1 → webui/api}/alerts.py +1 -1
- jettask/{api/v1 → webui/api}/analytics.py +2 -2
- jettask/{api/v1 → webui/api}/namespaces.py +1 -1
- jettask/{api/v1 → webui/api}/overview.py +1 -1
- jettask/{api/v1 → webui/api}/queues.py +3 -3
- jettask/{api/v1 → webui/api}/scheduled.py +1 -1
- jettask/{api/v1 → webui/api}/settings.py +1 -1
- jettask/{api.py → webui/app.py} +253 -145
- jettask/webui/namespace_manager/__init__.py +10 -0
- jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
- jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
- jettask/{run.py → webui/run.py} +2 -2
- jettask/{services → webui/services}/__init__.py +1 -3
- jettask/{services → webui/services}/overview_service.py +34 -16
- jettask/{services → webui/services}/queue_service.py +1 -1
- jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
- jettask/{services → webui/services}/settings_service.py +1 -1
- jettask/worker/__init__.py +53 -0
- jettask/worker/lifecycle.py +1507 -0
- jettask/worker/manager.py +583 -0
- jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
- {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
- jettask-0.2.20.dist-info/RECORD +145 -0
- jettask/__main__.py +0 -140
- jettask/api/__init__.py +0 -103
- jettask/backend/__init__.py +0 -1
- jettask/backend/api/__init__.py +0 -3
- jettask/backend/api/v1/__init__.py +0 -17
- jettask/backend/api/v1/monitoring.py +0 -431
- jettask/backend/api/v1/namespaces.py +0 -504
- jettask/backend/api/v1/queues.py +0 -342
- jettask/backend/api/v1/tasks.py +0 -367
- jettask/backend/core/__init__.py +0 -3
- jettask/backend/core/cache.py +0 -221
- jettask/backend/core/database.py +0 -200
- jettask/backend/core/exceptions.py +0 -102
- jettask/backend/dependencies.py +0 -261
- jettask/backend/init_meta_db.py +0 -158
- jettask/backend/main.py +0 -1426
- jettask/backend/main_unified.py +0 -78
- jettask/backend/main_v2.py +0 -394
- jettask/backend/models/__init__.py +0 -3
- jettask/backend/models/requests.py +0 -236
- jettask/backend/models/responses.py +0 -230
- jettask/backend/namespace_api_old.py +0 -267
- jettask/backend/services/__init__.py +0 -3
- jettask/backend/start.py +0 -42
- jettask/backend/unified_api_router.py +0 -1541
- jettask/cleanup_deprecated_tables.sql +0 -16
- jettask/core/consumer_manager.py +0 -1695
- jettask/core/delay_scanner.py +0 -256
- jettask/core/event_pool.py +0 -1700
- jettask/core/heartbeat_process.py +0 -222
- jettask/core/task_batch.py +0 -153
- jettask/core/worker_scanner.py +0 -271
- jettask/executors/__init__.py +0 -5
- jettask/executors/asyncio.py +0 -876
- jettask/executors/base.py +0 -30
- jettask/executors/common.py +0 -148
- jettask/executors/multi_asyncio.py +0 -309
- jettask/gradio_app.py +0 -570
- jettask/integrated_gradio_app.py +0 -1088
- jettask/main.py +0 -0
- jettask/monitoring/__init__.py +0 -3
- jettask/pg_consumer.py +0 -1896
- jettask/run_monitor.py +0 -22
- jettask/run_webui.py +0 -148
- jettask/scheduler/multi_namespace_scheduler.py +0 -294
- jettask/scheduler/unified_manager.py +0 -450
- jettask/task_center_client.py +0 -150
- jettask/utils/serializer_optimized.py +0 -33
- jettask/webui_exceptions.py +0 -67
- jettask-0.2.18.dist-info/RECORD +0 -150
- /jettask/{constants.py → config/constants.py} +0 -0
- /jettask/{backend/config.py → config/task_center.py} +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
- /jettask/{models.py → persistence/models.py} +0 -0
- /jettask/scheduler/{manager.py → task_crud.py} +0 -0
- /jettask/{schema.sql → schemas/schema.sql} +0 -0
- /jettask/{task_center.py → task/task_center/client.py} +0 -0
- /jettask/{monitoring → utils}/file_watcher.py +0 -0
- /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
- /jettask/{api/v1 → webui/api}/__init__.py +0 -0
- /jettask/{webui_config.py → webui/config.py} +0 -0
- /jettask/{webui_models → webui/models}/__init__.py +0 -0
- /jettask/{webui_models → webui/models}/namespace.py +0 -0
- /jettask/{services → webui/services}/alert_service.py +0 -0
- /jettask/{services → webui/services}/analytics_service.py +0 -0
- /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
- /jettask/{services → webui/services}/task_service.py +0 -0
- /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
- /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
- {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
- {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
jettask/pg_consumer.py
DELETED
@@ -1,1896 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
"""简化版的 PostgreSQL Consumer - 只保留必要功能"""
|
3
|
-
|
4
|
-
import asyncio
|
5
|
-
import json
|
6
|
-
import logging
|
7
|
-
import msgpack
|
8
|
-
import os
|
9
|
-
import time
|
10
|
-
from typing import Dict, List, Optional, Any, Set
|
11
|
-
from datetime import datetime, timezone
|
12
|
-
from collections import defaultdict
|
13
|
-
|
14
|
-
import redis.asyncio as redis
|
15
|
-
from redis.asyncio import Redis
|
16
|
-
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
17
|
-
from sqlalchemy.orm import sessionmaker
|
18
|
-
from sqlalchemy import text
|
19
|
-
|
20
|
-
from jettask.webui_config import PostgreSQLConfig, RedisConfig
|
21
|
-
from jettask.core.consumer_manager import ConsumerManager, ConsumerStrategy
|
22
|
-
from jettask.core.offline_worker_recovery import OfflineWorkerRecovery
|
23
|
-
from jettask.constants import is_internal_consumer, TASK_STATUS_PRIORITY
|
24
|
-
|
25
|
-
logger = logging.getLogger(__name__)
|
26
|
-
|
27
|
-
# 注释掉调试文件写入,避免权限问题
|
28
|
-
# logger_f = open(f'./pg_consumer.txt', 'a+')
|
29
|
-
|
30
|
-
# 使用 constants.py 中定义的任务状态优先级
|
31
|
-
# STATUS_PRIORITY 已从 constants.py 导入为 TASK_STATUS_PRIORITY
|
32
|
-
class PostgreSQLConsumer:
|
33
|
-
"""PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL
|
34
|
-
|
35
|
-
支持多租户(命名空间)隔离
|
36
|
-
"""
|
37
|
-
|
38
|
-
def __init__(self, pg_config: PostgreSQLConfig, redis_config: RedisConfig, prefix: str = "jettask",
|
39
|
-
node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT,
|
40
|
-
namespace_id: str = None, namespace_name: str = None,
|
41
|
-
enable_backlog_monitor: bool = True, backlog_monitor_interval: int = 1):
|
42
|
-
self.pg_config = pg_config
|
43
|
-
self.redis_config = redis_config
|
44
|
-
self.prefix = prefix
|
45
|
-
|
46
|
-
# 命名空间支持
|
47
|
-
self.namespace_id = namespace_id
|
48
|
-
self.namespace_name = namespace_name or "default"
|
49
|
-
self.redis_client: Optional[Redis] = None
|
50
|
-
self.async_engine = None
|
51
|
-
self.AsyncSessionLocal = None
|
52
|
-
self.consumer_group = f"{prefix}_pg_consumer"
|
53
|
-
|
54
|
-
# 节点标识
|
55
|
-
import socket
|
56
|
-
hostname = socket.gethostname()
|
57
|
-
self.node_id = node_id or f"{hostname}_{os.getpid()}"
|
58
|
-
|
59
|
-
# 使用 ConsumerManager 来管理 consumer_id
|
60
|
-
self.consumer_strategy = consumer_strategy
|
61
|
-
self.consumer_manager = None # 将在 start() 中初始化
|
62
|
-
self.consumer_id = None # 将从 ConsumerManager 获取
|
63
|
-
|
64
|
-
self._running = False
|
65
|
-
self._tasks = []
|
66
|
-
self._known_queues = set()
|
67
|
-
self._consecutive_errors = defaultdict(int)
|
68
|
-
|
69
|
-
# 内存中维护已处理的任务ID集合(用于优化查询)
|
70
|
-
self._processed_task_ids = set()
|
71
|
-
self._processed_ids_lock = asyncio.Lock() # 保护并发访问
|
72
|
-
# 定期清理过期的ID(防止内存无限增长)
|
73
|
-
self._processed_ids_max_size = 100000 # 最多保存10万个ID
|
74
|
-
self._processed_ids_cleanup_interval = 300 # 每5分钟清理一次
|
75
|
-
|
76
|
-
# 待重试的任务更新(任务ID -> 更新信息)
|
77
|
-
self._pending_updates = {}
|
78
|
-
self._pending_updates_lock = asyncio.Lock()
|
79
|
-
self._max_pending_updates = 10000 # 最多保存1万个待重试更新
|
80
|
-
self._retry_interval = 5 # 每5秒重试一次
|
81
|
-
|
82
|
-
# 动态批次大小
|
83
|
-
self.batch_size = 2000
|
84
|
-
self.min_batch_size = 500
|
85
|
-
self.max_batch_size = 5000
|
86
|
-
|
87
|
-
# Stream积压监控配置
|
88
|
-
self.enable_backlog_monitor = enable_backlog_monitor # 是否启用积压监控
|
89
|
-
self.backlog_monitor_interval = backlog_monitor_interval # 监控采集间隔(秒)
|
90
|
-
self.backlog_monitor_lock_key = f"{prefix}:BACKLOG_MONITOR_LOCK" # 分布式锁键
|
91
|
-
self.backlog_monitor_lock_ttl = backlog_monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
|
92
|
-
|
93
|
-
# 队列注册表(替代scan命令)
|
94
|
-
self.queue_registry_key = f"{prefix}:QUEUE_REGISTRY" # 队列注册表的Redis key
|
95
|
-
self.stream_registry_key = f"{prefix}:STREAM_REGISTRY" # Stream注册表的Redis key(用于积压监控)
|
96
|
-
|
97
|
-
async def start(self):
|
98
|
-
"""启动消费者"""
|
99
|
-
logger.info(f"Starting PostgreSQL consumer (simplified) on node: {self.node_id}")
|
100
|
-
|
101
|
-
# 连接Redis
|
102
|
-
# 构建连接参数,只在密码非空时传递
|
103
|
-
async_redis_params = {
|
104
|
-
'host': self.redis_config.host,
|
105
|
-
'port': self.redis_config.port,
|
106
|
-
'db': self.redis_config.db,
|
107
|
-
'decode_responses': False
|
108
|
-
}
|
109
|
-
if self.redis_config.password:
|
110
|
-
async_redis_params['password'] = self.redis_config.password
|
111
|
-
|
112
|
-
self.redis_client = await redis.Redis(**async_redis_params)
|
113
|
-
|
114
|
-
# 初始化 ConsumerManager(需要同步的 Redis 客户端)
|
115
|
-
import redis as sync_redis
|
116
|
-
# 构建连接参数,只在密码非空时传递
|
117
|
-
sync_redis_params = {
|
118
|
-
'host': self.redis_config.host,
|
119
|
-
'port': self.redis_config.port,
|
120
|
-
'db': self.redis_config.db,
|
121
|
-
'decode_responses': True # 使用字符串模式,与其他组件保持一致
|
122
|
-
}
|
123
|
-
if self.redis_config.password:
|
124
|
-
sync_redis_params['password'] = self.redis_config.password
|
125
|
-
|
126
|
-
sync_redis_client = sync_redis.StrictRedis(**sync_redis_params)
|
127
|
-
|
128
|
-
# 配置 ConsumerManager
|
129
|
-
# 初始队列列表包含TASK_CHANGES,其他队列会动态添加
|
130
|
-
initial_queues = ['TASK_CHANGES'] # TASK_CHANGES是固定的
|
131
|
-
consumer_config = {
|
132
|
-
'redis_prefix': self.prefix,
|
133
|
-
'queues': initial_queues,
|
134
|
-
'worker_prefix': 'PG_CONSUMER', # 使用不同的前缀,与task worker区分开
|
135
|
-
}
|
136
|
-
|
137
|
-
self.consumer_manager = ConsumerManager(
|
138
|
-
redis_client=sync_redis_client,
|
139
|
-
strategy=self.consumer_strategy,
|
140
|
-
config=consumer_config
|
141
|
-
)
|
142
|
-
|
143
|
-
# 获取稳定的 consumer_id(使用TASK_CHANGES作为基准队列)
|
144
|
-
self.consumer_id = self.consumer_manager.get_consumer_name('TASK_CHANGES')
|
145
|
-
logger.debug(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
|
146
|
-
|
147
|
-
# 创建SQLAlchemy异步引擎
|
148
|
-
if self.pg_config.dsn.startswith('postgresql://'):
|
149
|
-
dsn = self.pg_config.dsn.replace('postgresql://', 'postgresql+asyncpg://', 1)
|
150
|
-
else:
|
151
|
-
dsn = self.pg_config.dsn
|
152
|
-
|
153
|
-
self.async_engine = create_async_engine(
|
154
|
-
dsn,
|
155
|
-
pool_size=50,
|
156
|
-
max_overflow=20,
|
157
|
-
pool_pre_ping=True,
|
158
|
-
pool_recycle=300,
|
159
|
-
echo=False
|
160
|
-
)
|
161
|
-
|
162
|
-
# 预热连接池
|
163
|
-
logger.debug("Pre-warming database connection pool...")
|
164
|
-
async with self.async_engine.begin() as conn:
|
165
|
-
await conn.execute(text("SELECT 1"))
|
166
|
-
|
167
|
-
# 创建异步会话工厂
|
168
|
-
self.AsyncSessionLocal = sessionmaker(
|
169
|
-
self.async_engine,
|
170
|
-
class_=AsyncSession,
|
171
|
-
expire_on_commit=False
|
172
|
-
)
|
173
|
-
|
174
|
-
|
175
|
-
self._running = True
|
176
|
-
|
177
|
-
# 先进行一次队列发现,确保ConsumerManager有正确的队列列表
|
178
|
-
await self._initial_queue_discovery()
|
179
|
-
|
180
|
-
# 创建离线worker恢复器(用于恢复TASK_CHANGES stream的离线消息)
|
181
|
-
self.offline_recovery = OfflineWorkerRecovery(
|
182
|
-
async_redis_client=self.redis_client,
|
183
|
-
redis_prefix=self.prefix,
|
184
|
-
worker_prefix='PG_CONSUMER', # 使用PG_CONSUMER前缀
|
185
|
-
consumer_manager=self.consumer_manager
|
186
|
-
)
|
187
|
-
|
188
|
-
# 启动消费任务(简化版:只保留必要的任务)
|
189
|
-
self._tasks = [
|
190
|
-
asyncio.create_task(self._consume_queues()), # 消费新任务
|
191
|
-
asyncio.create_task(self._consume_task_changes()), # 消费任务变更事件
|
192
|
-
asyncio.create_task(self._database_maintenance()), # 数据库维护
|
193
|
-
asyncio.create_task(self._retry_pending_updates()), # 重试待更新的任务
|
194
|
-
asyncio.create_task(self._start_offline_recovery()) # 离线worker恢复服务
|
195
|
-
]
|
196
|
-
|
197
|
-
# 如果启用了积压监控,添加监控任务
|
198
|
-
if self.enable_backlog_monitor:
|
199
|
-
self._tasks.append(
|
200
|
-
asyncio.create_task(self._stream_backlog_monitor()) # Stream积压监控
|
201
|
-
)
|
202
|
-
logger.info(f"Stream backlog monitor enabled with {self.backlog_monitor_interval}s interval")
|
203
|
-
|
204
|
-
# 如果使用 HEARTBEAT 策略,ConsumerManager 会自动管理心跳
|
205
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and self.consumer_manager:
|
206
|
-
# 启动心跳(ConsumerManager 内部会处理)
|
207
|
-
logger.debug("Heartbeat is managed by ConsumerManager")
|
208
|
-
|
209
|
-
logger.debug("PostgreSQL consumer started successfully")
|
210
|
-
|
211
|
-
async def stop(self):
|
212
|
-
"""停止消费者"""
|
213
|
-
logger.debug("Stopping PostgreSQL consumer...")
|
214
|
-
self._running = False
|
215
|
-
|
216
|
-
# 停止离线恢复服务
|
217
|
-
if hasattr(self, 'offline_recovery'):
|
218
|
-
self.offline_recovery.stop() # stop() 不是异步方法
|
219
|
-
|
220
|
-
# 取消所有任务
|
221
|
-
for task in self._tasks:
|
222
|
-
task.cancel()
|
223
|
-
|
224
|
-
# 等待任务完成
|
225
|
-
await asyncio.gather(*self._tasks, return_exceptions=True)
|
226
|
-
|
227
|
-
# 清理 ConsumerManager
|
228
|
-
if self.consumer_manager:
|
229
|
-
try:
|
230
|
-
self.consumer_manager.cleanup()
|
231
|
-
logger.debug(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
|
232
|
-
except Exception as e:
|
233
|
-
logger.error(f"Error cleaning up ConsumerManager: {e}")
|
234
|
-
|
235
|
-
# 关闭连接
|
236
|
-
if self.redis_client:
|
237
|
-
await self.redis_client.close()
|
238
|
-
|
239
|
-
if self.async_engine:
|
240
|
-
await self.async_engine.dispose()
|
241
|
-
|
242
|
-
logger.debug("PostgreSQL consumer stopped")
|
243
|
-
|
244
|
-
async def _initial_queue_discovery(self):
|
245
|
-
"""初始队列发现,在启动时执行一次 - 使用队列注册表替代scan"""
|
246
|
-
try:
|
247
|
-
new_queues = set()
|
248
|
-
logger.info(f"Starting initial queue discovery from queue registry: {self.queue_registry_key}")
|
249
|
-
|
250
|
-
# 从队列注册表获取所有队列
|
251
|
-
queue_members = await self.redis_client.smembers(self.queue_registry_key.encode())
|
252
|
-
for queue_name_bytes in queue_members:
|
253
|
-
queue_name = queue_name_bytes.decode('utf-8') if isinstance(queue_name_bytes, bytes) else str(queue_name_bytes)
|
254
|
-
new_queues.add(queue_name)
|
255
|
-
logger.info(f"Found registered queue: {queue_name}")
|
256
|
-
|
257
|
-
# 如果注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
|
258
|
-
if not new_queues:
|
259
|
-
logger.warning(f"Queue registry is empty, performing one-time scan initialization...")
|
260
|
-
pattern = f"{self.prefix}:QUEUE:*"
|
261
|
-
async for key in self.redis_client.scan_iter(match=pattern, count=100):
|
262
|
-
key_str = key.decode('utf-8')
|
263
|
-
parts = key_str.split(":")
|
264
|
-
if len(parts) >= 3:
|
265
|
-
# 去掉前缀和QUEUE部分
|
266
|
-
queue_parts = parts[2:] # 从第3部分开始是队列名
|
267
|
-
queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
|
268
|
-
new_queues.add(queue_name)
|
269
|
-
logger.info(f"Found queue during scan: {queue_name} from key: {key_str}")
|
270
|
-
|
271
|
-
# 将发现的队列添加到注册表中
|
272
|
-
if new_queues:
|
273
|
-
pipeline = self.redis_client.pipeline()
|
274
|
-
for queue_name in new_queues:
|
275
|
-
pipeline.sadd(self.queue_registry_key.encode(), queue_name.encode())
|
276
|
-
await pipeline.execute()
|
277
|
-
logger.info(f"Registered {len(new_queues)} queues to registry during initialization")
|
278
|
-
|
279
|
-
if new_queues:
|
280
|
-
logger.info(f"Initial queue discovery found {len(new_queues)} queues: {new_queues}")
|
281
|
-
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
282
|
-
all_queues = list(new_queues) + ['TASK_CHANGES']
|
283
|
-
|
284
|
-
# 更新ConsumerManager的配置
|
285
|
-
if self.consumer_manager:
|
286
|
-
self.consumer_manager.config['queues'] = all_queues
|
287
|
-
|
288
|
-
# 更新worker的队列信息
|
289
|
-
# 获取实际的consumer_id(从心跳策略中)
|
290
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
|
291
|
-
actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
|
292
|
-
else:
|
293
|
-
# 从consumer_name中提取(格式:consumer_id-queue)
|
294
|
-
actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
|
295
|
-
|
296
|
-
worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
|
297
|
-
try:
|
298
|
-
# 使用同步Redis客户端更新
|
299
|
-
self.consumer_manager.redis_client.hset(
|
300
|
-
worker_key,
|
301
|
-
'queues',
|
302
|
-
','.join(all_queues)
|
303
|
-
)
|
304
|
-
logger.debug(f"Initial queue discovery - found queues: {all_queues}")
|
305
|
-
except Exception as e:
|
306
|
-
logger.error(f"Error updating initial worker queues: {e}")
|
307
|
-
|
308
|
-
self._known_queues = new_queues
|
309
|
-
|
310
|
-
except Exception as e:
|
311
|
-
logger.error(f"Error in initial queue discovery: {e}")
|
312
|
-
|
313
|
-
async def _discover_queues(self):
|
314
|
-
"""定期发现新队列 - 使用队列注册表替代scan"""
|
315
|
-
while self._running:
|
316
|
-
try:
|
317
|
-
new_queues = set()
|
318
|
-
|
319
|
-
# 从队列注册表获取所有队列
|
320
|
-
queue_members = await self.redis_client.smembers(self.queue_registry_key.encode())
|
321
|
-
for queue_name_bytes in queue_members:
|
322
|
-
queue_name = queue_name_bytes.decode('utf-8') if isinstance(queue_name_bytes, bytes) else str(queue_name_bytes)
|
323
|
-
new_queues.add(queue_name)
|
324
|
-
|
325
|
-
# 优化:添加日志,只在队列数量或内容发生变化时记录
|
326
|
-
if len(new_queues) != len(self._known_queues) or new_queues != self._known_queues:
|
327
|
-
logger.debug(f"Queue registry contains {len(new_queues)} queues: {sorted(new_queues)}")
|
328
|
-
|
329
|
-
# 为新发现的队列创建消费者组(注意:新队列应该通过生产者自动注册)
|
330
|
-
new_discovered = new_queues - self._known_queues
|
331
|
-
if new_discovered:
|
332
|
-
for queue in new_discovered:
|
333
|
-
# 正确构建stream_key,保留优先级部分
|
334
|
-
stream_key = f"{self.prefix}:QUEUE:{queue}"
|
335
|
-
try:
|
336
|
-
await self.redis_client.xgroup_create(
|
337
|
-
stream_key, self.consumer_group, id='0', mkstream=True
|
338
|
-
)
|
339
|
-
logger.info(f"Created consumer group for new queue: {queue} with stream_key: {stream_key}")
|
340
|
-
except redis.ResponseError:
|
341
|
-
pass
|
342
|
-
|
343
|
-
# 更新ConsumerManager的队列列表(同步操作)
|
344
|
-
if new_queues != self._known_queues:
|
345
|
-
logger.info(f"Queue discovery: found {len(new_queues)} queues: {new_queues}")
|
346
|
-
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
347
|
-
all_queues = list(new_queues) + ['TASK_CHANGES']
|
348
|
-
|
349
|
-
# 更新ConsumerManager的配置
|
350
|
-
if self.consumer_manager:
|
351
|
-
self.consumer_manager.config['queues'] = all_queues
|
352
|
-
|
353
|
-
# 更新worker的队列信息
|
354
|
-
# 获取实际的consumer_id(从心跳策略中)
|
355
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
|
356
|
-
actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
|
357
|
-
else:
|
358
|
-
# 从consumer_name中提取(格式:consumer_id-queue)
|
359
|
-
actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
|
360
|
-
|
361
|
-
worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
|
362
|
-
try:
|
363
|
-
# 使用同步Redis客户端更新
|
364
|
-
self.consumer_manager.redis_client.hset(
|
365
|
-
worker_key,
|
366
|
-
'queues',
|
367
|
-
','.join(all_queues)
|
368
|
-
)
|
369
|
-
logger.debug(f"Updated ConsumerManager queues: {all_queues}")
|
370
|
-
except Exception as e:
|
371
|
-
logger.error(f"Error updating worker queues: {e}")
|
372
|
-
|
373
|
-
self._known_queues = new_queues
|
374
|
-
await asyncio.sleep(10) # 保持较短的检查间隔,确保新队列能及时发现
|
375
|
-
|
376
|
-
except Exception as e:
|
377
|
-
import traceback
|
378
|
-
traceback.print_exc()
|
379
|
-
logger.error(f"Error discovering queues: {e}")
|
380
|
-
await asyncio.sleep(10)
|
381
|
-
|
382
|
-
async def _consume_queue(self, queue_name: str):
|
383
|
-
"""消费单个队列的任务(包括优先级队列)"""
|
384
|
-
# logger.info(f"Starting to consume queue: {queue_name}")
|
385
|
-
# 判断是否是优先级队列
|
386
|
-
is_priority_queue = ':' in queue_name and queue_name.rsplit(':', 1)[-1].isdigit()
|
387
|
-
|
388
|
-
if is_priority_queue:
|
389
|
-
# 优先级队列格式:base_queue:priority (如 robust_bench2:2)
|
390
|
-
base_queue = queue_name.rsplit(':', 1)[0]
|
391
|
-
priority = queue_name.rsplit(':', 1)[1]
|
392
|
-
stream_key = f"{self.prefix}:QUEUE:{base_queue}:{priority}"
|
393
|
-
else:
|
394
|
-
# 普通队列
|
395
|
-
stream_key = f"{self.prefix}:QUEUE:{queue_name}"
|
396
|
-
|
397
|
-
logger.debug(f"Consuming queue: {queue_name}, stream_key: {stream_key}, is_priority: {is_priority_queue}")
|
398
|
-
|
399
|
-
check_backlog = True
|
400
|
-
lastid = "0-0"
|
401
|
-
|
402
|
-
# pg_consumer 应该使用统一的 consumer_id,而不是为每个队列创建新的
|
403
|
-
# 因为 pg_consumer 的职责是消费所有队列的消息并写入数据库
|
404
|
-
# 它不是真正的任务执行者,所以不需要为每个队列创建独立的 consumer
|
405
|
-
consumer_name = self.consumer_id
|
406
|
-
|
407
|
-
# ConsumerManager会自动处理离线worker的pending消息恢复
|
408
|
-
# 不需要手动恢复
|
409
|
-
|
410
|
-
while self._running and queue_name in self._known_queues:
|
411
|
-
try:
|
412
|
-
myid = lastid if check_backlog else ">"
|
413
|
-
|
414
|
-
messages = await self.redis_client.xreadgroup(
|
415
|
-
self.consumer_group,
|
416
|
-
consumer_name, # 使用ConsumerManager管理的consumer_name
|
417
|
-
{stream_key: myid},
|
418
|
-
count=10000,
|
419
|
-
block=1000 if not check_backlog else 0
|
420
|
-
)
|
421
|
-
if not messages or (messages and len(messages[0][1]) == 0):
|
422
|
-
check_backlog = False
|
423
|
-
continue
|
424
|
-
|
425
|
-
if messages:
|
426
|
-
await self._process_messages(messages)
|
427
|
-
self._consecutive_errors[queue_name] = 0
|
428
|
-
|
429
|
-
if messages[0] and messages[0][1]:
|
430
|
-
lastid = messages[0][1][-1][0].decode('utf-8') if isinstance(messages[0][1][-1][0], bytes) else messages[0][1][-1][0]
|
431
|
-
check_backlog = len(messages[0][1]) >= 2000
|
432
|
-
|
433
|
-
except redis.ResponseError as e:
|
434
|
-
if "NOGROUP" in str(e):
|
435
|
-
try:
|
436
|
-
await self.redis_client.xgroup_create(
|
437
|
-
stream_key, self.consumer_group, id='0', mkstream=True
|
438
|
-
)
|
439
|
-
logger.debug(f"Recreated consumer group for queue: {queue_name}")
|
440
|
-
check_backlog = True
|
441
|
-
lastid = "0-0"
|
442
|
-
except:
|
443
|
-
pass
|
444
|
-
else:
|
445
|
-
logger.error(f"Redis error for queue {queue_name}: {e}")
|
446
|
-
self._consecutive_errors[queue_name] += 1
|
447
|
-
|
448
|
-
if self._consecutive_errors[queue_name] > 10:
|
449
|
-
logger.debug(f"Too many errors for queue {queue_name}, will retry later")
|
450
|
-
await asyncio.sleep(30)
|
451
|
-
self._consecutive_errors[queue_name] = 0
|
452
|
-
|
453
|
-
except Exception as e:
|
454
|
-
logger.error(f"Error consuming queue {queue_name}: {e}", exc_info=True)
|
455
|
-
self._consecutive_errors[queue_name] += 1
|
456
|
-
await asyncio.sleep(1)
|
457
|
-
|
458
|
-
async def _consume_queues(self):
|
459
|
-
"""启动所有队列的消费任务"""
|
460
|
-
discover_task = asyncio.create_task(self._discover_queues())
|
461
|
-
queue_tasks = {}
|
462
|
-
while self._running:
|
463
|
-
try:
|
464
|
-
for queue in self._known_queues:
|
465
|
-
if queue not in queue_tasks or queue_tasks[queue].done():
|
466
|
-
queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
|
467
|
-
logger.debug(f"Started consumer task for queue: {queue}")
|
468
|
-
|
469
|
-
for queue in list(queue_tasks.keys()):
|
470
|
-
if queue not in self._known_queues:
|
471
|
-
queue_tasks[queue].cancel()
|
472
|
-
del queue_tasks[queue]
|
473
|
-
logger.debug(f"Stopped consumer task for removed queue: {queue}")
|
474
|
-
|
475
|
-
await asyncio.sleep(10)
|
476
|
-
|
477
|
-
except Exception as e:
|
478
|
-
logger.error(f"Error in consume_queues manager: {e}")
|
479
|
-
await asyncio.sleep(5)
|
480
|
-
|
481
|
-
discover_task.cancel()
|
482
|
-
for task in queue_tasks.values():
|
483
|
-
task.cancel()
|
484
|
-
|
485
|
-
await asyncio.gather(discover_task, *queue_tasks.values(), return_exceptions=True)
|
486
|
-
|
487
|
-
async def _process_messages(self, messages: List):
|
488
|
-
"""处理消息并保存到PostgreSQL"""
|
489
|
-
tasks_to_insert = []
|
490
|
-
ack_batch = []
|
491
|
-
|
492
|
-
for stream_key, stream_messages in messages:
|
493
|
-
if not stream_messages:
|
494
|
-
continue
|
495
|
-
|
496
|
-
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
|
497
|
-
msg_ids_to_ack = []
|
498
|
-
|
499
|
-
for msg_id, data in stream_messages:
|
500
|
-
try:
|
501
|
-
if not msg_id or not data:
|
502
|
-
continue
|
503
|
-
|
504
|
-
msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
|
505
|
-
|
506
|
-
# 使用公共方法解析消息
|
507
|
-
task_info = self._parse_stream_message(msg_id_str, data)
|
508
|
-
if task_info:
|
509
|
-
tasks_to_insert.append(task_info)
|
510
|
-
msg_ids_to_ack.append(msg_id)
|
511
|
-
|
512
|
-
except Exception as e:
|
513
|
-
logger.error(f"Error processing message {msg_id}: {e}")
|
514
|
-
|
515
|
-
if msg_ids_to_ack:
|
516
|
-
ack_batch.append((stream_key, msg_ids_to_ack))
|
517
|
-
|
518
|
-
if tasks_to_insert:
|
519
|
-
await self._insert_tasks(tasks_to_insert)
|
520
|
-
|
521
|
-
# 将成功插入的任务ID添加到内存集合中
|
522
|
-
async with self._processed_ids_lock:
|
523
|
-
for task in tasks_to_insert:
|
524
|
-
self._processed_task_ids.add(task['id'])
|
525
|
-
|
526
|
-
# 如果集合过大,清理最早的一半
|
527
|
-
if len(self._processed_task_ids) > self._processed_ids_max_size:
|
528
|
-
# 只保留最新的一半ID
|
529
|
-
ids_list = list(self._processed_task_ids)
|
530
|
-
keep_count = self._processed_ids_max_size // 2
|
531
|
-
self._processed_task_ids = set(ids_list[-keep_count:])
|
532
|
-
logger.debug(f"Cleaned processed IDs cache, kept {keep_count} most recent IDs")
|
533
|
-
|
534
|
-
if ack_batch:
|
535
|
-
pipeline = self.redis_client.pipeline()
|
536
|
-
for stream_key, msg_ids in ack_batch:
|
537
|
-
pipeline.xack(stream_key, self.consumer_group, *msg_ids)
|
538
|
-
|
539
|
-
try:
|
540
|
-
await pipeline.execute()
|
541
|
-
total_acked = sum(len(msg_ids) for _, msg_ids in ack_batch)
|
542
|
-
logger.debug(f"Successfully ACKed {total_acked} messages")
|
543
|
-
except Exception as e:
|
544
|
-
logger.error(f"Error executing batch ACK: {e}")
|
545
|
-
|
546
|
-
async def _insert_tasks(self, tasks: List[Dict[str, Any]]):
|
547
|
-
"""批量插入任务到PostgreSQL(只处理tasks表)"""
|
548
|
-
if not tasks:
|
549
|
-
return
|
550
|
-
|
551
|
-
logger.info(f"Attempting to insert {len(tasks)} tasks to tasks table")
|
552
|
-
try:
|
553
|
-
async with self.AsyncSessionLocal() as session:
|
554
|
-
# 插入tasks表 - 使用批量INSERT忽略冲突
|
555
|
-
# 由于stream_id在实践中是唯一的,我们可以简单地忽略重复
|
556
|
-
tasks_query = text("""
|
557
|
-
INSERT INTO tasks (stream_id, queue, namespace, scheduled_task_id,
|
558
|
-
payload, priority, created_at, source, metadata)
|
559
|
-
VALUES (:stream_id, :queue, :namespace, :scheduled_task_id,
|
560
|
-
CAST(:payload AS jsonb), :priority, :created_at, :source, CAST(:metadata AS jsonb))
|
561
|
-
ON CONFLICT DO NOTHING
|
562
|
-
RETURNING stream_id;
|
563
|
-
""")
|
564
|
-
|
565
|
-
# 准备tasks表的数据
|
566
|
-
tasks_data = []
|
567
|
-
for task in tasks:
|
568
|
-
task_data = json.loads(task['task_data'])
|
569
|
-
|
570
|
-
# 从task_data中获取scheduled_task_id
|
571
|
-
scheduled_task_id = task_data.get('scheduled_task_id') or task.get('scheduled_task_id')
|
572
|
-
|
573
|
-
# 根据是否有scheduled_task_id来判断任务来源
|
574
|
-
if scheduled_task_id:
|
575
|
-
source = 'scheduler' # 定时任务
|
576
|
-
else:
|
577
|
-
source = 'redis_stream' # 普通任务
|
578
|
-
|
579
|
-
tasks_data.append({
|
580
|
-
'stream_id': task['id'], # Redis Stream ID作为stream_id
|
581
|
-
'queue': task['queue_name'],
|
582
|
-
'namespace': self.namespace_name,
|
583
|
-
'scheduled_task_id': str(scheduled_task_id) if scheduled_task_id else None,
|
584
|
-
'payload': task['task_data'], # 完整的任务数据
|
585
|
-
'priority': task['priority'],
|
586
|
-
'created_at': task['created_at'],
|
587
|
-
'source': source,
|
588
|
-
'metadata': task.get('metadata', '{}')
|
589
|
-
})
|
590
|
-
|
591
|
-
# 批量插入 - 使用executemany提高性能
|
592
|
-
logger.debug(f"Executing batch insert with {len(tasks_data)} tasks")
|
593
|
-
|
594
|
-
try:
|
595
|
-
# 使用executemany批量插入
|
596
|
-
result = await session.execute(tasks_query, tasks_data)
|
597
|
-
|
598
|
-
# 获取实际插入的记录数
|
599
|
-
inserted_count = result.rowcount
|
600
|
-
|
601
|
-
# if inserted_count > 0:
|
602
|
-
# logger.info(f"Successfully inserted {inserted_count} new tasks to tasks table")
|
603
|
-
# else:
|
604
|
-
# logger.info(f"No new tasks inserted (all may be duplicates)")
|
605
|
-
|
606
|
-
await session.commit()
|
607
|
-
logger.debug("Tasks table batch insert transaction completed")
|
608
|
-
|
609
|
-
except Exception as e:
|
610
|
-
logger.error(f"Error in batch insert, trying fallback: {e}")
|
611
|
-
await session.rollback()
|
612
|
-
|
613
|
-
# 如果批量插入失败,降级为小批量插入(每批10条)
|
614
|
-
batch_size = 10
|
615
|
-
total_inserted = 0
|
616
|
-
|
617
|
-
for i in range(0, len(tasks_data), batch_size):
|
618
|
-
batch = tasks_data[i:i+batch_size]
|
619
|
-
try:
|
620
|
-
result = await session.execute(tasks_query, batch)
|
621
|
-
batch_inserted = result.rowcount
|
622
|
-
if batch_inserted > 0:
|
623
|
-
total_inserted += batch_inserted
|
624
|
-
await session.commit()
|
625
|
-
except Exception as batch_error:
|
626
|
-
logger.error(f"Batch {i//batch_size + 1} failed: {batch_error}")
|
627
|
-
await session.rollback()
|
628
|
-
|
629
|
-
if total_inserted > 0:
|
630
|
-
logger.info(f"Fallback insert completed: {total_inserted} tasks inserted")
|
631
|
-
else:
|
632
|
-
logger.info(f"No new tasks inserted in fallback mode")
|
633
|
-
|
634
|
-
except Exception as e:
|
635
|
-
logger.error(f"Error inserting tasks to PostgreSQL: {e}")
|
636
|
-
|
637
|
-
async def _consume_task_changes(self):
|
638
|
-
"""消费任务变更事件流 - 基于事件驱动的更新(支持pending消息恢复)"""
|
639
|
-
change_stream_key = f"{self.prefix}:TASK_CHANGES"
|
640
|
-
consumer_group = f"{self.prefix}_changes_consumer"
|
641
|
-
|
642
|
-
# 使用 ConsumerManager 管理的 consumer name
|
643
|
-
# 这样 ConsumerManager 才能正确跟踪和恢复这个流的待处理消息
|
644
|
-
consumer_name = self.consumer_manager.get_consumer_name('TASK_CHANGES')
|
645
|
-
|
646
|
-
# 创建消费者组
|
647
|
-
try:
|
648
|
-
await self.redis_client.xgroup_create(
|
649
|
-
change_stream_key, consumer_group, id='0', mkstream=True
|
650
|
-
)
|
651
|
-
logger.debug(f"Created consumer group for task changes stream")
|
652
|
-
except redis.ResponseError:
|
653
|
-
pass
|
654
|
-
|
655
|
-
# 模仿 listen_event_by_task 的写法:先处理pending消息,再处理新消息
|
656
|
-
check_backlog = True
|
657
|
-
lastid = "0-0"
|
658
|
-
batch_size = 1000
|
659
|
-
|
660
|
-
while self._running:
|
661
|
-
try:
|
662
|
-
# 决定读取位置:如果有backlog,从lastid开始;否则读取新消息
|
663
|
-
if check_backlog:
|
664
|
-
myid = lastid
|
665
|
-
else:
|
666
|
-
myid = ">"
|
667
|
-
|
668
|
-
messages = await self.redis_client.xreadgroup(
|
669
|
-
consumer_group,
|
670
|
-
consumer_name, # 使用 ConsumerManager 管理的 consumer name
|
671
|
-
{change_stream_key: myid},
|
672
|
-
count=batch_size,
|
673
|
-
block=1000 if not check_backlog else 0 # backlog时不阻塞
|
674
|
-
)
|
675
|
-
|
676
|
-
if not messages:
|
677
|
-
check_backlog = False
|
678
|
-
continue
|
679
|
-
|
680
|
-
# 检查是否还有更多backlog消息
|
681
|
-
if messages and len(messages[0][1]) > 0:
|
682
|
-
check_backlog = len(messages[0][1]) >= batch_size
|
683
|
-
else:
|
684
|
-
check_backlog = False
|
685
|
-
|
686
|
-
# 收集消息ID和对应的task_id
|
687
|
-
msg_to_task = {} # msg_id -> task_id 映射
|
688
|
-
|
689
|
-
for _, stream_messages in messages:
|
690
|
-
for msg_id, data in stream_messages:
|
691
|
-
try:
|
692
|
-
# 更新lastid(无论消息是否处理成功)
|
693
|
-
if isinstance(msg_id, bytes):
|
694
|
-
lastid = msg_id.decode('utf-8')
|
695
|
-
else:
|
696
|
-
lastid = str(msg_id)
|
697
|
-
|
698
|
-
task_key = data[b'id']
|
699
|
-
task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
|
700
|
-
|
701
|
-
# 从完整的task_key格式提取stream_id
|
702
|
-
# 格式: namespace:TASK:stream_id:queue_name
|
703
|
-
stream_id = None
|
704
|
-
if ':TASK:' in task_key:
|
705
|
-
parts = task_key.split(':TASK:')
|
706
|
-
if len(parts) == 2:
|
707
|
-
# 再从右边部分提取stream_id
|
708
|
-
right_parts = parts[1].split(':')
|
709
|
-
if right_parts:
|
710
|
-
stream_id = right_parts[0] # 提取stream_id
|
711
|
-
|
712
|
-
if stream_id:
|
713
|
-
# 存储元组: (stream_id, task_key)
|
714
|
-
msg_to_task[msg_id] = (stream_id, task_key)
|
715
|
-
else:
|
716
|
-
logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
|
717
|
-
except Exception as e:
|
718
|
-
import traceback
|
719
|
-
traceback.print_exc()
|
720
|
-
logger.error(f"Error processing change event {msg_id}: {e} {data=}")
|
721
|
-
# 解析失败的消息也应该ACK,避免一直重试
|
722
|
-
await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
|
723
|
-
|
724
|
-
if msg_to_task:
|
725
|
-
# 批量更新任务,返回成功更新的task_id列表
|
726
|
-
# msg_to_task 的值现在是元组 (stream_id, task_key)
|
727
|
-
id_tuples = list(set(msg_to_task.values()))
|
728
|
-
logger.info(f"Processing {len(id_tuples)} task updates from change stream")
|
729
|
-
# logger_f.write(f'{id_tuples=} \n')
|
730
|
-
successful_tuples = await self._update_tasks_by_event(id_tuples)
|
731
|
-
|
732
|
-
# 只ACK成功更新的消息
|
733
|
-
ack_ids = []
|
734
|
-
failed_count = 0
|
735
|
-
for msg_id, id_tuple in msg_to_task.items():
|
736
|
-
if successful_tuples and id_tuple in successful_tuples:
|
737
|
-
ack_ids.append(msg_id)
|
738
|
-
else:
|
739
|
-
failed_count += 1
|
740
|
-
|
741
|
-
if ack_ids:
|
742
|
-
await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
|
743
|
-
if len(ack_ids) > 0:
|
744
|
-
logger.info(f"Updated {len(ack_ids)} task statuses")
|
745
|
-
|
746
|
-
if failed_count > 0:
|
747
|
-
logger.debug(f"Failed to update {failed_count} tasks, will retry")
|
748
|
-
|
749
|
-
except redis.ResponseError as e:
|
750
|
-
if "NOGROUP" in str(e):
|
751
|
-
# 如果消费者组不存在,重新创建
|
752
|
-
try:
|
753
|
-
await self.redis_client.xgroup_create(
|
754
|
-
change_stream_key, consumer_group, id='0', mkstream=True
|
755
|
-
)
|
756
|
-
logger.debug(f"Recreated consumer group for task changes stream")
|
757
|
-
check_backlog = True
|
758
|
-
lastid = "0-0"
|
759
|
-
except:
|
760
|
-
pass
|
761
|
-
else:
|
762
|
-
logger.error(f"Redis error in consume_task_changes: {e}")
|
763
|
-
await asyncio.sleep(1)
|
764
|
-
except Exception as e:
|
765
|
-
logger.error(f"Error in consume_task_changes: {e}", exc_info=True)
|
766
|
-
await asyncio.sleep(1)
|
767
|
-
|
768
|
-
async def _update_tasks_by_event(self, id_tuples: List[tuple]) -> Set[tuple]:
|
769
|
-
"""基于事件ID批量更新任务状态
|
770
|
-
|
771
|
-
Args:
|
772
|
-
id_tuples: 元组列表,每个元组为 (stream_id, task_key)
|
773
|
-
|
774
|
-
Returns:
|
775
|
-
成功更新的元组集合
|
776
|
-
"""
|
777
|
-
if not id_tuples:
|
778
|
-
return set()
|
779
|
-
|
780
|
-
successful_tuples = set()
|
781
|
-
|
782
|
-
try:
|
783
|
-
pipeline = self.redis_client.pipeline()
|
784
|
-
for stream_id, task_key in id_tuples:
|
785
|
-
pipeline.hgetall(task_key)
|
786
|
-
|
787
|
-
redis_values = await pipeline.execute()
|
788
|
-
updates = []
|
789
|
-
valid_tuples = [] # 记录有效的元组
|
790
|
-
if len(id_tuples) != len(redis_values):
|
791
|
-
logger.error(f'Mismatch: {len(id_tuples)=} {len(redis_values)=}')
|
792
|
-
# 不抛出异常,继续处理能处理的
|
793
|
-
|
794
|
-
for i, (stream_id, task_key) in enumerate(id_tuples):
|
795
|
-
if i >= len(redis_values):
|
796
|
-
logger.error(f'Missing redis value for task_key={task_key}')
|
797
|
-
continue
|
798
|
-
|
799
|
-
hash_data = redis_values[i]
|
800
|
-
|
801
|
-
if not hash_data:
|
802
|
-
logger.debug(f'No hash data for task_key={task_key}')
|
803
|
-
continue
|
804
|
-
|
805
|
-
try:
|
806
|
-
# 从task_key解析出consumer_group
|
807
|
-
# task_key格式: namespace:TASK:stream_id:group_name
|
808
|
-
# 其中group_name就是完整的consumer_group(格式: jettask:QUEUE:queue_name:task_name)
|
809
|
-
parts = task_key.split(':', 3) # 最多分割成4部分
|
810
|
-
if len(parts) == 4:
|
811
|
-
# parts[0] = namespace (如 'default')
|
812
|
-
# parts[1] = 'TASK'
|
813
|
-
# parts[2] = stream_id
|
814
|
-
# parts[3] = group_name (consumer_group)
|
815
|
-
consumer_group = parts[3] # 直接使用group_name作为consumer_group
|
816
|
-
logger.debug(f"Extracted consumer_group from task_key: {consumer_group}")
|
817
|
-
else:
|
818
|
-
logger.warning(f"Cannot parse consumer_group from task_key: {task_key}")
|
819
|
-
continue
|
820
|
-
|
821
|
-
# 从consumer_group中提取task_name
|
822
|
-
# consumer_group格式: prefix:QUEUE:queue:task_name (如 jettask:QUEUE:robust_bench2:robust_benchmark.benchmark_task)
|
823
|
-
task_name = None
|
824
|
-
if consumer_group:
|
825
|
-
parts = consumer_group.split(':')
|
826
|
-
if len(parts) >= 4:
|
827
|
-
# 最后一部分是task_name
|
828
|
-
task_name = parts[-1]
|
829
|
-
logger.debug(f"Extracted task_name '{task_name}' from consumer_group '{consumer_group}'")
|
830
|
-
|
831
|
-
# 使用stream_id作为任务ID
|
832
|
-
update_info = self._parse_task_hash(stream_id, hash_data)
|
833
|
-
if update_info:
|
834
|
-
# 添加consumer_group和task_name到更新信息中
|
835
|
-
update_info['consumer_group'] = consumer_group
|
836
|
-
update_info['task_name'] = task_name or 'unknown' # 如果无法提取task_name,使用'unknown'
|
837
|
-
# consumer_name就是worker_id(执行任务的实际worker)
|
838
|
-
update_info['consumer_name'] = update_info.get('worker_id')
|
839
|
-
updates.append(update_info)
|
840
|
-
valid_tuples.append((stream_id, task_key))
|
841
|
-
else:
|
842
|
-
logger.debug(f'Failed to parse stream_id={stream_id} hash_data={hash_data}')
|
843
|
-
except Exception as e:
|
844
|
-
logger.error(f'Error parsing task stream_id={stream_id}: {e}')
|
845
|
-
continue
|
846
|
-
if updates:
|
847
|
-
logger.info(f"Attempting to update {len(updates)} tasks, first few: {[u['id'] for u in updates[:3]]}")
|
848
|
-
# logger_f.write(f'{updates=} \n')
|
849
|
-
try:
|
850
|
-
# _update_tasks 现在返回成功更新的ID集合
|
851
|
-
batch_successful = await self._update_tasks(updates)
|
852
|
-
# 将成功的stream_id映射回元组
|
853
|
-
for stream_id in batch_successful:
|
854
|
-
for tuple_item in valid_tuples:
|
855
|
-
if tuple_item[0] == stream_id: # stream_id匹配
|
856
|
-
successful_tuples.add(tuple_item)
|
857
|
-
if batch_successful:
|
858
|
-
logger.info(f"Successfully updated {len(batch_successful)} tasks from change events")
|
859
|
-
else:
|
860
|
-
logger.warning(f"No tasks were successfully updated")
|
861
|
-
except Exception as e:
|
862
|
-
logger.error(f"Error in batch update: {e}")
|
863
|
-
# 批量更新失败,尝试逐个更新
|
864
|
-
for update, tuple_item in zip(updates, valid_tuples):
|
865
|
-
try:
|
866
|
-
single_successful = await self._update_tasks([update])
|
867
|
-
if update['id'] in single_successful:
|
868
|
-
successful_tuples.add(tuple_item)
|
869
|
-
except Exception as single_error:
|
870
|
-
logger.error(f"Failed to update task {tuple_item[0]}: {single_error}")
|
871
|
-
|
872
|
-
except Exception as e:
|
873
|
-
logger.error(f"Error updating tasks by event: {e}", exc_info=True)
|
874
|
-
logger.debug(f'{successful_tuples=}')
|
875
|
-
return successful_tuples
|
876
|
-
|
877
|
-
def _parse_task_hash(self, task_id: str, hash_data: dict) -> Optional[dict]:
|
878
|
-
"""解析Redis Hash数据"""
|
879
|
-
update_info = {
|
880
|
-
'id': task_id,
|
881
|
-
'status': None,
|
882
|
-
'result': None,
|
883
|
-
'error_message': None,
|
884
|
-
'started_at': None,
|
885
|
-
'completed_at': None,
|
886
|
-
'worker_id': None,
|
887
|
-
'execution_time': None,
|
888
|
-
'duration': None
|
889
|
-
}
|
890
|
-
|
891
|
-
try:
|
892
|
-
from jettask.utils.serializer import loads_str
|
893
|
-
|
894
|
-
hash_dict = {}
|
895
|
-
for k, v in hash_data.items():
|
896
|
-
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
897
|
-
if isinstance(v, bytes):
|
898
|
-
try:
|
899
|
-
value = loads_str(v)
|
900
|
-
if isinstance(value, (dict, list)):
|
901
|
-
value = json.dumps(value, ensure_ascii=False)
|
902
|
-
else:
|
903
|
-
value = str(value)
|
904
|
-
except:
|
905
|
-
try:
|
906
|
-
value = v.decode('utf-8')
|
907
|
-
except:
|
908
|
-
value = str(v)
|
909
|
-
else:
|
910
|
-
value = v
|
911
|
-
hash_dict[key] = value
|
912
|
-
|
913
|
-
update_info['status'] = hash_dict.get('status')
|
914
|
-
update_info['error_message'] = hash_dict.get('error_msg') or hash_dict.get('exception')
|
915
|
-
|
916
|
-
# 转换时间戳
|
917
|
-
for time_field in ['started_at', 'completed_at']:
|
918
|
-
if hash_dict.get(time_field):
|
919
|
-
try:
|
920
|
-
time_str = hash_dict[time_field]
|
921
|
-
if isinstance(time_str, str) and time_str.startswith("b'") and time_str.endswith("'"):
|
922
|
-
time_str = time_str[2:-1]
|
923
|
-
update_info[time_field] = datetime.fromtimestamp(float(time_str), tz=timezone.utc)
|
924
|
-
except:
|
925
|
-
pass
|
926
|
-
|
927
|
-
update_info['worker_id'] = hash_dict.get('consumer') or hash_dict.get('worker_id')
|
928
|
-
|
929
|
-
# 转换数值 - 直接存储原始秒数值
|
930
|
-
for num_field in ['execution_time', 'duration']:
|
931
|
-
if hash_dict.get(num_field):
|
932
|
-
try:
|
933
|
-
num_str = hash_dict[num_field]
|
934
|
-
# 直接存储浮点数秒值
|
935
|
-
update_info[num_field] = float(num_str)
|
936
|
-
except:
|
937
|
-
pass
|
938
|
-
|
939
|
-
# 处理result
|
940
|
-
if 'result' in hash_dict:
|
941
|
-
result_str = hash_dict['result']
|
942
|
-
if result_str == 'null':
|
943
|
-
update_info['result'] = None
|
944
|
-
else:
|
945
|
-
update_info['result'] = result_str
|
946
|
-
|
947
|
-
# 只返回有数据的更新
|
948
|
-
if any(v is not None for k, v in update_info.items() if k != 'id'):
|
949
|
-
return update_info
|
950
|
-
|
951
|
-
except Exception as e:
|
952
|
-
logger.error(f"Failed to parse hash data for task {task_id}: {e}")
|
953
|
-
|
954
|
-
return None
|
955
|
-
|
956
|
-
async def _update_tasks(self, updates: List[Dict[str, Any]]) -> Set[str]:
|
957
|
-
"""批量更新任务状态(使用UPSERT逻辑处理task_runs表)
|
958
|
-
|
959
|
-
Returns:
|
960
|
-
成功更新的stream_id集合
|
961
|
-
"""
|
962
|
-
if not updates:
|
963
|
-
return set()
|
964
|
-
|
965
|
-
try:
|
966
|
-
async with self.AsyncSessionLocal() as session:
|
967
|
-
# V3结构:使用UPSERT逻辑处理task_runs表
|
968
|
-
stream_ids = [u['id'] for u in updates]
|
969
|
-
logger.info(f"Upserting {len(stream_ids)} task_runs records")
|
970
|
-
|
971
|
-
# 对于分区表,我们需要使用不同的UPSERT策略
|
972
|
-
# 先尝试UPDATE,如果没有更新到任何行,则INSERT
|
973
|
-
upsert_query = text("""
|
974
|
-
WITH updated AS (
|
975
|
-
UPDATE task_runs SET
|
976
|
-
consumer_name = COALESCE(CAST(:consumer_name AS TEXT), consumer_name),
|
977
|
-
status = CASE
|
978
|
-
WHEN CAST(:status AS TEXT) IS NULL THEN status
|
979
|
-
WHEN status = 'pending' THEN COALESCE(CAST(:status AS TEXT), status)
|
980
|
-
WHEN status = 'running' AND CAST(:status AS TEXT) IN ('success', 'failed', 'timeout', 'skipped') THEN CAST(:status AS TEXT)
|
981
|
-
WHEN status IN ('success', 'failed', 'timeout', 'skipped') THEN status
|
982
|
-
ELSE COALESCE(CAST(:status AS TEXT), status)
|
983
|
-
END,
|
984
|
-
result = CASE
|
985
|
-
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN result
|
986
|
-
ELSE COALESCE(CAST(:result AS jsonb), result)
|
987
|
-
END,
|
988
|
-
error_message = CASE
|
989
|
-
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN error_message
|
990
|
-
ELSE COALESCE(CAST(:error_message AS TEXT), error_message)
|
991
|
-
END,
|
992
|
-
start_time = COALESCE(CAST(:started_at AS TIMESTAMPTZ), start_time),
|
993
|
-
end_time = CASE
|
994
|
-
WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN end_time
|
995
|
-
ELSE COALESCE(CAST(:completed_at AS TIMESTAMPTZ), end_time)
|
996
|
-
END,
|
997
|
-
worker_id = COALESCE(CAST(:worker_id AS TEXT), worker_id),
|
998
|
-
duration = COALESCE(CAST(:duration AS DOUBLE PRECISION), duration),
|
999
|
-
execution_time = COALESCE(CAST(:execution_time AS DOUBLE PRECISION), execution_time),
|
1000
|
-
updated_at = CURRENT_TIMESTAMP
|
1001
|
-
WHERE stream_id = :stream_id AND consumer_group = :consumer_group
|
1002
|
-
RETURNING stream_id
|
1003
|
-
)
|
1004
|
-
INSERT INTO task_runs (
|
1005
|
-
stream_id, task_name, consumer_group, consumer_name, status, result, error_message,
|
1006
|
-
start_time, end_time, worker_id, duration, execution_time,
|
1007
|
-
created_at, updated_at
|
1008
|
-
)
|
1009
|
-
SELECT
|
1010
|
-
:stream_id, :task_name, :consumer_group, :consumer_name,
|
1011
|
-
COALESCE(CAST(:status AS TEXT), 'pending'),
|
1012
|
-
CAST(:result AS jsonb),
|
1013
|
-
CAST(:error_message AS TEXT),
|
1014
|
-
CAST(:started_at AS TIMESTAMPTZ),
|
1015
|
-
CAST(:completed_at AS TIMESTAMPTZ),
|
1016
|
-
CAST(:worker_id AS TEXT),
|
1017
|
-
CAST(:duration AS DOUBLE PRECISION),
|
1018
|
-
CAST(:execution_time AS DOUBLE PRECISION),
|
1019
|
-
CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
1020
|
-
WHERE NOT EXISTS (SELECT 1 FROM updated)
|
1021
|
-
RETURNING stream_id;
|
1022
|
-
""")
|
1023
|
-
|
1024
|
-
# 为每个更新转换参数名称(从id改为stream_id)
|
1025
|
-
run_updates = []
|
1026
|
-
for update in updates:
|
1027
|
-
run_update = update.copy()
|
1028
|
-
run_update['stream_id'] = run_update.pop('id') # 将id改为stream_id
|
1029
|
-
# consumer_group 已经在 update_info 中了,不需要额外处理
|
1030
|
-
run_updates.append(run_update)
|
1031
|
-
|
1032
|
-
# 批量执行UPSERT - 使用事务批处理提高性能
|
1033
|
-
successful_count = 0
|
1034
|
-
batch_size = 20 # 每批处理20条记录
|
1035
|
-
|
1036
|
-
for i in range(0, len(run_updates), batch_size):
|
1037
|
-
batch = run_updates[i:i+batch_size]
|
1038
|
-
|
1039
|
-
try:
|
1040
|
-
# 在一个事务中处理整批
|
1041
|
-
for run_update in batch:
|
1042
|
-
result = await session.execute(upsert_query, run_update)
|
1043
|
-
if result.rowcount > 0:
|
1044
|
-
successful_count += 1
|
1045
|
-
|
1046
|
-
# 批量提交
|
1047
|
-
await session.commit()
|
1048
|
-
logger.debug(f"Batch {i//batch_size + 1} committed: {len(batch)} records")
|
1049
|
-
|
1050
|
-
except Exception as e:
|
1051
|
-
logger.error(f"Batch {i//batch_size + 1} failed, trying individual records: {e}")
|
1052
|
-
await session.rollback()
|
1053
|
-
|
1054
|
-
# 如果批处理失败,回退到逐个处理这批记录
|
1055
|
-
for run_update in batch:
|
1056
|
-
try:
|
1057
|
-
result = await session.execute(upsert_query, run_update)
|
1058
|
-
await session.commit()
|
1059
|
-
if result.rowcount > 0:
|
1060
|
-
successful_count += 1
|
1061
|
-
except Exception as individual_error:
|
1062
|
-
logger.error(f"Individual upsert failed for {run_update.get('stream_id')}: {individual_error}")
|
1063
|
-
await session.rollback()
|
1064
|
-
|
1065
|
-
# 记录成功更新的数量
|
1066
|
-
if successful_count > 0:
|
1067
|
-
logger.info(f"Upserted {successful_count}/{len(run_updates)} task_runs records")
|
1068
|
-
|
1069
|
-
# 检查哪些任务是完成状态,需要从Redis中删除
|
1070
|
-
completed_task_keys = []
|
1071
|
-
for update in updates:
|
1072
|
-
status = update.get('status')
|
1073
|
-
# 如果状态是完成状态(success, error, cancel等)
|
1074
|
-
if status in ['success', 'error', 'failed', 'cancel', 'cancelled', 'timeout', 'skipped']:
|
1075
|
-
# 构建task_key
|
1076
|
-
# task_key格式: namespace:TASK:stream_id:group_name
|
1077
|
-
stream_id = update['id']
|
1078
|
-
consumer_group = update.get('consumer_group')
|
1079
|
-
if consumer_group:
|
1080
|
-
# 从consumer_group提取namespace
|
1081
|
-
# consumer_group格式: prefix:QUEUE:queue:task_name
|
1082
|
-
parts = consumer_group.split(':', 1)
|
1083
|
-
namespace = parts[0] if parts else 'default'
|
1084
|
-
task_key = f"{namespace}:TASK:{stream_id}:{consumer_group}"
|
1085
|
-
completed_task_keys.append(task_key)
|
1086
|
-
logger.info(f"Task {stream_id} with status {status} will be deleted from Redis: {task_key}")
|
1087
|
-
|
1088
|
-
# 从Redis中删除已完成的任务
|
1089
|
-
if completed_task_keys:
|
1090
|
-
try:
|
1091
|
-
pipeline = self.redis_client.pipeline()
|
1092
|
-
for task_key in completed_task_keys:
|
1093
|
-
pipeline.delete(task_key)
|
1094
|
-
deleted_results = await pipeline.execute()
|
1095
|
-
deleted_count = sum(1 for r in deleted_results if r > 0)
|
1096
|
-
if deleted_count > 0:
|
1097
|
-
logger.info(f"Deleted {deleted_count} completed tasks from Redis")
|
1098
|
-
except Exception as e:
|
1099
|
-
logger.error(f"Error deleting completed tasks from Redis: {e}")
|
1100
|
-
|
1101
|
-
# UPSERT 操作总是成功的,返回所有stream_id
|
1102
|
-
# 不需要复杂的错误处理,因为UPSERT保证了操作的原子性
|
1103
|
-
return set(stream_ids)
|
1104
|
-
|
1105
|
-
except Exception as e:
|
1106
|
-
logger.error(f"Error upserting task statuses: {e}")
|
1107
|
-
return set() # 出错时返回空集
|
1108
|
-
|
1109
|
-
async def _retry_pending_updates(self):
|
1110
|
-
"""定期重试待更新的任务"""
|
1111
|
-
while self._running:
|
1112
|
-
try:
|
1113
|
-
await asyncio.sleep(self._retry_interval) # 等待一段时间
|
1114
|
-
|
1115
|
-
# 获取待重试的更新
|
1116
|
-
async with self._pending_updates_lock:
|
1117
|
-
if not self._pending_updates:
|
1118
|
-
continue
|
1119
|
-
|
1120
|
-
# 取出所有待重试的更新
|
1121
|
-
pending_items = list(self._pending_updates.items())
|
1122
|
-
self._pending_updates.clear()
|
1123
|
-
|
1124
|
-
if pending_items:
|
1125
|
-
|
1126
|
-
# 重新尝试更新
|
1127
|
-
updates = [update_info for _, update_info in pending_items]
|
1128
|
-
logger.debug(f"Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}")
|
1129
|
-
logger_f.write(f'{time.time()=} Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}\n')
|
1130
|
-
logger_f.flush()
|
1131
|
-
await self._update_tasks(updates)
|
1132
|
-
|
1133
|
-
except Exception as e:
|
1134
|
-
logger.error(f"Error in retry pending updates: {e}")
|
1135
|
-
await asyncio.sleep(5)
|
1136
|
-
|
1137
|
-
async def _start_offline_recovery(self):
|
1138
|
-
"""启动离线worker恢复服务,恢复离线PG_CONSUMER的消息"""
|
1139
|
-
logger.debug("Starting offline worker recovery service for PG_CONSUMER")
|
1140
|
-
|
1141
|
-
# 等待consumer manager初始化和队列发现
|
1142
|
-
# await asyncio.sleep(5)
|
1143
|
-
|
1144
|
-
while self._running:
|
1145
|
-
try:
|
1146
|
-
total_recovered = 0
|
1147
|
-
|
1148
|
-
# 1. 恢复普通队列的消息
|
1149
|
-
# for queue in self._known_queues:
|
1150
|
-
# # logger.info(f'{queue=}')
|
1151
|
-
# try:
|
1152
|
-
# recovered = await self.offline_recovery.recover_offline_workers(
|
1153
|
-
# queue=queue,
|
1154
|
-
# current_consumer_name=self.consumer_id,
|
1155
|
-
# process_message_callback=self._process_recovered_queue_message
|
1156
|
-
# )
|
1157
|
-
|
1158
|
-
# if recovered > 0:
|
1159
|
-
# logger.info(f"Recovered {recovered} messages from queue {queue}")
|
1160
|
-
# total_recovered += recovered
|
1161
|
-
|
1162
|
-
# except Exception as e:
|
1163
|
-
# logger.error(f"Error recovering queue {queue}: {e}")
|
1164
|
-
|
1165
|
-
# 2. 恢复TASK_CHANGES stream的消息
|
1166
|
-
recovered = await self._recover_task_changes_offline_messages()
|
1167
|
-
if recovered > 0:
|
1168
|
-
logger.debug(f"Recovered {recovered} TASK_CHANGES messages")
|
1169
|
-
total_recovered += recovered
|
1170
|
-
|
1171
|
-
if total_recovered > 0:
|
1172
|
-
logger.debug(f"Total recovered {total_recovered} messages in this cycle")
|
1173
|
-
|
1174
|
-
# 每30秒扫描一次
|
1175
|
-
await asyncio.sleep(1)
|
1176
|
-
|
1177
|
-
except Exception as e:
|
1178
|
-
logger.error(f"Error in offline recovery service: {e}")
|
1179
|
-
await asyncio.sleep(10)
|
1180
|
-
|
1181
|
-
async def _recover_task_changes_offline_messages(self) -> int:
|
1182
|
-
"""恢复TASK_CHANGES stream的离线消息"""
|
1183
|
-
# 使用 OfflineWorkerRecovery 的标准接口
|
1184
|
-
try:
|
1185
|
-
# 为TASK_CHANGES定义自定义的队列格式化器
|
1186
|
-
def task_changes_formatter(queue):
|
1187
|
-
# 对于TASK_CHANGES,直接返回stream key(不加QUEUE:前缀)
|
1188
|
-
if queue == 'TASK_CHANGES':
|
1189
|
-
return f"{self.prefix}:TASK_CHANGES"
|
1190
|
-
else:
|
1191
|
-
return f"{self.prefix}:QUEUE:{queue}"
|
1192
|
-
|
1193
|
-
# 创建专门用于TASK_CHANGES的恢复器
|
1194
|
-
task_changes_recovery = OfflineWorkerRecovery(
|
1195
|
-
async_redis_client=self.redis_client,
|
1196
|
-
redis_prefix=self.prefix,
|
1197
|
-
worker_prefix='PG_CONSUMER',
|
1198
|
-
queue_formatter=task_changes_formatter
|
1199
|
-
)
|
1200
|
-
|
1201
|
-
# 调用标准的恢复方法
|
1202
|
-
# TASK_CHANGES作为队列名传入,会被正确处理
|
1203
|
-
recovered = await task_changes_recovery.recover_offline_workers(
|
1204
|
-
queue='TASK_CHANGES', # 这个队列名会用于查找离线worker
|
1205
|
-
current_consumer_name=self.consumer_id,
|
1206
|
-
process_message_callback=self._process_recovered_task_change_v2
|
1207
|
-
)
|
1208
|
-
|
1209
|
-
return recovered
|
1210
|
-
|
1211
|
-
except Exception as e:
|
1212
|
-
logger.error(f"Error in recover_task_changes_offline_messages: {e}")
|
1213
|
-
return 0
|
1214
|
-
|
1215
|
-
async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
|
1216
|
-
"""处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
|
1217
|
-
try:
|
1218
|
-
logger.debug(f'处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口) {msg_data=}')
|
1219
|
-
# 解析消息 - 现在使用task_id而不是event_id
|
1220
|
-
if b'task_id' in msg_data:
|
1221
|
-
# 使用msgpack解压task_id
|
1222
|
-
compressed_task_id = msg_data[b'task_id']
|
1223
|
-
task_key = msgpack.unpackb(compressed_task_id)
|
1224
|
-
task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
|
1225
|
-
|
1226
|
-
# 从完整的task_key格式提取stream_id
|
1227
|
-
# 格式: namespace:TASK:stream_id:queue_name
|
1228
|
-
stream_id = None
|
1229
|
-
if ':TASK:' in task_key:
|
1230
|
-
parts = task_key.split(':TASK:')
|
1231
|
-
if len(parts) == 2:
|
1232
|
-
# 再从右边部分提取stream_id
|
1233
|
-
right_parts = parts[1].split(':')
|
1234
|
-
if right_parts:
|
1235
|
-
stream_id = right_parts[0] # 提取stream_id
|
1236
|
-
|
1237
|
-
if stream_id:
|
1238
|
-
logger.debug(f"Processing recovered TASK_CHANGES message: {stream_id} from offline worker {consumer_id}")
|
1239
|
-
# 更新任务状态 - 传入(stream_id, task_key)元组
|
1240
|
-
await self._update_tasks_by_event([(stream_id, task_key)])
|
1241
|
-
else:
|
1242
|
-
logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
|
1243
|
-
|
1244
|
-
# ACK消息
|
1245
|
-
change_stream_key = f"{self.prefix}:TASK_CHANGES"
|
1246
|
-
consumer_group = f"{self.prefix}_changes_consumer"
|
1247
|
-
await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
|
1248
|
-
|
1249
|
-
except Exception as e:
|
1250
|
-
logger.error(f"Error processing recovered task change {msg_id}: {e}")
|
1251
|
-
|
1252
|
-
async def _database_maintenance(self):
|
1253
|
-
"""定期执行数据库维护任务"""
|
1254
|
-
last_analyze_time = 0
|
1255
|
-
analyze_interval = 7200 # 每2小时执行一次ANALYZE
|
1256
|
-
|
1257
|
-
while self._running:
|
1258
|
-
try:
|
1259
|
-
current_time = time.time()
|
1260
|
-
|
1261
|
-
if current_time - last_analyze_time > analyze_interval:
|
1262
|
-
async with self.AsyncSessionLocal() as session:
|
1263
|
-
logger.debug("Running ANALYZE on tasks and task_runs tables...")
|
1264
|
-
await session.execute(text("ANALYZE tasks"))
|
1265
|
-
await session.execute(text("ANALYZE task_runs"))
|
1266
|
-
await session.commit()
|
1267
|
-
logger.debug("ANALYZE completed successfully for both tables")
|
1268
|
-
last_analyze_time = current_time
|
1269
|
-
|
1270
|
-
await asyncio.sleep(300) # 每5分钟检查一次
|
1271
|
-
|
1272
|
-
except Exception as e:
|
1273
|
-
logger.error(f"Error in database maintenance: {e}")
|
1274
|
-
await asyncio.sleep(60)
|
1275
|
-
|
1276
|
-
async def _stream_backlog_monitor(self):
|
1277
|
-
"""Stream积压监控任务 - 使用分布式锁确保只有一个实例采集"""
|
1278
|
-
# await asyncio.sleep(10) # 启动后延迟10秒开始
|
1279
|
-
|
1280
|
-
while self._running:
|
1281
|
-
try:
|
1282
|
-
# 尝试获取分布式锁
|
1283
|
-
lock_acquired = await self._try_acquire_monitor_lock()
|
1284
|
-
|
1285
|
-
if lock_acquired:
|
1286
|
-
try:
|
1287
|
-
logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
|
1288
|
-
await self._collect_stream_backlog_metrics()
|
1289
|
-
logger.debug("Stream backlog metrics collected successfully")
|
1290
|
-
finally:
|
1291
|
-
# 释放锁
|
1292
|
-
await self._release_monitor_lock()
|
1293
|
-
else:
|
1294
|
-
logger.debug("Another instance is collecting backlog metrics, skipping...")
|
1295
|
-
|
1296
|
-
# 等待下一次采集
|
1297
|
-
await asyncio.sleep(self.backlog_monitor_interval)
|
1298
|
-
|
1299
|
-
except Exception as e:
|
1300
|
-
logger.error(f"Error in stream backlog monitor: {e}")
|
1301
|
-
await asyncio.sleep(30) # 出错后等待30秒
|
1302
|
-
|
1303
|
-
async def _try_acquire_monitor_lock(self) -> bool:
|
1304
|
-
"""尝试获取监控锁(使用Redis原生锁)"""
|
1305
|
-
try:
|
1306
|
-
# 使用SET NX EX命令实现分布式锁
|
1307
|
-
# NX: 只在键不存在时设置
|
1308
|
-
# EX: 设置过期时间(秒)
|
1309
|
-
result = await self.redis_client.set(
|
1310
|
-
self.backlog_monitor_lock_key.encode(),
|
1311
|
-
self.node_id.encode(), # 锁的值为当前节点ID
|
1312
|
-
nx=True, # 只在不存在时设置
|
1313
|
-
ex=self.backlog_monitor_lock_ttl # 过期时间
|
1314
|
-
)
|
1315
|
-
return result is not None
|
1316
|
-
except Exception as e:
|
1317
|
-
logger.error(f"Error acquiring monitor lock: {e}")
|
1318
|
-
return False
|
1319
|
-
|
1320
|
-
async def _release_monitor_lock(self):
|
1321
|
-
"""释放监控锁(只释放自己持有的锁)"""
|
1322
|
-
try:
|
1323
|
-
# 使用Lua脚本确保只释放自己持有的锁
|
1324
|
-
lua_script = """
|
1325
|
-
if redis.call("get", KEYS[1]) == ARGV[1] then
|
1326
|
-
return redis.call("del", KEYS[1])
|
1327
|
-
else
|
1328
|
-
return 0
|
1329
|
-
end
|
1330
|
-
"""
|
1331
|
-
await self.redis_client.eval(
|
1332
|
-
lua_script,
|
1333
|
-
1,
|
1334
|
-
self.backlog_monitor_lock_key.encode(),
|
1335
|
-
self.node_id.encode()
|
1336
|
-
)
|
1337
|
-
except Exception as e:
|
1338
|
-
logger.error(f"Error releasing monitor lock: {e}")
|
1339
|
-
|
1340
|
-
async def _collect_stream_backlog_metrics(self):
|
1341
|
-
"""采集Stream积压指标并保存到数据库(使用offset方式)"""
|
1342
|
-
try:
|
1343
|
-
# 获取所有队列的最新offset (QUEUE_OFFSETS)
|
1344
|
-
queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
|
1345
|
-
queue_offsets = {}
|
1346
|
-
try:
|
1347
|
-
# 使用decode_responses=False的客户端,手动解码
|
1348
|
-
raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
|
1349
|
-
for k, v in raw_queue_offsets.items():
|
1350
|
-
queue_name = k.decode() if isinstance(k, bytes) else k
|
1351
|
-
offset_value = v.decode() if isinstance(v, bytes) else v
|
1352
|
-
queue_offsets[queue_name] = int(offset_value)
|
1353
|
-
except Exception as e:
|
1354
|
-
logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
|
1355
|
-
|
1356
|
-
# 获取所有任务组的消费offset (TASK_OFFSETS)
|
1357
|
-
task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
|
1358
|
-
task_offsets = {}
|
1359
|
-
try:
|
1360
|
-
raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
|
1361
|
-
for k, v in raw_task_offsets.items():
|
1362
|
-
task_key = k.decode() if isinstance(k, bytes) else k
|
1363
|
-
offset_value = v.decode() if isinstance(v, bytes) else v
|
1364
|
-
task_offsets[task_key] = int(offset_value)
|
1365
|
-
except Exception as e:
|
1366
|
-
logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
|
1367
|
-
|
1368
|
-
# 使用Stream注册表替代SCAN命令获取队列信息
|
1369
|
-
stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
|
1370
|
-
|
1371
|
-
# 从fredis中获取stream注册表(Hash结构)
|
1372
|
-
# 格式: {"queue_name:priority": "stream_key"}
|
1373
|
-
# 对于普通队列,priority为0
|
1374
|
-
stream_registry = await self.redis_client.hgetall(self.stream_registry_key.encode())
|
1375
|
-
|
1376
|
-
for queue_priority_bytes, stream_key_bytes in stream_registry.items():
|
1377
|
-
queue_priority_str = queue_priority_bytes.decode() if isinstance(queue_priority_bytes, bytes) else str(queue_priority_bytes)
|
1378
|
-
stream_key = stream_key_bytes.decode() if isinstance(stream_key_bytes, bytes) else str(stream_key_bytes)
|
1379
|
-
|
1380
|
-
# 解析queue_name和priority
|
1381
|
-
if ':' in queue_priority_str:
|
1382
|
-
parts = queue_priority_str.rsplit(':', 1)
|
1383
|
-
if len(parts) == 2 and parts[1].isdigit():
|
1384
|
-
queue_name = parts[0]
|
1385
|
-
priority = int(parts[1])
|
1386
|
-
else:
|
1387
|
-
# 如果最后一部分不是数字,说明是普通队列名包含冒号
|
1388
|
-
queue_name = queue_priority_str
|
1389
|
-
priority = 0
|
1390
|
-
else:
|
1391
|
-
# 普通队列
|
1392
|
-
queue_name = queue_priority_str
|
1393
|
-
priority = 0
|
1394
|
-
|
1395
|
-
if queue_name not in stream_info_map:
|
1396
|
-
stream_info_map[queue_name] = []
|
1397
|
-
stream_info_map[queue_name].append((stream_key, priority))
|
1398
|
-
|
1399
|
-
# 如果Stream注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
|
1400
|
-
if not stream_info_map:
|
1401
|
-
logger.warning(f"Stream registry is empty, performing one-time scan initialization...")
|
1402
|
-
pattern = f"{self.prefix}:QUEUE:*".encode()
|
1403
|
-
cursor = 0
|
1404
|
-
|
1405
|
-
while True:
|
1406
|
-
cursor, keys = await self.redis_client.scan(cursor, match=pattern, count=10000)
|
1407
|
-
|
1408
|
-
for key in keys:
|
1409
|
-
key_str = key.decode()
|
1410
|
-
# 移除前缀 "prefix:QUEUE:"
|
1411
|
-
queue_part = key_str.replace(f"{self.prefix}:QUEUE:", "")
|
1412
|
-
|
1413
|
-
# 检查是否是优先级队列(格式:queue_name:priority)
|
1414
|
-
parts = queue_part.split(':')
|
1415
|
-
if len(parts) == 2 and parts[1].isdigit():
|
1416
|
-
# 优先级队列
|
1417
|
-
queue_name = parts[0]
|
1418
|
-
priority = int(parts[1])
|
1419
|
-
queue_priority_key = f"{queue_name}:{priority}"
|
1420
|
-
elif ':' not in queue_part:
|
1421
|
-
# 普通队列(不包含冒号)
|
1422
|
-
queue_name = queue_part
|
1423
|
-
priority = 0
|
1424
|
-
queue_priority_key = queue_name
|
1425
|
-
else:
|
1426
|
-
# 忽略其他格式的键(如消费组等)
|
1427
|
-
continue
|
1428
|
-
|
1429
|
-
if queue_name not in stream_info_map:
|
1430
|
-
stream_info_map[queue_name] = []
|
1431
|
-
stream_info_map[queue_name].append((key, priority))
|
1432
|
-
|
1433
|
-
if cursor == 0:
|
1434
|
-
break
|
1435
|
-
|
1436
|
-
# 将发现的Stream信息添加到注册表中
|
1437
|
-
if stream_info_map:
|
1438
|
-
pipeline = self.redis_client.pipeline()
|
1439
|
-
for queue_name, stream_list in stream_info_map.items():
|
1440
|
-
for stream_key, priority in stream_list:
|
1441
|
-
if priority > 0:
|
1442
|
-
queue_priority_key = f"{queue_name}:{priority}"
|
1443
|
-
else:
|
1444
|
-
queue_priority_key = queue_name
|
1445
|
-
# stream_key已经是bytes类型(从scan_iter返回)
|
1446
|
-
if isinstance(stream_key, str):
|
1447
|
-
stream_key = stream_key.encode()
|
1448
|
-
pipeline.hset(self.stream_registry_key.encode(), queue_priority_key.encode(), stream_key)
|
1449
|
-
await pipeline.execute()
|
1450
|
-
logger.info(f"Registered {sum(len(stream_list) for stream_list in stream_info_map.values())} streams to registry during initialization")
|
1451
|
-
|
1452
|
-
if not stream_info_map:
|
1453
|
-
logger.debug("No streams found in registry for backlog monitoring")
|
1454
|
-
return
|
1455
|
-
|
1456
|
-
# 调试日志(使用debug级别避免刷屏)
|
1457
|
-
logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
|
1458
|
-
for queue_name, stream_list in stream_info_map.items():
|
1459
|
-
priorities = [p for _, p in stream_list]
|
1460
|
-
# 筛选出非0优先级(0表示普通队列)
|
1461
|
-
high_priorities = [p for p in priorities if p > 0]
|
1462
|
-
if high_priorities:
|
1463
|
-
logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
|
1464
|
-
else:
|
1465
|
-
logger.debug(f" - {queue_name}: regular queue only (priority=0)")
|
1466
|
-
|
1467
|
-
# 收集每个队列的指标(聚合所有优先级)
|
1468
|
-
metrics = []
|
1469
|
-
current_time = datetime.now(timezone.utc)
|
1470
|
-
|
1471
|
-
for queue_name, stream_list in stream_info_map.items():
|
1472
|
-
# 分别处理每个优先级队列
|
1473
|
-
for stream_key, priority in stream_list:
|
1474
|
-
try:
|
1475
|
-
# 获取该队列的最新offset(考虑优先级队列)
|
1476
|
-
if priority > 0:
|
1477
|
-
# 优先级队列的key格式: queue_name:priority
|
1478
|
-
queue_key = f"{queue_name}:{priority}"
|
1479
|
-
else:
|
1480
|
-
queue_key = queue_name
|
1481
|
-
last_published_offset = queue_offsets.get(queue_key, 0)
|
1482
|
-
|
1483
|
-
# 获取Stream信息
|
1484
|
-
stream_info = await self.redis_client.xinfo_stream(stream_key)
|
1485
|
-
stream_length = stream_info.get(b'length', 0)
|
1486
|
-
|
1487
|
-
# 获取消费组信息
|
1488
|
-
has_consumer_groups = False
|
1489
|
-
try:
|
1490
|
-
groups = await self.redis_client.xinfo_groups(stream_key)
|
1491
|
-
|
1492
|
-
for group in groups:
|
1493
|
-
# 处理group_name
|
1494
|
-
raw_name = group.get('name', b'')
|
1495
|
-
if isinstance(raw_name, bytes):
|
1496
|
-
group_name = raw_name.decode() if raw_name else ''
|
1497
|
-
else:
|
1498
|
-
group_name = str(raw_name) if raw_name else ''
|
1499
|
-
|
1500
|
-
if not group_name:
|
1501
|
-
group_name = 'unknown'
|
1502
|
-
|
1503
|
-
# 过滤内部消费者组
|
1504
|
-
if is_internal_consumer(group_name):
|
1505
|
-
# logger.info(f"Skipping internal consumer group: {group_name}")
|
1506
|
-
continue
|
1507
|
-
|
1508
|
-
# 处理pending - 直接是int
|
1509
|
-
pending_count = group.get('pending', 0)
|
1510
|
-
|
1511
|
-
# 从TASK_OFFSETS获取该组的消费offset
|
1512
|
-
# key格式: f"{queue_name}:{group_name}" (不包含优先级)
|
1513
|
-
task_offset_key = f"{queue_name}:{group_name}"
|
1514
|
-
last_acked_offset = task_offsets.get(task_offset_key, 0)
|
1515
|
-
|
1516
|
-
# 计算各种积压指标
|
1517
|
-
# 1. 总积压 = 队列最新offset - 消费组已确认的offset
|
1518
|
-
total_backlog = max(0, last_published_offset - last_acked_offset)
|
1519
|
-
|
1520
|
-
# 2. 未投递的积压 = 总积压 - pending数量
|
1521
|
-
backlog_undelivered = max(0, total_backlog - pending_count)
|
1522
|
-
|
1523
|
-
# 3. 已投递未确认 = pending数量
|
1524
|
-
backlog_delivered_unacked = pending_count
|
1525
|
-
|
1526
|
-
# 4. 已投递的offset = 已确认offset + pending数量
|
1527
|
-
last_delivered_offset = last_acked_offset + pending_count
|
1528
|
-
|
1529
|
-
# 为每个消费组创建一条记录
|
1530
|
-
metrics.append({
|
1531
|
-
'namespace': self.namespace_name,
|
1532
|
-
'stream_name': queue_name,
|
1533
|
-
'priority': priority, # 添加优先级字段
|
1534
|
-
'consumer_group': group_name,
|
1535
|
-
'last_published_offset': last_published_offset,
|
1536
|
-
'last_delivered_offset': last_delivered_offset,
|
1537
|
-
'last_acked_offset': last_acked_offset,
|
1538
|
-
'pending_count': pending_count,
|
1539
|
-
'backlog_undelivered': backlog_undelivered,
|
1540
|
-
'backlog_unprocessed': total_backlog,
|
1541
|
-
'created_at': current_time
|
1542
|
-
})
|
1543
|
-
has_consumer_groups = True
|
1544
|
-
|
1545
|
-
except Exception as e:
|
1546
|
-
# 这个队列没有消费组
|
1547
|
-
logger.debug(f"No consumer groups for stream {stream_key.decode()}: {e}")
|
1548
|
-
|
1549
|
-
# 如果没有消费组,保存Stream级别的指标
|
1550
|
-
if not has_consumer_groups and last_published_offset > 0:
|
1551
|
-
metrics.append({
|
1552
|
-
'namespace': self.namespace_name,
|
1553
|
-
'stream_name': queue_name,
|
1554
|
-
'priority': priority, # 添加优先级字段
|
1555
|
-
'consumer_group': None,
|
1556
|
-
'last_published_offset': last_published_offset,
|
1557
|
-
'last_delivered_offset': 0,
|
1558
|
-
'last_acked_offset': 0,
|
1559
|
-
'pending_count': 0,
|
1560
|
-
'backlog_undelivered': last_published_offset,
|
1561
|
-
'backlog_unprocessed': last_published_offset,
|
1562
|
-
'created_at': current_time
|
1563
|
-
})
|
1564
|
-
|
1565
|
-
except Exception as e:
|
1566
|
-
logger.error(f"Error collecting metrics for stream {stream_key.decode()}: {e}")
|
1567
|
-
continue
|
1568
|
-
|
1569
|
-
# 保存指标到数据库
|
1570
|
-
if metrics:
|
1571
|
-
await self._save_backlog_metrics(metrics)
|
1572
|
-
# logger.info(f"Collected backlog metrics for {len(metrics)} stream/group combinations {time.time() }")
|
1573
|
-
|
1574
|
-
except Exception as e:
|
1575
|
-
import traceback
|
1576
|
-
traceback.print_exc()
|
1577
|
-
logger.error(f"Error collecting stream backlog metrics: {e}")
|
1578
|
-
|
1579
|
-
async def _save_backlog_metrics(self, metrics: List[Dict]):
|
1580
|
-
"""保存积压指标到数据库(仅保存发生变化的数据)"""
|
1581
|
-
if not metrics:
|
1582
|
-
return
|
1583
|
-
|
1584
|
-
# logger.info(f"Processing {len(metrics)} metrics for deduplication")
|
1585
|
-
|
1586
|
-
try:
|
1587
|
-
async with self.AsyncSessionLocal() as session:
|
1588
|
-
# 要保存的新记录
|
1589
|
-
metrics_to_save = []
|
1590
|
-
|
1591
|
-
# 使用批量查询优化性能
|
1592
|
-
metric_keys = {} # 用于快速查找
|
1593
|
-
|
1594
|
-
for metric in metrics:
|
1595
|
-
# 构建唯一键:namespace + stream_name + consumer_group + priority
|
1596
|
-
unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
|
1597
|
-
metric_keys[unique_key] = metric
|
1598
|
-
|
1599
|
-
# logger.info(f"Checking {len(metric_keys)} unique metric combinations")
|
1600
|
-
|
1601
|
-
# 批量查询最新记录 - 分批查询以避免SQL过长
|
1602
|
-
last_records = {}
|
1603
|
-
metric_list = list(metric_keys.values())
|
1604
|
-
batch_size = 50 # 每批查询50个
|
1605
|
-
|
1606
|
-
for i in range(0, len(metric_list), batch_size):
|
1607
|
-
batch = metric_list[i:i + batch_size]
|
1608
|
-
|
1609
|
-
# 构建参数化查询
|
1610
|
-
conditions = []
|
1611
|
-
params = {}
|
1612
|
-
for idx, metric in enumerate(batch):
|
1613
|
-
param_prefix = f"p{i + idx}"
|
1614
|
-
conditions.append(f"""
|
1615
|
-
(namespace = :{param_prefix}_ns
|
1616
|
-
AND stream_name = :{param_prefix}_sn
|
1617
|
-
AND consumer_group = :{param_prefix}_cg
|
1618
|
-
AND priority = :{param_prefix}_pr)
|
1619
|
-
""")
|
1620
|
-
params[f"{param_prefix}_ns"] = metric['namespace']
|
1621
|
-
params[f"{param_prefix}_sn"] = metric['stream_name']
|
1622
|
-
params[f"{param_prefix}_cg"] = metric['consumer_group']
|
1623
|
-
params[f"{param_prefix}_pr"] = metric['priority']
|
1624
|
-
|
1625
|
-
if conditions:
|
1626
|
-
# 使用窗口函数获取每个组合的最新记录
|
1627
|
-
query_sql = text(f"""
|
1628
|
-
WITH latest_records AS (
|
1629
|
-
SELECT
|
1630
|
-
namespace,
|
1631
|
-
stream_name,
|
1632
|
-
consumer_group,
|
1633
|
-
priority,
|
1634
|
-
last_published_offset,
|
1635
|
-
last_delivered_offset,
|
1636
|
-
last_acked_offset,
|
1637
|
-
pending_count,
|
1638
|
-
backlog_undelivered,
|
1639
|
-
backlog_unprocessed,
|
1640
|
-
ROW_NUMBER() OVER (
|
1641
|
-
PARTITION BY namespace, stream_name, consumer_group, priority
|
1642
|
-
ORDER BY created_at DESC
|
1643
|
-
) as rn
|
1644
|
-
FROM stream_backlog_monitor
|
1645
|
-
WHERE ({' OR '.join(conditions)})
|
1646
|
-
)
|
1647
|
-
SELECT
|
1648
|
-
namespace,
|
1649
|
-
stream_name,
|
1650
|
-
consumer_group,
|
1651
|
-
priority,
|
1652
|
-
last_published_offset,
|
1653
|
-
last_delivered_offset,
|
1654
|
-
last_acked_offset,
|
1655
|
-
pending_count,
|
1656
|
-
backlog_undelivered,
|
1657
|
-
backlog_unprocessed
|
1658
|
-
FROM latest_records
|
1659
|
-
WHERE rn = 1
|
1660
|
-
""")
|
1661
|
-
|
1662
|
-
result = await session.execute(query_sql, params)
|
1663
|
-
for row in result:
|
1664
|
-
key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
|
1665
|
-
last_records[key] = row
|
1666
|
-
logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
|
1667
|
-
|
1668
|
-
# 对每个指标进行去重检查
|
1669
|
-
for unique_key, metric in metric_keys.items():
|
1670
|
-
should_save = False
|
1671
|
-
|
1672
|
-
if unique_key not in last_records:
|
1673
|
-
# 没有历史记录,需要保存
|
1674
|
-
should_save = True
|
1675
|
-
# logger.info(f"New metric for {unique_key}, will save")
|
1676
|
-
else:
|
1677
|
-
# 比较关键指标是否发生变化
|
1678
|
-
last_record = last_records[unique_key]
|
1679
|
-
|
1680
|
-
# 详细的调试日志
|
1681
|
-
changes = []
|
1682
|
-
logger.debug(f"Comparing for {unique_key}:")
|
1683
|
-
logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
|
1684
|
-
f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
|
1685
|
-
f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
|
1686
|
-
f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
|
1687
|
-
logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
|
1688
|
-
f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
|
1689
|
-
f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
|
1690
|
-
f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
|
1691
|
-
|
1692
|
-
# 确保类型一致的比较(全部转为int进行比较)
|
1693
|
-
db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
|
1694
|
-
new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
|
1695
|
-
|
1696
|
-
db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
|
1697
|
-
new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
|
1698
|
-
|
1699
|
-
db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
|
1700
|
-
new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
|
1701
|
-
|
1702
|
-
db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
|
1703
|
-
new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
|
1704
|
-
|
1705
|
-
db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
|
1706
|
-
new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
|
1707
|
-
|
1708
|
-
db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
|
1709
|
-
new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
|
1710
|
-
|
1711
|
-
if db_published != new_published:
|
1712
|
-
changes.append(f"published: {db_published} -> {new_published}")
|
1713
|
-
if db_delivered != new_delivered:
|
1714
|
-
changes.append(f"delivered: {db_delivered} -> {new_delivered}")
|
1715
|
-
if db_acked != new_acked:
|
1716
|
-
changes.append(f"acked: {db_acked} -> {new_acked}")
|
1717
|
-
if db_pending != new_pending:
|
1718
|
-
changes.append(f"pending: {db_pending} -> {new_pending}")
|
1719
|
-
if db_undelivered != new_undelivered:
|
1720
|
-
changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
|
1721
|
-
if db_unprocessed != new_unprocessed:
|
1722
|
-
changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
|
1723
|
-
|
1724
|
-
if changes:
|
1725
|
-
should_save = True
|
1726
|
-
# logger.info(f"Metric changed for {unique_key}: {', '.join(changes)}")
|
1727
|
-
else:
|
1728
|
-
logger.debug(f"Metric unchanged for {unique_key}, skipping")
|
1729
|
-
|
1730
|
-
if should_save:
|
1731
|
-
metrics_to_save.append(metric)
|
1732
|
-
|
1733
|
-
# 批量插入发生变化的监控数据
|
1734
|
-
if metrics_to_save:
|
1735
|
-
insert_sql = text("""
|
1736
|
-
INSERT INTO stream_backlog_monitor
|
1737
|
-
(namespace, stream_name, priority, consumer_group, last_published_offset,
|
1738
|
-
last_delivered_offset, last_acked_offset, pending_count,
|
1739
|
-
backlog_undelivered, backlog_unprocessed, created_at)
|
1740
|
-
VALUES
|
1741
|
-
(:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
|
1742
|
-
:last_delivered_offset, :last_acked_offset, :pending_count,
|
1743
|
-
:backlog_undelivered, :backlog_unprocessed, :created_at)
|
1744
|
-
""")
|
1745
|
-
|
1746
|
-
# 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
|
1747
|
-
for metric_data in metrics_to_save:
|
1748
|
-
await session.execute(insert_sql, metric_data)
|
1749
|
-
|
1750
|
-
await session.commit()
|
1751
|
-
# logger.info(f"Saved {len(metrics_to_save)} changed metrics out of {len(metrics)} total")
|
1752
|
-
else:
|
1753
|
-
logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
|
1754
|
-
|
1755
|
-
except Exception as e:
|
1756
|
-
logger.error(f"Error saving backlog metrics to database: {e}")
|
1757
|
-
|
1758
|
-
def _parse_stream_message(self, task_id: str, data: dict) -> Optional[dict]:
|
1759
|
-
"""解析Stream消息为任务信息(返回完整的字段)"""
|
1760
|
-
try:
|
1761
|
-
from jettask.utils.serializer import loads_str
|
1762
|
-
if b'data' in data:
|
1763
|
-
task_data = loads_str(data[b'data'])
|
1764
|
-
else:
|
1765
|
-
task_data = {}
|
1766
|
-
for k, v in data.items():
|
1767
|
-
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
1768
|
-
if isinstance(v, bytes):
|
1769
|
-
try:
|
1770
|
-
value = loads_str(v)
|
1771
|
-
except:
|
1772
|
-
value = str(v)
|
1773
|
-
else:
|
1774
|
-
value = v
|
1775
|
-
task_data[key] = value
|
1776
|
-
# 如果配置了命名空间,检查消息是否属于该命名空间
|
1777
|
-
# if self.namespace_id:
|
1778
|
-
# msg_namespace_id = task_data.get('__namespace_id')
|
1779
|
-
# # 如果消息没有namespace_id且当前不是默认命名空间,跳过
|
1780
|
-
# if msg_namespace_id != self.namespace_id:
|
1781
|
-
# if not (msg_namespace_id is None and self.namespace_id == 'default'):
|
1782
|
-
# logger.debug(f"Skipping message from different namespace: {msg_namespace_id} != {self.namespace_id}")
|
1783
|
-
# return None
|
1784
|
-
queue_name = task_data['queue']
|
1785
|
-
task_name = task_data.get('name', task_data.get('task', 'unknown'))
|
1786
|
-
created_at = None
|
1787
|
-
if 'trigger_time' in task_data:
|
1788
|
-
try:
|
1789
|
-
timestamp = float(task_data['trigger_time'])
|
1790
|
-
created_at = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
1791
|
-
except:
|
1792
|
-
pass
|
1793
|
-
# 返回完整的字段,包括所有可能为None的字段
|
1794
|
-
return {
|
1795
|
-
'id': task_id,
|
1796
|
-
'queue_name': queue_name,
|
1797
|
-
'task_name': task_name,
|
1798
|
-
'task_data': json.dumps(task_data),
|
1799
|
-
'priority': int(task_data.get('priority', 0)),
|
1800
|
-
'retry_count': int(task_data.get('retry', 0)),
|
1801
|
-
'max_retry': int(task_data.get('max_retry', 3)),
|
1802
|
-
'status': 'pending',
|
1803
|
-
'result': None, # 新任务没有结果
|
1804
|
-
'error_message': None, # 新任务没有错误信息
|
1805
|
-
'created_at': created_at,
|
1806
|
-
'started_at': None, # 新任务还未开始
|
1807
|
-
'completed_at': None, # 新任务还未完成
|
1808
|
-
'scheduled_task_id': task_data.get('scheduled_task_id'), # 调度任务ID
|
1809
|
-
'metadata': json.dumps(task_data.get('metadata', {})),
|
1810
|
-
'worker_id': None, # 新任务还未分配worker
|
1811
|
-
'execution_time': None, # 新任务还没有执行时间
|
1812
|
-
'duration': None, # 新任务还没有持续时间
|
1813
|
-
'namespace_id': self.namespace_id # 添加命名空间ID
|
1814
|
-
}
|
1815
|
-
except Exception as e:
|
1816
|
-
import traceback
|
1817
|
-
traceback.print_exc()
|
1818
|
-
logger.error(f"Error parsing stream message for task {task_id}: {e}")
|
1819
|
-
return None
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig,
|
1824
|
-
consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
|
1825
|
-
"""运行PostgreSQL消费者"""
|
1826
|
-
# 从环境变量读取监控配置
|
1827
|
-
enable_backlog_monitor = os.getenv('JETTASK_ENABLE_BACKLOG_MONITOR', 'true').lower() == 'true'
|
1828
|
-
backlog_monitor_interval = int(os.getenv('JETTASK_BACKLOG_MONITOR_INTERVAL', '60'))
|
1829
|
-
|
1830
|
-
logger.info(f"Backlog monitor config: enabled={enable_backlog_monitor}, interval={backlog_monitor_interval}s")
|
1831
|
-
|
1832
|
-
consumer = PostgreSQLConsumer(
|
1833
|
-
pg_config,
|
1834
|
-
redis_config,
|
1835
|
-
consumer_strategy=consumer_strategy,
|
1836
|
-
enable_backlog_monitor=enable_backlog_monitor,
|
1837
|
-
backlog_monitor_interval=backlog_monitor_interval
|
1838
|
-
)
|
1839
|
-
|
1840
|
-
try:
|
1841
|
-
await consumer.start()
|
1842
|
-
while True:
|
1843
|
-
await asyncio.sleep(1)
|
1844
|
-
|
1845
|
-
except KeyboardInterrupt:
|
1846
|
-
logger.debug("Received interrupt signal")
|
1847
|
-
finally:
|
1848
|
-
await consumer.stop()
|
1849
|
-
|
1850
|
-
|
1851
|
-
def main():
|
1852
|
-
"""主入口函数"""
|
1853
|
-
from dotenv import load_dotenv
|
1854
|
-
|
1855
|
-
load_dotenv()
|
1856
|
-
|
1857
|
-
logging.basicConfig(
|
1858
|
-
level=logging.INFO,
|
1859
|
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
1860
|
-
)
|
1861
|
-
|
1862
|
-
pg_config = PostgreSQLConfig(
|
1863
|
-
host=os.getenv('JETTASK_PG_HOST', 'localhost'),
|
1864
|
-
port=int(os.getenv('JETTASK_PG_PORT', '5432')),
|
1865
|
-
database=os.getenv('JETTASK_PG_DB', 'jettask'),
|
1866
|
-
user=os.getenv('JETTASK_PG_USER', 'jettask'),
|
1867
|
-
password=os.getenv('JETTASK_PG_PASSWORD', '123456'),
|
1868
|
-
)
|
1869
|
-
|
1870
|
-
redis_config = RedisConfig(
|
1871
|
-
host=os.getenv('REDIS_HOST', 'localhost'),
|
1872
|
-
port=int(os.getenv('REDIS_PORT', '6379')),
|
1873
|
-
db=int(os.getenv('REDIS_DB', '0')),
|
1874
|
-
password=os.getenv('REDIS_PASSWORD'),
|
1875
|
-
)
|
1876
|
-
|
1877
|
-
# 从环境变量获取消费者策略,默认使用 HEARTBEAT
|
1878
|
-
strategy_name = os.getenv('JETTASK_CONSUMER_STRATEGY', 'HEARTBEAT').upper()
|
1879
|
-
consumer_strategy = ConsumerStrategy.HEARTBEAT # 默认
|
1880
|
-
|
1881
|
-
if strategy_name == 'FIXED':
|
1882
|
-
consumer_strategy = ConsumerStrategy.FIXED
|
1883
|
-
elif strategy_name == 'POD':
|
1884
|
-
consumer_strategy = ConsumerStrategy.POD
|
1885
|
-
elif strategy_name == 'HEARTBEAT':
|
1886
|
-
consumer_strategy = ConsumerStrategy.HEARTBEAT
|
1887
|
-
else:
|
1888
|
-
logger.debug(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
|
1889
|
-
|
1890
|
-
logger.debug(f"Using consumer strategy: {consumer_strategy.value}")
|
1891
|
-
|
1892
|
-
asyncio.run(run_pg_consumer(pg_config, redis_config, consumer_strategy))
|
1893
|
-
|
1894
|
-
|
1895
|
-
if __name__ == '__main__':
|
1896
|
-
main()
|