jettask 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/constants.py +213 -0
- jettask/core/app.py +525 -205
- jettask/core/cli.py +193 -185
- jettask/core/consumer_manager.py +126 -34
- jettask/core/context.py +3 -0
- jettask/core/enums.py +137 -0
- jettask/core/event_pool.py +501 -168
- jettask/core/message.py +147 -0
- jettask/core/offline_worker_recovery.py +181 -114
- jettask/core/task.py +10 -174
- jettask/core/task_batch.py +153 -0
- jettask/core/unified_manager_base.py +243 -0
- jettask/core/worker_scanner.py +54 -54
- jettask/executors/asyncio.py +184 -64
- jettask/webui/backend/config.py +51 -0
- jettask/webui/backend/data_access.py +2083 -92
- jettask/webui/backend/data_api.py +3294 -0
- jettask/webui/backend/dependencies.py +261 -0
- jettask/webui/backend/init_meta_db.py +158 -0
- jettask/webui/backend/main.py +1358 -69
- jettask/webui/backend/main_unified.py +78 -0
- jettask/webui/backend/main_v2.py +394 -0
- jettask/webui/backend/namespace_api.py +295 -0
- jettask/webui/backend/namespace_api_old.py +294 -0
- jettask/webui/backend/namespace_data_access.py +611 -0
- jettask/webui/backend/queue_backlog_api.py +727 -0
- jettask/webui/backend/queue_stats_v2.py +521 -0
- jettask/webui/backend/redis_monitor_api.py +476 -0
- jettask/webui/backend/unified_api_router.py +1601 -0
- jettask/webui/db_init.py +204 -32
- jettask/webui/frontend/package-lock.json +492 -1
- jettask/webui/frontend/package.json +4 -1
- jettask/webui/frontend/src/App.css +105 -7
- jettask/webui/frontend/src/App.jsx +49 -20
- jettask/webui/frontend/src/components/NamespaceSelector.jsx +166 -0
- jettask/webui/frontend/src/components/QueueBacklogChart.jsx +298 -0
- jettask/webui/frontend/src/components/QueueBacklogTrend.jsx +638 -0
- jettask/webui/frontend/src/components/QueueDetailsTable.css +65 -0
- jettask/webui/frontend/src/components/QueueDetailsTable.jsx +487 -0
- jettask/webui/frontend/src/components/QueueDetailsTableV2.jsx +465 -0
- jettask/webui/frontend/src/components/ScheduledTaskFilter.jsx +423 -0
- jettask/webui/frontend/src/components/TaskFilter.jsx +425 -0
- jettask/webui/frontend/src/components/TimeRangeSelector.css +21 -0
- jettask/webui/frontend/src/components/TimeRangeSelector.jsx +160 -0
- jettask/webui/frontend/src/components/layout/AppLayout.css +95 -0
- jettask/webui/frontend/src/components/layout/AppLayout.jsx +49 -0
- jettask/webui/frontend/src/components/layout/Header.css +34 -10
- jettask/webui/frontend/src/components/layout/Header.jsx +31 -23
- jettask/webui/frontend/src/components/layout/SideMenu.css +137 -0
- jettask/webui/frontend/src/components/layout/SideMenu.jsx +209 -0
- jettask/webui/frontend/src/components/layout/TabsNav.css +244 -0
- jettask/webui/frontend/src/components/layout/TabsNav.jsx +206 -0
- jettask/webui/frontend/src/components/layout/UserInfo.css +197 -0
- jettask/webui/frontend/src/components/layout/UserInfo.jsx +197 -0
- jettask/webui/frontend/src/contexts/NamespaceContext.jsx +72 -0
- jettask/webui/frontend/src/contexts/TabsContext.backup.jsx +245 -0
- jettask/webui/frontend/src/main.jsx +1 -0
- jettask/webui/frontend/src/pages/Alerts.jsx +684 -0
- jettask/webui/frontend/src/pages/Dashboard.jsx +1330 -0
- jettask/webui/frontend/src/pages/QueueDetail.jsx +1109 -10
- jettask/webui/frontend/src/pages/QueueMonitor.jsx +236 -115
- jettask/webui/frontend/src/pages/Queues.jsx +5 -1
- jettask/webui/frontend/src/pages/ScheduledTasks.jsx +809 -0
- jettask/webui/frontend/src/pages/Settings.jsx +800 -0
- jettask/webui/frontend/src/services/api.js +7 -5
- jettask/webui/frontend/src/utils/suppressWarnings.js +22 -0
- jettask/webui/frontend/src/utils/userPreferences.js +154 -0
- jettask/webui/multi_namespace_consumer.py +543 -0
- jettask/webui/pg_consumer.py +983 -246
- jettask/webui/static/dist/assets/index-7129cfe1.css +1 -0
- jettask/webui/static/dist/assets/index-8d1935cc.js +774 -0
- jettask/webui/static/dist/index.html +2 -2
- jettask/webui/task_center.py +216 -0
- jettask/webui/task_center_client.py +150 -0
- jettask/webui/unified_consumer_manager.py +193 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/METADATA +1 -1
- jettask-0.2.4.dist-info/RECORD +134 -0
- jettask/webui/pg_consumer_slow.py +0 -1099
- jettask/webui/pg_consumer_test.py +0 -678
- jettask/webui/static/dist/assets/index-823408e8.css +0 -1
- jettask/webui/static/dist/assets/index-9968b0b8.js +0 -543
- jettask/webui/test_pg_consumer_recovery.py +0 -547
- jettask/webui/test_recovery_simple.py +0 -492
- jettask/webui/test_self_recovery.py +0 -467
- jettask-0.2.1.dist-info/RECORD +0 -91
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/WHEEL +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/top_level.txt +0 -0
@@ -1,1099 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
"""慢速版的 PostgreSQL Consumer - 用于测试恢复机制"""
|
3
|
-
|
4
|
-
import asyncio
|
5
|
-
import json
|
6
|
-
import logging
|
7
|
-
import os
|
8
|
-
import time
|
9
|
-
from typing import Dict, List, Optional, Any
|
10
|
-
from datetime import datetime, timezone
|
11
|
-
from collections import defaultdict
|
12
|
-
|
13
|
-
import redis.asyncio as redis
|
14
|
-
from redis.asyncio import Redis
|
15
|
-
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
16
|
-
from sqlalchemy.orm import sessionmaker
|
17
|
-
from sqlalchemy import text
|
18
|
-
|
19
|
-
from jettask.webui.config import PostgreSQLConfig, RedisConfig
|
20
|
-
from jettask.core.consumer_manager import ConsumerManager, ConsumerStrategy
|
21
|
-
from jettask.core.offline_worker_recovery import OfflineWorkerRecovery
|
22
|
-
|
23
|
-
logger = logging.getLogger(__name__)
|
24
|
-
|
25
|
-
|
26
|
-
class PostgreSQLConsumer:
|
27
|
-
"""PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL"""
|
28
|
-
|
29
|
-
def __init__(self, pg_config: PostgreSQLConfig, redis_config: RedisConfig, prefix: str = "jettask",
|
30
|
-
node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
|
31
|
-
self.pg_config = pg_config
|
32
|
-
self.redis_config = redis_config
|
33
|
-
self.prefix = prefix
|
34
|
-
self.redis_client: Optional[Redis] = None
|
35
|
-
self.async_engine = None
|
36
|
-
self.AsyncSessionLocal = None
|
37
|
-
self.consumer_group = f"{prefix}_pg_consumer1"
|
38
|
-
|
39
|
-
# 节点标识
|
40
|
-
import socket
|
41
|
-
hostname = socket.gethostname()
|
42
|
-
self.node_id = node_id or f"{hostname}_{os.getpid()}"
|
43
|
-
|
44
|
-
# 使用 ConsumerManager 来管理 consumer_id
|
45
|
-
self.consumer_strategy = consumer_strategy
|
46
|
-
self.consumer_manager = None # 将在 start() 中初始化
|
47
|
-
self.consumer_id = None # 将从 ConsumerManager 获取
|
48
|
-
|
49
|
-
self._running = False
|
50
|
-
self._tasks = []
|
51
|
-
self._known_queues = set()
|
52
|
-
self._consecutive_errors = defaultdict(int)
|
53
|
-
|
54
|
-
# 内存中维护已处理的任务ID集合(用于优化查询)
|
55
|
-
self._processed_task_ids = set()
|
56
|
-
self._processed_ids_lock = asyncio.Lock() # 保护并发访问
|
57
|
-
# 定期清理过期的ID(防止内存无限增长)
|
58
|
-
self._processed_ids_max_size = 100000 # 最多保存10万个ID
|
59
|
-
self._processed_ids_cleanup_interval = 300 # 每5分钟清理一次
|
60
|
-
|
61
|
-
# 待重试的任务更新(任务ID -> 更新信息)
|
62
|
-
self._pending_updates = {}
|
63
|
-
self._pending_updates_lock = asyncio.Lock()
|
64
|
-
self._max_pending_updates = 10000 # 最多保存1万个待重试更新
|
65
|
-
self._retry_interval = 5 # 每5秒重试一次
|
66
|
-
|
67
|
-
# 动态批次大小
|
68
|
-
self.batch_size = 2000
|
69
|
-
self.min_batch_size = 500
|
70
|
-
self.max_batch_size = 5000
|
71
|
-
|
72
|
-
async def start(self):
|
73
|
-
"""启动消费者"""
|
74
|
-
logger.info(f"Starting PostgreSQL consumer (simplified) on node: {self.node_id}")
|
75
|
-
|
76
|
-
# 连接Redis
|
77
|
-
self.redis_client = await redis.Redis(
|
78
|
-
host=self.redis_config.host,
|
79
|
-
port=self.redis_config.port,
|
80
|
-
db=self.redis_config.db,
|
81
|
-
password=self.redis_config.password,
|
82
|
-
decode_responses=False
|
83
|
-
)
|
84
|
-
|
85
|
-
# 初始化 ConsumerManager(需要同步的 Redis 客户端)
|
86
|
-
import redis as sync_redis
|
87
|
-
sync_redis_client = sync_redis.StrictRedis(
|
88
|
-
host=self.redis_config.host,
|
89
|
-
port=self.redis_config.port,
|
90
|
-
db=self.redis_config.db,
|
91
|
-
password=self.redis_config.password,
|
92
|
-
decode_responses=True # 使用字符串模式,与其他组件保持一致
|
93
|
-
)
|
94
|
-
|
95
|
-
# 配置 ConsumerManager
|
96
|
-
# 初始队列列表包含TASK_CHANGES,其他队列会动态添加
|
97
|
-
initial_queues = ['TASK_CHANGES'] # TASK_CHANGES是固定的
|
98
|
-
consumer_config = {
|
99
|
-
'redis_prefix': self.prefix,
|
100
|
-
'queues': initial_queues,
|
101
|
-
'worker_prefix': 'PG_CONSUMER', # 使用不同的前缀,与task worker区分开
|
102
|
-
}
|
103
|
-
|
104
|
-
self.consumer_manager = ConsumerManager(
|
105
|
-
redis_client=sync_redis_client,
|
106
|
-
strategy=self.consumer_strategy,
|
107
|
-
config=consumer_config
|
108
|
-
)
|
109
|
-
|
110
|
-
# 获取稳定的 consumer_id(使用TASK_CHANGES作为基准队列)
|
111
|
-
self.consumer_id = self.consumer_manager.get_consumer_name('TASK_CHANGES')
|
112
|
-
logger.info(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
|
113
|
-
|
114
|
-
# 创建SQLAlchemy异步引擎
|
115
|
-
if self.pg_config.dsn.startswith('postgresql://'):
|
116
|
-
dsn = self.pg_config.dsn.replace('postgresql://', 'postgresql+psycopg://', 1)
|
117
|
-
else:
|
118
|
-
dsn = self.pg_config.dsn
|
119
|
-
|
120
|
-
self.async_engine = create_async_engine(
|
121
|
-
dsn,
|
122
|
-
pool_size=50,
|
123
|
-
max_overflow=20,
|
124
|
-
pool_pre_ping=True,
|
125
|
-
pool_recycle=300,
|
126
|
-
echo=False
|
127
|
-
)
|
128
|
-
|
129
|
-
# 预热连接池
|
130
|
-
logger.info("Pre-warming database connection pool...")
|
131
|
-
async with self.async_engine.begin() as conn:
|
132
|
-
await conn.execute(text("SELECT 1"))
|
133
|
-
|
134
|
-
# 创建异步会话工厂
|
135
|
-
self.AsyncSessionLocal = sessionmaker(
|
136
|
-
self.async_engine,
|
137
|
-
class_=AsyncSession,
|
138
|
-
expire_on_commit=False
|
139
|
-
)
|
140
|
-
|
141
|
-
# 初始化数据库架构
|
142
|
-
await self._init_database()
|
143
|
-
|
144
|
-
self._running = True
|
145
|
-
|
146
|
-
# 先进行一次队列发现,确保ConsumerManager有正确的队列列表
|
147
|
-
await self._initial_queue_discovery()
|
148
|
-
|
149
|
-
# 创建离线worker恢复器(用于恢复TASK_CHANGES stream的离线消息)
|
150
|
-
self.offline_recovery = OfflineWorkerRecovery(
|
151
|
-
async_redis_client=self.redis_client,
|
152
|
-
redis_prefix=self.prefix,
|
153
|
-
worker_prefix='PG_CONSUMER', # 使用PG_CONSUMER前缀
|
154
|
-
consumer_manager=self.consumer_manager
|
155
|
-
)
|
156
|
-
|
157
|
-
# 启动消费任务(简化版:只保留必要的任务)
|
158
|
-
self._tasks = [
|
159
|
-
asyncio.create_task(self._consume_queues()), # 消费新任务
|
160
|
-
asyncio.create_task(self._consume_task_changes()), # 消费任务变更事件
|
161
|
-
asyncio.create_task(self._database_maintenance()), # 数据库维护
|
162
|
-
asyncio.create_task(self._retry_pending_updates()), # 重试待更新的任务
|
163
|
-
asyncio.create_task(self._start_offline_recovery()) # 离线worker恢复服务
|
164
|
-
]
|
165
|
-
|
166
|
-
# 如果使用 HEARTBEAT 策略,ConsumerManager 会自动管理心跳
|
167
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and self.consumer_manager:
|
168
|
-
# 启动心跳(ConsumerManager 内部会处理)
|
169
|
-
logger.info("Heartbeat is managed by ConsumerManager")
|
170
|
-
|
171
|
-
logger.info("PostgreSQL consumer started successfully")
|
172
|
-
|
173
|
-
async def stop(self):
|
174
|
-
"""停止消费者"""
|
175
|
-
logger.info("Stopping PostgreSQL consumer...")
|
176
|
-
self._running = False
|
177
|
-
|
178
|
-
# 停止离线恢复服务
|
179
|
-
if hasattr(self, 'offline_recovery'):
|
180
|
-
self.offline_recovery.stop() # stop() 不是异步方法
|
181
|
-
|
182
|
-
# 取消所有任务
|
183
|
-
for task in self._tasks:
|
184
|
-
task.cancel()
|
185
|
-
|
186
|
-
# 等待任务完成
|
187
|
-
await asyncio.gather(*self._tasks, return_exceptions=True)
|
188
|
-
|
189
|
-
# 清理 ConsumerManager
|
190
|
-
if self.consumer_manager:
|
191
|
-
try:
|
192
|
-
self.consumer_manager.cleanup()
|
193
|
-
logger.info(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
|
194
|
-
except Exception as e:
|
195
|
-
logger.error(f"Error cleaning up ConsumerManager: {e}")
|
196
|
-
|
197
|
-
# 关闭连接
|
198
|
-
if self.redis_client:
|
199
|
-
await self.redis_client.close()
|
200
|
-
|
201
|
-
if self.async_engine:
|
202
|
-
await self.async_engine.dispose()
|
203
|
-
|
204
|
-
logger.info("PostgreSQL consumer stopped")
|
205
|
-
|
206
|
-
async def _init_database(self):
|
207
|
-
"""初始化数据库架构"""
|
208
|
-
# 使用相对于当前文件的路径
|
209
|
-
import os
|
210
|
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
211
|
-
schema_path = os.path.join(current_dir, "schema.sql")
|
212
|
-
try:
|
213
|
-
with open(schema_path, 'r') as f:
|
214
|
-
schema_sql = f.read()
|
215
|
-
|
216
|
-
async with self.AsyncSessionLocal() as session:
|
217
|
-
await session.execute(text(schema_sql))
|
218
|
-
await session.commit()
|
219
|
-
logger.info("Database schema initialized")
|
220
|
-
except FileNotFoundError:
|
221
|
-
logger.warning(f"Schema file not found at {schema_path}, skipping initialization")
|
222
|
-
except Exception as e:
|
223
|
-
logger.error(f"Failed to initialize database schema: {e}")
|
224
|
-
|
225
|
-
async def _initial_queue_discovery(self):
|
226
|
-
"""初始队列发现,在启动时执行一次"""
|
227
|
-
try:
|
228
|
-
pattern = f"{self.prefix}:QUEUE:*"
|
229
|
-
new_queues = set()
|
230
|
-
|
231
|
-
async for key in self.redis_client.scan_iter(match=pattern, count=100):
|
232
|
-
queue_name = key.decode('utf-8').split(":")[-1]
|
233
|
-
new_queues.add(queue_name)
|
234
|
-
|
235
|
-
if new_queues:
|
236
|
-
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
237
|
-
all_queues = list(new_queues) + ['TASK_CHANGES']
|
238
|
-
|
239
|
-
# 更新ConsumerManager的配置
|
240
|
-
if self.consumer_manager:
|
241
|
-
self.consumer_manager.config['queues'] = all_queues
|
242
|
-
|
243
|
-
# 更新worker的队列信息
|
244
|
-
# 获取实际的consumer_id(从心跳策略中)
|
245
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
|
246
|
-
actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
|
247
|
-
else:
|
248
|
-
# 从consumer_name中提取(格式:consumer_id-queue)
|
249
|
-
actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
|
250
|
-
|
251
|
-
worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
|
252
|
-
try:
|
253
|
-
# 使用同步Redis客户端更新
|
254
|
-
self.consumer_manager.redis_client.hset(
|
255
|
-
worker_key,
|
256
|
-
'queues',
|
257
|
-
','.join(all_queues)
|
258
|
-
)
|
259
|
-
logger.info(f"Initial queue discovery - found queues: {all_queues}")
|
260
|
-
except Exception as e:
|
261
|
-
logger.error(f"Error updating initial worker queues: {e}")
|
262
|
-
|
263
|
-
self._known_queues = new_queues
|
264
|
-
|
265
|
-
except Exception as e:
|
266
|
-
logger.error(f"Error in initial queue discovery: {e}")
|
267
|
-
|
268
|
-
async def _discover_queues(self):
|
269
|
-
"""定期发现新队列"""
|
270
|
-
while self._running:
|
271
|
-
try:
|
272
|
-
pattern = f"{self.prefix}:QUEUE:*"
|
273
|
-
new_queues = set()
|
274
|
-
|
275
|
-
async for key in self.redis_client.scan_iter(match=pattern, count=100):
|
276
|
-
queue_name = key.decode('utf-8').split(":")[-1]
|
277
|
-
new_queues.add(queue_name)
|
278
|
-
|
279
|
-
# 为新发现的队列创建消费者组
|
280
|
-
for queue in new_queues - self._known_queues:
|
281
|
-
stream_key = f"{self.prefix}:QUEUE:{queue}"
|
282
|
-
try:
|
283
|
-
await self.redis_client.xgroup_create(
|
284
|
-
stream_key, self.consumer_group, id='0', mkstream=True
|
285
|
-
)
|
286
|
-
logger.info(f"Created consumer group for new queue: {queue}")
|
287
|
-
except redis.ResponseError:
|
288
|
-
pass
|
289
|
-
|
290
|
-
# 更新ConsumerManager的队列列表(同步操作)
|
291
|
-
if new_queues != self._known_queues:
|
292
|
-
# 合并所有队列:TASK_CHANGES + 动态发现的队列
|
293
|
-
all_queues = list(new_queues) + ['TASK_CHANGES']
|
294
|
-
|
295
|
-
# 更新ConsumerManager的配置
|
296
|
-
if self.consumer_manager:
|
297
|
-
self.consumer_manager.config['queues'] = all_queues
|
298
|
-
|
299
|
-
# 更新worker的队列信息
|
300
|
-
# 获取实际的consumer_id(从心跳策略中)
|
301
|
-
if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
|
302
|
-
actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
|
303
|
-
else:
|
304
|
-
# 从consumer_name中提取(格式:consumer_id-queue)
|
305
|
-
actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
|
306
|
-
|
307
|
-
worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
|
308
|
-
try:
|
309
|
-
# 使用同步Redis客户端更新
|
310
|
-
self.consumer_manager.redis_client.hset(
|
311
|
-
worker_key,
|
312
|
-
'queues',
|
313
|
-
','.join(all_queues)
|
314
|
-
)
|
315
|
-
logger.info(f"Updated ConsumerManager queues: {all_queues}")
|
316
|
-
except Exception as e:
|
317
|
-
logger.error(f"Error updating worker queues: {e}")
|
318
|
-
|
319
|
-
self._known_queues = new_queues
|
320
|
-
await asyncio.sleep(30)
|
321
|
-
|
322
|
-
except Exception as e:
|
323
|
-
import traceback
|
324
|
-
traceback.print_exc()
|
325
|
-
logger.error(f"Error discovering queues: {e}")
|
326
|
-
await asyncio.sleep(10)
|
327
|
-
|
328
|
-
async def _consume_queue(self, queue_name: str):
|
329
|
-
"""消费单个队列的任务"""
|
330
|
-
stream_key = f"{self.prefix}:QUEUE:{queue_name}"
|
331
|
-
check_backlog = True
|
332
|
-
lastid = "0-0"
|
333
|
-
|
334
|
-
# pg_consumer 应该使用统一的 consumer_id,而不是为每个队列创建新的
|
335
|
-
# 因为 pg_consumer 的职责是消费所有队列的消息并写入数据库
|
336
|
-
# 它不是真正的任务执行者,所以不需要为每个队列创建独立的 consumer
|
337
|
-
consumer_name = self.consumer_id
|
338
|
-
|
339
|
-
# ConsumerManager会自动处理离线worker的pending消息恢复
|
340
|
-
# 不需要手动恢复
|
341
|
-
|
342
|
-
while self._running and queue_name in self._known_queues:
|
343
|
-
try:
|
344
|
-
myid = lastid if check_backlog else ">"
|
345
|
-
|
346
|
-
messages = await self.redis_client.xreadgroup(
|
347
|
-
self.consumer_group,
|
348
|
-
consumer_name, # 使用ConsumerManager管理的consumer_name
|
349
|
-
{stream_key: myid},
|
350
|
-
count=10000,
|
351
|
-
block=1000 if not check_backlog else 0
|
352
|
-
)
|
353
|
-
|
354
|
-
if not messages or (messages and len(messages[0][1]) == 0):
|
355
|
-
check_backlog = False
|
356
|
-
continue
|
357
|
-
|
358
|
-
if messages:
|
359
|
-
await self._process_messages(messages)
|
360
|
-
self._consecutive_errors[queue_name] = 0
|
361
|
-
|
362
|
-
if messages[0] and messages[0][1]:
|
363
|
-
lastid = messages[0][1][-1][0].decode('utf-8') if isinstance(messages[0][1][-1][0], bytes) else messages[0][1][-1][0]
|
364
|
-
check_backlog = len(messages[0][1]) >= 2000
|
365
|
-
|
366
|
-
except redis.ResponseError as e:
|
367
|
-
if "NOGROUP" in str(e):
|
368
|
-
try:
|
369
|
-
await self.redis_client.xgroup_create(
|
370
|
-
stream_key, self.consumer_group, id='0', mkstream=True
|
371
|
-
)
|
372
|
-
logger.info(f"Recreated consumer group for queue: {queue_name}")
|
373
|
-
check_backlog = True
|
374
|
-
lastid = "0-0"
|
375
|
-
except:
|
376
|
-
pass
|
377
|
-
else:
|
378
|
-
logger.error(f"Redis error for queue {queue_name}: {e}")
|
379
|
-
self._consecutive_errors[queue_name] += 1
|
380
|
-
|
381
|
-
if self._consecutive_errors[queue_name] > 10:
|
382
|
-
logger.warning(f"Too many errors for queue {queue_name}, will retry later")
|
383
|
-
await asyncio.sleep(30)
|
384
|
-
self._consecutive_errors[queue_name] = 0
|
385
|
-
|
386
|
-
except Exception as e:
|
387
|
-
logger.error(f"Error consuming queue {queue_name}: {e}", exc_info=True)
|
388
|
-
self._consecutive_errors[queue_name] += 1
|
389
|
-
await asyncio.sleep(1)
|
390
|
-
|
391
|
-
async def _consume_queues(self):
|
392
|
-
"""启动所有队列的消费任务"""
|
393
|
-
discover_task = asyncio.create_task(self._discover_queues())
|
394
|
-
queue_tasks = {}
|
395
|
-
|
396
|
-
while self._running:
|
397
|
-
try:
|
398
|
-
for queue in self._known_queues:
|
399
|
-
if queue not in queue_tasks or queue_tasks[queue].done():
|
400
|
-
queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
|
401
|
-
logger.info(f"Started consumer task for queue: {queue}")
|
402
|
-
|
403
|
-
for queue in list(queue_tasks.keys()):
|
404
|
-
if queue not in self._known_queues:
|
405
|
-
queue_tasks[queue].cancel()
|
406
|
-
del queue_tasks[queue]
|
407
|
-
logger.info(f"Stopped consumer task for removed queue: {queue}")
|
408
|
-
|
409
|
-
await asyncio.sleep(10)
|
410
|
-
|
411
|
-
except Exception as e:
|
412
|
-
logger.error(f"Error in consume_queues manager: {e}")
|
413
|
-
await asyncio.sleep(5)
|
414
|
-
|
415
|
-
discover_task.cancel()
|
416
|
-
for task in queue_tasks.values():
|
417
|
-
task.cancel()
|
418
|
-
|
419
|
-
await asyncio.gather(discover_task, *queue_tasks.values(), return_exceptions=True)
|
420
|
-
|
421
|
-
async def _process_messages(self, messages: List):
|
422
|
-
"""处理消息并保存到PostgreSQL - 慢速版本用于测试"""
|
423
|
-
tasks_to_insert = []
|
424
|
-
ack_batch = []
|
425
|
-
|
426
|
-
# 添加人工延迟模拟慢速处理
|
427
|
-
logger.info(f"[SLOW] Processing {len(messages)} message batches, adding 5 second delay...")
|
428
|
-
await asyncio.sleep(5) # 模拟慢速处理
|
429
|
-
|
430
|
-
for stream_key, stream_messages in messages:
|
431
|
-
if not stream_messages:
|
432
|
-
continue
|
433
|
-
|
434
|
-
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
|
435
|
-
queue_name = stream_key_str.split(":")[-1]
|
436
|
-
msg_ids_to_ack = []
|
437
|
-
|
438
|
-
for msg_id, data in stream_messages:
|
439
|
-
try:
|
440
|
-
if not msg_id or not data:
|
441
|
-
continue
|
442
|
-
|
443
|
-
msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
|
444
|
-
|
445
|
-
# 使用公共方法解析消息
|
446
|
-
task_info = self._parse_stream_message(msg_id_str, data, queue_name)
|
447
|
-
if task_info:
|
448
|
-
tasks_to_insert.append(task_info)
|
449
|
-
msg_ids_to_ack.append(msg_id)
|
450
|
-
|
451
|
-
except Exception as e:
|
452
|
-
logger.error(f"Error processing message {msg_id}: {e}")
|
453
|
-
|
454
|
-
if msg_ids_to_ack:
|
455
|
-
ack_batch.append((stream_key, msg_ids_to_ack))
|
456
|
-
|
457
|
-
if tasks_to_insert:
|
458
|
-
await self._insert_tasks(tasks_to_insert)
|
459
|
-
|
460
|
-
# 将成功插入的任务ID添加到内存集合中
|
461
|
-
async with self._processed_ids_lock:
|
462
|
-
for task in tasks_to_insert:
|
463
|
-
self._processed_task_ids.add(task['id'])
|
464
|
-
|
465
|
-
# 如果集合过大,清理最早的一半
|
466
|
-
if len(self._processed_task_ids) > self._processed_ids_max_size:
|
467
|
-
# 只保留最新的一半ID
|
468
|
-
ids_list = list(self._processed_task_ids)
|
469
|
-
keep_count = self._processed_ids_max_size // 2
|
470
|
-
self._processed_task_ids = set(ids_list[-keep_count:])
|
471
|
-
logger.debug(f"Cleaned processed IDs cache, kept {keep_count} most recent IDs")
|
472
|
-
|
473
|
-
if ack_batch:
|
474
|
-
pipeline = self.redis_client.pipeline()
|
475
|
-
for stream_key, msg_ids in ack_batch:
|
476
|
-
pipeline.xack(stream_key, self.consumer_group, *msg_ids)
|
477
|
-
|
478
|
-
try:
|
479
|
-
await pipeline.execute()
|
480
|
-
total_acked = sum(len(msg_ids) for _, msg_ids in ack_batch)
|
481
|
-
logger.debug(f"Successfully ACKed {total_acked} messages")
|
482
|
-
except Exception as e:
|
483
|
-
logger.error(f"Error executing batch ACK: {e}")
|
484
|
-
|
485
|
-
async def _insert_tasks(self, tasks: List[Dict[str, Any]]):
|
486
|
-
"""批量插入任务到PostgreSQL"""
|
487
|
-
if not tasks:
|
488
|
-
return
|
489
|
-
|
490
|
-
try:
|
491
|
-
async with self.AsyncSessionLocal() as session:
|
492
|
-
query = text("""
|
493
|
-
INSERT INTO tasks (id, queue_name, task_name, task_data, priority,
|
494
|
-
retry_count, max_retry, status, metadata, created_at)
|
495
|
-
VALUES (:id, :queue_name, :task_name, CAST(:task_data AS jsonb), :priority,
|
496
|
-
:retry_count, :max_retry, :status, CAST(:metadata AS jsonb), :created_at)
|
497
|
-
ON CONFLICT (id) DO NOTHING;
|
498
|
-
""")
|
499
|
-
|
500
|
-
await session.execute(query, tasks)
|
501
|
-
await session.commit()
|
502
|
-
logger.info(f"Batch inserted {len(tasks)} tasks to PostgreSQL")
|
503
|
-
|
504
|
-
except Exception as e:
|
505
|
-
logger.error(f"Error inserting tasks to PostgreSQL: {e}")
|
506
|
-
|
507
|
-
async def _consume_task_changes(self):
|
508
|
-
"""消费任务变更事件流 - 基于事件驱动的更新(支持pending消息恢复)"""
|
509
|
-
change_stream_key = f"{self.prefix}:TASK_CHANGES"
|
510
|
-
consumer_group = f"{self.prefix}_changes_consumer"
|
511
|
-
|
512
|
-
# 使用 ConsumerManager 管理的 consumer name
|
513
|
-
# 这样 ConsumerManager 才能正确跟踪和恢复这个流的待处理消息
|
514
|
-
consumer_name = self.consumer_manager.get_consumer_name('pg_consumer')
|
515
|
-
|
516
|
-
# 创建消费者组
|
517
|
-
try:
|
518
|
-
await self.redis_client.xgroup_create(
|
519
|
-
change_stream_key, consumer_group, id='0', mkstream=True
|
520
|
-
)
|
521
|
-
logger.info(f"Created consumer group for task changes stream")
|
522
|
-
except redis.ResponseError:
|
523
|
-
pass
|
524
|
-
|
525
|
-
# 模仿 listen_event_by_task 的写法:先处理pending消息,再处理新消息
|
526
|
-
check_backlog = True
|
527
|
-
lastid = "0-0"
|
528
|
-
batch_size = 100
|
529
|
-
|
530
|
-
while self._running:
|
531
|
-
try:
|
532
|
-
# 决定读取位置:如果有backlog,从lastid开始;否则读取新消息
|
533
|
-
if check_backlog:
|
534
|
-
myid = lastid
|
535
|
-
else:
|
536
|
-
myid = ">"
|
537
|
-
|
538
|
-
messages = await self.redis_client.xreadgroup(
|
539
|
-
consumer_group,
|
540
|
-
consumer_name, # 使用 ConsumerManager 管理的 consumer name
|
541
|
-
{change_stream_key: myid},
|
542
|
-
count=batch_size,
|
543
|
-
block=1000 if not check_backlog else 0 # backlog时不阻塞
|
544
|
-
)
|
545
|
-
|
546
|
-
if not messages:
|
547
|
-
check_backlog = False
|
548
|
-
continue
|
549
|
-
|
550
|
-
# 检查是否还有更多backlog消息
|
551
|
-
if messages and len(messages[0][1]) > 0:
|
552
|
-
check_backlog = len(messages[0][1]) >= batch_size
|
553
|
-
else:
|
554
|
-
check_backlog = False
|
555
|
-
|
556
|
-
task_ids_to_update = set()
|
557
|
-
ack_ids = []
|
558
|
-
|
559
|
-
for _, stream_messages in messages:
|
560
|
-
for msg_id, data in stream_messages:
|
561
|
-
try:
|
562
|
-
# 更新lastid(无论消息是否处理成功)
|
563
|
-
if isinstance(msg_id, bytes):
|
564
|
-
lastid = msg_id.decode('utf-8')
|
565
|
-
else:
|
566
|
-
lastid = str(msg_id)
|
567
|
-
|
568
|
-
event_id = data.get(b'event_id')
|
569
|
-
if event_id:
|
570
|
-
if isinstance(event_id, bytes):
|
571
|
-
event_id = event_id.decode('utf-8')
|
572
|
-
task_ids_to_update.add(event_id)
|
573
|
-
ack_ids.append(msg_id)
|
574
|
-
except Exception as e:
|
575
|
-
logger.error(f"Error processing change event {msg_id}: {e}")
|
576
|
-
|
577
|
-
if task_ids_to_update:
|
578
|
-
await self._update_tasks_by_event(list(task_ids_to_update))
|
579
|
-
logger.info(f"Updated {len(task_ids_to_update)} tasks from change events")
|
580
|
-
|
581
|
-
if ack_ids:
|
582
|
-
await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
|
583
|
-
|
584
|
-
except redis.ResponseError as e:
|
585
|
-
if "NOGROUP" in str(e):
|
586
|
-
# 如果消费者组不存在,重新创建
|
587
|
-
try:
|
588
|
-
await self.redis_client.xgroup_create(
|
589
|
-
change_stream_key, consumer_group, id='0', mkstream=True
|
590
|
-
)
|
591
|
-
logger.info(f"Recreated consumer group for task changes stream")
|
592
|
-
check_backlog = True
|
593
|
-
lastid = "0-0"
|
594
|
-
except:
|
595
|
-
pass
|
596
|
-
else:
|
597
|
-
logger.error(f"Redis error in consume_task_changes: {e}")
|
598
|
-
await asyncio.sleep(1)
|
599
|
-
except Exception as e:
|
600
|
-
logger.error(f"Error in consume_task_changes: {e}", exc_info=True)
|
601
|
-
await asyncio.sleep(1)
|
602
|
-
|
603
|
-
async def _update_tasks_by_event(self, task_ids: List[str]):
|
604
|
-
"""基于事件ID批量更新任务状态 - 慢速版本用于测试"""
|
605
|
-
if not task_ids:
|
606
|
-
return
|
607
|
-
|
608
|
-
# 添加人工延迟模拟慢速处理
|
609
|
-
logger.info(f"[SLOW] Updating {len(task_ids)} task events, adding 3 second delay...")
|
610
|
-
await asyncio.sleep(3) # 模拟慢速处理
|
611
|
-
|
612
|
-
try:
|
613
|
-
pipeline = self.redis_client.pipeline()
|
614
|
-
for task_id in task_ids:
|
615
|
-
task_key = f"{self.prefix}:TASK:{task_id}"
|
616
|
-
pipeline.hgetall(task_key)
|
617
|
-
|
618
|
-
redis_values = await pipeline.execute()
|
619
|
-
updates = []
|
620
|
-
|
621
|
-
for i, task_id in enumerate(task_ids):
|
622
|
-
hash_data = redis_values[i]
|
623
|
-
|
624
|
-
if not hash_data:
|
625
|
-
continue
|
626
|
-
|
627
|
-
update_info = self._parse_task_hash(task_id, hash_data)
|
628
|
-
if update_info:
|
629
|
-
updates.append(update_info)
|
630
|
-
|
631
|
-
if updates:
|
632
|
-
await self._update_tasks(updates)
|
633
|
-
logger.debug(f"Updated {len(updates)} tasks from change events")
|
634
|
-
|
635
|
-
except Exception as e:
|
636
|
-
logger.error(f"Error updating tasks by event: {e}", exc_info=True)
|
637
|
-
|
638
|
-
def _parse_task_hash(self, task_id: str, hash_data: dict) -> Optional[dict]:
|
639
|
-
"""解析Redis Hash数据"""
|
640
|
-
update_info = {
|
641
|
-
'id': task_id,
|
642
|
-
'status': None,
|
643
|
-
'result': None,
|
644
|
-
'error_message': None,
|
645
|
-
'started_at': None,
|
646
|
-
'completed_at': None,
|
647
|
-
'worker_id': None,
|
648
|
-
'execution_time': None,
|
649
|
-
'duration': None
|
650
|
-
}
|
651
|
-
|
652
|
-
try:
|
653
|
-
from jettask.utils.serializer import loads_str
|
654
|
-
|
655
|
-
hash_dict = {}
|
656
|
-
for k, v in hash_data.items():
|
657
|
-
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
658
|
-
if isinstance(v, bytes):
|
659
|
-
try:
|
660
|
-
value = loads_str(v)
|
661
|
-
if isinstance(value, (dict, list)):
|
662
|
-
value = json.dumps(value, ensure_ascii=False)
|
663
|
-
else:
|
664
|
-
value = str(value)
|
665
|
-
except:
|
666
|
-
try:
|
667
|
-
value = v.decode('utf-8')
|
668
|
-
except:
|
669
|
-
value = str(v)
|
670
|
-
else:
|
671
|
-
value = v
|
672
|
-
hash_dict[key] = value
|
673
|
-
|
674
|
-
update_info['status'] = hash_dict.get('status')
|
675
|
-
update_info['error_message'] = hash_dict.get('error_msg') or hash_dict.get('exception')
|
676
|
-
|
677
|
-
# 转换时间戳
|
678
|
-
for time_field in ['started_at', 'completed_at']:
|
679
|
-
if hash_dict.get(time_field):
|
680
|
-
try:
|
681
|
-
time_str = hash_dict[time_field]
|
682
|
-
if isinstance(time_str, str) and time_str.startswith("b'") and time_str.endswith("'"):
|
683
|
-
time_str = time_str[2:-1]
|
684
|
-
update_info[time_field] = datetime.fromtimestamp(float(time_str), tz=timezone.utc)
|
685
|
-
except:
|
686
|
-
pass
|
687
|
-
|
688
|
-
update_info['worker_id'] = hash_dict.get('consumer') or hash_dict.get('worker_id')
|
689
|
-
|
690
|
-
# 转换数值
|
691
|
-
for num_field in ['execution_time', 'duration']:
|
692
|
-
if hash_dict.get(num_field):
|
693
|
-
try:
|
694
|
-
num_str = hash_dict[num_field]
|
695
|
-
if isinstance(num_str, str) and num_str.startswith("b'") and num_str.endswith("'"):
|
696
|
-
num_str = num_str[2:-1]
|
697
|
-
update_info[num_field] = float(num_str)
|
698
|
-
except:
|
699
|
-
pass
|
700
|
-
|
701
|
-
# 处理result
|
702
|
-
if 'result' in hash_dict:
|
703
|
-
result_str = hash_dict['result']
|
704
|
-
if result_str == 'null':
|
705
|
-
update_info['result'] = None
|
706
|
-
else:
|
707
|
-
update_info['result'] = result_str
|
708
|
-
|
709
|
-
# 只返回有数据的更新
|
710
|
-
if any(v is not None for k, v in update_info.items() if k != 'id'):
|
711
|
-
return update_info
|
712
|
-
|
713
|
-
except Exception as e:
|
714
|
-
logger.error(f"Failed to parse hash data for task {task_id}: {e}")
|
715
|
-
|
716
|
-
return None
|
717
|
-
|
718
|
-
async def _update_tasks(self, updates: List[Dict[str, Any]]):
|
719
|
-
"""批量更新任务状态(处理竞态条件)"""
|
720
|
-
if not updates:
|
721
|
-
return
|
722
|
-
|
723
|
-
try:
|
724
|
-
async with self.AsyncSessionLocal() as session:
|
725
|
-
# 使用 executemany 批量更新
|
726
|
-
update_query = text("""
|
727
|
-
UPDATE tasks SET
|
728
|
-
status = COALESCE(:status, status),
|
729
|
-
result = COALESCE(CAST(:result AS jsonb), result),
|
730
|
-
error_message = COALESCE(:error_message, error_message),
|
731
|
-
started_at = COALESCE(:started_at, started_at),
|
732
|
-
completed_at = COALESCE(:completed_at, completed_at),
|
733
|
-
worker_id = COALESCE(:worker_id, worker_id),
|
734
|
-
execution_time = COALESCE(:execution_time, execution_time),
|
735
|
-
duration = COALESCE(:duration, duration)
|
736
|
-
WHERE id = :id
|
737
|
-
""")
|
738
|
-
|
739
|
-
# 批量执行更新
|
740
|
-
result = await session.execute(update_query, updates)
|
741
|
-
|
742
|
-
# 检查受影响的行数
|
743
|
-
updated_count = result.rowcount
|
744
|
-
expected_count = len(updates)
|
745
|
-
|
746
|
-
# 只有当受影响行数与预期不一致时,才去查询具体哪些记录不存在
|
747
|
-
if updated_count < expected_count:
|
748
|
-
task_ids = [u['id'] for u in updates]
|
749
|
-
|
750
|
-
# 先使用内存集合进行快速过滤
|
751
|
-
async with self._processed_ids_lock:
|
752
|
-
# 过滤出可能存在的ID(在内存集合中的肯定存在)
|
753
|
-
known_existing_ids = set(task_ids) & self._processed_task_ids
|
754
|
-
|
755
|
-
# 计算可能缺失的ID(不在内存集合中的需要查询确认)
|
756
|
-
potential_missing_ids = set(task_ids) - known_existing_ids
|
757
|
-
|
758
|
-
if len(known_existing_ids) + updated_count >= expected_count:
|
759
|
-
# 如果已知存在的ID数量加上更新成功的数量已经达到预期,说明没有缺失
|
760
|
-
missing_ids = set()
|
761
|
-
logger.debug(f"Memory cache hit: avoided DB query for {len(known_existing_ids)} IDs")
|
762
|
-
elif potential_missing_ids:
|
763
|
-
# 只查询不在内存集合中的ID,减少查询范围
|
764
|
-
logger.debug(f"Memory cache partial hit: checking {len(potential_missing_ids)} IDs in DB (skipped {len(known_existing_ids)} cached IDs)")
|
765
|
-
check_query = text("""
|
766
|
-
SELECT id FROM tasks WHERE id = ANY(:ids)
|
767
|
-
""")
|
768
|
-
check_result = await session.execute(check_query, {'ids': list(potential_missing_ids)})
|
769
|
-
existing_in_db = {row[0] for row in check_result}
|
770
|
-
|
771
|
-
# 更新内存集合(发现的新ID加入集合)
|
772
|
-
async with self._processed_ids_lock:
|
773
|
-
self._processed_task_ids.update(existing_in_db)
|
774
|
-
|
775
|
-
# 找出确实不存在的记录
|
776
|
-
missing_ids = potential_missing_ids - existing_in_db
|
777
|
-
else:
|
778
|
-
missing_ids = set()
|
779
|
-
else:
|
780
|
-
# 所有记录都更新成功
|
781
|
-
missing_ids = set()
|
782
|
-
|
783
|
-
if missing_ids:
|
784
|
-
# 将缺失的任务更新加入待重试队列,而不是立即创建
|
785
|
-
async with self._pending_updates_lock:
|
786
|
-
# 创建更新信息映射
|
787
|
-
update_map = {u['id']: u for u in updates if u['id'] in missing_ids}
|
788
|
-
|
789
|
-
for task_id in missing_ids:
|
790
|
-
if task_id in update_map:
|
791
|
-
# 如果已经有旧的更新在队列中,新的更新会覆盖它
|
792
|
-
# 这确保了只有最新的更新会被重试
|
793
|
-
if task_id in self._pending_updates:
|
794
|
-
logger.debug(f"Replacing old pending update for task {task_id} with newer one")
|
795
|
-
|
796
|
-
# 保存更新信息,等待重试(会覆盖旧的)
|
797
|
-
self._pending_updates[task_id] = update_map[task_id]
|
798
|
-
|
799
|
-
# 如果待重试队列过大,清理最早的一半
|
800
|
-
if len(self._pending_updates) > self._max_pending_updates:
|
801
|
-
items = list(self._pending_updates.items())
|
802
|
-
keep_count = self._max_pending_updates // 2
|
803
|
-
self._pending_updates = dict(items[-keep_count:])
|
804
|
-
logger.warning(f"Pending updates queue full, kept {keep_count} most recent items")
|
805
|
-
|
806
|
-
logger.info(f"Added {len(missing_ids)} task updates to retry queue")
|
807
|
-
|
808
|
-
await session.commit()
|
809
|
-
|
810
|
-
# if updated_count > 0:
|
811
|
-
# logger.info(f"Updated {updated_count} task statuses {updates=}")
|
812
|
-
|
813
|
-
except Exception as e:
|
814
|
-
logger.error(f"Error updating task statuses: {e}")
|
815
|
-
|
816
|
-
async def _retry_pending_updates(self):
|
817
|
-
"""定期重试待更新的任务"""
|
818
|
-
while self._running:
|
819
|
-
try:
|
820
|
-
await asyncio.sleep(self._retry_interval) # 等待一段时间
|
821
|
-
|
822
|
-
# 获取待重试的更新
|
823
|
-
async with self._pending_updates_lock:
|
824
|
-
if not self._pending_updates:
|
825
|
-
continue
|
826
|
-
|
827
|
-
# 取出所有待重试的更新
|
828
|
-
pending_items = list(self._pending_updates.items())
|
829
|
-
self._pending_updates.clear()
|
830
|
-
|
831
|
-
if pending_items:
|
832
|
-
logger.info(f"Retrying {len(pending_items)} pending task updates")
|
833
|
-
|
834
|
-
# 重新尝试更新
|
835
|
-
updates = [update_info for _, update_info in pending_items]
|
836
|
-
await self._update_tasks(updates)
|
837
|
-
|
838
|
-
except Exception as e:
|
839
|
-
logger.error(f"Error in retry pending updates: {e}")
|
840
|
-
await asyncio.sleep(5)
|
841
|
-
|
842
|
-
async def _start_offline_recovery(self):
|
843
|
-
"""启动离线worker恢复服务,恢复离线PG_CONSUMER的消息"""
|
844
|
-
logger.info("Starting offline worker recovery service for PG_CONSUMER")
|
845
|
-
|
846
|
-
# 等待consumer manager初始化和队列发现
|
847
|
-
# await asyncio.sleep(5)
|
848
|
-
|
849
|
-
while self._running:
|
850
|
-
try:
|
851
|
-
total_recovered = 0
|
852
|
-
|
853
|
-
# 1. 恢复普通队列的消息
|
854
|
-
for queue in self._known_queues:
|
855
|
-
# logger.info(f'{queue=}')
|
856
|
-
try:
|
857
|
-
recovered = await self.offline_recovery.recover_offline_workers(
|
858
|
-
queue=queue,
|
859
|
-
current_consumer_name=self.consumer_id,
|
860
|
-
process_message_callback=self._process_recovered_queue_message
|
861
|
-
)
|
862
|
-
|
863
|
-
if recovered > 0:
|
864
|
-
logger.info(f"Recovered {recovered} messages from queue {queue}")
|
865
|
-
total_recovered += recovered
|
866
|
-
|
867
|
-
except Exception as e:
|
868
|
-
logger.error(f"Error recovering queue {queue}: {e}")
|
869
|
-
|
870
|
-
# 2. 恢复TASK_CHANGES stream的消息
|
871
|
-
recovered = await self._recover_task_changes_offline_messages()
|
872
|
-
if recovered > 0:
|
873
|
-
logger.info(f"Recovered {recovered} TASK_CHANGES messages")
|
874
|
-
total_recovered += recovered
|
875
|
-
|
876
|
-
if total_recovered > 0:
|
877
|
-
logger.info(f"Total recovered {total_recovered} messages in this cycle")
|
878
|
-
|
879
|
-
# 每30秒扫描一次
|
880
|
-
await asyncio.sleep(1)
|
881
|
-
|
882
|
-
except Exception as e:
|
883
|
-
logger.error(f"Error in offline recovery service: {e}")
|
884
|
-
await asyncio.sleep(10)
|
885
|
-
|
886
|
-
async def _recover_task_changes_offline_messages(self) -> int:
|
887
|
-
"""恢复TASK_CHANGES stream的离线消息"""
|
888
|
-
# 使用 OfflineWorkerRecovery 的标准接口
|
889
|
-
try:
|
890
|
-
# 为TASK_CHANGES定义自定义的队列格式化器
|
891
|
-
def task_changes_formatter(queue):
|
892
|
-
# 对于TASK_CHANGES,直接返回stream key(不加QUEUE:前缀)
|
893
|
-
if queue == 'TASK_CHANGES':
|
894
|
-
return f"{self.prefix}:TASK_CHANGES"
|
895
|
-
else:
|
896
|
-
return f"{self.prefix}:QUEUE:{queue}"
|
897
|
-
|
898
|
-
# 创建专门用于TASK_CHANGES的恢复器
|
899
|
-
task_changes_recovery = OfflineWorkerRecovery(
|
900
|
-
async_redis_client=self.redis_client,
|
901
|
-
redis_prefix=self.prefix,
|
902
|
-
worker_prefix='PG_CONSUMER',
|
903
|
-
queue_formatter=task_changes_formatter
|
904
|
-
)
|
905
|
-
|
906
|
-
# 调用标准的恢复方法
|
907
|
-
# TASK_CHANGES作为队列名传入,会被正确处理
|
908
|
-
recovered = await task_changes_recovery.recover_offline_workers(
|
909
|
-
queue='TASK_CHANGES', # 这个队列名会用于查找离线worker
|
910
|
-
current_consumer_name=self.consumer_id,
|
911
|
-
process_message_callback=self._process_recovered_task_change_v2
|
912
|
-
)
|
913
|
-
|
914
|
-
return recovered
|
915
|
-
|
916
|
-
except Exception as e:
|
917
|
-
logger.error(f"Error in recover_task_changes_offline_messages: {e}")
|
918
|
-
return 0
|
919
|
-
|
920
|
-
async def _process_recovered_queue_message(self, msg_id, msg_data, queue, consumer_id):
|
921
|
-
"""处理恢复的普通队列消息(符合OfflineWorkerRecovery的回调接口)"""
|
922
|
-
try:
|
923
|
-
logger.info(f"Processing recovered message {msg_id} from queue {queue}, offline worker {consumer_id}")
|
924
|
-
|
925
|
-
# 解析任务信息
|
926
|
-
task_info = self._parse_stream_message(msg_id, msg_data, queue)
|
927
|
-
if task_info:
|
928
|
-
# 批量插入到数据库
|
929
|
-
await self._batch_insert_tasks([task_info])
|
930
|
-
|
931
|
-
# ACK消息
|
932
|
-
stream_key = f"{self.prefix}:QUEUE:{queue}"
|
933
|
-
await self.redis_client.xack(stream_key, self.consumer_group, msg_id)
|
934
|
-
|
935
|
-
except Exception as e:
|
936
|
-
logger.error(f"Error processing recovered queue message {msg_id}: {e}")
|
937
|
-
|
938
|
-
async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
|
939
|
-
"""处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
|
940
|
-
try:
|
941
|
-
# 解析消息
|
942
|
-
event_id = msg_data.get(b'event_id')
|
943
|
-
if event_id:
|
944
|
-
if isinstance(event_id, bytes):
|
945
|
-
event_id = event_id.decode('utf-8')
|
946
|
-
|
947
|
-
logger.info(f"Processing recovered TASK_CHANGES message: {event_id} from offline worker {consumer_id}")
|
948
|
-
|
949
|
-
# 更新任务状态
|
950
|
-
await self._update_tasks_by_event([event_id])
|
951
|
-
|
952
|
-
# ACK消息
|
953
|
-
change_stream_key = f"{self.prefix}:TASK_CHANGES"
|
954
|
-
consumer_group = f"{self.prefix}_changes_consumer"
|
955
|
-
await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
|
956
|
-
|
957
|
-
except Exception as e:
|
958
|
-
logger.error(f"Error processing recovered task change {msg_id}: {e}")
|
959
|
-
|
960
|
-
async def _database_maintenance(self):
|
961
|
-
"""定期执行数据库维护任务"""
|
962
|
-
last_analyze_time = 0
|
963
|
-
analyze_interval = 7200 # 每2小时执行一次ANALYZE
|
964
|
-
|
965
|
-
while self._running:
|
966
|
-
try:
|
967
|
-
current_time = time.time()
|
968
|
-
|
969
|
-
if current_time - last_analyze_time > analyze_interval:
|
970
|
-
async with self.AsyncSessionLocal() as session:
|
971
|
-
logger.info("Running ANALYZE on tasks table...")
|
972
|
-
await session.execute(text("ANALYZE tasks"))
|
973
|
-
await session.commit()
|
974
|
-
logger.info("ANALYZE completed successfully")
|
975
|
-
last_analyze_time = current_time
|
976
|
-
|
977
|
-
await asyncio.sleep(300) # 每5分钟检查一次
|
978
|
-
|
979
|
-
except Exception as e:
|
980
|
-
logger.error(f"Error in database maintenance: {e}")
|
981
|
-
await asyncio.sleep(60)
|
982
|
-
|
983
|
-
def _parse_stream_message(self, task_id: str, data: dict, queue_name: str) -> Optional[dict]:
|
984
|
-
"""解析Stream消息为任务信息(返回完整的字段)"""
|
985
|
-
try:
|
986
|
-
from jettask.utils.serializer import loads_str
|
987
|
-
|
988
|
-
if b'data' in data:
|
989
|
-
task_data = loads_str(data[b'data'])
|
990
|
-
else:
|
991
|
-
task_data = {}
|
992
|
-
for k, v in data.items():
|
993
|
-
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
994
|
-
if isinstance(v, bytes):
|
995
|
-
try:
|
996
|
-
value = loads_str(v)
|
997
|
-
except:
|
998
|
-
value = str(v)
|
999
|
-
else:
|
1000
|
-
value = v
|
1001
|
-
task_data[key] = value
|
1002
|
-
|
1003
|
-
task_name = task_data.get('name', task_data.get('task', 'unknown'))
|
1004
|
-
created_at = None
|
1005
|
-
if 'trigger_time' in task_data:
|
1006
|
-
try:
|
1007
|
-
timestamp = float(task_data['trigger_time'])
|
1008
|
-
created_at = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
1009
|
-
except:
|
1010
|
-
pass
|
1011
|
-
|
1012
|
-
# 返回完整的字段,包括所有可能为None的字段
|
1013
|
-
return {
|
1014
|
-
'id': task_id,
|
1015
|
-
'queue_name': queue_name,
|
1016
|
-
'task_name': task_name,
|
1017
|
-
'task_data': json.dumps(task_data),
|
1018
|
-
'priority': int(task_data.get('priority', 0)),
|
1019
|
-
'retry_count': int(task_data.get('retry', 0)),
|
1020
|
-
'max_retry': int(task_data.get('max_retry', 3)),
|
1021
|
-
'status': 'pending',
|
1022
|
-
'result': None, # 新任务没有结果
|
1023
|
-
'error_message': None, # 新任务没有错误信息
|
1024
|
-
'created_at': created_at,
|
1025
|
-
'started_at': None, # 新任务还未开始
|
1026
|
-
'completed_at': None, # 新任务还未完成
|
1027
|
-
'worker_id': None, # 新任务还未分配worker
|
1028
|
-
'execution_time': None, # 新任务还没有执行时间
|
1029
|
-
'duration': None, # 新任务还没有持续时间
|
1030
|
-
'metadata': json.dumps(task_data.get('metadata', {}))
|
1031
|
-
}
|
1032
|
-
except Exception as e:
|
1033
|
-
logger.error(f"Error parsing stream message for task {task_id}: {e}")
|
1034
|
-
return None
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig,
|
1039
|
-
consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
|
1040
|
-
"""运行PostgreSQL消费者"""
|
1041
|
-
consumer = PostgreSQLConsumer(pg_config, redis_config, consumer_strategy=consumer_strategy)
|
1042
|
-
|
1043
|
-
try:
|
1044
|
-
await consumer.start()
|
1045
|
-
while True:
|
1046
|
-
await asyncio.sleep(1)
|
1047
|
-
|
1048
|
-
except KeyboardInterrupt:
|
1049
|
-
logger.info("Received interrupt signal")
|
1050
|
-
finally:
|
1051
|
-
await consumer.stop()
|
1052
|
-
|
1053
|
-
|
1054
|
-
def main():
|
1055
|
-
"""主入口函数"""
|
1056
|
-
from dotenv import load_dotenv
|
1057
|
-
|
1058
|
-
load_dotenv()
|
1059
|
-
|
1060
|
-
logging.basicConfig(
|
1061
|
-
level=logging.INFO,
|
1062
|
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
1063
|
-
)
|
1064
|
-
|
1065
|
-
pg_config = PostgreSQLConfig(
|
1066
|
-
host=os.getenv('JETTASK_PG_HOST', 'localhost'),
|
1067
|
-
port=int(os.getenv('JETTASK_PG_PORT', '5432')),
|
1068
|
-
database=os.getenv('JETTASK_PG_DB', 'jettask'),
|
1069
|
-
user=os.getenv('JETTASK_PG_USER', 'jettask'),
|
1070
|
-
password=os.getenv('JETTASK_PG_PASSWORD', '123456'),
|
1071
|
-
)
|
1072
|
-
|
1073
|
-
redis_config = RedisConfig(
|
1074
|
-
host=os.getenv('REDIS_HOST', 'localhost'),
|
1075
|
-
port=int(os.getenv('REDIS_PORT', '6379')),
|
1076
|
-
db=int(os.getenv('REDIS_DB', '0')),
|
1077
|
-
password=os.getenv('REDIS_PASSWORD'),
|
1078
|
-
)
|
1079
|
-
|
1080
|
-
# 从环境变量获取消费者策略,默认使用 HEARTBEAT
|
1081
|
-
strategy_name = os.getenv('JETTASK_CONSUMER_STRATEGY', 'HEARTBEAT').upper()
|
1082
|
-
consumer_strategy = ConsumerStrategy.HEARTBEAT # 默认
|
1083
|
-
|
1084
|
-
if strategy_name == 'FIXED':
|
1085
|
-
consumer_strategy = ConsumerStrategy.FIXED
|
1086
|
-
elif strategy_name == 'POD':
|
1087
|
-
consumer_strategy = ConsumerStrategy.POD
|
1088
|
-
elif strategy_name == 'HEARTBEAT':
|
1089
|
-
consumer_strategy = ConsumerStrategy.HEARTBEAT
|
1090
|
-
else:
|
1091
|
-
logger.warning(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
|
1092
|
-
|
1093
|
-
logger.info(f"Using consumer strategy: {consumer_strategy.value}")
|
1094
|
-
|
1095
|
-
asyncio.run(run_pg_consumer(pg_config, redis_config, consumer_strategy))
|
1096
|
-
|
1097
|
-
|
1098
|
-
if __name__ == '__main__':
|
1099
|
-
main()
|