jettask 0.2.20__py3-none-any.whl → 0.2.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +4 -0
- jettask/cli.py +12 -8
- jettask/config/lua_scripts.py +37 -0
- jettask/config/nacos_config.py +1 -1
- jettask/core/app.py +313 -340
- jettask/core/container.py +4 -4
- jettask/{persistence → core}/namespace.py +93 -27
- jettask/core/task.py +16 -9
- jettask/core/unified_manager_base.py +136 -26
- jettask/db/__init__.py +67 -0
- jettask/db/base.py +137 -0
- jettask/{utils/db_connector.py → db/connector.py} +130 -26
- jettask/db/models/__init__.py +16 -0
- jettask/db/models/scheduled_task.py +196 -0
- jettask/db/models/task.py +77 -0
- jettask/db/models/task_run.py +85 -0
- jettask/executor/__init__.py +0 -15
- jettask/executor/core.py +76 -31
- jettask/executor/process_entry.py +29 -114
- jettask/executor/task_executor.py +4 -0
- jettask/messaging/event_pool.py +928 -685
- jettask/messaging/scanner.py +30 -0
- jettask/persistence/__init__.py +28 -103
- jettask/persistence/buffer.py +170 -0
- jettask/persistence/consumer.py +330 -249
- jettask/persistence/manager.py +304 -0
- jettask/persistence/persistence.py +391 -0
- jettask/scheduler/__init__.py +15 -3
- jettask/scheduler/{task_crud.py → database.py} +61 -57
- jettask/scheduler/loader.py +2 -2
- jettask/scheduler/{scheduler_coordinator.py → manager.py} +23 -6
- jettask/scheduler/models.py +14 -10
- jettask/scheduler/schedule.py +166 -0
- jettask/scheduler/scheduler.py +12 -11
- jettask/schemas/__init__.py +50 -1
- jettask/schemas/backlog.py +43 -6
- jettask/schemas/namespace.py +70 -19
- jettask/schemas/queue.py +19 -3
- jettask/schemas/responses.py +493 -0
- jettask/task/__init__.py +0 -2
- jettask/task/router.py +3 -0
- jettask/test_connection_monitor.py +1 -1
- jettask/utils/__init__.py +7 -5
- jettask/utils/db_init.py +8 -4
- jettask/utils/namespace_dep.py +167 -0
- jettask/utils/queue_matcher.py +186 -0
- jettask/utils/rate_limit/concurrency_limiter.py +7 -1
- jettask/utils/stream_backlog.py +1 -1
- jettask/webui/__init__.py +0 -1
- jettask/webui/api/__init__.py +4 -4
- jettask/webui/api/alerts.py +806 -71
- jettask/webui/api/example_refactored.py +400 -0
- jettask/webui/api/namespaces.py +390 -45
- jettask/webui/api/overview.py +300 -54
- jettask/webui/api/queues.py +971 -267
- jettask/webui/api/scheduled.py +1249 -56
- jettask/webui/api/settings.py +129 -7
- jettask/webui/api/workers.py +442 -0
- jettask/webui/app.py +46 -2329
- jettask/webui/middleware/__init__.py +6 -0
- jettask/webui/middleware/namespace_middleware.py +135 -0
- jettask/webui/services/__init__.py +146 -0
- jettask/webui/services/heartbeat_service.py +251 -0
- jettask/webui/services/overview_service.py +60 -51
- jettask/webui/services/queue_monitor_service.py +426 -0
- jettask/webui/services/redis_monitor_service.py +87 -0
- jettask/webui/services/settings_service.py +174 -111
- jettask/webui/services/task_monitor_service.py +222 -0
- jettask/webui/services/timeline_pg_service.py +452 -0
- jettask/webui/services/timeline_service.py +189 -0
- jettask/webui/services/worker_monitor_service.py +467 -0
- jettask/webui/utils/__init__.py +11 -0
- jettask/webui/utils/time_utils.py +122 -0
- jettask/worker/lifecycle.py +8 -2
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/METADATA +1 -1
- jettask-0.2.24.dist-info/RECORD +142 -0
- jettask/executor/executor.py +0 -338
- jettask/persistence/backlog_monitor.py +0 -567
- jettask/persistence/base.py +0 -2334
- jettask/persistence/db_manager.py +0 -516
- jettask/persistence/maintenance.py +0 -81
- jettask/persistence/message_consumer.py +0 -259
- jettask/persistence/models.py +0 -49
- jettask/persistence/offline_recovery.py +0 -196
- jettask/persistence/queue_discovery.py +0 -215
- jettask/persistence/task_persistence.py +0 -218
- jettask/persistence/task_updater.py +0 -583
- jettask/scheduler/add_execution_count.sql +0 -11
- jettask/scheduler/add_priority_field.sql +0 -26
- jettask/scheduler/add_scheduler_id.sql +0 -25
- jettask/scheduler/add_scheduler_id_index.sql +0 -10
- jettask/scheduler/make_scheduler_id_required.sql +0 -28
- jettask/scheduler/migrate_interval_seconds.sql +0 -9
- jettask/scheduler/performance_optimization.sql +0 -45
- jettask/scheduler/run_scheduler.py +0 -186
- jettask/scheduler/schema.sql +0 -84
- jettask/task/task_executor.py +0 -318
- jettask/webui/api/analytics.py +0 -323
- jettask/webui/config.py +0 -90
- jettask/webui/models/__init__.py +0 -3
- jettask/webui/models/namespace.py +0 -63
- jettask/webui/namespace_manager/__init__.py +0 -10
- jettask/webui/namespace_manager/multi.py +0 -593
- jettask/webui/namespace_manager/unified.py +0 -193
- jettask/webui/run.py +0 -46
- jettask-0.2.20.dist-info/RECORD +0 -145
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/WHEEL +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.20.dist-info → jettask-0.2.24.dist-info}/top_level.txt +0 -0
@@ -1,259 +0,0 @@
|
|
1
|
-
"""消息消费模块
|
2
|
-
|
3
|
-
负责从Redis Stream队列中消费消息,并持久化到PostgreSQL。
|
4
|
-
"""
|
5
|
-
|
6
|
-
import asyncio
|
7
|
-
import logging
|
8
|
-
import traceback
|
9
|
-
from typing import List, Dict
|
10
|
-
from collections import defaultdict
|
11
|
-
|
12
|
-
import redis.asyncio as redis
|
13
|
-
from redis.asyncio import Redis
|
14
|
-
|
15
|
-
from .task_persistence import TaskPersistence
|
16
|
-
|
17
|
-
logger = logging.getLogger(__name__)
|
18
|
-
|
19
|
-
|
20
|
-
class MessageConsumer:
|
21
|
-
"""消息消费器
|
22
|
-
|
23
|
-
职责:
|
24
|
-
- 从Redis Stream队列消费消息
|
25
|
-
- 解析消息并持久化到数据库
|
26
|
-
- 管理多个队列的消费任务
|
27
|
-
- 处理错误重试和ACK
|
28
|
-
"""
|
29
|
-
|
30
|
-
def __init__(
|
31
|
-
self,
|
32
|
-
redis_client: Redis,
|
33
|
-
redis_prefix: str,
|
34
|
-
consumer_group: str,
|
35
|
-
consumer_id: str,
|
36
|
-
task_persistence: TaskPersistence,
|
37
|
-
queue_discovery: 'QueueDiscovery'
|
38
|
-
):
|
39
|
-
"""初始化消息消费器
|
40
|
-
|
41
|
-
Args:
|
42
|
-
redis_client: Redis异步客户端
|
43
|
-
redis_prefix: Redis键前缀
|
44
|
-
consumer_group: 消费者组名称
|
45
|
-
consumer_id: 消费者ID
|
46
|
-
task_persistence: 任务持久化处理器
|
47
|
-
queue_discovery: 队列发现器
|
48
|
-
"""
|
49
|
-
self.redis_client = redis_client
|
50
|
-
self.redis_prefix = redis_prefix
|
51
|
-
self.consumer_group = consumer_group
|
52
|
-
self.consumer_id = consumer_id
|
53
|
-
self.task_persistence = task_persistence
|
54
|
-
self.queue_discovery = queue_discovery
|
55
|
-
|
56
|
-
# 错误计数器
|
57
|
-
self._consecutive_errors = defaultdict(int)
|
58
|
-
|
59
|
-
# 已处理任务ID缓存(用于优化查询)
|
60
|
-
self._processed_task_ids = set()
|
61
|
-
self._processed_ids_lock = asyncio.Lock()
|
62
|
-
self._processed_ids_max_size = 100000
|
63
|
-
self._processed_ids_cleanup_interval = 300
|
64
|
-
|
65
|
-
self._running = False
|
66
|
-
self._consume_task = None
|
67
|
-
self._queue_tasks = {}
|
68
|
-
|
69
|
-
async def start(self):
|
70
|
-
"""启动消费器"""
|
71
|
-
self._running = True
|
72
|
-
self._consume_task = asyncio.create_task(self._consume_queues())
|
73
|
-
logger.debug("MessageConsumer started")
|
74
|
-
|
75
|
-
async def stop(self):
|
76
|
-
"""停止消费器"""
|
77
|
-
self._running = False
|
78
|
-
|
79
|
-
if self._consume_task:
|
80
|
-
self._consume_task.cancel()
|
81
|
-
try:
|
82
|
-
await self._consume_task
|
83
|
-
except asyncio.CancelledError:
|
84
|
-
pass
|
85
|
-
|
86
|
-
# 取消所有队列任务
|
87
|
-
for task in self._queue_tasks.values():
|
88
|
-
task.cancel()
|
89
|
-
|
90
|
-
if self._queue_tasks:
|
91
|
-
await asyncio.gather(*self._queue_tasks.values(), return_exceptions=True)
|
92
|
-
|
93
|
-
logger.debug("MessageConsumer stopped")
|
94
|
-
|
95
|
-
async def _consume_queues(self):
|
96
|
-
"""启动所有队列的消费任务"""
|
97
|
-
while self._running:
|
98
|
-
try:
|
99
|
-
# 获取已知队列
|
100
|
-
known_queues = self.queue_discovery.get_known_queues()
|
101
|
-
|
102
|
-
# 为每个队列启动消费任务
|
103
|
-
for queue in known_queues:
|
104
|
-
if queue not in self._queue_tasks or self._queue_tasks[queue].done():
|
105
|
-
self._queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
|
106
|
-
logger.debug(f"Started consumer task for queue: {queue}")
|
107
|
-
|
108
|
-
# 移除不存在的队列任务
|
109
|
-
for queue in list(self._queue_tasks.keys()):
|
110
|
-
if queue not in known_queues:
|
111
|
-
self._queue_tasks[queue].cancel()
|
112
|
-
del self._queue_tasks[queue]
|
113
|
-
logger.debug(f"Stopped consumer task for removed queue: {queue}")
|
114
|
-
|
115
|
-
await asyncio.sleep(10)
|
116
|
-
|
117
|
-
except Exception as e:
|
118
|
-
logger.error(f"Error in consume_queues manager: {e}")
|
119
|
-
logger.error(traceback.format_exc())
|
120
|
-
await asyncio.sleep(5)
|
121
|
-
|
122
|
-
async def _consume_queue(self, queue_name: str):
|
123
|
-
"""消费单个队列的任务(包括优先级队列)"""
|
124
|
-
# 判断是否是优先级队列
|
125
|
-
is_priority_queue = ':' in queue_name and queue_name.rsplit(':', 1)[-1].isdigit()
|
126
|
-
|
127
|
-
if is_priority_queue:
|
128
|
-
# 优先级队列格式:base_queue:priority (如 robust_bench2:2)
|
129
|
-
base_queue = queue_name.rsplit(':', 1)[0]
|
130
|
-
priority = queue_name.rsplit(':', 1)[1]
|
131
|
-
stream_key = f"{self.redis_prefix}:QUEUE:{base_queue}:{priority}"
|
132
|
-
else:
|
133
|
-
# 普通队列
|
134
|
-
stream_key = f"{self.redis_prefix}:QUEUE:{queue_name}"
|
135
|
-
|
136
|
-
logger.debug(f"Consuming queue: {queue_name}, stream_key: {stream_key}, is_priority: {is_priority_queue}")
|
137
|
-
|
138
|
-
check_backlog = True
|
139
|
-
lastid = "0-0"
|
140
|
-
|
141
|
-
# pg_consumer 应该使用统一的 consumer_id,而不是为每个队列创建新的
|
142
|
-
# 因为 pg_consumer 的职责是消费所有队列的消息并写入数据库
|
143
|
-
# 它不是真正的任务执行者,所以不需要为每个队列创建独立的 consumer
|
144
|
-
consumer_name = self.consumer_id
|
145
|
-
|
146
|
-
# ConsumerManager会自动处理离线worker的pending消息恢复
|
147
|
-
# 不需要手动恢复
|
148
|
-
|
149
|
-
while self._running and queue_name in self.queue_discovery.get_known_queues():
|
150
|
-
try:
|
151
|
-
myid = lastid if check_backlog else ">"
|
152
|
-
|
153
|
-
messages = await self.redis_client.xreadgroup(
|
154
|
-
self.consumer_group,
|
155
|
-
consumer_name, # 使用ConsumerManager管理的consumer_name
|
156
|
-
{stream_key: myid},
|
157
|
-
count=10000,
|
158
|
-
block=1000 if not check_backlog else 0
|
159
|
-
)
|
160
|
-
|
161
|
-
if not messages or (messages and len(messages[0][1]) == 0):
|
162
|
-
check_backlog = False
|
163
|
-
continue
|
164
|
-
|
165
|
-
if messages:
|
166
|
-
await self._process_messages(messages)
|
167
|
-
self._consecutive_errors[queue_name] = 0
|
168
|
-
|
169
|
-
if messages[0] and messages[0][1]:
|
170
|
-
lastid = messages[0][1][-1][0].decode('utf-8') if isinstance(messages[0][1][-1][0], bytes) else messages[0][1][-1][0]
|
171
|
-
check_backlog = len(messages[0][1]) >= 2000
|
172
|
-
|
173
|
-
except redis.ResponseError as e:
|
174
|
-
if "NOGROUP" in str(e):
|
175
|
-
try:
|
176
|
-
await self.redis_client.xgroup_create(
|
177
|
-
stream_key, self.consumer_group, id='0', mkstream=True
|
178
|
-
)
|
179
|
-
logger.debug(f"Recreated consumer group for queue: {queue_name}")
|
180
|
-
check_backlog = True
|
181
|
-
lastid = "0-0"
|
182
|
-
except:
|
183
|
-
pass
|
184
|
-
else:
|
185
|
-
logger.error(f"Redis error for queue {queue_name}: {e}")
|
186
|
-
logger.error(traceback.format_exc())
|
187
|
-
self._consecutive_errors[queue_name] += 1
|
188
|
-
|
189
|
-
if self._consecutive_errors[queue_name] > 10:
|
190
|
-
logger.debug(f"Too many errors for queue {queue_name}, will retry later")
|
191
|
-
await asyncio.sleep(30)
|
192
|
-
self._consecutive_errors[queue_name] = 0
|
193
|
-
|
194
|
-
except Exception as e:
|
195
|
-
logger.error(f"Error consuming queue {queue_name}: {e}", exc_info=True)
|
196
|
-
self._consecutive_errors[queue_name] += 1
|
197
|
-
await asyncio.sleep(1)
|
198
|
-
|
199
|
-
async def _process_messages(self, messages: List):
|
200
|
-
"""处理消息并保存到PostgreSQL"""
|
201
|
-
tasks_to_insert = []
|
202
|
-
ack_batch = []
|
203
|
-
|
204
|
-
for stream_key, stream_messages in messages:
|
205
|
-
if not stream_messages:
|
206
|
-
continue
|
207
|
-
|
208
|
-
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
|
209
|
-
msg_ids_to_ack = []
|
210
|
-
|
211
|
-
for msg_id, data in stream_messages:
|
212
|
-
try:
|
213
|
-
if not msg_id or not data:
|
214
|
-
continue
|
215
|
-
|
216
|
-
msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
|
217
|
-
|
218
|
-
# 使用TaskPersistence解析消息
|
219
|
-
task_info = self.task_persistence.parse_stream_message(msg_id_str, data)
|
220
|
-
if task_info:
|
221
|
-
tasks_to_insert.append(task_info)
|
222
|
-
msg_ids_to_ack.append(msg_id)
|
223
|
-
|
224
|
-
except Exception as e:
|
225
|
-
logger.error(f"Error processing message {msg_id}: {e}")
|
226
|
-
logger.error(traceback.format_exc())
|
227
|
-
|
228
|
-
if msg_ids_to_ack:
|
229
|
-
ack_batch.append((stream_key, msg_ids_to_ack))
|
230
|
-
|
231
|
-
if tasks_to_insert:
|
232
|
-
# 使用TaskPersistence插入任务
|
233
|
-
inserted_count = await self.task_persistence.insert_tasks(tasks_to_insert)
|
234
|
-
|
235
|
-
# 将成功插入的任务ID添加到内存集合中
|
236
|
-
async with self._processed_ids_lock:
|
237
|
-
for task in tasks_to_insert:
|
238
|
-
self._processed_task_ids.add(task['id'])
|
239
|
-
|
240
|
-
# 如果集合过大,清理最早的一半
|
241
|
-
if len(self._processed_task_ids) > self._processed_ids_max_size:
|
242
|
-
# 只保留最新的一半ID
|
243
|
-
ids_list = list(self._processed_task_ids)
|
244
|
-
keep_count = self._processed_ids_max_size // 2
|
245
|
-
self._processed_task_ids = set(ids_list[-keep_count:])
|
246
|
-
logger.debug(f"Cleaned processed IDs cache, kept {keep_count} most recent IDs")
|
247
|
-
|
248
|
-
# ACK所有消息(即使部分插入失败,也要ACK,避免重复处理)
|
249
|
-
if ack_batch:
|
250
|
-
pipeline = self.redis_client.pipeline()
|
251
|
-
for stream_key, msg_ids in ack_batch:
|
252
|
-
pipeline.xack(stream_key, self.consumer_group, *msg_ids)
|
253
|
-
|
254
|
-
try:
|
255
|
-
await pipeline.execute()
|
256
|
-
total_acked = sum(len(msg_ids) for _, msg_ids in ack_batch)
|
257
|
-
logger.debug(f"Successfully ACKed {total_acked} messages")
|
258
|
-
except Exception as e:
|
259
|
-
logger.error(f"Error executing batch ACK: {e}")
|
jettask/persistence/models.py
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
"""SQLAlchemy models for JetTask WebUI database."""
|
2
|
-
from datetime import datetime
|
3
|
-
from typing import Optional, Dict, Any, List
|
4
|
-
from sqlalchemy import (
|
5
|
-
Column, String, Integer, Float, DateTime, Text, JSON,
|
6
|
-
ARRAY, UniqueConstraint, Index, func
|
7
|
-
)
|
8
|
-
from sqlalchemy.ext.declarative import declarative_base
|
9
|
-
from sqlalchemy.dialects.postgresql import JSONB
|
10
|
-
|
11
|
-
Base = declarative_base()
|
12
|
-
|
13
|
-
|
14
|
-
class Task(Base):
|
15
|
-
"""任务表模型"""
|
16
|
-
__tablename__ = 'tasks'
|
17
|
-
|
18
|
-
id = Column(String(255), primary_key=True) # Redis Stream的事件ID
|
19
|
-
queue_name = Column(String(255), nullable=False)
|
20
|
-
task_name = Column(String(255), nullable=False)
|
21
|
-
task_data = Column(JSONB) # 任务的原始数据
|
22
|
-
priority = Column(Integer, default=0)
|
23
|
-
retry_count = Column(Integer, default=0)
|
24
|
-
max_retry = Column(Integer, default=3)
|
25
|
-
status = Column(String(50), default='pending') # pending, running, success, failed, timeout
|
26
|
-
result = Column(JSONB) # 执行结果
|
27
|
-
error_message = Column(Text)
|
28
|
-
created_at = Column(DateTime(timezone=True), default=func.current_timestamp())
|
29
|
-
started_at = Column(DateTime(timezone=True))
|
30
|
-
completed_at = Column(DateTime(timezone=True))
|
31
|
-
worker_id = Column(String(255))
|
32
|
-
execution_time = Column(Float) # 任务执行时间(秒)
|
33
|
-
duration = Column(Float) # 任务总持续时间(秒)
|
34
|
-
task_metadata = Column('metadata', JSONB) # 额外的元数据,在数据库中仍叫metadata
|
35
|
-
|
36
|
-
__table_args__ = (
|
37
|
-
Index('idx_tasks_queue_name', 'queue_name'),
|
38
|
-
Index('idx_tasks_status', 'status'),
|
39
|
-
# 组合索引:优化按队列和状态查询
|
40
|
-
Index('idx_tasks_queue_status', 'queue_name', 'status'),
|
41
|
-
# 时间索引:优化时间范围查询
|
42
|
-
Index('idx_tasks_created_at', 'created_at'),
|
43
|
-
# Worker索引:优化查询特定worker的任务
|
44
|
-
Index('idx_tasks_worker_id', 'worker_id',
|
45
|
-
postgresql_where=(worker_id.isnot(None))),
|
46
|
-
)
|
47
|
-
|
48
|
-
|
49
|
-
# QueueStats 和 Worker 表已废弃,不再使用
|
@@ -1,196 +0,0 @@
|
|
1
|
-
"""离线Worker恢复模块
|
2
|
-
|
3
|
-
负责恢复离线PG_CONSUMER的消息,包括TASK_CHANGES流的离线消息。
|
4
|
-
"""
|
5
|
-
|
6
|
-
import asyncio
|
7
|
-
import logging
|
8
|
-
import msgpack
|
9
|
-
import traceback
|
10
|
-
from typing import Optional
|
11
|
-
|
12
|
-
from redis.asyncio import Redis
|
13
|
-
from jettask.worker.recovery import OfflineWorkerRecovery
|
14
|
-
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
|
-
|
18
|
-
class OfflineRecoveryHandler:
|
19
|
-
"""离线Worker恢复处理器
|
20
|
-
|
21
|
-
职责:
|
22
|
-
- 启动离线worker恢复服务
|
23
|
-
- 恢复TASK_CHANGES stream的离线消息
|
24
|
-
- 处理恢复的消息并更新任务状态
|
25
|
-
"""
|
26
|
-
|
27
|
-
def __init__(
|
28
|
-
self,
|
29
|
-
redis_client: Redis,
|
30
|
-
redis_prefix: str,
|
31
|
-
consumer_id: str,
|
32
|
-
task_updater: 'TaskUpdater' # 类型提示使用字符串避免循环导入
|
33
|
-
):
|
34
|
-
"""初始化离线恢复处理器
|
35
|
-
|
36
|
-
Args:
|
37
|
-
redis_client: Redis异步客户端
|
38
|
-
redis_prefix: Redis键前缀
|
39
|
-
consumer_id: 消费者ID
|
40
|
-
task_updater: 任务更新器实例(用于处理恢复的消息)
|
41
|
-
"""
|
42
|
-
self.redis_client = redis_client
|
43
|
-
self.redis_prefix = redis_prefix
|
44
|
-
self.consumer_id = consumer_id
|
45
|
-
self.task_updater = task_updater
|
46
|
-
|
47
|
-
# 创建 WorkerState 实例(用于查询 Worker 状态)
|
48
|
-
from jettask.worker.manager import WorkerState
|
49
|
-
self.worker_state = WorkerState(
|
50
|
-
redis_client=None, # persistence 模块使用异步客户端
|
51
|
-
async_redis_client=redis_client,
|
52
|
-
redis_prefix=redis_prefix
|
53
|
-
)
|
54
|
-
|
55
|
-
# 创建离线worker恢复器(用于恢复TASK_CHANGES stream的离线消息)
|
56
|
-
# 注意:这里不传入consumer_manager,因为需要在start时初始化
|
57
|
-
self.offline_recovery = None
|
58
|
-
|
59
|
-
self._running = False
|
60
|
-
self._recovery_task = None
|
61
|
-
|
62
|
-
def set_consumer_manager(self, consumer_manager):
|
63
|
-
"""设置ConsumerManager(延迟初始化)
|
64
|
-
|
65
|
-
Args:
|
66
|
-
consumer_manager: ConsumerManager实例
|
67
|
-
"""
|
68
|
-
self.offline_recovery = OfflineWorkerRecovery(
|
69
|
-
async_redis_client=self.redis_client,
|
70
|
-
redis_prefix=self.redis_prefix,
|
71
|
-
worker_prefix='PG_CONSUMER', # 使用PG_CONSUMER前缀
|
72
|
-
consumer_manager=consumer_manager,
|
73
|
-
worker_state=self.worker_state # 传入在 __init__ 中创建的 WorkerState
|
74
|
-
)
|
75
|
-
|
76
|
-
async def start(self):
|
77
|
-
"""启动离线恢复服务"""
|
78
|
-
if not self.offline_recovery:
|
79
|
-
logger.warning("OfflineRecovery not initialized, please call set_consumer_manager first")
|
80
|
-
return
|
81
|
-
|
82
|
-
self._running = True
|
83
|
-
self._recovery_task = asyncio.create_task(self._recovery_loop())
|
84
|
-
logger.debug("OfflineRecoveryHandler started")
|
85
|
-
|
86
|
-
async def stop(self):
|
87
|
-
"""停止离线恢复服务"""
|
88
|
-
self._running = False
|
89
|
-
|
90
|
-
if self.offline_recovery:
|
91
|
-
self.offline_recovery.stop() # stop() 不是异步方法
|
92
|
-
|
93
|
-
if self._recovery_task:
|
94
|
-
self._recovery_task.cancel()
|
95
|
-
try:
|
96
|
-
await self._recovery_task
|
97
|
-
except asyncio.CancelledError:
|
98
|
-
pass
|
99
|
-
|
100
|
-
logger.debug("OfflineRecoveryHandler stopped")
|
101
|
-
|
102
|
-
async def _recovery_loop(self):
|
103
|
-
"""离线恢复循环"""
|
104
|
-
while self._running:
|
105
|
-
try:
|
106
|
-
total_recovered = 0
|
107
|
-
|
108
|
-
# 恢复TASK_CHANGES stream的消息
|
109
|
-
recovered = await self._recover_task_changes_offline_messages()
|
110
|
-
if recovered > 0:
|
111
|
-
logger.debug(f"Recovered {recovered} TASK_CHANGES messages")
|
112
|
-
total_recovered += recovered
|
113
|
-
|
114
|
-
if total_recovered > 0:
|
115
|
-
logger.debug(f"Total recovered {total_recovered} messages in this cycle")
|
116
|
-
|
117
|
-
# 每1秒扫描一次
|
118
|
-
await asyncio.sleep(1)
|
119
|
-
|
120
|
-
except Exception as e:
|
121
|
-
logger.error(f"Error in offline recovery service: {e}")
|
122
|
-
await asyncio.sleep(10)
|
123
|
-
|
124
|
-
async def _recover_task_changes_offline_messages(self) -> int:
|
125
|
-
"""恢复TASK_CHANGES stream的离线消息"""
|
126
|
-
# 使用 OfflineWorkerRecovery 的标准接口
|
127
|
-
try:
|
128
|
-
# 为TASK_CHANGES定义自定义的队列格式化器
|
129
|
-
def task_changes_formatter(queue):
|
130
|
-
# 对于TASK_CHANGES,直接返回stream key(不加QUEUE:前缀)
|
131
|
-
if queue == 'TASK_CHANGES':
|
132
|
-
return f"{self.redis_prefix}:TASK_CHANGES"
|
133
|
-
else:
|
134
|
-
return f"{self.redis_prefix}:QUEUE:{queue}"
|
135
|
-
|
136
|
-
# 创建专门用于TASK_CHANGES的恢复器
|
137
|
-
task_changes_recovery = OfflineWorkerRecovery(
|
138
|
-
async_redis_client=self.redis_client,
|
139
|
-
redis_prefix=self.redis_prefix,
|
140
|
-
worker_prefix='PG_CONSUMER',
|
141
|
-
queue_formatter=task_changes_formatter,
|
142
|
-
worker_state=self.worker_state # 传入在 __init__ 中创建的 WorkerState
|
143
|
-
)
|
144
|
-
|
145
|
-
# 调用标准的恢复方法
|
146
|
-
# TASK_CHANGES作为队列名传入,会被正确处理
|
147
|
-
recovered = await task_changes_recovery.recover_offline_workers(
|
148
|
-
queue='TASK_CHANGES', # 这个队列名会用于查找离线worker
|
149
|
-
current_consumer_name=self.consumer_id,
|
150
|
-
process_message_callback=self._process_recovered_task_change_v2
|
151
|
-
)
|
152
|
-
|
153
|
-
return recovered
|
154
|
-
|
155
|
-
except Exception as e:
|
156
|
-
logger.error(f"Error in recover_task_changes_offline_messages: {e}")
|
157
|
-
return 0
|
158
|
-
|
159
|
-
async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
|
160
|
-
"""处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
|
161
|
-
try:
|
162
|
-
logger.debug(f'处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口) {msg_data=}')
|
163
|
-
# 解析消息 - 现在使用task_id而不是event_id
|
164
|
-
if b'task_id' in msg_data:
|
165
|
-
# 使用msgpack解压task_id
|
166
|
-
compressed_task_id = msg_data[b'task_id']
|
167
|
-
task_key = msgpack.unpackb(compressed_task_id)
|
168
|
-
task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
|
169
|
-
|
170
|
-
# 从完整的task_key格式提取stream_id
|
171
|
-
# 格式: namespace:TASK:stream_id:queue_name
|
172
|
-
stream_id = None
|
173
|
-
if ':TASK:' in task_key:
|
174
|
-
parts = task_key.split(':TASK:')
|
175
|
-
if len(parts) == 2:
|
176
|
-
# 再从右边部分提取stream_id
|
177
|
-
right_parts = parts[1].split(':')
|
178
|
-
if right_parts:
|
179
|
-
stream_id = right_parts[0] # 提取stream_id
|
180
|
-
|
181
|
-
if stream_id:
|
182
|
-
logger.debug(f"Processing recovered TASK_CHANGES message: {stream_id} from offline worker {consumer_id}")
|
183
|
-
# 更新任务状态 - 传入(stream_id, task_key)元组
|
184
|
-
# 使用task_updater的内部方法
|
185
|
-
await self.task_updater._update_tasks_by_event([(stream_id, task_key)])
|
186
|
-
else:
|
187
|
-
logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
|
188
|
-
|
189
|
-
# ACK消息
|
190
|
-
change_stream_key = f"{self.redis_prefix}:TASK_CHANGES"
|
191
|
-
consumer_group = f"{self.redis_prefix}_changes_consumer"
|
192
|
-
await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
|
193
|
-
|
194
|
-
except Exception as e:
|
195
|
-
logger.error(f"Error processing recovered task change {msg_id}: {e}")
|
196
|
-
logger.error(traceback.format_exc())
|