jettask 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +10 -3
- jettask/cli.py +314 -228
- jettask/config/__init__.py +9 -1
- jettask/config/config.py +245 -0
- jettask/config/env_loader.py +381 -0
- jettask/config/lua_scripts.py +158 -0
- jettask/config/nacos_config.py +132 -5
- jettask/core/__init__.py +1 -1
- jettask/core/app.py +1573 -666
- jettask/core/app_importer.py +33 -16
- jettask/core/container.py +532 -0
- jettask/core/task.py +1 -4
- jettask/core/unified_manager_base.py +2 -2
- jettask/executor/__init__.py +38 -0
- jettask/executor/core.py +625 -0
- jettask/executor/executor.py +338 -0
- jettask/executor/orchestrator.py +290 -0
- jettask/executor/process_entry.py +638 -0
- jettask/executor/task_executor.py +317 -0
- jettask/messaging/__init__.py +68 -0
- jettask/messaging/event_pool.py +2188 -0
- jettask/messaging/reader.py +519 -0
- jettask/messaging/registry.py +266 -0
- jettask/messaging/scanner.py +369 -0
- jettask/messaging/sender.py +312 -0
- jettask/persistence/__init__.py +118 -0
- jettask/persistence/backlog_monitor.py +567 -0
- jettask/{backend/data_access.py → persistence/base.py} +58 -57
- jettask/persistence/consumer.py +315 -0
- jettask/{core → persistence}/db_manager.py +23 -22
- jettask/persistence/maintenance.py +81 -0
- jettask/persistence/message_consumer.py +259 -0
- jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
- jettask/persistence/offline_recovery.py +196 -0
- jettask/persistence/queue_discovery.py +215 -0
- jettask/persistence/task_persistence.py +218 -0
- jettask/persistence/task_updater.py +583 -0
- jettask/scheduler/__init__.py +2 -2
- jettask/scheduler/loader.py +6 -5
- jettask/scheduler/run_scheduler.py +1 -1
- jettask/scheduler/scheduler.py +7 -7
- jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
- jettask/task/__init__.py +16 -0
- jettask/{router.py → task/router.py} +26 -8
- jettask/task/task_center/__init__.py +9 -0
- jettask/task/task_executor.py +318 -0
- jettask/task/task_registry.py +291 -0
- jettask/test_connection_monitor.py +73 -0
- jettask/utils/__init__.py +31 -1
- jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
- jettask/utils/db_connector.py +1629 -0
- jettask/{db_init.py → utils/db_init.py} +1 -1
- jettask/utils/rate_limit/__init__.py +30 -0
- jettask/utils/rate_limit/concurrency_limiter.py +665 -0
- jettask/utils/rate_limit/config.py +145 -0
- jettask/utils/rate_limit/limiter.py +41 -0
- jettask/utils/rate_limit/manager.py +269 -0
- jettask/utils/rate_limit/qps_limiter.py +154 -0
- jettask/utils/rate_limit/task_limiter.py +384 -0
- jettask/utils/serializer.py +3 -0
- jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
- jettask/utils/time_sync.py +173 -0
- jettask/webui/__init__.py +27 -0
- jettask/{api/v1 → webui/api}/alerts.py +1 -1
- jettask/{api/v1 → webui/api}/analytics.py +2 -2
- jettask/{api/v1 → webui/api}/namespaces.py +1 -1
- jettask/{api/v1 → webui/api}/overview.py +1 -1
- jettask/{api/v1 → webui/api}/queues.py +3 -3
- jettask/{api/v1 → webui/api}/scheduled.py +1 -1
- jettask/{api/v1 → webui/api}/settings.py +1 -1
- jettask/{api.py → webui/app.py} +253 -145
- jettask/webui/namespace_manager/__init__.py +10 -0
- jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
- jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
- jettask/{run.py → webui/run.py} +2 -2
- jettask/{services → webui/services}/__init__.py +1 -3
- jettask/{services → webui/services}/overview_service.py +34 -16
- jettask/{services → webui/services}/queue_service.py +1 -1
- jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
- jettask/{services → webui/services}/settings_service.py +1 -1
- jettask/worker/__init__.py +53 -0
- jettask/worker/lifecycle.py +1507 -0
- jettask/worker/manager.py +583 -0
- jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
- jettask-0.2.20.dist-info/RECORD +145 -0
- jettask/__main__.py +0 -140
- jettask/api/__init__.py +0 -103
- jettask/backend/__init__.py +0 -1
- jettask/backend/api/__init__.py +0 -3
- jettask/backend/api/v1/__init__.py +0 -17
- jettask/backend/api/v1/monitoring.py +0 -431
- jettask/backend/api/v1/namespaces.py +0 -504
- jettask/backend/api/v1/queues.py +0 -342
- jettask/backend/api/v1/tasks.py +0 -367
- jettask/backend/core/__init__.py +0 -3
- jettask/backend/core/cache.py +0 -221
- jettask/backend/core/database.py +0 -200
- jettask/backend/core/exceptions.py +0 -102
- jettask/backend/dependencies.py +0 -261
- jettask/backend/init_meta_db.py +0 -158
- jettask/backend/main.py +0 -1426
- jettask/backend/main_unified.py +0 -78
- jettask/backend/main_v2.py +0 -394
- jettask/backend/models/__init__.py +0 -3
- jettask/backend/models/requests.py +0 -236
- jettask/backend/models/responses.py +0 -230
- jettask/backend/namespace_api_old.py +0 -267
- jettask/backend/services/__init__.py +0 -3
- jettask/backend/start.py +0 -42
- jettask/backend/unified_api_router.py +0 -1541
- jettask/cleanup_deprecated_tables.sql +0 -16
- jettask/core/consumer_manager.py +0 -1695
- jettask/core/delay_scanner.py +0 -256
- jettask/core/event_pool.py +0 -1700
- jettask/core/heartbeat_process.py +0 -222
- jettask/core/task_batch.py +0 -153
- jettask/core/worker_scanner.py +0 -271
- jettask/executors/__init__.py +0 -5
- jettask/executors/asyncio.py +0 -876
- jettask/executors/base.py +0 -30
- jettask/executors/common.py +0 -148
- jettask/executors/multi_asyncio.py +0 -309
- jettask/gradio_app.py +0 -570
- jettask/integrated_gradio_app.py +0 -1088
- jettask/main.py +0 -0
- jettask/monitoring/__init__.py +0 -3
- jettask/pg_consumer.py +0 -1896
- jettask/run_monitor.py +0 -22
- jettask/run_webui.py +0 -148
- jettask/scheduler/multi_namespace_scheduler.py +0 -294
- jettask/scheduler/unified_manager.py +0 -450
- jettask/task_center_client.py +0 -150
- jettask/utils/serializer_optimized.py +0 -33
- jettask/webui_exceptions.py +0 -67
- jettask-0.2.19.dist-info/RECORD +0 -150
- /jettask/{constants.py → config/constants.py} +0 -0
- /jettask/{backend/config.py → config/task_center.py} +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
- /jettask/{models.py → persistence/models.py} +0 -0
- /jettask/scheduler/{manager.py → task_crud.py} +0 -0
- /jettask/{schema.sql → schemas/schema.sql} +0 -0
- /jettask/{task_center.py → task/task_center/client.py} +0 -0
- /jettask/{monitoring → utils}/file_watcher.py +0 -0
- /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
- /jettask/{api/v1 → webui/api}/__init__.py +0 -0
- /jettask/{webui_config.py → webui/config.py} +0 -0
- /jettask/{webui_models → webui/models}/__init__.py +0 -0
- /jettask/{webui_models → webui/models}/namespace.py +0 -0
- /jettask/{services → webui/services}/alert_service.py +0 -0
- /jettask/{services → webui/services}/analytics_service.py +0 -0
- /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
- /jettask/{services → webui/services}/task_service.py +0 -0
- /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
- /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -2,19 +2,35 @@
|
|
2
2
|
简化的离线worker消息恢复模块
|
3
3
|
"""
|
4
4
|
import asyncio
|
5
|
+
import json
|
5
6
|
import logging
|
6
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
|
7
8
|
from redis.asyncio.lock import Lock as AsyncLock
|
8
9
|
|
9
10
|
import msgpack
|
10
11
|
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from jettask.worker.manager import WorkerState
|
14
|
+
|
11
15
|
logger = logging.getLogger(__name__)
|
12
16
|
|
13
17
|
|
14
18
|
class OfflineWorkerRecovery:
|
15
19
|
"""离线worker消息恢复处理器"""
|
16
|
-
|
17
|
-
def __init__(self, async_redis_client, consumer_manager=None, redis_prefix='jettask', worker_prefix='WORKER', queue_formatter=None):
|
20
|
+
|
21
|
+
def __init__(self, async_redis_client, consumer_manager=None, redis_prefix='jettask', worker_prefix='WORKER', queue_formatter=None, queue_registry=None, worker_state: Optional['WorkerState'] = None):
|
22
|
+
"""
|
23
|
+
初始化离线worker消息恢复处理器
|
24
|
+
|
25
|
+
Args:
|
26
|
+
async_redis_client: 异步Redis客户端
|
27
|
+
consumer_manager: 消费者管理器
|
28
|
+
redis_prefix: Redis键前缀
|
29
|
+
worker_prefix: Worker键前缀
|
30
|
+
queue_formatter: 队列格式化函数
|
31
|
+
queue_registry: 队列注册表
|
32
|
+
worker_state: WorkerState实例(用于查询Worker状态)
|
33
|
+
"""
|
18
34
|
self.async_redis_client = async_redis_client
|
19
35
|
self.consumer_manager = consumer_manager
|
20
36
|
self.redis_prefix = redis_prefix
|
@@ -22,206 +38,244 @@ class OfflineWorkerRecovery:
|
|
22
38
|
self._stop_recovery = False
|
23
39
|
# 队列格式化函数,默认使用 prefix:QUEUE:queue_name 格式
|
24
40
|
self.queue_formatter = queue_formatter or (lambda q: f"{self.redis_prefix}:QUEUE:{q}")
|
41
|
+
# 通过 consumer_manager 访问 worker_state_manager
|
42
|
+
self.worker_state_manager = consumer_manager.app.worker_state_manager if (consumer_manager and hasattr(consumer_manager, 'app') and consumer_manager.app) else None
|
43
|
+
# 队列注册表,用于获取优先级队列
|
44
|
+
self.queue_registry = queue_registry
|
45
|
+
# Worker状态查询器(必须传入)
|
46
|
+
self._worker_state: Optional['WorkerState'] = worker_state
|
25
47
|
|
26
48
|
async def recover_offline_workers(self,
|
27
49
|
queue: str,
|
28
50
|
current_consumer_name: str = None,
|
29
51
|
event_queue: Optional[asyncio.Queue] = None,
|
30
52
|
process_message_callback: Optional[callable] = None,
|
31
|
-
consumer_group_suffix: Optional[str] = None
|
53
|
+
consumer_group_suffix: Optional[str] = None,
|
54
|
+
event_queue_callback: Optional[callable] = None) -> int:
|
32
55
|
"""
|
33
56
|
恢复指定队列的离线worker的pending消息
|
57
|
+
|
58
|
+
支持优先级队列:
|
59
|
+
- 如果 queue_registry 可用,会自动获取基础队列的所有优先级队列并恢复
|
60
|
+
- 如果不可用,只恢复指定的队列
|
34
61
|
"""
|
35
62
|
total_recovered = 0
|
36
|
-
logger.debug(f'恢复指定队列的离线worker的pending
|
63
|
+
logger.debug(f'恢复指定队列的离线worker的pending消息: {queue}')
|
64
|
+
|
37
65
|
try:
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
logger.debug(f"
|
57
|
-
|
58
|
-
#
|
59
|
-
offline_workers = await self._find_offline_workers(
|
66
|
+
# 确定基础队列名(去除优先级后缀)
|
67
|
+
base_queue = queue.split(':')[0]
|
68
|
+
|
69
|
+
# 获取需要恢复的所有队列(包括优先级队列)
|
70
|
+
queues_to_recover = [queue]
|
71
|
+
|
72
|
+
# 如果 queue_registry 可用,并且是基础队列,获取所有优先级队列
|
73
|
+
if self.queue_registry and base_queue == queue:
|
74
|
+
try:
|
75
|
+
priority_queues = await self.queue_registry.get_priority_queues_for_base(queue)
|
76
|
+
if priority_queues:
|
77
|
+
logger.debug(f"Found {len(priority_queues)} priority queues for base queue {queue}: {priority_queues}")
|
78
|
+
queues_to_recover.extend(priority_queues)
|
79
|
+
else:
|
80
|
+
logger.debug(f"No priority queues found for base queue {queue}")
|
81
|
+
except Exception as e:
|
82
|
+
logger.warning(f"Error getting priority queues for {queue}: {e}, will only recover base queue")
|
83
|
+
|
84
|
+
logger.debug(f"Will recover {len(queues_to_recover)} queue(s): {queues_to_recover}")
|
85
|
+
|
86
|
+
# 只查找一次离线 worker(使用基础队列名,因为 worker 的 queues 字段只存储基础队列)
|
87
|
+
offline_workers = await self._find_offline_workers(base_queue)
|
60
88
|
if not offline_workers:
|
61
|
-
logger.debug(f"No offline workers found for queue {
|
89
|
+
logger.debug(f"No offline workers found for base queue {base_queue}")
|
62
90
|
return 0
|
63
|
-
|
64
|
-
logger.
|
65
|
-
|
66
|
-
#
|
91
|
+
|
92
|
+
logger.info(f"Found {len(offline_workers)} offline workers for base queue {base_queue}, starting recovery...")
|
93
|
+
|
94
|
+
# 对每个离线worker,提前提取 group_infos(使用基础队列)
|
95
|
+
workers_with_groups = []
|
67
96
|
for worker_key, worker_data in offline_workers:
|
97
|
+
group_infos = []
|
98
|
+
for key, value in worker_data.items():
|
99
|
+
if key.startswith('group_info:'):
|
100
|
+
try:
|
101
|
+
group_info = json.loads(value)
|
102
|
+
# 使用基础队列名进行比较
|
103
|
+
if group_info.get('queue') == base_queue:
|
104
|
+
group_infos.append(group_info)
|
105
|
+
logger.debug(f"Found group info for base queue {base_queue}: {group_info}")
|
106
|
+
except Exception as e:
|
107
|
+
logger.error(f"Error parsing group_info: {e}")
|
108
|
+
|
109
|
+
workers_with_groups.append((worker_key, worker_data, group_infos))
|
110
|
+
logger.info(f"Worker {worker_key} has {len(group_infos)} groups for base queue {base_queue}")
|
111
|
+
|
112
|
+
# 对每个队列(基础队列 + 优先级队列)恢复消息
|
113
|
+
for queue_to_recover in queues_to_recover:
|
68
114
|
if self._stop_recovery:
|
69
115
|
logger.debug("Stopping recovery due to shutdown signal")
|
70
116
|
break
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
117
|
+
|
118
|
+
logger.info(f"Recovering queue: {queue_to_recover}")
|
119
|
+
|
120
|
+
# 获取当前consumer名称
|
121
|
+
consumer_name = current_consumer_name
|
122
|
+
if not consumer_name and self.consumer_manager:
|
123
|
+
consumer_name = self.consumer_manager.get_consumer_name(base_queue)
|
124
|
+
|
125
|
+
# 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
|
126
|
+
# 不再需要为优先级队列添加后缀
|
127
|
+
|
128
|
+
if not consumer_name:
|
129
|
+
logger.error(f"Cannot get current consumer name for queue {queue_to_recover}")
|
130
|
+
continue
|
131
|
+
|
132
|
+
logger.info(f"Starting recovery for queue {queue_to_recover} with consumer {consumer_name}")
|
133
|
+
|
134
|
+
# 处理每个离线worker的这个队列的消息(传入预提取的 group_infos)
|
135
|
+
for worker_key, worker_data, group_infos in workers_with_groups:
|
136
|
+
if self._stop_recovery:
|
137
|
+
logger.debug("Stopping recovery due to shutdown signal")
|
138
|
+
break
|
139
|
+
|
140
|
+
logger.info(f"Recovering messages from worker {worker_key} for queue {queue_to_recover}")
|
141
|
+
recovered = await self._recover_worker_messages(
|
142
|
+
queue=queue_to_recover,
|
143
|
+
worker_key=worker_key,
|
144
|
+
worker_data=worker_data,
|
145
|
+
group_infos=group_infos, # 传入预提取的 group_infos
|
146
|
+
current_consumer_name=consumer_name,
|
147
|
+
event_queue=event_queue,
|
148
|
+
process_message_callback=process_message_callback,
|
149
|
+
consumer_group_suffix=consumer_group_suffix,
|
150
|
+
event_queue_callback=event_queue_callback # 传入回调函数
|
151
|
+
)
|
152
|
+
|
153
|
+
total_recovered += recovered
|
154
|
+
|
86
155
|
except Exception as e:
|
87
156
|
logger.error(f"Error recovering offline workers for queue {queue}: {e}")
|
88
|
-
|
157
|
+
import traceback
|
158
|
+
traceback.print_exc()
|
159
|
+
|
89
160
|
return total_recovered
|
90
161
|
|
91
162
|
async def _find_offline_workers(self, queue: str) -> List[Tuple[str, Dict]]:
|
92
|
-
"""查找指定队列的离线worker
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
# 跳过非worker键
|
111
|
-
if any(x in worker_key for x in [':HISTORY:', ':REUSE:LOCK', ':REUSING']):
|
112
|
-
continue
|
113
|
-
|
114
|
-
try:
|
115
|
-
worker_data = await self.async_redis_client.hgetall(worker_key)
|
116
|
-
if not worker_data:
|
117
|
-
continue
|
118
|
-
|
119
|
-
# 解码二进制数据
|
120
|
-
decoded_worker_data = {}
|
121
|
-
for k, v in worker_data.items():
|
122
|
-
key = k.decode('utf-8') if isinstance(k, bytes) else k
|
123
|
-
value = v.decode('utf-8') if isinstance(v, bytes) else v
|
124
|
-
decoded_worker_data[key] = value
|
125
|
-
|
126
|
-
# logger.debug(f'{worker_key=} {decoded_worker_data=}')
|
127
|
-
# logger.debug(f'{decoded_worker_data=}')
|
128
|
-
# 检查worker是否离线且消息未转移
|
129
|
-
is_alive = decoded_worker_data.get('is_alive', 'false').lower() == 'true'
|
130
|
-
messages_transferred = decoded_worker_data.get('messages_transferred', 'false').lower() == 'true'
|
131
|
-
# logger.debug(f'{worker_key=} {is_alive=} {messages_transferred=} {not is_alive and not messages_transferred}')
|
132
|
-
# 找到离线且消息未转移的worker
|
133
|
-
if not is_alive and not messages_transferred:
|
134
|
-
queues_str = decoded_worker_data.get('queues', '')
|
135
|
-
worker_queues = queues_str.split(',') if queues_str else []
|
136
|
-
|
137
|
-
# logger.debug(f'{worker_queues=} {queue=}')
|
138
|
-
# 检查这个worker是否负责当前队列
|
139
|
-
# 支持优先级队列:如果queue是"base:priority"格式,检查worker是否负责base队列
|
140
|
-
queue_matched = False
|
141
|
-
if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
|
142
|
-
# 这是优先级队列,提取基础队列名
|
143
|
-
base_queue = queue.rsplit(':', 1)[0]
|
144
|
-
queue_matched = base_queue in worker_queues
|
145
|
-
else:
|
146
|
-
# 普通队列
|
147
|
-
queue_matched = queue in worker_queues
|
148
|
-
if queue_matched:
|
149
|
-
offline_workers.append((worker_key, decoded_worker_data))
|
150
|
-
|
151
|
-
except Exception as e:
|
152
|
-
logger.error(f"Error processing worker key {worker_key}: {e}")
|
153
|
-
continue
|
154
|
-
|
155
|
-
# 当cursor返回0时,表示扫描完成
|
156
|
-
if cursor == 0:
|
157
|
-
break
|
158
|
-
|
159
|
-
except Exception as e:
|
160
|
-
logger.error(f"Error finding offline workers: {e}")
|
161
|
-
|
162
|
-
return offline_workers
|
163
|
+
"""查找指定队列的离线worker
|
164
|
+
|
165
|
+
委托给 WorkerState.find_offline_workers_for_queue() 方法
|
166
|
+
|
167
|
+
注意:worker_state 必须在 OfflineWorkerRecovery 初始化时传入
|
168
|
+
"""
|
169
|
+
if self._worker_state is None:
|
170
|
+
raise RuntimeError(
|
171
|
+
"WorkerState not provided to OfflineWorkerRecovery. "
|
172
|
+
"Please pass worker_state parameter during initialization."
|
173
|
+
)
|
174
|
+
|
175
|
+
return await self._worker_state.find_offline_workers_for_queue(
|
176
|
+
queue=queue,
|
177
|
+
worker_prefix=self.worker_prefix,
|
178
|
+
worker_state_manager=self.worker_state_manager
|
179
|
+
)
|
163
180
|
|
164
181
|
async def _recover_worker_messages(self,
|
165
182
|
queue: str,
|
166
183
|
worker_key: str,
|
167
184
|
worker_data: Dict,
|
185
|
+
group_infos: List[Dict],
|
168
186
|
current_consumer_name: str,
|
169
187
|
event_queue: Optional[asyncio.Queue] = None,
|
170
188
|
process_message_callback: Optional[callable] = None,
|
171
|
-
consumer_group_suffix: Optional[str] = None
|
189
|
+
consumer_group_suffix: Optional[str] = None,
|
190
|
+
event_queue_callback: Optional[callable] = None) -> int:
|
172
191
|
"""
|
173
192
|
恢复单个worker的pending消息
|
174
|
-
|
175
|
-
|
193
|
+
|
194
|
+
Args:
|
195
|
+
queue: 当前要恢复的队列(可能是基础队列或优先级队列)
|
196
|
+
worker_key: Worker的Redis键
|
197
|
+
worker_data: Worker的数据
|
198
|
+
group_infos: 预提取的group_info列表(已按基础队列过滤)
|
199
|
+
current_consumer_name: 当前consumer名称
|
200
|
+
event_queue: 事件队列
|
201
|
+
process_message_callback: 处理消息的回调
|
202
|
+
consumer_group_suffix: Consumer组后缀
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
恢复的消息数量
|
176
206
|
"""
|
177
207
|
total_claimed = 0
|
178
|
-
|
208
|
+
|
179
209
|
try:
|
180
210
|
# worker_data 现在已经是解码后的字典
|
181
211
|
consumer_id = worker_data.get('consumer_id')
|
182
|
-
|
183
|
-
# 从worker_data中提取group_info字段
|
184
|
-
group_infos = []
|
185
|
-
import json
|
186
|
-
for key, value in worker_data.items():
|
187
|
-
if key.startswith('group_info:'):
|
188
|
-
try:
|
189
|
-
group_info = json.loads(value)
|
190
|
-
# 只处理属于当前队列的group
|
191
|
-
if group_info.get('queue') == queue:
|
192
|
-
group_infos.append(group_info)
|
193
|
-
logger.info(f"Found group info for queue {queue}: {group_info}")
|
194
|
-
except Exception as e:
|
195
|
-
logger.error(f"Error parsing group_info: {e}")
|
196
|
-
|
212
|
+
|
197
213
|
if not group_infos:
|
198
|
-
logger.
|
214
|
+
logger.info(f"No group_info provided for queue {queue} in worker {worker_key}")
|
199
215
|
# 即使没有group_info,也要标记为已处理,避免重复扫描
|
200
|
-
|
216
|
+
# 通过 WorkerStateManager 标记消息已转移
|
217
|
+
if self.worker_state_manager:
|
218
|
+
worker_id = worker_key.split(':')[-1]
|
219
|
+
await self.worker_state_manager.mark_messages_transferred(worker_id, transferred=True)
|
220
|
+
else:
|
221
|
+
await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
|
201
222
|
return 0
|
202
|
-
|
223
|
+
|
203
224
|
# 在处理任何group之前,先标记该worker的消息已开始转移
|
204
225
|
# 避免其他进程重复处理
|
205
|
-
|
226
|
+
if self.worker_state_manager:
|
227
|
+
worker_id = worker_key.split(':')[-1]
|
228
|
+
await self.worker_state_manager.mark_messages_transferred(worker_id, transferred=True)
|
229
|
+
else:
|
230
|
+
await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
|
206
231
|
logger.info(f"Marked worker {worker_key} as messages_transferred=true")
|
207
232
|
|
208
233
|
# 处理每个group_info
|
209
234
|
for group_info in group_infos:
|
210
|
-
|
235
|
+
base_stream_key = group_info.get('stream_key')
|
211
236
|
group_name = group_info.get('group_name')
|
212
|
-
|
237
|
+
base_offline_consumer_name = group_info.get('consumer_name')
|
213
238
|
task_name = group_info.get('task_name')
|
214
|
-
|
215
|
-
|
239
|
+
base_queue = group_info.get('queue')
|
240
|
+
|
241
|
+
if not all([base_stream_key, group_name, base_offline_consumer_name]):
|
216
242
|
logger.warning(f"Incomplete group_info: {group_info}")
|
217
243
|
continue
|
218
|
-
|
244
|
+
|
245
|
+
# 根据当前处理的队列构建正确的 stream_key 和 offline_consumer_name
|
246
|
+
# group_info 中存储的是基础队列的信息(如 robust_bench2)
|
247
|
+
# 如果当前处理的是优先级队列(如 robust_bench2:6),需要添加优先级后缀
|
248
|
+
stream_key = f"{self.redis_prefix}:QUEUE:{queue}"
|
249
|
+
|
250
|
+
# 构建离线 consumer 的名称
|
251
|
+
# 如果当前处理的是优先级队列,需要添加优先级后缀
|
252
|
+
offline_consumer_name = base_offline_consumer_name
|
253
|
+
# if base_queue and queue != base_queue:
|
254
|
+
# # 提取优先级后缀(如从 robust_bench2:6 提取 6)
|
255
|
+
# priority_suffix = queue.rsplit(':', 1)[-1]
|
256
|
+
# offline_consumer_name = f"{base_offline_consumer_name}:{priority_suffix}"
|
257
|
+
|
219
258
|
logger.info(f"Recovering task {task_name}: stream={stream_key}, group={group_name}, consumer={offline_consumer_name}")
|
220
|
-
|
221
|
-
# 跳过自己的consumer
|
222
|
-
|
223
|
-
|
259
|
+
|
260
|
+
# 跳过自己的consumer,但只有在worker仍然活跃的情况下
|
261
|
+
# 如果worker已经offline(is_alive=false或messages_transferred=true),即使consumer名称相同也应该恢复
|
262
|
+
# 这处理了worker_id被复用的情况
|
263
|
+
is_alive = worker_data.get('is_alive', 'false')
|
264
|
+
if isinstance(is_alive, bytes):
|
265
|
+
is_alive = is_alive.decode('utf-8')
|
266
|
+
is_alive = is_alive.lower() == 'true'
|
267
|
+
|
268
|
+
messages_transferred = worker_data.get('messages_transferred', 'false')
|
269
|
+
if isinstance(messages_transferred, bytes):
|
270
|
+
messages_transferred = messages_transferred.decode('utf-8')
|
271
|
+
messages_transferred = messages_transferred.lower() == 'true'
|
272
|
+
|
273
|
+
# 只有在worker活跃且消息未转移时,才跳过同名consumer
|
274
|
+
if current_consumer_name == offline_consumer_name and is_alive and not messages_transferred:
|
275
|
+
logger.info(f"Skipping own active consumer: {offline_consumer_name}")
|
224
276
|
continue
|
277
|
+
elif current_consumer_name == offline_consumer_name:
|
278
|
+
logger.info(f"Recovering same-name consumer from offline worker: {offline_consumer_name} (is_alive={is_alive}, messages_transferred={messages_transferred})")
|
225
279
|
|
226
280
|
# 使用分布式锁
|
227
281
|
lock_key = f"{self.redis_prefix}:CLAIM:LOCK:{offline_consumer_name}:{group_name}"
|
@@ -233,7 +287,7 @@ class OfflineWorkerRecovery:
|
|
233
287
|
)
|
234
288
|
|
235
289
|
if not await lock.acquire():
|
236
|
-
logger.
|
290
|
+
logger.info(f"Lock busy for {offline_consumer_name}:{group_name}")
|
237
291
|
continue
|
238
292
|
|
239
293
|
try:
|
@@ -241,18 +295,30 @@ class OfflineWorkerRecovery:
|
|
241
295
|
pending_info = await self.async_redis_client.xpending(
|
242
296
|
stream_key, group_name
|
243
297
|
)
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
298
|
+
logger.info(f"Pending info for {stream_key=} {group_name=} {task_name}: {pending_info=}")
|
299
|
+
|
300
|
+
total_pending = pending_info.get('pending', 0) if pending_info else 0
|
301
|
+
if total_pending > 0:
|
302
|
+
# 批量处理所有 pending 消息(避免遗漏)
|
303
|
+
batch_size = 100
|
304
|
+
total_claimed_count = 0
|
305
|
+
|
306
|
+
# 循环直到处理完所有消息
|
307
|
+
while True:
|
308
|
+
# 获取具体的pending消息信息(每次最多 batch_size 条)
|
309
|
+
detailed_pending = await self.async_redis_client.xpending_range(
|
310
|
+
stream_key, group_name,
|
311
|
+
min='-', max='+', count=batch_size,
|
312
|
+
consumername=offline_consumer_name
|
313
|
+
)
|
314
|
+
logger.info(f'{detailed_pending=} {stream_key=} {group_name=} {offline_consumer_name=}')
|
315
|
+
|
316
|
+
if not detailed_pending:
|
317
|
+
# 没有更多消息了
|
318
|
+
break
|
319
|
+
|
320
|
+
logger.info(f"Found {len(detailed_pending)} pending messages for {task_name} (batch {total_claimed_count // batch_size + 1})")
|
321
|
+
|
256
322
|
# 批量认领消息
|
257
323
|
message_ids = [msg['message_id'] for msg in detailed_pending]
|
258
324
|
claimed_messages = await self.async_redis_client.xclaim(
|
@@ -261,27 +327,35 @@ class OfflineWorkerRecovery:
|
|
261
327
|
min_idle_time=0,
|
262
328
|
message_ids=message_ids
|
263
329
|
)
|
264
|
-
|
330
|
+
|
265
331
|
if claimed_messages:
|
266
|
-
logger.info(f"Claimed {len(claimed_messages)} messages for task {task_name}")
|
267
|
-
|
268
|
-
|
269
|
-
#
|
270
|
-
|
332
|
+
logger.info(f"Claimed {len(claimed_messages)} messages for task {task_name} in this batch")
|
333
|
+
total_claimed_count += len(claimed_messages)
|
334
|
+
|
335
|
+
# 获取该任务的 event_queue
|
336
|
+
# 优先使用 event_queue_callback,其次使用直接传入的 event_queue
|
337
|
+
task_event_queue = None
|
338
|
+
if event_queue_callback and task_name:
|
339
|
+
task_event_queue = event_queue_callback(task_name)
|
340
|
+
elif event_queue:
|
341
|
+
task_event_queue = event_queue
|
342
|
+
|
343
|
+
# 如果有 event_queue,将消息放入队列
|
344
|
+
if task_event_queue:
|
345
|
+
logger.info(f'即将转移 {len(claimed_messages)=} 消息到 {task_name}')
|
271
346
|
for msg_id, msg_data in claimed_messages:
|
272
347
|
if isinstance(msg_id, bytes):
|
273
348
|
msg_id = msg_id.decode('utf-8')
|
274
|
-
|
349
|
+
|
275
350
|
# 解析消息数据
|
276
351
|
data_field = msg_data.get(b'data') or msg_data.get('data')
|
277
352
|
if data_field:
|
278
353
|
try:
|
279
|
-
import msgpack
|
280
354
|
parsed_data = msgpack.unpackb(data_field, raw=False)
|
281
355
|
# 添加必要的元数据
|
282
356
|
parsed_data['_task_name'] = task_name
|
283
357
|
parsed_data['queue'] = queue
|
284
|
-
|
358
|
+
|
285
359
|
# 构建任务项
|
286
360
|
task_item = {
|
287
361
|
'queue': queue,
|
@@ -290,14 +364,33 @@ class OfflineWorkerRecovery:
|
|
290
364
|
'consumer': current_consumer_name,
|
291
365
|
'group_name': group_name
|
292
366
|
}
|
293
|
-
|
294
|
-
await
|
367
|
+
|
368
|
+
await task_event_queue.put(task_item)
|
369
|
+
logger.debug(f"Put recovered message {msg_id} into event_queue for task {task_name}")
|
295
370
|
except Exception as e:
|
296
371
|
logger.error(f"Error processing claimed message: {e}")
|
372
|
+
else:
|
373
|
+
logger.warning(f"No event_queue available for task {task_name}, claimed messages will not be executed")
|
374
|
+
|
375
|
+
# 更新总计数
|
376
|
+
total_claimed += len(claimed_messages)
|
377
|
+
|
378
|
+
# 如果这批处理的消息数少于 batch_size,说明已经处理完了
|
379
|
+
if len(detailed_pending) < batch_size:
|
380
|
+
break
|
381
|
+
else:
|
382
|
+
# 没有成功 claim 到消息,退出循环
|
383
|
+
break
|
384
|
+
|
385
|
+
# 记录总恢复数量
|
386
|
+
if total_claimed_count > 0:
|
387
|
+
logger.info(f"Total claimed {total_claimed_count} messages for {task_name} from {offline_consumer_name}")
|
297
388
|
finally:
|
298
389
|
await lock.release()
|
299
390
|
|
300
391
|
except Exception as e:
|
392
|
+
import traceback
|
393
|
+
traceback.print_exc()
|
301
394
|
logger.error(f"Error recovering messages: {e}")
|
302
395
|
|
303
396
|
return total_claimed
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: jettask
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.20
|
4
4
|
Summary: A high-performance distributed task queue system with web monitoring
|
5
5
|
Author-email: JetTask Team <support@jettask.io>
|
6
6
|
License-Expression: MIT
|
@@ -20,8 +20,7 @@ Classifier: Operating System :: OS Independent
|
|
20
20
|
Requires-Python: >=3.8
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
License-File: LICENSE
|
23
|
-
Requires-Dist: redis
|
24
|
-
Requires-Dist: aioredis>=2.0.0
|
23
|
+
Requires-Dist: redis
|
25
24
|
Requires-Dist: msgpack>=1.0.0
|
26
25
|
Requires-Dist: watchdog>=3.0.0
|
27
26
|
Requires-Dist: uvloop>=0.17.0
|
@@ -49,71 +48,3 @@ Requires-Dist: flake8>=6.0; extra == "dev"
|
|
49
48
|
Requires-Dist: mypy>=1.4.0; extra == "dev"
|
50
49
|
Requires-Dist: coverage>=7.0; extra == "dev"
|
51
50
|
Dynamic: license-file
|
52
|
-
|
53
|
-
# JetTask
|
54
|
-
|
55
|
-
一个高性能的分布式任务队列系统,支持Web监控界面。
|
56
|
-
|
57
|
-
## 特性
|
58
|
-
|
59
|
-
- 🚀 高性能异步任务执行
|
60
|
-
- 📊 实时Web监控界面
|
61
|
-
- ⏰ 支持定时任务和延迟任务
|
62
|
-
- 🔄 任务重试和错误处理
|
63
|
-
- 🎯 多队列和优先级支持
|
64
|
-
- 🌍 多命名空间隔离
|
65
|
-
- 📈 任务统计和性能监控
|
66
|
-
- 🔧 简单易用的API
|
67
|
-
|
68
|
-
## 安装
|
69
|
-
|
70
|
-
```bash
|
71
|
-
pip install jettask
|
72
|
-
```
|
73
|
-
|
74
|
-
## 快速开始
|
75
|
-
|
76
|
-
### 1. 创建任务
|
77
|
-
|
78
|
-
```python
|
79
|
-
from jettask import JetTask
|
80
|
-
|
81
|
-
app = JetTask()
|
82
|
-
|
83
|
-
@app.task(queue="default")
|
84
|
-
async def hello_task(name):
|
85
|
-
return f"Hello, {name}!"
|
86
|
-
```
|
87
|
-
|
88
|
-
### 2. 启动Worker
|
89
|
-
|
90
|
-
```bash
|
91
|
-
jettask worker -a app:app --queues default
|
92
|
-
```
|
93
|
-
|
94
|
-
### 3. 发送任务
|
95
|
-
|
96
|
-
```python
|
97
|
-
result = await hello_task.send("World")
|
98
|
-
print(result) # Hello, World!
|
99
|
-
```
|
100
|
-
|
101
|
-
### 4. 启动Web监控界面
|
102
|
-
|
103
|
-
```bash
|
104
|
-
# 启动API服务
|
105
|
-
jettask api
|
106
|
-
|
107
|
-
# 启动前端界面
|
108
|
-
jettask frontend
|
109
|
-
```
|
110
|
-
|
111
|
-
然后访问 http://localhost:3000 查看监控界面。
|
112
|
-
|
113
|
-
## 文档
|
114
|
-
|
115
|
-
详细文档请参见 [docs/](docs/) 目录。
|
116
|
-
|
117
|
-
## 许可证
|
118
|
-
|
119
|
-
MIT License
|