jettask 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +10 -3
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.19.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -2,19 +2,35 @@
2
2
  简化的离线worker消息恢复模块
3
3
  """
4
4
  import asyncio
5
+ import json
5
6
  import logging
6
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
7
8
  from redis.asyncio.lock import Lock as AsyncLock
8
9
 
9
10
  import msgpack
10
11
 
12
+ if TYPE_CHECKING:
13
+ from jettask.worker.manager import WorkerState
14
+
11
15
  logger = logging.getLogger(__name__)
12
16
 
13
17
 
14
18
  class OfflineWorkerRecovery:
15
19
  """离线worker消息恢复处理器"""
16
-
17
- def __init__(self, async_redis_client, consumer_manager=None, redis_prefix='jettask', worker_prefix='WORKER', queue_formatter=None):
20
+
21
+ def __init__(self, async_redis_client, consumer_manager=None, redis_prefix='jettask', worker_prefix='WORKER', queue_formatter=None, queue_registry=None, worker_state: Optional['WorkerState'] = None):
22
+ """
23
+ 初始化离线worker消息恢复处理器
24
+
25
+ Args:
26
+ async_redis_client: 异步Redis客户端
27
+ consumer_manager: 消费者管理器
28
+ redis_prefix: Redis键前缀
29
+ worker_prefix: Worker键前缀
30
+ queue_formatter: 队列格式化函数
31
+ queue_registry: 队列注册表
32
+ worker_state: WorkerState实例(用于查询Worker状态)
33
+ """
18
34
  self.async_redis_client = async_redis_client
19
35
  self.consumer_manager = consumer_manager
20
36
  self.redis_prefix = redis_prefix
@@ -22,206 +38,244 @@ class OfflineWorkerRecovery:
22
38
  self._stop_recovery = False
23
39
  # 队列格式化函数,默认使用 prefix:QUEUE:queue_name 格式
24
40
  self.queue_formatter = queue_formatter or (lambda q: f"{self.redis_prefix}:QUEUE:{q}")
41
+ # 通过 consumer_manager 访问 worker_state_manager
42
+ self.worker_state_manager = consumer_manager.app.worker_state_manager if (consumer_manager and hasattr(consumer_manager, 'app') and consumer_manager.app) else None
43
+ # 队列注册表,用于获取优先级队列
44
+ self.queue_registry = queue_registry
45
+ # Worker状态查询器(必须传入)
46
+ self._worker_state: Optional['WorkerState'] = worker_state
25
47
 
26
48
  async def recover_offline_workers(self,
27
49
  queue: str,
28
50
  current_consumer_name: str = None,
29
51
  event_queue: Optional[asyncio.Queue] = None,
30
52
  process_message_callback: Optional[callable] = None,
31
- consumer_group_suffix: Optional[str] = None) -> int:
53
+ consumer_group_suffix: Optional[str] = None,
54
+ event_queue_callback: Optional[callable] = None) -> int:
32
55
  """
33
56
  恢复指定队列的离线worker的pending消息
57
+
58
+ 支持优先级队列:
59
+ - 如果 queue_registry 可用,会自动获取基础队列的所有优先级队列并恢复
60
+ - 如果不可用,只恢复指定的队列
34
61
  """
35
62
  total_recovered = 0
36
- logger.debug(f'恢复指定队列的离线worker的pending消息')
63
+ logger.debug(f'恢复指定队列的离线worker的pending消息: {queue}')
64
+
37
65
  try:
38
- # 获取当前consumer名称
39
- if not current_consumer_name and self.consumer_manager:
40
- # 对于优先级队列,使用基础队列名来获取consumer
41
- base_queue = queue
42
- if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
43
- base_queue = queue.rsplit(':', 1)[0]
44
-
45
- current_consumer_name = self.consumer_manager.get_consumer_name(base_queue)
46
-
47
- # 对于优先级队列,consumer名称需要添加队列后缀
48
- if current_consumer_name and base_queue != queue:
49
- priority_suffix = queue.rsplit(':', 1)[-1]
50
- current_consumer_name = f"{current_consumer_name}:{priority_suffix}"
51
-
52
- if not current_consumer_name:
53
- logger.error(f"Cannot get current consumer name for queue {queue}")
54
- return 0
55
-
56
- logger.debug(f"Starting recovery for queue {queue} with consumer {current_consumer_name}")
57
-
58
- # 获取所有离线worker
59
- offline_workers = await self._find_offline_workers(queue)
66
+ # 确定基础队列名(去除优先级后缀)
67
+ base_queue = queue.split(':')[0]
68
+
69
+ # 获取需要恢复的所有队列(包括优先级队列)
70
+ queues_to_recover = [queue]
71
+
72
+ # 如果 queue_registry 可用,并且是基础队列,获取所有优先级队列
73
+ if self.queue_registry and base_queue == queue:
74
+ try:
75
+ priority_queues = await self.queue_registry.get_priority_queues_for_base(queue)
76
+ if priority_queues:
77
+ logger.debug(f"Found {len(priority_queues)} priority queues for base queue {queue}: {priority_queues}")
78
+ queues_to_recover.extend(priority_queues)
79
+ else:
80
+ logger.debug(f"No priority queues found for base queue {queue}")
81
+ except Exception as e:
82
+ logger.warning(f"Error getting priority queues for {queue}: {e}, will only recover base queue")
83
+
84
+ logger.debug(f"Will recover {len(queues_to_recover)} queue(s): {queues_to_recover}")
85
+
86
+ # 只查找一次离线 worker(使用基础队列名,因为 worker 的 queues 字段只存储基础队列)
87
+ offline_workers = await self._find_offline_workers(base_queue)
60
88
  if not offline_workers:
61
- logger.debug(f"No offline workers found for queue {queue}")
89
+ logger.debug(f"No offline workers found for base queue {base_queue}")
62
90
  return 0
63
-
64
- logger.debug(f"Found {len(offline_workers)} offline workers for queue {queue}")
65
-
66
- # 处理每个离线worker
91
+
92
+ logger.info(f"Found {len(offline_workers)} offline workers for base queue {base_queue}, starting recovery...")
93
+
94
+ # 对每个离线worker,提前提取 group_infos(使用基础队列)
95
+ workers_with_groups = []
67
96
  for worker_key, worker_data in offline_workers:
97
+ group_infos = []
98
+ for key, value in worker_data.items():
99
+ if key.startswith('group_info:'):
100
+ try:
101
+ group_info = json.loads(value)
102
+ # 使用基础队列名进行比较
103
+ if group_info.get('queue') == base_queue:
104
+ group_infos.append(group_info)
105
+ logger.debug(f"Found group info for base queue {base_queue}: {group_info}")
106
+ except Exception as e:
107
+ logger.error(f"Error parsing group_info: {e}")
108
+
109
+ workers_with_groups.append((worker_key, worker_data, group_infos))
110
+ logger.info(f"Worker {worker_key} has {len(group_infos)} groups for base queue {base_queue}")
111
+
112
+ # 对每个队列(基础队列 + 优先级队列)恢复消息
113
+ for queue_to_recover in queues_to_recover:
68
114
  if self._stop_recovery:
69
115
  logger.debug("Stopping recovery due to shutdown signal")
70
116
  break
71
-
72
- # logger.debug(f'恢复指定队列的离线worker的pending消息 {offline_workers=}')
73
- # logger.info(f"Processing offline worker: {worker_key} {worker_data=} {queue=}")
74
- recovered = await self._recover_worker_messages(
75
- queue=queue,
76
- worker_key=worker_key,
77
- worker_data=worker_data,
78
- current_consumer_name=current_consumer_name,
79
- event_queue=event_queue,
80
- process_message_callback=process_message_callback,
81
- consumer_group_suffix=consumer_group_suffix
82
- )
83
-
84
- total_recovered += recovered
85
-
117
+
118
+ logger.info(f"Recovering queue: {queue_to_recover}")
119
+
120
+ # 获取当前consumer名称
121
+ consumer_name = current_consumer_name
122
+ if not consumer_name and self.consumer_manager:
123
+ consumer_name = self.consumer_manager.get_consumer_name(base_queue)
124
+
125
+ # 统一 group_name 架构:所有队列(包括优先级队列)使用同一个 consumer name
126
+ # 不再需要为优先级队列添加后缀
127
+
128
+ if not consumer_name:
129
+ logger.error(f"Cannot get current consumer name for queue {queue_to_recover}")
130
+ continue
131
+
132
+ logger.info(f"Starting recovery for queue {queue_to_recover} with consumer {consumer_name}")
133
+
134
+ # 处理每个离线worker的这个队列的消息(传入预提取的 group_infos)
135
+ for worker_key, worker_data, group_infos in workers_with_groups:
136
+ if self._stop_recovery:
137
+ logger.debug("Stopping recovery due to shutdown signal")
138
+ break
139
+
140
+ logger.info(f"Recovering messages from worker {worker_key} for queue {queue_to_recover}")
141
+ recovered = await self._recover_worker_messages(
142
+ queue=queue_to_recover,
143
+ worker_key=worker_key,
144
+ worker_data=worker_data,
145
+ group_infos=group_infos, # 传入预提取的 group_infos
146
+ current_consumer_name=consumer_name,
147
+ event_queue=event_queue,
148
+ process_message_callback=process_message_callback,
149
+ consumer_group_suffix=consumer_group_suffix,
150
+ event_queue_callback=event_queue_callback # 传入回调函数
151
+ )
152
+
153
+ total_recovered += recovered
154
+
86
155
  except Exception as e:
87
156
  logger.error(f"Error recovering offline workers for queue {queue}: {e}")
88
-
157
+ import traceback
158
+ traceback.print_exc()
159
+
89
160
  return total_recovered
90
161
 
91
162
  async def _find_offline_workers(self, queue: str) -> List[Tuple[str, Dict]]:
92
- """查找指定队列的离线worker"""
93
- offline_workers = []
94
-
95
- try:
96
- # 扫描所有worker
97
- pattern = f"{self.redis_prefix}:{self.worker_prefix}:*"
98
- cursor = 0
99
- while True:
100
- cursor, keys = await self.async_redis_client.scan(
101
- cursor=cursor,
102
- match=pattern,
103
- count=100
104
- )
105
-
106
- for worker_key in keys:
107
- if isinstance(worker_key, bytes):
108
- worker_key = worker_key.decode('utf-8')
109
-
110
- # 跳过非worker键
111
- if any(x in worker_key for x in [':HISTORY:', ':REUSE:LOCK', ':REUSING']):
112
- continue
113
-
114
- try:
115
- worker_data = await self.async_redis_client.hgetall(worker_key)
116
- if not worker_data:
117
- continue
118
-
119
- # 解码二进制数据
120
- decoded_worker_data = {}
121
- for k, v in worker_data.items():
122
- key = k.decode('utf-8') if isinstance(k, bytes) else k
123
- value = v.decode('utf-8') if isinstance(v, bytes) else v
124
- decoded_worker_data[key] = value
125
-
126
- # logger.debug(f'{worker_key=} {decoded_worker_data=}')
127
- # logger.debug(f'{decoded_worker_data=}')
128
- # 检查worker是否离线且消息未转移
129
- is_alive = decoded_worker_data.get('is_alive', 'false').lower() == 'true'
130
- messages_transferred = decoded_worker_data.get('messages_transferred', 'false').lower() == 'true'
131
- # logger.debug(f'{worker_key=} {is_alive=} {messages_transferred=} {not is_alive and not messages_transferred}')
132
- # 找到离线且消息未转移的worker
133
- if not is_alive and not messages_transferred:
134
- queues_str = decoded_worker_data.get('queues', '')
135
- worker_queues = queues_str.split(',') if queues_str else []
136
-
137
- # logger.debug(f'{worker_queues=} {queue=}')
138
- # 检查这个worker是否负责当前队列
139
- # 支持优先级队列:如果queue是"base:priority"格式,检查worker是否负责base队列
140
- queue_matched = False
141
- if ':' in queue and queue.rsplit(':', 1)[-1].isdigit():
142
- # 这是优先级队列,提取基础队列名
143
- base_queue = queue.rsplit(':', 1)[0]
144
- queue_matched = base_queue in worker_queues
145
- else:
146
- # 普通队列
147
- queue_matched = queue in worker_queues
148
- if queue_matched:
149
- offline_workers.append((worker_key, decoded_worker_data))
150
-
151
- except Exception as e:
152
- logger.error(f"Error processing worker key {worker_key}: {e}")
153
- continue
154
-
155
- # 当cursor返回0时,表示扫描完成
156
- if cursor == 0:
157
- break
158
-
159
- except Exception as e:
160
- logger.error(f"Error finding offline workers: {e}")
161
-
162
- return offline_workers
163
+ """查找指定队列的离线worker
164
+
165
+ 委托给 WorkerState.find_offline_workers_for_queue() 方法
166
+
167
+ 注意:worker_state 必须在 OfflineWorkerRecovery 初始化时传入
168
+ """
169
+ if self._worker_state is None:
170
+ raise RuntimeError(
171
+ "WorkerState not provided to OfflineWorkerRecovery. "
172
+ "Please pass worker_state parameter during initialization."
173
+ )
174
+
175
+ return await self._worker_state.find_offline_workers_for_queue(
176
+ queue=queue,
177
+ worker_prefix=self.worker_prefix,
178
+ worker_state_manager=self.worker_state_manager
179
+ )
163
180
 
164
181
  async def _recover_worker_messages(self,
165
182
  queue: str,
166
183
  worker_key: str,
167
184
  worker_data: Dict,
185
+ group_infos: List[Dict],
168
186
  current_consumer_name: str,
169
187
  event_queue: Optional[asyncio.Queue] = None,
170
188
  process_message_callback: Optional[callable] = None,
171
- consumer_group_suffix: Optional[str] = None) -> int:
189
+ consumer_group_suffix: Optional[str] = None,
190
+ event_queue_callback: Optional[callable] = None) -> int:
172
191
  """
173
192
  恢复单个worker的pending消息
174
-
175
- 从worker_data中获取所有的group_info字段,恢复对应的pending消息
193
+
194
+ Args:
195
+ queue: 当前要恢复的队列(可能是基础队列或优先级队列)
196
+ worker_key: Worker的Redis键
197
+ worker_data: Worker的数据
198
+ group_infos: 预提取的group_info列表(已按基础队列过滤)
199
+ current_consumer_name: 当前consumer名称
200
+ event_queue: 事件队列
201
+ process_message_callback: 处理消息的回调
202
+ consumer_group_suffix: Consumer组后缀
203
+
204
+ Returns:
205
+ 恢复的消息数量
176
206
  """
177
207
  total_claimed = 0
178
-
208
+
179
209
  try:
180
210
  # worker_data 现在已经是解码后的字典
181
211
  consumer_id = worker_data.get('consumer_id')
182
-
183
- # 从worker_data中提取group_info字段
184
- group_infos = []
185
- import json
186
- for key, value in worker_data.items():
187
- if key.startswith('group_info:'):
188
- try:
189
- group_info = json.loads(value)
190
- # 只处理属于当前队列的group
191
- if group_info.get('queue') == queue:
192
- group_infos.append(group_info)
193
- logger.info(f"Found group info for queue {queue}: {group_info}")
194
- except Exception as e:
195
- logger.error(f"Error parsing group_info: {e}")
196
-
212
+
197
213
  if not group_infos:
198
- logger.debug(f"No group_info found for queue {queue} in worker {worker_key}")
214
+ logger.info(f"No group_info provided for queue {queue} in worker {worker_key}")
199
215
  # 即使没有group_info,也要标记为已处理,避免重复扫描
200
- await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
216
+ # 通过 WorkerStateManager 标记消息已转移
217
+ if self.worker_state_manager:
218
+ worker_id = worker_key.split(':')[-1]
219
+ await self.worker_state_manager.mark_messages_transferred(worker_id, transferred=True)
220
+ else:
221
+ await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
201
222
  return 0
202
-
223
+
203
224
  # 在处理任何group之前,先标记该worker的消息已开始转移
204
225
  # 避免其他进程重复处理
205
- await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
226
+ if self.worker_state_manager:
227
+ worker_id = worker_key.split(':')[-1]
228
+ await self.worker_state_manager.mark_messages_transferred(worker_id, transferred=True)
229
+ else:
230
+ await self.async_redis_client.hset(worker_key, 'messages_transferred', 'true')
206
231
  logger.info(f"Marked worker {worker_key} as messages_transferred=true")
207
232
 
208
233
  # 处理每个group_info
209
234
  for group_info in group_infos:
210
- stream_key = group_info.get('stream_key')
235
+ base_stream_key = group_info.get('stream_key')
211
236
  group_name = group_info.get('group_name')
212
- offline_consumer_name = group_info.get('consumer_name')
237
+ base_offline_consumer_name = group_info.get('consumer_name')
213
238
  task_name = group_info.get('task_name')
214
-
215
- if not all([stream_key, group_name, offline_consumer_name]):
239
+ base_queue = group_info.get('queue')
240
+
241
+ if not all([base_stream_key, group_name, base_offline_consumer_name]):
216
242
  logger.warning(f"Incomplete group_info: {group_info}")
217
243
  continue
218
-
244
+
245
+ # 根据当前处理的队列构建正确的 stream_key 和 offline_consumer_name
246
+ # group_info 中存储的是基础队列的信息(如 robust_bench2)
247
+ # 如果当前处理的是优先级队列(如 robust_bench2:6),需要添加优先级后缀
248
+ stream_key = f"{self.redis_prefix}:QUEUE:{queue}"
249
+
250
+ # 构建离线 consumer 的名称
251
+ # 如果当前处理的是优先级队列,需要添加优先级后缀
252
+ offline_consumer_name = base_offline_consumer_name
253
+ # if base_queue and queue != base_queue:
254
+ # # 提取优先级后缀(如从 robust_bench2:6 提取 6)
255
+ # priority_suffix = queue.rsplit(':', 1)[-1]
256
+ # offline_consumer_name = f"{base_offline_consumer_name}:{priority_suffix}"
257
+
219
258
  logger.info(f"Recovering task {task_name}: stream={stream_key}, group={group_name}, consumer={offline_consumer_name}")
220
-
221
- # 跳过自己的consumer
222
- if current_consumer_name == offline_consumer_name:
223
- logger.debug(f"Skipping own consumer: {offline_consumer_name}")
259
+
260
+ # 跳过自己的consumer,但只有在worker仍然活跃的情况下
261
+ # 如果worker已经offline(is_alive=false或messages_transferred=true),即使consumer名称相同也应该恢复
262
+ # 这处理了worker_id被复用的情况
263
+ is_alive = worker_data.get('is_alive', 'false')
264
+ if isinstance(is_alive, bytes):
265
+ is_alive = is_alive.decode('utf-8')
266
+ is_alive = is_alive.lower() == 'true'
267
+
268
+ messages_transferred = worker_data.get('messages_transferred', 'false')
269
+ if isinstance(messages_transferred, bytes):
270
+ messages_transferred = messages_transferred.decode('utf-8')
271
+ messages_transferred = messages_transferred.lower() == 'true'
272
+
273
+ # 只有在worker活跃且消息未转移时,才跳过同名consumer
274
+ if current_consumer_name == offline_consumer_name and is_alive and not messages_transferred:
275
+ logger.info(f"Skipping own active consumer: {offline_consumer_name}")
224
276
  continue
277
+ elif current_consumer_name == offline_consumer_name:
278
+ logger.info(f"Recovering same-name consumer from offline worker: {offline_consumer_name} (is_alive={is_alive}, messages_transferred={messages_transferred})")
225
279
 
226
280
  # 使用分布式锁
227
281
  lock_key = f"{self.redis_prefix}:CLAIM:LOCK:{offline_consumer_name}:{group_name}"
@@ -233,7 +287,7 @@ class OfflineWorkerRecovery:
233
287
  )
234
288
 
235
289
  if not await lock.acquire():
236
- logger.debug(f"Lock busy for {offline_consumer_name}:{group_name}")
290
+ logger.info(f"Lock busy for {offline_consumer_name}:{group_name}")
237
291
  continue
238
292
 
239
293
  try:
@@ -241,18 +295,30 @@ class OfflineWorkerRecovery:
241
295
  pending_info = await self.async_redis_client.xpending(
242
296
  stream_key, group_name
243
297
  )
244
-
245
- if pending_info and pending_info.get('pending', 0) > 0:
246
- # 获取具体的pending消息信息
247
- detailed_pending = await self.async_redis_client.xpending_range(
248
- stream_key, group_name,
249
- min='-', max='+', count=100,
250
- consumername=offline_consumer_name
251
- )
252
-
253
- if detailed_pending:
254
- logger.info(f"Found {len(detailed_pending)} pending messages for {task_name}")
255
-
298
+ logger.info(f"Pending info for {stream_key=} {group_name=} {task_name}: {pending_info=}")
299
+
300
+ total_pending = pending_info.get('pending', 0) if pending_info else 0
301
+ if total_pending > 0:
302
+ # 批量处理所有 pending 消息(避免遗漏)
303
+ batch_size = 100
304
+ total_claimed_count = 0
305
+
306
+ # 循环直到处理完所有消息
307
+ while True:
308
+ # 获取具体的pending消息信息(每次最多 batch_size 条)
309
+ detailed_pending = await self.async_redis_client.xpending_range(
310
+ stream_key, group_name,
311
+ min='-', max='+', count=batch_size,
312
+ consumername=offline_consumer_name
313
+ )
314
+ logger.info(f'{detailed_pending=} {stream_key=} {group_name=} {offline_consumer_name=}')
315
+
316
+ if not detailed_pending:
317
+ # 没有更多消息了
318
+ break
319
+
320
+ logger.info(f"Found {len(detailed_pending)} pending messages for {task_name} (batch {total_claimed_count // batch_size + 1})")
321
+
256
322
  # 批量认领消息
257
323
  message_ids = [msg['message_id'] for msg in detailed_pending]
258
324
  claimed_messages = await self.async_redis_client.xclaim(
@@ -261,27 +327,35 @@ class OfflineWorkerRecovery:
261
327
  min_idle_time=0,
262
328
  message_ids=message_ids
263
329
  )
264
-
330
+
265
331
  if claimed_messages:
266
- logger.info(f"Claimed {len(claimed_messages)} messages for task {task_name}")
267
- total_claimed += len(claimed_messages)
268
-
269
- # 如果提供了event_queue,将消息放入队列
270
- if event_queue:
332
+ logger.info(f"Claimed {len(claimed_messages)} messages for task {task_name} in this batch")
333
+ total_claimed_count += len(claimed_messages)
334
+
335
+ # 获取该任务的 event_queue
336
+ # 优先使用 event_queue_callback,其次使用直接传入的 event_queue
337
+ task_event_queue = None
338
+ if event_queue_callback and task_name:
339
+ task_event_queue = event_queue_callback(task_name)
340
+ elif event_queue:
341
+ task_event_queue = event_queue
342
+
343
+ # 如果有 event_queue,将消息放入队列
344
+ if task_event_queue:
345
+ logger.info(f'即将转移 {len(claimed_messages)=} 消息到 {task_name}')
271
346
  for msg_id, msg_data in claimed_messages:
272
347
  if isinstance(msg_id, bytes):
273
348
  msg_id = msg_id.decode('utf-8')
274
-
349
+
275
350
  # 解析消息数据
276
351
  data_field = msg_data.get(b'data') or msg_data.get('data')
277
352
  if data_field:
278
353
  try:
279
- import msgpack
280
354
  parsed_data = msgpack.unpackb(data_field, raw=False)
281
355
  # 添加必要的元数据
282
356
  parsed_data['_task_name'] = task_name
283
357
  parsed_data['queue'] = queue
284
-
358
+
285
359
  # 构建任务项
286
360
  task_item = {
287
361
  'queue': queue,
@@ -290,14 +364,33 @@ class OfflineWorkerRecovery:
290
364
  'consumer': current_consumer_name,
291
365
  'group_name': group_name
292
366
  }
293
-
294
- await event_queue.put(task_item)
367
+
368
+ await task_event_queue.put(task_item)
369
+ logger.debug(f"Put recovered message {msg_id} into event_queue for task {task_name}")
295
370
  except Exception as e:
296
371
  logger.error(f"Error processing claimed message: {e}")
372
+ else:
373
+ logger.warning(f"No event_queue available for task {task_name}, claimed messages will not be executed")
374
+
375
+ # 更新总计数
376
+ total_claimed += len(claimed_messages)
377
+
378
+ # 如果这批处理的消息数少于 batch_size,说明已经处理完了
379
+ if len(detailed_pending) < batch_size:
380
+ break
381
+ else:
382
+ # 没有成功 claim 到消息,退出循环
383
+ break
384
+
385
+ # 记录总恢复数量
386
+ if total_claimed_count > 0:
387
+ logger.info(f"Total claimed {total_claimed_count} messages for {task_name} from {offline_consumer_name}")
297
388
  finally:
298
389
  await lock.release()
299
390
 
300
391
  except Exception as e:
392
+ import traceback
393
+ traceback.print_exc()
301
394
  logger.error(f"Error recovering messages: {e}")
302
395
 
303
396
  return total_claimed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jettask
3
- Version: 0.2.19
3
+ Version: 0.2.20
4
4
  Summary: A high-performance distributed task queue system with web monitoring
5
5
  Author-email: JetTask Team <support@jettask.io>
6
6
  License-Expression: MIT
@@ -20,8 +20,7 @@ Classifier: Operating System :: OS Independent
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
- Requires-Dist: redis>=4.5.0
24
- Requires-Dist: aioredis>=2.0.0
23
+ Requires-Dist: redis
25
24
  Requires-Dist: msgpack>=1.0.0
26
25
  Requires-Dist: watchdog>=3.0.0
27
26
  Requires-Dist: uvloop>=0.17.0
@@ -49,71 +48,3 @@ Requires-Dist: flake8>=6.0; extra == "dev"
49
48
  Requires-Dist: mypy>=1.4.0; extra == "dev"
50
49
  Requires-Dist: coverage>=7.0; extra == "dev"
51
50
  Dynamic: license-file
52
-
53
- # JetTask
54
-
55
- 一个高性能的分布式任务队列系统,支持Web监控界面。
56
-
57
- ## 特性
58
-
59
- - 🚀 高性能异步任务执行
60
- - 📊 实时Web监控界面
61
- - ⏰ 支持定时任务和延迟任务
62
- - 🔄 任务重试和错误处理
63
- - 🎯 多队列和优先级支持
64
- - 🌍 多命名空间隔离
65
- - 📈 任务统计和性能监控
66
- - 🔧 简单易用的API
67
-
68
- ## 安装
69
-
70
- ```bash
71
- pip install jettask
72
- ```
73
-
74
- ## 快速开始
75
-
76
- ### 1. 创建任务
77
-
78
- ```python
79
- from jettask import JetTask
80
-
81
- app = JetTask()
82
-
83
- @app.task(queue="default")
84
- async def hello_task(name):
85
- return f"Hello, {name}!"
86
- ```
87
-
88
- ### 2. 启动Worker
89
-
90
- ```bash
91
- jettask worker -a app:app --queues default
92
- ```
93
-
94
- ### 3. 发送任务
95
-
96
- ```python
97
- result = await hello_task.send("World")
98
- print(result) # Hello, World!
99
- ```
100
-
101
- ### 4. 启动Web监控界面
102
-
103
- ```bash
104
- # 启动API服务
105
- jettask api
106
-
107
- # 启动前端界面
108
- jettask frontend
109
- ```
110
-
111
- 然后访问 http://localhost:3000 查看监控界面。
112
-
113
- ## 文档
114
-
115
- 详细文档请参见 [docs/](docs/) 目录。
116
-
117
- ## 许可证
118
-
119
- MIT License