jettask 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +60 -2
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.18.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
jettask/pg_consumer.py DELETED
@@ -1,1896 +0,0 @@
1
- #!/usr/bin/env python
2
- """简化版的 PostgreSQL Consumer - 只保留必要功能"""
3
-
4
- import asyncio
5
- import json
6
- import logging
7
- import msgpack
8
- import os
9
- import time
10
- from typing import Dict, List, Optional, Any, Set
11
- from datetime import datetime, timezone
12
- from collections import defaultdict
13
-
14
- import redis.asyncio as redis
15
- from redis.asyncio import Redis
16
- from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
17
- from sqlalchemy.orm import sessionmaker
18
- from sqlalchemy import text
19
-
20
- from jettask.webui_config import PostgreSQLConfig, RedisConfig
21
- from jettask.core.consumer_manager import ConsumerManager, ConsumerStrategy
22
- from jettask.core.offline_worker_recovery import OfflineWorkerRecovery
23
- from jettask.constants import is_internal_consumer, TASK_STATUS_PRIORITY
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
- # 注释掉调试文件写入,避免权限问题
28
- # logger_f = open(f'./pg_consumer.txt', 'a+')
29
-
30
- # 使用 constants.py 中定义的任务状态优先级
31
- # STATUS_PRIORITY 已从 constants.py 导入为 TASK_STATUS_PRIORITY
32
- class PostgreSQLConsumer:
33
- """PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL
34
-
35
- 支持多租户(命名空间)隔离
36
- """
37
-
38
- def __init__(self, pg_config: PostgreSQLConfig, redis_config: RedisConfig, prefix: str = "jettask",
39
- node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT,
40
- namespace_id: str = None, namespace_name: str = None,
41
- enable_backlog_monitor: bool = True, backlog_monitor_interval: int = 1):
42
- self.pg_config = pg_config
43
- self.redis_config = redis_config
44
- self.prefix = prefix
45
-
46
- # 命名空间支持
47
- self.namespace_id = namespace_id
48
- self.namespace_name = namespace_name or "default"
49
- self.redis_client: Optional[Redis] = None
50
- self.async_engine = None
51
- self.AsyncSessionLocal = None
52
- self.consumer_group = f"{prefix}_pg_consumer"
53
-
54
- # 节点标识
55
- import socket
56
- hostname = socket.gethostname()
57
- self.node_id = node_id or f"{hostname}_{os.getpid()}"
58
-
59
- # 使用 ConsumerManager 来管理 consumer_id
60
- self.consumer_strategy = consumer_strategy
61
- self.consumer_manager = None # 将在 start() 中初始化
62
- self.consumer_id = None # 将从 ConsumerManager 获取
63
-
64
- self._running = False
65
- self._tasks = []
66
- self._known_queues = set()
67
- self._consecutive_errors = defaultdict(int)
68
-
69
- # 内存中维护已处理的任务ID集合(用于优化查询)
70
- self._processed_task_ids = set()
71
- self._processed_ids_lock = asyncio.Lock() # 保护并发访问
72
- # 定期清理过期的ID(防止内存无限增长)
73
- self._processed_ids_max_size = 100000 # 最多保存10万个ID
74
- self._processed_ids_cleanup_interval = 300 # 每5分钟清理一次
75
-
76
- # 待重试的任务更新(任务ID -> 更新信息)
77
- self._pending_updates = {}
78
- self._pending_updates_lock = asyncio.Lock()
79
- self._max_pending_updates = 10000 # 最多保存1万个待重试更新
80
- self._retry_interval = 5 # 每5秒重试一次
81
-
82
- # 动态批次大小
83
- self.batch_size = 2000
84
- self.min_batch_size = 500
85
- self.max_batch_size = 5000
86
-
87
- # Stream积压监控配置
88
- self.enable_backlog_monitor = enable_backlog_monitor # 是否启用积压监控
89
- self.backlog_monitor_interval = backlog_monitor_interval # 监控采集间隔(秒)
90
- self.backlog_monitor_lock_key = f"{prefix}:BACKLOG_MONITOR_LOCK" # 分布式锁键
91
- self.backlog_monitor_lock_ttl = backlog_monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
92
-
93
- # 队列注册表(替代scan命令)
94
- self.queue_registry_key = f"{prefix}:QUEUE_REGISTRY" # 队列注册表的Redis key
95
- self.stream_registry_key = f"{prefix}:STREAM_REGISTRY" # Stream注册表的Redis key(用于积压监控)
96
-
97
- async def start(self):
98
- """启动消费者"""
99
- logger.info(f"Starting PostgreSQL consumer (simplified) on node: {self.node_id}")
100
-
101
- # 连接Redis
102
- # 构建连接参数,只在密码非空时传递
103
- async_redis_params = {
104
- 'host': self.redis_config.host,
105
- 'port': self.redis_config.port,
106
- 'db': self.redis_config.db,
107
- 'decode_responses': False
108
- }
109
- if self.redis_config.password:
110
- async_redis_params['password'] = self.redis_config.password
111
-
112
- self.redis_client = await redis.Redis(**async_redis_params)
113
-
114
- # 初始化 ConsumerManager(需要同步的 Redis 客户端)
115
- import redis as sync_redis
116
- # 构建连接参数,只在密码非空时传递
117
- sync_redis_params = {
118
- 'host': self.redis_config.host,
119
- 'port': self.redis_config.port,
120
- 'db': self.redis_config.db,
121
- 'decode_responses': True # 使用字符串模式,与其他组件保持一致
122
- }
123
- if self.redis_config.password:
124
- sync_redis_params['password'] = self.redis_config.password
125
-
126
- sync_redis_client = sync_redis.StrictRedis(**sync_redis_params)
127
-
128
- # 配置 ConsumerManager
129
- # 初始队列列表包含TASK_CHANGES,其他队列会动态添加
130
- initial_queues = ['TASK_CHANGES'] # TASK_CHANGES是固定的
131
- consumer_config = {
132
- 'redis_prefix': self.prefix,
133
- 'queues': initial_queues,
134
- 'worker_prefix': 'PG_CONSUMER', # 使用不同的前缀,与task worker区分开
135
- }
136
-
137
- self.consumer_manager = ConsumerManager(
138
- redis_client=sync_redis_client,
139
- strategy=self.consumer_strategy,
140
- config=consumer_config
141
- )
142
-
143
- # 获取稳定的 consumer_id(使用TASK_CHANGES作为基准队列)
144
- self.consumer_id = self.consumer_manager.get_consumer_name('TASK_CHANGES')
145
- logger.debug(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
146
-
147
- # 创建SQLAlchemy异步引擎
148
- if self.pg_config.dsn.startswith('postgresql://'):
149
- dsn = self.pg_config.dsn.replace('postgresql://', 'postgresql+asyncpg://', 1)
150
- else:
151
- dsn = self.pg_config.dsn
152
-
153
- self.async_engine = create_async_engine(
154
- dsn,
155
- pool_size=50,
156
- max_overflow=20,
157
- pool_pre_ping=True,
158
- pool_recycle=300,
159
- echo=False
160
- )
161
-
162
- # 预热连接池
163
- logger.debug("Pre-warming database connection pool...")
164
- async with self.async_engine.begin() as conn:
165
- await conn.execute(text("SELECT 1"))
166
-
167
- # 创建异步会话工厂
168
- self.AsyncSessionLocal = sessionmaker(
169
- self.async_engine,
170
- class_=AsyncSession,
171
- expire_on_commit=False
172
- )
173
-
174
-
175
- self._running = True
176
-
177
- # 先进行一次队列发现,确保ConsumerManager有正确的队列列表
178
- await self._initial_queue_discovery()
179
-
180
- # 创建离线worker恢复器(用于恢复TASK_CHANGES stream的离线消息)
181
- self.offline_recovery = OfflineWorkerRecovery(
182
- async_redis_client=self.redis_client,
183
- redis_prefix=self.prefix,
184
- worker_prefix='PG_CONSUMER', # 使用PG_CONSUMER前缀
185
- consumer_manager=self.consumer_manager
186
- )
187
-
188
- # 启动消费任务(简化版:只保留必要的任务)
189
- self._tasks = [
190
- asyncio.create_task(self._consume_queues()), # 消费新任务
191
- asyncio.create_task(self._consume_task_changes()), # 消费任务变更事件
192
- asyncio.create_task(self._database_maintenance()), # 数据库维护
193
- asyncio.create_task(self._retry_pending_updates()), # 重试待更新的任务
194
- asyncio.create_task(self._start_offline_recovery()) # 离线worker恢复服务
195
- ]
196
-
197
- # 如果启用了积压监控,添加监控任务
198
- if self.enable_backlog_monitor:
199
- self._tasks.append(
200
- asyncio.create_task(self._stream_backlog_monitor()) # Stream积压监控
201
- )
202
- logger.info(f"Stream backlog monitor enabled with {self.backlog_monitor_interval}s interval")
203
-
204
- # 如果使用 HEARTBEAT 策略,ConsumerManager 会自动管理心跳
205
- if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and self.consumer_manager:
206
- # 启动心跳(ConsumerManager 内部会处理)
207
- logger.debug("Heartbeat is managed by ConsumerManager")
208
-
209
- logger.debug("PostgreSQL consumer started successfully")
210
-
211
- async def stop(self):
212
- """停止消费者"""
213
- logger.debug("Stopping PostgreSQL consumer...")
214
- self._running = False
215
-
216
- # 停止离线恢复服务
217
- if hasattr(self, 'offline_recovery'):
218
- self.offline_recovery.stop() # stop() 不是异步方法
219
-
220
- # 取消所有任务
221
- for task in self._tasks:
222
- task.cancel()
223
-
224
- # 等待任务完成
225
- await asyncio.gather(*self._tasks, return_exceptions=True)
226
-
227
- # 清理 ConsumerManager
228
- if self.consumer_manager:
229
- try:
230
- self.consumer_manager.cleanup()
231
- logger.debug(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
232
- except Exception as e:
233
- logger.error(f"Error cleaning up ConsumerManager: {e}")
234
-
235
- # 关闭连接
236
- if self.redis_client:
237
- await self.redis_client.close()
238
-
239
- if self.async_engine:
240
- await self.async_engine.dispose()
241
-
242
- logger.debug("PostgreSQL consumer stopped")
243
-
244
- async def _initial_queue_discovery(self):
245
- """初始队列发现,在启动时执行一次 - 使用队列注册表替代scan"""
246
- try:
247
- new_queues = set()
248
- logger.info(f"Starting initial queue discovery from queue registry: {self.queue_registry_key}")
249
-
250
- # 从队列注册表获取所有队列
251
- queue_members = await self.redis_client.smembers(self.queue_registry_key.encode())
252
- for queue_name_bytes in queue_members:
253
- queue_name = queue_name_bytes.decode('utf-8') if isinstance(queue_name_bytes, bytes) else str(queue_name_bytes)
254
- new_queues.add(queue_name)
255
- logger.info(f"Found registered queue: {queue_name}")
256
-
257
- # 如果注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
258
- if not new_queues:
259
- logger.warning(f"Queue registry is empty, performing one-time scan initialization...")
260
- pattern = f"{self.prefix}:QUEUE:*"
261
- async for key in self.redis_client.scan_iter(match=pattern, count=100):
262
- key_str = key.decode('utf-8')
263
- parts = key_str.split(":")
264
- if len(parts) >= 3:
265
- # 去掉前缀和QUEUE部分
266
- queue_parts = parts[2:] # 从第3部分开始是队列名
267
- queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
268
- new_queues.add(queue_name)
269
- logger.info(f"Found queue during scan: {queue_name} from key: {key_str}")
270
-
271
- # 将发现的队列添加到注册表中
272
- if new_queues:
273
- pipeline = self.redis_client.pipeline()
274
- for queue_name in new_queues:
275
- pipeline.sadd(self.queue_registry_key.encode(), queue_name.encode())
276
- await pipeline.execute()
277
- logger.info(f"Registered {len(new_queues)} queues to registry during initialization")
278
-
279
- if new_queues:
280
- logger.info(f"Initial queue discovery found {len(new_queues)} queues: {new_queues}")
281
- # 合并所有队列:TASK_CHANGES + 动态发现的队列
282
- all_queues = list(new_queues) + ['TASK_CHANGES']
283
-
284
- # 更新ConsumerManager的配置
285
- if self.consumer_manager:
286
- self.consumer_manager.config['queues'] = all_queues
287
-
288
- # 更新worker的队列信息
289
- # 获取实际的consumer_id(从心跳策略中)
290
- if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
291
- actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
292
- else:
293
- # 从consumer_name中提取(格式:consumer_id-queue)
294
- actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
295
-
296
- worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
297
- try:
298
- # 使用同步Redis客户端更新
299
- self.consumer_manager.redis_client.hset(
300
- worker_key,
301
- 'queues',
302
- ','.join(all_queues)
303
- )
304
- logger.debug(f"Initial queue discovery - found queues: {all_queues}")
305
- except Exception as e:
306
- logger.error(f"Error updating initial worker queues: {e}")
307
-
308
- self._known_queues = new_queues
309
-
310
- except Exception as e:
311
- logger.error(f"Error in initial queue discovery: {e}")
312
-
313
- async def _discover_queues(self):
314
- """定期发现新队列 - 使用队列注册表替代scan"""
315
- while self._running:
316
- try:
317
- new_queues = set()
318
-
319
- # 从队列注册表获取所有队列
320
- queue_members = await self.redis_client.smembers(self.queue_registry_key.encode())
321
- for queue_name_bytes in queue_members:
322
- queue_name = queue_name_bytes.decode('utf-8') if isinstance(queue_name_bytes, bytes) else str(queue_name_bytes)
323
- new_queues.add(queue_name)
324
-
325
- # 优化:添加日志,只在队列数量或内容发生变化时记录
326
- if len(new_queues) != len(self._known_queues) or new_queues != self._known_queues:
327
- logger.debug(f"Queue registry contains {len(new_queues)} queues: {sorted(new_queues)}")
328
-
329
- # 为新发现的队列创建消费者组(注意:新队列应该通过生产者自动注册)
330
- new_discovered = new_queues - self._known_queues
331
- if new_discovered:
332
- for queue in new_discovered:
333
- # 正确构建stream_key,保留优先级部分
334
- stream_key = f"{self.prefix}:QUEUE:{queue}"
335
- try:
336
- await self.redis_client.xgroup_create(
337
- stream_key, self.consumer_group, id='0', mkstream=True
338
- )
339
- logger.info(f"Created consumer group for new queue: {queue} with stream_key: {stream_key}")
340
- except redis.ResponseError:
341
- pass
342
-
343
- # 更新ConsumerManager的队列列表(同步操作)
344
- if new_queues != self._known_queues:
345
- logger.info(f"Queue discovery: found {len(new_queues)} queues: {new_queues}")
346
- # 合并所有队列:TASK_CHANGES + 动态发现的队列
347
- all_queues = list(new_queues) + ['TASK_CHANGES']
348
-
349
- # 更新ConsumerManager的配置
350
- if self.consumer_manager:
351
- self.consumer_manager.config['queues'] = all_queues
352
-
353
- # 更新worker的队列信息
354
- # 获取实际的consumer_id(从心跳策略中)
355
- if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and hasattr(self.consumer_manager, '_heartbeat_strategy'):
356
- actual_consumer_id = self.consumer_manager._heartbeat_strategy.consumer_id
357
- else:
358
- # 从consumer_name中提取(格式:consumer_id-queue)
359
- actual_consumer_id = self.consumer_id.rsplit('-', 1)[0] if '-' in self.consumer_id else self.consumer_id
360
-
361
- worker_key = f"{self.prefix}:{self.consumer_manager.config.get('worker_prefix', 'PG_CONSUMER')}:{actual_consumer_id}"
362
- try:
363
- # 使用同步Redis客户端更新
364
- self.consumer_manager.redis_client.hset(
365
- worker_key,
366
- 'queues',
367
- ','.join(all_queues)
368
- )
369
- logger.debug(f"Updated ConsumerManager queues: {all_queues}")
370
- except Exception as e:
371
- logger.error(f"Error updating worker queues: {e}")
372
-
373
- self._known_queues = new_queues
374
- await asyncio.sleep(10) # 保持较短的检查间隔,确保新队列能及时发现
375
-
376
- except Exception as e:
377
- import traceback
378
- traceback.print_exc()
379
- logger.error(f"Error discovering queues: {e}")
380
- await asyncio.sleep(10)
381
-
382
- async def _consume_queue(self, queue_name: str):
383
- """消费单个队列的任务(包括优先级队列)"""
384
- # logger.info(f"Starting to consume queue: {queue_name}")
385
- # 判断是否是优先级队列
386
- is_priority_queue = ':' in queue_name and queue_name.rsplit(':', 1)[-1].isdigit()
387
-
388
- if is_priority_queue:
389
- # 优先级队列格式:base_queue:priority (如 robust_bench2:2)
390
- base_queue = queue_name.rsplit(':', 1)[0]
391
- priority = queue_name.rsplit(':', 1)[1]
392
- stream_key = f"{self.prefix}:QUEUE:{base_queue}:{priority}"
393
- else:
394
- # 普通队列
395
- stream_key = f"{self.prefix}:QUEUE:{queue_name}"
396
-
397
- logger.debug(f"Consuming queue: {queue_name}, stream_key: {stream_key}, is_priority: {is_priority_queue}")
398
-
399
- check_backlog = True
400
- lastid = "0-0"
401
-
402
- # pg_consumer 应该使用统一的 consumer_id,而不是为每个队列创建新的
403
- # 因为 pg_consumer 的职责是消费所有队列的消息并写入数据库
404
- # 它不是真正的任务执行者,所以不需要为每个队列创建独立的 consumer
405
- consumer_name = self.consumer_id
406
-
407
- # ConsumerManager会自动处理离线worker的pending消息恢复
408
- # 不需要手动恢复
409
-
410
- while self._running and queue_name in self._known_queues:
411
- try:
412
- myid = lastid if check_backlog else ">"
413
-
414
- messages = await self.redis_client.xreadgroup(
415
- self.consumer_group,
416
- consumer_name, # 使用ConsumerManager管理的consumer_name
417
- {stream_key: myid},
418
- count=10000,
419
- block=1000 if not check_backlog else 0
420
- )
421
- if not messages or (messages and len(messages[0][1]) == 0):
422
- check_backlog = False
423
- continue
424
-
425
- if messages:
426
- await self._process_messages(messages)
427
- self._consecutive_errors[queue_name] = 0
428
-
429
- if messages[0] and messages[0][1]:
430
- lastid = messages[0][1][-1][0].decode('utf-8') if isinstance(messages[0][1][-1][0], bytes) else messages[0][1][-1][0]
431
- check_backlog = len(messages[0][1]) >= 2000
432
-
433
- except redis.ResponseError as e:
434
- if "NOGROUP" in str(e):
435
- try:
436
- await self.redis_client.xgroup_create(
437
- stream_key, self.consumer_group, id='0', mkstream=True
438
- )
439
- logger.debug(f"Recreated consumer group for queue: {queue_name}")
440
- check_backlog = True
441
- lastid = "0-0"
442
- except:
443
- pass
444
- else:
445
- logger.error(f"Redis error for queue {queue_name}: {e}")
446
- self._consecutive_errors[queue_name] += 1
447
-
448
- if self._consecutive_errors[queue_name] > 10:
449
- logger.debug(f"Too many errors for queue {queue_name}, will retry later")
450
- await asyncio.sleep(30)
451
- self._consecutive_errors[queue_name] = 0
452
-
453
- except Exception as e:
454
- logger.error(f"Error consuming queue {queue_name}: {e}", exc_info=True)
455
- self._consecutive_errors[queue_name] += 1
456
- await asyncio.sleep(1)
457
-
458
- async def _consume_queues(self):
459
- """启动所有队列的消费任务"""
460
- discover_task = asyncio.create_task(self._discover_queues())
461
- queue_tasks = {}
462
- while self._running:
463
- try:
464
- for queue in self._known_queues:
465
- if queue not in queue_tasks or queue_tasks[queue].done():
466
- queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
467
- logger.debug(f"Started consumer task for queue: {queue}")
468
-
469
- for queue in list(queue_tasks.keys()):
470
- if queue not in self._known_queues:
471
- queue_tasks[queue].cancel()
472
- del queue_tasks[queue]
473
- logger.debug(f"Stopped consumer task for removed queue: {queue}")
474
-
475
- await asyncio.sleep(10)
476
-
477
- except Exception as e:
478
- logger.error(f"Error in consume_queues manager: {e}")
479
- await asyncio.sleep(5)
480
-
481
- discover_task.cancel()
482
- for task in queue_tasks.values():
483
- task.cancel()
484
-
485
- await asyncio.gather(discover_task, *queue_tasks.values(), return_exceptions=True)
486
-
487
- async def _process_messages(self, messages: List):
488
- """处理消息并保存到PostgreSQL"""
489
- tasks_to_insert = []
490
- ack_batch = []
491
-
492
- for stream_key, stream_messages in messages:
493
- if not stream_messages:
494
- continue
495
-
496
- stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
497
- msg_ids_to_ack = []
498
-
499
- for msg_id, data in stream_messages:
500
- try:
501
- if not msg_id or not data:
502
- continue
503
-
504
- msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
505
-
506
- # 使用公共方法解析消息
507
- task_info = self._parse_stream_message(msg_id_str, data)
508
- if task_info:
509
- tasks_to_insert.append(task_info)
510
- msg_ids_to_ack.append(msg_id)
511
-
512
- except Exception as e:
513
- logger.error(f"Error processing message {msg_id}: {e}")
514
-
515
- if msg_ids_to_ack:
516
- ack_batch.append((stream_key, msg_ids_to_ack))
517
-
518
- if tasks_to_insert:
519
- await self._insert_tasks(tasks_to_insert)
520
-
521
- # 将成功插入的任务ID添加到内存集合中
522
- async with self._processed_ids_lock:
523
- for task in tasks_to_insert:
524
- self._processed_task_ids.add(task['id'])
525
-
526
- # 如果集合过大,清理最早的一半
527
- if len(self._processed_task_ids) > self._processed_ids_max_size:
528
- # 只保留最新的一半ID
529
- ids_list = list(self._processed_task_ids)
530
- keep_count = self._processed_ids_max_size // 2
531
- self._processed_task_ids = set(ids_list[-keep_count:])
532
- logger.debug(f"Cleaned processed IDs cache, kept {keep_count} most recent IDs")
533
-
534
- if ack_batch:
535
- pipeline = self.redis_client.pipeline()
536
- for stream_key, msg_ids in ack_batch:
537
- pipeline.xack(stream_key, self.consumer_group, *msg_ids)
538
-
539
- try:
540
- await pipeline.execute()
541
- total_acked = sum(len(msg_ids) for _, msg_ids in ack_batch)
542
- logger.debug(f"Successfully ACKed {total_acked} messages")
543
- except Exception as e:
544
- logger.error(f"Error executing batch ACK: {e}")
545
-
546
- async def _insert_tasks(self, tasks: List[Dict[str, Any]]):
547
- """批量插入任务到PostgreSQL(只处理tasks表)"""
548
- if not tasks:
549
- return
550
-
551
- logger.info(f"Attempting to insert {len(tasks)} tasks to tasks table")
552
- try:
553
- async with self.AsyncSessionLocal() as session:
554
- # 插入tasks表 - 使用批量INSERT忽略冲突
555
- # 由于stream_id在实践中是唯一的,我们可以简单地忽略重复
556
- tasks_query = text("""
557
- INSERT INTO tasks (stream_id, queue, namespace, scheduled_task_id,
558
- payload, priority, created_at, source, metadata)
559
- VALUES (:stream_id, :queue, :namespace, :scheduled_task_id,
560
- CAST(:payload AS jsonb), :priority, :created_at, :source, CAST(:metadata AS jsonb))
561
- ON CONFLICT DO NOTHING
562
- RETURNING stream_id;
563
- """)
564
-
565
- # 准备tasks表的数据
566
- tasks_data = []
567
- for task in tasks:
568
- task_data = json.loads(task['task_data'])
569
-
570
- # 从task_data中获取scheduled_task_id
571
- scheduled_task_id = task_data.get('scheduled_task_id') or task.get('scheduled_task_id')
572
-
573
- # 根据是否有scheduled_task_id来判断任务来源
574
- if scheduled_task_id:
575
- source = 'scheduler' # 定时任务
576
- else:
577
- source = 'redis_stream' # 普通任务
578
-
579
- tasks_data.append({
580
- 'stream_id': task['id'], # Redis Stream ID作为stream_id
581
- 'queue': task['queue_name'],
582
- 'namespace': self.namespace_name,
583
- 'scheduled_task_id': str(scheduled_task_id) if scheduled_task_id else None,
584
- 'payload': task['task_data'], # 完整的任务数据
585
- 'priority': task['priority'],
586
- 'created_at': task['created_at'],
587
- 'source': source,
588
- 'metadata': task.get('metadata', '{}')
589
- })
590
-
591
- # 批量插入 - 使用executemany提高性能
592
- logger.debug(f"Executing batch insert with {len(tasks_data)} tasks")
593
-
594
- try:
595
- # 使用executemany批量插入
596
- result = await session.execute(tasks_query, tasks_data)
597
-
598
- # 获取实际插入的记录数
599
- inserted_count = result.rowcount
600
-
601
- # if inserted_count > 0:
602
- # logger.info(f"Successfully inserted {inserted_count} new tasks to tasks table")
603
- # else:
604
- # logger.info(f"No new tasks inserted (all may be duplicates)")
605
-
606
- await session.commit()
607
- logger.debug("Tasks table batch insert transaction completed")
608
-
609
- except Exception as e:
610
- logger.error(f"Error in batch insert, trying fallback: {e}")
611
- await session.rollback()
612
-
613
- # 如果批量插入失败,降级为小批量插入(每批10条)
614
- batch_size = 10
615
- total_inserted = 0
616
-
617
- for i in range(0, len(tasks_data), batch_size):
618
- batch = tasks_data[i:i+batch_size]
619
- try:
620
- result = await session.execute(tasks_query, batch)
621
- batch_inserted = result.rowcount
622
- if batch_inserted > 0:
623
- total_inserted += batch_inserted
624
- await session.commit()
625
- except Exception as batch_error:
626
- logger.error(f"Batch {i//batch_size + 1} failed: {batch_error}")
627
- await session.rollback()
628
-
629
- if total_inserted > 0:
630
- logger.info(f"Fallback insert completed: {total_inserted} tasks inserted")
631
- else:
632
- logger.info(f"No new tasks inserted in fallback mode")
633
-
634
- except Exception as e:
635
- logger.error(f"Error inserting tasks to PostgreSQL: {e}")
636
-
637
- async def _consume_task_changes(self):
638
- """消费任务变更事件流 - 基于事件驱动的更新(支持pending消息恢复)"""
639
- change_stream_key = f"{self.prefix}:TASK_CHANGES"
640
- consumer_group = f"{self.prefix}_changes_consumer"
641
-
642
- # 使用 ConsumerManager 管理的 consumer name
643
- # 这样 ConsumerManager 才能正确跟踪和恢复这个流的待处理消息
644
- consumer_name = self.consumer_manager.get_consumer_name('TASK_CHANGES')
645
-
646
- # 创建消费者组
647
- try:
648
- await self.redis_client.xgroup_create(
649
- change_stream_key, consumer_group, id='0', mkstream=True
650
- )
651
- logger.debug(f"Created consumer group for task changes stream")
652
- except redis.ResponseError:
653
- pass
654
-
655
- # 模仿 listen_event_by_task 的写法:先处理pending消息,再处理新消息
656
- check_backlog = True
657
- lastid = "0-0"
658
- batch_size = 1000
659
-
660
- while self._running:
661
- try:
662
- # 决定读取位置:如果有backlog,从lastid开始;否则读取新消息
663
- if check_backlog:
664
- myid = lastid
665
- else:
666
- myid = ">"
667
-
668
- messages = await self.redis_client.xreadgroup(
669
- consumer_group,
670
- consumer_name, # 使用 ConsumerManager 管理的 consumer name
671
- {change_stream_key: myid},
672
- count=batch_size,
673
- block=1000 if not check_backlog else 0 # backlog时不阻塞
674
- )
675
-
676
- if not messages:
677
- check_backlog = False
678
- continue
679
-
680
- # 检查是否还有更多backlog消息
681
- if messages and len(messages[0][1]) > 0:
682
- check_backlog = len(messages[0][1]) >= batch_size
683
- else:
684
- check_backlog = False
685
-
686
- # 收集消息ID和对应的task_id
687
- msg_to_task = {} # msg_id -> task_id 映射
688
-
689
- for _, stream_messages in messages:
690
- for msg_id, data in stream_messages:
691
- try:
692
- # 更新lastid(无论消息是否处理成功)
693
- if isinstance(msg_id, bytes):
694
- lastid = msg_id.decode('utf-8')
695
- else:
696
- lastid = str(msg_id)
697
-
698
- task_key = data[b'id']
699
- task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
700
-
701
- # 从完整的task_key格式提取stream_id
702
- # 格式: namespace:TASK:stream_id:queue_name
703
- stream_id = None
704
- if ':TASK:' in task_key:
705
- parts = task_key.split(':TASK:')
706
- if len(parts) == 2:
707
- # 再从右边部分提取stream_id
708
- right_parts = parts[1].split(':')
709
- if right_parts:
710
- stream_id = right_parts[0] # 提取stream_id
711
-
712
- if stream_id:
713
- # 存储元组: (stream_id, task_key)
714
- msg_to_task[msg_id] = (stream_id, task_key)
715
- else:
716
- logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
717
- except Exception as e:
718
- import traceback
719
- traceback.print_exc()
720
- logger.error(f"Error processing change event {msg_id}: {e} {data=}")
721
- # 解析失败的消息也应该ACK,避免一直重试
722
- await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
723
-
724
- if msg_to_task:
725
- # 批量更新任务,返回成功更新的task_id列表
726
- # msg_to_task 的值现在是元组 (stream_id, task_key)
727
- id_tuples = list(set(msg_to_task.values()))
728
- logger.info(f"Processing {len(id_tuples)} task updates from change stream")
729
- # logger_f.write(f'{id_tuples=} \n')
730
- successful_tuples = await self._update_tasks_by_event(id_tuples)
731
-
732
- # 只ACK成功更新的消息
733
- ack_ids = []
734
- failed_count = 0
735
- for msg_id, id_tuple in msg_to_task.items():
736
- if successful_tuples and id_tuple in successful_tuples:
737
- ack_ids.append(msg_id)
738
- else:
739
- failed_count += 1
740
-
741
- if ack_ids:
742
- await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
743
- if len(ack_ids) > 0:
744
- logger.info(f"Updated {len(ack_ids)} task statuses")
745
-
746
- if failed_count > 0:
747
- logger.debug(f"Failed to update {failed_count} tasks, will retry")
748
-
749
- except redis.ResponseError as e:
750
- if "NOGROUP" in str(e):
751
- # 如果消费者组不存在,重新创建
752
- try:
753
- await self.redis_client.xgroup_create(
754
- change_stream_key, consumer_group, id='0', mkstream=True
755
- )
756
- logger.debug(f"Recreated consumer group for task changes stream")
757
- check_backlog = True
758
- lastid = "0-0"
759
- except:
760
- pass
761
- else:
762
- logger.error(f"Redis error in consume_task_changes: {e}")
763
- await asyncio.sleep(1)
764
- except Exception as e:
765
- logger.error(f"Error in consume_task_changes: {e}", exc_info=True)
766
- await asyncio.sleep(1)
767
-
768
- async def _update_tasks_by_event(self, id_tuples: List[tuple]) -> Set[tuple]:
769
- """基于事件ID批量更新任务状态
770
-
771
- Args:
772
- id_tuples: 元组列表,每个元组为 (stream_id, task_key)
773
-
774
- Returns:
775
- 成功更新的元组集合
776
- """
777
- if not id_tuples:
778
- return set()
779
-
780
- successful_tuples = set()
781
-
782
- try:
783
- pipeline = self.redis_client.pipeline()
784
- for stream_id, task_key in id_tuples:
785
- pipeline.hgetall(task_key)
786
-
787
- redis_values = await pipeline.execute()
788
- updates = []
789
- valid_tuples = [] # 记录有效的元组
790
- if len(id_tuples) != len(redis_values):
791
- logger.error(f'Mismatch: {len(id_tuples)=} {len(redis_values)=}')
792
- # 不抛出异常,继续处理能处理的
793
-
794
- for i, (stream_id, task_key) in enumerate(id_tuples):
795
- if i >= len(redis_values):
796
- logger.error(f'Missing redis value for task_key={task_key}')
797
- continue
798
-
799
- hash_data = redis_values[i]
800
-
801
- if not hash_data:
802
- logger.debug(f'No hash data for task_key={task_key}')
803
- continue
804
-
805
- try:
806
- # 从task_key解析出consumer_group
807
- # task_key格式: namespace:TASK:stream_id:group_name
808
- # 其中group_name就是完整的consumer_group(格式: jettask:QUEUE:queue_name:task_name)
809
- parts = task_key.split(':', 3) # 最多分割成4部分
810
- if len(parts) == 4:
811
- # parts[0] = namespace (如 'default')
812
- # parts[1] = 'TASK'
813
- # parts[2] = stream_id
814
- # parts[3] = group_name (consumer_group)
815
- consumer_group = parts[3] # 直接使用group_name作为consumer_group
816
- logger.debug(f"Extracted consumer_group from task_key: {consumer_group}")
817
- else:
818
- logger.warning(f"Cannot parse consumer_group from task_key: {task_key}")
819
- continue
820
-
821
- # 从consumer_group中提取task_name
822
- # consumer_group格式: prefix:QUEUE:queue:task_name (如 jettask:QUEUE:robust_bench2:robust_benchmark.benchmark_task)
823
- task_name = None
824
- if consumer_group:
825
- parts = consumer_group.split(':')
826
- if len(parts) >= 4:
827
- # 最后一部分是task_name
828
- task_name = parts[-1]
829
- logger.debug(f"Extracted task_name '{task_name}' from consumer_group '{consumer_group}'")
830
-
831
- # 使用stream_id作为任务ID
832
- update_info = self._parse_task_hash(stream_id, hash_data)
833
- if update_info:
834
- # 添加consumer_group和task_name到更新信息中
835
- update_info['consumer_group'] = consumer_group
836
- update_info['task_name'] = task_name or 'unknown' # 如果无法提取task_name,使用'unknown'
837
- # consumer_name就是worker_id(执行任务的实际worker)
838
- update_info['consumer_name'] = update_info.get('worker_id')
839
- updates.append(update_info)
840
- valid_tuples.append((stream_id, task_key))
841
- else:
842
- logger.debug(f'Failed to parse stream_id={stream_id} hash_data={hash_data}')
843
- except Exception as e:
844
- logger.error(f'Error parsing task stream_id={stream_id}: {e}')
845
- continue
846
- if updates:
847
- logger.info(f"Attempting to update {len(updates)} tasks, first few: {[u['id'] for u in updates[:3]]}")
848
- # logger_f.write(f'{updates=} \n')
849
- try:
850
- # _update_tasks 现在返回成功更新的ID集合
851
- batch_successful = await self._update_tasks(updates)
852
- # 将成功的stream_id映射回元组
853
- for stream_id in batch_successful:
854
- for tuple_item in valid_tuples:
855
- if tuple_item[0] == stream_id: # stream_id匹配
856
- successful_tuples.add(tuple_item)
857
- if batch_successful:
858
- logger.info(f"Successfully updated {len(batch_successful)} tasks from change events")
859
- else:
860
- logger.warning(f"No tasks were successfully updated")
861
- except Exception as e:
862
- logger.error(f"Error in batch update: {e}")
863
- # 批量更新失败,尝试逐个更新
864
- for update, tuple_item in zip(updates, valid_tuples):
865
- try:
866
- single_successful = await self._update_tasks([update])
867
- if update['id'] in single_successful:
868
- successful_tuples.add(tuple_item)
869
- except Exception as single_error:
870
- logger.error(f"Failed to update task {tuple_item[0]}: {single_error}")
871
-
872
- except Exception as e:
873
- logger.error(f"Error updating tasks by event: {e}", exc_info=True)
874
- logger.debug(f'{successful_tuples=}')
875
- return successful_tuples
876
-
877
- def _parse_task_hash(self, task_id: str, hash_data: dict) -> Optional[dict]:
878
- """解析Redis Hash数据"""
879
- update_info = {
880
- 'id': task_id,
881
- 'status': None,
882
- 'result': None,
883
- 'error_message': None,
884
- 'started_at': None,
885
- 'completed_at': None,
886
- 'worker_id': None,
887
- 'execution_time': None,
888
- 'duration': None
889
- }
890
-
891
- try:
892
- from jettask.utils.serializer import loads_str
893
-
894
- hash_dict = {}
895
- for k, v in hash_data.items():
896
- key = k.decode('utf-8') if isinstance(k, bytes) else k
897
- if isinstance(v, bytes):
898
- try:
899
- value = loads_str(v)
900
- if isinstance(value, (dict, list)):
901
- value = json.dumps(value, ensure_ascii=False)
902
- else:
903
- value = str(value)
904
- except:
905
- try:
906
- value = v.decode('utf-8')
907
- except:
908
- value = str(v)
909
- else:
910
- value = v
911
- hash_dict[key] = value
912
-
913
- update_info['status'] = hash_dict.get('status')
914
- update_info['error_message'] = hash_dict.get('error_msg') or hash_dict.get('exception')
915
-
916
- # 转换时间戳
917
- for time_field in ['started_at', 'completed_at']:
918
- if hash_dict.get(time_field):
919
- try:
920
- time_str = hash_dict[time_field]
921
- if isinstance(time_str, str) and time_str.startswith("b'") and time_str.endswith("'"):
922
- time_str = time_str[2:-1]
923
- update_info[time_field] = datetime.fromtimestamp(float(time_str), tz=timezone.utc)
924
- except:
925
- pass
926
-
927
- update_info['worker_id'] = hash_dict.get('consumer') or hash_dict.get('worker_id')
928
-
929
- # 转换数值 - 直接存储原始秒数值
930
- for num_field in ['execution_time', 'duration']:
931
- if hash_dict.get(num_field):
932
- try:
933
- num_str = hash_dict[num_field]
934
- # 直接存储浮点数秒值
935
- update_info[num_field] = float(num_str)
936
- except:
937
- pass
938
-
939
- # 处理result
940
- if 'result' in hash_dict:
941
- result_str = hash_dict['result']
942
- if result_str == 'null':
943
- update_info['result'] = None
944
- else:
945
- update_info['result'] = result_str
946
-
947
- # 只返回有数据的更新
948
- if any(v is not None for k, v in update_info.items() if k != 'id'):
949
- return update_info
950
-
951
- except Exception as e:
952
- logger.error(f"Failed to parse hash data for task {task_id}: {e}")
953
-
954
- return None
955
-
956
- async def _update_tasks(self, updates: List[Dict[str, Any]]) -> Set[str]:
957
- """批量更新任务状态(使用UPSERT逻辑处理task_runs表)
958
-
959
- Returns:
960
- 成功更新的stream_id集合
961
- """
962
- if not updates:
963
- return set()
964
-
965
- try:
966
- async with self.AsyncSessionLocal() as session:
967
- # V3结构:使用UPSERT逻辑处理task_runs表
968
- stream_ids = [u['id'] for u in updates]
969
- logger.info(f"Upserting {len(stream_ids)} task_runs records")
970
-
971
- # 对于分区表,我们需要使用不同的UPSERT策略
972
- # 先尝试UPDATE,如果没有更新到任何行,则INSERT
973
- upsert_query = text("""
974
- WITH updated AS (
975
- UPDATE task_runs SET
976
- consumer_name = COALESCE(CAST(:consumer_name AS TEXT), consumer_name),
977
- status = CASE
978
- WHEN CAST(:status AS TEXT) IS NULL THEN status
979
- WHEN status = 'pending' THEN COALESCE(CAST(:status AS TEXT), status)
980
- WHEN status = 'running' AND CAST(:status AS TEXT) IN ('success', 'failed', 'timeout', 'skipped') THEN CAST(:status AS TEXT)
981
- WHEN status IN ('success', 'failed', 'timeout', 'skipped') THEN status
982
- ELSE COALESCE(CAST(:status AS TEXT), status)
983
- END,
984
- result = CASE
985
- WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN result
986
- ELSE COALESCE(CAST(:result AS jsonb), result)
987
- END,
988
- error_message = CASE
989
- WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN error_message
990
- ELSE COALESCE(CAST(:error_message AS TEXT), error_message)
991
- END,
992
- start_time = COALESCE(CAST(:started_at AS TIMESTAMPTZ), start_time),
993
- end_time = CASE
994
- WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN end_time
995
- ELSE COALESCE(CAST(:completed_at AS TIMESTAMPTZ), end_time)
996
- END,
997
- worker_id = COALESCE(CAST(:worker_id AS TEXT), worker_id),
998
- duration = COALESCE(CAST(:duration AS DOUBLE PRECISION), duration),
999
- execution_time = COALESCE(CAST(:execution_time AS DOUBLE PRECISION), execution_time),
1000
- updated_at = CURRENT_TIMESTAMP
1001
- WHERE stream_id = :stream_id AND consumer_group = :consumer_group
1002
- RETURNING stream_id
1003
- )
1004
- INSERT INTO task_runs (
1005
- stream_id, task_name, consumer_group, consumer_name, status, result, error_message,
1006
- start_time, end_time, worker_id, duration, execution_time,
1007
- created_at, updated_at
1008
- )
1009
- SELECT
1010
- :stream_id, :task_name, :consumer_group, :consumer_name,
1011
- COALESCE(CAST(:status AS TEXT), 'pending'),
1012
- CAST(:result AS jsonb),
1013
- CAST(:error_message AS TEXT),
1014
- CAST(:started_at AS TIMESTAMPTZ),
1015
- CAST(:completed_at AS TIMESTAMPTZ),
1016
- CAST(:worker_id AS TEXT),
1017
- CAST(:duration AS DOUBLE PRECISION),
1018
- CAST(:execution_time AS DOUBLE PRECISION),
1019
- CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
1020
- WHERE NOT EXISTS (SELECT 1 FROM updated)
1021
- RETURNING stream_id;
1022
- """)
1023
-
1024
- # 为每个更新转换参数名称(从id改为stream_id)
1025
- run_updates = []
1026
- for update in updates:
1027
- run_update = update.copy()
1028
- run_update['stream_id'] = run_update.pop('id') # 将id改为stream_id
1029
- # consumer_group 已经在 update_info 中了,不需要额外处理
1030
- run_updates.append(run_update)
1031
-
1032
- # 批量执行UPSERT - 使用事务批处理提高性能
1033
- successful_count = 0
1034
- batch_size = 20 # 每批处理20条记录
1035
-
1036
- for i in range(0, len(run_updates), batch_size):
1037
- batch = run_updates[i:i+batch_size]
1038
-
1039
- try:
1040
- # 在一个事务中处理整批
1041
- for run_update in batch:
1042
- result = await session.execute(upsert_query, run_update)
1043
- if result.rowcount > 0:
1044
- successful_count += 1
1045
-
1046
- # 批量提交
1047
- await session.commit()
1048
- logger.debug(f"Batch {i//batch_size + 1} committed: {len(batch)} records")
1049
-
1050
- except Exception as e:
1051
- logger.error(f"Batch {i//batch_size + 1} failed, trying individual records: {e}")
1052
- await session.rollback()
1053
-
1054
- # 如果批处理失败,回退到逐个处理这批记录
1055
- for run_update in batch:
1056
- try:
1057
- result = await session.execute(upsert_query, run_update)
1058
- await session.commit()
1059
- if result.rowcount > 0:
1060
- successful_count += 1
1061
- except Exception as individual_error:
1062
- logger.error(f"Individual upsert failed for {run_update.get('stream_id')}: {individual_error}")
1063
- await session.rollback()
1064
-
1065
- # 记录成功更新的数量
1066
- if successful_count > 0:
1067
- logger.info(f"Upserted {successful_count}/{len(run_updates)} task_runs records")
1068
-
1069
- # 检查哪些任务是完成状态,需要从Redis中删除
1070
- completed_task_keys = []
1071
- for update in updates:
1072
- status = update.get('status')
1073
- # 如果状态是完成状态(success, error, cancel等)
1074
- if status in ['success', 'error', 'failed', 'cancel', 'cancelled', 'timeout', 'skipped']:
1075
- # 构建task_key
1076
- # task_key格式: namespace:TASK:stream_id:group_name
1077
- stream_id = update['id']
1078
- consumer_group = update.get('consumer_group')
1079
- if consumer_group:
1080
- # 从consumer_group提取namespace
1081
- # consumer_group格式: prefix:QUEUE:queue:task_name
1082
- parts = consumer_group.split(':', 1)
1083
- namespace = parts[0] if parts else 'default'
1084
- task_key = f"{namespace}:TASK:{stream_id}:{consumer_group}"
1085
- completed_task_keys.append(task_key)
1086
- logger.info(f"Task {stream_id} with status {status} will be deleted from Redis: {task_key}")
1087
-
1088
- # 从Redis中删除已完成的任务
1089
- if completed_task_keys:
1090
- try:
1091
- pipeline = self.redis_client.pipeline()
1092
- for task_key in completed_task_keys:
1093
- pipeline.delete(task_key)
1094
- deleted_results = await pipeline.execute()
1095
- deleted_count = sum(1 for r in deleted_results if r > 0)
1096
- if deleted_count > 0:
1097
- logger.info(f"Deleted {deleted_count} completed tasks from Redis")
1098
- except Exception as e:
1099
- logger.error(f"Error deleting completed tasks from Redis: {e}")
1100
-
1101
- # UPSERT 操作总是成功的,返回所有stream_id
1102
- # 不需要复杂的错误处理,因为UPSERT保证了操作的原子性
1103
- return set(stream_ids)
1104
-
1105
- except Exception as e:
1106
- logger.error(f"Error upserting task statuses: {e}")
1107
- return set() # 出错时返回空集
1108
-
1109
- async def _retry_pending_updates(self):
1110
- """定期重试待更新的任务"""
1111
- while self._running:
1112
- try:
1113
- await asyncio.sleep(self._retry_interval) # 等待一段时间
1114
-
1115
- # 获取待重试的更新
1116
- async with self._pending_updates_lock:
1117
- if not self._pending_updates:
1118
- continue
1119
-
1120
- # 取出所有待重试的更新
1121
- pending_items = list(self._pending_updates.items())
1122
- self._pending_updates.clear()
1123
-
1124
- if pending_items:
1125
-
1126
- # 重新尝试更新
1127
- updates = [update_info for _, update_info in pending_items]
1128
- logger.debug(f"Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}")
1129
- logger_f.write(f'{time.time()=} Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}\n')
1130
- logger_f.flush()
1131
- await self._update_tasks(updates)
1132
-
1133
- except Exception as e:
1134
- logger.error(f"Error in retry pending updates: {e}")
1135
- await asyncio.sleep(5)
1136
-
1137
- async def _start_offline_recovery(self):
1138
- """启动离线worker恢复服务,恢复离线PG_CONSUMER的消息"""
1139
- logger.debug("Starting offline worker recovery service for PG_CONSUMER")
1140
-
1141
- # 等待consumer manager初始化和队列发现
1142
- # await asyncio.sleep(5)
1143
-
1144
- while self._running:
1145
- try:
1146
- total_recovered = 0
1147
-
1148
- # 1. 恢复普通队列的消息
1149
- # for queue in self._known_queues:
1150
- # # logger.info(f'{queue=}')
1151
- # try:
1152
- # recovered = await self.offline_recovery.recover_offline_workers(
1153
- # queue=queue,
1154
- # current_consumer_name=self.consumer_id,
1155
- # process_message_callback=self._process_recovered_queue_message
1156
- # )
1157
-
1158
- # if recovered > 0:
1159
- # logger.info(f"Recovered {recovered} messages from queue {queue}")
1160
- # total_recovered += recovered
1161
-
1162
- # except Exception as e:
1163
- # logger.error(f"Error recovering queue {queue}: {e}")
1164
-
1165
- # 2. 恢复TASK_CHANGES stream的消息
1166
- recovered = await self._recover_task_changes_offline_messages()
1167
- if recovered > 0:
1168
- logger.debug(f"Recovered {recovered} TASK_CHANGES messages")
1169
- total_recovered += recovered
1170
-
1171
- if total_recovered > 0:
1172
- logger.debug(f"Total recovered {total_recovered} messages in this cycle")
1173
-
1174
- # 每30秒扫描一次
1175
- await asyncio.sleep(1)
1176
-
1177
- except Exception as e:
1178
- logger.error(f"Error in offline recovery service: {e}")
1179
- await asyncio.sleep(10)
1180
-
1181
- async def _recover_task_changes_offline_messages(self) -> int:
1182
- """恢复TASK_CHANGES stream的离线消息"""
1183
- # 使用 OfflineWorkerRecovery 的标准接口
1184
- try:
1185
- # 为TASK_CHANGES定义自定义的队列格式化器
1186
- def task_changes_formatter(queue):
1187
- # 对于TASK_CHANGES,直接返回stream key(不加QUEUE:前缀)
1188
- if queue == 'TASK_CHANGES':
1189
- return f"{self.prefix}:TASK_CHANGES"
1190
- else:
1191
- return f"{self.prefix}:QUEUE:{queue}"
1192
-
1193
- # 创建专门用于TASK_CHANGES的恢复器
1194
- task_changes_recovery = OfflineWorkerRecovery(
1195
- async_redis_client=self.redis_client,
1196
- redis_prefix=self.prefix,
1197
- worker_prefix='PG_CONSUMER',
1198
- queue_formatter=task_changes_formatter
1199
- )
1200
-
1201
- # 调用标准的恢复方法
1202
- # TASK_CHANGES作为队列名传入,会被正确处理
1203
- recovered = await task_changes_recovery.recover_offline_workers(
1204
- queue='TASK_CHANGES', # 这个队列名会用于查找离线worker
1205
- current_consumer_name=self.consumer_id,
1206
- process_message_callback=self._process_recovered_task_change_v2
1207
- )
1208
-
1209
- return recovered
1210
-
1211
- except Exception as e:
1212
- logger.error(f"Error in recover_task_changes_offline_messages: {e}")
1213
- return 0
1214
-
1215
- async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
1216
- """处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
1217
- try:
1218
- logger.debug(f'处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口) {msg_data=}')
1219
- # 解析消息 - 现在使用task_id而不是event_id
1220
- if b'task_id' in msg_data:
1221
- # 使用msgpack解压task_id
1222
- compressed_task_id = msg_data[b'task_id']
1223
- task_key = msgpack.unpackb(compressed_task_id)
1224
- task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
1225
-
1226
- # 从完整的task_key格式提取stream_id
1227
- # 格式: namespace:TASK:stream_id:queue_name
1228
- stream_id = None
1229
- if ':TASK:' in task_key:
1230
- parts = task_key.split(':TASK:')
1231
- if len(parts) == 2:
1232
- # 再从右边部分提取stream_id
1233
- right_parts = parts[1].split(':')
1234
- if right_parts:
1235
- stream_id = right_parts[0] # 提取stream_id
1236
-
1237
- if stream_id:
1238
- logger.debug(f"Processing recovered TASK_CHANGES message: {stream_id} from offline worker {consumer_id}")
1239
- # 更新任务状态 - 传入(stream_id, task_key)元组
1240
- await self._update_tasks_by_event([(stream_id, task_key)])
1241
- else:
1242
- logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
1243
-
1244
- # ACK消息
1245
- change_stream_key = f"{self.prefix}:TASK_CHANGES"
1246
- consumer_group = f"{self.prefix}_changes_consumer"
1247
- await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
1248
-
1249
- except Exception as e:
1250
- logger.error(f"Error processing recovered task change {msg_id}: {e}")
1251
-
1252
- async def _database_maintenance(self):
1253
- """定期执行数据库维护任务"""
1254
- last_analyze_time = 0
1255
- analyze_interval = 7200 # 每2小时执行一次ANALYZE
1256
-
1257
- while self._running:
1258
- try:
1259
- current_time = time.time()
1260
-
1261
- if current_time - last_analyze_time > analyze_interval:
1262
- async with self.AsyncSessionLocal() as session:
1263
- logger.debug("Running ANALYZE on tasks and task_runs tables...")
1264
- await session.execute(text("ANALYZE tasks"))
1265
- await session.execute(text("ANALYZE task_runs"))
1266
- await session.commit()
1267
- logger.debug("ANALYZE completed successfully for both tables")
1268
- last_analyze_time = current_time
1269
-
1270
- await asyncio.sleep(300) # 每5分钟检查一次
1271
-
1272
- except Exception as e:
1273
- logger.error(f"Error in database maintenance: {e}")
1274
- await asyncio.sleep(60)
1275
-
1276
- async def _stream_backlog_monitor(self):
1277
- """Stream积压监控任务 - 使用分布式锁确保只有一个实例采集"""
1278
- # await asyncio.sleep(10) # 启动后延迟10秒开始
1279
-
1280
- while self._running:
1281
- try:
1282
- # 尝试获取分布式锁
1283
- lock_acquired = await self._try_acquire_monitor_lock()
1284
-
1285
- if lock_acquired:
1286
- try:
1287
- logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
1288
- await self._collect_stream_backlog_metrics()
1289
- logger.debug("Stream backlog metrics collected successfully")
1290
- finally:
1291
- # 释放锁
1292
- await self._release_monitor_lock()
1293
- else:
1294
- logger.debug("Another instance is collecting backlog metrics, skipping...")
1295
-
1296
- # 等待下一次采集
1297
- await asyncio.sleep(self.backlog_monitor_interval)
1298
-
1299
- except Exception as e:
1300
- logger.error(f"Error in stream backlog monitor: {e}")
1301
- await asyncio.sleep(30) # 出错后等待30秒
1302
-
1303
- async def _try_acquire_monitor_lock(self) -> bool:
1304
- """尝试获取监控锁(使用Redis原生锁)"""
1305
- try:
1306
- # 使用SET NX EX命令实现分布式锁
1307
- # NX: 只在键不存在时设置
1308
- # EX: 设置过期时间(秒)
1309
- result = await self.redis_client.set(
1310
- self.backlog_monitor_lock_key.encode(),
1311
- self.node_id.encode(), # 锁的值为当前节点ID
1312
- nx=True, # 只在不存在时设置
1313
- ex=self.backlog_monitor_lock_ttl # 过期时间
1314
- )
1315
- return result is not None
1316
- except Exception as e:
1317
- logger.error(f"Error acquiring monitor lock: {e}")
1318
- return False
1319
-
1320
- async def _release_monitor_lock(self):
1321
- """释放监控锁(只释放自己持有的锁)"""
1322
- try:
1323
- # 使用Lua脚本确保只释放自己持有的锁
1324
- lua_script = """
1325
- if redis.call("get", KEYS[1]) == ARGV[1] then
1326
- return redis.call("del", KEYS[1])
1327
- else
1328
- return 0
1329
- end
1330
- """
1331
- await self.redis_client.eval(
1332
- lua_script,
1333
- 1,
1334
- self.backlog_monitor_lock_key.encode(),
1335
- self.node_id.encode()
1336
- )
1337
- except Exception as e:
1338
- logger.error(f"Error releasing monitor lock: {e}")
1339
-
1340
- async def _collect_stream_backlog_metrics(self):
1341
- """采集Stream积压指标并保存到数据库(使用offset方式)"""
1342
- try:
1343
- # 获取所有队列的最新offset (QUEUE_OFFSETS)
1344
- queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
1345
- queue_offsets = {}
1346
- try:
1347
- # 使用decode_responses=False的客户端,手动解码
1348
- raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
1349
- for k, v in raw_queue_offsets.items():
1350
- queue_name = k.decode() if isinstance(k, bytes) else k
1351
- offset_value = v.decode() if isinstance(v, bytes) else v
1352
- queue_offsets[queue_name] = int(offset_value)
1353
- except Exception as e:
1354
- logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
1355
-
1356
- # 获取所有任务组的消费offset (TASK_OFFSETS)
1357
- task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
1358
- task_offsets = {}
1359
- try:
1360
- raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
1361
- for k, v in raw_task_offsets.items():
1362
- task_key = k.decode() if isinstance(k, bytes) else k
1363
- offset_value = v.decode() if isinstance(v, bytes) else v
1364
- task_offsets[task_key] = int(offset_value)
1365
- except Exception as e:
1366
- logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
1367
-
1368
- # 使用Stream注册表替代SCAN命令获取队列信息
1369
- stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
1370
-
1371
- # 从fredis中获取stream注册表(Hash结构)
1372
- # 格式: {"queue_name:priority": "stream_key"}
1373
- # 对于普通队列,priority为0
1374
- stream_registry = await self.redis_client.hgetall(self.stream_registry_key.encode())
1375
-
1376
- for queue_priority_bytes, stream_key_bytes in stream_registry.items():
1377
- queue_priority_str = queue_priority_bytes.decode() if isinstance(queue_priority_bytes, bytes) else str(queue_priority_bytes)
1378
- stream_key = stream_key_bytes.decode() if isinstance(stream_key_bytes, bytes) else str(stream_key_bytes)
1379
-
1380
- # 解析queue_name和priority
1381
- if ':' in queue_priority_str:
1382
- parts = queue_priority_str.rsplit(':', 1)
1383
- if len(parts) == 2 and parts[1].isdigit():
1384
- queue_name = parts[0]
1385
- priority = int(parts[1])
1386
- else:
1387
- # 如果最后一部分不是数字,说明是普通队列名包含冒号
1388
- queue_name = queue_priority_str
1389
- priority = 0
1390
- else:
1391
- # 普通队列
1392
- queue_name = queue_priority_str
1393
- priority = 0
1394
-
1395
- if queue_name not in stream_info_map:
1396
- stream_info_map[queue_name] = []
1397
- stream_info_map[queue_name].append((stream_key, priority))
1398
-
1399
- # 如果Stream注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
1400
- if not stream_info_map:
1401
- logger.warning(f"Stream registry is empty, performing one-time scan initialization...")
1402
- pattern = f"{self.prefix}:QUEUE:*".encode()
1403
- cursor = 0
1404
-
1405
- while True:
1406
- cursor, keys = await self.redis_client.scan(cursor, match=pattern, count=10000)
1407
-
1408
- for key in keys:
1409
- key_str = key.decode()
1410
- # 移除前缀 "prefix:QUEUE:"
1411
- queue_part = key_str.replace(f"{self.prefix}:QUEUE:", "")
1412
-
1413
- # 检查是否是优先级队列(格式:queue_name:priority)
1414
- parts = queue_part.split(':')
1415
- if len(parts) == 2 and parts[1].isdigit():
1416
- # 优先级队列
1417
- queue_name = parts[0]
1418
- priority = int(parts[1])
1419
- queue_priority_key = f"{queue_name}:{priority}"
1420
- elif ':' not in queue_part:
1421
- # 普通队列(不包含冒号)
1422
- queue_name = queue_part
1423
- priority = 0
1424
- queue_priority_key = queue_name
1425
- else:
1426
- # 忽略其他格式的键(如消费组等)
1427
- continue
1428
-
1429
- if queue_name not in stream_info_map:
1430
- stream_info_map[queue_name] = []
1431
- stream_info_map[queue_name].append((key, priority))
1432
-
1433
- if cursor == 0:
1434
- break
1435
-
1436
- # 将发现的Stream信息添加到注册表中
1437
- if stream_info_map:
1438
- pipeline = self.redis_client.pipeline()
1439
- for queue_name, stream_list in stream_info_map.items():
1440
- for stream_key, priority in stream_list:
1441
- if priority > 0:
1442
- queue_priority_key = f"{queue_name}:{priority}"
1443
- else:
1444
- queue_priority_key = queue_name
1445
- # stream_key已经是bytes类型(从scan_iter返回)
1446
- if isinstance(stream_key, str):
1447
- stream_key = stream_key.encode()
1448
- pipeline.hset(self.stream_registry_key.encode(), queue_priority_key.encode(), stream_key)
1449
- await pipeline.execute()
1450
- logger.info(f"Registered {sum(len(stream_list) for stream_list in stream_info_map.values())} streams to registry during initialization")
1451
-
1452
- if not stream_info_map:
1453
- logger.debug("No streams found in registry for backlog monitoring")
1454
- return
1455
-
1456
- # 调试日志(使用debug级别避免刷屏)
1457
- logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
1458
- for queue_name, stream_list in stream_info_map.items():
1459
- priorities = [p for _, p in stream_list]
1460
- # 筛选出非0优先级(0表示普通队列)
1461
- high_priorities = [p for p in priorities if p > 0]
1462
- if high_priorities:
1463
- logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
1464
- else:
1465
- logger.debug(f" - {queue_name}: regular queue only (priority=0)")
1466
-
1467
- # 收集每个队列的指标(聚合所有优先级)
1468
- metrics = []
1469
- current_time = datetime.now(timezone.utc)
1470
-
1471
- for queue_name, stream_list in stream_info_map.items():
1472
- # 分别处理每个优先级队列
1473
- for stream_key, priority in stream_list:
1474
- try:
1475
- # 获取该队列的最新offset(考虑优先级队列)
1476
- if priority > 0:
1477
- # 优先级队列的key格式: queue_name:priority
1478
- queue_key = f"{queue_name}:{priority}"
1479
- else:
1480
- queue_key = queue_name
1481
- last_published_offset = queue_offsets.get(queue_key, 0)
1482
-
1483
- # 获取Stream信息
1484
- stream_info = await self.redis_client.xinfo_stream(stream_key)
1485
- stream_length = stream_info.get(b'length', 0)
1486
-
1487
- # 获取消费组信息
1488
- has_consumer_groups = False
1489
- try:
1490
- groups = await self.redis_client.xinfo_groups(stream_key)
1491
-
1492
- for group in groups:
1493
- # 处理group_name
1494
- raw_name = group.get('name', b'')
1495
- if isinstance(raw_name, bytes):
1496
- group_name = raw_name.decode() if raw_name else ''
1497
- else:
1498
- group_name = str(raw_name) if raw_name else ''
1499
-
1500
- if not group_name:
1501
- group_name = 'unknown'
1502
-
1503
- # 过滤内部消费者组
1504
- if is_internal_consumer(group_name):
1505
- # logger.info(f"Skipping internal consumer group: {group_name}")
1506
- continue
1507
-
1508
- # 处理pending - 直接是int
1509
- pending_count = group.get('pending', 0)
1510
-
1511
- # 从TASK_OFFSETS获取该组的消费offset
1512
- # key格式: f"{queue_name}:{group_name}" (不包含优先级)
1513
- task_offset_key = f"{queue_name}:{group_name}"
1514
- last_acked_offset = task_offsets.get(task_offset_key, 0)
1515
-
1516
- # 计算各种积压指标
1517
- # 1. 总积压 = 队列最新offset - 消费组已确认的offset
1518
- total_backlog = max(0, last_published_offset - last_acked_offset)
1519
-
1520
- # 2. 未投递的积压 = 总积压 - pending数量
1521
- backlog_undelivered = max(0, total_backlog - pending_count)
1522
-
1523
- # 3. 已投递未确认 = pending数量
1524
- backlog_delivered_unacked = pending_count
1525
-
1526
- # 4. 已投递的offset = 已确认offset + pending数量
1527
- last_delivered_offset = last_acked_offset + pending_count
1528
-
1529
- # 为每个消费组创建一条记录
1530
- metrics.append({
1531
- 'namespace': self.namespace_name,
1532
- 'stream_name': queue_name,
1533
- 'priority': priority, # 添加优先级字段
1534
- 'consumer_group': group_name,
1535
- 'last_published_offset': last_published_offset,
1536
- 'last_delivered_offset': last_delivered_offset,
1537
- 'last_acked_offset': last_acked_offset,
1538
- 'pending_count': pending_count,
1539
- 'backlog_undelivered': backlog_undelivered,
1540
- 'backlog_unprocessed': total_backlog,
1541
- 'created_at': current_time
1542
- })
1543
- has_consumer_groups = True
1544
-
1545
- except Exception as e:
1546
- # 这个队列没有消费组
1547
- logger.debug(f"No consumer groups for stream {stream_key.decode()}: {e}")
1548
-
1549
- # 如果没有消费组,保存Stream级别的指标
1550
- if not has_consumer_groups and last_published_offset > 0:
1551
- metrics.append({
1552
- 'namespace': self.namespace_name,
1553
- 'stream_name': queue_name,
1554
- 'priority': priority, # 添加优先级字段
1555
- 'consumer_group': None,
1556
- 'last_published_offset': last_published_offset,
1557
- 'last_delivered_offset': 0,
1558
- 'last_acked_offset': 0,
1559
- 'pending_count': 0,
1560
- 'backlog_undelivered': last_published_offset,
1561
- 'backlog_unprocessed': last_published_offset,
1562
- 'created_at': current_time
1563
- })
1564
-
1565
- except Exception as e:
1566
- logger.error(f"Error collecting metrics for stream {stream_key.decode()}: {e}")
1567
- continue
1568
-
1569
- # 保存指标到数据库
1570
- if metrics:
1571
- await self._save_backlog_metrics(metrics)
1572
- # logger.info(f"Collected backlog metrics for {len(metrics)} stream/group combinations {time.time() }")
1573
-
1574
- except Exception as e:
1575
- import traceback
1576
- traceback.print_exc()
1577
- logger.error(f"Error collecting stream backlog metrics: {e}")
1578
-
1579
- async def _save_backlog_metrics(self, metrics: List[Dict]):
1580
- """保存积压指标到数据库(仅保存发生变化的数据)"""
1581
- if not metrics:
1582
- return
1583
-
1584
- # logger.info(f"Processing {len(metrics)} metrics for deduplication")
1585
-
1586
- try:
1587
- async with self.AsyncSessionLocal() as session:
1588
- # 要保存的新记录
1589
- metrics_to_save = []
1590
-
1591
- # 使用批量查询优化性能
1592
- metric_keys = {} # 用于快速查找
1593
-
1594
- for metric in metrics:
1595
- # 构建唯一键:namespace + stream_name + consumer_group + priority
1596
- unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
1597
- metric_keys[unique_key] = metric
1598
-
1599
- # logger.info(f"Checking {len(metric_keys)} unique metric combinations")
1600
-
1601
- # 批量查询最新记录 - 分批查询以避免SQL过长
1602
- last_records = {}
1603
- metric_list = list(metric_keys.values())
1604
- batch_size = 50 # 每批查询50个
1605
-
1606
- for i in range(0, len(metric_list), batch_size):
1607
- batch = metric_list[i:i + batch_size]
1608
-
1609
- # 构建参数化查询
1610
- conditions = []
1611
- params = {}
1612
- for idx, metric in enumerate(batch):
1613
- param_prefix = f"p{i + idx}"
1614
- conditions.append(f"""
1615
- (namespace = :{param_prefix}_ns
1616
- AND stream_name = :{param_prefix}_sn
1617
- AND consumer_group = :{param_prefix}_cg
1618
- AND priority = :{param_prefix}_pr)
1619
- """)
1620
- params[f"{param_prefix}_ns"] = metric['namespace']
1621
- params[f"{param_prefix}_sn"] = metric['stream_name']
1622
- params[f"{param_prefix}_cg"] = metric['consumer_group']
1623
- params[f"{param_prefix}_pr"] = metric['priority']
1624
-
1625
- if conditions:
1626
- # 使用窗口函数获取每个组合的最新记录
1627
- query_sql = text(f"""
1628
- WITH latest_records AS (
1629
- SELECT
1630
- namespace,
1631
- stream_name,
1632
- consumer_group,
1633
- priority,
1634
- last_published_offset,
1635
- last_delivered_offset,
1636
- last_acked_offset,
1637
- pending_count,
1638
- backlog_undelivered,
1639
- backlog_unprocessed,
1640
- ROW_NUMBER() OVER (
1641
- PARTITION BY namespace, stream_name, consumer_group, priority
1642
- ORDER BY created_at DESC
1643
- ) as rn
1644
- FROM stream_backlog_monitor
1645
- WHERE ({' OR '.join(conditions)})
1646
- )
1647
- SELECT
1648
- namespace,
1649
- stream_name,
1650
- consumer_group,
1651
- priority,
1652
- last_published_offset,
1653
- last_delivered_offset,
1654
- last_acked_offset,
1655
- pending_count,
1656
- backlog_undelivered,
1657
- backlog_unprocessed
1658
- FROM latest_records
1659
- WHERE rn = 1
1660
- """)
1661
-
1662
- result = await session.execute(query_sql, params)
1663
- for row in result:
1664
- key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
1665
- last_records[key] = row
1666
- logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
1667
-
1668
- # 对每个指标进行去重检查
1669
- for unique_key, metric in metric_keys.items():
1670
- should_save = False
1671
-
1672
- if unique_key not in last_records:
1673
- # 没有历史记录,需要保存
1674
- should_save = True
1675
- # logger.info(f"New metric for {unique_key}, will save")
1676
- else:
1677
- # 比较关键指标是否发生变化
1678
- last_record = last_records[unique_key]
1679
-
1680
- # 详细的调试日志
1681
- changes = []
1682
- logger.debug(f"Comparing for {unique_key}:")
1683
- logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
1684
- f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
1685
- f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
1686
- f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
1687
- logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
1688
- f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
1689
- f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
1690
- f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
1691
-
1692
- # 确保类型一致的比较(全部转为int进行比较)
1693
- db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
1694
- new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
1695
-
1696
- db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
1697
- new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
1698
-
1699
- db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
1700
- new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
1701
-
1702
- db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
1703
- new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
1704
-
1705
- db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
1706
- new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
1707
-
1708
- db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
1709
- new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
1710
-
1711
- if db_published != new_published:
1712
- changes.append(f"published: {db_published} -> {new_published}")
1713
- if db_delivered != new_delivered:
1714
- changes.append(f"delivered: {db_delivered} -> {new_delivered}")
1715
- if db_acked != new_acked:
1716
- changes.append(f"acked: {db_acked} -> {new_acked}")
1717
- if db_pending != new_pending:
1718
- changes.append(f"pending: {db_pending} -> {new_pending}")
1719
- if db_undelivered != new_undelivered:
1720
- changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
1721
- if db_unprocessed != new_unprocessed:
1722
- changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
1723
-
1724
- if changes:
1725
- should_save = True
1726
- # logger.info(f"Metric changed for {unique_key}: {', '.join(changes)}")
1727
- else:
1728
- logger.debug(f"Metric unchanged for {unique_key}, skipping")
1729
-
1730
- if should_save:
1731
- metrics_to_save.append(metric)
1732
-
1733
- # 批量插入发生变化的监控数据
1734
- if metrics_to_save:
1735
- insert_sql = text("""
1736
- INSERT INTO stream_backlog_monitor
1737
- (namespace, stream_name, priority, consumer_group, last_published_offset,
1738
- last_delivered_offset, last_acked_offset, pending_count,
1739
- backlog_undelivered, backlog_unprocessed, created_at)
1740
- VALUES
1741
- (:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
1742
- :last_delivered_offset, :last_acked_offset, :pending_count,
1743
- :backlog_undelivered, :backlog_unprocessed, :created_at)
1744
- """)
1745
-
1746
- # 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
1747
- for metric_data in metrics_to_save:
1748
- await session.execute(insert_sql, metric_data)
1749
-
1750
- await session.commit()
1751
- # logger.info(f"Saved {len(metrics_to_save)} changed metrics out of {len(metrics)} total")
1752
- else:
1753
- logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
1754
-
1755
- except Exception as e:
1756
- logger.error(f"Error saving backlog metrics to database: {e}")
1757
-
1758
- def _parse_stream_message(self, task_id: str, data: dict) -> Optional[dict]:
1759
- """解析Stream消息为任务信息(返回完整的字段)"""
1760
- try:
1761
- from jettask.utils.serializer import loads_str
1762
- if b'data' in data:
1763
- task_data = loads_str(data[b'data'])
1764
- else:
1765
- task_data = {}
1766
- for k, v in data.items():
1767
- key = k.decode('utf-8') if isinstance(k, bytes) else k
1768
- if isinstance(v, bytes):
1769
- try:
1770
- value = loads_str(v)
1771
- except:
1772
- value = str(v)
1773
- else:
1774
- value = v
1775
- task_data[key] = value
1776
- # 如果配置了命名空间,检查消息是否属于该命名空间
1777
- # if self.namespace_id:
1778
- # msg_namespace_id = task_data.get('__namespace_id')
1779
- # # 如果消息没有namespace_id且当前不是默认命名空间,跳过
1780
- # if msg_namespace_id != self.namespace_id:
1781
- # if not (msg_namespace_id is None and self.namespace_id == 'default'):
1782
- # logger.debug(f"Skipping message from different namespace: {msg_namespace_id} != {self.namespace_id}")
1783
- # return None
1784
- queue_name = task_data['queue']
1785
- task_name = task_data.get('name', task_data.get('task', 'unknown'))
1786
- created_at = None
1787
- if 'trigger_time' in task_data:
1788
- try:
1789
- timestamp = float(task_data['trigger_time'])
1790
- created_at = datetime.fromtimestamp(timestamp, tz=timezone.utc)
1791
- except:
1792
- pass
1793
- # 返回完整的字段,包括所有可能为None的字段
1794
- return {
1795
- 'id': task_id,
1796
- 'queue_name': queue_name,
1797
- 'task_name': task_name,
1798
- 'task_data': json.dumps(task_data),
1799
- 'priority': int(task_data.get('priority', 0)),
1800
- 'retry_count': int(task_data.get('retry', 0)),
1801
- 'max_retry': int(task_data.get('max_retry', 3)),
1802
- 'status': 'pending',
1803
- 'result': None, # 新任务没有结果
1804
- 'error_message': None, # 新任务没有错误信息
1805
- 'created_at': created_at,
1806
- 'started_at': None, # 新任务还未开始
1807
- 'completed_at': None, # 新任务还未完成
1808
- 'scheduled_task_id': task_data.get('scheduled_task_id'), # 调度任务ID
1809
- 'metadata': json.dumps(task_data.get('metadata', {})),
1810
- 'worker_id': None, # 新任务还未分配worker
1811
- 'execution_time': None, # 新任务还没有执行时间
1812
- 'duration': None, # 新任务还没有持续时间
1813
- 'namespace_id': self.namespace_id # 添加命名空间ID
1814
- }
1815
- except Exception as e:
1816
- import traceback
1817
- traceback.print_exc()
1818
- logger.error(f"Error parsing stream message for task {task_id}: {e}")
1819
- return None
1820
-
1821
-
1822
-
1823
- async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig,
1824
- consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
1825
- """运行PostgreSQL消费者"""
1826
- # 从环境变量读取监控配置
1827
- enable_backlog_monitor = os.getenv('JETTASK_ENABLE_BACKLOG_MONITOR', 'true').lower() == 'true'
1828
- backlog_monitor_interval = int(os.getenv('JETTASK_BACKLOG_MONITOR_INTERVAL', '60'))
1829
-
1830
- logger.info(f"Backlog monitor config: enabled={enable_backlog_monitor}, interval={backlog_monitor_interval}s")
1831
-
1832
- consumer = PostgreSQLConsumer(
1833
- pg_config,
1834
- redis_config,
1835
- consumer_strategy=consumer_strategy,
1836
- enable_backlog_monitor=enable_backlog_monitor,
1837
- backlog_monitor_interval=backlog_monitor_interval
1838
- )
1839
-
1840
- try:
1841
- await consumer.start()
1842
- while True:
1843
- await asyncio.sleep(1)
1844
-
1845
- except KeyboardInterrupt:
1846
- logger.debug("Received interrupt signal")
1847
- finally:
1848
- await consumer.stop()
1849
-
1850
-
1851
- def main():
1852
- """主入口函数"""
1853
- from dotenv import load_dotenv
1854
-
1855
- load_dotenv()
1856
-
1857
- logging.basicConfig(
1858
- level=logging.INFO,
1859
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1860
- )
1861
-
1862
- pg_config = PostgreSQLConfig(
1863
- host=os.getenv('JETTASK_PG_HOST', 'localhost'),
1864
- port=int(os.getenv('JETTASK_PG_PORT', '5432')),
1865
- database=os.getenv('JETTASK_PG_DB', 'jettask'),
1866
- user=os.getenv('JETTASK_PG_USER', 'jettask'),
1867
- password=os.getenv('JETTASK_PG_PASSWORD', '123456'),
1868
- )
1869
-
1870
- redis_config = RedisConfig(
1871
- host=os.getenv('REDIS_HOST', 'localhost'),
1872
- port=int(os.getenv('REDIS_PORT', '6379')),
1873
- db=int(os.getenv('REDIS_DB', '0')),
1874
- password=os.getenv('REDIS_PASSWORD'),
1875
- )
1876
-
1877
- # 从环境变量获取消费者策略,默认使用 HEARTBEAT
1878
- strategy_name = os.getenv('JETTASK_CONSUMER_STRATEGY', 'HEARTBEAT').upper()
1879
- consumer_strategy = ConsumerStrategy.HEARTBEAT # 默认
1880
-
1881
- if strategy_name == 'FIXED':
1882
- consumer_strategy = ConsumerStrategy.FIXED
1883
- elif strategy_name == 'POD':
1884
- consumer_strategy = ConsumerStrategy.POD
1885
- elif strategy_name == 'HEARTBEAT':
1886
- consumer_strategy = ConsumerStrategy.HEARTBEAT
1887
- else:
1888
- logger.debug(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
1889
-
1890
- logger.debug(f"Using consumer strategy: {consumer_strategy.value}")
1891
-
1892
- asyncio.run(run_pg_consumer(pg_config, redis_config, consumer_strategy))
1893
-
1894
-
1895
- if __name__ == '__main__':
1896
- main()