jettask 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. jettask/constants.py +213 -0
  2. jettask/core/app.py +525 -205
  3. jettask/core/cli.py +193 -185
  4. jettask/core/consumer_manager.py +126 -34
  5. jettask/core/context.py +3 -0
  6. jettask/core/enums.py +137 -0
  7. jettask/core/event_pool.py +501 -168
  8. jettask/core/message.py +147 -0
  9. jettask/core/offline_worker_recovery.py +181 -114
  10. jettask/core/task.py +10 -174
  11. jettask/core/task_batch.py +153 -0
  12. jettask/core/unified_manager_base.py +243 -0
  13. jettask/core/worker_scanner.py +54 -54
  14. jettask/executors/asyncio.py +184 -64
  15. jettask/webui/backend/config.py +51 -0
  16. jettask/webui/backend/data_access.py +2083 -92
  17. jettask/webui/backend/data_api.py +3294 -0
  18. jettask/webui/backend/dependencies.py +261 -0
  19. jettask/webui/backend/init_meta_db.py +158 -0
  20. jettask/webui/backend/main.py +1358 -69
  21. jettask/webui/backend/main_unified.py +78 -0
  22. jettask/webui/backend/main_v2.py +394 -0
  23. jettask/webui/backend/namespace_api.py +295 -0
  24. jettask/webui/backend/namespace_api_old.py +294 -0
  25. jettask/webui/backend/namespace_data_access.py +611 -0
  26. jettask/webui/backend/queue_backlog_api.py +727 -0
  27. jettask/webui/backend/queue_stats_v2.py +521 -0
  28. jettask/webui/backend/redis_monitor_api.py +476 -0
  29. jettask/webui/backend/unified_api_router.py +1601 -0
  30. jettask/webui/db_init.py +204 -32
  31. jettask/webui/frontend/package-lock.json +492 -1
  32. jettask/webui/frontend/package.json +4 -1
  33. jettask/webui/frontend/src/App.css +105 -7
  34. jettask/webui/frontend/src/App.jsx +49 -20
  35. jettask/webui/frontend/src/components/NamespaceSelector.jsx +166 -0
  36. jettask/webui/frontend/src/components/QueueBacklogChart.jsx +298 -0
  37. jettask/webui/frontend/src/components/QueueBacklogTrend.jsx +638 -0
  38. jettask/webui/frontend/src/components/QueueDetailsTable.css +65 -0
  39. jettask/webui/frontend/src/components/QueueDetailsTable.jsx +487 -0
  40. jettask/webui/frontend/src/components/QueueDetailsTableV2.jsx +465 -0
  41. jettask/webui/frontend/src/components/ScheduledTaskFilter.jsx +423 -0
  42. jettask/webui/frontend/src/components/TaskFilter.jsx +425 -0
  43. jettask/webui/frontend/src/components/TimeRangeSelector.css +21 -0
  44. jettask/webui/frontend/src/components/TimeRangeSelector.jsx +160 -0
  45. jettask/webui/frontend/src/components/layout/AppLayout.css +95 -0
  46. jettask/webui/frontend/src/components/layout/AppLayout.jsx +49 -0
  47. jettask/webui/frontend/src/components/layout/Header.css +34 -10
  48. jettask/webui/frontend/src/components/layout/Header.jsx +31 -23
  49. jettask/webui/frontend/src/components/layout/SideMenu.css +137 -0
  50. jettask/webui/frontend/src/components/layout/SideMenu.jsx +209 -0
  51. jettask/webui/frontend/src/components/layout/TabsNav.css +244 -0
  52. jettask/webui/frontend/src/components/layout/TabsNav.jsx +206 -0
  53. jettask/webui/frontend/src/components/layout/UserInfo.css +197 -0
  54. jettask/webui/frontend/src/components/layout/UserInfo.jsx +197 -0
  55. jettask/webui/frontend/src/contexts/NamespaceContext.jsx +72 -0
  56. jettask/webui/frontend/src/contexts/TabsContext.backup.jsx +245 -0
  57. jettask/webui/frontend/src/main.jsx +1 -0
  58. jettask/webui/frontend/src/pages/Alerts.jsx +684 -0
  59. jettask/webui/frontend/src/pages/Dashboard.jsx +1330 -0
  60. jettask/webui/frontend/src/pages/QueueDetail.jsx +1109 -10
  61. jettask/webui/frontend/src/pages/QueueMonitor.jsx +236 -115
  62. jettask/webui/frontend/src/pages/Queues.jsx +5 -1
  63. jettask/webui/frontend/src/pages/ScheduledTasks.jsx +809 -0
  64. jettask/webui/frontend/src/pages/Settings.jsx +800 -0
  65. jettask/webui/frontend/src/services/api.js +7 -5
  66. jettask/webui/frontend/src/utils/suppressWarnings.js +22 -0
  67. jettask/webui/frontend/src/utils/userPreferences.js +154 -0
  68. jettask/webui/multi_namespace_consumer.py +543 -0
  69. jettask/webui/pg_consumer.py +983 -246
  70. jettask/webui/static/dist/assets/index-7129cfe1.css +1 -0
  71. jettask/webui/static/dist/assets/index-8d1935cc.js +774 -0
  72. jettask/webui/static/dist/index.html +2 -2
  73. jettask/webui/task_center.py +216 -0
  74. jettask/webui/task_center_client.py +150 -0
  75. jettask/webui/unified_consumer_manager.py +193 -0
  76. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/METADATA +1 -1
  77. jettask-0.2.4.dist-info/RECORD +134 -0
  78. jettask/webui/pg_consumer_slow.py +0 -1099
  79. jettask/webui/pg_consumer_test.py +0 -678
  80. jettask/webui/static/dist/assets/index-823408e8.css +0 -1
  81. jettask/webui/static/dist/assets/index-9968b0b8.js +0 -543
  82. jettask/webui/test_pg_consumer_recovery.py +0 -547
  83. jettask/webui/test_recovery_simple.py +0 -492
  84. jettask/webui/test_self_recovery.py +0 -467
  85. jettask-0.2.1.dist-info/RECORD +0 -91
  86. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/WHEEL +0 -0
  87. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/entry_points.txt +0 -0
  88. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/licenses/LICENSE +0 -0
  89. {jettask-0.2.1.dist-info → jettask-0.2.4.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,10 @@
4
4
  import asyncio
5
5
  import json
6
6
  import logging
7
+ import msgpack
7
8
  import os
8
9
  import time
9
- from typing import Dict, List, Optional, Any
10
+ from typing import Dict, List, Optional, Any, Set
10
11
  from datetime import datetime, timezone
11
12
  from collections import defaultdict
12
13
 
@@ -19,22 +20,36 @@ from sqlalchemy import text
19
20
  from jettask.webui.config import PostgreSQLConfig, RedisConfig
20
21
  from jettask.core.consumer_manager import ConsumerManager, ConsumerStrategy
21
22
  from jettask.core.offline_worker_recovery import OfflineWorkerRecovery
23
+ from jettask.constants import is_internal_consumer, TASK_STATUS_PRIORITY
22
24
 
23
25
  logger = logging.getLogger(__name__)
24
26
 
27
+ # 注释掉调试文件写入,避免权限问题
28
+ # logger_f = open(f'./pg_consumer.txt', 'a+')
25
29
 
30
+ # 使用 constants.py 中定义的任务状态优先级
31
+ # STATUS_PRIORITY 已从 constants.py 导入为 TASK_STATUS_PRIORITY
26
32
  class PostgreSQLConsumer:
27
- """PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL"""
33
+ """PostgreSQL消费者,从Redis队列消费任务并持久化到PostgreSQL
34
+
35
+ 支持多租户(命名空间)隔离
36
+ """
28
37
 
29
38
  def __init__(self, pg_config: PostgreSQLConfig, redis_config: RedisConfig, prefix: str = "jettask",
30
- node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
39
+ node_id: str = None, consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT,
40
+ namespace_id: str = None, namespace_name: str = None,
41
+ enable_backlog_monitor: bool = True, backlog_monitor_interval: int = 1):
31
42
  self.pg_config = pg_config
32
43
  self.redis_config = redis_config
33
44
  self.prefix = prefix
45
+
46
+ # 命名空间支持
47
+ self.namespace_id = namespace_id
48
+ self.namespace_name = namespace_name or "default"
34
49
  self.redis_client: Optional[Redis] = None
35
50
  self.async_engine = None
36
51
  self.AsyncSessionLocal = None
37
- self.consumer_group = f"{prefix}_pg_consumer1"
52
+ self.consumer_group = f"{prefix}_pg_consumer"
38
53
 
39
54
  # 节点标识
40
55
  import socket
@@ -69,28 +84,42 @@ class PostgreSQLConsumer:
69
84
  self.min_batch_size = 500
70
85
  self.max_batch_size = 5000
71
86
 
87
+ # Stream积压监控配置
88
+ self.enable_backlog_monitor = enable_backlog_monitor # 是否启用积压监控
89
+ self.backlog_monitor_interval = backlog_monitor_interval # 监控采集间隔(秒)
90
+ self.backlog_monitor_lock_key = f"{prefix}:BACKLOG_MONITOR_LOCK" # 分布式锁键
91
+ self.backlog_monitor_lock_ttl = backlog_monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
92
+
72
93
  async def start(self):
73
94
  """启动消费者"""
74
95
  logger.info(f"Starting PostgreSQL consumer (simplified) on node: {self.node_id}")
75
96
 
76
97
  # 连接Redis
77
- self.redis_client = await redis.Redis(
78
- host=self.redis_config.host,
79
- port=self.redis_config.port,
80
- db=self.redis_config.db,
81
- password=self.redis_config.password,
82
- decode_responses=False
83
- )
98
+ # 构建连接参数,只在密码非空时传递
99
+ async_redis_params = {
100
+ 'host': self.redis_config.host,
101
+ 'port': self.redis_config.port,
102
+ 'db': self.redis_config.db,
103
+ 'decode_responses': False
104
+ }
105
+ if self.redis_config.password:
106
+ async_redis_params['password'] = self.redis_config.password
107
+
108
+ self.redis_client = await redis.Redis(**async_redis_params)
84
109
 
85
110
  # 初始化 ConsumerManager(需要同步的 Redis 客户端)
86
111
  import redis as sync_redis
87
- sync_redis_client = sync_redis.StrictRedis(
88
- host=self.redis_config.host,
89
- port=self.redis_config.port,
90
- db=self.redis_config.db,
91
- password=self.redis_config.password,
92
- decode_responses=True # 使用字符串模式,与其他组件保持一致
93
- )
112
+ # 构建连接参数,只在密码非空时传递
113
+ sync_redis_params = {
114
+ 'host': self.redis_config.host,
115
+ 'port': self.redis_config.port,
116
+ 'db': self.redis_config.db,
117
+ 'decode_responses': True # 使用字符串模式,与其他组件保持一致
118
+ }
119
+ if self.redis_config.password:
120
+ sync_redis_params['password'] = self.redis_config.password
121
+
122
+ sync_redis_client = sync_redis.StrictRedis(**sync_redis_params)
94
123
 
95
124
  # 配置 ConsumerManager
96
125
  # 初始队列列表包含TASK_CHANGES,其他队列会动态添加
@@ -109,7 +138,7 @@ class PostgreSQLConsumer:
109
138
 
110
139
  # 获取稳定的 consumer_id(使用TASK_CHANGES作为基准队列)
111
140
  self.consumer_id = self.consumer_manager.get_consumer_name('TASK_CHANGES')
112
- logger.info(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
141
+ logger.debug(f"Using consumer_id: {self.consumer_id} with strategy: {self.consumer_strategy.value}")
113
142
 
114
143
  # 创建SQLAlchemy异步引擎
115
144
  if self.pg_config.dsn.startswith('postgresql://'):
@@ -127,7 +156,7 @@ class PostgreSQLConsumer:
127
156
  )
128
157
 
129
158
  # 预热连接池
130
- logger.info("Pre-warming database connection pool...")
159
+ logger.debug("Pre-warming database connection pool...")
131
160
  async with self.async_engine.begin() as conn:
132
161
  await conn.execute(text("SELECT 1"))
133
162
 
@@ -138,8 +167,6 @@ class PostgreSQLConsumer:
138
167
  expire_on_commit=False
139
168
  )
140
169
 
141
- # 初始化数据库架构
142
- await self._init_database()
143
170
 
144
171
  self._running = True
145
172
 
@@ -163,16 +190,23 @@ class PostgreSQLConsumer:
163
190
  asyncio.create_task(self._start_offline_recovery()) # 离线worker恢复服务
164
191
  ]
165
192
 
193
+ # 如果启用了积压监控,添加监控任务
194
+ if self.enable_backlog_monitor:
195
+ self._tasks.append(
196
+ asyncio.create_task(self._stream_backlog_monitor()) # Stream积压监控
197
+ )
198
+ logger.info(f"Stream backlog monitor enabled with {self.backlog_monitor_interval}s interval")
199
+
166
200
  # 如果使用 HEARTBEAT 策略,ConsumerManager 会自动管理心跳
167
201
  if self.consumer_strategy == ConsumerStrategy.HEARTBEAT and self.consumer_manager:
168
202
  # 启动心跳(ConsumerManager 内部会处理)
169
- logger.info("Heartbeat is managed by ConsumerManager")
203
+ logger.debug("Heartbeat is managed by ConsumerManager")
170
204
 
171
- logger.info("PostgreSQL consumer started successfully")
205
+ logger.debug("PostgreSQL consumer started successfully")
172
206
 
173
207
  async def stop(self):
174
208
  """停止消费者"""
175
- logger.info("Stopping PostgreSQL consumer...")
209
+ logger.debug("Stopping PostgreSQL consumer...")
176
210
  self._running = False
177
211
 
178
212
  # 停止离线恢复服务
@@ -190,7 +224,7 @@ class PostgreSQLConsumer:
190
224
  if self.consumer_manager:
191
225
  try:
192
226
  self.consumer_manager.cleanup()
193
- logger.info(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
227
+ logger.debug(f"Cleaned up ConsumerManager for consumer: {self.consumer_id}")
194
228
  except Exception as e:
195
229
  logger.error(f"Error cleaning up ConsumerManager: {e}")
196
230
 
@@ -201,38 +235,30 @@ class PostgreSQLConsumer:
201
235
  if self.async_engine:
202
236
  await self.async_engine.dispose()
203
237
 
204
- logger.info("PostgreSQL consumer stopped")
205
-
206
- async def _init_database(self):
207
- """初始化数据库架构"""
208
- # 使用相对于当前文件的路径
209
- import os
210
- current_dir = os.path.dirname(os.path.abspath(__file__))
211
- schema_path = os.path.join(current_dir, "schema.sql")
212
- try:
213
- with open(schema_path, 'r') as f:
214
- schema_sql = f.read()
215
-
216
- async with self.AsyncSessionLocal() as session:
217
- await session.execute(text(schema_sql))
218
- await session.commit()
219
- logger.info("Database schema initialized")
220
- except FileNotFoundError:
221
- logger.warning(f"Schema file not found at {schema_path}, skipping initialization")
222
- except Exception as e:
223
- logger.error(f"Failed to initialize database schema: {e}")
224
-
238
+ logger.debug("PostgreSQL consumer stopped")
239
+
225
240
  async def _initial_queue_discovery(self):
226
241
  """初始队列发现,在启动时执行一次"""
227
242
  try:
228
243
  pattern = f"{self.prefix}:QUEUE:*"
229
244
  new_queues = set()
245
+ logger.info(f"Starting initial queue discovery with pattern: {pattern}")
230
246
 
231
247
  async for key in self.redis_client.scan_iter(match=pattern, count=100):
232
- queue_name = key.decode('utf-8').split(":")[-1]
233
- new_queues.add(queue_name)
248
+ # key中提取队列名,格式可能是:
249
+ # - prefix:QUEUE:queue_name (普通队列)
250
+ # - prefix:QUEUE:queue_name:priority (优先级队列)
251
+ key_str = key.decode('utf-8')
252
+ parts = key_str.split(":")
253
+ if len(parts) >= 3:
254
+ # 去掉前缀和QUEUE部分
255
+ queue_parts = parts[2:] # 从第3部分开始是队列名
256
+ queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
257
+ new_queues.add(queue_name)
258
+ logger.info(f"Found queue: {queue_name} from key: {key_str}")
234
259
 
235
260
  if new_queues:
261
+ logger.info(f"Initial queue discovery found {len(new_queues)} queues: {new_queues}")
236
262
  # 合并所有队列:TASK_CHANGES + 动态发现的队列
237
263
  all_queues = list(new_queues) + ['TASK_CHANGES']
238
264
 
@@ -256,7 +282,7 @@ class PostgreSQLConsumer:
256
282
  'queues',
257
283
  ','.join(all_queues)
258
284
  )
259
- logger.info(f"Initial queue discovery - found queues: {all_queues}")
285
+ logger.debug(f"Initial queue discovery - found queues: {all_queues}")
260
286
  except Exception as e:
261
287
  logger.error(f"Error updating initial worker queues: {e}")
262
288
 
@@ -271,24 +297,34 @@ class PostgreSQLConsumer:
271
297
  try:
272
298
  pattern = f"{self.prefix}:QUEUE:*"
273
299
  new_queues = set()
274
-
300
+ # logger.info(f'{pattern=}')
275
301
  async for key in self.redis_client.scan_iter(match=pattern, count=100):
276
- queue_name = key.decode('utf-8').split(":")[-1]
277
- new_queues.add(queue_name)
302
+ # key中提取队列名,格式可能是:
303
+ # - prefix:QUEUE:queue_name (普通队列)
304
+ # - prefix:QUEUE:queue_name:priority (优先级队列)
305
+ key_str = key.decode('utf-8')
306
+ parts = key_str.split(":")
307
+ if len(parts) >= 3:
308
+ # 去掉前缀和QUEUE部分
309
+ queue_parts = parts[2:] # 从第3部分开始是队列名
310
+ queue_name = ":".join(queue_parts) # 重新组合,保留优先级部分
311
+ new_queues.add(queue_name)
278
312
 
279
313
  # 为新发现的队列创建消费者组
280
314
  for queue in new_queues - self._known_queues:
315
+ # 正确构建stream_key,保留优先级部分
281
316
  stream_key = f"{self.prefix}:QUEUE:{queue}"
282
317
  try:
283
318
  await self.redis_client.xgroup_create(
284
319
  stream_key, self.consumer_group, id='0', mkstream=True
285
320
  )
286
- logger.info(f"Created consumer group for new queue: {queue}")
321
+ logger.info(f"Created consumer group for new queue: {queue} with stream_key: {stream_key}")
287
322
  except redis.ResponseError:
288
323
  pass
289
324
 
290
325
  # 更新ConsumerManager的队列列表(同步操作)
291
326
  if new_queues != self._known_queues:
327
+ logger.info(f"Queue discovery: found {len(new_queues)} queues: {new_queues}")
292
328
  # 合并所有队列:TASK_CHANGES + 动态发现的队列
293
329
  all_queues = list(new_queues) + ['TASK_CHANGES']
294
330
 
@@ -312,7 +348,7 @@ class PostgreSQLConsumer:
312
348
  'queues',
313
349
  ','.join(all_queues)
314
350
  )
315
- logger.info(f"Updated ConsumerManager queues: {all_queues}")
351
+ logger.debug(f"Updated ConsumerManager queues: {all_queues}")
316
352
  except Exception as e:
317
353
  logger.error(f"Error updating worker queues: {e}")
318
354
 
@@ -326,8 +362,22 @@ class PostgreSQLConsumer:
326
362
  await asyncio.sleep(10)
327
363
 
328
364
  async def _consume_queue(self, queue_name: str):
329
- """消费单个队列的任务"""
330
- stream_key = f"{self.prefix}:QUEUE:{queue_name}"
365
+ """消费单个队列的任务(包括优先级队列)"""
366
+ # logger.info(f"Starting to consume queue: {queue_name}")
367
+ # 判断是否是优先级队列
368
+ is_priority_queue = ':' in queue_name and queue_name.rsplit(':', 1)[-1].isdigit()
369
+
370
+ if is_priority_queue:
371
+ # 优先级队列格式:base_queue:priority (如 robust_bench2:2)
372
+ base_queue = queue_name.rsplit(':', 1)[0]
373
+ priority = queue_name.rsplit(':', 1)[1]
374
+ stream_key = f"{self.prefix}:QUEUE:{base_queue}:{priority}"
375
+ else:
376
+ # 普通队列
377
+ stream_key = f"{self.prefix}:QUEUE:{queue_name}"
378
+
379
+ logger.debug(f"Consuming queue: {queue_name}, stream_key: {stream_key}, is_priority: {is_priority_queue}")
380
+
331
381
  check_backlog = True
332
382
  lastid = "0-0"
333
383
 
@@ -350,7 +400,6 @@ class PostgreSQLConsumer:
350
400
  count=10000,
351
401
  block=1000 if not check_backlog else 0
352
402
  )
353
-
354
403
  if not messages or (messages and len(messages[0][1]) == 0):
355
404
  check_backlog = False
356
405
  continue
@@ -369,7 +418,7 @@ class PostgreSQLConsumer:
369
418
  await self.redis_client.xgroup_create(
370
419
  stream_key, self.consumer_group, id='0', mkstream=True
371
420
  )
372
- logger.info(f"Recreated consumer group for queue: {queue_name}")
421
+ logger.debug(f"Recreated consumer group for queue: {queue_name}")
373
422
  check_backlog = True
374
423
  lastid = "0-0"
375
424
  except:
@@ -379,7 +428,7 @@ class PostgreSQLConsumer:
379
428
  self._consecutive_errors[queue_name] += 1
380
429
 
381
430
  if self._consecutive_errors[queue_name] > 10:
382
- logger.warning(f"Too many errors for queue {queue_name}, will retry later")
431
+ logger.debug(f"Too many errors for queue {queue_name}, will retry later")
383
432
  await asyncio.sleep(30)
384
433
  self._consecutive_errors[queue_name] = 0
385
434
 
@@ -392,19 +441,18 @@ class PostgreSQLConsumer:
392
441
  """启动所有队列的消费任务"""
393
442
  discover_task = asyncio.create_task(self._discover_queues())
394
443
  queue_tasks = {}
395
-
396
444
  while self._running:
397
445
  try:
398
446
  for queue in self._known_queues:
399
447
  if queue not in queue_tasks or queue_tasks[queue].done():
400
448
  queue_tasks[queue] = asyncio.create_task(self._consume_queue(queue))
401
- logger.info(f"Started consumer task for queue: {queue}")
449
+ logger.debug(f"Started consumer task for queue: {queue}")
402
450
 
403
451
  for queue in list(queue_tasks.keys()):
404
452
  if queue not in self._known_queues:
405
453
  queue_tasks[queue].cancel()
406
454
  del queue_tasks[queue]
407
- logger.info(f"Stopped consumer task for removed queue: {queue}")
455
+ logger.debug(f"Stopped consumer task for removed queue: {queue}")
408
456
 
409
457
  await asyncio.sleep(10)
410
458
 
@@ -428,7 +476,6 @@ class PostgreSQLConsumer:
428
476
  continue
429
477
 
430
478
  stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else stream_key
431
- queue_name = stream_key_str.split(":")[-1]
432
479
  msg_ids_to_ack = []
433
480
 
434
481
  for msg_id, data in stream_messages:
@@ -439,7 +486,7 @@ class PostgreSQLConsumer:
439
486
  msg_id_str = msg_id.decode('utf-8') if isinstance(msg_id, bytes) else str(msg_id)
440
487
 
441
488
  # 使用公共方法解析消息
442
- task_info = self._parse_stream_message(msg_id_str, data, queue_name)
489
+ task_info = self._parse_stream_message(msg_id_str, data)
443
490
  if task_info:
444
491
  tasks_to_insert.append(task_info)
445
492
  msg_ids_to_ack.append(msg_id)
@@ -479,23 +526,92 @@ class PostgreSQLConsumer:
479
526
  logger.error(f"Error executing batch ACK: {e}")
480
527
 
481
528
  async def _insert_tasks(self, tasks: List[Dict[str, Any]]):
482
- """批量插入任务到PostgreSQL"""
529
+ """批量插入任务到PostgreSQL(只处理tasks表)"""
483
530
  if not tasks:
484
531
  return
485
532
 
533
+ logger.info(f"Attempting to insert {len(tasks)} tasks to tasks table")
486
534
  try:
487
535
  async with self.AsyncSessionLocal() as session:
488
- query = text("""
489
- INSERT INTO tasks (id, queue_name, task_name, task_data, priority,
490
- retry_count, max_retry, status, metadata, created_at)
491
- VALUES (:id, :queue_name, :task_name, CAST(:task_data AS jsonb), :priority,
492
- :retry_count, :max_retry, :status, CAST(:metadata AS jsonb), :created_at)
493
- ON CONFLICT (id) DO NOTHING;
536
+ # 插入tasks表 - 使用批量INSERT忽略冲突
537
+ # 由于stream_id在实践中是唯一的,我们可以简单地忽略重复
538
+ tasks_query = text("""
539
+ INSERT INTO tasks (stream_id, queue, namespace, scheduled_task_id,
540
+ payload, priority, created_at, source, metadata)
541
+ VALUES (:stream_id, :queue, :namespace, :scheduled_task_id,
542
+ CAST(:payload AS jsonb), :priority, :created_at, :source, CAST(:metadata AS jsonb))
543
+ ON CONFLICT DO NOTHING
544
+ RETURNING stream_id;
494
545
  """)
495
546
 
496
- await session.execute(query, tasks)
497
- await session.commit()
498
- logger.info(f"Batch inserted {len(tasks)} tasks to PostgreSQL")
547
+ # 准备tasks表的数据
548
+ tasks_data = []
549
+ for task in tasks:
550
+ task_data = json.loads(task['task_data'])
551
+
552
+ # 从task_data中获取scheduled_task_id
553
+ scheduled_task_id = task_data.get('scheduled_task_id') or task.get('scheduled_task_id')
554
+
555
+ # 根据是否有scheduled_task_id来判断任务来源
556
+ if scheduled_task_id:
557
+ source = 'scheduler' # 定时任务
558
+ else:
559
+ source = 'redis_stream' # 普通任务
560
+
561
+ tasks_data.append({
562
+ 'stream_id': task['id'], # Redis Stream ID作为stream_id
563
+ 'queue': task['queue_name'],
564
+ 'namespace': self.namespace_name,
565
+ 'scheduled_task_id': str(scheduled_task_id) if scheduled_task_id else None,
566
+ 'payload': task['task_data'], # 完整的任务数据
567
+ 'priority': task['priority'],
568
+ 'created_at': task['created_at'],
569
+ 'source': source,
570
+ 'metadata': task.get('metadata', '{}')
571
+ })
572
+
573
+ # 批量插入 - 使用executemany提高性能
574
+ logger.debug(f"Executing batch insert with {len(tasks_data)} tasks")
575
+
576
+ try:
577
+ # 使用executemany批量插入
578
+ result = await session.execute(tasks_query, tasks_data)
579
+
580
+ # 获取实际插入的记录数
581
+ inserted_count = result.rowcount
582
+
583
+ # if inserted_count > 0:
584
+ # logger.info(f"Successfully inserted {inserted_count} new tasks to tasks table")
585
+ # else:
586
+ # logger.info(f"No new tasks inserted (all may be duplicates)")
587
+
588
+ await session.commit()
589
+ logger.debug("Tasks table batch insert transaction completed")
590
+
591
+ except Exception as e:
592
+ logger.error(f"Error in batch insert, trying fallback: {e}")
593
+ await session.rollback()
594
+
595
+ # 如果批量插入失败,降级为小批量插入(每批10条)
596
+ batch_size = 10
597
+ total_inserted = 0
598
+
599
+ for i in range(0, len(tasks_data), batch_size):
600
+ batch = tasks_data[i:i+batch_size]
601
+ try:
602
+ result = await session.execute(tasks_query, batch)
603
+ batch_inserted = result.rowcount
604
+ if batch_inserted > 0:
605
+ total_inserted += batch_inserted
606
+ await session.commit()
607
+ except Exception as batch_error:
608
+ logger.error(f"Batch {i//batch_size + 1} failed: {batch_error}")
609
+ await session.rollback()
610
+
611
+ if total_inserted > 0:
612
+ logger.info(f"Fallback insert completed: {total_inserted} tasks inserted")
613
+ else:
614
+ logger.info(f"No new tasks inserted in fallback mode")
499
615
 
500
616
  except Exception as e:
501
617
  logger.error(f"Error inserting tasks to PostgreSQL: {e}")
@@ -507,21 +623,21 @@ class PostgreSQLConsumer:
507
623
 
508
624
  # 使用 ConsumerManager 管理的 consumer name
509
625
  # 这样 ConsumerManager 才能正确跟踪和恢复这个流的待处理消息
510
- consumer_name = self.consumer_manager.get_consumer_name('pg_consumer')
626
+ consumer_name = self.consumer_manager.get_consumer_name('TASK_CHANGES')
511
627
 
512
628
  # 创建消费者组
513
629
  try:
514
630
  await self.redis_client.xgroup_create(
515
631
  change_stream_key, consumer_group, id='0', mkstream=True
516
632
  )
517
- logger.info(f"Created consumer group for task changes stream")
633
+ logger.debug(f"Created consumer group for task changes stream")
518
634
  except redis.ResponseError:
519
635
  pass
520
636
 
521
637
  # 模仿 listen_event_by_task 的写法:先处理pending消息,再处理新消息
522
638
  check_backlog = True
523
639
  lastid = "0-0"
524
- batch_size = 100
640
+ batch_size = 1000
525
641
 
526
642
  while self._running:
527
643
  try:
@@ -549,8 +665,8 @@ class PostgreSQLConsumer:
549
665
  else:
550
666
  check_backlog = False
551
667
 
552
- task_ids_to_update = set()
553
- ack_ids = []
668
+ # 收集消息ID和对应的task_id
669
+ msg_to_task = {} # msg_id -> task_id 映射
554
670
 
555
671
  for _, stream_messages in messages:
556
672
  for msg_id, data in stream_messages:
@@ -561,21 +677,56 @@ class PostgreSQLConsumer:
561
677
  else:
562
678
  lastid = str(msg_id)
563
679
 
564
- event_id = data.get(b'event_id')
565
- if event_id:
566
- if isinstance(event_id, bytes):
567
- event_id = event_id.decode('utf-8')
568
- task_ids_to_update.add(event_id)
569
- ack_ids.append(msg_id)
680
+ task_key = data[b'id']
681
+ task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
682
+
683
+ # 从完整的task_key格式提取stream_id
684
+ # 格式: namespace:TASK:stream_id:queue_name
685
+ stream_id = None
686
+ if ':TASK:' in task_key:
687
+ parts = task_key.split(':TASK:')
688
+ if len(parts) == 2:
689
+ # 再从右边部分提取stream_id
690
+ right_parts = parts[1].split(':')
691
+ if right_parts:
692
+ stream_id = right_parts[0] # 提取stream_id
693
+
694
+ if stream_id:
695
+ # 存储元组: (stream_id, task_key)
696
+ msg_to_task[msg_id] = (stream_id, task_key)
697
+ else:
698
+ logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
570
699
  except Exception as e:
571
- logger.error(f"Error processing change event {msg_id}: {e}")
572
-
573
- if task_ids_to_update:
574
- await self._update_tasks_by_event(list(task_ids_to_update))
575
- logger.info(f"Updated {len(task_ids_to_update)} tasks from change events")
700
+ import traceback
701
+ traceback.print_exc()
702
+ logger.error(f"Error processing change event {msg_id}: {e} {data=}")
703
+ # 解析失败的消息也应该ACK,避免一直重试
704
+ await self.redis_client.xack(change_stream_key, consumer_group, msg_id)
576
705
 
577
- if ack_ids:
578
- await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
706
+ if msg_to_task:
707
+ # 批量更新任务,返回成功更新的task_id列表
708
+ # msg_to_task 的值现在是元组 (stream_id, task_key)
709
+ id_tuples = list(set(msg_to_task.values()))
710
+ logger.info(f"Processing {len(id_tuples)} task updates from change stream")
711
+ # logger_f.write(f'{id_tuples=} \n')
712
+ successful_tuples = await self._update_tasks_by_event(id_tuples)
713
+
714
+ # 只ACK成功更新的消息
715
+ ack_ids = []
716
+ failed_count = 0
717
+ for msg_id, id_tuple in msg_to_task.items():
718
+ if successful_tuples and id_tuple in successful_tuples:
719
+ ack_ids.append(msg_id)
720
+ else:
721
+ failed_count += 1
722
+
723
+ if ack_ids:
724
+ await self.redis_client.xack(change_stream_key, consumer_group, *ack_ids)
725
+ if len(ack_ids) > 0:
726
+ logger.info(f"Updated {len(ack_ids)} task statuses")
727
+
728
+ if failed_count > 0:
729
+ logger.debug(f"Failed to update {failed_count} tasks, will retry")
579
730
 
580
731
  except redis.ResponseError as e:
581
732
  if "NOGROUP" in str(e):
@@ -584,7 +735,7 @@ class PostgreSQLConsumer:
584
735
  await self.redis_client.xgroup_create(
585
736
  change_stream_key, consumer_group, id='0', mkstream=True
586
737
  )
587
- logger.info(f"Recreated consumer group for task changes stream")
738
+ logger.debug(f"Recreated consumer group for task changes stream")
588
739
  check_backlog = True
589
740
  lastid = "0-0"
590
741
  except:
@@ -596,36 +747,114 @@ class PostgreSQLConsumer:
596
747
  logger.error(f"Error in consume_task_changes: {e}", exc_info=True)
597
748
  await asyncio.sleep(1)
598
749
 
599
- async def _update_tasks_by_event(self, task_ids: List[str]):
600
- """基于事件ID批量更新任务状态"""
601
- if not task_ids:
602
- return
750
+ async def _update_tasks_by_event(self, id_tuples: List[tuple]) -> Set[tuple]:
751
+ """基于事件ID批量更新任务状态
752
+
753
+ Args:
754
+ id_tuples: 元组列表,每个元组为 (stream_id, task_key)
755
+
756
+ Returns:
757
+ 成功更新的元组集合
758
+ """
759
+ if not id_tuples:
760
+ return set()
761
+
762
+ successful_tuples = set()
603
763
 
604
764
  try:
605
765
  pipeline = self.redis_client.pipeline()
606
- for task_id in task_ids:
607
- task_key = f"{self.prefix}:TASK:{task_id}"
766
+ for stream_id, task_key in id_tuples:
608
767
  pipeline.hgetall(task_key)
609
768
 
610
769
  redis_values = await pipeline.execute()
611
770
  updates = []
771
+ valid_tuples = [] # 记录有效的元组
772
+ if len(id_tuples) != len(redis_values):
773
+ logger.error(f'Mismatch: {len(id_tuples)=} {len(redis_values)=}')
774
+ # 不抛出异常,继续处理能处理的
612
775
 
613
- for i, task_id in enumerate(task_ids):
776
+ for i, (stream_id, task_key) in enumerate(id_tuples):
777
+ if i >= len(redis_values):
778
+ logger.error(f'Missing redis value for task_key={task_key}')
779
+ continue
780
+
614
781
  hash_data = redis_values[i]
615
782
 
616
783
  if not hash_data:
784
+ logger.debug(f'No hash data for task_key={task_key}')
617
785
  continue
618
786
 
619
- update_info = self._parse_task_hash(task_id, hash_data)
620
- if update_info:
621
- updates.append(update_info)
622
-
787
+ try:
788
+ # 从task_key解析出consumer_group
789
+ # task_key格式: namespace:TASK:stream_id:group_name
790
+ # 其中group_name就是完整的consumer_group(格式: jettask:QUEUE:queue_name:task_name)
791
+ parts = task_key.split(':', 3) # 最多分割成4部分
792
+ if len(parts) == 4:
793
+ # parts[0] = namespace (如 'default')
794
+ # parts[1] = 'TASK'
795
+ # parts[2] = stream_id
796
+ # parts[3] = group_name (consumer_group)
797
+ consumer_group = parts[3] # 直接使用group_name作为consumer_group
798
+ logger.debug(f"Extracted consumer_group from task_key: {consumer_group}")
799
+ else:
800
+ logger.warning(f"Cannot parse consumer_group from task_key: {task_key}")
801
+ continue
802
+
803
+ # 从consumer_group中提取task_name
804
+ # consumer_group格式: prefix:QUEUE:queue:task_name (如 jettask:QUEUE:robust_bench2:robust_benchmark.benchmark_task)
805
+ task_name = None
806
+ if consumer_group:
807
+ parts = consumer_group.split(':')
808
+ if len(parts) >= 4:
809
+ # 最后一部分是task_name
810
+ task_name = parts[-1]
811
+ logger.debug(f"Extracted task_name '{task_name}' from consumer_group '{consumer_group}'")
812
+
813
+ # 使用stream_id作为任务ID
814
+ update_info = self._parse_task_hash(stream_id, hash_data)
815
+ if update_info:
816
+ # 添加consumer_group和task_name到更新信息中
817
+ update_info['consumer_group'] = consumer_group
818
+ update_info['task_name'] = task_name or 'unknown' # 如果无法提取task_name,使用'unknown'
819
+ # consumer_name就是worker_id(执行任务的实际worker)
820
+ update_info['consumer_name'] = update_info.get('worker_id')
821
+ updates.append(update_info)
822
+ valid_tuples.append((stream_id, task_key))
823
+ else:
824
+ logger.debug(f'Failed to parse stream_id={stream_id} hash_data={hash_data}')
825
+ except Exception as e:
826
+ logger.error(f'Error parsing task stream_id={stream_id}: {e}')
827
+ continue
623
828
  if updates:
624
- await self._update_tasks(updates)
625
- logger.debug(f"Updated {len(updates)} tasks from change events")
626
-
829
+ logger.info(f"Attempting to update {len(updates)} tasks, first few: {[u['id'] for u in updates[:3]]}")
830
+ # logger_f.write(f'{updates=} \n')
831
+ try:
832
+ # _update_tasks 现在返回成功更新的ID集合
833
+ batch_successful = await self._update_tasks(updates)
834
+ # 将成功的stream_id映射回元组
835
+ for stream_id in batch_successful:
836
+ for tuple_item in valid_tuples:
837
+ if tuple_item[0] == stream_id: # stream_id匹配
838
+ successful_tuples.add(tuple_item)
839
+ if batch_successful:
840
+ logger.info(f"Successfully updated {len(batch_successful)} tasks from change events")
841
+ else:
842
+ logger.warning(f"No tasks were successfully updated")
843
+ except Exception as e:
844
+ logger.error(f"Error in batch update: {e}")
845
+ # 批量更新失败,尝试逐个更新
846
+ for update, tuple_item in zip(updates, valid_tuples):
847
+ try:
848
+ single_successful = await self._update_tasks([update])
849
+ if update['id'] in single_successful:
850
+ successful_tuples.add(tuple_item)
851
+ except Exception as single_error:
852
+ logger.error(f"Failed to update task {tuple_item[0]}: {single_error}")
853
+
627
854
  except Exception as e:
628
855
  logger.error(f"Error updating tasks by event: {e}", exc_info=True)
856
+ logger.debug(f'{successful_tuples=}')
857
+ return successful_tuples
629
858
 
630
859
  def _parse_task_hash(self, task_id: str, hash_data: dict) -> Optional[dict]:
631
860
  """解析Redis Hash数据"""
@@ -679,13 +908,12 @@ class PostgreSQLConsumer:
679
908
 
680
909
  update_info['worker_id'] = hash_dict.get('consumer') or hash_dict.get('worker_id')
681
910
 
682
- # 转换数值
911
+ # 转换数值 - 直接存储原始秒数值
683
912
  for num_field in ['execution_time', 'duration']:
684
913
  if hash_dict.get(num_field):
685
914
  try:
686
915
  num_str = hash_dict[num_field]
687
- if isinstance(num_str, str) and num_str.startswith("b'") and num_str.endswith("'"):
688
- num_str = num_str[2:-1]
916
+ # 直接存储浮点数秒值
689
917
  update_info[num_field] = float(num_str)
690
918
  except:
691
919
  pass
@@ -707,103 +935,158 @@ class PostgreSQLConsumer:
707
935
 
708
936
  return None
709
937
 
710
- async def _update_tasks(self, updates: List[Dict[str, Any]]):
711
- """批量更新任务状态(处理竞态条件)"""
938
+ async def _update_tasks(self, updates: List[Dict[str, Any]]) -> Set[str]:
939
+ """批量更新任务状态(使用UPSERT逻辑处理task_runs表)
940
+
941
+ Returns:
942
+ 成功更新的stream_id集合
943
+ """
712
944
  if not updates:
713
- return
945
+ return set()
714
946
 
715
947
  try:
716
948
  async with self.AsyncSessionLocal() as session:
717
- # 使用 executemany 批量更新
718
- update_query = text("""
719
- UPDATE tasks SET
720
- status = COALESCE(:status, status),
721
- result = COALESCE(CAST(:result AS jsonb), result),
722
- error_message = COALESCE(:error_message, error_message),
723
- started_at = COALESCE(:started_at, started_at),
724
- completed_at = COALESCE(:completed_at, completed_at),
725
- worker_id = COALESCE(:worker_id, worker_id),
726
- execution_time = COALESCE(:execution_time, execution_time),
727
- duration = COALESCE(:duration, duration)
728
- WHERE id = :id
949
+ # V3结构:使用UPSERT逻辑处理task_runs表
950
+ stream_ids = [u['id'] for u in updates]
951
+ logger.info(f"Upserting {len(stream_ids)} task_runs records")
952
+
953
+ # 对于分区表,我们需要使用不同的UPSERT策略
954
+ # 先尝试UPDATE,如果没有更新到任何行,则INSERT
955
+ upsert_query = text("""
956
+ WITH updated AS (
957
+ UPDATE task_runs SET
958
+ consumer_name = COALESCE(CAST(:consumer_name AS TEXT), consumer_name),
959
+ status = CASE
960
+ WHEN CAST(:status AS TEXT) IS NULL THEN status
961
+ WHEN status = 'pending' THEN COALESCE(CAST(:status AS TEXT), status)
962
+ WHEN status = 'running' AND CAST(:status AS TEXT) IN ('success', 'failed', 'timeout', 'skipped') THEN CAST(:status AS TEXT)
963
+ WHEN status IN ('success', 'failed', 'timeout', 'skipped') THEN status
964
+ ELSE COALESCE(CAST(:status AS TEXT), status)
965
+ END,
966
+ result = CASE
967
+ WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN result
968
+ ELSE COALESCE(CAST(:result AS jsonb), result)
969
+ END,
970
+ error_message = CASE
971
+ WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN error_message
972
+ ELSE COALESCE(CAST(:error_message AS TEXT), error_message)
973
+ END,
974
+ start_time = COALESCE(CAST(:started_at AS TIMESTAMPTZ), start_time),
975
+ end_time = CASE
976
+ WHEN status IN ('success', 'failed', 'timeout', 'skipped') AND CAST(:status AS TEXT) NOT IN ('success', 'failed', 'timeout', 'skipped') THEN end_time
977
+ ELSE COALESCE(CAST(:completed_at AS TIMESTAMPTZ), end_time)
978
+ END,
979
+ worker_id = COALESCE(CAST(:worker_id AS TEXT), worker_id),
980
+ duration = COALESCE(CAST(:duration AS DOUBLE PRECISION), duration),
981
+ execution_time = COALESCE(CAST(:execution_time AS DOUBLE PRECISION), execution_time),
982
+ updated_at = CURRENT_TIMESTAMP
983
+ WHERE stream_id = :stream_id AND consumer_group = :consumer_group
984
+ RETURNING stream_id
985
+ )
986
+ INSERT INTO task_runs (
987
+ stream_id, task_name, consumer_group, consumer_name, status, result, error_message,
988
+ start_time, end_time, worker_id, duration, execution_time,
989
+ created_at, updated_at
990
+ )
991
+ SELECT
992
+ :stream_id, :task_name, :consumer_group, :consumer_name,
993
+ COALESCE(CAST(:status AS TEXT), 'pending'),
994
+ CAST(:result AS jsonb),
995
+ CAST(:error_message AS TEXT),
996
+ CAST(:started_at AS TIMESTAMPTZ),
997
+ CAST(:completed_at AS TIMESTAMPTZ),
998
+ CAST(:worker_id AS TEXT),
999
+ CAST(:duration AS DOUBLE PRECISION),
1000
+ CAST(:execution_time AS DOUBLE PRECISION),
1001
+ CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
1002
+ WHERE NOT EXISTS (SELECT 1 FROM updated)
1003
+ RETURNING stream_id;
729
1004
  """)
730
1005
 
731
- # 批量执行更新
732
- result = await session.execute(update_query, updates)
1006
+ # 为每个更新转换参数名称(从id改为stream_id)
1007
+ run_updates = []
1008
+ for update in updates:
1009
+ run_update = update.copy()
1010
+ run_update['stream_id'] = run_update.pop('id') # 将id改为stream_id
1011
+ # consumer_group 已经在 update_info 中了,不需要额外处理
1012
+ run_updates.append(run_update)
733
1013
 
734
- # 检查受影响的行数
735
- updated_count = result.rowcount
736
- expected_count = len(updates)
1014
+ # 批量执行UPSERT - 使用事务批处理提高性能
1015
+ successful_count = 0
1016
+ batch_size = 20 # 每批处理20条记录
737
1017
 
738
- # 只有当受影响行数与预期不一致时,才去查询具体哪些记录不存在
739
- if updated_count < expected_count:
740
- task_ids = [u['id'] for u in updates]
741
-
742
- # 先使用内存集合进行快速过滤
743
- async with self._processed_ids_lock:
744
- # 过滤出可能存在的ID(在内存集合中的肯定存在)
745
- known_existing_ids = set(task_ids) & self._processed_task_ids
1018
+ for i in range(0, len(run_updates), batch_size):
1019
+ batch = run_updates[i:i+batch_size]
746
1020
 
747
- # 计算可能缺失的ID(不在内存集合中的需要查询确认)
748
- potential_missing_ids = set(task_ids) - known_existing_ids
749
-
750
- if len(known_existing_ids) + updated_count >= expected_count:
751
- # 如果已知存在的ID数量加上更新成功的数量已经达到预期,说明没有缺失
752
- missing_ids = set()
753
- logger.debug(f"Memory cache hit: avoided DB query for {len(known_existing_ids)} IDs")
754
- elif potential_missing_ids:
755
- # 只查询不在内存集合中的ID,减少查询范围
756
- logger.debug(f"Memory cache partial hit: checking {len(potential_missing_ids)} IDs in DB (skipped {len(known_existing_ids)} cached IDs)")
757
- check_query = text("""
758
- SELECT id FROM tasks WHERE id = ANY(:ids)
759
- """)
760
- check_result = await session.execute(check_query, {'ids': list(potential_missing_ids)})
761
- existing_in_db = {row[0] for row in check_result}
762
-
763
- # 更新内存集合(发现的新ID加入集合)
764
- async with self._processed_ids_lock:
765
- self._processed_task_ids.update(existing_in_db)
1021
+ try:
1022
+ # 在一个事务中处理整批
1023
+ for run_update in batch:
1024
+ result = await session.execute(upsert_query, run_update)
1025
+ if result.rowcount > 0:
1026
+ successful_count += 1
766
1027
 
767
- # 找出确实不存在的记录
768
- missing_ids = potential_missing_ids - existing_in_db
769
- else:
770
- missing_ids = set()
771
- else:
772
- # 所有记录都更新成功
773
- missing_ids = set()
774
-
775
- if missing_ids:
776
- # 将缺失的任务更新加入待重试队列,而不是立即创建
777
- async with self._pending_updates_lock:
778
- # 创建更新信息映射
779
- update_map = {u['id']: u for u in updates if u['id'] in missing_ids}
1028
+ # 批量提交
1029
+ await session.commit()
1030
+ logger.debug(f"Batch {i//batch_size + 1} committed: {len(batch)} records")
780
1031
 
781
- for task_id in missing_ids:
782
- if task_id in update_map:
783
- # 如果已经有旧的更新在队列中,新的更新会覆盖它
784
- # 这确保了只有最新的更新会被重试
785
- if task_id in self._pending_updates:
786
- logger.debug(f"Replacing old pending update for task {task_id} with newer one")
787
-
788
- # 保存更新信息,等待重试(会覆盖旧的)
789
- self._pending_updates[task_id] = update_map[task_id]
1032
+ except Exception as e:
1033
+ logger.error(f"Batch {i//batch_size + 1} failed, trying individual records: {e}")
1034
+ await session.rollback()
790
1035
 
791
- # 如果待重试队列过大,清理最早的一半
792
- if len(self._pending_updates) > self._max_pending_updates:
793
- items = list(self._pending_updates.items())
794
- keep_count = self._max_pending_updates // 2
795
- self._pending_updates = dict(items[-keep_count:])
796
- logger.warning(f"Pending updates queue full, kept {keep_count} most recent items")
797
-
798
- logger.info(f"Added {len(missing_ids)} task updates to retry queue")
1036
+ # 如果批处理失败,回退到逐个处理这批记录
1037
+ for run_update in batch:
1038
+ try:
1039
+ result = await session.execute(upsert_query, run_update)
1040
+ await session.commit()
1041
+ if result.rowcount > 0:
1042
+ successful_count += 1
1043
+ except Exception as individual_error:
1044
+ logger.error(f"Individual upsert failed for {run_update.get('stream_id')}: {individual_error}")
1045
+ await session.rollback()
799
1046
 
800
- await session.commit()
1047
+ # 记录成功更新的数量
1048
+ if successful_count > 0:
1049
+ logger.info(f"Upserted {successful_count}/{len(run_updates)} task_runs records")
801
1050
 
802
- # if updated_count > 0:
803
- # logger.info(f"Updated {updated_count} task statuses {updates=}")
1051
+ # 检查哪些任务是完成状态,需要从Redis中删除
1052
+ completed_task_keys = []
1053
+ for update in updates:
1054
+ status = update.get('status')
1055
+ # 如果状态是完成状态(success, error, cancel等)
1056
+ if status in ['success', 'error', 'failed', 'cancel', 'cancelled', 'timeout', 'skipped']:
1057
+ # 构建task_key
1058
+ # task_key格式: namespace:TASK:stream_id:group_name
1059
+ stream_id = update['id']
1060
+ consumer_group = update.get('consumer_group')
1061
+ if consumer_group:
1062
+ # 从consumer_group提取namespace
1063
+ # consumer_group格式: prefix:QUEUE:queue:task_name
1064
+ parts = consumer_group.split(':', 1)
1065
+ namespace = parts[0] if parts else 'default'
1066
+ task_key = f"{namespace}:TASK:{stream_id}:{consumer_group}"
1067
+ completed_task_keys.append(task_key)
1068
+ logger.info(f"Task {stream_id} with status {status} will be deleted from Redis: {task_key}")
1069
+
1070
+ # 从Redis中删除已完成的任务
1071
+ if completed_task_keys:
1072
+ try:
1073
+ pipeline = self.redis_client.pipeline()
1074
+ for task_key in completed_task_keys:
1075
+ pipeline.delete(task_key)
1076
+ deleted_results = await pipeline.execute()
1077
+ deleted_count = sum(1 for r in deleted_results if r > 0)
1078
+ if deleted_count > 0:
1079
+ logger.info(f"Deleted {deleted_count} completed tasks from Redis")
1080
+ except Exception as e:
1081
+ logger.error(f"Error deleting completed tasks from Redis: {e}")
1082
+
1083
+ # UPSERT 操作总是成功的,返回所有stream_id
1084
+ # 不需要复杂的错误处理,因为UPSERT保证了操作的原子性
1085
+ return set(stream_ids)
804
1086
 
805
1087
  except Exception as e:
806
- logger.error(f"Error updating task statuses: {e}")
1088
+ logger.error(f"Error upserting task statuses: {e}")
1089
+ return set() # 出错时返回空集
807
1090
 
808
1091
  async def _retry_pending_updates(self):
809
1092
  """定期重试待更新的任务"""
@@ -821,10 +1104,12 @@ class PostgreSQLConsumer:
821
1104
  self._pending_updates.clear()
822
1105
 
823
1106
  if pending_items:
824
- logger.info(f"Retrying {len(pending_items)} pending task updates")
825
1107
 
826
1108
  # 重新尝试更新
827
1109
  updates = [update_info for _, update_info in pending_items]
1110
+ logger.debug(f"Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}")
1111
+ logger_f.write(f'{time.time()=} Retrying {len(pending_items)} pending task updates {[_ for _, update_info in pending_items]=}\n')
1112
+ logger_f.flush()
828
1113
  await self._update_tasks(updates)
829
1114
 
830
1115
  except Exception as e:
@@ -833,7 +1118,7 @@ class PostgreSQLConsumer:
833
1118
 
834
1119
  async def _start_offline_recovery(self):
835
1120
  """启动离线worker恢复服务,恢复离线PG_CONSUMER的消息"""
836
- logger.info("Starting offline worker recovery service for PG_CONSUMER")
1121
+ logger.debug("Starting offline worker recovery service for PG_CONSUMER")
837
1122
 
838
1123
  # 等待consumer manager初始化和队列发现
839
1124
  # await asyncio.sleep(5)
@@ -843,30 +1128,30 @@ class PostgreSQLConsumer:
843
1128
  total_recovered = 0
844
1129
 
845
1130
  # 1. 恢复普通队列的消息
846
- for queue in self._known_queues:
847
- # logger.info(f'{queue=}')
848
- try:
849
- recovered = await self.offline_recovery.recover_offline_workers(
850
- queue=queue,
851
- current_consumer_name=self.consumer_id,
852
- process_message_callback=self._process_recovered_queue_message
853
- )
1131
+ # for queue in self._known_queues:
1132
+ # # logger.info(f'{queue=}')
1133
+ # try:
1134
+ # recovered = await self.offline_recovery.recover_offline_workers(
1135
+ # queue=queue,
1136
+ # current_consumer_name=self.consumer_id,
1137
+ # process_message_callback=self._process_recovered_queue_message
1138
+ # )
854
1139
 
855
- if recovered > 0:
856
- logger.info(f"Recovered {recovered} messages from queue {queue}")
857
- total_recovered += recovered
1140
+ # if recovered > 0:
1141
+ # logger.info(f"Recovered {recovered} messages from queue {queue}")
1142
+ # total_recovered += recovered
858
1143
 
859
- except Exception as e:
860
- logger.error(f"Error recovering queue {queue}: {e}")
1144
+ # except Exception as e:
1145
+ # logger.error(f"Error recovering queue {queue}: {e}")
861
1146
 
862
1147
  # 2. 恢复TASK_CHANGES stream的消息
863
1148
  recovered = await self._recover_task_changes_offline_messages()
864
1149
  if recovered > 0:
865
- logger.info(f"Recovered {recovered} TASK_CHANGES messages")
1150
+ logger.debug(f"Recovered {recovered} TASK_CHANGES messages")
866
1151
  total_recovered += recovered
867
1152
 
868
1153
  if total_recovered > 0:
869
- logger.info(f"Total recovered {total_recovered} messages in this cycle")
1154
+ logger.debug(f"Total recovered {total_recovered} messages in this cycle")
870
1155
 
871
1156
  # 每30秒扫描一次
872
1157
  await asyncio.sleep(1)
@@ -909,37 +1194,34 @@ class PostgreSQLConsumer:
909
1194
  logger.error(f"Error in recover_task_changes_offline_messages: {e}")
910
1195
  return 0
911
1196
 
912
- async def _process_recovered_queue_message(self, msg_id, msg_data, queue, consumer_id):
913
- """处理恢复的普通队列消息(符合OfflineWorkerRecovery的回调接口)"""
914
- try:
915
- logger.info(f"Processing recovered message {msg_id} from queue {queue}, offline worker {consumer_id}")
916
-
917
- # 解析任务信息
918
- task_info = self._parse_stream_message(msg_id, msg_data, queue)
919
- if task_info:
920
- # 批量插入到数据库
921
- await self._batch_insert_tasks([task_info])
922
-
923
- # ACK消息
924
- stream_key = f"{self.prefix}:QUEUE:{queue}"
925
- await self.redis_client.xack(stream_key, self.consumer_group, msg_id)
926
-
927
- except Exception as e:
928
- logger.error(f"Error processing recovered queue message {msg_id}: {e}")
929
-
930
1197
  async def _process_recovered_task_change_v2(self, msg_id, msg_data, queue, consumer_id):
931
1198
  """处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口)"""
932
1199
  try:
933
- # 解析消息
934
- event_id = msg_data.get(b'event_id')
935
- if event_id:
936
- if isinstance(event_id, bytes):
937
- event_id = event_id.decode('utf-8')
1200
+ logger.debug(f'处理恢复的TASK_CHANGES消息(符合OfflineWorkerRecovery的回调接口) {msg_data=}')
1201
+ # 解析消息 - 现在使用task_id而不是event_id
1202
+ if b'task_id' in msg_data:
1203
+ # 使用msgpack解压task_id
1204
+ compressed_task_id = msg_data[b'task_id']
1205
+ task_key = msgpack.unpackb(compressed_task_id)
1206
+ task_key = task_key.decode('utf-8') if isinstance(task_key, bytes) else str(task_key)
938
1207
 
939
- logger.info(f"Processing recovered TASK_CHANGES message: {event_id} from offline worker {consumer_id}")
1208
+ # 从完整的task_key格式提取stream_id
1209
+ # 格式: namespace:TASK:stream_id:queue_name
1210
+ stream_id = None
1211
+ if ':TASK:' in task_key:
1212
+ parts = task_key.split(':TASK:')
1213
+ if len(parts) == 2:
1214
+ # 再从右边部分提取stream_id
1215
+ right_parts = parts[1].split(':')
1216
+ if right_parts:
1217
+ stream_id = right_parts[0] # 提取stream_id
940
1218
 
941
- # 更新任务状态
942
- await self._update_tasks_by_event([event_id])
1219
+ if stream_id:
1220
+ logger.debug(f"Processing recovered TASK_CHANGES message: {stream_id} from offline worker {consumer_id}")
1221
+ # 更新任务状态 - 传入(stream_id, task_key)元组
1222
+ await self._update_tasks_by_event([(stream_id, task_key)])
1223
+ else:
1224
+ logger.warning(f"Cannot extract stream_id from task_key: {task_key}")
943
1225
 
944
1226
  # ACK消息
945
1227
  change_stream_key = f"{self.prefix}:TASK_CHANGES"
@@ -960,10 +1242,11 @@ class PostgreSQLConsumer:
960
1242
 
961
1243
  if current_time - last_analyze_time > analyze_interval:
962
1244
  async with self.AsyncSessionLocal() as session:
963
- logger.info("Running ANALYZE on tasks table...")
1245
+ logger.debug("Running ANALYZE on tasks and task_runs tables...")
964
1246
  await session.execute(text("ANALYZE tasks"))
1247
+ await session.execute(text("ANALYZE task_runs"))
965
1248
  await session.commit()
966
- logger.info("ANALYZE completed successfully")
1249
+ logger.debug("ANALYZE completed successfully for both tables")
967
1250
  last_analyze_time = current_time
968
1251
 
969
1252
  await asyncio.sleep(300) # 每5分钟检查一次
@@ -972,11 +1255,442 @@ class PostgreSQLConsumer:
972
1255
  logger.error(f"Error in database maintenance: {e}")
973
1256
  await asyncio.sleep(60)
974
1257
 
975
- def _parse_stream_message(self, task_id: str, data: dict, queue_name: str) -> Optional[dict]:
1258
+ async def _stream_backlog_monitor(self):
1259
+ """Stream积压监控任务 - 使用分布式锁确保只有一个实例采集"""
1260
+ # await asyncio.sleep(10) # 启动后延迟10秒开始
1261
+
1262
+ while self._running:
1263
+ try:
1264
+ # 尝试获取分布式锁
1265
+ lock_acquired = await self._try_acquire_monitor_lock()
1266
+
1267
+ if lock_acquired:
1268
+ try:
1269
+ logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
1270
+ await self._collect_stream_backlog_metrics()
1271
+ logger.debug("Stream backlog metrics collected successfully")
1272
+ finally:
1273
+ # 释放锁
1274
+ await self._release_monitor_lock()
1275
+ else:
1276
+ logger.debug("Another instance is collecting backlog metrics, skipping...")
1277
+
1278
+ # 等待下一次采集
1279
+ await asyncio.sleep(self.backlog_monitor_interval)
1280
+
1281
+ except Exception as e:
1282
+ logger.error(f"Error in stream backlog monitor: {e}")
1283
+ await asyncio.sleep(30) # 出错后等待30秒
1284
+
1285
+ async def _try_acquire_monitor_lock(self) -> bool:
1286
+ """尝试获取监控锁(使用Redis原生锁)"""
1287
+ try:
1288
+ # 使用SET NX EX命令实现分布式锁
1289
+ # NX: 只在键不存在时设置
1290
+ # EX: 设置过期时间(秒)
1291
+ result = await self.redis_client.set(
1292
+ self.backlog_monitor_lock_key.encode(),
1293
+ self.node_id.encode(), # 锁的值为当前节点ID
1294
+ nx=True, # 只在不存在时设置
1295
+ ex=self.backlog_monitor_lock_ttl # 过期时间
1296
+ )
1297
+ return result is not None
1298
+ except Exception as e:
1299
+ logger.error(f"Error acquiring monitor lock: {e}")
1300
+ return False
1301
+
1302
+ async def _release_monitor_lock(self):
1303
+ """释放监控锁(只释放自己持有的锁)"""
1304
+ try:
1305
+ # 使用Lua脚本确保只释放自己持有的锁
1306
+ lua_script = """
1307
+ if redis.call("get", KEYS[1]) == ARGV[1] then
1308
+ return redis.call("del", KEYS[1])
1309
+ else
1310
+ return 0
1311
+ end
1312
+ """
1313
+ await self.redis_client.eval(
1314
+ lua_script,
1315
+ 1,
1316
+ self.backlog_monitor_lock_key.encode(),
1317
+ self.node_id.encode()
1318
+ )
1319
+ except Exception as e:
1320
+ logger.error(f"Error releasing monitor lock: {e}")
1321
+
1322
+ async def _collect_stream_backlog_metrics(self):
1323
+ """采集Stream积压指标并保存到数据库(使用offset方式)"""
1324
+ try:
1325
+ # 获取所有队列的最新offset (QUEUE_OFFSETS)
1326
+ queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
1327
+ queue_offsets = {}
1328
+ try:
1329
+ # 使用decode_responses=False的客户端,手动解码
1330
+ raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
1331
+ for k, v in raw_queue_offsets.items():
1332
+ queue_name = k.decode() if isinstance(k, bytes) else k
1333
+ offset_value = v.decode() if isinstance(v, bytes) else v
1334
+ queue_offsets[queue_name] = int(offset_value)
1335
+ except Exception as e:
1336
+ logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
1337
+
1338
+ # 获取所有任务组的消费offset (TASK_OFFSETS)
1339
+ task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
1340
+ task_offsets = {}
1341
+ try:
1342
+ raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
1343
+ for k, v in raw_task_offsets.items():
1344
+ task_key = k.decode() if isinstance(k, bytes) else k
1345
+ offset_value = v.decode() if isinstance(v, bytes) else v
1346
+ task_offsets[task_key] = int(offset_value)
1347
+ except Exception as e:
1348
+ logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
1349
+
1350
+ # 使用SCAN命令扫描所有队列Stream(包括普通队列和优先级队列)
1351
+ stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
1352
+ pattern = f"{self.prefix}:QUEUE:*".encode()
1353
+ cursor = 0
1354
+
1355
+ # 使用SCAN命令,增大count参数以提高效率
1356
+ while True:
1357
+ cursor, keys = await self.redis_client.scan(cursor, match=pattern, count=10000)
1358
+
1359
+ for key in keys:
1360
+ key_str = key.decode()
1361
+ # 移除前缀 "prefix:QUEUE:"
1362
+ queue_part = key_str.replace(f"{self.prefix}:QUEUE:", "")
1363
+
1364
+ # 检查是否是优先级队列(格式:queue_name:priority)
1365
+ parts = queue_part.split(':')
1366
+ if len(parts) == 2 and parts[1].isdigit():
1367
+ # 优先级队列
1368
+ queue_name = parts[0]
1369
+ priority = int(parts[1])
1370
+ if queue_name not in stream_info_map:
1371
+ stream_info_map[queue_name] = []
1372
+ stream_info_map[queue_name].append((key, priority))
1373
+ elif ':' not in queue_part:
1374
+ # 普通队列(不包含冒号)
1375
+ queue_name = queue_part
1376
+ if queue_name not in stream_info_map:
1377
+ stream_info_map[queue_name] = []
1378
+ stream_info_map[queue_name].append((key, 0)) # 普通队列优先级为0
1379
+ # 忽略其他格式的键(如消费组等)
1380
+
1381
+ if cursor == 0:
1382
+ break
1383
+
1384
+ if not stream_info_map:
1385
+ logger.debug("No streams found for backlog monitoring")
1386
+ return
1387
+
1388
+ # 调试日志(使用debug级别避免刷屏)
1389
+ logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
1390
+ for queue_name, stream_list in stream_info_map.items():
1391
+ priorities = [p for _, p in stream_list]
1392
+ # 筛选出非0优先级(0表示普通队列)
1393
+ high_priorities = [p for p in priorities if p > 0]
1394
+ if high_priorities:
1395
+ logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
1396
+ else:
1397
+ logger.debug(f" - {queue_name}: regular queue only (priority=0)")
1398
+
1399
+ # 收集每个队列的指标(聚合所有优先级)
1400
+ metrics = []
1401
+ current_time = datetime.now(timezone.utc)
1402
+
1403
+ for queue_name, stream_list in stream_info_map.items():
1404
+ # 分别处理每个优先级队列
1405
+ for stream_key, priority in stream_list:
1406
+ try:
1407
+ # 获取该队列的最新offset(考虑优先级队列)
1408
+ if priority > 0:
1409
+ # 优先级队列的key格式: queue_name:priority
1410
+ queue_key = f"{queue_name}:{priority}"
1411
+ else:
1412
+ queue_key = queue_name
1413
+ last_published_offset = queue_offsets.get(queue_key, 0)
1414
+
1415
+ # 获取Stream信息
1416
+ stream_info = await self.redis_client.xinfo_stream(stream_key)
1417
+ stream_length = stream_info.get(b'length', 0)
1418
+
1419
+ # 获取消费组信息
1420
+ has_consumer_groups = False
1421
+ try:
1422
+ groups = await self.redis_client.xinfo_groups(stream_key)
1423
+
1424
+ for group in groups:
1425
+ # 处理group_name
1426
+ raw_name = group.get('name', b'')
1427
+ if isinstance(raw_name, bytes):
1428
+ group_name = raw_name.decode() if raw_name else ''
1429
+ else:
1430
+ group_name = str(raw_name) if raw_name else ''
1431
+
1432
+ if not group_name:
1433
+ group_name = 'unknown'
1434
+
1435
+ # 过滤内部消费者组
1436
+ if is_internal_consumer(group_name):
1437
+ # logger.info(f"Skipping internal consumer group: {group_name}")
1438
+ continue
1439
+
1440
+ # 处理pending - 直接是int
1441
+ pending_count = group.get('pending', 0)
1442
+
1443
+ # 从TASK_OFFSETS获取该组的消费offset
1444
+ # key格式: f"{queue_name}:{group_name}" (不包含优先级)
1445
+ task_offset_key = f"{queue_name}:{group_name}"
1446
+ last_acked_offset = task_offsets.get(task_offset_key, 0)
1447
+
1448
+ # 计算各种积压指标
1449
+ # 1. 总积压 = 队列最新offset - 消费组已确认的offset
1450
+ total_backlog = max(0, last_published_offset - last_acked_offset)
1451
+
1452
+ # 2. 未投递的积压 = 总积压 - pending数量
1453
+ backlog_undelivered = max(0, total_backlog - pending_count)
1454
+
1455
+ # 3. 已投递未确认 = pending数量
1456
+ backlog_delivered_unacked = pending_count
1457
+
1458
+ # 4. 已投递的offset = 已确认offset + pending数量
1459
+ last_delivered_offset = last_acked_offset + pending_count
1460
+
1461
+ # 为每个消费组创建一条记录
1462
+ metrics.append({
1463
+ 'namespace': self.namespace_name,
1464
+ 'stream_name': queue_name,
1465
+ 'priority': priority, # 添加优先级字段
1466
+ 'consumer_group': group_name,
1467
+ 'last_published_offset': last_published_offset,
1468
+ 'last_delivered_offset': last_delivered_offset,
1469
+ 'last_acked_offset': last_acked_offset,
1470
+ 'pending_count': pending_count,
1471
+ 'backlog_undelivered': backlog_undelivered,
1472
+ 'backlog_unprocessed': total_backlog,
1473
+ 'created_at': current_time
1474
+ })
1475
+ has_consumer_groups = True
1476
+
1477
+ except Exception as e:
1478
+ # 这个队列没有消费组
1479
+ logger.debug(f"No consumer groups for stream {stream_key.decode()}: {e}")
1480
+
1481
+ # 如果没有消费组,保存Stream级别的指标
1482
+ if not has_consumer_groups and last_published_offset > 0:
1483
+ metrics.append({
1484
+ 'namespace': self.namespace_name,
1485
+ 'stream_name': queue_name,
1486
+ 'priority': priority, # 添加优先级字段
1487
+ 'consumer_group': None,
1488
+ 'last_published_offset': last_published_offset,
1489
+ 'last_delivered_offset': 0,
1490
+ 'last_acked_offset': 0,
1491
+ 'pending_count': 0,
1492
+ 'backlog_undelivered': last_published_offset,
1493
+ 'backlog_unprocessed': last_published_offset,
1494
+ 'created_at': current_time
1495
+ })
1496
+
1497
+ except Exception as e:
1498
+ logger.error(f"Error collecting metrics for stream {stream_key.decode()}: {e}")
1499
+ continue
1500
+
1501
+ # 保存指标到数据库
1502
+ if metrics:
1503
+ await self._save_backlog_metrics(metrics)
1504
+ # logger.info(f"Collected backlog metrics for {len(metrics)} stream/group combinations {time.time() }")
1505
+
1506
+ except Exception as e:
1507
+ import traceback
1508
+ traceback.print_exc()
1509
+ logger.error(f"Error collecting stream backlog metrics: {e}")
1510
+
1511
+ async def _save_backlog_metrics(self, metrics: List[Dict]):
1512
+ """保存积压指标到数据库(仅保存发生变化的数据)"""
1513
+ if not metrics:
1514
+ return
1515
+
1516
+ # logger.info(f"Processing {len(metrics)} metrics for deduplication")
1517
+
1518
+ try:
1519
+ async with self.AsyncSessionLocal() as session:
1520
+ # 要保存的新记录
1521
+ metrics_to_save = []
1522
+
1523
+ # 使用批量查询优化性能
1524
+ metric_keys = {} # 用于快速查找
1525
+
1526
+ for metric in metrics:
1527
+ # 构建唯一键:namespace + stream_name + consumer_group + priority
1528
+ unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
1529
+ metric_keys[unique_key] = metric
1530
+
1531
+ # logger.info(f"Checking {len(metric_keys)} unique metric combinations")
1532
+
1533
+ # 批量查询最新记录 - 分批查询以避免SQL过长
1534
+ last_records = {}
1535
+ metric_list = list(metric_keys.values())
1536
+ batch_size = 50 # 每批查询50个
1537
+
1538
+ for i in range(0, len(metric_list), batch_size):
1539
+ batch = metric_list[i:i + batch_size]
1540
+
1541
+ # 构建参数化查询
1542
+ conditions = []
1543
+ params = {}
1544
+ for idx, metric in enumerate(batch):
1545
+ param_prefix = f"p{i + idx}"
1546
+ conditions.append(f"""
1547
+ (namespace = :{param_prefix}_ns
1548
+ AND stream_name = :{param_prefix}_sn
1549
+ AND consumer_group = :{param_prefix}_cg
1550
+ AND priority = :{param_prefix}_pr)
1551
+ """)
1552
+ params[f"{param_prefix}_ns"] = metric['namespace']
1553
+ params[f"{param_prefix}_sn"] = metric['stream_name']
1554
+ params[f"{param_prefix}_cg"] = metric['consumer_group']
1555
+ params[f"{param_prefix}_pr"] = metric['priority']
1556
+
1557
+ if conditions:
1558
+ # 使用窗口函数获取每个组合的最新记录
1559
+ query_sql = text(f"""
1560
+ WITH latest_records AS (
1561
+ SELECT
1562
+ namespace,
1563
+ stream_name,
1564
+ consumer_group,
1565
+ priority,
1566
+ last_published_offset,
1567
+ last_delivered_offset,
1568
+ last_acked_offset,
1569
+ pending_count,
1570
+ backlog_undelivered,
1571
+ backlog_unprocessed,
1572
+ ROW_NUMBER() OVER (
1573
+ PARTITION BY namespace, stream_name, consumer_group, priority
1574
+ ORDER BY created_at DESC
1575
+ ) as rn
1576
+ FROM stream_backlog_monitor
1577
+ WHERE ({' OR '.join(conditions)})
1578
+ )
1579
+ SELECT
1580
+ namespace,
1581
+ stream_name,
1582
+ consumer_group,
1583
+ priority,
1584
+ last_published_offset,
1585
+ last_delivered_offset,
1586
+ last_acked_offset,
1587
+ pending_count,
1588
+ backlog_undelivered,
1589
+ backlog_unprocessed
1590
+ FROM latest_records
1591
+ WHERE rn = 1
1592
+ """)
1593
+
1594
+ result = await session.execute(query_sql, params)
1595
+ for row in result:
1596
+ key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
1597
+ last_records[key] = row
1598
+ logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
1599
+
1600
+ # 对每个指标进行去重检查
1601
+ for unique_key, metric in metric_keys.items():
1602
+ should_save = False
1603
+
1604
+ if unique_key not in last_records:
1605
+ # 没有历史记录,需要保存
1606
+ should_save = True
1607
+ # logger.info(f"New metric for {unique_key}, will save")
1608
+ else:
1609
+ # 比较关键指标是否发生变化
1610
+ last_record = last_records[unique_key]
1611
+
1612
+ # 详细的调试日志
1613
+ changes = []
1614
+ logger.debug(f"Comparing for {unique_key}:")
1615
+ logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
1616
+ f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
1617
+ f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
1618
+ f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
1619
+ logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
1620
+ f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
1621
+ f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
1622
+ f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
1623
+
1624
+ # 确保类型一致的比较(全部转为int进行比较)
1625
+ db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
1626
+ new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
1627
+
1628
+ db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
1629
+ new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
1630
+
1631
+ db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
1632
+ new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
1633
+
1634
+ db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
1635
+ new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
1636
+
1637
+ db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
1638
+ new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
1639
+
1640
+ db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
1641
+ new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
1642
+
1643
+ if db_published != new_published:
1644
+ changes.append(f"published: {db_published} -> {new_published}")
1645
+ if db_delivered != new_delivered:
1646
+ changes.append(f"delivered: {db_delivered} -> {new_delivered}")
1647
+ if db_acked != new_acked:
1648
+ changes.append(f"acked: {db_acked} -> {new_acked}")
1649
+ if db_pending != new_pending:
1650
+ changes.append(f"pending: {db_pending} -> {new_pending}")
1651
+ if db_undelivered != new_undelivered:
1652
+ changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
1653
+ if db_unprocessed != new_unprocessed:
1654
+ changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
1655
+
1656
+ if changes:
1657
+ should_save = True
1658
+ # logger.info(f"Metric changed for {unique_key}: {', '.join(changes)}")
1659
+ else:
1660
+ logger.debug(f"Metric unchanged for {unique_key}, skipping")
1661
+
1662
+ if should_save:
1663
+ metrics_to_save.append(metric)
1664
+
1665
+ # 批量插入发生变化的监控数据
1666
+ if metrics_to_save:
1667
+ insert_sql = text("""
1668
+ INSERT INTO stream_backlog_monitor
1669
+ (namespace, stream_name, priority, consumer_group, last_published_offset,
1670
+ last_delivered_offset, last_acked_offset, pending_count,
1671
+ backlog_undelivered, backlog_unprocessed, created_at)
1672
+ VALUES
1673
+ (:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
1674
+ :last_delivered_offset, :last_acked_offset, :pending_count,
1675
+ :backlog_undelivered, :backlog_unprocessed, :created_at)
1676
+ """)
1677
+
1678
+ # 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
1679
+ for metric_data in metrics_to_save:
1680
+ await session.execute(insert_sql, metric_data)
1681
+
1682
+ await session.commit()
1683
+ # logger.info(f"Saved {len(metrics_to_save)} changed metrics out of {len(metrics)} total")
1684
+ else:
1685
+ logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
1686
+
1687
+ except Exception as e:
1688
+ logger.error(f"Error saving backlog metrics to database: {e}")
1689
+
1690
+ def _parse_stream_message(self, task_id: str, data: dict) -> Optional[dict]:
976
1691
  """解析Stream消息为任务信息(返回完整的字段)"""
977
1692
  try:
978
1693
  from jettask.utils.serializer import loads_str
979
-
980
1694
  if b'data' in data:
981
1695
  task_data = loads_str(data[b'data'])
982
1696
  else:
@@ -991,7 +1705,15 @@ class PostgreSQLConsumer:
991
1705
  else:
992
1706
  value = v
993
1707
  task_data[key] = value
994
-
1708
+ # 如果配置了命名空间,检查消息是否属于该命名空间
1709
+ # if self.namespace_id:
1710
+ # msg_namespace_id = task_data.get('__namespace_id')
1711
+ # # 如果消息没有namespace_id且当前不是默认命名空间,跳过
1712
+ # if msg_namespace_id != self.namespace_id:
1713
+ # if not (msg_namespace_id is None and self.namespace_id == 'default'):
1714
+ # logger.debug(f"Skipping message from different namespace: {msg_namespace_id} != {self.namespace_id}")
1715
+ # return None
1716
+ queue_name = task_data['queue']
995
1717
  task_name = task_data.get('name', task_data.get('task', 'unknown'))
996
1718
  created_at = None
997
1719
  if 'trigger_time' in task_data:
@@ -1000,7 +1722,6 @@ class PostgreSQLConsumer:
1000
1722
  created_at = datetime.fromtimestamp(timestamp, tz=timezone.utc)
1001
1723
  except:
1002
1724
  pass
1003
-
1004
1725
  # 返回完整的字段,包括所有可能为None的字段
1005
1726
  return {
1006
1727
  'id': task_id,
@@ -1016,12 +1737,16 @@ class PostgreSQLConsumer:
1016
1737
  'created_at': created_at,
1017
1738
  'started_at': None, # 新任务还未开始
1018
1739
  'completed_at': None, # 新任务还未完成
1740
+ 'scheduled_task_id': task_data.get('scheduled_task_id'), # 调度任务ID
1741
+ 'metadata': json.dumps(task_data.get('metadata', {})),
1019
1742
  'worker_id': None, # 新任务还未分配worker
1020
1743
  'execution_time': None, # 新任务还没有执行时间
1021
1744
  'duration': None, # 新任务还没有持续时间
1022
- 'metadata': json.dumps(task_data.get('metadata', {}))
1745
+ 'namespace_id': self.namespace_id # 添加命名空间ID
1023
1746
  }
1024
1747
  except Exception as e:
1748
+ import traceback
1749
+ traceback.print_exc()
1025
1750
  logger.error(f"Error parsing stream message for task {task_id}: {e}")
1026
1751
  return None
1027
1752
 
@@ -1030,7 +1755,19 @@ class PostgreSQLConsumer:
1030
1755
  async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig,
1031
1756
  consumer_strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT):
1032
1757
  """运行PostgreSQL消费者"""
1033
- consumer = PostgreSQLConsumer(pg_config, redis_config, consumer_strategy=consumer_strategy)
1758
+ # 从环境变量读取监控配置
1759
+ enable_backlog_monitor = os.getenv('JETTASK_ENABLE_BACKLOG_MONITOR', 'true').lower() == 'true'
1760
+ backlog_monitor_interval = int(os.getenv('JETTASK_BACKLOG_MONITOR_INTERVAL', '60'))
1761
+
1762
+ logger.info(f"Backlog monitor config: enabled={enable_backlog_monitor}, interval={backlog_monitor_interval}s")
1763
+
1764
+ consumer = PostgreSQLConsumer(
1765
+ pg_config,
1766
+ redis_config,
1767
+ consumer_strategy=consumer_strategy,
1768
+ enable_backlog_monitor=enable_backlog_monitor,
1769
+ backlog_monitor_interval=backlog_monitor_interval
1770
+ )
1034
1771
 
1035
1772
  try:
1036
1773
  await consumer.start()
@@ -1038,7 +1775,7 @@ async def run_pg_consumer(pg_config: PostgreSQLConfig, redis_config: RedisConfig
1038
1775
  await asyncio.sleep(1)
1039
1776
 
1040
1777
  except KeyboardInterrupt:
1041
- logger.info("Received interrupt signal")
1778
+ logger.debug("Received interrupt signal")
1042
1779
  finally:
1043
1780
  await consumer.stop()
1044
1781
 
@@ -1080,9 +1817,9 @@ def main():
1080
1817
  elif strategy_name == 'HEARTBEAT':
1081
1818
  consumer_strategy = ConsumerStrategy.HEARTBEAT
1082
1819
  else:
1083
- logger.warning(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
1820
+ logger.debug(f"Unknown consumer strategy: {strategy_name}, using HEARTBEAT")
1084
1821
 
1085
- logger.info(f"Using consumer strategy: {consumer_strategy.value}")
1822
+ logger.debug(f"Using consumer strategy: {consumer_strategy.value}")
1086
1823
 
1087
1824
  asyncio.run(run_pg_consumer(pg_config, redis_config, consumer_strategy))
1088
1825