jettask 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +60 -2
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.18.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.18.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,567 @@
1
+ """Stream积压监控模块
2
+
3
+ 负责监控Redis Stream的积压情况,包括:
4
+ - 采集各队列的积压指标
5
+ - 使用分布式锁确保只有一个实例在采集
6
+ - 保存积压数据到PostgreSQL数据库
7
+ """
8
+
9
+ import asyncio
10
+ import logging
11
+ import traceback
12
+ from typing import List, Dict, Optional
13
+ from datetime import datetime, timezone
14
+
15
+ from redis.asyncio import Redis
16
+ from sqlalchemy import text
17
+ from sqlalchemy.ext.asyncio import AsyncSession
18
+ from sqlalchemy.orm import sessionmaker
19
+
20
+ from jettask.config.constants import is_internal_consumer
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class BacklogMonitor:
26
+ """Stream积压监控器
27
+
28
+ 职责:
29
+ - 定期采集Redis Stream的积压指标
30
+ - 使用分布式锁确保单实例采集
31
+ - 将指标保存到PostgreSQL
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ redis_client: Redis,
37
+ async_session_local: sessionmaker,
38
+ redis_prefix: str,
39
+ namespace_name: str,
40
+ node_id: str,
41
+ enable_monitor: bool = True,
42
+ monitor_interval: int = 1
43
+ ):
44
+ """初始化积压监控器
45
+
46
+ Args:
47
+ redis_client: Redis异步客户端
48
+ async_session_local: SQLAlchemy会话工厂
49
+ redis_prefix: Redis键前缀
50
+ namespace_name: 命名空间名称
51
+ node_id: 节点ID
52
+ enable_monitor: 是否启用监控
53
+ monitor_interval: 监控采集间隔(秒)
54
+ """
55
+ self.redis_client = redis_client
56
+ self.AsyncSessionLocal = async_session_local
57
+ self.redis_prefix = redis_prefix
58
+ self.namespace_name = namespace_name
59
+ self.node_id = node_id
60
+
61
+ self.enable_monitor = enable_monitor
62
+ self.monitor_interval = monitor_interval
63
+
64
+ # 分布式锁配置
65
+ self.lock_key = f"{redis_prefix}:BACKLOG_MONITOR_LOCK"
66
+ self.lock_ttl = monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
67
+
68
+ # Stream注册表键
69
+ self.stream_registry_key = f"{redis_prefix}:STREAM_REGISTRY"
70
+
71
+ self._running = False
72
+ self._monitor_task = None
73
+
74
+ async def start(self):
75
+ """启动监控任务"""
76
+ if not self.enable_monitor:
77
+ logger.info("Backlog monitor is disabled")
78
+ return
79
+
80
+ self._running = True
81
+ self._monitor_task = asyncio.create_task(self._monitor_loop())
82
+ logger.info(f"Backlog monitor started with {self.monitor_interval}s interval")
83
+
84
+ async def stop(self):
85
+ """停止监控任务"""
86
+ self._running = False
87
+ if self._monitor_task:
88
+ self._monitor_task.cancel()
89
+ try:
90
+ await self._monitor_task
91
+ except asyncio.CancelledError:
92
+ pass
93
+ logger.debug("Backlog monitor stopped")
94
+
95
+ async def _monitor_loop(self):
96
+ """监控循环"""
97
+ while self._running:
98
+ try:
99
+ # 尝试获取分布式锁
100
+ lock_acquired = await self._try_acquire_lock()
101
+
102
+ if lock_acquired:
103
+ try:
104
+ logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
105
+ await self._collect_stream_backlog_metrics()
106
+ logger.debug("Stream backlog metrics collected successfully")
107
+ finally:
108
+ # 释放锁
109
+ await self._release_lock()
110
+ else:
111
+ logger.debug("Another instance is collecting backlog metrics, skipping...")
112
+
113
+ # 等待下一次采集
114
+ await asyncio.sleep(self.monitor_interval)
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error in stream backlog monitor: {e}")
118
+ await asyncio.sleep(30) # 出错后等待30秒
119
+
120
+ async def _try_acquire_lock(self) -> bool:
121
+ """尝试获取监控锁(使用Redis原生锁)"""
122
+ try:
123
+ # 使用SET NX EX命令实现分布式锁
124
+ # NX: 只在键不存在时设置
125
+ # EX: 设置过期时间(秒)
126
+ result = await self.redis_client.set(
127
+ self.lock_key.encode(),
128
+ self.node_id.encode(), # 锁的值为当前节点ID
129
+ nx=True, # 只在不存在时设置
130
+ ex=self.lock_ttl # 过期时间
131
+ )
132
+ return result is not None
133
+ except Exception as e:
134
+ logger.error(f"Error acquiring monitor lock: {e}")
135
+ return False
136
+
137
+ async def _release_lock(self):
138
+ """释放监控锁(只释放自己持有的锁)"""
139
+ try:
140
+ # 使用Lua脚本确保只释放自己持有的锁
141
+ lua_script = """
142
+ if redis.call("get", KEYS[1]) == ARGV[1] then
143
+ return redis.call("del", KEYS[1])
144
+ else
145
+ return 0
146
+ end
147
+ """
148
+ await self.redis_client.eval(
149
+ lua_script,
150
+ 1,
151
+ self.lock_key.encode(),
152
+ self.node_id.encode()
153
+ )
154
+ except Exception as e:
155
+ logger.error(f"Error releasing monitor lock: {e}")
156
+
157
+ async def _collect_stream_backlog_metrics(self):
158
+ """采集Stream积压指标并保存到数据库(使用offset方式)"""
159
+ try:
160
+ # 获取所有队列的最新offset (QUEUE_OFFSETS)
161
+ queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
162
+ queue_offsets = {}
163
+ try:
164
+ # 使用decode_responses=False的客户端,手动解码
165
+ raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
166
+ for k, v in raw_queue_offsets.items():
167
+ queue_name = k.decode() if isinstance(k, bytes) else k
168
+ offset_value = v.decode() if isinstance(v, bytes) else v
169
+ queue_offsets[queue_name] = int(offset_value)
170
+ except Exception as e:
171
+ logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
172
+
173
+ # 获取所有任务组的消费offset (TASK_OFFSETS)
174
+ task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
175
+ task_offsets = {}
176
+ try:
177
+ raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
178
+ for k, v in raw_task_offsets.items():
179
+ task_key = k.decode() if isinstance(k, bytes) else k
180
+ offset_value = v.decode() if isinstance(v, bytes) else v
181
+ task_offsets[task_key] = int(offset_value)
182
+ except Exception as e:
183
+ logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
184
+
185
+ # 使用Stream注册表替代SCAN命令获取队列信息
186
+ stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
187
+
188
+ # 从redis中获取stream注册表(Hash结构)
189
+ # 格式: {"queue_name:priority": "stream_key"}
190
+ # 对于普通队列,priority为0
191
+ stream_registry = await self.redis_client.hgetall(self.stream_registry_key.encode())
192
+
193
+ for queue_priority_bytes, stream_key_bytes in stream_registry.items():
194
+ queue_priority_str = queue_priority_bytes.decode() if isinstance(queue_priority_bytes, bytes) else str(queue_priority_bytes)
195
+ stream_key = stream_key_bytes.decode() if isinstance(stream_key_bytes, bytes) else str(stream_key_bytes)
196
+
197
+ # 解析queue_name和priority
198
+ if ':' in queue_priority_str:
199
+ parts = queue_priority_str.rsplit(':', 1)
200
+ if len(parts) == 2 and parts[1].isdigit():
201
+ queue_name = parts[0]
202
+ priority = int(parts[1])
203
+ else:
204
+ # 如果最后一部分不是数字,说明是普通队列名包含冒号
205
+ queue_name = queue_priority_str
206
+ priority = 0
207
+ else:
208
+ # 普通队列
209
+ queue_name = queue_priority_str
210
+ priority = 0
211
+
212
+ if queue_name not in stream_info_map:
213
+ stream_info_map[queue_name] = []
214
+ stream_info_map[queue_name].append((stream_key, priority))
215
+
216
+ # 如果Stream注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
217
+ if not stream_info_map:
218
+ # 使用 RegistryManager 获取队列,避免 scan
219
+ from jettask.messaging.registry import QueueRegistry
220
+ registry = QueueRegistry(
221
+ redis_client=None,
222
+ async_redis_client=self.redis_client,
223
+ redis_prefix=self.redis_prefix
224
+ )
225
+
226
+ # 获取所有队列
227
+ all_queues = await registry.get_all_queues()
228
+
229
+ for queue_full_name in all_queues:
230
+ # 构建 stream key
231
+ stream_key = f"{self.redis_prefix}:QUEUE:{queue_full_name}".encode()
232
+
233
+ # 检查 stream 是否存在
234
+ if await self.redis_client.exists(stream_key):
235
+ # 解析队列名和优先级
236
+ parts = queue_full_name.split(':')
237
+ if len(parts) >= 2 and parts[-1].isdigit():
238
+ # 优先级队列
239
+ queue_name = ':'.join(parts[:-1])
240
+ priority = int(parts[-1])
241
+ else:
242
+ # 普通队列
243
+ queue_name = queue_full_name
244
+ priority = 0
245
+
246
+ if queue_name not in stream_info_map:
247
+ stream_info_map[queue_name] = []
248
+ stream_info_map[queue_name].append((stream_key, priority))
249
+
250
+ # 将发现的Stream信息添加到注册表中
251
+ if stream_info_map:
252
+ pipeline = self.redis_client.pipeline()
253
+ for queue_name, stream_list in stream_info_map.items():
254
+ for stream_key, priority in stream_list:
255
+ if priority > 0:
256
+ queue_priority_key = f"{queue_name}:{priority}"
257
+ else:
258
+ queue_priority_key = queue_name
259
+ # 确保stream_key是bytes类型
260
+ if isinstance(stream_key, str):
261
+ stream_key = stream_key.encode()
262
+ pipeline.hset(self.stream_registry_key.encode(), queue_priority_key.encode(), stream_key)
263
+ await pipeline.execute()
264
+ logger.info(f"Registered {sum(len(stream_list) for stream_list in stream_info_map.values())} streams to registry during initialization")
265
+
266
+ if not stream_info_map:
267
+ logger.debug("No streams found in registry for backlog monitoring")
268
+ return
269
+
270
+ # 调试日志(使用debug级别避免刷屏)
271
+ logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
272
+ for queue_name, stream_list in stream_info_map.items():
273
+ priorities = [p for _, p in stream_list]
274
+ # 筛选出非0优先级(0表示普通队列)
275
+ high_priorities = [p for p in priorities if p > 0]
276
+ if high_priorities:
277
+ logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
278
+ else:
279
+ logger.debug(f" - {queue_name}: regular queue only (priority=0)")
280
+
281
+ # 收集每个队列的指标(聚合所有优先级)
282
+ metrics = []
283
+ current_time = datetime.now(timezone.utc)
284
+
285
+ for queue_name, stream_list in stream_info_map.items():
286
+ # 分别处理每个优先级队列
287
+ for stream_key, priority in stream_list:
288
+ try:
289
+ # 获取该队列的最新offset(考虑优先级队列)
290
+ if priority > 0:
291
+ # 优先级队列的key格式: queue_name:priority
292
+ queue_key = f"{queue_name}:{priority}"
293
+ else:
294
+ queue_key = queue_name
295
+ last_published_offset = queue_offsets.get(queue_key, 0)
296
+
297
+ # 获取Stream信息
298
+ stream_info = await self.redis_client.xinfo_stream(stream_key)
299
+ stream_length = stream_info.get(b'length', 0)
300
+
301
+ # 获取消费组信息
302
+ has_consumer_groups = False
303
+ try:
304
+ groups = await self.redis_client.xinfo_groups(stream_key)
305
+
306
+ for group in groups:
307
+ # 处理group_name
308
+ raw_name = group.get('name', b'')
309
+ if isinstance(raw_name, bytes):
310
+ group_name = raw_name.decode() if raw_name else ''
311
+ else:
312
+ group_name = str(raw_name) if raw_name else ''
313
+
314
+ if not group_name:
315
+ group_name = 'unknown'
316
+
317
+ # 过滤内部消费者组
318
+ if is_internal_consumer(group_name):
319
+ # logger.info(f"Skipping internal consumer group: {group_name}")
320
+ continue
321
+
322
+ # 处理pending - 直接是int
323
+ pending_count = group.get('pending', 0)
324
+
325
+ # 从TASK_OFFSETS获取该组的消费offset
326
+ # 从 group_name 中提取 task_name(最后一段)
327
+ task_name = group_name.split(':')[-1]
328
+ # 构建 field:队列名(含优先级)+ 任务名
329
+ # 例如:robust_bench2:8:benchmark_task
330
+ task_offset_key = f"{queue_key}:{task_name}"
331
+ last_acked_offset = task_offsets.get(task_offset_key, 0)
332
+
333
+ # 计算各种积压指标
334
+ # 1. 总积压 = 队列最新offset - 消费组已确认的offset
335
+ total_backlog = max(0, last_published_offset - last_acked_offset)
336
+
337
+ # 2. 未投递的积压 = 总积压 - pending数量
338
+ backlog_undelivered = max(0, total_backlog - pending_count)
339
+
340
+ # 3. 已投递未确认 = pending数量
341
+ backlog_delivered_unacked = pending_count
342
+
343
+ # 4. 已投递的offset = 已确认offset + pending数量
344
+ last_delivered_offset = last_acked_offset + pending_count
345
+
346
+ # 为每个消费组创建一条记录
347
+ metrics.append({
348
+ 'namespace': self.namespace_name,
349
+ 'stream_name': queue_name,
350
+ 'priority': priority, # 添加优先级字段
351
+ 'consumer_group': group_name,
352
+ 'last_published_offset': last_published_offset,
353
+ 'last_delivered_offset': last_delivered_offset,
354
+ 'last_acked_offset': last_acked_offset,
355
+ 'pending_count': pending_count,
356
+ 'backlog_undelivered': backlog_undelivered,
357
+ 'backlog_unprocessed': total_backlog,
358
+ 'created_at': current_time
359
+ })
360
+ has_consumer_groups = True
361
+
362
+ except Exception as e:
363
+ # 这个队列没有消费组
364
+ stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else str(stream_key)
365
+ logger.debug(f"No consumer groups for stream {stream_key_str}: {e}")
366
+
367
+ # 如果没有消费组,保存Stream级别的指标
368
+ if not has_consumer_groups and last_published_offset > 0:
369
+ metrics.append({
370
+ 'namespace': self.namespace_name,
371
+ 'stream_name': queue_name,
372
+ 'priority': priority, # 添加优先级字段
373
+ 'consumer_group': None,
374
+ 'last_published_offset': last_published_offset,
375
+ 'last_delivered_offset': 0,
376
+ 'last_acked_offset': 0,
377
+ 'pending_count': 0,
378
+ 'backlog_undelivered': last_published_offset,
379
+ 'backlog_unprocessed': last_published_offset,
380
+ 'created_at': current_time
381
+ })
382
+
383
+ except Exception as e:
384
+ stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else str(stream_key)
385
+ logger.error(f"Error collecting metrics for stream {stream_key_str}: {e}")
386
+ continue
387
+
388
+ # 保存指标到数据库
389
+ if metrics:
390
+ await self._save_backlog_metrics(metrics)
391
+
392
+ except Exception as e:
393
+ logger.error(f"Error collecting stream backlog metrics: {e}")
394
+ logger.error(traceback.format_exc())
395
+
396
+ async def _save_backlog_metrics(self, metrics: List[Dict]):
397
+ """保存积压指标到数据库(仅保存发生变化的数据)"""
398
+ if not metrics:
399
+ return
400
+
401
+ try:
402
+ async with self.AsyncSessionLocal() as session:
403
+ # 要保存的新记录
404
+ metrics_to_save = []
405
+
406
+ # 使用批量查询优化性能
407
+ metric_keys = {} # 用于快速查找
408
+
409
+ for metric in metrics:
410
+ # 构建唯一键:namespace + stream_name + consumer_group + priority
411
+ unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
412
+ metric_keys[unique_key] = metric
413
+
414
+ # 批量查询最新记录 - 分批查询以避免SQL过长
415
+ last_records = {}
416
+ metric_list = list(metric_keys.values())
417
+ batch_size = 50 # 每批查询50个
418
+
419
+ for i in range(0, len(metric_list), batch_size):
420
+ batch = metric_list[i:i + batch_size]
421
+
422
+ # 构建参数化查询
423
+ conditions = []
424
+ params = {}
425
+ for idx, metric in enumerate(batch):
426
+ param_prefix = f"p{i + idx}"
427
+ conditions.append(f"""
428
+ (namespace = :{param_prefix}_ns
429
+ AND stream_name = :{param_prefix}_sn
430
+ AND consumer_group = :{param_prefix}_cg
431
+ AND priority = :{param_prefix}_pr)
432
+ """)
433
+ params[f"{param_prefix}_ns"] = metric['namespace']
434
+ params[f"{param_prefix}_sn"] = metric['stream_name']
435
+ params[f"{param_prefix}_cg"] = metric['consumer_group']
436
+ params[f"{param_prefix}_pr"] = metric['priority']
437
+
438
+ if conditions:
439
+ # 使用窗口函数获取每个组合的最新记录
440
+ query_sql = text(f"""
441
+ WITH latest_records AS (
442
+ SELECT
443
+ namespace,
444
+ stream_name,
445
+ consumer_group,
446
+ priority,
447
+ last_published_offset,
448
+ last_delivered_offset,
449
+ last_acked_offset,
450
+ pending_count,
451
+ backlog_undelivered,
452
+ backlog_unprocessed,
453
+ ROW_NUMBER() OVER (
454
+ PARTITION BY namespace, stream_name, consumer_group, priority
455
+ ORDER BY created_at DESC
456
+ ) as rn
457
+ FROM stream_backlog_monitor
458
+ WHERE ({' OR '.join(conditions)})
459
+ )
460
+ SELECT
461
+ namespace,
462
+ stream_name,
463
+ consumer_group,
464
+ priority,
465
+ last_published_offset,
466
+ last_delivered_offset,
467
+ last_acked_offset,
468
+ pending_count,
469
+ backlog_undelivered,
470
+ backlog_unprocessed
471
+ FROM latest_records
472
+ WHERE rn = 1
473
+ """)
474
+
475
+ result = await session.execute(query_sql, params)
476
+ for row in result:
477
+ key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
478
+ last_records[key] = row
479
+ logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
480
+
481
+ # 对每个指标进行去重检查
482
+ for unique_key, metric in metric_keys.items():
483
+ should_save = False
484
+
485
+ if unique_key not in last_records:
486
+ # 没有历史记录,需要保存
487
+ should_save = True
488
+ else:
489
+ # 比较关键指标是否发生变化
490
+ last_record = last_records[unique_key]
491
+
492
+ # 详细的调试日志
493
+ changes = []
494
+ logger.debug(f"Comparing for {unique_key}:")
495
+ logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
496
+ f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
497
+ f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
498
+ f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
499
+ logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
500
+ f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
501
+ f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
502
+ f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
503
+
504
+ # 确保类型一致的比较(全部转为int进行比较)
505
+ db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
506
+ new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
507
+
508
+ db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
509
+ new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
510
+
511
+ db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
512
+ new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
513
+
514
+ db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
515
+ new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
516
+
517
+ db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
518
+ new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
519
+
520
+ db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
521
+ new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
522
+
523
+ if db_published != new_published:
524
+ changes.append(f"published: {db_published} -> {new_published}")
525
+ if db_delivered != new_delivered:
526
+ changes.append(f"delivered: {db_delivered} -> {new_delivered}")
527
+ if db_acked != new_acked:
528
+ changes.append(f"acked: {db_acked} -> {new_acked}")
529
+ if db_pending != new_pending:
530
+ changes.append(f"pending: {db_pending} -> {new_pending}")
531
+ if db_undelivered != new_undelivered:
532
+ changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
533
+ if db_unprocessed != new_unprocessed:
534
+ changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
535
+
536
+ if changes:
537
+ should_save = True
538
+ else:
539
+ logger.debug(f"Metric unchanged for {unique_key}, skipping")
540
+
541
+ if should_save:
542
+ metrics_to_save.append(metric)
543
+
544
+ # 批量插入发生变化的监控数据
545
+ if metrics_to_save:
546
+ insert_sql = text("""
547
+ INSERT INTO stream_backlog_monitor
548
+ (namespace, stream_name, priority, consumer_group, last_published_offset,
549
+ last_delivered_offset, last_acked_offset, pending_count,
550
+ backlog_undelivered, backlog_unprocessed, created_at)
551
+ VALUES
552
+ (:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
553
+ :last_delivered_offset, :last_acked_offset, :pending_count,
554
+ :backlog_undelivered, :backlog_unprocessed, :created_at)
555
+ """)
556
+
557
+ # 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
558
+ for metric_data in metrics_to_save:
559
+ await session.execute(insert_sql, metric_data)
560
+
561
+ await session.commit()
562
+ else:
563
+ logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
564
+
565
+ except Exception as e:
566
+ logger.error(f"Error saving backlog metrics to database: {e}")
567
+ logger.error(traceback.format_exc())