jettask 0.2.19__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jettask/__init__.py +12 -3
- jettask/cli.py +314 -228
- jettask/config/__init__.py +9 -1
- jettask/config/config.py +245 -0
- jettask/config/env_loader.py +381 -0
- jettask/config/lua_scripts.py +158 -0
- jettask/config/nacos_config.py +132 -5
- jettask/core/__init__.py +1 -1
- jettask/core/app.py +1573 -666
- jettask/core/app_importer.py +33 -16
- jettask/core/container.py +532 -0
- jettask/core/task.py +1 -4
- jettask/core/unified_manager_base.py +2 -2
- jettask/executor/__init__.py +38 -0
- jettask/executor/core.py +625 -0
- jettask/executor/executor.py +338 -0
- jettask/executor/orchestrator.py +290 -0
- jettask/executor/process_entry.py +638 -0
- jettask/executor/task_executor.py +317 -0
- jettask/messaging/__init__.py +68 -0
- jettask/messaging/event_pool.py +2188 -0
- jettask/messaging/reader.py +519 -0
- jettask/messaging/registry.py +266 -0
- jettask/messaging/scanner.py +369 -0
- jettask/messaging/sender.py +312 -0
- jettask/persistence/__init__.py +118 -0
- jettask/persistence/backlog_monitor.py +567 -0
- jettask/{backend/data_access.py → persistence/base.py} +58 -57
- jettask/persistence/consumer.py +315 -0
- jettask/{core → persistence}/db_manager.py +23 -22
- jettask/persistence/maintenance.py +81 -0
- jettask/persistence/message_consumer.py +259 -0
- jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
- jettask/persistence/offline_recovery.py +196 -0
- jettask/persistence/queue_discovery.py +215 -0
- jettask/persistence/task_persistence.py +218 -0
- jettask/persistence/task_updater.py +583 -0
- jettask/scheduler/__init__.py +2 -2
- jettask/scheduler/loader.py +6 -5
- jettask/scheduler/run_scheduler.py +1 -1
- jettask/scheduler/scheduler.py +7 -7
- jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
- jettask/task/__init__.py +16 -0
- jettask/{router.py → task/router.py} +26 -8
- jettask/task/task_center/__init__.py +9 -0
- jettask/task/task_executor.py +318 -0
- jettask/task/task_registry.py +291 -0
- jettask/test_connection_monitor.py +73 -0
- jettask/utils/__init__.py +31 -1
- jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
- jettask/utils/db_connector.py +1629 -0
- jettask/{db_init.py → utils/db_init.py} +1 -1
- jettask/utils/rate_limit/__init__.py +30 -0
- jettask/utils/rate_limit/concurrency_limiter.py +665 -0
- jettask/utils/rate_limit/config.py +145 -0
- jettask/utils/rate_limit/limiter.py +41 -0
- jettask/utils/rate_limit/manager.py +269 -0
- jettask/utils/rate_limit/qps_limiter.py +154 -0
- jettask/utils/rate_limit/task_limiter.py +384 -0
- jettask/utils/serializer.py +3 -0
- jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
- jettask/utils/time_sync.py +173 -0
- jettask/webui/__init__.py +27 -0
- jettask/{api/v1 → webui/api}/alerts.py +1 -1
- jettask/{api/v1 → webui/api}/analytics.py +2 -2
- jettask/{api/v1 → webui/api}/namespaces.py +1 -1
- jettask/{api/v1 → webui/api}/overview.py +1 -1
- jettask/{api/v1 → webui/api}/queues.py +3 -3
- jettask/{api/v1 → webui/api}/scheduled.py +1 -1
- jettask/{api/v1 → webui/api}/settings.py +1 -1
- jettask/{api.py → webui/app.py} +253 -145
- jettask/webui/namespace_manager/__init__.py +10 -0
- jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
- jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
- jettask/{run.py → webui/run.py} +2 -2
- jettask/{services → webui/services}/__init__.py +1 -3
- jettask/{services → webui/services}/overview_service.py +34 -16
- jettask/{services → webui/services}/queue_service.py +1 -1
- jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
- jettask/{services → webui/services}/settings_service.py +1 -1
- jettask/worker/__init__.py +53 -0
- jettask/worker/lifecycle.py +1507 -0
- jettask/worker/manager.py +583 -0
- jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
- {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/METADATA +2 -71
- jettask-0.2.23.dist-info/RECORD +145 -0
- jettask/__main__.py +0 -140
- jettask/api/__init__.py +0 -103
- jettask/backend/__init__.py +0 -1
- jettask/backend/api/__init__.py +0 -3
- jettask/backend/api/v1/__init__.py +0 -17
- jettask/backend/api/v1/monitoring.py +0 -431
- jettask/backend/api/v1/namespaces.py +0 -504
- jettask/backend/api/v1/queues.py +0 -342
- jettask/backend/api/v1/tasks.py +0 -367
- jettask/backend/core/__init__.py +0 -3
- jettask/backend/core/cache.py +0 -221
- jettask/backend/core/database.py +0 -200
- jettask/backend/core/exceptions.py +0 -102
- jettask/backend/dependencies.py +0 -261
- jettask/backend/init_meta_db.py +0 -158
- jettask/backend/main.py +0 -1426
- jettask/backend/main_unified.py +0 -78
- jettask/backend/main_v2.py +0 -394
- jettask/backend/models/__init__.py +0 -3
- jettask/backend/models/requests.py +0 -236
- jettask/backend/models/responses.py +0 -230
- jettask/backend/namespace_api_old.py +0 -267
- jettask/backend/services/__init__.py +0 -3
- jettask/backend/start.py +0 -42
- jettask/backend/unified_api_router.py +0 -1541
- jettask/cleanup_deprecated_tables.sql +0 -16
- jettask/core/consumer_manager.py +0 -1695
- jettask/core/delay_scanner.py +0 -256
- jettask/core/event_pool.py +0 -1700
- jettask/core/heartbeat_process.py +0 -222
- jettask/core/task_batch.py +0 -153
- jettask/core/worker_scanner.py +0 -271
- jettask/executors/__init__.py +0 -5
- jettask/executors/asyncio.py +0 -876
- jettask/executors/base.py +0 -30
- jettask/executors/common.py +0 -148
- jettask/executors/multi_asyncio.py +0 -309
- jettask/gradio_app.py +0 -570
- jettask/integrated_gradio_app.py +0 -1088
- jettask/main.py +0 -0
- jettask/monitoring/__init__.py +0 -3
- jettask/pg_consumer.py +0 -1896
- jettask/run_monitor.py +0 -22
- jettask/run_webui.py +0 -148
- jettask/scheduler/multi_namespace_scheduler.py +0 -294
- jettask/scheduler/unified_manager.py +0 -450
- jettask/task_center_client.py +0 -150
- jettask/utils/serializer_optimized.py +0 -33
- jettask/webui_exceptions.py +0 -67
- jettask-0.2.19.dist-info/RECORD +0 -150
- /jettask/{constants.py → config/constants.py} +0 -0
- /jettask/{backend/config.py → config/task_center.py} +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
- /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
- /jettask/{models.py → persistence/models.py} +0 -0
- /jettask/scheduler/{manager.py → task_crud.py} +0 -0
- /jettask/{schema.sql → schemas/schema.sql} +0 -0
- /jettask/{task_center.py → task/task_center/client.py} +0 -0
- /jettask/{monitoring → utils}/file_watcher.py +0 -0
- /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
- /jettask/{api/v1 → webui/api}/__init__.py +0 -0
- /jettask/{webui_config.py → webui/config.py} +0 -0
- /jettask/{webui_models → webui/models}/__init__.py +0 -0
- /jettask/{webui_models → webui/models}/namespace.py +0 -0
- /jettask/{services → webui/services}/alert_service.py +0 -0
- /jettask/{services → webui/services}/analytics_service.py +0 -0
- /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
- /jettask/{services → webui/services}/task_service.py +0 -0
- /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
- /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/WHEEL +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/entry_points.txt +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/licenses/LICENSE +0 -0
- {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,567 @@
|
|
1
|
+
"""Stream积压监控模块
|
2
|
+
|
3
|
+
负责监控Redis Stream的积压情况,包括:
|
4
|
+
- 采集各队列的积压指标
|
5
|
+
- 使用分布式锁确保只有一个实例在采集
|
6
|
+
- 保存积压数据到PostgreSQL数据库
|
7
|
+
"""
|
8
|
+
|
9
|
+
import asyncio
|
10
|
+
import logging
|
11
|
+
import traceback
|
12
|
+
from typing import List, Dict, Optional
|
13
|
+
from datetime import datetime, timezone
|
14
|
+
|
15
|
+
from redis.asyncio import Redis
|
16
|
+
from sqlalchemy import text
|
17
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
18
|
+
from sqlalchemy.orm import sessionmaker
|
19
|
+
|
20
|
+
from jettask.config.constants import is_internal_consumer
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class BacklogMonitor:
|
26
|
+
"""Stream积压监控器
|
27
|
+
|
28
|
+
职责:
|
29
|
+
- 定期采集Redis Stream的积压指标
|
30
|
+
- 使用分布式锁确保单实例采集
|
31
|
+
- 将指标保存到PostgreSQL
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
redis_client: Redis,
|
37
|
+
async_session_local: sessionmaker,
|
38
|
+
redis_prefix: str,
|
39
|
+
namespace_name: str,
|
40
|
+
node_id: str,
|
41
|
+
enable_monitor: bool = True,
|
42
|
+
monitor_interval: int = 1
|
43
|
+
):
|
44
|
+
"""初始化积压监控器
|
45
|
+
|
46
|
+
Args:
|
47
|
+
redis_client: Redis异步客户端
|
48
|
+
async_session_local: SQLAlchemy会话工厂
|
49
|
+
redis_prefix: Redis键前缀
|
50
|
+
namespace_name: 命名空间名称
|
51
|
+
node_id: 节点ID
|
52
|
+
enable_monitor: 是否启用监控
|
53
|
+
monitor_interval: 监控采集间隔(秒)
|
54
|
+
"""
|
55
|
+
self.redis_client = redis_client
|
56
|
+
self.AsyncSessionLocal = async_session_local
|
57
|
+
self.redis_prefix = redis_prefix
|
58
|
+
self.namespace_name = namespace_name
|
59
|
+
self.node_id = node_id
|
60
|
+
|
61
|
+
self.enable_monitor = enable_monitor
|
62
|
+
self.monitor_interval = monitor_interval
|
63
|
+
|
64
|
+
# 分布式锁配置
|
65
|
+
self.lock_key = f"{redis_prefix}:BACKLOG_MONITOR_LOCK"
|
66
|
+
self.lock_ttl = monitor_interval * 2 # 锁的TTL(秒),设为采集间隔的2倍
|
67
|
+
|
68
|
+
# Stream注册表键
|
69
|
+
self.stream_registry_key = f"{redis_prefix}:STREAM_REGISTRY"
|
70
|
+
|
71
|
+
self._running = False
|
72
|
+
self._monitor_task = None
|
73
|
+
|
74
|
+
async def start(self):
|
75
|
+
"""启动监控任务"""
|
76
|
+
if not self.enable_monitor:
|
77
|
+
logger.info("Backlog monitor is disabled")
|
78
|
+
return
|
79
|
+
|
80
|
+
self._running = True
|
81
|
+
self._monitor_task = asyncio.create_task(self._monitor_loop())
|
82
|
+
logger.info(f"Backlog monitor started with {self.monitor_interval}s interval")
|
83
|
+
|
84
|
+
async def stop(self):
|
85
|
+
"""停止监控任务"""
|
86
|
+
self._running = False
|
87
|
+
if self._monitor_task:
|
88
|
+
self._monitor_task.cancel()
|
89
|
+
try:
|
90
|
+
await self._monitor_task
|
91
|
+
except asyncio.CancelledError:
|
92
|
+
pass
|
93
|
+
logger.debug("Backlog monitor stopped")
|
94
|
+
|
95
|
+
async def _monitor_loop(self):
|
96
|
+
"""监控循环"""
|
97
|
+
while self._running:
|
98
|
+
try:
|
99
|
+
# 尝试获取分布式锁
|
100
|
+
lock_acquired = await self._try_acquire_lock()
|
101
|
+
|
102
|
+
if lock_acquired:
|
103
|
+
try:
|
104
|
+
logger.debug(f"Acquired backlog monitor lock, collecting metrics...")
|
105
|
+
await self._collect_stream_backlog_metrics()
|
106
|
+
logger.debug("Stream backlog metrics collected successfully")
|
107
|
+
finally:
|
108
|
+
# 释放锁
|
109
|
+
await self._release_lock()
|
110
|
+
else:
|
111
|
+
logger.debug("Another instance is collecting backlog metrics, skipping...")
|
112
|
+
|
113
|
+
# 等待下一次采集
|
114
|
+
await asyncio.sleep(self.monitor_interval)
|
115
|
+
|
116
|
+
except Exception as e:
|
117
|
+
logger.error(f"Error in stream backlog monitor: {e}")
|
118
|
+
await asyncio.sleep(30) # 出错后等待30秒
|
119
|
+
|
120
|
+
async def _try_acquire_lock(self) -> bool:
|
121
|
+
"""尝试获取监控锁(使用Redis原生锁)"""
|
122
|
+
try:
|
123
|
+
# 使用SET NX EX命令实现分布式锁
|
124
|
+
# NX: 只在键不存在时设置
|
125
|
+
# EX: 设置过期时间(秒)
|
126
|
+
result = await self.redis_client.set(
|
127
|
+
self.lock_key.encode(),
|
128
|
+
self.node_id.encode(), # 锁的值为当前节点ID
|
129
|
+
nx=True, # 只在不存在时设置
|
130
|
+
ex=self.lock_ttl # 过期时间
|
131
|
+
)
|
132
|
+
return result is not None
|
133
|
+
except Exception as e:
|
134
|
+
logger.error(f"Error acquiring monitor lock: {e}")
|
135
|
+
return False
|
136
|
+
|
137
|
+
async def _release_lock(self):
|
138
|
+
"""释放监控锁(只释放自己持有的锁)"""
|
139
|
+
try:
|
140
|
+
# 使用Lua脚本确保只释放自己持有的锁
|
141
|
+
lua_script = """
|
142
|
+
if redis.call("get", KEYS[1]) == ARGV[1] then
|
143
|
+
return redis.call("del", KEYS[1])
|
144
|
+
else
|
145
|
+
return 0
|
146
|
+
end
|
147
|
+
"""
|
148
|
+
await self.redis_client.eval(
|
149
|
+
lua_script,
|
150
|
+
1,
|
151
|
+
self.lock_key.encode(),
|
152
|
+
self.node_id.encode()
|
153
|
+
)
|
154
|
+
except Exception as e:
|
155
|
+
logger.error(f"Error releasing monitor lock: {e}")
|
156
|
+
|
157
|
+
async def _collect_stream_backlog_metrics(self):
|
158
|
+
"""采集Stream积压指标并保存到数据库(使用offset方式)"""
|
159
|
+
try:
|
160
|
+
# 获取所有队列的最新offset (QUEUE_OFFSETS)
|
161
|
+
queue_offsets_key = f"{self.namespace_name}:QUEUE_OFFSETS"
|
162
|
+
queue_offsets = {}
|
163
|
+
try:
|
164
|
+
# 使用decode_responses=False的客户端,手动解码
|
165
|
+
raw_queue_offsets = await self.redis_client.hgetall(queue_offsets_key.encode())
|
166
|
+
for k, v in raw_queue_offsets.items():
|
167
|
+
queue_name = k.decode() if isinstance(k, bytes) else k
|
168
|
+
offset_value = v.decode() if isinstance(v, bytes) else v
|
169
|
+
queue_offsets[queue_name] = int(offset_value)
|
170
|
+
except Exception as e:
|
171
|
+
logger.debug(f"No QUEUE_OFFSETS found for {queue_offsets_key}: {e}")
|
172
|
+
|
173
|
+
# 获取所有任务组的消费offset (TASK_OFFSETS)
|
174
|
+
task_offsets_key = f"{self.namespace_name}:TASK_OFFSETS"
|
175
|
+
task_offsets = {}
|
176
|
+
try:
|
177
|
+
raw_task_offsets = await self.redis_client.hgetall(task_offsets_key.encode())
|
178
|
+
for k, v in raw_task_offsets.items():
|
179
|
+
task_key = k.decode() if isinstance(k, bytes) else k
|
180
|
+
offset_value = v.decode() if isinstance(v, bytes) else v
|
181
|
+
task_offsets[task_key] = int(offset_value)
|
182
|
+
except Exception as e:
|
183
|
+
logger.debug(f"No TASK_OFFSETS found for {task_offsets_key}: {e}")
|
184
|
+
|
185
|
+
# 使用Stream注册表替代SCAN命令获取队列信息
|
186
|
+
stream_info_map = {} # {queue_name: [(stream_key, priority), ...]}
|
187
|
+
|
188
|
+
# 从redis中获取stream注册表(Hash结构)
|
189
|
+
# 格式: {"queue_name:priority": "stream_key"}
|
190
|
+
# 对于普通队列,priority为0
|
191
|
+
stream_registry = await self.redis_client.hgetall(self.stream_registry_key.encode())
|
192
|
+
|
193
|
+
for queue_priority_bytes, stream_key_bytes in stream_registry.items():
|
194
|
+
queue_priority_str = queue_priority_bytes.decode() if isinstance(queue_priority_bytes, bytes) else str(queue_priority_bytes)
|
195
|
+
stream_key = stream_key_bytes.decode() if isinstance(stream_key_bytes, bytes) else str(stream_key_bytes)
|
196
|
+
|
197
|
+
# 解析queue_name和priority
|
198
|
+
if ':' in queue_priority_str:
|
199
|
+
parts = queue_priority_str.rsplit(':', 1)
|
200
|
+
if len(parts) == 2 and parts[1].isdigit():
|
201
|
+
queue_name = parts[0]
|
202
|
+
priority = int(parts[1])
|
203
|
+
else:
|
204
|
+
# 如果最后一部分不是数字,说明是普通队列名包含冒号
|
205
|
+
queue_name = queue_priority_str
|
206
|
+
priority = 0
|
207
|
+
else:
|
208
|
+
# 普通队列
|
209
|
+
queue_name = queue_priority_str
|
210
|
+
priority = 0
|
211
|
+
|
212
|
+
if queue_name not in stream_info_map:
|
213
|
+
stream_info_map[queue_name] = []
|
214
|
+
stream_info_map[queue_name].append((stream_key, priority))
|
215
|
+
|
216
|
+
# 如果Stream注册表为空,进行一次性的scan作为初始化(仅在首次运行时)
|
217
|
+
if not stream_info_map:
|
218
|
+
# 使用 RegistryManager 获取队列,避免 scan
|
219
|
+
from jettask.messaging.registry import QueueRegistry
|
220
|
+
registry = QueueRegistry(
|
221
|
+
redis_client=None,
|
222
|
+
async_redis_client=self.redis_client,
|
223
|
+
redis_prefix=self.redis_prefix
|
224
|
+
)
|
225
|
+
|
226
|
+
# 获取所有队列
|
227
|
+
all_queues = await registry.get_all_queues()
|
228
|
+
|
229
|
+
for queue_full_name in all_queues:
|
230
|
+
# 构建 stream key
|
231
|
+
stream_key = f"{self.redis_prefix}:QUEUE:{queue_full_name}".encode()
|
232
|
+
|
233
|
+
# 检查 stream 是否存在
|
234
|
+
if await self.redis_client.exists(stream_key):
|
235
|
+
# 解析队列名和优先级
|
236
|
+
parts = queue_full_name.split(':')
|
237
|
+
if len(parts) >= 2 and parts[-1].isdigit():
|
238
|
+
# 优先级队列
|
239
|
+
queue_name = ':'.join(parts[:-1])
|
240
|
+
priority = int(parts[-1])
|
241
|
+
else:
|
242
|
+
# 普通队列
|
243
|
+
queue_name = queue_full_name
|
244
|
+
priority = 0
|
245
|
+
|
246
|
+
if queue_name not in stream_info_map:
|
247
|
+
stream_info_map[queue_name] = []
|
248
|
+
stream_info_map[queue_name].append((stream_key, priority))
|
249
|
+
|
250
|
+
# 将发现的Stream信息添加到注册表中
|
251
|
+
if stream_info_map:
|
252
|
+
pipeline = self.redis_client.pipeline()
|
253
|
+
for queue_name, stream_list in stream_info_map.items():
|
254
|
+
for stream_key, priority in stream_list:
|
255
|
+
if priority > 0:
|
256
|
+
queue_priority_key = f"{queue_name}:{priority}"
|
257
|
+
else:
|
258
|
+
queue_priority_key = queue_name
|
259
|
+
# 确保stream_key是bytes类型
|
260
|
+
if isinstance(stream_key, str):
|
261
|
+
stream_key = stream_key.encode()
|
262
|
+
pipeline.hset(self.stream_registry_key.encode(), queue_priority_key.encode(), stream_key)
|
263
|
+
await pipeline.execute()
|
264
|
+
logger.info(f"Registered {sum(len(stream_list) for stream_list in stream_info_map.values())} streams to registry during initialization")
|
265
|
+
|
266
|
+
if not stream_info_map:
|
267
|
+
logger.debug("No streams found in registry for backlog monitoring")
|
268
|
+
return
|
269
|
+
|
270
|
+
# 调试日志(使用debug级别避免刷屏)
|
271
|
+
logger.debug(f"Found {len(stream_info_map)} queues for backlog monitoring")
|
272
|
+
for queue_name, stream_list in stream_info_map.items():
|
273
|
+
priorities = [p for _, p in stream_list]
|
274
|
+
# 筛选出非0优先级(0表示普通队列)
|
275
|
+
high_priorities = [p for p in priorities if p > 0]
|
276
|
+
if high_priorities:
|
277
|
+
logger.debug(f" - {queue_name}: {len(stream_list)} streams (includes priorities: {sorted(set(priorities))})")
|
278
|
+
else:
|
279
|
+
logger.debug(f" - {queue_name}: regular queue only (priority=0)")
|
280
|
+
|
281
|
+
# 收集每个队列的指标(聚合所有优先级)
|
282
|
+
metrics = []
|
283
|
+
current_time = datetime.now(timezone.utc)
|
284
|
+
|
285
|
+
for queue_name, stream_list in stream_info_map.items():
|
286
|
+
# 分别处理每个优先级队列
|
287
|
+
for stream_key, priority in stream_list:
|
288
|
+
try:
|
289
|
+
# 获取该队列的最新offset(考虑优先级队列)
|
290
|
+
if priority > 0:
|
291
|
+
# 优先级队列的key格式: queue_name:priority
|
292
|
+
queue_key = f"{queue_name}:{priority}"
|
293
|
+
else:
|
294
|
+
queue_key = queue_name
|
295
|
+
last_published_offset = queue_offsets.get(queue_key, 0)
|
296
|
+
|
297
|
+
# 获取Stream信息
|
298
|
+
stream_info = await self.redis_client.xinfo_stream(stream_key)
|
299
|
+
stream_length = stream_info.get(b'length', 0)
|
300
|
+
|
301
|
+
# 获取消费组信息
|
302
|
+
has_consumer_groups = False
|
303
|
+
try:
|
304
|
+
groups = await self.redis_client.xinfo_groups(stream_key)
|
305
|
+
|
306
|
+
for group in groups:
|
307
|
+
# 处理group_name
|
308
|
+
raw_name = group.get('name', b'')
|
309
|
+
if isinstance(raw_name, bytes):
|
310
|
+
group_name = raw_name.decode() if raw_name else ''
|
311
|
+
else:
|
312
|
+
group_name = str(raw_name) if raw_name else ''
|
313
|
+
|
314
|
+
if not group_name:
|
315
|
+
group_name = 'unknown'
|
316
|
+
|
317
|
+
# 过滤内部消费者组
|
318
|
+
if is_internal_consumer(group_name):
|
319
|
+
# logger.info(f"Skipping internal consumer group: {group_name}")
|
320
|
+
continue
|
321
|
+
|
322
|
+
# 处理pending - 直接是int
|
323
|
+
pending_count = group.get('pending', 0)
|
324
|
+
|
325
|
+
# 从TASK_OFFSETS获取该组的消费offset
|
326
|
+
# 从 group_name 中提取 task_name(最后一段)
|
327
|
+
task_name = group_name.split(':')[-1]
|
328
|
+
# 构建 field:队列名(含优先级)+ 任务名
|
329
|
+
# 例如:robust_bench2:8:benchmark_task
|
330
|
+
task_offset_key = f"{queue_key}:{task_name}"
|
331
|
+
last_acked_offset = task_offsets.get(task_offset_key, 0)
|
332
|
+
|
333
|
+
# 计算各种积压指标
|
334
|
+
# 1. 总积压 = 队列最新offset - 消费组已确认的offset
|
335
|
+
total_backlog = max(0, last_published_offset - last_acked_offset)
|
336
|
+
|
337
|
+
# 2. 未投递的积压 = 总积压 - pending数量
|
338
|
+
backlog_undelivered = max(0, total_backlog - pending_count)
|
339
|
+
|
340
|
+
# 3. 已投递未确认 = pending数量
|
341
|
+
backlog_delivered_unacked = pending_count
|
342
|
+
|
343
|
+
# 4. 已投递的offset = 已确认offset + pending数量
|
344
|
+
last_delivered_offset = last_acked_offset + pending_count
|
345
|
+
|
346
|
+
# 为每个消费组创建一条记录
|
347
|
+
metrics.append({
|
348
|
+
'namespace': self.namespace_name,
|
349
|
+
'stream_name': queue_name,
|
350
|
+
'priority': priority, # 添加优先级字段
|
351
|
+
'consumer_group': group_name,
|
352
|
+
'last_published_offset': last_published_offset,
|
353
|
+
'last_delivered_offset': last_delivered_offset,
|
354
|
+
'last_acked_offset': last_acked_offset,
|
355
|
+
'pending_count': pending_count,
|
356
|
+
'backlog_undelivered': backlog_undelivered,
|
357
|
+
'backlog_unprocessed': total_backlog,
|
358
|
+
'created_at': current_time
|
359
|
+
})
|
360
|
+
has_consumer_groups = True
|
361
|
+
|
362
|
+
except Exception as e:
|
363
|
+
# 这个队列没有消费组
|
364
|
+
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else str(stream_key)
|
365
|
+
logger.debug(f"No consumer groups for stream {stream_key_str}: {e}")
|
366
|
+
|
367
|
+
# 如果没有消费组,保存Stream级别的指标
|
368
|
+
if not has_consumer_groups and last_published_offset > 0:
|
369
|
+
metrics.append({
|
370
|
+
'namespace': self.namespace_name,
|
371
|
+
'stream_name': queue_name,
|
372
|
+
'priority': priority, # 添加优先级字段
|
373
|
+
'consumer_group': None,
|
374
|
+
'last_published_offset': last_published_offset,
|
375
|
+
'last_delivered_offset': 0,
|
376
|
+
'last_acked_offset': 0,
|
377
|
+
'pending_count': 0,
|
378
|
+
'backlog_undelivered': last_published_offset,
|
379
|
+
'backlog_unprocessed': last_published_offset,
|
380
|
+
'created_at': current_time
|
381
|
+
})
|
382
|
+
|
383
|
+
except Exception as e:
|
384
|
+
stream_key_str = stream_key.decode('utf-8') if isinstance(stream_key, bytes) else str(stream_key)
|
385
|
+
logger.error(f"Error collecting metrics for stream {stream_key_str}: {e}")
|
386
|
+
continue
|
387
|
+
|
388
|
+
# 保存指标到数据库
|
389
|
+
if metrics:
|
390
|
+
await self._save_backlog_metrics(metrics)
|
391
|
+
|
392
|
+
except Exception as e:
|
393
|
+
logger.error(f"Error collecting stream backlog metrics: {e}")
|
394
|
+
logger.error(traceback.format_exc())
|
395
|
+
|
396
|
+
async def _save_backlog_metrics(self, metrics: List[Dict]):
|
397
|
+
"""保存积压指标到数据库(仅保存发生变化的数据)"""
|
398
|
+
if not metrics:
|
399
|
+
return
|
400
|
+
|
401
|
+
try:
|
402
|
+
async with self.AsyncSessionLocal() as session:
|
403
|
+
# 要保存的新记录
|
404
|
+
metrics_to_save = []
|
405
|
+
|
406
|
+
# 使用批量查询优化性能
|
407
|
+
metric_keys = {} # 用于快速查找
|
408
|
+
|
409
|
+
for metric in metrics:
|
410
|
+
# 构建唯一键:namespace + stream_name + consumer_group + priority
|
411
|
+
unique_key = f"{metric['namespace']}:{metric['stream_name']}:{metric['consumer_group']}:{metric['priority']}"
|
412
|
+
metric_keys[unique_key] = metric
|
413
|
+
|
414
|
+
# 批量查询最新记录 - 分批查询以避免SQL过长
|
415
|
+
last_records = {}
|
416
|
+
metric_list = list(metric_keys.values())
|
417
|
+
batch_size = 50 # 每批查询50个
|
418
|
+
|
419
|
+
for i in range(0, len(metric_list), batch_size):
|
420
|
+
batch = metric_list[i:i + batch_size]
|
421
|
+
|
422
|
+
# 构建参数化查询
|
423
|
+
conditions = []
|
424
|
+
params = {}
|
425
|
+
for idx, metric in enumerate(batch):
|
426
|
+
param_prefix = f"p{i + idx}"
|
427
|
+
conditions.append(f"""
|
428
|
+
(namespace = :{param_prefix}_ns
|
429
|
+
AND stream_name = :{param_prefix}_sn
|
430
|
+
AND consumer_group = :{param_prefix}_cg
|
431
|
+
AND priority = :{param_prefix}_pr)
|
432
|
+
""")
|
433
|
+
params[f"{param_prefix}_ns"] = metric['namespace']
|
434
|
+
params[f"{param_prefix}_sn"] = metric['stream_name']
|
435
|
+
params[f"{param_prefix}_cg"] = metric['consumer_group']
|
436
|
+
params[f"{param_prefix}_pr"] = metric['priority']
|
437
|
+
|
438
|
+
if conditions:
|
439
|
+
# 使用窗口函数获取每个组合的最新记录
|
440
|
+
query_sql = text(f"""
|
441
|
+
WITH latest_records AS (
|
442
|
+
SELECT
|
443
|
+
namespace,
|
444
|
+
stream_name,
|
445
|
+
consumer_group,
|
446
|
+
priority,
|
447
|
+
last_published_offset,
|
448
|
+
last_delivered_offset,
|
449
|
+
last_acked_offset,
|
450
|
+
pending_count,
|
451
|
+
backlog_undelivered,
|
452
|
+
backlog_unprocessed,
|
453
|
+
ROW_NUMBER() OVER (
|
454
|
+
PARTITION BY namespace, stream_name, consumer_group, priority
|
455
|
+
ORDER BY created_at DESC
|
456
|
+
) as rn
|
457
|
+
FROM stream_backlog_monitor
|
458
|
+
WHERE ({' OR '.join(conditions)})
|
459
|
+
)
|
460
|
+
SELECT
|
461
|
+
namespace,
|
462
|
+
stream_name,
|
463
|
+
consumer_group,
|
464
|
+
priority,
|
465
|
+
last_published_offset,
|
466
|
+
last_delivered_offset,
|
467
|
+
last_acked_offset,
|
468
|
+
pending_count,
|
469
|
+
backlog_undelivered,
|
470
|
+
backlog_unprocessed
|
471
|
+
FROM latest_records
|
472
|
+
WHERE rn = 1
|
473
|
+
""")
|
474
|
+
|
475
|
+
result = await session.execute(query_sql, params)
|
476
|
+
for row in result:
|
477
|
+
key = f"{row.namespace}:{row.stream_name}:{row.consumer_group}:{row.priority}"
|
478
|
+
last_records[key] = row
|
479
|
+
logger.debug(f"Found last record for {key}: published={row.last_published_offset}")
|
480
|
+
|
481
|
+
# 对每个指标进行去重检查
|
482
|
+
for unique_key, metric in metric_keys.items():
|
483
|
+
should_save = False
|
484
|
+
|
485
|
+
if unique_key not in last_records:
|
486
|
+
# 没有历史记录,需要保存
|
487
|
+
should_save = True
|
488
|
+
else:
|
489
|
+
# 比较关键指标是否发生变化
|
490
|
+
last_record = last_records[unique_key]
|
491
|
+
|
492
|
+
# 详细的调试日志
|
493
|
+
changes = []
|
494
|
+
logger.debug(f"Comparing for {unique_key}:")
|
495
|
+
logger.debug(f" DB record: published={last_record.last_published_offset} (type={type(last_record.last_published_offset)}), "
|
496
|
+
f"delivered={last_record.last_delivered_offset} (type={type(last_record.last_delivered_offset)}), "
|
497
|
+
f"acked={last_record.last_acked_offset}, pending={last_record.pending_count}, "
|
498
|
+
f"undelivered={last_record.backlog_undelivered}, unprocessed={last_record.backlog_unprocessed}")
|
499
|
+
logger.debug(f" New metric: published={metric['last_published_offset']} (type={type(metric['last_published_offset'])}), "
|
500
|
+
f"delivered={metric['last_delivered_offset']} (type={type(metric['last_delivered_offset'])}), "
|
501
|
+
f"acked={metric['last_acked_offset']}, pending={metric['pending_count']}, "
|
502
|
+
f"undelivered={metric['backlog_undelivered']}, unprocessed={metric['backlog_unprocessed']}")
|
503
|
+
|
504
|
+
# 确保类型一致的比较(全部转为int进行比较)
|
505
|
+
db_published = int(last_record.last_published_offset) if last_record.last_published_offset is not None else 0
|
506
|
+
new_published = int(metric['last_published_offset']) if metric['last_published_offset'] is not None else 0
|
507
|
+
|
508
|
+
db_delivered = int(last_record.last_delivered_offset) if last_record.last_delivered_offset is not None else 0
|
509
|
+
new_delivered = int(metric['last_delivered_offset']) if metric['last_delivered_offset'] is not None else 0
|
510
|
+
|
511
|
+
db_acked = int(last_record.last_acked_offset) if last_record.last_acked_offset is not None else 0
|
512
|
+
new_acked = int(metric['last_acked_offset']) if metric['last_acked_offset'] is not None else 0
|
513
|
+
|
514
|
+
db_pending = int(last_record.pending_count) if last_record.pending_count is not None else 0
|
515
|
+
new_pending = int(metric['pending_count']) if metric['pending_count'] is not None else 0
|
516
|
+
|
517
|
+
db_undelivered = int(last_record.backlog_undelivered) if last_record.backlog_undelivered is not None else 0
|
518
|
+
new_undelivered = int(metric['backlog_undelivered']) if metric['backlog_undelivered'] is not None else 0
|
519
|
+
|
520
|
+
db_unprocessed = int(last_record.backlog_unprocessed) if last_record.backlog_unprocessed is not None else 0
|
521
|
+
new_unprocessed = int(metric['backlog_unprocessed']) if metric['backlog_unprocessed'] is not None else 0
|
522
|
+
|
523
|
+
if db_published != new_published:
|
524
|
+
changes.append(f"published: {db_published} -> {new_published}")
|
525
|
+
if db_delivered != new_delivered:
|
526
|
+
changes.append(f"delivered: {db_delivered} -> {new_delivered}")
|
527
|
+
if db_acked != new_acked:
|
528
|
+
changes.append(f"acked: {db_acked} -> {new_acked}")
|
529
|
+
if db_pending != new_pending:
|
530
|
+
changes.append(f"pending: {db_pending} -> {new_pending}")
|
531
|
+
if db_undelivered != new_undelivered:
|
532
|
+
changes.append(f"undelivered: {db_undelivered} -> {new_undelivered}")
|
533
|
+
if db_unprocessed != new_unprocessed:
|
534
|
+
changes.append(f"unprocessed: {db_unprocessed} -> {new_unprocessed}")
|
535
|
+
|
536
|
+
if changes:
|
537
|
+
should_save = True
|
538
|
+
else:
|
539
|
+
logger.debug(f"Metric unchanged for {unique_key}, skipping")
|
540
|
+
|
541
|
+
if should_save:
|
542
|
+
metrics_to_save.append(metric)
|
543
|
+
|
544
|
+
# 批量插入发生变化的监控数据
|
545
|
+
if metrics_to_save:
|
546
|
+
insert_sql = text("""
|
547
|
+
INSERT INTO stream_backlog_monitor
|
548
|
+
(namespace, stream_name, priority, consumer_group, last_published_offset,
|
549
|
+
last_delivered_offset, last_acked_offset, pending_count,
|
550
|
+
backlog_undelivered, backlog_unprocessed, created_at)
|
551
|
+
VALUES
|
552
|
+
(:namespace, :stream_name, :priority, :consumer_group, :last_published_offset,
|
553
|
+
:last_delivered_offset, :last_acked_offset, :pending_count,
|
554
|
+
:backlog_undelivered, :backlog_unprocessed, :created_at)
|
555
|
+
""")
|
556
|
+
|
557
|
+
# 逐条插入(SQLAlchemy的execute不支持批量插入参数列表)
|
558
|
+
for metric_data in metrics_to_save:
|
559
|
+
await session.execute(insert_sql, metric_data)
|
560
|
+
|
561
|
+
await session.commit()
|
562
|
+
else:
|
563
|
+
logger.debug(f"No metrics changed, skipped saving all {len(metrics)} records")
|
564
|
+
|
565
|
+
except Exception as e:
|
566
|
+
logger.error(f"Error saving backlog metrics to database: {e}")
|
567
|
+
logger.error(traceback.format_exc())
|