aury-boot 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/adapter/http.py +17 -6
- aury/boot/application/app/base.py +1 -0
- aury/boot/application/app/components.py +93 -3
- aury/boot/application/config/settings.py +80 -2
- aury/boot/commands/init.py +20 -0
- aury/boot/commands/pkg.py +31 -1
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +1 -0
- aury/boot/commands/templates/project/aury_docs/18-monitoring-profiling.md.tpl +239 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +15 -0
- aury/boot/common/logging/setup.py +8 -3
- aury/boot/infrastructure/cache/redis.py +82 -16
- aury/boot/infrastructure/channel/__init__.py +2 -1
- aury/boot/infrastructure/channel/backends/__init__.py +2 -1
- aury/boot/infrastructure/channel/backends/redis_cluster.py +124 -0
- aury/boot/infrastructure/channel/backends/redis_cluster_channel.py +139 -0
- aury/boot/infrastructure/channel/base.py +2 -0
- aury/boot/infrastructure/channel/manager.py +9 -1
- aury/boot/infrastructure/clients/redis/manager.py +90 -19
- aury/boot/infrastructure/database/manager.py +6 -4
- aury/boot/infrastructure/monitoring/__init__.py +10 -2
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +33 -16
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +14 -13
- aury/boot/infrastructure/monitoring/profiling/__init__.py +664 -0
- aury/boot/infrastructure/scheduler/__init__.py +2 -0
- aury/boot/infrastructure/scheduler/jobstores/__init__.py +10 -0
- aury/boot/infrastructure/scheduler/jobstores/redis_cluster.py +255 -0
- aury/boot/infrastructure/scheduler/manager.py +15 -3
- aury/boot/toolkit/http/__init__.py +180 -85
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/METADATA +14 -4
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/RECORD +33 -27
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
"""Profiling 模块。
|
|
2
|
+
|
|
3
|
+
提供持续性能分析和问题时刻状态快照功能:
|
|
4
|
+
- Pyroscope 集成:持续采样生成火焰图
|
|
5
|
+
- 事件循环阻塞检测:检测同步代码阻塞协程
|
|
6
|
+
|
|
7
|
+
使用方式:
|
|
8
|
+
# 通过配置启用
|
|
9
|
+
PROFILING__ENABLED=true
|
|
10
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
11
|
+
|
|
12
|
+
# 事件循环阻塞检测
|
|
13
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
14
|
+
PROFILING__BLOCKING_THRESHOLD_MS=100
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import time
|
|
23
|
+
import traceback
|
|
24
|
+
from collections import deque
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
from typing import TYPE_CHECKING, Any
|
|
28
|
+
|
|
29
|
+
from aury.boot.common.logging import logger
|
|
30
|
+
|
|
31
|
+
# Pyroscope 可选依赖
|
|
32
|
+
try:
|
|
33
|
+
import pyroscope
|
|
34
|
+
PYROSCOPE_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
pyroscope = None # type: ignore[assignment]
|
|
37
|
+
PYROSCOPE_AVAILABLE = False
|
|
38
|
+
|
|
39
|
+
# psutil 可选依赖(用于进程资源监控)
|
|
40
|
+
try:
|
|
41
|
+
import psutil
|
|
42
|
+
PSUTIL_AVAILABLE = True
|
|
43
|
+
except ImportError:
|
|
44
|
+
psutil = None # type: ignore[assignment]
|
|
45
|
+
PSUTIL_AVAILABLE = False
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from collections.abc import Callable
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# 配置
|
|
53
|
+
# =============================================================================
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class ProfilingConfig:
|
|
58
|
+
"""Profiling 配置。"""
|
|
59
|
+
|
|
60
|
+
# Pyroscope 配置
|
|
61
|
+
enabled: bool = False
|
|
62
|
+
pyroscope_endpoint: str | None = None
|
|
63
|
+
pyroscope_auth_token: str | None = None
|
|
64
|
+
service_name: str = "aury-service"
|
|
65
|
+
environment: str = "development"
|
|
66
|
+
|
|
67
|
+
# 事件循环阻塞检测配置
|
|
68
|
+
blocking_detector_enabled: bool = False
|
|
69
|
+
blocking_check_interval_ms: float = 100
|
|
70
|
+
blocking_threshold_ms: float = 100
|
|
71
|
+
blocking_severe_threshold_ms: float = 500
|
|
72
|
+
blocking_alert_enabled: bool = True
|
|
73
|
+
blocking_alert_cooldown_seconds: float = 60
|
|
74
|
+
blocking_max_history: int = 50
|
|
75
|
+
|
|
76
|
+
# 滑动窗口统计(秒)
|
|
77
|
+
blocking_stats_window_seconds: float = 300 # 5分钟
|
|
78
|
+
|
|
79
|
+
# 标签
|
|
80
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# =============================================================================
|
|
84
|
+
# Pyroscope 集成
|
|
85
|
+
# =============================================================================
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PyroscopeProfiler:
|
|
89
|
+
"""Pyroscope 持续 Profiler。
|
|
90
|
+
|
|
91
|
+
集成 Grafana Pyroscope 实现持续性能分析和火焰图生成。
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, config: ProfilingConfig) -> None:
|
|
95
|
+
self._config = config
|
|
96
|
+
self._initialized = False
|
|
97
|
+
|
|
98
|
+
def start(self) -> bool:
|
|
99
|
+
"""启动 Pyroscope profiling。"""
|
|
100
|
+
if self._initialized:
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
if not PYROSCOPE_AVAILABLE:
|
|
104
|
+
logger.warning("Pyroscope 未安装,跳过 profiling 初始化 (pip install pyroscope-io)")
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
if not self._config.pyroscope_endpoint:
|
|
108
|
+
logger.warning("Pyroscope endpoint 未配置,跳过初始化")
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
pyroscope.configure(
|
|
113
|
+
application_name=self._config.service_name,
|
|
114
|
+
server_address=self._config.pyroscope_endpoint,
|
|
115
|
+
auth_token=self._config.pyroscope_auth_token or "",
|
|
116
|
+
tags=self._config.tags,
|
|
117
|
+
)
|
|
118
|
+
self._initialized = True
|
|
119
|
+
logger.info(
|
|
120
|
+
f"Pyroscope profiling 已启动 | "
|
|
121
|
+
f"endpoint={self._config.pyroscope_endpoint} "
|
|
122
|
+
f"service={self._config.service_name}"
|
|
123
|
+
)
|
|
124
|
+
return True
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Pyroscope 初始化失败: {e}")
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
def stop(self) -> None:
|
|
130
|
+
"""停止 Pyroscope profiling。"""
|
|
131
|
+
if not self._initialized:
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
if PYROSCOPE_AVAILABLE:
|
|
136
|
+
pyroscope.shutdown()
|
|
137
|
+
self._initialized = False
|
|
138
|
+
logger.info("Pyroscope profiling 已停止")
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"Pyroscope 关闭失败: {e}")
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def is_running(self) -> bool:
|
|
144
|
+
"""是否正在运行。"""
|
|
145
|
+
return self._initialized
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# =============================================================================
|
|
149
|
+
# 事件循环阻塞检测
|
|
150
|
+
# =============================================================================
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class BlockingEvent:
|
|
155
|
+
"""阻塞事件记录。"""
|
|
156
|
+
|
|
157
|
+
timestamp: datetime
|
|
158
|
+
blocked_ms: float
|
|
159
|
+
main_thread_stack: list[dict[str, Any]] # 最佳堆栈(用户代码优先)
|
|
160
|
+
all_sampled_stacks: list[list[dict[str, Any]]] = field(default_factory=list) # 所有采样堆栈
|
|
161
|
+
process_stats: dict[str, Any] | None = None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class EventLoopBlockingDetector:
|
|
165
|
+
"""事件循环阻塞检测器。
|
|
166
|
+
|
|
167
|
+
原理:后台线程定期向事件循环投递任务,如果任务执行延迟超过阈值,
|
|
168
|
+
说明事件循环被同步代码阻塞。此时自动捕获主线程调用栈和进程状态。
|
|
169
|
+
|
|
170
|
+
用于排查:
|
|
171
|
+
- 同步 I/O 阻塞协程
|
|
172
|
+
- CPU 密集型代码阻塞事件循环
|
|
173
|
+
- 死锁或长时间锁等待
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(self, config: ProfilingConfig) -> None:
|
|
177
|
+
self._config = config
|
|
178
|
+
self._running = False
|
|
179
|
+
self._thread: threading.Thread | None = None
|
|
180
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
181
|
+
self._blocking_events: list[BlockingEvent] = []
|
|
182
|
+
self._lock = threading.Lock()
|
|
183
|
+
# 滑动窗口统计:记录时间戳 (timestamp, is_block)
|
|
184
|
+
self._check_history: deque[tuple[float, bool]] = deque()
|
|
185
|
+
self._last_alert_time: float = 0
|
|
186
|
+
|
|
187
|
+
def start(self, loop: asyncio.AbstractEventLoop | None = None) -> None:
|
|
188
|
+
"""启动阻塞检测。"""
|
|
189
|
+
if self._running:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
self._loop = loop or asyncio.get_running_loop()
|
|
194
|
+
except RuntimeError:
|
|
195
|
+
logger.warning("无法获取事件循环,阻塞检测器未启动")
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
self._running = True
|
|
199
|
+
self._thread = threading.Thread(
|
|
200
|
+
target=self._monitor_loop,
|
|
201
|
+
daemon=True,
|
|
202
|
+
name="blocking-detector",
|
|
203
|
+
)
|
|
204
|
+
self._thread.start()
|
|
205
|
+
logger.info(
|
|
206
|
+
f"事件循环阻塞检测已启动 | "
|
|
207
|
+
f"阈值={self._config.blocking_threshold_ms}ms "
|
|
208
|
+
f"严重阈值={self._config.blocking_severe_threshold_ms}ms"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def stop(self) -> None:
|
|
212
|
+
"""停止阻塞检测。"""
|
|
213
|
+
self._running = False
|
|
214
|
+
if self._thread:
|
|
215
|
+
self._thread.join(timeout=1.0)
|
|
216
|
+
self._thread = None
|
|
217
|
+
logger.info("事件循环阻塞检测已停止")
|
|
218
|
+
|
|
219
|
+
def _monitor_loop(self) -> None:
|
|
220
|
+
"""后台监控循环。"""
|
|
221
|
+
while self._running and self._loop:
|
|
222
|
+
try:
|
|
223
|
+
start_time = time.perf_counter()
|
|
224
|
+
future = asyncio.run_coroutine_threadsafe(self._ping(), self._loop)
|
|
225
|
+
|
|
226
|
+
# 在等待期间连续采样堆栈
|
|
227
|
+
sampled_stacks: list[list[dict[str, Any]]] = []
|
|
228
|
+
sample_interval = 0.01 # 10ms 采样一次
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# 轮询等待,同时采样堆栈
|
|
232
|
+
timeout = self._config.blocking_threshold_ms * 10 / 1000
|
|
233
|
+
deadline = time.perf_counter() + timeout
|
|
234
|
+
|
|
235
|
+
while time.perf_counter() < deadline:
|
|
236
|
+
try:
|
|
237
|
+
future.result(timeout=sample_interval)
|
|
238
|
+
break # 成功返回
|
|
239
|
+
except TimeoutError:
|
|
240
|
+
# 还在等待,采样当前堆栈
|
|
241
|
+
elapsed = (time.perf_counter() - start_time) * 1000
|
|
242
|
+
if elapsed > self._config.blocking_threshold_ms * 0.5: # 超过阈值50%开始采样
|
|
243
|
+
stack = self._capture_main_thread_stack()
|
|
244
|
+
if stack and (not sampled_stacks or stack != sampled_stacks[-1]):
|
|
245
|
+
sampled_stacks.append(stack)
|
|
246
|
+
else:
|
|
247
|
+
# 超时
|
|
248
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
249
|
+
self._record_blocking(elapsed_ms, sampled_stacks)
|
|
250
|
+
self._record_check(is_block=True)
|
|
251
|
+
time.sleep(self._config.blocking_check_interval_ms / 1000)
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
258
|
+
is_blocked = elapsed_ms > self._config.blocking_threshold_ms
|
|
259
|
+
if is_blocked:
|
|
260
|
+
self._record_blocking(elapsed_ms, sampled_stacks)
|
|
261
|
+
|
|
262
|
+
# 记录检查历史(滑动窗口)
|
|
263
|
+
self._record_check(is_blocked)
|
|
264
|
+
except Exception:
|
|
265
|
+
pass # 事件循环可能已关闭
|
|
266
|
+
|
|
267
|
+
time.sleep(self._config.blocking_check_interval_ms / 1000)
|
|
268
|
+
|
|
269
|
+
async def _ping(self) -> None:
|
|
270
|
+
"""空操作,用于测量事件循环响应时间。"""
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
def _record_blocking(
|
|
274
|
+
self,
|
|
275
|
+
blocked_ms: float,
|
|
276
|
+
sampled_stacks: list[list[dict[str, Any]]] | None = None,
|
|
277
|
+
) -> None:
|
|
278
|
+
"""记录阻塞事件。"""
|
|
279
|
+
|
|
280
|
+
# 优先使用采样的堆栈(阻塞期间捕获的),否则捕获当前堆栈
|
|
281
|
+
if sampled_stacks:
|
|
282
|
+
# 取用户代码最多的堆栈作为主堆栈
|
|
283
|
+
stack = self._merge_sampled_stacks(sampled_stacks)
|
|
284
|
+
# 去重保留所有不同的堆栈
|
|
285
|
+
unique_stacks = self._dedupe_stacks(sampled_stacks)
|
|
286
|
+
else:
|
|
287
|
+
stack = self._capture_main_thread_stack()
|
|
288
|
+
unique_stacks = [stack] if stack else []
|
|
289
|
+
|
|
290
|
+
# 获取进程状态
|
|
291
|
+
process_stats = self._capture_process_stats()
|
|
292
|
+
|
|
293
|
+
event = BlockingEvent(
|
|
294
|
+
timestamp=datetime.now(),
|
|
295
|
+
blocked_ms=round(blocked_ms, 2),
|
|
296
|
+
main_thread_stack=stack,
|
|
297
|
+
all_sampled_stacks=unique_stacks,
|
|
298
|
+
process_stats=process_stats,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
with self._lock:
|
|
302
|
+
self._blocking_events.append(event)
|
|
303
|
+
if len(self._blocking_events) > self._config.blocking_max_history:
|
|
304
|
+
self._blocking_events.pop(0)
|
|
305
|
+
|
|
306
|
+
# 输出日志
|
|
307
|
+
self._log_blocking(event)
|
|
308
|
+
|
|
309
|
+
# 发送告警
|
|
310
|
+
if self._config.blocking_alert_enabled and self._loop:
|
|
311
|
+
self._maybe_send_alert(event)
|
|
312
|
+
|
|
313
|
+
def _capture_main_thread_stack(self) -> list[dict[str, Any]]:
|
|
314
|
+
"""捕获主线程调用栈。"""
|
|
315
|
+
main_thread_id = threading.main_thread().ident
|
|
316
|
+
if not main_thread_id or main_thread_id not in sys._current_frames():
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
frame = sys._current_frames()[main_thread_id]
|
|
320
|
+
stack = []
|
|
321
|
+
|
|
322
|
+
for filename, lineno, name, line in traceback.extract_stack(frame):
|
|
323
|
+
# 只跳过检测器自身和 frozen 内部代码
|
|
324
|
+
if "<frozen" in filename or "monitoring/profiling" in filename:
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
stack.append({
|
|
328
|
+
"file": filename,
|
|
329
|
+
"line": lineno,
|
|
330
|
+
"function": name,
|
|
331
|
+
"code": line,
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
return stack[-20:] # 保留最近 20 帧
|
|
335
|
+
|
|
336
|
+
def _is_user_code(self, filename: str) -> bool:
|
|
337
|
+
"""判断是否为用户代码(非标准库/非三方库)。"""
|
|
338
|
+
if not filename:
|
|
339
|
+
return False
|
|
340
|
+
is_stdlib = any(p in filename for p in (
|
|
341
|
+
"/lib/python", "/Lib/Python", "/opt/homebrew/Cellar/python",
|
|
342
|
+
"/.pyenv/", "/Python.framework/"
|
|
343
|
+
))
|
|
344
|
+
is_site_packages = "site-packages" in filename or "dist-packages" in filename
|
|
345
|
+
return not is_stdlib and not is_site_packages
|
|
346
|
+
|
|
347
|
+
def _score_stack(self, stack: list[dict[str, Any]]) -> int:
|
|
348
|
+
"""评分堆栈:用户代码帧越多分数越高。"""
|
|
349
|
+
return sum(1 for f in stack if self._is_user_code(f.get("file", "")))
|
|
350
|
+
|
|
351
|
+
def _stack_signature(self, stack: list[dict[str, Any]]) -> str:
|
|
352
|
+
"""生成堆栈签名用于去重。"""
|
|
353
|
+
return "|".join(f"{f.get('file', '')}:{f.get('line', '')}" for f in stack[-5:])
|
|
354
|
+
|
|
355
|
+
def _dedupe_stacks(
|
|
356
|
+
self, stacks: list[list[dict[str, Any]]]
|
|
357
|
+
) -> list[list[dict[str, Any]]]:
|
|
358
|
+
"""去重堆栈,保留唯一的堆栈。"""
|
|
359
|
+
seen: set[str] = set()
|
|
360
|
+
unique: list[list[dict[str, Any]]] = []
|
|
361
|
+
for stack in stacks:
|
|
362
|
+
sig = self._stack_signature(stack)
|
|
363
|
+
if sig not in seen:
|
|
364
|
+
seen.add(sig)
|
|
365
|
+
unique.append(stack)
|
|
366
|
+
return unique
|
|
367
|
+
|
|
368
|
+
def _merge_sampled_stacks(
|
|
369
|
+
self, sampled_stacks: list[list[dict[str, Any]]]
|
|
370
|
+
) -> list[dict[str, Any]]:
|
|
371
|
+
"""合并多次采样的堆栈,返回用户代码最多的。"""
|
|
372
|
+
if not sampled_stacks:
|
|
373
|
+
return []
|
|
374
|
+
return max(sampled_stacks, key=self._score_stack)
|
|
375
|
+
|
|
376
|
+
def _capture_process_stats(self) -> dict[str, Any] | None:
|
|
377
|
+
"""捕获当前进程状态。"""
|
|
378
|
+
if not PSUTIL_AVAILABLE:
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
try:
|
|
382
|
+
proc = psutil.Process()
|
|
383
|
+
with proc.oneshot():
|
|
384
|
+
return {
|
|
385
|
+
"cpu_percent": proc.cpu_percent(),
|
|
386
|
+
"memory_rss_mb": round(proc.memory_info().rss / 1024**2, 2),
|
|
387
|
+
"num_threads": proc.num_threads(),
|
|
388
|
+
"num_fds": proc.num_fds() if hasattr(proc, "num_fds") else None,
|
|
389
|
+
}
|
|
390
|
+
except Exception:
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
def _format_stack(self, stack: list[dict[str, Any]], limit: int = 5, highlight_user: bool = True) -> str:
|
|
394
|
+
"""格式化调用栈为字符串。"""
|
|
395
|
+
lines = []
|
|
396
|
+
for frame in stack[-limit:]:
|
|
397
|
+
if frame.get("code"):
|
|
398
|
+
filename = frame['file']
|
|
399
|
+
is_user = self._is_user_code(filename)
|
|
400
|
+
# 用户代码加前缀标记
|
|
401
|
+
prefix = "→ " if (highlight_user and is_user) else " "
|
|
402
|
+
lines.append(f"{prefix}{filename}:{frame['line']} in {frame['function']}")
|
|
403
|
+
lines.append(f" > {frame['code']}")
|
|
404
|
+
return "\n".join(lines)
|
|
405
|
+
|
|
406
|
+
def _record_check(self, is_block: bool) -> None:
|
|
407
|
+
"""记录一次检查到滑动窗口。"""
|
|
408
|
+
now = time.time()
|
|
409
|
+
with self._lock:
|
|
410
|
+
self._check_history.append((now, is_block))
|
|
411
|
+
# 清理过期数据
|
|
412
|
+
cutoff = now - self._config.blocking_stats_window_seconds
|
|
413
|
+
while self._check_history and self._check_history[0][0] < cutoff:
|
|
414
|
+
self._check_history.popleft()
|
|
415
|
+
|
|
416
|
+
def _get_window_stats(self) -> tuple[int, int]:
|
|
417
|
+
"""获取时间窗口内的统计。
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
(total_checks, total_blocks)
|
|
421
|
+
"""
|
|
422
|
+
now = time.time()
|
|
423
|
+
cutoff = now - self._config.blocking_stats_window_seconds
|
|
424
|
+
total_checks = 0
|
|
425
|
+
total_blocks = 0
|
|
426
|
+
|
|
427
|
+
with self._lock:
|
|
428
|
+
for ts, is_block in self._check_history:
|
|
429
|
+
if ts >= cutoff:
|
|
430
|
+
total_checks += 1
|
|
431
|
+
if is_block:
|
|
432
|
+
total_blocks += 1
|
|
433
|
+
|
|
434
|
+
return total_checks, total_blocks
|
|
435
|
+
|
|
436
|
+
def _log_blocking(self, event: BlockingEvent) -> None:
|
|
437
|
+
"""输出阻塞日志。"""
|
|
438
|
+
is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
|
|
439
|
+
log_fn = logger.error if is_severe else logger.warning
|
|
440
|
+
|
|
441
|
+
# 获取时间窗口统计
|
|
442
|
+
total_checks, total_blocks = self._get_window_stats()
|
|
443
|
+
window_minutes = int(self._config.blocking_stats_window_seconds / 60)
|
|
444
|
+
|
|
445
|
+
# 格式化进程状态
|
|
446
|
+
stats_str = ""
|
|
447
|
+
if event.process_stats:
|
|
448
|
+
s = event.process_stats
|
|
449
|
+
stats_str = f" | CPU={s.get('cpu_percent', 'N/A')}% RSS={s.get('memory_rss_mb', 'N/A')}MB threads={s.get('num_threads', 'N/A')}"
|
|
450
|
+
|
|
451
|
+
# 检查是否有用户代码
|
|
452
|
+
has_user_code = self._score_stack(event.main_thread_stack) > 0
|
|
453
|
+
|
|
454
|
+
# 构建堆栈信息
|
|
455
|
+
stack_lines = []
|
|
456
|
+
|
|
457
|
+
if has_user_code:
|
|
458
|
+
# 有用户代码,显示主堆栈
|
|
459
|
+
stack_lines.append("调用栈 (→ 标记用户代码):")
|
|
460
|
+
stack_lines.append(self._format_stack(event.main_thread_stack, limit=8))
|
|
461
|
+
else:
|
|
462
|
+
# 没有用户代码,可能是框架内部阻塞
|
|
463
|
+
stack_lines.append("调用栈 (无用户代码,可能是三方库/框架内部阻塞):")
|
|
464
|
+
stack_lines.append(self._format_stack(event.main_thread_stack, limit=5, highlight_user=False))
|
|
465
|
+
|
|
466
|
+
# 显示所有不同的采样堆栈
|
|
467
|
+
if len(event.all_sampled_stacks) > 1:
|
|
468
|
+
stack_lines.append(f"\n共采样到 {len(event.all_sampled_stacks)} 个不同堆栈:")
|
|
469
|
+
for i, stack in enumerate(event.all_sampled_stacks[:3], 1): # 最多显示3个
|
|
470
|
+
if stack != event.main_thread_stack:
|
|
471
|
+
stack_lines.append(f"--- 采样 #{i} ---")
|
|
472
|
+
stack_lines.append(self._format_stack(stack, limit=3, highlight_user=False))
|
|
473
|
+
|
|
474
|
+
log_fn(
|
|
475
|
+
f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms "
|
|
476
|
+
f"(阈值={self._config.blocking_threshold_ms}ms, "
|
|
477
|
+
f"近{window_minutes}分钟={total_blocks}次, "
|
|
478
|
+
f"阻塞率={total_blocks / max(total_checks, 1) * 100:.2f}%){stats_str}\n"
|
|
479
|
+
+ "\n".join(stack_lines)
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
def _maybe_send_alert(self, event: BlockingEvent) -> None:
|
|
483
|
+
"""发送告警(带冷却)。"""
|
|
484
|
+
now = time.time()
|
|
485
|
+
if now - self._last_alert_time < self._config.blocking_alert_cooldown_seconds:
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
self._last_alert_time = now
|
|
489
|
+
asyncio.run_coroutine_threadsafe(self._send_alert(event), self._loop)
|
|
490
|
+
|
|
491
|
+
async def _send_alert(self, event: BlockingEvent) -> None:
|
|
492
|
+
"""发送告警。"""
|
|
493
|
+
try:
|
|
494
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
495
|
+
AlertEventType,
|
|
496
|
+
AlertSeverity,
|
|
497
|
+
emit_alert,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
|
|
501
|
+
severity = AlertSeverity.CRITICAL if is_severe else AlertSeverity.WARNING
|
|
502
|
+
|
|
503
|
+
# 获取时间窗口统计
|
|
504
|
+
total_checks, total_blocks = self._get_window_stats()
|
|
505
|
+
window_minutes = int(self._config.blocking_stats_window_seconds / 60)
|
|
506
|
+
|
|
507
|
+
await emit_alert(
|
|
508
|
+
AlertEventType.CUSTOM,
|
|
509
|
+
f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms",
|
|
510
|
+
severity=severity,
|
|
511
|
+
source="blocking_detector",
|
|
512
|
+
blocked_ms=event.blocked_ms,
|
|
513
|
+
threshold_ms=self._config.blocking_threshold_ms,
|
|
514
|
+
window_minutes=window_minutes,
|
|
515
|
+
total_blocks=total_blocks,
|
|
516
|
+
block_rate=f"{total_blocks / max(total_checks, 1) * 100:.2f}%",
|
|
517
|
+
stacktrace=self._format_stack(event.main_thread_stack),
|
|
518
|
+
process_stats=event.process_stats,
|
|
519
|
+
)
|
|
520
|
+
except Exception as e:
|
|
521
|
+
logger.debug(f"发送阻塞告警失败: {e}")
|
|
522
|
+
|
|
523
|
+
def get_status(self) -> dict[str, Any]:
|
|
524
|
+
"""获取检测状态和历史。"""
|
|
525
|
+
total_checks, total_blocks = self._get_window_stats()
|
|
526
|
+
window_minutes = int(self._config.blocking_stats_window_seconds / 60)
|
|
527
|
+
|
|
528
|
+
with self._lock:
|
|
529
|
+
events = [
|
|
530
|
+
{
|
|
531
|
+
"timestamp": e.timestamp.isoformat(),
|
|
532
|
+
"blocked_ms": e.blocked_ms,
|
|
533
|
+
"stack": e.main_thread_stack,
|
|
534
|
+
"process_stats": e.process_stats,
|
|
535
|
+
}
|
|
536
|
+
for e in self._blocking_events
|
|
537
|
+
]
|
|
538
|
+
|
|
539
|
+
return {
|
|
540
|
+
"running": self._running,
|
|
541
|
+
"config": {
|
|
542
|
+
"check_interval_ms": self._config.blocking_check_interval_ms,
|
|
543
|
+
"threshold_ms": self._config.blocking_threshold_ms,
|
|
544
|
+
"severe_threshold_ms": self._config.blocking_severe_threshold_ms,
|
|
545
|
+
"alert_enabled": self._config.blocking_alert_enabled,
|
|
546
|
+
"stats_window_seconds": self._config.blocking_stats_window_seconds,
|
|
547
|
+
},
|
|
548
|
+
"stats": {
|
|
549
|
+
"window_minutes": window_minutes,
|
|
550
|
+
"total_checks": total_checks,
|
|
551
|
+
"total_blocks": total_blocks,
|
|
552
|
+
"block_rate_percent": round(
|
|
553
|
+
total_blocks / max(total_checks, 1) * 100, 2
|
|
554
|
+
),
|
|
555
|
+
},
|
|
556
|
+
"recent_events": events,
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
def clear_history(self) -> None:
|
|
560
|
+
"""清空阻塞历史。"""
|
|
561
|
+
with self._lock:
|
|
562
|
+
self._blocking_events.clear()
|
|
563
|
+
self._check_history.clear()
|
|
564
|
+
|
|
565
|
+
@property
|
|
566
|
+
def is_running(self) -> bool:
|
|
567
|
+
"""是否正在运行。"""
|
|
568
|
+
return self._running
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
# =============================================================================
|
|
572
|
+
# 统一管理器
|
|
573
|
+
# =============================================================================
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
class ProfilingManager:
|
|
577
|
+
"""Profiling 统一管理器。
|
|
578
|
+
|
|
579
|
+
管理 Pyroscope 和阻塞检测器的生命周期。
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
_instance: "ProfilingManager | None" = None
|
|
583
|
+
|
|
584
|
+
def __init__(self) -> None:
|
|
585
|
+
self._config: ProfilingConfig | None = None
|
|
586
|
+
self._pyroscope: PyroscopeProfiler | None = None
|
|
587
|
+
self._blocking_detector: EventLoopBlockingDetector | None = None
|
|
588
|
+
|
|
589
|
+
@classmethod
|
|
590
|
+
def get_instance(cls) -> "ProfilingManager":
|
|
591
|
+
"""获取单例实例。"""
|
|
592
|
+
if cls._instance is None:
|
|
593
|
+
cls._instance = cls()
|
|
594
|
+
return cls._instance
|
|
595
|
+
|
|
596
|
+
def configure(self, config: ProfilingConfig) -> None:
|
|
597
|
+
"""配置管理器。"""
|
|
598
|
+
self._config = config
|
|
599
|
+
self._pyroscope = PyroscopeProfiler(config)
|
|
600
|
+
self._blocking_detector = EventLoopBlockingDetector(config)
|
|
601
|
+
|
|
602
|
+
async def start(self) -> None:
|
|
603
|
+
"""启动所有 profiling 组件。"""
|
|
604
|
+
if not self._config:
|
|
605
|
+
logger.warning("ProfilingManager 未配置")
|
|
606
|
+
return
|
|
607
|
+
|
|
608
|
+
# 启动 Pyroscope
|
|
609
|
+
if self._config.enabled and self._pyroscope:
|
|
610
|
+
self._pyroscope.start()
|
|
611
|
+
|
|
612
|
+
# 启动阻塞检测器
|
|
613
|
+
if self._config.blocking_detector_enabled and self._blocking_detector:
|
|
614
|
+
self._blocking_detector.start()
|
|
615
|
+
|
|
616
|
+
async def stop(self) -> None:
|
|
617
|
+
"""停止所有 profiling 组件。"""
|
|
618
|
+
if self._pyroscope:
|
|
619
|
+
self._pyroscope.stop()
|
|
620
|
+
|
|
621
|
+
if self._blocking_detector:
|
|
622
|
+
self._blocking_detector.stop()
|
|
623
|
+
|
|
624
|
+
@property
|
|
625
|
+
def pyroscope(self) -> PyroscopeProfiler | None:
|
|
626
|
+
"""获取 Pyroscope profiler。"""
|
|
627
|
+
return self._pyroscope
|
|
628
|
+
|
|
629
|
+
@property
|
|
630
|
+
def blocking_detector(self) -> EventLoopBlockingDetector | None:
|
|
631
|
+
"""获取阻塞检测器。"""
|
|
632
|
+
return self._blocking_detector
|
|
633
|
+
|
|
634
|
+
def get_status(self) -> dict[str, Any]:
|
|
635
|
+
"""获取所有组件状态。"""
|
|
636
|
+
return {
|
|
637
|
+
"pyroscope": {
|
|
638
|
+
"available": PYROSCOPE_AVAILABLE,
|
|
639
|
+
"running": self._pyroscope.is_running if self._pyroscope else False,
|
|
640
|
+
},
|
|
641
|
+
"blocking_detector": (
|
|
642
|
+
self._blocking_detector.get_status()
|
|
643
|
+
if self._blocking_detector
|
|
644
|
+
else {"running": False}
|
|
645
|
+
),
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
# 便捷访问
|
|
650
|
+
def get_profiling_manager() -> ProfilingManager:
|
|
651
|
+
"""获取 ProfilingManager 实例。"""
|
|
652
|
+
return ProfilingManager.get_instance()
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
__all__ = [
|
|
656
|
+
"BlockingEvent",
|
|
657
|
+
"EventLoopBlockingDetector",
|
|
658
|
+
"ProfilingConfig",
|
|
659
|
+
"ProfilingManager",
|
|
660
|
+
"PyroscopeProfiler",
|
|
661
|
+
"get_profiling_manager",
|
|
662
|
+
"PSUTIL_AVAILABLE",
|
|
663
|
+
"PYROSCOPE_AVAILABLE",
|
|
664
|
+
]
|
|
@@ -8,9 +8,11 @@ from .exceptions import (
|
|
|
8
8
|
SchedulerError,
|
|
9
9
|
SchedulerJobError,
|
|
10
10
|
)
|
|
11
|
+
from .jobstores import RedisClusterJobStore
|
|
11
12
|
from .manager import SchedulerManager
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
15
|
+
"RedisClusterJobStore",
|
|
14
16
|
"SchedulerBackendError",
|
|
15
17
|
"SchedulerError",
|
|
16
18
|
"SchedulerJobError",
|