aury-boot 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/adapter/http.py +17 -6
- aury/boot/application/app/base.py +1 -0
- aury/boot/application/app/components.py +81 -2
- aury/boot/application/config/settings.py +73 -0
- aury/boot/application/migrations/setup.py +14 -2
- aury/boot/commands/init.py +20 -0
- aury/boot/commands/pkg.py +31 -1
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +1 -0
- aury/boot/commands/templates/project/aury_docs/18-monitoring-profiling.md.tpl +239 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +15 -0
- aury/boot/common/logging/setup.py +8 -3
- aury/boot/infrastructure/database/manager.py +6 -4
- aury/boot/infrastructure/monitoring/__init__.py +10 -2
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +32 -16
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +14 -13
- aury/boot/infrastructure/monitoring/profiling/__init__.py +573 -0
- aury/boot/infrastructure/mq/backends/redis_stream.py +8 -1
- aury/boot/infrastructure/scheduler/manager.py +15 -3
- aury/boot/toolkit/http/__init__.py +180 -85
- {aury_boot-0.0.38.dist-info → aury_boot-0.0.40.dist-info}/METADATA +10 -4
- {aury_boot-0.0.38.dist-info → aury_boot-0.0.40.dist-info}/RECORD +24 -22
- {aury_boot-0.0.38.dist-info → aury_boot-0.0.40.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.38.dist-info → aury_boot-0.0.40.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
"""Profiling 模块。
|
|
2
|
+
|
|
3
|
+
提供持续性能分析和问题时刻状态快照功能:
|
|
4
|
+
- Pyroscope 集成:持续采样生成火焰图
|
|
5
|
+
- 事件循环阻塞检测:检测同步代码阻塞协程
|
|
6
|
+
|
|
7
|
+
使用方式:
|
|
8
|
+
# 通过配置启用
|
|
9
|
+
PROFILING__ENABLED=true
|
|
10
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
11
|
+
|
|
12
|
+
# 事件循环阻塞检测
|
|
13
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
14
|
+
PROFILING__BLOCKING_THRESHOLD_MS=100
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import time
|
|
23
|
+
import traceback
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import TYPE_CHECKING, Any
|
|
27
|
+
|
|
28
|
+
from aury.boot.common.logging import logger
|
|
29
|
+
|
|
30
|
+
# Pyroscope 可选依赖
|
|
31
|
+
try:
|
|
32
|
+
import pyroscope
|
|
33
|
+
PYROSCOPE_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
pyroscope = None # type: ignore[assignment]
|
|
36
|
+
PYROSCOPE_AVAILABLE = False
|
|
37
|
+
|
|
38
|
+
# psutil 可选依赖(用于进程资源监控)
|
|
39
|
+
try:
|
|
40
|
+
import psutil
|
|
41
|
+
PSUTIL_AVAILABLE = True
|
|
42
|
+
except ImportError:
|
|
43
|
+
psutil = None # type: ignore[assignment]
|
|
44
|
+
PSUTIL_AVAILABLE = False
|
|
45
|
+
|
|
46
|
+
if TYPE_CHECKING:
|
|
47
|
+
from collections.abc import Callable
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# 配置
|
|
52
|
+
# =============================================================================
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class ProfilingConfig:
|
|
57
|
+
"""Profiling 配置。"""
|
|
58
|
+
|
|
59
|
+
# Pyroscope 配置
|
|
60
|
+
enabled: bool = False
|
|
61
|
+
pyroscope_endpoint: str | None = None
|
|
62
|
+
pyroscope_auth_token: str | None = None
|
|
63
|
+
service_name: str = "aury-service"
|
|
64
|
+
environment: str = "development"
|
|
65
|
+
|
|
66
|
+
# 事件循环阻塞检测配置
|
|
67
|
+
blocking_detector_enabled: bool = False
|
|
68
|
+
blocking_check_interval_ms: float = 100
|
|
69
|
+
blocking_threshold_ms: float = 100
|
|
70
|
+
blocking_severe_threshold_ms: float = 500
|
|
71
|
+
blocking_alert_enabled: bool = True
|
|
72
|
+
blocking_alert_cooldown_seconds: float = 60
|
|
73
|
+
blocking_max_history: int = 50
|
|
74
|
+
|
|
75
|
+
# 标签
|
|
76
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# =============================================================================
|
|
80
|
+
# Pyroscope 集成
|
|
81
|
+
# =============================================================================
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PyroscopeProfiler:
|
|
85
|
+
"""Pyroscope 持续 Profiler。
|
|
86
|
+
|
|
87
|
+
集成 Grafana Pyroscope 实现持续性能分析和火焰图生成。
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self, config: ProfilingConfig) -> None:
|
|
91
|
+
self._config = config
|
|
92
|
+
self._initialized = False
|
|
93
|
+
|
|
94
|
+
def start(self) -> bool:
|
|
95
|
+
"""启动 Pyroscope profiling。"""
|
|
96
|
+
if self._initialized:
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
if not PYROSCOPE_AVAILABLE:
|
|
100
|
+
logger.warning("Pyroscope 未安装,跳过 profiling 初始化 (pip install pyroscope-io)")
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
if not self._config.pyroscope_endpoint:
|
|
104
|
+
logger.warning("Pyroscope endpoint 未配置,跳过初始化")
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
pyroscope.configure(
|
|
109
|
+
application_name=self._config.service_name,
|
|
110
|
+
server_address=self._config.pyroscope_endpoint,
|
|
111
|
+
auth_token=self._config.pyroscope_auth_token or "",
|
|
112
|
+
tags=self._config.tags,
|
|
113
|
+
)
|
|
114
|
+
self._initialized = True
|
|
115
|
+
logger.info(
|
|
116
|
+
f"Pyroscope profiling 已启动 | "
|
|
117
|
+
f"endpoint={self._config.pyroscope_endpoint} "
|
|
118
|
+
f"service={self._config.service_name}"
|
|
119
|
+
)
|
|
120
|
+
return True
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(f"Pyroscope 初始化失败: {e}")
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
def stop(self) -> None:
|
|
126
|
+
"""停止 Pyroscope profiling。"""
|
|
127
|
+
if not self._initialized:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
if PYROSCOPE_AVAILABLE:
|
|
132
|
+
pyroscope.shutdown()
|
|
133
|
+
self._initialized = False
|
|
134
|
+
logger.info("Pyroscope profiling 已停止")
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.warning(f"Pyroscope 关闭失败: {e}")
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def is_running(self) -> bool:
|
|
140
|
+
"""是否正在运行。"""
|
|
141
|
+
return self._initialized
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# =============================================================================
|
|
145
|
+
# 事件循环阻塞检测
|
|
146
|
+
# =============================================================================
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class BlockingEvent:
|
|
151
|
+
"""阻塞事件记录。"""
|
|
152
|
+
|
|
153
|
+
timestamp: datetime
|
|
154
|
+
blocked_ms: float
|
|
155
|
+
main_thread_stack: list[dict[str, Any]]
|
|
156
|
+
process_stats: dict[str, Any] | None = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class EventLoopBlockingDetector:
|
|
160
|
+
"""事件循环阻塞检测器。
|
|
161
|
+
|
|
162
|
+
原理:后台线程定期向事件循环投递任务,如果任务执行延迟超过阈值,
|
|
163
|
+
说明事件循环被同步代码阻塞。此时自动捕获主线程调用栈和进程状态。
|
|
164
|
+
|
|
165
|
+
用于排查:
|
|
166
|
+
- 同步 I/O 阻塞协程
|
|
167
|
+
- CPU 密集型代码阻塞事件循环
|
|
168
|
+
- 死锁或长时间锁等待
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self, config: ProfilingConfig) -> None:
|
|
172
|
+
self._config = config
|
|
173
|
+
self._running = False
|
|
174
|
+
self._thread: threading.Thread | None = None
|
|
175
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
176
|
+
self._blocking_events: list[BlockingEvent] = []
|
|
177
|
+
self._lock = threading.Lock()
|
|
178
|
+
self._total_checks = 0
|
|
179
|
+
self._total_blocks = 0
|
|
180
|
+
self._last_alert_time: float = 0
|
|
181
|
+
|
|
182
|
+
def start(self, loop: asyncio.AbstractEventLoop | None = None) -> None:
|
|
183
|
+
"""启动阻塞检测。"""
|
|
184
|
+
if self._running:
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
self._loop = loop or asyncio.get_running_loop()
|
|
189
|
+
except RuntimeError:
|
|
190
|
+
logger.warning("无法获取事件循环,阻塞检测器未启动")
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
self._running = True
|
|
194
|
+
self._thread = threading.Thread(
|
|
195
|
+
target=self._monitor_loop,
|
|
196
|
+
daemon=True,
|
|
197
|
+
name="blocking-detector",
|
|
198
|
+
)
|
|
199
|
+
self._thread.start()
|
|
200
|
+
logger.info(
|
|
201
|
+
f"事件循环阻塞检测已启动 | "
|
|
202
|
+
f"阈值={self._config.blocking_threshold_ms}ms "
|
|
203
|
+
f"严重阈值={self._config.blocking_severe_threshold_ms}ms"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def stop(self) -> None:
|
|
207
|
+
"""停止阻塞检测。"""
|
|
208
|
+
self._running = False
|
|
209
|
+
if self._thread:
|
|
210
|
+
self._thread.join(timeout=1.0)
|
|
211
|
+
self._thread = None
|
|
212
|
+
logger.info("事件循环阻塞检测已停止")
|
|
213
|
+
|
|
214
|
+
def _monitor_loop(self) -> None:
|
|
215
|
+
"""后台监控循环。"""
|
|
216
|
+
while self._running and self._loop:
|
|
217
|
+
try:
|
|
218
|
+
start_time = time.perf_counter()
|
|
219
|
+
future = asyncio.run_coroutine_threadsafe(self._ping(), self._loop)
|
|
220
|
+
|
|
221
|
+
# 在等待期间连续采样堆栈
|
|
222
|
+
sampled_stacks: list[list[dict[str, Any]]] = []
|
|
223
|
+
sample_interval = 0.01 # 10ms 采样一次
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
# 轮询等待,同时采样堆栈
|
|
227
|
+
timeout = self._config.blocking_threshold_ms * 10 / 1000
|
|
228
|
+
deadline = time.perf_counter() + timeout
|
|
229
|
+
|
|
230
|
+
while time.perf_counter() < deadline:
|
|
231
|
+
try:
|
|
232
|
+
future.result(timeout=sample_interval)
|
|
233
|
+
break # 成功返回
|
|
234
|
+
except TimeoutError:
|
|
235
|
+
# 还在等待,采样当前堆栈
|
|
236
|
+
elapsed = (time.perf_counter() - start_time) * 1000
|
|
237
|
+
if elapsed > self._config.blocking_threshold_ms * 0.5: # 超过阈值50%开始采样
|
|
238
|
+
stack = self._capture_main_thread_stack()
|
|
239
|
+
if stack and (not sampled_stacks or stack != sampled_stacks[-1]):
|
|
240
|
+
sampled_stacks.append(stack)
|
|
241
|
+
else:
|
|
242
|
+
# 超时
|
|
243
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
244
|
+
self._record_blocking(elapsed_ms, sampled_stacks)
|
|
245
|
+
self._total_checks += 1
|
|
246
|
+
time.sleep(self._config.blocking_check_interval_ms / 1000)
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
253
|
+
if elapsed_ms > self._config.blocking_threshold_ms:
|
|
254
|
+
self._record_blocking(elapsed_ms, sampled_stacks)
|
|
255
|
+
|
|
256
|
+
self._total_checks += 1
|
|
257
|
+
except Exception:
|
|
258
|
+
pass # 事件循环可能已关闭
|
|
259
|
+
|
|
260
|
+
time.sleep(self._config.blocking_check_interval_ms / 1000)
|
|
261
|
+
|
|
262
|
+
async def _ping(self) -> None:
|
|
263
|
+
"""空操作,用于测量事件循环响应时间。"""
|
|
264
|
+
pass
|
|
265
|
+
|
|
266
|
+
def _record_blocking(
|
|
267
|
+
self,
|
|
268
|
+
blocked_ms: float,
|
|
269
|
+
sampled_stacks: list[list[dict[str, Any]]] | None = None,
|
|
270
|
+
) -> None:
|
|
271
|
+
"""记录阻塞事件。"""
|
|
272
|
+
self._total_blocks += 1
|
|
273
|
+
|
|
274
|
+
# 优先使用采样的堆栈(阻塞期间捕获的),否则捕获当前堆栈
|
|
275
|
+
if sampled_stacks:
|
|
276
|
+
# 合并所有采样的堆栈,去重后取最有价值的
|
|
277
|
+
stack = self._merge_sampled_stacks(sampled_stacks)
|
|
278
|
+
else:
|
|
279
|
+
stack = self._capture_main_thread_stack()
|
|
280
|
+
|
|
281
|
+
# 获取进程状态
|
|
282
|
+
process_stats = self._capture_process_stats()
|
|
283
|
+
|
|
284
|
+
event = BlockingEvent(
|
|
285
|
+
timestamp=datetime.now(),
|
|
286
|
+
blocked_ms=round(blocked_ms, 2),
|
|
287
|
+
main_thread_stack=stack,
|
|
288
|
+
process_stats=process_stats,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
with self._lock:
|
|
292
|
+
self._blocking_events.append(event)
|
|
293
|
+
if len(self._blocking_events) > self._config.blocking_max_history:
|
|
294
|
+
self._blocking_events.pop(0)
|
|
295
|
+
|
|
296
|
+
# 输出日志
|
|
297
|
+
self._log_blocking(event)
|
|
298
|
+
|
|
299
|
+
# 发送告警
|
|
300
|
+
if self._config.blocking_alert_enabled and self._loop:
|
|
301
|
+
self._maybe_send_alert(event)
|
|
302
|
+
|
|
303
|
+
def _capture_main_thread_stack(self) -> list[dict[str, Any]]:
|
|
304
|
+
"""捕获主线程调用栈。"""
|
|
305
|
+
main_thread_id = threading.main_thread().ident
|
|
306
|
+
if not main_thread_id or main_thread_id not in sys._current_frames():
|
|
307
|
+
return []
|
|
308
|
+
|
|
309
|
+
frame = sys._current_frames()[main_thread_id]
|
|
310
|
+
stack = []
|
|
311
|
+
|
|
312
|
+
for filename, lineno, name, line in traceback.extract_stack(frame):
|
|
313
|
+
# 只跳过检测器自身和 frozen 内部代码
|
|
314
|
+
if "<frozen" in filename or "monitoring/profiling" in filename:
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
stack.append({
|
|
318
|
+
"file": filename,
|
|
319
|
+
"line": lineno,
|
|
320
|
+
"function": name,
|
|
321
|
+
"code": line,
|
|
322
|
+
})
|
|
323
|
+
|
|
324
|
+
return stack[-20:] # 保留最近 20 帧
|
|
325
|
+
|
|
326
|
+
def _merge_sampled_stacks(
|
|
327
|
+
self, sampled_stacks: list[list[dict[str, Any]]]
|
|
328
|
+
) -> list[dict[str, Any]]:
|
|
329
|
+
"""合并多次采样的堆栈,返回最有价值的帧。
|
|
330
|
+
|
|
331
|
+
优先返回包含用户代码(非标准库/site-packages)的堆栈。
|
|
332
|
+
"""
|
|
333
|
+
if not sampled_stacks:
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
# 评分标准:用户代码帧数越多越好
|
|
337
|
+
def score_stack(stack: list[dict[str, Any]]) -> int:
|
|
338
|
+
user_frames = 0
|
|
339
|
+
for frame in stack:
|
|
340
|
+
filename = frame.get("file", "")
|
|
341
|
+
is_stdlib = any(p in filename for p in (
|
|
342
|
+
"/lib/python", "/Lib/Python", "/opt/homebrew/", "/.pyenv/"
|
|
343
|
+
))
|
|
344
|
+
is_site_packages = "site-packages" in filename or "dist-packages" in filename
|
|
345
|
+
if not is_stdlib and not is_site_packages:
|
|
346
|
+
user_frames += 1
|
|
347
|
+
return user_frames
|
|
348
|
+
|
|
349
|
+
# 返回用户代码帧最多的堆栈
|
|
350
|
+
return max(sampled_stacks, key=score_stack)
|
|
351
|
+
|
|
352
|
+
def _capture_process_stats(self) -> dict[str, Any] | None:
|
|
353
|
+
"""捕获当前进程状态。"""
|
|
354
|
+
if not PSUTIL_AVAILABLE:
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
proc = psutil.Process()
|
|
359
|
+
with proc.oneshot():
|
|
360
|
+
return {
|
|
361
|
+
"cpu_percent": proc.cpu_percent(),
|
|
362
|
+
"memory_rss_mb": round(proc.memory_info().rss / 1024**2, 2),
|
|
363
|
+
"num_threads": proc.num_threads(),
|
|
364
|
+
"num_fds": proc.num_fds() if hasattr(proc, "num_fds") else None,
|
|
365
|
+
}
|
|
366
|
+
except Exception:
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
def _format_stack(self, stack: list[dict[str, Any]], limit: int = 5) -> str:
|
|
370
|
+
"""格式化调用栈为字符串。"""
|
|
371
|
+
lines = []
|
|
372
|
+
for frame in stack[-limit:]:
|
|
373
|
+
if frame.get("code"):
|
|
374
|
+
lines.append(f" {frame['file']}:{frame['line']} in {frame['function']}")
|
|
375
|
+
lines.append(f" > {frame['code']}")
|
|
376
|
+
return "\n".join(lines)
|
|
377
|
+
|
|
378
|
+
def _log_blocking(self, event: BlockingEvent) -> None:
|
|
379
|
+
"""输出阻塞日志。"""
|
|
380
|
+
is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
|
|
381
|
+
log_fn = logger.error if is_severe else logger.warning
|
|
382
|
+
|
|
383
|
+
# 格式化调用栈
|
|
384
|
+
stack_str = self._format_stack(event.main_thread_stack)
|
|
385
|
+
|
|
386
|
+
# 格式化进程状态
|
|
387
|
+
stats_str = ""
|
|
388
|
+
if event.process_stats:
|
|
389
|
+
s = event.process_stats
|
|
390
|
+
stats_str = f" | CPU={s.get('cpu_percent', 'N/A')}% RSS={s.get('memory_rss_mb', 'N/A')}MB threads={s.get('num_threads', 'N/A')}"
|
|
391
|
+
|
|
392
|
+
log_fn(
|
|
393
|
+
f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms "
|
|
394
|
+
f"(阈值={self._config.blocking_threshold_ms}ms, "
|
|
395
|
+
f"累计={self._total_blocks}次, "
|
|
396
|
+
f"阻塞率={self._total_blocks / max(self._total_checks, 1) * 100:.2f}%){stats_str}\n"
|
|
397
|
+
f"调用栈:\n{stack_str}"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def _maybe_send_alert(self, event: BlockingEvent) -> None:
|
|
401
|
+
"""发送告警(带冷却)。"""
|
|
402
|
+
now = time.time()
|
|
403
|
+
if now - self._last_alert_time < self._config.blocking_alert_cooldown_seconds:
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
self._last_alert_time = now
|
|
407
|
+
asyncio.run_coroutine_threadsafe(self._send_alert(event), self._loop)
|
|
408
|
+
|
|
409
|
+
async def _send_alert(self, event: BlockingEvent) -> None:
|
|
410
|
+
"""发送告警。"""
|
|
411
|
+
try:
|
|
412
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
413
|
+
AlertEventType,
|
|
414
|
+
AlertSeverity,
|
|
415
|
+
emit_alert,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
|
|
419
|
+
severity = AlertSeverity.CRITICAL if is_severe else AlertSeverity.WARNING
|
|
420
|
+
|
|
421
|
+
await emit_alert(
|
|
422
|
+
AlertEventType.CUSTOM,
|
|
423
|
+
f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms",
|
|
424
|
+
severity=severity,
|
|
425
|
+
source="blocking_detector",
|
|
426
|
+
blocked_ms=event.blocked_ms,
|
|
427
|
+
threshold_ms=self._config.blocking_threshold_ms,
|
|
428
|
+
total_blocks=self._total_blocks,
|
|
429
|
+
block_rate=f"{self._total_blocks / max(self._total_checks, 1) * 100:.2f}%",
|
|
430
|
+
stacktrace=self._format_stack(event.main_thread_stack),
|
|
431
|
+
process_stats=event.process_stats,
|
|
432
|
+
)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.debug(f"发送阻塞告警失败: {e}")
|
|
435
|
+
|
|
436
|
+
def get_status(self) -> dict[str, Any]:
|
|
437
|
+
"""获取检测状态和历史。"""
|
|
438
|
+
with self._lock:
|
|
439
|
+
events = [
|
|
440
|
+
{
|
|
441
|
+
"timestamp": e.timestamp.isoformat(),
|
|
442
|
+
"blocked_ms": e.blocked_ms,
|
|
443
|
+
"stack": e.main_thread_stack,
|
|
444
|
+
"process_stats": e.process_stats,
|
|
445
|
+
}
|
|
446
|
+
for e in self._blocking_events
|
|
447
|
+
]
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
"running": self._running,
|
|
451
|
+
"config": {
|
|
452
|
+
"check_interval_ms": self._config.blocking_check_interval_ms,
|
|
453
|
+
"threshold_ms": self._config.blocking_threshold_ms,
|
|
454
|
+
"severe_threshold_ms": self._config.blocking_severe_threshold_ms,
|
|
455
|
+
"alert_enabled": self._config.blocking_alert_enabled,
|
|
456
|
+
},
|
|
457
|
+
"stats": {
|
|
458
|
+
"total_checks": self._total_checks,
|
|
459
|
+
"total_blocks": self._total_blocks,
|
|
460
|
+
"block_rate_percent": round(
|
|
461
|
+
self._total_blocks / max(self._total_checks, 1) * 100, 2
|
|
462
|
+
),
|
|
463
|
+
},
|
|
464
|
+
"recent_events": events,
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
def clear_history(self) -> None:
|
|
468
|
+
"""清空阻塞历史。"""
|
|
469
|
+
with self._lock:
|
|
470
|
+
self._blocking_events.clear()
|
|
471
|
+
self._total_checks = 0
|
|
472
|
+
self._total_blocks = 0
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def is_running(self) -> bool:
|
|
476
|
+
"""是否正在运行。"""
|
|
477
|
+
return self._running
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# =============================================================================
|
|
481
|
+
# 统一管理器
|
|
482
|
+
# =============================================================================
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class ProfilingManager:
|
|
486
|
+
"""Profiling 统一管理器。
|
|
487
|
+
|
|
488
|
+
管理 Pyroscope 和阻塞检测器的生命周期。
|
|
489
|
+
"""
|
|
490
|
+
|
|
491
|
+
_instance: "ProfilingManager | None" = None
|
|
492
|
+
|
|
493
|
+
def __init__(self) -> None:
|
|
494
|
+
self._config: ProfilingConfig | None = None
|
|
495
|
+
self._pyroscope: PyroscopeProfiler | None = None
|
|
496
|
+
self._blocking_detector: EventLoopBlockingDetector | None = None
|
|
497
|
+
|
|
498
|
+
@classmethod
|
|
499
|
+
def get_instance(cls) -> "ProfilingManager":
|
|
500
|
+
"""获取单例实例。"""
|
|
501
|
+
if cls._instance is None:
|
|
502
|
+
cls._instance = cls()
|
|
503
|
+
return cls._instance
|
|
504
|
+
|
|
505
|
+
def configure(self, config: ProfilingConfig) -> None:
|
|
506
|
+
"""配置管理器。"""
|
|
507
|
+
self._config = config
|
|
508
|
+
self._pyroscope = PyroscopeProfiler(config)
|
|
509
|
+
self._blocking_detector = EventLoopBlockingDetector(config)
|
|
510
|
+
|
|
511
|
+
async def start(self) -> None:
|
|
512
|
+
"""启动所有 profiling 组件。"""
|
|
513
|
+
if not self._config:
|
|
514
|
+
logger.warning("ProfilingManager 未配置")
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
# 启动 Pyroscope
|
|
518
|
+
if self._config.enabled and self._pyroscope:
|
|
519
|
+
self._pyroscope.start()
|
|
520
|
+
|
|
521
|
+
# 启动阻塞检测器
|
|
522
|
+
if self._config.blocking_detector_enabled and self._blocking_detector:
|
|
523
|
+
self._blocking_detector.start()
|
|
524
|
+
|
|
525
|
+
async def stop(self) -> None:
|
|
526
|
+
"""停止所有 profiling 组件。"""
|
|
527
|
+
if self._pyroscope:
|
|
528
|
+
self._pyroscope.stop()
|
|
529
|
+
|
|
530
|
+
if self._blocking_detector:
|
|
531
|
+
self._blocking_detector.stop()
|
|
532
|
+
|
|
533
|
+
@property
|
|
534
|
+
def pyroscope(self) -> PyroscopeProfiler | None:
|
|
535
|
+
"""获取 Pyroscope profiler。"""
|
|
536
|
+
return self._pyroscope
|
|
537
|
+
|
|
538
|
+
@property
|
|
539
|
+
def blocking_detector(self) -> EventLoopBlockingDetector | None:
|
|
540
|
+
"""获取阻塞检测器。"""
|
|
541
|
+
return self._blocking_detector
|
|
542
|
+
|
|
543
|
+
def get_status(self) -> dict[str, Any]:
|
|
544
|
+
"""获取所有组件状态。"""
|
|
545
|
+
return {
|
|
546
|
+
"pyroscope": {
|
|
547
|
+
"available": PYROSCOPE_AVAILABLE,
|
|
548
|
+
"running": self._pyroscope.is_running if self._pyroscope else False,
|
|
549
|
+
},
|
|
550
|
+
"blocking_detector": (
|
|
551
|
+
self._blocking_detector.get_status()
|
|
552
|
+
if self._blocking_detector
|
|
553
|
+
else {"running": False}
|
|
554
|
+
),
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
# 便捷访问
|
|
559
|
+
def get_profiling_manager() -> ProfilingManager:
|
|
560
|
+
"""获取 ProfilingManager 实例。"""
|
|
561
|
+
return ProfilingManager.get_instance()
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
__all__ = [
|
|
565
|
+
"BlockingEvent",
|
|
566
|
+
"EventLoopBlockingDetector",
|
|
567
|
+
"ProfilingConfig",
|
|
568
|
+
"ProfilingManager",
|
|
569
|
+
"PyroscopeProfiler",
|
|
570
|
+
"get_profiling_manager",
|
|
571
|
+
"PSUTIL_AVAILABLE",
|
|
572
|
+
"PYROSCOPE_AVAILABLE",
|
|
573
|
+
]
|
|
@@ -65,6 +65,10 @@ class RedisStreamMQ(IMQ):
|
|
|
65
65
|
self._max_len = max_len
|
|
66
66
|
self._consuming = False
|
|
67
67
|
self._owns_client = False
|
|
68
|
+
self._log_sample_counter = 0 # 日志采样计数器
|
|
69
|
+
|
|
70
|
+
# 日志采样率:每 N 个 send 打印 1 次
|
|
71
|
+
LOG_SAMPLE_RATE = 100
|
|
68
72
|
|
|
69
73
|
async def _ensure_client(self) -> None:
|
|
70
74
|
"""确保 Redis 客户端已初始化。"""
|
|
@@ -122,7 +126,10 @@ class RedisStreamMQ(IMQ):
|
|
|
122
126
|
else:
|
|
123
127
|
msg_id = await self._client.connection.xadd(stream_key, data)
|
|
124
128
|
|
|
125
|
-
|
|
129
|
+
# 采样日志:每 N 个消息打印 1 次
|
|
130
|
+
self._log_sample_counter += 1
|
|
131
|
+
if self._log_sample_counter % self.LOG_SAMPLE_RATE == 1:
|
|
132
|
+
logger.debug(f"发送消息到 Stream: {stream_key}, id={msg_id}, count={self._log_sample_counter}")
|
|
126
133
|
return message.id
|
|
127
134
|
|
|
128
135
|
async def receive(
|
|
@@ -179,6 +179,12 @@ class SchedulerManager:
|
|
|
179
179
|
if timezone:
|
|
180
180
|
scheduler_kwargs["timezone"] = timezone
|
|
181
181
|
|
|
182
|
+
# 默认使用 AsyncIOExecutor 避免信号量泄漏
|
|
183
|
+
# ThreadPoolExecutor 在 uvicorn reload/多进程模式下会导致信号量泄漏
|
|
184
|
+
if "executors" not in scheduler_kwargs:
|
|
185
|
+
from apscheduler.executors.asyncio import AsyncIOExecutor
|
|
186
|
+
scheduler_kwargs["executors"] = {"default": AsyncIOExecutor()}
|
|
187
|
+
|
|
182
188
|
instance._scheduler = AsyncIOScheduler(**scheduler_kwargs)
|
|
183
189
|
instance._initialized = True
|
|
184
190
|
cls._instances[name] = instance
|
|
@@ -529,10 +535,16 @@ class SchedulerManager:
|
|
|
529
535
|
else:
|
|
530
536
|
logger.info("调度器已启动,无定时任务")
|
|
531
537
|
|
|
532
|
-
def shutdown(self) -> None:
|
|
533
|
-
"""关闭调度器。
|
|
538
|
+
def shutdown(self, wait: bool = True) -> None:
|
|
539
|
+
"""关闭调度器。
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
wait: 是否等待所有正在执行的任务完成。
|
|
543
|
+
默认 True,确保资源正确释放,避免信号量泄漏。
|
|
544
|
+
"""
|
|
534
545
|
if self._scheduler and self._scheduler.running:
|
|
535
|
-
self._scheduler.shutdown()
|
|
546
|
+
self._scheduler.shutdown(wait=wait)
|
|
547
|
+
self._started = False
|
|
536
548
|
logger.info("调度器已关闭")
|
|
537
549
|
|
|
538
550
|
def pause(self) -> None:
|