aury-boot 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,573 @@
1
+ """Profiling 模块。
2
+
3
+ 提供持续性能分析和问题时刻状态快照功能:
4
+ - Pyroscope 集成:持续采样生成火焰图
5
+ - 事件循环阻塞检测:检测同步代码阻塞协程
6
+
7
+ 使用方式:
8
+ # 通过配置启用
9
+ PROFILING__ENABLED=true
10
+ PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
11
+
12
+ # 事件循环阻塞检测
13
+ PROFILING__BLOCKING_DETECTOR_ENABLED=true
14
+ PROFILING__BLOCKING_THRESHOLD_MS=100
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import asyncio
20
+ import sys
21
+ import threading
22
+ import time
23
+ import traceback
24
+ from dataclasses import dataclass, field
25
+ from datetime import datetime
26
+ from typing import TYPE_CHECKING, Any
27
+
28
+ from aury.boot.common.logging import logger
29
+
30
+ # Pyroscope 可选依赖
31
+ try:
32
+ import pyroscope
33
+ PYROSCOPE_AVAILABLE = True
34
+ except ImportError:
35
+ pyroscope = None # type: ignore[assignment]
36
+ PYROSCOPE_AVAILABLE = False
37
+
38
+ # psutil 可选依赖(用于进程资源监控)
39
+ try:
40
+ import psutil
41
+ PSUTIL_AVAILABLE = True
42
+ except ImportError:
43
+ psutil = None # type: ignore[assignment]
44
+ PSUTIL_AVAILABLE = False
45
+
46
+ if TYPE_CHECKING:
47
+ from collections.abc import Callable
48
+
49
+
50
+ # =============================================================================
51
+ # 配置
52
+ # =============================================================================
53
+
54
+
55
+ @dataclass
56
+ class ProfilingConfig:
57
+ """Profiling 配置。"""
58
+
59
+ # Pyroscope 配置
60
+ enabled: bool = False
61
+ pyroscope_endpoint: str | None = None
62
+ pyroscope_auth_token: str | None = None
63
+ service_name: str = "aury-service"
64
+ environment: str = "development"
65
+
66
+ # 事件循环阻塞检测配置
67
+ blocking_detector_enabled: bool = False
68
+ blocking_check_interval_ms: float = 100
69
+ blocking_threshold_ms: float = 100
70
+ blocking_severe_threshold_ms: float = 500
71
+ blocking_alert_enabled: bool = True
72
+ blocking_alert_cooldown_seconds: float = 60
73
+ blocking_max_history: int = 50
74
+
75
+ # 标签
76
+ tags: dict[str, str] = field(default_factory=dict)
77
+
78
+
79
+ # =============================================================================
80
+ # Pyroscope 集成
81
+ # =============================================================================
82
+
83
+
84
+ class PyroscopeProfiler:
85
+ """Pyroscope 持续 Profiler。
86
+
87
+ 集成 Grafana Pyroscope 实现持续性能分析和火焰图生成。
88
+ """
89
+
90
+ def __init__(self, config: ProfilingConfig) -> None:
91
+ self._config = config
92
+ self._initialized = False
93
+
94
+ def start(self) -> bool:
95
+ """启动 Pyroscope profiling。"""
96
+ if self._initialized:
97
+ return True
98
+
99
+ if not PYROSCOPE_AVAILABLE:
100
+ logger.warning("Pyroscope 未安装,跳过 profiling 初始化 (pip install pyroscope-io)")
101
+ return False
102
+
103
+ if not self._config.pyroscope_endpoint:
104
+ logger.warning("Pyroscope endpoint 未配置,跳过初始化")
105
+ return False
106
+
107
+ try:
108
+ pyroscope.configure(
109
+ application_name=self._config.service_name,
110
+ server_address=self._config.pyroscope_endpoint,
111
+ auth_token=self._config.pyroscope_auth_token or "",
112
+ tags=self._config.tags,
113
+ )
114
+ self._initialized = True
115
+ logger.info(
116
+ f"Pyroscope profiling 已启动 | "
117
+ f"endpoint={self._config.pyroscope_endpoint} "
118
+ f"service={self._config.service_name}"
119
+ )
120
+ return True
121
+ except Exception as e:
122
+ logger.error(f"Pyroscope 初始化失败: {e}")
123
+ return False
124
+
125
+ def stop(self) -> None:
126
+ """停止 Pyroscope profiling。"""
127
+ if not self._initialized:
128
+ return
129
+
130
+ try:
131
+ if PYROSCOPE_AVAILABLE:
132
+ pyroscope.shutdown()
133
+ self._initialized = False
134
+ logger.info("Pyroscope profiling 已停止")
135
+ except Exception as e:
136
+ logger.warning(f"Pyroscope 关闭失败: {e}")
137
+
138
+ @property
139
+ def is_running(self) -> bool:
140
+ """是否正在运行。"""
141
+ return self._initialized
142
+
143
+
144
+ # =============================================================================
145
+ # 事件循环阻塞检测
146
+ # =============================================================================
147
+
148
+
149
+ @dataclass
150
+ class BlockingEvent:
151
+ """阻塞事件记录。"""
152
+
153
+ timestamp: datetime
154
+ blocked_ms: float
155
+ main_thread_stack: list[dict[str, Any]]
156
+ process_stats: dict[str, Any] | None = None
157
+
158
+
159
+ class EventLoopBlockingDetector:
160
+ """事件循环阻塞检测器。
161
+
162
+ 原理:后台线程定期向事件循环投递任务,如果任务执行延迟超过阈值,
163
+ 说明事件循环被同步代码阻塞。此时自动捕获主线程调用栈和进程状态。
164
+
165
+ 用于排查:
166
+ - 同步 I/O 阻塞协程
167
+ - CPU 密集型代码阻塞事件循环
168
+ - 死锁或长时间锁等待
169
+ """
170
+
171
+ def __init__(self, config: ProfilingConfig) -> None:
172
+ self._config = config
173
+ self._running = False
174
+ self._thread: threading.Thread | None = None
175
+ self._loop: asyncio.AbstractEventLoop | None = None
176
+ self._blocking_events: list[BlockingEvent] = []
177
+ self._lock = threading.Lock()
178
+ self._total_checks = 0
179
+ self._total_blocks = 0
180
+ self._last_alert_time: float = 0
181
+
182
+ def start(self, loop: asyncio.AbstractEventLoop | None = None) -> None:
183
+ """启动阻塞检测。"""
184
+ if self._running:
185
+ return
186
+
187
+ try:
188
+ self._loop = loop or asyncio.get_running_loop()
189
+ except RuntimeError:
190
+ logger.warning("无法获取事件循环,阻塞检测器未启动")
191
+ return
192
+
193
+ self._running = True
194
+ self._thread = threading.Thread(
195
+ target=self._monitor_loop,
196
+ daemon=True,
197
+ name="blocking-detector",
198
+ )
199
+ self._thread.start()
200
+ logger.info(
201
+ f"事件循环阻塞检测已启动 | "
202
+ f"阈值={self._config.blocking_threshold_ms}ms "
203
+ f"严重阈值={self._config.blocking_severe_threshold_ms}ms"
204
+ )
205
+
206
+ def stop(self) -> None:
207
+ """停止阻塞检测。"""
208
+ self._running = False
209
+ if self._thread:
210
+ self._thread.join(timeout=1.0)
211
+ self._thread = None
212
+ logger.info("事件循环阻塞检测已停止")
213
+
214
+ def _monitor_loop(self) -> None:
215
+ """后台监控循环。"""
216
+ while self._running and self._loop:
217
+ try:
218
+ start_time = time.perf_counter()
219
+ future = asyncio.run_coroutine_threadsafe(self._ping(), self._loop)
220
+
221
+ # 在等待期间连续采样堆栈
222
+ sampled_stacks: list[list[dict[str, Any]]] = []
223
+ sample_interval = 0.01 # 10ms 采样一次
224
+
225
+ try:
226
+ # 轮询等待,同时采样堆栈
227
+ timeout = self._config.blocking_threshold_ms * 10 / 1000
228
+ deadline = time.perf_counter() + timeout
229
+
230
+ while time.perf_counter() < deadline:
231
+ try:
232
+ future.result(timeout=sample_interval)
233
+ break # 成功返回
234
+ except TimeoutError:
235
+ # 还在等待,采样当前堆栈
236
+ elapsed = (time.perf_counter() - start_time) * 1000
237
+ if elapsed > self._config.blocking_threshold_ms * 0.5: # 超过阈值50%开始采样
238
+ stack = self._capture_main_thread_stack()
239
+ if stack and (not sampled_stacks or stack != sampled_stacks[-1]):
240
+ sampled_stacks.append(stack)
241
+ else:
242
+ # 超时
243
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
244
+ self._record_blocking(elapsed_ms, sampled_stacks)
245
+ self._total_checks += 1
246
+ time.sleep(self._config.blocking_check_interval_ms / 1000)
247
+ continue
248
+
249
+ except Exception:
250
+ pass
251
+
252
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
253
+ if elapsed_ms > self._config.blocking_threshold_ms:
254
+ self._record_blocking(elapsed_ms, sampled_stacks)
255
+
256
+ self._total_checks += 1
257
+ except Exception:
258
+ pass # 事件循环可能已关闭
259
+
260
+ time.sleep(self._config.blocking_check_interval_ms / 1000)
261
+
262
+ async def _ping(self) -> None:
263
+ """空操作,用于测量事件循环响应时间。"""
264
+ pass
265
+
266
+ def _record_blocking(
267
+ self,
268
+ blocked_ms: float,
269
+ sampled_stacks: list[list[dict[str, Any]]] | None = None,
270
+ ) -> None:
271
+ """记录阻塞事件。"""
272
+ self._total_blocks += 1
273
+
274
+ # 优先使用采样的堆栈(阻塞期间捕获的),否则捕获当前堆栈
275
+ if sampled_stacks:
276
+ # 合并所有采样的堆栈,去重后取最有价值的
277
+ stack = self._merge_sampled_stacks(sampled_stacks)
278
+ else:
279
+ stack = self._capture_main_thread_stack()
280
+
281
+ # 获取进程状态
282
+ process_stats = self._capture_process_stats()
283
+
284
+ event = BlockingEvent(
285
+ timestamp=datetime.now(),
286
+ blocked_ms=round(blocked_ms, 2),
287
+ main_thread_stack=stack,
288
+ process_stats=process_stats,
289
+ )
290
+
291
+ with self._lock:
292
+ self._blocking_events.append(event)
293
+ if len(self._blocking_events) > self._config.blocking_max_history:
294
+ self._blocking_events.pop(0)
295
+
296
+ # 输出日志
297
+ self._log_blocking(event)
298
+
299
+ # 发送告警
300
+ if self._config.blocking_alert_enabled and self._loop:
301
+ self._maybe_send_alert(event)
302
+
303
+ def _capture_main_thread_stack(self) -> list[dict[str, Any]]:
304
+ """捕获主线程调用栈。"""
305
+ main_thread_id = threading.main_thread().ident
306
+ if not main_thread_id or main_thread_id not in sys._current_frames():
307
+ return []
308
+
309
+ frame = sys._current_frames()[main_thread_id]
310
+ stack = []
311
+
312
+ for filename, lineno, name, line in traceback.extract_stack(frame):
313
+ # 只跳过检测器自身和 frozen 内部代码
314
+ if "<frozen" in filename or "monitoring/profiling" in filename:
315
+ continue
316
+
317
+ stack.append({
318
+ "file": filename,
319
+ "line": lineno,
320
+ "function": name,
321
+ "code": line,
322
+ })
323
+
324
+ return stack[-20:] # 保留最近 20 帧
325
+
326
+ def _merge_sampled_stacks(
327
+ self, sampled_stacks: list[list[dict[str, Any]]]
328
+ ) -> list[dict[str, Any]]:
329
+ """合并多次采样的堆栈,返回最有价值的帧。
330
+
331
+ 优先返回包含用户代码(非标准库/site-packages)的堆栈。
332
+ """
333
+ if not sampled_stacks:
334
+ return []
335
+
336
+ # 评分标准:用户代码帧数越多越好
337
+ def score_stack(stack: list[dict[str, Any]]) -> int:
338
+ user_frames = 0
339
+ for frame in stack:
340
+ filename = frame.get("file", "")
341
+ is_stdlib = any(p in filename for p in (
342
+ "/lib/python", "/Lib/Python", "/opt/homebrew/", "/.pyenv/"
343
+ ))
344
+ is_site_packages = "site-packages" in filename or "dist-packages" in filename
345
+ if not is_stdlib and not is_site_packages:
346
+ user_frames += 1
347
+ return user_frames
348
+
349
+ # 返回用户代码帧最多的堆栈
350
+ return max(sampled_stacks, key=score_stack)
351
+
352
+ def _capture_process_stats(self) -> dict[str, Any] | None:
353
+ """捕获当前进程状态。"""
354
+ if not PSUTIL_AVAILABLE:
355
+ return None
356
+
357
+ try:
358
+ proc = psutil.Process()
359
+ with proc.oneshot():
360
+ return {
361
+ "cpu_percent": proc.cpu_percent(),
362
+ "memory_rss_mb": round(proc.memory_info().rss / 1024**2, 2),
363
+ "num_threads": proc.num_threads(),
364
+ "num_fds": proc.num_fds() if hasattr(proc, "num_fds") else None,
365
+ }
366
+ except Exception:
367
+ return None
368
+
369
+ def _format_stack(self, stack: list[dict[str, Any]], limit: int = 5) -> str:
370
+ """格式化调用栈为字符串。"""
371
+ lines = []
372
+ for frame in stack[-limit:]:
373
+ if frame.get("code"):
374
+ lines.append(f" {frame['file']}:{frame['line']} in {frame['function']}")
375
+ lines.append(f" > {frame['code']}")
376
+ return "\n".join(lines)
377
+
378
+ def _log_blocking(self, event: BlockingEvent) -> None:
379
+ """输出阻塞日志。"""
380
+ is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
381
+ log_fn = logger.error if is_severe else logger.warning
382
+
383
+ # 格式化调用栈
384
+ stack_str = self._format_stack(event.main_thread_stack)
385
+
386
+ # 格式化进程状态
387
+ stats_str = ""
388
+ if event.process_stats:
389
+ s = event.process_stats
390
+ stats_str = f" | CPU={s.get('cpu_percent', 'N/A')}% RSS={s.get('memory_rss_mb', 'N/A')}MB threads={s.get('num_threads', 'N/A')}"
391
+
392
+ log_fn(
393
+ f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms "
394
+ f"(阈值={self._config.blocking_threshold_ms}ms, "
395
+ f"累计={self._total_blocks}次, "
396
+ f"阻塞率={self._total_blocks / max(self._total_checks, 1) * 100:.2f}%){stats_str}\n"
397
+ f"调用栈:\n{stack_str}"
398
+ )
399
+
400
+ def _maybe_send_alert(self, event: BlockingEvent) -> None:
401
+ """发送告警(带冷却)。"""
402
+ now = time.time()
403
+ if now - self._last_alert_time < self._config.blocking_alert_cooldown_seconds:
404
+ return
405
+
406
+ self._last_alert_time = now
407
+ asyncio.run_coroutine_threadsafe(self._send_alert(event), self._loop)
408
+
409
+ async def _send_alert(self, event: BlockingEvent) -> None:
410
+ """发送告警。"""
411
+ try:
412
+ from aury.boot.infrastructure.monitoring.alerting import (
413
+ AlertEventType,
414
+ AlertSeverity,
415
+ emit_alert,
416
+ )
417
+
418
+ is_severe = event.blocked_ms >= self._config.blocking_severe_threshold_ms
419
+ severity = AlertSeverity.CRITICAL if is_severe else AlertSeverity.WARNING
420
+
421
+ await emit_alert(
422
+ AlertEventType.CUSTOM,
423
+ f"事件循环阻塞{'(严重)' if is_severe else ''}: {event.blocked_ms:.0f}ms",
424
+ severity=severity,
425
+ source="blocking_detector",
426
+ blocked_ms=event.blocked_ms,
427
+ threshold_ms=self._config.blocking_threshold_ms,
428
+ total_blocks=self._total_blocks,
429
+ block_rate=f"{self._total_blocks / max(self._total_checks, 1) * 100:.2f}%",
430
+ stacktrace=self._format_stack(event.main_thread_stack),
431
+ process_stats=event.process_stats,
432
+ )
433
+ except Exception as e:
434
+ logger.debug(f"发送阻塞告警失败: {e}")
435
+
436
+ def get_status(self) -> dict[str, Any]:
437
+ """获取检测状态和历史。"""
438
+ with self._lock:
439
+ events = [
440
+ {
441
+ "timestamp": e.timestamp.isoformat(),
442
+ "blocked_ms": e.blocked_ms,
443
+ "stack": e.main_thread_stack,
444
+ "process_stats": e.process_stats,
445
+ }
446
+ for e in self._blocking_events
447
+ ]
448
+
449
+ return {
450
+ "running": self._running,
451
+ "config": {
452
+ "check_interval_ms": self._config.blocking_check_interval_ms,
453
+ "threshold_ms": self._config.blocking_threshold_ms,
454
+ "severe_threshold_ms": self._config.blocking_severe_threshold_ms,
455
+ "alert_enabled": self._config.blocking_alert_enabled,
456
+ },
457
+ "stats": {
458
+ "total_checks": self._total_checks,
459
+ "total_blocks": self._total_blocks,
460
+ "block_rate_percent": round(
461
+ self._total_blocks / max(self._total_checks, 1) * 100, 2
462
+ ),
463
+ },
464
+ "recent_events": events,
465
+ }
466
+
467
+ def clear_history(self) -> None:
468
+ """清空阻塞历史。"""
469
+ with self._lock:
470
+ self._blocking_events.clear()
471
+ self._total_checks = 0
472
+ self._total_blocks = 0
473
+
474
+ @property
475
+ def is_running(self) -> bool:
476
+ """是否正在运行。"""
477
+ return self._running
478
+
479
+
480
+ # =============================================================================
481
+ # 统一管理器
482
+ # =============================================================================
483
+
484
+
485
+ class ProfilingManager:
486
+ """Profiling 统一管理器。
487
+
488
+ 管理 Pyroscope 和阻塞检测器的生命周期。
489
+ """
490
+
491
+ _instance: "ProfilingManager | None" = None
492
+
493
+ def __init__(self) -> None:
494
+ self._config: ProfilingConfig | None = None
495
+ self._pyroscope: PyroscopeProfiler | None = None
496
+ self._blocking_detector: EventLoopBlockingDetector | None = None
497
+
498
+ @classmethod
499
+ def get_instance(cls) -> "ProfilingManager":
500
+ """获取单例实例。"""
501
+ if cls._instance is None:
502
+ cls._instance = cls()
503
+ return cls._instance
504
+
505
+ def configure(self, config: ProfilingConfig) -> None:
506
+ """配置管理器。"""
507
+ self._config = config
508
+ self._pyroscope = PyroscopeProfiler(config)
509
+ self._blocking_detector = EventLoopBlockingDetector(config)
510
+
511
+ async def start(self) -> None:
512
+ """启动所有 profiling 组件。"""
513
+ if not self._config:
514
+ logger.warning("ProfilingManager 未配置")
515
+ return
516
+
517
+ # 启动 Pyroscope
518
+ if self._config.enabled and self._pyroscope:
519
+ self._pyroscope.start()
520
+
521
+ # 启动阻塞检测器
522
+ if self._config.blocking_detector_enabled and self._blocking_detector:
523
+ self._blocking_detector.start()
524
+
525
+ async def stop(self) -> None:
526
+ """停止所有 profiling 组件。"""
527
+ if self._pyroscope:
528
+ self._pyroscope.stop()
529
+
530
+ if self._blocking_detector:
531
+ self._blocking_detector.stop()
532
+
533
+ @property
534
+ def pyroscope(self) -> PyroscopeProfiler | None:
535
+ """获取 Pyroscope profiler。"""
536
+ return self._pyroscope
537
+
538
+ @property
539
+ def blocking_detector(self) -> EventLoopBlockingDetector | None:
540
+ """获取阻塞检测器。"""
541
+ return self._blocking_detector
542
+
543
+ def get_status(self) -> dict[str, Any]:
544
+ """获取所有组件状态。"""
545
+ return {
546
+ "pyroscope": {
547
+ "available": PYROSCOPE_AVAILABLE,
548
+ "running": self._pyroscope.is_running if self._pyroscope else False,
549
+ },
550
+ "blocking_detector": (
551
+ self._blocking_detector.get_status()
552
+ if self._blocking_detector
553
+ else {"running": False}
554
+ ),
555
+ }
556
+
557
+
558
+ # 便捷访问
559
+ def get_profiling_manager() -> ProfilingManager:
560
+ """获取 ProfilingManager 实例。"""
561
+ return ProfilingManager.get_instance()
562
+
563
+
564
+ __all__ = [
565
+ "BlockingEvent",
566
+ "EventLoopBlockingDetector",
567
+ "ProfilingConfig",
568
+ "ProfilingManager",
569
+ "PyroscopeProfiler",
570
+ "get_profiling_manager",
571
+ "PSUTIL_AVAILABLE",
572
+ "PYROSCOPE_AVAILABLE",
573
+ ]
@@ -65,6 +65,10 @@ class RedisStreamMQ(IMQ):
65
65
  self._max_len = max_len
66
66
  self._consuming = False
67
67
  self._owns_client = False
68
+ self._log_sample_counter = 0 # 日志采样计数器
69
+
70
+ # 日志采样率:每 N 个 send 打印 1 次
71
+ LOG_SAMPLE_RATE = 100
68
72
 
69
73
  async def _ensure_client(self) -> None:
70
74
  """确保 Redis 客户端已初始化。"""
@@ -122,7 +126,10 @@ class RedisStreamMQ(IMQ):
122
126
  else:
123
127
  msg_id = await self._client.connection.xadd(stream_key, data)
124
128
 
125
- logger.debug(f"发送消息到 Stream: {stream_key}, id={msg_id}")
129
+ # 采样日志:每 N 个消息打印 1 次
130
+ self._log_sample_counter += 1
131
+ if self._log_sample_counter % self.LOG_SAMPLE_RATE == 1:
132
+ logger.debug(f"发送消息到 Stream: {stream_key}, id={msg_id}, count={self._log_sample_counter}")
126
133
  return message.id
127
134
 
128
135
  async def receive(
@@ -179,6 +179,12 @@ class SchedulerManager:
179
179
  if timezone:
180
180
  scheduler_kwargs["timezone"] = timezone
181
181
 
182
+ # 默认使用 AsyncIOExecutor 避免信号量泄漏
183
+ # ThreadPoolExecutor 在 uvicorn reload/多进程模式下会导致信号量泄漏
184
+ if "executors" not in scheduler_kwargs:
185
+ from apscheduler.executors.asyncio import AsyncIOExecutor
186
+ scheduler_kwargs["executors"] = {"default": AsyncIOExecutor()}
187
+
182
188
  instance._scheduler = AsyncIOScheduler(**scheduler_kwargs)
183
189
  instance._initialized = True
184
190
  cls._instances[name] = instance
@@ -529,10 +535,16 @@ class SchedulerManager:
529
535
  else:
530
536
  logger.info("调度器已启动,无定时任务")
531
537
 
532
- def shutdown(self) -> None:
533
- """关闭调度器。"""
538
+ def shutdown(self, wait: bool = True) -> None:
539
+ """关闭调度器。
540
+
541
+ Args:
542
+ wait: 是否等待所有正在执行的任务完成。
543
+ 默认 True,确保资源正确释放,避免信号量泄漏。
544
+ """
534
545
  if self._scheduler and self._scheduler.running:
535
- self._scheduler.shutdown()
546
+ self._scheduler.shutdown(wait=wait)
547
+ self._started = False
536
548
  logger.info("调度器已关闭")
537
549
 
538
550
  def pause(self) -> None: