aury-boot 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. aury/boot/_version.py +2 -2
  2. aury/boot/application/adapter/http.py +17 -6
  3. aury/boot/application/app/base.py +1 -0
  4. aury/boot/application/app/components.py +93 -3
  5. aury/boot/application/config/settings.py +80 -2
  6. aury/boot/commands/init.py +20 -0
  7. aury/boot/commands/pkg.py +31 -1
  8. aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +1 -0
  9. aury/boot/commands/templates/project/aury_docs/18-monitoring-profiling.md.tpl +239 -0
  10. aury/boot/commands/templates/project/env_templates/monitoring.tpl +15 -0
  11. aury/boot/common/logging/setup.py +8 -3
  12. aury/boot/infrastructure/cache/redis.py +82 -16
  13. aury/boot/infrastructure/channel/__init__.py +2 -1
  14. aury/boot/infrastructure/channel/backends/__init__.py +2 -1
  15. aury/boot/infrastructure/channel/backends/redis_cluster.py +124 -0
  16. aury/boot/infrastructure/channel/backends/redis_cluster_channel.py +139 -0
  17. aury/boot/infrastructure/channel/base.py +2 -0
  18. aury/boot/infrastructure/channel/manager.py +9 -1
  19. aury/boot/infrastructure/clients/redis/manager.py +90 -19
  20. aury/boot/infrastructure/database/manager.py +6 -4
  21. aury/boot/infrastructure/monitoring/__init__.py +10 -2
  22. aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +33 -16
  23. aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +14 -13
  24. aury/boot/infrastructure/monitoring/profiling/__init__.py +664 -0
  25. aury/boot/infrastructure/scheduler/__init__.py +2 -0
  26. aury/boot/infrastructure/scheduler/jobstores/__init__.py +10 -0
  27. aury/boot/infrastructure/scheduler/jobstores/redis_cluster.py +255 -0
  28. aury/boot/infrastructure/scheduler/manager.py +15 -3
  29. aury/boot/toolkit/http/__init__.py +180 -85
  30. {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/METADATA +14 -4
  31. {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/RECORD +33 -27
  32. {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/WHEEL +0 -0
  33. {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/entry_points.txt +0 -0
aury/boot/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.0.39'
32
- __version_tuple__ = version_tuple = (0, 0, 39)
31
+ __version__ = version = '0.0.41'
32
+ __version_tuple__ = version_tuple = (0, 0, 41)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -25,10 +25,14 @@ from __future__ import annotations
25
25
 
26
26
  from typing import Any
27
27
 
28
- import httpx
29
-
30
28
  from aury.boot.common.logging import get_trace_id, logger
31
- from aury.boot.toolkit.http import HttpClient, RetryConfig
29
+ from aury.boot.toolkit.http import (
30
+ HttpClient,
31
+ HttpNetworkError,
32
+ HttpStatusError,
33
+ HttpTimeoutError,
34
+ RetryConfig,
35
+ )
32
36
 
33
37
  from .base import BaseAdapter
34
38
  from .config import AdapterSettings
@@ -185,7 +189,7 @@ class HttpAdapter(BaseAdapter):
185
189
  json: JSON 请求体
186
190
  data: 表单数据
187
191
  files: 上传文件
188
- **kwargs: 其他 httpx 参数
192
+ **kwargs: 其他 aiohttp 参数
189
193
 
190
194
  Returns:
191
195
  dict: 响应 JSON
@@ -228,7 +232,7 @@ class HttpAdapter(BaseAdapter):
228
232
  "content": response.text,
229
233
  }
230
234
 
231
- except httpx.TimeoutException as exc:
235
+ except HttpTimeoutError as exc:
232
236
  raise AdapterTimeoutError(
233
237
  f"请求超时: {method} {path}",
234
238
  adapter_name=self.name,
@@ -236,7 +240,7 @@ class HttpAdapter(BaseAdapter):
236
240
  cause=exc,
237
241
  ) from exc
238
242
 
239
- except httpx.HTTPStatusError as exc:
243
+ except HttpStatusError as exc:
240
244
  # HTTP 错误状态码
241
245
  response = exc.response
242
246
  try:
@@ -255,6 +259,13 @@ class HttpAdapter(BaseAdapter):
255
259
  cause=exc,
256
260
  ) from exc
257
261
 
262
+ except HttpNetworkError as exc:
263
+ raise AdapterError(
264
+ f"网络错误: {method} {path} - {exc}",
265
+ adapter_name=self.name,
266
+ cause=exc,
267
+ ) from exc
268
+
258
269
  except Exception as exc:
259
270
  raise AdapterError(
260
271
  f"请求失败: {method} {path} - {type(exc).__name__}: {exc}",
@@ -313,6 +313,7 @@ class FoundationApp(FastAPI):
313
313
  enable_file_rotation=config.log.enable_file_rotation,
314
314
  enable_console=config.log.enable_console,
315
315
  logger_levels=logger_levels,
316
+ enqueue=config.log.enqueue,
316
317
  )
317
318
 
318
319
  # 注册 access 日志(HTTP 请求日志)
@@ -318,13 +318,37 @@ class SchedulerComponent(Component):
318
318
  # jobstores: 根据 URL 自动选择存储后端
319
319
  if scheduler_config.jobstore_url:
320
320
  url = scheduler_config.jobstore_url
321
- if url.startswith("redis://"):
321
+ if url.startswith("redis-cluster://"):
322
+ # Redis Cluster 模式
322
323
  try:
324
+ from aury.boot.infrastructure.scheduler.jobstores import RedisClusterJobStore
325
+
326
+ scheduler_kwargs["jobstores"] = {
327
+ "default": RedisClusterJobStore(url=url)
328
+ }
329
+ logger.info(f"调度器使用 Redis Cluster 存储: {url.split('@')[-1].split('/')[0]}")
330
+ except ImportError:
331
+ logger.warning("Redis Cluster jobstore 需要安装 redis[cluster]: pip install 'redis[cluster]'")
332
+ elif url.startswith("redis://"):
333
+ try:
334
+ from urllib.parse import urlparse
323
335
  from apscheduler.jobstores.redis import RedisJobStore
336
+
337
+ # 解析 Redis URL
338
+ parsed = urlparse(url)
339
+ redis_kwargs: dict = {
340
+ "host": parsed.hostname or "localhost",
341
+ "port": parsed.port or 6379,
342
+ }
343
+ if parsed.password:
344
+ redis_kwargs["password"] = parsed.password
345
+ if parsed.path and parsed.path != "/":
346
+ redis_kwargs["db"] = int(parsed.path.lstrip("/") or 0)
347
+
324
348
  scheduler_kwargs["jobstores"] = {
325
- "default": RedisJobStore.from_url(url)
349
+ "default": RedisJobStore(**redis_kwargs)
326
350
  }
327
- logger.info(f"调度器使用 Redis 存储: {url.split('@')[-1]}")
351
+ logger.info(f"调度器使用 Redis 存储: {parsed.hostname}:{parsed.port}")
328
352
  except ImportError:
329
353
  logger.warning("Redis jobstore 需要安装 redis: pip install redis")
330
354
  else:
@@ -766,6 +790,70 @@ class AlertComponent(Component):
766
790
  pass
767
791
 
768
792
 
793
+ class ProfilingComponent(Component):
794
+ """Profiling 组件。
795
+
796
+ 提供持续性能分析和事件循环阻塞检测:
797
+ - Pyroscope:持续采样生成火焰图
798
+ - 阻塞检测:检测同步代码阻塞事件循环
799
+ """
800
+
801
+ name = "profiling"
802
+ enabled = True
803
+ depends_on: ClassVar[list[str]] = ["alert"] # 告警依赖
804
+
805
+ def can_enable(self, config: BaseConfig) -> bool:
806
+ """当启用 Pyroscope 或阻塞检测时启用。"""
807
+ return self.enabled and (
808
+ config.profiling.enabled or config.profiling.blocking_detector_enabled
809
+ )
810
+
811
+ async def setup(self, app: FoundationApp, config: BaseConfig) -> None:
812
+ """初始化 Profiling 组件。"""
813
+ try:
814
+ from aury.boot.infrastructure.monitoring.profiling import (
815
+ ProfilingConfig,
816
+ ProfilingManager,
817
+ )
818
+
819
+ profiling_config = ProfilingConfig(
820
+ enabled=config.profiling.enabled,
821
+ pyroscope_endpoint=config.profiling.pyroscope_endpoint,
822
+ pyroscope_auth_token=config.profiling.pyroscope_auth_token,
823
+ service_name=config.service.name,
824
+ environment=config.service.environment,
825
+ blocking_detector_enabled=config.profiling.blocking_detector_enabled,
826
+ blocking_check_interval_ms=config.profiling.blocking_check_interval_ms,
827
+ blocking_threshold_ms=config.profiling.blocking_threshold_ms,
828
+ blocking_severe_threshold_ms=config.profiling.blocking_severe_threshold_ms,
829
+ blocking_alert_enabled=config.profiling.blocking_alert_enabled,
830
+ blocking_alert_cooldown_seconds=config.profiling.blocking_alert_cooldown_seconds,
831
+ blocking_max_history=config.profiling.blocking_max_history,
832
+ tags=config.profiling.pyroscope_tags,
833
+ )
834
+
835
+ manager = ProfilingManager.get_instance()
836
+ manager.configure(profiling_config)
837
+ await manager.start()
838
+
839
+ # 保存到 app.state
840
+ app.state.profiling_manager = manager
841
+
842
+ except ImportError as e:
843
+ logger.warning(f"Profiling 依赖未安装,跳过初始化: {e}")
844
+ except Exception as e:
845
+ logger.warning(f"Profiling 初始化失败(非关键): {e}")
846
+
847
+ async def teardown(self, app: FoundationApp) -> None:
848
+ """停止 Profiling 组件。"""
849
+ try:
850
+ manager = getattr(app.state, "profiling_manager", None)
851
+ if manager:
852
+ await manager.stop()
853
+ except Exception as e:
854
+ logger.warning(f"Profiling 关闭失败: {e}")
855
+
856
+
769
857
  class EventBusComponent(Component):
770
858
  """事件总线组件。
771
859
 
@@ -815,6 +903,7 @@ FoundationApp.plugins = [
815
903
  # 设置默认组件
816
904
  FoundationApp.components = [
817
905
  AlertComponent, # 最先初始化告警管理器
906
+ ProfilingComponent, # Profiling 依赖告警
818
907
  DatabaseComponent,
819
908
  MigrationComponent,
820
909
  AdminConsoleComponent,
@@ -837,6 +926,7 @@ __all__ = [
837
926
  "EventBusComponent",
838
927
  "MessageQueueComponent",
839
928
  "MigrationComponent",
929
+ "ProfilingComponent",
840
930
  "SchedulerComponent",
841
931
  "StorageComponent",
842
932
  "TaskComponent",
@@ -263,15 +263,19 @@ class ChannelSettings(BaseModel):
263
263
  支持的后端类型:
264
264
  - memory: 内存后端(默认,单进程)
265
265
  - redis: Redis Pub/Sub(多进程/分布式)
266
+ - redis_cluster: Redis Cluster Sharded Pub/Sub(Redis 7.0+)
267
+
268
+ 注意:URL 的 scheme 会自动决定后端类型:
269
+ - redis-cluster://... 自动使用 redis_cluster 后端
266
270
  """
267
271
 
268
272
  backend: str = Field(
269
273
  default="",
270
- description="通道后端 (memory/redis),空字符串表示不启用"
274
+ description="通道后端 (memory/redis/redis_cluster),空字符串表示不启用"
271
275
  )
272
276
  url: str | None = Field(
273
277
  default=None,
274
- description="Redis URL(当 backend=redis 时需要)"
278
+ description="连接 URL(redis://... redis-cluster://...)"
275
279
  )
276
280
 
277
281
 
@@ -401,6 +405,14 @@ class LogSettings(BaseModel):
401
405
  default=False,
402
406
  description="是否记录 WebSocket 消息内容(注意性能和敏感数据)"
403
407
  )
408
+ enqueue: bool = Field(
409
+ default=False,
410
+ description=(
411
+ "是否启用多进程安全队列。"
412
+ "启用后日志通过 multiprocessing.Queue 传输,"
413
+ "可能导致事件循环阻塞。建议在 asyncio 应用中保持 False"
414
+ )
415
+ )
404
416
 
405
417
 
406
418
  class ServiceSettings(BaseModel):
@@ -464,6 +476,7 @@ class SchedulerSettings(BaseModel):
464
476
  default=None,
465
477
  description=(
466
478
  "任务存储 URL。支持:\n"
479
+ "- redis-cluster://password@host:port(Redis Cluster 存储)\n"
467
480
  "- redis://localhost:6379/0(Redis 存储)\n"
468
481
  "- sqlite:///jobs.db(SQLite 存储)\n"
469
482
  "- postgresql://user:pass@host/db(PostgreSQL 存储)\n"
@@ -741,6 +754,70 @@ class AlertSettings(BaseModel):
741
754
  return self._notifiers
742
755
 
743
756
 
757
+ class ProfilingSettings(BaseModel):
758
+ """Profiling 配置。
759
+
760
+ 环境变量格式: PROFILING__{FIELD}
761
+ 示例: PROFILING__ENABLED, PROFILING__PYROSCOPE_ENDPOINT
762
+
763
+ 功能说明:
764
+ - Pyroscope:持续采样生成火焰图(需安装 pyroscope-io)
765
+ - 阻塞检测:检测同步代码阻塞事件循环(需安装 psutil)
766
+ """
767
+
768
+ # Pyroscope 持续 Profiling
769
+ enabled: bool = Field(
770
+ default=False,
771
+ description="是否启用 Pyroscope 持续 profiling"
772
+ )
773
+ pyroscope_endpoint: str | None = Field(
774
+ default=None,
775
+ description="Pyroscope 服务端点(如 http://pyroscope:4040)"
776
+ )
777
+ pyroscope_auth_token: str | None = Field(
778
+ default=None,
779
+ description="Pyroscope 认证 token(可选)"
780
+ )
781
+ pyroscope_sample_rate: int = Field(
782
+ default=100,
783
+ description="Pyroscope 采样率 (Hz),降低可减少开销"
784
+ )
785
+ pyroscope_tags: dict[str, str] = Field(
786
+ default_factory=dict,
787
+ description="Pyroscope 自定义标签"
788
+ )
789
+
790
+ # 事件循环阻塞检测
791
+ blocking_detector_enabled: bool = Field(
792
+ default=False,
793
+ description="是否启用事件循环阻塞检测"
794
+ )
795
+ blocking_check_interval_ms: float = Field(
796
+ default=100,
797
+ description="阻塞检测间隔 (ms)"
798
+ )
799
+ blocking_threshold_ms: float = Field(
800
+ default=100,
801
+ description="阻塞阈值 (ms),超过此时间记录阻塞事件"
802
+ )
803
+ blocking_severe_threshold_ms: float = Field(
804
+ default=500,
805
+ description="严重阻塞阈值 (ms),超过此时间触发严重告警"
806
+ )
807
+ blocking_alert_enabled: bool = Field(
808
+ default=True,
809
+ description="检测到阻塞时是否发送告警"
810
+ )
811
+ blocking_alert_cooldown_seconds: float = Field(
812
+ default=60,
813
+ description="阻塞告警冷却时间 (秒),避免告警风暴"
814
+ )
815
+ blocking_max_history: int = Field(
816
+ default=50,
817
+ description="保留的阻塞事件历史数量"
818
+ )
819
+
820
+
744
821
  class MigrationSettings(BaseModel):
745
822
  """数据库迁移配置。
746
823
 
@@ -1014,6 +1091,7 @@ class BaseConfig(BaseSettings):
1014
1091
  # ========== 监控告警 ==========
1015
1092
  telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
1016
1093
  alert: AlertSettings = Field(default_factory=AlertSettings)
1094
+ profiling: ProfilingSettings = Field(default_factory=ProfilingSettings)
1017
1095
 
1018
1096
  model_config = SettingsConfigDict(
1019
1097
  case_sensitive=False,
@@ -542,6 +542,18 @@ def _collect_interactive_config() -> dict:
542
542
 
543
543
  config["features"] = features
544
544
 
545
+ # 5.5 监控配置
546
+ console.print()
547
+ console.print("[bold]📊 监控配置[/bold]")
548
+ config["with_otel"] = Confirm.ask(
549
+ " 启用 OpenTelemetry 链路追踪",
550
+ default=True,
551
+ )
552
+ config["with_profiling"] = Confirm.ask(
553
+ " 启用 Profiling (火焰图/阻塞检测)",
554
+ default=False,
555
+ )
556
+
545
557
  # 6. 开发工具
546
558
  console.print()
547
559
  config["with_dev"] = Confirm.ask(
@@ -589,6 +601,12 @@ def _build_dependency_list(config: dict) -> list[str]:
589
601
  if config.get("with_admin_console", True):
590
602
  extras.add("admin")
591
603
 
604
+ # 监控
605
+ if config.get("with_otel", True):
606
+ extras.add("otel")
607
+ if config.get("with_profiling", False):
608
+ extras.add("profiling")
609
+
592
610
  # 开发工具
593
611
  if config.get("with_dev"):
594
612
  extras.add("dev")
@@ -616,6 +634,8 @@ def _show_config_summary(config: dict) -> None:
616
634
  ("服务模式", config.get("service_mode", "api")),
617
635
  ("管理后台", "是" if config.get("with_admin_console", True) else "否"),
618
636
  ("可选功能", ", ".join(config.get("features", [])) or "无"),
637
+ ("OpenTelemetry", "是" if config.get("with_otel", True) else "否"),
638
+ ("Profiling", "是" if config.get("with_profiling", False) else "否"),
619
639
  ("开发工具", "是" if config.get("with_dev") else "否"),
620
640
  ("Docker", "是" if config.get("with_docker") else "否"),
621
641
  ]
aury/boot/commands/pkg.py CHANGED
@@ -45,6 +45,7 @@ class Category(str, Enum):
45
45
  SCHEDULER = "scheduler"
46
46
  ADMIN = "admin"
47
47
  STORAGE = "storage"
48
+ MONITORING = "monitoring"
48
49
  ECOSYSTEM = "ecosystem"
49
50
 
50
51
 
@@ -132,6 +133,29 @@ MODULES: dict[str, ModuleInfo] = {
132
133
  category=Category.STORAGE,
133
134
  deps=["aury-sdk-storage[aws]"],
134
135
  ),
136
+ # 监控
137
+ "otel": ModuleInfo(
138
+ name="otel",
139
+ desc="OpenTelemetry 链路追踪",
140
+ usage="启用 TELEMETRY__ENABLED 自动 instrument FastAPI/SQLAlchemy/httpx",
141
+ category=Category.MONITORING,
142
+ deps=["opentelemetry-api", "opentelemetry-sdk", "opentelemetry-instrumentation-fastapi",
143
+ "opentelemetry-instrumentation-sqlalchemy", "opentelemetry-instrumentation-httpx"],
144
+ ),
145
+ "otel-exporter": ModuleInfo(
146
+ name="otel-exporter",
147
+ desc="OpenTelemetry OTLP 导出器",
148
+ usage="导出 Traces/Metrics/Logs 到 Jaeger/Prometheus/Loki",
149
+ category=Category.MONITORING,
150
+ deps=["opentelemetry-exporter-otlp"],
151
+ ),
152
+ "profiling": ModuleInfo(
153
+ name="profiling",
154
+ desc="Profiling 性能分析",
155
+ usage="Pyroscope 火焰图 + 事件循环阻塞检测",
156
+ category=Category.MONITORING,
157
+ deps=["pyroscope-io", "psutil"],
158
+ ),
135
159
  # 生态包
136
160
  "storage-aws": ModuleInfo(
137
161
  name="storage-aws",
@@ -182,7 +206,12 @@ PRESETS: dict[str, PresetInfo] = {
182
206
  "full": PresetInfo(
183
207
  name="full",
184
208
  desc="完整功能(所有模块)",
185
- modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos"],
209
+ modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos", "otel", "profiling"],
210
+ ),
211
+ "monitoring": PresetInfo(
212
+ name="monitoring",
213
+ desc="完整监控(OTel + Profiling)",
214
+ modules=["otel", "otel-exporter", "profiling"],
186
215
  ),
187
216
  }
188
217
 
@@ -195,6 +224,7 @@ CATEGORY_NAMES: dict[Category, str] = {
195
224
  Category.SCHEDULER: "📦 定时调度",
196
225
  Category.ADMIN: "📦 管理后台",
197
226
  Category.STORAGE: "📦 对象存储",
227
+ Category.MONITORING: "📊 监控分析",
198
228
  Category.ECOSYSTEM: "🌐 生态包",
199
229
  }
200
230
 
@@ -60,3 +60,4 @@ CLI 命令参考请查看 [99-cli.md](./99-cli.md)。
60
60
 
61
61
  ### 监控与告警
62
62
  - [17-alerting.md](./17-alerting.md) - 告警系统(慢请求/慢SQL/异常 → 飞书)
63
+ - [18-monitoring-profiling.md](./18-monitoring-profiling.md) - 监控与 Profiling(火焰图/阻塞检测)
@@ -0,0 +1,239 @@
1
+ # 监控与 Profiling
2
+
3
+ 本文档介绍 {project_name} 项目中的监控和性能分析配置。
4
+
5
+ ## 监控能力概览
6
+
7
+ | 功能 | 用途 | 建议环境 |
8
+ |------|------|----------|
9
+ | OpenTelemetry | 链路追踪、慢请求检测 | 所有环境 |
10
+ | 告警系统 | 异常/慢请求通知 | 所有环境 |
11
+ | Pyroscope | 持续 Profiling、火焰图 | 测试/灰度 |
12
+ | 阻塞检测 | 检测同步代码阻塞协程 | 测试/按需 |
13
+
14
+ ---
15
+
16
+ ## 不同环境的最佳实践
17
+
18
+ ### 开发环境
19
+
20
+ ```bash
21
+ # .env.development
22
+ TELEMETRY__ENABLED=false
23
+ ALERT__ENABLED=false
24
+ PROFILING__ENABLED=false
25
+ PROFILING__BLOCKING_DETECTOR_ENABLED=true # 开发时检测阻塞问题
26
+ ```
27
+
28
+ ### 测试/灰度环境
29
+
30
+ ```bash
31
+ # .env.staging
32
+ TELEMETRY__ENABLED=true
33
+ TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
34
+
35
+ ALERT__ENABLED=true
36
+ ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
37
+ ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
38
+
39
+ # 开启 Profiling 排查性能问题
40
+ PROFILING__ENABLED=true
41
+ PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
42
+
43
+ PROFILING__BLOCKING_DETECTOR_ENABLED=true
44
+ ```
45
+
46
+ ### 生产环境
47
+
48
+ ```bash
49
+ # .env.production
50
+ # 链路追踪 - 必开
51
+ TELEMETRY__ENABLED=true
52
+ TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
53
+ TELEMETRY__SAMPLING_RATE=0.1 # 采样 10% 减少开销
54
+
55
+ # 告警 - 必开
56
+ ALERT__ENABLED=true
57
+ ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
58
+ ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
59
+
60
+ # Profiling - 按需(有约 2-5% CPU 开销)
61
+ PROFILING__ENABLED=false # 出问题时临时开启
62
+ # PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
63
+ # PROFILING__PYROSCOPE_SAMPLE_RATE=10 # 降低采样率减少开销
64
+
65
+ # 阻塞检测 - 按需
66
+ PROFILING__BLOCKING_DETECTOR_ENABLED=false # 出问题时临时开启
67
+ ```
68
+
69
+ ---
70
+
71
+ ## OpenTelemetry
72
+
73
+ 自动 instrument:
74
+ - FastAPI 请求
75
+ - SQLAlchemy SQL 查询
76
+ - httpx 外部调用
77
+
78
+ ### 配置
79
+
80
+ ```bash
81
+ TELEMETRY__ENABLED=true
82
+ TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317 # 可选
83
+ TELEMETRY__SAMPLING_RATE=1.0 # 采样率,1.0=100%
84
+ ```
85
+
86
+ ### 手动 Span
87
+
88
+ ```python
89
+ from aury.boot.infrastructure.monitoring.tracing import span, trace_span
90
+
91
+ # 装饰器方式
92
+ @trace_span(name="call_external_api")
93
+ async def call_api():
94
+ ...
95
+
96
+ # 上下文管理器
97
+ async def process():
98
+ with span("step_1"):
99
+ await do_step_1()
100
+ with span("step_2"):
101
+ await do_step_2()
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Pyroscope 持续 Profiling
107
+
108
+ 生成 CPU 火焰图,定位性能瓶颈。
109
+
110
+ ### 安装
111
+
112
+ ```bash
113
+ pip install pyroscope-io
114
+ ```
115
+
116
+ ### 配置
117
+
118
+ ```bash
119
+ PROFILING__ENABLED=true
120
+ PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
121
+ PROFILING__PYROSCOPE_SAMPLE_RATE=100 # 采样率 Hz
122
+ ```
123
+
124
+ ### 部署 Pyroscope
125
+
126
+ ```yaml
127
+ # docker-compose.yml
128
+ services:
129
+ pyroscope:
130
+ image: grafana/pyroscope:latest
131
+ ports:
132
+ - "4040:4040"
133
+ ```
134
+
135
+ 访问 http://localhost:4040 查看火焰图。
136
+
137
+ ---
138
+
139
+ ## 事件循环阻塞检测
140
+
141
+ 检测同步代码阻塞 asyncio 事件循环的问题。
142
+
143
+ ### 安装
144
+
145
+ ```bash
146
+ pip install psutil
147
+ ```
148
+
149
+ ### 配置
150
+
151
+ ```bash
152
+ PROFILING__BLOCKING_DETECTOR_ENABLED=true
153
+ PROFILING__BLOCKING_THRESHOLD_MS=100 # 阻塞阈值
154
+ PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500 # 严重阈值
155
+ PROFILING__BLOCKING_ALERT_ENABLED=true # 阻塞时发送告警
156
+ PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60 # 告警冷却
157
+ ```
158
+
159
+ ### 工作原理
160
+
161
+ 1. 后台线程每 100ms 向事件循环投递空任务
162
+ 2. 如果响应延迟 > 阈值,说明事件循环被阻塞
163
+ 3. 自动捕获主线程调用栈 + 进程状态
164
+ 4. 发送告警(含阻塞代码位置)
165
+
166
+ ### 告警示例
167
+
168
+ ```
169
+ 事件循环阻塞(严重): 520ms
170
+
171
+ 调用栈:
172
+ app/services/sync_io.py:42 in read_file
173
+ > data = open(path).read() # 同步 IO!
174
+
175
+ 进程状态:
176
+ cpu: 95%, memory: 256MB, threads: 12
177
+ ```
178
+
179
+ ### 常见阻塞原因
180
+
181
+ - `time.sleep()` 应使用 `asyncio.sleep()`
182
+ - `open().read()` 应使用 `aiofiles`
183
+ - `requests.get()` 应使用 `httpx` 或 `aiohttp`
184
+ - CPU 密集计算 应使用 `run_in_executor()`
185
+
186
+ ---
187
+
188
+ ## 环境变量参考
189
+
190
+ ### Telemetry
191
+
192
+ | 变量 | 说明 | 默认值 |
193
+ |------|------|--------|
194
+ | `TELEMETRY__ENABLED` | 是否启用 | `false` |
195
+ | `TELEMETRY__TRACES_ENDPOINT` | Traces 导出端点 | - |
196
+ | `TELEMETRY__LOGS_ENDPOINT` | Logs 导出端点 | - |
197
+ | `TELEMETRY__METRICS_ENDPOINT` | Metrics 导出端点 | - |
198
+ | `TELEMETRY__SAMPLING_RATE` | 采样率 | `1.0` |
199
+
200
+ ### Profiling
201
+
202
+ | 变量 | 说明 | 默认值 |
203
+ |------|------|--------|
204
+ | `PROFILING__ENABLED` | 是否启用 Pyroscope | `false` |
205
+ | `PROFILING__PYROSCOPE_ENDPOINT` | Pyroscope 端点 | - |
206
+ | `PROFILING__PYROSCOPE_SAMPLE_RATE` | 采样率 (Hz) | `100` |
207
+ | `PROFILING__BLOCKING_DETECTOR_ENABLED` | 阻塞检测 | `false` |
208
+ | `PROFILING__BLOCKING_THRESHOLD_MS` | 阻塞阈值 | `100` |
209
+ | `PROFILING__BLOCKING_SEVERE_THRESHOLD_MS` | 严重阈值 | `500` |
210
+ | `PROFILING__BLOCKING_ALERT_ENABLED` | 阻塞告警 | `true` |
211
+ | `PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS` | 告警冷却 | `60` |
212
+
213
+ ---
214
+
215
+ ## 推荐的监控栈
216
+
217
+ ### 开源方案
218
+
219
+ ```
220
+ OpenTelemetry → Jaeger (Traces)
221
+ → Grafana Loki (Logs)
222
+ → Prometheus (Metrics)
223
+
224
+ Pyroscope → 火焰图
225
+ ```
226
+
227
+ ### 云服务方案
228
+
229
+ - **阿里云 ARMS** - APM + 告警
230
+ - **腾讯云 APM** - 类似
231
+ - **Datadog** - 全功能 APM
232
+ - **Sentry** - 错误监控
233
+
234
+ ---
235
+
236
+ ## 相关文档
237
+
238
+ - [17-alerting.md](./17-alerting.md) - 告警系统详细配置
239
+ - [11-logging.md](./11-logging.md) - 日志配置
@@ -12,6 +12,21 @@ TELEMETRY__ENABLED=false
12
12
  # TELEMETRY__LOGS_ENDPOINT=http://loki:3100
13
13
  # TELEMETRY__METRICS_ENDPOINT=http://prometheus:9090
14
14
 
15
+ # ---------- Profiling 性能分析 ----------
16
+ # Pyroscope 持续 Profiling(生成火焰图)
17
+ # 需安装:pip install pyroscope-io
18
+ PROFILING__ENABLED=false
19
+ # PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
20
+ # PROFILING__PYROSCOPE_SAMPLE_RATE=100
21
+
22
+ # 事件循环阻塞检测(检测同步代码阻塞协程)
23
+ # 需安装:pip install psutil
24
+ PROFILING__BLOCKING_DETECTOR_ENABLED=false
25
+ # PROFILING__BLOCKING_THRESHOLD_MS=100
26
+ # PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500
27
+ # PROFILING__BLOCKING_ALERT_ENABLED=true
28
+ # PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60
29
+
15
30
  # ---------- 告警系统 ----------
16
31
  ALERT__ENABLED=false
17
32
  # 慢操作阈值