aury-boot 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/adapter/http.py +17 -6
- aury/boot/application/app/base.py +1 -0
- aury/boot/application/app/components.py +93 -3
- aury/boot/application/config/settings.py +80 -2
- aury/boot/commands/init.py +20 -0
- aury/boot/commands/pkg.py +31 -1
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +1 -0
- aury/boot/commands/templates/project/aury_docs/18-monitoring-profiling.md.tpl +239 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +15 -0
- aury/boot/common/logging/setup.py +8 -3
- aury/boot/infrastructure/cache/redis.py +82 -16
- aury/boot/infrastructure/channel/__init__.py +2 -1
- aury/boot/infrastructure/channel/backends/__init__.py +2 -1
- aury/boot/infrastructure/channel/backends/redis_cluster.py +124 -0
- aury/boot/infrastructure/channel/backends/redis_cluster_channel.py +139 -0
- aury/boot/infrastructure/channel/base.py +2 -0
- aury/boot/infrastructure/channel/manager.py +9 -1
- aury/boot/infrastructure/clients/redis/manager.py +90 -19
- aury/boot/infrastructure/database/manager.py +6 -4
- aury/boot/infrastructure/monitoring/__init__.py +10 -2
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +33 -16
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +14 -13
- aury/boot/infrastructure/monitoring/profiling/__init__.py +664 -0
- aury/boot/infrastructure/scheduler/__init__.py +2 -0
- aury/boot/infrastructure/scheduler/jobstores/__init__.py +10 -0
- aury/boot/infrastructure/scheduler/jobstores/redis_cluster.py +255 -0
- aury/boot/infrastructure/scheduler/manager.py +15 -3
- aury/boot/toolkit/http/__init__.py +180 -85
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/METADATA +14 -4
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/RECORD +33 -27
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.41.dist-info}/entry_points.txt +0 -0
aury/boot/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0,
|
|
31
|
+
__version__ = version = '0.0.41'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 41)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -25,10 +25,14 @@ from __future__ import annotations
|
|
|
25
25
|
|
|
26
26
|
from typing import Any
|
|
27
27
|
|
|
28
|
-
import httpx
|
|
29
|
-
|
|
30
28
|
from aury.boot.common.logging import get_trace_id, logger
|
|
31
|
-
from aury.boot.toolkit.http import
|
|
29
|
+
from aury.boot.toolkit.http import (
|
|
30
|
+
HttpClient,
|
|
31
|
+
HttpNetworkError,
|
|
32
|
+
HttpStatusError,
|
|
33
|
+
HttpTimeoutError,
|
|
34
|
+
RetryConfig,
|
|
35
|
+
)
|
|
32
36
|
|
|
33
37
|
from .base import BaseAdapter
|
|
34
38
|
from .config import AdapterSettings
|
|
@@ -185,7 +189,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
185
189
|
json: JSON 请求体
|
|
186
190
|
data: 表单数据
|
|
187
191
|
files: 上传文件
|
|
188
|
-
**kwargs: 其他
|
|
192
|
+
**kwargs: 其他 aiohttp 参数
|
|
189
193
|
|
|
190
194
|
Returns:
|
|
191
195
|
dict: 响应 JSON
|
|
@@ -228,7 +232,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
228
232
|
"content": response.text,
|
|
229
233
|
}
|
|
230
234
|
|
|
231
|
-
except
|
|
235
|
+
except HttpTimeoutError as exc:
|
|
232
236
|
raise AdapterTimeoutError(
|
|
233
237
|
f"请求超时: {method} {path}",
|
|
234
238
|
adapter_name=self.name,
|
|
@@ -236,7 +240,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
236
240
|
cause=exc,
|
|
237
241
|
) from exc
|
|
238
242
|
|
|
239
|
-
except
|
|
243
|
+
except HttpStatusError as exc:
|
|
240
244
|
# HTTP 错误状态码
|
|
241
245
|
response = exc.response
|
|
242
246
|
try:
|
|
@@ -255,6 +259,13 @@ class HttpAdapter(BaseAdapter):
|
|
|
255
259
|
cause=exc,
|
|
256
260
|
) from exc
|
|
257
261
|
|
|
262
|
+
except HttpNetworkError as exc:
|
|
263
|
+
raise AdapterError(
|
|
264
|
+
f"网络错误: {method} {path} - {exc}",
|
|
265
|
+
adapter_name=self.name,
|
|
266
|
+
cause=exc,
|
|
267
|
+
) from exc
|
|
268
|
+
|
|
258
269
|
except Exception as exc:
|
|
259
270
|
raise AdapterError(
|
|
260
271
|
f"请求失败: {method} {path} - {type(exc).__name__}: {exc}",
|
|
@@ -318,13 +318,37 @@ class SchedulerComponent(Component):
|
|
|
318
318
|
# jobstores: 根据 URL 自动选择存储后端
|
|
319
319
|
if scheduler_config.jobstore_url:
|
|
320
320
|
url = scheduler_config.jobstore_url
|
|
321
|
-
if url.startswith("redis://"):
|
|
321
|
+
if url.startswith("redis-cluster://"):
|
|
322
|
+
# Redis Cluster 模式
|
|
322
323
|
try:
|
|
324
|
+
from aury.boot.infrastructure.scheduler.jobstores import RedisClusterJobStore
|
|
325
|
+
|
|
326
|
+
scheduler_kwargs["jobstores"] = {
|
|
327
|
+
"default": RedisClusterJobStore(url=url)
|
|
328
|
+
}
|
|
329
|
+
logger.info(f"调度器使用 Redis Cluster 存储: {url.split('@')[-1].split('/')[0]}")
|
|
330
|
+
except ImportError:
|
|
331
|
+
logger.warning("Redis Cluster jobstore 需要安装 redis[cluster]: pip install 'redis[cluster]'")
|
|
332
|
+
elif url.startswith("redis://"):
|
|
333
|
+
try:
|
|
334
|
+
from urllib.parse import urlparse
|
|
323
335
|
from apscheduler.jobstores.redis import RedisJobStore
|
|
336
|
+
|
|
337
|
+
# 解析 Redis URL
|
|
338
|
+
parsed = urlparse(url)
|
|
339
|
+
redis_kwargs: dict = {
|
|
340
|
+
"host": parsed.hostname or "localhost",
|
|
341
|
+
"port": parsed.port or 6379,
|
|
342
|
+
}
|
|
343
|
+
if parsed.password:
|
|
344
|
+
redis_kwargs["password"] = parsed.password
|
|
345
|
+
if parsed.path and parsed.path != "/":
|
|
346
|
+
redis_kwargs["db"] = int(parsed.path.lstrip("/") or 0)
|
|
347
|
+
|
|
324
348
|
scheduler_kwargs["jobstores"] = {
|
|
325
|
-
"default": RedisJobStore
|
|
349
|
+
"default": RedisJobStore(**redis_kwargs)
|
|
326
350
|
}
|
|
327
|
-
logger.info(f"调度器使用 Redis 存储: {
|
|
351
|
+
logger.info(f"调度器使用 Redis 存储: {parsed.hostname}:{parsed.port}")
|
|
328
352
|
except ImportError:
|
|
329
353
|
logger.warning("Redis jobstore 需要安装 redis: pip install redis")
|
|
330
354
|
else:
|
|
@@ -766,6 +790,70 @@ class AlertComponent(Component):
|
|
|
766
790
|
pass
|
|
767
791
|
|
|
768
792
|
|
|
793
|
+
class ProfilingComponent(Component):
|
|
794
|
+
"""Profiling 组件。
|
|
795
|
+
|
|
796
|
+
提供持续性能分析和事件循环阻塞检测:
|
|
797
|
+
- Pyroscope:持续采样生成火焰图
|
|
798
|
+
- 阻塞检测:检测同步代码阻塞事件循环
|
|
799
|
+
"""
|
|
800
|
+
|
|
801
|
+
name = "profiling"
|
|
802
|
+
enabled = True
|
|
803
|
+
depends_on: ClassVar[list[str]] = ["alert"] # 告警依赖
|
|
804
|
+
|
|
805
|
+
def can_enable(self, config: BaseConfig) -> bool:
|
|
806
|
+
"""当启用 Pyroscope 或阻塞检测时启用。"""
|
|
807
|
+
return self.enabled and (
|
|
808
|
+
config.profiling.enabled or config.profiling.blocking_detector_enabled
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
async def setup(self, app: FoundationApp, config: BaseConfig) -> None:
|
|
812
|
+
"""初始化 Profiling 组件。"""
|
|
813
|
+
try:
|
|
814
|
+
from aury.boot.infrastructure.monitoring.profiling import (
|
|
815
|
+
ProfilingConfig,
|
|
816
|
+
ProfilingManager,
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
profiling_config = ProfilingConfig(
|
|
820
|
+
enabled=config.profiling.enabled,
|
|
821
|
+
pyroscope_endpoint=config.profiling.pyroscope_endpoint,
|
|
822
|
+
pyroscope_auth_token=config.profiling.pyroscope_auth_token,
|
|
823
|
+
service_name=config.service.name,
|
|
824
|
+
environment=config.service.environment,
|
|
825
|
+
blocking_detector_enabled=config.profiling.blocking_detector_enabled,
|
|
826
|
+
blocking_check_interval_ms=config.profiling.blocking_check_interval_ms,
|
|
827
|
+
blocking_threshold_ms=config.profiling.blocking_threshold_ms,
|
|
828
|
+
blocking_severe_threshold_ms=config.profiling.blocking_severe_threshold_ms,
|
|
829
|
+
blocking_alert_enabled=config.profiling.blocking_alert_enabled,
|
|
830
|
+
blocking_alert_cooldown_seconds=config.profiling.blocking_alert_cooldown_seconds,
|
|
831
|
+
blocking_max_history=config.profiling.blocking_max_history,
|
|
832
|
+
tags=config.profiling.pyroscope_tags,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
manager = ProfilingManager.get_instance()
|
|
836
|
+
manager.configure(profiling_config)
|
|
837
|
+
await manager.start()
|
|
838
|
+
|
|
839
|
+
# 保存到 app.state
|
|
840
|
+
app.state.profiling_manager = manager
|
|
841
|
+
|
|
842
|
+
except ImportError as e:
|
|
843
|
+
logger.warning(f"Profiling 依赖未安装,跳过初始化: {e}")
|
|
844
|
+
except Exception as e:
|
|
845
|
+
logger.warning(f"Profiling 初始化失败(非关键): {e}")
|
|
846
|
+
|
|
847
|
+
async def teardown(self, app: FoundationApp) -> None:
|
|
848
|
+
"""停止 Profiling 组件。"""
|
|
849
|
+
try:
|
|
850
|
+
manager = getattr(app.state, "profiling_manager", None)
|
|
851
|
+
if manager:
|
|
852
|
+
await manager.stop()
|
|
853
|
+
except Exception as e:
|
|
854
|
+
logger.warning(f"Profiling 关闭失败: {e}")
|
|
855
|
+
|
|
856
|
+
|
|
769
857
|
class EventBusComponent(Component):
|
|
770
858
|
"""事件总线组件。
|
|
771
859
|
|
|
@@ -815,6 +903,7 @@ FoundationApp.plugins = [
|
|
|
815
903
|
# 设置默认组件
|
|
816
904
|
FoundationApp.components = [
|
|
817
905
|
AlertComponent, # 最先初始化告警管理器
|
|
906
|
+
ProfilingComponent, # Profiling 依赖告警
|
|
818
907
|
DatabaseComponent,
|
|
819
908
|
MigrationComponent,
|
|
820
909
|
AdminConsoleComponent,
|
|
@@ -837,6 +926,7 @@ __all__ = [
|
|
|
837
926
|
"EventBusComponent",
|
|
838
927
|
"MessageQueueComponent",
|
|
839
928
|
"MigrationComponent",
|
|
929
|
+
"ProfilingComponent",
|
|
840
930
|
"SchedulerComponent",
|
|
841
931
|
"StorageComponent",
|
|
842
932
|
"TaskComponent",
|
|
@@ -263,15 +263,19 @@ class ChannelSettings(BaseModel):
|
|
|
263
263
|
支持的后端类型:
|
|
264
264
|
- memory: 内存后端(默认,单进程)
|
|
265
265
|
- redis: Redis Pub/Sub(多进程/分布式)
|
|
266
|
+
- redis_cluster: Redis Cluster Sharded Pub/Sub(Redis 7.0+)
|
|
267
|
+
|
|
268
|
+
注意:URL 的 scheme 会自动决定后端类型:
|
|
269
|
+
- redis-cluster://... 自动使用 redis_cluster 后端
|
|
266
270
|
"""
|
|
267
271
|
|
|
268
272
|
backend: str = Field(
|
|
269
273
|
default="",
|
|
270
|
-
description="通道后端 (memory/redis),空字符串表示不启用"
|
|
274
|
+
description="通道后端 (memory/redis/redis_cluster),空字符串表示不启用"
|
|
271
275
|
)
|
|
272
276
|
url: str | None = Field(
|
|
273
277
|
default=None,
|
|
274
|
-
description="
|
|
278
|
+
description="连接 URL(redis://... 或 redis-cluster://...)"
|
|
275
279
|
)
|
|
276
280
|
|
|
277
281
|
|
|
@@ -401,6 +405,14 @@ class LogSettings(BaseModel):
|
|
|
401
405
|
default=False,
|
|
402
406
|
description="是否记录 WebSocket 消息内容(注意性能和敏感数据)"
|
|
403
407
|
)
|
|
408
|
+
enqueue: bool = Field(
|
|
409
|
+
default=False,
|
|
410
|
+
description=(
|
|
411
|
+
"是否启用多进程安全队列。"
|
|
412
|
+
"启用后日志通过 multiprocessing.Queue 传输,"
|
|
413
|
+
"可能导致事件循环阻塞。建议在 asyncio 应用中保持 False"
|
|
414
|
+
)
|
|
415
|
+
)
|
|
404
416
|
|
|
405
417
|
|
|
406
418
|
class ServiceSettings(BaseModel):
|
|
@@ -464,6 +476,7 @@ class SchedulerSettings(BaseModel):
|
|
|
464
476
|
default=None,
|
|
465
477
|
description=(
|
|
466
478
|
"任务存储 URL。支持:\n"
|
|
479
|
+
"- redis-cluster://password@host:port(Redis Cluster 存储)\n"
|
|
467
480
|
"- redis://localhost:6379/0(Redis 存储)\n"
|
|
468
481
|
"- sqlite:///jobs.db(SQLite 存储)\n"
|
|
469
482
|
"- postgresql://user:pass@host/db(PostgreSQL 存储)\n"
|
|
@@ -741,6 +754,70 @@ class AlertSettings(BaseModel):
|
|
|
741
754
|
return self._notifiers
|
|
742
755
|
|
|
743
756
|
|
|
757
|
+
class ProfilingSettings(BaseModel):
|
|
758
|
+
"""Profiling 配置。
|
|
759
|
+
|
|
760
|
+
环境变量格式: PROFILING__{FIELD}
|
|
761
|
+
示例: PROFILING__ENABLED, PROFILING__PYROSCOPE_ENDPOINT
|
|
762
|
+
|
|
763
|
+
功能说明:
|
|
764
|
+
- Pyroscope:持续采样生成火焰图(需安装 pyroscope-io)
|
|
765
|
+
- 阻塞检测:检测同步代码阻塞事件循环(需安装 psutil)
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
# Pyroscope 持续 Profiling
|
|
769
|
+
enabled: bool = Field(
|
|
770
|
+
default=False,
|
|
771
|
+
description="是否启用 Pyroscope 持续 profiling"
|
|
772
|
+
)
|
|
773
|
+
pyroscope_endpoint: str | None = Field(
|
|
774
|
+
default=None,
|
|
775
|
+
description="Pyroscope 服务端点(如 http://pyroscope:4040)"
|
|
776
|
+
)
|
|
777
|
+
pyroscope_auth_token: str | None = Field(
|
|
778
|
+
default=None,
|
|
779
|
+
description="Pyroscope 认证 token(可选)"
|
|
780
|
+
)
|
|
781
|
+
pyroscope_sample_rate: int = Field(
|
|
782
|
+
default=100,
|
|
783
|
+
description="Pyroscope 采样率 (Hz),降低可减少开销"
|
|
784
|
+
)
|
|
785
|
+
pyroscope_tags: dict[str, str] = Field(
|
|
786
|
+
default_factory=dict,
|
|
787
|
+
description="Pyroscope 自定义标签"
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# 事件循环阻塞检测
|
|
791
|
+
blocking_detector_enabled: bool = Field(
|
|
792
|
+
default=False,
|
|
793
|
+
description="是否启用事件循环阻塞检测"
|
|
794
|
+
)
|
|
795
|
+
blocking_check_interval_ms: float = Field(
|
|
796
|
+
default=100,
|
|
797
|
+
description="阻塞检测间隔 (ms)"
|
|
798
|
+
)
|
|
799
|
+
blocking_threshold_ms: float = Field(
|
|
800
|
+
default=100,
|
|
801
|
+
description="阻塞阈值 (ms),超过此时间记录阻塞事件"
|
|
802
|
+
)
|
|
803
|
+
blocking_severe_threshold_ms: float = Field(
|
|
804
|
+
default=500,
|
|
805
|
+
description="严重阻塞阈值 (ms),超过此时间触发严重告警"
|
|
806
|
+
)
|
|
807
|
+
blocking_alert_enabled: bool = Field(
|
|
808
|
+
default=True,
|
|
809
|
+
description="检测到阻塞时是否发送告警"
|
|
810
|
+
)
|
|
811
|
+
blocking_alert_cooldown_seconds: float = Field(
|
|
812
|
+
default=60,
|
|
813
|
+
description="阻塞告警冷却时间 (秒),避免告警风暴"
|
|
814
|
+
)
|
|
815
|
+
blocking_max_history: int = Field(
|
|
816
|
+
default=50,
|
|
817
|
+
description="保留的阻塞事件历史数量"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
|
|
744
821
|
class MigrationSettings(BaseModel):
|
|
745
822
|
"""数据库迁移配置。
|
|
746
823
|
|
|
@@ -1014,6 +1091,7 @@ class BaseConfig(BaseSettings):
|
|
|
1014
1091
|
# ========== 监控告警 ==========
|
|
1015
1092
|
telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
|
|
1016
1093
|
alert: AlertSettings = Field(default_factory=AlertSettings)
|
|
1094
|
+
profiling: ProfilingSettings = Field(default_factory=ProfilingSettings)
|
|
1017
1095
|
|
|
1018
1096
|
model_config = SettingsConfigDict(
|
|
1019
1097
|
case_sensitive=False,
|
aury/boot/commands/init.py
CHANGED
|
@@ -542,6 +542,18 @@ def _collect_interactive_config() -> dict:
|
|
|
542
542
|
|
|
543
543
|
config["features"] = features
|
|
544
544
|
|
|
545
|
+
# 5.5 监控配置
|
|
546
|
+
console.print()
|
|
547
|
+
console.print("[bold]📊 监控配置[/bold]")
|
|
548
|
+
config["with_otel"] = Confirm.ask(
|
|
549
|
+
" 启用 OpenTelemetry 链路追踪",
|
|
550
|
+
default=True,
|
|
551
|
+
)
|
|
552
|
+
config["with_profiling"] = Confirm.ask(
|
|
553
|
+
" 启用 Profiling (火焰图/阻塞检测)",
|
|
554
|
+
default=False,
|
|
555
|
+
)
|
|
556
|
+
|
|
545
557
|
# 6. 开发工具
|
|
546
558
|
console.print()
|
|
547
559
|
config["with_dev"] = Confirm.ask(
|
|
@@ -589,6 +601,12 @@ def _build_dependency_list(config: dict) -> list[str]:
|
|
|
589
601
|
if config.get("with_admin_console", True):
|
|
590
602
|
extras.add("admin")
|
|
591
603
|
|
|
604
|
+
# 监控
|
|
605
|
+
if config.get("with_otel", True):
|
|
606
|
+
extras.add("otel")
|
|
607
|
+
if config.get("with_profiling", False):
|
|
608
|
+
extras.add("profiling")
|
|
609
|
+
|
|
592
610
|
# 开发工具
|
|
593
611
|
if config.get("with_dev"):
|
|
594
612
|
extras.add("dev")
|
|
@@ -616,6 +634,8 @@ def _show_config_summary(config: dict) -> None:
|
|
|
616
634
|
("服务模式", config.get("service_mode", "api")),
|
|
617
635
|
("管理后台", "是" if config.get("with_admin_console", True) else "否"),
|
|
618
636
|
("可选功能", ", ".join(config.get("features", [])) or "无"),
|
|
637
|
+
("OpenTelemetry", "是" if config.get("with_otel", True) else "否"),
|
|
638
|
+
("Profiling", "是" if config.get("with_profiling", False) else "否"),
|
|
619
639
|
("开发工具", "是" if config.get("with_dev") else "否"),
|
|
620
640
|
("Docker", "是" if config.get("with_docker") else "否"),
|
|
621
641
|
]
|
aury/boot/commands/pkg.py
CHANGED
|
@@ -45,6 +45,7 @@ class Category(str, Enum):
|
|
|
45
45
|
SCHEDULER = "scheduler"
|
|
46
46
|
ADMIN = "admin"
|
|
47
47
|
STORAGE = "storage"
|
|
48
|
+
MONITORING = "monitoring"
|
|
48
49
|
ECOSYSTEM = "ecosystem"
|
|
49
50
|
|
|
50
51
|
|
|
@@ -132,6 +133,29 @@ MODULES: dict[str, ModuleInfo] = {
|
|
|
132
133
|
category=Category.STORAGE,
|
|
133
134
|
deps=["aury-sdk-storage[aws]"],
|
|
134
135
|
),
|
|
136
|
+
# 监控
|
|
137
|
+
"otel": ModuleInfo(
|
|
138
|
+
name="otel",
|
|
139
|
+
desc="OpenTelemetry 链路追踪",
|
|
140
|
+
usage="启用 TELEMETRY__ENABLED 自动 instrument FastAPI/SQLAlchemy/httpx",
|
|
141
|
+
category=Category.MONITORING,
|
|
142
|
+
deps=["opentelemetry-api", "opentelemetry-sdk", "opentelemetry-instrumentation-fastapi",
|
|
143
|
+
"opentelemetry-instrumentation-sqlalchemy", "opentelemetry-instrumentation-httpx"],
|
|
144
|
+
),
|
|
145
|
+
"otel-exporter": ModuleInfo(
|
|
146
|
+
name="otel-exporter",
|
|
147
|
+
desc="OpenTelemetry OTLP 导出器",
|
|
148
|
+
usage="导出 Traces/Metrics/Logs 到 Jaeger/Prometheus/Loki",
|
|
149
|
+
category=Category.MONITORING,
|
|
150
|
+
deps=["opentelemetry-exporter-otlp"],
|
|
151
|
+
),
|
|
152
|
+
"profiling": ModuleInfo(
|
|
153
|
+
name="profiling",
|
|
154
|
+
desc="Profiling 性能分析",
|
|
155
|
+
usage="Pyroscope 火焰图 + 事件循环阻塞检测",
|
|
156
|
+
category=Category.MONITORING,
|
|
157
|
+
deps=["pyroscope-io", "psutil"],
|
|
158
|
+
),
|
|
135
159
|
# 生态包
|
|
136
160
|
"storage-aws": ModuleInfo(
|
|
137
161
|
name="storage-aws",
|
|
@@ -182,7 +206,12 @@ PRESETS: dict[str, PresetInfo] = {
|
|
|
182
206
|
"full": PresetInfo(
|
|
183
207
|
name="full",
|
|
184
208
|
desc="完整功能(所有模块)",
|
|
185
|
-
modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos"],
|
|
209
|
+
modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos", "otel", "profiling"],
|
|
210
|
+
),
|
|
211
|
+
"monitoring": PresetInfo(
|
|
212
|
+
name="monitoring",
|
|
213
|
+
desc="完整监控(OTel + Profiling)",
|
|
214
|
+
modules=["otel", "otel-exporter", "profiling"],
|
|
186
215
|
),
|
|
187
216
|
}
|
|
188
217
|
|
|
@@ -195,6 +224,7 @@ CATEGORY_NAMES: dict[Category, str] = {
|
|
|
195
224
|
Category.SCHEDULER: "📦 定时调度",
|
|
196
225
|
Category.ADMIN: "📦 管理后台",
|
|
197
226
|
Category.STORAGE: "📦 对象存储",
|
|
227
|
+
Category.MONITORING: "📊 监控分析",
|
|
198
228
|
Category.ECOSYSTEM: "🌐 生态包",
|
|
199
229
|
}
|
|
200
230
|
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# 监控与 Profiling
|
|
2
|
+
|
|
3
|
+
本文档介绍 {project_name} 项目中的监控和性能分析配置。
|
|
4
|
+
|
|
5
|
+
## 监控能力概览
|
|
6
|
+
|
|
7
|
+
| 功能 | 用途 | 建议环境 |
|
|
8
|
+
|------|------|----------|
|
|
9
|
+
| OpenTelemetry | 链路追踪、慢请求检测 | 所有环境 |
|
|
10
|
+
| 告警系统 | 异常/慢请求通知 | 所有环境 |
|
|
11
|
+
| Pyroscope | 持续 Profiling、火焰图 | 测试/灰度 |
|
|
12
|
+
| 阻塞检测 | 检测同步代码阻塞协程 | 测试/按需 |
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 不同环境的最佳实践
|
|
17
|
+
|
|
18
|
+
### 开发环境
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# .env.development
|
|
22
|
+
TELEMETRY__ENABLED=false
|
|
23
|
+
ALERT__ENABLED=false
|
|
24
|
+
PROFILING__ENABLED=false
|
|
25
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true # 开发时检测阻塞问题
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### 测试/灰度环境
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# .env.staging
|
|
32
|
+
TELEMETRY__ENABLED=true
|
|
33
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
|
|
34
|
+
|
|
35
|
+
ALERT__ENABLED=true
|
|
36
|
+
ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
37
|
+
ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
|
|
38
|
+
|
|
39
|
+
# 开启 Profiling 排查性能问题
|
|
40
|
+
PROFILING__ENABLED=true
|
|
41
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
42
|
+
|
|
43
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 生产环境
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# .env.production
|
|
50
|
+
# 链路追踪 - 必开
|
|
51
|
+
TELEMETRY__ENABLED=true
|
|
52
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
|
|
53
|
+
TELEMETRY__SAMPLING_RATE=0.1 # 采样 10% 减少开销
|
|
54
|
+
|
|
55
|
+
# 告警 - 必开
|
|
56
|
+
ALERT__ENABLED=true
|
|
57
|
+
ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
58
|
+
ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
|
|
59
|
+
|
|
60
|
+
# Profiling - 按需(有约 2-5% CPU 开销)
|
|
61
|
+
PROFILING__ENABLED=false # 出问题时临时开启
|
|
62
|
+
# PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
63
|
+
# PROFILING__PYROSCOPE_SAMPLE_RATE=10 # 降低采样率减少开销
|
|
64
|
+
|
|
65
|
+
# 阻塞检测 - 按需
|
|
66
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=false # 出问题时临时开启
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## OpenTelemetry
|
|
72
|
+
|
|
73
|
+
自动 instrument:
|
|
74
|
+
- FastAPI 请求
|
|
75
|
+
- SQLAlchemy SQL 查询
|
|
76
|
+
- httpx 外部调用
|
|
77
|
+
|
|
78
|
+
### 配置
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
TELEMETRY__ENABLED=true
|
|
82
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317 # 可选
|
|
83
|
+
TELEMETRY__SAMPLING_RATE=1.0 # 采样率,1.0=100%
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 手动 Span
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from aury.boot.infrastructure.monitoring.tracing import span, trace_span
|
|
90
|
+
|
|
91
|
+
# 装饰器方式
|
|
92
|
+
@trace_span(name="call_external_api")
|
|
93
|
+
async def call_api():
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
# 上下文管理器
|
|
97
|
+
async def process():
|
|
98
|
+
with span("step_1"):
|
|
99
|
+
await do_step_1()
|
|
100
|
+
with span("step_2"):
|
|
101
|
+
await do_step_2()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Pyroscope 持续 Profiling
|
|
107
|
+
|
|
108
|
+
生成 CPU 火焰图,定位性能瓶颈。
|
|
109
|
+
|
|
110
|
+
### 安装
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install pyroscope-io
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 配置
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
PROFILING__ENABLED=true
|
|
120
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
121
|
+
PROFILING__PYROSCOPE_SAMPLE_RATE=100 # 采样率 Hz
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### 部署 Pyroscope
|
|
125
|
+
|
|
126
|
+
```yaml
|
|
127
|
+
# docker-compose.yml
|
|
128
|
+
services:
|
|
129
|
+
pyroscope:
|
|
130
|
+
image: grafana/pyroscope:latest
|
|
131
|
+
ports:
|
|
132
|
+
- "4040:4040"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
访问 http://localhost:4040 查看火焰图。
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## 事件循环阻塞检测
|
|
140
|
+
|
|
141
|
+
检测同步代码阻塞 asyncio 事件循环的问题。
|
|
142
|
+
|
|
143
|
+
### 安装
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
pip install psutil
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### 配置
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
153
|
+
PROFILING__BLOCKING_THRESHOLD_MS=100 # 阻塞阈值
|
|
154
|
+
PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500 # 严重阈值
|
|
155
|
+
PROFILING__BLOCKING_ALERT_ENABLED=true # 阻塞时发送告警
|
|
156
|
+
PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60 # 告警冷却
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 工作原理
|
|
160
|
+
|
|
161
|
+
1. 后台线程每 100ms 向事件循环投递空任务
|
|
162
|
+
2. 如果响应延迟 > 阈值,说明事件循环被阻塞
|
|
163
|
+
3. 自动捕获主线程调用栈 + 进程状态
|
|
164
|
+
4. 发送告警(含阻塞代码位置)
|
|
165
|
+
|
|
166
|
+
### 告警示例
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
事件循环阻塞(严重): 520ms
|
|
170
|
+
|
|
171
|
+
调用栈:
|
|
172
|
+
app/services/sync_io.py:42 in read_file
|
|
173
|
+
> data = open(path).read() # 同步 IO!
|
|
174
|
+
|
|
175
|
+
进程状态:
|
|
176
|
+
cpu: 95%, memory: 256MB, threads: 12
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 常见阻塞原因
|
|
180
|
+
|
|
181
|
+
- `time.sleep()` 应使用 `asyncio.sleep()`
|
|
182
|
+
- `open().read()` 应使用 `aiofiles`
|
|
183
|
+
- `requests.get()` 应使用 `httpx` 或 `aiohttp`
|
|
184
|
+
- CPU 密集计算 应使用 `run_in_executor()`
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## 环境变量参考
|
|
189
|
+
|
|
190
|
+
### Telemetry
|
|
191
|
+
|
|
192
|
+
| 变量 | 说明 | 默认值 |
|
|
193
|
+
|------|------|--------|
|
|
194
|
+
| `TELEMETRY__ENABLED` | 是否启用 | `false` |
|
|
195
|
+
| `TELEMETRY__TRACES_ENDPOINT` | Traces 导出端点 | - |
|
|
196
|
+
| `TELEMETRY__LOGS_ENDPOINT` | Logs 导出端点 | - |
|
|
197
|
+
| `TELEMETRY__METRICS_ENDPOINT` | Metrics 导出端点 | - |
|
|
198
|
+
| `TELEMETRY__SAMPLING_RATE` | 采样率 | `1.0` |
|
|
199
|
+
|
|
200
|
+
### Profiling
|
|
201
|
+
|
|
202
|
+
| 变量 | 说明 | 默认值 |
|
|
203
|
+
|------|------|--------|
|
|
204
|
+
| `PROFILING__ENABLED` | 是否启用 Pyroscope | `false` |
|
|
205
|
+
| `PROFILING__PYROSCOPE_ENDPOINT` | Pyroscope 端点 | - |
|
|
206
|
+
| `PROFILING__PYROSCOPE_SAMPLE_RATE` | 采样率 (Hz) | `100` |
|
|
207
|
+
| `PROFILING__BLOCKING_DETECTOR_ENABLED` | 阻塞检测 | `false` |
|
|
208
|
+
| `PROFILING__BLOCKING_THRESHOLD_MS` | 阻塞阈值 | `100` |
|
|
209
|
+
| `PROFILING__BLOCKING_SEVERE_THRESHOLD_MS` | 严重阈值 | `500` |
|
|
210
|
+
| `PROFILING__BLOCKING_ALERT_ENABLED` | 阻塞告警 | `true` |
|
|
211
|
+
| `PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS` | 告警冷却 | `60` |
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## 推荐的监控栈
|
|
216
|
+
|
|
217
|
+
### 开源方案
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
OpenTelemetry → Jaeger (Traces)
|
|
221
|
+
→ Grafana Loki (Logs)
|
|
222
|
+
→ Prometheus (Metrics)
|
|
223
|
+
|
|
224
|
+
Pyroscope → 火焰图
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### 云服务方案
|
|
228
|
+
|
|
229
|
+
- **阿里云 ARMS** - APM + 告警
|
|
230
|
+
- **腾讯云 APM** - 类似
|
|
231
|
+
- **Datadog** - 全功能 APM
|
|
232
|
+
- **Sentry** - 错误监控
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## 相关文档
|
|
237
|
+
|
|
238
|
+
- [17-alerting.md](./17-alerting.md) - 告警系统详细配置
|
|
239
|
+
- [11-logging.md](./11-logging.md) - 日志配置
|
|
@@ -12,6 +12,21 @@ TELEMETRY__ENABLED=false
|
|
|
12
12
|
# TELEMETRY__LOGS_ENDPOINT=http://loki:3100
|
|
13
13
|
# TELEMETRY__METRICS_ENDPOINT=http://prometheus:9090
|
|
14
14
|
|
|
15
|
+
# ---------- Profiling 性能分析 ----------
|
|
16
|
+
# Pyroscope 持续 Profiling(生成火焰图)
|
|
17
|
+
# 需安装:pip install pyroscope-io
|
|
18
|
+
PROFILING__ENABLED=false
|
|
19
|
+
# PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
20
|
+
# PROFILING__PYROSCOPE_SAMPLE_RATE=100
|
|
21
|
+
|
|
22
|
+
# 事件循环阻塞检测(检测同步代码阻塞协程)
|
|
23
|
+
# 需安装:pip install psutil
|
|
24
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=false
|
|
25
|
+
# PROFILING__BLOCKING_THRESHOLD_MS=100
|
|
26
|
+
# PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500
|
|
27
|
+
# PROFILING__BLOCKING_ALERT_ENABLED=true
|
|
28
|
+
# PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60
|
|
29
|
+
|
|
15
30
|
# ---------- 告警系统 ----------
|
|
16
31
|
ALERT__ENABLED=false
|
|
17
32
|
# 慢操作阈值
|